1
1

Add a couple of new tests to the orte system.

Modify the job_complete check so we don't kill jobs when a single proc was terminated by ORTE command via plm.terminate_procs

Still dies gracefully with a ctrl-c, and behaves as before when using plm.terminate_job

This commit was SVN r22227.
Этот коммит содержится в:
Ralph Castain 2009-11-20 01:47:49 +00:00
родитель 5e031d9ded
Коммит 92733b13d9
5 изменённых файлов: 90 добавлений и 22 удалений

Просмотреть файл

@ -1311,7 +1311,7 @@ void orte_plm_base_check_job_completed(orte_job_t *jdata)
* as abnormally terminated, then do not update its state
*
* Treat termination of any process in a continuously operating job as
* an error
* an error unless it was specifically commanded
*/
if (jdata->state < ORTE_JOB_STATE_TERMINATED ||
jdata->controls & ORTE_JOB_CONTROL_CONTINUOUS_OP) {
@ -1375,12 +1375,16 @@ void orte_plm_base_check_job_completed(orte_job_t *jdata)
}
break;
} else if (ORTE_PROC_STATE_KILLED_BY_CMD == proc->state) {
/* we ordered this proc to die */
jdata->state = ORTE_JOB_STATE_KILLED_BY_CMD;
jdata->aborted_proc = NULL; /* no reason to save it */
/* use the default exit code since we ordered the termination */
ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
/* now just check the remaining jobs to see if anyone is still alive */
/* we ordered this proc to die, so it isn't an abnormal termination
* and we don't flag it as such - just check the remaining jobs to
* see if anyone is still alive
*/
if (jdata->num_terminated >= jdata->num_procs) {
/* this job has terminated - now we need to check to see if ALL
* the other jobs have also completed and wakeup if that is true
*/
jdata->state = ORTE_JOB_STATE_KILLED_BY_CMD;
}
goto CHECK_ALL_JOBS;
} else if (ORTE_PROC_STATE_UNTERMINATED < proc->state &&
jdata->controls & ORTE_JOB_CONTROL_CONTINUOUS_OP) {
@ -1446,7 +1450,7 @@ CHECK_ALL_JOBS:
* anything further - just return here
*/
if (NULL != jdata && ORTE_JOB_CONTROL_CONTINUOUS_OP & jdata->controls) {
return;
goto CHECK_ALIVE;
}
/* if the job that is being checked is the HNP, then we are
@ -1511,6 +1515,7 @@ CHECK_ALL_JOBS:
jdata->map = NULL;
}
CHECK_ALIVE:
/* now check to see if all jobs are done - release this jdata
* object when we find it
*/
@ -1529,7 +1534,8 @@ CHECK_ALL_JOBS:
* report appropriately to the user
*/
if (NULL != jdata && job->jobid == jdata->jobid &&
jdata->state == ORTE_JOB_STATE_TERMINATED) {
(jdata->state == ORTE_JOB_STATE_TERMINATED ||
jdata->state == ORTE_JOB_STATE_KILLED_BY_CMD)) {
/* release this object, ensuring that the
* pointer array internal accounting
* is maintained!

Просмотреть файл

@ -1,4 +1,4 @@
PROGS = no_op sigusr_trap spin orte_nodename orte_spawn orte_loop_spawn orte_loop_child orte_abort get_limits orte_ring spawn_child orte_tool orte_no_op binom oob_stress iof_stress iof_delay radix orte_barrier orte_mcast opal_interface mcast mcast_recv
PROGS = no_op sigusr_trap spin orte_nodename orte_spawn orte_loop_spawn orte_loop_child orte_abort get_limits orte_ring spawn_child orte_tool orte_no_op binom oob_stress iof_stress iof_delay radix orte_barrier orte_mcast opal_interface mcast mcast_recv orte_spin segfault
all: $(PROGS)

Просмотреть файл

@ -8,6 +8,7 @@
#include "opal/dss/dss.h"
#include "opal/event/event.h"
#include "opal/util/output.h"
#include "orte/util/proc_info.h"
#include "orte/util/name_fns.h"
@ -43,6 +44,7 @@ int main(int argc, char* argv[])
opal_buffer_t buf, *bfptr;
int32_t i32=1;
struct iovec iovec_array[3];
orte_rmcast_channel_t chan=4;
if (0 > (rc = orte_init(ORTE_PROC_NON_MPI))) {
fprintf(stderr, "orte_nodename: couldn't init orte - error code %d\n", rc);
@ -58,13 +60,17 @@ int main(int argc, char* argv[])
if (0 == ORTE_PROC_MY_NAME->vpid) {
/* open a new channel */
if (ORTE_SUCCESS != (rc = orte_rmcast.open_channel(&chan, "orte_mcast", NULL, -1, NULL, ORTE_RMCAST_XMIT))) {
ORTE_ERROR_LOG(rc);
goto blast;
}
orte_grpcomm.barrier();
fprintf(stderr, "%d: past barrier\n", (int)ORTE_PROC_MY_NAME->vpid);
OBJ_CONSTRUCT(&buf, opal_buffer_t);
opal_dss.pack(&buf, &i32, 1, OPAL_INT32);
if (ORTE_SUCCESS != (rc = orte_rmcast.send_buffer(ORTE_RMCAST_APP_PUBLIC_CHANNEL,
ORTE_RMCAST_TAG_WILDCARD, &buf))) {
ORTE_RMCAST_TAG_ANNOUNCE, &buf))) {
ORTE_ERROR_LOG(rc);
OBJ_DESTRUCT(&buf);
goto blast;
@ -74,8 +80,8 @@ int main(int argc, char* argv[])
bfptr = OBJ_NEW(opal_buffer_t);
i32 = 2;
opal_dss.pack(bfptr, &i32, 1, OPAL_INT32);
if (ORTE_SUCCESS != (rc = orte_rmcast.send_buffer_nb(ORTE_RMCAST_APP_PUBLIC_CHANNEL,
ORTE_RMCAST_TAG_WILDCARD, bfptr,
if (ORTE_SUCCESS != (rc = orte_rmcast.send_buffer_nb(chan,
ORTE_RMCAST_TAG_OUTPUT, bfptr,
cbfunc_buf_snt, NULL))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(bfptr);
@ -98,12 +104,22 @@ int main(int argc, char* argv[])
orte_finalize();
return 0;
} else {
/* open a new channel */
if (ORTE_SUCCESS != (rc = orte_rmcast.open_channel(&chan, "orte_mcast", NULL, -1, NULL, ORTE_RMCAST_RECV))) {
ORTE_ERROR_LOG(rc);
}
if (ORTE_SUCCESS != (rc = orte_rmcast.recv_buffer_nb(ORTE_RMCAST_APP_PUBLIC_CHANNEL,
ORTE_RMCAST_TAG_WILDCARD,
ORTE_RMCAST_PERSISTENT,
cbfunc, NULL))) {
ORTE_ERROR_LOG(rc);
}
if (ORTE_SUCCESS != (rc = orte_rmcast.recv_buffer_nb(chan,
ORTE_RMCAST_TAG_WILDCARD,
ORTE_RMCAST_PERSISTENT,
cbfunc, NULL))) {
ORTE_ERROR_LOG(rc);
}
if (ORTE_SUCCESS != (rc = orte_rmcast.recv_nb(ORTE_RMCAST_APP_PUBLIC_CHANNEL,
ORTE_RMCAST_TAG_WILDCARD,
ORTE_RMCAST_PERSISTENT,
@ -112,7 +128,6 @@ int main(int argc, char* argv[])
}
orte_grpcomm.barrier();
fprintf(stderr, "%d: past barrier\n", (int)ORTE_PROC_MY_NAME->vpid);
}
opal_event_dispatch();
@ -134,10 +149,9 @@ static void cbfunc(int status,
rc = 1;
opal_dss.unpack(buffer, &i32, &rc, OPAL_INT32);
fprintf(stderr, "%s GOT BUFFER MESSAGE from %s with value %d\n",
opal_output(0, "%s GOT BUFFER MESSAGE from %s on channel %d tag %d with value %d\n",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(sender), i32);
fflush(stderr);
ORTE_NAME_PRINT(sender), channel, tag, i32);
orte_grpcomm.barrier();
@ -172,9 +186,9 @@ static void cbfunc_iovec(int status,
{
int rc;
fprintf(stderr, "%s GOT IOVEC MESSAGE from %s of %d elements\n",
opal_output(0, "%s GOT IOVEC MESSAGE from %s of %d elements\n",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(sender), count);
fflush(stderr);
#if 0
if (0 != ORTE_PROC_MY_NAME->vpid) {
/* send it back */
@ -195,8 +209,7 @@ static void cbfunc_buf_snt(int status,
orte_process_name_t *sender,
opal_buffer_t *buf, void *cbdata)
{
fprintf(stderr, "%s BUFFERED_NB SEND COMPLETE\n", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
fflush(stderr);
opal_output(0, "%s BUFFERED_NB SEND COMPLETE\n", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
OBJ_RELEASE(buf);
}

31
orte/test/system/orte_spin.c Обычный файл
Просмотреть файл

@ -0,0 +1,31 @@
/* -*- C -*-
*
* $HEADER$
*
* A program that just spins - provides mechanism for testing user-driven
* abnormal program termination
*/
#include <stdio.h>
#include "orte/runtime/runtime.h"
int main(int argc, char* argv[])
{
int i;
double pi;
orte_init(ORTE_PROC_NON_MPI);
i = 0;
while (1) {
i++;
pi = i / 3.14159256;
if (i > 100) i = 0;
}
orte_finalize();
return 0;
}

18
orte/test/system/segfault.c Обычный файл
Просмотреть файл

@ -0,0 +1,18 @@
/* -*- C -*-
*
* $HEADER$
*
* A program that just segfaults
*/
#include <stdio.h>
int main(int argc, char* argv[])
{
double pi;
char *dum=NULL;
pi = (double)*dum;
}