Add a couple of new tests to the orte system.
Modify the job_complete check so we don't kill jobs when a single proc was terminated by ORTE command via plm.terminate_procs Still dies gracefully with a ctrl-c, and behaves as before when using plm.terminate_job This commit was SVN r22227.
Этот коммит содержится в:
родитель
5e031d9ded
Коммит
92733b13d9
@ -1311,7 +1311,7 @@ void orte_plm_base_check_job_completed(orte_job_t *jdata)
|
||||
* as abnormally terminated, then do not update its state
|
||||
*
|
||||
* Treat termination of any process in a continuously operating job as
|
||||
* an error
|
||||
* an error unless it was specifically commanded
|
||||
*/
|
||||
if (jdata->state < ORTE_JOB_STATE_TERMINATED ||
|
||||
jdata->controls & ORTE_JOB_CONTROL_CONTINUOUS_OP) {
|
||||
@ -1375,12 +1375,16 @@ void orte_plm_base_check_job_completed(orte_job_t *jdata)
|
||||
}
|
||||
break;
|
||||
} else if (ORTE_PROC_STATE_KILLED_BY_CMD == proc->state) {
|
||||
/* we ordered this proc to die */
|
||||
/* we ordered this proc to die, so it isn't an abnormal termination
|
||||
* and we don't flag it as such - just check the remaining jobs to
|
||||
* see if anyone is still alive
|
||||
*/
|
||||
if (jdata->num_terminated >= jdata->num_procs) {
|
||||
/* this job has terminated - now we need to check to see if ALL
|
||||
* the other jobs have also completed and wakeup if that is true
|
||||
*/
|
||||
jdata->state = ORTE_JOB_STATE_KILLED_BY_CMD;
|
||||
jdata->aborted_proc = NULL; /* no reason to save it */
|
||||
/* use the default exit code since we ordered the termination */
|
||||
ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
|
||||
/* now just check the remaining jobs to see if anyone is still alive */
|
||||
}
|
||||
goto CHECK_ALL_JOBS;
|
||||
} else if (ORTE_PROC_STATE_UNTERMINATED < proc->state &&
|
||||
jdata->controls & ORTE_JOB_CONTROL_CONTINUOUS_OP) {
|
||||
@ -1446,7 +1450,7 @@ CHECK_ALL_JOBS:
|
||||
* anything further - just return here
|
||||
*/
|
||||
if (NULL != jdata && ORTE_JOB_CONTROL_CONTINUOUS_OP & jdata->controls) {
|
||||
return;
|
||||
goto CHECK_ALIVE;
|
||||
}
|
||||
|
||||
/* if the job that is being checked is the HNP, then we are
|
||||
@ -1511,6 +1515,7 @@ CHECK_ALL_JOBS:
|
||||
jdata->map = NULL;
|
||||
}
|
||||
|
||||
CHECK_ALIVE:
|
||||
/* now check to see if all jobs are done - release this jdata
|
||||
* object when we find it
|
||||
*/
|
||||
@ -1529,7 +1534,8 @@ CHECK_ALL_JOBS:
|
||||
* report appropriately to the user
|
||||
*/
|
||||
if (NULL != jdata && job->jobid == jdata->jobid &&
|
||||
jdata->state == ORTE_JOB_STATE_TERMINATED) {
|
||||
(jdata->state == ORTE_JOB_STATE_TERMINATED ||
|
||||
jdata->state == ORTE_JOB_STATE_KILLED_BY_CMD)) {
|
||||
/* release this object, ensuring that the
|
||||
* pointer array internal accounting
|
||||
* is maintained!
|
||||
|
@ -1,4 +1,4 @@
|
||||
PROGS = no_op sigusr_trap spin orte_nodename orte_spawn orte_loop_spawn orte_loop_child orte_abort get_limits orte_ring spawn_child orte_tool orte_no_op binom oob_stress iof_stress iof_delay radix orte_barrier orte_mcast opal_interface mcast mcast_recv
|
||||
PROGS = no_op sigusr_trap spin orte_nodename orte_spawn orte_loop_spawn orte_loop_child orte_abort get_limits orte_ring spawn_child orte_tool orte_no_op binom oob_stress iof_stress iof_delay radix orte_barrier orte_mcast opal_interface mcast mcast_recv orte_spin segfault
|
||||
|
||||
all: $(PROGS)
|
||||
|
||||
|
@ -8,6 +8,7 @@
|
||||
|
||||
#include "opal/dss/dss.h"
|
||||
#include "opal/event/event.h"
|
||||
#include "opal/util/output.h"
|
||||
|
||||
#include "orte/util/proc_info.h"
|
||||
#include "orte/util/name_fns.h"
|
||||
@ -43,6 +44,7 @@ int main(int argc, char* argv[])
|
||||
opal_buffer_t buf, *bfptr;
|
||||
int32_t i32=1;
|
||||
struct iovec iovec_array[3];
|
||||
orte_rmcast_channel_t chan=4;
|
||||
|
||||
if (0 > (rc = orte_init(ORTE_PROC_NON_MPI))) {
|
||||
fprintf(stderr, "orte_nodename: couldn't init orte - error code %d\n", rc);
|
||||
@ -58,13 +60,17 @@ int main(int argc, char* argv[])
|
||||
|
||||
|
||||
if (0 == ORTE_PROC_MY_NAME->vpid) {
|
||||
/* open a new channel */
|
||||
if (ORTE_SUCCESS != (rc = orte_rmcast.open_channel(&chan, "orte_mcast", NULL, -1, NULL, ORTE_RMCAST_XMIT))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto blast;
|
||||
}
|
||||
orte_grpcomm.barrier();
|
||||
fprintf(stderr, "%d: past barrier\n", (int)ORTE_PROC_MY_NAME->vpid);
|
||||
|
||||
OBJ_CONSTRUCT(&buf, opal_buffer_t);
|
||||
opal_dss.pack(&buf, &i32, 1, OPAL_INT32);
|
||||
if (ORTE_SUCCESS != (rc = orte_rmcast.send_buffer(ORTE_RMCAST_APP_PUBLIC_CHANNEL,
|
||||
ORTE_RMCAST_TAG_WILDCARD, &buf))) {
|
||||
ORTE_RMCAST_TAG_ANNOUNCE, &buf))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_DESTRUCT(&buf);
|
||||
goto blast;
|
||||
@ -74,8 +80,8 @@ int main(int argc, char* argv[])
|
||||
bfptr = OBJ_NEW(opal_buffer_t);
|
||||
i32 = 2;
|
||||
opal_dss.pack(bfptr, &i32, 1, OPAL_INT32);
|
||||
if (ORTE_SUCCESS != (rc = orte_rmcast.send_buffer_nb(ORTE_RMCAST_APP_PUBLIC_CHANNEL,
|
||||
ORTE_RMCAST_TAG_WILDCARD, bfptr,
|
||||
if (ORTE_SUCCESS != (rc = orte_rmcast.send_buffer_nb(chan,
|
||||
ORTE_RMCAST_TAG_OUTPUT, bfptr,
|
||||
cbfunc_buf_snt, NULL))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_RELEASE(bfptr);
|
||||
@ -98,12 +104,22 @@ int main(int argc, char* argv[])
|
||||
orte_finalize();
|
||||
return 0;
|
||||
} else {
|
||||
/* open a new channel */
|
||||
if (ORTE_SUCCESS != (rc = orte_rmcast.open_channel(&chan, "orte_mcast", NULL, -1, NULL, ORTE_RMCAST_RECV))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
if (ORTE_SUCCESS != (rc = orte_rmcast.recv_buffer_nb(ORTE_RMCAST_APP_PUBLIC_CHANNEL,
|
||||
ORTE_RMCAST_TAG_WILDCARD,
|
||||
ORTE_RMCAST_PERSISTENT,
|
||||
cbfunc, NULL))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
if (ORTE_SUCCESS != (rc = orte_rmcast.recv_buffer_nb(chan,
|
||||
ORTE_RMCAST_TAG_WILDCARD,
|
||||
ORTE_RMCAST_PERSISTENT,
|
||||
cbfunc, NULL))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
if (ORTE_SUCCESS != (rc = orte_rmcast.recv_nb(ORTE_RMCAST_APP_PUBLIC_CHANNEL,
|
||||
ORTE_RMCAST_TAG_WILDCARD,
|
||||
ORTE_RMCAST_PERSISTENT,
|
||||
@ -112,7 +128,6 @@ int main(int argc, char* argv[])
|
||||
}
|
||||
|
||||
orte_grpcomm.barrier();
|
||||
fprintf(stderr, "%d: past barrier\n", (int)ORTE_PROC_MY_NAME->vpid);
|
||||
|
||||
}
|
||||
opal_event_dispatch();
|
||||
@ -134,10 +149,9 @@ static void cbfunc(int status,
|
||||
rc = 1;
|
||||
opal_dss.unpack(buffer, &i32, &rc, OPAL_INT32);
|
||||
|
||||
fprintf(stderr, "%s GOT BUFFER MESSAGE from %s with value %d\n",
|
||||
opal_output(0, "%s GOT BUFFER MESSAGE from %s on channel %d tag %d with value %d\n",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(sender), i32);
|
||||
fflush(stderr);
|
||||
ORTE_NAME_PRINT(sender), channel, tag, i32);
|
||||
|
||||
orte_grpcomm.barrier();
|
||||
|
||||
@ -172,9 +186,9 @@ static void cbfunc_iovec(int status,
|
||||
{
|
||||
int rc;
|
||||
|
||||
fprintf(stderr, "%s GOT IOVEC MESSAGE from %s of %d elements\n",
|
||||
opal_output(0, "%s GOT IOVEC MESSAGE from %s of %d elements\n",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(sender), count);
|
||||
fflush(stderr);
|
||||
|
||||
#if 0
|
||||
if (0 != ORTE_PROC_MY_NAME->vpid) {
|
||||
/* send it back */
|
||||
@ -195,8 +209,7 @@ static void cbfunc_buf_snt(int status,
|
||||
orte_process_name_t *sender,
|
||||
opal_buffer_t *buf, void *cbdata)
|
||||
{
|
||||
fprintf(stderr, "%s BUFFERED_NB SEND COMPLETE\n", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
|
||||
fflush(stderr);
|
||||
opal_output(0, "%s BUFFERED_NB SEND COMPLETE\n", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
|
||||
|
||||
OBJ_RELEASE(buf);
|
||||
}
|
||||
|
31
orte/test/system/orte_spin.c
Обычный файл
31
orte/test/system/orte_spin.c
Обычный файл
@ -0,0 +1,31 @@
|
||||
/* -*- C -*-
|
||||
*
|
||||
* $HEADER$
|
||||
*
|
||||
* A program that just spins - provides mechanism for testing user-driven
|
||||
* abnormal program termination
|
||||
*/
|
||||
|
||||
#include <stdio.h>
|
||||
|
||||
#include "orte/runtime/runtime.h"
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
|
||||
int i;
|
||||
double pi;
|
||||
|
||||
orte_init(ORTE_PROC_NON_MPI);
|
||||
|
||||
i = 0;
|
||||
while (1) {
|
||||
i++;
|
||||
pi = i / 3.14159256;
|
||||
if (i > 100) i = 0;
|
||||
}
|
||||
|
||||
orte_finalize();
|
||||
|
||||
return 0;
|
||||
}
|
18
orte/test/system/segfault.c
Обычный файл
18
orte/test/system/segfault.c
Обычный файл
@ -0,0 +1,18 @@
|
||||
/* -*- C -*-
|
||||
*
|
||||
* $HEADER$
|
||||
*
|
||||
* A program that just segfaults
|
||||
*/
|
||||
|
||||
#include <stdio.h>
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
|
||||
double pi;
|
||||
char *dum=NULL;
|
||||
|
||||
pi = (double)*dum;
|
||||
|
||||
}
|
Загрузка…
x
Ссылка в новой задаче
Block a user