From f95e20e2e108b0f04f9e1cae400dba47aa351855 Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Mon, 13 Nov 2006 21:51:34 +0000 Subject: [PATCH] Add another test program - an MPI app that just spins. This supports testing of system response to signal-terminated processes. Add some debugger output to the ODLS default component. Modify the orted command communication system so that it is done via non-blocking sends. This removes the linearity of the transmission and improves the response time. This commit was SVN r12585. --- orte/mca/odls/default/odls_default_module.c | 7 ++ orte/mca/pls/base/base.h | 7 +- orte/mca/pls/base/pls_base_close.c | 4 + orte/mca/pls/base/pls_base_open.c | 4 + orte/mca/pls/base/pls_base_orted_cmds.c | 96 ++++++++++++++------- orte/test/mpi/Makefile | 2 +- orte/test/mpi/mpi_spin.c | 30 +++++++ orte/tools/orted/orted.c | 7 +- 8 files changed, 119 insertions(+), 38 deletions(-) create mode 100644 orte/test/mpi/mpi_spin.c diff --git a/orte/mca/odls/default/odls_default_module.c b/orte/mca/odls/default/odls_default_module.c index 2d59f93505..5724e6714b 100644 --- a/orte/mca/odls/default/odls_default_module.c +++ b/orte/mca/odls/default/odls_default_module.c @@ -430,13 +430,20 @@ GOTCHILD: /* the abort file must exist - there is nothing in it we need. It's * meer existence indicates that an abnormal termination occurred */ + opal_output(orte_odls_globals.output, "odls: child [%ld,%ld,%ld] died by abort", + ORTE_NAME_ARGS(child->name)); aborted = true; free(abort_file); + } else { + opal_output(orte_odls_globals.output, "odls: child [%ld,%ld,%ld] died naturally", + ORTE_NAME_ARGS(child->name)); } } else { /* the process was terminated with a signal! That's definitely * abnormal, so indicate that condition */ + opal_output(orte_odls_globals.output, "odls: child [%ld,%ld,%ld] died by signal", + ORTE_NAME_ARGS(child->name)); aborted = true; } diff --git a/orte/mca/pls/base/base.h b/orte/mca/pls/base/base.h index 074b4006e2..053ec00cf3 100644 --- a/orte/mca/pls/base/base.h +++ b/orte/mca/pls/base/base.h @@ -28,6 +28,7 @@ #include "opal/mca/mca.h" #include "opal/class/opal_list.h" +#include "opal/threads/condition.h" #include "orte/mca/pls/pls.h" @@ -37,7 +38,7 @@ extern "C" { #endif /** - * Struct to hold data globale to the pls framework + * Struct to hold data global to the pls framework */ typedef struct orte_pls_base_t { /** Verbose/debug output stream */ @@ -48,6 +49,10 @@ extern "C" { bool selected; /** selected component */ orte_pls_base_component_t selected_component; + /* orted cmd comm lock */ + opal_mutex_t orted_cmd_lock; + /* orted cmd cond */ + opal_condition_t orted_cmd_cond; } orte_pls_base_t; /** diff --git a/orte/mca/pls/base/pls_base_close.c b/orte/mca/pls/base/pls_base_close.c index f3788c502e..6080c6ae9d 100644 --- a/orte/mca/pls/base/pls_base_close.c +++ b/orte/mca/pls/base/pls_base_close.c @@ -62,6 +62,10 @@ int orte_pls_base_close(void) &orte_pls_base.available_components, NULL); OBJ_DESTRUCT(&orte_pls_base.available_components); + /* clearout the orted cmd locks */ + OBJ_DESTRUCT(&orte_pls_base.orted_cmd_lock); + OBJ_DESTRUCT(&orte_pls_base.orted_cmd_cond); + return ORTE_SUCCESS; } diff --git a/orte/mca/pls/base/pls_base_open.c b/orte/mca/pls/base/pls_base_open.c index 5ecaff9455..2a6d507679 100644 --- a/orte/mca/pls/base/pls_base_open.c +++ b/orte/mca/pls/base/pls_base_open.c @@ -60,6 +60,10 @@ int orte_pls_base_open(void) /* init selected to be false */ orte_pls_base.selected = false; + /* initialize the condition variables for orted comm */ + OBJ_CONSTRUCT(&orte_pls_base.orted_cmd_lock, opal_mutex_t); + OBJ_CONSTRUCT(&orte_pls_base.orted_cmd_cond, opal_condition_t); + /* Open up all the components that we can find */ if (ORTE_SUCCESS != diff --git a/orte/mca/pls/base/pls_base_orted_cmds.c b/orte/mca/pls/base/pls_base_orted_cmds.c index 412a021097..3eff905331 100644 --- a/orte/mca/pls/base/pls_base_orted_cmds.c +++ b/orte/mca/pls/base/pls_base_orted_cmds.c @@ -20,6 +20,7 @@ #include "orte_config.h" #include "orte/orte_constants.h" +#include "opal/threads/condition.h" #include "opal/util/output.h" #include "opal/util/argv.h" #include "opal/mca/base/mca_base_param.h" @@ -31,19 +32,36 @@ #include "orte/mca/rml/rml.h" #include "orte/mca/errmgr/errmgr.h" +#include "orte/mca/pls/base/base.h" #include "orte/mca/pls/base/pls_private.h" +static orte_std_cntr_t orted_cmd_num_active; + +static void orte_pls_base_orted_send_cb(int status, + orte_process_name_t* peer, + orte_buffer_t* req, + orte_rml_tag_t tag, + void* cbdata) +{ + OPAL_THREAD_LOCK(&orte_pls_base.orted_cmd_lock); + orted_cmd_num_active--; + if (orted_cmd_num_active == 0) { + opal_condition_signal(&orte_pls_base.orted_cmd_cond); + } + OPAL_THREAD_UNLOCK(&orte_pls_base.orted_cmd_lock); +} + int orte_pls_base_orted_exit(opal_list_t *daemons) { int rc; - orte_buffer_t cmd, answer; + orte_buffer_t cmd; orte_daemon_cmd_flag_t command=ORTE_DAEMON_EXIT_CMD; opal_list_item_t *item; orte_pls_daemon_info_t *dmn; OPAL_TRACE(1); - + OBJ_CONSTRUCT(&cmd, orte_buffer_t); /* pack the command */ @@ -58,19 +76,22 @@ int orte_pls_base_orted_exit(opal_list_t *daemons) item = opal_list_get_next(item)) { dmn = (orte_pls_daemon_info_t*)item; - if (0 > orte_rml.send_buffer(dmn->name, &cmd, ORTE_RML_TAG_PLS_ORTED, 0)) { + if (0 > orte_rml.send_buffer_nb(dmn->name, &cmd, ORTE_RML_TAG_PLS_ORTED, + 0, orte_pls_base_orted_send_cb, NULL)) { ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); OBJ_DESTRUCT(&cmd); return ORTE_ERR_COMM_FAILURE; } - - OBJ_CONSTRUCT(&answer, orte_buffer_t); - if (0 > orte_rml.recv_buffer(dmn->name, &answer, ORTE_RML_TAG_PLS_ORTED_ACK)) { - ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); - } - OBJ_DESTRUCT(&answer); + orted_cmd_num_active++; } - + + /* wait for all commands to have been received */ + OPAL_THREAD_LOCK(&orte_pls_base.orted_cmd_lock); + if (orted_cmd_num_active > 0) { + opal_condition_wait(&orte_pls_base.orted_cmd_cond, &orte_pls_base.orted_cmd_lock); + } + OPAL_THREAD_UNLOCK(&orte_pls_base.orted_cmd_lock); + CLEANUP: OBJ_DESTRUCT(&cmd); @@ -82,7 +103,7 @@ CLEANUP: int orte_pls_base_orted_kill_local_procs(opal_list_t *daemons, orte_jobid_t job) { int rc; - orte_buffer_t cmd, answer; + orte_buffer_t cmd; orte_daemon_cmd_flag_t command=ORTE_DAEMON_KILL_LOCAL_PROCS; opal_list_item_t *item; orte_pls_daemon_info_t *dmn; @@ -109,18 +130,23 @@ int orte_pls_base_orted_kill_local_procs(opal_list_t *daemons, orte_jobid_t job) item = opal_list_get_next(item)) { dmn = (orte_pls_daemon_info_t*)item; - if (0 > orte_rml.send_buffer(dmn->name, &cmd, ORTE_RML_TAG_PLS_ORTED, 0)) { + if (0 > orte_rml.send_buffer_nb(dmn->name, &cmd, ORTE_RML_TAG_PLS_ORTED, + 0, orte_pls_base_orted_send_cb, NULL)) { ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); OBJ_DESTRUCT(&cmd); return rc; } - OBJ_CONSTRUCT(&answer, orte_buffer_t); - if (0 > orte_rml.recv_buffer(dmn->name, &answer, ORTE_RML_TAG_PLS_ORTED_ACK)) { - ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); - } - OBJ_DESTRUCT(&answer); + orted_cmd_num_active++; } + + /* wait for all commands to have been received */ + OPAL_THREAD_LOCK(&orte_pls_base.orted_cmd_lock); + if (orted_cmd_num_active > 0) { + opal_condition_wait(&orte_pls_base.orted_cmd_cond, &orte_pls_base.orted_cmd_lock); + } + OPAL_THREAD_UNLOCK(&orte_pls_base.orted_cmd_lock); + CLEANUP: OBJ_DESTRUCT(&cmd); @@ -134,7 +160,7 @@ CLEANUP: int orte_pls_base_orted_signal_local_procs(opal_list_t *daemons, int32_t signal) { int rc; - orte_buffer_t cmd, answer; + orte_buffer_t cmd; orte_daemon_cmd_flag_t command=ORTE_DAEMON_SIGNAL_LOCAL_PROCS; opal_list_item_t *item; orte_pls_daemon_info_t *dmn; @@ -161,19 +187,23 @@ int orte_pls_base_orted_signal_local_procs(opal_list_t *daemons, int32_t signal) item = opal_list_get_next(item)) { dmn = (orte_pls_daemon_info_t*)item; - if (0 > orte_rml.send_buffer(dmn->name, &cmd, ORTE_RML_TAG_PLS_ORTED, 0)) { + if (0 > orte_rml.send_buffer_nb(dmn->name, &cmd, ORTE_RML_TAG_PLS_ORTED, + 0, orte_pls_base_orted_send_cb, NULL)) { ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); OBJ_DESTRUCT(&cmd); return rc; } - OBJ_CONSTRUCT(&answer, orte_buffer_t); - if (0 > orte_rml.recv_buffer(dmn->name, &answer, ORTE_RML_TAG_PLS_ORTED_ACK)) { - ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); - } - OBJ_DESTRUCT(&answer); + orted_cmd_num_active++; } + /* wait for all commands to have been received */ + OPAL_THREAD_LOCK(&orte_pls_base.orted_cmd_lock); + if (orted_cmd_num_active > 0) { + opal_condition_wait(&orte_pls_base.orted_cmd_cond, &orte_pls_base.orted_cmd_lock); + } + OPAL_THREAD_UNLOCK(&orte_pls_base.orted_cmd_lock); + CLEANUP: OBJ_DESTRUCT(&cmd); @@ -185,7 +215,7 @@ CLEANUP: int orte_pls_base_orted_add_local_procs(opal_list_t *daemons, orte_gpr_notify_data_t *ndat) { int rc; - orte_buffer_t cmd, answer; + orte_buffer_t cmd; orte_daemon_cmd_flag_t command=ORTE_DAEMON_ADD_LOCAL_PROCS; opal_list_item_t *item; orte_pls_daemon_info_t *dmn; @@ -212,19 +242,23 @@ int orte_pls_base_orted_add_local_procs(opal_list_t *daemons, orte_gpr_notify_da item = opal_list_get_next(item)) { dmn = (orte_pls_daemon_info_t*)item; - if (0 > orte_rml.send_buffer(dmn->name, &cmd, ORTE_RML_TAG_PLS_ORTED, 0)) { + if (0 > orte_rml.send_buffer_nb(dmn->name, &cmd, ORTE_RML_TAG_PLS_ORTED, + 0, orte_pls_base_orted_send_cb, NULL)) { ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); OBJ_DESTRUCT(&cmd); return rc; } - OBJ_CONSTRUCT(&answer, orte_buffer_t); - if (0 > orte_rml.recv_buffer(dmn->name, &answer, ORTE_RML_TAG_PLS_ORTED_ACK)) { - ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); - } - OBJ_DESTRUCT(&answer); + orted_cmd_num_active++; } + /* wait for all commands to have been received */ + OPAL_THREAD_LOCK(&orte_pls_base.orted_cmd_lock); + if (orted_cmd_num_active > 0) { + opal_condition_wait(&orte_pls_base.orted_cmd_cond, &orte_pls_base.orted_cmd_lock); + } + OPAL_THREAD_UNLOCK(&orte_pls_base.orted_cmd_lock); + CLEANUP: OBJ_DESTRUCT(&cmd); diff --git a/orte/test/mpi/Makefile b/orte/test/mpi/Makefile index b348d5b316..e29b2718ab 100644 --- a/orte/test/mpi/Makefile +++ b/orte/test/mpi/Makefile @@ -1,4 +1,4 @@ -PROGS = mpi_no_op hello hello_nodename abort multi_abort simple_spawn +PROGS = mpi_no_op hello hello_nodename abort multi_abort simple_spawn mpi_spin all: $(PROGS) diff --git a/orte/test/mpi/mpi_spin.c b/orte/test/mpi/mpi_spin.c new file mode 100644 index 0000000000..b4f990c4ab --- /dev/null +++ b/orte/test/mpi/mpi_spin.c @@ -0,0 +1,30 @@ +/* -*- C -*- + * + * $HEADER$ + * + * A program that just spins - provides mechanism for testing user-driven + * abnormal program termination + */ + +#include +#include "mpi.h" + +int main(int argc, char* argv[]) +{ + + int i; + double pi; + + MPI_Init(&argc, &argv); + + i = 0; + while (1) { + i++; + pi = i / 3.14159256; + if (i > 100) i = 0; + } + + MPI_Finalize(); + + return 0; +} diff --git a/orte/tools/orted/orted.c b/orte/tools/orted/orted.c index 1b8db5d4c1..5284340dc8 100644 --- a/orte/tools/orted/orted.c +++ b/orte/tools/orted/orted.c @@ -694,12 +694,9 @@ static void orte_daemon_recv_pls(int status, orte_process_name_t* sender, opal_output(0, "[%lu,%lu,%lu] orted_recv_pls: received exit", ORTE_NAME_ARGS(orte_process_info.my_name)); } - /* send the response before we wakeup because otherwise - * we'll depart before it gets out! + /* no response to send - the fact that we received the command + * is known to the HNP because the send_nb gets a callback */ - if (0 > orte_rml.send_buffer(sender, answer, ORTE_RML_TAG_PLS_ORTED_ACK, 0)) { - ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); - } orted_globals.exit_condition = true; opal_condition_signal(&orted_globals.condition); break;