1
1

Add another test program - an MPI app that just spins. This supports testing of system response to signal-terminated processes.

Add some debugger output to the ODLS default component.

Modify the orted command communication system so that it is done via non-blocking sends. This removes the linearity of the transmission and improves the response time.

This commit was SVN r12585.
Этот коммит содержится в:
Ralph Castain 2006-11-13 21:51:34 +00:00
родитель a4bdcb4faa
Коммит f95e20e2e1
8 изменённых файлов: 119 добавлений и 38 удалений

@ -430,13 +430,20 @@ GOTCHILD:
/* the abort file must exist - there is nothing in it we need. It's
* meer existence indicates that an abnormal termination occurred
*/
opal_output(orte_odls_globals.output, "odls: child [%ld,%ld,%ld] died by abort",
ORTE_NAME_ARGS(child->name));
aborted = true;
free(abort_file);
} else {
opal_output(orte_odls_globals.output, "odls: child [%ld,%ld,%ld] died naturally",
ORTE_NAME_ARGS(child->name));
}
} else {
/* the process was terminated with a signal! That's definitely
* abnormal, so indicate that condition
*/
opal_output(orte_odls_globals.output, "odls: child [%ld,%ld,%ld] died by signal",
ORTE_NAME_ARGS(child->name));
aborted = true;
}

@ -28,6 +28,7 @@
#include "opal/mca/mca.h"
#include "opal/class/opal_list.h"
#include "opal/threads/condition.h"
#include "orte/mca/pls/pls.h"
@ -37,7 +38,7 @@ extern "C" {
#endif
/**
* Struct to hold data globale to the pls framework
* Struct to hold data global to the pls framework
*/
typedef struct orte_pls_base_t {
/** Verbose/debug output stream */
@ -48,6 +49,10 @@ extern "C" {
bool selected;
/** selected component */
orte_pls_base_component_t selected_component;
/* orted cmd comm lock */
opal_mutex_t orted_cmd_lock;
/* orted cmd cond */
opal_condition_t orted_cmd_cond;
} orte_pls_base_t;
/**

@ -62,6 +62,10 @@ int orte_pls_base_close(void)
&orte_pls_base.available_components, NULL);
OBJ_DESTRUCT(&orte_pls_base.available_components);
/* clearout the orted cmd locks */
OBJ_DESTRUCT(&orte_pls_base.orted_cmd_lock);
OBJ_DESTRUCT(&orte_pls_base.orted_cmd_cond);
return ORTE_SUCCESS;
}

@ -60,6 +60,10 @@ int orte_pls_base_open(void)
/* init selected to be false */
orte_pls_base.selected = false;
/* initialize the condition variables for orted comm */
OBJ_CONSTRUCT(&orte_pls_base.orted_cmd_lock, opal_mutex_t);
OBJ_CONSTRUCT(&orte_pls_base.orted_cmd_cond, opal_condition_t);
/* Open up all the components that we can find */
if (ORTE_SUCCESS !=

@ -20,6 +20,7 @@
#include "orte_config.h"
#include "orte/orte_constants.h"
#include "opal/threads/condition.h"
#include "opal/util/output.h"
#include "opal/util/argv.h"
#include "opal/mca/base/mca_base_param.h"
@ -31,19 +32,36 @@
#include "orte/mca/rml/rml.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/pls/base/base.h"
#include "orte/mca/pls/base/pls_private.h"
static orte_std_cntr_t orted_cmd_num_active;
static void orte_pls_base_orted_send_cb(int status,
orte_process_name_t* peer,
orte_buffer_t* req,
orte_rml_tag_t tag,
void* cbdata)
{
OPAL_THREAD_LOCK(&orte_pls_base.orted_cmd_lock);
orted_cmd_num_active--;
if (orted_cmd_num_active == 0) {
opal_condition_signal(&orte_pls_base.orted_cmd_cond);
}
OPAL_THREAD_UNLOCK(&orte_pls_base.orted_cmd_lock);
}
int orte_pls_base_orted_exit(opal_list_t *daemons)
{
int rc;
orte_buffer_t cmd, answer;
orte_buffer_t cmd;
orte_daemon_cmd_flag_t command=ORTE_DAEMON_EXIT_CMD;
opal_list_item_t *item;
orte_pls_daemon_info_t *dmn;
OPAL_TRACE(1);
OBJ_CONSTRUCT(&cmd, orte_buffer_t);
/* pack the command */
@ -58,19 +76,22 @@ int orte_pls_base_orted_exit(opal_list_t *daemons)
item = opal_list_get_next(item)) {
dmn = (orte_pls_daemon_info_t*)item;
if (0 > orte_rml.send_buffer(dmn->name, &cmd, ORTE_RML_TAG_PLS_ORTED, 0)) {
if (0 > orte_rml.send_buffer_nb(dmn->name, &cmd, ORTE_RML_TAG_PLS_ORTED,
0, orte_pls_base_orted_send_cb, NULL)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_DESTRUCT(&cmd);
return ORTE_ERR_COMM_FAILURE;
}
OBJ_CONSTRUCT(&answer, orte_buffer_t);
if (0 > orte_rml.recv_buffer(dmn->name, &answer, ORTE_RML_TAG_PLS_ORTED_ACK)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
}
OBJ_DESTRUCT(&answer);
orted_cmd_num_active++;
}
/* wait for all commands to have been received */
OPAL_THREAD_LOCK(&orte_pls_base.orted_cmd_lock);
if (orted_cmd_num_active > 0) {
opal_condition_wait(&orte_pls_base.orted_cmd_cond, &orte_pls_base.orted_cmd_lock);
}
OPAL_THREAD_UNLOCK(&orte_pls_base.orted_cmd_lock);
CLEANUP:
OBJ_DESTRUCT(&cmd);
@ -82,7 +103,7 @@ CLEANUP:
int orte_pls_base_orted_kill_local_procs(opal_list_t *daemons, orte_jobid_t job)
{
int rc;
orte_buffer_t cmd, answer;
orte_buffer_t cmd;
orte_daemon_cmd_flag_t command=ORTE_DAEMON_KILL_LOCAL_PROCS;
opal_list_item_t *item;
orte_pls_daemon_info_t *dmn;
@ -109,18 +130,23 @@ int orte_pls_base_orted_kill_local_procs(opal_list_t *daemons, orte_jobid_t job)
item = opal_list_get_next(item)) {
dmn = (orte_pls_daemon_info_t*)item;
if (0 > orte_rml.send_buffer(dmn->name, &cmd, ORTE_RML_TAG_PLS_ORTED, 0)) {
if (0 > orte_rml.send_buffer_nb(dmn->name, &cmd, ORTE_RML_TAG_PLS_ORTED,
0, orte_pls_base_orted_send_cb, NULL)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_DESTRUCT(&cmd);
return rc;
}
OBJ_CONSTRUCT(&answer, orte_buffer_t);
if (0 > orte_rml.recv_buffer(dmn->name, &answer, ORTE_RML_TAG_PLS_ORTED_ACK)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
}
OBJ_DESTRUCT(&answer);
orted_cmd_num_active++;
}
/* wait for all commands to have been received */
OPAL_THREAD_LOCK(&orte_pls_base.orted_cmd_lock);
if (orted_cmd_num_active > 0) {
opal_condition_wait(&orte_pls_base.orted_cmd_cond, &orte_pls_base.orted_cmd_lock);
}
OPAL_THREAD_UNLOCK(&orte_pls_base.orted_cmd_lock);
CLEANUP:
OBJ_DESTRUCT(&cmd);
@ -134,7 +160,7 @@ CLEANUP:
int orte_pls_base_orted_signal_local_procs(opal_list_t *daemons, int32_t signal)
{
int rc;
orte_buffer_t cmd, answer;
orte_buffer_t cmd;
orte_daemon_cmd_flag_t command=ORTE_DAEMON_SIGNAL_LOCAL_PROCS;
opal_list_item_t *item;
orte_pls_daemon_info_t *dmn;
@ -161,19 +187,23 @@ int orte_pls_base_orted_signal_local_procs(opal_list_t *daemons, int32_t signal)
item = opal_list_get_next(item)) {
dmn = (orte_pls_daemon_info_t*)item;
if (0 > orte_rml.send_buffer(dmn->name, &cmd, ORTE_RML_TAG_PLS_ORTED, 0)) {
if (0 > orte_rml.send_buffer_nb(dmn->name, &cmd, ORTE_RML_TAG_PLS_ORTED,
0, orte_pls_base_orted_send_cb, NULL)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_DESTRUCT(&cmd);
return rc;
}
OBJ_CONSTRUCT(&answer, orte_buffer_t);
if (0 > orte_rml.recv_buffer(dmn->name, &answer, ORTE_RML_TAG_PLS_ORTED_ACK)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
}
OBJ_DESTRUCT(&answer);
orted_cmd_num_active++;
}
/* wait for all commands to have been received */
OPAL_THREAD_LOCK(&orte_pls_base.orted_cmd_lock);
if (orted_cmd_num_active > 0) {
opal_condition_wait(&orte_pls_base.orted_cmd_cond, &orte_pls_base.orted_cmd_lock);
}
OPAL_THREAD_UNLOCK(&orte_pls_base.orted_cmd_lock);
CLEANUP:
OBJ_DESTRUCT(&cmd);
@ -185,7 +215,7 @@ CLEANUP:
int orte_pls_base_orted_add_local_procs(opal_list_t *daemons, orte_gpr_notify_data_t *ndat)
{
int rc;
orte_buffer_t cmd, answer;
orte_buffer_t cmd;
orte_daemon_cmd_flag_t command=ORTE_DAEMON_ADD_LOCAL_PROCS;
opal_list_item_t *item;
orte_pls_daemon_info_t *dmn;
@ -212,19 +242,23 @@ int orte_pls_base_orted_add_local_procs(opal_list_t *daemons, orte_gpr_notify_da
item = opal_list_get_next(item)) {
dmn = (orte_pls_daemon_info_t*)item;
if (0 > orte_rml.send_buffer(dmn->name, &cmd, ORTE_RML_TAG_PLS_ORTED, 0)) {
if (0 > orte_rml.send_buffer_nb(dmn->name, &cmd, ORTE_RML_TAG_PLS_ORTED,
0, orte_pls_base_orted_send_cb, NULL)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_DESTRUCT(&cmd);
return rc;
}
OBJ_CONSTRUCT(&answer, orte_buffer_t);
if (0 > orte_rml.recv_buffer(dmn->name, &answer, ORTE_RML_TAG_PLS_ORTED_ACK)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
}
OBJ_DESTRUCT(&answer);
orted_cmd_num_active++;
}
/* wait for all commands to have been received */
OPAL_THREAD_LOCK(&orte_pls_base.orted_cmd_lock);
if (orted_cmd_num_active > 0) {
opal_condition_wait(&orte_pls_base.orted_cmd_cond, &orte_pls_base.orted_cmd_lock);
}
OPAL_THREAD_UNLOCK(&orte_pls_base.orted_cmd_lock);
CLEANUP:
OBJ_DESTRUCT(&cmd);

@ -1,4 +1,4 @@
PROGS = mpi_no_op hello hello_nodename abort multi_abort simple_spawn
PROGS = mpi_no_op hello hello_nodename abort multi_abort simple_spawn mpi_spin
all: $(PROGS)

30
orte/test/mpi/mpi_spin.c Обычный файл

@ -0,0 +1,30 @@
/* -*- C -*-
*
* $HEADER$
*
* A program that just spins - provides mechanism for testing user-driven
* abnormal program termination
*/
#include <stdio.h>
#include "mpi.h"
int main(int argc, char* argv[])
{
int i;
double pi;
MPI_Init(&argc, &argv);
i = 0;
while (1) {
i++;
pi = i / 3.14159256;
if (i > 100) i = 0;
}
MPI_Finalize();
return 0;
}

@ -694,12 +694,9 @@ static void orte_daemon_recv_pls(int status, orte_process_name_t* sender,
opal_output(0, "[%lu,%lu,%lu] orted_recv_pls: received exit",
ORTE_NAME_ARGS(orte_process_info.my_name));
}
/* send the response before we wakeup because otherwise
* we'll depart before it gets out!
/* no response to send - the fact that we received the command
* is known to the HNP because the send_nb gets a callback
*/
if (0 > orte_rml.send_buffer(sender, answer, ORTE_RML_TAG_PLS_ORTED_ACK, 0)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
}
orted_globals.exit_condition = true;
opal_condition_signal(&orted_globals.condition);
break;