Add another test program - an MPI app that just spins. This supports testing of system response to signal-terminated processes.
Add some debugger output to the ODLS default component. Modify the orted command communication system so that it is done via non-blocking sends. This removes the linearity of the transmission and improves the response time. This commit was SVN r12585.
Этот коммит содержится в:
родитель
a4bdcb4faa
Коммит
f95e20e2e1
orte
mca
test/mpi
tools/orted
@ -430,13 +430,20 @@ GOTCHILD:
|
||||
/* the abort file must exist - there is nothing in it we need. It's
|
||||
* meer existence indicates that an abnormal termination occurred
|
||||
*/
|
||||
opal_output(orte_odls_globals.output, "odls: child [%ld,%ld,%ld] died by abort",
|
||||
ORTE_NAME_ARGS(child->name));
|
||||
aborted = true;
|
||||
free(abort_file);
|
||||
} else {
|
||||
opal_output(orte_odls_globals.output, "odls: child [%ld,%ld,%ld] died naturally",
|
||||
ORTE_NAME_ARGS(child->name));
|
||||
}
|
||||
} else {
|
||||
/* the process was terminated with a signal! That's definitely
|
||||
* abnormal, so indicate that condition
|
||||
*/
|
||||
opal_output(orte_odls_globals.output, "odls: child [%ld,%ld,%ld] died by signal",
|
||||
ORTE_NAME_ARGS(child->name));
|
||||
aborted = true;
|
||||
}
|
||||
|
||||
|
@ -28,6 +28,7 @@
|
||||
|
||||
#include "opal/mca/mca.h"
|
||||
#include "opal/class/opal_list.h"
|
||||
#include "opal/threads/condition.h"
|
||||
|
||||
#include "orte/mca/pls/pls.h"
|
||||
|
||||
@ -37,7 +38,7 @@ extern "C" {
|
||||
#endif
|
||||
|
||||
/**
|
||||
* Struct to hold data globale to the pls framework
|
||||
* Struct to hold data global to the pls framework
|
||||
*/
|
||||
typedef struct orte_pls_base_t {
|
||||
/** Verbose/debug output stream */
|
||||
@ -48,6 +49,10 @@ extern "C" {
|
||||
bool selected;
|
||||
/** selected component */
|
||||
orte_pls_base_component_t selected_component;
|
||||
/* orted cmd comm lock */
|
||||
opal_mutex_t orted_cmd_lock;
|
||||
/* orted cmd cond */
|
||||
opal_condition_t orted_cmd_cond;
|
||||
} orte_pls_base_t;
|
||||
|
||||
/**
|
||||
|
@ -62,6 +62,10 @@ int orte_pls_base_close(void)
|
||||
&orte_pls_base.available_components, NULL);
|
||||
OBJ_DESTRUCT(&orte_pls_base.available_components);
|
||||
|
||||
/* clearout the orted cmd locks */
|
||||
OBJ_DESTRUCT(&orte_pls_base.orted_cmd_lock);
|
||||
OBJ_DESTRUCT(&orte_pls_base.orted_cmd_cond);
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
@ -60,6 +60,10 @@ int orte_pls_base_open(void)
|
||||
/* init selected to be false */
|
||||
orte_pls_base.selected = false;
|
||||
|
||||
/* initialize the condition variables for orted comm */
|
||||
OBJ_CONSTRUCT(&orte_pls_base.orted_cmd_lock, opal_mutex_t);
|
||||
OBJ_CONSTRUCT(&orte_pls_base.orted_cmd_cond, opal_condition_t);
|
||||
|
||||
/* Open up all the components that we can find */
|
||||
|
||||
if (ORTE_SUCCESS !=
|
||||
|
@ -20,6 +20,7 @@
|
||||
#include "orte_config.h"
|
||||
#include "orte/orte_constants.h"
|
||||
|
||||
#include "opal/threads/condition.h"
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/util/argv.h"
|
||||
#include "opal/mca/base/mca_base_param.h"
|
||||
@ -31,19 +32,36 @@
|
||||
#include "orte/mca/rml/rml.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
|
||||
#include "orte/mca/pls/base/base.h"
|
||||
#include "orte/mca/pls/base/pls_private.h"
|
||||
|
||||
static orte_std_cntr_t orted_cmd_num_active;
|
||||
|
||||
static void orte_pls_base_orted_send_cb(int status,
|
||||
orte_process_name_t* peer,
|
||||
orte_buffer_t* req,
|
||||
orte_rml_tag_t tag,
|
||||
void* cbdata)
|
||||
{
|
||||
OPAL_THREAD_LOCK(&orte_pls_base.orted_cmd_lock);
|
||||
orted_cmd_num_active--;
|
||||
if (orted_cmd_num_active == 0) {
|
||||
opal_condition_signal(&orte_pls_base.orted_cmd_cond);
|
||||
}
|
||||
OPAL_THREAD_UNLOCK(&orte_pls_base.orted_cmd_lock);
|
||||
}
|
||||
|
||||
|
||||
int orte_pls_base_orted_exit(opal_list_t *daemons)
|
||||
{
|
||||
int rc;
|
||||
orte_buffer_t cmd, answer;
|
||||
orte_buffer_t cmd;
|
||||
orte_daemon_cmd_flag_t command=ORTE_DAEMON_EXIT_CMD;
|
||||
opal_list_item_t *item;
|
||||
orte_pls_daemon_info_t *dmn;
|
||||
|
||||
OPAL_TRACE(1);
|
||||
|
||||
|
||||
OBJ_CONSTRUCT(&cmd, orte_buffer_t);
|
||||
|
||||
/* pack the command */
|
||||
@ -58,19 +76,22 @@ int orte_pls_base_orted_exit(opal_list_t *daemons)
|
||||
item = opal_list_get_next(item)) {
|
||||
dmn = (orte_pls_daemon_info_t*)item;
|
||||
|
||||
if (0 > orte_rml.send_buffer(dmn->name, &cmd, ORTE_RML_TAG_PLS_ORTED, 0)) {
|
||||
if (0 > orte_rml.send_buffer_nb(dmn->name, &cmd, ORTE_RML_TAG_PLS_ORTED,
|
||||
0, orte_pls_base_orted_send_cb, NULL)) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
|
||||
OBJ_DESTRUCT(&cmd);
|
||||
return ORTE_ERR_COMM_FAILURE;
|
||||
}
|
||||
|
||||
OBJ_CONSTRUCT(&answer, orte_buffer_t);
|
||||
if (0 > orte_rml.recv_buffer(dmn->name, &answer, ORTE_RML_TAG_PLS_ORTED_ACK)) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
|
||||
}
|
||||
OBJ_DESTRUCT(&answer);
|
||||
orted_cmd_num_active++;
|
||||
}
|
||||
|
||||
|
||||
/* wait for all commands to have been received */
|
||||
OPAL_THREAD_LOCK(&orte_pls_base.orted_cmd_lock);
|
||||
if (orted_cmd_num_active > 0) {
|
||||
opal_condition_wait(&orte_pls_base.orted_cmd_cond, &orte_pls_base.orted_cmd_lock);
|
||||
}
|
||||
OPAL_THREAD_UNLOCK(&orte_pls_base.orted_cmd_lock);
|
||||
|
||||
CLEANUP:
|
||||
OBJ_DESTRUCT(&cmd);
|
||||
|
||||
@ -82,7 +103,7 @@ CLEANUP:
|
||||
int orte_pls_base_orted_kill_local_procs(opal_list_t *daemons, orte_jobid_t job)
|
||||
{
|
||||
int rc;
|
||||
orte_buffer_t cmd, answer;
|
||||
orte_buffer_t cmd;
|
||||
orte_daemon_cmd_flag_t command=ORTE_DAEMON_KILL_LOCAL_PROCS;
|
||||
opal_list_item_t *item;
|
||||
orte_pls_daemon_info_t *dmn;
|
||||
@ -109,18 +130,23 @@ int orte_pls_base_orted_kill_local_procs(opal_list_t *daemons, orte_jobid_t job)
|
||||
item = opal_list_get_next(item)) {
|
||||
dmn = (orte_pls_daemon_info_t*)item;
|
||||
|
||||
if (0 > orte_rml.send_buffer(dmn->name, &cmd, ORTE_RML_TAG_PLS_ORTED, 0)) {
|
||||
if (0 > orte_rml.send_buffer_nb(dmn->name, &cmd, ORTE_RML_TAG_PLS_ORTED,
|
||||
0, orte_pls_base_orted_send_cb, NULL)) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
|
||||
OBJ_DESTRUCT(&cmd);
|
||||
return rc;
|
||||
}
|
||||
|
||||
OBJ_CONSTRUCT(&answer, orte_buffer_t);
|
||||
if (0 > orte_rml.recv_buffer(dmn->name, &answer, ORTE_RML_TAG_PLS_ORTED_ACK)) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
|
||||
}
|
||||
OBJ_DESTRUCT(&answer);
|
||||
orted_cmd_num_active++;
|
||||
}
|
||||
|
||||
/* wait for all commands to have been received */
|
||||
OPAL_THREAD_LOCK(&orte_pls_base.orted_cmd_lock);
|
||||
if (orted_cmd_num_active > 0) {
|
||||
opal_condition_wait(&orte_pls_base.orted_cmd_cond, &orte_pls_base.orted_cmd_lock);
|
||||
}
|
||||
OPAL_THREAD_UNLOCK(&orte_pls_base.orted_cmd_lock);
|
||||
|
||||
|
||||
CLEANUP:
|
||||
OBJ_DESTRUCT(&cmd);
|
||||
@ -134,7 +160,7 @@ CLEANUP:
|
||||
int orte_pls_base_orted_signal_local_procs(opal_list_t *daemons, int32_t signal)
|
||||
{
|
||||
int rc;
|
||||
orte_buffer_t cmd, answer;
|
||||
orte_buffer_t cmd;
|
||||
orte_daemon_cmd_flag_t command=ORTE_DAEMON_SIGNAL_LOCAL_PROCS;
|
||||
opal_list_item_t *item;
|
||||
orte_pls_daemon_info_t *dmn;
|
||||
@ -161,19 +187,23 @@ int orte_pls_base_orted_signal_local_procs(opal_list_t *daemons, int32_t signal)
|
||||
item = opal_list_get_next(item)) {
|
||||
dmn = (orte_pls_daemon_info_t*)item;
|
||||
|
||||
if (0 > orte_rml.send_buffer(dmn->name, &cmd, ORTE_RML_TAG_PLS_ORTED, 0)) {
|
||||
if (0 > orte_rml.send_buffer_nb(dmn->name, &cmd, ORTE_RML_TAG_PLS_ORTED,
|
||||
0, orte_pls_base_orted_send_cb, NULL)) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
|
||||
OBJ_DESTRUCT(&cmd);
|
||||
return rc;
|
||||
}
|
||||
|
||||
OBJ_CONSTRUCT(&answer, orte_buffer_t);
|
||||
if (0 > orte_rml.recv_buffer(dmn->name, &answer, ORTE_RML_TAG_PLS_ORTED_ACK)) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
|
||||
}
|
||||
OBJ_DESTRUCT(&answer);
|
||||
orted_cmd_num_active++;
|
||||
}
|
||||
|
||||
/* wait for all commands to have been received */
|
||||
OPAL_THREAD_LOCK(&orte_pls_base.orted_cmd_lock);
|
||||
if (orted_cmd_num_active > 0) {
|
||||
opal_condition_wait(&orte_pls_base.orted_cmd_cond, &orte_pls_base.orted_cmd_lock);
|
||||
}
|
||||
OPAL_THREAD_UNLOCK(&orte_pls_base.orted_cmd_lock);
|
||||
|
||||
CLEANUP:
|
||||
OBJ_DESTRUCT(&cmd);
|
||||
|
||||
@ -185,7 +215,7 @@ CLEANUP:
|
||||
int orte_pls_base_orted_add_local_procs(opal_list_t *daemons, orte_gpr_notify_data_t *ndat)
|
||||
{
|
||||
int rc;
|
||||
orte_buffer_t cmd, answer;
|
||||
orte_buffer_t cmd;
|
||||
orte_daemon_cmd_flag_t command=ORTE_DAEMON_ADD_LOCAL_PROCS;
|
||||
opal_list_item_t *item;
|
||||
orte_pls_daemon_info_t *dmn;
|
||||
@ -212,19 +242,23 @@ int orte_pls_base_orted_add_local_procs(opal_list_t *daemons, orte_gpr_notify_da
|
||||
item = opal_list_get_next(item)) {
|
||||
dmn = (orte_pls_daemon_info_t*)item;
|
||||
|
||||
if (0 > orte_rml.send_buffer(dmn->name, &cmd, ORTE_RML_TAG_PLS_ORTED, 0)) {
|
||||
if (0 > orte_rml.send_buffer_nb(dmn->name, &cmd, ORTE_RML_TAG_PLS_ORTED,
|
||||
0, orte_pls_base_orted_send_cb, NULL)) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
|
||||
OBJ_DESTRUCT(&cmd);
|
||||
return rc;
|
||||
}
|
||||
|
||||
OBJ_CONSTRUCT(&answer, orte_buffer_t);
|
||||
if (0 > orte_rml.recv_buffer(dmn->name, &answer, ORTE_RML_TAG_PLS_ORTED_ACK)) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
|
||||
}
|
||||
OBJ_DESTRUCT(&answer);
|
||||
orted_cmd_num_active++;
|
||||
}
|
||||
|
||||
/* wait for all commands to have been received */
|
||||
OPAL_THREAD_LOCK(&orte_pls_base.orted_cmd_lock);
|
||||
if (orted_cmd_num_active > 0) {
|
||||
opal_condition_wait(&orte_pls_base.orted_cmd_cond, &orte_pls_base.orted_cmd_lock);
|
||||
}
|
||||
OPAL_THREAD_UNLOCK(&orte_pls_base.orted_cmd_lock);
|
||||
|
||||
CLEANUP:
|
||||
OBJ_DESTRUCT(&cmd);
|
||||
|
||||
|
@ -1,4 +1,4 @@
|
||||
PROGS = mpi_no_op hello hello_nodename abort multi_abort simple_spawn
|
||||
PROGS = mpi_no_op hello hello_nodename abort multi_abort simple_spawn mpi_spin
|
||||
|
||||
all: $(PROGS)
|
||||
|
||||
|
30
orte/test/mpi/mpi_spin.c
Обычный файл
30
orte/test/mpi/mpi_spin.c
Обычный файл
@ -0,0 +1,30 @@
|
||||
/* -*- C -*-
|
||||
*
|
||||
* $HEADER$
|
||||
*
|
||||
* A program that just spins - provides mechanism for testing user-driven
|
||||
* abnormal program termination
|
||||
*/
|
||||
|
||||
#include <stdio.h>
|
||||
#include "mpi.h"
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
|
||||
int i;
|
||||
double pi;
|
||||
|
||||
MPI_Init(&argc, &argv);
|
||||
|
||||
i = 0;
|
||||
while (1) {
|
||||
i++;
|
||||
pi = i / 3.14159256;
|
||||
if (i > 100) i = 0;
|
||||
}
|
||||
|
||||
MPI_Finalize();
|
||||
|
||||
return 0;
|
||||
}
|
@ -694,12 +694,9 @@ static void orte_daemon_recv_pls(int status, orte_process_name_t* sender,
|
||||
opal_output(0, "[%lu,%lu,%lu] orted_recv_pls: received exit",
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name));
|
||||
}
|
||||
/* send the response before we wakeup because otherwise
|
||||
* we'll depart before it gets out!
|
||||
/* no response to send - the fact that we received the command
|
||||
* is known to the HNP because the send_nb gets a callback
|
||||
*/
|
||||
if (0 > orte_rml.send_buffer(sender, answer, ORTE_RML_TAG_PLS_ORTED_ACK, 0)) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
|
||||
}
|
||||
orted_globals.exit_condition = true;
|
||||
opal_condition_signal(&orted_globals.condition);
|
||||
break;
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user