1
1
openmpi/orte/mca/iof/hnp/iof_hnp_component.c
Ralph Castain 728a24c8ec After considerable patience and help with debugging/testing from Tim M and Jeff S, return a completed and pretty well tested patch of the IOF to the trunk. This commit includes the previously reverted r20074, r20068, and r20064, as well as changes to fix those commits.
Basically, the remaining problem turned out to be:

1. closing stdout/stderr during orte_finalize of mpirun

2. inadvertently setting up a write event on fd = -1

3. devising a scheme to more accurately track when the stdin write event was active vs closed so it only got released once

This passed prelim MTT testing by Jeff and Tim, but should soak for awhile before migrating to 1.3.

This commit was SVN r20106.

The following SVN revision numbers were found above:
  r20064 --> open-mpi/ompi@a07660aea8
  r20068 --> open-mpi/ompi@ec930d14a9
  r20074 --> open-mpi/ompi@2940309613
2008-12-10 20:40:47 +00:00

203 строки
6.3 KiB
C

/*
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2007 Sun Microsystems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "opal/mca/base/base.h"
#include "opal/mca/base/mca_base_param.h"
#include "opal/event/event.h"
#include "orte/util/show_help.h"
#include "orte/util/proc_info.h"
#include "orte/mca/rml/rml.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/runtime/orte_globals.h"
#include "orte/mca/iof/base/base.h"
#include "iof_hnp.h"
/*
* Local functions
*/
static int orte_iof_hnp_open(void);
static int orte_iof_hnp_close(void);
static int orte_iof_hnp_query(mca_base_module_t **module, int *priority);
static void
orte_iof_hnp_exception_handler(const orte_process_name_t* peer, orte_rml_exception_t reason);
/*
* Local variables
*/
static bool initialized = false;
/*
* Public string showing the iof hnp component version number
*/
const char *mca_iof_hnp_component_version_string =
"Open MPI hnp iof MCA component version " ORTE_VERSION;
orte_iof_hnp_component_t mca_iof_hnp_component = {
{
/* First, the mca_base_component_t struct containing meta
information about the component itself */
{
ORTE_IOF_BASE_VERSION_2_0_0,
"hnp", /* MCA component name */
ORTE_MAJOR_VERSION, /* MCA component major version */
ORTE_MINOR_VERSION, /* MCA component minor version */
ORTE_RELEASE_VERSION, /* MCA component release version */
/* Component open, close, and query functions */
orte_iof_hnp_open,
orte_iof_hnp_close,
orte_iof_hnp_query
},
{
/* The component is checkpoint ready */
MCA_BASE_METADATA_PARAM_CHECKPOINT
},
}
};
/**
* component open/close/init function
*/
static int orte_iof_hnp_open(void)
{
/* Nothing to do */
return ORTE_SUCCESS;
}
static int orte_iof_hnp_close(void)
{
opal_list_item_t* item;
if (initialized) {
OPAL_THREAD_LOCK(&mca_iof_hnp_component.lock);
/* if the stdin event is active, delete it */
if (NULL != mca_iof_hnp_component.stdinev) {
OBJ_RELEASE(mca_iof_hnp_component.stdinev);
}
/* cleanout all registered sinks */
while ((item = opal_list_remove_first(&mca_iof_hnp_component.sinks)) != NULL) {
OBJ_RELEASE(item);
}
OBJ_DESTRUCT(&mca_iof_hnp_component.sinks);
/* cleanout all pending proc objects holding receive events */
while ((item = opal_list_remove_first(&mca_iof_hnp_component.procs)) != NULL) {
OBJ_RELEASE(item);
}
OBJ_DESTRUCT(&mca_iof_hnp_component.procs);
orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_IOF_HNP);
OPAL_THREAD_UNLOCK(&mca_iof_hnp_component.lock);
OBJ_DESTRUCT(&mca_iof_hnp_component.lock);
}
return ORTE_SUCCESS;
}
/**
* Module query
*/
static int orte_iof_hnp_query(mca_base_module_t **module, int *priority)
{
int rc;
/* set default */
*module = NULL;
*priority = -1;
/* if we are not the HNP, then don't use this module */
if (!orte_process_info.hnp) {
return ORTE_ERROR;
}
/* post non-blocking recv to catch forwarded IO from
* the orteds
*/
if (ORTE_SUCCESS != (rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD,
ORTE_RML_TAG_IOF_HNP,
ORTE_RML_NON_PERSISTENT,
orte_iof_hnp_recv,
NULL))) {
ORTE_ERROR_LOG(rc);
return rc;
}
if (ORTE_SUCCESS != (rc = orte_rml.add_exception_handler(orte_iof_hnp_exception_handler))) {
ORTE_ERROR_LOG(rc);
orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_IOF_HNP);
return rc;
}
OBJ_CONSTRUCT(&mca_iof_hnp_component.lock, opal_mutex_t);
OBJ_CONSTRUCT(&mca_iof_hnp_component.sinks, opal_list_t);
OBJ_CONSTRUCT(&mca_iof_hnp_component.procs, opal_list_t);
mca_iof_hnp_component.stdinev = NULL;
/* we must be selected */
*priority = 100;
*module = (mca_base_module_t *) &orte_iof_hnp_module;
initialized = true;
return ORTE_SUCCESS;
}
/**
* Callback when peer is disconnected
*/
static void
orte_iof_hnp_exception_handler(const orte_process_name_t* peer, orte_rml_exception_t reason)
{
#if 0
orte_iof_base_endpoint_t *endpoint;
opal_output_verbose(1, orte_iof_base.iof_output,
"iof svc exception handler! %s\n",
ORTE_NAME_PRINT((orte_process_name_t*)peer));
/* If we detect an exception on the RML connection to a peer,
delete all of its subscriptions and publications. Note that
exceptions can be detected during a normal RML shutdown; they
are recoverable events (no need to abort). */
orte_iof_hnp_sub_delete_all(peer);
orte_iof_hnp_pub_delete_all(peer);
opal_output_verbose(1, orte_iof_base.iof_output, "deleted all pubs and subs\n");
/* Find any streams on any endpoints for this peer and close them */
while (NULL !=
(endpoint = orte_iof_base_endpoint_match(peer, ORTE_NS_CMP_ALL,
ORTE_IOF_ANY))) {
orte_iof_base_endpoint_closed(endpoint);
/* Delete the endpoint that we just matched */
orte_iof_base_endpoint_delete(peer, ORTE_NS_CMP_ALL, ORTE_IOF_ANY);
}
#endif
opal_output_verbose(1, orte_iof_base.iof_output, "done with exception handler\n");
}