1
1

Here is the major MAD-cure commit. I have written plenty about it, so I refer you here to those messages for a description of everything that was done.

This commit was SVN r11661.
Этот коммит содержится в:
Ralph Castain 2006-09-14 21:29:51 +00:00
родитель 17afe7dc9f
Коммит 37dfdb76eb
296 изменённых файлов: 13934 добавлений и 6042 удалений

Просмотреть файл

@ -373,7 +373,7 @@ ompi_comm_start_processes(int count, char **array_of_commands,
* later override this value by providing an MPI_Info value. for now, though,
* let's get the default value off the registry
*/
if (ORTE_SUCCESS != (rc = orte_rmgr_base_get_app_context(orte_process_info.my_name->jobid, &apps, &num_apps))) {
if (ORTE_SUCCESS != (rc = orte_rmgr.get_app_context(orte_process_info.my_name->jobid, &apps, &num_apps))) {
ORTE_ERROR_LOG(rc);
return rc;
}
@ -533,7 +533,7 @@ ompi_comm_start_processes(int count, char **array_of_commands,
if (NULL != base_prefix) free(base_prefix);
/* spawn procs */
if (ORTE_SUCCESS != (rc = orte_rmgr.spawn(apps, count, &new_jobid, NULL, ORTE_PROC_STATE_NONE))) {
if (ORTE_SUCCESS != (rc = orte_rmgr.spawn_job(apps, count, &new_jobid, NULL, ORTE_PROC_STATE_NONE))) {
ORTE_ERROR_LOG(rc);
opal_progress_event_decrement();
return MPI_ERR_SPAWN;

Просмотреть файл

@ -27,7 +27,7 @@
#include "ompi/mca/pml/pml.h"
#include "ompi/mca/btl/btl.h"
#include "ompi/mca/btl/base/base.h"
#include "orte/runtime/runtime.h"
#include "orte/mca/errmgr/errmgr.h"
OBJ_CLASS_INSTANCE(
mca_btl_base_selected_module_t,
@ -151,7 +151,7 @@ int mca_btl_base_select(bool enable_progress_threads,
if (0 == opal_list_get_size(&mca_btl_base_modules_initialized)) {
opal_show_help("help-mca-base.txt", "find-available:none-found", true,
"btl");
orte_abort(1, "");
orte_errmgr.error_detected(1, NULL);
}
return OMPI_SUCCESS;
}

Просмотреть файл

@ -391,7 +391,7 @@ static int mca_btl_tcp_component_create_listen(void)
{
int flags;
struct sockaddr_in inaddr;
ompi_socklen_t addrlen;
opal_socklen_t addrlen;
/* create a listen socket for incoming connections */
mca_btl_tcp_component.tcp_listen_sd = socket(AF_INET, SOCK_STREAM, 0);
@ -556,7 +556,7 @@ int mca_btl_tcp_component_control(int param, void* value, size_t size)
static void mca_btl_tcp_component_accept(void)
{
while(true) {
ompi_socklen_t addrlen = sizeof(struct sockaddr_in);
opal_socklen_t addrlen = sizeof(struct sockaddr_in);
struct sockaddr_in addr;
mca_btl_tcp_event_t *event;
int sd = accept(mca_btl_tcp_component.tcp_listen_sd, (struct sockaddr*)&addr, &addrlen);
@ -588,7 +588,7 @@ static void mca_btl_tcp_component_recv_handler(int sd, short flags, void* user)
struct sockaddr_in addr;
int retval;
mca_btl_tcp_proc_t* btl_proc;
ompi_socklen_t addr_len = sizeof(addr);
opal_socklen_t addr_len = sizeof(addr);
mca_btl_tcp_event_t *event = (mca_btl_tcp_event_t *)user;
/* accept new connections on the listen socket */

Просмотреть файл

@ -133,8 +133,8 @@ static void mca_btl_tcp_endpoint_dump(mca_btl_base_endpoint_t* btl_endpoint, con
char dst[64];
int sndbuf,rcvbuf,nodelay,flags;
struct sockaddr_in inaddr;
ompi_socklen_t obtlen;
ompi_socklen_t addrlen = sizeof(struct sockaddr_in);
opal_socklen_t obtlen;
opal_socklen_t addrlen = sizeof(struct sockaddr_in);
getsockname(btl_endpoint->endpoint_sd, (struct sockaddr*)&inaddr, &addrlen);
sprintf(src, "%s", inet_ntoa(inaddr.sin_addr));
@ -553,7 +553,7 @@ static int mca_btl_tcp_endpoint_start_connect(mca_btl_base_endpoint_t* btl_endpo
static void mca_btl_tcp_endpoint_complete_connect(mca_btl_base_endpoint_t* btl_endpoint)
{
int so_error = 0;
ompi_socklen_t so_length = sizeof(so_error);
opal_socklen_t so_length = sizeof(so_error);
/* unregister from receiving event notifications */
opal_event_del(&btl_endpoint->endpoint_send_event);

Просмотреть файл

@ -148,17 +148,16 @@ am__aclocal_m4_deps = $(top_srcdir)/acinclude.m4 \
$(top_srcdir)/opal/mca/timer/linux/configure.m4 \
$(top_srcdir)/opal/mca/timer/solaris/configure.m4 \
$(top_srcdir)/opal/mca/timer/windows/configure.m4 \
$(top_srcdir)/orte/mca/errmgr/bproc/configure.m4 \
$(top_srcdir)/orte/mca/odls/bproc/configure.m4 \
$(top_srcdir)/orte/mca/odls/default/configure.m4 \
$(top_srcdir)/orte/mca/oob/tcp/configure.m4 \
$(top_srcdir)/orte/mca/pls/bproc/configure.m4 \
$(top_srcdir)/orte/mca/pls/bproc_orted/configure.m4 \
$(top_srcdir)/orte/mca/pls/fork/configure.m4 \
$(top_srcdir)/orte/mca/pls/gridengine/configure.m4 \
$(top_srcdir)/orte/mca/pls/poe/configure.m4 \
$(top_srcdir)/orte/mca/pls/process/configure.m4 \
$(top_srcdir)/orte/mca/pls/rsh/configure.m4 \
$(top_srcdir)/orte/mca/pls/slurm/configure.m4 \
$(top_srcdir)/orte/mca/pls/tm/configure.m4 \
$(top_srcdir)/orte/mca/pls/xgrid/configure.m4 \
$(top_srcdir)/orte/mca/ras/bjs/configure.m4 \
$(top_srcdir)/orte/mca/ras/gridengine/configure.m4 \
$(top_srcdir)/orte/mca/ras/lsf_bproc/configure.m4 \
@ -416,6 +415,13 @@ MCA_ns_DSO_SUBDIRS = @MCA_ns_DSO_SUBDIRS@
MCA_ns_STATIC_COMPONENTS = @MCA_ns_STATIC_COMPONENTS@
MCA_ns_STATIC_LTLIBS = @MCA_ns_STATIC_LTLIBS@
MCA_ns_STATIC_SUBDIRS = @MCA_ns_STATIC_SUBDIRS@
MCA_odls_ALL_COMPONENTS = @MCA_odls_ALL_COMPONENTS@
MCA_odls_ALL_SUBDIRS = @MCA_odls_ALL_SUBDIRS@
MCA_odls_DSO_COMPONENTS = @MCA_odls_DSO_COMPONENTS@
MCA_odls_DSO_SUBDIRS = @MCA_odls_DSO_SUBDIRS@
MCA_odls_STATIC_COMPONENTS = @MCA_odls_STATIC_COMPONENTS@
MCA_odls_STATIC_LTLIBS = @MCA_odls_STATIC_LTLIBS@
MCA_odls_STATIC_SUBDIRS = @MCA_odls_STATIC_SUBDIRS@
MCA_ompi_FRAMEWORKS = @MCA_ompi_FRAMEWORKS@
MCA_ompi_FRAMEWORKS_SUBDIRS = @MCA_ompi_FRAMEWORKS_SUBDIRS@
MCA_ompi_FRAMEWORK_COMPONENT_ALL_SUBDIRS = @MCA_ompi_FRAMEWORK_COMPONENT_ALL_SUBDIRS@
@ -609,6 +615,14 @@ OMPI_BUILD_common_portals_DSO_FALSE = @OMPI_BUILD_common_portals_DSO_FALSE@
OMPI_BUILD_common_portals_DSO_TRUE = @OMPI_BUILD_common_portals_DSO_TRUE@
OMPI_BUILD_common_sm_DSO_FALSE = @OMPI_BUILD_common_sm_DSO_FALSE@
OMPI_BUILD_common_sm_DSO_TRUE = @OMPI_BUILD_common_sm_DSO_TRUE@
OMPI_BUILD_errmgr_bproc_DSO_FALSE = @OMPI_BUILD_errmgr_bproc_DSO_FALSE@
OMPI_BUILD_errmgr_bproc_DSO_TRUE = @OMPI_BUILD_errmgr_bproc_DSO_TRUE@
OMPI_BUILD_errmgr_hnp_DSO_FALSE = @OMPI_BUILD_errmgr_hnp_DSO_FALSE@
OMPI_BUILD_errmgr_hnp_DSO_TRUE = @OMPI_BUILD_errmgr_hnp_DSO_TRUE@
OMPI_BUILD_errmgr_orted_DSO_FALSE = @OMPI_BUILD_errmgr_orted_DSO_FALSE@
OMPI_BUILD_errmgr_orted_DSO_TRUE = @OMPI_BUILD_errmgr_orted_DSO_TRUE@
OMPI_BUILD_errmgr_proxy_DSO_FALSE = @OMPI_BUILD_errmgr_proxy_DSO_FALSE@
OMPI_BUILD_errmgr_proxy_DSO_TRUE = @OMPI_BUILD_errmgr_proxy_DSO_TRUE@
OMPI_BUILD_gpr_null_DSO_FALSE = @OMPI_BUILD_gpr_null_DSO_FALSE@
OMPI_BUILD_gpr_null_DSO_TRUE = @OMPI_BUILD_gpr_null_DSO_TRUE@
OMPI_BUILD_gpr_proxy_DSO_FALSE = @OMPI_BUILD_gpr_proxy_DSO_FALSE@
@ -651,6 +665,10 @@ OMPI_BUILD_ns_proxy_DSO_FALSE = @OMPI_BUILD_ns_proxy_DSO_FALSE@
OMPI_BUILD_ns_proxy_DSO_TRUE = @OMPI_BUILD_ns_proxy_DSO_TRUE@
OMPI_BUILD_ns_replica_DSO_FALSE = @OMPI_BUILD_ns_replica_DSO_FALSE@
OMPI_BUILD_ns_replica_DSO_TRUE = @OMPI_BUILD_ns_replica_DSO_TRUE@
OMPI_BUILD_odls_bproc_DSO_FALSE = @OMPI_BUILD_odls_bproc_DSO_FALSE@
OMPI_BUILD_odls_bproc_DSO_TRUE = @OMPI_BUILD_odls_bproc_DSO_TRUE@
OMPI_BUILD_odls_default_DSO_FALSE = @OMPI_BUILD_odls_default_DSO_FALSE@
OMPI_BUILD_odls_default_DSO_TRUE = @OMPI_BUILD_odls_default_DSO_TRUE@
OMPI_BUILD_oob_tcp_DSO_FALSE = @OMPI_BUILD_oob_tcp_DSO_FALSE@
OMPI_BUILD_oob_tcp_DSO_TRUE = @OMPI_BUILD_oob_tcp_DSO_TRUE@
OMPI_BUILD_osc_pt2pt_DSO_FALSE = @OMPI_BUILD_osc_pt2pt_DSO_FALSE@
@ -665,24 +683,18 @@ OMPI_BUILD_paffinity_windows_DSO_FALSE = @OMPI_BUILD_paffinity_windows_DSO_FALSE
OMPI_BUILD_paffinity_windows_DSO_TRUE = @OMPI_BUILD_paffinity_windows_DSO_TRUE@
OMPI_BUILD_pls_bproc_DSO_FALSE = @OMPI_BUILD_pls_bproc_DSO_FALSE@
OMPI_BUILD_pls_bproc_DSO_TRUE = @OMPI_BUILD_pls_bproc_DSO_TRUE@
OMPI_BUILD_pls_bproc_orted_DSO_FALSE = @OMPI_BUILD_pls_bproc_orted_DSO_FALSE@
OMPI_BUILD_pls_bproc_orted_DSO_TRUE = @OMPI_BUILD_pls_bproc_orted_DSO_TRUE@
OMPI_BUILD_pls_fork_DSO_FALSE = @OMPI_BUILD_pls_fork_DSO_FALSE@
OMPI_BUILD_pls_fork_DSO_TRUE = @OMPI_BUILD_pls_fork_DSO_TRUE@
OMPI_BUILD_pls_gridengine_DSO_FALSE = @OMPI_BUILD_pls_gridengine_DSO_FALSE@
OMPI_BUILD_pls_gridengine_DSO_TRUE = @OMPI_BUILD_pls_gridengine_DSO_TRUE@
OMPI_BUILD_pls_poe_DSO_FALSE = @OMPI_BUILD_pls_poe_DSO_FALSE@
OMPI_BUILD_pls_poe_DSO_TRUE = @OMPI_BUILD_pls_poe_DSO_TRUE@
OMPI_BUILD_pls_process_DSO_FALSE = @OMPI_BUILD_pls_process_DSO_FALSE@
OMPI_BUILD_pls_process_DSO_TRUE = @OMPI_BUILD_pls_process_DSO_TRUE@
OMPI_BUILD_pls_proxy_DSO_FALSE = @OMPI_BUILD_pls_proxy_DSO_FALSE@
OMPI_BUILD_pls_proxy_DSO_TRUE = @OMPI_BUILD_pls_proxy_DSO_TRUE@
OMPI_BUILD_pls_rsh_DSO_FALSE = @OMPI_BUILD_pls_rsh_DSO_FALSE@
OMPI_BUILD_pls_rsh_DSO_TRUE = @OMPI_BUILD_pls_rsh_DSO_TRUE@
OMPI_BUILD_pls_slurm_DSO_FALSE = @OMPI_BUILD_pls_slurm_DSO_FALSE@
OMPI_BUILD_pls_slurm_DSO_TRUE = @OMPI_BUILD_pls_slurm_DSO_TRUE@
OMPI_BUILD_pls_tm_DSO_FALSE = @OMPI_BUILD_pls_tm_DSO_FALSE@
OMPI_BUILD_pls_tm_DSO_TRUE = @OMPI_BUILD_pls_tm_DSO_TRUE@
OMPI_BUILD_pls_xgrid_DSO_FALSE = @OMPI_BUILD_pls_xgrid_DSO_FALSE@
OMPI_BUILD_pls_xgrid_DSO_TRUE = @OMPI_BUILD_pls_xgrid_DSO_TRUE@
OMPI_BUILD_pml_cm_DSO_FALSE = @OMPI_BUILD_pml_cm_DSO_FALSE@
OMPI_BUILD_pml_cm_DSO_TRUE = @OMPI_BUILD_pml_cm_DSO_TRUE@
OMPI_BUILD_pml_dr_DSO_FALSE = @OMPI_BUILD_pml_dr_DSO_FALSE@
@ -703,6 +715,8 @@ OMPI_BUILD_ras_lsf_bproc_DSO_FALSE = @OMPI_BUILD_ras_lsf_bproc_DSO_FALSE@
OMPI_BUILD_ras_lsf_bproc_DSO_TRUE = @OMPI_BUILD_ras_lsf_bproc_DSO_TRUE@
OMPI_BUILD_ras_poe_DSO_FALSE = @OMPI_BUILD_ras_poe_DSO_FALSE@
OMPI_BUILD_ras_poe_DSO_TRUE = @OMPI_BUILD_ras_poe_DSO_TRUE@
OMPI_BUILD_ras_proxy_DSO_FALSE = @OMPI_BUILD_ras_proxy_DSO_FALSE@
OMPI_BUILD_ras_proxy_DSO_TRUE = @OMPI_BUILD_ras_proxy_DSO_TRUE@
OMPI_BUILD_ras_slurm_DSO_FALSE = @OMPI_BUILD_ras_slurm_DSO_FALSE@
OMPI_BUILD_ras_slurm_DSO_TRUE = @OMPI_BUILD_ras_slurm_DSO_TRUE@
OMPI_BUILD_ras_tm_DSO_FALSE = @OMPI_BUILD_ras_tm_DSO_FALSE@
@ -715,8 +729,12 @@ OMPI_BUILD_rcache_vma_DSO_FALSE = @OMPI_BUILD_rcache_vma_DSO_FALSE@
OMPI_BUILD_rcache_vma_DSO_TRUE = @OMPI_BUILD_rcache_vma_DSO_TRUE@
OMPI_BUILD_rds_hostfile_DSO_FALSE = @OMPI_BUILD_rds_hostfile_DSO_FALSE@
OMPI_BUILD_rds_hostfile_DSO_TRUE = @OMPI_BUILD_rds_hostfile_DSO_TRUE@
OMPI_BUILD_rds_proxy_DSO_FALSE = @OMPI_BUILD_rds_proxy_DSO_FALSE@
OMPI_BUILD_rds_proxy_DSO_TRUE = @OMPI_BUILD_rds_proxy_DSO_TRUE@
OMPI_BUILD_rds_resfile_DSO_FALSE = @OMPI_BUILD_rds_resfile_DSO_FALSE@
OMPI_BUILD_rds_resfile_DSO_TRUE = @OMPI_BUILD_rds_resfile_DSO_TRUE@
OMPI_BUILD_rmaps_proxy_DSO_FALSE = @OMPI_BUILD_rmaps_proxy_DSO_FALSE@
OMPI_BUILD_rmaps_proxy_DSO_TRUE = @OMPI_BUILD_rmaps_proxy_DSO_TRUE@
OMPI_BUILD_rmaps_round_robin_DSO_FALSE = @OMPI_BUILD_rmaps_round_robin_DSO_FALSE@
OMPI_BUILD_rmaps_round_robin_DSO_TRUE = @OMPI_BUILD_rmaps_round_robin_DSO_TRUE@
OMPI_BUILD_rmgr_cnos_DSO_FALSE = @OMPI_BUILD_rmgr_cnos_DSO_FALSE@
@ -971,6 +989,9 @@ common_portals_CPPFLAGS = @common_portals_CPPFLAGS@
common_portals_LDFLAGS = @common_portals_LDFLAGS@
common_portals_LIBS = @common_portals_LIBS@
datadir = @datadir@
errmgr_bproc_CPPFLAGS = @errmgr_bproc_CPPFLAGS@
errmgr_bproc_LDFLAGS = @errmgr_bproc_LDFLAGS@
errmgr_bproc_LIBS = @errmgr_bproc_LIBS@
exec_prefix = @exec_prefix@
host = @host@
host_alias = @host_alias@
@ -1017,21 +1038,19 @@ mtl_psm_CFLAGS = @mtl_psm_CFLAGS@
mtl_psm_CPPFLAGS = @mtl_psm_CPPFLAGS@
mtl_psm_LDFLAGS = @mtl_psm_LDFLAGS@
mtl_psm_LIBS = @mtl_psm_LIBS@
odls_bproc_CPPFLAGS = @odls_bproc_CPPFLAGS@
odls_bproc_LDFLAGS = @odls_bproc_LDFLAGS@
odls_bproc_LIBS = @odls_bproc_LIBS@
oldincludedir = @oldincludedir@
pls_bproc_CPPFLAGS = @pls_bproc_CPPFLAGS@
pls_bproc_LDFLAGS = @pls_bproc_LDFLAGS@
pls_bproc_LIBS = @pls_bproc_LIBS@
pls_bproc_orted_CPPFLAGS = @pls_bproc_orted_CPPFLAGS@
pls_bproc_orted_LDFLAGS = @pls_bproc_orted_LDFLAGS@
pls_bproc_orted_LIBS = @pls_bproc_orted_LIBS@
pls_slurm_CPPFLAGS = @pls_slurm_CPPFLAGS@
pls_slurm_LDFLAGS = @pls_slurm_LDFLAGS@
pls_slurm_LIBS = @pls_slurm_LIBS@
pls_tm_CPPFLAGS = @pls_tm_CPPFLAGS@
pls_tm_LDFLAGS = @pls_tm_LDFLAGS@
pls_tm_LIBS = @pls_tm_LIBS@
pls_xgrid_LDFLAGS = @pls_xgrid_LDFLAGS@
pls_xgrid_OBJCFLAGS = @pls_xgrid_OBJCFLAGS@
prefix = @prefix@
program_transform_name = @program_transform_name@
ras_bjs_CPPFLAGS = @ras_bjs_CPPFLAGS@

Просмотреть файл

@ -24,7 +24,7 @@
#include "opal/runtime/opal_progress.h"
#include "opal/mca/mca.h"
#include "opal/mca/base/base.h"
#include "orte/runtime/runtime.h"
#include "orte/mca/errmgr/errmgr.h"
#include "ompi/constants.h"
#include "ompi/mca/pml/pml.h"
#include "ompi/mca/pml/base/base.h"
@ -117,9 +117,9 @@ int mca_pml_base_select(bool enable_progress_threads,
if( NULL == best_component ) {
opal_show_help("help-mca-base.txt", "find-available:none-found", true, "pml");
if( NULL != mca_pml_base_pml ) {
orte_abort( 1, "PML %s cannot be selected", mca_pml_base_pml );
orte_errmgr.error_detected(1, "PML %s cannot be selected", mca_pml_base_pml, NULL);
} else {
orte_abort(1, "No pml component available. This shouldn't happen.");
orte_errmgr.error_detected(2, "No pml component available. This shouldn't happen.", NULL);
}
}

Просмотреть файл

@ -40,6 +40,8 @@
#include "orte/util/proc_info.h"
#include "orte/runtime/runtime.h"
#include "orte/mca/ns/ns.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/pls/pls.h"
#include "orte/mca/rmgr/rmgr.h"
#include "ompi/communicator/communicator.h"
#include "ompi/proc/proc.h"
@ -50,6 +52,7 @@
#include <signal.h>
#endif
#if 0
static
int
abort_procs(ompi_proc_t **procs, int proc_count,
@ -66,14 +69,14 @@ abort_procs(ompi_proc_t **procs, int proc_count,
}
if (jobid == my_jobid) continue;
killret = orte_rmgr.terminate_job(jobid);
killret = orte_pls.terminate_job(jobid);
if (OMPI_SUCCESS != killret) ret = killret;
}
return ret;
}
#endif
int
ompi_mpi_abort(struct ompi_communicator_t* comm,
@ -143,7 +146,7 @@ ompi_mpi_abort(struct ompi_communicator_t* comm,
}
/* BWB - XXX - Should probably publish the error code somewhere */
#if 0
/* Kill everyone in the job. We may make this better someday to
actually loop over ompi_rte_kill_proc() to only kill the procs
in comm, and additionally to somehow use errorcode. */
@ -167,7 +170,7 @@ ompi_mpi_abort(struct ompi_communicator_t* comm,
comm->c_local_group->grp_proc_count,
my_jobid);
ret = orte_rmgr.terminate_job(my_jobid);
ret = orte_pls.terminate_job(my_jobid);
if (OMPI_SUCCESS == ret) {
while (1) {
@ -188,6 +191,12 @@ ompi_mpi_abort(struct ompi_communicator_t* comm,
just exit and let it become Somebody Elses Problem. */
exit(errcode);
}
#endif
/* tell the error manager we detected an error - OpenRTE
* will take care of cleaning up for us
*/
orte_errmgr.error_detected(errcode, "MPI_Abort has been called", NULL);
return OMPI_SUCCESS;
}

Просмотреть файл

@ -221,7 +221,7 @@ void ompi_info::open_components()
component_map["rml"] = &orte_rml_base.rml_components;
orte_pls_base_open();
component_map["pls"] = &orte_pls_base.pls_opened;
component_map["pls"] = &orte_pls_base.available_components;
orte_sds_base_open();
component_map["sds"] = &orte_sds_base_components_available;

Просмотреть файл

@ -101,9 +101,9 @@ typedef void* ompi_iov_base_ptr_t;
*/
#if defined(HAVE_SOCKLEN_T)
typedef socklen_t ompi_socklen_t;
typedef socklen_t opal_socklen_t;
#else
typedef int ompi_socklen_t;
typedef int opal_socklen_t;
#endif

Просмотреть файл

@ -122,7 +122,7 @@ static int make_mask(unsigned int *len, unsigned long **mask)
linux_module_get_num_procs(&num_procs);
*len = num_procs / 8;
if (*len != num_procs * 8) {
if (*len != (unsigned int)num_procs * 8) {
++*len;
}

Просмотреть файл

@ -22,11 +22,12 @@
#if HAVE_NETINET_IN_H
#include <netinet/in.h>
#endif
#include "opal/util/output.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/odls/odls_types.h"
#include "orte/dss/dss_internal.h"
#include "opal/util/output.h"
static void orte_dss_arith_int(int *value, int *operand, orte_dss_arith_op_t operation);
static void orte_dss_arith_uint(uint *value, uint *operand, orte_dss_arith_op_t operation);

Просмотреть файл

@ -232,16 +232,6 @@ int orte_dss_compare_dt(orte_data_type_t *value1, orte_data_type_t *value2, orte
return ORTE_EQUAL;
}
/* ORTE_DAEMON_CMD */
int orte_dss_compare_daemon_cmd(orte_daemon_cmd_flag_t *value1, orte_daemon_cmd_flag_t *value2, orte_data_type_t type)
{
if (*value1 > *value2) return ORTE_VALUE1_GREATER;
if (*value2 > *value1) return ORTE_VALUE2_GREATER;
return ORTE_EQUAL;
}
/* ORTE_DATA_VALUE */
int orte_dss_compare_data_value(orte_data_value_t *value1, orte_data_value_t *value2, orte_data_type_t type)
{

Просмотреть файл

@ -114,10 +114,6 @@ int orte_dss_std_copy(void **dest, void *src, orte_data_type_t type)
datasize = sizeof(orte_data_type_t);
break;
case ORTE_DAEMON_CMD:
datasize = sizeof(orte_daemon_cmd_flag_t);
break;
default:
ORTE_ERROR_LOG(ORTE_ERR_UNKNOWN_DATA_TYPE);
return ORTE_ERR_UNKNOWN_DATA_TYPE;

Просмотреть файл

@ -112,6 +112,55 @@ extern "C" {
#error Unsupported pid_t size!
#endif
/* Unpack generic size macros */
#define UNPACK_SIZE_MISMATCH(unpack_type, remote_type, ret) \
do { \
switch(remote_type) { \
case ORTE_UINT8: \
UNPACK_SIZE_MISMATCH_FOUND(unpack_type, uint8_t, remote_type); \
break; \
case ORTE_INT8: \
UNPACK_SIZE_MISMATCH_FOUND(unpack_type, int8_t, remote_type); \
break; \
case ORTE_UINT16: \
UNPACK_SIZE_MISMATCH_FOUND(unpack_type, uint16_t, remote_type); \
break; \
case ORTE_INT16: \
UNPACK_SIZE_MISMATCH_FOUND(unpack_type, int16_t, remote_type); \
break; \
case ORTE_UINT32: \
UNPACK_SIZE_MISMATCH_FOUND(unpack_type, uint32_t, remote_type); \
break; \
case ORTE_INT32: \
UNPACK_SIZE_MISMATCH_FOUND(unpack_type, int32_t, remote_type); \
break; \
case ORTE_UINT64: \
UNPACK_SIZE_MISMATCH_FOUND(unpack_type, uint64_t, remote_type); \
break; \
case ORTE_INT64: \
UNPACK_SIZE_MISMATCH_FOUND(unpack_type, int64_t, remote_type); \
break; \
default: \
ret = ORTE_ERR_NOT_FOUND; \
ORTE_ERROR_LOG(ret); \
} \
} while (0)
/* NOTE: do not need to deal with endianness here, as the unpacking of
the underling sender-side type will do that for us. Repeat: the
data in tmpbuf[] is already in host byte order. */
#define UNPACK_SIZE_MISMATCH_FOUND(unpack_type, tmptype, tmpdsstype) \
do { \
orte_std_cntr_t i; \
tmptype *tmpbuf = (tmptype*)malloc(sizeof(tmptype) * (*num_vals)); \
ret = orte_dss_unpack_buffer(buffer, tmpbuf, num_vals, tmpdsstype); \
for (i = 0 ; i < *num_vals ; ++i) { \
((unpack_type*) dest)[i] = (unpack_type)(tmpbuf[i]); \
} \
free(tmpbuf); \
} while (0)
/**
* Internal struct used for holding registered dss functions
*/
@ -256,9 +305,6 @@ extern orte_data_type_t orte_dss_num_reg_types;
int orte_dss_pack_data_type(orte_buffer_t *buffer, void *src,
orte_std_cntr_t num_vals, orte_data_type_t type);
int orte_dss_pack_daemon_cmd(orte_buffer_t *buffer, void *src,
orte_std_cntr_t num_vals, orte_data_type_t type);
int orte_dss_pack_data_value(orte_buffer_t *buffer, void *src,
orte_std_cntr_t num_vals, orte_data_type_t type);
@ -301,9 +347,6 @@ extern orte_data_type_t orte_dss_num_reg_types;
int orte_dss_unpack_data_type(orte_buffer_t *buffer, void *dest,
orte_std_cntr_t *num_vals, orte_data_type_t type);
int orte_dss_unpack_daemon_cmd(orte_buffer_t *buffer, void *dest,
orte_std_cntr_t *num_vals, orte_data_type_t type);
int orte_dss_unpack_data_value(orte_buffer_t *buffer, void *dest,
orte_std_cntr_t *num_vals, orte_data_type_t type);
@ -360,8 +403,6 @@ extern orte_data_type_t orte_dss_num_reg_types;
int orte_dss_compare_dt(orte_data_type_t *value1, orte_data_type_t *value2, orte_data_type_t type);
int orte_dss_compare_daemon_cmd(orte_daemon_cmd_flag_t *value1, orte_daemon_cmd_flag_t *value2, orte_data_type_t type);
int orte_dss_compare_data_value(orte_data_value_t *value1, orte_data_value_t *value2, orte_data_type_t type);
int orte_dss_compare_byte_object(orte_byte_object_t *value1, orte_byte_object_t *value2, orte_data_type_t type);
@ -405,7 +446,6 @@ extern orte_data_type_t orte_dss_num_reg_types;
int orte_dss_print_null(char **output, char *prefix, void *src, orte_data_type_t type);
int orte_dss_print_std_cntr(char **output, char *prefix, orte_std_cntr_t *src, orte_data_type_t type);
int orte_dss_print_data_type(char **output, char *prefix, orte_data_type_t *src, orte_data_type_t type);
int orte_dss_print_daemon_cmd(char **output, char *prefix, orte_daemon_cmd_flag_t *src, orte_data_type_t type);
int orte_dss_print_data_value(char **output, char *prefix, orte_data_value_t *src, orte_data_type_t type);
int orte_dss_print_byte_object(char **output, char *prefix, orte_byte_object_t *src, orte_data_type_t type);

Просмотреть файл

@ -426,19 +426,6 @@ int orte_dss_open(void)
ORTE_ERROR_LOG(rc);
return rc;
}
tmp = ORTE_DAEMON_CMD;
if (ORTE_SUCCESS != (rc = orte_dss.register_type(orte_dss_pack_daemon_cmd,
orte_dss_unpack_daemon_cmd,
(orte_dss_copy_fn_t)orte_dss_std_copy,
(orte_dss_compare_fn_t)orte_dss_compare_daemon_cmd,
(orte_dss_size_fn_t)orte_dss_std_size,
(orte_dss_print_fn_t)orte_dss_print_daemon_cmd,
(orte_dss_release_fn_t)orte_dss_std_release,
ORTE_DSS_UNSTRUCTURED,
"ORTE_DATA_TYPE", &tmp))) {
ORTE_ERROR_LOG(rc);
return rc;
}
tmp = ORTE_BYTE_OBJECT;
if (ORTE_SUCCESS != (rc = orte_dss.register_type(orte_dss_pack_byte_object,
orte_dss_unpack_byte_object,

Просмотреть файл

@ -417,22 +417,6 @@ int orte_dss_pack_data_value(orte_buffer_t *buffer, void *src, orte_std_cntr_t n
return ORTE_SUCCESS;
}
/*
* ORTE_DAEMON_CMD
*/
int orte_dss_pack_daemon_cmd(orte_buffer_t *buffer, void *src, orte_std_cntr_t num_vals,
orte_data_type_t type)
{
int ret;
/* Turn around and pack the real type */
if (ORTE_SUCCESS != (ret = orte_dss_pack_buffer(buffer, src, num_vals, ORTE_DAEMON_CMD_T))) {
ORTE_ERROR_LOG(ret);
}
return ret;
}
/*
* ORTE_BYTE_OBJECT
*/

Просмотреть файл

@ -475,28 +475,6 @@ int orte_dss_print_data_value(char **output, char *prefix, orte_data_value_t *sr
return ORTE_SUCCESS;
}
/*
* ORTE_DAEMON_CMD
*/
int orte_dss_print_daemon_cmd(char **output, char *prefix, orte_daemon_cmd_flag_t *src, orte_data_type_t type)
{
char *prefx;
/* deal with NULL prefix */
if (NULL == prefix) asprintf(&prefx, " ");
else prefx = prefix;
/* if src is NULL, just print data type and return */
if (NULL == src) {
asprintf(output, "%sData type: ORTE_DAEMON_CMD\tValue: NULL pointer", prefx);
return ORTE_SUCCESS;
}
asprintf(output, "%sData type: ORTE_DAEMON_CMD\tValue: %lu", prefx, (unsigned long) *src);
return ORTE_SUCCESS;
}
/*
* ORTE_BYTE_OBJECT
*/

Просмотреть файл

@ -109,10 +109,6 @@ int orte_dss_std_size(size_t *size, void *src, orte_data_type_t type)
*size = sizeof(orte_data_type_t);
break;
case ORTE_DAEMON_CMD:
*size = sizeof(orte_daemon_cmd_flag_t);
break;
default:
*size = 0;
ORTE_ERROR_LOG(ORTE_ERR_UNKNOWN_DATA_TYPE);

Просмотреть файл

@ -31,54 +31,6 @@
#include "orte/dss/dss_internal.h"
#define UNPACK_SIZE_MISMATCH(unpack_type, remote_type, ret) \
do { \
switch(remote_type) { \
case ORTE_UINT8: \
UNPACK_SIZE_MISMATCH_FOUND(unpack_type, uint8_t, remote_type); \
break; \
case ORTE_INT8: \
UNPACK_SIZE_MISMATCH_FOUND(unpack_type, int8_t, remote_type); \
break; \
case ORTE_UINT16: \
UNPACK_SIZE_MISMATCH_FOUND(unpack_type, uint16_t, remote_type); \
break; \
case ORTE_INT16: \
UNPACK_SIZE_MISMATCH_FOUND(unpack_type, int16_t, remote_type); \
break; \
case ORTE_UINT32: \
UNPACK_SIZE_MISMATCH_FOUND(unpack_type, uint32_t, remote_type); \
break; \
case ORTE_INT32: \
UNPACK_SIZE_MISMATCH_FOUND(unpack_type, int32_t, remote_type); \
break; \
case ORTE_UINT64: \
UNPACK_SIZE_MISMATCH_FOUND(unpack_type, uint64_t, remote_type); \
break; \
case ORTE_INT64: \
UNPACK_SIZE_MISMATCH_FOUND(unpack_type, int64_t, remote_type); \
break; \
default: \
ret = ORTE_ERR_NOT_FOUND; \
ORTE_ERROR_LOG(ret); \
} \
} while (0)
/* NOTE: do not need to deal with endianness here, as the unpacking of
the underling sender-side type will do that for us. Repeat: the
data in tmpbuf[] is already in host byte order. */
#define UNPACK_SIZE_MISMATCH_FOUND(unpack_type, tmptype, tmpdsstype) \
do { \
orte_std_cntr_t i; \
tmptype *tmpbuf = (tmptype*)malloc(sizeof(tmptype) * (*num_vals)); \
ret = orte_dss_unpack_buffer(buffer, tmpbuf, num_vals, tmpdsstype); \
for (i = 0 ; i < *num_vals ; ++i) { \
((unpack_type*) dest)[i] = (unpack_type)(tmpbuf[i]); \
} \
free(tmpbuf); \
} while (0)
int orte_dss_unpack(orte_buffer_t *buffer, void *dst, orte_std_cntr_t *num_vals,
orte_data_type_t type)
{
@ -603,49 +555,6 @@ int orte_dss_unpack_data_type(orte_buffer_t *buffer, void *dest, orte_std_cntr_t
return ret;
}
/*
* ORTE_DAEMON_CMD
*/
int orte_dss_unpack_daemon_cmd(orte_buffer_t *buffer, void *dest, orte_std_cntr_t *num_vals,
orte_data_type_t type)
{
int ret;
orte_data_type_t remote_type;
/* if the buffer is fully described, then we can do some magic to handle
* the heterogeneous case. if not, then we can only shoot blind - it is the
* user's responsibility to ensure we are in a homogeneous environment.
*/
if (ORTE_DSS_BUFFER_FULLY_DESC == buffer->type) {
/* see what type was actually packed */
if (ORTE_SUCCESS != (ret = orte_dss_peek_type(buffer, &remote_type))) {
ORTE_ERROR_LOG(ret);
return ret;
}
if (remote_type == ORTE_DAEMON_CMD_T) {
/* fast path it if the sizes are the same */
/* Turn around and unpack the real type */
if (ORTE_SUCCESS != (ret = orte_dss_unpack_buffer(buffer, dest, num_vals, ORTE_DAEMON_CMD_T))) {
ORTE_ERROR_LOG(ret);
}
} else {
/* slow path - types are different sizes */
UNPACK_SIZE_MISMATCH(orte_daemon_cmd_flag_t, remote_type, ret);
}
return ret;
}
/* if we get here, then this buffer is NOT fully described. just unpack it
* using the local size - user gets the pain if it's wrong
*/
if (ORTE_SUCCESS != (ret = orte_dss_unpack_buffer(buffer, dest, num_vals, ORTE_DAEMON_CMD_T))) {
ORTE_ERROR_LOG(ret);
}
return ret;
}
/*
* ORTE_DATA_VALUE
*/

Просмотреть файл

@ -46,10 +46,6 @@ typedef struct {
uint8_t *bytes;
} orte_byte_object_t;
/* define the orted command flag type */
typedef uint16_t orte_daemon_cmd_flag_t;
#define ORTE_DAEMON_CMD_T ORTE_UINT16
/**
* handle differences in iovec
*/

Просмотреть файл

@ -17,10 +17,12 @@
#
headers += \
base/errmgr_private.h \
base/base.h
libmca_errmgr_la_SOURCES += \
base/errmgr_base_close.c \
base/errmgr_base_receive.c \
base/errmgr_base_select.c \
base/errmgr_base_open.c \
base/errmgr_base_fns.c

Просмотреть файл

@ -48,26 +48,9 @@ extern "C" {
* function definitions
*/
ORTE_DECLSPEC int orte_errmgr_base_open(void);
ORTE_DECLSPEC int orte_errmgr_base_select(bool *allow_multi_user_threads,
bool *have_hidden_threads);
ORTE_DECLSPEC int orte_errmgr_base_select(void);
ORTE_DECLSPEC int orte_errmgr_base_close(void);
/*
* Base functions that are common to all implementations - can be overridden
*/
ORTE_DECLSPEC void orte_errmgr_base_log(int error_code, char *filename, int line);
ORTE_DECLSPEC void orte_errmgr_base_proc_aborted(orte_process_name_t *proc);
ORTE_DECLSPEC void orte_errmgr_base_incomplete_start(orte_jobid_t job);
ORTE_DECLSPEC void orte_errmgr_base_error_detected(int error_code);
ORTE_DECLSPEC int orte_errmgr_base_register_job(orte_jobid_t job);
ORTE_DECLSPEC void orte_errmgr_base_abort(void);
/*
* globals that might be needed
*/
@ -78,6 +61,8 @@ ORTE_DECLSPEC extern bool orte_errmgr_initialized;
ORTE_DECLSPEC extern opal_list_t orte_errmgr_base_components_available;
ORTE_DECLSPEC extern mca_errmgr_base_component_t orte_errmgr_base_selected_component;
/* make the default module available so that close can use it */
ORTE_DECLSPEC extern orte_errmgr_base_module_t orte_errmgr_default;
/*
* external API functions will be documented in the mca/errmgr/errmgr.h file
*/

Просмотреть файл

@ -24,6 +24,8 @@
#include "opal/util/trace.h"
#include "opal/mca/mca.h"
#include "opal/mca/base/base.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/errmgr/base/base.h"
@ -44,6 +46,10 @@ int orte_errmgr_base_close(void)
&orte_errmgr_base_components_available, NULL);
orte_errmgr_initialized = false;
/* set the module back to the default so that error logging can continue */
orte_errmgr = orte_errmgr_default;
/* All done */
return ORTE_SUCCESS;

Просмотреть файл

@ -23,18 +23,14 @@
#endif
#include <stdlib.h>
#include "orte/orte_constants.h"
#include "orte/mca/schema/schema.h"
#include "orte/runtime/runtime.h"
#include "orte/runtime/orte_wait.h"
#include "opal/util/output.h"
#include "opal/util/trace.h"
#include "orte/util/proc_info.h"
#include "orte/mca/ns/ns.h"
#include "orte/mca/ns/ns_types.h"
#include "orte/mca/rmgr/rmgr.h"
#include "orte/mca/errmgr/base/base.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/errmgr/base/errmgr_private.h"
void orte_errmgr_base_log(int error_code, char *filename, int line)
@ -49,55 +45,37 @@ void orte_errmgr_base_log(int error_code, char *filename, int line)
ORTE_NAME_ARGS(orte_process_info.my_name),
ORTE_ERROR_NAME(error_code), filename, line);
}
/* orte_errmgr_base_error_detected(error_code); */
}
void orte_errmgr_base_proc_aborted(orte_process_name_t *proc)
int orte_errmgr_base_proc_aborted_not_avail(orte_gpr_notify_message_t *msg)
{
orte_jobid_t job;
int rc;
OPAL_TRACE(1);
if (ORTE_SUCCESS != (rc = orte_ns.get_jobid(&job, proc))) {
ORTE_ERROR_LOG(rc);
return;
}
orte_rmgr.terminate_job(job);
return ORTE_ERR_NOT_AVAILABLE;
}
void orte_errmgr_base_incomplete_start(orte_jobid_t job)
int orte_errmgr_base_incomplete_start_not_avail(orte_gpr_notify_message_t *msgb)
{
OPAL_TRACE(1);
orte_rmgr.terminate_job(job);
return ORTE_ERR_NOT_AVAILABLE;
}
void orte_errmgr_base_error_detected(int error_code)
void orte_errmgr_base_error_detected(int error_code, char *fmt, ...)
{
OPAL_TRACE(1);
/* we can't know if any output is available yet, so
* we just exit */
exit(error_code);
}
void orte_errmgr_base_abort()
void orte_errmgr_base_abort(void)
{
OPAL_TRACE(1);
/* kill and reap all children */
orte_wait_kill(9);
/* abnormal exit */
orte_abort(-1, NULL);
/* guess we should exit */
exit(-1);
}
int orte_errmgr_base_register_job(orte_jobid_t job)
int orte_errmgr_base_register_job_not_avail(orte_jobid_t job)
{
/* register subscription for process_status values
* changing to abnormal termination codes
*/
OPAL_TRACE(1);
return ORTE_SUCCESS;
return ORTE_ERR_NOT_AVAILABLE;
}
int orte_errmgr_base_abort_procs_request_not_avail(orte_process_name_t *procs, orte_std_cntr_t num_procs)
{
return ORTE_ERR_NOT_AVAILABLE;
}

Просмотреть файл

@ -27,6 +27,7 @@
#include "opal/util/trace.h"
#include "orte/mca/errmgr/base/base.h"
#include "orte/mca/errmgr/base/errmgr_private.h"
/*
@ -45,14 +46,22 @@
* Global variables
*/
int orte_errmgr_base_output = -1;
orte_errmgr_base_module_t orte_errmgr = {
/*
* we must define a default module so that the error logging
* functions can be available as early as possible
*/
orte_errmgr_base_module_t orte_errmgr_default = {
orte_errmgr_base_log,
orte_errmgr_base_proc_aborted,
orte_errmgr_base_incomplete_start,
orte_errmgr_base_proc_aborted_not_avail,
orte_errmgr_base_incomplete_start_not_avail,
orte_errmgr_base_error_detected,
orte_errmgr_base_register_job,
orte_errmgr_base_abort
orte_errmgr_base_register_job_not_avail,
orte_errmgr_base_abort,
orte_errmgr_base_abort_procs_request_not_avail
};
/* start out with a default module */
orte_errmgr_base_module_t orte_errmgr;
bool orte_errmgr_base_selected = false;
opal_list_t orte_errmgr_base_components_available;
mca_errmgr_base_component_t orte_errmgr_base_selected_component;
@ -81,6 +90,9 @@ int orte_errmgr_base_open(void)
orte_errmgr_base_output = -1;
}
/* set the default module */
orte_errmgr = orte_errmgr_default;
/* Open up all available components */
if (ORTE_SUCCESS !=

162
orte/mca/errmgr/base/errmgr_base_receive.c Обычный файл
Просмотреть файл

@ -0,0 +1,162 @@
/* -*- C -*-
*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/** @file:
*
*/
/*
* includes
*/
#include "orte_config.h"
#include "orte/orte_constants.h"
#include "orte/orte_types.h"
#include "opal/util/output.h"
#include "opal/mca/mca.h"
#include "opal/mca/base/mca_base_param.h"
#include "orte/dss/dss.h"
#include "orte/util/proc_info.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/rml/rml.h"
#include "orte/mca/errmgr/base/errmgr_private.h"
static bool recv_issued=false;
int orte_errmgr_base_comm_start(void)
{
int rc;
if (recv_issued) {
return ORTE_SUCCESS;
}
if (ORTE_SUCCESS != (rc = orte_rml.recv_buffer_nb(ORTE_RML_NAME_ANY,
ORTE_RML_TAG_ERRMGR,
ORTE_RML_PERSISTENT,
orte_errmgr_base_recv,
NULL))) {
ORTE_ERROR_LOG(rc);
}
recv_issued = true;
return rc;
}
int orte_errmgr_base_comm_stop(void)
{
int rc;
if (!recv_issued) {
return ORTE_SUCCESS;
}
if (ORTE_SUCCESS != (rc = orte_rml.recv_cancel(ORTE_RML_NAME_ANY, ORTE_RML_TAG_ERRMGR))) {
ORTE_ERROR_LOG(rc);
}
recv_issued = false;
return rc;
}
/*
* handle message from proxies
* NOTE: The incoming buffer "buffer" is OBJ_RELEASED by the calling program.
* DO NOT RELEASE THIS BUFFER IN THIS CODE
*/
void orte_errmgr_base_recv(int status, orte_process_name_t* sender,
orte_buffer_t* buffer, orte_rml_tag_t tag,
void* cbdata)
{
orte_buffer_t answer;
orte_errmgr_cmd_flag_t command;
orte_std_cntr_t count, nprocs;
orte_process_name_t *procs;
int rc;
/* get the command */
count = 1;
if (ORTE_SUCCESS != (rc = orte_dss.unpack(buffer, &command, &count, ORTE_ERRMGR_CMD))) {
ORTE_ERROR_LOG(rc);
return;
}
/* setup to return an answer */
OBJ_CONSTRUCT(&answer, orte_buffer_t);
/* pack the command in the answer - this is done to allow the caller to check
* that we are talking about the same command
*/
if (ORTE_SUCCESS != (rc = orte_dss.pack(&answer, &command, 1, ORTE_ERRMGR_CMD))) {
ORTE_ERROR_LOG(rc);
return;
}
switch (command) {
case ORTE_ERRMGR_ABORT_PROCS_REQUEST_CMD:
/* get the number of processes */
count = 1;
if (ORTE_SUCCESS != (rc = orte_dss.unpack(buffer, &nprocs, &count, ORTE_STD_CNTR))) {
ORTE_ERROR_LOG(rc);
goto SEND_ANSWER;
}
/* get the required space */
procs = (orte_process_name_t*)malloc(nprocs * sizeof(orte_process_name_t));
if (NULL == procs) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
goto SEND_ANSWER;
}
/* unpack the array of process names */
count = nprocs;
if (ORTE_SUCCESS != (rc = orte_dss.unpack(buffer, &procs, &count, ORTE_NAME))) {
ORTE_ERROR_LOG(rc);
goto SEND_ANSWER;
}
/* if we didn't get the number we requested, then something is wrong */
if (count != nprocs) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
goto SEND_ANSWER;
}
/* process the request */
if (ORTE_SUCCESS != (rc = orte_errmgr.abort_procs_request(procs, nprocs))) {
ORTE_ERROR_LOG(rc);
goto SEND_ANSWER;
}
break;
default:
ORTE_ERROR_LOG(ORTE_ERR_VALUE_OUT_OF_BOUNDS);
}
SEND_ANSWER:
if (0 > orte_rml.send_buffer(sender, &answer, tag, 0)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
}
/* cleanup */
OBJ_DESTRUCT(&answer);
}

Просмотреть файл

@ -29,8 +29,7 @@
* Function for selecting one component from all those that are
* available.
*/
int orte_errmgr_base_select(bool *allow_multi_user_threads,
bool *have_hidden_threads)
int orte_errmgr_base_select(void)
{
opal_list_item_t *item;
mca_base_component_list_item_t *cli;
@ -71,11 +70,9 @@ int orte_errmgr_base_select(bool *allow_multi_user_threads,
best_module = module;
best_component = component;
*allow_multi_user_threads = multi;
*have_hidden_threads = hidden;
/* update the best priority */
best_priority = priority;
/* update the best priority */
best_priority = priority;
}
/* If it's not the best one, finalize it */
@ -86,10 +83,10 @@ int orte_errmgr_base_select(bool *allow_multi_user_threads,
}
}
/* If we didn't find one to select, that's okay - stick with default */
/* If we didn't find one to select, then we have a big problem */
if (NULL == best_component) {
return ORTE_SUCCESS;
return ORTE_ERROR;
}
/* We have happiness -- save the component and module for later
@ -98,7 +95,7 @@ int orte_errmgr_base_select(bool *allow_multi_user_threads,
orte_errmgr = *best_module;
orte_errmgr_base_selected_component = *best_component;
orte_errmgr_base_selected = true;
/* all done */
return ORTE_SUCCESS;

82
orte/mca/errmgr/base/errmgr_private.h Обычный файл
Просмотреть файл

@ -0,0 +1,82 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/** @file:
*/
#ifndef ORTE_MCA_ERRMGR_PRIVATE_H
#define ORTE_MCA_ERRMGR_PRIVATE_H
/*
* includes
*/
#include "orte_config.h"
#include "orte/orte_constants.h"
#include "orte/mca/ns/ns_types.h"
#include "orte/mca/gpr/gpr_types.h"
#include "orte/mca/rml/rml_types.h"
/*
* Functions for use solely within the ERRMGR framework
*/
#if defined(c_plusplus) || defined(__cplusplus)
extern "C" {
#endif
/* Define the ERRMGR command flag */
typedef uint8_t orte_errmgr_cmd_flag_t;
#define ORTE_ERRMGR_CMD ORTE_UINT8
/* define some commands */
#define ORTE_ERRMGR_ABORT_PROCS_REQUEST_CMD 0x01
/* Internal support */
int orte_errmgr_base_comm_start(void);
int orte_errmgr_base_comm_stop(void);
void orte_errmgr_base_recv(int status, orte_process_name_t* sender,
orte_buffer_t* buffer, orte_rml_tag_t tag,
void* cbdata);
/*
* Base functions
*/
ORTE_DECLSPEC void orte_errmgr_base_log(int error_code, char *filename, int line);
ORTE_DECLSPEC int orte_errmgr_base_proc_aborted_not_avail(orte_gpr_notify_message_t *msg);
ORTE_DECLSPEC int orte_errmgr_base_incomplete_start_not_avail(orte_gpr_notify_message_t *msg);
ORTE_DECLSPEC void orte_errmgr_base_error_detected(int error_code, char *fmt, ...);
ORTE_DECLSPEC int orte_errmgr_base_register_job_not_avail(orte_jobid_t job);
ORTE_DECLSPEC void orte_errmgr_base_abort(void);
ORTE_DECLSPEC int orte_errmgr_base_abort_procs_request_not_avail(orte_process_name_t *procs, orte_std_cntr_t num_procs);
/*
* external API functions will be documented in the mca/errmgr/errmgr.h file
*/
#if defined(c_plusplus) || defined(__cplusplus)
}
#endif
#endif

51
orte/mca/errmgr/bproc/Makefile.am Обычный файл
Просмотреть файл

@ -0,0 +1,51 @@
#
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2004-2005 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# Copyright (c) 2006 Cisco Systems, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
AM_CPPFLAGS = -I$(top_ompi_builddir)/src/include $(errmgr_bproc_CPPFLAGS)
# Make the output library in this directory, and name it either
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
# (for static builds).
if OMPI_BUILD_errmgr_bproc_DSO
component_noinst =
component_install = mca_errmgr_bproc.la
else
component_noinst = libmca_errmgr_bproc.la
component_install =
endif
sources = \
errmgr_bproc.h \
errmgr_bproc.c \
errmgr_bproc_component.c
mcacomponentdir = $(libdir)/openmpi
mcacomponent_LTLIBRARIES = $(component_install)
mca_errmgr_bproc_la_SOURCES = $(sources)
mca_errmgr_bproc_la_LIBADD = \
$(errmgr_bproc_LIBS) \
$(top_ompi_builddir)/orte/liborte.la \
$(top_ompi_builddir)/opal/libopal.la
mca_errmgr_bproc_la_LDFLAGS = -module -avoid-version $(errmgr_bproc_LDFLAGS)
noinst_LTLIBRARIES = $(component_noinst)
libmca_errmgr_bproc_la_SOURCES = $(sources)
libmca_errmgr_bproc_la_LIBADD = $(errmgr_bproc_LIBS)
libmca_errmgr_bproc_la_LDFLAGS = -module -avoid-version $(errmgr_bproc_LDFLAGS)

Просмотреть файл

@ -17,22 +17,22 @@
# $HEADER$
#
# MCA_pls_bproc_orted_CONFIG([action-if-found], [action-if-not-found])
# MCA_errmgr_bproc_CONFIG([action-if-found], [action-if-not-found])
# -----------------------------------------------------------
AC_DEFUN([MCA_pls_bproc_orted_CONFIG],[
OMPI_CHECK_BPROC([pls_bproc_orted], [pls_bproc_orted_good=1],
[pls_bproc_orted_good=1], [pls_bproc_orted_good=0])
AC_DEFUN([MCA_errmgr_bproc_CONFIG],[
OMPI_CHECK_BPROC([errmgr_bproc], [errmgr_bproc_good=1],
[errmgr_bproc_good=1], [errmgr_bproc_good=0])
# if check worked, set wrapper flags if so.
# Evaluate succeed / fail
AS_IF([test "$pls_bproc_orted_good" = "1"],
[pls_bproc_orted_WRAPPER_EXTRA_LDFLAGS="$pls_bproc_orted_LDFLAGS"
pls_bproc_orted_WRAPPER_EXTRA_LIBS="$pls_bproc_orted_LIBS"
AS_IF([test "$errmgr_bproc_good" = "1"],
[errmgr_bproc_WRAPPER_EXTRA_LDFLAGS="$errmgr_bproc_LDFLAGS"
errmgr_bproc_WRAPPER_EXTRA_LIBS="$errmgr_bproc_LIBS"
$1],
[$2])
# set build flags to use in makefile
AC_SUBST([pls_bproc_orted_CPPFLAGS])
AC_SUBST([pls_bproc_orted_LDFLAGS])
AC_SUBST([pls_bproc_orted_LIBS])
AC_SUBST([errmgr_bproc_CPPFLAGS])
AC_SUBST([errmgr_bproc_LDFLAGS])
AC_SUBST([errmgr_bproc_LIBS])
])dnl

23
orte/mca/errmgr/bproc/configure.params Обычный файл
Просмотреть файл

@ -0,0 +1,23 @@
# -*- shell-script -*-
#
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2004-2005 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# Specific to this module
PARAM_INIT_FILE=errmgr_bproc_component.c
PARAM_CONFIG_FILES="Makefile"

223
orte/mca/errmgr/bproc/errmgr_bproc.c Обычный файл
Просмотреть файл

@ -0,0 +1,223 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "orte/orte_constants.h"
#include "opal/util/output.h"
#include "orte/runtime/runtime.h"
#include "orte/mca/ns/ns_types.h"
#include "orte/mca/gpr/gpr_types.h"
#include "orte/mca/rml/rml.h"
#include "orte/mca/errmgr/base/errmgr_private.h"
#include "orte/mca/errmgr/bproc/errmgr_bproc.h"
/*
* This function gets called when the SMR updates a process state to
* indicate that it aborted. Since the bproc component is only active on
* non-HNP processes, this function will NEVER be called
*/
int orte_errmgr_bproc_proc_aborted(orte_gpr_notify_message_t *msg)
{
return ORTE_ERR_NOT_AVAILABLE;
}
/*
* This function gets called when the SMR updates a process state to
* indicate that it failed to start. Since the bproc component is only active on
* non-HNP processes, this function will NEVER be called
*/
int orte_errmgr_bproc_incomplete_start(orte_gpr_notify_message_t *msg)
{
return ORTE_ERR_NOT_AVAILABLE;
}
/*
* This function gets called when a process detects an internal error.
* Bproc is unusually bad about letting us pass information that we
* aborted as opposed to normally terminated. There is no way to locally
* monitor the process state on a remote node, so the only thing we
* can do is pass the info back to the Bproc PLS on the HNP and let it
* figure out what to do.
*/
void orte_errmgr_bproc_error_detected(int error_code, char *fmt, ...)
{
va_list arglist;
orte_buffer_t* cmd;
uint8_t command;
int rc;
/* If there was a message, output it */
va_start(arglist, fmt);
if( NULL != fmt ) {
char* buffer = NULL;
vasprintf( &buffer, fmt, arglist );
opal_output( 0, buffer );
free( buffer );
}
va_end(arglist);
/* Now prepare and send a message to the BProc PLS so it knows that
* we abnormally terminated. It doesn't matter what is in the
* message - the fact that it gets received is adequate
*/
command = 0x01;
cmd = OBJ_NEW(orte_buffer_t);
if (cmd == NULL) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return;
}
/* just pack something */
if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, &command, 1, ORTE_UINT8))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(cmd);
return;
}
/* send the alert */
if (0 > orte_rml.send_buffer(orte_errmgr_bproc_globals.replica, cmd, ORTE_RML_TAG_BPROC_ABORT, 0)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_RELEASE(cmd);
return;
}
OBJ_RELEASE(cmd);
/* okay, now we can truly abort. Tell the abort function not to bother writing out
* an abort file - we can't do anything with it anyway!
*/
orte_abort(error_code, false);
}
/*
* This function gets called when a process desperately needs to just die.
* Nothing can be done by definition here - this function ONLY gets
* called as an absolute last resort.
*/
void orte_errmgr_bproc_abort()
{
/* abnormal exit - no point in writing out an abort file as bproc doesn't
* know what to do with it anyway
*/
orte_abort(-1, false);
}
/*
* Alternatively, some systems (e.g., OpenMPI) need to tell us to kill
* some other subset of processes along with us. Send that info to the
* HNP so it can kill them.
*
* NOTE: this function assumes that the underlying ORTE infrastructure is
* still operational. Use of this function should therefore be restricted
* to cases where the problem is in a higher layer (e.g., MPI) as the
* process is likely to "hang" if an ORTE problem has been encountered.
*/
int orte_errmgr_bproc_abort_procs_request(orte_process_name_t *procs, orte_std_cntr_t nprocs)
{
orte_buffer_t* cmd;
orte_buffer_t* answer;
orte_errmgr_cmd_flag_t command;
orte_std_cntr_t count;
int rc;
/* protect us against error */
if (NULL == procs) {
return ORTE_ERR_BAD_PARAM;
}
command = ORTE_ERRMGR_ABORT_PROCS_REQUEST_CMD;
cmd = OBJ_NEW(orte_buffer_t);
if (cmd == NULL) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
/* pack the command */
if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, &command, 1, ORTE_ERRMGR_CMD))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(cmd);
return rc;
}
/* pack the number of procs we are requesting be aborted */
if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, &nprocs, 1, ORTE_STD_CNTR))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(cmd);
return rc;
}
/* pack the array of proc names */
if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, procs, nprocs, ORTE_NAME))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(cmd);
return rc;
}
/* send the request */
if (0 > orte_rml.send_buffer(orte_errmgr_bproc_globals.replica, cmd, ORTE_RML_TAG_RDS, 0)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_RELEASE(cmd);
return ORTE_ERR_COMM_FAILURE;
}
OBJ_RELEASE(cmd);
/* setup a buffer for the answer */
answer = OBJ_NEW(orte_buffer_t);
if(answer == NULL) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
/* enter a blocking receive until we hear back */
if (0 > orte_rml.recv_buffer(orte_errmgr_bproc_globals.replica, answer, ORTE_RML_TAG_RDS)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_RELEASE(answer);
return ORTE_ERR_COMM_FAILURE;
}
count = 1;
if (ORTE_SUCCESS != (rc = orte_dss.unpack(answer, &command, &count, ORTE_ERRMGR_CMD))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(answer);
return rc;
}
/* check that this is the right command */
if (ORTE_ERRMGR_ABORT_PROCS_REQUEST_CMD != command) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_RELEASE(answer);
return ORTE_ERR_COMM_FAILURE;
}
/* clean up and leave */
OBJ_RELEASE(answer);
return ORTE_SUCCESS;
}
/*
* It is imperative that ONLY an HNP perform this registration!
*/
int orte_errmgr_bproc_register_job(orte_jobid_t job)
{
return ORTE_SUCCESS;
}

81
orte/mca/errmgr/bproc/errmgr_bproc.h Обычный файл
Просмотреть файл

@ -0,0 +1,81 @@
/* -*- C -*-
*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*
*/
#ifndef ORTE_ERRMGR_BPROC_H
#define ORTE_ERRMGR_BPROC_H
#include "orte_config.h"
#include "orte/orte_types.h"
#include "orte/mca/ns/ns_types.h"
#include "orte/mca/gpr/gpr_types.h"
#include "orte/mca/errmgr/errmgr.h"
#if defined(c_plusplus) || defined(__cplusplus)
extern "C" {
#endif
/*
* Module open / close
*/
int orte_errmgr_bproc_open(void);
int orte_errmgr_bproc_close(void);
/*
* Startup / Shutdown
*/
orte_errmgr_base_module_t*
orte_errmgr_bproc_component_init(bool *allow_multi_user_threads, bool *have_hidden_threads, int *priority);
int orte_errmgr_bproc_finalize(void);
/*
* globals used within the component
*/
typedef struct {
int debug;
orte_process_name_t *replica;
} orte_errmgr_bproc_globals_t;
extern orte_errmgr_bproc_globals_t orte_errmgr_bproc_globals;
/*
* Component API functions
*/
int orte_errmgr_bproc_proc_aborted(orte_gpr_notify_message_t *msg);
int orte_errmgr_bproc_incomplete_start(orte_gpr_notify_message_t *msg);
void orte_errmgr_bproc_error_detected(int error_code, char *fmt, ...);
void orte_errmgr_bproc_abort(void);
int orte_errmgr_bproc_register_job(orte_jobid_t job);
int orte_errmgr_bproc_abort_procs_request(orte_process_name_t *procs, orte_std_cntr_t nprocs);
#if defined(c_plusplus) || defined(__cplusplus)
}
#endif
#endif

Просмотреть файл

@ -0,0 +1,164 @@
/* -*- C -*-
*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/** @file:
*
* The Open MPI General Purpose Registry - Proxy component
*
*/
/*
* includes
*/
#include "orte_config.h"
#include "orte/orte_constants.h"
#include "orte/orte_types.h"
#include "opal/util/output.h"
#include "opal/mca/mca.h"
#include "opal/mca/base/base.h"
#include "opal/mca/base/mca_base_param.h"
#include "orte/util/proc_info.h"
#include "orte/mca/ns/ns_types.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/errmgr/base/errmgr_private.h"
#include "errmgr_bproc.h"
/*
* Struct of function pointers that need to be initialized
*/
mca_errmgr_base_component_t mca_errmgr_bproc_component = {
{
ORTE_ERRMGR_BASE_VERSION_1_3_0,
"bproc", /* MCA module name */
ORTE_MAJOR_VERSION, /* MCA module major version */
ORTE_MINOR_VERSION, /* MCA module minor version */
ORTE_RELEASE_VERSION, /* MCA module release version */
orte_errmgr_bproc_open, /* module open */
orte_errmgr_bproc_close /* module close */
},
{
false /* checkpoint / restart */
},
orte_errmgr_bproc_component_init, /* module init */
orte_errmgr_bproc_finalize /* module shutdown */
};
/*
* setup the function pointers for the module
*/
static orte_errmgr_base_module_t orte_errmgr_bproc = {
orte_errmgr_base_log,
orte_errmgr_bproc_proc_aborted,
orte_errmgr_bproc_incomplete_start,
orte_errmgr_bproc_error_detected,
orte_errmgr_bproc_register_job,
orte_errmgr_bproc_abort,
orte_errmgr_bproc_abort_procs_request
};
/*
* Whether or not we allowed this component to be selected
*/
static bool initialized = false;
/* local globals */
orte_errmgr_bproc_globals_t orte_errmgr_bproc_globals;
/*
* Open the component
*/
int orte_errmgr_bproc_open(void)
{
int id, tmp;
id = mca_base_param_register_int("errmgr", "bproc", "debug", NULL, 0);
mca_base_param_lookup_int(id, &tmp);
if (tmp) {
orte_errmgr_bproc_globals.debug = true;
} else {
orte_errmgr_bproc_globals.debug = false;
}
return ORTE_SUCCESS;
}
/*
* Close the component
*/
int orte_errmgr_bproc_close(void)
{
return ORTE_SUCCESS;
}
orte_errmgr_base_module_t*
orte_errmgr_bproc_component_init(bool *allow_multi_user_threads, bool *have_hidden_threads,
int *priority)
{
if (orte_errmgr_bproc_globals.debug) {
opal_output(0, "errmgr_bproc_init called");
}
/* If we are an HNP or an orted, then don't pick us! */
if (orte_process_info.seed || orte_process_info.daemon) {
/* don't take me! */
return NULL;
}
/* Return a module (choose an arbitrary, positive priority --
absolutely must be higher than the proxy component
*/
*priority = 100;
/* no part of OpenRTE allows or has threads */
*allow_multi_user_threads = false;
*have_hidden_threads = false;
/* define the replica for us to use - for now, just point
* to the name service replica
*/
orte_errmgr_bproc_globals.replica = orte_process_info.ns_replica;
initialized = true;
return &orte_errmgr_bproc;
}
/*
* finalize routine
*/
int orte_errmgr_bproc_finalize(void)
{
if (orte_errmgr_bproc_globals.debug) {
opal_output(0, "[%lu,%lu,%lu] errmgr_bproc_finalize called",
ORTE_NAME_ARGS(orte_process_info.my_name));
}
initialized = false;
/* All done */
return ORTE_SUCCESS;
}

Просмотреть файл

@ -30,7 +30,11 @@
#include "orte_config.h"
#include "orte/orte_constants.h"
#include "orte/mca/schema/schema.h"
#include "orte/mca/gpr/gpr_types.h"
#include "orte/mca/ns/ns_types.h"
#include "opal/mca/mca.h"
@ -58,8 +62,7 @@ extern "C" {
/**
* Log an error
* Log an error that occurred in the runtime environment, and call the "error_detected"
* interface to see if further action is required.
* Log an error that occurred in the runtime environment
*
* @code
* orte_errmgr.log("this is an error", __FILE__, __LINE__);
@ -70,70 +73,110 @@ typedef void (*orte_errmgr_base_module_log_fn_t)(int error_code, char *filename,
/**
* Alert - process aborted
* This function is called when a remote process aborts during execution. Note that local
* process errors should always be reported through the error_detected interface and
* NOT here. The function is called when a message is received from the universe daemon
* indicating that another process in the job failed. For now, this function will
* simply cause the local process to gracefully finalize and terminate.
* This function is called when a remote process aborts during execution. The function
* is called via the GPR's trigger notification system. Actions taken in response
* to the abnormal termination of a remote application process will vary across
* the various errmgr components.
* NOTE: Local process errors should always be reported through the error_detected interface and
* NOT here.
*/
typedef void (*orte_errmgr_base_module_proc_aborted_fn_t)(orte_process_name_t *proc);
typedef int (*orte_errmgr_base_module_proc_aborted_fn_t)(orte_gpr_notify_message_t *msg);
/**
* Alert - incomplete start of a job
* This function is called when an attempted launch of a job encounters failure of
* one or more processes to start. The function decides on the strategy for dealing
* with this "incomplete start" situation - for now, it simply orders the resource
* manager to terminate the entire job.
* one or more processes to start. The strategy for dealing
* with this "incomplete start" situation varies across the various errmgr components.
*
* This function is only called by the respective process launcher, which is responsible
* for detecting incomplete starts.
* for detecting incomplete starts. If on a daemon, the function simply updates the
* process state to indicate failure to launch - this initiates a trigger that goes to
* the respective HNP for response.
*
* NOTE: Errmgr components on non-HNP and non-daemon processes are expressly forbidden
* from taking any action to this function call. Instead, they are restricted to simply
* returning.
*/
typedef void (*orte_errmgr_base_module_incomplete_start_fn_t)(orte_jobid_t job);
typedef int (*orte_errmgr_base_module_incomplete_start_fn_t)(orte_gpr_notify_message_t *msg);
/**
* Alert - internal error detected
* This function is called when an internal error is detected within the local process.
* It decides what to do about the error - for now, it simply orders the local process
* to finalize and terminate.
* This function is called when an internal error is detected within a local process.
* It decides what to do about the error. In the case of application processes, it simply
* orders the local process to finalize and terminate. The abnormal termination will be
* detected and dealt with by the daemon/HNP system.
*
* HNPs, of course, cannot simply exit - they must first cleanup their running jobs if at
* all possible. In some cases, this cannot be done - e.g., if the error detected would
* prevent operation of the registry or has corrupted memory. In these extreme cases,
* nothing can really be done.
*
* Likewise, orteds have responsibility towards their local application processes and
* must make some attempt to clean them up before exiting.
*
* The function pretty prints an error message if possible. Error message should be
* specified using the standard \code printf() format.
*/
typedef void (*orte_errmgr_base_module_error_detected_fn_t)(int error_code);
typedef void (*orte_errmgr_base_module_error_detected_fn_t)(int error_code, char *fmt, ...);
/*
* Register a job with the error manager
* When a job is launched, this function is called so the error manager can register
* subscriptions on the job segment so that the error manager will be notified when
* problems occur - i.e., when process status entries change to abnormal termination
* values. Process status entries are changed by the appropriate state-of-health monitor
* values. Process status entries are changed by the appropriate state monitor
* and/or the process launcher, depending upon the stage at which the problem occurs.
*
* Monitoring of the job begins once the job has reached the "executing" stage. Prior
* to that time, failure of processes to start are the responsibility of the respective
* process launcher - which is expected to call the error manager via the "incomplete
* start" interface to report any problems prior to the job beginning "execution".
*
* NOTE: ONLY HNPs are allowed to register for trigger reports. All other components
* MUST do nothing but return ORTE_SUCCESS.
*/
typedef int (*orte_errmgr_base_module_register_job_fn_t)(orte_jobid_t job);
/**
* Alert - self aborting
* This function is called when a process is aborting. The routine will kill
* any child processes and terminate the calling process.
* This function is called when a process is aborting. It will finalize the process
* itself, and then exits - it takes no other actions. The intent here is to provide
* a last-ditch exit procedure that attempts to clean up a little.
*/
typedef void (*orte_errmgr_base_module_abort_fn_t)(void);
/*
* Request that the system abort processes other than myself
* The possibility exists that a process will decide that ONLY a small subset of a job
* must be aborted. This function allows a process to request that the identified
* processes be aborted. The "request" portion of the function's name is not
* by accident - this function specifically does NOT perform the abort process
* itself, but simply requests that it be done.
*
* NOTE: Please ensure that you do NOT include your own process name in the
* array or else you will be ordered to "die" before you complete this function
* (i.e., you will be held in a blocking receive pending an answer from the
* HNP, which won't come before you receive your own "die" command). If you need
* to die too, then call "abort" after completing this function call.
*/
typedef int (*orte_errmgr_base_module_abort_procs_request_fn_t)(orte_process_name_t *procs, orte_std_cntr_t num_procs);
/*
* Ver 1.0.0
*/
struct orte_errmgr_base_module_1_0_0_t {
orte_errmgr_base_module_log_fn_t log;
orte_errmgr_base_module_proc_aborted_fn_t proc_aborted;
orte_errmgr_base_module_incomplete_start_fn_t incomplete_start;
orte_errmgr_base_module_error_detected_fn_t error_detected;
orte_errmgr_base_module_register_job_fn_t register_job;
orte_errmgr_base_module_abort_fn_t abort;
struct orte_errmgr_base_module_1_3_0_t {
orte_errmgr_base_module_log_fn_t log;
orte_errmgr_base_module_proc_aborted_fn_t proc_aborted;
orte_errmgr_base_module_incomplete_start_fn_t incomplete_start;
orte_errmgr_base_module_error_detected_fn_t error_detected;
orte_errmgr_base_module_register_job_fn_t register_job;
orte_errmgr_base_module_abort_fn_t abort;
orte_errmgr_base_module_abort_procs_request_fn_t abort_procs_request;
};
typedef struct orte_errmgr_base_module_1_0_0_t orte_errmgr_base_module_1_0_0_t;
typedef orte_errmgr_base_module_1_0_0_t orte_errmgr_base_module_t;
typedef struct orte_errmgr_base_module_1_3_0_t orte_errmgr_base_module_1_3_0_t;
typedef orte_errmgr_base_module_1_3_0_t orte_errmgr_base_module_t;
/*
* ERRMGR Component
@ -150,26 +193,26 @@ typedef int (*orte_errmgr_base_component_finalize_fn_t)(void);
* the standard component data structure
*/
struct mca_errmgr_base_component_1_0_0_t {
struct mca_errmgr_base_component_1_3_0_t {
mca_base_component_t errmgr_version;
mca_base_component_data_1_0_0_t errmgr_data;
orte_errmgr_base_component_init_fn_t errmgr_init;
orte_errmgr_base_component_finalize_fn_t errmgr_finalize;
};
typedef struct mca_errmgr_base_component_1_0_0_t mca_errmgr_base_component_1_0_0_t;
typedef mca_errmgr_base_component_1_0_0_t mca_errmgr_base_component_t;
typedef struct mca_errmgr_base_component_1_3_0_t mca_errmgr_base_component_1_3_0_t;
typedef mca_errmgr_base_component_1_3_0_t mca_errmgr_base_component_t;
/*
* Macro for use in components that are of type errmgr v1.0.0
*/
#define ORTE_ERRMGR_BASE_VERSION_1_0_0 \
/* ns v1.0 is chained to MCA v1.0 */ \
#define ORTE_ERRMGR_BASE_VERSION_1_3_0 \
/* errmgr v1.3 is chained to MCA v1.0 */ \
MCA_BASE_VERSION_1_0_0, \
/* errmgr v1.0 */ \
"errmgr", 1, 0, 0
/* errmgr v1.3 */ \
"errmgr", 1, 3, 0
/* Global structure for accessing error manager functions
*/

46
orte/mca/errmgr/hnp/Makefile.am Обычный файл
Просмотреть файл

@ -0,0 +1,46 @@
#
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2004-2005 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
sources = \
errmgr_hnp.h \
errmgr_hnp_component.c \
errmgr_hnp.c
# Make the output library in this directory, and name it either
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
# (for static builds).
if OMPI_BUILD_errmgr_hnp_DSO
component_noinst =
component_install = mca_errmgr_hnp.la
else
component_noinst = libmca_errmgr_hnp.la
component_install =
endif
mcacomponentdir = $(libdir)/openmpi
mcacomponent_LTLIBRARIES = $(component_install)
mca_errmgr_hnp_la_SOURCES = $(sources)
mca_errmgr_hnp_la_LDFLAGS = -module -avoid-version
mca_errmgr_hnp_la_LIBADD = \
$(top_ompi_builddir)/orte/liborte.la \
$(top_ompi_builddir)/opal/libopal.la
noinst_LTLIBRARIES = $(component_noinst)
libmca_errmgr_hnp_la_SOURCES =$(sources)
libmca_errmgr_hnp_la_LDFLAGS = -module -avoid-version

23
orte/mca/errmgr/hnp/configure.params Обычный файл
Просмотреть файл

@ -0,0 +1,23 @@
# -*- shell-script -*-
#
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2004-2005 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# Specific to this module
PARAM_INIT_FILE=errmgr_hnp_component.c
PARAM_CONFIG_FILES="Makefile"

205
orte/mca/errmgr/hnp/errmgr_hnp.c Обычный файл
Просмотреть файл

@ -0,0 +1,205 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "orte/orte_constants.h"
#include <stdlib.h>
#include <stdarg.h>
#include "opal/util/trace.h"
#include "opal/util/output.h"
#include "orte/runtime/runtime.h"
#include "orte/mca/ns/ns_types.h"
#include "orte/mca/gpr/gpr.h"
#include "orte/mca/pls/pls.h"
#include "orte/mca/smr/smr.h"
#include "orte/mca/errmgr/base/base.h"
#include "orte/mca/errmgr/hnp/errmgr_hnp.h"
/*
* This function gets called when the someone updates a process
* state to indicate it has aborted. That action results in
* the firing of a registry trigger that passes a minimal
* data message here. The only part of that message we need
* is the segment name so we can extract the jobid from it
*
* Various components will follow their own strategy for dealing with
* this situation. For this component, we simply kill the job.
*/
int orte_errmgr_hnp_proc_aborted(orte_gpr_notify_message_t *msg)
{
orte_jobid_t job;
int rc;
OPAL_TRACE(1);
opal_output(orte_errmgr_base_output, "errmgr:hnp: proc abort has been detected");
/* This trigger is named, so we can extract the jobid
* directly from the trigger name
*/
if (ORTE_SUCCESS != (rc = orte_schema.extract_jobid_from_std_trigger_name(&job, msg->target))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* set the job state */
if (ORTE_SUCCESS != (rc = orte_smr.set_job_state(job, ORTE_JOB_STATE_ABORTED))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* tell the pls to terminate the job */
if (ORTE_SUCCESS != (rc = orte_pls.terminate_job(job))) {
ORTE_ERROR_LOG(rc);
}
return rc;
}
/*
* This function gets called when someone updates a process
* state to indicate it failed to start. That action results in
* the firing of a registry trigger that passes a minimal
* data message here. The only part of that message we need
* is the segment name so we can extract the jobid from it
*
* Various components will follow their own strategy for dealing with
* this situation. For this component, we simply kill the job.
*/
int orte_errmgr_hnp_incomplete_start(orte_gpr_notify_message_t *msg)
{
orte_jobid_t job;
int rc;
OPAL_TRACE(1);
/* This trigger is named, so we can extract the jobid
* directly from the trigger name
*/
if (ORTE_SUCCESS != (rc = orte_schema.extract_jobid_from_std_trigger_name(&job, msg->target))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* set the job state */
if (ORTE_SUCCESS != (rc = orte_smr.set_job_state(job, ORTE_JOB_STATE_FAILED_TO_START))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* tell the pls to terminate the job */
if (ORTE_SUCCESS != (rc = orte_pls.terminate_job(job))) {
ORTE_ERROR_LOG(rc);
}
return rc;
}
/*
* This function gets called when the HNP itself detects an internal error!
* Ideally, we would find some way to tell all the active jobs to die before
* we depart ourselves. Unfortunately, at this time, we aren't sure we can do
* this - later, we'll add some more intelligence by, for example, checking
* the error code to see if it's something that would allow us to alert
* the remote orteds.
*
* For now, we'll just depart!
*/
void orte_errmgr_hnp_error_detected(int error_code, char *fmt, ...)
{
va_list arglist;
/* If there was a message, output it */
va_start(arglist, fmt);
if( NULL != fmt ) {
char* buffer = NULL;
vasprintf( &buffer, fmt, arglist );
opal_output( 0, buffer );
free( buffer );
}
va_end(arglist);
/* abnormal exit */
orte_abort(error_code, false);
}
/*
* This function gets called when the HNP desperately needs to just die.
* Nothing can be done by definition here - this function ONLY gets
* called as an absolute last resort
*/
void orte_errmgr_hnp_abort(void)
{
OPAL_TRACE(1);
/* abnormal exit */
orte_abort(-1, false);
}
/*
* This function gets called when a process wants to request that the HNP
* abort some set of processes for it. Since this component IS for the HNP,
* that means we need to actually execute this request! Call upon the PLS
* as needed to execute the abort requests
*/
int orte_errmgr_hnp_abort_procs_request(orte_process_name_t *procs, orte_std_cntr_t nprocs)
{
int rc;
rc = ORTE_SUCCESS;
return rc;
}
/*
* Register the HNP's errmgr functions to be called when the job encounters
* certain pre-identified problem states.
*
* NOTE: It is imperative that ONLY the HNP perform this registration!
*/
int orte_errmgr_hnp_register_job(orte_jobid_t job)
{
/* we need to setup two counters and their corresponding triggers - one
* to alert us when something fails to launch, and another for when
* someone aborts
*/
int rc;
/* define the ABORT trigger to fire when any process aborts */
if (ORTE_SUCCESS != (rc = orte_smr.define_alert_monitor(job, ORTE_NUM_ABORTED_TRIGGER,
ORTE_PROC_NUM_ABORTED, 0, 1, true,
orte_errmgr_hnp_proc_aborted, NULL))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* define the FAILED_LAUNCH trigger to fire when the launch fails */
if (ORTE_SUCCESS != (rc = orte_smr.define_alert_monitor(job, ORTE_FAILED_TO_START_TRIGGER,
ORTE_PROC_NUM_FAILED_START, 0, 1, true,
orte_errmgr_hnp_incomplete_start, NULL))) {
ORTE_ERROR_LOG(rc);
return rc;
}
return ORTE_SUCCESS;
}

79
orte/mca/errmgr/hnp/errmgr_hnp.h Обычный файл
Просмотреть файл

@ -0,0 +1,79 @@
/* -*- C -*-
*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*
*/
#ifndef ORTE_ERRMGR_HNP_H
#define ORTE_ERRMGR_HNP_H
#include "orte_config.h"
#include "orte/orte_types.h"
#include "orte/mca/ns/ns_types.h"
#include "orte/mca/errmgr/errmgr.h"
#if defined(c_plusplus) || defined(__cplusplus)
extern "C" {
#endif
/*
* Module open / close
*/
int orte_errmgr_hnp_open(void);
int orte_errmgr_hnp_close(void);
/*
* Startup / Shutdown
*/
orte_errmgr_base_module_t*
orte_errmgr_hnp_component_init(bool *allow_multi_user_threads, bool *have_hidden_threads, int *priority);
int orte_errmgr_hnp_finalize(void);
/*
* globals used within the component
*/
typedef struct {
int debug;
} orte_errmgr_hnp_globals_t;
extern orte_errmgr_hnp_globals_t orte_errmgr_hnp_globals;
/*
* Component API functions
*/
int orte_errmgr_hnp_proc_aborted(orte_gpr_notify_message_t *msg);
int orte_errmgr_hnp_incomplete_start(orte_gpr_notify_message_t *msg);
void orte_errmgr_hnp_error_detected(int error_code, char *fmt, ...);
void orte_errmgr_hnp_abort(void);
int orte_errmgr_hnp_register_job(orte_jobid_t job);
int orte_errmgr_hnp_abort_procs_request(orte_process_name_t *procs, orte_std_cntr_t nprocs);
#if defined(c_plusplus) || defined(__cplusplus)
}
#endif
#endif

174
orte/mca/errmgr/hnp/errmgr_hnp_component.c Обычный файл
Просмотреть файл

@ -0,0 +1,174 @@
/* -*- C -*-
*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/** @file:
*
* The Open MPI General Purpose Registry - Proxy component
*
*/
/*
* includes
*/
#include "orte_config.h"
#include "orte/orte_constants.h"
#include "orte/orte_types.h"
#include "opal/util/output.h"
#include "opal/mca/mca.h"
#include "opal/mca/base/base.h"
#include "opal/mca/base/mca_base_param.h"
#include "orte/util/proc_info.h"
#include "orte/mca/ns/ns_types.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/errmgr/base/errmgr_private.h"
#include "errmgr_hnp.h"
/*
* Struct of function pointers that need to be initialized
*/
mca_errmgr_base_component_t mca_errmgr_hnp_component = {
{
ORTE_ERRMGR_BASE_VERSION_1_3_0,
"hnp", /* MCA module name */
ORTE_MAJOR_VERSION, /* MCA module major version */
ORTE_MINOR_VERSION, /* MCA module minor version */
ORTE_RELEASE_VERSION, /* MCA module release version */
orte_errmgr_hnp_open, /* module open */
orte_errmgr_hnp_close /* module close */
},
{
false /* checkpoint / restart */
},
orte_errmgr_hnp_component_init, /* module init */
orte_errmgr_hnp_finalize /* module shutdown */
};
/*
* setup the function pointers for the module
*/
static orte_errmgr_base_module_t orte_errmgr_hnp = {
orte_errmgr_base_log,
orte_errmgr_hnp_proc_aborted,
orte_errmgr_hnp_incomplete_start,
orte_errmgr_hnp_error_detected,
orte_errmgr_hnp_register_job,
orte_errmgr_hnp_abort,
orte_errmgr_hnp_abort_procs_request
};
/*
* Whether or not we allowed this component to be selected
*/
static bool initialized = false;
/* local globals */
orte_errmgr_hnp_globals_t orte_errmgr_hnp_globals;
/*
* Open the component
*/
int orte_errmgr_hnp_open(void)
{
int id, tmp;
id = mca_base_param_register_int("errmgr", "hnp", "debug", NULL, 0);
mca_base_param_lookup_int(id, &tmp);
if (tmp) {
orte_errmgr_hnp_globals.debug = true;
} else {
orte_errmgr_hnp_globals.debug = false;
}
return ORTE_SUCCESS;
}
/*
* Close the component
*/
int orte_errmgr_hnp_close(void)
{
return ORTE_SUCCESS;
}
orte_errmgr_base_module_t*
orte_errmgr_hnp_component_init(bool *allow_multi_user_threads, bool *have_hidden_threads,
int *priority)
{
int rc;
if (orte_errmgr_hnp_globals.debug) {
opal_output(0, "errmgr_hnp_init called");
}
/* If we are not an HNP, then don't pick us! */
if (!orte_process_info.seed) {
/* don't take me! */
return NULL;
}
/* Return a module (choose an arbitrary, positive priority --
it's only relevant compared to other components). */
*priority = 10;
/* no part of OpenRTE allows or has threads */
*allow_multi_user_threads = false;
*have_hidden_threads = false;
/* start the receive function */
if (ORTE_SUCCESS != (rc = orte_errmgr_base_comm_start())) {
ORTE_ERROR_LOG(rc);
return NULL;
}
initialized = true;
return &orte_errmgr_hnp;
}
/*
* finalize routine
*/
int orte_errmgr_hnp_finalize(void)
{
int rc;
if (orte_errmgr_hnp_globals.debug) {
opal_output(0, "[%lu,%lu,%lu] errmgr_hnp_finalize called",
ORTE_NAME_ARGS(orte_process_info.my_name));
}
/* stop the receive function */
if (ORTE_SUCCESS != (rc = orte_errmgr_base_comm_stop())) {
ORTE_ERROR_LOG(rc);
}
initialized = false;
/* All done */
return ORTE_SUCCESS;
}

46
orte/mca/errmgr/orted/Makefile.am Обычный файл
Просмотреть файл

@ -0,0 +1,46 @@
#
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2004-2005 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
sources = \
errmgr_orted.h \
errmgr_orted_component.c \
errmgr_orted.c
# Make the output library in this directory, and name it either
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
# (for static builds).
if OMPI_BUILD_errmgr_orted_DSO
component_noinst =
component_install = mca_errmgr_orted.la
else
component_noinst = libmca_errmgr_orted.la
component_install =
endif
mcacomponentdir = $(libdir)/openmpi
mcacomponent_LTLIBRARIES = $(component_install)
mca_errmgr_orted_la_SOURCES = $(sources)
mca_errmgr_orted_la_LDFLAGS = -module -avoid-version
mca_errmgr_orted_la_LIBADD = \
$(top_ompi_builddir)/orte/liborte.la \
$(top_ompi_builddir)/opal/libopal.la
noinst_LTLIBRARIES = $(component_noinst)
libmca_errmgr_orted_la_SOURCES =$(sources)
libmca_errmgr_orted_la_LDFLAGS = -module -avoid-version

23
orte/mca/errmgr/orted/configure.params Обычный файл
Просмотреть файл

@ -0,0 +1,23 @@
# -*- shell-script -*-
#
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2004-2005 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# Specific to this module
PARAM_INIT_FILE=errmgr_orted_component.c
PARAM_CONFIG_FILES="Makefile"

192
orte/mca/errmgr/orted/errmgr_orted.c Обычный файл
Просмотреть файл

@ -0,0 +1,192 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "orte/orte_constants.h"
#include <stdlib.h>
#include <stdarg.h>
#include "opal/util/output.h"
#include "orte/runtime/runtime.h"
#include "orte/runtime/orte_wait.h"
#include "orte/util/proc_info.h"
#include "orte/util/session_dir.h"
#include "orte/mca/ns/ns_types.h"
#include "orte/mca/smr/smr.h"
#include "orte/mca/rml/rml.h"
#include "orte/mca/errmgr/base/errmgr_private.h"
#include "orte/mca/errmgr/orted/errmgr_orted.h"
/*
* This function only gets called on HNP components! Orteds learn about
* a proc aborting from the HNP.
*/
int orte_errmgr_orted_proc_aborted(orte_gpr_notify_message_t *msg)
{
return ORTE_ERR_NOT_AVAILABLE;
}
/* This function only gets called on HNP components! Orteds learn about
* an incomplete start from the HNP.
*/
int orte_errmgr_orted_incomplete_start(orte_gpr_notify_message_t *msg)
{
return ORTE_ERR_NOT_AVAILABLE;
}
/*
* This function gets called when the orted itself detects an internal error!
* At some point in future, to be polite, we tell any of our own local
* processes to die before we abandon them
*/
void orte_errmgr_orted_error_detected(int error_code, char *fmt, ...)
{
va_list arglist;
/* If there was a message, output it */
va_start(arglist, fmt);
if( NULL != fmt ) {
char* buffer = NULL;
vasprintf( &buffer, fmt, arglist );
opal_output( 0, buffer );
free( buffer );
}
va_end(arglist);
/* cleanup my session directory */
orte_session_dir_finalize(orte_process_info.my_name);
/* abnormal exit */
orte_abort(error_code, false);
}
/*
* This function gets called when we desperately need to just die.
* Nothing can be done by definition here - this function ONLY gets
* called as an absolute last resort
*/
void orte_errmgr_orted_abort(void)
{
/* cleanup my session directory */
orte_session_dir_finalize(orte_process_info.my_name);
/* abnormal exit */
orte_abort(-1, false);
}
/*
* This function is called by the orted to request that some set of processes
* be aborted by the HNP. This would likely be an unusual request as the orted
* would have no knowledge of other processes or real reason to order them killed.
* Still, the capability is provided here.
*/
int orte_errmgr_orted_abort_procs_request(orte_process_name_t *procs, orte_std_cntr_t nprocs)
{
orte_buffer_t* cmd;
orte_buffer_t* answer;
orte_errmgr_cmd_flag_t command;
orte_std_cntr_t count;
int rc;
/* protect us against error */
if (NULL == procs) {
return ORTE_ERR_BAD_PARAM;
}
command = ORTE_ERRMGR_ABORT_PROCS_REQUEST_CMD;
cmd = OBJ_NEW(orte_buffer_t);
if (cmd == NULL) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
/* pack the command */
if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, &command, 1, ORTE_ERRMGR_CMD))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(cmd);
return rc;
}
/* pack the number of procs we are requesting be aborted */
if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, &nprocs, 1, ORTE_STD_CNTR))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(cmd);
return rc;
}
/* pack the array of proc names */
if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, procs, nprocs, ORTE_NAME))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(cmd);
return rc;
}
/* send the request */
if (0 > orte_rml.send_buffer(orte_errmgr_orted_globals.replica, cmd, ORTE_RML_TAG_RDS, 0)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_RELEASE(cmd);
return ORTE_ERR_COMM_FAILURE;
}
OBJ_RELEASE(cmd);
/* setup a buffer for the answer */
answer = OBJ_NEW(orte_buffer_t);
if(answer == NULL) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
/* enter a blocking receive until we hear back */
if (0 > orte_rml.recv_buffer(orte_errmgr_orted_globals.replica, answer, ORTE_RML_TAG_RDS)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_RELEASE(answer);
return ORTE_ERR_COMM_FAILURE;
}
count = 1;
if (ORTE_SUCCESS != (rc = orte_dss.unpack(answer, &command, &count, ORTE_ERRMGR_CMD))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(answer);
return rc;
}
/* check that this is the right command */
if (ORTE_ERRMGR_ABORT_PROCS_REQUEST_CMD != command) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_RELEASE(answer);
return ORTE_ERR_COMM_FAILURE;
}
/* clean up and leave */
OBJ_RELEASE(answer);
return ORTE_SUCCESS;
}
/*
* It is imperative that ONLY an HNP perform this registration!
*/
int orte_errmgr_orted_register_job(orte_jobid_t job)
{
return ORTE_ERR_NOT_AVAILABLE;
}

81
orte/mca/errmgr/orted/errmgr_orted.h Обычный файл
Просмотреть файл

@ -0,0 +1,81 @@
/* -*- C -*-
*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*
*/
#ifndef ORTE_ERRMGR_ORTED_H
#define ORTE_ERRMGR_ORTED_H
#include "orte_config.h"
#include "orte/orte_types.h"
#include "orte/mca/ns/ns_types.h"
#include "orte/mca/gpr/gpr_types.h"
#include "orte/mca/errmgr/errmgr.h"
#if defined(c_plusplus) || defined(__cplusplus)
extern "C" {
#endif
/*
* Module open / close
*/
int orte_errmgr_orted_open(void);
int orte_errmgr_orted_close(void);
/*
* Startup / Shutdown
*/
orte_errmgr_base_module_t*
orte_errmgr_orted_component_init(bool *allow_multi_user_threads, bool *have_hidden_threads, int *priority);
int orte_errmgr_orted_finalize(void);
/*
* globals used within the component
*/
typedef struct {
int debug;
orte_process_name_t *replica;
} orte_errmgr_orted_globals_t;
extern orte_errmgr_orted_globals_t orte_errmgr_orted_globals;
/*
* Component API functions
*/
int orte_errmgr_orted_proc_aborted(orte_gpr_notify_message_t *msg);
int orte_errmgr_orted_incomplete_start(orte_gpr_notify_message_t *msg);
void orte_errmgr_orted_error_detected(int error_code, char *fmt, ...);
void orte_errmgr_orted_abort(void);
int orte_errmgr_orted_register_job(orte_jobid_t job);
int orte_errmgr_orted_abort_procs_request(orte_process_name_t *procs, orte_std_cntr_t nprocs);
#if defined(c_plusplus) || defined(__cplusplus)
}
#endif
#endif

Просмотреть файл

@ -0,0 +1,164 @@
/* -*- C -*-
*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/** @file:
*
* The Open MPI General Purpose Registry - Proxy component
*
*/
/*
* includes
*/
#include "orte_config.h"
#include "orte/orte_constants.h"
#include "orte/orte_types.h"
#include "opal/util/output.h"
#include "opal/mca/mca.h"
#include "opal/mca/base/base.h"
#include "opal/mca/base/mca_base_param.h"
#include "orte/util/proc_info.h"
#include "orte/mca/ns/ns_types.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/errmgr/base/errmgr_private.h"
#include "errmgr_orted.h"
/*
* Struct of function pointers that need to be initialized
*/
mca_errmgr_base_component_t mca_errmgr_orted_component = {
{
ORTE_ERRMGR_BASE_VERSION_1_3_0,
"orted", /* MCA module name */
ORTE_MAJOR_VERSION, /* MCA module major version */
ORTE_MINOR_VERSION, /* MCA module minor version */
ORTE_RELEASE_VERSION, /* MCA module release version */
orte_errmgr_orted_open, /* module open */
orte_errmgr_orted_close /* module close */
},
{
false /* checkpoint / restart */
},
orte_errmgr_orted_component_init, /* module init */
orte_errmgr_orted_finalize /* module shutdown */
};
/*
* setup the function pointers for the module
*/
static orte_errmgr_base_module_t orte_errmgr_orted = {
orte_errmgr_base_log,
orte_errmgr_orted_proc_aborted,
orte_errmgr_orted_incomplete_start,
orte_errmgr_orted_error_detected,
orte_errmgr_orted_register_job,
orte_errmgr_orted_abort,
orte_errmgr_orted_abort_procs_request
};
/*
* Whether or not we allowed this component to be selected
*/
static bool initialized = false;
/* local globals */
orte_errmgr_orted_globals_t orte_errmgr_orted_globals;
/*
* Open the component
*/
int orte_errmgr_orted_open(void)
{
int id, tmp;
id = mca_base_param_register_int("errmgr", "orted", "debug", NULL, 0);
mca_base_param_lookup_int(id, &tmp);
if (tmp) {
orte_errmgr_orted_globals.debug = true;
} else {
orte_errmgr_orted_globals.debug = false;
}
return ORTE_SUCCESS;
}
/*
* Close the component
*/
int orte_errmgr_orted_close(void)
{
return ORTE_SUCCESS;
}
orte_errmgr_base_module_t*
orte_errmgr_orted_component_init(bool *allow_multi_user_threads, bool *have_hidden_threads,
int *priority)
{
if (orte_errmgr_orted_globals.debug) {
opal_output(0, "errmgr_orted_init called");
}
/* If we are not a daemon, then this component is not for us! */
if (!orte_process_info.daemon) {
/* don't take me! */
return NULL;
}
/* Return a module (choose an arbitrary, positive priority --
it's only relevant compared to other components). */
*priority = 10;
/* no part of OpenRTE allows or has threads */
*allow_multi_user_threads = false;
*have_hidden_threads = false;
/* define the HNP we should be talking to - for now,
* just use the NS replica
*/
orte_errmgr_orted_globals.replica = orte_process_info.ns_replica;
initialized = true;
return &orte_errmgr_orted;
}
/*
* finalize routine
*/
int orte_errmgr_orted_finalize(void)
{
if (orte_errmgr_orted_globals.debug) {
opal_output(0, "[%lu,%lu,%lu] errmgr_orted_finalize called",
ORTE_NAME_ARGS(orte_process_info.my_name));
}
initialized = false;
/* All done */
return ORTE_SUCCESS;
}

46
orte/mca/errmgr/proxy/Makefile.am Обычный файл
Просмотреть файл

@ -0,0 +1,46 @@
#
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2004-2005 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
sources = \
errmgr_proxy.h \
errmgr_proxy_component.c \
errmgr_proxy.c
# Make the output library in this directory, and name it either
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
# (for static builds).
if OMPI_BUILD_errmgr_proxy_DSO
component_noinst =
component_install = mca_errmgr_proxy.la
else
component_noinst = libmca_errmgr_proxy.la
component_install =
endif
mcacomponentdir = $(libdir)/openmpi
mcacomponent_LTLIBRARIES = $(component_install)
mca_errmgr_proxy_la_SOURCES = $(sources)
mca_errmgr_proxy_la_LDFLAGS = -module -avoid-version
mca_errmgr_proxy_la_LIBADD = \
$(top_ompi_builddir)/orte/liborte.la \
$(top_ompi_builddir)/opal/libopal.la
noinst_LTLIBRARIES = $(component_noinst)
libmca_errmgr_proxy_la_SOURCES =$(sources)
libmca_errmgr_proxy_la_LDFLAGS = -module -avoid-version

23
orte/mca/errmgr/proxy/configure.params Обычный файл
Просмотреть файл

@ -0,0 +1,23 @@
# -*- shell-script -*-
#
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2004-2005 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# Specific to this module
PARAM_INIT_FILE=errmgr_proxy_component.c
PARAM_CONFIG_FILES="Makefile"

187
orte/mca/errmgr/proxy/errmgr_proxy.c Обычный файл
Просмотреть файл

@ -0,0 +1,187 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "orte/orte_constants.h"
#include "opal/util/output.h"
#include "orte/runtime/runtime.h"
#include "orte/mca/ns/ns_types.h"
#include "orte/mca/gpr/gpr_types.h"
#include "orte/mca/rml/rml.h"
#include "orte/mca/errmgr/base/errmgr_private.h"
#include "orte/mca/errmgr/proxy/errmgr_proxy.h"
/*
* This function gets called when the SMR updates a process state to
* indicate that it aborted. Since the proxy component is only active on
* non-HNP processes, this function will NEVER be called
*/
int orte_errmgr_proxy_proc_aborted(orte_gpr_notify_message_t *msg)
{
return ORTE_ERR_NOT_AVAILABLE;
}
/*
* This function gets called when the SMR updates a process state to
* indicate that it failed to start. Since the proxy component is only active on
* non-HNP processes, this function will NEVER be called
*/
int orte_errmgr_proxy_incomplete_start(orte_gpr_notify_message_t *msg)
{
return ORTE_ERR_NOT_AVAILABLE;
}
/*
* This function gets called when a process detects an internal error.
* Various non-HNP/non-orted errmgr components will deal with this in various
* ways - for now, we simply abort and provide the error_code as our
* exit status
*/
void orte_errmgr_proxy_error_detected(int error_code, char *fmt, ...)
{
va_list arglist;
/* If there was a message, output it */
va_start(arglist, fmt);
if( NULL != fmt ) {
char* buffer = NULL;
vasprintf( &buffer, fmt, arglist );
opal_output( 0, buffer );
free( buffer );
}
va_end(arglist);
orte_abort(error_code, true);
}
/*
* This function gets called when a process desperately needs to just die.
* Nothing can be done by definition here - this function ONLY gets
* called as an absolute last resort.
*/
void orte_errmgr_proxy_abort()
{
/* abnormal exit */
orte_abort(-1, true);
}
/*
* Alternatively, some systems (e.g., OpenMPI) need to tell us to kill
* some other subset of processes along with us. Send that info to the
* HNP so it can kill them.
*
* NOTE: this function assumes that the underlying ORTE infrastructure is
* still operational. Use of this function should therefore be restricted
* to cases where the problem is in a higher layer (e.g., MPI) as the
* process is likely to "hang" if an ORTE problem has been encountered.
*/
int orte_errmgr_proxy_abort_procs_request(orte_process_name_t *procs, orte_std_cntr_t nprocs)
{
orte_buffer_t* cmd;
orte_buffer_t* answer;
orte_errmgr_cmd_flag_t command;
orte_std_cntr_t count;
int rc;
/* protect us against error */
if (NULL == procs) {
return ORTE_ERR_BAD_PARAM;
}
command = ORTE_ERRMGR_ABORT_PROCS_REQUEST_CMD;
cmd = OBJ_NEW(orte_buffer_t);
if (cmd == NULL) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
/* pack the command */
if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, &command, 1, ORTE_ERRMGR_CMD))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(cmd);
return rc;
}
/* pack the number of procs we are requesting be aborted */
if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, &nprocs, 1, ORTE_STD_CNTR))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(cmd);
return rc;
}
/* pack the array of proc names */
if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, procs, nprocs, ORTE_NAME))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(cmd);
return rc;
}
/* send the request */
if (0 > orte_rml.send_buffer(orte_errmgr_proxy_globals.replica, cmd, ORTE_RML_TAG_RDS, 0)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_RELEASE(cmd);
return ORTE_ERR_COMM_FAILURE;
}
OBJ_RELEASE(cmd);
/* setup a buffer for the answer */
answer = OBJ_NEW(orte_buffer_t);
if(answer == NULL) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
/* enter a blocking receive until we hear back */
if (0 > orte_rml.recv_buffer(orte_errmgr_proxy_globals.replica, answer, ORTE_RML_TAG_RDS)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_RELEASE(answer);
return ORTE_ERR_COMM_FAILURE;
}
count = 1;
if (ORTE_SUCCESS != (rc = orte_dss.unpack(answer, &command, &count, ORTE_ERRMGR_CMD))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(answer);
return rc;
}
/* check that this is the right command */
if (ORTE_ERRMGR_ABORT_PROCS_REQUEST_CMD != command) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_RELEASE(answer);
return ORTE_ERR_COMM_FAILURE;
}
/* clean up and leave */
OBJ_RELEASE(answer);
return ORTE_SUCCESS;
}
/*
* It is imperative that ONLY an HNP perform this registration!
*/
int orte_errmgr_proxy_register_job(orte_jobid_t job)
{
return ORTE_SUCCESS;
}

81
orte/mca/errmgr/proxy/errmgr_proxy.h Обычный файл
Просмотреть файл

@ -0,0 +1,81 @@
/* -*- C -*-
*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*
*/
#ifndef ORTE_ERRMGR_PROXY_H
#define ORTE_ERRMGR_PROXY_H
#include "orte_config.h"
#include "orte/orte_types.h"
#include "orte/mca/ns/ns_types.h"
#include "orte/mca/gpr/gpr_types.h"
#include "orte/mca/errmgr/errmgr.h"
#if defined(c_plusplus) || defined(__cplusplus)
extern "C" {
#endif
/*
* Module open / close
*/
int orte_errmgr_proxy_open(void);
int orte_errmgr_proxy_close(void);
/*
* Startup / Shutdown
*/
orte_errmgr_base_module_t*
orte_errmgr_proxy_component_init(bool *allow_multi_user_threads, bool *have_hidden_threads, int *priority);
int orte_errmgr_proxy_finalize(void);
/*
* globals used within the component
*/
typedef struct {
int debug;
orte_process_name_t *replica;
} orte_errmgr_proxy_globals_t;
extern orte_errmgr_proxy_globals_t orte_errmgr_proxy_globals;
/*
* Component API functions
*/
int orte_errmgr_proxy_proc_aborted(orte_gpr_notify_message_t *msg);
int orte_errmgr_proxy_incomplete_start(orte_gpr_notify_message_t *msg);
void orte_errmgr_proxy_error_detected(int error_code, char *fmt, ...);
void orte_errmgr_proxy_abort(void);
int orte_errmgr_proxy_register_job(orte_jobid_t job);
int orte_errmgr_proxy_abort_procs_request(orte_process_name_t *procs, orte_std_cntr_t nprocs);
#if defined(c_plusplus) || defined(__cplusplus)
}
#endif
#endif

Просмотреть файл

@ -0,0 +1,163 @@
/* -*- C -*-
*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/** @file:
*
* The Open MPI General Purpose Registry - Proxy component
*
*/
/*
* includes
*/
#include "orte_config.h"
#include "orte/orte_constants.h"
#include "orte/orte_types.h"
#include "opal/util/output.h"
#include "opal/mca/mca.h"
#include "opal/mca/base/base.h"
#include "opal/mca/base/mca_base_param.h"
#include "orte/util/proc_info.h"
#include "orte/mca/ns/ns_types.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/errmgr/base/errmgr_private.h"
#include "errmgr_proxy.h"
/*
* Struct of function pointers that need to be initialized
*/
mca_errmgr_base_component_t mca_errmgr_proxy_component = {
{
ORTE_ERRMGR_BASE_VERSION_1_3_0,
"proxy", /* MCA module name */
ORTE_MAJOR_VERSION, /* MCA module major version */
ORTE_MINOR_VERSION, /* MCA module minor version */
ORTE_RELEASE_VERSION, /* MCA module release version */
orte_errmgr_proxy_open, /* module open */
orte_errmgr_proxy_close /* module close */
},
{
false /* checkpoint / restart */
},
orte_errmgr_proxy_component_init, /* module init */
orte_errmgr_proxy_finalize /* module shutdown */
};
/*
* setup the function pointers for the module
*/
static orte_errmgr_base_module_t orte_errmgr_proxy = {
orte_errmgr_base_log,
orte_errmgr_proxy_proc_aborted,
orte_errmgr_proxy_incomplete_start,
orte_errmgr_proxy_error_detected,
orte_errmgr_proxy_register_job,
orte_errmgr_proxy_abort,
orte_errmgr_proxy_abort_procs_request
};
/*
* Whether or not we allowed this component to be selected
*/
static bool initialized = false;
/* local globals */
orte_errmgr_proxy_globals_t orte_errmgr_proxy_globals;
/*
* Open the component
*/
int orte_errmgr_proxy_open(void)
{
int id, tmp;
id = mca_base_param_register_int("errmgr", "proxy", "debug", NULL, 0);
mca_base_param_lookup_int(id, &tmp);
if (tmp) {
orte_errmgr_proxy_globals.debug = true;
} else {
orte_errmgr_proxy_globals.debug = false;
}
return ORTE_SUCCESS;
}
/*
* Close the component
*/
int orte_errmgr_proxy_close(void)
{
return ORTE_SUCCESS;
}
orte_errmgr_base_module_t*
orte_errmgr_proxy_component_init(bool *allow_multi_user_threads, bool *have_hidden_threads,
int *priority)
{
if (orte_errmgr_proxy_globals.debug) {
opal_output(0, "errmgr_proxy_init called");
}
/* If we are an HNP or an orted, then don't pick us! */
if (orte_process_info.seed || orte_process_info.daemon) {
/* don't take me! */
return NULL;
}
/* Return a module (choose an arbitrary, positive priority --
it's only relevant compared to other components). */
*priority = 10;
/* no part of OpenRTE allows or has threads */
*allow_multi_user_threads = false;
*have_hidden_threads = false;
/* define the replica for us to use - for now, just point
* to the name service replica
*/
orte_errmgr_proxy_globals.replica = orte_process_info.ns_replica;
initialized = true;
return &orte_errmgr_proxy;
}
/*
* finalize routine
*/
int orte_errmgr_proxy_finalize(void)
{
if (orte_errmgr_proxy_globals.debug) {
opal_output(0, "[%lu,%lu,%lu] errmgr_proxy_finalize called",
ORTE_NAME_ARGS(orte_process_info.my_name));
}
initialized = false;
/* All done */
return ORTE_SUCCESS;
}

Просмотреть файл

@ -278,6 +278,7 @@ int orte_gpr_replica_purge_subscriptions(orte_process_name_t *proc);
int orte_gpr_replica_store_value_in_msg(orte_gpr_replica_requestor_t *req,
orte_gpr_notify_message_t *msg,
char *sub_name,
orte_std_cntr_t cnt,
orte_gpr_value_t **values);

Просмотреть файл

@ -213,7 +213,7 @@ int orte_gpr_replica_register_callback(orte_gpr_replica_subscription_t *sub,
* subscription id, combining data where the id's match
*/
if (ORTE_SUCCESS != (rc = orte_gpr_replica_store_value_in_msg(reqs[i],
cb->message, cnt, values))) {
cb->message, sub->name, cnt, values))) {
ORTE_ERROR_LOG(rc);
goto CLEANUP;
}
@ -436,6 +436,7 @@ int orte_gpr_replica_define_callback(orte_gpr_notify_msg_type_t msg_type,
int orte_gpr_replica_store_value_in_msg(orte_gpr_replica_requestor_t *req,
orte_gpr_notify_message_t *msg,
char *sub_name,
orte_std_cntr_t cnt,
orte_gpr_value_t **values)
{
@ -482,6 +483,10 @@ int orte_gpr_replica_store_value_in_msg(orte_gpr_replica_requestor_t *req,
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
/* set the name of the subscription, if provided */
if (NULL != sub_name) {
dptr->target = strdup(sub_name);
}
dptr->id = req->idtag;
if (0 > orte_pointer_array_add(&index, msg->data, dptr)) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
@ -526,7 +531,7 @@ static int orte_gpr_replica_store_value_in_trigger_msg(orte_gpr_replica_subscrip
if (NULL != data[i]) {
k++;
if ((NULL == data[i]->target && NULL == sub) ||
(NULL != data[i]->target &&
(NULL != data[i]->target && NULL != sub->name &&
0 == strcmp(data[i]->target, sub->name))) { /* going to the same place */
for (j=0; j < cnt; j++) {
if (0 > orte_pointer_array_add(&index, data[i]->values, values[j])) {
@ -557,7 +562,7 @@ static int orte_gpr_replica_store_value_in_trigger_msg(orte_gpr_replica_subscrip
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
if (NULL != sub) {
if (NULL != sub && NULL != sub->name) {
dptr->target = strdup(sub->name);
}
if (0 > orte_pointer_array_add(&index, msg->data, dptr)) {

Просмотреть файл

@ -42,28 +42,79 @@ int orte_ns_base_compare_name(orte_process_name_t *value1,
return ORTE_VALUE1_GREATER;
}
/* for this generic compare, go through the progression */
if (value1->cellid < value2->cellid) {
return ORTE_VALUE2_GREATER;
} else if (value1->cellid > value2->cellid) {
return ORTE_VALUE1_GREATER;
/** we have to take care of the special case where one of the
* values is ORTE_NAME_WILDCARD. If any of the fields are wildcard,
* then we want to just ignore that one field. However, in the case
* of ORTE_NAME_WILDCARD (where ALL of the fields are wildcard), this
* would automatically result in ORTE_EQUAL for any name in the other
* value - a totally useless result.
*
* Instead, what we want to know in this case is if the value actually
* *is* ORTE_NAME_WILDCARD. So, we need to detect if one of the values
* is ORTE_NAME_WILDCARD, and then specifically check the other one
* to see if it matches
*/
if (value2->cellid == ORTE_CELLID_WILDCARD &&
value2->jobid == ORTE_JOBID_WILDCARD &&
value2->vpid == ORTE_VPID_WILDCARD) {
if (value1->cellid == ORTE_CELLID_WILDCARD &&
value1->jobid == ORTE_JOBID_WILDCARD &&
value1->vpid == ORTE_VPID_WILDCARD) {
return ORTE_EQUAL;
} else {
return ORTE_VALUE1_GREATER;
}
} else if (value1->cellid == ORTE_CELLID_WILDCARD &&
value1->jobid == ORTE_JOBID_WILDCARD &&
value1->vpid == ORTE_VPID_WILDCARD) {
if (value2->cellid == ORTE_CELLID_WILDCARD &&
value2->jobid == ORTE_JOBID_WILDCARD &&
value2->vpid == ORTE_VPID_WILDCARD) {
return ORTE_EQUAL;
} else {
return ORTE_VALUE2_GREATER;
}
}
/* get here if jobid's are equal - now check process group */
if (value1->jobid < value2->jobid) {
return ORTE_VALUE2_GREATER;
} else if (value1->jobid > value2->jobid) {
return ORTE_VALUE1_GREATER;
/** now that the special cases are done, go through the progression */
/** check the cellids - if one of them is WILDCARD, then ignore
* this field since anything is okay
*/
if (value1->cellid != ORTE_CELLID_WILDCARD &&
value2->cellid != ORTE_CELLID_WILDCARD) {
if (value1->cellid < value2->cellid) {
return ORTE_VALUE2_GREATER;
} else if (value1->cellid > value2->cellid) {
return ORTE_VALUE1_GREATER;
}
}
/* get here if cellid's and jobid's are equal - now check vpid */
if (value1->vpid < value2->vpid) {
return ORTE_VALUE2_GREATER;
} else if (value1->vpid > value2->vpid) {
return ORTE_VALUE1_GREATER;
/** check the jobids - if one of them is WILDCARD, then ignore
* this field since anything is okay
*/
if (value1->jobid != ORTE_JOBID_WILDCARD &&
value2->jobid != ORTE_JOBID_WILDCARD) {
if (value1->jobid < value2->jobid) {
return ORTE_VALUE2_GREATER;
} else if (value1->jobid > value2->jobid) {
return ORTE_VALUE1_GREATER;
}
}
/* only way to get here is if all fields are equal */
/** check the vpids - if one of them is WILDCARD, then ignore
* this field since anything is okay
*/
if (value1->vpid != ORTE_VPID_WILDCARD &&
value2->vpid != ORTE_VPID_WILDCARD) {
if (value1->vpid < value2->vpid) {
return ORTE_VALUE2_GREATER;
} else if (value1->vpid > value2->vpid) {
return ORTE_VALUE1_GREATER;
}
}
/** only way to get here is if all fields are equal or WILDCARD */
return ORTE_EQUAL;
}
@ -72,10 +123,14 @@ int orte_ns_base_compare_vpid(orte_vpid_t *value1,
orte_data_type_t type)
{
/** if either value is WILDCARD, then return equal */
if (*value1 == ORTE_VPID_WILDCARD ||
*value2 == ORTE_VPID_WILDCARD) return ORTE_EQUAL;
if (*value1 > *value2) return ORTE_VALUE1_GREATER;
if (*value2 > *value1) return ORTE_VALUE2_GREATER;
return ORTE_EQUAL;
}
@ -83,10 +138,14 @@ int orte_ns_base_compare_jobid(orte_jobid_t *value1,
orte_jobid_t *value2,
orte_data_type_t type)
{
/** if either value is WILDCARD, then return equal */
if (*value1 == ORTE_JOBID_WILDCARD ||
*value2 == ORTE_JOBID_WILDCARD) return ORTE_EQUAL;
if (*value1 > *value2) return ORTE_VALUE1_GREATER;
if (*value2 > *value1) return ORTE_VALUE2_GREATER;
return ORTE_EQUAL;
}
@ -94,9 +153,13 @@ int orte_ns_base_compare_cellid(orte_cellid_t *value1,
orte_cellid_t *value2,
orte_data_type_t type)
{
/** if either value is WILDCARD, then return equal */
if (*value1 == ORTE_CELLID_WILDCARD ||
*value2 == ORTE_CELLID_WILDCARD) return ORTE_EQUAL;
if (*value1 > *value2) return ORTE_VALUE1_GREATER;
if (*value2 > *value1) return ORTE_VALUE2_GREATER;
return ORTE_EQUAL;
}

Просмотреть файл

@ -100,13 +100,13 @@ mca_ns_base_component_t mca_ns_base_selected_component;
/* constructor - used to initialize namelist instance */
static void orte_name_services_namelist_construct(orte_name_services_namelist_t* list)
static void orte_namelist_construct(orte_namelist_t* list)
{
list->name = NULL;
}
/* destructor - used to free any resources held by instance */
static void orte_name_services_namelist_destructor(orte_name_services_namelist_t* list)
static void orte_namelist_destructor(orte_namelist_t* list)
{
if (NULL != list->name) {
free(list->name);
@ -115,10 +115,10 @@ static void orte_name_services_namelist_destructor(orte_name_services_namelist_t
/* define instance of opal_class_t */
OBJ_CLASS_INSTANCE(
orte_name_services_namelist_t, /* type name */
orte_namelist_t, /* type name */
opal_list_item_t, /* parent "class" name */
orte_name_services_namelist_construct, /* constructor */
orte_name_services_namelist_destructor); /* destructor */
orte_namelist_construct, /* constructor */
orte_namelist_destructor); /* destructor */

Просмотреть файл

@ -91,6 +91,20 @@ typedef struct orte_process_name_t orte_process_name_t;
#define ORTE_JOBID_MAX ORTE_STD_CNTR_MAX
#define ORTE_VPID_MAX ORTE_STD_CNTR_MAX
/*
* define invalid values
*/
#define ORTE_CELLID_INVALID -999
#define ORTE_JOBID_INVALID -999
#define ORTE_VPID_INVALID -999
/*
* define wildcard values
*/
#define ORTE_CELLID_WILDCARD -1
#define ORTE_JOBID_WILDCARD -1
#define ORTE_VPID_WILDCARD -1
ORTE_DECLSPEC extern orte_process_name_t orte_name_all;
#define ORTE_NAME_ALL &orte_name_all
@ -117,13 +131,13 @@ ORTE_DECLSPEC extern orte_process_name_t orte_name_all;
/** List of names for general use
*/
struct orte_name_services_namelist_t {
struct orte_namelist_t {
opal_list_item_t item; /**< Allows this item to be placed on a list */
orte_process_name_t *name; /**< Name of a process */
};
typedef struct orte_name_services_namelist_t orte_name_services_namelist_t;
typedef struct orte_namelist_t orte_namelist_t;
ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_name_services_namelist_t);
ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_namelist_t);
#if defined(c_plusplus) || defined(__cplusplus)
}

Просмотреть файл

@ -79,7 +79,7 @@ int orte_ns_proxy_create_cellid(orte_cellid_t *cellid, char *site, char *resourc
return rc;
}
if (0 > orte_rml.send_buffer(orte_ns_proxy.my_replica, cmd, MCA_OOB_TAG_NS, 0)) {
if (0 > orte_rml.send_buffer(orte_ns_proxy.my_replica, cmd, ORTE_RML_TAG_NS, 0)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_RELEASE(cmd);
return ORTE_ERR_COMM_FAILURE;
@ -202,7 +202,7 @@ int orte_ns_proxy_get_cell_info(orte_cellid_t cellid,
return rc;
}
if (0 > orte_rml.send_buffer(orte_ns_proxy.my_replica, cmd, MCA_OOB_TAG_NS, 0)) {
if (0 > orte_rml.send_buffer(orte_ns_proxy.my_replica, cmd, ORTE_RML_TAG_NS, 0)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_RELEASE(cmd);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
@ -803,7 +803,7 @@ int orte_ns_proxy_create_my_name(void)
return rc;
}
if (0 > orte_rml.send_buffer(orte_ns_proxy.my_replica, cmd, MCA_OOB_TAG_NS, 0)) {
if (0 > orte_rml.send_buffer(orte_ns_proxy.my_replica, cmd, ORTE_RML_TAG_NS, 0)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_RELEASE(cmd);
return ORTE_ERR_COMM_FAILURE;
@ -839,7 +839,7 @@ int orte_ns_proxy_dump_cells(void)
return rc;
}
if (0 > orte_rml.send_buffer(orte_ns_proxy.my_replica, &cmd, MCA_OOB_TAG_NS, 0)) {
if (0 > orte_rml.send_buffer(orte_ns_proxy.my_replica, &cmd, ORTE_RML_TAG_NS, 0)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_DESTRUCT(&cmd);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
@ -898,7 +898,7 @@ int orte_ns_proxy_dump_jobs(void)
return rc;
}
if (0 > orte_rml.send_buffer(orte_ns_proxy.my_replica, &cmd, MCA_OOB_TAG_NS, 0)) {
if (0 > orte_rml.send_buffer(orte_ns_proxy.my_replica, &cmd, ORTE_RML_TAG_NS, 0)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_DESTRUCT(&cmd);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
@ -947,7 +947,7 @@ int orte_ns_proxy_dump_tags(void)
return rc;
}
if (0 > orte_rml.send_buffer(orte_ns_proxy.my_replica, &cmd, MCA_OOB_TAG_NS, 0)) {
if (0 > orte_rml.send_buffer(orte_ns_proxy.my_replica, &cmd, ORTE_RML_TAG_NS, 0)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_DESTRUCT(&cmd);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
@ -1008,7 +1008,7 @@ int orte_ns_proxy_dump_datatypes(void)
return rc;
}
if (0 > orte_rml.send_buffer(orte_ns_proxy.my_replica, &cmd, MCA_OOB_TAG_NS, 0)) {
if (0 > orte_rml.send_buffer(orte_ns_proxy.my_replica, &cmd, ORTE_RML_TAG_NS, 0)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_DESTRUCT(&cmd);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);

42
orte/mca/odls/Makefile.am Обычный файл
Просмотреть файл

@ -0,0 +1,42 @@
#
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2004-2005 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# main library setup
noinst_LTLIBRARIES = libmca_odls.la
libmca_odls_la_SOURCES =
# header setup
nobase_orte_HEADERS =
dist_pkgdata_DATA =
# local files
headers = odls.h odls_types.h
libmca_odls_la_SOURCES += $(headers)
# Conditionally install the header files
if WANT_INSTALL_HEADERS
nobase_orte_HEADERS += $(headers)
ortedir = $(includedir)/openmpi/orte/mca/odls
else
ortedir = $(includedir)
endif
include base/Makefile.am
distclean-local:
rm -f base/static-components.h

33
orte/mca/odls/base/Makefile.am Обычный файл
Просмотреть файл

@ -0,0 +1,33 @@
#
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2004-2005 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
headers += \
base/odls_private.h \
base/base.h
libmca_odls_la_SOURCES += \
base/odls_base_close.c \
base/odls_base_open.c \
base/odls_base_select.c \
base/data_type_support/odls_compare_fns.c \
base/data_type_support/odls_copy_fns.c \
base/data_type_support/odls_packing_fns.c \
base/data_type_support/odls_print_fns.c \
base/data_type_support/odls_release_fns.c \
base/data_type_support/odls_size_fns.c \
base/data_type_support/odls_unpacking_fns.c

80
orte/mca/odls/base/base.h Обычный файл
Просмотреть файл

@ -0,0 +1,80 @@
/*
* Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2006 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/** @file:
*/
#ifndef MCA_ODLS_BASE_H
#define MCA_ODLS_BASE_H
/*
* includes
*/
#include "orte_config.h"
#include "opal/mca/mca.h"
#include "opal/class/opal_list.h"
#include "orte/mca/odls/odls.h"
#if defined(c_plusplus) || defined(__cplusplus)
extern "C" {
#endif
/**
* Struct to hold globals for the odls framework
*/
typedef struct orte_odls_base_t {
/* components are available */
bool components_available;
/* component has been selected */
bool selected;
/** List of opened components */
opal_list_t available_components;
/** selected component */
orte_odls_base_component_t selected_component;
} orte_odls_base_t;
/**
* Global instance of odls-wide framework data
*/
ORTE_DECLSPEC extern orte_odls_base_t orte_odls_base;
/*
* Global functions for MCA overall collective open and close
*/
/**
* Open the odls framework
*/
ORTE_DECLSPEC int orte_odls_base_open(void);
/**
* Select an odls module
*/
ORTE_DECLSPEC int orte_odls_base_select(void);
/**
* Close the odls framework
*/
ORTE_DECLSPEC int orte_odls_base_finalize(void);
ORTE_DECLSPEC int orte_odls_base_close(void);
#if defined(c_plusplus) || defined(__cplusplus)
}
#endif
#endif

31
orte/mca/odls/base/data_type_support/odls_compare_fns.c Исполняемый файл
Просмотреть файл

@ -0,0 +1,31 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University.
* All rights reserved.
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
* All rights reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "orte/orte_constants.h"
#include "orte/mca/odls/base/odls_private.h"
/* ORTE_DAEMON_CMD */
int orte_odls_compare_daemon_cmd(orte_daemon_cmd_flag_t *value1, orte_daemon_cmd_flag_t *value2, orte_data_type_t type)
{
if (*value1 > *value2) return ORTE_VALUE1_GREATER;
if (*value2 > *value1) return ORTE_VALUE2_GREATER;
return ORTE_EQUAL;
}

40
orte/mca/odls/base/data_type_support/odls_copy_fns.c Исполняемый файл
Просмотреть файл

@ -0,0 +1,40 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University.
* All rights reserved.
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
* All rights reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "orte/orte_constants.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/odls/base/odls_private.h"
int orte_odls_copy_daemon_cmd(orte_daemon_cmd_flag_t **dest, orte_daemon_cmd_flag_t *src, orte_data_type_t type)
{
size_t datasize;
datasize = sizeof(orte_daemon_cmd_flag_t);
*dest = (orte_daemon_cmd_flag_t*)malloc(datasize);
if (NULL == *dest) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
memcpy(*dest, src, datasize);
return ORTE_SUCCESS;
}

Просмотреть файл

@ -0,0 +1,42 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "orte/orte_constants.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/dss/dss_internal.h"
#include "orte/mca/odls/base/odls_private.h"
/*
* ORTE_DAEMON_CMD
*/
int orte_odls_pack_daemon_cmd(orte_buffer_t *buffer, void *src, orte_std_cntr_t num_vals,
orte_data_type_t type)
{
int ret;
/* Turn around and pack the real type */
if (ORTE_SUCCESS != (ret = orte_dss_pack_buffer(buffer, src, num_vals, ORTE_DAEMON_CMD_T))) {
ORTE_ERROR_LOG(ret);
}
return ret;
}

45
orte/mca/odls/base/data_type_support/odls_print_fns.c Исполняемый файл
Просмотреть файл

@ -0,0 +1,45 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University.
* All rights reserved.
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
* All rights reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "orte/orte_constants.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/odls/base/odls_private.h"
/*
* ORTE_DAEMON_CMD
*/
int orte_odls_print_daemon_cmd(char **output, char *prefix, orte_daemon_cmd_flag_t *src, orte_data_type_t type)
{
char *prefx;
/* deal with NULL prefix */
if (NULL == prefix) asprintf(&prefx, " ");
else prefx = prefix;
/* if src is NULL, just print data type and return */
if (NULL == src) {
asprintf(output, "%sData type: ORTE_DAEMON_CMD\tValue: NULL pointer", prefx);
return ORTE_SUCCESS;
}
asprintf(output, "%sData type: ORTE_DAEMON_CMD\tValue: %lu", prefx, (unsigned long) *src);
return ORTE_SUCCESS;
}

Просмотреть файл

@ -0,0 +1,30 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University.
* All rights reserved.
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
* All rights reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "orte/dss/dss_types.h"
#include "orte/mca/odls/base/odls_private.h"
/*
* STANDARD RELEASE FUNCTION - WORKS FOR EVERYTHING NON-STRUCTURED
*/
void orte_odls_std_release(orte_data_value_t *value)
{
free(value->data);
value->data = NULL;
}

30
orte/mca/odls/base/data_type_support/odls_size_fns.c Исполняемый файл
Просмотреть файл

@ -0,0 +1,30 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University.
* All rights reserved.
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
* All rights reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "orte/orte_constants.h"
#include "orte/mca/odls/base/odls_private.h"
/*
* STANDARD SIZE FUNCTION - WORKS FOR EVERYTHING NON-STRUCTURED
*/
int orte_odls_size_daemon_cmd(size_t *size, orte_daemon_cmd_flag_t *src, orte_data_type_t type)
{
*size = sizeof(orte_daemon_cmd_flag_t);
return ORTE_SUCCESS;
}

Просмотреть файл

@ -0,0 +1,70 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "orte/orte_constants.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/dss/dss_types.h"
#include "orte/dss/dss_internal.h"
#include "orte/mca/odls/base/odls_private.h"
/*
* ORTE_DAEMON_CMD
*/
int orte_odls_unpack_daemon_cmd(orte_buffer_t *buffer, void *dest, orte_std_cntr_t *num_vals,
orte_data_type_t type)
{
int ret;
orte_data_type_t remote_type;
/* if the buffer is fully described, then we can do some magic to handle
* the heterogeneous case. if not, then we can only shoot blind - it is the
* user's responsibility to ensure we are in a homogeneous environment.
*/
if (ORTE_DSS_BUFFER_FULLY_DESC == buffer->type) {
/* see what type was actually packed */
if (ORTE_SUCCESS != (ret = orte_dss_peek_type(buffer, &remote_type))) {
ORTE_ERROR_LOG(ret);
return ret;
}
if (remote_type == ORTE_DAEMON_CMD_T) {
/* fast path it if the sizes are the same */
/* Turn around and unpack the real type */
if (ORTE_SUCCESS != (ret = orte_dss_unpack_buffer(buffer, dest, num_vals, ORTE_DAEMON_CMD_T))) {
ORTE_ERROR_LOG(ret);
}
} else {
/* slow path - types are different sizes */
UNPACK_SIZE_MISMATCH(orte_daemon_cmd_flag_t, remote_type, ret);
}
return ret;
}
/* if we get here, then this buffer is NOT fully described. just unpack it
* using the local size - user gets the pain if it's wrong
*/
if (ORTE_SUCCESS != (ret = orte_dss_unpack_buffer(buffer, dest, num_vals, ORTE_DAEMON_CMD_T))) {
ORTE_ERROR_LOG(ret);
}
return ret;
}

56
orte/mca/odls/base/odls_base_close.c Обычный файл
Просмотреть файл

@ -0,0 +1,56 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include <stdio.h>
#include "orte/orte_constants.h"
#include "opal/util/trace.h"
#include "opal/mca/mca.h"
#include "opal/mca/base/base.h"
#include "orte/mca/odls/odls.h"
#include "orte/mca/odls/base/base.h"
#include "orte/mca/odls/base/odls_private.h"
int orte_odls_base_close(void)
{
OPAL_TRACE(5);
/* if no components are available, then punt */
if (!orte_odls_base.components_available) {
return ORTE_SUCCESS;
}
/* If we have a selected component and module, then finalize it */
if (orte_odls_base.selected) {
orte_odls_base.selected_component.finalize();
}
/* Close all available components (only one in this case) */
mca_base_components_close(orte_odls_globals.output,
&orte_odls_base.available_components, NULL);
/* All done */
return ORTE_SUCCESS;
}

114
orte/mca/odls/base/odls_base_open.c Обычный файл
Просмотреть файл

@ -0,0 +1,114 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "orte/orte_constants.h"
#include "opal/mca/mca.h"
#include "opal/mca/base/base.h"
#include "opal/mca/base/mca_base_param.h"
#include "opal/util/output.h"
#include "opal/util/trace.h"
#include "orte/dss/dss.h"
#include "orte/util/proc_info.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/odls/base/base.h"
#include "orte/mca/odls/base/odls_private.h"
/*
* The following file was created by configure. It contains extern
* statements and the definition of an array of pointers to each
* component's public mca_base_component_t struct.
*/
#include "orte/mca/odls/base/static-components.h"
/*
* Instantiate globals
*/
orte_odls_base_module_t orte_odls;
/*
* Framework global variables
*/
orte_odls_base_t orte_odls_base;
orte_odls_globals_t orte_odls_globals;
/**
* Function for finding and opening either all MCA components, or the one
* that was specifically requested via a MCA parameter.
*/
int orte_odls_base_open(void)
{
int param, value, rc;
orte_data_type_t tmp;
OPAL_TRACE(5);
/* Debugging / verbose output */
param = mca_base_param_reg_int_name("odls_base", "verbose",
"Verbosity level for the odls framework",
false, false, 0, &value);
if (value != 0) {
orte_odls_globals.output = opal_output_open(NULL);
} else {
orte_odls_globals.output = -1;
}
/* register the daemon cmd data type */
tmp = ORTE_DAEMON_CMD;
if (ORTE_SUCCESS != (rc = orte_dss.register_type(orte_odls_pack_daemon_cmd,
orte_odls_unpack_daemon_cmd,
(orte_dss_copy_fn_t)orte_odls_copy_daemon_cmd,
(orte_dss_compare_fn_t)orte_odls_compare_daemon_cmd,
(orte_dss_size_fn_t)orte_odls_size_daemon_cmd,
(orte_dss_print_fn_t)orte_odls_print_daemon_cmd,
(orte_dss_release_fn_t)orte_odls_std_release,
ORTE_DSS_UNSTRUCTURED,
"ORTE_DAEMON_CMD", &tmp))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* if we are NOT a daemon, then that is ALL we do! We just needed to ensure
* that the data type(s) got registered so we can send messages to the daemons
*/
if (!orte_process_info.daemon) {
orte_odls_base.components_available = false;
return ORTE_SUCCESS;
}
/* Open up all available components */
if (ORTE_SUCCESS !=
mca_base_components_open("odls", orte_odls_globals.output,
mca_odls_base_static_components,
&orte_odls_base.available_components, true)) {
return ORTE_ERROR;
}
orte_odls_base.components_available = true;
/* All done */
return ORTE_SUCCESS;
}

113
orte/mca/odls/base/odls_base_select.c Обычный файл
Просмотреть файл

@ -0,0 +1,113 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "orte/orte_constants.h"
#include "opal/mca/mca.h"
#include "opal/mca/base/base.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/odls/base/base.h"
/**
* Function for selecting one component from all those that are
* available.
*/
int orte_odls_base_select(void)
{
opal_list_item_t *item;
mca_base_component_list_item_t *cli;
orte_odls_base_component_t *component, *best_component = NULL;
orte_odls_base_module_t *module, *best_module = NULL;
int priority, best_priority = -1;
/* if no components are available (e.g., we are not in a daemon), then
* there is nothing to do - so just return
*/
if (!orte_odls_base.components_available) {
orte_odls_base.selected = false;
return ORTE_SUCCESS;
}
/* Iterate through all the available components */
for (item = opal_list_get_first(&orte_odls_base.available_components);
item != opal_list_get_end(&orte_odls_base.available_components);
item = opal_list_get_next(item)) {
cli = (mca_base_component_list_item_t *) item;
component = (orte_odls_base_component_t *) cli->cli_component;
/* Call the component's init function and see if it wants to be
selected */
module = component->init(&priority);
/* If we got a non-NULL module back, then the component wants to
be selected. So save its multi/hidden values and save the
module with the highest priority */
if (NULL != module) {
/* If this is the best one, save it */
if (priority > best_priority) {
/* If there was a previous best one, finalize */
if (NULL != best_component) {
best_component->finalize();
}
/* Save the new best one */
best_module = module;
best_component = component;
/* update the best priority */
best_priority = priority;
}
/* If it's not the best one, finalize it */
else {
component->finalize();
}
}
}
/* If we didn't find one to select, then we have a big problem */
if (NULL == best_component) {
orte_odls_base.selected = false;
return ORTE_ERROR;
}
/* We have happiness -- save the component and module for later
usage */
orte_odls = *best_module;
orte_odls_base.selected_component = *best_component;
orte_odls_base.selected = true;
/* all done */
return ORTE_SUCCESS;
}

80
orte/mca/odls/base/odls_private.h Обычный файл
Просмотреть файл

@ -0,0 +1,80 @@
/*
* Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2006 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/** @file:
*/
#ifndef MCA_ODLS_PRIVATE_H
#define MCA_ODLS_PRIVATE_H
/*
* includes
*/
#include "orte_config.h"
#include "opal/class/opal_list.h"
#include "orte/dss/dss_types.h"
#include "orte/mca/ns/ns_types.h"
#include "orte/mca/rmgr/rmgr_types.h"
#include "orte/mca/odls/odls_types.h"
#if defined(c_plusplus) || defined(__cplusplus)
extern "C" {
#endif
/*
* General ODLS types
*/
#if defined(c_plusplus) || defined(__cplusplus)
extern "C" {
#endif
typedef struct orte_odls_globals_t {
/** Verbose/debug output stream */
int output;
/** Time to allow process to forcibly die */
int timeout_before_sigkill;
} orte_odls_globals_t;
extern orte_odls_globals_t orte_odls_globals;
/*
* data type functions
*/
int orte_odls_compare_daemon_cmd(orte_daemon_cmd_flag_t *value1, orte_daemon_cmd_flag_t *value2, orte_data_type_t type);
int orte_odls_copy_daemon_cmd(orte_daemon_cmd_flag_t **dest, orte_daemon_cmd_flag_t *src, orte_data_type_t type);
int orte_odls_pack_daemon_cmd(orte_buffer_t *buffer, void *src,
orte_std_cntr_t num_vals, orte_data_type_t type);
int orte_odls_print_daemon_cmd(char **output, char *prefix, orte_daemon_cmd_flag_t *src, orte_data_type_t type);
void orte_odls_std_release(orte_data_value_t *value);
int orte_odls_size_daemon_cmd(size_t *size, orte_daemon_cmd_flag_t *src, orte_data_type_t type);
int orte_odls_unpack_daemon_cmd(orte_buffer_t *buffer, void *dest,
orte_std_cntr_t *num_vals, orte_data_type_t type);
#if defined(c_plusplus) || defined(__cplusplus)
}
#endif
#endif

Просмотреть файл

@ -9,6 +9,7 @@
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# Copyright (c) 2006 Cisco Systems, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
@ -16,37 +17,35 @@
# $HEADER$
#
AM_CPPFLAGS = -I$(top_ompi_builddir)/src/include $(pls_bproc_orted_CPPFLAGS)
AM_CPPFLAGS = -I$(top_ompi_builddir)/src/include $(odls_bproc_CPPFLAGS)
# Make the output library in this directory, and name it either
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
# (for static builds).
if OMPI_BUILD_pls_bproc_orted_DSO
if OMPI_BUILD_odls_bproc_DSO
component_noinst =
component_install = mca_pls_bproc_orted.la
component_install = mca_odls_bproc.la
else
component_noinst = libmca_pls_bproc_orted.la
component_noinst = libmca_odls_bproc.la
component_install =
endif
sources = \
pls_bproc_orted.h \
pls_bproc_orted.c \
pls_bproc_orted_component.c
odls_bproc.h \
odls_bproc.c \
odls_bproc_component.c
mcacomponentdir = $(libdir)/openmpi
mcacomponent_LTLIBRARIES = $(component_install)
mca_pls_bproc_orted_la_SOURCES = $(sources)
mca_pls_bproc_orted_la_LIBADD = \
$(pls_bproc_orted_LIBS) \
mca_odls_bproc_la_SOURCES = $(sources)
mca_odls_bproc_la_LIBADD = \
$(odls_bproc_LIBS) \
$(top_ompi_builddir)/orte/liborte.la \
$(top_ompi_builddir)/opal/libopal.la
mca_pls_bproc_orted_la_LDFLAGS = -module -avoid-version $(pls_bproc_orted_LDFLAGS)
mca_odls_bproc_la_LDFLAGS = -module -avoid-version $(odls_bproc_LDFLAGS)
noinst_LTLIBRARIES = $(component_noinst)
libmca_pls_bproc_orted_la_SOURCES = $(sources)
libmca_pls_bproc_orted_la_LIBADD = $(pls_bproc_orted_LIBS)
libmca_pls_bproc_orted_la_LDFLAGS = -module -avoid-version $(pls_bproc_orted_LDFLAGS)
libmca_odls_bproc_la_SOURCES = $(sources)
libmca_odls_bproc_la_LIBADD = $(odls_bproc_LIBS)
libmca_odls_bproc_la_LDFLAGS = -module -avoid-version $(odls_bproc_LDFLAGS)

38
orte/mca/odls/bproc/configure.m4 Обычный файл
Просмотреть файл

@ -0,0 +1,38 @@
# -*- shell-script -*-
#
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2004-2005 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# MCA_odls_bproc_CONFIG([action-if-found], [action-if-not-found])
# -----------------------------------------------------------
AC_DEFUN([MCA_odls_bproc_CONFIG],[
OMPI_CHECK_BPROC([odls_bproc], [odls_bproc_good=1],
[odls_bproc_good=1], [odls_bproc_good=0])
# if check worked, set wrapper flags if so.
# Evaluate succeed / fail
AS_IF([test "$odls_bproc_good" = "1"],
[odls_bproc_WRAPPER_EXTRA_LDFLAGS="$odls_bproc_LDFLAGS"
odls_bproc_WRAPPER_EXTRA_LIBS="$odls_bproc_LIBS"
$1],
[$2])
# set build flags to use in makefile
AC_SUBST([odls_bproc_CPPFLAGS])
AC_SUBST([odls_bproc_LDFLAGS])
AC_SUBST([odls_bproc_LIBS])
])dnl

23
orte/mca/odls/bproc/configure.params Обычный файл
Просмотреть файл

@ -0,0 +1,23 @@
# -*- shell-script -*-
#
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2004-2005 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# Specific to this module
PARAM_INIT_FILE=odls_bproc_component.c
PARAM_CONFIG_FILES="Makefile"

Просмотреть файл

@ -18,7 +18,7 @@
/**
* @file:
* Part of the bproc launcher.
* See pls_bproc_orted.h for an overview of how it works.
* See odls_bproc.h for an overview of how it works.
*/
#include "orte_config.h"
#include <stdlib.h>
@ -44,35 +44,32 @@
#include "orte/mca/iof/base/iof_base_setup.h"
#include "orte/mca/ns/base/base.h"
#include "orte/mca/oob/base/base.h"
#include "orte/mca/pls/base/base.h"
#include "orte/mca/rmaps/base/rmaps_base_map.h"
#include "orte/mca/rml/rml.h"
#include "orte/util/session_dir.h"
#include "orte/util/univ_info.h"
#include "pls_bproc_orted.h"
#include "odls_bproc.h"
/**
* Initialization of the bproc_orted module with all the needed function pointers
*/
orte_pls_base_module_1_0_0_t orte_pls_bproc_orted_module = {
orte_pls_bproc_orted_launch,
orte_pls_bproc_orted_terminate_job,
orte_pls_bproc_orted_terminate_proc,
orte_pls_bproc_orted_signal_job,
orte_pls_bproc_orted_signal_proc,
orte_pls_bproc_orted_finalize
orte_odls_base_module_t orte_odls_bproc_module = {
orte_odls_bproc_subscribe_launch_data,
orte_odls_bproc_launch_local_procs,
orte_odls_bproc_kill_local_procs,
orte_odls_bproc_signal_local_procs
};
static int pls_bproc_orted_make_dir(char *directory);
static char * pls_bproc_orted_get_base_dir_name(int proc_rank, orte_jobid_t jobid,
size_t app_context);
static void pls_bproc_orted_delete_dir_tree(char * path);
static int pls_bproc_orted_remove_dir(void);
static void pls_bproc_orted_send_cb(int status, orte_process_name_t * peer,
static int odls_bproc_make_dir(char *directory);
static char * odls_bproc_get_base_dir_name(int proc_rank, orte_jobid_t jobid,
orte_std_cntr_t app_context);
static void odls_bproc_delete_dir_tree(char * path);
static int odls_bproc_remove_dir(void);
static void odls_bproc_send_cb(int status, orte_process_name_t * peer,
orte_buffer_t* buffer, int tag, void* cbdata);
static int pls_bproc_orted_setup_stdio(orte_process_name_t *proc_name,
static int odls_bproc_setup_stdio(orte_process_name_t *proc_name,
int proc_rank, orte_jobid_t jobid,
size_t app_context, bool connect_stdin);
orte_std_cntr_t app_context, bool connect_stdin);
/**
@ -83,13 +80,13 @@ static int pls_bproc_orted_setup_stdio(orte_process_name_t *proc_name,
* @retval error
*/
static int
pls_bproc_orted_make_dir(char *directory)
odls_bproc_make_dir(char *directory)
{
struct stat buf;
mode_t my_mode = S_IRWXU; /* at the least, I need to be able to do anything */
if (0 == stat(directory, &buf)) { /* exists - delete it and its contents */
pls_bproc_orted_delete_dir_tree(directory);
odls_bproc_delete_dir_tree(directory);
}
/* try to create it with proper mode */
return(opal_os_dirpath_create(directory, my_mode));
@ -108,8 +105,8 @@ pls_bproc_orted_make_dir(char *directory)
* @retval path
*/
static char *
pls_bproc_orted_get_base_dir_name(int proc_rank, orte_jobid_t jobid,
size_t app_context)
odls_bproc_get_base_dir_name(int proc_rank, orte_jobid_t jobid,
orte_std_cntr_t app_context)
{
char *path = NULL, *user = NULL, *job = NULL;
int rc;
@ -141,6 +138,9 @@ static char *
ORTE_ERROR_LOG(ORTE_ERROR);
path = NULL;
}
if(0 < mca_odls_bproc_component.debug) {
opal_output(0, "odls bproc io setup. Path: %s\n", path);
}
free(user);
free(job);
return path;
@ -152,7 +152,7 @@ static char *
* @param path the path to the base directory to delete
*/
static void
pls_bproc_orted_delete_dir_tree(char * path)
odls_bproc_delete_dir_tree(char * path)
{
DIR *dp;
struct dirent *ep;
@ -170,7 +170,7 @@ pls_bproc_orted_delete_dir_tree(char * path)
filenm = opal_os_path(false, path, ep->d_name, NULL);
ret = stat(filenm, &buf);
if (ret < 0 || S_ISDIR(buf.st_mode)) {
pls_bproc_orted_delete_dir_tree(filenm);
odls_bproc_delete_dir_tree(filenm);
free(filenm);
continue;
}
@ -190,7 +190,7 @@ pls_bproc_orted_delete_dir_tree(char * path)
* @retval error
*/
static int
pls_bproc_orted_remove_dir()
odls_bproc_remove_dir()
{
char *frontend = NULL, *user = NULL, *filename = NULL;
int id;
@ -213,7 +213,7 @@ pls_bproc_orted_remove_dir()
return ORTE_ERROR;
}
/* we do our best to clean up the directory tree, but we ignore errors*/
pls_bproc_orted_delete_dir_tree(frontend);
odls_bproc_delete_dir_tree(frontend);
free(frontend);
return ORTE_SUCCESS;
}
@ -228,7 +228,7 @@ pls_bproc_orted_remove_dir()
* @param cbdata
*/
static void
pls_bproc_orted_send_cb(int status, orte_process_name_t * peer,
odls_bproc_send_cb(int status, orte_process_name_t * peer,
orte_buffer_t* buffer, int tag, void* cbdata)
{
OBJ_RELEASE(buffer);
@ -257,9 +257,9 @@ pls_bproc_orted_send_cb(int status, orte_process_name_t * peer,
* @retval error
*/
static int
pls_bproc_orted_setup_stdio(orte_process_name_t *proc_name, int proc_rank,
odls_bproc_setup_stdio(orte_process_name_t *proc_name, int proc_rank,
orte_jobid_t jobid,
size_t app_context, bool connect_stdin)
orte_std_cntr_t app_context, bool connect_stdin)
{
char *path_prefix, *fd_link_path = NULL;
int rc = ORTE_SUCCESS, fd;
@ -269,7 +269,7 @@ pls_bproc_orted_setup_stdio(orte_process_name_t *proc_name, int proc_rank,
struct termios term_attrs;
#endif
path_prefix = pls_bproc_orted_get_base_dir_name(proc_rank, jobid, app_context);
path_prefix = odls_bproc_get_base_dir_name(proc_rank, jobid, (size_t)app_context);
if (NULL == path_prefix) {
rc = ORTE_ERROR;
ORTE_ERROR_LOG(rc);
@ -277,7 +277,7 @@ pls_bproc_orted_setup_stdio(orte_process_name_t *proc_name, int proc_rank,
}
/* check for existence and access, or create it */
if (ORTE_SUCCESS != (rc = pls_bproc_orted_make_dir(path_prefix))) {
if (ORTE_SUCCESS != (rc = odls_bproc_make_dir(path_prefix))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
@ -294,7 +294,7 @@ pls_bproc_orted_setup_stdio(orte_process_name_t *proc_name, int proc_rank,
if (connect_stdin) {
if (0 != mkfifo(fd_link_path, S_IRWXU)) {
perror("pls_bproc_orted mkfifo failed");
perror("odls_bproc mkfifo failed");
rc = ORTE_ERROR;
ORTE_ERROR_LOG(rc);
goto cleanup;
@ -302,7 +302,7 @@ pls_bproc_orted_setup_stdio(orte_process_name_t *proc_name, int proc_rank,
fd = open(fd_link_path, O_RDWR);
if (-1 == fd) {
perror("pls_bproc_orted open failed");
perror("odls_bproc open failed");
rc = ORTE_ERROR;
ORTE_ERROR_LOG(rc);
goto cleanup;
@ -312,7 +312,7 @@ pls_bproc_orted_setup_stdio(orte_process_name_t *proc_name, int proc_rank,
ORTE_IOF_STDIN, fd);
} else {
if(0 != symlink("/dev/null", fd_link_path)) {
perror("pls_bproc_orted could not create symlink");
perror("odls_bproc could not create symlink");
rc = ORTE_ERROR;
ORTE_ERROR_LOG(rc);
goto cleanup;
@ -332,7 +332,7 @@ pls_bproc_orted_setup_stdio(orte_process_name_t *proc_name, int proc_rank,
#if defined(HAVE_OPENPTY) && (OMPI_ENABLE_PTY_SUPPORT != 0)
if (0 != openpty(&amaster, &aslave, pty_name, NULL, NULL)) {
opal_output(0, "pls_bproc_orted: openpty failed, using pipes instead");
opal_output(0, "odls_bproc: openpty failed, using pipes instead");
goto stdout_fifo_setup;
}
@ -366,14 +366,14 @@ stdout_fifo_setup:
#endif
if (0 != mkfifo(fd_link_path, S_IRWXU)) {
perror("pls_bproc_orted mkfifo failed");
perror("odls_bproc mkfifo failed");
rc = ORTE_ERROR;
goto cleanup;
}
fd = open(fd_link_path, O_RDWR);
if (-1 == fd) {
perror("pls_bproc_orted open failed");
perror("odls_bproc open failed");
rc = ORTE_ERROR;
goto cleanup;
}
@ -395,14 +395,14 @@ stderr_fifo_setup:
}
if (0 != mkfifo(fd_link_path, S_IRWXU)) {
perror("pls_bproc_orted mkfifo failed");
perror("odls_bproc mkfifo failed");
rc = ORTE_ERROR;
goto cleanup;
}
fd = open(fd_link_path, O_RDWR);
if (-1 == fd) {
perror("pls_bproc_orted open failed");
perror("odls_bproc open failed");
rc = ORTE_ERROR;
goto cleanup;
}
@ -421,28 +421,118 @@ cleanup:
}
/* this entire function gets called within a GPR compound command,
* so the subscription actually doesn't get done until the orted
* executes the compound command
*/
int orte_odls_bproc_subscribe_launch_data(orte_jobid_t job, orte_gpr_notify_cb_fn_t cbfunc)
{
char *segment;
orte_gpr_value_t *values[1];
orte_gpr_subscription_t *subs, sub=ORTE_GPR_SUBSCRIPTION_EMPTY;
orte_gpr_trigger_t *trigs, trig=ORTE_GPR_TRIGGER_EMPTY;
char* keys[] = {
ORTE_PROC_NAME_KEY,
ORTE_PROC_APP_CONTEXT_KEY,
ORTE_NODE_NAME_KEY,
};
int num_keys = 3;
int i, rc;
/* get the job segment name */
if (ORTE_SUCCESS != (rc = orte_schema.get_job_segment_name(&segment, job))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* attach ourselves to the "standard" orted trigger */
if (ORTE_SUCCESS !=
(rc = orte_schema.get_std_trigger_name(&(trig.name),
ORTED_LAUNCH_STAGE_GATE_TRIGGER, job))) {
ORTE_ERROR_LOG(rc);
free(segment);
return rc;
}
/* ask for return of all data required for launching local processes */
subs = &sub;
sub.action = ORTE_GPR_NOTIFY_DELETE_AFTER_TRIG;
if (ORTE_SUCCESS != (rc = orte_schema.get_std_subscription_name(&(sub.name),
ORTED_LAUNCH_STG_SUB,
job))) {
ORTE_ERROR_LOG(rc);
free(segment);
free(trig.name);
return rc;
}
sub.cnt = 1;
sub.values = values;
if (ORTE_SUCCESS != (rc = orte_gpr.create_value(&(values[0]), ORTE_GPR_KEYS_OR | ORTE_GPR_TOKENS_OR,
segment, num_keys, 0))) {
ORTE_ERROR_LOG(rc);
free(segment);
free(sub.name);
free(trig.name);
return rc;
}
for (i=0; i < num_keys; i++) {
if (ORTE_SUCCESS != (rc = orte_gpr.create_keyval(&(values[0]->keyvals[i]),
keys[i], ORTE_UNDEF, NULL))) {
ORTE_ERROR_LOG(rc);
free(segment);
free(sub.name);
free(trig.name);
OBJ_RELEASE(values[0]);
return rc;
}
}
sub.cbfunc = cbfunc;
trigs = &trig;
/* do the subscription */
if (ORTE_SUCCESS != (rc = orte_gpr.subscribe(1, &subs, 1, &trigs))) {
ORTE_ERROR_LOG(rc);
}
free(segment);
free(sub.name);
free(trig.name);
OBJ_RELEASE(values[0]);
return rc;
}
/**
* Setup io for the current node, then tell orterun we are ready for the actual
* processes.
* @param jobid The jobid of the job to launch
* @retval ORTE_SUCCESS
* @retval error
*/
int
orte_pls_bproc_orted_launch(orte_jobid_t jobid)
orte_odls_bproc_launch_local_procs(orte_gpr_notify_data_t *data)
{
opal_list_t map;
orte_rmaps_base_map_t * mapping;
orte_rmaps_base_proc_t * proc;
odls_bproc_child_t *child;
opal_list_item_t* item;
orte_gpr_value_t *value, **values;
orte_gpr_keyval_t *kval;
char *node_name;
int rc;
int num_procs = 0;
size_t i;
orte_std_cntr_t i, j, kv, kv2, *sptr;
int src = 0;
orte_buffer_t *ack;
char * param;
bool connect_stdin;
char * pty_name = NULL;
orte_jobid_t jobid;
/* first, retrieve the job number we are to launch from the
* returned data - we can extract the jobid directly from the
* subscription name we created
*/
if (ORTE_SUCCESS != (rc = orte_schema.extract_jobid_from_std_trigger_name(&jobid, data->target))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/**
* hack for bproc4, change process group so that we do not receive signals
@ -451,55 +541,87 @@ orte_pls_bproc_orted_launch(orte_jobid_t jobid)
*/
setpgid(0,0);
/* get current node number */
rc = bproc_currnode();
if(0 > rc) {
opal_output(0, "pls_bproc_orted component running on invalid node");
}
if(0 > asprintf(&param, "%d", rc)) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
goto cleanup;
}
/* query the allocation for this node */
OBJ_CONSTRUCT(&map, opal_list_t);
rc = orte_rmaps_base_get_node_map(orte_process_info.my_name->cellid, jobid,
param, &map);
free(param);
if (ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
goto cleanup;
/* loop through the returned data to find the global info and
* the info for processes going onto this node
*/
values = (orte_gpr_value_t**)(data->values)->addr;
for (j=0, i=0; i < data->cnt && j < (data->values)->size; j++) { /* loop through all returned values */
if (NULL != values[j]) {
i++;
value = values[j];
/* this must have come from one of the process containers, so it must
* contain data for a proc structure - see if it belongs to this node
*/
for (kv=0; kv < value->cnt; kv++) {
kval = value->keyvals[kv];
if (strcmp(kval->key, ORTE_NODE_NAME_KEY) == 0) {
/* Most C-compilers will bark if we try to directly compare the string in the
* kval data area against a regular string, so we need to "get" the data
* so we can access it */
if (ORTE_SUCCESS != (rc = orte_dss.get((void**)&node_name, kval->value, ORTE_STRING))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* if this is our node...must also protect against a zero-length string */
if (NULL != node_name && 0 == strcmp(node_name, orte_system_info.nodename)) {
/* ...harvest the info into a new child structure */
child = OBJ_NEW(odls_bproc_child_t);
for (kv2 = 0; kv2 < value->cnt; kv2++) {
kval = value->keyvals[kv2];
if(strcmp(kval->key, ORTE_PROC_NAME_KEY) == 0) {
/* copy the name into the child object */
if (ORTE_SUCCESS != (rc = orte_dss.copy((void**)&(child->name), kval->value->data, ORTE_NAME))) {
ORTE_ERROR_LOG(rc);
return rc;
}
continue;
}
if(strcmp(kval->key, ORTE_PROC_APP_CONTEXT_KEY) == 0) {
if (ORTE_SUCCESS != (rc = orte_dss.get((void**)&sptr, kval->value, ORTE_STD_CNTR))) {
ORTE_ERROR_LOG(rc);
return rc;
}
child->app_idx = *sptr; /* save the index into the app_context objects */
continue;
}
} /* kv2 */
/* protect operation on the global list of children */
OPAL_THREAD_LOCK(&mca_odls_bproc_component.mutex);
opal_list_append(&mca_odls_bproc_component.children, &child->super);
opal_condition_signal(&mca_odls_bproc_component.cond);
OPAL_THREAD_UNLOCK(&mca_odls_bproc_component.mutex);
}
}
} /* for kv */
} /* for j */
}
/* figure out what processes will be on this node and set up the io files */
for(item = opal_list_get_first(&map);
item != opal_list_get_end(&map);
/* set up the io files for our children */
for(item = opal_list_get_first(&mca_odls_bproc_component.children);
item != opal_list_get_end(&mca_odls_bproc_component.children);
item = opal_list_get_next(item)) {
mapping = (orte_rmaps_base_map_t *) item;
num_procs = 0;
for(i = mapping->num_procs; i > 0; i--) {
proc = mapping->procs[i - 1];
if(0 < mca_pls_bproc_orted_component.debug) {
opal_output(0, "orte_pls_bproc_orted_launch: setting up io for "
"[%lu,%lu,%lu] proc rank %lu\n",
ORTE_NAME_ARGS((&proc->proc_name)),
proc->proc_rank);
}
/* only setup to forward stdin if it is rank 0, otherwise connect
* to /dev/null */
if(0 == proc->proc_rank) {
connect_stdin = true;
} else {
connect_stdin = false;
}
child = (odls_bproc_child_t *) item;
if(0 < mca_odls_bproc_component.debug) {
opal_output(0, "orte_odls_bproc_launch: setting up io for "
"[%lu,%lu,%lu] proc rank %lu\n",
ORTE_NAME_ARGS((child->name)),
child->name->vpid);
}
/* only setup to forward stdin if it is rank 0, otherwise connect
* to /dev/null */
if(0 == child->name->vpid) {
connect_stdin = true;
} else {
connect_stdin = false;
}
rc = pls_bproc_orted_setup_stdio(&proc->proc_name, num_procs,
jobid, mapping->app->idx,
connect_stdin);
if (ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
num_procs++;
rc = odls_bproc_setup_stdio(child->name, (int)child->name->vpid,
jobid, child->app_idx,
connect_stdin);
if (ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
}
@ -509,8 +631,8 @@ orte_pls_bproc_orted_launch(orte_jobid_t jobid)
if(ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
}
rc = mca_oob_send_packed_nb(MCA_OOB_NAME_SEED, ack, MCA_OOB_TAG_BPROC, 0,
pls_bproc_orted_send_cb, NULL);
rc = mca_oob_send_packed_nb(ORTE_RML_NAME_SEED, ack, ORTE_RML_TAG_BPROC, 0,
odls_bproc_send_cb, NULL);
if (0 > rc) {
ORTE_ERROR_LOG(rc);
goto cleanup;
@ -518,13 +640,7 @@ orte_pls_bproc_orted_launch(orte_jobid_t jobid)
rc = ORTE_SUCCESS;
cleanup:
while(NULL != (item = opal_list_remove_first(&map))) {
OBJ_RELEASE(item);
}
if(NULL != pty_name) {
free(pty_name);
}
OBJ_DESTRUCT(&map);
return rc;
}
@ -532,37 +648,8 @@ cleanup:
* Function to terminate a job. Since this component only runs on remote nodes
* and doesn't actually launch any processes, this function is not needed
* so is a noop.
* @param jobid The job to terminate
* @retval ORTE_SUCCESS
*/
int orte_pls_bproc_orted_terminate_job(orte_jobid_t jobid)
{
orte_iof.iof_flush();
return ORTE_SUCCESS;
}
/**
* Function to terminate a process. Since this component only runs on remote nodes
* and doesn't actually launch any processes, this function is not needed
* so is a noop.
* @param proc the process's name
* @retval ORTE_SUCCESS
*/
int orte_pls_bproc_orted_terminate_proc(const orte_process_name_t* proc)
{
orte_iof.iof_flush();
return ORTE_SUCCESS;
}
/**
* Function to signal a job. Since this component only runs on remote nodes
* and doesn't actually launch any processes, this function is not needed
* so is a noop.
* @param jobid The job to signal
* @param signal The signal to send
* @retval ORTE_SUCCESS
*/
int orte_pls_bproc_orted_signal_job(orte_jobid_t jobid, int32_t signal)
int orte_odls_bproc_kill_local_procs(orte_jobid_t job, bool set_state)
{
orte_iof.iof_flush();
return ORTE_SUCCESS;
@ -576,7 +663,7 @@ int orte_pls_bproc_orted_signal_job(orte_jobid_t jobid, int32_t signal)
* @param signal The signal to send
* @retval ORTE_SUCCESS
*/
int orte_pls_bproc_orted_signal_proc(const orte_process_name_t* proc, int32_t signal)
int orte_odls_bproc_signal_local_procs(orte_process_name_t* proc, int32_t signal)
{
orte_iof.iof_flush();
return ORTE_SUCCESS;
@ -584,14 +671,14 @@ int orte_pls_bproc_orted_signal_proc(const orte_process_name_t* proc, int32_t si
/**
* Finalizes the bproc_orted module. Cleanup tmp directory/files
* Finalizes the bproc module. Cleanup tmp directory/files
* used for I/O forwarding.
* @retval ORTE_SUCCESS
*/
int orte_pls_bproc_orted_finalize(void)
int orte_odls_bproc_finalize(void)
{
orte_iof.iof_flush();
pls_bproc_orted_remove_dir();
odls_bproc_remove_dir();
orte_session_dir_finalize(orte_process_info.my_name);
return ORTE_SUCCESS;
}

108
orte/mca/odls/bproc/odls_bproc.h Обычный файл
Просмотреть файл

@ -0,0 +1,108 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2006 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/**
* @file:
* Part of the bproc launching system. This launching system is broken into 2
* parts: one runs under the PLS on the head node to launch the orteds, and the
* other serves as the orted's local launcher.
*
* The main job of this component is to setup ptys/pipes for IO forwarding.
* See pls_bproc.h for an overview of how the entire bproc launching system works.
*/
#ifndef ORTE_ODLS_BPROC_H_
#define ORTE_ODLS_BPROC_H_
#include "orte_config.h"
#include <sys/bproc.h>
#include "opal/mca/mca.h"
#include "opal/threads/condition.h"
#include "orte/mca/odls/odls.h"
#if defined(c_plusplus) || defined(__cplusplus)
extern "C" {
#endif
/*
* Module open / close
*/
int orte_odls_bproc_component_open(void);
int orte_odls_bproc_component_close(void);
int orte_odls_bproc_finalize(void);
orte_odls_base_module_t* orte_odls_bproc_init(int *priority);
/*
* Startup / Shutdown
*/
int orte_odls_bproc_finalize(void);
/*
* Interface
*/
int orte_odls_bproc_subscribe_launch_data(orte_jobid_t job, orte_gpr_notify_cb_fn_t cbfunc);
int orte_odls_bproc_launch_local_procs(orte_gpr_notify_data_t *data);
int orte_odls_bproc_kill_local_procs(orte_jobid_t job, bool set_state);
int orte_odls_bproc_signal_local_procs(orte_process_name_t* proc_name, int32_t signal);
/**
* ODLS bproc_orted component
*/
struct orte_odls_bproc_component_t {
orte_odls_base_component_t super;
/**< The base class */
int debug;
/**< If greater than 0 print debugging information */
int priority;
/**< The priority of this component. This will be returned if we determine
* that bproc is available and running on this node, */
opal_mutex_t lock;
/**< Lock used to prevent some race conditions */
opal_condition_t cond;
/**< Condition used to wake up waiting threads */
opal_list_t children;
/**< list of children on this node */
};
/**
* Convenience typedef
*/
typedef struct orte_odls_bproc_component_t orte_odls_bproc_component_t;
/*
* List object to locally store the process names and pids of
* our children. This can subsequently be used to order termination
* or pass signals without looking the info up again.
*/
typedef struct odls_bproc_child_t {
opal_list_item_t super; /* required to place this on a list */
orte_process_name_t *name; /* the OpenRTE name of the proc */
pid_t pid; /* local pid of the proc */
orte_std_cntr_t app_idx; /* index of the app_context for this proc */
bool alive; /* is this proc alive? */
} odls_bproc_child_t;
ORTE_DECLSPEC OBJ_CLASS_DECLARATION(odls_bproc_child_t);
ORTE_DECLSPEC orte_odls_bproc_component_t mca_odls_bproc_component;
ORTE_DECLSPEC orte_odls_base_module_t orte_odls_bproc_module;
#if defined(c_plusplus) || defined(__cplusplus)
}
#endif
#endif /* ORTE_ODLS_BPROC_H_ */

Просмотреть файл

@ -21,32 +21,51 @@
* Takes care of the component stuff for the MCA.
*/
#include "orte_config.h"
#include "opal/mca/base/mca_base_param.h"
#include "orte/orte_constants.h"
#include "orte/mca/pls/pls.h"
#include "opal/mca/base/mca_base_param.h"
#include "orte/util/proc_info.h"
#include "pls_bproc_orted.h"
#include "orte/mca/odls/odls.h"
#include "odls_bproc.h"
/* instance the child list object */
static void odls_bproc_child_constructor(odls_bproc_child_t *ptr)
{
ptr->name = NULL;
ptr->app_idx = -1;
ptr->alive = false;
}
static void odls_bproc_child_destructor(odls_bproc_child_t *ptr)
{
if (NULL != ptr->name) free(ptr->name);
}
OBJ_CLASS_INSTANCE(odls_bproc_child_t,
opal_list_item_t,
odls_bproc_child_constructor,
odls_bproc_child_destructor);
/**
* The bproc_orted component data structure used to store all the relevent data
* The bproc component data structure used to store all the relevent data
* about this component.
*/
orte_pls_bproc_orted_component_t mca_pls_bproc_orted_component = {
orte_odls_bproc_component_t mca_odls_bproc_component = {
{
/* First, the mca_component_t struct containing meta information
about the component itself */
{
/* Indicate that we are a pls v1.0.0 component (which also
/* Indicate that we are a odls v1.3.0 component (which also
implies a specific MCA version) */
ORTE_PLS_BASE_VERSION_1_0_0,
ORTE_ODLS_BASE_VERSION_1_3_0,
/* Component name and version */
"bproc_orted",
"bproc",
ORTE_MAJOR_VERSION,
ORTE_MINOR_VERSION,
ORTE_RELEASE_VERSION,
/* Component open and close functions */
orte_pls_bproc_orted_component_open,
orte_pls_bproc_orted_component_close
orte_odls_bproc_component_open,
orte_odls_bproc_component_close
},
/* Next the MCA v1.0.0 component meta data */
{
@ -54,7 +73,8 @@ orte_pls_bproc_orted_component_t mca_pls_bproc_orted_component = {
false
},
/* Initialization / querying functions */
orte_pls_bproc_orted_init
orte_odls_bproc_init,
orte_odls_bproc_finalize
}
};
@ -62,18 +82,20 @@ orte_pls_bproc_orted_component_t mca_pls_bproc_orted_component = {
* Opens the pls_bproc component, setting all the needed mca parameters and
* finishes setting up the component struct.
*/
int orte_pls_bproc_orted_component_open(void)
int orte_odls_bproc_component_open(void)
{
/* initialize globals */
OBJ_CONSTRUCT(&mca_pls_bproc_orted_component.lock, opal_mutex_t);
OBJ_CONSTRUCT(&mca_odls_bproc_component.lock, opal_mutex_t);
OBJ_CONSTRUCT(&mca_odls_bproc_component.cond, opal_condition_t);
OBJ_CONSTRUCT(&mca_odls_bproc_component.children, opal_list_t);
/* lookup parameters */
mca_base_param_reg_int(&mca_pls_bproc_orted_component.super.pls_version,
mca_base_param_reg_int(&mca_odls_bproc_component.super.version,
"priority", NULL, false, false, 100,
&mca_pls_bproc_orted_component.priority);
mca_base_param_reg_int(&mca_pls_bproc_orted_component.super.pls_version,
&mca_odls_bproc_component.priority);
mca_base_param_reg_int(&mca_odls_bproc_component.super.version,
"debug", "If > 0 prints library debugging information",
false, false, 0, &mca_pls_bproc_orted_component.debug);
false, false, 0, &mca_odls_bproc_component.debug);
return ORTE_SUCCESS;
}
@ -81,16 +103,16 @@ int orte_pls_bproc_orted_component_open(void)
* Initializes the module. We do not want to run unless we are not the seed,
* bproc is running, and we are not on the master node.
*/
orte_pls_base_module_t *orte_pls_bproc_orted_init(int *priority)
orte_odls_base_module_t *orte_odls_bproc_init(int *priority)
{
int ret;
struct bproc_version_t version;
/* are we the seed */
if(orte_process_info.seed == true)
return NULL;
/* okay, we are in a daemon - now check to see if BProc is running here */
/* the base open/select logic protects us against operation when
* we are NOT in a daemon, so we don't have to check that here
*/
/* check to see if BProc is running here */
ret = bproc_version(&version);
if (ret != 0) {
return NULL;
@ -101,16 +123,29 @@ orte_pls_base_module_t *orte_pls_bproc_orted_init(int *priority)
return NULL;
}
*priority = mca_pls_bproc_orted_component.priority;
return &orte_pls_bproc_orted_module;
*priority = mca_odls_bproc_component.priority;
return &orte_odls_bproc_module;
}
/**
* Component close function.
*/
int orte_pls_bproc_orted_component_close(void)
int orte_odls_bproc_component_close(void)
{
OBJ_DESTRUCT(&mca_pls_bproc_orted_component.lock);
OBJ_DESTRUCT(&mca_odls_bproc_component.lock);
OBJ_DESTRUCT(&mca_odls_bproc_component.cond);
OBJ_DESTRUCT(&mca_odls_bproc_component.children);
return ORTE_SUCCESS;
}
int orte_odls_bproc_component_finalize(void)
{
opal_list_item_t *item;
/* cleanup state */
while (NULL != (item = opal_list_remove_first(&mca_odls_bproc_component.children))) {
OBJ_RELEASE(item);
}
return ORTE_SUCCESS;
}

48
orte/mca/odls/default/Makefile.am Обычный файл
Просмотреть файл

@ -0,0 +1,48 @@
#
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2004-2005 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
dist_pkgdata_DATA = help-odls-default.txt
sources = \
odls_default.h \
odls_default_component.c \
odls_default_module.c
# Make the output library in this directory, and name it either
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
# (for static builds).
if OMPI_BUILD_odls_default_DSO
component_noinst =
component_install = mca_odls_default.la
else
component_noinst = libmca_odls_default.la
component_install =
endif
mcacomponentdir = $(libdir)/openmpi
mcacomponent_LTLIBRARIES = $(component_install)
mca_odls_default_la_SOURCES = $(sources)
mca_odls_default_la_LDFLAGS = -module -avoid-version
mca_odls_default_la_LIBADD = \
$(top_ompi_builddir)/orte/liborte.la \
$(top_ompi_builddir)/opal/libopal.la
noinst_LTLIBRARIES = $(component_noinst)
libmca_odls_default_la_SOURCES =$(sources)
libmca_odls_default_la_LDFLAGS = -module -avoid-version

Просмотреть файл

@ -17,8 +17,9 @@
# $HEADER$
#
# MCA_pls_fork_CONFIG([action-if-found], [action-if-not-found])
# MCA_odls_default_CONFIG([action-if-found], [action-if-not-found])
# -----------------------------------------------------------
AC_DEFUN([MCA_pls_fork_CONFIG],[
AC_DEFUN([MCA_odls_default_CONFIG],[
AC_CHECK_FUNC([fork], [$1], [$2])
])dnl

Просмотреть файл

@ -17,5 +17,5 @@
# $HEADER$
#
PARAM_INIT_FILE=pls_fork_component.c
PARAM_INIT_FILE=odls_default_component.c
PARAM_CONFIG_FILES="Makefile"

Просмотреть файл

@ -16,36 +16,23 @@
#
# $HEADER$
#
# This is the US/English general help file for Open RTE's orterun.
# This is the US/English general help file for Open RTE's orted launcher.
#
[orte-pls-fork:chdir-error]
[odls-default:chdir-error]
Failed to change to the working directory:
Host: %s
Directory: %s
The error returned was "%s". Execution will now abort.
[orte-pls-fork:argv0-not-found]
Failed to find the following executable:
Host: %s
Executable: %s
Cannot continue.
[orte-pls-fork:argv0-not-accessible]
[odls-default:argv0-not-accessible]
Failed to find or execute the following executable:
Host: %s
Executable: %s
Cannot continue.
[orte-pls-fork:execv-error]
Could not execute the executable "%s": %s
This could mean that your PATH or executable name is wrong, or that you do not
have the necessary permissions. Please ensure that the executable is able to be
found and executed.
[orte-pls-fork:could-not-kill]
[odls-default:could-not-kill]
WARNING: A process refused to die!
Host: %s
@ -53,7 +40,7 @@ PID: %d
This process may still be running and/or consuming resources.
[orte-pls-fork:could-not-kill]
[odls-default:could-not-send-kill]
WARNING: A process refused the kill SIGTERM signal!
This should never happen unless the application is changing the
parent/child relationship permissions.

105
orte/mca/odls/default/odls_default.h Обычный файл
Просмотреть файл

@ -0,0 +1,105 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2006 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/**
* @file:
*/
#ifndef ORTE_ODLS_H
#define ORTE_ODLS_H
#include "orte_config.h"
#include "opal/threads/mutex.h"
#include "opal/threads/condition.h"
#include "opal/mca/mca.h"
#include "orte/mca/ns/ns_types.h"
#include "orte/mca/gpr/gpr_types.h"
#include "orte/mca/odls/odls.h"
#if defined(c_plusplus) || defined(__cplusplus)
extern "C" {
#endif
/*
* Module open / close
*/
int orte_odls_default_component_open(void);
int orte_odls_default_component_close(void);
orte_odls_base_module_t* orte_odls_default_component_init(int *priority);
/*
* Startup / Shutdown
*/
int orte_odls_default_finalize(void);
/*
* Interface
*/
int orte_odls_default_subscribe_launch_data(orte_jobid_t job, orte_gpr_notify_cb_fn_t cbfunc);
int orte_odls_default_launch_local_procs(orte_gpr_notify_data_t *data);
int orte_odls_default_kill_local_procs(orte_jobid_t job, bool set_state);
int orte_odls_default_signal_local_procs(orte_process_name_t *proc,
int32_t signal);
/**
* ODLS Default globals
*/
typedef struct orte_odls_default_globals_t {
opal_mutex_t mutex;
opal_condition_t cond;
opal_list_t children;
} orte_odls_default_globals_t;
extern orte_odls_default_globals_t orte_odls_default;
/*
* List object to locally store the process names and pids of
* our children. This can subsequently be used to order termination
* or pass signals without looking the info up again.
*/
typedef struct odls_default_child_t {
opal_list_item_t super; /* required to place this on a list */
orte_process_name_t *name; /* the OpenRTE name of the proc */
pid_t pid; /* local pid of the proc */
orte_std_cntr_t app_idx; /* index of the app_context for this proc */
bool alive; /* is this proc alive? */
} odls_default_child_t;
ORTE_DECLSPEC OBJ_CLASS_DECLARATION(odls_default_child_t);
/*
* List object to locally store app_contexts returned by the
* registry subscription. Since we don't know how many app_contexts will
* be returned, we need to store them on a list.
*/
typedef struct odls_default_app_context_t {
opal_list_item_t super; /* required to place this on a list */
orte_app_context_t *app_context;
} odls_default_app_context_t;
ORTE_DECLSPEC OBJ_CLASS_DECLARATION(odls_default_app_context_t);
/*
* ODLS Default module
*/
extern orte_odls_base_module_t orte_odls_default_module;
#if defined(c_plusplus) || defined(__cplusplus)
}
#endif
#endif /* ORTE_ODLS_H */

Просмотреть файл

@ -0,0 +1,163 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2006 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*
* These symbols are in a file by themselves to provide nice linker
* semantics. Since linkers generally pull in symbols by object
* files, keeping these symbols as the only symbols in this file
* prevents utility programs such as "ompi_info" from having to import
* entire components just to query their version and parameters.
*/
#include "orte_config.h"
#include "orte/orte_constants.h"
#include <stdlib.h>
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif
#include <ctype.h>
#include "opal/util/argv.h"
#include "opal/util/path.h"
#include "opal/util/basename.h"
#include "opal/util/show_help.h"
#include "opal/mca/base/mca_base_param.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/rml/rml.h"
#include "orte/mca/odls/odls.h"
#include "orte/mca/pls/base/pls_private.h"
#include "orte/mca/odls/default/odls_default.h"
/* Instantiate the component globals */
orte_odls_default_globals_t orte_odls_default;
/* instance the child list object */
static void odls_default_child_constructor(odls_default_child_t *ptr)
{
ptr->name = NULL;
ptr->pid = 0;
ptr->app_idx = -1;
ptr->alive = false;
}
static void odls_default_child_destructor(odls_default_child_t *ptr)
{
if (NULL != ptr->name) free(ptr->name);
}
OBJ_CLASS_INSTANCE(odls_default_child_t,
opal_list_item_t,
odls_default_child_constructor,
odls_default_child_destructor);
/* instance the app_context list object */
OBJ_CLASS_INSTANCE(odls_default_app_context_t,
opal_list_item_t,
NULL, NULL);
/*
* Instantiate the public struct with all of our public information
* and pointers to our public functions in it
*/
orte_odls_base_component_t mca_odls_default_component = {
/* First, the mca_component_t struct containing meta information
about the component itself */
{
/* Indicate that we are a odls v1.3.0 component (which also
implies a specific MCA version) */
ORTE_ODLS_BASE_VERSION_1_3_0,
/* Component name and version */
"default",
ORTE_MAJOR_VERSION,
ORTE_MINOR_VERSION,
ORTE_RELEASE_VERSION,
/* Component open and close functions */
orte_odls_default_component_open,
orte_odls_default_component_close
},
/* Next the MCA v1.0.0 component meta data */
{
/* Whether the component is checkpointable or not */
true
},
/* Initialization / querying functions */
orte_odls_default_component_init,
orte_odls_default_finalize
};
int orte_odls_default_component_open(void)
{
/* initialize globals */
OBJ_CONSTRUCT(&orte_odls_default.mutex, opal_mutex_t);
OBJ_CONSTRUCT(&orte_odls_default.cond, opal_condition_t);
OBJ_CONSTRUCT(&orte_odls_default.children, opal_list_t);
return ORTE_SUCCESS;
}
orte_odls_base_module_t *orte_odls_default_component_init(int *priority)
{
/* the base open/select logic protects us against operation when
* we are NOT in a daemon, so we don't have to check that here
*/
/* we have built some logic into the configure.m4 file that checks
* to see if we have "fork" support and only builds this component
* if we do. Hence, we only get here if we CAN build - in which
* case, we definitely should be considered for selection
*/
*priority = 1; /* let others override us - we are the default */
return &orte_odls_default_module;
}
int orte_odls_default_component_close(void)
{
OBJ_DESTRUCT(&orte_odls_default.mutex);
OBJ_DESTRUCT(&orte_odls_default.cond);
OBJ_DESTRUCT(&orte_odls_default.children);
return ORTE_SUCCESS;
}
int orte_odls_default_finalize(void)
{
opal_list_item_t *item;
/* cleanup state */
while (NULL != (item = opal_list_remove_first(&orte_odls_default.children))) {
OBJ_RELEASE(item);
}
return ORTE_SUCCESS;
}

1037
orte/mca/odls/default/odls_default_module.c Обычный файл

Разница между файлами не показана из-за своего большого размера Загрузить разницу

130
orte/mca/odls/odls.h Обычный файл
Просмотреть файл

@ -0,0 +1,130 @@
/* -*- C -*-
*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/**
* @file
*
* The OpenRTE Daemon's Local Launch Subsystem
*
*/
#ifndef ORTE_MCA_ODLS_H
#define ORTE_MCA_ODLS_H
#include "orte_config.h"
#include "opal/mca/mca.h"
#include "opal/class/opal_list.h"
#include "orte/mca/gpr/gpr_types.h"
#include "orte/mca/ns/ns_types.h"
#include "orte/mca/odls/odls_types.h"
/*
* odls module functions
*/
/**
* Subscribe to receive the launch data for local processes
*/
typedef int (*orte_odls_base_module_subscribe_launch_data_fn_t)(orte_jobid_t job, orte_gpr_notify_cb_fn_t cbfunc);
/**
* Locally launch the provided processes
*/
typedef int (*orte_odls_base_module_launch_local_processes_fn_t)(orte_gpr_notify_data_t *data);
/**
* Kill the local processes on this node
*/
typedef int (*orte_odls_base_module_kill_local_processes_fn_t)(orte_jobid_t job, bool set_state);
/**
* Signal local processes
*/
typedef int (*orte_pls_base_module_signal_local_process_fn_t)(orte_process_name_t *proc,
int32_t signal);
/**
* pls module version 1.3.0
*/
struct orte_odls_base_module_1_3_0_t {
orte_odls_base_module_subscribe_launch_data_fn_t subscribe_launch_data;
orte_odls_base_module_launch_local_processes_fn_t launch_local_procs;
orte_odls_base_module_kill_local_processes_fn_t kill_local_procs;
orte_pls_base_module_signal_local_process_fn_t signal_local_procs;
};
/** shorten orte_odls_base_module_1_3_0_t declaration */
typedef struct orte_odls_base_module_1_3_0_t orte_odls_base_module_1_3_0_t;
/** shorten orte_odls_base_module_t declaration */
typedef struct orte_odls_base_module_1_3_0_t orte_odls_base_module_t;
/**
* odls initialization function
*
* Called by the MCA framework to initialize the component. Invoked
* exactly once per process.
*
* @param priority (OUT) Relative priority or ranking use by MCA to
* select a module.
*/
typedef struct orte_odls_base_module_1_3_0_t*
(*orte_odls_base_component_init_fn_t)(int *priority);
/**
* Cleanup all resources held by the component
*/
typedef int (*orte_odls_base_component_finalize_fn_t)(void);
/**
* odls component v1.3.0
*/
struct orte_odls_base_component_1_3_0_t {
/** component version */
mca_base_component_t version;
/** component data */
mca_base_component_data_1_0_0_t odls_data;
/** Function called when component is initialized */
orte_odls_base_component_init_fn_t init;
/* Function called when component is finalized */
orte_odls_base_component_finalize_fn_t finalize;
};
/** Convenience typedef */
typedef struct orte_odls_base_component_1_3_0_t orte_odls_base_component_1_3_0_t;
/** Convenience typedef */
typedef orte_odls_base_component_1_3_0_t orte_odls_base_component_t;
/**
* Macro for use in modules that are of type odls v1.3.0
*/
#define ORTE_ODLS_BASE_VERSION_1_3_0 \
/* odls v1.3 is chained to MCA v1.0 */ \
MCA_BASE_VERSION_1_0_0, \
/* odls v1.3 */ \
"odls", 1, 3, 0
/* Global structure for accessing ODLS functions
*/
ORTE_DECLSPEC extern orte_odls_base_module_t orte_odls; /* holds selected module's function pointers */
#endif /* MCA_ODLS_H */

51
orte/mca/odls/odls_types.h Обычный файл
Просмотреть файл

@ -0,0 +1,51 @@
/* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/** @file:
*/
#ifndef ORTE_MCA_ODLS_TYPES_H
#define ORTE_MCA_ODLS_TYPES_H
#include "orte_config.h"
#include "orte/orte_types.h"
#if defined(c_plusplus) || defined(__cplusplus)
extern "C" {
#endif
/* define the orted command flag type */
typedef uint8_t orte_daemon_cmd_flag_t;
#define ORTE_DAEMON_CMD_T ORTE_UINT8
/*
* Definitions needed for communication
*/
#define ORTE_DAEMON_HOSTFILE_CMD (orte_daemon_cmd_flag_t) 1
#define ORTE_DAEMON_SCRIPTFILE_CMD (orte_daemon_cmd_flag_t) 2
#define ORTE_DAEMON_CONTACT_QUERY_CMD (orte_daemon_cmd_flag_t) 3
#define ORTE_DAEMON_KILL_LOCAL_PROCS (orte_daemon_cmd_flag_t) 4
#define ORTE_DAEMON_SIGNAL_LOCAL_PROCS (orte_daemon_cmd_flag_t) 5
#define ORTE_DAEMON_ADD_LOCAL_PROCS (orte_daemon_cmd_flag_t) 6
#define ORTE_DAEMON_HEARTBEAT_CMD (orte_daemon_cmd_flag_t) 254
#define ORTE_DAEMON_EXIT_CMD (orte_daemon_cmd_flag_t) 255
#if defined(c_plusplus) || defined(__cplusplus)
}
#endif
#endif

0
orte/mca/odls/windows/.ompi_ignore Обычный файл
Просмотреть файл

0
orte/mca/pls/process/Makefile.am → orte/mca/odls/windows/Makefile.am Обычный файл → Исполняемый файл
Просмотреть файл

4
orte/mca/pls/process/configure.m4 → orte/mca/odls/windows/configure.m4 Обычный файл → Исполняемый файл
Просмотреть файл

@ -10,8 +10,8 @@
# $HEADER$
#
# MCA_pls_process_CONFIG([action-if-found], [action-if-not-found])
# MCA_odls_windows_CONFIG([action-if-found], [action-if-not-found])
# -----------------------------------------------------------
AC_DEFUN([MCA_pls_process_CONFIG],[
AC_DEFUN([MCA_odls_windows_CONFIG],[
AC_CHECK_FUNC([CreateProcess], [$1], [$2])
])dnl

2
orte/mca/pls/process/configure.params → orte/mca/odls/windows/configure.params Обычный файл → Исполняемый файл
Просмотреть файл

@ -10,5 +10,5 @@
# $HEADER$
#
PARAM_INIT_FILE=pls_process_component.c
PARAM_INIT_FILE=odls_windows_component.c
PARAM_CONFIG_FILES="Makefile"

0
orte/mca/pls/process/help-orte-pls-process.txt → orte/mca/odls/windows/help-odls-windows.txt Обычный файл → Исполняемый файл
Просмотреть файл

0
orte/mca/pls/process/pls_process.h → orte/mca/odls/windows/odls_windows.h Обычный файл → Исполняемый файл
Просмотреть файл

0
orte/mca/pls/process/pls_process_component.c → orte/mca/odls/windows/odls_windows_component.c Обычный файл → Исполняемый файл
Просмотреть файл

0
orte/mca/pls/process/pls_process_module.c → orte/mca/odls/windows/odls_windows_module.c Обычный файл → Исполняемый файл
Просмотреть файл

Просмотреть файл

@ -91,7 +91,7 @@ int mca_oob_xcast(
{
orte_std_cntr_t i;
int rc;
int tag = MCA_OOB_TAG_XCAST;
int tag = ORTE_RML_TAG_XCAST;
int status;
orte_proc_state_t state;

Просмотреть файл

@ -32,23 +32,6 @@
* Other constants
*/
/**
* Service tags
*/
#define MCA_OOB_TAG_NS (orte_rml_tag_t) 1
#define MCA_OOB_TAG_GPR (orte_rml_tag_t) 2
#define MCA_OOB_TAG_GPR_NOTIFY (orte_rml_tag_t) 3
#define MCA_OOB_TAG_RTE (orte_rml_tag_t) 4
#define MCA_OOB_TAG_EXEC (orte_rml_tag_t) 5
#define MCA_OOB_TAG_DAEMON (orte_rml_tag_t) 6
#define MCA_OOB_TAG_STDIO (orte_rml_tag_t) 7
#define MCA_OOB_TAG_SCHED (orte_rml_tag_t) 8
#define MCA_OOB_TAG_PCM_KILL (orte_rml_tag_t) 9
#define MCA_OOB_TAG_XCAST (orte_rml_tag_t) 10
#define MCA_OOB_TAG_PCM_KILL_ACK (orte_rml_tag_t) 11
#define MCA_OOB_TAG_BPROC (orte_rml_tag_t) 12
#define ORTE_OOB_TAG_START_LIST (orte_rml_tag_t) 100 /* starting point for tag server assignments */
/**
* The wildcard for receives from any peer.
*/

Просмотреть файл

@ -83,6 +83,7 @@ OBJ_CLASS_INSTANCE(
*/
static int mca_oob_tcp_create_listen(void);
static int mca_oob_tcp_create_listen_thread(void);
static void mca_oob_tcp_recv_handler(int sd, short flags, void* user);
static void mca_oob_tcp_accept(void);
@ -100,6 +101,12 @@ OBJ_CLASS_INSTANCE(
NULL,
NULL);
OBJ_CLASS_INSTANCE(
mca_oob_tcp_pending_connection_t,
opal_free_list_item_t,
NULL,
NULL);
/*
@ -169,6 +176,9 @@ static inline char* mca_oob_tcp_param_register_str(
*/
int mca_oob_tcp_component_open(void)
{
char *listen_type;
int tmp;
#ifdef __WINDOWS__
WSADATA win_sock_data;
if (WSAStartup(MAKEWORD(2,2), &win_sock_data) != 0) {
@ -190,6 +200,12 @@ int mca_oob_tcp_component_open(void)
OBJ_CONSTRUCT(&mca_oob_tcp_component.tcp_msg_completed, opal_list_t);
OBJ_CONSTRUCT(&mca_oob_tcp_component.tcp_match_lock, opal_mutex_t);
OBJ_CONSTRUCT(&mca_oob_tcp_component.tcp_match_cond, opal_condition_t);
OBJ_CONSTRUCT(&mca_oob_tcp_component.tcp_listen_thread, opal_thread_t);
OBJ_CONSTRUCT(&mca_oob_tcp_component.tcp_pending_connections_fl, opal_free_list_t);
OBJ_CONSTRUCT(&mca_oob_tcp_component.tcp_pending_connections, opal_list_t);
OBJ_CONSTRUCT(&mca_oob_tcp_component.tcp_copy_out_connections, opal_list_t);
OBJ_CONSTRUCT(&mca_oob_tcp_component.tcp_copy_in_connections, opal_list_t);
OBJ_CONSTRUCT(&mca_oob_tcp_component.tcp_pending_connections_lock, opal_mutex_t);
/* register oob module parameters */
mca_oob_tcp_component.tcp_peer_limit =
@ -207,9 +223,60 @@ int mca_oob_tcp_component_open(void)
mca_oob_tcp_component.tcp_rcvbuf =
mca_oob_tcp_param_register_int("rcvbuf", 128*1024);
mca_base_param_reg_string(&mca_oob_tcp_component.super.oob_base,
"listen_mode",
"Mode for HNP to accept incoming connections: event, listen_thread",
false,
false,
"event",
&listen_type);
if ((0 == strcmp(listen_type, "event")) || NULL == getenv("I_AM_MPIRUN")) {
mca_oob_tcp_component.tcp_listen_type = OOB_TCP_EVENT;
} else if (0 == strcmp(listen_type, "listen_thread")) {
mca_oob_tcp_component.tcp_listen_type = OOB_TCP_LISTEN_THREAD;
} else {
opal_output(0, "Invalid value for oob_tcp_listen_mode parameter: %s",
listen_type);
return ORTE_ERROR;
}
mca_base_param_reg_int(&mca_oob_tcp_component.super.oob_base,
"listen_thread_max_queue",
"High water mark for queued accepted socket list size",
false,
false,
10,
&mca_oob_tcp_component.tcp_copy_max_size);
mca_base_param_reg_int(&mca_oob_tcp_component.super.oob_base,
"listen_thread_max_time",
"Maximum amount of time (in milliseconds) to wait between processing accepted socket list",
false,
false,
10,
&tmp);
#if OPAL_TIMER_USEC_NATIVE
mca_oob_tcp_component.tcp_copy_delta = tmp * 1000;
#else
mca_oob_tcp_component.tcp_copy_delta = tmp *
opal_timer_base_get_freq() / 1000;
#endif
mca_base_param_reg_int(&mca_oob_tcp_component.super.oob_base,
"accept_spin_count",
"Number of times to let accept return EWOULDBLOCK before updating accepted socket list",
false,
false,
10,
&mca_oob_tcp_component.tcp_copy_spin_count);
/* initialize state */
mca_oob_tcp_component.tcp_shutdown = false;
mca_oob_tcp_component.tcp_listen_sd = -1;
mca_oob_tcp_component.tcp_match_count = 0;
mca_oob_tcp_component.tcp_last_copy_time = 0;
return ORTE_SUCCESS;
}
@ -251,7 +318,7 @@ int mca_oob_tcp_component_close(void)
static void mca_oob_tcp_accept(void)
{
while(true) {
ompi_socklen_t addrlen = sizeof(struct sockaddr_in);
opal_socklen_t addrlen = sizeof(struct sockaddr_in);
struct sockaddr_in addr;
mca_oob_tcp_event_t* event;
int sd;
@ -291,7 +358,7 @@ static int mca_oob_tcp_create_listen(void)
{
int flags;
struct sockaddr_in inaddr;
ompi_socklen_t addrlen;
opal_socklen_t addrlen;
/* create a listen socket for incoming connections */
mca_oob_tcp_component.tcp_listen_sd = socket(AF_INET, SOCK_STREAM, 0);
@ -352,6 +419,206 @@ static int mca_oob_tcp_create_listen(void)
}
static void* mca_oob_tcp_listen_thread(opal_object_t *obj)
{
int rc, count;
opal_socklen_t addrlen = sizeof(struct sockaddr_in);
opal_free_list_item_t *fl_item;
mca_oob_tcp_pending_connection_t *item;
struct timeval timeout;
fd_set readfds;
while (false == mca_oob_tcp_component.tcp_shutdown) {
count = 0;
FD_ZERO(&readfds);
FD_SET(mca_oob_tcp_component.tcp_listen_sd, &readfds);
timeout.tv_sec = 0;
timeout.tv_usec = 10000;
rc = select(mca_oob_tcp_component.tcp_listen_sd + 1, &readfds,
NULL, NULL, &timeout);
if (rc < 0) {
if (EAGAIN != opal_socket_errno && EINTR != opal_socket_errno) {
perror("select");
}
continue;
}
while (count < mca_oob_tcp_component.tcp_copy_spin_count &&
opal_list_get_size(&mca_oob_tcp_component.tcp_copy_in_connections) <
(size_t) mca_oob_tcp_component.tcp_copy_max_size) {
OPAL_FREE_LIST_WAIT(&mca_oob_tcp_component.tcp_pending_connections_fl,
fl_item, rc);
item = (mca_oob_tcp_pending_connection_t*) fl_item;
item->fd = accept(mca_oob_tcp_component.tcp_listen_sd,
(struct sockaddr*)&(item->addr), &addrlen);
if(item->fd < 0) {
OPAL_FREE_LIST_RETURN(&mca_oob_tcp_component.tcp_pending_connections_fl,
fl_item);
if (mca_oob_tcp_component.tcp_shutdown) return NULL;
if(opal_socket_errno != EAGAIN || opal_socket_errno != EWOULDBLOCK) {
opal_output(0, "mca_oob_tcp_accept: accept() failed with errno %d.", opal_socket_errno);
close(item->fd);
return NULL;
}
count++;
continue;
}
if(mca_oob_tcp_component.tcp_debug) {
opal_output(0, "[%lu,%lu,%lu] mca_oob_tcp_listen_thread: (%d, %d) %s:%d\n",
ORTE_NAME_ARGS(orte_process_info.my_name),
item->fd, opal_socket_errno,
inet_ntoa(item->addr.sin_addr),
item->addr.sin_port);
}
opal_list_append(&mca_oob_tcp_component.tcp_copy_in_connections,
(opal_list_item_t*) item);
}
if (0 < opal_list_get_size(&mca_oob_tcp_component.tcp_copy_in_connections)) {
opal_mutex_lock(&mca_oob_tcp_component.tcp_pending_connections_lock);
opal_list_join(&mca_oob_tcp_component.tcp_pending_connections,
opal_list_get_end(&mca_oob_tcp_component.tcp_pending_connections),
&mca_oob_tcp_component.tcp_copy_in_connections);
opal_mutex_unlock(&mca_oob_tcp_component.tcp_pending_connections_lock);
}
}
return NULL;
}
/* called from opal_progress() to create the oob contact information
for the file descriptors accepted() by the accept thread. */
static int mca_oob_tcp_listen_progress(void)
{
int count = 0;
mca_oob_tcp_pending_connection_t *item;
mca_oob_tcp_event_t* event;
#if OPAL_TIMER_USEC_NATIVE
opal_timer_t now = opal_timer_base_get_usec();
#else
opal_timer_t now = opal_timer_base_get_cycles();
#endif /* OPAL_TIMER_USEC_NATIVE */
/* if we've not pulled pending connections for a while OR we've
hit the high water mark of pending connections, grab all the
pending connections */
if ((now - mca_oob_tcp_component.tcp_last_copy_time >
mca_oob_tcp_component.tcp_copy_delta) ||
((size_t) mca_oob_tcp_component.tcp_copy_max_size <
opal_list_get_size(&mca_oob_tcp_component.tcp_pending_connections))) {
/* copy the pending connections from the list the accept
thread is inserting into into a temporary list for us to
process from. This is an O(1) operation, so we minimize
the lock time */
opal_mutex_lock(&mca_oob_tcp_component.tcp_pending_connections_lock);
opal_list_join(&mca_oob_tcp_component.tcp_copy_out_connections,
opal_list_get_end(&mca_oob_tcp_component.tcp_copy_out_connections),
&mca_oob_tcp_component.tcp_pending_connections);
opal_mutex_unlock(&mca_oob_tcp_component.tcp_pending_connections_lock);
/* process al the connections */
while (NULL != (item = (mca_oob_tcp_pending_connection_t*)
opal_list_remove_first(&mca_oob_tcp_component.
tcp_copy_out_connections))) {
/* setup socket options */
mca_oob_tcp_set_socket_options(item->fd);
/* log the accept */
if(mca_oob_tcp_component.tcp_debug) {
opal_output(0, "[%lu,%lu,%lu] mca_oob_tcp_listen_progress: %s:%d\n",
ORTE_NAME_ARGS(orte_process_info.my_name),
inet_ntoa(item->addr.sin_addr),
item->addr.sin_port);
}
/* wait for receipt of peers process identifier to
complete this connection */
event = OBJ_NEW(mca_oob_tcp_event_t);
opal_event_set(&event->event, item->fd, OPAL_EV_READ, mca_oob_tcp_recv_handler, event);
opal_event_add(&event->event, 0);
OPAL_FREE_LIST_RETURN(&mca_oob_tcp_component.tcp_pending_connections_fl,
(opal_free_list_item_t *) item);
count++;
}
mca_oob_tcp_component.tcp_last_copy_time = now;
}
return count;
}
static int mca_oob_tcp_create_listen_thread(void)
{
struct sockaddr_in inaddr;
opal_socklen_t addrlen;
int flags;
/* create a listen socket for incoming connections */
mca_oob_tcp_component.tcp_listen_sd = socket(AF_INET, SOCK_STREAM, 0);
if(mca_oob_tcp_component.tcp_listen_sd < 0) {
opal_output(0,"mca_oob_tcp_component_init: socket() failed with errno=%d", opal_socket_errno);
return ORTE_ERROR;
}
/* setup socket options */
mca_oob_tcp_set_socket_options(mca_oob_tcp_component.tcp_listen_sd);
/* bind address */
memset(&inaddr, 0, sizeof(inaddr));
inaddr.sin_family = AF_INET;
inaddr.sin_addr.s_addr = INADDR_ANY;
inaddr.sin_port = 0;
if(bind(mca_oob_tcp_component.tcp_listen_sd, (struct sockaddr*)&inaddr, sizeof(inaddr)) < 0) {
opal_output(0,"mca_oob_tcp_create_listen: bind() failed with errno=%d", opal_socket_errno);
return ORTE_ERROR;
}
/* resolve system assigned port */
addrlen = sizeof(struct sockaddr_in);
if(getsockname(mca_oob_tcp_component.tcp_listen_sd, (struct sockaddr*)&inaddr, &addrlen) < 0) {
opal_output(0, "mca_oob_tcp_create_listen: getsockname() failed with errno=%d", opal_socket_errno);
return ORTE_ERROR;
}
mca_oob_tcp_component.tcp_listen_port = inaddr.sin_port;
/* setup listen backlog to maximum allowed by kernel */
if(listen(mca_oob_tcp_component.tcp_listen_sd, SOMAXCONN) < 0) {
opal_output(0, "mca_oob_tcp_component_init: listen() failed with errno=%d", opal_socket_errno);
return ORTE_ERROR;
}
/* set socket up to be non-blocking, otherwise accept could block */
if((flags = fcntl(mca_oob_tcp_component.tcp_listen_sd, F_GETFL, 0)) < 0) {
opal_output(0, "mca_oob_tcp_component_init: fcntl(F_GETFL) failed with errno=%d", opal_socket_errno);
return ORTE_ERROR;
} else {
flags |= O_NONBLOCK;
if(fcntl(mca_oob_tcp_component.tcp_listen_sd, F_SETFL, flags) < 0) {
opal_output(0, "mca_oob_tcp_component_init: fcntl(F_SETFL) failed with errno=%d", opal_socket_errno);
return ORTE_ERROR;
}
}
/* start the listen thread */
mca_oob_tcp_component.tcp_listen_thread.t_run = mca_oob_tcp_listen_thread;
mca_oob_tcp_component.tcp_listen_thread.t_arg = NULL;
return opal_thread_start(&mca_oob_tcp_component.tcp_listen_thread);
}
/*
* Handle probe
*/
@ -537,9 +804,23 @@ mca_oob_t* mca_oob_tcp_component_init(int* priority)
memset(&mca_oob_tcp_component.tcp_send_event, 0, sizeof(opal_event_t));
/* create a listen socket */
if(mca_oob_tcp_create_listen() != ORTE_SUCCESS) {
opal_output(0, "mca_oob_tcp_init: unable to create listen socket\n");
return NULL;
if (OOB_TCP_EVENT == mca_oob_tcp_component.tcp_listen_type) {
if(mca_oob_tcp_create_listen() != ORTE_SUCCESS) {
opal_output(0, "mca_oob_tcp_init: unable to create listen socket");
return NULL;
}
} else if (OOB_TCP_LISTEN_THREAD == mca_oob_tcp_component.tcp_listen_type) {
if (mca_oob_tcp_create_listen_thread() != ORTE_SUCCESS) {
opal_output(0, "mca_oob_tcp_init: unable to create listen thread");
return NULL;
}
opal_free_list_init(&mca_oob_tcp_component.tcp_pending_connections_fl,
sizeof(mca_oob_tcp_pending_connection_t),
OBJ_CLASS(mca_oob_tcp_pending_connection_t),
16, /* initial number */
-1, /* maximum number */
16); /* increment to grow by */
opal_progress_register(mca_oob_tcp_listen_progress);
}
return &mca_oob_tcp;
}
@ -932,8 +1213,16 @@ int mca_oob_tcp_fini(void)
/* close listen socket */
if (mca_oob_tcp_component.tcp_listen_sd >= 0) {
opal_event_del(&mca_oob_tcp_component.tcp_recv_event);
CLOSE_THE_SOCKET(mca_oob_tcp_component.tcp_listen_sd);
if (OOB_TCP_EVENT == mca_oob_tcp_component.tcp_listen_type) {
opal_event_del(&mca_oob_tcp_component.tcp_recv_event);
close(mca_oob_tcp_component.tcp_listen_sd);
} else if (OOB_TCP_LISTEN_THREAD == mca_oob_tcp_component.tcp_listen_type) {
void *data;
mca_oob_tcp_component.tcp_shutdown = true;
close(mca_oob_tcp_component.tcp_listen_sd);
opal_thread_join(&mca_oob_tcp_component.tcp_listen_thread, &data);
opal_progress_unregister(mca_oob_tcp_listen_progress);
}
mca_oob_tcp_component.tcp_listen_sd = -1;
}

Просмотреть файл

@ -34,6 +34,7 @@
#include "opal/threads/condition.h"
#include "orte/mca/oob/tcp/oob_tcp_peer.h"
#include "orte/mca/oob/tcp/oob_tcp_msg.h"
#include "opal/mca/timer/base/base.h"
#if defined(c_plusplus) || defined(__cplusplus)
@ -223,11 +224,6 @@ void mca_oob_tcp_registry_callback(
void mca_oob_tcp_set_socket_options(int sd);
typedef enum {
OOB_TCP_EVENT,
OOB_TCP_LISTEN_THREAD
} mca_oob_tcp_listen_type_t;
/**
* OOB TCP Component
*/
@ -258,6 +254,19 @@ struct mca_oob_tcp_component_t {
opal_condition_t tcp_match_cond; /**< condition variable used in finalize */
int tcp_match_count; /**< number of matched recvs in progress */
int tcp_debug; /**< debug level */
bool tcp_shutdown;
enum { OOB_TCP_EVENT, OOB_TCP_LISTEN_THREAD } tcp_listen_type;
opal_thread_t tcp_listen_thread;
opal_free_list_t tcp_pending_connections_fl;
opal_list_t tcp_pending_connections;
opal_list_t tcp_copy_out_connections;
opal_list_t tcp_copy_in_connections;
opal_mutex_t tcp_pending_connections_lock;
opal_timer_t tcp_last_copy_time;
opal_timer_t tcp_copy_delta;
int tcp_copy_max_size;
int tcp_copy_spin_count;
};
/**
@ -273,6 +282,14 @@ ORTE_MODULE_DECLSPEC extern mca_oob_tcp_component_t mca_oob_tcp_component;
#define CLOSE_THE_SOCKET(socket) close(socket)
#endif /* defined(__WINDOWS__) */
struct mca_oob_tcp_pending_connection_t {
opal_free_list_item_t super;
int fd;
struct sockaddr_in addr;
};
typedef struct mca_oob_tcp_pending_connection_t mca_oob_tcp_pending_connection_t;
OBJ_CLASS_DECLARATION(mca_oob_tcp_pending_connection_t);
#if defined(c_plusplus) || defined(__cplusplus)
}
#endif

Просмотреть файл

@ -379,7 +379,7 @@ static int mca_oob_tcp_peer_start_connect(mca_oob_tcp_peer_t* peer)
static void mca_oob_tcp_peer_complete_connect(mca_oob_tcp_peer_t* peer)
{
int so_error = 0;
ompi_socklen_t so_length = sizeof(so_error);
opal_socklen_t so_length = sizeof(so_error);
/* unregister from receiving event notifications */
opal_event_del(&peer->peer_send_event);
@ -467,7 +467,7 @@ void mca_oob_tcp_peer_close(mca_oob_tcp_peer_t* peer)
* get stuck in the orte_wait_kill when receiving messages in the
* tcp OOB. */
OPAL_THREAD_UNLOCK(&peer->peer_lock);
orte_errmgr.abort();
orte_errmgr.error_detected(1, "OOB: Connection to HNP lost", NULL);
}
}
@ -787,8 +787,8 @@ static void mca_oob_tcp_peer_dump(mca_oob_tcp_peer_t* peer, const char* msg)
char buff[255];
int sndbuf,rcvbuf,nodelay,flags;
struct sockaddr_in inaddr;
ompi_socklen_t optlen;
ompi_socklen_t addrlen = sizeof(struct sockaddr_in);
opal_socklen_t optlen;
opal_socklen_t addrlen = sizeof(struct sockaddr_in);
getsockname(peer->peer_sd, (struct sockaddr*)&inaddr, &addrlen);
sprintf(src, "%s", inet_ntoa(inaddr.sin_addr));

Просмотреть файл

@ -25,7 +25,7 @@ nobase_orte_HEADERS =
dist_pkgdata_DATA =
# local files
headers = pls.h
headers = pls.h pls_types.h
libmca_pls_la_SOURCES += $(headers)
# Conditionally install the header files

Просмотреть файл

@ -19,12 +19,14 @@
dist_pkgdata_DATA += base/help-pls-base.txt
headers += \
base/pls_private.h \
base/base.h
libmca_pls_la_SOURCES += \
base/pls_base_close.c \
base/pls_base_context.c \
base/pls_base_general_support_fns.c \
base/pls_base_open.c \
base/pls_base_receive.c \
base/pls_base_select.c \
base/pls_base_state.c \
base/pls_base_proxy.c
base/pls_base_dmn_registry_fns.c \
base/pls_base_orted_cmds.c

Некоторые файлы не были показаны из-за слишком большого количества измененных файлов Показать больше