Here is the major MAD-cure commit. I have written plenty about it, so I refer you here to those messages for a description of everything that was done.
This commit was SVN r11661.
Этот коммит содержится в:
родитель
17afe7dc9f
Коммит
37dfdb76eb
@ -373,7 +373,7 @@ ompi_comm_start_processes(int count, char **array_of_commands,
|
||||
* later override this value by providing an MPI_Info value. for now, though,
|
||||
* let's get the default value off the registry
|
||||
*/
|
||||
if (ORTE_SUCCESS != (rc = orte_rmgr_base_get_app_context(orte_process_info.my_name->jobid, &apps, &num_apps))) {
|
||||
if (ORTE_SUCCESS != (rc = orte_rmgr.get_app_context(orte_process_info.my_name->jobid, &apps, &num_apps))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
@ -533,7 +533,7 @@ ompi_comm_start_processes(int count, char **array_of_commands,
|
||||
if (NULL != base_prefix) free(base_prefix);
|
||||
|
||||
/* spawn procs */
|
||||
if (ORTE_SUCCESS != (rc = orte_rmgr.spawn(apps, count, &new_jobid, NULL, ORTE_PROC_STATE_NONE))) {
|
||||
if (ORTE_SUCCESS != (rc = orte_rmgr.spawn_job(apps, count, &new_jobid, NULL, ORTE_PROC_STATE_NONE))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
opal_progress_event_decrement();
|
||||
return MPI_ERR_SPAWN;
|
||||
|
@ -27,7 +27,7 @@
|
||||
#include "ompi/mca/pml/pml.h"
|
||||
#include "ompi/mca/btl/btl.h"
|
||||
#include "ompi/mca/btl/base/base.h"
|
||||
#include "orte/runtime/runtime.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
|
||||
OBJ_CLASS_INSTANCE(
|
||||
mca_btl_base_selected_module_t,
|
||||
@ -151,7 +151,7 @@ int mca_btl_base_select(bool enable_progress_threads,
|
||||
if (0 == opal_list_get_size(&mca_btl_base_modules_initialized)) {
|
||||
opal_show_help("help-mca-base.txt", "find-available:none-found", true,
|
||||
"btl");
|
||||
orte_abort(1, "");
|
||||
orte_errmgr.error_detected(1, NULL);
|
||||
}
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
@ -391,7 +391,7 @@ static int mca_btl_tcp_component_create_listen(void)
|
||||
{
|
||||
int flags;
|
||||
struct sockaddr_in inaddr;
|
||||
ompi_socklen_t addrlen;
|
||||
opal_socklen_t addrlen;
|
||||
|
||||
/* create a listen socket for incoming connections */
|
||||
mca_btl_tcp_component.tcp_listen_sd = socket(AF_INET, SOCK_STREAM, 0);
|
||||
@ -556,7 +556,7 @@ int mca_btl_tcp_component_control(int param, void* value, size_t size)
|
||||
static void mca_btl_tcp_component_accept(void)
|
||||
{
|
||||
while(true) {
|
||||
ompi_socklen_t addrlen = sizeof(struct sockaddr_in);
|
||||
opal_socklen_t addrlen = sizeof(struct sockaddr_in);
|
||||
struct sockaddr_in addr;
|
||||
mca_btl_tcp_event_t *event;
|
||||
int sd = accept(mca_btl_tcp_component.tcp_listen_sd, (struct sockaddr*)&addr, &addrlen);
|
||||
@ -588,7 +588,7 @@ static void mca_btl_tcp_component_recv_handler(int sd, short flags, void* user)
|
||||
struct sockaddr_in addr;
|
||||
int retval;
|
||||
mca_btl_tcp_proc_t* btl_proc;
|
||||
ompi_socklen_t addr_len = sizeof(addr);
|
||||
opal_socklen_t addr_len = sizeof(addr);
|
||||
mca_btl_tcp_event_t *event = (mca_btl_tcp_event_t *)user;
|
||||
|
||||
/* accept new connections on the listen socket */
|
||||
|
@ -133,8 +133,8 @@ static void mca_btl_tcp_endpoint_dump(mca_btl_base_endpoint_t* btl_endpoint, con
|
||||
char dst[64];
|
||||
int sndbuf,rcvbuf,nodelay,flags;
|
||||
struct sockaddr_in inaddr;
|
||||
ompi_socklen_t obtlen;
|
||||
ompi_socklen_t addrlen = sizeof(struct sockaddr_in);
|
||||
opal_socklen_t obtlen;
|
||||
opal_socklen_t addrlen = sizeof(struct sockaddr_in);
|
||||
|
||||
getsockname(btl_endpoint->endpoint_sd, (struct sockaddr*)&inaddr, &addrlen);
|
||||
sprintf(src, "%s", inet_ntoa(inaddr.sin_addr));
|
||||
@ -553,7 +553,7 @@ static int mca_btl_tcp_endpoint_start_connect(mca_btl_base_endpoint_t* btl_endpo
|
||||
static void mca_btl_tcp_endpoint_complete_connect(mca_btl_base_endpoint_t* btl_endpoint)
|
||||
{
|
||||
int so_error = 0;
|
||||
ompi_socklen_t so_length = sizeof(so_error);
|
||||
opal_socklen_t so_length = sizeof(so_error);
|
||||
|
||||
/* unregister from receiving event notifications */
|
||||
opal_event_del(&btl_endpoint->endpoint_send_event);
|
||||
|
@ -148,17 +148,16 @@ am__aclocal_m4_deps = $(top_srcdir)/acinclude.m4 \
|
||||
$(top_srcdir)/opal/mca/timer/linux/configure.m4 \
|
||||
$(top_srcdir)/opal/mca/timer/solaris/configure.m4 \
|
||||
$(top_srcdir)/opal/mca/timer/windows/configure.m4 \
|
||||
$(top_srcdir)/orte/mca/errmgr/bproc/configure.m4 \
|
||||
$(top_srcdir)/orte/mca/odls/bproc/configure.m4 \
|
||||
$(top_srcdir)/orte/mca/odls/default/configure.m4 \
|
||||
$(top_srcdir)/orte/mca/oob/tcp/configure.m4 \
|
||||
$(top_srcdir)/orte/mca/pls/bproc/configure.m4 \
|
||||
$(top_srcdir)/orte/mca/pls/bproc_orted/configure.m4 \
|
||||
$(top_srcdir)/orte/mca/pls/fork/configure.m4 \
|
||||
$(top_srcdir)/orte/mca/pls/gridengine/configure.m4 \
|
||||
$(top_srcdir)/orte/mca/pls/poe/configure.m4 \
|
||||
$(top_srcdir)/orte/mca/pls/process/configure.m4 \
|
||||
$(top_srcdir)/orte/mca/pls/rsh/configure.m4 \
|
||||
$(top_srcdir)/orte/mca/pls/slurm/configure.m4 \
|
||||
$(top_srcdir)/orte/mca/pls/tm/configure.m4 \
|
||||
$(top_srcdir)/orte/mca/pls/xgrid/configure.m4 \
|
||||
$(top_srcdir)/orte/mca/ras/bjs/configure.m4 \
|
||||
$(top_srcdir)/orte/mca/ras/gridengine/configure.m4 \
|
||||
$(top_srcdir)/orte/mca/ras/lsf_bproc/configure.m4 \
|
||||
@ -416,6 +415,13 @@ MCA_ns_DSO_SUBDIRS = @MCA_ns_DSO_SUBDIRS@
|
||||
MCA_ns_STATIC_COMPONENTS = @MCA_ns_STATIC_COMPONENTS@
|
||||
MCA_ns_STATIC_LTLIBS = @MCA_ns_STATIC_LTLIBS@
|
||||
MCA_ns_STATIC_SUBDIRS = @MCA_ns_STATIC_SUBDIRS@
|
||||
MCA_odls_ALL_COMPONENTS = @MCA_odls_ALL_COMPONENTS@
|
||||
MCA_odls_ALL_SUBDIRS = @MCA_odls_ALL_SUBDIRS@
|
||||
MCA_odls_DSO_COMPONENTS = @MCA_odls_DSO_COMPONENTS@
|
||||
MCA_odls_DSO_SUBDIRS = @MCA_odls_DSO_SUBDIRS@
|
||||
MCA_odls_STATIC_COMPONENTS = @MCA_odls_STATIC_COMPONENTS@
|
||||
MCA_odls_STATIC_LTLIBS = @MCA_odls_STATIC_LTLIBS@
|
||||
MCA_odls_STATIC_SUBDIRS = @MCA_odls_STATIC_SUBDIRS@
|
||||
MCA_ompi_FRAMEWORKS = @MCA_ompi_FRAMEWORKS@
|
||||
MCA_ompi_FRAMEWORKS_SUBDIRS = @MCA_ompi_FRAMEWORKS_SUBDIRS@
|
||||
MCA_ompi_FRAMEWORK_COMPONENT_ALL_SUBDIRS = @MCA_ompi_FRAMEWORK_COMPONENT_ALL_SUBDIRS@
|
||||
@ -609,6 +615,14 @@ OMPI_BUILD_common_portals_DSO_FALSE = @OMPI_BUILD_common_portals_DSO_FALSE@
|
||||
OMPI_BUILD_common_portals_DSO_TRUE = @OMPI_BUILD_common_portals_DSO_TRUE@
|
||||
OMPI_BUILD_common_sm_DSO_FALSE = @OMPI_BUILD_common_sm_DSO_FALSE@
|
||||
OMPI_BUILD_common_sm_DSO_TRUE = @OMPI_BUILD_common_sm_DSO_TRUE@
|
||||
OMPI_BUILD_errmgr_bproc_DSO_FALSE = @OMPI_BUILD_errmgr_bproc_DSO_FALSE@
|
||||
OMPI_BUILD_errmgr_bproc_DSO_TRUE = @OMPI_BUILD_errmgr_bproc_DSO_TRUE@
|
||||
OMPI_BUILD_errmgr_hnp_DSO_FALSE = @OMPI_BUILD_errmgr_hnp_DSO_FALSE@
|
||||
OMPI_BUILD_errmgr_hnp_DSO_TRUE = @OMPI_BUILD_errmgr_hnp_DSO_TRUE@
|
||||
OMPI_BUILD_errmgr_orted_DSO_FALSE = @OMPI_BUILD_errmgr_orted_DSO_FALSE@
|
||||
OMPI_BUILD_errmgr_orted_DSO_TRUE = @OMPI_BUILD_errmgr_orted_DSO_TRUE@
|
||||
OMPI_BUILD_errmgr_proxy_DSO_FALSE = @OMPI_BUILD_errmgr_proxy_DSO_FALSE@
|
||||
OMPI_BUILD_errmgr_proxy_DSO_TRUE = @OMPI_BUILD_errmgr_proxy_DSO_TRUE@
|
||||
OMPI_BUILD_gpr_null_DSO_FALSE = @OMPI_BUILD_gpr_null_DSO_FALSE@
|
||||
OMPI_BUILD_gpr_null_DSO_TRUE = @OMPI_BUILD_gpr_null_DSO_TRUE@
|
||||
OMPI_BUILD_gpr_proxy_DSO_FALSE = @OMPI_BUILD_gpr_proxy_DSO_FALSE@
|
||||
@ -651,6 +665,10 @@ OMPI_BUILD_ns_proxy_DSO_FALSE = @OMPI_BUILD_ns_proxy_DSO_FALSE@
|
||||
OMPI_BUILD_ns_proxy_DSO_TRUE = @OMPI_BUILD_ns_proxy_DSO_TRUE@
|
||||
OMPI_BUILD_ns_replica_DSO_FALSE = @OMPI_BUILD_ns_replica_DSO_FALSE@
|
||||
OMPI_BUILD_ns_replica_DSO_TRUE = @OMPI_BUILD_ns_replica_DSO_TRUE@
|
||||
OMPI_BUILD_odls_bproc_DSO_FALSE = @OMPI_BUILD_odls_bproc_DSO_FALSE@
|
||||
OMPI_BUILD_odls_bproc_DSO_TRUE = @OMPI_BUILD_odls_bproc_DSO_TRUE@
|
||||
OMPI_BUILD_odls_default_DSO_FALSE = @OMPI_BUILD_odls_default_DSO_FALSE@
|
||||
OMPI_BUILD_odls_default_DSO_TRUE = @OMPI_BUILD_odls_default_DSO_TRUE@
|
||||
OMPI_BUILD_oob_tcp_DSO_FALSE = @OMPI_BUILD_oob_tcp_DSO_FALSE@
|
||||
OMPI_BUILD_oob_tcp_DSO_TRUE = @OMPI_BUILD_oob_tcp_DSO_TRUE@
|
||||
OMPI_BUILD_osc_pt2pt_DSO_FALSE = @OMPI_BUILD_osc_pt2pt_DSO_FALSE@
|
||||
@ -665,24 +683,18 @@ OMPI_BUILD_paffinity_windows_DSO_FALSE = @OMPI_BUILD_paffinity_windows_DSO_FALSE
|
||||
OMPI_BUILD_paffinity_windows_DSO_TRUE = @OMPI_BUILD_paffinity_windows_DSO_TRUE@
|
||||
OMPI_BUILD_pls_bproc_DSO_FALSE = @OMPI_BUILD_pls_bproc_DSO_FALSE@
|
||||
OMPI_BUILD_pls_bproc_DSO_TRUE = @OMPI_BUILD_pls_bproc_DSO_TRUE@
|
||||
OMPI_BUILD_pls_bproc_orted_DSO_FALSE = @OMPI_BUILD_pls_bproc_orted_DSO_FALSE@
|
||||
OMPI_BUILD_pls_bproc_orted_DSO_TRUE = @OMPI_BUILD_pls_bproc_orted_DSO_TRUE@
|
||||
OMPI_BUILD_pls_fork_DSO_FALSE = @OMPI_BUILD_pls_fork_DSO_FALSE@
|
||||
OMPI_BUILD_pls_fork_DSO_TRUE = @OMPI_BUILD_pls_fork_DSO_TRUE@
|
||||
OMPI_BUILD_pls_gridengine_DSO_FALSE = @OMPI_BUILD_pls_gridengine_DSO_FALSE@
|
||||
OMPI_BUILD_pls_gridengine_DSO_TRUE = @OMPI_BUILD_pls_gridengine_DSO_TRUE@
|
||||
OMPI_BUILD_pls_poe_DSO_FALSE = @OMPI_BUILD_pls_poe_DSO_FALSE@
|
||||
OMPI_BUILD_pls_poe_DSO_TRUE = @OMPI_BUILD_pls_poe_DSO_TRUE@
|
||||
OMPI_BUILD_pls_process_DSO_FALSE = @OMPI_BUILD_pls_process_DSO_FALSE@
|
||||
OMPI_BUILD_pls_process_DSO_TRUE = @OMPI_BUILD_pls_process_DSO_TRUE@
|
||||
OMPI_BUILD_pls_proxy_DSO_FALSE = @OMPI_BUILD_pls_proxy_DSO_FALSE@
|
||||
OMPI_BUILD_pls_proxy_DSO_TRUE = @OMPI_BUILD_pls_proxy_DSO_TRUE@
|
||||
OMPI_BUILD_pls_rsh_DSO_FALSE = @OMPI_BUILD_pls_rsh_DSO_FALSE@
|
||||
OMPI_BUILD_pls_rsh_DSO_TRUE = @OMPI_BUILD_pls_rsh_DSO_TRUE@
|
||||
OMPI_BUILD_pls_slurm_DSO_FALSE = @OMPI_BUILD_pls_slurm_DSO_FALSE@
|
||||
OMPI_BUILD_pls_slurm_DSO_TRUE = @OMPI_BUILD_pls_slurm_DSO_TRUE@
|
||||
OMPI_BUILD_pls_tm_DSO_FALSE = @OMPI_BUILD_pls_tm_DSO_FALSE@
|
||||
OMPI_BUILD_pls_tm_DSO_TRUE = @OMPI_BUILD_pls_tm_DSO_TRUE@
|
||||
OMPI_BUILD_pls_xgrid_DSO_FALSE = @OMPI_BUILD_pls_xgrid_DSO_FALSE@
|
||||
OMPI_BUILD_pls_xgrid_DSO_TRUE = @OMPI_BUILD_pls_xgrid_DSO_TRUE@
|
||||
OMPI_BUILD_pml_cm_DSO_FALSE = @OMPI_BUILD_pml_cm_DSO_FALSE@
|
||||
OMPI_BUILD_pml_cm_DSO_TRUE = @OMPI_BUILD_pml_cm_DSO_TRUE@
|
||||
OMPI_BUILD_pml_dr_DSO_FALSE = @OMPI_BUILD_pml_dr_DSO_FALSE@
|
||||
@ -703,6 +715,8 @@ OMPI_BUILD_ras_lsf_bproc_DSO_FALSE = @OMPI_BUILD_ras_lsf_bproc_DSO_FALSE@
|
||||
OMPI_BUILD_ras_lsf_bproc_DSO_TRUE = @OMPI_BUILD_ras_lsf_bproc_DSO_TRUE@
|
||||
OMPI_BUILD_ras_poe_DSO_FALSE = @OMPI_BUILD_ras_poe_DSO_FALSE@
|
||||
OMPI_BUILD_ras_poe_DSO_TRUE = @OMPI_BUILD_ras_poe_DSO_TRUE@
|
||||
OMPI_BUILD_ras_proxy_DSO_FALSE = @OMPI_BUILD_ras_proxy_DSO_FALSE@
|
||||
OMPI_BUILD_ras_proxy_DSO_TRUE = @OMPI_BUILD_ras_proxy_DSO_TRUE@
|
||||
OMPI_BUILD_ras_slurm_DSO_FALSE = @OMPI_BUILD_ras_slurm_DSO_FALSE@
|
||||
OMPI_BUILD_ras_slurm_DSO_TRUE = @OMPI_BUILD_ras_slurm_DSO_TRUE@
|
||||
OMPI_BUILD_ras_tm_DSO_FALSE = @OMPI_BUILD_ras_tm_DSO_FALSE@
|
||||
@ -715,8 +729,12 @@ OMPI_BUILD_rcache_vma_DSO_FALSE = @OMPI_BUILD_rcache_vma_DSO_FALSE@
|
||||
OMPI_BUILD_rcache_vma_DSO_TRUE = @OMPI_BUILD_rcache_vma_DSO_TRUE@
|
||||
OMPI_BUILD_rds_hostfile_DSO_FALSE = @OMPI_BUILD_rds_hostfile_DSO_FALSE@
|
||||
OMPI_BUILD_rds_hostfile_DSO_TRUE = @OMPI_BUILD_rds_hostfile_DSO_TRUE@
|
||||
OMPI_BUILD_rds_proxy_DSO_FALSE = @OMPI_BUILD_rds_proxy_DSO_FALSE@
|
||||
OMPI_BUILD_rds_proxy_DSO_TRUE = @OMPI_BUILD_rds_proxy_DSO_TRUE@
|
||||
OMPI_BUILD_rds_resfile_DSO_FALSE = @OMPI_BUILD_rds_resfile_DSO_FALSE@
|
||||
OMPI_BUILD_rds_resfile_DSO_TRUE = @OMPI_BUILD_rds_resfile_DSO_TRUE@
|
||||
OMPI_BUILD_rmaps_proxy_DSO_FALSE = @OMPI_BUILD_rmaps_proxy_DSO_FALSE@
|
||||
OMPI_BUILD_rmaps_proxy_DSO_TRUE = @OMPI_BUILD_rmaps_proxy_DSO_TRUE@
|
||||
OMPI_BUILD_rmaps_round_robin_DSO_FALSE = @OMPI_BUILD_rmaps_round_robin_DSO_FALSE@
|
||||
OMPI_BUILD_rmaps_round_robin_DSO_TRUE = @OMPI_BUILD_rmaps_round_robin_DSO_TRUE@
|
||||
OMPI_BUILD_rmgr_cnos_DSO_FALSE = @OMPI_BUILD_rmgr_cnos_DSO_FALSE@
|
||||
@ -971,6 +989,9 @@ common_portals_CPPFLAGS = @common_portals_CPPFLAGS@
|
||||
common_portals_LDFLAGS = @common_portals_LDFLAGS@
|
||||
common_portals_LIBS = @common_portals_LIBS@
|
||||
datadir = @datadir@
|
||||
errmgr_bproc_CPPFLAGS = @errmgr_bproc_CPPFLAGS@
|
||||
errmgr_bproc_LDFLAGS = @errmgr_bproc_LDFLAGS@
|
||||
errmgr_bproc_LIBS = @errmgr_bproc_LIBS@
|
||||
exec_prefix = @exec_prefix@
|
||||
host = @host@
|
||||
host_alias = @host_alias@
|
||||
@ -1017,21 +1038,19 @@ mtl_psm_CFLAGS = @mtl_psm_CFLAGS@
|
||||
mtl_psm_CPPFLAGS = @mtl_psm_CPPFLAGS@
|
||||
mtl_psm_LDFLAGS = @mtl_psm_LDFLAGS@
|
||||
mtl_psm_LIBS = @mtl_psm_LIBS@
|
||||
odls_bproc_CPPFLAGS = @odls_bproc_CPPFLAGS@
|
||||
odls_bproc_LDFLAGS = @odls_bproc_LDFLAGS@
|
||||
odls_bproc_LIBS = @odls_bproc_LIBS@
|
||||
oldincludedir = @oldincludedir@
|
||||
pls_bproc_CPPFLAGS = @pls_bproc_CPPFLAGS@
|
||||
pls_bproc_LDFLAGS = @pls_bproc_LDFLAGS@
|
||||
pls_bproc_LIBS = @pls_bproc_LIBS@
|
||||
pls_bproc_orted_CPPFLAGS = @pls_bproc_orted_CPPFLAGS@
|
||||
pls_bproc_orted_LDFLAGS = @pls_bproc_orted_LDFLAGS@
|
||||
pls_bproc_orted_LIBS = @pls_bproc_orted_LIBS@
|
||||
pls_slurm_CPPFLAGS = @pls_slurm_CPPFLAGS@
|
||||
pls_slurm_LDFLAGS = @pls_slurm_LDFLAGS@
|
||||
pls_slurm_LIBS = @pls_slurm_LIBS@
|
||||
pls_tm_CPPFLAGS = @pls_tm_CPPFLAGS@
|
||||
pls_tm_LDFLAGS = @pls_tm_LDFLAGS@
|
||||
pls_tm_LIBS = @pls_tm_LIBS@
|
||||
pls_xgrid_LDFLAGS = @pls_xgrid_LDFLAGS@
|
||||
pls_xgrid_OBJCFLAGS = @pls_xgrid_OBJCFLAGS@
|
||||
prefix = @prefix@
|
||||
program_transform_name = @program_transform_name@
|
||||
ras_bjs_CPPFLAGS = @ras_bjs_CPPFLAGS@
|
||||
|
@ -24,7 +24,7 @@
|
||||
#include "opal/runtime/opal_progress.h"
|
||||
#include "opal/mca/mca.h"
|
||||
#include "opal/mca/base/base.h"
|
||||
#include "orte/runtime/runtime.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "ompi/constants.h"
|
||||
#include "ompi/mca/pml/pml.h"
|
||||
#include "ompi/mca/pml/base/base.h"
|
||||
@ -117,9 +117,9 @@ int mca_pml_base_select(bool enable_progress_threads,
|
||||
if( NULL == best_component ) {
|
||||
opal_show_help("help-mca-base.txt", "find-available:none-found", true, "pml");
|
||||
if( NULL != mca_pml_base_pml ) {
|
||||
orte_abort( 1, "PML %s cannot be selected", mca_pml_base_pml );
|
||||
orte_errmgr.error_detected(1, "PML %s cannot be selected", mca_pml_base_pml, NULL);
|
||||
} else {
|
||||
orte_abort(1, "No pml component available. This shouldn't happen.");
|
||||
orte_errmgr.error_detected(2, "No pml component available. This shouldn't happen.", NULL);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -40,6 +40,8 @@
|
||||
#include "orte/util/proc_info.h"
|
||||
#include "orte/runtime/runtime.h"
|
||||
#include "orte/mca/ns/ns.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/pls/pls.h"
|
||||
#include "orte/mca/rmgr/rmgr.h"
|
||||
#include "ompi/communicator/communicator.h"
|
||||
#include "ompi/proc/proc.h"
|
||||
@ -50,6 +52,7 @@
|
||||
#include <signal.h>
|
||||
#endif
|
||||
|
||||
#if 0
|
||||
static
|
||||
int
|
||||
abort_procs(ompi_proc_t **procs, int proc_count,
|
||||
@ -66,14 +69,14 @@ abort_procs(ompi_proc_t **procs, int proc_count,
|
||||
}
|
||||
if (jobid == my_jobid) continue;
|
||||
|
||||
killret = orte_rmgr.terminate_job(jobid);
|
||||
killret = orte_pls.terminate_job(jobid);
|
||||
|
||||
if (OMPI_SUCCESS != killret) ret = killret;
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
int
|
||||
ompi_mpi_abort(struct ompi_communicator_t* comm,
|
||||
@ -143,7 +146,7 @@ ompi_mpi_abort(struct ompi_communicator_t* comm,
|
||||
}
|
||||
|
||||
/* BWB - XXX - Should probably publish the error code somewhere */
|
||||
|
||||
#if 0
|
||||
/* Kill everyone in the job. We may make this better someday to
|
||||
actually loop over ompi_rte_kill_proc() to only kill the procs
|
||||
in comm, and additionally to somehow use errorcode. */
|
||||
@ -167,7 +170,7 @@ ompi_mpi_abort(struct ompi_communicator_t* comm,
|
||||
comm->c_local_group->grp_proc_count,
|
||||
my_jobid);
|
||||
|
||||
ret = orte_rmgr.terminate_job(my_jobid);
|
||||
ret = orte_pls.terminate_job(my_jobid);
|
||||
|
||||
if (OMPI_SUCCESS == ret) {
|
||||
while (1) {
|
||||
@ -188,6 +191,12 @@ ompi_mpi_abort(struct ompi_communicator_t* comm,
|
||||
just exit and let it become Somebody Elses Problem. */
|
||||
exit(errcode);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
/* tell the error manager we detected an error - OpenRTE
|
||||
* will take care of cleaning up for us
|
||||
*/
|
||||
orte_errmgr.error_detected(errcode, "MPI_Abort has been called", NULL);
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
@ -221,7 +221,7 @@ void ompi_info::open_components()
|
||||
component_map["rml"] = &orte_rml_base.rml_components;
|
||||
|
||||
orte_pls_base_open();
|
||||
component_map["pls"] = &orte_pls_base.pls_opened;
|
||||
component_map["pls"] = &orte_pls_base.available_components;
|
||||
|
||||
orte_sds_base_open();
|
||||
component_map["sds"] = &orte_sds_base_components_available;
|
||||
|
@ -101,9 +101,9 @@ typedef void* ompi_iov_base_ptr_t;
|
||||
*/
|
||||
|
||||
#if defined(HAVE_SOCKLEN_T)
|
||||
typedef socklen_t ompi_socklen_t;
|
||||
typedef socklen_t opal_socklen_t;
|
||||
#else
|
||||
typedef int ompi_socklen_t;
|
||||
typedef int opal_socklen_t;
|
||||
#endif
|
||||
|
||||
|
||||
|
@ -122,7 +122,7 @@ static int make_mask(unsigned int *len, unsigned long **mask)
|
||||
|
||||
linux_module_get_num_procs(&num_procs);
|
||||
*len = num_procs / 8;
|
||||
if (*len != num_procs * 8) {
|
||||
if (*len != (unsigned int)num_procs * 8) {
|
||||
++*len;
|
||||
}
|
||||
|
||||
|
@ -22,11 +22,12 @@
|
||||
#if HAVE_NETINET_IN_H
|
||||
#include <netinet/in.h>
|
||||
#endif
|
||||
#include "opal/util/output.h"
|
||||
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/odls/odls_types.h"
|
||||
|
||||
#include "orte/dss/dss_internal.h"
|
||||
#include "opal/util/output.h"
|
||||
|
||||
static void orte_dss_arith_int(int *value, int *operand, orte_dss_arith_op_t operation);
|
||||
static void orte_dss_arith_uint(uint *value, uint *operand, orte_dss_arith_op_t operation);
|
||||
|
@ -232,16 +232,6 @@ int orte_dss_compare_dt(orte_data_type_t *value1, orte_data_type_t *value2, orte
|
||||
return ORTE_EQUAL;
|
||||
}
|
||||
|
||||
/* ORTE_DAEMON_CMD */
|
||||
int orte_dss_compare_daemon_cmd(orte_daemon_cmd_flag_t *value1, orte_daemon_cmd_flag_t *value2, orte_data_type_t type)
|
||||
{
|
||||
if (*value1 > *value2) return ORTE_VALUE1_GREATER;
|
||||
|
||||
if (*value2 > *value1) return ORTE_VALUE2_GREATER;
|
||||
|
||||
return ORTE_EQUAL;
|
||||
}
|
||||
|
||||
/* ORTE_DATA_VALUE */
|
||||
int orte_dss_compare_data_value(orte_data_value_t *value1, orte_data_value_t *value2, orte_data_type_t type)
|
||||
{
|
||||
|
@ -114,10 +114,6 @@ int orte_dss_std_copy(void **dest, void *src, orte_data_type_t type)
|
||||
datasize = sizeof(orte_data_type_t);
|
||||
break;
|
||||
|
||||
case ORTE_DAEMON_CMD:
|
||||
datasize = sizeof(orte_daemon_cmd_flag_t);
|
||||
break;
|
||||
|
||||
default:
|
||||
ORTE_ERROR_LOG(ORTE_ERR_UNKNOWN_DATA_TYPE);
|
||||
return ORTE_ERR_UNKNOWN_DATA_TYPE;
|
||||
|
@ -112,6 +112,55 @@ extern "C" {
|
||||
#error Unsupported pid_t size!
|
||||
#endif
|
||||
|
||||
/* Unpack generic size macros */
|
||||
#define UNPACK_SIZE_MISMATCH(unpack_type, remote_type, ret) \
|
||||
do { \
|
||||
switch(remote_type) { \
|
||||
case ORTE_UINT8: \
|
||||
UNPACK_SIZE_MISMATCH_FOUND(unpack_type, uint8_t, remote_type); \
|
||||
break; \
|
||||
case ORTE_INT8: \
|
||||
UNPACK_SIZE_MISMATCH_FOUND(unpack_type, int8_t, remote_type); \
|
||||
break; \
|
||||
case ORTE_UINT16: \
|
||||
UNPACK_SIZE_MISMATCH_FOUND(unpack_type, uint16_t, remote_type); \
|
||||
break; \
|
||||
case ORTE_INT16: \
|
||||
UNPACK_SIZE_MISMATCH_FOUND(unpack_type, int16_t, remote_type); \
|
||||
break; \
|
||||
case ORTE_UINT32: \
|
||||
UNPACK_SIZE_MISMATCH_FOUND(unpack_type, uint32_t, remote_type); \
|
||||
break; \
|
||||
case ORTE_INT32: \
|
||||
UNPACK_SIZE_MISMATCH_FOUND(unpack_type, int32_t, remote_type); \
|
||||
break; \
|
||||
case ORTE_UINT64: \
|
||||
UNPACK_SIZE_MISMATCH_FOUND(unpack_type, uint64_t, remote_type); \
|
||||
break; \
|
||||
case ORTE_INT64: \
|
||||
UNPACK_SIZE_MISMATCH_FOUND(unpack_type, int64_t, remote_type); \
|
||||
break; \
|
||||
default: \
|
||||
ret = ORTE_ERR_NOT_FOUND; \
|
||||
ORTE_ERROR_LOG(ret); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
/* NOTE: do not need to deal with endianness here, as the unpacking of
|
||||
the underling sender-side type will do that for us. Repeat: the
|
||||
data in tmpbuf[] is already in host byte order. */
|
||||
#define UNPACK_SIZE_MISMATCH_FOUND(unpack_type, tmptype, tmpdsstype) \
|
||||
do { \
|
||||
orte_std_cntr_t i; \
|
||||
tmptype *tmpbuf = (tmptype*)malloc(sizeof(tmptype) * (*num_vals)); \
|
||||
ret = orte_dss_unpack_buffer(buffer, tmpbuf, num_vals, tmpdsstype); \
|
||||
for (i = 0 ; i < *num_vals ; ++i) { \
|
||||
((unpack_type*) dest)[i] = (unpack_type)(tmpbuf[i]); \
|
||||
} \
|
||||
free(tmpbuf); \
|
||||
} while (0)
|
||||
|
||||
|
||||
/**
|
||||
* Internal struct used for holding registered dss functions
|
||||
*/
|
||||
@ -256,9 +305,6 @@ extern orte_data_type_t orte_dss_num_reg_types;
|
||||
int orte_dss_pack_data_type(orte_buffer_t *buffer, void *src,
|
||||
orte_std_cntr_t num_vals, orte_data_type_t type);
|
||||
|
||||
int orte_dss_pack_daemon_cmd(orte_buffer_t *buffer, void *src,
|
||||
orte_std_cntr_t num_vals, orte_data_type_t type);
|
||||
|
||||
int orte_dss_pack_data_value(orte_buffer_t *buffer, void *src,
|
||||
orte_std_cntr_t num_vals, orte_data_type_t type);
|
||||
|
||||
@ -301,9 +347,6 @@ extern orte_data_type_t orte_dss_num_reg_types;
|
||||
int orte_dss_unpack_data_type(orte_buffer_t *buffer, void *dest,
|
||||
orte_std_cntr_t *num_vals, orte_data_type_t type);
|
||||
|
||||
int orte_dss_unpack_daemon_cmd(orte_buffer_t *buffer, void *dest,
|
||||
orte_std_cntr_t *num_vals, orte_data_type_t type);
|
||||
|
||||
int orte_dss_unpack_data_value(orte_buffer_t *buffer, void *dest,
|
||||
orte_std_cntr_t *num_vals, orte_data_type_t type);
|
||||
|
||||
@ -360,8 +403,6 @@ extern orte_data_type_t orte_dss_num_reg_types;
|
||||
|
||||
int orte_dss_compare_dt(orte_data_type_t *value1, orte_data_type_t *value2, orte_data_type_t type);
|
||||
|
||||
int orte_dss_compare_daemon_cmd(orte_daemon_cmd_flag_t *value1, orte_daemon_cmd_flag_t *value2, orte_data_type_t type);
|
||||
|
||||
int orte_dss_compare_data_value(orte_data_value_t *value1, orte_data_value_t *value2, orte_data_type_t type);
|
||||
|
||||
int orte_dss_compare_byte_object(orte_byte_object_t *value1, orte_byte_object_t *value2, orte_data_type_t type);
|
||||
@ -405,7 +446,6 @@ extern orte_data_type_t orte_dss_num_reg_types;
|
||||
int orte_dss_print_null(char **output, char *prefix, void *src, orte_data_type_t type);
|
||||
int orte_dss_print_std_cntr(char **output, char *prefix, orte_std_cntr_t *src, orte_data_type_t type);
|
||||
int orte_dss_print_data_type(char **output, char *prefix, orte_data_type_t *src, orte_data_type_t type);
|
||||
int orte_dss_print_daemon_cmd(char **output, char *prefix, orte_daemon_cmd_flag_t *src, orte_data_type_t type);
|
||||
int orte_dss_print_data_value(char **output, char *prefix, orte_data_value_t *src, orte_data_type_t type);
|
||||
int orte_dss_print_byte_object(char **output, char *prefix, orte_byte_object_t *src, orte_data_type_t type);
|
||||
|
||||
|
@ -426,19 +426,6 @@ int orte_dss_open(void)
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
tmp = ORTE_DAEMON_CMD;
|
||||
if (ORTE_SUCCESS != (rc = orte_dss.register_type(orte_dss_pack_daemon_cmd,
|
||||
orte_dss_unpack_daemon_cmd,
|
||||
(orte_dss_copy_fn_t)orte_dss_std_copy,
|
||||
(orte_dss_compare_fn_t)orte_dss_compare_daemon_cmd,
|
||||
(orte_dss_size_fn_t)orte_dss_std_size,
|
||||
(orte_dss_print_fn_t)orte_dss_print_daemon_cmd,
|
||||
(orte_dss_release_fn_t)orte_dss_std_release,
|
||||
ORTE_DSS_UNSTRUCTURED,
|
||||
"ORTE_DATA_TYPE", &tmp))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
tmp = ORTE_BYTE_OBJECT;
|
||||
if (ORTE_SUCCESS != (rc = orte_dss.register_type(orte_dss_pack_byte_object,
|
||||
orte_dss_unpack_byte_object,
|
||||
|
@ -417,22 +417,6 @@ int orte_dss_pack_data_value(orte_buffer_t *buffer, void *src, orte_std_cntr_t n
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/*
|
||||
* ORTE_DAEMON_CMD
|
||||
*/
|
||||
int orte_dss_pack_daemon_cmd(orte_buffer_t *buffer, void *src, orte_std_cntr_t num_vals,
|
||||
orte_data_type_t type)
|
||||
{
|
||||
int ret;
|
||||
|
||||
/* Turn around and pack the real type */
|
||||
if (ORTE_SUCCESS != (ret = orte_dss_pack_buffer(buffer, src, num_vals, ORTE_DAEMON_CMD_T))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* ORTE_BYTE_OBJECT
|
||||
*/
|
||||
|
@ -475,28 +475,6 @@ int orte_dss_print_data_value(char **output, char *prefix, orte_data_value_t *sr
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/*
|
||||
* ORTE_DAEMON_CMD
|
||||
*/
|
||||
int orte_dss_print_daemon_cmd(char **output, char *prefix, orte_daemon_cmd_flag_t *src, orte_data_type_t type)
|
||||
{
|
||||
char *prefx;
|
||||
|
||||
/* deal with NULL prefix */
|
||||
if (NULL == prefix) asprintf(&prefx, " ");
|
||||
else prefx = prefix;
|
||||
|
||||
/* if src is NULL, just print data type and return */
|
||||
if (NULL == src) {
|
||||
asprintf(output, "%sData type: ORTE_DAEMON_CMD\tValue: NULL pointer", prefx);
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
asprintf(output, "%sData type: ORTE_DAEMON_CMD\tValue: %lu", prefx, (unsigned long) *src);
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/*
|
||||
* ORTE_BYTE_OBJECT
|
||||
*/
|
||||
|
@ -109,10 +109,6 @@ int orte_dss_std_size(size_t *size, void *src, orte_data_type_t type)
|
||||
*size = sizeof(orte_data_type_t);
|
||||
break;
|
||||
|
||||
case ORTE_DAEMON_CMD:
|
||||
*size = sizeof(orte_daemon_cmd_flag_t);
|
||||
break;
|
||||
|
||||
default:
|
||||
*size = 0;
|
||||
ORTE_ERROR_LOG(ORTE_ERR_UNKNOWN_DATA_TYPE);
|
||||
|
@ -31,54 +31,6 @@
|
||||
|
||||
#include "orte/dss/dss_internal.h"
|
||||
|
||||
#define UNPACK_SIZE_MISMATCH(unpack_type, remote_type, ret) \
|
||||
do { \
|
||||
switch(remote_type) { \
|
||||
case ORTE_UINT8: \
|
||||
UNPACK_SIZE_MISMATCH_FOUND(unpack_type, uint8_t, remote_type); \
|
||||
break; \
|
||||
case ORTE_INT8: \
|
||||
UNPACK_SIZE_MISMATCH_FOUND(unpack_type, int8_t, remote_type); \
|
||||
break; \
|
||||
case ORTE_UINT16: \
|
||||
UNPACK_SIZE_MISMATCH_FOUND(unpack_type, uint16_t, remote_type); \
|
||||
break; \
|
||||
case ORTE_INT16: \
|
||||
UNPACK_SIZE_MISMATCH_FOUND(unpack_type, int16_t, remote_type); \
|
||||
break; \
|
||||
case ORTE_UINT32: \
|
||||
UNPACK_SIZE_MISMATCH_FOUND(unpack_type, uint32_t, remote_type); \
|
||||
break; \
|
||||
case ORTE_INT32: \
|
||||
UNPACK_SIZE_MISMATCH_FOUND(unpack_type, int32_t, remote_type); \
|
||||
break; \
|
||||
case ORTE_UINT64: \
|
||||
UNPACK_SIZE_MISMATCH_FOUND(unpack_type, uint64_t, remote_type); \
|
||||
break; \
|
||||
case ORTE_INT64: \
|
||||
UNPACK_SIZE_MISMATCH_FOUND(unpack_type, int64_t, remote_type); \
|
||||
break; \
|
||||
default: \
|
||||
ret = ORTE_ERR_NOT_FOUND; \
|
||||
ORTE_ERROR_LOG(ret); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
/* NOTE: do not need to deal with endianness here, as the unpacking of
|
||||
the underling sender-side type will do that for us. Repeat: the
|
||||
data in tmpbuf[] is already in host byte order. */
|
||||
#define UNPACK_SIZE_MISMATCH_FOUND(unpack_type, tmptype, tmpdsstype) \
|
||||
do { \
|
||||
orte_std_cntr_t i; \
|
||||
tmptype *tmpbuf = (tmptype*)malloc(sizeof(tmptype) * (*num_vals)); \
|
||||
ret = orte_dss_unpack_buffer(buffer, tmpbuf, num_vals, tmpdsstype); \
|
||||
for (i = 0 ; i < *num_vals ; ++i) { \
|
||||
((unpack_type*) dest)[i] = (unpack_type)(tmpbuf[i]); \
|
||||
} \
|
||||
free(tmpbuf); \
|
||||
} while (0)
|
||||
|
||||
|
||||
int orte_dss_unpack(orte_buffer_t *buffer, void *dst, orte_std_cntr_t *num_vals,
|
||||
orte_data_type_t type)
|
||||
{
|
||||
@ -603,49 +555,6 @@ int orte_dss_unpack_data_type(orte_buffer_t *buffer, void *dest, orte_std_cntr_t
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* ORTE_DAEMON_CMD
|
||||
*/
|
||||
int orte_dss_unpack_daemon_cmd(orte_buffer_t *buffer, void *dest, orte_std_cntr_t *num_vals,
|
||||
orte_data_type_t type)
|
||||
{
|
||||
int ret;
|
||||
orte_data_type_t remote_type;
|
||||
|
||||
/* if the buffer is fully described, then we can do some magic to handle
|
||||
* the heterogeneous case. if not, then we can only shoot blind - it is the
|
||||
* user's responsibility to ensure we are in a homogeneous environment.
|
||||
*/
|
||||
if (ORTE_DSS_BUFFER_FULLY_DESC == buffer->type) {
|
||||
/* see what type was actually packed */
|
||||
if (ORTE_SUCCESS != (ret = orte_dss_peek_type(buffer, &remote_type))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
if (remote_type == ORTE_DAEMON_CMD_T) {
|
||||
/* fast path it if the sizes are the same */
|
||||
/* Turn around and unpack the real type */
|
||||
if (ORTE_SUCCESS != (ret = orte_dss_unpack_buffer(buffer, dest, num_vals, ORTE_DAEMON_CMD_T))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
}
|
||||
} else {
|
||||
/* slow path - types are different sizes */
|
||||
UNPACK_SIZE_MISMATCH(orte_daemon_cmd_flag_t, remote_type, ret);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* if we get here, then this buffer is NOT fully described. just unpack it
|
||||
* using the local size - user gets the pain if it's wrong
|
||||
*/
|
||||
if (ORTE_SUCCESS != (ret = orte_dss_unpack_buffer(buffer, dest, num_vals, ORTE_DAEMON_CMD_T))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* ORTE_DATA_VALUE
|
||||
*/
|
||||
|
@ -46,10 +46,6 @@ typedef struct {
|
||||
uint8_t *bytes;
|
||||
} orte_byte_object_t;
|
||||
|
||||
/* define the orted command flag type */
|
||||
typedef uint16_t orte_daemon_cmd_flag_t;
|
||||
#define ORTE_DAEMON_CMD_T ORTE_UINT16
|
||||
|
||||
/**
|
||||
* handle differences in iovec
|
||||
*/
|
||||
|
@ -17,10 +17,12 @@
|
||||
#
|
||||
|
||||
headers += \
|
||||
base/errmgr_private.h \
|
||||
base/base.h
|
||||
|
||||
libmca_errmgr_la_SOURCES += \
|
||||
base/errmgr_base_close.c \
|
||||
base/errmgr_base_receive.c \
|
||||
base/errmgr_base_select.c \
|
||||
base/errmgr_base_open.c \
|
||||
base/errmgr_base_fns.c
|
||||
|
@ -48,26 +48,9 @@ extern "C" {
|
||||
* function definitions
|
||||
*/
|
||||
ORTE_DECLSPEC int orte_errmgr_base_open(void);
|
||||
ORTE_DECLSPEC int orte_errmgr_base_select(bool *allow_multi_user_threads,
|
||||
bool *have_hidden_threads);
|
||||
ORTE_DECLSPEC int orte_errmgr_base_select(void);
|
||||
ORTE_DECLSPEC int orte_errmgr_base_close(void);
|
||||
|
||||
/*
|
||||
* Base functions that are common to all implementations - can be overridden
|
||||
*/
|
||||
|
||||
ORTE_DECLSPEC void orte_errmgr_base_log(int error_code, char *filename, int line);
|
||||
|
||||
ORTE_DECLSPEC void orte_errmgr_base_proc_aborted(orte_process_name_t *proc);
|
||||
|
||||
ORTE_DECLSPEC void orte_errmgr_base_incomplete_start(orte_jobid_t job);
|
||||
|
||||
ORTE_DECLSPEC void orte_errmgr_base_error_detected(int error_code);
|
||||
|
||||
ORTE_DECLSPEC int orte_errmgr_base_register_job(orte_jobid_t job);
|
||||
|
||||
ORTE_DECLSPEC void orte_errmgr_base_abort(void);
|
||||
|
||||
/*
|
||||
* globals that might be needed
|
||||
*/
|
||||
@ -78,6 +61,8 @@ ORTE_DECLSPEC extern bool orte_errmgr_initialized;
|
||||
ORTE_DECLSPEC extern opal_list_t orte_errmgr_base_components_available;
|
||||
ORTE_DECLSPEC extern mca_errmgr_base_component_t orte_errmgr_base_selected_component;
|
||||
|
||||
/* make the default module available so that close can use it */
|
||||
ORTE_DECLSPEC extern orte_errmgr_base_module_t orte_errmgr_default;
|
||||
/*
|
||||
* external API functions will be documented in the mca/errmgr/errmgr.h file
|
||||
*/
|
||||
|
@ -24,6 +24,8 @@
|
||||
#include "opal/util/trace.h"
|
||||
#include "opal/mca/mca.h"
|
||||
#include "opal/mca/base/base.h"
|
||||
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/errmgr/base/base.h"
|
||||
|
||||
|
||||
@ -44,6 +46,10 @@ int orte_errmgr_base_close(void)
|
||||
&orte_errmgr_base_components_available, NULL);
|
||||
|
||||
orte_errmgr_initialized = false;
|
||||
|
||||
/* set the module back to the default so that error logging can continue */
|
||||
orte_errmgr = orte_errmgr_default;
|
||||
|
||||
/* All done */
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
|
@ -23,18 +23,14 @@
|
||||
#endif
|
||||
#include <stdlib.h>
|
||||
#include "orte/orte_constants.h"
|
||||
#include "orte/mca/schema/schema.h"
|
||||
|
||||
#include "orte/runtime/runtime.h"
|
||||
#include "orte/runtime/orte_wait.h"
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/util/trace.h"
|
||||
#include "orte/util/proc_info.h"
|
||||
#include "orte/mca/ns/ns.h"
|
||||
#include "orte/mca/ns/ns_types.h"
|
||||
|
||||
#include "orte/mca/rmgr/rmgr.h"
|
||||
|
||||
#include "orte/mca/errmgr/base/base.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/errmgr/base/errmgr_private.h"
|
||||
|
||||
|
||||
void orte_errmgr_base_log(int error_code, char *filename, int line)
|
||||
@ -49,55 +45,37 @@ void orte_errmgr_base_log(int error_code, char *filename, int line)
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name),
|
||||
ORTE_ERROR_NAME(error_code), filename, line);
|
||||
}
|
||||
/* orte_errmgr_base_error_detected(error_code); */
|
||||
}
|
||||
|
||||
void orte_errmgr_base_proc_aborted(orte_process_name_t *proc)
|
||||
int orte_errmgr_base_proc_aborted_not_avail(orte_gpr_notify_message_t *msg)
|
||||
{
|
||||
orte_jobid_t job;
|
||||
int rc;
|
||||
|
||||
OPAL_TRACE(1);
|
||||
|
||||
if (ORTE_SUCCESS != (rc = orte_ns.get_jobid(&job, proc))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return;
|
||||
}
|
||||
|
||||
orte_rmgr.terminate_job(job);
|
||||
return ORTE_ERR_NOT_AVAILABLE;
|
||||
}
|
||||
|
||||
void orte_errmgr_base_incomplete_start(orte_jobid_t job)
|
||||
int orte_errmgr_base_incomplete_start_not_avail(orte_gpr_notify_message_t *msgb)
|
||||
{
|
||||
OPAL_TRACE(1);
|
||||
|
||||
orte_rmgr.terminate_job(job);
|
||||
return ORTE_ERR_NOT_AVAILABLE;
|
||||
}
|
||||
|
||||
void orte_errmgr_base_error_detected(int error_code)
|
||||
void orte_errmgr_base_error_detected(int error_code, char *fmt, ...)
|
||||
{
|
||||
OPAL_TRACE(1);
|
||||
|
||||
/* we can't know if any output is available yet, so
|
||||
* we just exit */
|
||||
exit(error_code);
|
||||
}
|
||||
|
||||
void orte_errmgr_base_abort()
|
||||
void orte_errmgr_base_abort(void)
|
||||
{
|
||||
OPAL_TRACE(1);
|
||||
|
||||
/* kill and reap all children */
|
||||
orte_wait_kill(9);
|
||||
|
||||
/* abnormal exit */
|
||||
orte_abort(-1, NULL);
|
||||
/* guess we should exit */
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
int orte_errmgr_base_register_job(orte_jobid_t job)
|
||||
int orte_errmgr_base_register_job_not_avail(orte_jobid_t job)
|
||||
{
|
||||
/* register subscription for process_status values
|
||||
* changing to abnormal termination codes
|
||||
*/
|
||||
|
||||
OPAL_TRACE(1);
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
return ORTE_ERR_NOT_AVAILABLE;
|
||||
}
|
||||
|
||||
int orte_errmgr_base_abort_procs_request_not_avail(orte_process_name_t *procs, orte_std_cntr_t num_procs)
|
||||
{
|
||||
return ORTE_ERR_NOT_AVAILABLE;
|
||||
}
|
||||
|
@ -27,6 +27,7 @@
|
||||
#include "opal/util/trace.h"
|
||||
|
||||
#include "orte/mca/errmgr/base/base.h"
|
||||
#include "orte/mca/errmgr/base/errmgr_private.h"
|
||||
|
||||
|
||||
/*
|
||||
@ -45,14 +46,22 @@
|
||||
* Global variables
|
||||
*/
|
||||
int orte_errmgr_base_output = -1;
|
||||
orte_errmgr_base_module_t orte_errmgr = {
|
||||
/*
|
||||
* we must define a default module so that the error logging
|
||||
* functions can be available as early as possible
|
||||
*/
|
||||
orte_errmgr_base_module_t orte_errmgr_default = {
|
||||
orte_errmgr_base_log,
|
||||
orte_errmgr_base_proc_aborted,
|
||||
orte_errmgr_base_incomplete_start,
|
||||
orte_errmgr_base_proc_aborted_not_avail,
|
||||
orte_errmgr_base_incomplete_start_not_avail,
|
||||
orte_errmgr_base_error_detected,
|
||||
orte_errmgr_base_register_job,
|
||||
orte_errmgr_base_abort
|
||||
orte_errmgr_base_register_job_not_avail,
|
||||
orte_errmgr_base_abort,
|
||||
orte_errmgr_base_abort_procs_request_not_avail
|
||||
};
|
||||
/* start out with a default module */
|
||||
orte_errmgr_base_module_t orte_errmgr;
|
||||
|
||||
bool orte_errmgr_base_selected = false;
|
||||
opal_list_t orte_errmgr_base_components_available;
|
||||
mca_errmgr_base_component_t orte_errmgr_base_selected_component;
|
||||
@ -81,6 +90,9 @@ int orte_errmgr_base_open(void)
|
||||
orte_errmgr_base_output = -1;
|
||||
}
|
||||
|
||||
/* set the default module */
|
||||
orte_errmgr = orte_errmgr_default;
|
||||
|
||||
/* Open up all available components */
|
||||
|
||||
if (ORTE_SUCCESS !=
|
||||
|
162
orte/mca/errmgr/base/errmgr_base_receive.c
Обычный файл
162
orte/mca/errmgr/base/errmgr_base_receive.c
Обычный файл
@ -0,0 +1,162 @@
|
||||
/* -*- C -*-
|
||||
*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
/** @file:
|
||||
*
|
||||
*/
|
||||
|
||||
/*
|
||||
* includes
|
||||
*/
|
||||
#include "orte_config.h"
|
||||
|
||||
#include "orte/orte_constants.h"
|
||||
#include "orte/orte_types.h"
|
||||
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/mca/mca.h"
|
||||
#include "opal/mca/base/mca_base_param.h"
|
||||
|
||||
#include "orte/dss/dss.h"
|
||||
#include "orte/util/proc_info.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/rml/rml.h"
|
||||
|
||||
#include "orte/mca/errmgr/base/errmgr_private.h"
|
||||
|
||||
static bool recv_issued=false;
|
||||
|
||||
int orte_errmgr_base_comm_start(void)
|
||||
{
|
||||
int rc;
|
||||
|
||||
if (recv_issued) {
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
if (ORTE_SUCCESS != (rc = orte_rml.recv_buffer_nb(ORTE_RML_NAME_ANY,
|
||||
ORTE_RML_TAG_ERRMGR,
|
||||
ORTE_RML_PERSISTENT,
|
||||
orte_errmgr_base_recv,
|
||||
NULL))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
recv_issued = true;
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
||||
int orte_errmgr_base_comm_stop(void)
|
||||
{
|
||||
int rc;
|
||||
|
||||
if (!recv_issued) {
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
if (ORTE_SUCCESS != (rc = orte_rml.recv_cancel(ORTE_RML_NAME_ANY, ORTE_RML_TAG_ERRMGR))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
recv_issued = false;
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*
|
||||
* handle message from proxies
|
||||
* NOTE: The incoming buffer "buffer" is OBJ_RELEASED by the calling program.
|
||||
* DO NOT RELEASE THIS BUFFER IN THIS CODE
|
||||
*/
|
||||
|
||||
void orte_errmgr_base_recv(int status, orte_process_name_t* sender,
|
||||
orte_buffer_t* buffer, orte_rml_tag_t tag,
|
||||
void* cbdata)
|
||||
{
|
||||
orte_buffer_t answer;
|
||||
orte_errmgr_cmd_flag_t command;
|
||||
orte_std_cntr_t count, nprocs;
|
||||
orte_process_name_t *procs;
|
||||
int rc;
|
||||
|
||||
/* get the command */
|
||||
count = 1;
|
||||
if (ORTE_SUCCESS != (rc = orte_dss.unpack(buffer, &command, &count, ORTE_ERRMGR_CMD))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return;
|
||||
}
|
||||
|
||||
/* setup to return an answer */
|
||||
OBJ_CONSTRUCT(&answer, orte_buffer_t);
|
||||
|
||||
/* pack the command in the answer - this is done to allow the caller to check
|
||||
* that we are talking about the same command
|
||||
*/
|
||||
if (ORTE_SUCCESS != (rc = orte_dss.pack(&answer, &command, 1, ORTE_ERRMGR_CMD))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return;
|
||||
}
|
||||
|
||||
switch (command) {
|
||||
case ORTE_ERRMGR_ABORT_PROCS_REQUEST_CMD:
|
||||
/* get the number of processes */
|
||||
count = 1;
|
||||
if (ORTE_SUCCESS != (rc = orte_dss.unpack(buffer, &nprocs, &count, ORTE_STD_CNTR))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto SEND_ANSWER;
|
||||
}
|
||||
/* get the required space */
|
||||
procs = (orte_process_name_t*)malloc(nprocs * sizeof(orte_process_name_t));
|
||||
if (NULL == procs) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
goto SEND_ANSWER;
|
||||
}
|
||||
|
||||
/* unpack the array of process names */
|
||||
count = nprocs;
|
||||
if (ORTE_SUCCESS != (rc = orte_dss.unpack(buffer, &procs, &count, ORTE_NAME))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto SEND_ANSWER;
|
||||
}
|
||||
/* if we didn't get the number we requested, then something is wrong */
|
||||
if (count != nprocs) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
|
||||
goto SEND_ANSWER;
|
||||
}
|
||||
|
||||
/* process the request */
|
||||
if (ORTE_SUCCESS != (rc = orte_errmgr.abort_procs_request(procs, nprocs))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto SEND_ANSWER;
|
||||
}
|
||||
break;
|
||||
|
||||
default:
|
||||
ORTE_ERROR_LOG(ORTE_ERR_VALUE_OUT_OF_BOUNDS);
|
||||
}
|
||||
|
||||
SEND_ANSWER:
|
||||
if (0 > orte_rml.send_buffer(sender, &answer, tag, 0)) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
|
||||
}
|
||||
|
||||
/* cleanup */
|
||||
OBJ_DESTRUCT(&answer);
|
||||
}
|
||||
|
@ -29,8 +29,7 @@
|
||||
* Function for selecting one component from all those that are
|
||||
* available.
|
||||
*/
|
||||
int orte_errmgr_base_select(bool *allow_multi_user_threads,
|
||||
bool *have_hidden_threads)
|
||||
int orte_errmgr_base_select(void)
|
||||
{
|
||||
opal_list_item_t *item;
|
||||
mca_base_component_list_item_t *cli;
|
||||
@ -71,11 +70,9 @@ int orte_errmgr_base_select(bool *allow_multi_user_threads,
|
||||
|
||||
best_module = module;
|
||||
best_component = component;
|
||||
*allow_multi_user_threads = multi;
|
||||
*have_hidden_threads = hidden;
|
||||
|
||||
/* update the best priority */
|
||||
best_priority = priority;
|
||||
/* update the best priority */
|
||||
best_priority = priority;
|
||||
}
|
||||
|
||||
/* If it's not the best one, finalize it */
|
||||
@ -86,10 +83,10 @@ int orte_errmgr_base_select(bool *allow_multi_user_threads,
|
||||
}
|
||||
}
|
||||
|
||||
/* If we didn't find one to select, that's okay - stick with default */
|
||||
/* If we didn't find one to select, then we have a big problem */
|
||||
|
||||
if (NULL == best_component) {
|
||||
return ORTE_SUCCESS;
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
|
||||
/* We have happiness -- save the component and module for later
|
||||
@ -98,7 +95,7 @@ int orte_errmgr_base_select(bool *allow_multi_user_threads,
|
||||
orte_errmgr = *best_module;
|
||||
orte_errmgr_base_selected_component = *best_component;
|
||||
orte_errmgr_base_selected = true;
|
||||
|
||||
|
||||
/* all done */
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
|
82
orte/mca/errmgr/base/errmgr_private.h
Обычный файл
82
orte/mca/errmgr/base/errmgr_private.h
Обычный файл
@ -0,0 +1,82 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
/** @file:
|
||||
*/
|
||||
|
||||
#ifndef ORTE_MCA_ERRMGR_PRIVATE_H
|
||||
#define ORTE_MCA_ERRMGR_PRIVATE_H
|
||||
|
||||
/*
|
||||
* includes
|
||||
*/
|
||||
#include "orte_config.h"
|
||||
#include "orte/orte_constants.h"
|
||||
|
||||
#include "orte/mca/ns/ns_types.h"
|
||||
#include "orte/mca/gpr/gpr_types.h"
|
||||
#include "orte/mca/rml/rml_types.h"
|
||||
|
||||
|
||||
/*
|
||||
* Functions for use solely within the ERRMGR framework
|
||||
*/
|
||||
#if defined(c_plusplus) || defined(__cplusplus)
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
/* Define the ERRMGR command flag */
|
||||
typedef uint8_t orte_errmgr_cmd_flag_t;
|
||||
#define ORTE_ERRMGR_CMD ORTE_UINT8
|
||||
|
||||
/* define some commands */
|
||||
#define ORTE_ERRMGR_ABORT_PROCS_REQUEST_CMD 0x01
|
||||
|
||||
/* Internal support */
|
||||
int orte_errmgr_base_comm_start(void);
|
||||
int orte_errmgr_base_comm_stop(void);
|
||||
void orte_errmgr_base_recv(int status, orte_process_name_t* sender,
|
||||
orte_buffer_t* buffer, orte_rml_tag_t tag,
|
||||
void* cbdata);
|
||||
|
||||
|
||||
/*
|
||||
* Base functions
|
||||
*/
|
||||
|
||||
ORTE_DECLSPEC void orte_errmgr_base_log(int error_code, char *filename, int line);
|
||||
|
||||
ORTE_DECLSPEC int orte_errmgr_base_proc_aborted_not_avail(orte_gpr_notify_message_t *msg);
|
||||
|
||||
ORTE_DECLSPEC int orte_errmgr_base_incomplete_start_not_avail(orte_gpr_notify_message_t *msg);
|
||||
|
||||
ORTE_DECLSPEC void orte_errmgr_base_error_detected(int error_code, char *fmt, ...);
|
||||
|
||||
ORTE_DECLSPEC int orte_errmgr_base_register_job_not_avail(orte_jobid_t job);
|
||||
|
||||
ORTE_DECLSPEC void orte_errmgr_base_abort(void);
|
||||
|
||||
ORTE_DECLSPEC int orte_errmgr_base_abort_procs_request_not_avail(orte_process_name_t *procs, orte_std_cntr_t num_procs);
|
||||
|
||||
/*
|
||||
* external API functions will be documented in the mca/errmgr/errmgr.h file
|
||||
*/
|
||||
|
||||
#if defined(c_plusplus) || defined(__cplusplus)
|
||||
}
|
||||
#endif
|
||||
#endif
|
51
orte/mca/errmgr/bproc/Makefile.am
Обычный файл
51
orte/mca/errmgr/bproc/Makefile.am
Обычный файл
@ -0,0 +1,51 @@
|
||||
#
|
||||
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
# University Research and Technology
|
||||
# Corporation. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
# of Tennessee Research Foundation. All rights
|
||||
# reserved.
|
||||
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# Copyright (c) 2006 Cisco Systems, Inc. All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
AM_CPPFLAGS = -I$(top_ompi_builddir)/src/include $(errmgr_bproc_CPPFLAGS)
|
||||
|
||||
# Make the output library in this directory, and name it either
|
||||
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
|
||||
# (for static builds).
|
||||
|
||||
if OMPI_BUILD_errmgr_bproc_DSO
|
||||
component_noinst =
|
||||
component_install = mca_errmgr_bproc.la
|
||||
else
|
||||
component_noinst = libmca_errmgr_bproc.la
|
||||
component_install =
|
||||
endif
|
||||
|
||||
sources = \
|
||||
errmgr_bproc.h \
|
||||
errmgr_bproc.c \
|
||||
errmgr_bproc_component.c
|
||||
|
||||
mcacomponentdir = $(libdir)/openmpi
|
||||
mcacomponent_LTLIBRARIES = $(component_install)
|
||||
mca_errmgr_bproc_la_SOURCES = $(sources)
|
||||
mca_errmgr_bproc_la_LIBADD = \
|
||||
$(errmgr_bproc_LIBS) \
|
||||
$(top_ompi_builddir)/orte/liborte.la \
|
||||
$(top_ompi_builddir)/opal/libopal.la
|
||||
mca_errmgr_bproc_la_LDFLAGS = -module -avoid-version $(errmgr_bproc_LDFLAGS)
|
||||
|
||||
noinst_LTLIBRARIES = $(component_noinst)
|
||||
libmca_errmgr_bproc_la_SOURCES = $(sources)
|
||||
libmca_errmgr_bproc_la_LIBADD = $(errmgr_bproc_LIBS)
|
||||
libmca_errmgr_bproc_la_LDFLAGS = -module -avoid-version $(errmgr_bproc_LDFLAGS)
|
@ -17,22 +17,22 @@
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
# MCA_pls_bproc_orted_CONFIG([action-if-found], [action-if-not-found])
|
||||
# MCA_errmgr_bproc_CONFIG([action-if-found], [action-if-not-found])
|
||||
# -----------------------------------------------------------
|
||||
AC_DEFUN([MCA_pls_bproc_orted_CONFIG],[
|
||||
OMPI_CHECK_BPROC([pls_bproc_orted], [pls_bproc_orted_good=1],
|
||||
[pls_bproc_orted_good=1], [pls_bproc_orted_good=0])
|
||||
AC_DEFUN([MCA_errmgr_bproc_CONFIG],[
|
||||
OMPI_CHECK_BPROC([errmgr_bproc], [errmgr_bproc_good=1],
|
||||
[errmgr_bproc_good=1], [errmgr_bproc_good=0])
|
||||
|
||||
# if check worked, set wrapper flags if so.
|
||||
# Evaluate succeed / fail
|
||||
AS_IF([test "$pls_bproc_orted_good" = "1"],
|
||||
[pls_bproc_orted_WRAPPER_EXTRA_LDFLAGS="$pls_bproc_orted_LDFLAGS"
|
||||
pls_bproc_orted_WRAPPER_EXTRA_LIBS="$pls_bproc_orted_LIBS"
|
||||
AS_IF([test "$errmgr_bproc_good" = "1"],
|
||||
[errmgr_bproc_WRAPPER_EXTRA_LDFLAGS="$errmgr_bproc_LDFLAGS"
|
||||
errmgr_bproc_WRAPPER_EXTRA_LIBS="$errmgr_bproc_LIBS"
|
||||
$1],
|
||||
[$2])
|
||||
|
||||
# set build flags to use in makefile
|
||||
AC_SUBST([pls_bproc_orted_CPPFLAGS])
|
||||
AC_SUBST([pls_bproc_orted_LDFLAGS])
|
||||
AC_SUBST([pls_bproc_orted_LIBS])
|
||||
AC_SUBST([errmgr_bproc_CPPFLAGS])
|
||||
AC_SUBST([errmgr_bproc_LDFLAGS])
|
||||
AC_SUBST([errmgr_bproc_LIBS])
|
||||
])dnl
|
23
orte/mca/errmgr/bproc/configure.params
Обычный файл
23
orte/mca/errmgr/bproc/configure.params
Обычный файл
@ -0,0 +1,23 @@
|
||||
# -*- shell-script -*-
|
||||
#
|
||||
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
# University Research and Technology
|
||||
# Corporation. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
# of Tennessee Research Foundation. All rights
|
||||
# reserved.
|
||||
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
# Specific to this module
|
||||
|
||||
PARAM_INIT_FILE=errmgr_bproc_component.c
|
||||
PARAM_CONFIG_FILES="Makefile"
|
223
orte/mca/errmgr/bproc/errmgr_bproc.c
Обычный файл
223
orte/mca/errmgr/bproc/errmgr_bproc.c
Обычный файл
@ -0,0 +1,223 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
|
||||
#include "orte_config.h"
|
||||
#include "orte/orte_constants.h"
|
||||
|
||||
#include "opal/util/output.h"
|
||||
|
||||
#include "orte/runtime/runtime.h"
|
||||
#include "orte/mca/ns/ns_types.h"
|
||||
#include "orte/mca/gpr/gpr_types.h"
|
||||
#include "orte/mca/rml/rml.h"
|
||||
|
||||
#include "orte/mca/errmgr/base/errmgr_private.h"
|
||||
#include "orte/mca/errmgr/bproc/errmgr_bproc.h"
|
||||
|
||||
/*
|
||||
* This function gets called when the SMR updates a process state to
|
||||
* indicate that it aborted. Since the bproc component is only active on
|
||||
* non-HNP processes, this function will NEVER be called
|
||||
*/
|
||||
int orte_errmgr_bproc_proc_aborted(orte_gpr_notify_message_t *msg)
|
||||
{
|
||||
return ORTE_ERR_NOT_AVAILABLE;
|
||||
}
|
||||
|
||||
/*
|
||||
* This function gets called when the SMR updates a process state to
|
||||
* indicate that it failed to start. Since the bproc component is only active on
|
||||
* non-HNP processes, this function will NEVER be called
|
||||
*/
|
||||
int orte_errmgr_bproc_incomplete_start(orte_gpr_notify_message_t *msg)
|
||||
{
|
||||
return ORTE_ERR_NOT_AVAILABLE;
|
||||
}
|
||||
|
||||
/*
|
||||
* This function gets called when a process detects an internal error.
|
||||
* Bproc is unusually bad about letting us pass information that we
|
||||
* aborted as opposed to normally terminated. There is no way to locally
|
||||
* monitor the process state on a remote node, so the only thing we
|
||||
* can do is pass the info back to the Bproc PLS on the HNP and let it
|
||||
* figure out what to do.
|
||||
*/
|
||||
void orte_errmgr_bproc_error_detected(int error_code, char *fmt, ...)
|
||||
{
|
||||
va_list arglist;
|
||||
orte_buffer_t* cmd;
|
||||
uint8_t command;
|
||||
int rc;
|
||||
|
||||
/* If there was a message, output it */
|
||||
va_start(arglist, fmt);
|
||||
if( NULL != fmt ) {
|
||||
char* buffer = NULL;
|
||||
vasprintf( &buffer, fmt, arglist );
|
||||
opal_output( 0, buffer );
|
||||
free( buffer );
|
||||
}
|
||||
va_end(arglist);
|
||||
|
||||
/* Now prepare and send a message to the BProc PLS so it knows that
|
||||
* we abnormally terminated. It doesn't matter what is in the
|
||||
* message - the fact that it gets received is adequate
|
||||
*/
|
||||
command = 0x01;
|
||||
|
||||
cmd = OBJ_NEW(orte_buffer_t);
|
||||
if (cmd == NULL) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
return;
|
||||
}
|
||||
|
||||
/* just pack something */
|
||||
if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, &command, 1, ORTE_UINT8))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_RELEASE(cmd);
|
||||
return;
|
||||
}
|
||||
|
||||
/* send the alert */
|
||||
if (0 > orte_rml.send_buffer(orte_errmgr_bproc_globals.replica, cmd, ORTE_RML_TAG_BPROC_ABORT, 0)) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
|
||||
OBJ_RELEASE(cmd);
|
||||
return;
|
||||
}
|
||||
OBJ_RELEASE(cmd);
|
||||
|
||||
/* okay, now we can truly abort. Tell the abort function not to bother writing out
|
||||
* an abort file - we can't do anything with it anyway!
|
||||
*/
|
||||
orte_abort(error_code, false);
|
||||
}
|
||||
|
||||
/*
|
||||
* This function gets called when a process desperately needs to just die.
|
||||
* Nothing can be done by definition here - this function ONLY gets
|
||||
* called as an absolute last resort.
|
||||
*/
|
||||
void orte_errmgr_bproc_abort()
|
||||
{
|
||||
/* abnormal exit - no point in writing out an abort file as bproc doesn't
|
||||
* know what to do with it anyway
|
||||
*/
|
||||
orte_abort(-1, false);
|
||||
}
|
||||
|
||||
/*
|
||||
* Alternatively, some systems (e.g., OpenMPI) need to tell us to kill
|
||||
* some other subset of processes along with us. Send that info to the
|
||||
* HNP so it can kill them.
|
||||
*
|
||||
* NOTE: this function assumes that the underlying ORTE infrastructure is
|
||||
* still operational. Use of this function should therefore be restricted
|
||||
* to cases where the problem is in a higher layer (e.g., MPI) as the
|
||||
* process is likely to "hang" if an ORTE problem has been encountered.
|
||||
*/
|
||||
int orte_errmgr_bproc_abort_procs_request(orte_process_name_t *procs, orte_std_cntr_t nprocs)
|
||||
{
|
||||
orte_buffer_t* cmd;
|
||||
orte_buffer_t* answer;
|
||||
orte_errmgr_cmd_flag_t command;
|
||||
orte_std_cntr_t count;
|
||||
int rc;
|
||||
|
||||
/* protect us against error */
|
||||
if (NULL == procs) {
|
||||
return ORTE_ERR_BAD_PARAM;
|
||||
}
|
||||
|
||||
command = ORTE_ERRMGR_ABORT_PROCS_REQUEST_CMD;
|
||||
|
||||
cmd = OBJ_NEW(orte_buffer_t);
|
||||
if (cmd == NULL) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
/* pack the command */
|
||||
if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, &command, 1, ORTE_ERRMGR_CMD))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_RELEASE(cmd);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* pack the number of procs we are requesting be aborted */
|
||||
if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, &nprocs, 1, ORTE_STD_CNTR))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_RELEASE(cmd);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* pack the array of proc names */
|
||||
if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, procs, nprocs, ORTE_NAME))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_RELEASE(cmd);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* send the request */
|
||||
if (0 > orte_rml.send_buffer(orte_errmgr_bproc_globals.replica, cmd, ORTE_RML_TAG_RDS, 0)) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
|
||||
OBJ_RELEASE(cmd);
|
||||
return ORTE_ERR_COMM_FAILURE;
|
||||
}
|
||||
OBJ_RELEASE(cmd);
|
||||
|
||||
/* setup a buffer for the answer */
|
||||
answer = OBJ_NEW(orte_buffer_t);
|
||||
if(answer == NULL) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
/* enter a blocking receive until we hear back */
|
||||
if (0 > orte_rml.recv_buffer(orte_errmgr_bproc_globals.replica, answer, ORTE_RML_TAG_RDS)) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
|
||||
OBJ_RELEASE(answer);
|
||||
return ORTE_ERR_COMM_FAILURE;
|
||||
}
|
||||
|
||||
count = 1;
|
||||
if (ORTE_SUCCESS != (rc = orte_dss.unpack(answer, &command, &count, ORTE_ERRMGR_CMD))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_RELEASE(answer);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* check that this is the right command */
|
||||
if (ORTE_ERRMGR_ABORT_PROCS_REQUEST_CMD != command) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
|
||||
OBJ_RELEASE(answer);
|
||||
return ORTE_ERR_COMM_FAILURE;
|
||||
}
|
||||
|
||||
/* clean up and leave */
|
||||
OBJ_RELEASE(answer);
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/*
|
||||
* It is imperative that ONLY an HNP perform this registration!
|
||||
*/
|
||||
int orte_errmgr_bproc_register_job(orte_jobid_t job)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
81
orte/mca/errmgr/bproc/errmgr_bproc.h
Обычный файл
81
orte/mca/errmgr/bproc/errmgr_bproc.h
Обычный файл
@ -0,0 +1,81 @@
|
||||
/* -*- C -*-
|
||||
*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*
|
||||
*/
|
||||
#ifndef ORTE_ERRMGR_BPROC_H
|
||||
#define ORTE_ERRMGR_BPROC_H
|
||||
|
||||
|
||||
#include "orte_config.h"
|
||||
#include "orte/orte_types.h"
|
||||
|
||||
#include "orte/mca/ns/ns_types.h"
|
||||
#include "orte/mca/gpr/gpr_types.h"
|
||||
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
|
||||
#if defined(c_plusplus) || defined(__cplusplus)
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Module open / close
|
||||
*/
|
||||
int orte_errmgr_bproc_open(void);
|
||||
int orte_errmgr_bproc_close(void);
|
||||
|
||||
|
||||
/*
|
||||
* Startup / Shutdown
|
||||
*/
|
||||
orte_errmgr_base_module_t*
|
||||
orte_errmgr_bproc_component_init(bool *allow_multi_user_threads, bool *have_hidden_threads, int *priority);
|
||||
|
||||
int orte_errmgr_bproc_finalize(void);
|
||||
|
||||
/*
|
||||
* globals used within the component
|
||||
*/
|
||||
typedef struct {
|
||||
int debug;
|
||||
orte_process_name_t *replica;
|
||||
} orte_errmgr_bproc_globals_t;
|
||||
|
||||
|
||||
extern orte_errmgr_bproc_globals_t orte_errmgr_bproc_globals;
|
||||
|
||||
/*
|
||||
* Component API functions
|
||||
*/
|
||||
int orte_errmgr_bproc_proc_aborted(orte_gpr_notify_message_t *msg);
|
||||
|
||||
int orte_errmgr_bproc_incomplete_start(orte_gpr_notify_message_t *msg);
|
||||
|
||||
void orte_errmgr_bproc_error_detected(int error_code, char *fmt, ...);
|
||||
|
||||
void orte_errmgr_bproc_abort(void);
|
||||
|
||||
int orte_errmgr_bproc_register_job(orte_jobid_t job);
|
||||
|
||||
int orte_errmgr_bproc_abort_procs_request(orte_process_name_t *procs, orte_std_cntr_t nprocs);
|
||||
|
||||
#if defined(c_plusplus) || defined(__cplusplus)
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
164
orte/mca/errmgr/bproc/errmgr_bproc_component.c
Обычный файл
164
orte/mca/errmgr/bproc/errmgr_bproc_component.c
Обычный файл
@ -0,0 +1,164 @@
|
||||
/* -*- C -*-
|
||||
*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
/** @file:
|
||||
*
|
||||
* The Open MPI General Purpose Registry - Proxy component
|
||||
*
|
||||
*/
|
||||
|
||||
/*
|
||||
* includes
|
||||
*/
|
||||
#include "orte_config.h"
|
||||
|
||||
#include "orte/orte_constants.h"
|
||||
#include "orte/orte_types.h"
|
||||
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/mca/mca.h"
|
||||
#include "opal/mca/base/base.h"
|
||||
#include "opal/mca/base/mca_base_param.h"
|
||||
|
||||
#include "orte/util/proc_info.h"
|
||||
#include "orte/mca/ns/ns_types.h"
|
||||
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/errmgr/base/errmgr_private.h"
|
||||
|
||||
#include "errmgr_bproc.h"
|
||||
|
||||
|
||||
/*
|
||||
* Struct of function pointers that need to be initialized
|
||||
*/
|
||||
mca_errmgr_base_component_t mca_errmgr_bproc_component = {
|
||||
{
|
||||
ORTE_ERRMGR_BASE_VERSION_1_3_0,
|
||||
|
||||
"bproc", /* MCA module name */
|
||||
ORTE_MAJOR_VERSION, /* MCA module major version */
|
||||
ORTE_MINOR_VERSION, /* MCA module minor version */
|
||||
ORTE_RELEASE_VERSION, /* MCA module release version */
|
||||
orte_errmgr_bproc_open, /* module open */
|
||||
orte_errmgr_bproc_close /* module close */
|
||||
},
|
||||
{
|
||||
false /* checkpoint / restart */
|
||||
},
|
||||
orte_errmgr_bproc_component_init, /* module init */
|
||||
orte_errmgr_bproc_finalize /* module shutdown */
|
||||
};
|
||||
|
||||
/*
|
||||
* setup the function pointers for the module
|
||||
*/
|
||||
static orte_errmgr_base_module_t orte_errmgr_bproc = {
|
||||
orte_errmgr_base_log,
|
||||
orte_errmgr_bproc_proc_aborted,
|
||||
orte_errmgr_bproc_incomplete_start,
|
||||
orte_errmgr_bproc_error_detected,
|
||||
orte_errmgr_bproc_register_job,
|
||||
orte_errmgr_bproc_abort,
|
||||
orte_errmgr_bproc_abort_procs_request
|
||||
};
|
||||
|
||||
|
||||
/*
|
||||
* Whether or not we allowed this component to be selected
|
||||
*/
|
||||
static bool initialized = false;
|
||||
|
||||
/* local globals */
|
||||
orte_errmgr_bproc_globals_t orte_errmgr_bproc_globals;
|
||||
|
||||
/*
|
||||
* Open the component
|
||||
*/
|
||||
int orte_errmgr_bproc_open(void)
|
||||
{
|
||||
int id, tmp;
|
||||
|
||||
id = mca_base_param_register_int("errmgr", "bproc", "debug", NULL, 0);
|
||||
mca_base_param_lookup_int(id, &tmp);
|
||||
if (tmp) {
|
||||
orte_errmgr_bproc_globals.debug = true;
|
||||
} else {
|
||||
orte_errmgr_bproc_globals.debug = false;
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/*
|
||||
* Close the component
|
||||
*/
|
||||
int orte_errmgr_bproc_close(void)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
orte_errmgr_base_module_t*
|
||||
orte_errmgr_bproc_component_init(bool *allow_multi_user_threads, bool *have_hidden_threads,
|
||||
int *priority)
|
||||
{
|
||||
if (orte_errmgr_bproc_globals.debug) {
|
||||
opal_output(0, "errmgr_bproc_init called");
|
||||
}
|
||||
|
||||
/* If we are an HNP or an orted, then don't pick us! */
|
||||
if (orte_process_info.seed || orte_process_info.daemon) {
|
||||
/* don't take me! */
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* Return a module (choose an arbitrary, positive priority --
|
||||
absolutely must be higher than the proxy component
|
||||
*/
|
||||
|
||||
*priority = 100;
|
||||
|
||||
/* no part of OpenRTE allows or has threads */
|
||||
|
||||
*allow_multi_user_threads = false;
|
||||
*have_hidden_threads = false;
|
||||
|
||||
/* define the replica for us to use - for now, just point
|
||||
* to the name service replica
|
||||
*/
|
||||
orte_errmgr_bproc_globals.replica = orte_process_info.ns_replica;
|
||||
|
||||
initialized = true;
|
||||
return &orte_errmgr_bproc;
|
||||
}
|
||||
|
||||
/*
|
||||
* finalize routine
|
||||
*/
|
||||
int orte_errmgr_bproc_finalize(void)
|
||||
{
|
||||
if (orte_errmgr_bproc_globals.debug) {
|
||||
opal_output(0, "[%lu,%lu,%lu] errmgr_bproc_finalize called",
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name));
|
||||
}
|
||||
|
||||
initialized = false;
|
||||
|
||||
/* All done */
|
||||
return ORTE_SUCCESS;
|
||||
}
|
@ -30,7 +30,11 @@
|
||||
|
||||
#include "orte_config.h"
|
||||
#include "orte/orte_constants.h"
|
||||
|
||||
|
||||
#include "orte/mca/schema/schema.h"
|
||||
#include "orte/mca/gpr/gpr_types.h"
|
||||
#include "orte/mca/ns/ns_types.h"
|
||||
|
||||
#include "opal/mca/mca.h"
|
||||
|
||||
@ -58,8 +62,7 @@ extern "C" {
|
||||
|
||||
/**
|
||||
* Log an error
|
||||
* Log an error that occurred in the runtime environment, and call the "error_detected"
|
||||
* interface to see if further action is required.
|
||||
* Log an error that occurred in the runtime environment
|
||||
*
|
||||
* @code
|
||||
* orte_errmgr.log("this is an error", __FILE__, __LINE__);
|
||||
@ -70,70 +73,110 @@ typedef void (*orte_errmgr_base_module_log_fn_t)(int error_code, char *filename,
|
||||
|
||||
/**
|
||||
* Alert - process aborted
|
||||
* This function is called when a remote process aborts during execution. Note that local
|
||||
* process errors should always be reported through the error_detected interface and
|
||||
* NOT here. The function is called when a message is received from the universe daemon
|
||||
* indicating that another process in the job failed. For now, this function will
|
||||
* simply cause the local process to gracefully finalize and terminate.
|
||||
* This function is called when a remote process aborts during execution. The function
|
||||
* is called via the GPR's trigger notification system. Actions taken in response
|
||||
* to the abnormal termination of a remote application process will vary across
|
||||
* the various errmgr components.
|
||||
|
||||
* NOTE: Local process errors should always be reported through the error_detected interface and
|
||||
* NOT here.
|
||||
*/
|
||||
typedef void (*orte_errmgr_base_module_proc_aborted_fn_t)(orte_process_name_t *proc);
|
||||
typedef int (*orte_errmgr_base_module_proc_aborted_fn_t)(orte_gpr_notify_message_t *msg);
|
||||
|
||||
/**
|
||||
* Alert - incomplete start of a job
|
||||
* This function is called when an attempted launch of a job encounters failure of
|
||||
* one or more processes to start. The function decides on the strategy for dealing
|
||||
* with this "incomplete start" situation - for now, it simply orders the resource
|
||||
* manager to terminate the entire job.
|
||||
* one or more processes to start. The strategy for dealing
|
||||
* with this "incomplete start" situation varies across the various errmgr components.
|
||||
*
|
||||
* This function is only called by the respective process launcher, which is responsible
|
||||
* for detecting incomplete starts.
|
||||
* for detecting incomplete starts. If on a daemon, the function simply updates the
|
||||
* process state to indicate failure to launch - this initiates a trigger that goes to
|
||||
* the respective HNP for response.
|
||||
*
|
||||
* NOTE: Errmgr components on non-HNP and non-daemon processes are expressly forbidden
|
||||
* from taking any action to this function call. Instead, they are restricted to simply
|
||||
* returning.
|
||||
*/
|
||||
typedef void (*orte_errmgr_base_module_incomplete_start_fn_t)(orte_jobid_t job);
|
||||
typedef int (*orte_errmgr_base_module_incomplete_start_fn_t)(orte_gpr_notify_message_t *msg);
|
||||
|
||||
/**
|
||||
* Alert - internal error detected
|
||||
* This function is called when an internal error is detected within the local process.
|
||||
* It decides what to do about the error - for now, it simply orders the local process
|
||||
* to finalize and terminate.
|
||||
* This function is called when an internal error is detected within a local process.
|
||||
* It decides what to do about the error. In the case of application processes, it simply
|
||||
* orders the local process to finalize and terminate. The abnormal termination will be
|
||||
* detected and dealt with by the daemon/HNP system.
|
||||
*
|
||||
* HNPs, of course, cannot simply exit - they must first cleanup their running jobs if at
|
||||
* all possible. In some cases, this cannot be done - e.g., if the error detected would
|
||||
* prevent operation of the registry or has corrupted memory. In these extreme cases,
|
||||
* nothing can really be done.
|
||||
*
|
||||
* Likewise, orteds have responsibility towards their local application processes and
|
||||
* must make some attempt to clean them up before exiting.
|
||||
*
|
||||
* The function pretty prints an error message if possible. Error message should be
|
||||
* specified using the standard \code printf() format.
|
||||
*/
|
||||
typedef void (*orte_errmgr_base_module_error_detected_fn_t)(int error_code);
|
||||
typedef void (*orte_errmgr_base_module_error_detected_fn_t)(int error_code, char *fmt, ...);
|
||||
|
||||
/*
|
||||
* Register a job with the error manager
|
||||
* When a job is launched, this function is called so the error manager can register
|
||||
* subscriptions on the job segment so that the error manager will be notified when
|
||||
* problems occur - i.e., when process status entries change to abnormal termination
|
||||
* values. Process status entries are changed by the appropriate state-of-health monitor
|
||||
* values. Process status entries are changed by the appropriate state monitor
|
||||
* and/or the process launcher, depending upon the stage at which the problem occurs.
|
||||
*
|
||||
* Monitoring of the job begins once the job has reached the "executing" stage. Prior
|
||||
* to that time, failure of processes to start are the responsibility of the respective
|
||||
* process launcher - which is expected to call the error manager via the "incomplete
|
||||
* start" interface to report any problems prior to the job beginning "execution".
|
||||
*
|
||||
* NOTE: ONLY HNPs are allowed to register for trigger reports. All other components
|
||||
* MUST do nothing but return ORTE_SUCCESS.
|
||||
*/
|
||||
typedef int (*orte_errmgr_base_module_register_job_fn_t)(orte_jobid_t job);
|
||||
|
||||
/**
|
||||
* Alert - self aborting
|
||||
* This function is called when a process is aborting. The routine will kill
|
||||
* any child processes and terminate the calling process.
|
||||
* This function is called when a process is aborting. It will finalize the process
|
||||
* itself, and then exits - it takes no other actions. The intent here is to provide
|
||||
* a last-ditch exit procedure that attempts to clean up a little.
|
||||
*/
|
||||
typedef void (*orte_errmgr_base_module_abort_fn_t)(void);
|
||||
|
||||
/*
|
||||
* Request that the system abort processes other than myself
|
||||
* The possibility exists that a process will decide that ONLY a small subset of a job
|
||||
* must be aborted. This function allows a process to request that the identified
|
||||
* processes be aborted. The "request" portion of the function's name is not
|
||||
* by accident - this function specifically does NOT perform the abort process
|
||||
* itself, but simply requests that it be done.
|
||||
*
|
||||
* NOTE: Please ensure that you do NOT include your own process name in the
|
||||
* array or else you will be ordered to "die" before you complete this function
|
||||
* (i.e., you will be held in a blocking receive pending an answer from the
|
||||
* HNP, which won't come before you receive your own "die" command). If you need
|
||||
* to die too, then call "abort" after completing this function call.
|
||||
*/
|
||||
typedef int (*orte_errmgr_base_module_abort_procs_request_fn_t)(orte_process_name_t *procs, orte_std_cntr_t num_procs);
|
||||
|
||||
/*
|
||||
* Ver 1.0.0
|
||||
*/
|
||||
struct orte_errmgr_base_module_1_0_0_t {
|
||||
orte_errmgr_base_module_log_fn_t log;
|
||||
orte_errmgr_base_module_proc_aborted_fn_t proc_aborted;
|
||||
orte_errmgr_base_module_incomplete_start_fn_t incomplete_start;
|
||||
orte_errmgr_base_module_error_detected_fn_t error_detected;
|
||||
orte_errmgr_base_module_register_job_fn_t register_job;
|
||||
orte_errmgr_base_module_abort_fn_t abort;
|
||||
struct orte_errmgr_base_module_1_3_0_t {
|
||||
orte_errmgr_base_module_log_fn_t log;
|
||||
orte_errmgr_base_module_proc_aborted_fn_t proc_aborted;
|
||||
orte_errmgr_base_module_incomplete_start_fn_t incomplete_start;
|
||||
orte_errmgr_base_module_error_detected_fn_t error_detected;
|
||||
orte_errmgr_base_module_register_job_fn_t register_job;
|
||||
orte_errmgr_base_module_abort_fn_t abort;
|
||||
orte_errmgr_base_module_abort_procs_request_fn_t abort_procs_request;
|
||||
};
|
||||
|
||||
typedef struct orte_errmgr_base_module_1_0_0_t orte_errmgr_base_module_1_0_0_t;
|
||||
typedef orte_errmgr_base_module_1_0_0_t orte_errmgr_base_module_t;
|
||||
typedef struct orte_errmgr_base_module_1_3_0_t orte_errmgr_base_module_1_3_0_t;
|
||||
typedef orte_errmgr_base_module_1_3_0_t orte_errmgr_base_module_t;
|
||||
|
||||
/*
|
||||
* ERRMGR Component
|
||||
@ -150,26 +193,26 @@ typedef int (*orte_errmgr_base_component_finalize_fn_t)(void);
|
||||
* the standard component data structure
|
||||
*/
|
||||
|
||||
struct mca_errmgr_base_component_1_0_0_t {
|
||||
struct mca_errmgr_base_component_1_3_0_t {
|
||||
mca_base_component_t errmgr_version;
|
||||
mca_base_component_data_1_0_0_t errmgr_data;
|
||||
|
||||
orte_errmgr_base_component_init_fn_t errmgr_init;
|
||||
orte_errmgr_base_component_finalize_fn_t errmgr_finalize;
|
||||
};
|
||||
typedef struct mca_errmgr_base_component_1_0_0_t mca_errmgr_base_component_1_0_0_t;
|
||||
typedef mca_errmgr_base_component_1_0_0_t mca_errmgr_base_component_t;
|
||||
typedef struct mca_errmgr_base_component_1_3_0_t mca_errmgr_base_component_1_3_0_t;
|
||||
typedef mca_errmgr_base_component_1_3_0_t mca_errmgr_base_component_t;
|
||||
|
||||
|
||||
|
||||
/*
|
||||
* Macro for use in components that are of type errmgr v1.0.0
|
||||
*/
|
||||
#define ORTE_ERRMGR_BASE_VERSION_1_0_0 \
|
||||
/* ns v1.0 is chained to MCA v1.0 */ \
|
||||
#define ORTE_ERRMGR_BASE_VERSION_1_3_0 \
|
||||
/* errmgr v1.3 is chained to MCA v1.0 */ \
|
||||
MCA_BASE_VERSION_1_0_0, \
|
||||
/* errmgr v1.0 */ \
|
||||
"errmgr", 1, 0, 0
|
||||
/* errmgr v1.3 */ \
|
||||
"errmgr", 1, 3, 0
|
||||
|
||||
/* Global structure for accessing error manager functions
|
||||
*/
|
||||
|
46
orte/mca/errmgr/hnp/Makefile.am
Обычный файл
46
orte/mca/errmgr/hnp/Makefile.am
Обычный файл
@ -0,0 +1,46 @@
|
||||
#
|
||||
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
# University Research and Technology
|
||||
# Corporation. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
# of Tennessee Research Foundation. All rights
|
||||
# reserved.
|
||||
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
sources = \
|
||||
errmgr_hnp.h \
|
||||
errmgr_hnp_component.c \
|
||||
errmgr_hnp.c
|
||||
|
||||
# Make the output library in this directory, and name it either
|
||||
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
|
||||
# (for static builds).
|
||||
|
||||
if OMPI_BUILD_errmgr_hnp_DSO
|
||||
component_noinst =
|
||||
component_install = mca_errmgr_hnp.la
|
||||
else
|
||||
component_noinst = libmca_errmgr_hnp.la
|
||||
component_install =
|
||||
endif
|
||||
|
||||
mcacomponentdir = $(libdir)/openmpi
|
||||
mcacomponent_LTLIBRARIES = $(component_install)
|
||||
mca_errmgr_hnp_la_SOURCES = $(sources)
|
||||
mca_errmgr_hnp_la_LDFLAGS = -module -avoid-version
|
||||
mca_errmgr_hnp_la_LIBADD = \
|
||||
$(top_ompi_builddir)/orte/liborte.la \
|
||||
$(top_ompi_builddir)/opal/libopal.la
|
||||
|
||||
noinst_LTLIBRARIES = $(component_noinst)
|
||||
libmca_errmgr_hnp_la_SOURCES =$(sources)
|
||||
libmca_errmgr_hnp_la_LDFLAGS = -module -avoid-version
|
23
orte/mca/errmgr/hnp/configure.params
Обычный файл
23
orte/mca/errmgr/hnp/configure.params
Обычный файл
@ -0,0 +1,23 @@
|
||||
# -*- shell-script -*-
|
||||
#
|
||||
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
# University Research and Technology
|
||||
# Corporation. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
# of Tennessee Research Foundation. All rights
|
||||
# reserved.
|
||||
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
# Specific to this module
|
||||
|
||||
PARAM_INIT_FILE=errmgr_hnp_component.c
|
||||
PARAM_CONFIG_FILES="Makefile"
|
205
orte/mca/errmgr/hnp/errmgr_hnp.c
Обычный файл
205
orte/mca/errmgr/hnp/errmgr_hnp.c
Обычный файл
@ -0,0 +1,205 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
|
||||
#include "orte_config.h"
|
||||
#include "orte/orte_constants.h"
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stdarg.h>
|
||||
|
||||
#include "opal/util/trace.h"
|
||||
#include "opal/util/output.h"
|
||||
|
||||
#include "orte/runtime/runtime.h"
|
||||
#include "orte/mca/ns/ns_types.h"
|
||||
#include "orte/mca/gpr/gpr.h"
|
||||
#include "orte/mca/pls/pls.h"
|
||||
#include "orte/mca/smr/smr.h"
|
||||
|
||||
#include "orte/mca/errmgr/base/base.h"
|
||||
#include "orte/mca/errmgr/hnp/errmgr_hnp.h"
|
||||
|
||||
/*
|
||||
* This function gets called when the someone updates a process
|
||||
* state to indicate it has aborted. That action results in
|
||||
* the firing of a registry trigger that passes a minimal
|
||||
* data message here. The only part of that message we need
|
||||
* is the segment name so we can extract the jobid from it
|
||||
*
|
||||
* Various components will follow their own strategy for dealing with
|
||||
* this situation. For this component, we simply kill the job.
|
||||
*/
|
||||
int orte_errmgr_hnp_proc_aborted(orte_gpr_notify_message_t *msg)
|
||||
{
|
||||
orte_jobid_t job;
|
||||
int rc;
|
||||
|
||||
OPAL_TRACE(1);
|
||||
|
||||
opal_output(orte_errmgr_base_output, "errmgr:hnp: proc abort has been detected");
|
||||
|
||||
/* This trigger is named, so we can extract the jobid
|
||||
* directly from the trigger name
|
||||
*/
|
||||
if (ORTE_SUCCESS != (rc = orte_schema.extract_jobid_from_std_trigger_name(&job, msg->target))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* set the job state */
|
||||
if (ORTE_SUCCESS != (rc = orte_smr.set_job_state(job, ORTE_JOB_STATE_ABORTED))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* tell the pls to terminate the job */
|
||||
if (ORTE_SUCCESS != (rc = orte_pls.terminate_job(job))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
||||
/*
|
||||
* This function gets called when someone updates a process
|
||||
* state to indicate it failed to start. That action results in
|
||||
* the firing of a registry trigger that passes a minimal
|
||||
* data message here. The only part of that message we need
|
||||
* is the segment name so we can extract the jobid from it
|
||||
*
|
||||
* Various components will follow their own strategy for dealing with
|
||||
* this situation. For this component, we simply kill the job.
|
||||
*/
|
||||
int orte_errmgr_hnp_incomplete_start(orte_gpr_notify_message_t *msg)
|
||||
{
|
||||
orte_jobid_t job;
|
||||
int rc;
|
||||
|
||||
OPAL_TRACE(1);
|
||||
|
||||
/* This trigger is named, so we can extract the jobid
|
||||
* directly from the trigger name
|
||||
*/
|
||||
if (ORTE_SUCCESS != (rc = orte_schema.extract_jobid_from_std_trigger_name(&job, msg->target))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* set the job state */
|
||||
if (ORTE_SUCCESS != (rc = orte_smr.set_job_state(job, ORTE_JOB_STATE_FAILED_TO_START))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* tell the pls to terminate the job */
|
||||
if (ORTE_SUCCESS != (rc = orte_pls.terminate_job(job))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
||||
/*
|
||||
* This function gets called when the HNP itself detects an internal error!
|
||||
* Ideally, we would find some way to tell all the active jobs to die before
|
||||
* we depart ourselves. Unfortunately, at this time, we aren't sure we can do
|
||||
* this - later, we'll add some more intelligence by, for example, checking
|
||||
* the error code to see if it's something that would allow us to alert
|
||||
* the remote orteds.
|
||||
*
|
||||
* For now, we'll just depart!
|
||||
*/
|
||||
void orte_errmgr_hnp_error_detected(int error_code, char *fmt, ...)
|
||||
{
|
||||
va_list arglist;
|
||||
|
||||
/* If there was a message, output it */
|
||||
|
||||
va_start(arglist, fmt);
|
||||
if( NULL != fmt ) {
|
||||
char* buffer = NULL;
|
||||
vasprintf( &buffer, fmt, arglist );
|
||||
opal_output( 0, buffer );
|
||||
free( buffer );
|
||||
}
|
||||
va_end(arglist);
|
||||
|
||||
/* abnormal exit */
|
||||
orte_abort(error_code, false);
|
||||
}
|
||||
|
||||
/*
|
||||
* This function gets called when the HNP desperately needs to just die.
|
||||
* Nothing can be done by definition here - this function ONLY gets
|
||||
* called as an absolute last resort
|
||||
*/
|
||||
void orte_errmgr_hnp_abort(void)
|
||||
{
|
||||
OPAL_TRACE(1);
|
||||
|
||||
/* abnormal exit */
|
||||
orte_abort(-1, false);
|
||||
}
|
||||
|
||||
/*
|
||||
* This function gets called when a process wants to request that the HNP
|
||||
* abort some set of processes for it. Since this component IS for the HNP,
|
||||
* that means we need to actually execute this request! Call upon the PLS
|
||||
* as needed to execute the abort requests
|
||||
*/
|
||||
int orte_errmgr_hnp_abort_procs_request(orte_process_name_t *procs, orte_std_cntr_t nprocs)
|
||||
{
|
||||
int rc;
|
||||
|
||||
rc = ORTE_SUCCESS;
|
||||
return rc;
|
||||
}
|
||||
|
||||
/*
|
||||
* Register the HNP's errmgr functions to be called when the job encounters
|
||||
* certain pre-identified problem states.
|
||||
*
|
||||
* NOTE: It is imperative that ONLY the HNP perform this registration!
|
||||
*/
|
||||
int orte_errmgr_hnp_register_job(orte_jobid_t job)
|
||||
{
|
||||
/* we need to setup two counters and their corresponding triggers - one
|
||||
* to alert us when something fails to launch, and another for when
|
||||
* someone aborts
|
||||
*/
|
||||
int rc;
|
||||
|
||||
/* define the ABORT trigger to fire when any process aborts */
|
||||
if (ORTE_SUCCESS != (rc = orte_smr.define_alert_monitor(job, ORTE_NUM_ABORTED_TRIGGER,
|
||||
ORTE_PROC_NUM_ABORTED, 0, 1, true,
|
||||
orte_errmgr_hnp_proc_aborted, NULL))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* define the FAILED_LAUNCH trigger to fire when the launch fails */
|
||||
if (ORTE_SUCCESS != (rc = orte_smr.define_alert_monitor(job, ORTE_FAILED_TO_START_TRIGGER,
|
||||
ORTE_PROC_NUM_FAILED_START, 0, 1, true,
|
||||
orte_errmgr_hnp_incomplete_start, NULL))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
79
orte/mca/errmgr/hnp/errmgr_hnp.h
Обычный файл
79
orte/mca/errmgr/hnp/errmgr_hnp.h
Обычный файл
@ -0,0 +1,79 @@
|
||||
/* -*- C -*-
|
||||
*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*
|
||||
*/
|
||||
#ifndef ORTE_ERRMGR_HNP_H
|
||||
#define ORTE_ERRMGR_HNP_H
|
||||
|
||||
|
||||
#include "orte_config.h"
|
||||
#include "orte/orte_types.h"
|
||||
|
||||
#include "orte/mca/ns/ns_types.h"
|
||||
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
|
||||
#if defined(c_plusplus) || defined(__cplusplus)
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Module open / close
|
||||
*/
|
||||
int orte_errmgr_hnp_open(void);
|
||||
int orte_errmgr_hnp_close(void);
|
||||
|
||||
|
||||
/*
|
||||
* Startup / Shutdown
|
||||
*/
|
||||
orte_errmgr_base_module_t*
|
||||
orte_errmgr_hnp_component_init(bool *allow_multi_user_threads, bool *have_hidden_threads, int *priority);
|
||||
|
||||
int orte_errmgr_hnp_finalize(void);
|
||||
|
||||
/*
|
||||
* globals used within the component
|
||||
*/
|
||||
typedef struct {
|
||||
int debug;
|
||||
} orte_errmgr_hnp_globals_t;
|
||||
|
||||
|
||||
extern orte_errmgr_hnp_globals_t orte_errmgr_hnp_globals;
|
||||
|
||||
/*
|
||||
* Component API functions
|
||||
*/
|
||||
int orte_errmgr_hnp_proc_aborted(orte_gpr_notify_message_t *msg);
|
||||
|
||||
int orte_errmgr_hnp_incomplete_start(orte_gpr_notify_message_t *msg);
|
||||
|
||||
void orte_errmgr_hnp_error_detected(int error_code, char *fmt, ...);
|
||||
|
||||
void orte_errmgr_hnp_abort(void);
|
||||
|
||||
int orte_errmgr_hnp_register_job(orte_jobid_t job);
|
||||
|
||||
int orte_errmgr_hnp_abort_procs_request(orte_process_name_t *procs, orte_std_cntr_t nprocs);
|
||||
|
||||
#if defined(c_plusplus) || defined(__cplusplus)
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
174
orte/mca/errmgr/hnp/errmgr_hnp_component.c
Обычный файл
174
orte/mca/errmgr/hnp/errmgr_hnp_component.c
Обычный файл
@ -0,0 +1,174 @@
|
||||
/* -*- C -*-
|
||||
*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
/** @file:
|
||||
*
|
||||
* The Open MPI General Purpose Registry - Proxy component
|
||||
*
|
||||
*/
|
||||
|
||||
/*
|
||||
* includes
|
||||
*/
|
||||
#include "orte_config.h"
|
||||
|
||||
#include "orte/orte_constants.h"
|
||||
#include "orte/orte_types.h"
|
||||
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/mca/mca.h"
|
||||
#include "opal/mca/base/base.h"
|
||||
#include "opal/mca/base/mca_base_param.h"
|
||||
|
||||
#include "orte/util/proc_info.h"
|
||||
#include "orte/mca/ns/ns_types.h"
|
||||
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/errmgr/base/errmgr_private.h"
|
||||
|
||||
#include "errmgr_hnp.h"
|
||||
|
||||
|
||||
/*
|
||||
* Struct of function pointers that need to be initialized
|
||||
*/
|
||||
mca_errmgr_base_component_t mca_errmgr_hnp_component = {
|
||||
{
|
||||
ORTE_ERRMGR_BASE_VERSION_1_3_0,
|
||||
|
||||
"hnp", /* MCA module name */
|
||||
ORTE_MAJOR_VERSION, /* MCA module major version */
|
||||
ORTE_MINOR_VERSION, /* MCA module minor version */
|
||||
ORTE_RELEASE_VERSION, /* MCA module release version */
|
||||
orte_errmgr_hnp_open, /* module open */
|
||||
orte_errmgr_hnp_close /* module close */
|
||||
},
|
||||
{
|
||||
false /* checkpoint / restart */
|
||||
},
|
||||
orte_errmgr_hnp_component_init, /* module init */
|
||||
orte_errmgr_hnp_finalize /* module shutdown */
|
||||
};
|
||||
|
||||
/*
|
||||
* setup the function pointers for the module
|
||||
*/
|
||||
static orte_errmgr_base_module_t orte_errmgr_hnp = {
|
||||
orte_errmgr_base_log,
|
||||
orte_errmgr_hnp_proc_aborted,
|
||||
orte_errmgr_hnp_incomplete_start,
|
||||
orte_errmgr_hnp_error_detected,
|
||||
orte_errmgr_hnp_register_job,
|
||||
orte_errmgr_hnp_abort,
|
||||
orte_errmgr_hnp_abort_procs_request
|
||||
};
|
||||
|
||||
|
||||
/*
|
||||
* Whether or not we allowed this component to be selected
|
||||
*/
|
||||
static bool initialized = false;
|
||||
|
||||
/* local globals */
|
||||
orte_errmgr_hnp_globals_t orte_errmgr_hnp_globals;
|
||||
|
||||
|
||||
/*
|
||||
* Open the component
|
||||
*/
|
||||
int orte_errmgr_hnp_open(void)
|
||||
{
|
||||
int id, tmp;
|
||||
|
||||
id = mca_base_param_register_int("errmgr", "hnp", "debug", NULL, 0);
|
||||
mca_base_param_lookup_int(id, &tmp);
|
||||
if (tmp) {
|
||||
orte_errmgr_hnp_globals.debug = true;
|
||||
} else {
|
||||
orte_errmgr_hnp_globals.debug = false;
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/*
|
||||
* Close the component
|
||||
*/
|
||||
int orte_errmgr_hnp_close(void)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
orte_errmgr_base_module_t*
|
||||
orte_errmgr_hnp_component_init(bool *allow_multi_user_threads, bool *have_hidden_threads,
|
||||
int *priority)
|
||||
{
|
||||
int rc;
|
||||
|
||||
if (orte_errmgr_hnp_globals.debug) {
|
||||
opal_output(0, "errmgr_hnp_init called");
|
||||
}
|
||||
|
||||
/* If we are not an HNP, then don't pick us! */
|
||||
if (!orte_process_info.seed) {
|
||||
/* don't take me! */
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* Return a module (choose an arbitrary, positive priority --
|
||||
it's only relevant compared to other components). */
|
||||
|
||||
*priority = 10;
|
||||
|
||||
/* no part of OpenRTE allows or has threads */
|
||||
|
||||
*allow_multi_user_threads = false;
|
||||
*have_hidden_threads = false;
|
||||
|
||||
/* start the receive function */
|
||||
if (ORTE_SUCCESS != (rc = orte_errmgr_base_comm_start())) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
initialized = true;
|
||||
return &orte_errmgr_hnp;
|
||||
}
|
||||
|
||||
/*
|
||||
* finalize routine
|
||||
*/
|
||||
int orte_errmgr_hnp_finalize(void)
|
||||
{
|
||||
int rc;
|
||||
|
||||
if (orte_errmgr_hnp_globals.debug) {
|
||||
opal_output(0, "[%lu,%lu,%lu] errmgr_hnp_finalize called",
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name));
|
||||
}
|
||||
|
||||
/* stop the receive function */
|
||||
if (ORTE_SUCCESS != (rc = orte_errmgr_base_comm_stop())) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
|
||||
initialized = false;
|
||||
|
||||
/* All done */
|
||||
return ORTE_SUCCESS;
|
||||
}
|
46
orte/mca/errmgr/orted/Makefile.am
Обычный файл
46
orte/mca/errmgr/orted/Makefile.am
Обычный файл
@ -0,0 +1,46 @@
|
||||
#
|
||||
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
# University Research and Technology
|
||||
# Corporation. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
# of Tennessee Research Foundation. All rights
|
||||
# reserved.
|
||||
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
sources = \
|
||||
errmgr_orted.h \
|
||||
errmgr_orted_component.c \
|
||||
errmgr_orted.c
|
||||
|
||||
# Make the output library in this directory, and name it either
|
||||
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
|
||||
# (for static builds).
|
||||
|
||||
if OMPI_BUILD_errmgr_orted_DSO
|
||||
component_noinst =
|
||||
component_install = mca_errmgr_orted.la
|
||||
else
|
||||
component_noinst = libmca_errmgr_orted.la
|
||||
component_install =
|
||||
endif
|
||||
|
||||
mcacomponentdir = $(libdir)/openmpi
|
||||
mcacomponent_LTLIBRARIES = $(component_install)
|
||||
mca_errmgr_orted_la_SOURCES = $(sources)
|
||||
mca_errmgr_orted_la_LDFLAGS = -module -avoid-version
|
||||
mca_errmgr_orted_la_LIBADD = \
|
||||
$(top_ompi_builddir)/orte/liborte.la \
|
||||
$(top_ompi_builddir)/opal/libopal.la
|
||||
|
||||
noinst_LTLIBRARIES = $(component_noinst)
|
||||
libmca_errmgr_orted_la_SOURCES =$(sources)
|
||||
libmca_errmgr_orted_la_LDFLAGS = -module -avoid-version
|
23
orte/mca/errmgr/orted/configure.params
Обычный файл
23
orte/mca/errmgr/orted/configure.params
Обычный файл
@ -0,0 +1,23 @@
|
||||
# -*- shell-script -*-
|
||||
#
|
||||
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
# University Research and Technology
|
||||
# Corporation. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
# of Tennessee Research Foundation. All rights
|
||||
# reserved.
|
||||
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
# Specific to this module
|
||||
|
||||
PARAM_INIT_FILE=errmgr_orted_component.c
|
||||
PARAM_CONFIG_FILES="Makefile"
|
192
orte/mca/errmgr/orted/errmgr_orted.c
Обычный файл
192
orte/mca/errmgr/orted/errmgr_orted.c
Обычный файл
@ -0,0 +1,192 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
|
||||
#include "orte_config.h"
|
||||
#include "orte/orte_constants.h"
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stdarg.h>
|
||||
|
||||
#include "opal/util/output.h"
|
||||
|
||||
#include "orte/runtime/runtime.h"
|
||||
#include "orte/runtime/orte_wait.h"
|
||||
#include "orte/util/proc_info.h"
|
||||
#include "orte/util/session_dir.h"
|
||||
#include "orte/mca/ns/ns_types.h"
|
||||
#include "orte/mca/smr/smr.h"
|
||||
#include "orte/mca/rml/rml.h"
|
||||
|
||||
#include "orte/mca/errmgr/base/errmgr_private.h"
|
||||
#include "orte/mca/errmgr/orted/errmgr_orted.h"
|
||||
|
||||
/*
|
||||
* This function only gets called on HNP components! Orteds learn about
|
||||
* a proc aborting from the HNP.
|
||||
*/
|
||||
int orte_errmgr_orted_proc_aborted(orte_gpr_notify_message_t *msg)
|
||||
{
|
||||
return ORTE_ERR_NOT_AVAILABLE;
|
||||
}
|
||||
|
||||
/* This function only gets called on HNP components! Orteds learn about
|
||||
* an incomplete start from the HNP.
|
||||
*/
|
||||
int orte_errmgr_orted_incomplete_start(orte_gpr_notify_message_t *msg)
|
||||
{
|
||||
return ORTE_ERR_NOT_AVAILABLE;
|
||||
}
|
||||
|
||||
/*
|
||||
* This function gets called when the orted itself detects an internal error!
|
||||
* At some point in future, to be polite, we tell any of our own local
|
||||
* processes to die before we abandon them
|
||||
*/
|
||||
void orte_errmgr_orted_error_detected(int error_code, char *fmt, ...)
|
||||
{
|
||||
va_list arglist;
|
||||
|
||||
/* If there was a message, output it */
|
||||
|
||||
va_start(arglist, fmt);
|
||||
if( NULL != fmt ) {
|
||||
char* buffer = NULL;
|
||||
vasprintf( &buffer, fmt, arglist );
|
||||
opal_output( 0, buffer );
|
||||
free( buffer );
|
||||
}
|
||||
va_end(arglist);
|
||||
|
||||
/* cleanup my session directory */
|
||||
orte_session_dir_finalize(orte_process_info.my_name);
|
||||
|
||||
/* abnormal exit */
|
||||
orte_abort(error_code, false);
|
||||
}
|
||||
|
||||
/*
|
||||
* This function gets called when we desperately need to just die.
|
||||
* Nothing can be done by definition here - this function ONLY gets
|
||||
* called as an absolute last resort
|
||||
*/
|
||||
void orte_errmgr_orted_abort(void)
|
||||
{
|
||||
/* cleanup my session directory */
|
||||
orte_session_dir_finalize(orte_process_info.my_name);
|
||||
|
||||
/* abnormal exit */
|
||||
orte_abort(-1, false);
|
||||
}
|
||||
|
||||
/*
|
||||
* This function is called by the orted to request that some set of processes
|
||||
* be aborted by the HNP. This would likely be an unusual request as the orted
|
||||
* would have no knowledge of other processes or real reason to order them killed.
|
||||
* Still, the capability is provided here.
|
||||
*/
|
||||
int orte_errmgr_orted_abort_procs_request(orte_process_name_t *procs, orte_std_cntr_t nprocs)
|
||||
{
|
||||
orte_buffer_t* cmd;
|
||||
orte_buffer_t* answer;
|
||||
orte_errmgr_cmd_flag_t command;
|
||||
orte_std_cntr_t count;
|
||||
int rc;
|
||||
|
||||
/* protect us against error */
|
||||
if (NULL == procs) {
|
||||
return ORTE_ERR_BAD_PARAM;
|
||||
}
|
||||
|
||||
command = ORTE_ERRMGR_ABORT_PROCS_REQUEST_CMD;
|
||||
|
||||
cmd = OBJ_NEW(orte_buffer_t);
|
||||
if (cmd == NULL) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
/* pack the command */
|
||||
if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, &command, 1, ORTE_ERRMGR_CMD))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_RELEASE(cmd);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* pack the number of procs we are requesting be aborted */
|
||||
if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, &nprocs, 1, ORTE_STD_CNTR))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_RELEASE(cmd);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* pack the array of proc names */
|
||||
if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, procs, nprocs, ORTE_NAME))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_RELEASE(cmd);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* send the request */
|
||||
if (0 > orte_rml.send_buffer(orte_errmgr_orted_globals.replica, cmd, ORTE_RML_TAG_RDS, 0)) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
|
||||
OBJ_RELEASE(cmd);
|
||||
return ORTE_ERR_COMM_FAILURE;
|
||||
}
|
||||
OBJ_RELEASE(cmd);
|
||||
|
||||
/* setup a buffer for the answer */
|
||||
answer = OBJ_NEW(orte_buffer_t);
|
||||
if(answer == NULL) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
/* enter a blocking receive until we hear back */
|
||||
if (0 > orte_rml.recv_buffer(orte_errmgr_orted_globals.replica, answer, ORTE_RML_TAG_RDS)) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
|
||||
OBJ_RELEASE(answer);
|
||||
return ORTE_ERR_COMM_FAILURE;
|
||||
}
|
||||
|
||||
count = 1;
|
||||
if (ORTE_SUCCESS != (rc = orte_dss.unpack(answer, &command, &count, ORTE_ERRMGR_CMD))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_RELEASE(answer);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* check that this is the right command */
|
||||
if (ORTE_ERRMGR_ABORT_PROCS_REQUEST_CMD != command) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
|
||||
OBJ_RELEASE(answer);
|
||||
return ORTE_ERR_COMM_FAILURE;
|
||||
}
|
||||
|
||||
/* clean up and leave */
|
||||
OBJ_RELEASE(answer);
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/*
|
||||
* It is imperative that ONLY an HNP perform this registration!
|
||||
*/
|
||||
int orte_errmgr_orted_register_job(orte_jobid_t job)
|
||||
{
|
||||
return ORTE_ERR_NOT_AVAILABLE;
|
||||
}
|
81
orte/mca/errmgr/orted/errmgr_orted.h
Обычный файл
81
orte/mca/errmgr/orted/errmgr_orted.h
Обычный файл
@ -0,0 +1,81 @@
|
||||
/* -*- C -*-
|
||||
*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*
|
||||
*/
|
||||
#ifndef ORTE_ERRMGR_ORTED_H
|
||||
#define ORTE_ERRMGR_ORTED_H
|
||||
|
||||
|
||||
#include "orte_config.h"
|
||||
#include "orte/orte_types.h"
|
||||
|
||||
#include "orte/mca/ns/ns_types.h"
|
||||
#include "orte/mca/gpr/gpr_types.h"
|
||||
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
|
||||
#if defined(c_plusplus) || defined(__cplusplus)
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Module open / close
|
||||
*/
|
||||
int orte_errmgr_orted_open(void);
|
||||
int orte_errmgr_orted_close(void);
|
||||
|
||||
|
||||
/*
|
||||
* Startup / Shutdown
|
||||
*/
|
||||
orte_errmgr_base_module_t*
|
||||
orte_errmgr_orted_component_init(bool *allow_multi_user_threads, bool *have_hidden_threads, int *priority);
|
||||
|
||||
int orte_errmgr_orted_finalize(void);
|
||||
|
||||
/*
|
||||
* globals used within the component
|
||||
*/
|
||||
typedef struct {
|
||||
int debug;
|
||||
orte_process_name_t *replica;
|
||||
} orte_errmgr_orted_globals_t;
|
||||
|
||||
|
||||
extern orte_errmgr_orted_globals_t orte_errmgr_orted_globals;
|
||||
|
||||
/*
|
||||
* Component API functions
|
||||
*/
|
||||
int orte_errmgr_orted_proc_aborted(orte_gpr_notify_message_t *msg);
|
||||
|
||||
int orte_errmgr_orted_incomplete_start(orte_gpr_notify_message_t *msg);
|
||||
|
||||
void orte_errmgr_orted_error_detected(int error_code, char *fmt, ...);
|
||||
|
||||
void orte_errmgr_orted_abort(void);
|
||||
|
||||
int orte_errmgr_orted_register_job(orte_jobid_t job);
|
||||
|
||||
int orte_errmgr_orted_abort_procs_request(orte_process_name_t *procs, orte_std_cntr_t nprocs);
|
||||
|
||||
#if defined(c_plusplus) || defined(__cplusplus)
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
164
orte/mca/errmgr/orted/errmgr_orted_component.c
Обычный файл
164
orte/mca/errmgr/orted/errmgr_orted_component.c
Обычный файл
@ -0,0 +1,164 @@
|
||||
/* -*- C -*-
|
||||
*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
/** @file:
|
||||
*
|
||||
* The Open MPI General Purpose Registry - Proxy component
|
||||
*
|
||||
*/
|
||||
|
||||
/*
|
||||
* includes
|
||||
*/
|
||||
#include "orte_config.h"
|
||||
|
||||
#include "orte/orte_constants.h"
|
||||
#include "orte/orte_types.h"
|
||||
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/mca/mca.h"
|
||||
#include "opal/mca/base/base.h"
|
||||
#include "opal/mca/base/mca_base_param.h"
|
||||
|
||||
#include "orte/util/proc_info.h"
|
||||
#include "orte/mca/ns/ns_types.h"
|
||||
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/errmgr/base/errmgr_private.h"
|
||||
|
||||
#include "errmgr_orted.h"
|
||||
|
||||
|
||||
/*
|
||||
* Struct of function pointers that need to be initialized
|
||||
*/
|
||||
mca_errmgr_base_component_t mca_errmgr_orted_component = {
|
||||
{
|
||||
ORTE_ERRMGR_BASE_VERSION_1_3_0,
|
||||
|
||||
"orted", /* MCA module name */
|
||||
ORTE_MAJOR_VERSION, /* MCA module major version */
|
||||
ORTE_MINOR_VERSION, /* MCA module minor version */
|
||||
ORTE_RELEASE_VERSION, /* MCA module release version */
|
||||
orte_errmgr_orted_open, /* module open */
|
||||
orte_errmgr_orted_close /* module close */
|
||||
},
|
||||
{
|
||||
false /* checkpoint / restart */
|
||||
},
|
||||
orte_errmgr_orted_component_init, /* module init */
|
||||
orte_errmgr_orted_finalize /* module shutdown */
|
||||
};
|
||||
|
||||
/*
|
||||
* setup the function pointers for the module
|
||||
*/
|
||||
static orte_errmgr_base_module_t orte_errmgr_orted = {
|
||||
orte_errmgr_base_log,
|
||||
orte_errmgr_orted_proc_aborted,
|
||||
orte_errmgr_orted_incomplete_start,
|
||||
orte_errmgr_orted_error_detected,
|
||||
orte_errmgr_orted_register_job,
|
||||
orte_errmgr_orted_abort,
|
||||
orte_errmgr_orted_abort_procs_request
|
||||
};
|
||||
|
||||
|
||||
/*
|
||||
* Whether or not we allowed this component to be selected
|
||||
*/
|
||||
static bool initialized = false;
|
||||
|
||||
/* local globals */
|
||||
orte_errmgr_orted_globals_t orte_errmgr_orted_globals;
|
||||
|
||||
|
||||
/*
|
||||
* Open the component
|
||||
*/
|
||||
int orte_errmgr_orted_open(void)
|
||||
{
|
||||
int id, tmp;
|
||||
|
||||
id = mca_base_param_register_int("errmgr", "orted", "debug", NULL, 0);
|
||||
mca_base_param_lookup_int(id, &tmp);
|
||||
if (tmp) {
|
||||
orte_errmgr_orted_globals.debug = true;
|
||||
} else {
|
||||
orte_errmgr_orted_globals.debug = false;
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/*
|
||||
* Close the component
|
||||
*/
|
||||
int orte_errmgr_orted_close(void)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
orte_errmgr_base_module_t*
|
||||
orte_errmgr_orted_component_init(bool *allow_multi_user_threads, bool *have_hidden_threads,
|
||||
int *priority)
|
||||
{
|
||||
if (orte_errmgr_orted_globals.debug) {
|
||||
opal_output(0, "errmgr_orted_init called");
|
||||
}
|
||||
|
||||
/* If we are not a daemon, then this component is not for us! */
|
||||
if (!orte_process_info.daemon) {
|
||||
/* don't take me! */
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* Return a module (choose an arbitrary, positive priority --
|
||||
it's only relevant compared to other components). */
|
||||
|
||||
*priority = 10;
|
||||
|
||||
/* no part of OpenRTE allows or has threads */
|
||||
|
||||
*allow_multi_user_threads = false;
|
||||
*have_hidden_threads = false;
|
||||
|
||||
/* define the HNP we should be talking to - for now,
|
||||
* just use the NS replica
|
||||
*/
|
||||
orte_errmgr_orted_globals.replica = orte_process_info.ns_replica;
|
||||
|
||||
initialized = true;
|
||||
return &orte_errmgr_orted;
|
||||
}
|
||||
|
||||
/*
|
||||
* finalize routine
|
||||
*/
|
||||
int orte_errmgr_orted_finalize(void)
|
||||
{
|
||||
if (orte_errmgr_orted_globals.debug) {
|
||||
opal_output(0, "[%lu,%lu,%lu] errmgr_orted_finalize called",
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name));
|
||||
}
|
||||
|
||||
initialized = false;
|
||||
|
||||
/* All done */
|
||||
return ORTE_SUCCESS;
|
||||
}
|
46
orte/mca/errmgr/proxy/Makefile.am
Обычный файл
46
orte/mca/errmgr/proxy/Makefile.am
Обычный файл
@ -0,0 +1,46 @@
|
||||
#
|
||||
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
# University Research and Technology
|
||||
# Corporation. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
# of Tennessee Research Foundation. All rights
|
||||
# reserved.
|
||||
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
sources = \
|
||||
errmgr_proxy.h \
|
||||
errmgr_proxy_component.c \
|
||||
errmgr_proxy.c
|
||||
|
||||
# Make the output library in this directory, and name it either
|
||||
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
|
||||
# (for static builds).
|
||||
|
||||
if OMPI_BUILD_errmgr_proxy_DSO
|
||||
component_noinst =
|
||||
component_install = mca_errmgr_proxy.la
|
||||
else
|
||||
component_noinst = libmca_errmgr_proxy.la
|
||||
component_install =
|
||||
endif
|
||||
|
||||
mcacomponentdir = $(libdir)/openmpi
|
||||
mcacomponent_LTLIBRARIES = $(component_install)
|
||||
mca_errmgr_proxy_la_SOURCES = $(sources)
|
||||
mca_errmgr_proxy_la_LDFLAGS = -module -avoid-version
|
||||
mca_errmgr_proxy_la_LIBADD = \
|
||||
$(top_ompi_builddir)/orte/liborte.la \
|
||||
$(top_ompi_builddir)/opal/libopal.la
|
||||
|
||||
noinst_LTLIBRARIES = $(component_noinst)
|
||||
libmca_errmgr_proxy_la_SOURCES =$(sources)
|
||||
libmca_errmgr_proxy_la_LDFLAGS = -module -avoid-version
|
23
orte/mca/errmgr/proxy/configure.params
Обычный файл
23
orte/mca/errmgr/proxy/configure.params
Обычный файл
@ -0,0 +1,23 @@
|
||||
# -*- shell-script -*-
|
||||
#
|
||||
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
# University Research and Technology
|
||||
# Corporation. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
# of Tennessee Research Foundation. All rights
|
||||
# reserved.
|
||||
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
# Specific to this module
|
||||
|
||||
PARAM_INIT_FILE=errmgr_proxy_component.c
|
||||
PARAM_CONFIG_FILES="Makefile"
|
187
orte/mca/errmgr/proxy/errmgr_proxy.c
Обычный файл
187
orte/mca/errmgr/proxy/errmgr_proxy.c
Обычный файл
@ -0,0 +1,187 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
|
||||
#include "orte_config.h"
|
||||
#include "orte/orte_constants.h"
|
||||
|
||||
#include "opal/util/output.h"
|
||||
|
||||
#include "orte/runtime/runtime.h"
|
||||
#include "orte/mca/ns/ns_types.h"
|
||||
#include "orte/mca/gpr/gpr_types.h"
|
||||
#include "orte/mca/rml/rml.h"
|
||||
|
||||
#include "orte/mca/errmgr/base/errmgr_private.h"
|
||||
#include "orte/mca/errmgr/proxy/errmgr_proxy.h"
|
||||
|
||||
/*
|
||||
* This function gets called when the SMR updates a process state to
|
||||
* indicate that it aborted. Since the proxy component is only active on
|
||||
* non-HNP processes, this function will NEVER be called
|
||||
*/
|
||||
int orte_errmgr_proxy_proc_aborted(orte_gpr_notify_message_t *msg)
|
||||
{
|
||||
return ORTE_ERR_NOT_AVAILABLE;
|
||||
}
|
||||
|
||||
/*
|
||||
* This function gets called when the SMR updates a process state to
|
||||
* indicate that it failed to start. Since the proxy component is only active on
|
||||
* non-HNP processes, this function will NEVER be called
|
||||
*/
|
||||
int orte_errmgr_proxy_incomplete_start(orte_gpr_notify_message_t *msg)
|
||||
{
|
||||
return ORTE_ERR_NOT_AVAILABLE;
|
||||
}
|
||||
|
||||
/*
|
||||
* This function gets called when a process detects an internal error.
|
||||
* Various non-HNP/non-orted errmgr components will deal with this in various
|
||||
* ways - for now, we simply abort and provide the error_code as our
|
||||
* exit status
|
||||
*/
|
||||
void orte_errmgr_proxy_error_detected(int error_code, char *fmt, ...)
|
||||
{
|
||||
va_list arglist;
|
||||
|
||||
/* If there was a message, output it */
|
||||
|
||||
va_start(arglist, fmt);
|
||||
if( NULL != fmt ) {
|
||||
char* buffer = NULL;
|
||||
vasprintf( &buffer, fmt, arglist );
|
||||
opal_output( 0, buffer );
|
||||
free( buffer );
|
||||
}
|
||||
va_end(arglist);
|
||||
|
||||
orte_abort(error_code, true);
|
||||
}
|
||||
|
||||
/*
|
||||
* This function gets called when a process desperately needs to just die.
|
||||
* Nothing can be done by definition here - this function ONLY gets
|
||||
* called as an absolute last resort.
|
||||
*/
|
||||
void orte_errmgr_proxy_abort()
|
||||
{
|
||||
/* abnormal exit */
|
||||
orte_abort(-1, true);
|
||||
}
|
||||
|
||||
/*
|
||||
* Alternatively, some systems (e.g., OpenMPI) need to tell us to kill
|
||||
* some other subset of processes along with us. Send that info to the
|
||||
* HNP so it can kill them.
|
||||
*
|
||||
* NOTE: this function assumes that the underlying ORTE infrastructure is
|
||||
* still operational. Use of this function should therefore be restricted
|
||||
* to cases where the problem is in a higher layer (e.g., MPI) as the
|
||||
* process is likely to "hang" if an ORTE problem has been encountered.
|
||||
*/
|
||||
int orte_errmgr_proxy_abort_procs_request(orte_process_name_t *procs, orte_std_cntr_t nprocs)
|
||||
{
|
||||
orte_buffer_t* cmd;
|
||||
orte_buffer_t* answer;
|
||||
orte_errmgr_cmd_flag_t command;
|
||||
orte_std_cntr_t count;
|
||||
int rc;
|
||||
|
||||
/* protect us against error */
|
||||
if (NULL == procs) {
|
||||
return ORTE_ERR_BAD_PARAM;
|
||||
}
|
||||
|
||||
command = ORTE_ERRMGR_ABORT_PROCS_REQUEST_CMD;
|
||||
|
||||
cmd = OBJ_NEW(orte_buffer_t);
|
||||
if (cmd == NULL) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
/* pack the command */
|
||||
if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, &command, 1, ORTE_ERRMGR_CMD))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_RELEASE(cmd);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* pack the number of procs we are requesting be aborted */
|
||||
if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, &nprocs, 1, ORTE_STD_CNTR))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_RELEASE(cmd);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* pack the array of proc names */
|
||||
if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, procs, nprocs, ORTE_NAME))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_RELEASE(cmd);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* send the request */
|
||||
if (0 > orte_rml.send_buffer(orte_errmgr_proxy_globals.replica, cmd, ORTE_RML_TAG_RDS, 0)) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
|
||||
OBJ_RELEASE(cmd);
|
||||
return ORTE_ERR_COMM_FAILURE;
|
||||
}
|
||||
OBJ_RELEASE(cmd);
|
||||
|
||||
/* setup a buffer for the answer */
|
||||
answer = OBJ_NEW(orte_buffer_t);
|
||||
if(answer == NULL) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
/* enter a blocking receive until we hear back */
|
||||
if (0 > orte_rml.recv_buffer(orte_errmgr_proxy_globals.replica, answer, ORTE_RML_TAG_RDS)) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
|
||||
OBJ_RELEASE(answer);
|
||||
return ORTE_ERR_COMM_FAILURE;
|
||||
}
|
||||
|
||||
count = 1;
|
||||
if (ORTE_SUCCESS != (rc = orte_dss.unpack(answer, &command, &count, ORTE_ERRMGR_CMD))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_RELEASE(answer);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* check that this is the right command */
|
||||
if (ORTE_ERRMGR_ABORT_PROCS_REQUEST_CMD != command) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
|
||||
OBJ_RELEASE(answer);
|
||||
return ORTE_ERR_COMM_FAILURE;
|
||||
}
|
||||
|
||||
/* clean up and leave */
|
||||
OBJ_RELEASE(answer);
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/*
|
||||
* It is imperative that ONLY an HNP perform this registration!
|
||||
*/
|
||||
int orte_errmgr_proxy_register_job(orte_jobid_t job)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
81
orte/mca/errmgr/proxy/errmgr_proxy.h
Обычный файл
81
orte/mca/errmgr/proxy/errmgr_proxy.h
Обычный файл
@ -0,0 +1,81 @@
|
||||
/* -*- C -*-
|
||||
*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*
|
||||
*/
|
||||
#ifndef ORTE_ERRMGR_PROXY_H
|
||||
#define ORTE_ERRMGR_PROXY_H
|
||||
|
||||
|
||||
#include "orte_config.h"
|
||||
#include "orte/orte_types.h"
|
||||
|
||||
#include "orte/mca/ns/ns_types.h"
|
||||
#include "orte/mca/gpr/gpr_types.h"
|
||||
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
|
||||
#if defined(c_plusplus) || defined(__cplusplus)
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Module open / close
|
||||
*/
|
||||
int orte_errmgr_proxy_open(void);
|
||||
int orte_errmgr_proxy_close(void);
|
||||
|
||||
|
||||
/*
|
||||
* Startup / Shutdown
|
||||
*/
|
||||
orte_errmgr_base_module_t*
|
||||
orte_errmgr_proxy_component_init(bool *allow_multi_user_threads, bool *have_hidden_threads, int *priority);
|
||||
|
||||
int orte_errmgr_proxy_finalize(void);
|
||||
|
||||
/*
|
||||
* globals used within the component
|
||||
*/
|
||||
typedef struct {
|
||||
int debug;
|
||||
orte_process_name_t *replica;
|
||||
} orte_errmgr_proxy_globals_t;
|
||||
|
||||
|
||||
extern orte_errmgr_proxy_globals_t orte_errmgr_proxy_globals;
|
||||
|
||||
/*
|
||||
* Component API functions
|
||||
*/
|
||||
int orte_errmgr_proxy_proc_aborted(orte_gpr_notify_message_t *msg);
|
||||
|
||||
int orte_errmgr_proxy_incomplete_start(orte_gpr_notify_message_t *msg);
|
||||
|
||||
void orte_errmgr_proxy_error_detected(int error_code, char *fmt, ...);
|
||||
|
||||
void orte_errmgr_proxy_abort(void);
|
||||
|
||||
int orte_errmgr_proxy_register_job(orte_jobid_t job);
|
||||
|
||||
int orte_errmgr_proxy_abort_procs_request(orte_process_name_t *procs, orte_std_cntr_t nprocs);
|
||||
|
||||
#if defined(c_plusplus) || defined(__cplusplus)
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
163
orte/mca/errmgr/proxy/errmgr_proxy_component.c
Обычный файл
163
orte/mca/errmgr/proxy/errmgr_proxy_component.c
Обычный файл
@ -0,0 +1,163 @@
|
||||
/* -*- C -*-
|
||||
*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
/** @file:
|
||||
*
|
||||
* The Open MPI General Purpose Registry - Proxy component
|
||||
*
|
||||
*/
|
||||
|
||||
/*
|
||||
* includes
|
||||
*/
|
||||
#include "orte_config.h"
|
||||
|
||||
#include "orte/orte_constants.h"
|
||||
#include "orte/orte_types.h"
|
||||
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/mca/mca.h"
|
||||
#include "opal/mca/base/base.h"
|
||||
#include "opal/mca/base/mca_base_param.h"
|
||||
|
||||
#include "orte/util/proc_info.h"
|
||||
#include "orte/mca/ns/ns_types.h"
|
||||
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/errmgr/base/errmgr_private.h"
|
||||
|
||||
#include "errmgr_proxy.h"
|
||||
|
||||
|
||||
/*
|
||||
* Struct of function pointers that need to be initialized
|
||||
*/
|
||||
mca_errmgr_base_component_t mca_errmgr_proxy_component = {
|
||||
{
|
||||
ORTE_ERRMGR_BASE_VERSION_1_3_0,
|
||||
|
||||
"proxy", /* MCA module name */
|
||||
ORTE_MAJOR_VERSION, /* MCA module major version */
|
||||
ORTE_MINOR_VERSION, /* MCA module minor version */
|
||||
ORTE_RELEASE_VERSION, /* MCA module release version */
|
||||
orte_errmgr_proxy_open, /* module open */
|
||||
orte_errmgr_proxy_close /* module close */
|
||||
},
|
||||
{
|
||||
false /* checkpoint / restart */
|
||||
},
|
||||
orte_errmgr_proxy_component_init, /* module init */
|
||||
orte_errmgr_proxy_finalize /* module shutdown */
|
||||
};
|
||||
|
||||
/*
|
||||
* setup the function pointers for the module
|
||||
*/
|
||||
static orte_errmgr_base_module_t orte_errmgr_proxy = {
|
||||
orte_errmgr_base_log,
|
||||
orte_errmgr_proxy_proc_aborted,
|
||||
orte_errmgr_proxy_incomplete_start,
|
||||
orte_errmgr_proxy_error_detected,
|
||||
orte_errmgr_proxy_register_job,
|
||||
orte_errmgr_proxy_abort,
|
||||
orte_errmgr_proxy_abort_procs_request
|
||||
};
|
||||
|
||||
|
||||
/*
|
||||
* Whether or not we allowed this component to be selected
|
||||
*/
|
||||
static bool initialized = false;
|
||||
|
||||
/* local globals */
|
||||
orte_errmgr_proxy_globals_t orte_errmgr_proxy_globals;
|
||||
|
||||
/*
|
||||
* Open the component
|
||||
*/
|
||||
int orte_errmgr_proxy_open(void)
|
||||
{
|
||||
int id, tmp;
|
||||
|
||||
id = mca_base_param_register_int("errmgr", "proxy", "debug", NULL, 0);
|
||||
mca_base_param_lookup_int(id, &tmp);
|
||||
if (tmp) {
|
||||
orte_errmgr_proxy_globals.debug = true;
|
||||
} else {
|
||||
orte_errmgr_proxy_globals.debug = false;
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/*
|
||||
* Close the component
|
||||
*/
|
||||
int orte_errmgr_proxy_close(void)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
orte_errmgr_base_module_t*
|
||||
orte_errmgr_proxy_component_init(bool *allow_multi_user_threads, bool *have_hidden_threads,
|
||||
int *priority)
|
||||
{
|
||||
if (orte_errmgr_proxy_globals.debug) {
|
||||
opal_output(0, "errmgr_proxy_init called");
|
||||
}
|
||||
|
||||
/* If we are an HNP or an orted, then don't pick us! */
|
||||
if (orte_process_info.seed || orte_process_info.daemon) {
|
||||
/* don't take me! */
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* Return a module (choose an arbitrary, positive priority --
|
||||
it's only relevant compared to other components). */
|
||||
|
||||
*priority = 10;
|
||||
|
||||
/* no part of OpenRTE allows or has threads */
|
||||
|
||||
*allow_multi_user_threads = false;
|
||||
*have_hidden_threads = false;
|
||||
|
||||
/* define the replica for us to use - for now, just point
|
||||
* to the name service replica
|
||||
*/
|
||||
orte_errmgr_proxy_globals.replica = orte_process_info.ns_replica;
|
||||
|
||||
initialized = true;
|
||||
return &orte_errmgr_proxy;
|
||||
}
|
||||
|
||||
/*
|
||||
* finalize routine
|
||||
*/
|
||||
int orte_errmgr_proxy_finalize(void)
|
||||
{
|
||||
if (orte_errmgr_proxy_globals.debug) {
|
||||
opal_output(0, "[%lu,%lu,%lu] errmgr_proxy_finalize called",
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name));
|
||||
}
|
||||
|
||||
initialized = false;
|
||||
|
||||
/* All done */
|
||||
return ORTE_SUCCESS;
|
||||
}
|
@ -278,6 +278,7 @@ int orte_gpr_replica_purge_subscriptions(orte_process_name_t *proc);
|
||||
|
||||
int orte_gpr_replica_store_value_in_msg(orte_gpr_replica_requestor_t *req,
|
||||
orte_gpr_notify_message_t *msg,
|
||||
char *sub_name,
|
||||
orte_std_cntr_t cnt,
|
||||
orte_gpr_value_t **values);
|
||||
|
||||
|
@ -213,7 +213,7 @@ int orte_gpr_replica_register_callback(orte_gpr_replica_subscription_t *sub,
|
||||
* subscription id, combining data where the id's match
|
||||
*/
|
||||
if (ORTE_SUCCESS != (rc = orte_gpr_replica_store_value_in_msg(reqs[i],
|
||||
cb->message, cnt, values))) {
|
||||
cb->message, sub->name, cnt, values))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto CLEANUP;
|
||||
}
|
||||
@ -436,6 +436,7 @@ int orte_gpr_replica_define_callback(orte_gpr_notify_msg_type_t msg_type,
|
||||
|
||||
int orte_gpr_replica_store_value_in_msg(orte_gpr_replica_requestor_t *req,
|
||||
orte_gpr_notify_message_t *msg,
|
||||
char *sub_name,
|
||||
orte_std_cntr_t cnt,
|
||||
orte_gpr_value_t **values)
|
||||
{
|
||||
@ -482,6 +483,10 @@ int orte_gpr_replica_store_value_in_msg(orte_gpr_replica_requestor_t *req,
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
/* set the name of the subscription, if provided */
|
||||
if (NULL != sub_name) {
|
||||
dptr->target = strdup(sub_name);
|
||||
}
|
||||
dptr->id = req->idtag;
|
||||
if (0 > orte_pointer_array_add(&index, msg->data, dptr)) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
@ -526,7 +531,7 @@ static int orte_gpr_replica_store_value_in_trigger_msg(orte_gpr_replica_subscrip
|
||||
if (NULL != data[i]) {
|
||||
k++;
|
||||
if ((NULL == data[i]->target && NULL == sub) ||
|
||||
(NULL != data[i]->target &&
|
||||
(NULL != data[i]->target && NULL != sub->name &&
|
||||
0 == strcmp(data[i]->target, sub->name))) { /* going to the same place */
|
||||
for (j=0; j < cnt; j++) {
|
||||
if (0 > orte_pointer_array_add(&index, data[i]->values, values[j])) {
|
||||
@ -557,7 +562,7 @@ static int orte_gpr_replica_store_value_in_trigger_msg(orte_gpr_replica_subscrip
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
if (NULL != sub) {
|
||||
if (NULL != sub && NULL != sub->name) {
|
||||
dptr->target = strdup(sub->name);
|
||||
}
|
||||
if (0 > orte_pointer_array_add(&index, msg->data, dptr)) {
|
||||
|
@ -42,28 +42,79 @@ int orte_ns_base_compare_name(orte_process_name_t *value1,
|
||||
return ORTE_VALUE1_GREATER;
|
||||
}
|
||||
|
||||
/* for this generic compare, go through the progression */
|
||||
if (value1->cellid < value2->cellid) {
|
||||
return ORTE_VALUE2_GREATER;
|
||||
} else if (value1->cellid > value2->cellid) {
|
||||
return ORTE_VALUE1_GREATER;
|
||||
/** we have to take care of the special case where one of the
|
||||
* values is ORTE_NAME_WILDCARD. If any of the fields are wildcard,
|
||||
* then we want to just ignore that one field. However, in the case
|
||||
* of ORTE_NAME_WILDCARD (where ALL of the fields are wildcard), this
|
||||
* would automatically result in ORTE_EQUAL for any name in the other
|
||||
* value - a totally useless result.
|
||||
*
|
||||
* Instead, what we want to know in this case is if the value actually
|
||||
* *is* ORTE_NAME_WILDCARD. So, we need to detect if one of the values
|
||||
* is ORTE_NAME_WILDCARD, and then specifically check the other one
|
||||
* to see if it matches
|
||||
*/
|
||||
if (value2->cellid == ORTE_CELLID_WILDCARD &&
|
||||
value2->jobid == ORTE_JOBID_WILDCARD &&
|
||||
value2->vpid == ORTE_VPID_WILDCARD) {
|
||||
if (value1->cellid == ORTE_CELLID_WILDCARD &&
|
||||
value1->jobid == ORTE_JOBID_WILDCARD &&
|
||||
value1->vpid == ORTE_VPID_WILDCARD) {
|
||||
return ORTE_EQUAL;
|
||||
} else {
|
||||
return ORTE_VALUE1_GREATER;
|
||||
}
|
||||
} else if (value1->cellid == ORTE_CELLID_WILDCARD &&
|
||||
value1->jobid == ORTE_JOBID_WILDCARD &&
|
||||
value1->vpid == ORTE_VPID_WILDCARD) {
|
||||
if (value2->cellid == ORTE_CELLID_WILDCARD &&
|
||||
value2->jobid == ORTE_JOBID_WILDCARD &&
|
||||
value2->vpid == ORTE_VPID_WILDCARD) {
|
||||
return ORTE_EQUAL;
|
||||
} else {
|
||||
return ORTE_VALUE2_GREATER;
|
||||
}
|
||||
}
|
||||
|
||||
/* get here if jobid's are equal - now check process group */
|
||||
if (value1->jobid < value2->jobid) {
|
||||
return ORTE_VALUE2_GREATER;
|
||||
} else if (value1->jobid > value2->jobid) {
|
||||
return ORTE_VALUE1_GREATER;
|
||||
|
||||
/** now that the special cases are done, go through the progression */
|
||||
|
||||
/** check the cellids - if one of them is WILDCARD, then ignore
|
||||
* this field since anything is okay
|
||||
*/
|
||||
if (value1->cellid != ORTE_CELLID_WILDCARD &&
|
||||
value2->cellid != ORTE_CELLID_WILDCARD) {
|
||||
if (value1->cellid < value2->cellid) {
|
||||
return ORTE_VALUE2_GREATER;
|
||||
} else if (value1->cellid > value2->cellid) {
|
||||
return ORTE_VALUE1_GREATER;
|
||||
}
|
||||
}
|
||||
|
||||
/* get here if cellid's and jobid's are equal - now check vpid */
|
||||
if (value1->vpid < value2->vpid) {
|
||||
return ORTE_VALUE2_GREATER;
|
||||
} else if (value1->vpid > value2->vpid) {
|
||||
return ORTE_VALUE1_GREATER;
|
||||
|
||||
/** check the jobids - if one of them is WILDCARD, then ignore
|
||||
* this field since anything is okay
|
||||
*/
|
||||
if (value1->jobid != ORTE_JOBID_WILDCARD &&
|
||||
value2->jobid != ORTE_JOBID_WILDCARD) {
|
||||
if (value1->jobid < value2->jobid) {
|
||||
return ORTE_VALUE2_GREATER;
|
||||
} else if (value1->jobid > value2->jobid) {
|
||||
return ORTE_VALUE1_GREATER;
|
||||
}
|
||||
}
|
||||
|
||||
/* only way to get here is if all fields are equal */
|
||||
|
||||
/** check the vpids - if one of them is WILDCARD, then ignore
|
||||
* this field since anything is okay
|
||||
*/
|
||||
if (value1->vpid != ORTE_VPID_WILDCARD &&
|
||||
value2->vpid != ORTE_VPID_WILDCARD) {
|
||||
if (value1->vpid < value2->vpid) {
|
||||
return ORTE_VALUE2_GREATER;
|
||||
} else if (value1->vpid > value2->vpid) {
|
||||
return ORTE_VALUE1_GREATER;
|
||||
}
|
||||
}
|
||||
|
||||
/** only way to get here is if all fields are equal or WILDCARD */
|
||||
return ORTE_EQUAL;
|
||||
}
|
||||
|
||||
@ -72,10 +123,14 @@ int orte_ns_base_compare_vpid(orte_vpid_t *value1,
|
||||
orte_data_type_t type)
|
||||
|
||||
{
|
||||
/** if either value is WILDCARD, then return equal */
|
||||
if (*value1 == ORTE_VPID_WILDCARD ||
|
||||
*value2 == ORTE_VPID_WILDCARD) return ORTE_EQUAL;
|
||||
|
||||
if (*value1 > *value2) return ORTE_VALUE1_GREATER;
|
||||
|
||||
|
||||
if (*value2 > *value1) return ORTE_VALUE2_GREATER;
|
||||
|
||||
|
||||
return ORTE_EQUAL;
|
||||
}
|
||||
|
||||
@ -83,10 +138,14 @@ int orte_ns_base_compare_jobid(orte_jobid_t *value1,
|
||||
orte_jobid_t *value2,
|
||||
orte_data_type_t type)
|
||||
{
|
||||
/** if either value is WILDCARD, then return equal */
|
||||
if (*value1 == ORTE_JOBID_WILDCARD ||
|
||||
*value2 == ORTE_JOBID_WILDCARD) return ORTE_EQUAL;
|
||||
|
||||
if (*value1 > *value2) return ORTE_VALUE1_GREATER;
|
||||
|
||||
|
||||
if (*value2 > *value1) return ORTE_VALUE2_GREATER;
|
||||
|
||||
|
||||
return ORTE_EQUAL;
|
||||
}
|
||||
|
||||
@ -94,9 +153,13 @@ int orte_ns_base_compare_cellid(orte_cellid_t *value1,
|
||||
orte_cellid_t *value2,
|
||||
orte_data_type_t type)
|
||||
{
|
||||
/** if either value is WILDCARD, then return equal */
|
||||
if (*value1 == ORTE_CELLID_WILDCARD ||
|
||||
*value2 == ORTE_CELLID_WILDCARD) return ORTE_EQUAL;
|
||||
|
||||
if (*value1 > *value2) return ORTE_VALUE1_GREATER;
|
||||
|
||||
|
||||
if (*value2 > *value1) return ORTE_VALUE2_GREATER;
|
||||
|
||||
|
||||
return ORTE_EQUAL;
|
||||
}
|
||||
|
@ -100,13 +100,13 @@ mca_ns_base_component_t mca_ns_base_selected_component;
|
||||
|
||||
|
||||
/* constructor - used to initialize namelist instance */
|
||||
static void orte_name_services_namelist_construct(orte_name_services_namelist_t* list)
|
||||
static void orte_namelist_construct(orte_namelist_t* list)
|
||||
{
|
||||
list->name = NULL;
|
||||
}
|
||||
|
||||
/* destructor - used to free any resources held by instance */
|
||||
static void orte_name_services_namelist_destructor(orte_name_services_namelist_t* list)
|
||||
static void orte_namelist_destructor(orte_namelist_t* list)
|
||||
{
|
||||
if (NULL != list->name) {
|
||||
free(list->name);
|
||||
@ -115,10 +115,10 @@ static void orte_name_services_namelist_destructor(orte_name_services_namelist_t
|
||||
|
||||
/* define instance of opal_class_t */
|
||||
OBJ_CLASS_INSTANCE(
|
||||
orte_name_services_namelist_t, /* type name */
|
||||
orte_namelist_t, /* type name */
|
||||
opal_list_item_t, /* parent "class" name */
|
||||
orte_name_services_namelist_construct, /* constructor */
|
||||
orte_name_services_namelist_destructor); /* destructor */
|
||||
orte_namelist_construct, /* constructor */
|
||||
orte_namelist_destructor); /* destructor */
|
||||
|
||||
|
||||
|
||||
|
@ -91,6 +91,20 @@ typedef struct orte_process_name_t orte_process_name_t;
|
||||
#define ORTE_JOBID_MAX ORTE_STD_CNTR_MAX
|
||||
#define ORTE_VPID_MAX ORTE_STD_CNTR_MAX
|
||||
|
||||
/*
|
||||
* define invalid values
|
||||
*/
|
||||
#define ORTE_CELLID_INVALID -999
|
||||
#define ORTE_JOBID_INVALID -999
|
||||
#define ORTE_VPID_INVALID -999
|
||||
|
||||
/*
|
||||
* define wildcard values
|
||||
*/
|
||||
#define ORTE_CELLID_WILDCARD -1
|
||||
#define ORTE_JOBID_WILDCARD -1
|
||||
#define ORTE_VPID_WILDCARD -1
|
||||
|
||||
ORTE_DECLSPEC extern orte_process_name_t orte_name_all;
|
||||
#define ORTE_NAME_ALL &orte_name_all
|
||||
|
||||
@ -117,13 +131,13 @@ ORTE_DECLSPEC extern orte_process_name_t orte_name_all;
|
||||
|
||||
/** List of names for general use
|
||||
*/
|
||||
struct orte_name_services_namelist_t {
|
||||
struct orte_namelist_t {
|
||||
opal_list_item_t item; /**< Allows this item to be placed on a list */
|
||||
orte_process_name_t *name; /**< Name of a process */
|
||||
};
|
||||
typedef struct orte_name_services_namelist_t orte_name_services_namelist_t;
|
||||
typedef struct orte_namelist_t orte_namelist_t;
|
||||
|
||||
ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_name_services_namelist_t);
|
||||
ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_namelist_t);
|
||||
|
||||
#if defined(c_plusplus) || defined(__cplusplus)
|
||||
}
|
||||
|
@ -79,7 +79,7 @@ int orte_ns_proxy_create_cellid(orte_cellid_t *cellid, char *site, char *resourc
|
||||
return rc;
|
||||
}
|
||||
|
||||
if (0 > orte_rml.send_buffer(orte_ns_proxy.my_replica, cmd, MCA_OOB_TAG_NS, 0)) {
|
||||
if (0 > orte_rml.send_buffer(orte_ns_proxy.my_replica, cmd, ORTE_RML_TAG_NS, 0)) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
|
||||
OBJ_RELEASE(cmd);
|
||||
return ORTE_ERR_COMM_FAILURE;
|
||||
@ -202,7 +202,7 @@ int orte_ns_proxy_get_cell_info(orte_cellid_t cellid,
|
||||
return rc;
|
||||
}
|
||||
|
||||
if (0 > orte_rml.send_buffer(orte_ns_proxy.my_replica, cmd, MCA_OOB_TAG_NS, 0)) {
|
||||
if (0 > orte_rml.send_buffer(orte_ns_proxy.my_replica, cmd, ORTE_RML_TAG_NS, 0)) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
|
||||
OBJ_RELEASE(cmd);
|
||||
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
|
||||
@ -803,7 +803,7 @@ int orte_ns_proxy_create_my_name(void)
|
||||
return rc;
|
||||
}
|
||||
|
||||
if (0 > orte_rml.send_buffer(orte_ns_proxy.my_replica, cmd, MCA_OOB_TAG_NS, 0)) {
|
||||
if (0 > orte_rml.send_buffer(orte_ns_proxy.my_replica, cmd, ORTE_RML_TAG_NS, 0)) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
|
||||
OBJ_RELEASE(cmd);
|
||||
return ORTE_ERR_COMM_FAILURE;
|
||||
@ -839,7 +839,7 @@ int orte_ns_proxy_dump_cells(void)
|
||||
return rc;
|
||||
}
|
||||
|
||||
if (0 > orte_rml.send_buffer(orte_ns_proxy.my_replica, &cmd, MCA_OOB_TAG_NS, 0)) {
|
||||
if (0 > orte_rml.send_buffer(orte_ns_proxy.my_replica, &cmd, ORTE_RML_TAG_NS, 0)) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
|
||||
OBJ_DESTRUCT(&cmd);
|
||||
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
|
||||
@ -898,7 +898,7 @@ int orte_ns_proxy_dump_jobs(void)
|
||||
return rc;
|
||||
}
|
||||
|
||||
if (0 > orte_rml.send_buffer(orte_ns_proxy.my_replica, &cmd, MCA_OOB_TAG_NS, 0)) {
|
||||
if (0 > orte_rml.send_buffer(orte_ns_proxy.my_replica, &cmd, ORTE_RML_TAG_NS, 0)) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
|
||||
OBJ_DESTRUCT(&cmd);
|
||||
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
|
||||
@ -947,7 +947,7 @@ int orte_ns_proxy_dump_tags(void)
|
||||
return rc;
|
||||
}
|
||||
|
||||
if (0 > orte_rml.send_buffer(orte_ns_proxy.my_replica, &cmd, MCA_OOB_TAG_NS, 0)) {
|
||||
if (0 > orte_rml.send_buffer(orte_ns_proxy.my_replica, &cmd, ORTE_RML_TAG_NS, 0)) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
|
||||
OBJ_DESTRUCT(&cmd);
|
||||
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
|
||||
@ -1008,7 +1008,7 @@ int orte_ns_proxy_dump_datatypes(void)
|
||||
return rc;
|
||||
}
|
||||
|
||||
if (0 > orte_rml.send_buffer(orte_ns_proxy.my_replica, &cmd, MCA_OOB_TAG_NS, 0)) {
|
||||
if (0 > orte_rml.send_buffer(orte_ns_proxy.my_replica, &cmd, ORTE_RML_TAG_NS, 0)) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
|
||||
OBJ_DESTRUCT(&cmd);
|
||||
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
|
||||
|
42
orte/mca/odls/Makefile.am
Обычный файл
42
orte/mca/odls/Makefile.am
Обычный файл
@ -0,0 +1,42 @@
|
||||
#
|
||||
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
# University Research and Technology
|
||||
# Corporation. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
# of Tennessee Research Foundation. All rights
|
||||
# reserved.
|
||||
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
# main library setup
|
||||
noinst_LTLIBRARIES = libmca_odls.la
|
||||
libmca_odls_la_SOURCES =
|
||||
|
||||
# header setup
|
||||
nobase_orte_HEADERS =
|
||||
dist_pkgdata_DATA =
|
||||
|
||||
# local files
|
||||
headers = odls.h odls_types.h
|
||||
libmca_odls_la_SOURCES += $(headers)
|
||||
|
||||
# Conditionally install the header files
|
||||
if WANT_INSTALL_HEADERS
|
||||
nobase_orte_HEADERS += $(headers)
|
||||
ortedir = $(includedir)/openmpi/orte/mca/odls
|
||||
else
|
||||
ortedir = $(includedir)
|
||||
endif
|
||||
|
||||
include base/Makefile.am
|
||||
|
||||
distclean-local:
|
||||
rm -f base/static-components.h
|
33
orte/mca/odls/base/Makefile.am
Обычный файл
33
orte/mca/odls/base/Makefile.am
Обычный файл
@ -0,0 +1,33 @@
|
||||
#
|
||||
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
# University Research and Technology
|
||||
# Corporation. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
# of Tennessee Research Foundation. All rights
|
||||
# reserved.
|
||||
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
headers += \
|
||||
base/odls_private.h \
|
||||
base/base.h
|
||||
|
||||
libmca_odls_la_SOURCES += \
|
||||
base/odls_base_close.c \
|
||||
base/odls_base_open.c \
|
||||
base/odls_base_select.c \
|
||||
base/data_type_support/odls_compare_fns.c \
|
||||
base/data_type_support/odls_copy_fns.c \
|
||||
base/data_type_support/odls_packing_fns.c \
|
||||
base/data_type_support/odls_print_fns.c \
|
||||
base/data_type_support/odls_release_fns.c \
|
||||
base/data_type_support/odls_size_fns.c \
|
||||
base/data_type_support/odls_unpacking_fns.c
|
80
orte/mca/odls/base/base.h
Обычный файл
80
orte/mca/odls/base/base.h
Обычный файл
@ -0,0 +1,80 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2006 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
/** @file:
|
||||
*/
|
||||
|
||||
#ifndef MCA_ODLS_BASE_H
|
||||
#define MCA_ODLS_BASE_H
|
||||
|
||||
/*
|
||||
* includes
|
||||
*/
|
||||
#include "orte_config.h"
|
||||
|
||||
#include "opal/mca/mca.h"
|
||||
#include "opal/class/opal_list.h"
|
||||
|
||||
#include "orte/mca/odls/odls.h"
|
||||
|
||||
|
||||
#if defined(c_plusplus) || defined(__cplusplus)
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
/**
|
||||
* Struct to hold globals for the odls framework
|
||||
*/
|
||||
typedef struct orte_odls_base_t {
|
||||
/* components are available */
|
||||
bool components_available;
|
||||
/* component has been selected */
|
||||
bool selected;
|
||||
/** List of opened components */
|
||||
opal_list_t available_components;
|
||||
/** selected component */
|
||||
orte_odls_base_component_t selected_component;
|
||||
} orte_odls_base_t;
|
||||
|
||||
/**
|
||||
* Global instance of odls-wide framework data
|
||||
*/
|
||||
ORTE_DECLSPEC extern orte_odls_base_t orte_odls_base;
|
||||
|
||||
/*
|
||||
* Global functions for MCA overall collective open and close
|
||||
*/
|
||||
|
||||
/**
|
||||
* Open the odls framework
|
||||
*/
|
||||
ORTE_DECLSPEC int orte_odls_base_open(void);
|
||||
/**
|
||||
* Select an odls module
|
||||
*/
|
||||
ORTE_DECLSPEC int orte_odls_base_select(void);
|
||||
|
||||
/**
|
||||
* Close the odls framework
|
||||
*/
|
||||
ORTE_DECLSPEC int orte_odls_base_finalize(void);
|
||||
ORTE_DECLSPEC int orte_odls_base_close(void);
|
||||
|
||||
#if defined(c_plusplus) || defined(__cplusplus)
|
||||
}
|
||||
#endif
|
||||
#endif
|
31
orte/mca/odls/base/data_type_support/odls_compare_fns.c
Исполняемый файл
31
orte/mca/odls/base/data_type_support/odls_compare_fns.c
Исполняемый файл
@ -0,0 +1,31 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "orte_config.h"
|
||||
#include "orte/orte_constants.h"
|
||||
|
||||
#include "orte/mca/odls/base/odls_private.h"
|
||||
|
||||
/* ORTE_DAEMON_CMD */
|
||||
int orte_odls_compare_daemon_cmd(orte_daemon_cmd_flag_t *value1, orte_daemon_cmd_flag_t *value2, orte_data_type_t type)
|
||||
{
|
||||
if (*value1 > *value2) return ORTE_VALUE1_GREATER;
|
||||
|
||||
if (*value2 > *value1) return ORTE_VALUE2_GREATER;
|
||||
|
||||
return ORTE_EQUAL;
|
||||
}
|
||||
|
40
orte/mca/odls/base/data_type_support/odls_copy_fns.c
Исполняемый файл
40
orte/mca/odls/base/data_type_support/odls_copy_fns.c
Исполняемый файл
@ -0,0 +1,40 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "orte_config.h"
|
||||
#include "orte/orte_constants.h"
|
||||
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
|
||||
#include "orte/mca/odls/base/odls_private.h"
|
||||
|
||||
int orte_odls_copy_daemon_cmd(orte_daemon_cmd_flag_t **dest, orte_daemon_cmd_flag_t *src, orte_data_type_t type)
|
||||
{
|
||||
size_t datasize;
|
||||
|
||||
datasize = sizeof(orte_daemon_cmd_flag_t);
|
||||
|
||||
*dest = (orte_daemon_cmd_flag_t*)malloc(datasize);
|
||||
if (NULL == *dest) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
memcpy(*dest, src, datasize);
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
42
orte/mca/odls/base/data_type_support/odls_packing_fns.c
Обычный файл
42
orte/mca/odls/base/data_type_support/odls_packing_fns.c
Обычный файл
@ -0,0 +1,42 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "orte_config.h"
|
||||
#include "orte/orte_constants.h"
|
||||
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/dss/dss_internal.h"
|
||||
|
||||
#include "orte/mca/odls/base/odls_private.h"
|
||||
|
||||
/*
|
||||
* ORTE_DAEMON_CMD
|
||||
*/
|
||||
int orte_odls_pack_daemon_cmd(orte_buffer_t *buffer, void *src, orte_std_cntr_t num_vals,
|
||||
orte_data_type_t type)
|
||||
{
|
||||
int ret;
|
||||
|
||||
/* Turn around and pack the real type */
|
||||
if (ORTE_SUCCESS != (ret = orte_dss_pack_buffer(buffer, src, num_vals, ORTE_DAEMON_CMD_T))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
45
orte/mca/odls/base/data_type_support/odls_print_fns.c
Исполняемый файл
45
orte/mca/odls/base/data_type_support/odls_print_fns.c
Исполняемый файл
@ -0,0 +1,45 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "orte_config.h"
|
||||
#include "orte/orte_constants.h"
|
||||
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
|
||||
#include "orte/mca/odls/base/odls_private.h"
|
||||
|
||||
/*
|
||||
* ORTE_DAEMON_CMD
|
||||
*/
|
||||
int orte_odls_print_daemon_cmd(char **output, char *prefix, orte_daemon_cmd_flag_t *src, orte_data_type_t type)
|
||||
{
|
||||
char *prefx;
|
||||
|
||||
/* deal with NULL prefix */
|
||||
if (NULL == prefix) asprintf(&prefx, " ");
|
||||
else prefx = prefix;
|
||||
|
||||
/* if src is NULL, just print data type and return */
|
||||
if (NULL == src) {
|
||||
asprintf(output, "%sData type: ORTE_DAEMON_CMD\tValue: NULL pointer", prefx);
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
asprintf(output, "%sData type: ORTE_DAEMON_CMD\tValue: %lu", prefx, (unsigned long) *src);
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
30
orte/mca/odls/base/data_type_support/odls_release_fns.c
Обычный файл
30
orte/mca/odls/base/data_type_support/odls_release_fns.c
Обычный файл
@ -0,0 +1,30 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "orte_config.h"
|
||||
|
||||
#include "orte/dss/dss_types.h"
|
||||
|
||||
#include "orte/mca/odls/base/odls_private.h"
|
||||
|
||||
/*
|
||||
* STANDARD RELEASE FUNCTION - WORKS FOR EVERYTHING NON-STRUCTURED
|
||||
*/
|
||||
void orte_odls_std_release(orte_data_value_t *value)
|
||||
{
|
||||
free(value->data);
|
||||
value->data = NULL;
|
||||
}
|
30
orte/mca/odls/base/data_type_support/odls_size_fns.c
Исполняемый файл
30
orte/mca/odls/base/data_type_support/odls_size_fns.c
Исполняемый файл
@ -0,0 +1,30 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "orte_config.h"
|
||||
#include "orte/orte_constants.h"
|
||||
|
||||
#include "orte/mca/odls/base/odls_private.h"
|
||||
|
||||
/*
|
||||
* STANDARD SIZE FUNCTION - WORKS FOR EVERYTHING NON-STRUCTURED
|
||||
*/
|
||||
int orte_odls_size_daemon_cmd(size_t *size, orte_daemon_cmd_flag_t *src, orte_data_type_t type)
|
||||
{
|
||||
*size = sizeof(orte_daemon_cmd_flag_t);
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
70
orte/mca/odls/base/data_type_support/odls_unpacking_fns.c
Обычный файл
70
orte/mca/odls/base/data_type_support/odls_unpacking_fns.c
Обычный файл
@ -0,0 +1,70 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "orte_config.h"
|
||||
#include "orte/orte_constants.h"
|
||||
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/dss/dss_types.h"
|
||||
#include "orte/dss/dss_internal.h"
|
||||
|
||||
#include "orte/mca/odls/base/odls_private.h"
|
||||
|
||||
/*
|
||||
* ORTE_DAEMON_CMD
|
||||
*/
|
||||
int orte_odls_unpack_daemon_cmd(orte_buffer_t *buffer, void *dest, orte_std_cntr_t *num_vals,
|
||||
orte_data_type_t type)
|
||||
{
|
||||
int ret;
|
||||
orte_data_type_t remote_type;
|
||||
|
||||
/* if the buffer is fully described, then we can do some magic to handle
|
||||
* the heterogeneous case. if not, then we can only shoot blind - it is the
|
||||
* user's responsibility to ensure we are in a homogeneous environment.
|
||||
*/
|
||||
if (ORTE_DSS_BUFFER_FULLY_DESC == buffer->type) {
|
||||
/* see what type was actually packed */
|
||||
if (ORTE_SUCCESS != (ret = orte_dss_peek_type(buffer, &remote_type))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
if (remote_type == ORTE_DAEMON_CMD_T) {
|
||||
/* fast path it if the sizes are the same */
|
||||
/* Turn around and unpack the real type */
|
||||
if (ORTE_SUCCESS != (ret = orte_dss_unpack_buffer(buffer, dest, num_vals, ORTE_DAEMON_CMD_T))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
}
|
||||
} else {
|
||||
/* slow path - types are different sizes */
|
||||
UNPACK_SIZE_MISMATCH(orte_daemon_cmd_flag_t, remote_type, ret);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* if we get here, then this buffer is NOT fully described. just unpack it
|
||||
* using the local size - user gets the pain if it's wrong
|
||||
*/
|
||||
if (ORTE_SUCCESS != (ret = orte_dss_unpack_buffer(buffer, dest, num_vals, ORTE_DAEMON_CMD_T))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
56
orte/mca/odls/base/odls_base_close.c
Обычный файл
56
orte/mca/odls/base/odls_base_close.c
Обычный файл
@ -0,0 +1,56 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "orte_config.h"
|
||||
|
||||
#include <stdio.h>
|
||||
|
||||
#include "orte/orte_constants.h"
|
||||
#include "opal/util/trace.h"
|
||||
#include "opal/mca/mca.h"
|
||||
#include "opal/mca/base/base.h"
|
||||
|
||||
#include "orte/mca/odls/odls.h"
|
||||
#include "orte/mca/odls/base/base.h"
|
||||
#include "orte/mca/odls/base/odls_private.h"
|
||||
|
||||
|
||||
int orte_odls_base_close(void)
|
||||
{
|
||||
OPAL_TRACE(5);
|
||||
|
||||
/* if no components are available, then punt */
|
||||
if (!orte_odls_base.components_available) {
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/* If we have a selected component and module, then finalize it */
|
||||
|
||||
if (orte_odls_base.selected) {
|
||||
orte_odls_base.selected_component.finalize();
|
||||
}
|
||||
|
||||
/* Close all available components (only one in this case) */
|
||||
|
||||
mca_base_components_close(orte_odls_globals.output,
|
||||
&orte_odls_base.available_components, NULL);
|
||||
|
||||
/* All done */
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
114
orte/mca/odls/base/odls_base_open.c
Обычный файл
114
orte/mca/odls/base/odls_base_open.c
Обычный файл
@ -0,0 +1,114 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
|
||||
#include "orte_config.h"
|
||||
#include "orte/orte_constants.h"
|
||||
|
||||
#include "opal/mca/mca.h"
|
||||
#include "opal/mca/base/base.h"
|
||||
#include "opal/mca/base/mca_base_param.h"
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/util/trace.h"
|
||||
|
||||
#include "orte/dss/dss.h"
|
||||
#include "orte/util/proc_info.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
|
||||
#include "orte/mca/odls/base/base.h"
|
||||
#include "orte/mca/odls/base/odls_private.h"
|
||||
|
||||
|
||||
/*
|
||||
* The following file was created by configure. It contains extern
|
||||
* statements and the definition of an array of pointers to each
|
||||
* component's public mca_base_component_t struct.
|
||||
*/
|
||||
|
||||
#include "orte/mca/odls/base/static-components.h"
|
||||
|
||||
/*
|
||||
* Instantiate globals
|
||||
*/
|
||||
orte_odls_base_module_t orte_odls;
|
||||
|
||||
/*
|
||||
* Framework global variables
|
||||
*/
|
||||
orte_odls_base_t orte_odls_base;
|
||||
orte_odls_globals_t orte_odls_globals;
|
||||
|
||||
/**
|
||||
* Function for finding and opening either all MCA components, or the one
|
||||
* that was specifically requested via a MCA parameter.
|
||||
*/
|
||||
int orte_odls_base_open(void)
|
||||
{
|
||||
int param, value, rc;
|
||||
orte_data_type_t tmp;
|
||||
|
||||
OPAL_TRACE(5);
|
||||
|
||||
/* Debugging / verbose output */
|
||||
|
||||
param = mca_base_param_reg_int_name("odls_base", "verbose",
|
||||
"Verbosity level for the odls framework",
|
||||
false, false, 0, &value);
|
||||
if (value != 0) {
|
||||
orte_odls_globals.output = opal_output_open(NULL);
|
||||
} else {
|
||||
orte_odls_globals.output = -1;
|
||||
}
|
||||
|
||||
/* register the daemon cmd data type */
|
||||
tmp = ORTE_DAEMON_CMD;
|
||||
if (ORTE_SUCCESS != (rc = orte_dss.register_type(orte_odls_pack_daemon_cmd,
|
||||
orte_odls_unpack_daemon_cmd,
|
||||
(orte_dss_copy_fn_t)orte_odls_copy_daemon_cmd,
|
||||
(orte_dss_compare_fn_t)orte_odls_compare_daemon_cmd,
|
||||
(orte_dss_size_fn_t)orte_odls_size_daemon_cmd,
|
||||
(orte_dss_print_fn_t)orte_odls_print_daemon_cmd,
|
||||
(orte_dss_release_fn_t)orte_odls_std_release,
|
||||
ORTE_DSS_UNSTRUCTURED,
|
||||
"ORTE_DAEMON_CMD", &tmp))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* if we are NOT a daemon, then that is ALL we do! We just needed to ensure
|
||||
* that the data type(s) got registered so we can send messages to the daemons
|
||||
*/
|
||||
if (!orte_process_info.daemon) {
|
||||
orte_odls_base.components_available = false;
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/* Open up all available components */
|
||||
|
||||
if (ORTE_SUCCESS !=
|
||||
mca_base_components_open("odls", orte_odls_globals.output,
|
||||
mca_odls_base_static_components,
|
||||
&orte_odls_base.available_components, true)) {
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
orte_odls_base.components_available = true;
|
||||
|
||||
/* All done */
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
113
orte/mca/odls/base/odls_base_select.c
Обычный файл
113
orte/mca/odls/base/odls_base_select.c
Обычный файл
@ -0,0 +1,113 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
|
||||
#include "orte_config.h"
|
||||
#include "orte/orte_constants.h"
|
||||
|
||||
#include "opal/mca/mca.h"
|
||||
#include "opal/mca/base/base.h"
|
||||
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
|
||||
#include "orte/mca/odls/base/base.h"
|
||||
|
||||
|
||||
/**
|
||||
* Function for selecting one component from all those that are
|
||||
* available.
|
||||
*/
|
||||
int orte_odls_base_select(void)
|
||||
{
|
||||
opal_list_item_t *item;
|
||||
mca_base_component_list_item_t *cli;
|
||||
orte_odls_base_component_t *component, *best_component = NULL;
|
||||
orte_odls_base_module_t *module, *best_module = NULL;
|
||||
int priority, best_priority = -1;
|
||||
|
||||
/* if no components are available (e.g., we are not in a daemon), then
|
||||
* there is nothing to do - so just return
|
||||
*/
|
||||
if (!orte_odls_base.components_available) {
|
||||
orte_odls_base.selected = false;
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/* Iterate through all the available components */
|
||||
|
||||
for (item = opal_list_get_first(&orte_odls_base.available_components);
|
||||
item != opal_list_get_end(&orte_odls_base.available_components);
|
||||
item = opal_list_get_next(item)) {
|
||||
cli = (mca_base_component_list_item_t *) item;
|
||||
component = (orte_odls_base_component_t *) cli->cli_component;
|
||||
|
||||
/* Call the component's init function and see if it wants to be
|
||||
selected */
|
||||
|
||||
module = component->init(&priority);
|
||||
|
||||
/* If we got a non-NULL module back, then the component wants to
|
||||
be selected. So save its multi/hidden values and save the
|
||||
module with the highest priority */
|
||||
|
||||
if (NULL != module) {
|
||||
/* If this is the best one, save it */
|
||||
|
||||
if (priority > best_priority) {
|
||||
|
||||
/* If there was a previous best one, finalize */
|
||||
|
||||
if (NULL != best_component) {
|
||||
best_component->finalize();
|
||||
}
|
||||
|
||||
/* Save the new best one */
|
||||
|
||||
best_module = module;
|
||||
best_component = component;
|
||||
|
||||
/* update the best priority */
|
||||
best_priority = priority;
|
||||
}
|
||||
|
||||
/* If it's not the best one, finalize it */
|
||||
|
||||
else {
|
||||
component->finalize();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* If we didn't find one to select, then we have a big problem */
|
||||
|
||||
if (NULL == best_component) {
|
||||
orte_odls_base.selected = false;
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
|
||||
/* We have happiness -- save the component and module for later
|
||||
usage */
|
||||
|
||||
orte_odls = *best_module;
|
||||
orte_odls_base.selected_component = *best_component;
|
||||
orte_odls_base.selected = true;
|
||||
|
||||
/* all done */
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
80
orte/mca/odls/base/odls_private.h
Обычный файл
80
orte/mca/odls/base/odls_private.h
Обычный файл
@ -0,0 +1,80 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2006 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
/** @file:
|
||||
*/
|
||||
|
||||
#ifndef MCA_ODLS_PRIVATE_H
|
||||
#define MCA_ODLS_PRIVATE_H
|
||||
|
||||
/*
|
||||
* includes
|
||||
*/
|
||||
#include "orte_config.h"
|
||||
|
||||
#include "opal/class/opal_list.h"
|
||||
|
||||
#include "orte/dss/dss_types.h"
|
||||
#include "orte/mca/ns/ns_types.h"
|
||||
#include "orte/mca/rmgr/rmgr_types.h"
|
||||
|
||||
#include "orte/mca/odls/odls_types.h"
|
||||
|
||||
#if defined(c_plusplus) || defined(__cplusplus)
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
/*
|
||||
* General ODLS types
|
||||
*/
|
||||
#if defined(c_plusplus) || defined(__cplusplus)
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
typedef struct orte_odls_globals_t {
|
||||
/** Verbose/debug output stream */
|
||||
int output;
|
||||
/** Time to allow process to forcibly die */
|
||||
int timeout_before_sigkill;
|
||||
} orte_odls_globals_t;
|
||||
|
||||
extern orte_odls_globals_t orte_odls_globals;
|
||||
|
||||
/*
|
||||
* data type functions
|
||||
*/
|
||||
|
||||
int orte_odls_compare_daemon_cmd(orte_daemon_cmd_flag_t *value1, orte_daemon_cmd_flag_t *value2, orte_data_type_t type);
|
||||
|
||||
int orte_odls_copy_daemon_cmd(orte_daemon_cmd_flag_t **dest, orte_daemon_cmd_flag_t *src, orte_data_type_t type);
|
||||
|
||||
int orte_odls_pack_daemon_cmd(orte_buffer_t *buffer, void *src,
|
||||
orte_std_cntr_t num_vals, orte_data_type_t type);
|
||||
|
||||
int orte_odls_print_daemon_cmd(char **output, char *prefix, orte_daemon_cmd_flag_t *src, orte_data_type_t type);
|
||||
|
||||
void orte_odls_std_release(orte_data_value_t *value);
|
||||
|
||||
int orte_odls_size_daemon_cmd(size_t *size, orte_daemon_cmd_flag_t *src, orte_data_type_t type);
|
||||
|
||||
int orte_odls_unpack_daemon_cmd(orte_buffer_t *buffer, void *dest,
|
||||
orte_std_cntr_t *num_vals, orte_data_type_t type);
|
||||
|
||||
#if defined(c_plusplus) || defined(__cplusplus)
|
||||
}
|
||||
#endif
|
||||
#endif
|
@ -9,6 +9,7 @@
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# Copyright (c) 2006 Cisco Systems, Inc. All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
@ -16,37 +17,35 @@
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
|
||||
|
||||
AM_CPPFLAGS = -I$(top_ompi_builddir)/src/include $(pls_bproc_orted_CPPFLAGS)
|
||||
AM_CPPFLAGS = -I$(top_ompi_builddir)/src/include $(odls_bproc_CPPFLAGS)
|
||||
|
||||
# Make the output library in this directory, and name it either
|
||||
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
|
||||
# (for static builds).
|
||||
|
||||
if OMPI_BUILD_pls_bproc_orted_DSO
|
||||
if OMPI_BUILD_odls_bproc_DSO
|
||||
component_noinst =
|
||||
component_install = mca_pls_bproc_orted.la
|
||||
component_install = mca_odls_bproc.la
|
||||
else
|
||||
component_noinst = libmca_pls_bproc_orted.la
|
||||
component_noinst = libmca_odls_bproc.la
|
||||
component_install =
|
||||
endif
|
||||
|
||||
sources = \
|
||||
pls_bproc_orted.h \
|
||||
pls_bproc_orted.c \
|
||||
pls_bproc_orted_component.c
|
||||
odls_bproc.h \
|
||||
odls_bproc.c \
|
||||
odls_bproc_component.c
|
||||
|
||||
mcacomponentdir = $(libdir)/openmpi
|
||||
mcacomponent_LTLIBRARIES = $(component_install)
|
||||
mca_pls_bproc_orted_la_SOURCES = $(sources)
|
||||
mca_pls_bproc_orted_la_LIBADD = \
|
||||
$(pls_bproc_orted_LIBS) \
|
||||
mca_odls_bproc_la_SOURCES = $(sources)
|
||||
mca_odls_bproc_la_LIBADD = \
|
||||
$(odls_bproc_LIBS) \
|
||||
$(top_ompi_builddir)/orte/liborte.la \
|
||||
$(top_ompi_builddir)/opal/libopal.la
|
||||
mca_pls_bproc_orted_la_LDFLAGS = -module -avoid-version $(pls_bproc_orted_LDFLAGS)
|
||||
|
||||
mca_odls_bproc_la_LDFLAGS = -module -avoid-version $(odls_bproc_LDFLAGS)
|
||||
|
||||
noinst_LTLIBRARIES = $(component_noinst)
|
||||
libmca_pls_bproc_orted_la_SOURCES = $(sources)
|
||||
libmca_pls_bproc_orted_la_LIBADD = $(pls_bproc_orted_LIBS)
|
||||
libmca_pls_bproc_orted_la_LDFLAGS = -module -avoid-version $(pls_bproc_orted_LDFLAGS)
|
||||
libmca_odls_bproc_la_SOURCES = $(sources)
|
||||
libmca_odls_bproc_la_LIBADD = $(odls_bproc_LIBS)
|
||||
libmca_odls_bproc_la_LDFLAGS = -module -avoid-version $(odls_bproc_LDFLAGS)
|
38
orte/mca/odls/bproc/configure.m4
Обычный файл
38
orte/mca/odls/bproc/configure.m4
Обычный файл
@ -0,0 +1,38 @@
|
||||
# -*- shell-script -*-
|
||||
#
|
||||
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
# University Research and Technology
|
||||
# Corporation. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
# of Tennessee Research Foundation. All rights
|
||||
# reserved.
|
||||
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
# MCA_odls_bproc_CONFIG([action-if-found], [action-if-not-found])
|
||||
# -----------------------------------------------------------
|
||||
AC_DEFUN([MCA_odls_bproc_CONFIG],[
|
||||
OMPI_CHECK_BPROC([odls_bproc], [odls_bproc_good=1],
|
||||
[odls_bproc_good=1], [odls_bproc_good=0])
|
||||
|
||||
# if check worked, set wrapper flags if so.
|
||||
# Evaluate succeed / fail
|
||||
AS_IF([test "$odls_bproc_good" = "1"],
|
||||
[odls_bproc_WRAPPER_EXTRA_LDFLAGS="$odls_bproc_LDFLAGS"
|
||||
odls_bproc_WRAPPER_EXTRA_LIBS="$odls_bproc_LIBS"
|
||||
$1],
|
||||
[$2])
|
||||
|
||||
# set build flags to use in makefile
|
||||
AC_SUBST([odls_bproc_CPPFLAGS])
|
||||
AC_SUBST([odls_bproc_LDFLAGS])
|
||||
AC_SUBST([odls_bproc_LIBS])
|
||||
])dnl
|
23
orte/mca/odls/bproc/configure.params
Обычный файл
23
orte/mca/odls/bproc/configure.params
Обычный файл
@ -0,0 +1,23 @@
|
||||
# -*- shell-script -*-
|
||||
#
|
||||
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
# University Research and Technology
|
||||
# Corporation. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
# of Tennessee Research Foundation. All rights
|
||||
# reserved.
|
||||
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
# Specific to this module
|
||||
|
||||
PARAM_INIT_FILE=odls_bproc_component.c
|
||||
PARAM_CONFIG_FILES="Makefile"
|
@ -18,7 +18,7 @@
|
||||
/**
|
||||
* @file:
|
||||
* Part of the bproc launcher.
|
||||
* See pls_bproc_orted.h for an overview of how it works.
|
||||
* See odls_bproc.h for an overview of how it works.
|
||||
*/
|
||||
#include "orte_config.h"
|
||||
#include <stdlib.h>
|
||||
@ -44,35 +44,32 @@
|
||||
#include "orte/mca/iof/base/iof_base_setup.h"
|
||||
#include "orte/mca/ns/base/base.h"
|
||||
#include "orte/mca/oob/base/base.h"
|
||||
#include "orte/mca/pls/base/base.h"
|
||||
#include "orte/mca/rmaps/base/rmaps_base_map.h"
|
||||
#include "orte/mca/rml/rml.h"
|
||||
#include "orte/util/session_dir.h"
|
||||
#include "orte/util/univ_info.h"
|
||||
|
||||
#include "pls_bproc_orted.h"
|
||||
#include "odls_bproc.h"
|
||||
|
||||
/**
|
||||
* Initialization of the bproc_orted module with all the needed function pointers
|
||||
*/
|
||||
orte_pls_base_module_1_0_0_t orte_pls_bproc_orted_module = {
|
||||
orte_pls_bproc_orted_launch,
|
||||
orte_pls_bproc_orted_terminate_job,
|
||||
orte_pls_bproc_orted_terminate_proc,
|
||||
orte_pls_bproc_orted_signal_job,
|
||||
orte_pls_bproc_orted_signal_proc,
|
||||
orte_pls_bproc_orted_finalize
|
||||
orte_odls_base_module_t orte_odls_bproc_module = {
|
||||
orte_odls_bproc_subscribe_launch_data,
|
||||
orte_odls_bproc_launch_local_procs,
|
||||
orte_odls_bproc_kill_local_procs,
|
||||
orte_odls_bproc_signal_local_procs
|
||||
};
|
||||
|
||||
static int pls_bproc_orted_make_dir(char *directory);
|
||||
static char * pls_bproc_orted_get_base_dir_name(int proc_rank, orte_jobid_t jobid,
|
||||
size_t app_context);
|
||||
static void pls_bproc_orted_delete_dir_tree(char * path);
|
||||
static int pls_bproc_orted_remove_dir(void);
|
||||
static void pls_bproc_orted_send_cb(int status, orte_process_name_t * peer,
|
||||
static int odls_bproc_make_dir(char *directory);
|
||||
static char * odls_bproc_get_base_dir_name(int proc_rank, orte_jobid_t jobid,
|
||||
orte_std_cntr_t app_context);
|
||||
static void odls_bproc_delete_dir_tree(char * path);
|
||||
static int odls_bproc_remove_dir(void);
|
||||
static void odls_bproc_send_cb(int status, orte_process_name_t * peer,
|
||||
orte_buffer_t* buffer, int tag, void* cbdata);
|
||||
static int pls_bproc_orted_setup_stdio(orte_process_name_t *proc_name,
|
||||
static int odls_bproc_setup_stdio(orte_process_name_t *proc_name,
|
||||
int proc_rank, orte_jobid_t jobid,
|
||||
size_t app_context, bool connect_stdin);
|
||||
orte_std_cntr_t app_context, bool connect_stdin);
|
||||
|
||||
|
||||
/**
|
||||
@ -83,13 +80,13 @@ static int pls_bproc_orted_setup_stdio(orte_process_name_t *proc_name,
|
||||
* @retval error
|
||||
*/
|
||||
static int
|
||||
pls_bproc_orted_make_dir(char *directory)
|
||||
odls_bproc_make_dir(char *directory)
|
||||
{
|
||||
struct stat buf;
|
||||
mode_t my_mode = S_IRWXU; /* at the least, I need to be able to do anything */
|
||||
|
||||
if (0 == stat(directory, &buf)) { /* exists - delete it and its contents */
|
||||
pls_bproc_orted_delete_dir_tree(directory);
|
||||
odls_bproc_delete_dir_tree(directory);
|
||||
}
|
||||
/* try to create it with proper mode */
|
||||
return(opal_os_dirpath_create(directory, my_mode));
|
||||
@ -108,8 +105,8 @@ pls_bproc_orted_make_dir(char *directory)
|
||||
* @retval path
|
||||
*/
|
||||
static char *
|
||||
pls_bproc_orted_get_base_dir_name(int proc_rank, orte_jobid_t jobid,
|
||||
size_t app_context)
|
||||
odls_bproc_get_base_dir_name(int proc_rank, orte_jobid_t jobid,
|
||||
orte_std_cntr_t app_context)
|
||||
{
|
||||
char *path = NULL, *user = NULL, *job = NULL;
|
||||
int rc;
|
||||
@ -141,6 +138,9 @@ static char *
|
||||
ORTE_ERROR_LOG(ORTE_ERROR);
|
||||
path = NULL;
|
||||
}
|
||||
if(0 < mca_odls_bproc_component.debug) {
|
||||
opal_output(0, "odls bproc io setup. Path: %s\n", path);
|
||||
}
|
||||
free(user);
|
||||
free(job);
|
||||
return path;
|
||||
@ -152,7 +152,7 @@ static char *
|
||||
* @param path the path to the base directory to delete
|
||||
*/
|
||||
static void
|
||||
pls_bproc_orted_delete_dir_tree(char * path)
|
||||
odls_bproc_delete_dir_tree(char * path)
|
||||
{
|
||||
DIR *dp;
|
||||
struct dirent *ep;
|
||||
@ -170,7 +170,7 @@ pls_bproc_orted_delete_dir_tree(char * path)
|
||||
filenm = opal_os_path(false, path, ep->d_name, NULL);
|
||||
ret = stat(filenm, &buf);
|
||||
if (ret < 0 || S_ISDIR(buf.st_mode)) {
|
||||
pls_bproc_orted_delete_dir_tree(filenm);
|
||||
odls_bproc_delete_dir_tree(filenm);
|
||||
free(filenm);
|
||||
continue;
|
||||
}
|
||||
@ -190,7 +190,7 @@ pls_bproc_orted_delete_dir_tree(char * path)
|
||||
* @retval error
|
||||
*/
|
||||
static int
|
||||
pls_bproc_orted_remove_dir()
|
||||
odls_bproc_remove_dir()
|
||||
{
|
||||
char *frontend = NULL, *user = NULL, *filename = NULL;
|
||||
int id;
|
||||
@ -213,7 +213,7 @@ pls_bproc_orted_remove_dir()
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
/* we do our best to clean up the directory tree, but we ignore errors*/
|
||||
pls_bproc_orted_delete_dir_tree(frontend);
|
||||
odls_bproc_delete_dir_tree(frontend);
|
||||
free(frontend);
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
@ -228,7 +228,7 @@ pls_bproc_orted_remove_dir()
|
||||
* @param cbdata
|
||||
*/
|
||||
static void
|
||||
pls_bproc_orted_send_cb(int status, orte_process_name_t * peer,
|
||||
odls_bproc_send_cb(int status, orte_process_name_t * peer,
|
||||
orte_buffer_t* buffer, int tag, void* cbdata)
|
||||
{
|
||||
OBJ_RELEASE(buffer);
|
||||
@ -257,9 +257,9 @@ pls_bproc_orted_send_cb(int status, orte_process_name_t * peer,
|
||||
* @retval error
|
||||
*/
|
||||
static int
|
||||
pls_bproc_orted_setup_stdio(orte_process_name_t *proc_name, int proc_rank,
|
||||
odls_bproc_setup_stdio(orte_process_name_t *proc_name, int proc_rank,
|
||||
orte_jobid_t jobid,
|
||||
size_t app_context, bool connect_stdin)
|
||||
orte_std_cntr_t app_context, bool connect_stdin)
|
||||
{
|
||||
char *path_prefix, *fd_link_path = NULL;
|
||||
int rc = ORTE_SUCCESS, fd;
|
||||
@ -269,7 +269,7 @@ pls_bproc_orted_setup_stdio(orte_process_name_t *proc_name, int proc_rank,
|
||||
struct termios term_attrs;
|
||||
#endif
|
||||
|
||||
path_prefix = pls_bproc_orted_get_base_dir_name(proc_rank, jobid, app_context);
|
||||
path_prefix = odls_bproc_get_base_dir_name(proc_rank, jobid, (size_t)app_context);
|
||||
if (NULL == path_prefix) {
|
||||
rc = ORTE_ERROR;
|
||||
ORTE_ERROR_LOG(rc);
|
||||
@ -277,7 +277,7 @@ pls_bproc_orted_setup_stdio(orte_process_name_t *proc_name, int proc_rank,
|
||||
}
|
||||
|
||||
/* check for existence and access, or create it */
|
||||
if (ORTE_SUCCESS != (rc = pls_bproc_orted_make_dir(path_prefix))) {
|
||||
if (ORTE_SUCCESS != (rc = odls_bproc_make_dir(path_prefix))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
}
|
||||
@ -294,7 +294,7 @@ pls_bproc_orted_setup_stdio(orte_process_name_t *proc_name, int proc_rank,
|
||||
|
||||
if (connect_stdin) {
|
||||
if (0 != mkfifo(fd_link_path, S_IRWXU)) {
|
||||
perror("pls_bproc_orted mkfifo failed");
|
||||
perror("odls_bproc mkfifo failed");
|
||||
rc = ORTE_ERROR;
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
@ -302,7 +302,7 @@ pls_bproc_orted_setup_stdio(orte_process_name_t *proc_name, int proc_rank,
|
||||
|
||||
fd = open(fd_link_path, O_RDWR);
|
||||
if (-1 == fd) {
|
||||
perror("pls_bproc_orted open failed");
|
||||
perror("odls_bproc open failed");
|
||||
rc = ORTE_ERROR;
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
@ -312,7 +312,7 @@ pls_bproc_orted_setup_stdio(orte_process_name_t *proc_name, int proc_rank,
|
||||
ORTE_IOF_STDIN, fd);
|
||||
} else {
|
||||
if(0 != symlink("/dev/null", fd_link_path)) {
|
||||
perror("pls_bproc_orted could not create symlink");
|
||||
perror("odls_bproc could not create symlink");
|
||||
rc = ORTE_ERROR;
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
@ -332,7 +332,7 @@ pls_bproc_orted_setup_stdio(orte_process_name_t *proc_name, int proc_rank,
|
||||
|
||||
#if defined(HAVE_OPENPTY) && (OMPI_ENABLE_PTY_SUPPORT != 0)
|
||||
if (0 != openpty(&amaster, &aslave, pty_name, NULL, NULL)) {
|
||||
opal_output(0, "pls_bproc_orted: openpty failed, using pipes instead");
|
||||
opal_output(0, "odls_bproc: openpty failed, using pipes instead");
|
||||
goto stdout_fifo_setup;
|
||||
}
|
||||
|
||||
@ -366,14 +366,14 @@ stdout_fifo_setup:
|
||||
#endif
|
||||
|
||||
if (0 != mkfifo(fd_link_path, S_IRWXU)) {
|
||||
perror("pls_bproc_orted mkfifo failed");
|
||||
perror("odls_bproc mkfifo failed");
|
||||
rc = ORTE_ERROR;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
fd = open(fd_link_path, O_RDWR);
|
||||
if (-1 == fd) {
|
||||
perror("pls_bproc_orted open failed");
|
||||
perror("odls_bproc open failed");
|
||||
rc = ORTE_ERROR;
|
||||
goto cleanup;
|
||||
}
|
||||
@ -395,14 +395,14 @@ stderr_fifo_setup:
|
||||
}
|
||||
|
||||
if (0 != mkfifo(fd_link_path, S_IRWXU)) {
|
||||
perror("pls_bproc_orted mkfifo failed");
|
||||
perror("odls_bproc mkfifo failed");
|
||||
rc = ORTE_ERROR;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
fd = open(fd_link_path, O_RDWR);
|
||||
if (-1 == fd) {
|
||||
perror("pls_bproc_orted open failed");
|
||||
perror("odls_bproc open failed");
|
||||
rc = ORTE_ERROR;
|
||||
goto cleanup;
|
||||
}
|
||||
@ -421,28 +421,118 @@ cleanup:
|
||||
}
|
||||
|
||||
|
||||
/* this entire function gets called within a GPR compound command,
|
||||
* so the subscription actually doesn't get done until the orted
|
||||
* executes the compound command
|
||||
*/
|
||||
int orte_odls_bproc_subscribe_launch_data(orte_jobid_t job, orte_gpr_notify_cb_fn_t cbfunc)
|
||||
{
|
||||
char *segment;
|
||||
orte_gpr_value_t *values[1];
|
||||
orte_gpr_subscription_t *subs, sub=ORTE_GPR_SUBSCRIPTION_EMPTY;
|
||||
orte_gpr_trigger_t *trigs, trig=ORTE_GPR_TRIGGER_EMPTY;
|
||||
char* keys[] = {
|
||||
ORTE_PROC_NAME_KEY,
|
||||
ORTE_PROC_APP_CONTEXT_KEY,
|
||||
ORTE_NODE_NAME_KEY,
|
||||
};
|
||||
int num_keys = 3;
|
||||
int i, rc;
|
||||
|
||||
/* get the job segment name */
|
||||
if (ORTE_SUCCESS != (rc = orte_schema.get_job_segment_name(&segment, job))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* attach ourselves to the "standard" orted trigger */
|
||||
if (ORTE_SUCCESS !=
|
||||
(rc = orte_schema.get_std_trigger_name(&(trig.name),
|
||||
ORTED_LAUNCH_STAGE_GATE_TRIGGER, job))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
free(segment);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* ask for return of all data required for launching local processes */
|
||||
subs = ⊂
|
||||
sub.action = ORTE_GPR_NOTIFY_DELETE_AFTER_TRIG;
|
||||
if (ORTE_SUCCESS != (rc = orte_schema.get_std_subscription_name(&(sub.name),
|
||||
ORTED_LAUNCH_STG_SUB,
|
||||
job))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
free(segment);
|
||||
free(trig.name);
|
||||
return rc;
|
||||
}
|
||||
sub.cnt = 1;
|
||||
sub.values = values;
|
||||
|
||||
if (ORTE_SUCCESS != (rc = orte_gpr.create_value(&(values[0]), ORTE_GPR_KEYS_OR | ORTE_GPR_TOKENS_OR,
|
||||
segment, num_keys, 0))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
free(segment);
|
||||
free(sub.name);
|
||||
free(trig.name);
|
||||
return rc;
|
||||
}
|
||||
for (i=0; i < num_keys; i++) {
|
||||
if (ORTE_SUCCESS != (rc = orte_gpr.create_keyval(&(values[0]->keyvals[i]),
|
||||
keys[i], ORTE_UNDEF, NULL))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
free(segment);
|
||||
free(sub.name);
|
||||
free(trig.name);
|
||||
OBJ_RELEASE(values[0]);
|
||||
return rc;
|
||||
}
|
||||
}
|
||||
|
||||
sub.cbfunc = cbfunc;
|
||||
|
||||
trigs = &trig;
|
||||
|
||||
/* do the subscription */
|
||||
if (ORTE_SUCCESS != (rc = orte_gpr.subscribe(1, &subs, 1, &trigs))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
free(segment);
|
||||
free(sub.name);
|
||||
free(trig.name);
|
||||
OBJ_RELEASE(values[0]);
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
||||
/**
|
||||
* Setup io for the current node, then tell orterun we are ready for the actual
|
||||
* processes.
|
||||
* @param jobid The jobid of the job to launch
|
||||
* @retval ORTE_SUCCESS
|
||||
* @retval error
|
||||
*/
|
||||
int
|
||||
orte_pls_bproc_orted_launch(orte_jobid_t jobid)
|
||||
orte_odls_bproc_launch_local_procs(orte_gpr_notify_data_t *data)
|
||||
{
|
||||
opal_list_t map;
|
||||
orte_rmaps_base_map_t * mapping;
|
||||
orte_rmaps_base_proc_t * proc;
|
||||
odls_bproc_child_t *child;
|
||||
opal_list_item_t* item;
|
||||
orte_gpr_value_t *value, **values;
|
||||
orte_gpr_keyval_t *kval;
|
||||
char *node_name;
|
||||
int rc;
|
||||
int num_procs = 0;
|
||||
size_t i;
|
||||
orte_std_cntr_t i, j, kv, kv2, *sptr;
|
||||
int src = 0;
|
||||
orte_buffer_t *ack;
|
||||
char * param;
|
||||
bool connect_stdin;
|
||||
char * pty_name = NULL;
|
||||
orte_jobid_t jobid;
|
||||
|
||||
/* first, retrieve the job number we are to launch from the
|
||||
* returned data - we can extract the jobid directly from the
|
||||
* subscription name we created
|
||||
*/
|
||||
if (ORTE_SUCCESS != (rc = orte_schema.extract_jobid_from_std_trigger_name(&jobid, data->target))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/**
|
||||
* hack for bproc4, change process group so that we do not receive signals
|
||||
@ -451,55 +541,87 @@ orte_pls_bproc_orted_launch(orte_jobid_t jobid)
|
||||
*/
|
||||
setpgid(0,0);
|
||||
|
||||
/* get current node number */
|
||||
rc = bproc_currnode();
|
||||
if(0 > rc) {
|
||||
opal_output(0, "pls_bproc_orted component running on invalid node");
|
||||
}
|
||||
if(0 > asprintf(¶m, "%d", rc)) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
goto cleanup;
|
||||
}
|
||||
/* query the allocation for this node */
|
||||
OBJ_CONSTRUCT(&map, opal_list_t);
|
||||
rc = orte_rmaps_base_get_node_map(orte_process_info.my_name->cellid, jobid,
|
||||
param, &map);
|
||||
free(param);
|
||||
if (ORTE_SUCCESS != rc) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
/* loop through the returned data to find the global info and
|
||||
* the info for processes going onto this node
|
||||
*/
|
||||
values = (orte_gpr_value_t**)(data->values)->addr;
|
||||
for (j=0, i=0; i < data->cnt && j < (data->values)->size; j++) { /* loop through all returned values */
|
||||
if (NULL != values[j]) {
|
||||
i++;
|
||||
value = values[j];
|
||||
/* this must have come from one of the process containers, so it must
|
||||
* contain data for a proc structure - see if it belongs to this node
|
||||
*/
|
||||
for (kv=0; kv < value->cnt; kv++) {
|
||||
kval = value->keyvals[kv];
|
||||
if (strcmp(kval->key, ORTE_NODE_NAME_KEY) == 0) {
|
||||
/* Most C-compilers will bark if we try to directly compare the string in the
|
||||
* kval data area against a regular string, so we need to "get" the data
|
||||
* so we can access it */
|
||||
if (ORTE_SUCCESS != (rc = orte_dss.get((void**)&node_name, kval->value, ORTE_STRING))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
/* if this is our node...must also protect against a zero-length string */
|
||||
if (NULL != node_name && 0 == strcmp(node_name, orte_system_info.nodename)) {
|
||||
/* ...harvest the info into a new child structure */
|
||||
child = OBJ_NEW(odls_bproc_child_t);
|
||||
for (kv2 = 0; kv2 < value->cnt; kv2++) {
|
||||
kval = value->keyvals[kv2];
|
||||
if(strcmp(kval->key, ORTE_PROC_NAME_KEY) == 0) {
|
||||
/* copy the name into the child object */
|
||||
if (ORTE_SUCCESS != (rc = orte_dss.copy((void**)&(child->name), kval->value->data, ORTE_NAME))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
if(strcmp(kval->key, ORTE_PROC_APP_CONTEXT_KEY) == 0) {
|
||||
if (ORTE_SUCCESS != (rc = orte_dss.get((void**)&sptr, kval->value, ORTE_STD_CNTR))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
child->app_idx = *sptr; /* save the index into the app_context objects */
|
||||
continue;
|
||||
}
|
||||
} /* kv2 */
|
||||
/* protect operation on the global list of children */
|
||||
OPAL_THREAD_LOCK(&mca_odls_bproc_component.mutex);
|
||||
opal_list_append(&mca_odls_bproc_component.children, &child->super);
|
||||
opal_condition_signal(&mca_odls_bproc_component.cond);
|
||||
OPAL_THREAD_UNLOCK(&mca_odls_bproc_component.mutex);
|
||||
|
||||
}
|
||||
}
|
||||
} /* for kv */
|
||||
} /* for j */
|
||||
}
|
||||
|
||||
/* figure out what processes will be on this node and set up the io files */
|
||||
for(item = opal_list_get_first(&map);
|
||||
item != opal_list_get_end(&map);
|
||||
/* set up the io files for our children */
|
||||
for(item = opal_list_get_first(&mca_odls_bproc_component.children);
|
||||
item != opal_list_get_end(&mca_odls_bproc_component.children);
|
||||
item = opal_list_get_next(item)) {
|
||||
mapping = (orte_rmaps_base_map_t *) item;
|
||||
num_procs = 0;
|
||||
for(i = mapping->num_procs; i > 0; i--) {
|
||||
proc = mapping->procs[i - 1];
|
||||
if(0 < mca_pls_bproc_orted_component.debug) {
|
||||
opal_output(0, "orte_pls_bproc_orted_launch: setting up io for "
|
||||
"[%lu,%lu,%lu] proc rank %lu\n",
|
||||
ORTE_NAME_ARGS((&proc->proc_name)),
|
||||
proc->proc_rank);
|
||||
}
|
||||
/* only setup to forward stdin if it is rank 0, otherwise connect
|
||||
* to /dev/null */
|
||||
if(0 == proc->proc_rank) {
|
||||
connect_stdin = true;
|
||||
} else {
|
||||
connect_stdin = false;
|
||||
}
|
||||
child = (odls_bproc_child_t *) item;
|
||||
if(0 < mca_odls_bproc_component.debug) {
|
||||
opal_output(0, "orte_odls_bproc_launch: setting up io for "
|
||||
"[%lu,%lu,%lu] proc rank %lu\n",
|
||||
ORTE_NAME_ARGS((child->name)),
|
||||
child->name->vpid);
|
||||
}
|
||||
/* only setup to forward stdin if it is rank 0, otherwise connect
|
||||
* to /dev/null */
|
||||
if(0 == child->name->vpid) {
|
||||
connect_stdin = true;
|
||||
} else {
|
||||
connect_stdin = false;
|
||||
}
|
||||
|
||||
rc = pls_bproc_orted_setup_stdio(&proc->proc_name, num_procs,
|
||||
jobid, mapping->app->idx,
|
||||
connect_stdin);
|
||||
if (ORTE_SUCCESS != rc) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
}
|
||||
num_procs++;
|
||||
rc = odls_bproc_setup_stdio(child->name, (int)child->name->vpid,
|
||||
jobid, child->app_idx,
|
||||
connect_stdin);
|
||||
if (ORTE_SUCCESS != rc) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
}
|
||||
}
|
||||
|
||||
@ -509,8 +631,8 @@ orte_pls_bproc_orted_launch(orte_jobid_t jobid)
|
||||
if(ORTE_SUCCESS != rc) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
rc = mca_oob_send_packed_nb(MCA_OOB_NAME_SEED, ack, MCA_OOB_TAG_BPROC, 0,
|
||||
pls_bproc_orted_send_cb, NULL);
|
||||
rc = mca_oob_send_packed_nb(ORTE_RML_NAME_SEED, ack, ORTE_RML_TAG_BPROC, 0,
|
||||
odls_bproc_send_cb, NULL);
|
||||
if (0 > rc) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
@ -518,13 +640,7 @@ orte_pls_bproc_orted_launch(orte_jobid_t jobid)
|
||||
rc = ORTE_SUCCESS;
|
||||
|
||||
cleanup:
|
||||
while(NULL != (item = opal_list_remove_first(&map))) {
|
||||
OBJ_RELEASE(item);
|
||||
}
|
||||
if(NULL != pty_name) {
|
||||
free(pty_name);
|
||||
}
|
||||
OBJ_DESTRUCT(&map);
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
||||
@ -532,37 +648,8 @@ cleanup:
|
||||
* Function to terminate a job. Since this component only runs on remote nodes
|
||||
* and doesn't actually launch any processes, this function is not needed
|
||||
* so is a noop.
|
||||
* @param jobid The job to terminate
|
||||
* @retval ORTE_SUCCESS
|
||||
*/
|
||||
int orte_pls_bproc_orted_terminate_job(orte_jobid_t jobid)
|
||||
{
|
||||
orte_iof.iof_flush();
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/**
|
||||
* Function to terminate a process. Since this component only runs on remote nodes
|
||||
* and doesn't actually launch any processes, this function is not needed
|
||||
* so is a noop.
|
||||
* @param proc the process's name
|
||||
* @retval ORTE_SUCCESS
|
||||
*/
|
||||
int orte_pls_bproc_orted_terminate_proc(const orte_process_name_t* proc)
|
||||
{
|
||||
orte_iof.iof_flush();
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/**
|
||||
* Function to signal a job. Since this component only runs on remote nodes
|
||||
* and doesn't actually launch any processes, this function is not needed
|
||||
* so is a noop.
|
||||
* @param jobid The job to signal
|
||||
* @param signal The signal to send
|
||||
* @retval ORTE_SUCCESS
|
||||
*/
|
||||
int orte_pls_bproc_orted_signal_job(orte_jobid_t jobid, int32_t signal)
|
||||
int orte_odls_bproc_kill_local_procs(orte_jobid_t job, bool set_state)
|
||||
{
|
||||
orte_iof.iof_flush();
|
||||
return ORTE_SUCCESS;
|
||||
@ -576,7 +663,7 @@ int orte_pls_bproc_orted_signal_job(orte_jobid_t jobid, int32_t signal)
|
||||
* @param signal The signal to send
|
||||
* @retval ORTE_SUCCESS
|
||||
*/
|
||||
int orte_pls_bproc_orted_signal_proc(const orte_process_name_t* proc, int32_t signal)
|
||||
int orte_odls_bproc_signal_local_procs(orte_process_name_t* proc, int32_t signal)
|
||||
{
|
||||
orte_iof.iof_flush();
|
||||
return ORTE_SUCCESS;
|
||||
@ -584,14 +671,14 @@ int orte_pls_bproc_orted_signal_proc(const orte_process_name_t* proc, int32_t si
|
||||
|
||||
|
||||
/**
|
||||
* Finalizes the bproc_orted module. Cleanup tmp directory/files
|
||||
* Finalizes the bproc module. Cleanup tmp directory/files
|
||||
* used for I/O forwarding.
|
||||
* @retval ORTE_SUCCESS
|
||||
*/
|
||||
int orte_pls_bproc_orted_finalize(void)
|
||||
int orte_odls_bproc_finalize(void)
|
||||
{
|
||||
orte_iof.iof_flush();
|
||||
pls_bproc_orted_remove_dir();
|
||||
odls_bproc_remove_dir();
|
||||
orte_session_dir_finalize(orte_process_info.my_name);
|
||||
return ORTE_SUCCESS;
|
||||
}
|
108
orte/mca/odls/bproc/odls_bproc.h
Обычный файл
108
orte/mca/odls/bproc/odls_bproc.h
Обычный файл
@ -0,0 +1,108 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2006 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
/**
|
||||
* @file:
|
||||
* Part of the bproc launching system. This launching system is broken into 2
|
||||
* parts: one runs under the PLS on the head node to launch the orteds, and the
|
||||
* other serves as the orted's local launcher.
|
||||
*
|
||||
* The main job of this component is to setup ptys/pipes for IO forwarding.
|
||||
* See pls_bproc.h for an overview of how the entire bproc launching system works.
|
||||
*/
|
||||
#ifndef ORTE_ODLS_BPROC_H_
|
||||
#define ORTE_ODLS_BPROC_H_
|
||||
|
||||
#include "orte_config.h"
|
||||
|
||||
#include <sys/bproc.h>
|
||||
|
||||
#include "opal/mca/mca.h"
|
||||
#include "opal/threads/condition.h"
|
||||
|
||||
#include "orte/mca/odls/odls.h"
|
||||
|
||||
#if defined(c_plusplus) || defined(__cplusplus)
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Module open / close
|
||||
*/
|
||||
int orte_odls_bproc_component_open(void);
|
||||
int orte_odls_bproc_component_close(void);
|
||||
int orte_odls_bproc_finalize(void);
|
||||
orte_odls_base_module_t* orte_odls_bproc_init(int *priority);
|
||||
|
||||
/*
|
||||
* Startup / Shutdown
|
||||
*/
|
||||
int orte_odls_bproc_finalize(void);
|
||||
|
||||
/*
|
||||
* Interface
|
||||
*/
|
||||
int orte_odls_bproc_subscribe_launch_data(orte_jobid_t job, orte_gpr_notify_cb_fn_t cbfunc);
|
||||
int orte_odls_bproc_launch_local_procs(orte_gpr_notify_data_t *data);
|
||||
int orte_odls_bproc_kill_local_procs(orte_jobid_t job, bool set_state);
|
||||
int orte_odls_bproc_signal_local_procs(orte_process_name_t* proc_name, int32_t signal);
|
||||
|
||||
/**
|
||||
* ODLS bproc_orted component
|
||||
*/
|
||||
struct orte_odls_bproc_component_t {
|
||||
orte_odls_base_component_t super;
|
||||
/**< The base class */
|
||||
int debug;
|
||||
/**< If greater than 0 print debugging information */
|
||||
int priority;
|
||||
/**< The priority of this component. This will be returned if we determine
|
||||
* that bproc is available and running on this node, */
|
||||
opal_mutex_t lock;
|
||||
/**< Lock used to prevent some race conditions */
|
||||
opal_condition_t cond;
|
||||
/**< Condition used to wake up waiting threads */
|
||||
opal_list_t children;
|
||||
/**< list of children on this node */
|
||||
};
|
||||
/**
|
||||
* Convenience typedef
|
||||
*/
|
||||
typedef struct orte_odls_bproc_component_t orte_odls_bproc_component_t;
|
||||
|
||||
/*
|
||||
* List object to locally store the process names and pids of
|
||||
* our children. This can subsequently be used to order termination
|
||||
* or pass signals without looking the info up again.
|
||||
*/
|
||||
typedef struct odls_bproc_child_t {
|
||||
opal_list_item_t super; /* required to place this on a list */
|
||||
orte_process_name_t *name; /* the OpenRTE name of the proc */
|
||||
pid_t pid; /* local pid of the proc */
|
||||
orte_std_cntr_t app_idx; /* index of the app_context for this proc */
|
||||
bool alive; /* is this proc alive? */
|
||||
} odls_bproc_child_t;
|
||||
ORTE_DECLSPEC OBJ_CLASS_DECLARATION(odls_bproc_child_t);
|
||||
|
||||
ORTE_DECLSPEC orte_odls_bproc_component_t mca_odls_bproc_component;
|
||||
ORTE_DECLSPEC orte_odls_base_module_t orte_odls_bproc_module;
|
||||
|
||||
#if defined(c_plusplus) || defined(__cplusplus)
|
||||
}
|
||||
#endif
|
||||
#endif /* ORTE_ODLS_BPROC_H_ */
|
||||
|
@ -21,32 +21,51 @@
|
||||
* Takes care of the component stuff for the MCA.
|
||||
*/
|
||||
#include "orte_config.h"
|
||||
#include "opal/mca/base/mca_base_param.h"
|
||||
#include "orte/orte_constants.h"
|
||||
#include "orte/mca/pls/pls.h"
|
||||
|
||||
#include "opal/mca/base/mca_base_param.h"
|
||||
|
||||
#include "orte/util/proc_info.h"
|
||||
#include "pls_bproc_orted.h"
|
||||
|
||||
#include "orte/mca/odls/odls.h"
|
||||
#include "odls_bproc.h"
|
||||
|
||||
/* instance the child list object */
|
||||
static void odls_bproc_child_constructor(odls_bproc_child_t *ptr)
|
||||
{
|
||||
ptr->name = NULL;
|
||||
ptr->app_idx = -1;
|
||||
ptr->alive = false;
|
||||
}
|
||||
static void odls_bproc_child_destructor(odls_bproc_child_t *ptr)
|
||||
{
|
||||
if (NULL != ptr->name) free(ptr->name);
|
||||
}
|
||||
OBJ_CLASS_INSTANCE(odls_bproc_child_t,
|
||||
opal_list_item_t,
|
||||
odls_bproc_child_constructor,
|
||||
odls_bproc_child_destructor);
|
||||
|
||||
/**
|
||||
* The bproc_orted component data structure used to store all the relevent data
|
||||
* The bproc component data structure used to store all the relevent data
|
||||
* about this component.
|
||||
*/
|
||||
orte_pls_bproc_orted_component_t mca_pls_bproc_orted_component = {
|
||||
orte_odls_bproc_component_t mca_odls_bproc_component = {
|
||||
{
|
||||
/* First, the mca_component_t struct containing meta information
|
||||
about the component itself */
|
||||
{
|
||||
/* Indicate that we are a pls v1.0.0 component (which also
|
||||
/* Indicate that we are a odls v1.3.0 component (which also
|
||||
implies a specific MCA version) */
|
||||
ORTE_PLS_BASE_VERSION_1_0_0,
|
||||
ORTE_ODLS_BASE_VERSION_1_3_0,
|
||||
/* Component name and version */
|
||||
"bproc_orted",
|
||||
"bproc",
|
||||
ORTE_MAJOR_VERSION,
|
||||
ORTE_MINOR_VERSION,
|
||||
ORTE_RELEASE_VERSION,
|
||||
/* Component open and close functions */
|
||||
orte_pls_bproc_orted_component_open,
|
||||
orte_pls_bproc_orted_component_close
|
||||
orte_odls_bproc_component_open,
|
||||
orte_odls_bproc_component_close
|
||||
},
|
||||
/* Next the MCA v1.0.0 component meta data */
|
||||
{
|
||||
@ -54,7 +73,8 @@ orte_pls_bproc_orted_component_t mca_pls_bproc_orted_component = {
|
||||
false
|
||||
},
|
||||
/* Initialization / querying functions */
|
||||
orte_pls_bproc_orted_init
|
||||
orte_odls_bproc_init,
|
||||
orte_odls_bproc_finalize
|
||||
}
|
||||
};
|
||||
|
||||
@ -62,18 +82,20 @@ orte_pls_bproc_orted_component_t mca_pls_bproc_orted_component = {
|
||||
* Opens the pls_bproc component, setting all the needed mca parameters and
|
||||
* finishes setting up the component struct.
|
||||
*/
|
||||
int orte_pls_bproc_orted_component_open(void)
|
||||
int orte_odls_bproc_component_open(void)
|
||||
{
|
||||
/* initialize globals */
|
||||
OBJ_CONSTRUCT(&mca_pls_bproc_orted_component.lock, opal_mutex_t);
|
||||
OBJ_CONSTRUCT(&mca_odls_bproc_component.lock, opal_mutex_t);
|
||||
OBJ_CONSTRUCT(&mca_odls_bproc_component.cond, opal_condition_t);
|
||||
OBJ_CONSTRUCT(&mca_odls_bproc_component.children, opal_list_t);
|
||||
|
||||
/* lookup parameters */
|
||||
mca_base_param_reg_int(&mca_pls_bproc_orted_component.super.pls_version,
|
||||
mca_base_param_reg_int(&mca_odls_bproc_component.super.version,
|
||||
"priority", NULL, false, false, 100,
|
||||
&mca_pls_bproc_orted_component.priority);
|
||||
mca_base_param_reg_int(&mca_pls_bproc_orted_component.super.pls_version,
|
||||
&mca_odls_bproc_component.priority);
|
||||
mca_base_param_reg_int(&mca_odls_bproc_component.super.version,
|
||||
"debug", "If > 0 prints library debugging information",
|
||||
false, false, 0, &mca_pls_bproc_orted_component.debug);
|
||||
false, false, 0, &mca_odls_bproc_component.debug);
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
@ -81,16 +103,16 @@ int orte_pls_bproc_orted_component_open(void)
|
||||
* Initializes the module. We do not want to run unless we are not the seed,
|
||||
* bproc is running, and we are not on the master node.
|
||||
*/
|
||||
orte_pls_base_module_t *orte_pls_bproc_orted_init(int *priority)
|
||||
orte_odls_base_module_t *orte_odls_bproc_init(int *priority)
|
||||
{
|
||||
int ret;
|
||||
struct bproc_version_t version;
|
||||
|
||||
/* are we the seed */
|
||||
if(orte_process_info.seed == true)
|
||||
return NULL;
|
||||
|
||||
/* okay, we are in a daemon - now check to see if BProc is running here */
|
||||
/* the base open/select logic protects us against operation when
|
||||
* we are NOT in a daemon, so we don't have to check that here
|
||||
*/
|
||||
|
||||
/* check to see if BProc is running here */
|
||||
ret = bproc_version(&version);
|
||||
if (ret != 0) {
|
||||
return NULL;
|
||||
@ -101,16 +123,29 @@ orte_pls_base_module_t *orte_pls_bproc_orted_init(int *priority)
|
||||
return NULL;
|
||||
}
|
||||
|
||||
*priority = mca_pls_bproc_orted_component.priority;
|
||||
return &orte_pls_bproc_orted_module;
|
||||
*priority = mca_odls_bproc_component.priority;
|
||||
return &orte_odls_bproc_module;
|
||||
}
|
||||
|
||||
/**
|
||||
* Component close function.
|
||||
*/
|
||||
int orte_pls_bproc_orted_component_close(void)
|
||||
int orte_odls_bproc_component_close(void)
|
||||
{
|
||||
OBJ_DESTRUCT(&mca_pls_bproc_orted_component.lock);
|
||||
OBJ_DESTRUCT(&mca_odls_bproc_component.lock);
|
||||
OBJ_DESTRUCT(&mca_odls_bproc_component.cond);
|
||||
OBJ_DESTRUCT(&mca_odls_bproc_component.children);
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
int orte_odls_bproc_component_finalize(void)
|
||||
{
|
||||
opal_list_item_t *item;
|
||||
|
||||
/* cleanup state */
|
||||
while (NULL != (item = opal_list_remove_first(&mca_odls_bproc_component.children))) {
|
||||
OBJ_RELEASE(item);
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
48
orte/mca/odls/default/Makefile.am
Обычный файл
48
orte/mca/odls/default/Makefile.am
Обычный файл
@ -0,0 +1,48 @@
|
||||
#
|
||||
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
# University Research and Technology
|
||||
# Corporation. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
# of Tennessee Research Foundation. All rights
|
||||
# reserved.
|
||||
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
dist_pkgdata_DATA = help-odls-default.txt
|
||||
|
||||
sources = \
|
||||
odls_default.h \
|
||||
odls_default_component.c \
|
||||
odls_default_module.c
|
||||
|
||||
# Make the output library in this directory, and name it either
|
||||
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
|
||||
# (for static builds).
|
||||
|
||||
if OMPI_BUILD_odls_default_DSO
|
||||
component_noinst =
|
||||
component_install = mca_odls_default.la
|
||||
else
|
||||
component_noinst = libmca_odls_default.la
|
||||
component_install =
|
||||
endif
|
||||
|
||||
mcacomponentdir = $(libdir)/openmpi
|
||||
mcacomponent_LTLIBRARIES = $(component_install)
|
||||
mca_odls_default_la_SOURCES = $(sources)
|
||||
mca_odls_default_la_LDFLAGS = -module -avoid-version
|
||||
mca_odls_default_la_LIBADD = \
|
||||
$(top_ompi_builddir)/orte/liborte.la \
|
||||
$(top_ompi_builddir)/opal/libopal.la
|
||||
|
||||
noinst_LTLIBRARIES = $(component_noinst)
|
||||
libmca_odls_default_la_SOURCES =$(sources)
|
||||
libmca_odls_default_la_LDFLAGS = -module -avoid-version
|
@ -17,8 +17,9 @@
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
# MCA_pls_fork_CONFIG([action-if-found], [action-if-not-found])
|
||||
# MCA_odls_default_CONFIG([action-if-found], [action-if-not-found])
|
||||
# -----------------------------------------------------------
|
||||
AC_DEFUN([MCA_pls_fork_CONFIG],[
|
||||
AC_DEFUN([MCA_odls_default_CONFIG],[
|
||||
AC_CHECK_FUNC([fork], [$1], [$2])
|
||||
])dnl
|
||||
|
@ -17,5 +17,5 @@
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
PARAM_INIT_FILE=pls_fork_component.c
|
||||
PARAM_INIT_FILE=odls_default_component.c
|
||||
PARAM_CONFIG_FILES="Makefile"
|
@ -16,36 +16,23 @@
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
# This is the US/English general help file for Open RTE's orterun.
|
||||
# This is the US/English general help file for Open RTE's orted launcher.
|
||||
#
|
||||
[orte-pls-fork:chdir-error]
|
||||
[odls-default:chdir-error]
|
||||
Failed to change to the working directory:
|
||||
|
||||
Host: %s
|
||||
Directory: %s
|
||||
|
||||
The error returned was "%s". Execution will now abort.
|
||||
[orte-pls-fork:argv0-not-found]
|
||||
Failed to find the following executable:
|
||||
|
||||
Host: %s
|
||||
Executable: %s
|
||||
|
||||
Cannot continue.
|
||||
[orte-pls-fork:argv0-not-accessible]
|
||||
[odls-default:argv0-not-accessible]
|
||||
Failed to find or execute the following executable:
|
||||
|
||||
Host: %s
|
||||
Executable: %s
|
||||
|
||||
Cannot continue.
|
||||
[orte-pls-fork:execv-error]
|
||||
Could not execute the executable "%s": %s
|
||||
|
||||
This could mean that your PATH or executable name is wrong, or that you do not
|
||||
have the necessary permissions. Please ensure that the executable is able to be
|
||||
found and executed.
|
||||
[orte-pls-fork:could-not-kill]
|
||||
[odls-default:could-not-kill]
|
||||
WARNING: A process refused to die!
|
||||
|
||||
Host: %s
|
||||
@ -53,7 +40,7 @@ PID: %d
|
||||
|
||||
This process may still be running and/or consuming resources.
|
||||
|
||||
[orte-pls-fork:could-not-kill]
|
||||
[odls-default:could-not-send-kill]
|
||||
WARNING: A process refused the kill SIGTERM signal!
|
||||
This should never happen unless the application is changing the
|
||||
parent/child relationship permissions.
|
105
orte/mca/odls/default/odls_default.h
Обычный файл
105
orte/mca/odls/default/odls_default.h
Обычный файл
@ -0,0 +1,105 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2006 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
/**
|
||||
* @file:
|
||||
*/
|
||||
|
||||
#ifndef ORTE_ODLS_H
|
||||
#define ORTE_ODLS_H
|
||||
|
||||
#include "orte_config.h"
|
||||
|
||||
#include "opal/threads/mutex.h"
|
||||
#include "opal/threads/condition.h"
|
||||
#include "opal/mca/mca.h"
|
||||
|
||||
#include "orte/mca/ns/ns_types.h"
|
||||
#include "orte/mca/gpr/gpr_types.h"
|
||||
|
||||
#include "orte/mca/odls/odls.h"
|
||||
|
||||
#if defined(c_plusplus) || defined(__cplusplus)
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Module open / close
|
||||
*/
|
||||
int orte_odls_default_component_open(void);
|
||||
int orte_odls_default_component_close(void);
|
||||
orte_odls_base_module_t* orte_odls_default_component_init(int *priority);
|
||||
|
||||
/*
|
||||
* Startup / Shutdown
|
||||
*/
|
||||
int orte_odls_default_finalize(void);
|
||||
|
||||
/*
|
||||
* Interface
|
||||
*/
|
||||
int orte_odls_default_subscribe_launch_data(orte_jobid_t job, orte_gpr_notify_cb_fn_t cbfunc);
|
||||
int orte_odls_default_launch_local_procs(orte_gpr_notify_data_t *data);
|
||||
int orte_odls_default_kill_local_procs(orte_jobid_t job, bool set_state);
|
||||
int orte_odls_default_signal_local_procs(orte_process_name_t *proc,
|
||||
int32_t signal);
|
||||
|
||||
/**
|
||||
* ODLS Default globals
|
||||
*/
|
||||
typedef struct orte_odls_default_globals_t {
|
||||
opal_mutex_t mutex;
|
||||
opal_condition_t cond;
|
||||
opal_list_t children;
|
||||
} orte_odls_default_globals_t;
|
||||
|
||||
extern orte_odls_default_globals_t orte_odls_default;
|
||||
|
||||
/*
|
||||
* List object to locally store the process names and pids of
|
||||
* our children. This can subsequently be used to order termination
|
||||
* or pass signals without looking the info up again.
|
||||
*/
|
||||
typedef struct odls_default_child_t {
|
||||
opal_list_item_t super; /* required to place this on a list */
|
||||
orte_process_name_t *name; /* the OpenRTE name of the proc */
|
||||
pid_t pid; /* local pid of the proc */
|
||||
orte_std_cntr_t app_idx; /* index of the app_context for this proc */
|
||||
bool alive; /* is this proc alive? */
|
||||
} odls_default_child_t;
|
||||
ORTE_DECLSPEC OBJ_CLASS_DECLARATION(odls_default_child_t);
|
||||
|
||||
/*
|
||||
* List object to locally store app_contexts returned by the
|
||||
* registry subscription. Since we don't know how many app_contexts will
|
||||
* be returned, we need to store them on a list.
|
||||
*/
|
||||
typedef struct odls_default_app_context_t {
|
||||
opal_list_item_t super; /* required to place this on a list */
|
||||
orte_app_context_t *app_context;
|
||||
} odls_default_app_context_t;
|
||||
ORTE_DECLSPEC OBJ_CLASS_DECLARATION(odls_default_app_context_t);
|
||||
|
||||
/*
|
||||
* ODLS Default module
|
||||
*/
|
||||
extern orte_odls_base_module_t orte_odls_default_module;
|
||||
|
||||
#if defined(c_plusplus) || defined(__cplusplus)
|
||||
}
|
||||
#endif
|
||||
#endif /* ORTE_ODLS_H */
|
163
orte/mca/odls/default/odls_default_component.c
Обычный файл
163
orte/mca/odls/default/odls_default_component.c
Обычный файл
@ -0,0 +1,163 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2006 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*
|
||||
* These symbols are in a file by themselves to provide nice linker
|
||||
* semantics. Since linkers generally pull in symbols by object
|
||||
* files, keeping these symbols as the only symbols in this file
|
||||
* prevents utility programs such as "ompi_info" from having to import
|
||||
* entire components just to query their version and parameters.
|
||||
*/
|
||||
|
||||
#include "orte_config.h"
|
||||
#include "orte/orte_constants.h"
|
||||
|
||||
#include <stdlib.h>
|
||||
#ifdef HAVE_UNISTD_H
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
#include <ctype.h>
|
||||
|
||||
#include "opal/util/argv.h"
|
||||
#include "opal/util/path.h"
|
||||
#include "opal/util/basename.h"
|
||||
#include "opal/util/show_help.h"
|
||||
#include "opal/mca/base/mca_base_param.h"
|
||||
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/rml/rml.h"
|
||||
|
||||
#include "orte/mca/odls/odls.h"
|
||||
#include "orte/mca/pls/base/pls_private.h"
|
||||
|
||||
#include "orte/mca/odls/default/odls_default.h"
|
||||
|
||||
/* Instantiate the component globals */
|
||||
orte_odls_default_globals_t orte_odls_default;
|
||||
|
||||
|
||||
/* instance the child list object */
|
||||
static void odls_default_child_constructor(odls_default_child_t *ptr)
|
||||
{
|
||||
ptr->name = NULL;
|
||||
ptr->pid = 0;
|
||||
ptr->app_idx = -1;
|
||||
ptr->alive = false;
|
||||
}
|
||||
static void odls_default_child_destructor(odls_default_child_t *ptr)
|
||||
{
|
||||
if (NULL != ptr->name) free(ptr->name);
|
||||
}
|
||||
OBJ_CLASS_INSTANCE(odls_default_child_t,
|
||||
opal_list_item_t,
|
||||
odls_default_child_constructor,
|
||||
odls_default_child_destructor);
|
||||
|
||||
/* instance the app_context list object */
|
||||
OBJ_CLASS_INSTANCE(odls_default_app_context_t,
|
||||
opal_list_item_t,
|
||||
NULL, NULL);
|
||||
|
||||
|
||||
/*
|
||||
* Instantiate the public struct with all of our public information
|
||||
* and pointers to our public functions in it
|
||||
*/
|
||||
|
||||
orte_odls_base_component_t mca_odls_default_component = {
|
||||
/* First, the mca_component_t struct containing meta information
|
||||
about the component itself */
|
||||
{
|
||||
/* Indicate that we are a odls v1.3.0 component (which also
|
||||
implies a specific MCA version) */
|
||||
|
||||
ORTE_ODLS_BASE_VERSION_1_3_0,
|
||||
/* Component name and version */
|
||||
|
||||
"default",
|
||||
ORTE_MAJOR_VERSION,
|
||||
ORTE_MINOR_VERSION,
|
||||
ORTE_RELEASE_VERSION,
|
||||
|
||||
/* Component open and close functions */
|
||||
|
||||
orte_odls_default_component_open,
|
||||
orte_odls_default_component_close
|
||||
},
|
||||
|
||||
/* Next the MCA v1.0.0 component meta data */
|
||||
|
||||
{
|
||||
/* Whether the component is checkpointable or not */
|
||||
|
||||
true
|
||||
},
|
||||
|
||||
/* Initialization / querying functions */
|
||||
|
||||
orte_odls_default_component_init,
|
||||
orte_odls_default_finalize
|
||||
};
|
||||
|
||||
|
||||
|
||||
int orte_odls_default_component_open(void)
|
||||
{
|
||||
/* initialize globals */
|
||||
OBJ_CONSTRUCT(&orte_odls_default.mutex, opal_mutex_t);
|
||||
OBJ_CONSTRUCT(&orte_odls_default.cond, opal_condition_t);
|
||||
OBJ_CONSTRUCT(&orte_odls_default.children, opal_list_t);
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
orte_odls_base_module_t *orte_odls_default_component_init(int *priority)
|
||||
{
|
||||
/* the base open/select logic protects us against operation when
|
||||
* we are NOT in a daemon, so we don't have to check that here
|
||||
*/
|
||||
|
||||
/* we have built some logic into the configure.m4 file that checks
|
||||
* to see if we have "fork" support and only builds this component
|
||||
* if we do. Hence, we only get here if we CAN build - in which
|
||||
* case, we definitely should be considered for selection
|
||||
*/
|
||||
*priority = 1; /* let others override us - we are the default */
|
||||
|
||||
return &orte_odls_default_module;
|
||||
}
|
||||
|
||||
|
||||
int orte_odls_default_component_close(void)
|
||||
{
|
||||
OBJ_DESTRUCT(&orte_odls_default.mutex);
|
||||
OBJ_DESTRUCT(&orte_odls_default.cond);
|
||||
OBJ_DESTRUCT(&orte_odls_default.children);
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
int orte_odls_default_finalize(void)
|
||||
{
|
||||
opal_list_item_t *item;
|
||||
|
||||
/* cleanup state */
|
||||
while (NULL != (item = opal_list_remove_first(&orte_odls_default.children))) {
|
||||
OBJ_RELEASE(item);
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
1037
orte/mca/odls/default/odls_default_module.c
Обычный файл
1037
orte/mca/odls/default/odls_default_module.c
Обычный файл
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
130
orte/mca/odls/odls.h
Обычный файл
130
orte/mca/odls/odls.h
Обычный файл
@ -0,0 +1,130 @@
|
||||
/* -*- C -*-
|
||||
*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
/**
|
||||
* @file
|
||||
*
|
||||
* The OpenRTE Daemon's Local Launch Subsystem
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef ORTE_MCA_ODLS_H
|
||||
#define ORTE_MCA_ODLS_H
|
||||
|
||||
#include "orte_config.h"
|
||||
|
||||
#include "opal/mca/mca.h"
|
||||
#include "opal/class/opal_list.h"
|
||||
|
||||
#include "orte/mca/gpr/gpr_types.h"
|
||||
#include "orte/mca/ns/ns_types.h"
|
||||
|
||||
#include "orte/mca/odls/odls_types.h"
|
||||
|
||||
/*
|
||||
* odls module functions
|
||||
*/
|
||||
|
||||
/**
|
||||
* Subscribe to receive the launch data for local processes
|
||||
*/
|
||||
typedef int (*orte_odls_base_module_subscribe_launch_data_fn_t)(orte_jobid_t job, orte_gpr_notify_cb_fn_t cbfunc);
|
||||
|
||||
/**
|
||||
* Locally launch the provided processes
|
||||
*/
|
||||
typedef int (*orte_odls_base_module_launch_local_processes_fn_t)(orte_gpr_notify_data_t *data);
|
||||
|
||||
/**
|
||||
* Kill the local processes on this node
|
||||
*/
|
||||
typedef int (*orte_odls_base_module_kill_local_processes_fn_t)(orte_jobid_t job, bool set_state);
|
||||
|
||||
/**
|
||||
* Signal local processes
|
||||
*/
|
||||
typedef int (*orte_pls_base_module_signal_local_process_fn_t)(orte_process_name_t *proc,
|
||||
int32_t signal);
|
||||
|
||||
/**
|
||||
* pls module version 1.3.0
|
||||
*/
|
||||
struct orte_odls_base_module_1_3_0_t {
|
||||
orte_odls_base_module_subscribe_launch_data_fn_t subscribe_launch_data;
|
||||
orte_odls_base_module_launch_local_processes_fn_t launch_local_procs;
|
||||
orte_odls_base_module_kill_local_processes_fn_t kill_local_procs;
|
||||
orte_pls_base_module_signal_local_process_fn_t signal_local_procs;
|
||||
};
|
||||
|
||||
/** shorten orte_odls_base_module_1_3_0_t declaration */
|
||||
typedef struct orte_odls_base_module_1_3_0_t orte_odls_base_module_1_3_0_t;
|
||||
/** shorten orte_odls_base_module_t declaration */
|
||||
typedef struct orte_odls_base_module_1_3_0_t orte_odls_base_module_t;
|
||||
|
||||
/**
|
||||
* odls initialization function
|
||||
*
|
||||
* Called by the MCA framework to initialize the component. Invoked
|
||||
* exactly once per process.
|
||||
*
|
||||
* @param priority (OUT) Relative priority or ranking use by MCA to
|
||||
* select a module.
|
||||
*/
|
||||
typedef struct orte_odls_base_module_1_3_0_t*
|
||||
(*orte_odls_base_component_init_fn_t)(int *priority);
|
||||
|
||||
/**
|
||||
* Cleanup all resources held by the component
|
||||
*/
|
||||
typedef int (*orte_odls_base_component_finalize_fn_t)(void);
|
||||
|
||||
|
||||
/**
|
||||
* odls component v1.3.0
|
||||
*/
|
||||
struct orte_odls_base_component_1_3_0_t {
|
||||
/** component version */
|
||||
mca_base_component_t version;
|
||||
/** component data */
|
||||
mca_base_component_data_1_0_0_t odls_data;
|
||||
/** Function called when component is initialized */
|
||||
orte_odls_base_component_init_fn_t init;
|
||||
/* Function called when component is finalized */
|
||||
orte_odls_base_component_finalize_fn_t finalize;
|
||||
};
|
||||
/** Convenience typedef */
|
||||
typedef struct orte_odls_base_component_1_3_0_t orte_odls_base_component_1_3_0_t;
|
||||
/** Convenience typedef */
|
||||
typedef orte_odls_base_component_1_3_0_t orte_odls_base_component_t;
|
||||
|
||||
|
||||
/**
|
||||
* Macro for use in modules that are of type odls v1.3.0
|
||||
*/
|
||||
#define ORTE_ODLS_BASE_VERSION_1_3_0 \
|
||||
/* odls v1.3 is chained to MCA v1.0 */ \
|
||||
MCA_BASE_VERSION_1_0_0, \
|
||||
/* odls v1.3 */ \
|
||||
"odls", 1, 3, 0
|
||||
|
||||
/* Global structure for accessing ODLS functions
|
||||
*/
|
||||
ORTE_DECLSPEC extern orte_odls_base_module_t orte_odls; /* holds selected module's function pointers */
|
||||
|
||||
|
||||
#endif /* MCA_ODLS_H */
|
51
orte/mca/odls/odls_types.h
Обычный файл
51
orte/mca/odls/odls_types.h
Обычный файл
@ -0,0 +1,51 @@
|
||||
/* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
/** @file:
|
||||
*/
|
||||
|
||||
#ifndef ORTE_MCA_ODLS_TYPES_H
|
||||
#define ORTE_MCA_ODLS_TYPES_H
|
||||
|
||||
#include "orte_config.h"
|
||||
#include "orte/orte_types.h"
|
||||
|
||||
#if defined(c_plusplus) || defined(__cplusplus)
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
/* define the orted command flag type */
|
||||
typedef uint8_t orte_daemon_cmd_flag_t;
|
||||
#define ORTE_DAEMON_CMD_T ORTE_UINT8
|
||||
|
||||
|
||||
/*
|
||||
* Definitions needed for communication
|
||||
*/
|
||||
#define ORTE_DAEMON_HOSTFILE_CMD (orte_daemon_cmd_flag_t) 1
|
||||
#define ORTE_DAEMON_SCRIPTFILE_CMD (orte_daemon_cmd_flag_t) 2
|
||||
#define ORTE_DAEMON_CONTACT_QUERY_CMD (orte_daemon_cmd_flag_t) 3
|
||||
#define ORTE_DAEMON_KILL_LOCAL_PROCS (orte_daemon_cmd_flag_t) 4
|
||||
#define ORTE_DAEMON_SIGNAL_LOCAL_PROCS (orte_daemon_cmd_flag_t) 5
|
||||
#define ORTE_DAEMON_ADD_LOCAL_PROCS (orte_daemon_cmd_flag_t) 6
|
||||
#define ORTE_DAEMON_HEARTBEAT_CMD (orte_daemon_cmd_flag_t) 254
|
||||
#define ORTE_DAEMON_EXIT_CMD (orte_daemon_cmd_flag_t) 255
|
||||
|
||||
|
||||
#if defined(c_plusplus) || defined(__cplusplus)
|
||||
}
|
||||
#endif
|
||||
#endif
|
0
orte/mca/odls/windows/.ompi_ignore
Обычный файл
0
orte/mca/odls/windows/.ompi_ignore
Обычный файл
0
orte/mca/pls/process/Makefile.am → orte/mca/odls/windows/Makefile.am
Обычный файл → Исполняемый файл
0
orte/mca/pls/process/Makefile.am → orte/mca/odls/windows/Makefile.am
Обычный файл → Исполняемый файл
4
orte/mca/pls/process/configure.m4 → orte/mca/odls/windows/configure.m4
Обычный файл → Исполняемый файл
4
orte/mca/pls/process/configure.m4 → orte/mca/odls/windows/configure.m4
Обычный файл → Исполняемый файл
@ -10,8 +10,8 @@
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
# MCA_pls_process_CONFIG([action-if-found], [action-if-not-found])
|
||||
# MCA_odls_windows_CONFIG([action-if-found], [action-if-not-found])
|
||||
# -----------------------------------------------------------
|
||||
AC_DEFUN([MCA_pls_process_CONFIG],[
|
||||
AC_DEFUN([MCA_odls_windows_CONFIG],[
|
||||
AC_CHECK_FUNC([CreateProcess], [$1], [$2])
|
||||
])dnl
|
2
orte/mca/pls/process/configure.params → orte/mca/odls/windows/configure.params
Обычный файл → Исполняемый файл
2
orte/mca/pls/process/configure.params → orte/mca/odls/windows/configure.params
Обычный файл → Исполняемый файл
@ -10,5 +10,5 @@
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
PARAM_INIT_FILE=pls_process_component.c
|
||||
PARAM_INIT_FILE=odls_windows_component.c
|
||||
PARAM_CONFIG_FILES="Makefile"
|
0
orte/mca/pls/process/help-orte-pls-process.txt → orte/mca/odls/windows/help-odls-windows.txt
Обычный файл → Исполняемый файл
0
orte/mca/pls/process/help-orte-pls-process.txt → orte/mca/odls/windows/help-odls-windows.txt
Обычный файл → Исполняемый файл
0
orte/mca/pls/process/pls_process.h → orte/mca/odls/windows/odls_windows.h
Обычный файл → Исполняемый файл
0
orte/mca/pls/process/pls_process.h → orte/mca/odls/windows/odls_windows.h
Обычный файл → Исполняемый файл
0
orte/mca/pls/process/pls_process_component.c → orte/mca/odls/windows/odls_windows_component.c
Обычный файл → Исполняемый файл
0
orte/mca/pls/process/pls_process_component.c → orte/mca/odls/windows/odls_windows_component.c
Обычный файл → Исполняемый файл
0
orte/mca/pls/process/pls_process_module.c → orte/mca/odls/windows/odls_windows_module.c
Обычный файл → Исполняемый файл
0
orte/mca/pls/process/pls_process_module.c → orte/mca/odls/windows/odls_windows_module.c
Обычный файл → Исполняемый файл
@ -91,7 +91,7 @@ int mca_oob_xcast(
|
||||
{
|
||||
orte_std_cntr_t i;
|
||||
int rc;
|
||||
int tag = MCA_OOB_TAG_XCAST;
|
||||
int tag = ORTE_RML_TAG_XCAST;
|
||||
int status;
|
||||
orte_proc_state_t state;
|
||||
|
||||
|
@ -32,23 +32,6 @@
|
||||
* Other constants
|
||||
*/
|
||||
|
||||
/**
|
||||
* Service tags
|
||||
*/
|
||||
#define MCA_OOB_TAG_NS (orte_rml_tag_t) 1
|
||||
#define MCA_OOB_TAG_GPR (orte_rml_tag_t) 2
|
||||
#define MCA_OOB_TAG_GPR_NOTIFY (orte_rml_tag_t) 3
|
||||
#define MCA_OOB_TAG_RTE (orte_rml_tag_t) 4
|
||||
#define MCA_OOB_TAG_EXEC (orte_rml_tag_t) 5
|
||||
#define MCA_OOB_TAG_DAEMON (orte_rml_tag_t) 6
|
||||
#define MCA_OOB_TAG_STDIO (orte_rml_tag_t) 7
|
||||
#define MCA_OOB_TAG_SCHED (orte_rml_tag_t) 8
|
||||
#define MCA_OOB_TAG_PCM_KILL (orte_rml_tag_t) 9
|
||||
#define MCA_OOB_TAG_XCAST (orte_rml_tag_t) 10
|
||||
#define MCA_OOB_TAG_PCM_KILL_ACK (orte_rml_tag_t) 11
|
||||
#define MCA_OOB_TAG_BPROC (orte_rml_tag_t) 12
|
||||
#define ORTE_OOB_TAG_START_LIST (orte_rml_tag_t) 100 /* starting point for tag server assignments */
|
||||
|
||||
/**
|
||||
* The wildcard for receives from any peer.
|
||||
*/
|
||||
|
@ -83,6 +83,7 @@ OBJ_CLASS_INSTANCE(
|
||||
*/
|
||||
|
||||
static int mca_oob_tcp_create_listen(void);
|
||||
static int mca_oob_tcp_create_listen_thread(void);
|
||||
static void mca_oob_tcp_recv_handler(int sd, short flags, void* user);
|
||||
static void mca_oob_tcp_accept(void);
|
||||
|
||||
@ -100,6 +101,12 @@ OBJ_CLASS_INSTANCE(
|
||||
NULL,
|
||||
NULL);
|
||||
|
||||
OBJ_CLASS_INSTANCE(
|
||||
mca_oob_tcp_pending_connection_t,
|
||||
opal_free_list_item_t,
|
||||
NULL,
|
||||
NULL);
|
||||
|
||||
|
||||
|
||||
/*
|
||||
@ -169,6 +176,9 @@ static inline char* mca_oob_tcp_param_register_str(
|
||||
*/
|
||||
int mca_oob_tcp_component_open(void)
|
||||
{
|
||||
char *listen_type;
|
||||
int tmp;
|
||||
|
||||
#ifdef __WINDOWS__
|
||||
WSADATA win_sock_data;
|
||||
if (WSAStartup(MAKEWORD(2,2), &win_sock_data) != 0) {
|
||||
@ -190,6 +200,12 @@ int mca_oob_tcp_component_open(void)
|
||||
OBJ_CONSTRUCT(&mca_oob_tcp_component.tcp_msg_completed, opal_list_t);
|
||||
OBJ_CONSTRUCT(&mca_oob_tcp_component.tcp_match_lock, opal_mutex_t);
|
||||
OBJ_CONSTRUCT(&mca_oob_tcp_component.tcp_match_cond, opal_condition_t);
|
||||
OBJ_CONSTRUCT(&mca_oob_tcp_component.tcp_listen_thread, opal_thread_t);
|
||||
OBJ_CONSTRUCT(&mca_oob_tcp_component.tcp_pending_connections_fl, opal_free_list_t);
|
||||
OBJ_CONSTRUCT(&mca_oob_tcp_component.tcp_pending_connections, opal_list_t);
|
||||
OBJ_CONSTRUCT(&mca_oob_tcp_component.tcp_copy_out_connections, opal_list_t);
|
||||
OBJ_CONSTRUCT(&mca_oob_tcp_component.tcp_copy_in_connections, opal_list_t);
|
||||
OBJ_CONSTRUCT(&mca_oob_tcp_component.tcp_pending_connections_lock, opal_mutex_t);
|
||||
|
||||
/* register oob module parameters */
|
||||
mca_oob_tcp_component.tcp_peer_limit =
|
||||
@ -207,9 +223,60 @@ int mca_oob_tcp_component_open(void)
|
||||
mca_oob_tcp_component.tcp_rcvbuf =
|
||||
mca_oob_tcp_param_register_int("rcvbuf", 128*1024);
|
||||
|
||||
mca_base_param_reg_string(&mca_oob_tcp_component.super.oob_base,
|
||||
"listen_mode",
|
||||
"Mode for HNP to accept incoming connections: event, listen_thread",
|
||||
false,
|
||||
false,
|
||||
"event",
|
||||
&listen_type);
|
||||
|
||||
if ((0 == strcmp(listen_type, "event")) || NULL == getenv("I_AM_MPIRUN")) {
|
||||
mca_oob_tcp_component.tcp_listen_type = OOB_TCP_EVENT;
|
||||
} else if (0 == strcmp(listen_type, "listen_thread")) {
|
||||
mca_oob_tcp_component.tcp_listen_type = OOB_TCP_LISTEN_THREAD;
|
||||
} else {
|
||||
opal_output(0, "Invalid value for oob_tcp_listen_mode parameter: %s",
|
||||
listen_type);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
|
||||
mca_base_param_reg_int(&mca_oob_tcp_component.super.oob_base,
|
||||
"listen_thread_max_queue",
|
||||
"High water mark for queued accepted socket list size",
|
||||
false,
|
||||
false,
|
||||
10,
|
||||
&mca_oob_tcp_component.tcp_copy_max_size);
|
||||
|
||||
mca_base_param_reg_int(&mca_oob_tcp_component.super.oob_base,
|
||||
"listen_thread_max_time",
|
||||
"Maximum amount of time (in milliseconds) to wait between processing accepted socket list",
|
||||
false,
|
||||
false,
|
||||
10,
|
||||
&tmp);
|
||||
|
||||
#if OPAL_TIMER_USEC_NATIVE
|
||||
mca_oob_tcp_component.tcp_copy_delta = tmp * 1000;
|
||||
#else
|
||||
mca_oob_tcp_component.tcp_copy_delta = tmp *
|
||||
opal_timer_base_get_freq() / 1000;
|
||||
#endif
|
||||
|
||||
mca_base_param_reg_int(&mca_oob_tcp_component.super.oob_base,
|
||||
"accept_spin_count",
|
||||
"Number of times to let accept return EWOULDBLOCK before updating accepted socket list",
|
||||
false,
|
||||
false,
|
||||
10,
|
||||
&mca_oob_tcp_component.tcp_copy_spin_count);
|
||||
|
||||
/* initialize state */
|
||||
mca_oob_tcp_component.tcp_shutdown = false;
|
||||
mca_oob_tcp_component.tcp_listen_sd = -1;
|
||||
mca_oob_tcp_component.tcp_match_count = 0;
|
||||
mca_oob_tcp_component.tcp_last_copy_time = 0;
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
@ -251,7 +318,7 @@ int mca_oob_tcp_component_close(void)
|
||||
static void mca_oob_tcp_accept(void)
|
||||
{
|
||||
while(true) {
|
||||
ompi_socklen_t addrlen = sizeof(struct sockaddr_in);
|
||||
opal_socklen_t addrlen = sizeof(struct sockaddr_in);
|
||||
struct sockaddr_in addr;
|
||||
mca_oob_tcp_event_t* event;
|
||||
int sd;
|
||||
@ -291,7 +358,7 @@ static int mca_oob_tcp_create_listen(void)
|
||||
{
|
||||
int flags;
|
||||
struct sockaddr_in inaddr;
|
||||
ompi_socklen_t addrlen;
|
||||
opal_socklen_t addrlen;
|
||||
|
||||
/* create a listen socket for incoming connections */
|
||||
mca_oob_tcp_component.tcp_listen_sd = socket(AF_INET, SOCK_STREAM, 0);
|
||||
@ -352,6 +419,206 @@ static int mca_oob_tcp_create_listen(void)
|
||||
}
|
||||
|
||||
|
||||
static void* mca_oob_tcp_listen_thread(opal_object_t *obj)
|
||||
{
|
||||
int rc, count;
|
||||
opal_socklen_t addrlen = sizeof(struct sockaddr_in);
|
||||
opal_free_list_item_t *fl_item;
|
||||
mca_oob_tcp_pending_connection_t *item;
|
||||
struct timeval timeout;
|
||||
fd_set readfds;
|
||||
|
||||
while (false == mca_oob_tcp_component.tcp_shutdown) {
|
||||
count = 0;
|
||||
|
||||
FD_ZERO(&readfds);
|
||||
FD_SET(mca_oob_tcp_component.tcp_listen_sd, &readfds);
|
||||
timeout.tv_sec = 0;
|
||||
timeout.tv_usec = 10000;
|
||||
|
||||
rc = select(mca_oob_tcp_component.tcp_listen_sd + 1, &readfds,
|
||||
NULL, NULL, &timeout);
|
||||
if (rc < 0) {
|
||||
if (EAGAIN != opal_socket_errno && EINTR != opal_socket_errno) {
|
||||
perror("select");
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
while (count < mca_oob_tcp_component.tcp_copy_spin_count &&
|
||||
opal_list_get_size(&mca_oob_tcp_component.tcp_copy_in_connections) <
|
||||
(size_t) mca_oob_tcp_component.tcp_copy_max_size) {
|
||||
OPAL_FREE_LIST_WAIT(&mca_oob_tcp_component.tcp_pending_connections_fl,
|
||||
fl_item, rc);
|
||||
item = (mca_oob_tcp_pending_connection_t*) fl_item;
|
||||
item->fd = accept(mca_oob_tcp_component.tcp_listen_sd,
|
||||
(struct sockaddr*)&(item->addr), &addrlen);
|
||||
if(item->fd < 0) {
|
||||
OPAL_FREE_LIST_RETURN(&mca_oob_tcp_component.tcp_pending_connections_fl,
|
||||
fl_item);
|
||||
|
||||
if (mca_oob_tcp_component.tcp_shutdown) return NULL;
|
||||
|
||||
if(opal_socket_errno != EAGAIN || opal_socket_errno != EWOULDBLOCK) {
|
||||
opal_output(0, "mca_oob_tcp_accept: accept() failed with errno %d.", opal_socket_errno);
|
||||
close(item->fd);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
count++;
|
||||
continue;
|
||||
}
|
||||
|
||||
if(mca_oob_tcp_component.tcp_debug) {
|
||||
opal_output(0, "[%lu,%lu,%lu] mca_oob_tcp_listen_thread: (%d, %d) %s:%d\n",
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name),
|
||||
item->fd, opal_socket_errno,
|
||||
inet_ntoa(item->addr.sin_addr),
|
||||
item->addr.sin_port);
|
||||
}
|
||||
|
||||
opal_list_append(&mca_oob_tcp_component.tcp_copy_in_connections,
|
||||
(opal_list_item_t*) item);
|
||||
}
|
||||
|
||||
if (0 < opal_list_get_size(&mca_oob_tcp_component.tcp_copy_in_connections)) {
|
||||
opal_mutex_lock(&mca_oob_tcp_component.tcp_pending_connections_lock);
|
||||
opal_list_join(&mca_oob_tcp_component.tcp_pending_connections,
|
||||
opal_list_get_end(&mca_oob_tcp_component.tcp_pending_connections),
|
||||
&mca_oob_tcp_component.tcp_copy_in_connections);
|
||||
opal_mutex_unlock(&mca_oob_tcp_component.tcp_pending_connections_lock);
|
||||
}
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* called from opal_progress() to create the oob contact information
|
||||
for the file descriptors accepted() by the accept thread. */
|
||||
static int mca_oob_tcp_listen_progress(void)
|
||||
{
|
||||
int count = 0;
|
||||
mca_oob_tcp_pending_connection_t *item;
|
||||
mca_oob_tcp_event_t* event;
|
||||
#if OPAL_TIMER_USEC_NATIVE
|
||||
opal_timer_t now = opal_timer_base_get_usec();
|
||||
#else
|
||||
opal_timer_t now = opal_timer_base_get_cycles();
|
||||
#endif /* OPAL_TIMER_USEC_NATIVE */
|
||||
|
||||
/* if we've not pulled pending connections for a while OR we've
|
||||
hit the high water mark of pending connections, grab all the
|
||||
pending connections */
|
||||
if ((now - mca_oob_tcp_component.tcp_last_copy_time >
|
||||
mca_oob_tcp_component.tcp_copy_delta) ||
|
||||
((size_t) mca_oob_tcp_component.tcp_copy_max_size <
|
||||
opal_list_get_size(&mca_oob_tcp_component.tcp_pending_connections))) {
|
||||
|
||||
/* copy the pending connections from the list the accept
|
||||
thread is inserting into into a temporary list for us to
|
||||
process from. This is an O(1) operation, so we minimize
|
||||
the lock time */
|
||||
opal_mutex_lock(&mca_oob_tcp_component.tcp_pending_connections_lock);
|
||||
opal_list_join(&mca_oob_tcp_component.tcp_copy_out_connections,
|
||||
opal_list_get_end(&mca_oob_tcp_component.tcp_copy_out_connections),
|
||||
&mca_oob_tcp_component.tcp_pending_connections);
|
||||
opal_mutex_unlock(&mca_oob_tcp_component.tcp_pending_connections_lock);
|
||||
|
||||
/* process al the connections */
|
||||
while (NULL != (item = (mca_oob_tcp_pending_connection_t*)
|
||||
opal_list_remove_first(&mca_oob_tcp_component.
|
||||
tcp_copy_out_connections))) {
|
||||
|
||||
/* setup socket options */
|
||||
mca_oob_tcp_set_socket_options(item->fd);
|
||||
|
||||
/* log the accept */
|
||||
if(mca_oob_tcp_component.tcp_debug) {
|
||||
opal_output(0, "[%lu,%lu,%lu] mca_oob_tcp_listen_progress: %s:%d\n",
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name),
|
||||
inet_ntoa(item->addr.sin_addr),
|
||||
item->addr.sin_port);
|
||||
}
|
||||
|
||||
/* wait for receipt of peers process identifier to
|
||||
complete this connection */
|
||||
event = OBJ_NEW(mca_oob_tcp_event_t);
|
||||
opal_event_set(&event->event, item->fd, OPAL_EV_READ, mca_oob_tcp_recv_handler, event);
|
||||
opal_event_add(&event->event, 0);
|
||||
OPAL_FREE_LIST_RETURN(&mca_oob_tcp_component.tcp_pending_connections_fl,
|
||||
(opal_free_list_item_t *) item);
|
||||
|
||||
count++;
|
||||
}
|
||||
|
||||
mca_oob_tcp_component.tcp_last_copy_time = now;
|
||||
}
|
||||
|
||||
return count;
|
||||
}
|
||||
|
||||
|
||||
static int mca_oob_tcp_create_listen_thread(void)
|
||||
{
|
||||
struct sockaddr_in inaddr;
|
||||
opal_socklen_t addrlen;
|
||||
int flags;
|
||||
|
||||
/* create a listen socket for incoming connections */
|
||||
mca_oob_tcp_component.tcp_listen_sd = socket(AF_INET, SOCK_STREAM, 0);
|
||||
if(mca_oob_tcp_component.tcp_listen_sd < 0) {
|
||||
opal_output(0,"mca_oob_tcp_component_init: socket() failed with errno=%d", opal_socket_errno);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
|
||||
/* setup socket options */
|
||||
mca_oob_tcp_set_socket_options(mca_oob_tcp_component.tcp_listen_sd);
|
||||
|
||||
/* bind address */
|
||||
memset(&inaddr, 0, sizeof(inaddr));
|
||||
inaddr.sin_family = AF_INET;
|
||||
inaddr.sin_addr.s_addr = INADDR_ANY;
|
||||
inaddr.sin_port = 0;
|
||||
|
||||
if(bind(mca_oob_tcp_component.tcp_listen_sd, (struct sockaddr*)&inaddr, sizeof(inaddr)) < 0) {
|
||||
opal_output(0,"mca_oob_tcp_create_listen: bind() failed with errno=%d", opal_socket_errno);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
|
||||
/* resolve system assigned port */
|
||||
addrlen = sizeof(struct sockaddr_in);
|
||||
if(getsockname(mca_oob_tcp_component.tcp_listen_sd, (struct sockaddr*)&inaddr, &addrlen) < 0) {
|
||||
opal_output(0, "mca_oob_tcp_create_listen: getsockname() failed with errno=%d", opal_socket_errno);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
mca_oob_tcp_component.tcp_listen_port = inaddr.sin_port;
|
||||
|
||||
/* setup listen backlog to maximum allowed by kernel */
|
||||
if(listen(mca_oob_tcp_component.tcp_listen_sd, SOMAXCONN) < 0) {
|
||||
opal_output(0, "mca_oob_tcp_component_init: listen() failed with errno=%d", opal_socket_errno);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
|
||||
/* set socket up to be non-blocking, otherwise accept could block */
|
||||
if((flags = fcntl(mca_oob_tcp_component.tcp_listen_sd, F_GETFL, 0)) < 0) {
|
||||
opal_output(0, "mca_oob_tcp_component_init: fcntl(F_GETFL) failed with errno=%d", opal_socket_errno);
|
||||
return ORTE_ERROR;
|
||||
} else {
|
||||
flags |= O_NONBLOCK;
|
||||
if(fcntl(mca_oob_tcp_component.tcp_listen_sd, F_SETFL, flags) < 0) {
|
||||
opal_output(0, "mca_oob_tcp_component_init: fcntl(F_SETFL) failed with errno=%d", opal_socket_errno);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
}
|
||||
|
||||
/* start the listen thread */
|
||||
mca_oob_tcp_component.tcp_listen_thread.t_run = mca_oob_tcp_listen_thread;
|
||||
mca_oob_tcp_component.tcp_listen_thread.t_arg = NULL;
|
||||
|
||||
return opal_thread_start(&mca_oob_tcp_component.tcp_listen_thread);
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Handle probe
|
||||
*/
|
||||
@ -537,9 +804,23 @@ mca_oob_t* mca_oob_tcp_component_init(int* priority)
|
||||
memset(&mca_oob_tcp_component.tcp_send_event, 0, sizeof(opal_event_t));
|
||||
|
||||
/* create a listen socket */
|
||||
if(mca_oob_tcp_create_listen() != ORTE_SUCCESS) {
|
||||
opal_output(0, "mca_oob_tcp_init: unable to create listen socket\n");
|
||||
return NULL;
|
||||
if (OOB_TCP_EVENT == mca_oob_tcp_component.tcp_listen_type) {
|
||||
if(mca_oob_tcp_create_listen() != ORTE_SUCCESS) {
|
||||
opal_output(0, "mca_oob_tcp_init: unable to create listen socket");
|
||||
return NULL;
|
||||
}
|
||||
} else if (OOB_TCP_LISTEN_THREAD == mca_oob_tcp_component.tcp_listen_type) {
|
||||
if (mca_oob_tcp_create_listen_thread() != ORTE_SUCCESS) {
|
||||
opal_output(0, "mca_oob_tcp_init: unable to create listen thread");
|
||||
return NULL;
|
||||
}
|
||||
opal_free_list_init(&mca_oob_tcp_component.tcp_pending_connections_fl,
|
||||
sizeof(mca_oob_tcp_pending_connection_t),
|
||||
OBJ_CLASS(mca_oob_tcp_pending_connection_t),
|
||||
16, /* initial number */
|
||||
-1, /* maximum number */
|
||||
16); /* increment to grow by */
|
||||
opal_progress_register(mca_oob_tcp_listen_progress);
|
||||
}
|
||||
return &mca_oob_tcp;
|
||||
}
|
||||
@ -932,8 +1213,16 @@ int mca_oob_tcp_fini(void)
|
||||
|
||||
/* close listen socket */
|
||||
if (mca_oob_tcp_component.tcp_listen_sd >= 0) {
|
||||
opal_event_del(&mca_oob_tcp_component.tcp_recv_event);
|
||||
CLOSE_THE_SOCKET(mca_oob_tcp_component.tcp_listen_sd);
|
||||
if (OOB_TCP_EVENT == mca_oob_tcp_component.tcp_listen_type) {
|
||||
opal_event_del(&mca_oob_tcp_component.tcp_recv_event);
|
||||
close(mca_oob_tcp_component.tcp_listen_sd);
|
||||
} else if (OOB_TCP_LISTEN_THREAD == mca_oob_tcp_component.tcp_listen_type) {
|
||||
void *data;
|
||||
mca_oob_tcp_component.tcp_shutdown = true;
|
||||
close(mca_oob_tcp_component.tcp_listen_sd);
|
||||
opal_thread_join(&mca_oob_tcp_component.tcp_listen_thread, &data);
|
||||
opal_progress_unregister(mca_oob_tcp_listen_progress);
|
||||
}
|
||||
mca_oob_tcp_component.tcp_listen_sd = -1;
|
||||
}
|
||||
|
||||
|
@ -34,6 +34,7 @@
|
||||
#include "opal/threads/condition.h"
|
||||
#include "orte/mca/oob/tcp/oob_tcp_peer.h"
|
||||
#include "orte/mca/oob/tcp/oob_tcp_msg.h"
|
||||
#include "opal/mca/timer/base/base.h"
|
||||
|
||||
|
||||
#if defined(c_plusplus) || defined(__cplusplus)
|
||||
@ -223,11 +224,6 @@ void mca_oob_tcp_registry_callback(
|
||||
|
||||
void mca_oob_tcp_set_socket_options(int sd);
|
||||
|
||||
typedef enum {
|
||||
OOB_TCP_EVENT,
|
||||
OOB_TCP_LISTEN_THREAD
|
||||
} mca_oob_tcp_listen_type_t;
|
||||
|
||||
/**
|
||||
* OOB TCP Component
|
||||
*/
|
||||
@ -258,6 +254,19 @@ struct mca_oob_tcp_component_t {
|
||||
opal_condition_t tcp_match_cond; /**< condition variable used in finalize */
|
||||
int tcp_match_count; /**< number of matched recvs in progress */
|
||||
int tcp_debug; /**< debug level */
|
||||
|
||||
bool tcp_shutdown;
|
||||
enum { OOB_TCP_EVENT, OOB_TCP_LISTEN_THREAD } tcp_listen_type;
|
||||
opal_thread_t tcp_listen_thread;
|
||||
opal_free_list_t tcp_pending_connections_fl;
|
||||
opal_list_t tcp_pending_connections;
|
||||
opal_list_t tcp_copy_out_connections;
|
||||
opal_list_t tcp_copy_in_connections;
|
||||
opal_mutex_t tcp_pending_connections_lock;
|
||||
opal_timer_t tcp_last_copy_time;
|
||||
opal_timer_t tcp_copy_delta;
|
||||
int tcp_copy_max_size;
|
||||
int tcp_copy_spin_count;
|
||||
};
|
||||
|
||||
/**
|
||||
@ -273,6 +282,14 @@ ORTE_MODULE_DECLSPEC extern mca_oob_tcp_component_t mca_oob_tcp_component;
|
||||
#define CLOSE_THE_SOCKET(socket) close(socket)
|
||||
#endif /* defined(__WINDOWS__) */
|
||||
|
||||
struct mca_oob_tcp_pending_connection_t {
|
||||
opal_free_list_item_t super;
|
||||
int fd;
|
||||
struct sockaddr_in addr;
|
||||
};
|
||||
typedef struct mca_oob_tcp_pending_connection_t mca_oob_tcp_pending_connection_t;
|
||||
OBJ_CLASS_DECLARATION(mca_oob_tcp_pending_connection_t);
|
||||
|
||||
#if defined(c_plusplus) || defined(__cplusplus)
|
||||
}
|
||||
#endif
|
||||
|
@ -379,7 +379,7 @@ static int mca_oob_tcp_peer_start_connect(mca_oob_tcp_peer_t* peer)
|
||||
static void mca_oob_tcp_peer_complete_connect(mca_oob_tcp_peer_t* peer)
|
||||
{
|
||||
int so_error = 0;
|
||||
ompi_socklen_t so_length = sizeof(so_error);
|
||||
opal_socklen_t so_length = sizeof(so_error);
|
||||
|
||||
/* unregister from receiving event notifications */
|
||||
opal_event_del(&peer->peer_send_event);
|
||||
@ -467,7 +467,7 @@ void mca_oob_tcp_peer_close(mca_oob_tcp_peer_t* peer)
|
||||
* get stuck in the orte_wait_kill when receiving messages in the
|
||||
* tcp OOB. */
|
||||
OPAL_THREAD_UNLOCK(&peer->peer_lock);
|
||||
orte_errmgr.abort();
|
||||
orte_errmgr.error_detected(1, "OOB: Connection to HNP lost", NULL);
|
||||
}
|
||||
}
|
||||
|
||||
@ -787,8 +787,8 @@ static void mca_oob_tcp_peer_dump(mca_oob_tcp_peer_t* peer, const char* msg)
|
||||
char buff[255];
|
||||
int sndbuf,rcvbuf,nodelay,flags;
|
||||
struct sockaddr_in inaddr;
|
||||
ompi_socklen_t optlen;
|
||||
ompi_socklen_t addrlen = sizeof(struct sockaddr_in);
|
||||
opal_socklen_t optlen;
|
||||
opal_socklen_t addrlen = sizeof(struct sockaddr_in);
|
||||
|
||||
getsockname(peer->peer_sd, (struct sockaddr*)&inaddr, &addrlen);
|
||||
sprintf(src, "%s", inet_ntoa(inaddr.sin_addr));
|
||||
|
@ -25,7 +25,7 @@ nobase_orte_HEADERS =
|
||||
dist_pkgdata_DATA =
|
||||
|
||||
# local files
|
||||
headers = pls.h
|
||||
headers = pls.h pls_types.h
|
||||
libmca_pls_la_SOURCES += $(headers)
|
||||
|
||||
# Conditionally install the header files
|
||||
|
@ -19,12 +19,14 @@
|
||||
dist_pkgdata_DATA += base/help-pls-base.txt
|
||||
|
||||
headers += \
|
||||
base/pls_private.h \
|
||||
base/base.h
|
||||
|
||||
libmca_pls_la_SOURCES += \
|
||||
base/pls_base_close.c \
|
||||
base/pls_base_context.c \
|
||||
base/pls_base_general_support_fns.c \
|
||||
base/pls_base_open.c \
|
||||
base/pls_base_receive.c \
|
||||
base/pls_base_select.c \
|
||||
base/pls_base_state.c \
|
||||
base/pls_base_proxy.c
|
||||
base/pls_base_dmn_registry_fns.c \
|
||||
base/pls_base_orted_cmds.c
|
||||
|
Некоторые файлы не были показаны из-за слишком большого количества измененных файлов Показать больше
Загрузка…
x
Ссылка в новой задаче
Block a user