Change the SOH to the new State Monitoring and Reporting (SMR) framework. New API's will be appearing in the new framework shortly - this just gets the name change into the system.
Other changes: 1. Remove the old xcpu components as they are not functional. 2. Fix a "bug" in orterun whereby we called dump_aborted_procs even when we normally terminated. There is still some kind of bug in this procedure, however, as we appear to be calling the orterun job_state_callback function every time a process terminates (instead of only once when they have all terminated). I'll continue digging into that one. This will require an autogen/configure, I'm afraid. This commit was SVN r11228.
Этот коммит содержится в:
родитель
6d414f2d44
Коммит
8c7f0ed9ae
@ -1146,7 +1146,7 @@ AC_CONFIG_FILES([
|
||||
test/mca/rmaps/Makefile
|
||||
test/mca/rmgr/Makefile
|
||||
test/mca/schema/Makefile
|
||||
test/mca/soh/Makefile
|
||||
test/mca/smr/Makefile
|
||||
test/memory/Makefile
|
||||
test/runtime/Makefile
|
||||
test/support/Makefile
|
||||
|
@ -47,7 +47,7 @@
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/rmgr/rmgr.h"
|
||||
#include "orte/mca/rmgr/base/base.h"
|
||||
#include "orte/mca/soh/soh_types.h"
|
||||
#include "orte/mca/smr/smr_types.h"
|
||||
#include "orte/mca/rml/rml.h"
|
||||
|
||||
#include "orte/runtime/runtime.h"
|
||||
|
@ -45,8 +45,7 @@
|
||||
#include "orte/mca/ns/ns.h"
|
||||
#include "orte/mca/gpr/gpr.h"
|
||||
#include "orte/mca/rml/rml.h"
|
||||
#include "orte/mca/soh/soh.h"
|
||||
#include "orte/mca/soh/base/base.h"
|
||||
#include "orte/mca/smr/smr.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/runtime/runtime.h"
|
||||
|
||||
@ -133,7 +132,7 @@ int ompi_mpi_finalize(void)
|
||||
}
|
||||
*/
|
||||
/* Set process status to "at stg3" */
|
||||
if (ORTE_SUCCESS != (ret = orte_soh.set_proc_soh(orte_process_info.my_name,
|
||||
if (ORTE_SUCCESS != (ret = orte_smr.set_proc_state(orte_process_info.my_name,
|
||||
ORTE_PROC_STATE_AT_STG3, 0))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
}
|
||||
@ -277,15 +276,15 @@ int ompi_mpi_finalize(void)
|
||||
}
|
||||
|
||||
/* Set process status to "finalized" */
|
||||
if (ORTE_SUCCESS != (ret = orte_soh.set_proc_soh(orte_process_info.my_name,
|
||||
if (ORTE_SUCCESS != (ret = orte_smr.set_proc_state(orte_process_info.my_name,
|
||||
ORTE_PROC_STATE_FINALIZED, 0))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
}
|
||||
|
||||
/*
|
||||
* Wait for everyone to get here. This is necessary to allow the soh
|
||||
* Wait for everyone to get here. This is necessary to allow the smr
|
||||
* to update the job state for singletons. Otherwise, we finalize
|
||||
* the RTE while the soh is trying to do the update - which causes
|
||||
* the RTE while the smr is trying to do the update - which causes
|
||||
* an ugly race condition
|
||||
*/
|
||||
if (ORTE_SUCCESS != (ret = orte_rml.xcast(NULL, NULL, 0, NULL,
|
||||
|
@ -41,8 +41,7 @@
|
||||
#include "orte/mca/gpr/gpr.h"
|
||||
#include "orte/mca/rml/rml.h"
|
||||
#include "orte/mca/schema/schema.h"
|
||||
#include "orte/mca/soh/soh.h"
|
||||
#include "orte/mca/soh/base/base.h"
|
||||
#include "orte/mca/smr/smr.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
|
||||
#include "ompi/constants.h"
|
||||
@ -475,7 +474,7 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
|
||||
}
|
||||
|
||||
/* Let system know we are at STG1 Barrier */
|
||||
if (ORTE_SUCCESS != (ret = orte_soh.set_proc_soh(orte_process_info.my_name,
|
||||
if (ORTE_SUCCESS != (ret = orte_smr.set_proc_state(orte_process_info.my_name,
|
||||
ORTE_PROC_STATE_AT_STG1, 0))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
error = "set process state failed";
|
||||
@ -585,7 +584,7 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
|
||||
|
||||
/* Let system know we are at STG2 Barrier */
|
||||
|
||||
if (ORTE_SUCCESS != (ret = orte_soh.set_proc_soh(orte_process_info.my_name,
|
||||
if (ORTE_SUCCESS != (ret = orte_smr.set_proc_state(orte_process_info.my_name,
|
||||
ORTE_PROC_STATE_AT_STG2, 0))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
error = "set process state failed";
|
||||
|
@ -90,8 +90,8 @@
|
||||
#include "orte/mca/rml/base/base.h"
|
||||
#include "orte/mca/pls/pls.h"
|
||||
#include "orte/mca/pls/base/base.h"
|
||||
#include "orte/mca/soh/soh.h"
|
||||
#include "orte/mca/soh/base/base.h"
|
||||
#include "orte/mca/smr/smr.h"
|
||||
#include "orte/mca/smr/base/base.h"
|
||||
#include "orte/mca/sds/sds.h"
|
||||
#include "orte/mca/sds/base/base.h"
|
||||
|
||||
@ -228,8 +228,8 @@ void ompi_info::open_components()
|
||||
orte_sds_base_open();
|
||||
component_map["sds"] = &orte_sds_base_components_available;
|
||||
|
||||
orte_soh_base_open();
|
||||
component_map["soh"] = &orte_soh_base.soh_components;
|
||||
orte_smr_base_open();
|
||||
component_map["smr"] = &orte_smr_base.smr_components;
|
||||
|
||||
// MPI frameworks
|
||||
|
||||
@ -296,7 +296,7 @@ void ompi_info::close_components()
|
||||
|
||||
orte_iof_base_close();
|
||||
orte_sds_base_close();
|
||||
orte_soh_base_close();
|
||||
orte_smr_base_close();
|
||||
orte_pls_base_close();
|
||||
orte_rmgr_base_close();
|
||||
orte_rmaps_base_close();
|
||||
|
@ -43,7 +43,7 @@
|
||||
#include "orte/dss/dss_types.h"
|
||||
#include "orte/mca/ns/ns_types.h"
|
||||
#include "orte/mca/rmgr/rmgr_types.h"
|
||||
#include "orte/mca/soh/soh_types.h"
|
||||
#include "orte/mca/smr/smr_types.h"
|
||||
|
||||
#if defined(c_plusplus) || defined(__cplusplus)
|
||||
extern "C" {
|
||||
|
@ -35,7 +35,7 @@
|
||||
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/ns/ns_types.h"
|
||||
#include "orte/mca/soh/soh_types.h"
|
||||
#include "orte/mca/smr/smr_types.h"
|
||||
|
||||
#include "orte/mca/gpr/replica/transition_layer/gpr_replica_tl.h"
|
||||
#include "gpr_replica_fn.h"
|
||||
|
@ -26,10 +26,9 @@
|
||||
#include "orte/dss/dss.h"
|
||||
#include "orte/mca/oob/oob.h"
|
||||
#include "orte/mca/oob/base/base.h"
|
||||
#include "orte/mca/ns/ns.h"
|
||||
#include "orte/mca/gpr/gpr.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/soh/soh.h"
|
||||
#include "orte/mca/smr/smr.h"
|
||||
#include "orte/runtime/runtime.h"
|
||||
|
||||
|
||||
@ -93,18 +92,16 @@ int mca_oob_xcast(
|
||||
orte_std_cntr_t i;
|
||||
int rc;
|
||||
int tag = MCA_OOB_TAG_XCAST;
|
||||
int cmpval;
|
||||
int status;
|
||||
orte_proc_state_t state;
|
||||
|
||||
/* check to see if I am the root process name */
|
||||
cmpval = orte_ns.compare(ORTE_NS_CMP_ALL, root, orte_process_info.my_name);
|
||||
if(NULL != root && 0 == cmpval) {
|
||||
if(NULL != root && ORTE_EQUAL == orte_dss.compare(root, orte_process_info.my_name, ORTE_NAME)) {
|
||||
mca_oob_xcast_t *xcast = OBJ_NEW(mca_oob_xcast_t);
|
||||
xcast->counter = num_peers;
|
||||
for(i=0; i<num_peers; i++) {
|
||||
/* check status of peer to ensure they are alive */
|
||||
if (ORTE_SUCCESS != (rc = orte_soh.get_proc_soh(&state, &status, peers+i))) {
|
||||
if (ORTE_SUCCESS != (rc = orte_smr.get_proc_state(&state, &status, peers+i))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
@ -28,7 +28,7 @@
|
||||
#include "orte/mca/pls/base/base.h"
|
||||
#include "orte/mca/ns/ns.h"
|
||||
#include "orte/mca/gpr/gpr.h"
|
||||
#include "orte/mca/soh/soh_types.h"
|
||||
#include "orte/mca/smr/smr_types.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/schema/schema.h"
|
||||
|
||||
|
@ -63,7 +63,7 @@
|
||||
#include "orte/mca/rmaps/base/base.h"
|
||||
#include "orte/mca/rmaps/base/rmaps_base_map.h"
|
||||
#include "orte/mca/rml/rml.h"
|
||||
#include "orte/mca/soh/base/base.h"
|
||||
#include "orte/mca/smr/smr.h"
|
||||
#include "orte/runtime/orte_wait.h"
|
||||
#include "orte/runtime/runtime.h"
|
||||
|
||||
@ -289,9 +289,9 @@ static void orte_pls_bproc_waitpid_cb(pid_t wpid, int status, void *data) {
|
||||
int rc;
|
||||
/* set the state of this process */
|
||||
if(WIFEXITED(status)) {
|
||||
rc = orte_soh.set_proc_soh(proc, ORTE_PROC_STATE_TERMINATED, status);
|
||||
rc = orte_smr.set_proc_state(proc, ORTE_PROC_STATE_TERMINATED, status);
|
||||
} else {
|
||||
rc = orte_soh.set_proc_soh(proc, ORTE_PROC_STATE_ABORTED, status);
|
||||
rc = orte_smr.set_proc_state(proc, ORTE_PROC_STATE_ABORTED, status);
|
||||
}
|
||||
if(ORTE_SUCCESS != rc) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
@ -724,7 +724,7 @@ orte_pls_bproc_check_node_state(orte_gpr_notify_data_t *notify_data,
|
||||
if(dead_node) {
|
||||
/* gotta see if this node belongs to us... arg.. */
|
||||
/* also, we know by order of creation that the node state */
|
||||
/* comes before the node name.. see soh_bproc.c */
|
||||
/* comes before the node name.. see smr_bproc.c */
|
||||
orte_std_cntr_t name_idx;
|
||||
for (name_idx = 0;
|
||||
name_idx < orte_pointer_array_get_size(mca_pls_bproc_component.active_node_names);
|
||||
|
@ -72,8 +72,7 @@
|
||||
#include "orte/mca/gpr/gpr.h"
|
||||
#include "orte/mca/rmaps/base/base.h"
|
||||
#include "orte/mca/rmaps/base/rmaps_base_map.h"
|
||||
#include "orte/mca/soh/soh.h"
|
||||
#include "orte/mca/soh/base/base.h"
|
||||
#include "orte/mca/smr/smr.h"
|
||||
#include "orte/mca/pls/fork/pls_fork.h"
|
||||
|
||||
extern char **environ;
|
||||
@ -171,7 +170,7 @@ static void orte_pls_fork_kill_processes(opal_value_array_t *pids, opal_value_ar
|
||||
|
||||
/* update the process state on the registry */
|
||||
proc = OPAL_VALUE_ARRAY_GET_ITEM(procs, orte_process_name_t, i);
|
||||
if (ORTE_SUCCESS != (rc = orte_soh.set_proc_soh(&proc, ORTE_PROC_STATE_TERMINATED, exit_status))) {
|
||||
if (ORTE_SUCCESS != (rc = orte_smr.set_proc_state(&proc, ORTE_PROC_STATE_TERMINATED, exit_status))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
/* don't exit out even if this didn't work - we still might need to kill more
|
||||
* processes, so just keep trucking
|
||||
@ -203,9 +202,9 @@ static void orte_pls_fork_wait_proc(pid_t pid, int status, void* cbdata)
|
||||
|
||||
/* set the state of this process */
|
||||
if(WIFEXITED(status)) {
|
||||
rc = orte_soh.set_proc_soh(&proc->proc_name, ORTE_PROC_STATE_TERMINATED, status);
|
||||
rc = orte_smr.set_proc_state(&proc->proc_name, ORTE_PROC_STATE_TERMINATED, status);
|
||||
} else {
|
||||
rc = orte_soh.set_proc_soh(&proc->proc_name, ORTE_PROC_STATE_ABORTED, status);
|
||||
rc = orte_smr.set_proc_state(&proc->proc_name, ORTE_PROC_STATE_ABORTED, status);
|
||||
}
|
||||
if(ORTE_SUCCESS != rc) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
@ -493,7 +492,7 @@ static int orte_pls_fork_proc(
|
||||
the SOH or else everyone else will hang. Don't bother
|
||||
checking whether or not this worked - just fire and forget
|
||||
*/
|
||||
orte_soh.set_proc_soh(&proc->proc_name, ORTE_PROC_STATE_ABORTED, rc);
|
||||
orte_smr.set_proc_state(&proc->proc_name, ORTE_PROC_STATE_ABORTED, rc);
|
||||
return ORTE_ERR_FATAL;
|
||||
break;
|
||||
}
|
||||
@ -576,7 +575,7 @@ int orte_pls_fork_launch(orte_jobid_t jobid)
|
||||
processes to be launched to ABORTED. This will
|
||||
cause the entire job to abort. */
|
||||
for (; i < map->num_procs; ++i) {
|
||||
orte_soh.set_proc_soh(&map->procs[i]->proc_name,
|
||||
orte_smr.set_proc_state(&map->procs[i]->proc_name,
|
||||
ORTE_PROC_STATE_ABORTED, 0);
|
||||
}
|
||||
|
||||
|
@ -82,8 +82,7 @@
|
||||
#include "orte/mca/ras/base/ras_base_node.h"
|
||||
#include "orte/mca/rmaps/base/rmaps_base_map.h"
|
||||
#include "orte/mca/rmgr/base/base.h"
|
||||
#include "orte/mca/soh/soh.h"
|
||||
#include "orte/mca/soh/base/base.h"
|
||||
#include "orte/mca/smr/smr.h"
|
||||
#include "orte/mca/pls/gridengine/pls_gridengine.h"
|
||||
#include "orte/util/sys_info.h"
|
||||
|
||||
@ -186,7 +185,7 @@ static void orte_pls_gridengine_wait_daemon(pid_t pid, int status, void* cbdata)
|
||||
|
||||
orte_session_dir_finalize(&(map->procs[i])->proc_name);
|
||||
|
||||
rc = orte_soh.set_proc_soh(&(map->procs[i]->proc_name),
|
||||
rc = orte_smr.set_proc_state(&(map->procs[i]->proc_name),
|
||||
ORTE_PROC_STATE_ABORTED, status);
|
||||
}
|
||||
if (ORTE_SUCCESS != rc) {
|
||||
|
@ -43,7 +43,7 @@
|
||||
#include "orte/mca/rmgr/base/base.h"
|
||||
#include "orte/mca/rml/rml.h"
|
||||
#include "orte/mca/sds/base/base.h"
|
||||
#include "orte/mca/soh/soh.h"
|
||||
#include "orte/mca/smr/smr.h"
|
||||
#include "orte/util/univ_info.h"
|
||||
#include "orte/util/session_dir.h"
|
||||
#include "orte/runtime/orte_wait.h"
|
||||
@ -347,7 +347,7 @@ static void poe_wait_job(pid_t pid, int status, void* cbdata)
|
||||
|
||||
for(i = 0 ; i < map->num_procs ; ++i) {
|
||||
orte_session_dir_finalize(&(map->procs[i])->proc_name);
|
||||
rc = orte_soh.set_proc_soh(&(map->procs[i]->proc_name),
|
||||
rc = orte_smr.set_proc_state(&(map->procs[i]->proc_name),
|
||||
ORTE_PROC_STATE_ABORTED, status);
|
||||
}
|
||||
if(ORTE_SUCCESS != rc) {
|
||||
|
@ -79,8 +79,7 @@
|
||||
#include "orte/mca/ras/base/ras_base_node.h"
|
||||
#include "orte/mca/rmaps/base/rmaps_base_map.h"
|
||||
#include "orte/mca/rmgr/base/base.h"
|
||||
#include "orte/mca/soh/soh.h"
|
||||
#include "orte/mca/soh/base/base.h"
|
||||
#include "orte/mca/smr/smr.h"
|
||||
#include "orte/mca/pls/rsh/pls_rsh.h"
|
||||
#include "orte/util/sys_info.h"
|
||||
|
||||
@ -325,7 +324,7 @@ static void orte_pls_rsh_wait_daemon(pid_t pid, int status, void* cbdata)
|
||||
|
||||
orte_session_dir_finalize(&(map->procs[i])->proc_name);
|
||||
|
||||
rc = orte_soh.set_proc_soh(&(map->procs[i]->proc_name),
|
||||
rc = orte_smr.set_proc_state(&(map->procs[i]->proc_name),
|
||||
ORTE_PROC_STATE_ABORTED, status);
|
||||
}
|
||||
if (ORTE_SUCCESS != rc) {
|
||||
|
@ -56,10 +56,9 @@
|
||||
#include "orte/mca/pls/pls.h"
|
||||
#include "orte/mca/pls/base/base.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/soh/soh_types.h"
|
||||
#include "orte/mca/smr/smr_types.h"
|
||||
#include "orte/mca/gpr/gpr.h"
|
||||
#include "orte/mca/sds/base/base.h"
|
||||
#include "orte/mca/soh/soh.h"
|
||||
#include "orte/mca/rml/rml.h"
|
||||
#include "orte/mca/ns/ns.h"
|
||||
#include "pls_tm.h"
|
||||
@ -344,7 +343,7 @@ pls_tm_launch(orte_jobid_t jobid)
|
||||
* NOT being oversubscribed
|
||||
*/
|
||||
if (node->node_slots > 0 &&
|
||||
opal_list_get_size(&rmaps_node->node_procs) > node->node_slots) {
|
||||
(orte_std_cntr_t)opal_list_get_size(&rmaps_node->node_procs) > node->node_slots) {
|
||||
if (mca_pls_tm_component.debug) {
|
||||
opal_output(0, "pls:tm: oversubscribed -- setting mpi_yield_when_idle to 1 (%d %d)",
|
||||
node->node_slots,
|
||||
|
@ -1,50 +0,0 @@
|
||||
#
|
||||
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
# University Research and Technology
|
||||
# Corporation. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
# of Tennessee Research Foundation. All rights
|
||||
# reserved.
|
||||
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# Copyright (c) 2004-2006 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
AM_CPPFLAGS = $(pls_xcpu_CPPFLAGS)
|
||||
|
||||
# Make the output library in this directory, and name it either
|
||||
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
|
||||
# (for static builds).
|
||||
|
||||
if OMPI_BUILD_pls_xcpu_DSO
|
||||
component_noinst =
|
||||
component_install = mca_pls_xcpu.la
|
||||
else
|
||||
component_noinst = libmca_pls_xcpu.la
|
||||
component_install =
|
||||
endif
|
||||
|
||||
sources = \
|
||||
pls_xcpu.h \
|
||||
pls_xcpu.c \
|
||||
pls_xcpu_component.c
|
||||
|
||||
mcacomponentdir = $(libdir)/openmpi
|
||||
mcacomponent_LTLIBRARIES = $(component_install)
|
||||
mca_pls_xcpu_la_SOURCES = $(sources)
|
||||
mca_pls_xcpu_la_LIBADD = \
|
||||
$(pls_xcpu_LIBS) \
|
||||
$(top_ompi_builddir)/orte/liborte.la \
|
||||
$(top_ompi_builddir)/opal/libopal.la
|
||||
mca_pls_xcpu_la_LDFLAGS = -module -avoid-version $(pls_xcpu_LDFLAGS)
|
||||
|
||||
noinst_LTLIBRARIES = $(component_noinst)
|
||||
libmca_pls_xcpu_la_SOURCES = $(sources)
|
||||
libmca_pls_xcpu_la_LIBADD = $(pls_xcpu_LIBS)
|
||||
libmca_pls_xcpu_la_LDFLAGS = -module -avoid-version $(pls_xcpu_LDFLAGS)
|
@ -1,37 +0,0 @@
|
||||
# -*- shell-script -*-
|
||||
#
|
||||
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
# University Research and Technology
|
||||
# Corporation. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
# of Tennessee Research Foundation. All rights
|
||||
# reserved.
|
||||
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# Copyright (c) 2004-2006 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
# MCA_pls_xcpu_CONFIG([action-if-found], [action-if-not-found])
|
||||
# -----------------------------------------------------------
|
||||
AC_DEFUN([MCA_pls_xcpu_CONFIG],[
|
||||
OMPI_CHECK_XCPU([pls_xcpu], [pls_xcpu_good=1], [pls_xcpu_good=0])
|
||||
|
||||
# if check worked, set wrapper flags.
|
||||
# Evaluate succeed / fail
|
||||
AS_IF([test "$pls_xcpu_good" = "1"],
|
||||
[pls_xcpu_WRAPPER_EXTRA_LDFLAGS="$pls_xcpu_LDFLAGS"
|
||||
pls_xcpu_WRAPPER_EXTRA_LIBS="$pls_xcpu_LIBS"
|
||||
$1],
|
||||
[$2])
|
||||
|
||||
# set build flags to use in makefile
|
||||
AC_SUBST([pls_xcpu_CPPFLAGS])
|
||||
AC_SUBST([pls_xcpu_LDFLAGS])
|
||||
AC_SUBST([pls_xcpu_LIBS])
|
||||
])dnl
|
@ -1,24 +0,0 @@
|
||||
# -*- shell-script -*-
|
||||
#
|
||||
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
# University Research and Technology
|
||||
# Corporation. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
# of Tennessee Research Foundation. All rights
|
||||
# reserved.
|
||||
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# Copyright (c) 2004-2006 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
# Specific to this module
|
||||
|
||||
|
||||
PARAM_INIT_FILE=pls_xcpu.c
|
||||
PARAM_CONFIG_FILES="Makefile"
|
@ -1,792 +0,0 @@
|
||||
/* -*- C -*-
|
||||
*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2006 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*
|
||||
*/
|
||||
|
||||
/* @file:
|
||||
* xcpu Lancher to launch jobs on compute nodes..
|
||||
*/
|
||||
|
||||
#include "orte_config.h"
|
||||
#if HAVE_SYS_TYPES_H
|
||||
#include <sys/types.h>
|
||||
#endif /* HAVE_SYS_TYPES_H */
|
||||
#ifdef HAVE_SYS_STAT_H
|
||||
#include <sys/stat.h>
|
||||
#endif /* HAVE_SYS_STAT_H */
|
||||
#ifdef HAVE_UNISTD_H
|
||||
#include <unistd.h>
|
||||
#endif /* HAVE_UNISTD_H */
|
||||
#include <errno.h>
|
||||
#include <signal.h>
|
||||
#ifdef HAVE_FCNTL_H
|
||||
#include <fcntl.h>
|
||||
#endif /* HAVE_FCNTL_H */
|
||||
#ifdef HAVE_STRING_H
|
||||
#include <string.h>
|
||||
#endif /* HAVE_STRING_H */
|
||||
|
||||
#include "opal/event/event.h"
|
||||
#include "opal/mca/base/mca_base_param.h"
|
||||
#include "opal/util/argv.h"
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/util/opal_environ.h"
|
||||
#include "opal/util/path.h"
|
||||
#include "opal/util/show_help.h"
|
||||
|
||||
#include "orte/dss/dss.h"
|
||||
#include "orte/util/sys_info.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/gpr/base/base.h"
|
||||
#include "orte/mca/iof/iof.h"
|
||||
#include "orte/mca/ns/base/base.h"
|
||||
#include "orte/mca/sds/base/base.h"
|
||||
#include "orte/mca/oob/base/base.h"
|
||||
#include "orte/mca/ras/base/base.h"
|
||||
#include "orte/mca/rmgr/base/base.h"
|
||||
#include "orte/mca/rmaps/base/base.h"
|
||||
#include "orte/mca/rmaps/base/rmaps_base_map.h"
|
||||
#include "orte/mca/rml/rml.h"
|
||||
#include "orte/mca/soh/base/base.h"
|
||||
#include "orte/runtime/orte_wait.h"
|
||||
#include "orte/runtime/runtime.h"
|
||||
|
||||
#include "pls_xcpu.h"
|
||||
#include <regex.h>
|
||||
#include <dirent.h>
|
||||
/**
|
||||
* Our current evironment
|
||||
*/
|
||||
extern char **environ;
|
||||
extern int errno;
|
||||
|
||||
char **g_environ;
|
||||
int g_regexploc=1;
|
||||
regex_t g_compiled_exp;
|
||||
orte_pls_xcpu_mount_nodes *g_current_m=NULL;
|
||||
orte_pls_xcpu_thread_info *g_thread_info;
|
||||
orte_pls_xcpu_pthread_tindex t_info;
|
||||
orte_pls_xcpu_stdio_thread_info *g_stdout_thread_info, *g_stderr_thread_info;
|
||||
pthread_mutex_t mymutex = PTHREAD_MUTEX_INITIALIZER;
|
||||
|
||||
orte_pls_xcpu_pthread_tindex *orte_pls_xcpu_launch_procs(int, char **, char**, orte_process_name_t *);
|
||||
int orte_pls_xcpu_cmd_check(int, char **);
|
||||
void orte_pls_xcpu_cleanup();
|
||||
void *orte_pls_xcpu_start_thread(void *);
|
||||
void *orte_pls_xcpu_stdio_thread(void *);
|
||||
int orte_pls_xcpu_check_exp(char *);
|
||||
|
||||
/**
|
||||
* Initialization of the xcpu module with all the needed function pointers
|
||||
*/
|
||||
orte_pls_base_module_t orte_pls_xcpu_module = {
|
||||
orte_pls_xcpu_launch,
|
||||
orte_pls_xcpu_terminate_job,
|
||||
orte_pls_xcpu_terminate_proc,
|
||||
orte_pls_xcpu_finalize
|
||||
};
|
||||
|
||||
/** LOCAL SUPPORT FUNCTIONS **/
|
||||
|
||||
/** provide a local function to release the function stack
|
||||
* required by xcpu
|
||||
*/
|
||||
static void orte_pls_xcpu_free_stack(orte_pls_xcpu_tid_stack *s){
|
||||
if(s){
|
||||
orte_pls_xcpu_free_stack(s->next);
|
||||
free(s);
|
||||
}
|
||||
}
|
||||
|
||||
/* for handling stdout/err */
|
||||
void *orte_pls_xcpu_stdio_thread(void *info){
|
||||
orte_pls_xcpu_stdio_thread_info *io_t_info;
|
||||
char buf[100];int x, rc;
|
||||
io_t_info = (orte_pls_xcpu_stdio_thread_info*)info;
|
||||
if((x=open(io_t_info->stdio_path, O_RDONLY))<0){
|
||||
ORTE_ERROR_LOG(ORTE_ERR_FILE_OPEN_FAILURE);
|
||||
}else{
|
||||
while(1){
|
||||
if((rc=read(x, buf, 100))>0){
|
||||
write(io_t_info->outdes, buf, rc);
|
||||
}else{
|
||||
if(rc==-1){
|
||||
ORTE_ERROR_LOG(ORTE_ERR_FILE_READ_FAILURE);
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
x>=0?close(x):0;
|
||||
free(io_t_info->stdio_path);
|
||||
free(io_t_info);
|
||||
pthread_exit(NULL);
|
||||
}
|
||||
|
||||
/* used by orte_pls_xcpu_launch_procs to start process
|
||||
* on remote compute node.
|
||||
* one thread per process for time being
|
||||
*
|
||||
* @info: contains all the information required by thread
|
||||
* to launch process on remote compute node.
|
||||
*/
|
||||
void *orte_pls_xcpu_start_thread(void *info){
|
||||
orte_pls_xcpu_thread_info *t_info;
|
||||
char *session_clone, session_dir[255], *session_dir_path;
|
||||
int clone_des, rc=0, des1, des2/*, tdes*/, trc[2];
|
||||
char *env_path, *exec_path, *argv_path, *ctl_path;
|
||||
char character[8193];
|
||||
int i;
|
||||
orte_process_name_t *peers;
|
||||
pthread_t tids[2];
|
||||
trc[0]=trc[1]=0;
|
||||
t_info=(orte_pls_xcpu_thread_info*)info;
|
||||
|
||||
session_clone=(char*)malloc(strlen(t_info->local_mounts.name)+7);
|
||||
sprintf(session_clone, "%s/clone", t_info->local_mounts.name);
|
||||
if((clone_des=open(session_clone, O_RDONLY))<0){
|
||||
ORTE_ERROR_LOG(ORTE_ERR_FILE_OPEN_FAILURE);
|
||||
}
|
||||
if((rc=read(clone_des, session_dir, 255))<0){
|
||||
ORTE_ERROR_LOG(ORTE_ERR_FILE_READ_FAILURE);
|
||||
}
|
||||
else{
|
||||
session_dir[rc]='\0';
|
||||
session_dir_path=(char*)malloc(strlen(t_info->local_mounts.name)+strlen(session_dir)+2);
|
||||
sprintf(session_dir_path, "%s/%s", t_info->local_mounts.name, session_dir);
|
||||
|
||||
/* write environment if needed */
|
||||
env_path=(char*)malloc(strlen(session_dir_path)+5);
|
||||
sprintf(env_path, "%s/env", session_dir_path);
|
||||
if(t_info->env){
|
||||
if((des1=open(env_path, O_WRONLY))<0){
|
||||
ORTE_ERROR_LOG(ORTE_ERR_FILE_WRITE_FAILURE);
|
||||
}else{
|
||||
i=0;
|
||||
while(t_info->env[i]){
|
||||
/*printf("from lrx: %s\n", t_info->env[i]);
|
||||
*/if(write(des1, t_info->env[i], strlen(t_info->env[i])) == -1){
|
||||
ORTE_ERROR_LOG(ORTE_ERR_FILE_WRITE_FAILURE);
|
||||
break;
|
||||
}else{
|
||||
if(t_info->env[i+1]){
|
||||
if(write(des1, "\n", 1) == -1){
|
||||
ORTE_ERROR_LOG(ORTE_ERR_FILE_WRITE_FAILURE);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
i++;
|
||||
}
|
||||
close(des1);
|
||||
}
|
||||
}
|
||||
free(env_path);
|
||||
|
||||
/*then copy binary*/
|
||||
exec_path=(char*)malloc(strlen(session_dir_path)+6);
|
||||
sprintf(exec_path, "%s/exec", session_dir_path);
|
||||
if((des1=open(exec_path, O_WRONLY))<0){
|
||||
ORTE_ERROR_LOG(ORTE_ERR_FILE_OPEN_FAILURE);
|
||||
}else
|
||||
if((des2=open(t_info->binary, O_RDONLY))<0){
|
||||
ORTE_ERROR_LOG(ORTE_ERR_FILE_OPEN_FAILURE);
|
||||
}else{
|
||||
while(1){
|
||||
if((rc=read(des2, character, 8192))<=0){
|
||||
if(close(des1)!=0){ /*?????*/
|
||||
/*no ORTE_ERR defined for FILE_CLOSE_FAILURE*/
|
||||
}
|
||||
if(close(des2)!=0){
|
||||
/*no ORTE_ERR defined for FILE_CLOSE_FAILURE*/
|
||||
}
|
||||
break;
|
||||
}else{
|
||||
if(write(des1, character, rc)==-1){
|
||||
ORTE_ERROR_LOG(ORTE_ERR_FILE_WRITE_FAILURE);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* then write args*/
|
||||
argv_path=(char*)malloc(strlen(session_dir_path)+6);
|
||||
sprintf(argv_path, "%s/argv", session_dir_path);
|
||||
if((des1=open(argv_path, O_WRONLY))<0){
|
||||
ORTE_ERROR_LOG(ORTE_ERR_FILE_OPEN_FAILURE);
|
||||
}else{
|
||||
write(des1, t_info->argv, strlen(t_info->argv));
|
||||
close(des1);
|
||||
}
|
||||
/* then write exec into ctl file to start remote execution*/
|
||||
ctl_path=(char*)malloc(strlen(session_dir_path)+5);
|
||||
sprintf(ctl_path, "%s/ctl", session_dir_path);
|
||||
/*continuation of writing ctl*/
|
||||
if((des1=open(ctl_path, O_WRONLY))<0){
|
||||
ORTE_ERROR_LOG(ORTE_ERR_FILE_OPEN_FAILURE);
|
||||
}else{
|
||||
if(write(des1, "exec\n", 5)==-1){
|
||||
ORTE_ERROR_LOG(ORTE_ERR_FILE_WRITE_FAILURE);
|
||||
}else
|
||||
close(des1);
|
||||
}
|
||||
|
||||
/*then spawn threads for stderr and atdout*/
|
||||
g_stdout_thread_info=(orte_pls_xcpu_stdio_thread_info*)malloc(sizeof(orte_pls_xcpu_stdio_thread_info));
|
||||
g_stdout_thread_info->stdio_path=(char*)malloc(strlen(session_dir_path)+8);
|
||||
sprintf(g_stdout_thread_info->stdio_path, "%s/stdout", session_dir_path);
|
||||
g_stdout_thread_info->outdes=1;
|
||||
if((rc=pthread_create(&tids[0], NULL, orte_pls_xcpu_stdio_thread, (void*)g_stdout_thread_info))==0){
|
||||
trc[0]=1;
|
||||
}else ;
|
||||
/*ORTE_ERR for thread_creation_failure not defined yet*/
|
||||
/*fprintf(stderr, "\nstdout thread creation error\n");*/
|
||||
g_stderr_thread_info=(orte_pls_xcpu_stdio_thread_info*)malloc(sizeof(orte_pls_xcpu_stdio_thread_info));
|
||||
g_stderr_thread_info->stdio_path=(char*)malloc(strlen(session_dir_path)+8);
|
||||
sprintf(g_stderr_thread_info->stdio_path, "%s/stderr", session_dir_path);
|
||||
g_stderr_thread_info->outdes=2;
|
||||
if((rc=pthread_create(&tids[1], NULL, orte_pls_xcpu_stdio_thread, (void*)g_stderr_thread_info))==0){
|
||||
trc[1]=1;
|
||||
}else ;
|
||||
/*ORTE_ERR for thread_creation_failure not defined yet*/
|
||||
/*fprintf(stderr, "stderr thread creation error\n");*/
|
||||
|
||||
free(session_dir_path);
|
||||
free(exec_path);
|
||||
free(argv_path);
|
||||
free(ctl_path);
|
||||
if(trc[0]){
|
||||
pthread_join(tids[0], NULL);
|
||||
}
|
||||
if(trc[1]){
|
||||
pthread_join(tids[1], NULL);
|
||||
}
|
||||
}
|
||||
free(session_clone);
|
||||
(clone_des>0)?close(clone_des):0;
|
||||
/* make registry update thread-safe */
|
||||
pthread_mutex_lock(&mymutex);
|
||||
/*write into registry that you are done*/
|
||||
if (ORTE_SUCCESS != (orte_soh_base_set_proc_soh(t_info->peers, ORTE_PROC_STATE_TERMINATED, 0)) ){
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
pthread_mutex_unlock(&mymutex);
|
||||
/* free the allocated variables after you are done*/
|
||||
free(t_info->local_mounts.name);
|
||||
free(t_info->binary);
|
||||
free(t_info->argv);
|
||||
free(t_info);
|
||||
pthread_exit(NULL);
|
||||
}
|
||||
|
||||
/* xcpu launcher function.
|
||||
* this function is called once for each process to be launched. or might
|
||||
* be called one time for multiple processes if regular expression is passed
|
||||
* to it. but for now regular expressions are not being passed.
|
||||
*
|
||||
* @argc: number of arguments or number of elements in argv
|
||||
* @argv: it will be name of remote node as mounted at $XCPUBASE or /mnt/xcpu/
|
||||
* @env: environment the needs to be setup on remote node before
|
||||
* starting the process
|
||||
* @peers: process info, this will be passed onto the threads to help them write
|
||||
* process completion information in open-mpi registry.
|
||||
*/
|
||||
orte_pls_xcpu_pthread_tindex *orte_pls_xcpu_launch_procs(int argc, char **argv, char **env, orte_process_name_t *peers){
|
||||
char *xcpu_base, *xcpu_argv;
|
||||
struct dirent *d_entry;
|
||||
DIR *dirp;
|
||||
int temp_fd, rc=0, index=0, argvsize=0, ntids=0;
|
||||
pthread_t *tids;
|
||||
orte_pls_xcpu_mount_nodes *m_nodes, *local_mounts;
|
||||
g_current_m=NULL;
|
||||
m_nodes=NULL;
|
||||
(!(xcpu_base=getenv("XCPUBASE")))?xcpu_base="/mnt/xcpu":0;
|
||||
if(!(dirp=opendir(xcpu_base))){
|
||||
ORTE_ERROR_LOG(ORTE_ERR_FILE_OPEN_FAILURE);/* it should be DIR_OPEN_ERROR */
|
||||
return NULL;
|
||||
}
|
||||
/* this logic should be fast than the one commented below*/
|
||||
m_nodes=(orte_pls_xcpu_mount_nodes*)malloc(sizeof(orte_pls_xcpu_mount_nodes));
|
||||
m_nodes->next=g_current_m;
|
||||
m_nodes->name=(char*)malloc(1+strlen(xcpu_base)+1+
|
||||
strlen(argv[1])+1+strlen("xcpu")+1);
|
||||
sprintf(m_nodes->name, "%s/%s/xcpu", xcpu_base, argv[1]);
|
||||
if((temp_fd=open(m_nodes->name, O_RDONLY))<0){
|
||||
fprintf(stderr, "Node %s/%s/xcpu does not exist\n",xcpu_base, argv[1]);
|
||||
free(m_nodes->name);
|
||||
}else{
|
||||
close(temp_fd);
|
||||
g_current_m=m_nodes;
|
||||
ntids=1;
|
||||
}
|
||||
/* logic ends */
|
||||
|
||||
/*
|
||||
while((d_entry=readdir(dirp))!=NULL){
|
||||
printf("comapring %s %s\n",d_entry->d_name, argv[1]);
|
||||
if((strcmp(d_entry->d_name, ".")==0)||(strcmp(d_entry->d_name, "..")==0))
|
||||
;else
|
||||
if(regexec(&g_compiled_exp, d_entry->d_name, 0, NULL, 0)!=REG_NOMATCH){
|
||||
printf("matched %s\n", argv[1]);
|
||||
ntids++;
|
||||
m_nodes=(orte_pls_xcpu_mount_nodes*)malloc(sizeof(orte_pls_xcpu_mount_nodes));
|
||||
m_nodes->next=g_current_m;
|
||||
m_nodes->name=(char*)malloc(1+strlen(xcpu_base)+1+
|
||||
strlen(d_entry->d_name)+1+strlen("xcpu")+1);
|
||||
sprintf(m_nodes->name, "%s/%s/xcpu", xcpu_base, d_entry->d_name);
|
||||
g_current_m=m_nodes;
|
||||
*/ /* we can break after finding the first one
|
||||
* or if you want to give the user an option of
|
||||
* specifying regular expressions in hostfiles
|
||||
* then don't break here
|
||||
*/
|
||||
/* on a second thought we should not be going thrugh mounted node list
|
||||
* just check if xcpu_base/d_entry->d_name/xcpu exists or not
|
||||
*/
|
||||
/* break;
|
||||
}
|
||||
}*/
|
||||
if(g_current_m==NULL){ /* is that an error.... no?*/
|
||||
return NULL;
|
||||
}
|
||||
closedir(dirp);
|
||||
/* now combine argv's so that they could be passed on */
|
||||
/* g_regexploc will have proper value only if
|
||||
* cmd_check is already called in lrx
|
||||
* and the location of first arg after name of binary will be
|
||||
* argv[g_regexploc+2] because usage: ./o.lrx [-D xx] regexp binary args
|
||||
*/
|
||||
/* number of arguments = argc - g_regexploc - 2;*/
|
||||
index=g_regexploc+2-1; /*argv[0] could be anything*/
|
||||
while(argv[index]){
|
||||
argvsize+=strlen(argv[index])+1;
|
||||
index++;
|
||||
}
|
||||
xcpu_argv=(char*)malloc(argvsize+1);
|
||||
index=g_regexploc+2-1;
|
||||
while(argv[index]){
|
||||
if(index==g_regexploc+2-1)
|
||||
strcpy(xcpu_argv, argv[index]);/* i dont know why strcpy 1st time?*/
|
||||
else
|
||||
strcat(xcpu_argv, argv[index]);
|
||||
strcat(xcpu_argv, " ");
|
||||
index++;
|
||||
}
|
||||
xcpu_argv[argvsize]='\0';
|
||||
local_mounts=g_current_m; /* this is a linked list of mounted directories
|
||||
* where binaries need to run
|
||||
*/
|
||||
tids=(pthread_t*)malloc(ntids*sizeof(pthread_t));
|
||||
index=0;
|
||||
while(local_mounts){
|
||||
/* dont use a shared copy
|
||||
* give every thread its own copy since we dont know
|
||||
* when all threads will exit and when to free a shared copy
|
||||
*/
|
||||
g_thread_info=(orte_pls_xcpu_thread_info*)malloc(sizeof(orte_pls_xcpu_thread_info));
|
||||
/*copy name first*/
|
||||
g_thread_info->local_mounts.name=(char*)malloc(strlen(local_mounts->name)+1);
|
||||
strcpy(g_thread_info->local_mounts.name, local_mounts->name);
|
||||
/*then copy binary*/
|
||||
g_thread_info->binary=(char*)malloc(strlen(argv[g_regexploc+1])+1);
|
||||
strcpy(g_thread_info->binary,argv[g_regexploc+1]);
|
||||
/*then copy argv*/
|
||||
g_thread_info->argv=(char*)malloc(strlen(xcpu_argv)+1);
|
||||
strcpy(g_thread_info->argv, xcpu_argv);
|
||||
/* for env and peers, since we are not allocating space for these
|
||||
* and these will be freed after all the threads are completed at the
|
||||
* end of mpirun (i hope).. otherwise we might have to copy these
|
||||
* first and then pass to threads
|
||||
*/
|
||||
g_thread_info->env=env;
|
||||
g_thread_info->peers=peers;
|
||||
|
||||
/*following thread will free the thread_info structure*/
|
||||
rc=pthread_create(&tids[index], NULL, orte_pls_xcpu_start_thread, (void*)g_thread_info);
|
||||
index++;
|
||||
if(rc){
|
||||
/*ORTE_ERR for thread_creation_failure not defined yet*/
|
||||
/*fprintf(stderr, "pthread_create: error while creating thread %d\n", rc);*/
|
||||
return NULL;
|
||||
}
|
||||
local_mounts=local_mounts->next;
|
||||
}
|
||||
/* use pthrad_join here if you want to wait for threads
|
||||
* to finish execution
|
||||
*//*
|
||||
while(1){
|
||||
index--;
|
||||
pthread_join(tids[index], NULL);
|
||||
if(index==0)
|
||||
break;
|
||||
}
|
||||
free(tids);*/
|
||||
/* remember to free tids in calling function*/
|
||||
free(xcpu_argv);
|
||||
t_info.tids=tids;
|
||||
t_info.index=index;
|
||||
return &t_info;
|
||||
}
|
||||
|
||||
/* this function is to check if argv is in correct format.
|
||||
* Some checks being done in this function (for -D) are not necessary
|
||||
* and will be removed in future.
|
||||
*/
|
||||
int orte_pls_xcpu_cmd_check(int argc, char **argv){
|
||||
char *temp_exp;
|
||||
int rc=0;
|
||||
g_regexploc=1;
|
||||
if(argc>=3){
|
||||
if(argv[1][0]=='-'){
|
||||
switch(argv[1][1]){
|
||||
case 'D': /* for debugging*/
|
||||
g_regexploc+=2;
|
||||
if(argc<5){
|
||||
/*fprintf(stderr, "usage: o.lrx [-D debuglevel"
|
||||
"] nodes binary [argv0 argv1 ...]\n");
|
||||
*/rc=1;
|
||||
}
|
||||
break;
|
||||
default: /* unspecified option*/
|
||||
/*fprintf(stderr, "usage: o.lrx [-D debuglevel"
|
||||
"] nodes binary [argv0 argv1 ...]\n");
|
||||
*/return 1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}else{
|
||||
/*fprintf(stderr, "usage: o.lrx [-D debuglevel"
|
||||
"] nodes binary [argv0 argv1 ...]\n");
|
||||
*/rc=1;
|
||||
}
|
||||
if(!rc){/*check for regular expression*/
|
||||
temp_exp=(char*)malloc(strlen(argv[g_regexploc])+3);
|
||||
sprintf(temp_exp, "^%s$", argv[g_regexploc]);
|
||||
rc=orte_pls_xcpu_check_exp(temp_exp);
|
||||
free(temp_exp);
|
||||
}
|
||||
return rc;
|
||||
}
|
||||
|
||||
void orte_pls_xcpu_free_mount(orte_pls_xcpu_mount_nodes *g_current_m){
|
||||
if(g_current_m){
|
||||
orte_pls_xcpu_free_mount(g_current_m->next);
|
||||
free(g_current_m->name);
|
||||
free(g_current_m);
|
||||
}
|
||||
}
|
||||
|
||||
void orte_pls_xcpu_cleanup(){
|
||||
regfree(&g_compiled_exp);
|
||||
orte_pls_xcpu_free_mount(g_current_m);
|
||||
}
|
||||
|
||||
|
||||
/* Launcher can accept regular expressions as the list of nodes where
|
||||
* processes are going to be launched. This is just a helper function to check
|
||||
* if regular expression is correct or not
|
||||
*/
|
||||
int orte_pls_xcpu_check_exp(char *exp){
|
||||
if(regcomp(&g_compiled_exp, exp, REG_EXTENDED|REG_NOSUB)){
|
||||
/*fprintf(stderr, "Invlid regular expression: %s\n", exp);*/
|
||||
return 1;
|
||||
}
|
||||
/*regfree(&g_compiled_exp);*/
|
||||
return 0; /* now dont forget to call regfree at the end*/
|
||||
}
|
||||
|
||||
/* This is the main launcher function
|
||||
* It will call orte_pls_xcpu_launch_procs which will
|
||||
* start a thread for each process to be launched
|
||||
*/
|
||||
int lrx(int argc, char **argv, char **env, orte_process_name_t *peers){
|
||||
int rc;
|
||||
orte_pls_xcpu_pthread_tindex *t_info;
|
||||
if((rc=orte_pls_xcpu_cmd_check(argc, argv))==1){
|
||||
return 0;
|
||||
}
|
||||
if((t_info=orte_pls_xcpu_launch_procs(argc, argv, env, peers))==NULL){
|
||||
/*fprintf(stderr, "lrx: 0 processes launched\n");*/
|
||||
orte_pls_xcpu_cleanup();
|
||||
return 0;
|
||||
}
|
||||
else{
|
||||
orte_pls_xcpu_cleanup();
|
||||
t_info->index--;
|
||||
rc=t_info->tids[t_info->index];
|
||||
free(t_info->tids);
|
||||
return rc; /* no need to return thread_id
|
||||
* thread will write its completition
|
||||
* itself in the registry
|
||||
*/
|
||||
}
|
||||
/*
|
||||
while(1){
|
||||
t_info->index--;
|
||||
pthread_join(t_info->tids[t_info->index], NULL);
|
||||
if(t_info->index==0)
|
||||
break;
|
||||
}
|
||||
*/
|
||||
return 0;/* can never be called*/
|
||||
}
|
||||
|
||||
|
||||
/** provide a function to setup the environment for the remote
|
||||
* processes. We need to ensure that the remote processes know
|
||||
* their gpr and ns replicas, the universe
|
||||
* to which they belong, etc. - otherwise, they may run, but they
|
||||
* will never actually join the rest of the job. This function
|
||||
* creates the common environment for all the processes.
|
||||
*
|
||||
* @param env a pointer to the environment to setup
|
||||
*/
|
||||
static int orte_pls_xcpu_setup_env(char ***env)
|
||||
{
|
||||
char ** merged;
|
||||
char * var;
|
||||
char * param;
|
||||
int rc;
|
||||
int num_env;
|
||||
|
||||
/** merge in environment */
|
||||
merged = opal_environ_merge(*env, environ);
|
||||
opal_argv_free(*env);
|
||||
*env = merged;
|
||||
|
||||
num_env = opal_argv_count(*env);
|
||||
/** append mca parameters to our environment */
|
||||
if(ORTE_SUCCESS != (rc = mca_base_param_build_env(env, &num_env, false))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
|
||||
/** ns replica contact info */
|
||||
if (NULL != orte_process_info.ns_replica) {
|
||||
param = strdup(orte_process_info.ns_replica_uri);
|
||||
} else {
|
||||
param = orte_rml.get_uri();
|
||||
}
|
||||
var = mca_base_param_environ_variable("ns","replica","uri");
|
||||
opal_setenv(var, param, true, env);
|
||||
free(var);
|
||||
var = mca_base_param_environ_variable("ns","replica","uri");
|
||||
opal_setenv(var, param, true, env);
|
||||
free(var);
|
||||
|
||||
/** make sure the frontend hostname does not get pushed out to the backend */
|
||||
var = mca_base_param_environ_variable("orte", "base", "nodename");
|
||||
opal_unsetenv(var, env);
|
||||
free(var);
|
||||
opal_unsetenv("HOSTNAME", env);
|
||||
|
||||
/** gpr replica contact info */
|
||||
if (NULL != orte_process_info.gpr_replica) {
|
||||
param = strdup(orte_process_info.gpr_replica_uri);
|
||||
} else {
|
||||
param = orte_rml.get_uri();
|
||||
}
|
||||
var = mca_base_param_environ_variable("gpr","replica","uri");
|
||||
opal_setenv(var, param, true, env);
|
||||
free(param);
|
||||
free(var);
|
||||
|
||||
/** universe name */
|
||||
var = mca_base_param_environ_variable("universe", NULL, NULL);
|
||||
asprintf(¶m, "%s@%s:%s", orte_universe_info.uid,
|
||||
orte_universe_info.host, orte_universe_info.name);
|
||||
opal_setenv(var, param, true, env);
|
||||
free(param);
|
||||
free(var);
|
||||
|
||||
/** make sure hostname doesn't get pushed to backend node */
|
||||
opal_unsetenv("HOSTNAME", env);
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
/** LAUNCH **/
|
||||
|
||||
/* This is the main function that will launch jobs on remote compute modes
|
||||
* @param jobid the jobid of the job to launch
|
||||
* @retval ORTE_SUCCESS or error
|
||||
*/
|
||||
int orte_pls_xcpu_launch(orte_jobid_t jobid){
|
||||
opal_list_t mapping;
|
||||
char *param, *var;
|
||||
char *header[] = {
|
||||
"dummy",
|
||||
NULL,
|
||||
NULL};
|
||||
int argc;
|
||||
int rc;
|
||||
int i;
|
||||
size_t nprocs=0, proc_id=0;
|
||||
orte_pls_xcpu_tid_stack *t_stack, *temp_stack;
|
||||
opal_list_item_t *item;
|
||||
orte_rmaps_base_map_t* map;
|
||||
orte_rmaps_base_node_t *node;
|
||||
orte_rmaps_base_proc_t *proc;
|
||||
orte_vpid_t vpid_start, vpid_range;
|
||||
orte_process_name_t *peers;
|
||||
int peer_id, num_peers;
|
||||
/** first get the mapping we are going to use to launch job. The head
|
||||
* of the list is OBJ_CONSTRUCT'd since it is not dynamically allocated. The
|
||||
* get_map function, however, will dynamically allocate the items in the
|
||||
* list itself - these will be released when we OBJ_DESTRUCT the list at
|
||||
* the end
|
||||
*/
|
||||
OBJ_CONSTRUCT(&mapping, opal_list_t);
|
||||
/** get the mapping from the registry. This will provide a linked list, one
|
||||
* item for each mapping. Each item contains the full context of the application
|
||||
* that is to be executed upon that node. In particular, we need to obtain
|
||||
* the argv array that is included in that context as this tells us the application
|
||||
* to launch plus any "flags" to pass to it.
|
||||
*/
|
||||
if(ORTE_SUCCESS != (rc = orte_rmaps_base_get_map(jobid, &mapping))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/** next, get the vpid_start and range info so we can pass it along */
|
||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_get_vpid_range(jobid, &vpid_start, &vpid_range))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/** we have to do the following so that we can use the opal_argv utilities
|
||||
* to properly insert the header into the app's argv
|
||||
*/
|
||||
header[1] = strdup("dummy");
|
||||
|
||||
/** Now loop through all the provided maps to launch their associated apps */
|
||||
t_stack=NULL;
|
||||
nprocs = 0;
|
||||
peer_id=0;
|
||||
if (ORTE_SUCCESS != (rc = orte_ns.get_job_peers(&peers, &num_peers, jobid))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
for(item = opal_list_get_first(&mapping);
|
||||
item != opal_list_get_end(&mapping);
|
||||
item = opal_list_get_next(item)) {
|
||||
map = (orte_rmaps_base_map_t*) item;
|
||||
|
||||
/** xcpu requires an argv format that has a dummy filler in the
|
||||
* first location, followed by the node name, and then the standard
|
||||
* argv array we've all come to know and love (i.e., the application
|
||||
* name followed by options). We use the opal_argv utilities to
|
||||
* prepend this header info to the application's argv.
|
||||
*
|
||||
* Note: at this point, the header contains a dummy placeholder
|
||||
* for the node name - we'll fill that in later.
|
||||
*/
|
||||
opal_argv_insert(&(map->app->argv), 0, header);
|
||||
|
||||
/** we also need to pass the proper environment to the remote
|
||||
* process so it knows its universe, gpr and ns replicas, etc. Since this
|
||||
* can be specified by the user for each app, we have to do this
|
||||
* each time.
|
||||
*/
|
||||
if (ORTE_SUCCESS != (rc = orte_pls_xcpu_setup_env(&map->app->env))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/** since it is possible that each node could be executing a different application,
|
||||
* we cannot just do a mass launch - that would only be supported in the special
|
||||
* case of all the application processes being identical. Instead, we are going to
|
||||
* step our way through the list, launching each process individually.
|
||||
*/
|
||||
proc_id=0;
|
||||
while (proc_id < map->num_procs){
|
||||
char** env;
|
||||
proc = (orte_rmaps_base_proc_t*)(map->procs[proc_id]);
|
||||
node = proc->proc_node;
|
||||
proc_id++;
|
||||
|
||||
/** each proc_t entry contains the application to be executed,
|
||||
* the node upon which it is to be executed, and its OpenRTE
|
||||
* process name (plus a few other things). We use that
|
||||
* info to build the launch command by inserting them into
|
||||
* the argv array
|
||||
*/
|
||||
|
||||
/** start by pointing the proper location at the node name where
|
||||
* this process is to be launched
|
||||
*/
|
||||
if (NULL != map->app->argv[1]) free(map->app->argv[1]);
|
||||
map->app->argv[1] = strdup(node->node->node_name);
|
||||
|
||||
/* create a copy of the environment and modify for this proc */
|
||||
env = opal_argv_copy(map->app->env);
|
||||
|
||||
/** now setup the process name in the environment so we can
|
||||
* retrieve it on the other end
|
||||
*/
|
||||
if (ORTE_SUCCESS != (rc = orte_ns_nds_env_put(&(proc->proc_name),
|
||||
vpid_start, map->num_procs,
|
||||
&env))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/** the launcher wants to know how long the argv array is - get that now */
|
||||
argc = opal_argv_count(map->app->argv);
|
||||
|
||||
/** add this process to the stack so we can track it */
|
||||
temp_stack=(orte_pls_xcpu_tid_stack*)malloc(sizeof(orte_pls_xcpu_tid_stack));
|
||||
temp_stack->next=t_stack;
|
||||
t_stack=temp_stack;
|
||||
|
||||
/** launch the process */
|
||||
t_stack->tid=lrx(argc, map->app->argv, env, &peers[peer_id]);
|
||||
if(t_stack->tid==0){
|
||||
/* first kill all the processes started on remote nodes
|
||||
*/
|
||||
i=0;
|
||||
while(i<num_peers){
|
||||
if (ORTE_SUCCESS != (orte_soh_base_set_proc_soh(&peers[i], ORTE_PROC_STATE_TERMINATED, 0)) ){
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
i++;
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
peer_id++;
|
||||
}
|
||||
}
|
||||
|
||||
/** cleanup local storage */
|
||||
orte_pls_xcpu_free_stack(temp_stack);
|
||||
OBJ_DESTRUCT(&mapping);
|
||||
|
||||
/** launch complete */
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
int orte_pls_xcpu_terminate_job(orte_jobid_t jobid){
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
int orte_pls_xcpu_terminate_proc(const orte_process_name_t* proc_name){
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
int orte_pls_xcpu_finalize(void){
|
||||
return ORTE_SUCCESS;
|
||||
}
|
@ -1,125 +0,0 @@
|
||||
/* -*- C -*-
|
||||
*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2006 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*
|
||||
*
|
||||
*/
|
||||
/**
|
||||
* @file:
|
||||
* Header file for the xcpu launcher. This will use xcpu to launch jobs on
|
||||
* the list of nodes that it will get from RAS (resource allocation
|
||||
* system
|
||||
* -# pls_xcpu is called by orterun. It first setsup environment for the
|
||||
* process to be launched on remote node, then reads the ompi registry and
|
||||
* then launch the binary on the nodes specified in the registry.
|
||||
*/
|
||||
|
||||
#ifndef ORTE_PLS_XCPU_H_
|
||||
#define ORTE_PLS_XCPUC_H_
|
||||
|
||||
#include "orte_config.h"
|
||||
#include "orte/class/orte_pointer_array.h"
|
||||
#include "orte/orte_constants.h"
|
||||
#include "orte/mca/pls/base/base.h"
|
||||
#include "orte/util/proc_info.h"
|
||||
#include "opal/threads/condition.h"
|
||||
|
||||
#if defined(c_plusplus) || defined(__cplusplus)
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Module open / close -- defined in component file
|
||||
*/
|
||||
int orte_pls_xcpu_component_open(void);
|
||||
int orte_pls_xcpu_component_close(void);
|
||||
|
||||
/*
|
||||
* Startup / Shutdown
|
||||
*/
|
||||
orte_pls_base_module_t* orte_pls_xcpu_init(int *priority); /* in component file */
|
||||
|
||||
/*
|
||||
* Interface
|
||||
*/
|
||||
int orte_pls_xcpu_launch(orte_jobid_t);
|
||||
int orte_pls_xcpu_terminate_job(orte_jobid_t);
|
||||
int orte_pls_xcpu_terminate_proc(const orte_process_name_t* proc_name);
|
||||
int orte_pls_xcpu_finalize(void);
|
||||
|
||||
|
||||
/**
|
||||
* (P)rocess (L)aunch (S)ubsystem xcpu Component
|
||||
*/
|
||||
struct orte_pls_xcpu_component_t {
|
||||
/*base_class this is needed others below this may or may not*/
|
||||
orte_pls_base_component_t super;
|
||||
|
||||
int debug; /* If greater than 0 print debugging information */
|
||||
int priority; /* The priority of this component. This will be returned if
|
||||
* we determine that xcpu is available and running on this node,
|
||||
*/
|
||||
int terminate_sig; /* The signal that gets sent to a process to kill it. */
|
||||
size_t num_daemons; /* The number of daemons that are currently running. */
|
||||
orte_pointer_array_t * daemon_names;
|
||||
opal_mutex_t lock; /* Lock used to prevent some race conditions */
|
||||
opal_condition_t condition; /* Condition that is signaled when all the daemons have died */
|
||||
orte_cellid_t cellid;
|
||||
};
|
||||
typedef struct orte_pls_xcpu_component_t orte_pls_xcpu_component_t;
|
||||
|
||||
struct orte_pls_xcpu_tid_stack {
|
||||
int tid;
|
||||
struct orte_pls_xcpu_tid_stack *next;
|
||||
};
|
||||
typedef struct orte_pls_xcpu_tid_stack orte_pls_xcpu_tid_stack;
|
||||
|
||||
struct orte_pls_xcpu_mount_nodes{
|
||||
char *name;
|
||||
struct orte_pls_xcpu_mount_nodes *next;
|
||||
};
|
||||
typedef struct orte_pls_xcpu_mount_nodes orte_pls_xcpu_mount_nodes;
|
||||
|
||||
struct orte_pls_xcpu_thread_info{
|
||||
orte_pls_xcpu_mount_nodes local_mounts;/* can have only *name */
|
||||
char *binary;
|
||||
char *argv;
|
||||
char **env;
|
||||
orte_process_name_t *peers;
|
||||
};
|
||||
typedef struct orte_pls_xcpu_thread_info orte_pls_xcpu_thread_info;
|
||||
|
||||
struct orte_pls_xcpu_stdio_thread_info{
|
||||
char *stdio_path;
|
||||
int outdes;
|
||||
};
|
||||
typedef struct orte_pls_xcpu_stdio_thread_info orte_pls_xcpu_stdio_thread_info;
|
||||
|
||||
struct orte_pls_xcpu_pthread_tindex{
|
||||
pthread_t *tids;
|
||||
int index;
|
||||
};
|
||||
typedef struct orte_pls_xcpu_pthread_tindex orte_pls_xcpu_pthread_tindex;
|
||||
|
||||
ORTE_DECLSPEC extern orte_pls_xcpu_component_t mca_pls_xcpu_component;
|
||||
ORTE_DECLSPEC extern orte_pls_base_module_t orte_pls_xcpu_module; /* this is defined in pls_xcpu.c file */
|
||||
|
||||
#if defined(c_plusplus) || defined(__cplusplus)
|
||||
}
|
||||
#endif
|
||||
#endif /* ORTE_PLS_XCPU_H_ */
|
||||
|
@ -1,101 +0,0 @@
|
||||
/* -*- C -*-
|
||||
*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2006 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*
|
||||
*/
|
||||
/**
|
||||
* @file:
|
||||
* Takes care of the component stuff for the MCA.
|
||||
*/
|
||||
#include "orte_config.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "opal/mca/mca.h"
|
||||
#include "opal/mca/base/mca_base_param.h"
|
||||
#include "pls_xcpu.h"
|
||||
|
||||
/**
|
||||
* The xcpu component data structure that stores all the relevent data about
|
||||
* this component.
|
||||
*/
|
||||
orte_pls_xcpu_component_t mca_pls_xcpu_component = {
|
||||
{ /* version, data and init members of only first
|
||||
* structure (called super) being initialized
|
||||
*/
|
||||
{
|
||||
ORTE_PLS_BASE_VERSION_1_0_0,
|
||||
"xcpu", /* MCA component name */
|
||||
ORTE_MAJOR_VERSION, /* MCA component major version */
|
||||
ORTE_MINOR_VERSION, /* MCA component minor version */
|
||||
ORTE_RELEASE_VERSION, /* MCA component release version */
|
||||
orte_pls_xcpu_component_open, /* component open */
|
||||
orte_pls_xcpu_component_close /* component close */
|
||||
},
|
||||
{
|
||||
false /* checkpoint / restart */
|
||||
},
|
||||
orte_pls_xcpu_init /* component init */
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* Opens the pls_xcpu component, setting all the needed mca parameters and
|
||||
* finishes setting up the component struct.
|
||||
*/
|
||||
int orte_pls_xcpu_component_open(void) {
|
||||
int rc;
|
||||
/* init parameters */
|
||||
/*read trunk/opal/mca/base/mca_base_param.h for reg_int details*/
|
||||
mca_base_component_t *c = &mca_pls_xcpu_component.super.pls_version;
|
||||
mca_base_param_reg_int(c, "priority", NULL, false, false,5,
|
||||
&mca_pls_xcpu_component.priority);
|
||||
mca_base_param_reg_int(c, "debug",
|
||||
"If > 0 prints library debugging information",
|
||||
false, false, 0, &mca_pls_xcpu_component.debug);
|
||||
mca_base_param_reg_int(c, "terminate_sig",
|
||||
"Signal sent to processes to terminate them", false,
|
||||
false, 9, &mca_pls_xcpu_component.terminate_sig);
|
||||
mca_pls_xcpu_component.num_daemons = 0;
|
||||
OBJ_CONSTRUCT(&mca_pls_xcpu_component.lock, opal_mutex_t);
|
||||
OBJ_CONSTRUCT(&mca_pls_xcpu_component.condition, opal_condition_t);
|
||||
rc = orte_pointer_array_init(&mca_pls_xcpu_component.daemon_names, 8, 200000, 8);
|
||||
if(ORTE_SUCCESS != rc) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
return rc;
|
||||
}
|
||||
|
||||
/**
|
||||
* Closes the pls_xcpu component
|
||||
*/
|
||||
int orte_pls_xcpu_component_close(void) {
|
||||
OBJ_DESTRUCT(&mca_pls_xcpu_component.lock);
|
||||
OBJ_DESTRUCT(&mca_pls_xcpu_component.condition);
|
||||
OBJ_RELEASE(mca_pls_xcpu_component.daemon_names);
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
orte_pls_base_module_t* orte_pls_xcpu_init(int *priority) {
|
||||
/* check if xcpu component should be loaded or not
|
||||
* if not, then return NULL here
|
||||
*/
|
||||
*priority = mca_pls_xcpu_component.priority;
|
||||
return &orte_pls_xcpu_module; /* this is defined in pls_xcpu.c and will contains
|
||||
* function pointers for launch, terminate_job
|
||||
* terminate_proc and finalize
|
||||
*/
|
||||
}
|
||||
|
@ -25,7 +25,7 @@
|
||||
#include "opal/util/argv.h"
|
||||
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/soh/soh_types.h"
|
||||
#include "orte/mca/smr/smr_types.h"
|
||||
#include "orte/mca/gpr/gpr.h"
|
||||
#include "orte/mca/ns/ns.h"
|
||||
#include "orte/mca/ras/base/ras_base_node.h"
|
||||
|
@ -21,7 +21,7 @@
|
||||
#define ORTE_RAS_BASE_NODE_H
|
||||
|
||||
#include "orte/orte_types.h"
|
||||
#include "orte/mca/soh/soh_types.h"
|
||||
#include "orte/mca/smr/smr_types.h"
|
||||
#include "orte/mca/rmgr/rmgr_types.h"
|
||||
#include "orte/mca/ras/ras.h"
|
||||
|
||||
|
@ -19,7 +19,7 @@
|
||||
#include "orte_config.h"
|
||||
|
||||
#include "opal/class/opal_list.h"
|
||||
#include "orte/mca/soh/soh_types.h"
|
||||
#include "orte/mca/smr/smr_types.h"
|
||||
|
||||
#ifndef ORTE_MCA_RAS_TYPES_H
|
||||
#define ORTE_MCA_RAS_TYPES_H
|
||||
|
@ -32,7 +32,7 @@
|
||||
#include "orte/mca/rmgr/base/base.h"
|
||||
#include "orte/mca/rmaps/base/base.h"
|
||||
#include "orte/mca/rmaps/base/rmaps_base_map.h"
|
||||
#include "orte/mca/soh/soh_types.h"
|
||||
#include "orte/mca/smr/smr_types.h"
|
||||
|
||||
|
||||
/**
|
||||
|
@ -31,7 +31,7 @@
|
||||
|
||||
#include "orte/util/sys_info.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/soh/soh_types.h"
|
||||
#include "orte/mca/smr/smr_types.h"
|
||||
#include "orte/mca/ras/base/ras_base_node.h"
|
||||
#include "orte/mca/gpr/gpr.h"
|
||||
#include "orte/mca/ns/ns.h"
|
||||
|
@ -36,7 +36,7 @@
|
||||
#include "orte/mca/ns/ns.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/rml/rml.h"
|
||||
#include "orte/mca/soh/soh.h"
|
||||
#include "orte/mca/smr/smr.h"
|
||||
|
||||
#include "orte/mca/rmgr/base/base.h"
|
||||
|
||||
@ -187,7 +187,7 @@ int orte_rmgr_base_proc_stage_gate_init(orte_jobid_t job)
|
||||
free(trig_keys[0]);
|
||||
|
||||
/* set the job state to "launched" */
|
||||
if (ORTE_SUCCESS != (rc = orte_soh.set_job_soh(job, ORTE_JOB_STATE_LAUNCHED))) {
|
||||
if (ORTE_SUCCESS != (rc = orte_smr.set_job_state(job, ORTE_JOB_STATE_LAUNCHED))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
|
||||
@ -235,22 +235,22 @@ int orte_rmgr_base_proc_stage_gate_mgr(orte_gpr_notify_message_t *msg)
|
||||
|
||||
/* set the job state to the appropriate level */
|
||||
if (orte_schema.check_std_trigger_name(msg->target, ORTE_STG1_TRIGGER)) {
|
||||
if (ORTE_SUCCESS != (rc = orte_soh.set_job_soh(job, ORTE_JOB_STATE_AT_STG1))) {
|
||||
if (ORTE_SUCCESS != (rc = orte_smr.set_job_state(job, ORTE_JOB_STATE_AT_STG1))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto CLEANUP;
|
||||
}
|
||||
} else if (orte_schema.check_std_trigger_name(msg->target, ORTE_STG2_TRIGGER)) {
|
||||
if (ORTE_SUCCESS != (rc = orte_soh.set_job_soh(job, ORTE_JOB_STATE_AT_STG2))) {
|
||||
if (ORTE_SUCCESS != (rc = orte_smr.set_job_state(job, ORTE_JOB_STATE_AT_STG2))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto CLEANUP;
|
||||
}
|
||||
} else if (orte_schema.check_std_trigger_name(msg->target, ORTE_STG3_TRIGGER)) {
|
||||
if (ORTE_SUCCESS != (rc = orte_soh.set_job_soh(job, ORTE_JOB_STATE_AT_STG3))) {
|
||||
if (ORTE_SUCCESS != (rc = orte_smr.set_job_state(job, ORTE_JOB_STATE_AT_STG3))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto CLEANUP;
|
||||
}
|
||||
} else if (orte_schema.check_std_trigger_name(msg->target, ORTE_NUM_FINALIZED_TRIGGER)) {
|
||||
if (ORTE_SUCCESS != (rc = orte_soh.set_job_soh(job, ORTE_JOB_STATE_FINALIZED))) {
|
||||
if (ORTE_SUCCESS != (rc = orte_smr.set_job_state(job, ORTE_JOB_STATE_FINALIZED))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto CLEANUP;
|
||||
}
|
||||
@ -299,7 +299,7 @@ int orte_rmgr_base_proc_stage_gate_mgr_abort(orte_gpr_notify_message_t *msg)
|
||||
|
||||
/* set the job status to "aborted" */
|
||||
|
||||
if (ORTE_SUCCESS != (rc = orte_soh.set_job_soh(job, ORTE_JOB_STATE_ABORTED))) {
|
||||
if (ORTE_SUCCESS != (rc = orte_smr.set_job_state(job, ORTE_JOB_STATE_ABORTED))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
|
||||
|
@ -38,7 +38,7 @@
|
||||
#include "opal/mca/mca.h"
|
||||
#include "orte/mca/ns/ns_types.h"
|
||||
#include "orte/mca/gpr/gpr_types.h"
|
||||
#include "orte/mca/soh/soh_types.h"
|
||||
#include "orte/mca/smr/smr_types.h"
|
||||
#include "rmgr_types.h"
|
||||
|
||||
/*
|
||||
|
@ -40,7 +40,7 @@
|
||||
#include "orte/mca/iof/iof.h"
|
||||
#include "orte/mca/ns/ns.h"
|
||||
#include "orte/mca/rml/rml.h"
|
||||
#include "orte/mca/soh/soh.h"
|
||||
#include "orte/mca/smr/smr.h"
|
||||
|
||||
#include "orte/mca/rmgr/urm/rmgr_urm.h"
|
||||
|
||||
@ -184,7 +184,7 @@ static int orte_rmgr_urm_launch(orte_jobid_t jobid)
|
||||
if (ORTE_SUCCESS !=
|
||||
(ret = mca_rmgr_urm_component.urm_pls->launch(jobid))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
ret2 = orte_soh.set_job_soh(jobid, ORTE_JOB_STATE_ABORTED);
|
||||
ret2 = orte_smr.set_job_state(jobid, ORTE_JOB_STATE_ABORTED);
|
||||
if (ORTE_SUCCESS != ret2) {
|
||||
ORTE_ERROR_LOG(ret2);
|
||||
return ret2;
|
||||
|
@ -17,20 +17,20 @@
|
||||
#
|
||||
|
||||
# main library setup
|
||||
noinst_LTLIBRARIES = libmca_soh.la
|
||||
libmca_soh_la_SOURCES =
|
||||
noinst_LTLIBRARIES = libmca_smr.la
|
||||
libmca_smr_la_SOURCES =
|
||||
|
||||
# header setup
|
||||
nobase_orte_HEADERS =
|
||||
|
||||
# local files
|
||||
headers = soh.h soh_types.h
|
||||
libmca_soh_la_SOURCES += $(headers)
|
||||
headers = smr.h smr_types.h
|
||||
libmca_smr_la_SOURCES += $(headers)
|
||||
|
||||
# Conditionally install the header files
|
||||
if WANT_INSTALL_HEADERS
|
||||
nobase_orte_HEADERS += $(headers)
|
||||
ortedir = $(includedir)/openmpi/orte/mca/soh
|
||||
ortedir = $(includedir)/openmpi/orte/mca/smr
|
||||
else
|
||||
ortedir = $(includedir)
|
||||
endif
|
37
orte/mca/smr/base/Makefile.am
Обычный файл
37
orte/mca/smr/base/Makefile.am
Обычный файл
@ -0,0 +1,37 @@
|
||||
#
|
||||
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
# University Research and Technology
|
||||
# Corporation. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
# of Tennessee Research Foundation. All rights
|
||||
# reserved.
|
||||
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
headers += \
|
||||
base/base.h
|
||||
|
||||
libmca_smr_la_SOURCES += \
|
||||
base/smr_base_close.c \
|
||||
base/smr_base_select.c \
|
||||
base/smr_base_local_functions.c \
|
||||
base/smr_base_get_proc_state.c \
|
||||
base/smr_base_set_proc_state.c \
|
||||
base/smr_base_get_job_state.c \
|
||||
base/smr_base_set_job_state.c \
|
||||
base/smr_base_open.c \
|
||||
base/data_type_support/smr_data_type_compare_fns.c \
|
||||
base/data_type_support/smr_data_type_copy_fns.c \
|
||||
base/data_type_support/smr_data_type_print_fns.c \
|
||||
base/data_type_support/smr_data_type_release_fns.c \
|
||||
base/data_type_support/smr_data_type_size_fns.c \
|
||||
base/data_type_support/smr_data_type_packing_fns.c \
|
||||
base/data_type_support/smr_data_type_unpacking_fns.c
|
64
orte/mca/smr/base/base.h
Обычный файл
64
orte/mca/smr/base/base.h
Обычный файл
@ -0,0 +1,64 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
/** @file:
|
||||
*/
|
||||
|
||||
#ifndef MCA_SMR_BASE_H
|
||||
#define MCA_SMR_BASE_H
|
||||
|
||||
/*
|
||||
* includes
|
||||
*/
|
||||
#include "orte_config.h"
|
||||
#include "orte/orte_constants.h"
|
||||
#include "orte/orte_types.h"
|
||||
|
||||
#include "opal/class/opal_list.h"
|
||||
#include "orte/dss/dss_types.h"
|
||||
#include "opal/mca/mca.h"
|
||||
/* #include "orte/mca/ns/ns_types.h" */
|
||||
#include "orte/mca/smr/smr.h"
|
||||
|
||||
|
||||
/*
|
||||
* Global functions for MCA overall collective open and close
|
||||
*/
|
||||
#if defined(c_plusplus) || defined(__cplusplus)
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
OMPI_DECLSPEC int orte_smr_base_open(void);
|
||||
OMPI_DECLSPEC int orte_smr_base_select(void);
|
||||
OMPI_DECLSPEC int orte_smr_base_close(void);
|
||||
|
||||
typedef struct orte_smr_base_t {
|
||||
int smr_output;
|
||||
opal_list_t smr_components;
|
||||
} orte_smr_base_t;
|
||||
|
||||
OMPI_DECLSPEC extern orte_smr_base_t orte_smr_base;
|
||||
|
||||
|
||||
/*
|
||||
* external API functions will be documented in the mca/smr/smr.h file
|
||||
*/
|
||||
|
||||
#if defined(c_plusplus) || defined(__cplusplus)
|
||||
}
|
||||
#endif
|
||||
#endif
|
@ -24,12 +24,12 @@
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/dss/dss_internal.h"
|
||||
|
||||
#include "orte/mca/soh/base/base.h"
|
||||
#include "orte/mca/smr/base/smr_private.h"
|
||||
|
||||
/*
|
||||
* EXIT CODE
|
||||
*/
|
||||
int orte_soh_base_compare_exit_code(orte_exit_code_t *value1,
|
||||
int orte_smr_base_compare_exit_code(orte_exit_code_t *value1,
|
||||
orte_exit_code_t *value2,
|
||||
orte_data_type_t type)
|
||||
{
|
||||
@ -43,7 +43,7 @@ int orte_soh_base_compare_exit_code(orte_exit_code_t *value1,
|
||||
/*
|
||||
* NODE STATE
|
||||
*/
|
||||
int orte_soh_base_compare_node_state(orte_node_state_t *value1,
|
||||
int orte_smr_base_compare_node_state(orte_node_state_t *value1,
|
||||
orte_node_state_t *value2,
|
||||
orte_node_state_t type)
|
||||
{
|
||||
@ -57,7 +57,7 @@ int orte_soh_base_compare_node_state(orte_node_state_t *value1,
|
||||
/*
|
||||
* PROC STATE
|
||||
*/
|
||||
int orte_soh_base_compare_proc_state(orte_proc_state_t *value1,
|
||||
int orte_smr_base_compare_proc_state(orte_proc_state_t *value1,
|
||||
orte_proc_state_t *value2,
|
||||
orte_proc_state_t type)
|
||||
{
|
||||
@ -71,7 +71,7 @@ int orte_soh_base_compare_proc_state(orte_proc_state_t *value1,
|
||||
/*
|
||||
* JOB STATE
|
||||
*/
|
||||
int orte_soh_base_compare_job_state(orte_job_state_t *value1,
|
||||
int orte_smr_base_compare_job_state(orte_job_state_t *value1,
|
||||
orte_job_state_t *value2,
|
||||
orte_job_state_t type)
|
||||
{
|
@ -24,12 +24,12 @@
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/dss/dss_internal.h"
|
||||
|
||||
#include "orte/mca/soh/base/base.h"
|
||||
#include "orte/mca/smr/base/smr_private.h"
|
||||
|
||||
/*
|
||||
* COPY FOR NON-COMPLEX FUNCTIONS
|
||||
*/
|
||||
int orte_soh_base_copy_proc_state(orte_proc_state_t **dest, orte_proc_state_t *src, orte_data_type_t type)
|
||||
int orte_smr_base_copy_proc_state(orte_proc_state_t **dest, orte_proc_state_t *src, orte_data_type_t type)
|
||||
{
|
||||
orte_proc_state_t *ps;
|
||||
|
||||
@ -45,7 +45,7 @@ int orte_soh_base_copy_proc_state(orte_proc_state_t **dest, orte_proc_state_t *s
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
int orte_soh_base_copy_job_state(orte_job_state_t **dest, orte_job_state_t *src, orte_data_type_t type)
|
||||
int orte_smr_base_copy_job_state(orte_job_state_t **dest, orte_job_state_t *src, orte_data_type_t type)
|
||||
{
|
||||
orte_job_state_t *ps;
|
||||
|
||||
@ -61,7 +61,7 @@ int orte_soh_base_copy_job_state(orte_job_state_t **dest, orte_job_state_t *src,
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
int orte_soh_base_copy_node_state(orte_node_state_t **dest, orte_node_state_t *src, orte_data_type_t type)
|
||||
int orte_smr_base_copy_node_state(orte_node_state_t **dest, orte_node_state_t *src, orte_data_type_t type)
|
||||
{
|
||||
orte_node_state_t *ps;
|
||||
|
||||
@ -77,7 +77,7 @@ int orte_soh_base_copy_node_state(orte_node_state_t **dest, orte_node_state_t *s
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
int orte_soh_base_copy_exit_code(orte_exit_code_t **dest, orte_exit_code_t *src, orte_data_type_t type)
|
||||
int orte_smr_base_copy_exit_code(orte_exit_code_t **dest, orte_exit_code_t *src, orte_data_type_t type)
|
||||
{
|
||||
orte_exit_code_t *ps;
|
||||
|
@ -26,12 +26,12 @@
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/dss/dss_internal.h"
|
||||
|
||||
#include "orte/mca/soh/base/base.h"
|
||||
#include "orte/mca/smr/base/smr_private.h"
|
||||
|
||||
/*
|
||||
* EXIT CODE
|
||||
*/
|
||||
int orte_soh_base_pack_exit_code(orte_buffer_t *buffer, void *src,
|
||||
int orte_smr_base_pack_exit_code(orte_buffer_t *buffer, void *src,
|
||||
orte_std_cntr_t num_vals, orte_data_type_t type)
|
||||
{
|
||||
int rc;
|
||||
@ -46,7 +46,7 @@ int orte_soh_base_pack_exit_code(orte_buffer_t *buffer, void *src,
|
||||
/*
|
||||
* NODE STATE
|
||||
*/
|
||||
int orte_soh_base_pack_node_state(orte_buffer_t *buffer, void *src,
|
||||
int orte_smr_base_pack_node_state(orte_buffer_t *buffer, void *src,
|
||||
orte_std_cntr_t num_vals, orte_data_type_t type)
|
||||
{
|
||||
int rc;
|
||||
@ -61,7 +61,7 @@ int orte_soh_base_pack_node_state(orte_buffer_t *buffer, void *src,
|
||||
/*
|
||||
* PROC STATE
|
||||
*/
|
||||
int orte_soh_base_pack_proc_state(orte_buffer_t *buffer, void *src,
|
||||
int orte_smr_base_pack_proc_state(orte_buffer_t *buffer, void *src,
|
||||
orte_std_cntr_t num_vals, orte_data_type_t type)
|
||||
{
|
||||
int rc;
|
||||
@ -76,7 +76,7 @@ int orte_soh_base_pack_proc_state(orte_buffer_t *buffer, void *src,
|
||||
/*
|
||||
* JOB STATE
|
||||
*/
|
||||
int orte_soh_base_pack_job_state(orte_buffer_t *buffer, void *src,
|
||||
int orte_smr_base_pack_job_state(orte_buffer_t *buffer, void *src,
|
||||
orte_std_cntr_t num_vals, orte_data_type_t type)
|
||||
{
|
||||
int rc;
|
@ -23,33 +23,33 @@
|
||||
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
|
||||
#include "orte/mca/soh/base/base.h"
|
||||
#include "orte/mca/smr/base/smr_private.h"
|
||||
|
||||
static void orte_soh_base_quick_print(char **output, char *type_name, char *prefix, void *src, size_t src_size);
|
||||
static void orte_smr_base_quick_print(char **output, char *type_name, char *prefix, void *src, size_t src_size);
|
||||
|
||||
/*
|
||||
* STANDARD PRINT FUNCTION - WORKS FOR EVERYTHING NON-STRUCTURED
|
||||
*/
|
||||
int orte_soh_base_std_print(char **output, char *prefix, void *src, orte_data_type_t type)
|
||||
int orte_smr_base_std_print(char **output, char *prefix, void *src, orte_data_type_t type)
|
||||
{
|
||||
/* set default result */
|
||||
*output = NULL;
|
||||
|
||||
switch(type) {
|
||||
case ORTE_PROC_STATE:
|
||||
orte_soh_base_quick_print(output, "ORTE_PROC_STATE", prefix, src, sizeof(orte_proc_state_t));
|
||||
orte_smr_base_quick_print(output, "ORTE_PROC_STATE", prefix, src, sizeof(orte_proc_state_t));
|
||||
break;
|
||||
|
||||
case ORTE_JOB_STATE:
|
||||
orte_soh_base_quick_print(output, "ORTE_JOB_STATE", prefix, src, sizeof(orte_job_state_t));
|
||||
orte_smr_base_quick_print(output, "ORTE_JOB_STATE", prefix, src, sizeof(orte_job_state_t));
|
||||
break;
|
||||
|
||||
case ORTE_NODE_STATE:
|
||||
orte_soh_base_quick_print(output, "ORTE_NODE_STATE", prefix, src, sizeof(orte_node_state_t));
|
||||
orte_smr_base_quick_print(output, "ORTE_NODE_STATE", prefix, src, sizeof(orte_node_state_t));
|
||||
break;
|
||||
|
||||
case ORTE_EXIT_CODE:
|
||||
orte_soh_base_quick_print(output, "ORTE_EXIT_CODE", prefix, src, sizeof(orte_exit_code_t));
|
||||
orte_smr_base_quick_print(output, "ORTE_EXIT_CODE", prefix, src, sizeof(orte_exit_code_t));
|
||||
break;
|
||||
|
||||
default:
|
||||
@ -60,7 +60,7 @@ int orte_soh_base_std_print(char **output, char *prefix, void *src, orte_data_ty
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static void orte_soh_base_quick_print(char **output, char *type_name, char *prefix, void *src, size_t src_size)
|
||||
static void orte_smr_base_quick_print(char **output, char *type_name, char *prefix, void *src, size_t src_size)
|
||||
{
|
||||
uint8_t *ui8;
|
||||
uint16_t *ui16;
|
@ -23,12 +23,12 @@
|
||||
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
|
||||
#include "orte/mca/soh/base/base.h"
|
||||
#include "orte/mca/smr/base/smr_private.h"
|
||||
|
||||
/*
|
||||
* STANDARD SIZE FUNCTION - WORKS FOR EVERYTHING NON-STRUCTURED
|
||||
*/
|
||||
void orte_soh_base_std_release(orte_data_value_t *value)
|
||||
void orte_smr_base_std_release(orte_data_value_t *value)
|
||||
{
|
||||
free(value->data);
|
||||
value->data = NULL;
|
@ -23,12 +23,12 @@
|
||||
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
|
||||
#include "orte/mca/soh/base/base.h"
|
||||
#include "orte/mca/smr/base/smr_private.h"
|
||||
|
||||
/*
|
||||
* STANDARD SIZE FUNCTION - WORKS FOR EVERYTHING NON-STRUCTURED
|
||||
*/
|
||||
int orte_soh_base_std_size(size_t *size, void *src, orte_data_type_t type)
|
||||
int orte_smr_base_std_size(size_t *size, void *src, orte_data_type_t type)
|
||||
{
|
||||
switch(type) {
|
||||
case ORTE_PROC_STATE:
|
@ -26,12 +26,12 @@
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/dss/dss_internal.h"
|
||||
|
||||
#include "orte/mca/soh/base/base.h"
|
||||
#include "orte/mca/smr/base/smr_private.h"
|
||||
|
||||
/*
|
||||
* EXIT CODE
|
||||
*/
|
||||
int orte_soh_base_unpack_exit_code(orte_buffer_t *buffer, void *dest,
|
||||
int orte_smr_base_unpack_exit_code(orte_buffer_t *buffer, void *dest,
|
||||
orte_std_cntr_t *num_vals, orte_data_type_t type)
|
||||
{
|
||||
int rc;
|
||||
@ -46,7 +46,7 @@ int orte_soh_base_unpack_exit_code(orte_buffer_t *buffer, void *dest,
|
||||
/*
|
||||
* NODE STATE
|
||||
*/
|
||||
int orte_soh_base_unpack_node_state(orte_buffer_t *buffer, void *dest,
|
||||
int orte_smr_base_unpack_node_state(orte_buffer_t *buffer, void *dest,
|
||||
orte_std_cntr_t *num_vals, orte_data_type_t type)
|
||||
{
|
||||
int rc;
|
||||
@ -61,7 +61,7 @@ int orte_soh_base_unpack_node_state(orte_buffer_t *buffer, void *dest,
|
||||
/*
|
||||
* PROC STATE
|
||||
*/
|
||||
int orte_soh_base_unpack_proc_state(orte_buffer_t *buffer, void *dest,
|
||||
int orte_smr_base_unpack_proc_state(orte_buffer_t *buffer, void *dest,
|
||||
orte_std_cntr_t *num_vals, orte_data_type_t type)
|
||||
{
|
||||
int rc;
|
||||
@ -76,7 +76,7 @@ int orte_soh_base_unpack_proc_state(orte_buffer_t *buffer, void *dest,
|
||||
/*
|
||||
* JOB STATE
|
||||
*/
|
||||
int orte_soh_base_unpack_job_state(orte_buffer_t *buffer, void *dest,
|
||||
int orte_smr_base_unpack_job_state(orte_buffer_t *buffer, void *dest,
|
||||
orte_std_cntr_t *num_vals, orte_data_type_t type)
|
||||
{
|
||||
int rc;
|
@ -21,25 +21,26 @@
|
||||
#include "orte/orte_constants.h"
|
||||
#include "opal/mca/mca.h"
|
||||
#include "opal/mca/base/base.h"
|
||||
#include "orte/mca/soh/base/base.h"
|
||||
|
||||
#include "orte/mca/smr/base/base.h"
|
||||
#include "orte/mca/smr/base/smr_private.h"
|
||||
|
||||
int orte_soh_base_close(void)
|
||||
int orte_smr_base_close(void)
|
||||
{
|
||||
/* If we have a selected component and module, then finalize it */
|
||||
|
||||
if (NULL != orte_soh.finalize) {
|
||||
orte_soh.finalize();
|
||||
if (NULL != orte_smr.finalize) {
|
||||
orte_smr.finalize();
|
||||
}
|
||||
|
||||
/* after the module, close the component?? */
|
||||
/* orte_soh_base_component_finalize (); */
|
||||
/* orte_smr_base_component_finalize (); */
|
||||
|
||||
/* Close all remaining available components (may be one if this is a
|
||||
OMPI RTE program, or [possibly] multiple if this is ompi_info) */
|
||||
|
||||
mca_base_components_close(orte_soh_base.soh_output,
|
||||
&orte_soh_base.soh_components, NULL);
|
||||
mca_base_components_close(orte_smr_base.smr_output,
|
||||
&orte_smr_base.smr_components, NULL);
|
||||
|
||||
/* All done */
|
||||
|
@ -31,9 +31,9 @@
|
||||
#include "orte/mca/gpr/gpr.h"
|
||||
#include "orte/mca/ns/ns.h"
|
||||
|
||||
#include "orte/mca/soh/base/base.h"
|
||||
#include "orte/mca/smr/base/smr_private.h"
|
||||
|
||||
int orte_soh_base_get_job_soh(orte_job_state_t *state,
|
||||
int orte_smr_base_get_job_state(orte_job_state_t *state,
|
||||
orte_jobid_t jobid)
|
||||
{
|
||||
orte_gpr_value_t **values;
|
@ -31,9 +31,9 @@
|
||||
#include "orte/mca/gpr/gpr.h"
|
||||
#include "orte/mca/ns/ns.h"
|
||||
|
||||
#include "orte/mca/soh/base/base.h"
|
||||
#include "orte/mca/smr/base/smr_private.h"
|
||||
|
||||
int orte_soh_base_get_proc_soh(orte_proc_state_t *state,
|
||||
int orte_smr_base_get_proc_state(orte_proc_state_t *state,
|
||||
int *exit_status,
|
||||
orte_process_name_t *proc)
|
||||
{
|
@ -22,35 +22,34 @@
|
||||
* includes
|
||||
*/
|
||||
#include "orte_config.h"
|
||||
#include "orte/orte_constants.h"
|
||||
|
||||
#include "opal/class/opal_list.h"
|
||||
#include "opal/mca/mca.h"
|
||||
|
||||
#include "orte/mca/soh/base/base.h"
|
||||
|
||||
#include "orte/orte_constants.h"
|
||||
#include "orte/mca/smr/base/smr_private.h"
|
||||
|
||||
|
||||
int orte_soh_base_get_node_soh_not_available(orte_node_state_t *state,
|
||||
int orte_smr_base_get_node_state_not_available(orte_node_state_t *state,
|
||||
orte_cellid_t cell,
|
||||
char *nodename)
|
||||
{
|
||||
return ORTE_ERR_NOT_IMPLEMENTED;
|
||||
}
|
||||
|
||||
int orte_soh_base_set_node_soh_not_available(orte_cellid_t cell,
|
||||
int orte_smr_base_set_node_state_not_available(orte_cellid_t cell,
|
||||
char *nodename,
|
||||
orte_node_state_t state)
|
||||
{
|
||||
return ORTE_ERR_NOT_IMPLEMENTED;
|
||||
}
|
||||
|
||||
int orte_soh_base_begin_monitoring_not_available(orte_jobid_t job)
|
||||
int orte_smr_base_begin_monitoring_not_available(orte_jobid_t job)
|
||||
{
|
||||
return ORTE_ERR_NOT_IMPLEMENTED;
|
||||
}
|
||||
|
||||
int orte_soh_base_module_finalize_not_available (void)
|
||||
int orte_smr_base_module_finalize_not_available (void)
|
||||
{
|
||||
return ORTE_ERR_NOT_IMPLEMENTED;
|
||||
}
|
@ -30,7 +30,8 @@
|
||||
#include "orte/util/proc_info.h"
|
||||
#include "orte/mca/oob/base/base.h"
|
||||
|
||||
#include "orte/mca/soh/base/base.h"
|
||||
#include "orte/mca/smr/base/base.h"
|
||||
#include "orte/mca/smr/base/smr_private.h"
|
||||
|
||||
#include "stdio.h" /* just for gef debug */
|
||||
|
||||
@ -41,7 +42,7 @@
|
||||
* component's public mca_base_component_t struct.
|
||||
*/
|
||||
|
||||
#include "orte/mca/soh/base/static-components.h"
|
||||
#include "orte/mca/smr/base/static-components.h"
|
||||
|
||||
/*
|
||||
* globals
|
||||
@ -50,54 +51,54 @@
|
||||
/*
|
||||
* Global variables
|
||||
*/
|
||||
orte_soh_base_t orte_soh_base;
|
||||
orte_smr_base_t orte_smr_base;
|
||||
|
||||
orte_soh_base_module_t orte_soh = {
|
||||
orte_smr_base_module_t orte_smr = {
|
||||
|
||||
orte_soh_base_get_proc_soh,
|
||||
orte_soh_base_set_proc_soh,
|
||||
orte_soh_base_get_node_soh_not_available,
|
||||
orte_soh_base_set_node_soh_not_available,
|
||||
orte_soh_base_get_job_soh,
|
||||
orte_soh_base_set_job_soh,
|
||||
orte_soh_base_begin_monitoring_not_available,
|
||||
orte_soh_base_module_finalize_not_available
|
||||
orte_smr_base_get_proc_state,
|
||||
orte_smr_base_set_proc_state,
|
||||
orte_smr_base_get_node_state_not_available,
|
||||
orte_smr_base_set_node_state_not_available,
|
||||
orte_smr_base_get_job_state,
|
||||
orte_smr_base_set_job_state,
|
||||
orte_smr_base_begin_monitoring_not_available,
|
||||
orte_smr_base_module_finalize_not_available
|
||||
};
|
||||
|
||||
/**
|
||||
* Function for finding and opening either all MCA components, or the one
|
||||
* that was specifically requested via a MCA parameter.
|
||||
*/
|
||||
int orte_soh_base_open(void)
|
||||
int orte_smr_base_open(void)
|
||||
{
|
||||
|
||||
int param, value, rc;
|
||||
orte_data_type_t tmp;
|
||||
|
||||
/* fprintf(stderr,"orte_soh_base_open:enter\n"); */
|
||||
/* fprintf(stderr,"orte_smr_base_open:enter\n"); */
|
||||
|
||||
/* setup output for debug messages */
|
||||
|
||||
orte_soh_base.soh_output = opal_output_open(NULL);
|
||||
param = mca_base_param_reg_int_name("soh_base", "verbose",
|
||||
"Verbosity level for the soh framework",
|
||||
orte_smr_base.smr_output = opal_output_open(NULL);
|
||||
param = mca_base_param_reg_int_name("smr_base", "verbose",
|
||||
"Verbosity level for the smr framework",
|
||||
false, false, 0, &value);
|
||||
if (value != 0) {
|
||||
orte_soh_base.soh_output = opal_output_open(NULL);
|
||||
orte_smr_base.smr_output = opal_output_open(NULL);
|
||||
} else {
|
||||
orte_soh_base.soh_output = -1;
|
||||
orte_smr_base.smr_output = -1;
|
||||
}
|
||||
|
||||
|
||||
/* register the base system types with the DPS */
|
||||
tmp = ORTE_NODE_STATE;
|
||||
if (ORTE_SUCCESS != (rc = orte_dss.register_type(orte_soh_base_pack_node_state,
|
||||
orte_soh_base_unpack_node_state,
|
||||
(orte_dss_copy_fn_t)orte_soh_base_copy_node_state,
|
||||
(orte_dss_compare_fn_t)orte_soh_base_compare_node_state,
|
||||
(orte_dss_size_fn_t)orte_soh_base_std_size,
|
||||
(orte_dss_print_fn_t)orte_soh_base_std_print,
|
||||
(orte_dss_release_fn_t)orte_soh_base_std_release,
|
||||
if (ORTE_SUCCESS != (rc = orte_dss.register_type(orte_smr_base_pack_node_state,
|
||||
orte_smr_base_unpack_node_state,
|
||||
(orte_dss_copy_fn_t)orte_smr_base_copy_node_state,
|
||||
(orte_dss_compare_fn_t)orte_smr_base_compare_node_state,
|
||||
(orte_dss_size_fn_t)orte_smr_base_std_size,
|
||||
(orte_dss_print_fn_t)orte_smr_base_std_print,
|
||||
(orte_dss_release_fn_t)orte_smr_base_std_release,
|
||||
ORTE_DSS_UNSTRUCTURED,
|
||||
"ORTE_NODE_STATE", &tmp))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
@ -105,13 +106,13 @@ int orte_soh_base_open(void)
|
||||
}
|
||||
|
||||
tmp = ORTE_PROC_STATE;
|
||||
if (ORTE_SUCCESS != (rc = orte_dss.register_type(orte_soh_base_pack_proc_state,
|
||||
orte_soh_base_unpack_proc_state,
|
||||
(orte_dss_copy_fn_t)orte_soh_base_copy_proc_state,
|
||||
(orte_dss_compare_fn_t)orte_soh_base_compare_proc_state,
|
||||
(orte_dss_size_fn_t)orte_soh_base_std_size,
|
||||
(orte_dss_print_fn_t)orte_soh_base_std_print,
|
||||
(orte_dss_release_fn_t)orte_soh_base_std_release,
|
||||
if (ORTE_SUCCESS != (rc = orte_dss.register_type(orte_smr_base_pack_proc_state,
|
||||
orte_smr_base_unpack_proc_state,
|
||||
(orte_dss_copy_fn_t)orte_smr_base_copy_proc_state,
|
||||
(orte_dss_compare_fn_t)orte_smr_base_compare_proc_state,
|
||||
(orte_dss_size_fn_t)orte_smr_base_std_size,
|
||||
(orte_dss_print_fn_t)orte_smr_base_std_print,
|
||||
(orte_dss_release_fn_t)orte_smr_base_std_release,
|
||||
ORTE_DSS_UNSTRUCTURED,
|
||||
"ORTE_PROC_STATE", &tmp))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
@ -119,13 +120,13 @@ int orte_soh_base_open(void)
|
||||
}
|
||||
|
||||
tmp = ORTE_JOB_STATE;
|
||||
if (ORTE_SUCCESS != (rc = orte_dss.register_type(orte_soh_base_pack_job_state,
|
||||
orte_soh_base_unpack_job_state,
|
||||
(orte_dss_copy_fn_t)orte_soh_base_copy_job_state,
|
||||
(orte_dss_compare_fn_t)orte_soh_base_compare_job_state,
|
||||
(orte_dss_size_fn_t)orte_soh_base_std_size,
|
||||
(orte_dss_print_fn_t)orte_soh_base_std_print,
|
||||
(orte_dss_release_fn_t)orte_soh_base_std_release,
|
||||
if (ORTE_SUCCESS != (rc = orte_dss.register_type(orte_smr_base_pack_job_state,
|
||||
orte_smr_base_unpack_job_state,
|
||||
(orte_dss_copy_fn_t)orte_smr_base_copy_job_state,
|
||||
(orte_dss_compare_fn_t)orte_smr_base_compare_job_state,
|
||||
(orte_dss_size_fn_t)orte_smr_base_std_size,
|
||||
(orte_dss_print_fn_t)orte_smr_base_std_print,
|
||||
(orte_dss_release_fn_t)orte_smr_base_std_release,
|
||||
ORTE_DSS_UNSTRUCTURED,
|
||||
"ORTE_JOB_STATE", &tmp))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
@ -133,13 +134,13 @@ int orte_soh_base_open(void)
|
||||
}
|
||||
|
||||
tmp = ORTE_EXIT_CODE;
|
||||
if (ORTE_SUCCESS != (rc = orte_dss.register_type(orte_soh_base_pack_exit_code,
|
||||
orte_soh_base_unpack_exit_code,
|
||||
(orte_dss_copy_fn_t)orte_soh_base_copy_exit_code,
|
||||
(orte_dss_compare_fn_t)orte_soh_base_compare_exit_code,
|
||||
(orte_dss_size_fn_t)orte_soh_base_std_size,
|
||||
(orte_dss_print_fn_t)orte_soh_base_std_print,
|
||||
(orte_dss_release_fn_t)orte_soh_base_std_release,
|
||||
if (ORTE_SUCCESS != (rc = orte_dss.register_type(orte_smr_base_pack_exit_code,
|
||||
orte_smr_base_unpack_exit_code,
|
||||
(orte_dss_copy_fn_t)orte_smr_base_copy_exit_code,
|
||||
(orte_dss_compare_fn_t)orte_smr_base_compare_exit_code,
|
||||
(orte_dss_size_fn_t)orte_smr_base_std_size,
|
||||
(orte_dss_print_fn_t)orte_smr_base_std_print,
|
||||
(orte_dss_release_fn_t)orte_smr_base_std_release,
|
||||
ORTE_DSS_UNSTRUCTURED,
|
||||
"ORTE_EXIT_CODE", &tmp))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
@ -149,9 +150,9 @@ int orte_soh_base_open(void)
|
||||
/* Open up all available components */
|
||||
|
||||
if (ORTE_SUCCESS !=
|
||||
mca_base_components_open("soh", orte_soh_base.soh_output,
|
||||
mca_soh_base_static_components,
|
||||
&orte_soh_base.soh_components, true)) {
|
||||
mca_base_components_open("smr", orte_smr_base.smr_output,
|
||||
mca_smr_base_static_components,
|
||||
&orte_smr_base.smr_components, true)) {
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
|
@ -23,34 +23,35 @@
|
||||
#include "opal/mca/mca.h"
|
||||
#include "opal/mca/base/base.h"
|
||||
|
||||
#include "orte/mca/soh/base/base.h"
|
||||
#include "orte/mca/smr/base/base.h"
|
||||
#include "orte/mca/smr/base/smr_private.h"
|
||||
|
||||
|
||||
/**
|
||||
* Function for selecting one component from all those that are
|
||||
* available.
|
||||
*/
|
||||
int orte_soh_base_select(void)
|
||||
int orte_smr_base_select(void)
|
||||
{
|
||||
opal_list_item_t *item;
|
||||
opal_list_item_t *best_item = NULL;
|
||||
mca_base_component_list_item_t *cli;
|
||||
orte_soh_base_component_t *component, *best_component = NULL;
|
||||
orte_soh_base_module_t *module, *best_module = NULL;
|
||||
orte_smr_base_component_t *component, *best_component = NULL;
|
||||
orte_smr_base_module_t *module, *best_module = NULL;
|
||||
int priority, best_priority = -1;
|
||||
|
||||
/* Iterate through all the available components */
|
||||
|
||||
for (item = opal_list_get_first(&orte_soh_base.soh_components);
|
||||
item != opal_list_get_end(&orte_soh_base.soh_components);
|
||||
for (item = opal_list_get_first(&orte_smr_base.smr_components);
|
||||
item != opal_list_get_end(&orte_smr_base.smr_components);
|
||||
item = opal_list_get_next(item)) {
|
||||
cli = (mca_base_component_list_item_t *) item;
|
||||
component = (orte_soh_base_component_t *) cli->cli_component;
|
||||
component = (orte_smr_base_component_t *) cli->cli_component;
|
||||
|
||||
/* Call the component's init function and see if it wants to be
|
||||
selected */
|
||||
|
||||
module = component->soh_init(&priority);
|
||||
module = component->smr_init(&priority);
|
||||
|
||||
/* If we got a non-NULL module back, then the component wants to
|
||||
be selected. So save its multi/hidden values and save the
|
||||
@ -82,7 +83,7 @@ int orte_soh_base_select(void)
|
||||
/* If it's not the best one, finalize it */
|
||||
|
||||
/* else { */
|
||||
/* component->soh_finalize(); */
|
||||
/* component->smr_finalize(); */
|
||||
/* } */
|
||||
|
||||
} /* for each possible component */
|
||||
@ -90,7 +91,7 @@ int orte_soh_base_select(void)
|
||||
|
||||
/* If we didn't find one to select, barf */
|
||||
if (NULL != best_module) {
|
||||
orte_soh = *best_module;
|
||||
orte_smr = *best_module;
|
||||
}
|
||||
|
||||
/* all done */
|
@ -31,9 +31,9 @@
|
||||
#include "orte/mca/gpr/gpr.h"
|
||||
#include "orte/mca/ns/ns.h"
|
||||
|
||||
#include "orte/mca/soh/base/base.h"
|
||||
#include "orte/mca/smr/base/smr_private.h"
|
||||
|
||||
int orte_soh_base_set_job_soh(orte_jobid_t jobid,
|
||||
int orte_smr_base_set_job_state(orte_jobid_t jobid,
|
||||
orte_job_state_t state)
|
||||
{
|
||||
orte_gpr_value_t *value;
|
@ -31,9 +31,9 @@
|
||||
#include "orte/mca/gpr/gpr.h"
|
||||
#include "orte/mca/ns/ns.h"
|
||||
|
||||
#include "orte/mca/soh/base/base.h"
|
||||
#include "orte/mca/smr/base/smr_private.h"
|
||||
|
||||
int orte_soh_base_set_proc_soh(orte_process_name_t *proc,
|
||||
int orte_smr_base_set_proc_state(orte_process_name_t *proc,
|
||||
orte_proc_state_t state,
|
||||
int exit_status)
|
||||
{
|
@ -18,8 +18,8 @@
|
||||
/** @file:
|
||||
*/
|
||||
|
||||
#ifndef MCA_SOH_BASE_H
|
||||
#define MCA_SOH_BASE_H
|
||||
#ifndef MCA_SMR_PRIVATE_H
|
||||
#define MCA_SMR_PRIVATE_H
|
||||
|
||||
/*
|
||||
* includes
|
||||
@ -29,142 +29,132 @@
|
||||
#include "orte/orte_types.h"
|
||||
|
||||
#include "opal/class/opal_list.h"
|
||||
#include "orte/dss/dss_types.h"
|
||||
#include "opal/mca/mca.h"
|
||||
/* #include "orte/mca/ns/ns_types.h" */
|
||||
#include "orte/mca/soh/soh.h"
|
||||
|
||||
#include "orte/dss/dss_types.h"
|
||||
#include "orte/mca/ns/ns_types.h"
|
||||
#include "orte/mca/smr/base/base.h"
|
||||
|
||||
|
||||
/*
|
||||
* Global functions for MCA overall collective open and close
|
||||
* private functions for use inside SMR components
|
||||
*/
|
||||
#if defined(c_plusplus) || defined(__cplusplus)
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
OMPI_DECLSPEC int orte_soh_base_open(void);
|
||||
OMPI_DECLSPEC int orte_soh_base_select(void);
|
||||
OMPI_DECLSPEC int orte_soh_base_close(void);
|
||||
|
||||
int orte_soh_base_get_proc_soh(orte_proc_state_t *state,
|
||||
int orte_smr_base_get_proc_state(orte_proc_state_t *state,
|
||||
int *status,
|
||||
orte_process_name_t *proc);
|
||||
|
||||
int orte_soh_base_set_proc_soh(orte_process_name_t *proc,
|
||||
int orte_smr_base_set_proc_state(orte_process_name_t *proc,
|
||||
orte_proc_state_t state,
|
||||
int status);
|
||||
|
||||
int orte_soh_base_get_node_soh_not_available(orte_node_state_t *state,
|
||||
int orte_smr_base_get_node_state_not_available(orte_node_state_t *state,
|
||||
orte_cellid_t cell,
|
||||
char *nodename);
|
||||
|
||||
int orte_soh_base_set_node_soh_not_available(orte_cellid_t cell,
|
||||
int orte_smr_base_set_node_state_not_available(orte_cellid_t cell,
|
||||
char *nodename,
|
||||
orte_node_state_t state);
|
||||
|
||||
int orte_soh_base_get_job_soh(orte_job_state_t *state,
|
||||
int orte_smr_base_get_job_state(orte_job_state_t *state,
|
||||
orte_jobid_t jobid);
|
||||
|
||||
int orte_soh_base_set_job_soh(orte_jobid_t jobid,
|
||||
int orte_smr_base_set_job_state(orte_jobid_t jobid,
|
||||
orte_job_state_t state);
|
||||
|
||||
int orte_soh_base_begin_monitoring_not_available(orte_jobid_t job);
|
||||
int orte_smr_base_begin_monitoring_not_available(orte_jobid_t job);
|
||||
|
||||
|
||||
int orte_soh_base_module_finalize_not_available (void);
|
||||
int orte_smr_base_module_finalize_not_available (void);
|
||||
|
||||
/*
|
||||
* DATA TYPE PACKING FUNCTIONS
|
||||
*/
|
||||
int orte_soh_base_pack_exit_code(orte_buffer_t *buffer, void *src,
|
||||
int orte_smr_base_pack_exit_code(orte_buffer_t *buffer, void *src,
|
||||
orte_std_cntr_t num_vals, orte_data_type_t type);
|
||||
|
||||
int orte_soh_base_pack_node_state(orte_buffer_t *buffer, void *src,
|
||||
int orte_smr_base_pack_node_state(orte_buffer_t *buffer, void *src,
|
||||
orte_std_cntr_t num_vals, orte_data_type_t type);
|
||||
|
||||
int orte_soh_base_pack_proc_state(orte_buffer_t *buffer, void *src,
|
||||
int orte_smr_base_pack_proc_state(orte_buffer_t *buffer, void *src,
|
||||
orte_std_cntr_t num_vals, orte_data_type_t type);
|
||||
|
||||
int orte_soh_base_pack_job_state(orte_buffer_t *buffer, void *src,
|
||||
int orte_smr_base_pack_job_state(orte_buffer_t *buffer, void *src,
|
||||
orte_std_cntr_t num_vals, orte_data_type_t type);
|
||||
|
||||
/*
|
||||
* DATA TYPE UNPACKING FUNCTIONS
|
||||
*/
|
||||
int orte_soh_base_unpack_exit_code(orte_buffer_t *buffer, void *dest,
|
||||
int orte_smr_base_unpack_exit_code(orte_buffer_t *buffer, void *dest,
|
||||
orte_std_cntr_t *num_vals, orte_data_type_t type);
|
||||
|
||||
int orte_soh_base_unpack_node_state(orte_buffer_t *buffer, void *dest,
|
||||
int orte_smr_base_unpack_node_state(orte_buffer_t *buffer, void *dest,
|
||||
orte_std_cntr_t *num_vals, orte_data_type_t type);
|
||||
|
||||
int orte_soh_base_unpack_proc_state(orte_buffer_t *buffer, void *dest,
|
||||
int orte_smr_base_unpack_proc_state(orte_buffer_t *buffer, void *dest,
|
||||
orte_std_cntr_t *num_vals, orte_data_type_t type);
|
||||
|
||||
int orte_soh_base_unpack_job_state(orte_buffer_t *buffer, void *dest,
|
||||
int orte_smr_base_unpack_job_state(orte_buffer_t *buffer, void *dest,
|
||||
orte_std_cntr_t *num_vals, orte_data_type_t type);
|
||||
|
||||
/*
|
||||
* DATA TYPE COMPARE FUNCTIONS
|
||||
*/
|
||||
int orte_soh_base_compare_exit_code(orte_exit_code_t *value1,
|
||||
int orte_smr_base_compare_exit_code(orte_exit_code_t *value1,
|
||||
orte_exit_code_t *value2,
|
||||
orte_data_type_t type);
|
||||
|
||||
int orte_soh_base_compare_node_state(orte_node_state_t *value1,
|
||||
int orte_smr_base_compare_node_state(orte_node_state_t *value1,
|
||||
orte_node_state_t *value2,
|
||||
orte_node_state_t type);
|
||||
|
||||
int orte_soh_base_compare_proc_state(orte_proc_state_t *value1,
|
||||
int orte_smr_base_compare_proc_state(orte_proc_state_t *value1,
|
||||
orte_proc_state_t *value2,
|
||||
orte_proc_state_t type);
|
||||
|
||||
int orte_soh_base_compare_job_state(orte_job_state_t *value1,
|
||||
int orte_smr_base_compare_job_state(orte_job_state_t *value1,
|
||||
orte_job_state_t *value2,
|
||||
orte_job_state_t type);
|
||||
|
||||
/*
|
||||
* DATA TYPE COPY FUNCTIONS
|
||||
*/
|
||||
int orte_soh_base_copy_proc_state(orte_proc_state_t **dest, orte_proc_state_t *src, orte_data_type_t type);
|
||||
int orte_smr_base_copy_proc_state(orte_proc_state_t **dest, orte_proc_state_t *src, orte_data_type_t type);
|
||||
|
||||
int orte_soh_base_copy_job_state(orte_job_state_t **dest, orte_job_state_t *src, orte_data_type_t type);
|
||||
int orte_smr_base_copy_job_state(orte_job_state_t **dest, orte_job_state_t *src, orte_data_type_t type);
|
||||
|
||||
int orte_soh_base_copy_node_state(orte_node_state_t **dest, orte_node_state_t *src, orte_data_type_t type);
|
||||
int orte_smr_base_copy_node_state(orte_node_state_t **dest, orte_node_state_t *src, orte_data_type_t type);
|
||||
|
||||
int orte_soh_base_copy_exit_code(orte_exit_code_t **dest, orte_exit_code_t *src, orte_data_type_t type);
|
||||
int orte_smr_base_copy_exit_code(orte_exit_code_t **dest, orte_exit_code_t *src, orte_data_type_t type);
|
||||
|
||||
/*
|
||||
* DATA TYPE PRINT FUNCTIONS
|
||||
*/
|
||||
int orte_soh_base_std_print(char **output, char *prefix, void *src, orte_data_type_t type);
|
||||
int orte_smr_base_std_print(char **output, char *prefix, void *src, orte_data_type_t type);
|
||||
|
||||
/*
|
||||
* DATA TYPE SIZE FUNCTIONS
|
||||
*/
|
||||
int orte_soh_base_std_size(size_t *size, void *src, orte_data_type_t type);
|
||||
int orte_smr_base_std_size(size_t *size, void *src, orte_data_type_t type);
|
||||
|
||||
/*
|
||||
* DATA TYPE RELEASE FUNCTIONS
|
||||
*/
|
||||
void orte_soh_base_std_release(orte_data_value_t *value);
|
||||
void orte_smr_base_std_release(orte_data_value_t *value);
|
||||
|
||||
/*
|
||||
* globals that might be needed
|
||||
* globals that might be needed within the framework
|
||||
*/
|
||||
|
||||
OMPI_DECLSPEC extern int orte_soh_base_output;
|
||||
OMPI_DECLSPEC extern bool orte_soh_base_selected;
|
||||
|
||||
typedef struct orte_soh_base_t {
|
||||
int soh_output;
|
||||
opal_list_t soh_components;
|
||||
} orte_soh_base_t;
|
||||
|
||||
OMPI_DECLSPEC extern orte_soh_base_t orte_soh_base;
|
||||
OMPI_DECLSPEC extern int orte_smr_base_output;
|
||||
OMPI_DECLSPEC extern bool orte_smr_base_selected;
|
||||
|
||||
|
||||
/*
|
||||
* external API functions will be documented in the mca/soh/soh.h file
|
||||
* external API functions will be documented in the mca/smr/smr.h file
|
||||
*/
|
||||
|
||||
#if defined(c_plusplus) || defined(__cplusplus)
|
@ -19,40 +19,40 @@
|
||||
|
||||
|
||||
headers = \
|
||||
soh_bproc.h
|
||||
smr_bproc.h
|
||||
|
||||
if OMPI_BUILD_soh_bproc_DSO
|
||||
if OMPI_BUILD_smr_bproc_DSO
|
||||
component_noinst =
|
||||
component_install = mca_soh_bproc.la
|
||||
component_install = mca_smr_bproc.la
|
||||
else
|
||||
component_noinst = libmca_soh_bproc.la
|
||||
component_noinst = libmca_smr_bproc.la
|
||||
component_install =
|
||||
endif
|
||||
|
||||
# Conditionally install the header files
|
||||
|
||||
if WANT_INSTALL_HEADERS
|
||||
ortedir = $(includedir)/openmpi/orte/mca/soh/bproc
|
||||
ortedir = $(includedir)/openmpi/orte/mca/smr/bproc
|
||||
orte_HEADERS = $(headers)
|
||||
else
|
||||
ortedir = $(includedir)
|
||||
endif
|
||||
|
||||
soh_SOURCES = \
|
||||
soh_bproc.c \
|
||||
soh_bproc.h \
|
||||
soh_bproc_component.c
|
||||
smr_SOURCES = \
|
||||
smr_bproc.c \
|
||||
smr_bproc.h \
|
||||
smr_bproc_component.c
|
||||
|
||||
mcacomponentdir = $(libdir)/openmpi
|
||||
mcacomponent_LTLIBRARIES = $(component_install)
|
||||
mca_soh_bproc_la_SOURCES = $(soh_SOURCES)
|
||||
mca_soh_bproc_la_LIBADD = \
|
||||
$(soh_bproc_LIBS) \
|
||||
mca_smr_bproc_la_SOURCES = $(smr_SOURCES)
|
||||
mca_smr_bproc_la_LIBADD = \
|
||||
$(smr_bproc_LIBS) \
|
||||
$(top_ompi_builddir)/orte/liborte.la \
|
||||
$(top_ompi_builddir)/opal/libopal.la
|
||||
mca_soh_bproc_la_LDFLAGS = -module -avoid-version $(soh_bproc_LDFLAGS)
|
||||
mca_smr_bproc_la_LDFLAGS = -module -avoid-version $(smr_bproc_LDFLAGS)
|
||||
|
||||
noinst_LTLIBRARIES = $(component_noinst)
|
||||
libmca_soh_bproc_la_SOURCES = $(soh_SOURCES)
|
||||
libmca_soh_bproc_la_LIBADD = $(soh_bproc_LIBS)
|
||||
libmca_soh_bproc_la_LDFLAGS = -module -avoid-version $(soh_bproc_LDFLAGS)
|
||||
libmca_smr_bproc_la_SOURCES = $(smr_SOURCES)
|
||||
libmca_smr_bproc_la_LIBADD = $(smr_bproc_LIBS)
|
||||
libmca_smr_bproc_la_LDFLAGS = -module -avoid-version $(smr_bproc_LDFLAGS)
|
@ -17,32 +17,32 @@
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
# MCA_soh_bproc_CONFIG([action-if-found], [action-if-not-found])
|
||||
# MCA_smr_bproc_CONFIG([action-if-found], [action-if-not-found])
|
||||
# -----------------------------------------------------------
|
||||
AC_DEFUN([MCA_soh_bproc_CONFIG],[
|
||||
OMPI_CHECK_BPROC([soh_bproc], [soh_bproc_good=1],
|
||||
[soh_bproc_good=0], [soh_bproc_good=0])
|
||||
AC_DEFUN([MCA_smr_bproc_CONFIG],[
|
||||
OMPI_CHECK_BPROC([smr_bproc], [smr_bproc_good=1],
|
||||
[smr_bproc_good=0], [smr_bproc_good=0])
|
||||
|
||||
#BPROC_API_VERSION was added in bproc 4.0.0, and this component
|
||||
#will only compile with >= bproc 4.0.0
|
||||
AS_IF([test "$soh_bproc_good" = "1"],
|
||||
AS_IF([test "$smr_bproc_good" = "1"],
|
||||
[AC_MSG_CHECKING(for BPROC_API_VERSION)
|
||||
AC_TRY_COMPILE([#include <sys/bproc.h>],
|
||||
[int foo = BPROC_API_VERSION;],
|
||||
have_bproc_api_ver_msg=yes soh_bproc_good=1,
|
||||
have_bproc_api_ver_msg=no soh_bproc_good=0)
|
||||
have_bproc_api_ver_msg=yes smr_bproc_good=1,
|
||||
have_bproc_api_ver_msg=no smr_bproc_good=0)
|
||||
AC_MSG_RESULT([$have_bproc_api_ver_msg])])
|
||||
|
||||
# if check worked, set wrapper flags if so.
|
||||
# Evaluate succeed / fail
|
||||
AS_IF([test "$soh_bproc_good" = "1"],
|
||||
[soh_bproc_WRAPPER_EXTRA_LDFLAGS="$soh_bproc_LDFLAGS"
|
||||
soh_bproc_WRAPPER_EXTRA_LIBS="$soh_bproc_LIBS"
|
||||
AS_IF([test "$smr_bproc_good" = "1"],
|
||||
[smr_bproc_WRAPPER_EXTRA_LDFLAGS="$smr_bproc_LDFLAGS"
|
||||
smr_bproc_WRAPPER_EXTRA_LIBS="$smr_bproc_LIBS"
|
||||
$1],
|
||||
[$2])
|
||||
|
||||
# set build flags to use in makefile
|
||||
AC_SUBST([soh_bproc_CPPFLAGS])
|
||||
AC_SUBST([soh_bproc_LDFLAGS])
|
||||
AC_SUBST([soh_bproc_LIBS])
|
||||
AC_SUBST([smr_bproc_CPPFLAGS])
|
||||
AC_SUBST([smr_bproc_LDFLAGS])
|
||||
AC_SUBST([smr_bproc_LIBS])
|
||||
])dnl
|
@ -19,5 +19,5 @@
|
||||
|
||||
# Specific to this module
|
||||
|
||||
PARAM_INIT_FILE=soh_bproc.c
|
||||
PARAM_INIT_FILE=smr_bproc.c
|
||||
PARAM_CONFIG_FILES="Makefile"
|
@ -32,8 +32,8 @@
|
||||
#include "orte/mca/ns/ns.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/gpr/base/base.h"
|
||||
#include "orte/mca/soh/base/base.h"
|
||||
#include "orte/mca/soh/bproc/soh_bproc.h"
|
||||
#include "orte/mca/smr/base/smr_private.h"
|
||||
#include "orte/mca/smr/bproc/smr_bproc.h"
|
||||
#include "opal/util/output.h"
|
||||
|
||||
#define BIT_MASK(bit) (bit_set)(1 << (bit))
|
||||
@ -80,15 +80,15 @@ static inline int empty_set(bit_set set)
|
||||
return set == EMPTY_SET;
|
||||
}
|
||||
|
||||
static int orte_soh_bproc_get_proc_soh(orte_proc_state_t *, int *, orte_process_name_t *);
|
||||
static int orte_soh_bproc_set_proc_soh(orte_process_name_t *, orte_proc_state_t, int);
|
||||
static int orte_soh_bproc_finalize(void);
|
||||
static int orte_smr_bproc_get_proc_state(orte_proc_state_t *, int *, orte_process_name_t *);
|
||||
static int orte_smr_bproc_set_proc_state(orte_process_name_t *, orte_proc_state_t, int);
|
||||
static int orte_smr_bproc_finalize(void);
|
||||
|
||||
/**
|
||||
* Query the bproc node status
|
||||
*/
|
||||
|
||||
static int orte_soh_bproc_node_state(char *status)
|
||||
static int orte_smr_bproc_node_state(char *status)
|
||||
{
|
||||
if (strcmp(status, "up") == 0)
|
||||
return ORTE_NODE_STATE_UP;
|
||||
@ -103,8 +103,8 @@ static bit_set find_changes(struct bproc_node_info_t *old, struct bproc_node_inf
|
||||
{
|
||||
bit_set changes = EMPTY_SET;
|
||||
|
||||
if (orte_soh_bproc_node_state(old->status)
|
||||
!= orte_soh_bproc_node_state(new->status))
|
||||
if (orte_smr_bproc_node_state(old->status)
|
||||
!= orte_smr_bproc_node_state(new->status))
|
||||
set_bit(&changes, BIT_NODE_STATE);
|
||||
|
||||
if (strcmp(old->status, new->status) != 0)
|
||||
@ -160,7 +160,7 @@ static void update_registry(bit_set changes, struct bproc_node_info_t *ni)
|
||||
idx = 0;
|
||||
|
||||
if (is_set(changes, BIT_NODE_STATE)) {
|
||||
state = orte_soh_bproc_node_state(ni->status);
|
||||
state = orte_smr_bproc_node_state(ni->status);
|
||||
if (ORTE_SUCCESS != (rc = orte_gpr.create_keyval(&(value->keyvals[idx]), ORTE_NODE_STATE_KEY, ORTE_NODE_STATE, &state))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_RELEASE(value);
|
||||
@ -230,30 +230,30 @@ static void update_registry(bit_set changes, struct bproc_node_info_t *ni)
|
||||
}
|
||||
|
||||
if (idx != cnt) {
|
||||
opal_output(0, "soh_bproc: internal error %d != %d\n", idx, cnt);
|
||||
opal_output(0, "smr_bproc: internal error %d != %d\n", idx, cnt);
|
||||
free(node_name);
|
||||
OBJ_RELEASE(value);
|
||||
opal_event_del(&mca_soh_bproc_component.notify_event);
|
||||
opal_event_del(&mca_smr_bproc_component.notify_event);
|
||||
return;
|
||||
}
|
||||
|
||||
ret = orte_schema.get_node_tokens(&(value->tokens), &(value->num_tokens),
|
||||
mca_soh_bproc_component.cellid, node_name);
|
||||
mca_smr_bproc_component.cellid, node_name);
|
||||
|
||||
if (ret != ORTE_SUCCESS) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
OBJ_RELEASE(value);
|
||||
free(node_name);
|
||||
opal_event_del(&mca_soh_bproc_component.notify_event);
|
||||
opal_event_del(&mca_smr_bproc_component.notify_event);
|
||||
return;
|
||||
}
|
||||
|
||||
if (mca_soh_bproc_component.debug)
|
||||
if (mca_smr_bproc_component.debug)
|
||||
opal_output(0, "updating node %d\n", ni->node);
|
||||
|
||||
if ((ret = orte_gpr.put(1, &value)) != ORTE_SUCCESS) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
opal_event_del(&mca_soh_bproc_component.notify_event);
|
||||
opal_event_del(&mca_smr_bproc_component.notify_event);
|
||||
}
|
||||
|
||||
free(node_name);
|
||||
@ -271,9 +271,9 @@ static int do_update(struct bproc_node_set_t *ns)
|
||||
for (i = 0; i < ns->size; i++) {
|
||||
ni = &ns->node[i];
|
||||
|
||||
if (mca_soh_bproc_component.node_set.size > 0
|
||||
&& mca_soh_bproc_component.node_set.size == ns->size)
|
||||
changes = find_changes(&mca_soh_bproc_component.node_set.node[i], ni);
|
||||
if (mca_smr_bproc_component.node_set.size > 0
|
||||
&& mca_smr_bproc_component.node_set.size == ns->size)
|
||||
changes = find_changes(&mca_smr_bproc_component.node_set.node[i], ni);
|
||||
else
|
||||
changes = BIT_SET_ALL;
|
||||
|
||||
@ -284,21 +284,21 @@ static int do_update(struct bproc_node_set_t *ns)
|
||||
}
|
||||
|
||||
if (changed) {
|
||||
if (mca_soh_bproc_component.node_set.size != 0)
|
||||
bproc_nodeset_free(&mca_soh_bproc_component.node_set);
|
||||
mca_soh_bproc_component.node_set = *ns;
|
||||
if (mca_smr_bproc_component.node_set.size != 0)
|
||||
bproc_nodeset_free(&mca_smr_bproc_component.node_set);
|
||||
mca_smr_bproc_component.node_set = *ns;
|
||||
}
|
||||
|
||||
return changed;
|
||||
}
|
||||
|
||||
static void orte_soh_bproc_notify_handler(int fd, short flags, void *user)
|
||||
static void orte_smr_bproc_notify_handler(int fd, short flags, void *user)
|
||||
{
|
||||
struct bproc_node_set_t ns = BPROC_EMPTY_NODESET;
|
||||
|
||||
if (bproc_nodelist_(&ns, fd) < 0) {
|
||||
/* bproc_nodelist_ error */
|
||||
opal_event_del(&mca_soh_bproc_component.notify_event);
|
||||
opal_event_del(&mca_smr_bproc_component.notify_event);
|
||||
return;
|
||||
}
|
||||
|
||||
@ -309,20 +309,20 @@ static void orte_soh_bproc_notify_handler(int fd, short flags, void *user)
|
||||
/**
|
||||
* Register a callback to receive BProc update notifications
|
||||
*/
|
||||
int orte_soh_bproc_module_init(void)
|
||||
int orte_smr_bproc_module_init(void)
|
||||
{
|
||||
int rc;
|
||||
struct bproc_node_set_t ns = BPROC_EMPTY_NODESET;
|
||||
|
||||
if (mca_soh_bproc_component.debug)
|
||||
opal_output(0, "init soh_bproc_module\n");
|
||||
if (mca_smr_bproc_component.debug)
|
||||
opal_output(0, "init smr_bproc_module\n");
|
||||
|
||||
if (ORTE_SUCCESS != (rc = orte_ns.get_cellid(&mca_soh_bproc_component.cellid, orte_process_info.my_name))) {
|
||||
if (ORTE_SUCCESS != (rc = orte_ns.get_cellid(&mca_smr_bproc_component.cellid, orte_process_info.my_name))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
mca_soh_bproc_component.node_set.size = 0;
|
||||
mca_smr_bproc_component.node_set.size = 0;
|
||||
|
||||
/*
|
||||
* Set initial node status
|
||||
@ -338,51 +338,51 @@ int orte_soh_bproc_module_init(void)
|
||||
* Now regiser notify event
|
||||
*/
|
||||
|
||||
mca_soh_bproc_component.notify_fd = bproc_notifier();
|
||||
if (mca_soh_bproc_component.notify_fd < 0)
|
||||
mca_smr_bproc_component.notify_fd = bproc_notifier();
|
||||
if (mca_smr_bproc_component.notify_fd < 0)
|
||||
return ORTE_ERROR;
|
||||
|
||||
memset(&mca_soh_bproc_component.notify_event, 0, sizeof(opal_event_t));
|
||||
memset(&mca_smr_bproc_component.notify_event, 0, sizeof(opal_event_t));
|
||||
|
||||
opal_event_set(
|
||||
&mca_soh_bproc_component.notify_event,
|
||||
mca_soh_bproc_component.notify_fd,
|
||||
&mca_smr_bproc_component.notify_event,
|
||||
mca_smr_bproc_component.notify_fd,
|
||||
OPAL_EV_READ|OPAL_EV_PERSIST,
|
||||
orte_soh_bproc_notify_handler,
|
||||
orte_smr_bproc_notify_handler,
|
||||
0);
|
||||
|
||||
opal_event_add(&mca_soh_bproc_component.notify_event, 0);
|
||||
opal_event_add(&mca_smr_bproc_component.notify_event, 0);
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
orte_soh_base_module_t orte_soh_bproc_module = {
|
||||
orte_soh_bproc_get_proc_soh,
|
||||
orte_soh_bproc_set_proc_soh,
|
||||
orte_soh_base_get_node_soh_not_available,
|
||||
orte_soh_base_set_node_soh_not_available,
|
||||
orte_soh_base_get_job_soh,
|
||||
orte_soh_base_set_job_soh,
|
||||
orte_soh_base_begin_monitoring_not_available,
|
||||
orte_soh_bproc_finalize
|
||||
orte_smr_base_module_t orte_smr_bproc_module = {
|
||||
orte_smr_bproc_get_proc_state,
|
||||
orte_smr_bproc_set_proc_state,
|
||||
orte_smr_base_get_node_state_not_available,
|
||||
orte_smr_base_set_node_state_not_available,
|
||||
orte_smr_base_get_job_state,
|
||||
orte_smr_base_set_job_state,
|
||||
orte_smr_base_begin_monitoring_not_available,
|
||||
orte_smr_bproc_finalize
|
||||
};
|
||||
|
||||
static int orte_soh_bproc_get_proc_soh(orte_proc_state_t *state, int *status, orte_process_name_t *proc)
|
||||
static int orte_smr_bproc_get_proc_state(orte_proc_state_t *state, int *status, orte_process_name_t *proc)
|
||||
{
|
||||
return orte_soh_base_get_proc_soh(state, status, proc);
|
||||
return orte_smr_base_get_proc_state(state, status, proc);
|
||||
}
|
||||
|
||||
static int orte_soh_bproc_set_proc_soh(orte_process_name_t *proc, orte_proc_state_t state, int status)
|
||||
static int orte_smr_bproc_set_proc_state(orte_process_name_t *proc, orte_proc_state_t state, int status)
|
||||
{
|
||||
return orte_soh_base_set_proc_soh(proc, state, status);
|
||||
return orte_smr_base_set_proc_state(proc, state, status);
|
||||
}
|
||||
|
||||
/**
|
||||
* Cleanup
|
||||
*/
|
||||
|
||||
int orte_soh_bproc_finalize(void)
|
||||
int orte_smr_bproc_finalize(void)
|
||||
{
|
||||
opal_event_del(&mca_soh_bproc_component.notify_event);
|
||||
opal_event_del(&mca_smr_bproc_component.notify_event);
|
||||
return ORTE_SUCCESS;
|
||||
}
|
@ -18,12 +18,12 @@
|
||||
/**
|
||||
* @file
|
||||
*/
|
||||
#ifndef ORTE_SOH_BPROC_H
|
||||
#define ORTE_SOH_BPROC_H
|
||||
#ifndef ORTE_SMR_BPROC_H
|
||||
#define ORTE_SMR_BPROC_H
|
||||
|
||||
#include <sys/bproc.h>
|
||||
|
||||
#include "orte/mca/soh/soh.h"
|
||||
#include "orte/mca/smr/smr.h"
|
||||
#include "opal/event/event.h"
|
||||
|
||||
#if defined(c_plusplus) || defined(__cplusplus)
|
||||
@ -33,20 +33,20 @@ extern "C" {
|
||||
/**
|
||||
* Bproc node registry keys
|
||||
*/
|
||||
#define ORTE_SOH_BPROC_NODE_STATUS "orte-node-bproc-status"
|
||||
#define ORTE_SOH_BPROC_NODE_MODE "orte-node-bproc-mode"
|
||||
#define ORTE_SOH_BPROC_NODE_USER "orte-node-bproc-user"
|
||||
#define ORTE_SOH_BPROC_NODE_GROUP "orte-node-bproc-group"
|
||||
#define ORTE_SMR_BPROC_NODE_STATUS "orte-node-bproc-status"
|
||||
#define ORTE_SMR_BPROC_NODE_MODE "orte-node-bproc-mode"
|
||||
#define ORTE_SMR_BPROC_NODE_USER "orte-node-bproc-user"
|
||||
#define ORTE_SMR_BPROC_NODE_GROUP "orte-node-bproc-group"
|
||||
|
||||
|
||||
/**
|
||||
* Module init/fini
|
||||
*/
|
||||
int orte_soh_bproc_module_init(void);
|
||||
int orte_soh_bproc_module_finalize(void);
|
||||
int orte_smr_bproc_module_init(void);
|
||||
int orte_smr_bproc_module_finalize(void);
|
||||
|
||||
struct orte_soh_bproc_component_t {
|
||||
orte_soh_base_component_t super;
|
||||
struct orte_smr_bproc_component_t {
|
||||
orte_smr_base_component_t super;
|
||||
int debug;
|
||||
int priority;
|
||||
opal_event_t notify_event;
|
||||
@ -54,10 +54,10 @@ struct orte_soh_bproc_component_t {
|
||||
orte_cellid_t cellid;
|
||||
struct bproc_node_set_t node_set;
|
||||
};
|
||||
typedef struct orte_soh_bproc_component_t orte_soh_bproc_component_t;
|
||||
typedef struct orte_smr_bproc_component_t orte_smr_bproc_component_t;
|
||||
|
||||
OMPI_COMP_EXPORT extern orte_soh_base_module_t orte_soh_bproc_module;
|
||||
OMPI_COMP_EXPORT extern orte_soh_bproc_component_t mca_soh_bproc_component;
|
||||
OMPI_COMP_EXPORT extern orte_smr_base_module_t orte_smr_bproc_module;
|
||||
OMPI_COMP_EXPORT extern orte_smr_bproc_component_t mca_smr_bproc_component;
|
||||
|
||||
#if defined(c_plusplus) || defined(__cplusplus)
|
||||
}
|
@ -22,32 +22,32 @@
|
||||
#include "opal/mca/base/mca_base_param.h"
|
||||
#include "orte/util/proc_info.h"
|
||||
#include "opal/util/output.h"
|
||||
#include "orte/mca/soh/bproc/soh_bproc.h"
|
||||
#include "orte/mca/smr/bproc/smr_bproc.h"
|
||||
|
||||
/*
|
||||
* Local functions
|
||||
*/
|
||||
|
||||
static int orte_soh_bproc_open(void);
|
||||
static int orte_soh_bproc_close(void);
|
||||
static orte_soh_base_module_t* orte_soh_bproc_init(int*);
|
||||
static int orte_smr_bproc_open(void);
|
||||
static int orte_smr_bproc_close(void);
|
||||
static orte_smr_base_module_t* orte_smr_bproc_init(int*);
|
||||
|
||||
orte_soh_bproc_component_t mca_soh_bproc_component = {
|
||||
orte_smr_bproc_component_t mca_smr_bproc_component = {
|
||||
{
|
||||
/* First, the mca_base_module_t struct containing meta
|
||||
information about the module itself */
|
||||
{
|
||||
/* Indicate that we are a bproc soh v1.0.0 module (which also
|
||||
/* Indicate that we are a bproc smr v1.3.0 module (which also
|
||||
implies a specific MCA version) */
|
||||
|
||||
ORTE_SOH_BASE_VERSION_1_0_0,
|
||||
ORTE_SMR_BASE_VERSION_1_3_0,
|
||||
|
||||
"bproc", /* MCA module name */
|
||||
ORTE_MAJOR_VERSION, /* MCA module major version */
|
||||
ORTE_MINOR_VERSION, /* MCA module minor version */
|
||||
ORTE_RELEASE_VERSION, /* MCA module release version */
|
||||
orte_soh_bproc_open, /* component open */
|
||||
orte_soh_bproc_close /* component close */
|
||||
orte_smr_bproc_open, /* component open */
|
||||
orte_smr_bproc_close /* component close */
|
||||
},
|
||||
|
||||
/* Next the MCA v1.0.0 module meta data */
|
||||
@ -58,18 +58,18 @@ orte_soh_bproc_component_t mca_soh_bproc_component = {
|
||||
false
|
||||
},
|
||||
|
||||
orte_soh_bproc_init
|
||||
orte_smr_bproc_init
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* Utility function to register parameters
|
||||
*/
|
||||
static int orte_soh_bproc_param_register_int(
|
||||
static int orte_smr_bproc_param_register_int(
|
||||
const char* param_name,
|
||||
int default_value)
|
||||
{
|
||||
int id = mca_base_param_register_int("soh","bproc",param_name,NULL,default_value);
|
||||
int id = mca_base_param_register_int("smr","bproc",param_name,NULL,default_value);
|
||||
int param_value = default_value;
|
||||
mca_base_param_lookup_int(id,¶m_value);
|
||||
return param_value;
|
||||
@ -79,12 +79,12 @@ static int orte_soh_bproc_param_register_int(
|
||||
*
|
||||
*/
|
||||
|
||||
static int orte_soh_bproc_open(void)
|
||||
static int orte_smr_bproc_open(void)
|
||||
{
|
||||
mca_soh_bproc_component.debug =
|
||||
orte_soh_bproc_param_register_int("debug", 0);
|
||||
mca_soh_bproc_component.priority =
|
||||
orte_soh_bproc_param_register_int("priority", 1);
|
||||
mca_smr_bproc_component.debug =
|
||||
orte_smr_bproc_param_register_int("debug", 0);
|
||||
mca_smr_bproc_component.priority =
|
||||
orte_smr_bproc_param_register_int("priority", 1);
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
@ -92,14 +92,14 @@ static int orte_soh_bproc_open(void)
|
||||
*
|
||||
*/
|
||||
|
||||
static orte_soh_base_module_t* orte_soh_bproc_init(int *priority)
|
||||
static orte_smr_base_module_t* orte_smr_bproc_init(int *priority)
|
||||
{
|
||||
if (!orte_process_info.seed)
|
||||
return NULL;
|
||||
|
||||
*priority = mca_soh_bproc_component.priority;
|
||||
orte_soh_bproc_module_init();
|
||||
return &orte_soh_bproc_module;
|
||||
*priority = mca_smr_bproc_component.priority;
|
||||
orte_smr_bproc_module_init();
|
||||
return &orte_smr_bproc_module;
|
||||
}
|
||||
|
||||
|
||||
@ -107,7 +107,7 @@ static orte_soh_base_module_t* orte_soh_bproc_init(int *priority)
|
||||
*
|
||||
*/
|
||||
|
||||
static int orte_soh_bproc_close(void)
|
||||
static int orte_smr_bproc_close(void)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
@ -21,8 +21,8 @@
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef ORTE_SOH_H
|
||||
#define ORTE_SOH_H
|
||||
#ifndef ORTE_SMR_H
|
||||
#define ORTE_SMR_H
|
||||
|
||||
/*
|
||||
* includes
|
||||
@ -34,7 +34,7 @@
|
||||
|
||||
#include "opal/mca/mca.h"
|
||||
#include "orte/mca/ns/ns_types.h"
|
||||
#include "orte/mca/soh/soh_types.h"
|
||||
#include "orte/mca/smr/smr_types.h"
|
||||
|
||||
/*
|
||||
* Component functions - all MUST be provided!
|
||||
@ -42,114 +42,114 @@
|
||||
|
||||
|
||||
/*
|
||||
* Query the state-of-health of a process
|
||||
* Query a process state
|
||||
*/
|
||||
typedef int (*orte_soh_base_module_get_proc_soh_fn_t)(orte_proc_state_t *state,
|
||||
typedef int (*orte_smr_base_module_get_proc_state_fn_t)(orte_proc_state_t *state,
|
||||
int *status,
|
||||
orte_process_name_t *proc);
|
||||
|
||||
/*
|
||||
* Set the state-of-health of a process
|
||||
* Set a process state
|
||||
*/
|
||||
typedef int (*orte_soh_base_module_set_proc_soh_fn_t)(orte_process_name_t *proc,
|
||||
typedef int (*orte_smr_base_module_set_proc_state_fn_t)(orte_process_name_t *proc,
|
||||
orte_proc_state_t state, int status);
|
||||
|
||||
/*
|
||||
* Query SOH of a node
|
||||
* Query a node state
|
||||
*/
|
||||
typedef int (*orte_soh_base_module_get_node_soh_fn_t)(orte_node_state_t *state,
|
||||
typedef int (*orte_smr_base_module_get_node_state_fn_t)(orte_node_state_t *state,
|
||||
orte_cellid_t cell,
|
||||
char *nodename);
|
||||
/*
|
||||
* Set SOH of a node
|
||||
* Set a node state
|
||||
*/
|
||||
typedef int (*orte_soh_base_module_set_node_soh_fn_t)(orte_cellid_t cell,
|
||||
typedef int (*orte_smr_base_module_set_node_state_fn_t)(orte_cellid_t cell,
|
||||
char *nodename,
|
||||
orte_node_state_t state);
|
||||
|
||||
/*
|
||||
* Query the state-of-health of a job
|
||||
* Query a job state
|
||||
*/
|
||||
typedef int (*orte_soh_base_module_get_job_soh_fn_t)(orte_job_state_t *state,
|
||||
typedef int (*orte_smr_base_module_get_job_state_fn_t)(orte_job_state_t *state,
|
||||
orte_jobid_t jobid);
|
||||
|
||||
/*
|
||||
* Set the state-of-health of a job
|
||||
* Set a job state
|
||||
*/
|
||||
typedef int (*orte_soh_base_module_set_job_soh_fn_t)(orte_jobid_t jobid,
|
||||
typedef int (*orte_smr_base_module_set_job_state_fn_t)(orte_jobid_t jobid,
|
||||
orte_job_state_t state);
|
||||
|
||||
/*
|
||||
* Initiate monitoring of a job
|
||||
* This function notifies the soh that it should initiate monitoring of the specified
|
||||
* This function notifies the smr that it should initiate monitoring of the specified
|
||||
* jobid. It is called by the resource manager once a job has been launched. Calling
|
||||
* the function, allows soh components (e.g., the BProc component that monitors daemons
|
||||
* the function, allows smr components (e.g., the BProc component that monitors daemons
|
||||
* via the BProc-provided centralized alerting system) to make the necessary connections
|
||||
* for monitoring the job.
|
||||
*/
|
||||
typedef int (*orte_soh_base_module_begin_monitoring_fn_t)(orte_jobid_t job);
|
||||
typedef int (*orte_smr_base_module_begin_monitoring_fn_t)(orte_jobid_t job);
|
||||
|
||||
/* Shutdown the module nicely
|
||||
*/
|
||||
|
||||
typedef int (*orte_soh_base_module_finalize_fn_t)(void);
|
||||
typedef int (*orte_smr_base_module_finalize_fn_t)(void);
|
||||
|
||||
|
||||
|
||||
/* below are the prototypes needed by the MCA */
|
||||
|
||||
/*
|
||||
* Ver 1.0.0
|
||||
* Ver 1.3.0
|
||||
*/
|
||||
struct orte_soh_base_module_1_0_0_t {
|
||||
orte_soh_base_module_get_proc_soh_fn_t get_proc_soh;
|
||||
orte_soh_base_module_set_proc_soh_fn_t set_proc_soh;
|
||||
orte_soh_base_module_get_node_soh_fn_t get_node_soh;
|
||||
orte_soh_base_module_set_node_soh_fn_t set_node_soh;
|
||||
orte_soh_base_module_get_job_soh_fn_t get_job_soh;
|
||||
orte_soh_base_module_set_job_soh_fn_t set_job_soh;
|
||||
orte_soh_base_module_begin_monitoring_fn_t begin_monitoring_job;
|
||||
orte_soh_base_module_finalize_fn_t finalize;
|
||||
struct orte_smr_base_module_1_3_0_t {
|
||||
orte_smr_base_module_get_proc_state_fn_t get_proc_state;
|
||||
orte_smr_base_module_set_proc_state_fn_t set_proc_state;
|
||||
orte_smr_base_module_get_node_state_fn_t get_node_state;
|
||||
orte_smr_base_module_set_node_state_fn_t set_node_state;
|
||||
orte_smr_base_module_get_job_state_fn_t get_job_state;
|
||||
orte_smr_base_module_set_job_state_fn_t set_job_state;
|
||||
orte_smr_base_module_begin_monitoring_fn_t begin_monitoring_job;
|
||||
orte_smr_base_module_finalize_fn_t finalize;
|
||||
};
|
||||
|
||||
typedef struct orte_soh_base_module_1_0_0_t orte_soh_base_module_1_0_0_t;
|
||||
typedef orte_soh_base_module_1_0_0_t orte_soh_base_module_t;
|
||||
typedef struct orte_smr_base_module_1_3_0_t orte_smr_base_module_1_3_0_t;
|
||||
typedef orte_smr_base_module_1_3_0_t orte_smr_base_module_t;
|
||||
|
||||
/*
|
||||
* SOH Component
|
||||
*/
|
||||
|
||||
typedef orte_soh_base_module_t* (*orte_soh_base_component_init_fn_t)(
|
||||
typedef orte_smr_base_module_t* (*orte_smr_base_component_init_fn_t)(
|
||||
int *priority);
|
||||
|
||||
typedef int (*orte_soh_base_component_finalize_fn_t)(void);
|
||||
typedef int (*orte_smr_base_component_finalize_fn_t)(void);
|
||||
|
||||
/*
|
||||
* the standard component data structure
|
||||
*/
|
||||
|
||||
struct orte_soh_base_component_1_0_0_t {
|
||||
mca_base_component_t soh_version;
|
||||
mca_base_component_data_1_0_0_t soh_data;
|
||||
orte_soh_base_component_init_fn_t soh_init;
|
||||
orte_soh_base_component_finalize_fn_t soh_finalize;
|
||||
struct orte_smr_base_component_1_3_0_t {
|
||||
mca_base_component_t smr_version;
|
||||
mca_base_component_data_1_0_0_t smr_data;
|
||||
orte_smr_base_component_init_fn_t smr_init;
|
||||
orte_smr_base_component_finalize_fn_t smr_finalize;
|
||||
};
|
||||
|
||||
typedef struct orte_soh_base_component_1_0_0_t orte_soh_base_component_1_0_0_t;
|
||||
typedef struct orte_smr_base_component_1_3_0_t orte_smr_base_component_1_3_0_t;
|
||||
|
||||
typedef orte_soh_base_component_1_0_0_t orte_soh_base_component_t;
|
||||
typedef orte_smr_base_component_1_3_0_t orte_smr_base_component_t;
|
||||
|
||||
|
||||
|
||||
/*
|
||||
* Macro for use in components that are of type ns v1.0.0
|
||||
*/
|
||||
#define ORTE_SOH_BASE_VERSION_1_0_0 \
|
||||
/* soh v1.0 is chained to MCA v1.0 */ \
|
||||
#define ORTE_SMR_BASE_VERSION_1_3_0 \
|
||||
/* smr v1.3 is chained to MCA v1.0 */ \
|
||||
MCA_BASE_VERSION_1_0_0, \
|
||||
/* soh v1.0 */ \
|
||||
"soh", 1, 0, 0
|
||||
/* smr v1.3 */ \
|
||||
"smr", 1, 3, 0
|
||||
|
||||
OMPI_DECLSPEC extern orte_soh_base_module_t orte_soh; /* holds selected module's function pointers */
|
||||
OMPI_DECLSPEC extern orte_smr_base_module_t orte_smr; /* holds selected module's function pointers */
|
||||
|
||||
#endif /* ORTE_SOH_H */
|
||||
#endif /* ORTE_SMR_H */
|
@ -1,37 +0,0 @@
|
||||
#
|
||||
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
# University Research and Technology
|
||||
# Corporation. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
# of Tennessee Research Foundation. All rights
|
||||
# reserved.
|
||||
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
headers += \
|
||||
base/base.h
|
||||
|
||||
libmca_soh_la_SOURCES += \
|
||||
base/soh_base_close.c \
|
||||
base/soh_base_select.c \
|
||||
base/soh_base_local_functions.c \
|
||||
base/soh_base_get_proc_soh.c \
|
||||
base/soh_base_set_proc_soh.c \
|
||||
base/soh_base_get_job_soh.c \
|
||||
base/soh_base_set_job_soh.c \
|
||||
base/soh_base_open.c \
|
||||
base/data_type_support/soh_data_type_compare_fns.c \
|
||||
base/data_type_support/soh_data_type_copy_fns.c \
|
||||
base/data_type_support/soh_data_type_print_fns.c \
|
||||
base/data_type_support/soh_data_type_release_fns.c \
|
||||
base/data_type_support/soh_data_type_size_fns.c \
|
||||
base/data_type_support/soh_data_type_packing_fns.c \
|
||||
base/data_type_support/soh_data_type_unpacking_fns.c
|
@ -1,85 +0,0 @@
|
||||
/* -*- C -*-
|
||||
*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*
|
||||
*/
|
||||
|
||||
#include "orte_config.h"
|
||||
|
||||
#include <errno.h>
|
||||
#include <signal.h>
|
||||
|
||||
#include "pcm_bproc.h"
|
||||
#include "mca/pcm/pcm.h"
|
||||
#include "mca/pcm/base/base.h"
|
||||
#include "opal/class/opal_list.h"
|
||||
#include "mca/pcm/base/base_job_track.h"
|
||||
#include "orte/mca/ns/ns.h"
|
||||
#include "orte/mca/ns/base/base.h"
|
||||
|
||||
|
||||
int
|
||||
mca_pcm_bproc_kill_proc(struct mca_pcm_base_module_1_0_0_t* me_super,
|
||||
ompi_process_name_t *name, int flags)
|
||||
{
|
||||
mca_pcm_bproc_module_t *me = (mca_pcm_bproc_module_t*) me_super;
|
||||
pid_t doomed;
|
||||
|
||||
if (NULL == me) return ORTE_ERR_BAD_PARAM;
|
||||
if (NULL == name) return ORTE_ERR_BAD_PARAM;
|
||||
|
||||
doomed = mca_pcm_base_job_list_get_starter(me->jobs,
|
||||
mca_ns_base_get_jobid(name),
|
||||
mca_ns_base_get_vpid(name),
|
||||
true);
|
||||
if (doomed > 0) {
|
||||
kill(doomed, SIGTERM);
|
||||
} else {
|
||||
return ORTE_ERR_NOT_FOUND;
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
int
|
||||
mca_pcm_bproc_kill_job(struct mca_pcm_base_module_1_0_0_t* me_super,
|
||||
mca_ns_base_jobid_t jobid, int flags)
|
||||
{
|
||||
mca_pcm_bproc_module_t *me = (mca_pcm_bproc_module_t*) me_super;
|
||||
pid_t *doomed;
|
||||
size_t doomed_len, i;
|
||||
int ret;
|
||||
|
||||
if (NULL == me) return ORTE_ERR_BAD_PARAM;
|
||||
/* check for invalid jobid */
|
||||
|
||||
ret = mca_pcm_base_job_list_get_starters(me->jobs,
|
||||
jobid, &doomed, &doomed_len,
|
||||
true);
|
||||
if (ORTE_SUCCESS != ret) return ret;
|
||||
|
||||
for (i = 0 ; i < doomed_len ; ++i) {
|
||||
kill(doomed[i], SIGTERM);
|
||||
}
|
||||
|
||||
if (NULL != doomed) {
|
||||
free(doomed);
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
@ -1,69 +0,0 @@
|
||||
/* -*- C -*-
|
||||
*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*
|
||||
*/
|
||||
|
||||
#include "orte_config.h"
|
||||
|
||||
#include <errno.h>
|
||||
#include <unistd.h>
|
||||
|
||||
#include "pcm_bproc.h"
|
||||
#include "mca/pcm/pcm.h"
|
||||
#include "mca/pcm/base/base.h"
|
||||
#include "opal/class/opal_list.h"
|
||||
#include "orte/runtime/runtime.h"
|
||||
#include "orte/runtime/runtime_types.h"
|
||||
#include "ompi/runtime/ompi_rte_wait.h"
|
||||
#include "opal/util/show_help.h"
|
||||
#include "mca/pcm/base/base_kill_track.h"
|
||||
#include "mca/pcm/base/base_job_track.h"
|
||||
|
||||
void
|
||||
mca_pcm_bproc_monitor_cb(pid_t pid, int status, void *data)
|
||||
{
|
||||
mca_ns_base_jobid_t jobid = 0;
|
||||
mca_ns_base_vpid_t upper = 0;
|
||||
mca_ns_base_vpid_t lower = 0;
|
||||
mca_ns_base_vpid_t i = 0;
|
||||
int ret;
|
||||
ompi_process_name_t *proc_name;
|
||||
mca_pcm_bproc_module_t *me = (mca_pcm_bproc_module_t*) data;
|
||||
ompi_rte_process_status_t proc_status;
|
||||
|
||||
printf("pcm: bproc: process %d exited with status %d\n", pid, status);
|
||||
|
||||
ret = mca_pcm_base_job_list_get_job_info(me->jobs, pid, &jobid,
|
||||
&lower, &upper, true);
|
||||
if (ret != ORTE_SUCCESS) {
|
||||
opal_show_help("help-mca-pcm-bproc.txt",
|
||||
"spawn:no-process-record", true, pid, status);
|
||||
return;
|
||||
}
|
||||
|
||||
/* unregister all the procs */
|
||||
proc_status.status_key = OMPI_PROC_KILLED;
|
||||
proc_status.exit_code = (ompi_exit_code_t)status;
|
||||
for (i = lower ; i <= upper ; ++i) {
|
||||
proc_name = mca_ns_base_create_process_name(0, jobid, i);
|
||||
ompi_rte_set_process_status(&proc_status, proc_name);
|
||||
free(proc_name);
|
||||
}
|
||||
|
||||
mca_pcm_base_kill_unregister((mca_pcm_base_module_t*)me, jobid, lower, upper);
|
||||
}
|
@ -1,234 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "orte_config.h"
|
||||
|
||||
#include <sys/poll.h>
|
||||
#include <sys/bproc.h>
|
||||
|
||||
#ifdef HAVE_UNISTD_H
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
|
||||
#include "orte/orte_constants.h"
|
||||
#include "orte/mca/oob/oob.h"
|
||||
#include "orte/mca/oob/base/base.h"
|
||||
#include "orte/mca/ns/base/base.h"
|
||||
#include "orte/runtime/runtime.h"
|
||||
|
||||
#include "svc_bproc_soh.h"
|
||||
|
||||
|
||||
mca_svc_base_module_t mca_svc_bproc_soh_module = {
|
||||
mca_svc_bproc_soh_module_init,
|
||||
mca_svc_bproc_soh_module_fini
|
||||
};
|
||||
|
||||
|
||||
/*
|
||||
* Add a BProc node to the virtual machine SOH segment
|
||||
*/
|
||||
int
|
||||
mca_svc_bproc_soh_add_node(mca_ns_base_cellid_t cellid, int node)
|
||||
{
|
||||
ompi_rte_vm_status_t *vmdata;
|
||||
int err;
|
||||
|
||||
vmdata = (ompi_rte_vm_status_t*)malloc(sizeof(ompi_rte_vm_status_t));
|
||||
vmdata->cell = cellid;
|
||||
asprintf(&(vmdata->nodename), "%d", node);
|
||||
err = bproc_getnodeattr(ni->node, "cpus", &cpus, sizeof(cpus));
|
||||
if (err != 0)
|
||||
cpus = 1;
|
||||
vmdata->cpus = (uint16_t)cpus;
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Process a BProc update notice
|
||||
*/
|
||||
|
||||
int
|
||||
mca_svc_bproc_soh_status_changed(struct bproc_node_info_t *old, struct bproc_node_info_t *new)
|
||||
{
|
||||
if (old->node != new->node)
|
||||
return 0;
|
||||
if (strcmp(old->status, new->status))
|
||||
return 1;
|
||||
if (old->user != new->user)
|
||||
return 1;
|
||||
if (old->group != new->group)
|
||||
return 1;
|
||||
if (old->mode != new->mode)
|
||||
return 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
void
|
||||
mca_svc_bproc_soh_update_node_info(mca_ns_base_cellid_t cellid, struct bproc_node_info_t *ni)
|
||||
{
|
||||
int err;
|
||||
int cpus;
|
||||
char *node;
|
||||
ompi_rte_vm_status__t *vmdata;
|
||||
|
||||
asprintf(&node, "%d", ni->node);
|
||||
vmdata = ompi_rte_get_vm_status(cellid, node);
|
||||
if (vmdata == NULL) { /* this node isn't present yet - add it */
|
||||
mca_svc_bproc_soh_add_node(cellid, ni->node);
|
||||
|
||||
return;
|
||||
|
||||
/* in long-term, we will store the soh data in key-value pairs. for now,
|
||||
* we store it simply as values so we can get it working - I will update
|
||||
* this later to the final form.
|
||||
*/
|
||||
vmdata->user = ni->user;
|
||||
vmdata->group = ni->group;
|
||||
vmdata->mode = ni->mode;
|
||||
if (NULL != vmdata->status) {
|
||||
free(vmdata->status);
|
||||
}
|
||||
vmdata->status = strdup(ni->status);
|
||||
/*
|
||||
ompi_vm_status_data_add_int(vmdata, "user", ni->user);
|
||||
ompi_vm_status_data_add_int(vmdata, "group", ni->group);
|
||||
ompi_vm_status_data_add_int(vmdata, "mode", ni->mode);
|
||||
ompi_vm_status_data_add_string(vmdata, "status", ni->status);
|
||||
*/
|
||||
|
||||
/* probably should optimize this so it only happens once */
|
||||
/* ompi_vm_status_data_add_int(vmdata, "#cpus", cpus); */
|
||||
|
||||
/* registry_put(segment, cell, node, vmdata); */
|
||||
|
||||
free(node);
|
||||
ompit_vm_status_data_finish(vmdata);
|
||||
}
|
||||
|
||||
void
|
||||
mca_svc_bproc_soh_check_node_info(char *segment, char *cell,
|
||||
struct bproc_node_set_t **old,
|
||||
struct bproc_node_set_t *new)
|
||||
{
|
||||
/* we assume the number of nodes does not change */
|
||||
for (i = 0; i < new->size; i++) {
|
||||
ni = &new->node[i];
|
||||
if (!old->size || status_changed((*old)->node[i], ni))
|
||||
update_node_info(segment, cell, ni);
|
||||
}
|
||||
|
||||
if ((*old)->size)
|
||||
bproc_nodeset_free(*old);
|
||||
bproc_nodeset_init(*old, new->size);
|
||||
memcpy((*old)->node, new->node, sizeof(*new->node) * new->size);
|
||||
}
|
||||
|
||||
#if OMPI_HAVE_POSIX_THREADS
|
||||
static void *
|
||||
mca_svc_bproc_soh_status_thread(opal_thread_t *thread)
|
||||
{
|
||||
struct pollfd pfd;
|
||||
struct bproc_node_set_t ns = BPROC_EMPTY_NODESET;
|
||||
mca_svc_bproc_soh_module_t *module = (mca_svc_bproc_soh_module_t *)thread->t_arg;
|
||||
|
||||
/* This thread enter in a cancel enabled state */
|
||||
pthread_setcancelstate( PTHREAD_CANCEL_ENABLE, NULL );
|
||||
pthread_setcanceltype( PTHREAD_CANCEL_ASYNCHRONOUS, NULL );
|
||||
|
||||
for (;;) {
|
||||
pfd.fd = module->notify_fd;
|
||||
pfd.events = POLLIN;
|
||||
res = poll(&pfd, 1, -1);
|
||||
if (res < 0) {
|
||||
/* poll error */
|
||||
break;
|
||||
}
|
||||
if (bproc_nodelist_(&ns, module->notify_fd) < 0) {
|
||||
/* bproc_nodelist_ error */
|
||||
break;
|
||||
}
|
||||
|
||||
mca_svc_bproc_soh_check_node_info(module->segment, module->cell, &module->node_info, ns);
|
||||
|
||||
bproc_nodeset_free(&ns);
|
||||
}
|
||||
|
||||
return PTHREAD_CANCELED;
|
||||
}
|
||||
#endif /* OMPI_HAVE_POSIX_THREADS */
|
||||
|
||||
|
||||
/**
|
||||
* Register a callback to receive BProc update notifications
|
||||
*/
|
||||
|
||||
int mca_svc_bproc_soh_module_init(mca_svc_base_module_t* base)
|
||||
{
|
||||
int i;
|
||||
int num_nodes;
|
||||
bproc_node_set_t node_list;
|
||||
int node_num;
|
||||
char *segment, *jobid_string;
|
||||
mca_svc_bproc_soh_module_t *module /* = somthing */;
|
||||
|
||||
jobid_string = ompi_name_server.get_jobid_string(ompi_rte_get_self());
|
||||
asprintf(&module->segment, "%s-bproc", OMPI_RTE_VM_STATUS_SEGMENT);
|
||||
module->cell = /* get cell somehow */;
|
||||
|
||||
num_nodes = bproc_nodelist(&module->node_info);
|
||||
if (num_nodes < 0)
|
||||
return OMPI_ERROR;
|
||||
|
||||
for (i = 0; i < module->node_info->size; i++) {
|
||||
update_node_info(&module->node_info[i]);
|
||||
}
|
||||
|
||||
module->notify_fd = bproc_notifier();
|
||||
if (module->notify_fd < 0)
|
||||
return OMPI_ERROR;
|
||||
|
||||
if (ompi_using_thread()) {
|
||||
#if OMPI_HAVE_POSIX_THREADS
|
||||
module->thread.t_handle = 0;
|
||||
module->thread.t_run = (opal_thread_fn_t)mca_bproc_status_thread;
|
||||
module->thread.t_arg = (void *)module;
|
||||
#endif /* OMPI_HAVE_POSIX_THREADS */
|
||||
}
|
||||
|
||||
return opal_thread_start(&module->thread);
|
||||
}
|
||||
|
||||
/**
|
||||
* Cleanup
|
||||
*/
|
||||
|
||||
int mca_svc_bproc_soh_module_fini(mca_svc_base_module_t* base)
|
||||
{
|
||||
mca_svc_bproc_soh_module_t *module /* = somthing */;
|
||||
|
||||
#if OMPI_HAVE_POSIX_THREADS
|
||||
if (module->thread.t_handle != 0) {
|
||||
void *thread_return;
|
||||
pthread_cancel(ptl->thread.t_handle);
|
||||
opal_thread_join(&(module->thread), &thread_return);
|
||||
}
|
||||
#endif /* OMPI_HAVE_POSIX_THREADS */
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
@ -1,57 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
/**
|
||||
* @file
|
||||
*/
|
||||
#ifndef _MCA_SVC_BPROC_SOH_
|
||||
#define _MCA_SVC_BPROC_SOH_
|
||||
|
||||
#include "mca/svc/svc.h"
|
||||
|
||||
#if defined(c_plusplus) || defined(__cplusplus)
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
|
||||
/**
|
||||
* Component open/close/init
|
||||
*/
|
||||
int mca_svc_bproc_soh_component_open(void);
|
||||
int mca_svc_bproc_soh_component_close(void);
|
||||
mca_svc_base_module_t* mca_svc_bproc_soh_component_init(void);
|
||||
|
||||
/**
|
||||
* Module init/fini
|
||||
*/
|
||||
int mca_svc_bproc_soh_module_init(mca_svc_base_module_t*);
|
||||
int mca_svc_bproc_soh_module_fini(mca_svc_base_module_t*);
|
||||
|
||||
struct mca_svc_bproc_soh_component_t {
|
||||
mca_svc_base_component_t base;
|
||||
int debug;
|
||||
};
|
||||
typedef struct mca_svc_bproc_soh_component_t mca_svc_bproc_soh_component_t;
|
||||
|
||||
extern mca_svc_base_module_t mca_svc_bproc_soh_module;
|
||||
extern mca_svc_soh_component_t mca_svc_bproc_soh_component;
|
||||
|
||||
#if defined(c_plusplus) || defined(__cplusplus)
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
@ -1,99 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "orte_config.h"
|
||||
#include "orte/orte_constants.h"
|
||||
|
||||
#include "svc_bproc_soh.h"
|
||||
|
||||
|
||||
mca_svc_bproc_soh_component_t mca_svc_bproc_soh_component = {
|
||||
{
|
||||
/* First, the mca_base_module_t struct containing meta
|
||||
information about the module itself */
|
||||
{
|
||||
/* Indicate that we are a bproc soh v1.0.0 module (which also
|
||||
implies a specific MCA version) */
|
||||
|
||||
MCA_SVC_BASE_VERSION_1_0_0,
|
||||
|
||||
"bproc_soh", /* MCA module name */
|
||||
ORTE_MAJOR_VERSION, /* MCA module major version */
|
||||
ORTE_MINOR_VERSION, /* MCA module minor version */
|
||||
ORTE_RELEASE_VERSION, /* MCA module release version */
|
||||
mca_svc_bproc_soh_component_open, /* component open */
|
||||
mca_svc_bproc_soh_component_close /* component close */
|
||||
},
|
||||
|
||||
/* Next the MCA v1.0.0 module meta data */
|
||||
|
||||
{
|
||||
/* Whether the module is checkpointable or not */
|
||||
|
||||
false
|
||||
},
|
||||
|
||||
mca_svc_bproc_soh_component_init
|
||||
},
|
||||
0 /* exec_debug */
|
||||
};
|
||||
|
||||
/**
|
||||
* Utility function to register parameters
|
||||
*/
|
||||
static inline int mca_svc_bproc_soh_param_register_int(
|
||||
const char* param_name,
|
||||
int default_value)
|
||||
{
|
||||
int id = mca_base_param_register_int("svc","bproc_soh",param_name,NULL,default_value);
|
||||
int param_value = default_value;
|
||||
mca_base_param_lookup_int(id,¶m_value);
|
||||
return param_value;
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
*/
|
||||
|
||||
int mca_svc_bproc_soh_component_open(void)
|
||||
{
|
||||
mca_svc_bproc_soh_component.debug =
|
||||
mca_svc_bproc_soh_param_register_int("debug", 0);
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
*/
|
||||
|
||||
mca_svc_base_module_t* mca_svc_bproc_soh_component_init(void)
|
||||
{
|
||||
return &mca_svc_bproc_soh_module;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
*
|
||||
*/
|
||||
|
||||
int mca_svc_bproc_soh_component_close(void)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
@ -1,43 +0,0 @@
|
||||
#
|
||||
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
# University Research and Technology
|
||||
# Corporation. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
# of Tennessee Research Foundation. All rights
|
||||
# reserved.
|
||||
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# Copyright (c) 2004-2006 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
if OMPI_BUILD_soh_xcpu_DSO
|
||||
component_noinst =
|
||||
component_install = mca_soh_xcpu.la
|
||||
else
|
||||
component_noinst = libmca_soh_xcpu.la
|
||||
component_install =
|
||||
endif
|
||||
|
||||
xcpu_SOURCES = \
|
||||
soh_xcpu.c \
|
||||
soh_xcpu.h \
|
||||
soh_xcpu_component.c
|
||||
|
||||
mcacomponentdir = $(libdir)/openmpi
|
||||
mcacomponent_LTLIBRARIES = $(component_install)
|
||||
mca_soh_xcpu_la_SOURCES = $(xcpu_SOURCES)
|
||||
mca_soh_xcpu_la_LIBADD = \
|
||||
$(top_ompi_builddir)/orte/liborte.la \
|
||||
$(top_ompi_builddir)/opal/libopal.la
|
||||
mca_soh_xcpu_la_LDFLAGS = -module -avoid-version
|
||||
|
||||
noinst_LTLIBRARIES = $(component_noinst)
|
||||
libmca_soh_xcpu_la_SOURCES = $(xcpu_SOURCES)
|
||||
libmca_soh_xcpu_la_LIBADD =
|
||||
libmca_soh_xcpu_la_LDFLAGS = -module -avoid-version
|
@ -1,30 +0,0 @@
|
||||
# -*- shell-script -*-
|
||||
#
|
||||
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
# University Research and Technology
|
||||
# Corporation. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
# of Tennessee Research Foundation. All rights
|
||||
# reserved.
|
||||
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# Copyright (c) 2004-2006 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
# MCA_soh_xcpu_CONFIG([action-if-found], [action-if-not-found])
|
||||
# -----------------------------------------------------------
|
||||
AC_DEFUN([MCA_soh_xcpu_CONFIG],[
|
||||
soh_xcpu_good=0
|
||||
# no need for soh_xcpu for time being
|
||||
# if xcpu is present and working, soh_xcpu_good=1.
|
||||
# Evaluate succeed / fail
|
||||
|
||||
AS_IF([test "$soh_xcpu_good" = "1"],
|
||||
[$1], [$2])
|
||||
])dnl
|
@ -1,23 +0,0 @@
|
||||
# -*- shell-script -*-
|
||||
#
|
||||
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
# University Research and Technology
|
||||
# Corporation. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
# of Tennessee Research Foundation. All rights
|
||||
# reserved.
|
||||
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# Copyright (c) 2004-2006 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
# Specific to this module
|
||||
|
||||
PARAM_INIT_FILE=soh_xcpu.c
|
||||
PARAM_CONFIG_FILES="Makefile"
|
@ -1,94 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2006 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "orte_config.h"
|
||||
|
||||
#include <pwd.h>
|
||||
#include <grp.h>
|
||||
|
||||
#ifdef HAVE_UNISTD_H
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
|
||||
#include "orte/orte_constants.h"
|
||||
#include "orte/orte_types.h"
|
||||
|
||||
#include "orte/util/proc_info.h"
|
||||
#include "orte/mca/ns/ns.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/gpr/base/base.h"
|
||||
#include "orte/mca/soh/base/base.h"
|
||||
#include "orte/mca/soh/xcpu/soh_xcpu.h"
|
||||
#include "orte/mca/rmaps/base/base.h"
|
||||
#include "orte/mca/rmaps/base/rmaps_base_map.h"
|
||||
#include "opal/util/output.h"
|
||||
|
||||
static int orte_soh_xcpu_begin_monitoring_job(orte_jobid_t);
|
||||
static int orte_soh_xcpu_finalize(void);
|
||||
|
||||
int orte_soh_xcpu_module_init(void)
|
||||
{
|
||||
int rc;
|
||||
|
||||
if (ORTE_SUCCESS != (rc = orte_ns.get_cellid(&mca_soh_xcpu_component.cellid, orte_process_info.my_name))) {
|
||||
fprintf(stderr, "orte_soh_xcpu_module_init error\n");
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
orte_soh_base_module_t orte_soh_xcpu_module = {
|
||||
orte_soh_base_get_proc_soh,
|
||||
orte_soh_base_set_proc_soh,
|
||||
orte_soh_base_get_node_soh_not_available,
|
||||
orte_soh_base_set_node_soh_not_available,
|
||||
orte_soh_base_get_job_soh,
|
||||
orte_soh_base_set_job_soh,
|
||||
orte_soh_xcpu_begin_monitoring_job,
|
||||
orte_soh_xcpu_finalize
|
||||
};
|
||||
|
||||
/* @begin_monitoring: right now, its only trying to update registry so
|
||||
* that mpirun can exit normally
|
||||
* pls_xcpu is waiting for all threads to finish before calling this function
|
||||
*/
|
||||
static int orte_soh_xcpu_begin_monitoring_job(orte_jobid_t jobid){
|
||||
int rc;
|
||||
size_t num_procs, i;
|
||||
orte_process_name_t *peers;
|
||||
|
||||
if (ORTE_SUCCESS != (rc = orte_ns.get_job_peers(&peers, &num_procs, jobid))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}else
|
||||
for (i=0; i < num_procs; i++) {
|
||||
if (ORTE_SUCCESS != (rc = orte_soh_base_set_proc_soh(&peers[i], ORTE_PROC_STATE_TERMINATED, 0)) ) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
break;
|
||||
}
|
||||
}
|
||||
free(peers);
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
||||
static int orte_soh_xcpu_finalize(void)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
@ -1,66 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2006 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
/**
|
||||
* @file
|
||||
*/
|
||||
#ifndef ORTE_SOH_XCPU_H
|
||||
#define ORTE_SOH_XCPU_H
|
||||
|
||||
#include "orte/mca/soh/soh.h"
|
||||
#include "opal/event/event.h"
|
||||
|
||||
#if defined(c_plusplus) || defined(__cplusplus)
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
/**
|
||||
* Bproc node registry keys
|
||||
*/
|
||||
#define ORTE_SOH_XCPU_NODE_STATUS "orte-node-xcpu-status"
|
||||
#define ORTE_SOH_XCPU_NODE_MODE "orte-node-xcpu-mode"
|
||||
#define ORTE_SOH_XCPU_NODE_USER "orte-node-xcpu-user"
|
||||
#define ORTE_SOH_XCPU_NODE_GROUP "orte-node-xcpu-group"
|
||||
|
||||
|
||||
/**
|
||||
* Module init/fini
|
||||
*/
|
||||
int orte_soh_xcpu_module_init(void);
|
||||
int orte_soh_xcpu_module_finalize(void);
|
||||
|
||||
struct orte_soh_xcpu_component_t {
|
||||
orte_soh_base_component_t super;
|
||||
/* not sure which of the following variabels are
|
||||
* needed
|
||||
* */
|
||||
int debug;
|
||||
int priority;
|
||||
opal_event_t notify_event;
|
||||
int notify_fd;
|
||||
orte_cellid_t cellid;
|
||||
/*struct xcpu_node_set_t node_set;*/
|
||||
};
|
||||
typedef struct orte_soh_xcpu_component_t orte_soh_xcpu_component_t;
|
||||
|
||||
OMPI_COMP_EXPORT extern orte_soh_base_module_t orte_soh_xcpu_module;
|
||||
OMPI_COMP_EXPORT extern orte_soh_xcpu_component_t mca_soh_xcpu_component;
|
||||
|
||||
#if defined(c_plusplus) || defined(__cplusplus)
|
||||
}
|
||||
#endif
|
||||
#endif
|
@ -1,99 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2006 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "orte_config.h"
|
||||
#include "orte/orte_constants.h"
|
||||
#include "opal/mca/base/base.h"
|
||||
#include "opal/mca/base/mca_base_param.h"
|
||||
#include "orte/util/proc_info.h"
|
||||
#include "opal/util/output.h"
|
||||
#include "orte/mca/soh/xcpu/soh_xcpu.h"
|
||||
|
||||
/*
|
||||
* Local functions
|
||||
*/
|
||||
|
||||
static int orte_soh_xcpu_open(void);
|
||||
static int orte_soh_xcpu_close(void);
|
||||
static orte_soh_base_module_t* orte_soh_xcpu_init(int*);
|
||||
|
||||
orte_soh_xcpu_component_t mca_soh_xcpu_component = {
|
||||
{
|
||||
/* First, the mca_base_module_t struct containing meta
|
||||
information about the module itself */
|
||||
{
|
||||
/* Indicate that we are a xcpu soh v1.0.0 module (which also
|
||||
implies a specific MCA version) */
|
||||
|
||||
ORTE_SOH_BASE_VERSION_1_0_0,
|
||||
|
||||
"xcpu", /* MCA module name */
|
||||
ORTE_MAJOR_VERSION, /* MCA module major version */
|
||||
ORTE_MINOR_VERSION, /* MCA module minor version */
|
||||
ORTE_RELEASE_VERSION, /* MCA module release version */
|
||||
orte_soh_xcpu_open, /* component open */
|
||||
orte_soh_xcpu_close /* component close */
|
||||
},
|
||||
|
||||
/* Next the MCA v1.0.0 module meta data */
|
||||
|
||||
{
|
||||
/* Whether the module is checkpointable or not */
|
||||
|
||||
false
|
||||
},
|
||||
|
||||
orte_soh_xcpu_init
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* Utility function to register parameters
|
||||
*/
|
||||
static int orte_soh_xcpu_param_register_int(
|
||||
const char* param_name,
|
||||
int default_value)
|
||||
{
|
||||
int id = mca_base_param_register_int("soh","xcpu",param_name,NULL,default_value);
|
||||
int param_value = default_value;
|
||||
mca_base_param_lookup_int(id,¶m_value);
|
||||
return param_value;
|
||||
}
|
||||
|
||||
static int orte_soh_xcpu_open(void)
|
||||
{
|
||||
mca_soh_xcpu_component.debug =
|
||||
orte_soh_xcpu_param_register_int("debug", 0);
|
||||
mca_soh_xcpu_component.priority =
|
||||
orte_soh_xcpu_param_register_int("priority", 100);
|
||||
/*fprintf(stdout, "soh_xcpu: open\n");*/
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static orte_soh_base_module_t* orte_soh_xcpu_init(int *priority)
|
||||
{
|
||||
*priority = mca_soh_xcpu_component.priority;
|
||||
orte_soh_xcpu_module_init();/*do we need this???*/
|
||||
return &orte_soh_xcpu_module;
|
||||
}
|
||||
|
||||
static int orte_soh_xcpu_close(void)
|
||||
{
|
||||
fprintf(stdout, "soh_xcpu: close\n");
|
||||
return ORTE_SUCCESS;
|
||||
}
|
@ -52,7 +52,7 @@
|
||||
#include "orte/mca/rmgr/base/base.h"
|
||||
#include "orte/mca/rmaps/base/base.h"
|
||||
#include "orte/mca/schema/base/base.h"
|
||||
#include "orte/mca/soh/base/base.h"
|
||||
#include "orte/mca/smr/base/base.h"
|
||||
#include "orte/util/univ_info.h"
|
||||
#include "orte/util/proc_info.h"
|
||||
#include "orte/util/session_dir.h"
|
||||
@ -364,15 +364,15 @@ int orte_init_stage1(bool infrastructure)
|
||||
/*
|
||||
* setup the state-of-health monitor
|
||||
*/
|
||||
if (ORTE_SUCCESS != (ret = orte_soh_base_open())) {
|
||||
if (ORTE_SUCCESS != (ret = orte_smr_base_open())) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
error = "orte_soh_base_open";
|
||||
error = "orte_smr_base_open";
|
||||
goto error;
|
||||
}
|
||||
|
||||
if (ORTE_SUCCESS != (ret = orte_soh_base_select())) {
|
||||
if (ORTE_SUCCESS != (ret = orte_smr_base_select())) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
error = "orte_soh_base_select";
|
||||
error = "orte_smr_base_select";
|
||||
goto error;
|
||||
}
|
||||
|
||||
|
@ -29,9 +29,6 @@
|
||||
#include "orte/mca/rml/rml.h"
|
||||
#include "orte/mca/ns/ns.h"
|
||||
#include "orte/mca/gpr/gpr.h"
|
||||
|
||||
#include "orte/mca/rmgr/base/base.h"
|
||||
#include "orte/mca/soh/base/base.h"
|
||||
#include "orte/mca/iof/base/base.h"
|
||||
|
||||
#include "orte/runtime/runtime.h"
|
||||
|
@ -39,7 +39,7 @@
|
||||
#include "orte/mca/ns/base/base.h"
|
||||
#include "orte/mca/gpr/base/base.h"
|
||||
#include "orte/mca/rmgr/base/base.h"
|
||||
#include "orte/mca/soh/base/base.h"
|
||||
#include "orte/mca/smr/base/base.h"
|
||||
#include "orte/util/proc_info.h"
|
||||
#include "orte/util/sys_info.h"
|
||||
#include "orte/util/univ_info.h"
|
||||
@ -87,7 +87,7 @@ int orte_restart(orte_process_name_t *name, const char* uri)
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
if (ORTE_SUCCESS != (rc = orte_soh_base_close())) {
|
||||
if (ORTE_SUCCESS != (rc = orte_smr_base_close())) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
@ -225,7 +225,7 @@ int orte_restart(orte_process_name_t *name, const char* uri)
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
if (ORTE_SUCCESS != (rc = orte_soh_base_open())) {
|
||||
if (ORTE_SUCCESS != (rc = orte_smr_base_open())) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
@ -246,7 +246,7 @@ int orte_restart(orte_process_name_t *name, const char* uri)
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
if (ORTE_SUCCESS != (rc = orte_soh_base_select())) {
|
||||
if (ORTE_SUCCESS != (rc = orte_smr_base_select())) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
@ -57,7 +57,7 @@
|
||||
#include "orte/util/proc_info.h"
|
||||
#include "orte/util/session_dir.h"
|
||||
#include "orte/util/universe_setup_file_io.h"
|
||||
#include "orte/mca/soh/soh.h"
|
||||
#include "orte/mca/smr/smr.h"
|
||||
#include "orte/mca/rml/rml.h"
|
||||
#include "orte/mca/rds/rds_types.h"
|
||||
#include "orte/mca/ns/ns.h"
|
||||
|
@ -60,10 +60,9 @@
|
||||
#include "orte/mca/ns/base/base.h"
|
||||
#include "orte/mca/gpr/gpr.h"
|
||||
#include "orte/mca/rml/rml.h"
|
||||
#include "orte/mca/soh/soh.h"
|
||||
#include "orte/mca/smr/smr.h"
|
||||
#include "orte/mca/rmgr/rmgr.h"
|
||||
#include "orte/mca/rmgr/base/base.h"
|
||||
#include "orte/mca/soh/base/base.h"
|
||||
|
||||
#include "orte/runtime/runtime.h"
|
||||
|
||||
@ -387,7 +386,7 @@ int main(int argc, char *argv[])
|
||||
* Set my process status to "starting". Note that this must be done
|
||||
* after the rte init is completed.
|
||||
*/
|
||||
if (ORTE_SUCCESS != (ret = orte_soh.set_proc_soh(orte_process_info.my_name,
|
||||
if (ORTE_SUCCESS != (ret = orte_smr.set_proc_state(orte_process_info.my_name,
|
||||
ORTE_PROC_STATE_RUNNING, 0))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
return ret;
|
||||
|
@ -65,7 +65,6 @@
|
||||
#include "orte/mca/ns/base/base.h"
|
||||
#include "orte/mca/gpr/base/base.h"
|
||||
#include "orte/mca/schema/base/base.h"
|
||||
#include "orte/mca/soh/base/base.h"
|
||||
|
||||
#include "orte/runtime/runtime.h"
|
||||
#include "orte/runtime/orte_wait.h"
|
||||
|
@ -559,7 +559,7 @@ static void dump_aborted_procs(orte_jobid_t jobid)
|
||||
continue;
|
||||
}
|
||||
if(strcmp(keyval->key, ORTE_PROC_RANK_KEY) == 0) {
|
||||
if (ORTE_SUCCESS != (rc = orte_dss.get((void**)&sptr, keyval->value, ORTE_SIZE))) {
|
||||
if (ORTE_SUCCESS != (rc = orte_dss.get((void**)&sptr, keyval->value, ORTE_STD_CNTR))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
continue;
|
||||
}
|
||||
@ -660,7 +660,7 @@ static void job_state_callback(orte_jobid_t jobid, orte_proc_state_t state)
|
||||
break;
|
||||
|
||||
case ORTE_PROC_STATE_TERMINATED:
|
||||
dump_aborted_procs(jobid);
|
||||
orterun_globals.exit_status = 0; /* set the exit status to indicate normal termination */
|
||||
orterun_globals.exit = true;
|
||||
opal_condition_signal(&orterun_globals.cond);
|
||||
break;
|
||||
|
@ -19,4 +19,4 @@
|
||||
|
||||
|
||||
SUBDIRS = oob schema
|
||||
DIST_SUBDIRS = gpr gpr/remote ns oob ras rds rmaps rmgr schema soh
|
||||
DIST_SUBDIRS = gpr gpr/remote ns oob ras rds rmaps rmgr schema smr
|
||||
|
@ -21,48 +21,48 @@
|
||||
AM_CPPFLAGS = -I$(top_srcdir)/test/support
|
||||
|
||||
check_PROGRAMS = \
|
||||
soh_dt_buffer \
|
||||
soh_dt_compare \
|
||||
soh_dt_print \
|
||||
soh_dt_size \
|
||||
soh_dt_release \
|
||||
soh_dt_copy
|
||||
smr_dt_buffer \
|
||||
smr_dt_compare \
|
||||
smr_dt_print \
|
||||
smr_dt_size \
|
||||
smr_dt_release \
|
||||
smr_dt_copy
|
||||
|
||||
TESTS = \
|
||||
$(check_PROGRAMS)
|
||||
|
||||
soh_dt_buffer_SOURCES = soh_dt_buffer.c
|
||||
soh_dt_buffer_LDADD = \
|
||||
smr_dt_buffer_SOURCES = smr_dt_buffer.c
|
||||
smr_dt_buffer_LDADD = \
|
||||
$(top_builddir)/orte/liborte.la \
|
||||
$(top_builddir)/opal/libopal.la
|
||||
soh_dt_buffer_DEPENDENCIES = $(soh_dt_buffer_LDADD)
|
||||
smr_dt_buffer_DEPENDENCIES = $(smr_dt_buffer_LDADD)
|
||||
|
||||
soh_dt_copy_SOURCES = soh_dt_copy.c
|
||||
soh_dt_copy_LDADD = \
|
||||
smr_dt_copy_SOURCES = smr_dt_copy.c
|
||||
smr_dt_copy_LDADD = \
|
||||
$(top_builddir)/orte/liborte.la \
|
||||
$(top_builddir)/opal/libopal.la
|
||||
soh_dt_copy_DEPENDENCIES = $(soh_dt_copy_LDADD)
|
||||
smr_dt_copy_DEPENDENCIES = $(smr_dt_copy_LDADD)
|
||||
|
||||
soh_dt_compare_SOURCES = soh_dt_compare.c
|
||||
soh_dt_compare_LDADD = \
|
||||
smr_dt_compare_SOURCES = smr_dt_compare.c
|
||||
smr_dt_compare_LDADD = \
|
||||
$(top_builddir)/orte/liborte.la \
|
||||
$(top_builddir)/opal/libopal.la
|
||||
soh_dt_compare_DEPENDENCIES = $(soh_dt_compare_LDADD)
|
||||
smr_dt_compare_DEPENDENCIES = $(smr_dt_compare_LDADD)
|
||||
|
||||
soh_dt_print_SOURCES = soh_dt_print.c
|
||||
soh_dt_print_LDADD = \
|
||||
smr_dt_print_SOURCES = smr_dt_print.c
|
||||
smr_dt_print_LDADD = \
|
||||
$(top_builddir)/orte/liborte.la \
|
||||
$(top_builddir)/opal/libopal.la
|
||||
soh_dt_print_DEPENDENCIES = $(soh_dt_print_LDADD)
|
||||
smr_dt_print_DEPENDENCIES = $(smr_dt_print_LDADD)
|
||||
|
||||
soh_dt_size_SOURCES = soh_dt_size.c
|
||||
soh_dt_size_LDADD = \
|
||||
smr_dt_size_SOURCES = smr_dt_size.c
|
||||
smr_dt_size_LDADD = \
|
||||
$(top_builddir)/orte/liborte.la \
|
||||
$(top_builddir)/opal/libopal.la
|
||||
soh_dt_size_DEPENDENCIES = $(soh_dt_size_LDADD)
|
||||
smr_dt_size_DEPENDENCIES = $(smr_dt_size_LDADD)
|
||||
|
||||
soh_dt_release_SOURCES = soh_dt_release.c
|
||||
soh_dt_release_LDADD = \
|
||||
smr_dt_release_SOURCES = smr_dt_release.c
|
||||
smr_dt_release_LDADD = \
|
||||
$(top_builddir)/orte/liborte.la \
|
||||
$(top_builddir)/opal/libopal.la
|
||||
soh_dt_release_DEPENDENCIES = $(soh_dt_release_LDADD)
|
||||
smr_dt_release_DEPENDENCIES = $(smr_dt_release_LDADD)
|
Загрузка…
x
Ссылка в новой задаче
Block a user