1
1

Change the SOH to the new State Monitoring and Reporting (SMR) framework. New API's will be appearing in the new framework shortly - this just gets the name change into the system.

Other changes:

1. Remove the old xcpu components as they are not functional.

2. Fix a "bug" in orterun whereby we called dump_aborted_procs even when we normally terminated. There is still some kind of bug in this procedure, however, as we appear to be calling the orterun job_state_callback function every time a process terminates (instead of only once when they have all terminated). I'll continue digging into that one.

This will require an autogen/configure, I'm afraid.

This commit was SVN r11228.
Этот коммит содержится в:
Ralph Castain 2006-08-16 16:35:09 +00:00
родитель 6d414f2d44
Коммит 8c7f0ed9ae
83 изменённых файлов: 515 добавлений и 2501 удалений

Просмотреть файл

@ -1146,7 +1146,7 @@ AC_CONFIG_FILES([
test/mca/rmaps/Makefile
test/mca/rmgr/Makefile
test/mca/schema/Makefile
test/mca/soh/Makefile
test/mca/smr/Makefile
test/memory/Makefile
test/runtime/Makefile
test/support/Makefile

Просмотреть файл

@ -47,7 +47,7 @@
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/rmgr/rmgr.h"
#include "orte/mca/rmgr/base/base.h"
#include "orte/mca/soh/soh_types.h"
#include "orte/mca/smr/smr_types.h"
#include "orte/mca/rml/rml.h"
#include "orte/runtime/runtime.h"

Просмотреть файл

@ -45,8 +45,7 @@
#include "orte/mca/ns/ns.h"
#include "orte/mca/gpr/gpr.h"
#include "orte/mca/rml/rml.h"
#include "orte/mca/soh/soh.h"
#include "orte/mca/soh/base/base.h"
#include "orte/mca/smr/smr.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/runtime/runtime.h"
@ -133,7 +132,7 @@ int ompi_mpi_finalize(void)
}
*/
/* Set process status to "at stg3" */
if (ORTE_SUCCESS != (ret = orte_soh.set_proc_soh(orte_process_info.my_name,
if (ORTE_SUCCESS != (ret = orte_smr.set_proc_state(orte_process_info.my_name,
ORTE_PROC_STATE_AT_STG3, 0))) {
ORTE_ERROR_LOG(ret);
}
@ -277,15 +276,15 @@ int ompi_mpi_finalize(void)
}
/* Set process status to "finalized" */
if (ORTE_SUCCESS != (ret = orte_soh.set_proc_soh(orte_process_info.my_name,
if (ORTE_SUCCESS != (ret = orte_smr.set_proc_state(orte_process_info.my_name,
ORTE_PROC_STATE_FINALIZED, 0))) {
ORTE_ERROR_LOG(ret);
}
/*
* Wait for everyone to get here. This is necessary to allow the soh
* Wait for everyone to get here. This is necessary to allow the smr
* to update the job state for singletons. Otherwise, we finalize
* the RTE while the soh is trying to do the update - which causes
* the RTE while the smr is trying to do the update - which causes
* an ugly race condition
*/
if (ORTE_SUCCESS != (ret = orte_rml.xcast(NULL, NULL, 0, NULL,

Просмотреть файл

@ -41,8 +41,7 @@
#include "orte/mca/gpr/gpr.h"
#include "orte/mca/rml/rml.h"
#include "orte/mca/schema/schema.h"
#include "orte/mca/soh/soh.h"
#include "orte/mca/soh/base/base.h"
#include "orte/mca/smr/smr.h"
#include "orte/mca/errmgr/errmgr.h"
#include "ompi/constants.h"
@ -475,7 +474,7 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
}
/* Let system know we are at STG1 Barrier */
if (ORTE_SUCCESS != (ret = orte_soh.set_proc_soh(orte_process_info.my_name,
if (ORTE_SUCCESS != (ret = orte_smr.set_proc_state(orte_process_info.my_name,
ORTE_PROC_STATE_AT_STG1, 0))) {
ORTE_ERROR_LOG(ret);
error = "set process state failed";
@ -585,7 +584,7 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
/* Let system know we are at STG2 Barrier */
if (ORTE_SUCCESS != (ret = orte_soh.set_proc_soh(orte_process_info.my_name,
if (ORTE_SUCCESS != (ret = orte_smr.set_proc_state(orte_process_info.my_name,
ORTE_PROC_STATE_AT_STG2, 0))) {
ORTE_ERROR_LOG(ret);
error = "set process state failed";

Просмотреть файл

@ -90,8 +90,8 @@
#include "orte/mca/rml/base/base.h"
#include "orte/mca/pls/pls.h"
#include "orte/mca/pls/base/base.h"
#include "orte/mca/soh/soh.h"
#include "orte/mca/soh/base/base.h"
#include "orte/mca/smr/smr.h"
#include "orte/mca/smr/base/base.h"
#include "orte/mca/sds/sds.h"
#include "orte/mca/sds/base/base.h"
@ -228,8 +228,8 @@ void ompi_info::open_components()
orte_sds_base_open();
component_map["sds"] = &orte_sds_base_components_available;
orte_soh_base_open();
component_map["soh"] = &orte_soh_base.soh_components;
orte_smr_base_open();
component_map["smr"] = &orte_smr_base.smr_components;
// MPI frameworks
@ -296,7 +296,7 @@ void ompi_info::close_components()
orte_iof_base_close();
orte_sds_base_close();
orte_soh_base_close();
orte_smr_base_close();
orte_pls_base_close();
orte_rmgr_base_close();
orte_rmaps_base_close();

Просмотреть файл

@ -43,7 +43,7 @@
#include "orte/dss/dss_types.h"
#include "orte/mca/ns/ns_types.h"
#include "orte/mca/rmgr/rmgr_types.h"
#include "orte/mca/soh/soh_types.h"
#include "orte/mca/smr/smr_types.h"
#if defined(c_plusplus) || defined(__cplusplus)
extern "C" {

Просмотреть файл

@ -35,7 +35,7 @@
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/ns/ns_types.h"
#include "orte/mca/soh/soh_types.h"
#include "orte/mca/smr/smr_types.h"
#include "orte/mca/gpr/replica/transition_layer/gpr_replica_tl.h"
#include "gpr_replica_fn.h"

Просмотреть файл

@ -26,10 +26,9 @@
#include "orte/dss/dss.h"
#include "orte/mca/oob/oob.h"
#include "orte/mca/oob/base/base.h"
#include "orte/mca/ns/ns.h"
#include "orte/mca/gpr/gpr.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/soh/soh.h"
#include "orte/mca/smr/smr.h"
#include "orte/runtime/runtime.h"
@ -93,18 +92,16 @@ int mca_oob_xcast(
orte_std_cntr_t i;
int rc;
int tag = MCA_OOB_TAG_XCAST;
int cmpval;
int status;
orte_proc_state_t state;
/* check to see if I am the root process name */
cmpval = orte_ns.compare(ORTE_NS_CMP_ALL, root, orte_process_info.my_name);
if(NULL != root && 0 == cmpval) {
if(NULL != root && ORTE_EQUAL == orte_dss.compare(root, orte_process_info.my_name, ORTE_NAME)) {
mca_oob_xcast_t *xcast = OBJ_NEW(mca_oob_xcast_t);
xcast->counter = num_peers;
for(i=0; i<num_peers; i++) {
/* check status of peer to ensure they are alive */
if (ORTE_SUCCESS != (rc = orte_soh.get_proc_soh(&state, &status, peers+i))) {
if (ORTE_SUCCESS != (rc = orte_smr.get_proc_state(&state, &status, peers+i))) {
ORTE_ERROR_LOG(rc);
return rc;
}

Просмотреть файл

@ -28,7 +28,7 @@
#include "orte/mca/pls/base/base.h"
#include "orte/mca/ns/ns.h"
#include "orte/mca/gpr/gpr.h"
#include "orte/mca/soh/soh_types.h"
#include "orte/mca/smr/smr_types.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/schema/schema.h"

Просмотреть файл

@ -63,7 +63,7 @@
#include "orte/mca/rmaps/base/base.h"
#include "orte/mca/rmaps/base/rmaps_base_map.h"
#include "orte/mca/rml/rml.h"
#include "orte/mca/soh/base/base.h"
#include "orte/mca/smr/smr.h"
#include "orte/runtime/orte_wait.h"
#include "orte/runtime/runtime.h"
@ -289,9 +289,9 @@ static void orte_pls_bproc_waitpid_cb(pid_t wpid, int status, void *data) {
int rc;
/* set the state of this process */
if(WIFEXITED(status)) {
rc = orte_soh.set_proc_soh(proc, ORTE_PROC_STATE_TERMINATED, status);
rc = orte_smr.set_proc_state(proc, ORTE_PROC_STATE_TERMINATED, status);
} else {
rc = orte_soh.set_proc_soh(proc, ORTE_PROC_STATE_ABORTED, status);
rc = orte_smr.set_proc_state(proc, ORTE_PROC_STATE_ABORTED, status);
}
if(ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
@ -724,7 +724,7 @@ orte_pls_bproc_check_node_state(orte_gpr_notify_data_t *notify_data,
if(dead_node) {
/* gotta see if this node belongs to us... arg.. */
/* also, we know by order of creation that the node state */
/* comes before the node name.. see soh_bproc.c */
/* comes before the node name.. see smr_bproc.c */
orte_std_cntr_t name_idx;
for (name_idx = 0;
name_idx < orte_pointer_array_get_size(mca_pls_bproc_component.active_node_names);

Просмотреть файл

@ -72,8 +72,7 @@
#include "orte/mca/gpr/gpr.h"
#include "orte/mca/rmaps/base/base.h"
#include "orte/mca/rmaps/base/rmaps_base_map.h"
#include "orte/mca/soh/soh.h"
#include "orte/mca/soh/base/base.h"
#include "orte/mca/smr/smr.h"
#include "orte/mca/pls/fork/pls_fork.h"
extern char **environ;
@ -171,7 +170,7 @@ static void orte_pls_fork_kill_processes(opal_value_array_t *pids, opal_value_ar
/* update the process state on the registry */
proc = OPAL_VALUE_ARRAY_GET_ITEM(procs, orte_process_name_t, i);
if (ORTE_SUCCESS != (rc = orte_soh.set_proc_soh(&proc, ORTE_PROC_STATE_TERMINATED, exit_status))) {
if (ORTE_SUCCESS != (rc = orte_smr.set_proc_state(&proc, ORTE_PROC_STATE_TERMINATED, exit_status))) {
ORTE_ERROR_LOG(rc);
/* don't exit out even if this didn't work - we still might need to kill more
* processes, so just keep trucking
@ -203,9 +202,9 @@ static void orte_pls_fork_wait_proc(pid_t pid, int status, void* cbdata)
/* set the state of this process */
if(WIFEXITED(status)) {
rc = orte_soh.set_proc_soh(&proc->proc_name, ORTE_PROC_STATE_TERMINATED, status);
rc = orte_smr.set_proc_state(&proc->proc_name, ORTE_PROC_STATE_TERMINATED, status);
} else {
rc = orte_soh.set_proc_soh(&proc->proc_name, ORTE_PROC_STATE_ABORTED, status);
rc = orte_smr.set_proc_state(&proc->proc_name, ORTE_PROC_STATE_ABORTED, status);
}
if(ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
@ -493,7 +492,7 @@ static int orte_pls_fork_proc(
the SOH or else everyone else will hang. Don't bother
checking whether or not this worked - just fire and forget
*/
orte_soh.set_proc_soh(&proc->proc_name, ORTE_PROC_STATE_ABORTED, rc);
orte_smr.set_proc_state(&proc->proc_name, ORTE_PROC_STATE_ABORTED, rc);
return ORTE_ERR_FATAL;
break;
}
@ -576,7 +575,7 @@ int orte_pls_fork_launch(orte_jobid_t jobid)
processes to be launched to ABORTED. This will
cause the entire job to abort. */
for (; i < map->num_procs; ++i) {
orte_soh.set_proc_soh(&map->procs[i]->proc_name,
orte_smr.set_proc_state(&map->procs[i]->proc_name,
ORTE_PROC_STATE_ABORTED, 0);
}

Просмотреть файл

@ -82,8 +82,7 @@
#include "orte/mca/ras/base/ras_base_node.h"
#include "orte/mca/rmaps/base/rmaps_base_map.h"
#include "orte/mca/rmgr/base/base.h"
#include "orte/mca/soh/soh.h"
#include "orte/mca/soh/base/base.h"
#include "orte/mca/smr/smr.h"
#include "orte/mca/pls/gridengine/pls_gridengine.h"
#include "orte/util/sys_info.h"
@ -186,7 +185,7 @@ static void orte_pls_gridengine_wait_daemon(pid_t pid, int status, void* cbdata)
orte_session_dir_finalize(&(map->procs[i])->proc_name);
rc = orte_soh.set_proc_soh(&(map->procs[i]->proc_name),
rc = orte_smr.set_proc_state(&(map->procs[i]->proc_name),
ORTE_PROC_STATE_ABORTED, status);
}
if (ORTE_SUCCESS != rc) {

Просмотреть файл

@ -43,7 +43,7 @@
#include "orte/mca/rmgr/base/base.h"
#include "orte/mca/rml/rml.h"
#include "orte/mca/sds/base/base.h"
#include "orte/mca/soh/soh.h"
#include "orte/mca/smr/smr.h"
#include "orte/util/univ_info.h"
#include "orte/util/session_dir.h"
#include "orte/runtime/orte_wait.h"
@ -347,7 +347,7 @@ static void poe_wait_job(pid_t pid, int status, void* cbdata)
for(i = 0 ; i < map->num_procs ; ++i) {
orte_session_dir_finalize(&(map->procs[i])->proc_name);
rc = orte_soh.set_proc_soh(&(map->procs[i]->proc_name),
rc = orte_smr.set_proc_state(&(map->procs[i]->proc_name),
ORTE_PROC_STATE_ABORTED, status);
}
if(ORTE_SUCCESS != rc) {

Просмотреть файл

@ -79,8 +79,7 @@
#include "orte/mca/ras/base/ras_base_node.h"
#include "orte/mca/rmaps/base/rmaps_base_map.h"
#include "orte/mca/rmgr/base/base.h"
#include "orte/mca/soh/soh.h"
#include "orte/mca/soh/base/base.h"
#include "orte/mca/smr/smr.h"
#include "orte/mca/pls/rsh/pls_rsh.h"
#include "orte/util/sys_info.h"
@ -325,7 +324,7 @@ static void orte_pls_rsh_wait_daemon(pid_t pid, int status, void* cbdata)
orte_session_dir_finalize(&(map->procs[i])->proc_name);
rc = orte_soh.set_proc_soh(&(map->procs[i]->proc_name),
rc = orte_smr.set_proc_state(&(map->procs[i]->proc_name),
ORTE_PROC_STATE_ABORTED, status);
}
if (ORTE_SUCCESS != rc) {

Просмотреть файл

@ -56,10 +56,9 @@
#include "orte/mca/pls/pls.h"
#include "orte/mca/pls/base/base.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/soh/soh_types.h"
#include "orte/mca/smr/smr_types.h"
#include "orte/mca/gpr/gpr.h"
#include "orte/mca/sds/base/base.h"
#include "orte/mca/soh/soh.h"
#include "orte/mca/rml/rml.h"
#include "orte/mca/ns/ns.h"
#include "pls_tm.h"
@ -344,7 +343,7 @@ pls_tm_launch(orte_jobid_t jobid)
* NOT being oversubscribed
*/
if (node->node_slots > 0 &&
opal_list_get_size(&rmaps_node->node_procs) > node->node_slots) {
(orte_std_cntr_t)opal_list_get_size(&rmaps_node->node_procs) > node->node_slots) {
if (mca_pls_tm_component.debug) {
opal_output(0, "pls:tm: oversubscribed -- setting mpi_yield_when_idle to 1 (%d %d)",
node->node_slots,

Просмотреть файл

@ -1,50 +0,0 @@
#
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2004-2005 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2006 The Regents of the University of California.
# All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
AM_CPPFLAGS = $(pls_xcpu_CPPFLAGS)
# Make the output library in this directory, and name it either
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
# (for static builds).
if OMPI_BUILD_pls_xcpu_DSO
component_noinst =
component_install = mca_pls_xcpu.la
else
component_noinst = libmca_pls_xcpu.la
component_install =
endif
sources = \
pls_xcpu.h \
pls_xcpu.c \
pls_xcpu_component.c
mcacomponentdir = $(libdir)/openmpi
mcacomponent_LTLIBRARIES = $(component_install)
mca_pls_xcpu_la_SOURCES = $(sources)
mca_pls_xcpu_la_LIBADD = \
$(pls_xcpu_LIBS) \
$(top_ompi_builddir)/orte/liborte.la \
$(top_ompi_builddir)/opal/libopal.la
mca_pls_xcpu_la_LDFLAGS = -module -avoid-version $(pls_xcpu_LDFLAGS)
noinst_LTLIBRARIES = $(component_noinst)
libmca_pls_xcpu_la_SOURCES = $(sources)
libmca_pls_xcpu_la_LIBADD = $(pls_xcpu_LIBS)
libmca_pls_xcpu_la_LDFLAGS = -module -avoid-version $(pls_xcpu_LDFLAGS)

Просмотреть файл

@ -1,37 +0,0 @@
# -*- shell-script -*-
#
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2004-2005 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2006 The Regents of the University of California.
# All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# MCA_pls_xcpu_CONFIG([action-if-found], [action-if-not-found])
# -----------------------------------------------------------
AC_DEFUN([MCA_pls_xcpu_CONFIG],[
OMPI_CHECK_XCPU([pls_xcpu], [pls_xcpu_good=1], [pls_xcpu_good=0])
# if check worked, set wrapper flags.
# Evaluate succeed / fail
AS_IF([test "$pls_xcpu_good" = "1"],
[pls_xcpu_WRAPPER_EXTRA_LDFLAGS="$pls_xcpu_LDFLAGS"
pls_xcpu_WRAPPER_EXTRA_LIBS="$pls_xcpu_LIBS"
$1],
[$2])
# set build flags to use in makefile
AC_SUBST([pls_xcpu_CPPFLAGS])
AC_SUBST([pls_xcpu_LDFLAGS])
AC_SUBST([pls_xcpu_LIBS])
])dnl

Просмотреть файл

@ -1,24 +0,0 @@
# -*- shell-script -*-
#
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2004-2005 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2006 The Regents of the University of California.
# All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# Specific to this module
PARAM_INIT_FILE=pls_xcpu.c
PARAM_CONFIG_FILES="Makefile"

Просмотреть файл

@ -1,792 +0,0 @@
/* -*- C -*-
*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2006 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*
*/
/* @file:
* xcpu Lancher to launch jobs on compute nodes..
*/
#include "orte_config.h"
#if HAVE_SYS_TYPES_H
#include <sys/types.h>
#endif /* HAVE_SYS_TYPES_H */
#ifdef HAVE_SYS_STAT_H
#include <sys/stat.h>
#endif /* HAVE_SYS_STAT_H */
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif /* HAVE_UNISTD_H */
#include <errno.h>
#include <signal.h>
#ifdef HAVE_FCNTL_H
#include <fcntl.h>
#endif /* HAVE_FCNTL_H */
#ifdef HAVE_STRING_H
#include <string.h>
#endif /* HAVE_STRING_H */
#include "opal/event/event.h"
#include "opal/mca/base/mca_base_param.h"
#include "opal/util/argv.h"
#include "opal/util/output.h"
#include "opal/util/opal_environ.h"
#include "opal/util/path.h"
#include "opal/util/show_help.h"
#include "orte/dss/dss.h"
#include "orte/util/sys_info.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/gpr/base/base.h"
#include "orte/mca/iof/iof.h"
#include "orte/mca/ns/base/base.h"
#include "orte/mca/sds/base/base.h"
#include "orte/mca/oob/base/base.h"
#include "orte/mca/ras/base/base.h"
#include "orte/mca/rmgr/base/base.h"
#include "orte/mca/rmaps/base/base.h"
#include "orte/mca/rmaps/base/rmaps_base_map.h"
#include "orte/mca/rml/rml.h"
#include "orte/mca/soh/base/base.h"
#include "orte/runtime/orte_wait.h"
#include "orte/runtime/runtime.h"
#include "pls_xcpu.h"
#include <regex.h>
#include <dirent.h>
/**
* Our current evironment
*/
extern char **environ;
extern int errno;
char **g_environ;
int g_regexploc=1;
regex_t g_compiled_exp;
orte_pls_xcpu_mount_nodes *g_current_m=NULL;
orte_pls_xcpu_thread_info *g_thread_info;
orte_pls_xcpu_pthread_tindex t_info;
orte_pls_xcpu_stdio_thread_info *g_stdout_thread_info, *g_stderr_thread_info;
pthread_mutex_t mymutex = PTHREAD_MUTEX_INITIALIZER;
orte_pls_xcpu_pthread_tindex *orte_pls_xcpu_launch_procs(int, char **, char**, orte_process_name_t *);
int orte_pls_xcpu_cmd_check(int, char **);
void orte_pls_xcpu_cleanup();
void *orte_pls_xcpu_start_thread(void *);
void *orte_pls_xcpu_stdio_thread(void *);
int orte_pls_xcpu_check_exp(char *);
/**
* Initialization of the xcpu module with all the needed function pointers
*/
orte_pls_base_module_t orte_pls_xcpu_module = {
orte_pls_xcpu_launch,
orte_pls_xcpu_terminate_job,
orte_pls_xcpu_terminate_proc,
orte_pls_xcpu_finalize
};
/** LOCAL SUPPORT FUNCTIONS **/
/** provide a local function to release the function stack
* required by xcpu
*/
static void orte_pls_xcpu_free_stack(orte_pls_xcpu_tid_stack *s){
if(s){
orte_pls_xcpu_free_stack(s->next);
free(s);
}
}
/* for handling stdout/err */
void *orte_pls_xcpu_stdio_thread(void *info){
orte_pls_xcpu_stdio_thread_info *io_t_info;
char buf[100];int x, rc;
io_t_info = (orte_pls_xcpu_stdio_thread_info*)info;
if((x=open(io_t_info->stdio_path, O_RDONLY))<0){
ORTE_ERROR_LOG(ORTE_ERR_FILE_OPEN_FAILURE);
}else{
while(1){
if((rc=read(x, buf, 100))>0){
write(io_t_info->outdes, buf, rc);
}else{
if(rc==-1){
ORTE_ERROR_LOG(ORTE_ERR_FILE_READ_FAILURE);
}
break;
}
}
}
x>=0?close(x):0;
free(io_t_info->stdio_path);
free(io_t_info);
pthread_exit(NULL);
}
/* used by orte_pls_xcpu_launch_procs to start process
* on remote compute node.
* one thread per process for time being
*
* @info: contains all the information required by thread
* to launch process on remote compute node.
*/
void *orte_pls_xcpu_start_thread(void *info){
orte_pls_xcpu_thread_info *t_info;
char *session_clone, session_dir[255], *session_dir_path;
int clone_des, rc=0, des1, des2/*, tdes*/, trc[2];
char *env_path, *exec_path, *argv_path, *ctl_path;
char character[8193];
int i;
orte_process_name_t *peers;
pthread_t tids[2];
trc[0]=trc[1]=0;
t_info=(orte_pls_xcpu_thread_info*)info;
session_clone=(char*)malloc(strlen(t_info->local_mounts.name)+7);
sprintf(session_clone, "%s/clone", t_info->local_mounts.name);
if((clone_des=open(session_clone, O_RDONLY))<0){
ORTE_ERROR_LOG(ORTE_ERR_FILE_OPEN_FAILURE);
}
if((rc=read(clone_des, session_dir, 255))<0){
ORTE_ERROR_LOG(ORTE_ERR_FILE_READ_FAILURE);
}
else{
session_dir[rc]='\0';
session_dir_path=(char*)malloc(strlen(t_info->local_mounts.name)+strlen(session_dir)+2);
sprintf(session_dir_path, "%s/%s", t_info->local_mounts.name, session_dir);
/* write environment if needed */
env_path=(char*)malloc(strlen(session_dir_path)+5);
sprintf(env_path, "%s/env", session_dir_path);
if(t_info->env){
if((des1=open(env_path, O_WRONLY))<0){
ORTE_ERROR_LOG(ORTE_ERR_FILE_WRITE_FAILURE);
}else{
i=0;
while(t_info->env[i]){
/*printf("from lrx: %s\n", t_info->env[i]);
*/if(write(des1, t_info->env[i], strlen(t_info->env[i])) == -1){
ORTE_ERROR_LOG(ORTE_ERR_FILE_WRITE_FAILURE);
break;
}else{
if(t_info->env[i+1]){
if(write(des1, "\n", 1) == -1){
ORTE_ERROR_LOG(ORTE_ERR_FILE_WRITE_FAILURE);
break;
}
}
}
i++;
}
close(des1);
}
}
free(env_path);
/*then copy binary*/
exec_path=(char*)malloc(strlen(session_dir_path)+6);
sprintf(exec_path, "%s/exec", session_dir_path);
if((des1=open(exec_path, O_WRONLY))<0){
ORTE_ERROR_LOG(ORTE_ERR_FILE_OPEN_FAILURE);
}else
if((des2=open(t_info->binary, O_RDONLY))<0){
ORTE_ERROR_LOG(ORTE_ERR_FILE_OPEN_FAILURE);
}else{
while(1){
if((rc=read(des2, character, 8192))<=0){
if(close(des1)!=0){ /*?????*/
/*no ORTE_ERR defined for FILE_CLOSE_FAILURE*/
}
if(close(des2)!=0){
/*no ORTE_ERR defined for FILE_CLOSE_FAILURE*/
}
break;
}else{
if(write(des1, character, rc)==-1){
ORTE_ERROR_LOG(ORTE_ERR_FILE_WRITE_FAILURE);
break;
}
}
}
}
/* then write args*/
argv_path=(char*)malloc(strlen(session_dir_path)+6);
sprintf(argv_path, "%s/argv", session_dir_path);
if((des1=open(argv_path, O_WRONLY))<0){
ORTE_ERROR_LOG(ORTE_ERR_FILE_OPEN_FAILURE);
}else{
write(des1, t_info->argv, strlen(t_info->argv));
close(des1);
}
/* then write exec into ctl file to start remote execution*/
ctl_path=(char*)malloc(strlen(session_dir_path)+5);
sprintf(ctl_path, "%s/ctl", session_dir_path);
/*continuation of writing ctl*/
if((des1=open(ctl_path, O_WRONLY))<0){
ORTE_ERROR_LOG(ORTE_ERR_FILE_OPEN_FAILURE);
}else{
if(write(des1, "exec\n", 5)==-1){
ORTE_ERROR_LOG(ORTE_ERR_FILE_WRITE_FAILURE);
}else
close(des1);
}
/*then spawn threads for stderr and atdout*/
g_stdout_thread_info=(orte_pls_xcpu_stdio_thread_info*)malloc(sizeof(orte_pls_xcpu_stdio_thread_info));
g_stdout_thread_info->stdio_path=(char*)malloc(strlen(session_dir_path)+8);
sprintf(g_stdout_thread_info->stdio_path, "%s/stdout", session_dir_path);
g_stdout_thread_info->outdes=1;
if((rc=pthread_create(&tids[0], NULL, orte_pls_xcpu_stdio_thread, (void*)g_stdout_thread_info))==0){
trc[0]=1;
}else ;
/*ORTE_ERR for thread_creation_failure not defined yet*/
/*fprintf(stderr, "\nstdout thread creation error\n");*/
g_stderr_thread_info=(orte_pls_xcpu_stdio_thread_info*)malloc(sizeof(orte_pls_xcpu_stdio_thread_info));
g_stderr_thread_info->stdio_path=(char*)malloc(strlen(session_dir_path)+8);
sprintf(g_stderr_thread_info->stdio_path, "%s/stderr", session_dir_path);
g_stderr_thread_info->outdes=2;
if((rc=pthread_create(&tids[1], NULL, orte_pls_xcpu_stdio_thread, (void*)g_stderr_thread_info))==0){
trc[1]=1;
}else ;
/*ORTE_ERR for thread_creation_failure not defined yet*/
/*fprintf(stderr, "stderr thread creation error\n");*/
free(session_dir_path);
free(exec_path);
free(argv_path);
free(ctl_path);
if(trc[0]){
pthread_join(tids[0], NULL);
}
if(trc[1]){
pthread_join(tids[1], NULL);
}
}
free(session_clone);
(clone_des>0)?close(clone_des):0;
/* make registry update thread-safe */
pthread_mutex_lock(&mymutex);
/*write into registry that you are done*/
if (ORTE_SUCCESS != (orte_soh_base_set_proc_soh(t_info->peers, ORTE_PROC_STATE_TERMINATED, 0)) ){
ORTE_ERROR_LOG(rc);
}
pthread_mutex_unlock(&mymutex);
/* free the allocated variables after you are done*/
free(t_info->local_mounts.name);
free(t_info->binary);
free(t_info->argv);
free(t_info);
pthread_exit(NULL);
}
/* xcpu launcher function.
* this function is called once for each process to be launched. or might
* be called one time for multiple processes if regular expression is passed
* to it. but for now regular expressions are not being passed.
*
* @argc: number of arguments or number of elements in argv
* @argv: it will be name of remote node as mounted at $XCPUBASE or /mnt/xcpu/
* @env: environment the needs to be setup on remote node before
* starting the process
* @peers: process info, this will be passed onto the threads to help them write
* process completion information in open-mpi registry.
*/
orte_pls_xcpu_pthread_tindex *orte_pls_xcpu_launch_procs(int argc, char **argv, char **env, orte_process_name_t *peers){
char *xcpu_base, *xcpu_argv;
struct dirent *d_entry;
DIR *dirp;
int temp_fd, rc=0, index=0, argvsize=0, ntids=0;
pthread_t *tids;
orte_pls_xcpu_mount_nodes *m_nodes, *local_mounts;
g_current_m=NULL;
m_nodes=NULL;
(!(xcpu_base=getenv("XCPUBASE")))?xcpu_base="/mnt/xcpu":0;
if(!(dirp=opendir(xcpu_base))){
ORTE_ERROR_LOG(ORTE_ERR_FILE_OPEN_FAILURE);/* it should be DIR_OPEN_ERROR */
return NULL;
}
/* this logic should be fast than the one commented below*/
m_nodes=(orte_pls_xcpu_mount_nodes*)malloc(sizeof(orte_pls_xcpu_mount_nodes));
m_nodes->next=g_current_m;
m_nodes->name=(char*)malloc(1+strlen(xcpu_base)+1+
strlen(argv[1])+1+strlen("xcpu")+1);
sprintf(m_nodes->name, "%s/%s/xcpu", xcpu_base, argv[1]);
if((temp_fd=open(m_nodes->name, O_RDONLY))<0){
fprintf(stderr, "Node %s/%s/xcpu does not exist\n",xcpu_base, argv[1]);
free(m_nodes->name);
}else{
close(temp_fd);
g_current_m=m_nodes;
ntids=1;
}
/* logic ends */
/*
while((d_entry=readdir(dirp))!=NULL){
printf("comapring %s %s\n",d_entry->d_name, argv[1]);
if((strcmp(d_entry->d_name, ".")==0)||(strcmp(d_entry->d_name, "..")==0))
;else
if(regexec(&g_compiled_exp, d_entry->d_name, 0, NULL, 0)!=REG_NOMATCH){
printf("matched %s\n", argv[1]);
ntids++;
m_nodes=(orte_pls_xcpu_mount_nodes*)malloc(sizeof(orte_pls_xcpu_mount_nodes));
m_nodes->next=g_current_m;
m_nodes->name=(char*)malloc(1+strlen(xcpu_base)+1+
strlen(d_entry->d_name)+1+strlen("xcpu")+1);
sprintf(m_nodes->name, "%s/%s/xcpu", xcpu_base, d_entry->d_name);
g_current_m=m_nodes;
*/ /* we can break after finding the first one
* or if you want to give the user an option of
* specifying regular expressions in hostfiles
* then don't break here
*/
/* on a second thought we should not be going thrugh mounted node list
* just check if xcpu_base/d_entry->d_name/xcpu exists or not
*/
/* break;
}
}*/
if(g_current_m==NULL){ /* is that an error.... no?*/
return NULL;
}
closedir(dirp);
/* now combine argv's so that they could be passed on */
/* g_regexploc will have proper value only if
* cmd_check is already called in lrx
* and the location of first arg after name of binary will be
* argv[g_regexploc+2] because usage: ./o.lrx [-D xx] regexp binary args
*/
/* number of arguments = argc - g_regexploc - 2;*/
index=g_regexploc+2-1; /*argv[0] could be anything*/
while(argv[index]){
argvsize+=strlen(argv[index])+1;
index++;
}
xcpu_argv=(char*)malloc(argvsize+1);
index=g_regexploc+2-1;
while(argv[index]){
if(index==g_regexploc+2-1)
strcpy(xcpu_argv, argv[index]);/* i dont know why strcpy 1st time?*/
else
strcat(xcpu_argv, argv[index]);
strcat(xcpu_argv, " ");
index++;
}
xcpu_argv[argvsize]='\0';
local_mounts=g_current_m; /* this is a linked list of mounted directories
* where binaries need to run
*/
tids=(pthread_t*)malloc(ntids*sizeof(pthread_t));
index=0;
while(local_mounts){
/* dont use a shared copy
* give every thread its own copy since we dont know
* when all threads will exit and when to free a shared copy
*/
g_thread_info=(orte_pls_xcpu_thread_info*)malloc(sizeof(orte_pls_xcpu_thread_info));
/*copy name first*/
g_thread_info->local_mounts.name=(char*)malloc(strlen(local_mounts->name)+1);
strcpy(g_thread_info->local_mounts.name, local_mounts->name);
/*then copy binary*/
g_thread_info->binary=(char*)malloc(strlen(argv[g_regexploc+1])+1);
strcpy(g_thread_info->binary,argv[g_regexploc+1]);
/*then copy argv*/
g_thread_info->argv=(char*)malloc(strlen(xcpu_argv)+1);
strcpy(g_thread_info->argv, xcpu_argv);
/* for env and peers, since we are not allocating space for these
* and these will be freed after all the threads are completed at the
* end of mpirun (i hope).. otherwise we might have to copy these
* first and then pass to threads
*/
g_thread_info->env=env;
g_thread_info->peers=peers;
/*following thread will free the thread_info structure*/
rc=pthread_create(&tids[index], NULL, orte_pls_xcpu_start_thread, (void*)g_thread_info);
index++;
if(rc){
/*ORTE_ERR for thread_creation_failure not defined yet*/
/*fprintf(stderr, "pthread_create: error while creating thread %d\n", rc);*/
return NULL;
}
local_mounts=local_mounts->next;
}
/* use pthrad_join here if you want to wait for threads
* to finish execution
*//*
while(1){
index--;
pthread_join(tids[index], NULL);
if(index==0)
break;
}
free(tids);*/
/* remember to free tids in calling function*/
free(xcpu_argv);
t_info.tids=tids;
t_info.index=index;
return &t_info;
}
/* this function is to check if argv is in correct format.
* Some checks being done in this function (for -D) are not necessary
* and will be removed in future.
*/
int orte_pls_xcpu_cmd_check(int argc, char **argv){
char *temp_exp;
int rc=0;
g_regexploc=1;
if(argc>=3){
if(argv[1][0]=='-'){
switch(argv[1][1]){
case 'D': /* for debugging*/
g_regexploc+=2;
if(argc<5){
/*fprintf(stderr, "usage: o.lrx [-D debuglevel"
"] nodes binary [argv0 argv1 ...]\n");
*/rc=1;
}
break;
default: /* unspecified option*/
/*fprintf(stderr, "usage: o.lrx [-D debuglevel"
"] nodes binary [argv0 argv1 ...]\n");
*/return 1;
break;
}
}
}else{
/*fprintf(stderr, "usage: o.lrx [-D debuglevel"
"] nodes binary [argv0 argv1 ...]\n");
*/rc=1;
}
if(!rc){/*check for regular expression*/
temp_exp=(char*)malloc(strlen(argv[g_regexploc])+3);
sprintf(temp_exp, "^%s$", argv[g_regexploc]);
rc=orte_pls_xcpu_check_exp(temp_exp);
free(temp_exp);
}
return rc;
}
void orte_pls_xcpu_free_mount(orte_pls_xcpu_mount_nodes *g_current_m){
if(g_current_m){
orte_pls_xcpu_free_mount(g_current_m->next);
free(g_current_m->name);
free(g_current_m);
}
}
void orte_pls_xcpu_cleanup(){
regfree(&g_compiled_exp);
orte_pls_xcpu_free_mount(g_current_m);
}
/* Launcher can accept regular expressions as the list of nodes where
* processes are going to be launched. This is just a helper function to check
* if regular expression is correct or not
*/
int orte_pls_xcpu_check_exp(char *exp){
if(regcomp(&g_compiled_exp, exp, REG_EXTENDED|REG_NOSUB)){
/*fprintf(stderr, "Invlid regular expression: %s\n", exp);*/
return 1;
}
/*regfree(&g_compiled_exp);*/
return 0; /* now dont forget to call regfree at the end*/
}
/* This is the main launcher function
* It will call orte_pls_xcpu_launch_procs which will
* start a thread for each process to be launched
*/
int lrx(int argc, char **argv, char **env, orte_process_name_t *peers){
int rc;
orte_pls_xcpu_pthread_tindex *t_info;
if((rc=orte_pls_xcpu_cmd_check(argc, argv))==1){
return 0;
}
if((t_info=orte_pls_xcpu_launch_procs(argc, argv, env, peers))==NULL){
/*fprintf(stderr, "lrx: 0 processes launched\n");*/
orte_pls_xcpu_cleanup();
return 0;
}
else{
orte_pls_xcpu_cleanup();
t_info->index--;
rc=t_info->tids[t_info->index];
free(t_info->tids);
return rc; /* no need to return thread_id
* thread will write its completition
* itself in the registry
*/
}
/*
while(1){
t_info->index--;
pthread_join(t_info->tids[t_info->index], NULL);
if(t_info->index==0)
break;
}
*/
return 0;/* can never be called*/
}
/** provide a function to setup the environment for the remote
* processes. We need to ensure that the remote processes know
* their gpr and ns replicas, the universe
* to which they belong, etc. - otherwise, they may run, but they
* will never actually join the rest of the job. This function
* creates the common environment for all the processes.
*
* @param env a pointer to the environment to setup
*/
static int orte_pls_xcpu_setup_env(char ***env)
{
char ** merged;
char * var;
char * param;
int rc;
int num_env;
/** merge in environment */
merged = opal_environ_merge(*env, environ);
opal_argv_free(*env);
*env = merged;
num_env = opal_argv_count(*env);
/** append mca parameters to our environment */
if(ORTE_SUCCESS != (rc = mca_base_param_build_env(env, &num_env, false))) {
ORTE_ERROR_LOG(rc);
}
/** ns replica contact info */
if (NULL != orte_process_info.ns_replica) {
param = strdup(orte_process_info.ns_replica_uri);
} else {
param = orte_rml.get_uri();
}
var = mca_base_param_environ_variable("ns","replica","uri");
opal_setenv(var, param, true, env);
free(var);
var = mca_base_param_environ_variable("ns","replica","uri");
opal_setenv(var, param, true, env);
free(var);
/** make sure the frontend hostname does not get pushed out to the backend */
var = mca_base_param_environ_variable("orte", "base", "nodename");
opal_unsetenv(var, env);
free(var);
opal_unsetenv("HOSTNAME", env);
/** gpr replica contact info */
if (NULL != orte_process_info.gpr_replica) {
param = strdup(orte_process_info.gpr_replica_uri);
} else {
param = orte_rml.get_uri();
}
var = mca_base_param_environ_variable("gpr","replica","uri");
opal_setenv(var, param, true, env);
free(param);
free(var);
/** universe name */
var = mca_base_param_environ_variable("universe", NULL, NULL);
asprintf(&param, "%s@%s:%s", orte_universe_info.uid,
orte_universe_info.host, orte_universe_info.name);
opal_setenv(var, param, true, env);
free(param);
free(var);
/** make sure hostname doesn't get pushed to backend node */
opal_unsetenv("HOSTNAME", env);
return ORTE_SUCCESS;
}
/** LAUNCH **/
/* This is the main function that will launch jobs on remote compute modes
* @param jobid the jobid of the job to launch
* @retval ORTE_SUCCESS or error
*/
int orte_pls_xcpu_launch(orte_jobid_t jobid){
opal_list_t mapping;
char *param, *var;
char *header[] = {
"dummy",
NULL,
NULL};
int argc;
int rc;
int i;
size_t nprocs=0, proc_id=0;
orte_pls_xcpu_tid_stack *t_stack, *temp_stack;
opal_list_item_t *item;
orte_rmaps_base_map_t* map;
orte_rmaps_base_node_t *node;
orte_rmaps_base_proc_t *proc;
orte_vpid_t vpid_start, vpid_range;
orte_process_name_t *peers;
int peer_id, num_peers;
/** first get the mapping we are going to use to launch job. The head
* of the list is OBJ_CONSTRUCT'd since it is not dynamically allocated. The
* get_map function, however, will dynamically allocate the items in the
* list itself - these will be released when we OBJ_DESTRUCT the list at
* the end
*/
OBJ_CONSTRUCT(&mapping, opal_list_t);
/** get the mapping from the registry. This will provide a linked list, one
* item for each mapping. Each item contains the full context of the application
* that is to be executed upon that node. In particular, we need to obtain
* the argv array that is included in that context as this tells us the application
* to launch plus any "flags" to pass to it.
*/
if(ORTE_SUCCESS != (rc = orte_rmaps_base_get_map(jobid, &mapping))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/** next, get the vpid_start and range info so we can pass it along */
if (ORTE_SUCCESS != (rc = orte_rmaps_base_get_vpid_range(jobid, &vpid_start, &vpid_range))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/** we have to do the following so that we can use the opal_argv utilities
* to properly insert the header into the app's argv
*/
header[1] = strdup("dummy");
/** Now loop through all the provided maps to launch their associated apps */
t_stack=NULL;
nprocs = 0;
peer_id=0;
if (ORTE_SUCCESS != (rc = orte_ns.get_job_peers(&peers, &num_peers, jobid))) {
ORTE_ERROR_LOG(rc);
}
for(item = opal_list_get_first(&mapping);
item != opal_list_get_end(&mapping);
item = opal_list_get_next(item)) {
map = (orte_rmaps_base_map_t*) item;
/** xcpu requires an argv format that has a dummy filler in the
* first location, followed by the node name, and then the standard
* argv array we've all come to know and love (i.e., the application
* name followed by options). We use the opal_argv utilities to
* prepend this header info to the application's argv.
*
* Note: at this point, the header contains a dummy placeholder
* for the node name - we'll fill that in later.
*/
opal_argv_insert(&(map->app->argv), 0, header);
/** we also need to pass the proper environment to the remote
* process so it knows its universe, gpr and ns replicas, etc. Since this
* can be specified by the user for each app, we have to do this
* each time.
*/
if (ORTE_SUCCESS != (rc = orte_pls_xcpu_setup_env(&map->app->env))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/** since it is possible that each node could be executing a different application,
* we cannot just do a mass launch - that would only be supported in the special
* case of all the application processes being identical. Instead, we are going to
* step our way through the list, launching each process individually.
*/
proc_id=0;
while (proc_id < map->num_procs){
char** env;
proc = (orte_rmaps_base_proc_t*)(map->procs[proc_id]);
node = proc->proc_node;
proc_id++;
/** each proc_t entry contains the application to be executed,
* the node upon which it is to be executed, and its OpenRTE
* process name (plus a few other things). We use that
* info to build the launch command by inserting them into
* the argv array
*/
/** start by pointing the proper location at the node name where
* this process is to be launched
*/
if (NULL != map->app->argv[1]) free(map->app->argv[1]);
map->app->argv[1] = strdup(node->node->node_name);
/* create a copy of the environment and modify for this proc */
env = opal_argv_copy(map->app->env);
/** now setup the process name in the environment so we can
* retrieve it on the other end
*/
if (ORTE_SUCCESS != (rc = orte_ns_nds_env_put(&(proc->proc_name),
vpid_start, map->num_procs,
&env))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/** the launcher wants to know how long the argv array is - get that now */
argc = opal_argv_count(map->app->argv);
/** add this process to the stack so we can track it */
temp_stack=(orte_pls_xcpu_tid_stack*)malloc(sizeof(orte_pls_xcpu_tid_stack));
temp_stack->next=t_stack;
t_stack=temp_stack;
/** launch the process */
t_stack->tid=lrx(argc, map->app->argv, env, &peers[peer_id]);
if(t_stack->tid==0){
/* first kill all the processes started on remote nodes
*/
i=0;
while(i<num_peers){
if (ORTE_SUCCESS != (orte_soh_base_set_proc_soh(&peers[i], ORTE_PROC_STATE_TERMINATED, 0)) ){
ORTE_ERROR_LOG(rc);
}
i++;
}
break;
}
peer_id++;
}
}
/** cleanup local storage */
orte_pls_xcpu_free_stack(temp_stack);
OBJ_DESTRUCT(&mapping);
/** launch complete */
return ORTE_SUCCESS;
}
int orte_pls_xcpu_terminate_job(orte_jobid_t jobid){
return ORTE_SUCCESS;
}
int orte_pls_xcpu_terminate_proc(const orte_process_name_t* proc_name){
return ORTE_SUCCESS;
}
int orte_pls_xcpu_finalize(void){
return ORTE_SUCCESS;
}

Просмотреть файл

@ -1,125 +0,0 @@
/* -*- C -*-
*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2006 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*
*
*/
/**
* @file:
* Header file for the xcpu launcher. This will use xcpu to launch jobs on
* the list of nodes that it will get from RAS (resource allocation
* system
* -# pls_xcpu is called by orterun. It first setsup environment for the
* process to be launched on remote node, then reads the ompi registry and
* then launch the binary on the nodes specified in the registry.
*/
#ifndef ORTE_PLS_XCPU_H_
#define ORTE_PLS_XCPUC_H_
#include "orte_config.h"
#include "orte/class/orte_pointer_array.h"
#include "orte/orte_constants.h"
#include "orte/mca/pls/base/base.h"
#include "orte/util/proc_info.h"
#include "opal/threads/condition.h"
#if defined(c_plusplus) || defined(__cplusplus)
extern "C" {
#endif
/*
* Module open / close -- defined in component file
*/
int orte_pls_xcpu_component_open(void);
int orte_pls_xcpu_component_close(void);
/*
* Startup / Shutdown
*/
orte_pls_base_module_t* orte_pls_xcpu_init(int *priority); /* in component file */
/*
* Interface
*/
int orte_pls_xcpu_launch(orte_jobid_t);
int orte_pls_xcpu_terminate_job(orte_jobid_t);
int orte_pls_xcpu_terminate_proc(const orte_process_name_t* proc_name);
int orte_pls_xcpu_finalize(void);
/**
* (P)rocess (L)aunch (S)ubsystem xcpu Component
*/
struct orte_pls_xcpu_component_t {
/*base_class this is needed others below this may or may not*/
orte_pls_base_component_t super;
int debug; /* If greater than 0 print debugging information */
int priority; /* The priority of this component. This will be returned if
* we determine that xcpu is available and running on this node,
*/
int terminate_sig; /* The signal that gets sent to a process to kill it. */
size_t num_daemons; /* The number of daemons that are currently running. */
orte_pointer_array_t * daemon_names;
opal_mutex_t lock; /* Lock used to prevent some race conditions */
opal_condition_t condition; /* Condition that is signaled when all the daemons have died */
orte_cellid_t cellid;
};
typedef struct orte_pls_xcpu_component_t orte_pls_xcpu_component_t;
struct orte_pls_xcpu_tid_stack {
int tid;
struct orte_pls_xcpu_tid_stack *next;
};
typedef struct orte_pls_xcpu_tid_stack orte_pls_xcpu_tid_stack;
struct orte_pls_xcpu_mount_nodes{
char *name;
struct orte_pls_xcpu_mount_nodes *next;
};
typedef struct orte_pls_xcpu_mount_nodes orte_pls_xcpu_mount_nodes;
struct orte_pls_xcpu_thread_info{
orte_pls_xcpu_mount_nodes local_mounts;/* can have only *name */
char *binary;
char *argv;
char **env;
orte_process_name_t *peers;
};
typedef struct orte_pls_xcpu_thread_info orte_pls_xcpu_thread_info;
struct orte_pls_xcpu_stdio_thread_info{
char *stdio_path;
int outdes;
};
typedef struct orte_pls_xcpu_stdio_thread_info orte_pls_xcpu_stdio_thread_info;
struct orte_pls_xcpu_pthread_tindex{
pthread_t *tids;
int index;
};
typedef struct orte_pls_xcpu_pthread_tindex orte_pls_xcpu_pthread_tindex;
ORTE_DECLSPEC extern orte_pls_xcpu_component_t mca_pls_xcpu_component;
ORTE_DECLSPEC extern orte_pls_base_module_t orte_pls_xcpu_module; /* this is defined in pls_xcpu.c file */
#if defined(c_plusplus) || defined(__cplusplus)
}
#endif
#endif /* ORTE_PLS_XCPU_H_ */

Просмотреть файл

@ -1,101 +0,0 @@
/* -*- C -*-
*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2006 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*
*/
/**
* @file:
* Takes care of the component stuff for the MCA.
*/
#include "orte_config.h"
#include "orte/mca/errmgr/errmgr.h"
#include "opal/mca/mca.h"
#include "opal/mca/base/mca_base_param.h"
#include "pls_xcpu.h"
/**
* The xcpu component data structure that stores all the relevent data about
* this component.
*/
orte_pls_xcpu_component_t mca_pls_xcpu_component = {
{ /* version, data and init members of only first
* structure (called super) being initialized
*/
{
ORTE_PLS_BASE_VERSION_1_0_0,
"xcpu", /* MCA component name */
ORTE_MAJOR_VERSION, /* MCA component major version */
ORTE_MINOR_VERSION, /* MCA component minor version */
ORTE_RELEASE_VERSION, /* MCA component release version */
orte_pls_xcpu_component_open, /* component open */
orte_pls_xcpu_component_close /* component close */
},
{
false /* checkpoint / restart */
},
orte_pls_xcpu_init /* component init */
}
};
/**
* Opens the pls_xcpu component, setting all the needed mca parameters and
* finishes setting up the component struct.
*/
int orte_pls_xcpu_component_open(void) {
int rc;
/* init parameters */
/*read trunk/opal/mca/base/mca_base_param.h for reg_int details*/
mca_base_component_t *c = &mca_pls_xcpu_component.super.pls_version;
mca_base_param_reg_int(c, "priority", NULL, false, false,5,
&mca_pls_xcpu_component.priority);
mca_base_param_reg_int(c, "debug",
"If > 0 prints library debugging information",
false, false, 0, &mca_pls_xcpu_component.debug);
mca_base_param_reg_int(c, "terminate_sig",
"Signal sent to processes to terminate them", false,
false, 9, &mca_pls_xcpu_component.terminate_sig);
mca_pls_xcpu_component.num_daemons = 0;
OBJ_CONSTRUCT(&mca_pls_xcpu_component.lock, opal_mutex_t);
OBJ_CONSTRUCT(&mca_pls_xcpu_component.condition, opal_condition_t);
rc = orte_pointer_array_init(&mca_pls_xcpu_component.daemon_names, 8, 200000, 8);
if(ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
}
return rc;
}
/**
* Closes the pls_xcpu component
*/
int orte_pls_xcpu_component_close(void) {
OBJ_DESTRUCT(&mca_pls_xcpu_component.lock);
OBJ_DESTRUCT(&mca_pls_xcpu_component.condition);
OBJ_RELEASE(mca_pls_xcpu_component.daemon_names);
return ORTE_SUCCESS;
}
orte_pls_base_module_t* orte_pls_xcpu_init(int *priority) {
/* check if xcpu component should be loaded or not
* if not, then return NULL here
*/
*priority = mca_pls_xcpu_component.priority;
return &orte_pls_xcpu_module; /* this is defined in pls_xcpu.c and will contains
* function pointers for launch, terminate_job
* terminate_proc and finalize
*/
}

Просмотреть файл

@ -25,7 +25,7 @@
#include "opal/util/argv.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/soh/soh_types.h"
#include "orte/mca/smr/smr_types.h"
#include "orte/mca/gpr/gpr.h"
#include "orte/mca/ns/ns.h"
#include "orte/mca/ras/base/ras_base_node.h"

Просмотреть файл

@ -21,7 +21,7 @@
#define ORTE_RAS_BASE_NODE_H
#include "orte/orte_types.h"
#include "orte/mca/soh/soh_types.h"
#include "orte/mca/smr/smr_types.h"
#include "orte/mca/rmgr/rmgr_types.h"
#include "orte/mca/ras/ras.h"

Просмотреть файл

@ -19,7 +19,7 @@
#include "orte_config.h"
#include "opal/class/opal_list.h"
#include "orte/mca/soh/soh_types.h"
#include "orte/mca/smr/smr_types.h"
#ifndef ORTE_MCA_RAS_TYPES_H
#define ORTE_MCA_RAS_TYPES_H

Просмотреть файл

@ -32,7 +32,7 @@
#include "orte/mca/rmgr/base/base.h"
#include "orte/mca/rmaps/base/base.h"
#include "orte/mca/rmaps/base/rmaps_base_map.h"
#include "orte/mca/soh/soh_types.h"
#include "orte/mca/smr/smr_types.h"
/**

Просмотреть файл

@ -31,7 +31,7 @@
#include "orte/util/sys_info.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/soh/soh_types.h"
#include "orte/mca/smr/smr_types.h"
#include "orte/mca/ras/base/ras_base_node.h"
#include "orte/mca/gpr/gpr.h"
#include "orte/mca/ns/ns.h"

Просмотреть файл

@ -36,7 +36,7 @@
#include "orte/mca/ns/ns.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/rml/rml.h"
#include "orte/mca/soh/soh.h"
#include "orte/mca/smr/smr.h"
#include "orte/mca/rmgr/base/base.h"
@ -187,7 +187,7 @@ int orte_rmgr_base_proc_stage_gate_init(orte_jobid_t job)
free(trig_keys[0]);
/* set the job state to "launched" */
if (ORTE_SUCCESS != (rc = orte_soh.set_job_soh(job, ORTE_JOB_STATE_LAUNCHED))) {
if (ORTE_SUCCESS != (rc = orte_smr.set_job_state(job, ORTE_JOB_STATE_LAUNCHED))) {
ORTE_ERROR_LOG(rc);
}
@ -235,22 +235,22 @@ int orte_rmgr_base_proc_stage_gate_mgr(orte_gpr_notify_message_t *msg)
/* set the job state to the appropriate level */
if (orte_schema.check_std_trigger_name(msg->target, ORTE_STG1_TRIGGER)) {
if (ORTE_SUCCESS != (rc = orte_soh.set_job_soh(job, ORTE_JOB_STATE_AT_STG1))) {
if (ORTE_SUCCESS != (rc = orte_smr.set_job_state(job, ORTE_JOB_STATE_AT_STG1))) {
ORTE_ERROR_LOG(rc);
goto CLEANUP;
}
} else if (orte_schema.check_std_trigger_name(msg->target, ORTE_STG2_TRIGGER)) {
if (ORTE_SUCCESS != (rc = orte_soh.set_job_soh(job, ORTE_JOB_STATE_AT_STG2))) {
if (ORTE_SUCCESS != (rc = orte_smr.set_job_state(job, ORTE_JOB_STATE_AT_STG2))) {
ORTE_ERROR_LOG(rc);
goto CLEANUP;
}
} else if (orte_schema.check_std_trigger_name(msg->target, ORTE_STG3_TRIGGER)) {
if (ORTE_SUCCESS != (rc = orte_soh.set_job_soh(job, ORTE_JOB_STATE_AT_STG3))) {
if (ORTE_SUCCESS != (rc = orte_smr.set_job_state(job, ORTE_JOB_STATE_AT_STG3))) {
ORTE_ERROR_LOG(rc);
goto CLEANUP;
}
} else if (orte_schema.check_std_trigger_name(msg->target, ORTE_NUM_FINALIZED_TRIGGER)) {
if (ORTE_SUCCESS != (rc = orte_soh.set_job_soh(job, ORTE_JOB_STATE_FINALIZED))) {
if (ORTE_SUCCESS != (rc = orte_smr.set_job_state(job, ORTE_JOB_STATE_FINALIZED))) {
ORTE_ERROR_LOG(rc);
goto CLEANUP;
}
@ -299,7 +299,7 @@ int orte_rmgr_base_proc_stage_gate_mgr_abort(orte_gpr_notify_message_t *msg)
/* set the job status to "aborted" */
if (ORTE_SUCCESS != (rc = orte_soh.set_job_soh(job, ORTE_JOB_STATE_ABORTED))) {
if (ORTE_SUCCESS != (rc = orte_smr.set_job_state(job, ORTE_JOB_STATE_ABORTED))) {
ORTE_ERROR_LOG(rc);
}

Просмотреть файл

@ -38,7 +38,7 @@
#include "opal/mca/mca.h"
#include "orte/mca/ns/ns_types.h"
#include "orte/mca/gpr/gpr_types.h"
#include "orte/mca/soh/soh_types.h"
#include "orte/mca/smr/smr_types.h"
#include "rmgr_types.h"
/*

Просмотреть файл

@ -40,7 +40,7 @@
#include "orte/mca/iof/iof.h"
#include "orte/mca/ns/ns.h"
#include "orte/mca/rml/rml.h"
#include "orte/mca/soh/soh.h"
#include "orte/mca/smr/smr.h"
#include "orte/mca/rmgr/urm/rmgr_urm.h"
@ -184,7 +184,7 @@ static int orte_rmgr_urm_launch(orte_jobid_t jobid)
if (ORTE_SUCCESS !=
(ret = mca_rmgr_urm_component.urm_pls->launch(jobid))) {
ORTE_ERROR_LOG(ret);
ret2 = orte_soh.set_job_soh(jobid, ORTE_JOB_STATE_ABORTED);
ret2 = orte_smr.set_job_state(jobid, ORTE_JOB_STATE_ABORTED);
if (ORTE_SUCCESS != ret2) {
ORTE_ERROR_LOG(ret2);
return ret2;

Просмотреть файл

@ -17,20 +17,20 @@
#
# main library setup
noinst_LTLIBRARIES = libmca_soh.la
libmca_soh_la_SOURCES =
noinst_LTLIBRARIES = libmca_smr.la
libmca_smr_la_SOURCES =
# header setup
nobase_orte_HEADERS =
# local files
headers = soh.h soh_types.h
libmca_soh_la_SOURCES += $(headers)
headers = smr.h smr_types.h
libmca_smr_la_SOURCES += $(headers)
# Conditionally install the header files
if WANT_INSTALL_HEADERS
nobase_orte_HEADERS += $(headers)
ortedir = $(includedir)/openmpi/orte/mca/soh
ortedir = $(includedir)/openmpi/orte/mca/smr
else
ortedir = $(includedir)
endif

37
orte/mca/smr/base/Makefile.am Обычный файл
Просмотреть файл

@ -0,0 +1,37 @@
#
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2004-2005 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
headers += \
base/base.h
libmca_smr_la_SOURCES += \
base/smr_base_close.c \
base/smr_base_select.c \
base/smr_base_local_functions.c \
base/smr_base_get_proc_state.c \
base/smr_base_set_proc_state.c \
base/smr_base_get_job_state.c \
base/smr_base_set_job_state.c \
base/smr_base_open.c \
base/data_type_support/smr_data_type_compare_fns.c \
base/data_type_support/smr_data_type_copy_fns.c \
base/data_type_support/smr_data_type_print_fns.c \
base/data_type_support/smr_data_type_release_fns.c \
base/data_type_support/smr_data_type_size_fns.c \
base/data_type_support/smr_data_type_packing_fns.c \
base/data_type_support/smr_data_type_unpacking_fns.c

64
orte/mca/smr/base/base.h Обычный файл
Просмотреть файл

@ -0,0 +1,64 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/** @file:
*/
#ifndef MCA_SMR_BASE_H
#define MCA_SMR_BASE_H
/*
* includes
*/
#include "orte_config.h"
#include "orte/orte_constants.h"
#include "orte/orte_types.h"
#include "opal/class/opal_list.h"
#include "orte/dss/dss_types.h"
#include "opal/mca/mca.h"
/* #include "orte/mca/ns/ns_types.h" */
#include "orte/mca/smr/smr.h"
/*
* Global functions for MCA overall collective open and close
*/
#if defined(c_plusplus) || defined(__cplusplus)
extern "C" {
#endif
OMPI_DECLSPEC int orte_smr_base_open(void);
OMPI_DECLSPEC int orte_smr_base_select(void);
OMPI_DECLSPEC int orte_smr_base_close(void);
typedef struct orte_smr_base_t {
int smr_output;
opal_list_t smr_components;
} orte_smr_base_t;
OMPI_DECLSPEC extern orte_smr_base_t orte_smr_base;
/*
* external API functions will be documented in the mca/smr/smr.h file
*/
#if defined(c_plusplus) || defined(__cplusplus)
}
#endif
#endif

Просмотреть файл

@ -24,12 +24,12 @@
#include "orte/mca/errmgr/errmgr.h"
#include "orte/dss/dss_internal.h"
#include "orte/mca/soh/base/base.h"
#include "orte/mca/smr/base/smr_private.h"
/*
* EXIT CODE
*/
int orte_soh_base_compare_exit_code(orte_exit_code_t *value1,
int orte_smr_base_compare_exit_code(orte_exit_code_t *value1,
orte_exit_code_t *value2,
orte_data_type_t type)
{
@ -43,7 +43,7 @@ int orte_soh_base_compare_exit_code(orte_exit_code_t *value1,
/*
* NODE STATE
*/
int orte_soh_base_compare_node_state(orte_node_state_t *value1,
int orte_smr_base_compare_node_state(orte_node_state_t *value1,
orte_node_state_t *value2,
orte_node_state_t type)
{
@ -57,7 +57,7 @@ int orte_soh_base_compare_node_state(orte_node_state_t *value1,
/*
* PROC STATE
*/
int orte_soh_base_compare_proc_state(orte_proc_state_t *value1,
int orte_smr_base_compare_proc_state(orte_proc_state_t *value1,
orte_proc_state_t *value2,
orte_proc_state_t type)
{
@ -71,7 +71,7 @@ int orte_soh_base_compare_proc_state(orte_proc_state_t *value1,
/*
* JOB STATE
*/
int orte_soh_base_compare_job_state(orte_job_state_t *value1,
int orte_smr_base_compare_job_state(orte_job_state_t *value1,
orte_job_state_t *value2,
orte_job_state_t type)
{

Просмотреть файл

@ -24,12 +24,12 @@
#include "orte/mca/errmgr/errmgr.h"
#include "orte/dss/dss_internal.h"
#include "orte/mca/soh/base/base.h"
#include "orte/mca/smr/base/smr_private.h"
/*
* COPY FOR NON-COMPLEX FUNCTIONS
*/
int orte_soh_base_copy_proc_state(orte_proc_state_t **dest, orte_proc_state_t *src, orte_data_type_t type)
int orte_smr_base_copy_proc_state(orte_proc_state_t **dest, orte_proc_state_t *src, orte_data_type_t type)
{
orte_proc_state_t *ps;
@ -45,7 +45,7 @@ int orte_soh_base_copy_proc_state(orte_proc_state_t **dest, orte_proc_state_t *s
return ORTE_SUCCESS;
}
int orte_soh_base_copy_job_state(orte_job_state_t **dest, orte_job_state_t *src, orte_data_type_t type)
int orte_smr_base_copy_job_state(orte_job_state_t **dest, orte_job_state_t *src, orte_data_type_t type)
{
orte_job_state_t *ps;
@ -61,7 +61,7 @@ int orte_soh_base_copy_job_state(orte_job_state_t **dest, orte_job_state_t *src,
return ORTE_SUCCESS;
}
int orte_soh_base_copy_node_state(orte_node_state_t **dest, orte_node_state_t *src, orte_data_type_t type)
int orte_smr_base_copy_node_state(orte_node_state_t **dest, orte_node_state_t *src, orte_data_type_t type)
{
orte_node_state_t *ps;
@ -77,7 +77,7 @@ int orte_soh_base_copy_node_state(orte_node_state_t **dest, orte_node_state_t *s
return ORTE_SUCCESS;
}
int orte_soh_base_copy_exit_code(orte_exit_code_t **dest, orte_exit_code_t *src, orte_data_type_t type)
int orte_smr_base_copy_exit_code(orte_exit_code_t **dest, orte_exit_code_t *src, orte_data_type_t type)
{
orte_exit_code_t *ps;

Просмотреть файл

@ -26,12 +26,12 @@
#include "orte/mca/errmgr/errmgr.h"
#include "orte/dss/dss_internal.h"
#include "orte/mca/soh/base/base.h"
#include "orte/mca/smr/base/smr_private.h"
/*
* EXIT CODE
*/
int orte_soh_base_pack_exit_code(orte_buffer_t *buffer, void *src,
int orte_smr_base_pack_exit_code(orte_buffer_t *buffer, void *src,
orte_std_cntr_t num_vals, orte_data_type_t type)
{
int rc;
@ -46,7 +46,7 @@ int orte_soh_base_pack_exit_code(orte_buffer_t *buffer, void *src,
/*
* NODE STATE
*/
int orte_soh_base_pack_node_state(orte_buffer_t *buffer, void *src,
int orte_smr_base_pack_node_state(orte_buffer_t *buffer, void *src,
orte_std_cntr_t num_vals, orte_data_type_t type)
{
int rc;
@ -61,7 +61,7 @@ int orte_soh_base_pack_node_state(orte_buffer_t *buffer, void *src,
/*
* PROC STATE
*/
int orte_soh_base_pack_proc_state(orte_buffer_t *buffer, void *src,
int orte_smr_base_pack_proc_state(orte_buffer_t *buffer, void *src,
orte_std_cntr_t num_vals, orte_data_type_t type)
{
int rc;
@ -76,7 +76,7 @@ int orte_soh_base_pack_proc_state(orte_buffer_t *buffer, void *src,
/*
* JOB STATE
*/
int orte_soh_base_pack_job_state(orte_buffer_t *buffer, void *src,
int orte_smr_base_pack_job_state(orte_buffer_t *buffer, void *src,
orte_std_cntr_t num_vals, orte_data_type_t type)
{
int rc;

Просмотреть файл

@ -23,33 +23,33 @@
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/soh/base/base.h"
#include "orte/mca/smr/base/smr_private.h"
static void orte_soh_base_quick_print(char **output, char *type_name, char *prefix, void *src, size_t src_size);
static void orte_smr_base_quick_print(char **output, char *type_name, char *prefix, void *src, size_t src_size);
/*
* STANDARD PRINT FUNCTION - WORKS FOR EVERYTHING NON-STRUCTURED
*/
int orte_soh_base_std_print(char **output, char *prefix, void *src, orte_data_type_t type)
int orte_smr_base_std_print(char **output, char *prefix, void *src, orte_data_type_t type)
{
/* set default result */
*output = NULL;
switch(type) {
case ORTE_PROC_STATE:
orte_soh_base_quick_print(output, "ORTE_PROC_STATE", prefix, src, sizeof(orte_proc_state_t));
orte_smr_base_quick_print(output, "ORTE_PROC_STATE", prefix, src, sizeof(orte_proc_state_t));
break;
case ORTE_JOB_STATE:
orte_soh_base_quick_print(output, "ORTE_JOB_STATE", prefix, src, sizeof(orte_job_state_t));
orte_smr_base_quick_print(output, "ORTE_JOB_STATE", prefix, src, sizeof(orte_job_state_t));
break;
case ORTE_NODE_STATE:
orte_soh_base_quick_print(output, "ORTE_NODE_STATE", prefix, src, sizeof(orte_node_state_t));
orte_smr_base_quick_print(output, "ORTE_NODE_STATE", prefix, src, sizeof(orte_node_state_t));
break;
case ORTE_EXIT_CODE:
orte_soh_base_quick_print(output, "ORTE_EXIT_CODE", prefix, src, sizeof(orte_exit_code_t));
orte_smr_base_quick_print(output, "ORTE_EXIT_CODE", prefix, src, sizeof(orte_exit_code_t));
break;
default:
@ -60,7 +60,7 @@ int orte_soh_base_std_print(char **output, char *prefix, void *src, orte_data_ty
return ORTE_SUCCESS;
}
static void orte_soh_base_quick_print(char **output, char *type_name, char *prefix, void *src, size_t src_size)
static void orte_smr_base_quick_print(char **output, char *type_name, char *prefix, void *src, size_t src_size)
{
uint8_t *ui8;
uint16_t *ui16;

Просмотреть файл

@ -23,12 +23,12 @@
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/soh/base/base.h"
#include "orte/mca/smr/base/smr_private.h"
/*
* STANDARD SIZE FUNCTION - WORKS FOR EVERYTHING NON-STRUCTURED
*/
void orte_soh_base_std_release(orte_data_value_t *value)
void orte_smr_base_std_release(orte_data_value_t *value)
{
free(value->data);
value->data = NULL;

Просмотреть файл

@ -23,12 +23,12 @@
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/soh/base/base.h"
#include "orte/mca/smr/base/smr_private.h"
/*
* STANDARD SIZE FUNCTION - WORKS FOR EVERYTHING NON-STRUCTURED
*/
int orte_soh_base_std_size(size_t *size, void *src, orte_data_type_t type)
int orte_smr_base_std_size(size_t *size, void *src, orte_data_type_t type)
{
switch(type) {
case ORTE_PROC_STATE:

Просмотреть файл

@ -26,12 +26,12 @@
#include "orte/mca/errmgr/errmgr.h"
#include "orte/dss/dss_internal.h"
#include "orte/mca/soh/base/base.h"
#include "orte/mca/smr/base/smr_private.h"
/*
* EXIT CODE
*/
int orte_soh_base_unpack_exit_code(orte_buffer_t *buffer, void *dest,
int orte_smr_base_unpack_exit_code(orte_buffer_t *buffer, void *dest,
orte_std_cntr_t *num_vals, orte_data_type_t type)
{
int rc;
@ -46,7 +46,7 @@ int orte_soh_base_unpack_exit_code(orte_buffer_t *buffer, void *dest,
/*
* NODE STATE
*/
int orte_soh_base_unpack_node_state(orte_buffer_t *buffer, void *dest,
int orte_smr_base_unpack_node_state(orte_buffer_t *buffer, void *dest,
orte_std_cntr_t *num_vals, orte_data_type_t type)
{
int rc;
@ -61,7 +61,7 @@ int orte_soh_base_unpack_node_state(orte_buffer_t *buffer, void *dest,
/*
* PROC STATE
*/
int orte_soh_base_unpack_proc_state(orte_buffer_t *buffer, void *dest,
int orte_smr_base_unpack_proc_state(orte_buffer_t *buffer, void *dest,
orte_std_cntr_t *num_vals, orte_data_type_t type)
{
int rc;
@ -76,7 +76,7 @@ int orte_soh_base_unpack_proc_state(orte_buffer_t *buffer, void *dest,
/*
* JOB STATE
*/
int orte_soh_base_unpack_job_state(orte_buffer_t *buffer, void *dest,
int orte_smr_base_unpack_job_state(orte_buffer_t *buffer, void *dest,
orte_std_cntr_t *num_vals, orte_data_type_t type)
{
int rc;

Просмотреть файл

@ -21,25 +21,26 @@
#include "orte/orte_constants.h"
#include "opal/mca/mca.h"
#include "opal/mca/base/base.h"
#include "orte/mca/soh/base/base.h"
#include "orte/mca/smr/base/base.h"
#include "orte/mca/smr/base/smr_private.h"
int orte_soh_base_close(void)
int orte_smr_base_close(void)
{
/* If we have a selected component and module, then finalize it */
if (NULL != orte_soh.finalize) {
orte_soh.finalize();
if (NULL != orte_smr.finalize) {
orte_smr.finalize();
}
/* after the module, close the component?? */
/* orte_soh_base_component_finalize (); */
/* orte_smr_base_component_finalize (); */
/* Close all remaining available components (may be one if this is a
OMPI RTE program, or [possibly] multiple if this is ompi_info) */
mca_base_components_close(orte_soh_base.soh_output,
&orte_soh_base.soh_components, NULL);
mca_base_components_close(orte_smr_base.smr_output,
&orte_smr_base.smr_components, NULL);
/* All done */

Просмотреть файл

@ -31,9 +31,9 @@
#include "orte/mca/gpr/gpr.h"
#include "orte/mca/ns/ns.h"
#include "orte/mca/soh/base/base.h"
#include "orte/mca/smr/base/smr_private.h"
int orte_soh_base_get_job_soh(orte_job_state_t *state,
int orte_smr_base_get_job_state(orte_job_state_t *state,
orte_jobid_t jobid)
{
orte_gpr_value_t **values;

Просмотреть файл

@ -31,9 +31,9 @@
#include "orte/mca/gpr/gpr.h"
#include "orte/mca/ns/ns.h"
#include "orte/mca/soh/base/base.h"
#include "orte/mca/smr/base/smr_private.h"
int orte_soh_base_get_proc_soh(orte_proc_state_t *state,
int orte_smr_base_get_proc_state(orte_proc_state_t *state,
int *exit_status,
orte_process_name_t *proc)
{

Просмотреть файл

@ -22,35 +22,34 @@
* includes
*/
#include "orte_config.h"
#include "orte/orte_constants.h"
#include "opal/class/opal_list.h"
#include "opal/mca/mca.h"
#include "orte/mca/soh/base/base.h"
#include "orte/orte_constants.h"
#include "orte/mca/smr/base/smr_private.h"
int orte_soh_base_get_node_soh_not_available(orte_node_state_t *state,
int orte_smr_base_get_node_state_not_available(orte_node_state_t *state,
orte_cellid_t cell,
char *nodename)
{
return ORTE_ERR_NOT_IMPLEMENTED;
}
int orte_soh_base_set_node_soh_not_available(orte_cellid_t cell,
int orte_smr_base_set_node_state_not_available(orte_cellid_t cell,
char *nodename,
orte_node_state_t state)
{
return ORTE_ERR_NOT_IMPLEMENTED;
}
int orte_soh_base_begin_monitoring_not_available(orte_jobid_t job)
int orte_smr_base_begin_monitoring_not_available(orte_jobid_t job)
{
return ORTE_ERR_NOT_IMPLEMENTED;
}
int orte_soh_base_module_finalize_not_available (void)
int orte_smr_base_module_finalize_not_available (void)
{
return ORTE_ERR_NOT_IMPLEMENTED;
}

Просмотреть файл

@ -30,7 +30,8 @@
#include "orte/util/proc_info.h"
#include "orte/mca/oob/base/base.h"
#include "orte/mca/soh/base/base.h"
#include "orte/mca/smr/base/base.h"
#include "orte/mca/smr/base/smr_private.h"
#include "stdio.h" /* just for gef debug */
@ -41,7 +42,7 @@
* component's public mca_base_component_t struct.
*/
#include "orte/mca/soh/base/static-components.h"
#include "orte/mca/smr/base/static-components.h"
/*
* globals
@ -50,54 +51,54 @@
/*
* Global variables
*/
orte_soh_base_t orte_soh_base;
orte_smr_base_t orte_smr_base;
orte_soh_base_module_t orte_soh = {
orte_smr_base_module_t orte_smr = {
orte_soh_base_get_proc_soh,
orte_soh_base_set_proc_soh,
orte_soh_base_get_node_soh_not_available,
orte_soh_base_set_node_soh_not_available,
orte_soh_base_get_job_soh,
orte_soh_base_set_job_soh,
orte_soh_base_begin_monitoring_not_available,
orte_soh_base_module_finalize_not_available
orte_smr_base_get_proc_state,
orte_smr_base_set_proc_state,
orte_smr_base_get_node_state_not_available,
orte_smr_base_set_node_state_not_available,
orte_smr_base_get_job_state,
orte_smr_base_set_job_state,
orte_smr_base_begin_monitoring_not_available,
orte_smr_base_module_finalize_not_available
};
/**
* Function for finding and opening either all MCA components, or the one
* that was specifically requested via a MCA parameter.
*/
int orte_soh_base_open(void)
int orte_smr_base_open(void)
{
int param, value, rc;
orte_data_type_t tmp;
/* fprintf(stderr,"orte_soh_base_open:enter\n"); */
/* fprintf(stderr,"orte_smr_base_open:enter\n"); */
/* setup output for debug messages */
orte_soh_base.soh_output = opal_output_open(NULL);
param = mca_base_param_reg_int_name("soh_base", "verbose",
"Verbosity level for the soh framework",
orte_smr_base.smr_output = opal_output_open(NULL);
param = mca_base_param_reg_int_name("smr_base", "verbose",
"Verbosity level for the smr framework",
false, false, 0, &value);
if (value != 0) {
orte_soh_base.soh_output = opal_output_open(NULL);
orte_smr_base.smr_output = opal_output_open(NULL);
} else {
orte_soh_base.soh_output = -1;
orte_smr_base.smr_output = -1;
}
/* register the base system types with the DPS */
tmp = ORTE_NODE_STATE;
if (ORTE_SUCCESS != (rc = orte_dss.register_type(orte_soh_base_pack_node_state,
orte_soh_base_unpack_node_state,
(orte_dss_copy_fn_t)orte_soh_base_copy_node_state,
(orte_dss_compare_fn_t)orte_soh_base_compare_node_state,
(orte_dss_size_fn_t)orte_soh_base_std_size,
(orte_dss_print_fn_t)orte_soh_base_std_print,
(orte_dss_release_fn_t)orte_soh_base_std_release,
if (ORTE_SUCCESS != (rc = orte_dss.register_type(orte_smr_base_pack_node_state,
orte_smr_base_unpack_node_state,
(orte_dss_copy_fn_t)orte_smr_base_copy_node_state,
(orte_dss_compare_fn_t)orte_smr_base_compare_node_state,
(orte_dss_size_fn_t)orte_smr_base_std_size,
(orte_dss_print_fn_t)orte_smr_base_std_print,
(orte_dss_release_fn_t)orte_smr_base_std_release,
ORTE_DSS_UNSTRUCTURED,
"ORTE_NODE_STATE", &tmp))) {
ORTE_ERROR_LOG(rc);
@ -105,13 +106,13 @@ int orte_soh_base_open(void)
}
tmp = ORTE_PROC_STATE;
if (ORTE_SUCCESS != (rc = orte_dss.register_type(orte_soh_base_pack_proc_state,
orte_soh_base_unpack_proc_state,
(orte_dss_copy_fn_t)orte_soh_base_copy_proc_state,
(orte_dss_compare_fn_t)orte_soh_base_compare_proc_state,
(orte_dss_size_fn_t)orte_soh_base_std_size,
(orte_dss_print_fn_t)orte_soh_base_std_print,
(orte_dss_release_fn_t)orte_soh_base_std_release,
if (ORTE_SUCCESS != (rc = orte_dss.register_type(orte_smr_base_pack_proc_state,
orte_smr_base_unpack_proc_state,
(orte_dss_copy_fn_t)orte_smr_base_copy_proc_state,
(orte_dss_compare_fn_t)orte_smr_base_compare_proc_state,
(orte_dss_size_fn_t)orte_smr_base_std_size,
(orte_dss_print_fn_t)orte_smr_base_std_print,
(orte_dss_release_fn_t)orte_smr_base_std_release,
ORTE_DSS_UNSTRUCTURED,
"ORTE_PROC_STATE", &tmp))) {
ORTE_ERROR_LOG(rc);
@ -119,13 +120,13 @@ int orte_soh_base_open(void)
}
tmp = ORTE_JOB_STATE;
if (ORTE_SUCCESS != (rc = orte_dss.register_type(orte_soh_base_pack_job_state,
orte_soh_base_unpack_job_state,
(orte_dss_copy_fn_t)orte_soh_base_copy_job_state,
(orte_dss_compare_fn_t)orte_soh_base_compare_job_state,
(orte_dss_size_fn_t)orte_soh_base_std_size,
(orte_dss_print_fn_t)orte_soh_base_std_print,
(orte_dss_release_fn_t)orte_soh_base_std_release,
if (ORTE_SUCCESS != (rc = orte_dss.register_type(orte_smr_base_pack_job_state,
orte_smr_base_unpack_job_state,
(orte_dss_copy_fn_t)orte_smr_base_copy_job_state,
(orte_dss_compare_fn_t)orte_smr_base_compare_job_state,
(orte_dss_size_fn_t)orte_smr_base_std_size,
(orte_dss_print_fn_t)orte_smr_base_std_print,
(orte_dss_release_fn_t)orte_smr_base_std_release,
ORTE_DSS_UNSTRUCTURED,
"ORTE_JOB_STATE", &tmp))) {
ORTE_ERROR_LOG(rc);
@ -133,13 +134,13 @@ int orte_soh_base_open(void)
}
tmp = ORTE_EXIT_CODE;
if (ORTE_SUCCESS != (rc = orte_dss.register_type(orte_soh_base_pack_exit_code,
orte_soh_base_unpack_exit_code,
(orte_dss_copy_fn_t)orte_soh_base_copy_exit_code,
(orte_dss_compare_fn_t)orte_soh_base_compare_exit_code,
(orte_dss_size_fn_t)orte_soh_base_std_size,
(orte_dss_print_fn_t)orte_soh_base_std_print,
(orte_dss_release_fn_t)orte_soh_base_std_release,
if (ORTE_SUCCESS != (rc = orte_dss.register_type(orte_smr_base_pack_exit_code,
orte_smr_base_unpack_exit_code,
(orte_dss_copy_fn_t)orte_smr_base_copy_exit_code,
(orte_dss_compare_fn_t)orte_smr_base_compare_exit_code,
(orte_dss_size_fn_t)orte_smr_base_std_size,
(orte_dss_print_fn_t)orte_smr_base_std_print,
(orte_dss_release_fn_t)orte_smr_base_std_release,
ORTE_DSS_UNSTRUCTURED,
"ORTE_EXIT_CODE", &tmp))) {
ORTE_ERROR_LOG(rc);
@ -149,9 +150,9 @@ int orte_soh_base_open(void)
/* Open up all available components */
if (ORTE_SUCCESS !=
mca_base_components_open("soh", orte_soh_base.soh_output,
mca_soh_base_static_components,
&orte_soh_base.soh_components, true)) {
mca_base_components_open("smr", orte_smr_base.smr_output,
mca_smr_base_static_components,
&orte_smr_base.smr_components, true)) {
return ORTE_ERROR;
}

Просмотреть файл

@ -23,34 +23,35 @@
#include "opal/mca/mca.h"
#include "opal/mca/base/base.h"
#include "orte/mca/soh/base/base.h"
#include "orte/mca/smr/base/base.h"
#include "orte/mca/smr/base/smr_private.h"
/**
* Function for selecting one component from all those that are
* available.
*/
int orte_soh_base_select(void)
int orte_smr_base_select(void)
{
opal_list_item_t *item;
opal_list_item_t *best_item = NULL;
mca_base_component_list_item_t *cli;
orte_soh_base_component_t *component, *best_component = NULL;
orte_soh_base_module_t *module, *best_module = NULL;
orte_smr_base_component_t *component, *best_component = NULL;
orte_smr_base_module_t *module, *best_module = NULL;
int priority, best_priority = -1;
/* Iterate through all the available components */
for (item = opal_list_get_first(&orte_soh_base.soh_components);
item != opal_list_get_end(&orte_soh_base.soh_components);
for (item = opal_list_get_first(&orte_smr_base.smr_components);
item != opal_list_get_end(&orte_smr_base.smr_components);
item = opal_list_get_next(item)) {
cli = (mca_base_component_list_item_t *) item;
component = (orte_soh_base_component_t *) cli->cli_component;
component = (orte_smr_base_component_t *) cli->cli_component;
/* Call the component's init function and see if it wants to be
selected */
module = component->soh_init(&priority);
module = component->smr_init(&priority);
/* If we got a non-NULL module back, then the component wants to
be selected. So save its multi/hidden values and save the
@ -82,7 +83,7 @@ int orte_soh_base_select(void)
/* If it's not the best one, finalize it */
/* else { */
/* component->soh_finalize(); */
/* component->smr_finalize(); */
/* } */
} /* for each possible component */
@ -90,7 +91,7 @@ int orte_soh_base_select(void)
/* If we didn't find one to select, barf */
if (NULL != best_module) {
orte_soh = *best_module;
orte_smr = *best_module;
}
/* all done */

Просмотреть файл

@ -31,9 +31,9 @@
#include "orte/mca/gpr/gpr.h"
#include "orte/mca/ns/ns.h"
#include "orte/mca/soh/base/base.h"
#include "orte/mca/smr/base/smr_private.h"
int orte_soh_base_set_job_soh(orte_jobid_t jobid,
int orte_smr_base_set_job_state(orte_jobid_t jobid,
orte_job_state_t state)
{
orte_gpr_value_t *value;

Просмотреть файл

@ -31,9 +31,9 @@
#include "orte/mca/gpr/gpr.h"
#include "orte/mca/ns/ns.h"
#include "orte/mca/soh/base/base.h"
#include "orte/mca/smr/base/smr_private.h"
int orte_soh_base_set_proc_soh(orte_process_name_t *proc,
int orte_smr_base_set_proc_state(orte_process_name_t *proc,
orte_proc_state_t state,
int exit_status)
{

Просмотреть файл

@ -18,8 +18,8 @@
/** @file:
*/
#ifndef MCA_SOH_BASE_H
#define MCA_SOH_BASE_H
#ifndef MCA_SMR_PRIVATE_H
#define MCA_SMR_PRIVATE_H
/*
* includes
@ -29,142 +29,132 @@
#include "orte/orte_types.h"
#include "opal/class/opal_list.h"
#include "orte/dss/dss_types.h"
#include "opal/mca/mca.h"
/* #include "orte/mca/ns/ns_types.h" */
#include "orte/mca/soh/soh.h"
#include "orte/dss/dss_types.h"
#include "orte/mca/ns/ns_types.h"
#include "orte/mca/smr/base/base.h"
/*
* Global functions for MCA overall collective open and close
* private functions for use inside SMR components
*/
#if defined(c_plusplus) || defined(__cplusplus)
extern "C" {
#endif
OMPI_DECLSPEC int orte_soh_base_open(void);
OMPI_DECLSPEC int orte_soh_base_select(void);
OMPI_DECLSPEC int orte_soh_base_close(void);
int orte_soh_base_get_proc_soh(orte_proc_state_t *state,
int orte_smr_base_get_proc_state(orte_proc_state_t *state,
int *status,
orte_process_name_t *proc);
int orte_soh_base_set_proc_soh(orte_process_name_t *proc,
int orte_smr_base_set_proc_state(orte_process_name_t *proc,
orte_proc_state_t state,
int status);
int orte_soh_base_get_node_soh_not_available(orte_node_state_t *state,
int orte_smr_base_get_node_state_not_available(orte_node_state_t *state,
orte_cellid_t cell,
char *nodename);
int orte_soh_base_set_node_soh_not_available(orte_cellid_t cell,
int orte_smr_base_set_node_state_not_available(orte_cellid_t cell,
char *nodename,
orte_node_state_t state);
int orte_soh_base_get_job_soh(orte_job_state_t *state,
int orte_smr_base_get_job_state(orte_job_state_t *state,
orte_jobid_t jobid);
int orte_soh_base_set_job_soh(orte_jobid_t jobid,
int orte_smr_base_set_job_state(orte_jobid_t jobid,
orte_job_state_t state);
int orte_soh_base_begin_monitoring_not_available(orte_jobid_t job);
int orte_smr_base_begin_monitoring_not_available(orte_jobid_t job);
int orte_soh_base_module_finalize_not_available (void);
int orte_smr_base_module_finalize_not_available (void);
/*
* DATA TYPE PACKING FUNCTIONS
*/
int orte_soh_base_pack_exit_code(orte_buffer_t *buffer, void *src,
int orte_smr_base_pack_exit_code(orte_buffer_t *buffer, void *src,
orte_std_cntr_t num_vals, orte_data_type_t type);
int orte_soh_base_pack_node_state(orte_buffer_t *buffer, void *src,
int orte_smr_base_pack_node_state(orte_buffer_t *buffer, void *src,
orte_std_cntr_t num_vals, orte_data_type_t type);
int orte_soh_base_pack_proc_state(orte_buffer_t *buffer, void *src,
int orte_smr_base_pack_proc_state(orte_buffer_t *buffer, void *src,
orte_std_cntr_t num_vals, orte_data_type_t type);
int orte_soh_base_pack_job_state(orte_buffer_t *buffer, void *src,
int orte_smr_base_pack_job_state(orte_buffer_t *buffer, void *src,
orte_std_cntr_t num_vals, orte_data_type_t type);
/*
* DATA TYPE UNPACKING FUNCTIONS
*/
int orte_soh_base_unpack_exit_code(orte_buffer_t *buffer, void *dest,
int orte_smr_base_unpack_exit_code(orte_buffer_t *buffer, void *dest,
orte_std_cntr_t *num_vals, orte_data_type_t type);
int orte_soh_base_unpack_node_state(orte_buffer_t *buffer, void *dest,
int orte_smr_base_unpack_node_state(orte_buffer_t *buffer, void *dest,
orte_std_cntr_t *num_vals, orte_data_type_t type);
int orte_soh_base_unpack_proc_state(orte_buffer_t *buffer, void *dest,
int orte_smr_base_unpack_proc_state(orte_buffer_t *buffer, void *dest,
orte_std_cntr_t *num_vals, orte_data_type_t type);
int orte_soh_base_unpack_job_state(orte_buffer_t *buffer, void *dest,
int orte_smr_base_unpack_job_state(orte_buffer_t *buffer, void *dest,
orte_std_cntr_t *num_vals, orte_data_type_t type);
/*
* DATA TYPE COMPARE FUNCTIONS
*/
int orte_soh_base_compare_exit_code(orte_exit_code_t *value1,
int orte_smr_base_compare_exit_code(orte_exit_code_t *value1,
orte_exit_code_t *value2,
orte_data_type_t type);
int orte_soh_base_compare_node_state(orte_node_state_t *value1,
int orte_smr_base_compare_node_state(orte_node_state_t *value1,
orte_node_state_t *value2,
orte_node_state_t type);
int orte_soh_base_compare_proc_state(orte_proc_state_t *value1,
int orte_smr_base_compare_proc_state(orte_proc_state_t *value1,
orte_proc_state_t *value2,
orte_proc_state_t type);
int orte_soh_base_compare_job_state(orte_job_state_t *value1,
int orte_smr_base_compare_job_state(orte_job_state_t *value1,
orte_job_state_t *value2,
orte_job_state_t type);
/*
* DATA TYPE COPY FUNCTIONS
*/
int orte_soh_base_copy_proc_state(orte_proc_state_t **dest, orte_proc_state_t *src, orte_data_type_t type);
int orte_smr_base_copy_proc_state(orte_proc_state_t **dest, orte_proc_state_t *src, orte_data_type_t type);
int orte_soh_base_copy_job_state(orte_job_state_t **dest, orte_job_state_t *src, orte_data_type_t type);
int orte_smr_base_copy_job_state(orte_job_state_t **dest, orte_job_state_t *src, orte_data_type_t type);
int orte_soh_base_copy_node_state(orte_node_state_t **dest, orte_node_state_t *src, orte_data_type_t type);
int orte_smr_base_copy_node_state(orte_node_state_t **dest, orte_node_state_t *src, orte_data_type_t type);
int orte_soh_base_copy_exit_code(orte_exit_code_t **dest, orte_exit_code_t *src, orte_data_type_t type);
int orte_smr_base_copy_exit_code(orte_exit_code_t **dest, orte_exit_code_t *src, orte_data_type_t type);
/*
* DATA TYPE PRINT FUNCTIONS
*/
int orte_soh_base_std_print(char **output, char *prefix, void *src, orte_data_type_t type);
int orte_smr_base_std_print(char **output, char *prefix, void *src, orte_data_type_t type);
/*
* DATA TYPE SIZE FUNCTIONS
*/
int orte_soh_base_std_size(size_t *size, void *src, orte_data_type_t type);
int orte_smr_base_std_size(size_t *size, void *src, orte_data_type_t type);
/*
* DATA TYPE RELEASE FUNCTIONS
*/
void orte_soh_base_std_release(orte_data_value_t *value);
void orte_smr_base_std_release(orte_data_value_t *value);
/*
* globals that might be needed
* globals that might be needed within the framework
*/
OMPI_DECLSPEC extern int orte_soh_base_output;
OMPI_DECLSPEC extern bool orte_soh_base_selected;
typedef struct orte_soh_base_t {
int soh_output;
opal_list_t soh_components;
} orte_soh_base_t;
OMPI_DECLSPEC extern orte_soh_base_t orte_soh_base;
OMPI_DECLSPEC extern int orte_smr_base_output;
OMPI_DECLSPEC extern bool orte_smr_base_selected;
/*
* external API functions will be documented in the mca/soh/soh.h file
* external API functions will be documented in the mca/smr/smr.h file
*/
#if defined(c_plusplus) || defined(__cplusplus)

Просмотреть файл

@ -19,40 +19,40 @@
headers = \
soh_bproc.h
smr_bproc.h
if OMPI_BUILD_soh_bproc_DSO
if OMPI_BUILD_smr_bproc_DSO
component_noinst =
component_install = mca_soh_bproc.la
component_install = mca_smr_bproc.la
else
component_noinst = libmca_soh_bproc.la
component_noinst = libmca_smr_bproc.la
component_install =
endif
# Conditionally install the header files
if WANT_INSTALL_HEADERS
ortedir = $(includedir)/openmpi/orte/mca/soh/bproc
ortedir = $(includedir)/openmpi/orte/mca/smr/bproc
orte_HEADERS = $(headers)
else
ortedir = $(includedir)
endif
soh_SOURCES = \
soh_bproc.c \
soh_bproc.h \
soh_bproc_component.c
smr_SOURCES = \
smr_bproc.c \
smr_bproc.h \
smr_bproc_component.c
mcacomponentdir = $(libdir)/openmpi
mcacomponent_LTLIBRARIES = $(component_install)
mca_soh_bproc_la_SOURCES = $(soh_SOURCES)
mca_soh_bproc_la_LIBADD = \
$(soh_bproc_LIBS) \
mca_smr_bproc_la_SOURCES = $(smr_SOURCES)
mca_smr_bproc_la_LIBADD = \
$(smr_bproc_LIBS) \
$(top_ompi_builddir)/orte/liborte.la \
$(top_ompi_builddir)/opal/libopal.la
mca_soh_bproc_la_LDFLAGS = -module -avoid-version $(soh_bproc_LDFLAGS)
mca_smr_bproc_la_LDFLAGS = -module -avoid-version $(smr_bproc_LDFLAGS)
noinst_LTLIBRARIES = $(component_noinst)
libmca_soh_bproc_la_SOURCES = $(soh_SOURCES)
libmca_soh_bproc_la_LIBADD = $(soh_bproc_LIBS)
libmca_soh_bproc_la_LDFLAGS = -module -avoid-version $(soh_bproc_LDFLAGS)
libmca_smr_bproc_la_SOURCES = $(smr_SOURCES)
libmca_smr_bproc_la_LIBADD = $(smr_bproc_LIBS)
libmca_smr_bproc_la_LDFLAGS = -module -avoid-version $(smr_bproc_LDFLAGS)

Просмотреть файл

@ -17,32 +17,32 @@
# $HEADER$
#
# MCA_soh_bproc_CONFIG([action-if-found], [action-if-not-found])
# MCA_smr_bproc_CONFIG([action-if-found], [action-if-not-found])
# -----------------------------------------------------------
AC_DEFUN([MCA_soh_bproc_CONFIG],[
OMPI_CHECK_BPROC([soh_bproc], [soh_bproc_good=1],
[soh_bproc_good=0], [soh_bproc_good=0])
AC_DEFUN([MCA_smr_bproc_CONFIG],[
OMPI_CHECK_BPROC([smr_bproc], [smr_bproc_good=1],
[smr_bproc_good=0], [smr_bproc_good=0])
#BPROC_API_VERSION was added in bproc 4.0.0, and this component
#will only compile with >= bproc 4.0.0
AS_IF([test "$soh_bproc_good" = "1"],
AS_IF([test "$smr_bproc_good" = "1"],
[AC_MSG_CHECKING(for BPROC_API_VERSION)
AC_TRY_COMPILE([#include <sys/bproc.h>],
[int foo = BPROC_API_VERSION;],
have_bproc_api_ver_msg=yes soh_bproc_good=1,
have_bproc_api_ver_msg=no soh_bproc_good=0)
have_bproc_api_ver_msg=yes smr_bproc_good=1,
have_bproc_api_ver_msg=no smr_bproc_good=0)
AC_MSG_RESULT([$have_bproc_api_ver_msg])])
# if check worked, set wrapper flags if so.
# Evaluate succeed / fail
AS_IF([test "$soh_bproc_good" = "1"],
[soh_bproc_WRAPPER_EXTRA_LDFLAGS="$soh_bproc_LDFLAGS"
soh_bproc_WRAPPER_EXTRA_LIBS="$soh_bproc_LIBS"
AS_IF([test "$smr_bproc_good" = "1"],
[smr_bproc_WRAPPER_EXTRA_LDFLAGS="$smr_bproc_LDFLAGS"
smr_bproc_WRAPPER_EXTRA_LIBS="$smr_bproc_LIBS"
$1],
[$2])
# set build flags to use in makefile
AC_SUBST([soh_bproc_CPPFLAGS])
AC_SUBST([soh_bproc_LDFLAGS])
AC_SUBST([soh_bproc_LIBS])
AC_SUBST([smr_bproc_CPPFLAGS])
AC_SUBST([smr_bproc_LDFLAGS])
AC_SUBST([smr_bproc_LIBS])
])dnl

Просмотреть файл

@ -19,5 +19,5 @@
# Specific to this module
PARAM_INIT_FILE=soh_bproc.c
PARAM_INIT_FILE=smr_bproc.c
PARAM_CONFIG_FILES="Makefile"

Просмотреть файл

@ -32,8 +32,8 @@
#include "orte/mca/ns/ns.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/gpr/base/base.h"
#include "orte/mca/soh/base/base.h"
#include "orte/mca/soh/bproc/soh_bproc.h"
#include "orte/mca/smr/base/smr_private.h"
#include "orte/mca/smr/bproc/smr_bproc.h"
#include "opal/util/output.h"
#define BIT_MASK(bit) (bit_set)(1 << (bit))
@ -80,15 +80,15 @@ static inline int empty_set(bit_set set)
return set == EMPTY_SET;
}
static int orte_soh_bproc_get_proc_soh(orte_proc_state_t *, int *, orte_process_name_t *);
static int orte_soh_bproc_set_proc_soh(orte_process_name_t *, orte_proc_state_t, int);
static int orte_soh_bproc_finalize(void);
static int orte_smr_bproc_get_proc_state(orte_proc_state_t *, int *, orte_process_name_t *);
static int orte_smr_bproc_set_proc_state(orte_process_name_t *, orte_proc_state_t, int);
static int orte_smr_bproc_finalize(void);
/**
* Query the bproc node status
*/
static int orte_soh_bproc_node_state(char *status)
static int orte_smr_bproc_node_state(char *status)
{
if (strcmp(status, "up") == 0)
return ORTE_NODE_STATE_UP;
@ -103,8 +103,8 @@ static bit_set find_changes(struct bproc_node_info_t *old, struct bproc_node_inf
{
bit_set changes = EMPTY_SET;
if (orte_soh_bproc_node_state(old->status)
!= orte_soh_bproc_node_state(new->status))
if (orte_smr_bproc_node_state(old->status)
!= orte_smr_bproc_node_state(new->status))
set_bit(&changes, BIT_NODE_STATE);
if (strcmp(old->status, new->status) != 0)
@ -160,7 +160,7 @@ static void update_registry(bit_set changes, struct bproc_node_info_t *ni)
idx = 0;
if (is_set(changes, BIT_NODE_STATE)) {
state = orte_soh_bproc_node_state(ni->status);
state = orte_smr_bproc_node_state(ni->status);
if (ORTE_SUCCESS != (rc = orte_gpr.create_keyval(&(value->keyvals[idx]), ORTE_NODE_STATE_KEY, ORTE_NODE_STATE, &state))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(value);
@ -230,30 +230,30 @@ static void update_registry(bit_set changes, struct bproc_node_info_t *ni)
}
if (idx != cnt) {
opal_output(0, "soh_bproc: internal error %d != %d\n", idx, cnt);
opal_output(0, "smr_bproc: internal error %d != %d\n", idx, cnt);
free(node_name);
OBJ_RELEASE(value);
opal_event_del(&mca_soh_bproc_component.notify_event);
opal_event_del(&mca_smr_bproc_component.notify_event);
return;
}
ret = orte_schema.get_node_tokens(&(value->tokens), &(value->num_tokens),
mca_soh_bproc_component.cellid, node_name);
mca_smr_bproc_component.cellid, node_name);
if (ret != ORTE_SUCCESS) {
ORTE_ERROR_LOG(ret);
OBJ_RELEASE(value);
free(node_name);
opal_event_del(&mca_soh_bproc_component.notify_event);
opal_event_del(&mca_smr_bproc_component.notify_event);
return;
}
if (mca_soh_bproc_component.debug)
if (mca_smr_bproc_component.debug)
opal_output(0, "updating node %d\n", ni->node);
if ((ret = orte_gpr.put(1, &value)) != ORTE_SUCCESS) {
ORTE_ERROR_LOG(ret);
opal_event_del(&mca_soh_bproc_component.notify_event);
opal_event_del(&mca_smr_bproc_component.notify_event);
}
free(node_name);
@ -271,9 +271,9 @@ static int do_update(struct bproc_node_set_t *ns)
for (i = 0; i < ns->size; i++) {
ni = &ns->node[i];
if (mca_soh_bproc_component.node_set.size > 0
&& mca_soh_bproc_component.node_set.size == ns->size)
changes = find_changes(&mca_soh_bproc_component.node_set.node[i], ni);
if (mca_smr_bproc_component.node_set.size > 0
&& mca_smr_bproc_component.node_set.size == ns->size)
changes = find_changes(&mca_smr_bproc_component.node_set.node[i], ni);
else
changes = BIT_SET_ALL;
@ -284,21 +284,21 @@ static int do_update(struct bproc_node_set_t *ns)
}
if (changed) {
if (mca_soh_bproc_component.node_set.size != 0)
bproc_nodeset_free(&mca_soh_bproc_component.node_set);
mca_soh_bproc_component.node_set = *ns;
if (mca_smr_bproc_component.node_set.size != 0)
bproc_nodeset_free(&mca_smr_bproc_component.node_set);
mca_smr_bproc_component.node_set = *ns;
}
return changed;
}
static void orte_soh_bproc_notify_handler(int fd, short flags, void *user)
static void orte_smr_bproc_notify_handler(int fd, short flags, void *user)
{
struct bproc_node_set_t ns = BPROC_EMPTY_NODESET;
if (bproc_nodelist_(&ns, fd) < 0) {
/* bproc_nodelist_ error */
opal_event_del(&mca_soh_bproc_component.notify_event);
opal_event_del(&mca_smr_bproc_component.notify_event);
return;
}
@ -309,20 +309,20 @@ static void orte_soh_bproc_notify_handler(int fd, short flags, void *user)
/**
* Register a callback to receive BProc update notifications
*/
int orte_soh_bproc_module_init(void)
int orte_smr_bproc_module_init(void)
{
int rc;
struct bproc_node_set_t ns = BPROC_EMPTY_NODESET;
if (mca_soh_bproc_component.debug)
opal_output(0, "init soh_bproc_module\n");
if (mca_smr_bproc_component.debug)
opal_output(0, "init smr_bproc_module\n");
if (ORTE_SUCCESS != (rc = orte_ns.get_cellid(&mca_soh_bproc_component.cellid, orte_process_info.my_name))) {
if (ORTE_SUCCESS != (rc = orte_ns.get_cellid(&mca_smr_bproc_component.cellid, orte_process_info.my_name))) {
ORTE_ERROR_LOG(rc);
return rc;
}
mca_soh_bproc_component.node_set.size = 0;
mca_smr_bproc_component.node_set.size = 0;
/*
* Set initial node status
@ -338,51 +338,51 @@ int orte_soh_bproc_module_init(void)
* Now regiser notify event
*/
mca_soh_bproc_component.notify_fd = bproc_notifier();
if (mca_soh_bproc_component.notify_fd < 0)
mca_smr_bproc_component.notify_fd = bproc_notifier();
if (mca_smr_bproc_component.notify_fd < 0)
return ORTE_ERROR;
memset(&mca_soh_bproc_component.notify_event, 0, sizeof(opal_event_t));
memset(&mca_smr_bproc_component.notify_event, 0, sizeof(opal_event_t));
opal_event_set(
&mca_soh_bproc_component.notify_event,
mca_soh_bproc_component.notify_fd,
&mca_smr_bproc_component.notify_event,
mca_smr_bproc_component.notify_fd,
OPAL_EV_READ|OPAL_EV_PERSIST,
orte_soh_bproc_notify_handler,
orte_smr_bproc_notify_handler,
0);
opal_event_add(&mca_soh_bproc_component.notify_event, 0);
opal_event_add(&mca_smr_bproc_component.notify_event, 0);
return ORTE_SUCCESS;
}
orte_soh_base_module_t orte_soh_bproc_module = {
orte_soh_bproc_get_proc_soh,
orte_soh_bproc_set_proc_soh,
orte_soh_base_get_node_soh_not_available,
orte_soh_base_set_node_soh_not_available,
orte_soh_base_get_job_soh,
orte_soh_base_set_job_soh,
orte_soh_base_begin_monitoring_not_available,
orte_soh_bproc_finalize
orte_smr_base_module_t orte_smr_bproc_module = {
orte_smr_bproc_get_proc_state,
orte_smr_bproc_set_proc_state,
orte_smr_base_get_node_state_not_available,
orte_smr_base_set_node_state_not_available,
orte_smr_base_get_job_state,
orte_smr_base_set_job_state,
orte_smr_base_begin_monitoring_not_available,
orte_smr_bproc_finalize
};
static int orte_soh_bproc_get_proc_soh(orte_proc_state_t *state, int *status, orte_process_name_t *proc)
static int orte_smr_bproc_get_proc_state(orte_proc_state_t *state, int *status, orte_process_name_t *proc)
{
return orte_soh_base_get_proc_soh(state, status, proc);
return orte_smr_base_get_proc_state(state, status, proc);
}
static int orte_soh_bproc_set_proc_soh(orte_process_name_t *proc, orte_proc_state_t state, int status)
static int orte_smr_bproc_set_proc_state(orte_process_name_t *proc, orte_proc_state_t state, int status)
{
return orte_soh_base_set_proc_soh(proc, state, status);
return orte_smr_base_set_proc_state(proc, state, status);
}
/**
* Cleanup
*/
int orte_soh_bproc_finalize(void)
int orte_smr_bproc_finalize(void)
{
opal_event_del(&mca_soh_bproc_component.notify_event);
opal_event_del(&mca_smr_bproc_component.notify_event);
return ORTE_SUCCESS;
}

Просмотреть файл

@ -18,12 +18,12 @@
/**
* @file
*/
#ifndef ORTE_SOH_BPROC_H
#define ORTE_SOH_BPROC_H
#ifndef ORTE_SMR_BPROC_H
#define ORTE_SMR_BPROC_H
#include <sys/bproc.h>
#include "orte/mca/soh/soh.h"
#include "orte/mca/smr/smr.h"
#include "opal/event/event.h"
#if defined(c_plusplus) || defined(__cplusplus)
@ -33,20 +33,20 @@ extern "C" {
/**
* Bproc node registry keys
*/
#define ORTE_SOH_BPROC_NODE_STATUS "orte-node-bproc-status"
#define ORTE_SOH_BPROC_NODE_MODE "orte-node-bproc-mode"
#define ORTE_SOH_BPROC_NODE_USER "orte-node-bproc-user"
#define ORTE_SOH_BPROC_NODE_GROUP "orte-node-bproc-group"
#define ORTE_SMR_BPROC_NODE_STATUS "orte-node-bproc-status"
#define ORTE_SMR_BPROC_NODE_MODE "orte-node-bproc-mode"
#define ORTE_SMR_BPROC_NODE_USER "orte-node-bproc-user"
#define ORTE_SMR_BPROC_NODE_GROUP "orte-node-bproc-group"
/**
* Module init/fini
*/
int orte_soh_bproc_module_init(void);
int orte_soh_bproc_module_finalize(void);
int orte_smr_bproc_module_init(void);
int orte_smr_bproc_module_finalize(void);
struct orte_soh_bproc_component_t {
orte_soh_base_component_t super;
struct orte_smr_bproc_component_t {
orte_smr_base_component_t super;
int debug;
int priority;
opal_event_t notify_event;
@ -54,10 +54,10 @@ struct orte_soh_bproc_component_t {
orte_cellid_t cellid;
struct bproc_node_set_t node_set;
};
typedef struct orte_soh_bproc_component_t orte_soh_bproc_component_t;
typedef struct orte_smr_bproc_component_t orte_smr_bproc_component_t;
OMPI_COMP_EXPORT extern orte_soh_base_module_t orte_soh_bproc_module;
OMPI_COMP_EXPORT extern orte_soh_bproc_component_t mca_soh_bproc_component;
OMPI_COMP_EXPORT extern orte_smr_base_module_t orte_smr_bproc_module;
OMPI_COMP_EXPORT extern orte_smr_bproc_component_t mca_smr_bproc_component;
#if defined(c_plusplus) || defined(__cplusplus)
}

Просмотреть файл

@ -22,32 +22,32 @@
#include "opal/mca/base/mca_base_param.h"
#include "orte/util/proc_info.h"
#include "opal/util/output.h"
#include "orte/mca/soh/bproc/soh_bproc.h"
#include "orte/mca/smr/bproc/smr_bproc.h"
/*
* Local functions
*/
static int orte_soh_bproc_open(void);
static int orte_soh_bproc_close(void);
static orte_soh_base_module_t* orte_soh_bproc_init(int*);
static int orte_smr_bproc_open(void);
static int orte_smr_bproc_close(void);
static orte_smr_base_module_t* orte_smr_bproc_init(int*);
orte_soh_bproc_component_t mca_soh_bproc_component = {
orte_smr_bproc_component_t mca_smr_bproc_component = {
{
/* First, the mca_base_module_t struct containing meta
information about the module itself */
{
/* Indicate that we are a bproc soh v1.0.0 module (which also
/* Indicate that we are a bproc smr v1.3.0 module (which also
implies a specific MCA version) */
ORTE_SOH_BASE_VERSION_1_0_0,
ORTE_SMR_BASE_VERSION_1_3_0,
"bproc", /* MCA module name */
ORTE_MAJOR_VERSION, /* MCA module major version */
ORTE_MINOR_VERSION, /* MCA module minor version */
ORTE_RELEASE_VERSION, /* MCA module release version */
orte_soh_bproc_open, /* component open */
orte_soh_bproc_close /* component close */
orte_smr_bproc_open, /* component open */
orte_smr_bproc_close /* component close */
},
/* Next the MCA v1.0.0 module meta data */
@ -58,18 +58,18 @@ orte_soh_bproc_component_t mca_soh_bproc_component = {
false
},
orte_soh_bproc_init
orte_smr_bproc_init
}
};
/**
* Utility function to register parameters
*/
static int orte_soh_bproc_param_register_int(
static int orte_smr_bproc_param_register_int(
const char* param_name,
int default_value)
{
int id = mca_base_param_register_int("soh","bproc",param_name,NULL,default_value);
int id = mca_base_param_register_int("smr","bproc",param_name,NULL,default_value);
int param_value = default_value;
mca_base_param_lookup_int(id,&param_value);
return param_value;
@ -79,12 +79,12 @@ static int orte_soh_bproc_param_register_int(
*
*/
static int orte_soh_bproc_open(void)
static int orte_smr_bproc_open(void)
{
mca_soh_bproc_component.debug =
orte_soh_bproc_param_register_int("debug", 0);
mca_soh_bproc_component.priority =
orte_soh_bproc_param_register_int("priority", 1);
mca_smr_bproc_component.debug =
orte_smr_bproc_param_register_int("debug", 0);
mca_smr_bproc_component.priority =
orte_smr_bproc_param_register_int("priority", 1);
return ORTE_SUCCESS;
}
@ -92,14 +92,14 @@ static int orte_soh_bproc_open(void)
*
*/
static orte_soh_base_module_t* orte_soh_bproc_init(int *priority)
static orte_smr_base_module_t* orte_smr_bproc_init(int *priority)
{
if (!orte_process_info.seed)
return NULL;
*priority = mca_soh_bproc_component.priority;
orte_soh_bproc_module_init();
return &orte_soh_bproc_module;
*priority = mca_smr_bproc_component.priority;
orte_smr_bproc_module_init();
return &orte_smr_bproc_module;
}
@ -107,7 +107,7 @@ static orte_soh_base_module_t* orte_soh_bproc_init(int *priority)
*
*/
static int orte_soh_bproc_close(void)
static int orte_smr_bproc_close(void)
{
return ORTE_SUCCESS;
}

Просмотреть файл

@ -21,8 +21,8 @@
*
*/
#ifndef ORTE_SOH_H
#define ORTE_SOH_H
#ifndef ORTE_SMR_H
#define ORTE_SMR_H
/*
* includes
@ -34,7 +34,7 @@
#include "opal/mca/mca.h"
#include "orte/mca/ns/ns_types.h"
#include "orte/mca/soh/soh_types.h"
#include "orte/mca/smr/smr_types.h"
/*
* Component functions - all MUST be provided!
@ -42,114 +42,114 @@
/*
* Query the state-of-health of a process
* Query a process state
*/
typedef int (*orte_soh_base_module_get_proc_soh_fn_t)(orte_proc_state_t *state,
typedef int (*orte_smr_base_module_get_proc_state_fn_t)(orte_proc_state_t *state,
int *status,
orte_process_name_t *proc);
/*
* Set the state-of-health of a process
* Set a process state
*/
typedef int (*orte_soh_base_module_set_proc_soh_fn_t)(orte_process_name_t *proc,
typedef int (*orte_smr_base_module_set_proc_state_fn_t)(orte_process_name_t *proc,
orte_proc_state_t state, int status);
/*
* Query SOH of a node
* Query a node state
*/
typedef int (*orte_soh_base_module_get_node_soh_fn_t)(orte_node_state_t *state,
typedef int (*orte_smr_base_module_get_node_state_fn_t)(orte_node_state_t *state,
orte_cellid_t cell,
char *nodename);
/*
* Set SOH of a node
* Set a node state
*/
typedef int (*orte_soh_base_module_set_node_soh_fn_t)(orte_cellid_t cell,
typedef int (*orte_smr_base_module_set_node_state_fn_t)(orte_cellid_t cell,
char *nodename,
orte_node_state_t state);
/*
* Query the state-of-health of a job
* Query a job state
*/
typedef int (*orte_soh_base_module_get_job_soh_fn_t)(orte_job_state_t *state,
typedef int (*orte_smr_base_module_get_job_state_fn_t)(orte_job_state_t *state,
orte_jobid_t jobid);
/*
* Set the state-of-health of a job
* Set a job state
*/
typedef int (*orte_soh_base_module_set_job_soh_fn_t)(orte_jobid_t jobid,
typedef int (*orte_smr_base_module_set_job_state_fn_t)(orte_jobid_t jobid,
orte_job_state_t state);
/*
* Initiate monitoring of a job
* This function notifies the soh that it should initiate monitoring of the specified
* This function notifies the smr that it should initiate monitoring of the specified
* jobid. It is called by the resource manager once a job has been launched. Calling
* the function, allows soh components (e.g., the BProc component that monitors daemons
* the function, allows smr components (e.g., the BProc component that monitors daemons
* via the BProc-provided centralized alerting system) to make the necessary connections
* for monitoring the job.
*/
typedef int (*orte_soh_base_module_begin_monitoring_fn_t)(orte_jobid_t job);
typedef int (*orte_smr_base_module_begin_monitoring_fn_t)(orte_jobid_t job);
/* Shutdown the module nicely
*/
typedef int (*orte_soh_base_module_finalize_fn_t)(void);
typedef int (*orte_smr_base_module_finalize_fn_t)(void);
/* below are the prototypes needed by the MCA */
/*
* Ver 1.0.0
* Ver 1.3.0
*/
struct orte_soh_base_module_1_0_0_t {
orte_soh_base_module_get_proc_soh_fn_t get_proc_soh;
orte_soh_base_module_set_proc_soh_fn_t set_proc_soh;
orte_soh_base_module_get_node_soh_fn_t get_node_soh;
orte_soh_base_module_set_node_soh_fn_t set_node_soh;
orte_soh_base_module_get_job_soh_fn_t get_job_soh;
orte_soh_base_module_set_job_soh_fn_t set_job_soh;
orte_soh_base_module_begin_monitoring_fn_t begin_monitoring_job;
orte_soh_base_module_finalize_fn_t finalize;
struct orte_smr_base_module_1_3_0_t {
orte_smr_base_module_get_proc_state_fn_t get_proc_state;
orte_smr_base_module_set_proc_state_fn_t set_proc_state;
orte_smr_base_module_get_node_state_fn_t get_node_state;
orte_smr_base_module_set_node_state_fn_t set_node_state;
orte_smr_base_module_get_job_state_fn_t get_job_state;
orte_smr_base_module_set_job_state_fn_t set_job_state;
orte_smr_base_module_begin_monitoring_fn_t begin_monitoring_job;
orte_smr_base_module_finalize_fn_t finalize;
};
typedef struct orte_soh_base_module_1_0_0_t orte_soh_base_module_1_0_0_t;
typedef orte_soh_base_module_1_0_0_t orte_soh_base_module_t;
typedef struct orte_smr_base_module_1_3_0_t orte_smr_base_module_1_3_0_t;
typedef orte_smr_base_module_1_3_0_t orte_smr_base_module_t;
/*
* SOH Component
*/
typedef orte_soh_base_module_t* (*orte_soh_base_component_init_fn_t)(
typedef orte_smr_base_module_t* (*orte_smr_base_component_init_fn_t)(
int *priority);
typedef int (*orte_soh_base_component_finalize_fn_t)(void);
typedef int (*orte_smr_base_component_finalize_fn_t)(void);
/*
* the standard component data structure
*/
struct orte_soh_base_component_1_0_0_t {
mca_base_component_t soh_version;
mca_base_component_data_1_0_0_t soh_data;
orte_soh_base_component_init_fn_t soh_init;
orte_soh_base_component_finalize_fn_t soh_finalize;
struct orte_smr_base_component_1_3_0_t {
mca_base_component_t smr_version;
mca_base_component_data_1_0_0_t smr_data;
orte_smr_base_component_init_fn_t smr_init;
orte_smr_base_component_finalize_fn_t smr_finalize;
};
typedef struct orte_soh_base_component_1_0_0_t orte_soh_base_component_1_0_0_t;
typedef struct orte_smr_base_component_1_3_0_t orte_smr_base_component_1_3_0_t;
typedef orte_soh_base_component_1_0_0_t orte_soh_base_component_t;
typedef orte_smr_base_component_1_3_0_t orte_smr_base_component_t;
/*
* Macro for use in components that are of type ns v1.0.0
*/
#define ORTE_SOH_BASE_VERSION_1_0_0 \
/* soh v1.0 is chained to MCA v1.0 */ \
#define ORTE_SMR_BASE_VERSION_1_3_0 \
/* smr v1.3 is chained to MCA v1.0 */ \
MCA_BASE_VERSION_1_0_0, \
/* soh v1.0 */ \
"soh", 1, 0, 0
/* smr v1.3 */ \
"smr", 1, 3, 0
OMPI_DECLSPEC extern orte_soh_base_module_t orte_soh; /* holds selected module's function pointers */
OMPI_DECLSPEC extern orte_smr_base_module_t orte_smr; /* holds selected module's function pointers */
#endif /* ORTE_SOH_H */
#endif /* ORTE_SMR_H */

Просмотреть файл

Просмотреть файл

@ -1,37 +0,0 @@
#
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2004-2005 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
headers += \
base/base.h
libmca_soh_la_SOURCES += \
base/soh_base_close.c \
base/soh_base_select.c \
base/soh_base_local_functions.c \
base/soh_base_get_proc_soh.c \
base/soh_base_set_proc_soh.c \
base/soh_base_get_job_soh.c \
base/soh_base_set_job_soh.c \
base/soh_base_open.c \
base/data_type_support/soh_data_type_compare_fns.c \
base/data_type_support/soh_data_type_copy_fns.c \
base/data_type_support/soh_data_type_print_fns.c \
base/data_type_support/soh_data_type_release_fns.c \
base/data_type_support/soh_data_type_size_fns.c \
base/data_type_support/soh_data_type_packing_fns.c \
base/data_type_support/soh_data_type_unpacking_fns.c

Просмотреть файл

@ -1,85 +0,0 @@
/* -*- C -*-
*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*
*/
#include "orte_config.h"
#include <errno.h>
#include <signal.h>
#include "pcm_bproc.h"
#include "mca/pcm/pcm.h"
#include "mca/pcm/base/base.h"
#include "opal/class/opal_list.h"
#include "mca/pcm/base/base_job_track.h"
#include "orte/mca/ns/ns.h"
#include "orte/mca/ns/base/base.h"
int
mca_pcm_bproc_kill_proc(struct mca_pcm_base_module_1_0_0_t* me_super,
ompi_process_name_t *name, int flags)
{
mca_pcm_bproc_module_t *me = (mca_pcm_bproc_module_t*) me_super;
pid_t doomed;
if (NULL == me) return ORTE_ERR_BAD_PARAM;
if (NULL == name) return ORTE_ERR_BAD_PARAM;
doomed = mca_pcm_base_job_list_get_starter(me->jobs,
mca_ns_base_get_jobid(name),
mca_ns_base_get_vpid(name),
true);
if (doomed > 0) {
kill(doomed, SIGTERM);
} else {
return ORTE_ERR_NOT_FOUND;
}
return ORTE_SUCCESS;
}
int
mca_pcm_bproc_kill_job(struct mca_pcm_base_module_1_0_0_t* me_super,
mca_ns_base_jobid_t jobid, int flags)
{
mca_pcm_bproc_module_t *me = (mca_pcm_bproc_module_t*) me_super;
pid_t *doomed;
size_t doomed_len, i;
int ret;
if (NULL == me) return ORTE_ERR_BAD_PARAM;
/* check for invalid jobid */
ret = mca_pcm_base_job_list_get_starters(me->jobs,
jobid, &doomed, &doomed_len,
true);
if (ORTE_SUCCESS != ret) return ret;
for (i = 0 ; i < doomed_len ; ++i) {
kill(doomed[i], SIGTERM);
}
if (NULL != doomed) {
free(doomed);
}
return ORTE_SUCCESS;
}

Просмотреть файл

@ -1,69 +0,0 @@
/* -*- C -*-
*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*
*/
#include "orte_config.h"
#include <errno.h>
#include <unistd.h>
#include "pcm_bproc.h"
#include "mca/pcm/pcm.h"
#include "mca/pcm/base/base.h"
#include "opal/class/opal_list.h"
#include "orte/runtime/runtime.h"
#include "orte/runtime/runtime_types.h"
#include "ompi/runtime/ompi_rte_wait.h"
#include "opal/util/show_help.h"
#include "mca/pcm/base/base_kill_track.h"
#include "mca/pcm/base/base_job_track.h"
void
mca_pcm_bproc_monitor_cb(pid_t pid, int status, void *data)
{
mca_ns_base_jobid_t jobid = 0;
mca_ns_base_vpid_t upper = 0;
mca_ns_base_vpid_t lower = 0;
mca_ns_base_vpid_t i = 0;
int ret;
ompi_process_name_t *proc_name;
mca_pcm_bproc_module_t *me = (mca_pcm_bproc_module_t*) data;
ompi_rte_process_status_t proc_status;
printf("pcm: bproc: process %d exited with status %d\n", pid, status);
ret = mca_pcm_base_job_list_get_job_info(me->jobs, pid, &jobid,
&lower, &upper, true);
if (ret != ORTE_SUCCESS) {
opal_show_help("help-mca-pcm-bproc.txt",
"spawn:no-process-record", true, pid, status);
return;
}
/* unregister all the procs */
proc_status.status_key = OMPI_PROC_KILLED;
proc_status.exit_code = (ompi_exit_code_t)status;
for (i = lower ; i <= upper ; ++i) {
proc_name = mca_ns_base_create_process_name(0, jobid, i);
ompi_rte_set_process_status(&proc_status, proc_name);
free(proc_name);
}
mca_pcm_base_kill_unregister((mca_pcm_base_module_t*)me, jobid, lower, upper);
}

Просмотреть файл

@ -1,234 +0,0 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include <sys/poll.h>
#include <sys/bproc.h>
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif
#include "orte/orte_constants.h"
#include "orte/mca/oob/oob.h"
#include "orte/mca/oob/base/base.h"
#include "orte/mca/ns/base/base.h"
#include "orte/runtime/runtime.h"
#include "svc_bproc_soh.h"
mca_svc_base_module_t mca_svc_bproc_soh_module = {
mca_svc_bproc_soh_module_init,
mca_svc_bproc_soh_module_fini
};
/*
* Add a BProc node to the virtual machine SOH segment
*/
int
mca_svc_bproc_soh_add_node(mca_ns_base_cellid_t cellid, int node)
{
ompi_rte_vm_status_t *vmdata;
int err;
vmdata = (ompi_rte_vm_status_t*)malloc(sizeof(ompi_rte_vm_status_t));
vmdata->cell = cellid;
asprintf(&(vmdata->nodename), "%d", node);
err = bproc_getnodeattr(ni->node, "cpus", &cpus, sizeof(cpus));
if (err != 0)
cpus = 1;
vmdata->cpus = (uint16_t)cpus;
}
/**
* Process a BProc update notice
*/
int
mca_svc_bproc_soh_status_changed(struct bproc_node_info_t *old, struct bproc_node_info_t *new)
{
if (old->node != new->node)
return 0;
if (strcmp(old->status, new->status))
return 1;
if (old->user != new->user)
return 1;
if (old->group != new->group)
return 1;
if (old->mode != new->mode)
return 1;
return 0;
}
void
mca_svc_bproc_soh_update_node_info(mca_ns_base_cellid_t cellid, struct bproc_node_info_t *ni)
{
int err;
int cpus;
char *node;
ompi_rte_vm_status__t *vmdata;
asprintf(&node, "%d", ni->node);
vmdata = ompi_rte_get_vm_status(cellid, node);
if (vmdata == NULL) { /* this node isn't present yet - add it */
mca_svc_bproc_soh_add_node(cellid, ni->node);
return;
/* in long-term, we will store the soh data in key-value pairs. for now,
* we store it simply as values so we can get it working - I will update
* this later to the final form.
*/
vmdata->user = ni->user;
vmdata->group = ni->group;
vmdata->mode = ni->mode;
if (NULL != vmdata->status) {
free(vmdata->status);
}
vmdata->status = strdup(ni->status);
/*
ompi_vm_status_data_add_int(vmdata, "user", ni->user);
ompi_vm_status_data_add_int(vmdata, "group", ni->group);
ompi_vm_status_data_add_int(vmdata, "mode", ni->mode);
ompi_vm_status_data_add_string(vmdata, "status", ni->status);
*/
/* probably should optimize this so it only happens once */
/* ompi_vm_status_data_add_int(vmdata, "#cpus", cpus); */
/* registry_put(segment, cell, node, vmdata); */
free(node);
ompit_vm_status_data_finish(vmdata);
}
void
mca_svc_bproc_soh_check_node_info(char *segment, char *cell,
struct bproc_node_set_t **old,
struct bproc_node_set_t *new)
{
/* we assume the number of nodes does not change */
for (i = 0; i < new->size; i++) {
ni = &new->node[i];
if (!old->size || status_changed((*old)->node[i], ni))
update_node_info(segment, cell, ni);
}
if ((*old)->size)
bproc_nodeset_free(*old);
bproc_nodeset_init(*old, new->size);
memcpy((*old)->node, new->node, sizeof(*new->node) * new->size);
}
#if OMPI_HAVE_POSIX_THREADS
static void *
mca_svc_bproc_soh_status_thread(opal_thread_t *thread)
{
struct pollfd pfd;
struct bproc_node_set_t ns = BPROC_EMPTY_NODESET;
mca_svc_bproc_soh_module_t *module = (mca_svc_bproc_soh_module_t *)thread->t_arg;
/* This thread enter in a cancel enabled state */
pthread_setcancelstate( PTHREAD_CANCEL_ENABLE, NULL );
pthread_setcanceltype( PTHREAD_CANCEL_ASYNCHRONOUS, NULL );
for (;;) {
pfd.fd = module->notify_fd;
pfd.events = POLLIN;
res = poll(&pfd, 1, -1);
if (res < 0) {
/* poll error */
break;
}
if (bproc_nodelist_(&ns, module->notify_fd) < 0) {
/* bproc_nodelist_ error */
break;
}
mca_svc_bproc_soh_check_node_info(module->segment, module->cell, &module->node_info, ns);
bproc_nodeset_free(&ns);
}
return PTHREAD_CANCELED;
}
#endif /* OMPI_HAVE_POSIX_THREADS */
/**
* Register a callback to receive BProc update notifications
*/
int mca_svc_bproc_soh_module_init(mca_svc_base_module_t* base)
{
int i;
int num_nodes;
bproc_node_set_t node_list;
int node_num;
char *segment, *jobid_string;
mca_svc_bproc_soh_module_t *module /* = somthing */;
jobid_string = ompi_name_server.get_jobid_string(ompi_rte_get_self());
asprintf(&module->segment, "%s-bproc", OMPI_RTE_VM_STATUS_SEGMENT);
module->cell = /* get cell somehow */;
num_nodes = bproc_nodelist(&module->node_info);
if (num_nodes < 0)
return OMPI_ERROR;
for (i = 0; i < module->node_info->size; i++) {
update_node_info(&module->node_info[i]);
}
module->notify_fd = bproc_notifier();
if (module->notify_fd < 0)
return OMPI_ERROR;
if (ompi_using_thread()) {
#if OMPI_HAVE_POSIX_THREADS
module->thread.t_handle = 0;
module->thread.t_run = (opal_thread_fn_t)mca_bproc_status_thread;
module->thread.t_arg = (void *)module;
#endif /* OMPI_HAVE_POSIX_THREADS */
}
return opal_thread_start(&module->thread);
}
/**
* Cleanup
*/
int mca_svc_bproc_soh_module_fini(mca_svc_base_module_t* base)
{
mca_svc_bproc_soh_module_t *module /* = somthing */;
#if OMPI_HAVE_POSIX_THREADS
if (module->thread.t_handle != 0) {
void *thread_return;
pthread_cancel(ptl->thread.t_handle);
opal_thread_join(&(module->thread), &thread_return);
}
#endif /* OMPI_HAVE_POSIX_THREADS */
return ORTE_SUCCESS;
}

Просмотреть файл

@ -1,57 +0,0 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/**
* @file
*/
#ifndef _MCA_SVC_BPROC_SOH_
#define _MCA_SVC_BPROC_SOH_
#include "mca/svc/svc.h"
#if defined(c_plusplus) || defined(__cplusplus)
extern "C" {
#endif
/**
* Component open/close/init
*/
int mca_svc_bproc_soh_component_open(void);
int mca_svc_bproc_soh_component_close(void);
mca_svc_base_module_t* mca_svc_bproc_soh_component_init(void);
/**
* Module init/fini
*/
int mca_svc_bproc_soh_module_init(mca_svc_base_module_t*);
int mca_svc_bproc_soh_module_fini(mca_svc_base_module_t*);
struct mca_svc_bproc_soh_component_t {
mca_svc_base_component_t base;
int debug;
};
typedef struct mca_svc_bproc_soh_component_t mca_svc_bproc_soh_component_t;
extern mca_svc_base_module_t mca_svc_bproc_soh_module;
extern mca_svc_soh_component_t mca_svc_bproc_soh_component;
#if defined(c_plusplus) || defined(__cplusplus)
}
#endif
#endif

Просмотреть файл

@ -1,99 +0,0 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "orte/orte_constants.h"
#include "svc_bproc_soh.h"
mca_svc_bproc_soh_component_t mca_svc_bproc_soh_component = {
{
/* First, the mca_base_module_t struct containing meta
information about the module itself */
{
/* Indicate that we are a bproc soh v1.0.0 module (which also
implies a specific MCA version) */
MCA_SVC_BASE_VERSION_1_0_0,
"bproc_soh", /* MCA module name */
ORTE_MAJOR_VERSION, /* MCA module major version */
ORTE_MINOR_VERSION, /* MCA module minor version */
ORTE_RELEASE_VERSION, /* MCA module release version */
mca_svc_bproc_soh_component_open, /* component open */
mca_svc_bproc_soh_component_close /* component close */
},
/* Next the MCA v1.0.0 module meta data */
{
/* Whether the module is checkpointable or not */
false
},
mca_svc_bproc_soh_component_init
},
0 /* exec_debug */
};
/**
* Utility function to register parameters
*/
static inline int mca_svc_bproc_soh_param_register_int(
const char* param_name,
int default_value)
{
int id = mca_base_param_register_int("svc","bproc_soh",param_name,NULL,default_value);
int param_value = default_value;
mca_base_param_lookup_int(id,&param_value);
return param_value;
}
/**
*
*/
int mca_svc_bproc_soh_component_open(void)
{
mca_svc_bproc_soh_component.debug =
mca_svc_bproc_soh_param_register_int("debug", 0);
return ORTE_SUCCESS;
}
/**
*
*/
mca_svc_base_module_t* mca_svc_bproc_soh_component_init(void)
{
return &mca_svc_bproc_soh_module;
}
/**
*
*/
int mca_svc_bproc_soh_component_close(void)
{
return ORTE_SUCCESS;
}

Просмотреть файл

@ -1,43 +0,0 @@
#
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2004-2005 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2006 The Regents of the University of California.
# All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
if OMPI_BUILD_soh_xcpu_DSO
component_noinst =
component_install = mca_soh_xcpu.la
else
component_noinst = libmca_soh_xcpu.la
component_install =
endif
xcpu_SOURCES = \
soh_xcpu.c \
soh_xcpu.h \
soh_xcpu_component.c
mcacomponentdir = $(libdir)/openmpi
mcacomponent_LTLIBRARIES = $(component_install)
mca_soh_xcpu_la_SOURCES = $(xcpu_SOURCES)
mca_soh_xcpu_la_LIBADD = \
$(top_ompi_builddir)/orte/liborte.la \
$(top_ompi_builddir)/opal/libopal.la
mca_soh_xcpu_la_LDFLAGS = -module -avoid-version
noinst_LTLIBRARIES = $(component_noinst)
libmca_soh_xcpu_la_SOURCES = $(xcpu_SOURCES)
libmca_soh_xcpu_la_LIBADD =
libmca_soh_xcpu_la_LDFLAGS = -module -avoid-version

Просмотреть файл

@ -1,30 +0,0 @@
# -*- shell-script -*-
#
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2004-2005 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2006 The Regents of the University of California.
# All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# MCA_soh_xcpu_CONFIG([action-if-found], [action-if-not-found])
# -----------------------------------------------------------
AC_DEFUN([MCA_soh_xcpu_CONFIG],[
soh_xcpu_good=0
# no need for soh_xcpu for time being
# if xcpu is present and working, soh_xcpu_good=1.
# Evaluate succeed / fail
AS_IF([test "$soh_xcpu_good" = "1"],
[$1], [$2])
])dnl

Просмотреть файл

@ -1,23 +0,0 @@
# -*- shell-script -*-
#
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2004-2005 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2006 The Regents of the University of California.
# All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# Specific to this module
PARAM_INIT_FILE=soh_xcpu.c
PARAM_CONFIG_FILES="Makefile"

Просмотреть файл

@ -1,94 +0,0 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2006 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include <pwd.h>
#include <grp.h>
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif
#include "orte/orte_constants.h"
#include "orte/orte_types.h"
#include "orte/util/proc_info.h"
#include "orte/mca/ns/ns.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/gpr/base/base.h"
#include "orte/mca/soh/base/base.h"
#include "orte/mca/soh/xcpu/soh_xcpu.h"
#include "orte/mca/rmaps/base/base.h"
#include "orte/mca/rmaps/base/rmaps_base_map.h"
#include "opal/util/output.h"
static int orte_soh_xcpu_begin_monitoring_job(orte_jobid_t);
static int orte_soh_xcpu_finalize(void);
int orte_soh_xcpu_module_init(void)
{
int rc;
if (ORTE_SUCCESS != (rc = orte_ns.get_cellid(&mca_soh_xcpu_component.cellid, orte_process_info.my_name))) {
fprintf(stderr, "orte_soh_xcpu_module_init error\n");
ORTE_ERROR_LOG(rc);
return rc;
}
return ORTE_SUCCESS;
}
orte_soh_base_module_t orte_soh_xcpu_module = {
orte_soh_base_get_proc_soh,
orte_soh_base_set_proc_soh,
orte_soh_base_get_node_soh_not_available,
orte_soh_base_set_node_soh_not_available,
orte_soh_base_get_job_soh,
orte_soh_base_set_job_soh,
orte_soh_xcpu_begin_monitoring_job,
orte_soh_xcpu_finalize
};
/* @begin_monitoring: right now, its only trying to update registry so
* that mpirun can exit normally
* pls_xcpu is waiting for all threads to finish before calling this function
*/
static int orte_soh_xcpu_begin_monitoring_job(orte_jobid_t jobid){
int rc;
size_t num_procs, i;
orte_process_name_t *peers;
if (ORTE_SUCCESS != (rc = orte_ns.get_job_peers(&peers, &num_procs, jobid))) {
ORTE_ERROR_LOG(rc);
}else
for (i=0; i < num_procs; i++) {
if (ORTE_SUCCESS != (rc = orte_soh_base_set_proc_soh(&peers[i], ORTE_PROC_STATE_TERMINATED, 0)) ) {
ORTE_ERROR_LOG(rc);
break;
}
}
free(peers);
return rc;
}
static int orte_soh_xcpu_finalize(void)
{
return ORTE_SUCCESS;
}

Просмотреть файл

@ -1,66 +0,0 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2006 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/**
* @file
*/
#ifndef ORTE_SOH_XCPU_H
#define ORTE_SOH_XCPU_H
#include "orte/mca/soh/soh.h"
#include "opal/event/event.h"
#if defined(c_plusplus) || defined(__cplusplus)
extern "C" {
#endif
/**
* Bproc node registry keys
*/
#define ORTE_SOH_XCPU_NODE_STATUS "orte-node-xcpu-status"
#define ORTE_SOH_XCPU_NODE_MODE "orte-node-xcpu-mode"
#define ORTE_SOH_XCPU_NODE_USER "orte-node-xcpu-user"
#define ORTE_SOH_XCPU_NODE_GROUP "orte-node-xcpu-group"
/**
* Module init/fini
*/
int orte_soh_xcpu_module_init(void);
int orte_soh_xcpu_module_finalize(void);
struct orte_soh_xcpu_component_t {
orte_soh_base_component_t super;
/* not sure which of the following variabels are
* needed
* */
int debug;
int priority;
opal_event_t notify_event;
int notify_fd;
orte_cellid_t cellid;
/*struct xcpu_node_set_t node_set;*/
};
typedef struct orte_soh_xcpu_component_t orte_soh_xcpu_component_t;
OMPI_COMP_EXPORT extern orte_soh_base_module_t orte_soh_xcpu_module;
OMPI_COMP_EXPORT extern orte_soh_xcpu_component_t mca_soh_xcpu_component;
#if defined(c_plusplus) || defined(__cplusplus)
}
#endif
#endif

Просмотреть файл

@ -1,99 +0,0 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2006 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "orte/orte_constants.h"
#include "opal/mca/base/base.h"
#include "opal/mca/base/mca_base_param.h"
#include "orte/util/proc_info.h"
#include "opal/util/output.h"
#include "orte/mca/soh/xcpu/soh_xcpu.h"
/*
* Local functions
*/
static int orte_soh_xcpu_open(void);
static int orte_soh_xcpu_close(void);
static orte_soh_base_module_t* orte_soh_xcpu_init(int*);
orte_soh_xcpu_component_t mca_soh_xcpu_component = {
{
/* First, the mca_base_module_t struct containing meta
information about the module itself */
{
/* Indicate that we are a xcpu soh v1.0.0 module (which also
implies a specific MCA version) */
ORTE_SOH_BASE_VERSION_1_0_0,
"xcpu", /* MCA module name */
ORTE_MAJOR_VERSION, /* MCA module major version */
ORTE_MINOR_VERSION, /* MCA module minor version */
ORTE_RELEASE_VERSION, /* MCA module release version */
orte_soh_xcpu_open, /* component open */
orte_soh_xcpu_close /* component close */
},
/* Next the MCA v1.0.0 module meta data */
{
/* Whether the module is checkpointable or not */
false
},
orte_soh_xcpu_init
}
};
/**
* Utility function to register parameters
*/
static int orte_soh_xcpu_param_register_int(
const char* param_name,
int default_value)
{
int id = mca_base_param_register_int("soh","xcpu",param_name,NULL,default_value);
int param_value = default_value;
mca_base_param_lookup_int(id,&param_value);
return param_value;
}
static int orte_soh_xcpu_open(void)
{
mca_soh_xcpu_component.debug =
orte_soh_xcpu_param_register_int("debug", 0);
mca_soh_xcpu_component.priority =
orte_soh_xcpu_param_register_int("priority", 100);
/*fprintf(stdout, "soh_xcpu: open\n");*/
return ORTE_SUCCESS;
}
static orte_soh_base_module_t* orte_soh_xcpu_init(int *priority)
{
*priority = mca_soh_xcpu_component.priority;
orte_soh_xcpu_module_init();/*do we need this???*/
return &orte_soh_xcpu_module;
}
static int orte_soh_xcpu_close(void)
{
fprintf(stdout, "soh_xcpu: close\n");
return ORTE_SUCCESS;
}

Просмотреть файл

@ -52,7 +52,7 @@
#include "orte/mca/rmgr/base/base.h"
#include "orte/mca/rmaps/base/base.h"
#include "orte/mca/schema/base/base.h"
#include "orte/mca/soh/base/base.h"
#include "orte/mca/smr/base/base.h"
#include "orte/util/univ_info.h"
#include "orte/util/proc_info.h"
#include "orte/util/session_dir.h"
@ -364,15 +364,15 @@ int orte_init_stage1(bool infrastructure)
/*
* setup the state-of-health monitor
*/
if (ORTE_SUCCESS != (ret = orte_soh_base_open())) {
if (ORTE_SUCCESS != (ret = orte_smr_base_open())) {
ORTE_ERROR_LOG(ret);
error = "orte_soh_base_open";
error = "orte_smr_base_open";
goto error;
}
if (ORTE_SUCCESS != (ret = orte_soh_base_select())) {
if (ORTE_SUCCESS != (ret = orte_smr_base_select())) {
ORTE_ERROR_LOG(ret);
error = "orte_soh_base_select";
error = "orte_smr_base_select";
goto error;
}

Просмотреть файл

@ -29,9 +29,6 @@
#include "orte/mca/rml/rml.h"
#include "orte/mca/ns/ns.h"
#include "orte/mca/gpr/gpr.h"
#include "orte/mca/rmgr/base/base.h"
#include "orte/mca/soh/base/base.h"
#include "orte/mca/iof/base/base.h"
#include "orte/runtime/runtime.h"

Просмотреть файл

@ -39,7 +39,7 @@
#include "orte/mca/ns/base/base.h"
#include "orte/mca/gpr/base/base.h"
#include "orte/mca/rmgr/base/base.h"
#include "orte/mca/soh/base/base.h"
#include "orte/mca/smr/base/base.h"
#include "orte/util/proc_info.h"
#include "orte/util/sys_info.h"
#include "orte/util/univ_info.h"
@ -87,7 +87,7 @@ int orte_restart(orte_process_name_t *name, const char* uri)
ORTE_ERROR_LOG(rc);
return rc;
}
if (ORTE_SUCCESS != (rc = orte_soh_base_close())) {
if (ORTE_SUCCESS != (rc = orte_smr_base_close())) {
ORTE_ERROR_LOG(rc);
return rc;
}
@ -225,7 +225,7 @@ int orte_restart(orte_process_name_t *name, const char* uri)
ORTE_ERROR_LOG(rc);
return rc;
}
if (ORTE_SUCCESS != (rc = orte_soh_base_open())) {
if (ORTE_SUCCESS != (rc = orte_smr_base_open())) {
ORTE_ERROR_LOG(rc);
return rc;
}
@ -246,7 +246,7 @@ int orte_restart(orte_process_name_t *name, const char* uri)
ORTE_ERROR_LOG(rc);
return rc;
}
if (ORTE_SUCCESS != (rc = orte_soh_base_select())) {
if (ORTE_SUCCESS != (rc = orte_smr_base_select())) {
ORTE_ERROR_LOG(rc);
return rc;
}

Просмотреть файл

@ -57,7 +57,7 @@
#include "orte/util/proc_info.h"
#include "orte/util/session_dir.h"
#include "orte/util/universe_setup_file_io.h"
#include "orte/mca/soh/soh.h"
#include "orte/mca/smr/smr.h"
#include "orte/mca/rml/rml.h"
#include "orte/mca/rds/rds_types.h"
#include "orte/mca/ns/ns.h"

Просмотреть файл

@ -60,10 +60,9 @@
#include "orte/mca/ns/base/base.h"
#include "orte/mca/gpr/gpr.h"
#include "orte/mca/rml/rml.h"
#include "orte/mca/soh/soh.h"
#include "orte/mca/smr/smr.h"
#include "orte/mca/rmgr/rmgr.h"
#include "orte/mca/rmgr/base/base.h"
#include "orte/mca/soh/base/base.h"
#include "orte/runtime/runtime.h"
@ -387,7 +386,7 @@ int main(int argc, char *argv[])
* Set my process status to "starting". Note that this must be done
* after the rte init is completed.
*/
if (ORTE_SUCCESS != (ret = orte_soh.set_proc_soh(orte_process_info.my_name,
if (ORTE_SUCCESS != (ret = orte_smr.set_proc_state(orte_process_info.my_name,
ORTE_PROC_STATE_RUNNING, 0))) {
ORTE_ERROR_LOG(ret);
return ret;

Просмотреть файл

@ -65,7 +65,6 @@
#include "orte/mca/ns/base/base.h"
#include "orte/mca/gpr/base/base.h"
#include "orte/mca/schema/base/base.h"
#include "orte/mca/soh/base/base.h"
#include "orte/runtime/runtime.h"
#include "orte/runtime/orte_wait.h"

Просмотреть файл

@ -559,7 +559,7 @@ static void dump_aborted_procs(orte_jobid_t jobid)
continue;
}
if(strcmp(keyval->key, ORTE_PROC_RANK_KEY) == 0) {
if (ORTE_SUCCESS != (rc = orte_dss.get((void**)&sptr, keyval->value, ORTE_SIZE))) {
if (ORTE_SUCCESS != (rc = orte_dss.get((void**)&sptr, keyval->value, ORTE_STD_CNTR))) {
ORTE_ERROR_LOG(rc);
continue;
}
@ -660,7 +660,7 @@ static void job_state_callback(orte_jobid_t jobid, orte_proc_state_t state)
break;
case ORTE_PROC_STATE_TERMINATED:
dump_aborted_procs(jobid);
orterun_globals.exit_status = 0; /* set the exit status to indicate normal termination */
orterun_globals.exit = true;
opal_condition_signal(&orterun_globals.cond);
break;

Просмотреть файл

@ -19,4 +19,4 @@
SUBDIRS = oob schema
DIST_SUBDIRS = gpr gpr/remote ns oob ras rds rmaps rmgr schema soh
DIST_SUBDIRS = gpr gpr/remote ns oob ras rds rmaps rmgr schema smr

Просмотреть файл

@ -21,48 +21,48 @@
AM_CPPFLAGS = -I$(top_srcdir)/test/support
check_PROGRAMS = \
soh_dt_buffer \
soh_dt_compare \
soh_dt_print \
soh_dt_size \
soh_dt_release \
soh_dt_copy
smr_dt_buffer \
smr_dt_compare \
smr_dt_print \
smr_dt_size \
smr_dt_release \
smr_dt_copy
TESTS = \
$(check_PROGRAMS)
soh_dt_buffer_SOURCES = soh_dt_buffer.c
soh_dt_buffer_LDADD = \
smr_dt_buffer_SOURCES = smr_dt_buffer.c
smr_dt_buffer_LDADD = \
$(top_builddir)/orte/liborte.la \
$(top_builddir)/opal/libopal.la
soh_dt_buffer_DEPENDENCIES = $(soh_dt_buffer_LDADD)
smr_dt_buffer_DEPENDENCIES = $(smr_dt_buffer_LDADD)
soh_dt_copy_SOURCES = soh_dt_copy.c
soh_dt_copy_LDADD = \
smr_dt_copy_SOURCES = smr_dt_copy.c
smr_dt_copy_LDADD = \
$(top_builddir)/orte/liborte.la \
$(top_builddir)/opal/libopal.la
soh_dt_copy_DEPENDENCIES = $(soh_dt_copy_LDADD)
smr_dt_copy_DEPENDENCIES = $(smr_dt_copy_LDADD)
soh_dt_compare_SOURCES = soh_dt_compare.c
soh_dt_compare_LDADD = \
smr_dt_compare_SOURCES = smr_dt_compare.c
smr_dt_compare_LDADD = \
$(top_builddir)/orte/liborte.la \
$(top_builddir)/opal/libopal.la
soh_dt_compare_DEPENDENCIES = $(soh_dt_compare_LDADD)
smr_dt_compare_DEPENDENCIES = $(smr_dt_compare_LDADD)
soh_dt_print_SOURCES = soh_dt_print.c
soh_dt_print_LDADD = \
smr_dt_print_SOURCES = smr_dt_print.c
smr_dt_print_LDADD = \
$(top_builddir)/orte/liborte.la \
$(top_builddir)/opal/libopal.la
soh_dt_print_DEPENDENCIES = $(soh_dt_print_LDADD)
smr_dt_print_DEPENDENCIES = $(smr_dt_print_LDADD)
soh_dt_size_SOURCES = soh_dt_size.c
soh_dt_size_LDADD = \
smr_dt_size_SOURCES = smr_dt_size.c
smr_dt_size_LDADD = \
$(top_builddir)/orte/liborte.la \
$(top_builddir)/opal/libopal.la
soh_dt_size_DEPENDENCIES = $(soh_dt_size_LDADD)
smr_dt_size_DEPENDENCIES = $(smr_dt_size_LDADD)
soh_dt_release_SOURCES = soh_dt_release.c
soh_dt_release_LDADD = \
smr_dt_release_SOURCES = smr_dt_release.c
smr_dt_release_LDADD = \
$(top_builddir)/orte/liborte.la \
$(top_builddir)/opal/libopal.la
soh_dt_release_DEPENDENCIES = $(soh_dt_release_LDADD)
smr_dt_release_DEPENDENCIES = $(smr_dt_release_LDADD)

Просмотреть файл

Просмотреть файл

Просмотреть файл

Просмотреть файл

Просмотреть файл

Просмотреть файл