Merge pull request #5944 from rhc54/topic/psrvr
Remove the stale orte-dvm code
Этот коммит содержится в:
Коммит
6213d23f0b
@ -248,14 +248,12 @@ AC_DEFUN([OPAL_CHECK_PMIX],[
|
||||
AC_MSG_ERROR([Cannot continue])])
|
||||
|
||||
AC_MSG_CHECKING([if user requested internal PMIx support($with_pmix)])
|
||||
opal_prun_happy=no
|
||||
opal_external_pmix_happy=no
|
||||
opal_external_have_pmix1=0
|
||||
|
||||
AS_IF([test "$with_pmix" = "internal"],
|
||||
[AC_MSG_RESULT([yes])
|
||||
opal_external_pmix_happy=no
|
||||
opal_prun_happy=yes
|
||||
opal_external_pmix_version=internal],
|
||||
|
||||
[AC_MSG_RESULT([no])
|
||||
@ -376,7 +374,6 @@ AC_DEFUN([OPAL_CHECK_PMIX],[
|
||||
[AC_MSG_RESULT([found])
|
||||
opal_external_pmix_version=2x
|
||||
opal_external_pmix_version_found=1
|
||||
opal_prun_happy=yes
|
||||
opal_external_pmix_happy=yes],
|
||||
[AC_MSG_RESULT([not found])])])
|
||||
|
||||
@ -436,7 +433,6 @@ AC_DEFUN([OPAL_CHECK_PMIX],[
|
||||
|
||||
AC_DEFINE_UNQUOTED([OPAL_PMIX_V1],[$opal_external_have_pmix1],
|
||||
[Whether the external PMIx library is v1])
|
||||
AM_CONDITIONAL([OPAL_WANT_PRUN], [test "$opal_prun_happy" = "yes"])
|
||||
|
||||
AS_IF([test "$opal_external_pmix_happy" = "yes"],
|
||||
[AS_IF([test "$opal_external_pmix_version" = "1x"],
|
||||
|
@ -30,7 +30,5 @@ AC_DEFUN([ORTE_CONFIG_FILES],[
|
||||
orte/tools/orte-top/Makefile
|
||||
orte/tools/orte-info/Makefile
|
||||
orte/tools/orte-server/Makefile
|
||||
orte/tools/orte-dvm/Makefile
|
||||
orte/tools/ompi-prun/Makefile
|
||||
])
|
||||
])
|
||||
|
@ -2,7 +2,7 @@
|
||||
# Copyright (c) 2012 Los Alamos National Security, LLC.
|
||||
# All rights reserved.
|
||||
# Copyright (c) 2014 Cisco Systems, Inc. All rights reserved.
|
||||
# Copyright (c) 2016-2017 Intel, Inc. All rights reserved.
|
||||
# Copyright (c) 2016-2018 Intel, Inc. All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
@ -30,12 +30,6 @@ libmca_rte_orte_la_LIBADD = $(top_builddir)/orte/lib@ORTE_LIB_PREFIX@open-rte.la
|
||||
|
||||
man_pages = mpirun.1 mpiexec.1 ompi-ps.1 ompi-clean.1 ompi-top.1 ompi-server.1
|
||||
|
||||
if OPAL_WANT_PRUN
|
||||
if WANT_INSTALL_HEADERS
|
||||
man_pages += ompi-dvm.1
|
||||
endif
|
||||
endif
|
||||
|
||||
if OPAL_INSTALL_BINARIES
|
||||
nodist_man_MANS = $(man_pages)
|
||||
|
||||
@ -46,9 +40,6 @@ install-exec-hook:
|
||||
(cd $(DESTDIR)$(bindir); rm -f ompi-clean$(EXEEXT); $(LN_S) orte-clean$(EXEEXT) ompi-clean$(EXEEXT))
|
||||
(cd $(DESTDIR)$(bindir); rm -f ompi-top$(EXEEXT); $(LN_S) orte-top$(EXEEXT) ompi-top$(EXEEXT))
|
||||
(cd $(DESTDIR)$(bindir); rm -f ompi-server$(EXEEXT); $(LN_S) orte-server$(EXEEXT) ompi-server$(EXEEXT))
|
||||
if OPAL_WANT_PRUN
|
||||
(cd $(DESTDIR)$(bindir); rm -f ompi-dvm$(EXEEXT); $(LN_S) orte-dvm$(EXEEXT) ompi-dvm$(EXEEXT))
|
||||
endif
|
||||
|
||||
uninstall-local:
|
||||
rm -f $(DESTDIR)$(bindir)/mpirun$(EXEEXT) \
|
||||
@ -57,9 +48,6 @@ uninstall-local:
|
||||
$(DESTDIR)$(bindir)/ompi-clean$(EXEEXT) \
|
||||
$(DESTDIR)$(bindir)/ompi-top$(EXEEXT) \
|
||||
$(DESTDIR)$(bindir)/ompi-server$(EXEEXT)
|
||||
if OPAL_WANT_PRUN
|
||||
rm -f $(DESTDIR)$(bindir)/ompi-dvm$(EXEEXT)
|
||||
endif
|
||||
|
||||
endif # OPAL_INSTALL_BINARIES
|
||||
|
||||
@ -96,10 +84,5 @@ $(top_builddir)/orte/tools/orte-server/orte-server.1:
|
||||
ompi-server.1: $(top_builddir)/orte/tools/orte-server/orte-server.1
|
||||
cp -f $(top_builddir)/orte/tools/orte-server/orte-server.1 ompi-server.1
|
||||
|
||||
if OPAL_WANT_PRUN
|
||||
ompi-dvm.1: $(top_builddir)/orte/tools/orte-dvm/orte-dvm.1
|
||||
cp -f $(top_builddir)/orte/tools/orte-dvm/orte-dvm.1 ompi-dvm.1
|
||||
endif
|
||||
|
||||
clean-local:
|
||||
rm -f $(man_pages)
|
||||
|
@ -1,37 +0,0 @@
|
||||
#
|
||||
# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
||||
# Copyright (c) 2016 Intel, Inc. All rights reserved.
|
||||
# Copyright (c) 2017 IBM Corporation. All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
sources = \
|
||||
errmgr_dvm.h \
|
||||
errmgr_dvm_component.c \
|
||||
errmgr_dvm.c
|
||||
|
||||
# Make the output library in this directory, and name it either
|
||||
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
|
||||
# (for static builds).
|
||||
|
||||
if MCA_BUILD_orte_errmgr_dvm_DSO
|
||||
component_noinst =
|
||||
component_install = mca_errmgr_dvm.la
|
||||
else
|
||||
component_noinst = libmca_errmgr_dvm.la
|
||||
component_install =
|
||||
endif
|
||||
|
||||
mcacomponentdir = $(ortelibdir)
|
||||
mcacomponent_LTLIBRARIES = $(component_install)
|
||||
mca_errmgr_dvm_la_SOURCES = $(sources)
|
||||
mca_errmgr_dvm_la_LDFLAGS = -module -avoid-version
|
||||
mca_errmgr_dvm_la_LIBADD = $(top_builddir)/orte/lib@ORTE_LIB_PREFIX@open-rte.la
|
||||
|
||||
noinst_LTLIBRARIES = $(component_noinst)
|
||||
libmca_errmgr_dvm_la_SOURCES =$(sources)
|
||||
libmca_errmgr_dvm_la_LDFLAGS = -module -avoid-version
|
@ -1,632 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2009-2011 The Trustees of Indiana University.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2010-2017 Oak Ridge National Labs. All rights reserved.
|
||||
* Copyright (c) 2004-2011 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2011 Oracle and/or all its affiliates. All rights reserved.
|
||||
* Copyright (c) 2011-2013 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2014-2018 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2017 IBM Corporation. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "orte_config.h"
|
||||
|
||||
#include <sys/types.h>
|
||||
#ifdef HAVE_UNISTD_H
|
||||
#include <unistd.h>
|
||||
#endif /* HAVE_UNISTD_H */
|
||||
#include <string.h>
|
||||
#ifdef HAVE_SYS_WAIT_H
|
||||
#include <sys/wait.h>
|
||||
#endif
|
||||
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/dss/dss.h"
|
||||
|
||||
#include "orte/mca/iof/base/base.h"
|
||||
#include "orte/mca/rml/rml.h"
|
||||
#include "orte/mca/odls/odls.h"
|
||||
#include "orte/mca/odls/base/base.h"
|
||||
#include "orte/mca/odls/base/odls_private.h"
|
||||
#include "orte/mca/plm/base/plm_private.h"
|
||||
#include "orte/mca/plm/plm.h"
|
||||
#include "orte/mca/rmaps/rmaps_types.h"
|
||||
#include "orte/mca/routed/routed.h"
|
||||
#include "orte/mca/grpcomm/grpcomm.h"
|
||||
#include "orte/mca/ess/ess.h"
|
||||
#include "orte/mca/state/state.h"
|
||||
|
||||
#include "orte/util/error_strings.h"
|
||||
#include "orte/util/name_fns.h"
|
||||
#include "orte/util/proc_info.h"
|
||||
#include "orte/util/show_help.h"
|
||||
#include "orte/util/threads.h"
|
||||
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
#include "orte/runtime/orte_locks.h"
|
||||
#include "orte/runtime/orte_quit.h"
|
||||
#include "orte/runtime/data_type_support/orte_dt_support.h"
|
||||
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/errmgr/base/base.h"
|
||||
#include "orte/mca/errmgr/base/errmgr_private.h"
|
||||
|
||||
#include "errmgr_dvm.h"
|
||||
|
||||
static int init(void);
|
||||
static int finalize(void);
|
||||
|
||||
/******************
|
||||
* dvm module
|
||||
******************/
|
||||
orte_errmgr_base_module_t orte_errmgr_dvm_module = {
|
||||
.init = init,
|
||||
.finalize = finalize,
|
||||
.logfn = orte_errmgr_base_log,
|
||||
.abort = orte_errmgr_base_abort,
|
||||
.abort_peers = orte_errmgr_base_abort_peers
|
||||
};
|
||||
|
||||
|
||||
/*
|
||||
* Local functions
|
||||
*/
|
||||
static void job_errors(int fd, short args, void *cbdata);
|
||||
static void proc_errors(int fd, short args, void *cbdata);
|
||||
|
||||
static int init(void)
|
||||
{
|
||||
/* setup state machine to trap job errors */
|
||||
orte_state.add_job_state(ORTE_JOB_STATE_ERROR, job_errors, ORTE_ERROR_PRI);
|
||||
|
||||
/* set the lost connection state to run at MSG priority so
|
||||
* we can process any last messages from the proc
|
||||
*/
|
||||
orte_state.add_proc_state(ORTE_PROC_STATE_COMM_FAILED, proc_errors, ORTE_MSG_PRI);
|
||||
|
||||
/* setup state machine to trap proc errors */
|
||||
orte_state.add_proc_state(ORTE_PROC_STATE_ERROR, proc_errors, ORTE_ERROR_PRI);
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static int finalize(void)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static void _terminate_job(orte_jobid_t jobid)
|
||||
{
|
||||
opal_pointer_array_t procs;
|
||||
orte_proc_t pobj;
|
||||
|
||||
OBJ_CONSTRUCT(&procs, opal_pointer_array_t);
|
||||
opal_pointer_array_init(&procs, 1, 1, 1);
|
||||
OBJ_CONSTRUCT(&pobj, orte_proc_t);
|
||||
pobj.name.jobid = jobid;
|
||||
pobj.name.vpid = ORTE_VPID_WILDCARD;
|
||||
opal_pointer_array_add(&procs, &pobj);
|
||||
orte_plm.terminate_procs(&procs);
|
||||
OBJ_DESTRUCT(&procs);
|
||||
OBJ_DESTRUCT(&pobj);
|
||||
}
|
||||
|
||||
static void job_errors(int fd, short args, void *cbdata)
|
||||
{
|
||||
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
|
||||
orte_job_t *jdata;
|
||||
orte_job_state_t jobstate;
|
||||
opal_buffer_t *answer;
|
||||
int32_t rc, ret;
|
||||
int room, *rmptr;
|
||||
|
||||
ORTE_ACQUIRE_OBJECT(caddy);
|
||||
|
||||
/*
|
||||
* if orte is trying to shutdown, just let it
|
||||
*/
|
||||
if (orte_finalizing) {
|
||||
return;
|
||||
}
|
||||
|
||||
/* if the jdata is NULL, then we ignore it as this
|
||||
* is reporting an unrecoverable error
|
||||
*/
|
||||
if (NULL == caddy->jdata) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
|
||||
OBJ_RELEASE(caddy);
|
||||
return;
|
||||
}
|
||||
|
||||
/* update the state */
|
||||
jdata = caddy->jdata;
|
||||
jobstate = caddy->job_state;
|
||||
jdata->state = jobstate;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_framework.framework_output,
|
||||
"%s errmgr:dvm: job %s reported state %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_JOBID_PRINT(jdata->jobid),
|
||||
orte_job_state_to_str(jobstate)));
|
||||
|
||||
if (jdata->jobid == ORTE_PROC_MY_NAME->jobid) {
|
||||
/* if the daemon job aborted and we haven't heard from everyone yet,
|
||||
* then this could well have been caused by a daemon not finding
|
||||
* a way back to us. In this case, output a message indicating a daemon
|
||||
* died without reporting. Otherwise, say nothing as we
|
||||
* likely already output an error message */
|
||||
if (ORTE_JOB_STATE_ABORTED == jobstate &&
|
||||
jdata->num_procs != jdata->num_reported) {
|
||||
orte_routing_is_enabled = false;
|
||||
orte_show_help("help-errmgr-base.txt", "failed-daemon", true);
|
||||
}
|
||||
/* there really isn't much else we can do since the problem
|
||||
* is in the DVM itself, so best just to terminate */
|
||||
jdata->num_terminated = jdata->num_procs;
|
||||
/* activate the terminated state so we can exit */
|
||||
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_TERMINATED);
|
||||
OBJ_RELEASE(caddy);
|
||||
return;
|
||||
}
|
||||
|
||||
/* all other cases involve jobs submitted to the DVM - therefore,
|
||||
* we only inform the submitter of the problem, but do NOT terminate
|
||||
* the DVM itself */
|
||||
|
||||
rc = jobstate;
|
||||
answer = OBJ_NEW(opal_buffer_t);
|
||||
if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &rc, 1, OPAL_INT32))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
OBJ_RELEASE(caddy);
|
||||
return;
|
||||
}
|
||||
if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &jdata->jobid, 1, ORTE_JOBID))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
OBJ_RELEASE(caddy);
|
||||
return;
|
||||
}
|
||||
/* pack the room number */
|
||||
rmptr = &room;
|
||||
if (orte_get_attribute(&jdata->attributes, ORTE_JOB_ROOM_NUM, (void**)&rmptr, OPAL_INT)) {
|
||||
if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &room, 1, OPAL_INT))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
OBJ_RELEASE(caddy);
|
||||
return;
|
||||
}
|
||||
}
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
|
||||
"%s errmgr:dvm sending notification of job %s failure to %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_JOBID_PRINT(jdata->jobid),
|
||||
ORTE_NAME_PRINT(&jdata->originator)));
|
||||
if (0 > (ret = orte_rml.send_buffer_nb(orte_mgmt_conduit,
|
||||
&jdata->originator, answer,
|
||||
ORTE_RML_TAG_LAUNCH_RESP,
|
||||
orte_rml_send_callback, NULL))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
OBJ_RELEASE(answer);
|
||||
}
|
||||
/* ensure we terminate any processes left running in the DVM */
|
||||
_terminate_job(jdata->jobid);
|
||||
|
||||
/* cleanup */
|
||||
OBJ_RELEASE(caddy);
|
||||
}
|
||||
|
||||
static void proc_errors(int fd, short args, void *cbdata)
|
||||
{
|
||||
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
|
||||
orte_job_t *jdata;
|
||||
orte_proc_t *pptr, *proct;
|
||||
orte_process_name_t *proc = &caddy->name;
|
||||
orte_proc_state_t state = caddy->proc_state;
|
||||
int i;
|
||||
int32_t i32, *i32ptr;
|
||||
char *rtmod;
|
||||
|
||||
ORTE_ACQUIRE_OBJECT(caddy);
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_framework.framework_output,
|
||||
"%s errmgr:dvm: for proc %s state %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(proc),
|
||||
orte_proc_state_to_str(state)));
|
||||
|
||||
/*
|
||||
* if orte is trying to shutdown, just let it
|
||||
*/
|
||||
if (orte_finalizing) {
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/* get the job object */
|
||||
if (NULL == (jdata = orte_get_job_data_object(proc->jobid))) {
|
||||
/* could be a race condition */
|
||||
goto cleanup;
|
||||
}
|
||||
pptr = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid);
|
||||
|
||||
/* get the management conduit's routed module */
|
||||
rtmod = orte_rml.get_routed(orte_mgmt_conduit);
|
||||
|
||||
/* we MUST handle a communication failure before doing anything else
|
||||
* as it requires some special care to avoid normal termination issues
|
||||
* for local application procs
|
||||
*/
|
||||
if (ORTE_PROC_STATE_COMM_FAILED == state) {
|
||||
/* is this to a daemon? */
|
||||
if (ORTE_PROC_MY_NAME->jobid != proc->jobid) {
|
||||
/* nope - ignore it */
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
|
||||
"%s Comm failure to non-daemon proc - ignoring it",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
goto cleanup;
|
||||
}
|
||||
/* if this is my own connection, ignore it */
|
||||
if (ORTE_PROC_MY_NAME->vpid == proc->vpid) {
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
|
||||
"%s Comm failure on my own connection - ignoring it",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
goto cleanup;
|
||||
}
|
||||
/* mark the daemon as gone */
|
||||
ORTE_FLAG_UNSET(pptr, ORTE_PROC_FLAG_ALIVE);
|
||||
/* update the state */
|
||||
pptr->state = state;
|
||||
/* adjust our num_procs */
|
||||
--orte_process_info.num_procs;
|
||||
/* if we have ordered orteds to terminate or abort
|
||||
* is in progress, record it */
|
||||
if (orte_orteds_term_ordered || orte_abnormal_term_ordered) {
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
|
||||
"%s Comm failure: daemons terminating - recording daemon %s as gone",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc)));
|
||||
/* remove from dependent routes, if it is one */
|
||||
orte_routed.route_lost(rtmod, proc);
|
||||
/* if all my routes and local children are gone, then terminate ourselves */
|
||||
if (0 == orte_routed.num_routes(rtmod)) {
|
||||
for (i=0; i < orte_local_children->size; i++) {
|
||||
if (NULL != (proct = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i)) &&
|
||||
ORTE_FLAG_TEST(pptr, ORTE_PROC_FLAG_ALIVE) && proct->state < ORTE_PROC_STATE_UNTERMINATED) {
|
||||
/* at least one is still alive */
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
|
||||
"%s Comm failure: at least one proc (%s) still alive",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&proct->name)));
|
||||
goto cleanup;
|
||||
}
|
||||
}
|
||||
/* call our appropriate exit procedure */
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
|
||||
"%s errmgr_dvm: all routes and children gone - ordering exit",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED);
|
||||
} else {
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
|
||||
"%s Comm failure: %d routes remain alive",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
(int)orte_routed.num_routes(rtmod)));
|
||||
}
|
||||
goto cleanup;
|
||||
}
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
|
||||
"%s Comm failure: daemon %s - aborting",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc)));
|
||||
/* record the first one to fail */
|
||||
if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) {
|
||||
/* output an error message so the user knows what happened */
|
||||
orte_show_help("help-errmgr-base.txt", "node-died", true,
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
orte_process_info.nodename,
|
||||
ORTE_NAME_PRINT(proc),
|
||||
pptr->node->name);
|
||||
/* mark the daemon job as failed */
|
||||
jdata->state = ORTE_JOB_STATE_COMM_FAILED;
|
||||
/* point to the lowest rank to cause the problem */
|
||||
orte_set_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, ORTE_ATTR_LOCAL, pptr, OPAL_PTR);
|
||||
/* retain the object so it doesn't get free'd */
|
||||
OBJ_RETAIN(pptr);
|
||||
ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_ABORTED);
|
||||
/* update our exit code */
|
||||
ORTE_UPDATE_EXIT_STATUS(pptr->exit_code);
|
||||
/* just in case the exit code hadn't been set, do it here - this
|
||||
* won't override any reported exit code */
|
||||
ORTE_UPDATE_EXIT_STATUS(ORTE_ERR_COMM_FAILURE);
|
||||
}
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/* update the proc state - can get multiple reports on a proc
|
||||
* depending on circumstances, so ensure we only do this once
|
||||
*/
|
||||
if (pptr->state < ORTE_PROC_STATE_TERMINATED) {
|
||||
pptr->state = state;
|
||||
}
|
||||
|
||||
/* if we were ordered to terminate, mark this proc as dead and see if
|
||||
* any of our routes or local children remain alive - if not, then
|
||||
* terminate ourselves. */
|
||||
if (orte_orteds_term_ordered) {
|
||||
for (i=0; i < orte_local_children->size; i++) {
|
||||
if (NULL != (proct = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) {
|
||||
if (ORTE_FLAG_TEST(proct, ORTE_PROC_FLAG_ALIVE)) {
|
||||
goto keep_going;
|
||||
}
|
||||
}
|
||||
}
|
||||
/* if all my routes and children are gone, then terminate
|
||||
ourselves nicely (i.e., this is a normal termination) */
|
||||
if (0 == orte_routed.num_routes(rtmod)) {
|
||||
OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output,
|
||||
"%s errmgr:default:dvm all routes gone - exiting",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED);
|
||||
}
|
||||
}
|
||||
|
||||
keep_going:
|
||||
/* ensure we record the failed proc properly so we can report
|
||||
* the error once we terminate
|
||||
*/
|
||||
switch (state) {
|
||||
case ORTE_PROC_STATE_KILLED_BY_CMD:
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
|
||||
"%s errmgr:dvm: proc %s killed by cmd",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(proc)));
|
||||
/* we ordered this proc to die, so it isn't an abnormal termination
|
||||
* and we don't flag it as such
|
||||
*/
|
||||
if (jdata->num_terminated >= jdata->num_procs) {
|
||||
/* this job has terminated */
|
||||
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_TERMINATED);
|
||||
}
|
||||
/* don't abort the job as this isn't an abnormal termination */
|
||||
break;
|
||||
|
||||
case ORTE_PROC_STATE_ABORTED:
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
|
||||
"%s errmgr:dvm: proc %s aborted",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(proc)));
|
||||
if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) {
|
||||
jdata->state = ORTE_JOB_STATE_ABORTED;
|
||||
/* point to the first rank to cause the problem */
|
||||
orte_set_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, ORTE_ATTR_LOCAL, pptr, OPAL_PTR);
|
||||
/* retain the object so it doesn't get free'd */
|
||||
OBJ_RETAIN(pptr);
|
||||
ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_ABORTED);
|
||||
ORTE_UPDATE_EXIT_STATUS(pptr->exit_code);
|
||||
/* kill the job */
|
||||
_terminate_job(jdata->jobid);
|
||||
}
|
||||
break;
|
||||
|
||||
case ORTE_PROC_STATE_ABORTED_BY_SIG:
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
|
||||
"%s errmgr:dvm: proc %s aborted by signal",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(proc)));
|
||||
if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) {
|
||||
jdata->state = ORTE_JOB_STATE_ABORTED_BY_SIG;
|
||||
/* point to the first rank to cause the problem */
|
||||
orte_set_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, ORTE_ATTR_LOCAL, pptr, OPAL_PTR);
|
||||
/* retain the object so it doesn't get free'd */
|
||||
OBJ_RETAIN(pptr);
|
||||
ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_ABORTED);
|
||||
ORTE_UPDATE_EXIT_STATUS(pptr->exit_code);
|
||||
/* kill the job */
|
||||
_terminate_job(jdata->jobid);
|
||||
}
|
||||
break;
|
||||
|
||||
case ORTE_PROC_STATE_TERM_WO_SYNC:
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
|
||||
"%s errmgr:dvm: proc %s terminated without sync",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(proc)));
|
||||
if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) {
|
||||
jdata->state = ORTE_JOB_STATE_ABORTED_WO_SYNC;
|
||||
/* point to the first rank to cause the problem */
|
||||
orte_set_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, ORTE_ATTR_LOCAL, pptr, OPAL_PTR);
|
||||
/* retain the object so it doesn't get free'd */
|
||||
OBJ_RETAIN(pptr);
|
||||
ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_ABORTED);
|
||||
ORTE_UPDATE_EXIT_STATUS(pptr->exit_code);
|
||||
/* now treat a special case - if the proc exit'd without a required
|
||||
* sync, it may have done so with a zero exit code. We want to ensure
|
||||
* that the user realizes there was an error, so in this -one- case,
|
||||
* we overwrite the process' exit code with the default error code
|
||||
*/
|
||||
ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
|
||||
/* kill the job */
|
||||
_terminate_job(jdata->jobid);
|
||||
}
|
||||
break;
|
||||
|
||||
case ORTE_PROC_STATE_FAILED_TO_START:
|
||||
case ORTE_PROC_STATE_FAILED_TO_LAUNCH:
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
|
||||
"%s errmgr:dvm: proc %s %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(proc),
|
||||
orte_proc_state_to_str(state)));
|
||||
if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) {
|
||||
opal_buffer_t *answer;
|
||||
int id, *idptr, ret;
|
||||
|
||||
if (ORTE_PROC_STATE_FAILED_TO_START) {
|
||||
jdata->state = ORTE_JOB_STATE_FAILED_TO_START;
|
||||
} else {
|
||||
jdata->state = ORTE_JOB_STATE_FAILED_TO_LAUNCH;
|
||||
}
|
||||
/* point to the first rank to cause the problem */
|
||||
orte_set_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, ORTE_ATTR_LOCAL, pptr, OPAL_PTR);
|
||||
/* retain the object so it doesn't get free'd */
|
||||
OBJ_RETAIN(pptr);
|
||||
ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_ABORTED);
|
||||
/* send a notification to the requestor - indicate that this is a spawn response */
|
||||
answer = OBJ_NEW(opal_buffer_t);
|
||||
/* pack the return status */
|
||||
if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &pptr->exit_code, 1, OPAL_INT32))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
OBJ_RELEASE(answer);
|
||||
goto CLEANUP;
|
||||
}
|
||||
/* pack the jobid to be returned */
|
||||
if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &jdata->jobid, 1, ORTE_JOBID))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
OBJ_RELEASE(answer);
|
||||
goto CLEANUP;
|
||||
}
|
||||
idptr = &id;
|
||||
if (orte_get_attribute(&jdata->attributes, ORTE_JOB_ROOM_NUM, (void**)&idptr, OPAL_INT)) {
|
||||
/* pack the sender's index to the tracking object */
|
||||
if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, idptr, 1, OPAL_INT))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
OBJ_RELEASE(answer);
|
||||
goto CLEANUP;
|
||||
}
|
||||
}
|
||||
if (orte_get_attribute(&jdata->attributes, ORTE_JOB_FIXED_DVM, NULL, OPAL_BOOL)) {
|
||||
/* we need to send the requestor more info about what happened */
|
||||
opal_dss.pack(answer, &jdata->state, 1, ORTE_JOB_STATE_T);
|
||||
opal_dss.pack(answer, &pptr, 1, ORTE_PROC);
|
||||
opal_dss.pack(answer, &pptr->node, 1, ORTE_NODE);
|
||||
}
|
||||
/* return response */
|
||||
if (0 > (ret = orte_rml.send_buffer_nb(orte_mgmt_conduit,
|
||||
&jdata->originator, answer,
|
||||
ORTE_RML_TAG_LAUNCH_RESP,
|
||||
orte_rml_send_callback, NULL))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
OBJ_RELEASE(answer);
|
||||
}
|
||||
/* record that we notified about this job */
|
||||
jdata->state = ORTE_JOB_STATE_NOTIFIED;
|
||||
CLEANUP:
|
||||
/* kill the job */
|
||||
_terminate_job(jdata->jobid);
|
||||
}
|
||||
/* if this was a daemon, report it */
|
||||
if (jdata->jobid == ORTE_PROC_MY_NAME->jobid) {
|
||||
/* output a message indicating we failed to launch a daemon */
|
||||
orte_show_help("help-errmgr-base.txt", "failed-daemon-launch", true);
|
||||
}
|
||||
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_TERMINATED);
|
||||
break;
|
||||
|
||||
case ORTE_PROC_STATE_CALLED_ABORT:
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
|
||||
"%s errmgr:dvm: proc %s called abort with exit code %d",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(proc), pptr->exit_code));
|
||||
if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) {
|
||||
jdata->state = ORTE_JOB_STATE_CALLED_ABORT;
|
||||
/* point to the first proc to cause the problem */
|
||||
orte_set_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, ORTE_ATTR_LOCAL, pptr, OPAL_PTR);
|
||||
/* retain the object so it doesn't get free'd */
|
||||
OBJ_RETAIN(pptr);
|
||||
ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_ABORTED);
|
||||
ORTE_UPDATE_EXIT_STATUS(pptr->exit_code);
|
||||
/* kill the job */
|
||||
_terminate_job(jdata->jobid);
|
||||
}
|
||||
break;
|
||||
|
||||
case ORTE_PROC_STATE_TERM_NON_ZERO:
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
|
||||
"%s errmgr:dvm: proc %s exited with non-zero status %d",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(proc),
|
||||
pptr->exit_code));
|
||||
ORTE_UPDATE_EXIT_STATUS(pptr->exit_code);
|
||||
/* track the number of non-zero exits */
|
||||
i32 = 0;
|
||||
i32ptr = &i32;
|
||||
orte_get_attribute(&jdata->attributes, ORTE_JOB_NUM_NONZERO_EXIT, (void**)&i32ptr, OPAL_INT32);
|
||||
++i32;
|
||||
orte_set_attribute(&jdata->attributes, ORTE_JOB_NUM_NONZERO_EXIT, ORTE_ATTR_LOCAL, i32ptr, OPAL_INT32);
|
||||
if (orte_abort_non_zero_exit) {
|
||||
if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) {
|
||||
jdata->state = ORTE_JOB_STATE_NON_ZERO_TERM;
|
||||
/* point to the first rank to cause the problem */
|
||||
orte_set_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, ORTE_ATTR_LOCAL, pptr, OPAL_PTR);
|
||||
/* retain the object so it doesn't get free'd */
|
||||
OBJ_RETAIN(pptr);
|
||||
ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_ABORTED);
|
||||
/* kill the job */
|
||||
_terminate_job(jdata->jobid);
|
||||
}
|
||||
} else {
|
||||
/* user requested we consider this normal termination */
|
||||
if (jdata->num_terminated >= jdata->num_procs) {
|
||||
/* this job has terminated */
|
||||
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_TERMINATED);
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
case ORTE_PROC_STATE_HEARTBEAT_FAILED:
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
|
||||
"%s errmgr:dvm: proc %s heartbeat failed",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(proc)));
|
||||
if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) {
|
||||
jdata->state = ORTE_JOB_STATE_HEARTBEAT_FAILED;
|
||||
/* point to the first rank to cause the problem */
|
||||
orte_set_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, ORTE_ATTR_LOCAL, pptr, OPAL_PTR);
|
||||
/* retain the object so it doesn't get free'd */
|
||||
OBJ_RETAIN(pptr);
|
||||
ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_ABORTED);
|
||||
ORTE_UPDATE_EXIT_STATUS(pptr->exit_code);
|
||||
/* kill the job */
|
||||
_terminate_job(jdata->jobid);
|
||||
}
|
||||
/* remove from dependent routes, if it is one */
|
||||
orte_routed.route_lost(rtmod, proc);
|
||||
break;
|
||||
|
||||
case ORTE_PROC_STATE_UNABLE_TO_SEND_MSG:
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
|
||||
"%s errmgr:dvm: unable to send message to proc %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(proc)));
|
||||
/* if this proc is one of my daemons, then we are truly
|
||||
* hosed - so just exit out
|
||||
*/
|
||||
if (ORTE_PROC_MY_NAME->jobid == proc->jobid) {
|
||||
ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED);
|
||||
break;
|
||||
}
|
||||
break;
|
||||
|
||||
default:
|
||||
/* shouldn't get this, but terminate job if required */
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
|
||||
"%s errmgr:dvm: proc %s default error %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(proc),
|
||||
orte_proc_state_to_str(state)));
|
||||
if (jdata->num_terminated == jdata->num_procs) {
|
||||
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_TERMINATED);
|
||||
}
|
||||
break;
|
||||
}
|
||||
/* if the waitpid fired, be sure to let the state machine know */
|
||||
if (ORTE_FLAG_TEST(pptr, ORTE_PROC_FLAG_WAITPID)) {
|
||||
ORTE_ACTIVATE_PROC_STATE(&pptr->name, ORTE_PROC_STATE_WAITPID_FIRED);
|
||||
}
|
||||
|
||||
cleanup:
|
||||
OBJ_RELEASE(caddy);
|
||||
}
|
@ -1,39 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2004-2011 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2016 Intel, Inc. All rights reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
/**
|
||||
* @file
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef MCA_ERRMGR_dvm_EXPORT_H
|
||||
#define MCA_ERRMGR_dvm_EXPORT_H
|
||||
|
||||
#include "orte_config.h"
|
||||
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
|
||||
BEGIN_C_DECLS
|
||||
|
||||
/*
|
||||
* Local Component structures
|
||||
*/
|
||||
|
||||
ORTE_MODULE_DECLSPEC extern orte_errmgr_base_component_t mca_errmgr_dvm_component;
|
||||
|
||||
ORTE_DECLSPEC extern orte_errmgr_base_module_t orte_errmgr_dvm_module;
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
#endif /* MCA_ERRMGR_dvm_EXPORT_H */
|
@ -1,102 +0,0 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2016 Intel, Inc. All rights reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "orte_config.h"
|
||||
#include "opal/util/output.h"
|
||||
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/errmgr/base/base.h"
|
||||
#include "orte/mca/errmgr/base/errmgr_private.h"
|
||||
#include "errmgr_dvm.h"
|
||||
|
||||
/*
|
||||
* Public string for version number
|
||||
*/
|
||||
const char *orte_errmgr_dvm_component_version_string =
|
||||
"ORTE ERRMGR dvm MCA component version " ORTE_VERSION;
|
||||
|
||||
/*
|
||||
* Local functionality
|
||||
*/
|
||||
static int dvm_register(void);
|
||||
static int dvm_open(void);
|
||||
static int dvm_close(void);
|
||||
static int dvm_component_query(mca_base_module_t **module, int *priority);
|
||||
|
||||
/*
|
||||
* Instantiate the public struct with all of our public information
|
||||
* and pointer to our public functions in it
|
||||
*/
|
||||
orte_errmgr_base_component_t mca_errmgr_dvm_component = {
|
||||
/* Handle the general mca_component_t struct containing
|
||||
* meta information about the component dvm
|
||||
*/
|
||||
.base_version = {
|
||||
ORTE_ERRMGR_BASE_VERSION_3_0_0,
|
||||
/* Component name and version */
|
||||
.mca_component_name = "dvm",
|
||||
MCA_BASE_MAKE_VERSION(component, ORTE_MAJOR_VERSION, ORTE_MINOR_VERSION,
|
||||
ORTE_RELEASE_VERSION),
|
||||
|
||||
/* Component open and close functions */
|
||||
.mca_open_component = dvm_open,
|
||||
.mca_close_component = dvm_close,
|
||||
.mca_query_component = dvm_component_query,
|
||||
.mca_register_component_params = dvm_register,
|
||||
},
|
||||
.base_data = {
|
||||
/* The component is checkpoint ready */
|
||||
MCA_BASE_METADATA_PARAM_CHECKPOINT
|
||||
},
|
||||
};
|
||||
|
||||
static int my_priority;
|
||||
|
||||
static int dvm_register(void)
|
||||
{
|
||||
mca_base_component_t *c = &mca_errmgr_dvm_component.base_version;
|
||||
|
||||
my_priority = 1000;
|
||||
(void) mca_base_component_var_register(c, "priority",
|
||||
"Priority of the dvm errmgr component",
|
||||
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_9,
|
||||
MCA_BASE_VAR_SCOPE_READONLY, &my_priority);
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static int dvm_open(void)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static int dvm_close(void)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static int dvm_component_query(mca_base_module_t **module, int *priority)
|
||||
{
|
||||
/* used by DVM masters */
|
||||
if (ORTE_PROC_IS_MASTER) {
|
||||
*priority = my_priority;
|
||||
*module = (mca_base_module_t *)&orte_errmgr_dvm_module;
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
*module = NULL;
|
||||
*priority = -1;
|
||||
return ORTE_ERROR;
|
||||
}
|
@ -1,7 +0,0 @@
|
||||
#
|
||||
# owner/status file
|
||||
# owner: institution that is responsible for this package
|
||||
# status: e.g. active, maintenance, unmaintained
|
||||
#
|
||||
owner: INTEL
|
||||
status: active
|
@ -1,36 +0,0 @@
|
||||
#
|
||||
# Copyright (c) 2015-2018 Intel, Inc. All rights reserved.
|
||||
# Copyright (c) 2017 IBM Corporation. All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
sources = \
|
||||
state_dvm.h \
|
||||
state_dvm_component.c \
|
||||
state_dvm.c
|
||||
|
||||
# Make the output library in this directory, and name it either
|
||||
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
|
||||
# (for static builds).
|
||||
|
||||
if MCA_BUILD_orte_state_dvm_DSO
|
||||
component_noinst =
|
||||
component_install = mca_state_dvm.la
|
||||
else
|
||||
component_noinst = libmca_state_dvm.la
|
||||
component_install =
|
||||
endif
|
||||
|
||||
mcacomponentdir = $(ortelibdir)
|
||||
mcacomponent_LTLIBRARIES = $(component_install)
|
||||
mca_state_dvm_la_SOURCES = $(sources)
|
||||
mca_state_dvm_la_LDFLAGS = -module -avoid-version
|
||||
mca_state_dvm_la_LIBADD = $(top_builddir)/orte/lib@ORTE_LIB_PREFIX@open-rte.la
|
||||
|
||||
noinst_LTLIBRARIES = $(component_noinst)
|
||||
libmca_state_dvm_la_SOURCES =$(sources)
|
||||
libmca_state_dvm_la_LDFLAGS = -module -avoid-version
|
@ -1,7 +0,0 @@
|
||||
#
|
||||
# owner/status file
|
||||
# owner: institution that is responsible for this package
|
||||
# status: e.g. active, maintenance, unmaintained
|
||||
#
|
||||
owner: INTEL
|
||||
status: active
|
@ -1,688 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2015-2018 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "orte_config.h"
|
||||
|
||||
#include <sys/types.h>
|
||||
#ifdef HAVE_UNISTD_H
|
||||
#include <unistd.h>
|
||||
#endif /* HAVE_UNISTD_H */
|
||||
#include <string.h>
|
||||
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/mca/pmix/pmix.h"
|
||||
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/filem/filem.h"
|
||||
#include "orte/mca/grpcomm/grpcomm.h"
|
||||
#include "orte/mca/iof/base/base.h"
|
||||
#include "orte/mca/odls/odls_types.h"
|
||||
#include "orte/mca/plm/base/base.h"
|
||||
#include "orte/mca/ras/base/base.h"
|
||||
#include "orte/mca/regx/regx.h"
|
||||
#include "orte/mca/rmaps/base/base.h"
|
||||
#include "orte/mca/rml/rml.h"
|
||||
#include "orte/mca/rml/base/rml_contact.h"
|
||||
#include "orte/mca/routed/routed.h"
|
||||
#include "orte/util/session_dir.h"
|
||||
#include "orte/util/threads.h"
|
||||
#include "orte/runtime/orte_quit.h"
|
||||
#include "orte/runtime/orte_wait.h"
|
||||
|
||||
#include "orte/mca/state/state.h"
|
||||
#include "orte/mca/state/base/base.h"
|
||||
#include "orte/mca/state/base/state_private.h"
|
||||
#include "state_dvm.h"
|
||||
|
||||
/*
|
||||
* Module functions: Global
|
||||
*/
|
||||
static int init(void);
|
||||
static int finalize(void);
|
||||
|
||||
/* local functions */
|
||||
static void init_complete(int fd, short args, void *cbdata);
|
||||
static void vm_ready(int fd, short args, void *cbata);
|
||||
static void check_complete(int fd, short args, void *cbdata);
|
||||
static void cleanup_job(int fd, short args, void *cbdata);
|
||||
|
||||
/******************
|
||||
* DVM module - used when mpirun is persistent
|
||||
******************/
|
||||
orte_state_base_module_t orte_state_dvm_module = {
|
||||
init,
|
||||
finalize,
|
||||
orte_state_base_activate_job_state,
|
||||
orte_state_base_add_job_state,
|
||||
orte_state_base_set_job_state_callback,
|
||||
orte_state_base_set_job_state_priority,
|
||||
orte_state_base_remove_job_state,
|
||||
orte_state_base_activate_proc_state,
|
||||
orte_state_base_add_proc_state,
|
||||
orte_state_base_set_proc_state_callback,
|
||||
orte_state_base_set_proc_state_priority,
|
||||
orte_state_base_remove_proc_state
|
||||
};
|
||||
|
||||
static void dvm_notify(int sd, short args, void *cbdata);
|
||||
|
||||
/* defined default state machine sequence - individual
|
||||
* plm's must add a state for launching daemons
|
||||
*/
|
||||
static orte_job_state_t launch_states[] = {
|
||||
ORTE_JOB_STATE_INIT,
|
||||
ORTE_JOB_STATE_INIT_COMPLETE,
|
||||
ORTE_JOB_STATE_ALLOCATE,
|
||||
ORTE_JOB_STATE_ALLOCATION_COMPLETE,
|
||||
ORTE_JOB_STATE_DAEMONS_LAUNCHED,
|
||||
ORTE_JOB_STATE_DAEMONS_REPORTED,
|
||||
ORTE_JOB_STATE_VM_READY,
|
||||
ORTE_JOB_STATE_MAP,
|
||||
ORTE_JOB_STATE_MAP_COMPLETE,
|
||||
ORTE_JOB_STATE_SYSTEM_PREP,
|
||||
ORTE_JOB_STATE_LAUNCH_APPS,
|
||||
ORTE_JOB_STATE_SEND_LAUNCH_MSG,
|
||||
ORTE_JOB_STATE_LOCAL_LAUNCH_COMPLETE,
|
||||
ORTE_JOB_STATE_RUNNING,
|
||||
ORTE_JOB_STATE_REGISTERED,
|
||||
/* termination states */
|
||||
ORTE_JOB_STATE_TERMINATED,
|
||||
ORTE_JOB_STATE_NOTIFY_COMPLETED,
|
||||
ORTE_JOB_STATE_NOTIFIED,
|
||||
ORTE_JOB_STATE_ALL_JOBS_COMPLETE
|
||||
};
|
||||
static orte_state_cbfunc_t launch_callbacks[] = {
|
||||
orte_plm_base_setup_job,
|
||||
init_complete,
|
||||
orte_ras_base_allocate,
|
||||
orte_plm_base_allocation_complete,
|
||||
orte_plm_base_daemons_launched,
|
||||
orte_plm_base_daemons_reported,
|
||||
vm_ready,
|
||||
orte_rmaps_base_map_job,
|
||||
orte_plm_base_mapping_complete,
|
||||
orte_plm_base_complete_setup,
|
||||
orte_plm_base_launch_apps,
|
||||
orte_plm_base_send_launch_msg,
|
||||
orte_state_base_local_launch_complete,
|
||||
orte_plm_base_post_launch,
|
||||
orte_plm_base_registered,
|
||||
check_complete,
|
||||
dvm_notify,
|
||||
cleanup_job,
|
||||
orte_quit
|
||||
};
|
||||
|
||||
static orte_proc_state_t proc_states[] = {
|
||||
ORTE_PROC_STATE_RUNNING,
|
||||
ORTE_PROC_STATE_REGISTERED,
|
||||
ORTE_PROC_STATE_IOF_COMPLETE,
|
||||
ORTE_PROC_STATE_WAITPID_FIRED,
|
||||
ORTE_PROC_STATE_TERMINATED
|
||||
};
|
||||
static orte_state_cbfunc_t proc_callbacks[] = {
|
||||
orte_state_base_track_procs,
|
||||
orte_state_base_track_procs,
|
||||
orte_state_base_track_procs,
|
||||
orte_state_base_track_procs,
|
||||
orte_state_base_track_procs
|
||||
};
|
||||
|
||||
static void force_quit(int fd, short args, void *cbdata)
|
||||
{
|
||||
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
|
||||
|
||||
/* give us a chance to stop the orteds */
|
||||
orte_plm.terminate_orteds();
|
||||
OBJ_RELEASE(caddy);
|
||||
}
|
||||
|
||||
/************************
|
||||
* API Definitions
|
||||
************************/
|
||||
static int init(void)
|
||||
{
|
||||
int i, rc;
|
||||
int num_states;
|
||||
|
||||
/* setup the state machines */
|
||||
OBJ_CONSTRUCT(&orte_job_states, opal_list_t);
|
||||
OBJ_CONSTRUCT(&orte_proc_states, opal_list_t);
|
||||
|
||||
/* setup the job state machine */
|
||||
num_states = sizeof(launch_states) / sizeof(orte_job_state_t);
|
||||
for (i=0; i < num_states; i++) {
|
||||
if (ORTE_SUCCESS != (rc = orte_state.add_job_state(launch_states[i],
|
||||
launch_callbacks[i],
|
||||
ORTE_SYS_PRI))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
}
|
||||
/* add the termination response */
|
||||
if (ORTE_SUCCESS != (rc = orte_state.add_job_state(ORTE_JOB_STATE_DAEMONS_TERMINATED,
|
||||
orte_quit, ORTE_SYS_PRI))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
/* add a default error response */
|
||||
if (ORTE_SUCCESS != (rc = orte_state.add_job_state(ORTE_JOB_STATE_FORCED_EXIT,
|
||||
force_quit, ORTE_ERROR_PRI))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
/* add callback to report progress, if requested */
|
||||
if (ORTE_SUCCESS != (rc = orte_state.add_job_state(ORTE_JOB_STATE_REPORT_PROGRESS,
|
||||
orte_state_base_report_progress, ORTE_ERROR_PRI))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
if (5 < opal_output_get_verbosity(orte_state_base_framework.framework_output)) {
|
||||
orte_state_base_print_job_state_machine();
|
||||
}
|
||||
|
||||
/* populate the proc state machine to allow us to
|
||||
* track proc lifecycle changes
|
||||
*/
|
||||
num_states = sizeof(proc_states) / sizeof(orte_proc_state_t);
|
||||
for (i=0; i < num_states; i++) {
|
||||
if (ORTE_SUCCESS != (rc = orte_state.add_proc_state(proc_states[i],
|
||||
proc_callbacks[i],
|
||||
ORTE_SYS_PRI))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
}
|
||||
if (5 < opal_output_get_verbosity(orte_state_base_framework.framework_output)) {
|
||||
orte_state_base_print_proc_state_machine();
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static int finalize(void)
|
||||
{
|
||||
opal_list_item_t *item;
|
||||
|
||||
/* cleanup the proc state machine */
|
||||
while (NULL != (item = opal_list_remove_first(&orte_proc_states))) {
|
||||
OBJ_RELEASE(item);
|
||||
}
|
||||
OBJ_DESTRUCT(&orte_proc_states);
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static void files_ready(int status, void *cbdata)
|
||||
{
|
||||
orte_job_t *jdata = (orte_job_t*)cbdata;
|
||||
|
||||
if (ORTE_SUCCESS != status) {
|
||||
ORTE_FORCED_TERMINATE(status);
|
||||
return;
|
||||
} else {
|
||||
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_MAP);
|
||||
}
|
||||
}
|
||||
|
||||
static void init_complete(int sd, short args, void *cbdata)
|
||||
{
|
||||
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
|
||||
|
||||
ORTE_ACQUIRE_OBJECT(caddy);
|
||||
|
||||
/* nothing to do here but move along - if it is the
|
||||
* daemon job, then next step is allocate */
|
||||
ORTE_ACTIVATE_JOB_STATE(caddy->jdata, ORTE_JOB_STATE_ALLOCATE);
|
||||
OBJ_RELEASE(caddy);
|
||||
}
|
||||
|
||||
static void vm_ready(int fd, short args, void *cbdata)
|
||||
{
|
||||
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
|
||||
int rc;
|
||||
opal_buffer_t *buf;
|
||||
orte_daemon_cmd_flag_t command = ORTE_DAEMON_DVM_NIDMAP_CMD;
|
||||
orte_grpcomm_signature_t *sig;
|
||||
opal_buffer_t *wireup;
|
||||
orte_job_t *jptr;
|
||||
orte_proc_t *dmn;
|
||||
opal_byte_object_t bo, *boptr;
|
||||
int8_t flag;
|
||||
int32_t numbytes, v;
|
||||
char *nidmap;
|
||||
opal_list_t *modex;
|
||||
opal_value_t *val, *kv;
|
||||
|
||||
ORTE_ACQUIRE_OBJECT(caddy);
|
||||
|
||||
/* if this is my job, then we are done */
|
||||
if (ORTE_PROC_MY_NAME->jobid == caddy->jdata->jobid) {
|
||||
/* if there is only one daemon in the job, then there
|
||||
* is just a little bit to do */
|
||||
if (1 == orte_process_info.num_procs) {
|
||||
if (!orte_nidmap_communicated) {
|
||||
if (ORTE_SUCCESS != (rc = orte_regx.nidmap_create(orte_node_pool, &orte_node_regex))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return;
|
||||
}
|
||||
orte_nidmap_communicated = true;
|
||||
}
|
||||
} else {
|
||||
/* send the daemon map to every daemon in this DVM - we
|
||||
* do this here so we don't have to do it for every
|
||||
* job we are going to launch */
|
||||
buf = OBJ_NEW(opal_buffer_t);
|
||||
opal_dss.pack(buf, &command, 1, ORTE_DAEMON_CMD);
|
||||
/* if we couldn't provide the allocation regex on the orted
|
||||
* cmd line, then we need to provide all the info here */
|
||||
if (!orte_nidmap_communicated) {
|
||||
if (ORTE_SUCCESS != (rc = orte_regx.nidmap_create(orte_node_pool, &nidmap))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_RELEASE(buf);
|
||||
return;
|
||||
}
|
||||
orte_nidmap_communicated = true;
|
||||
} else {
|
||||
nidmap = NULL;
|
||||
}
|
||||
opal_dss.pack(buf, &nidmap, 1, OPAL_STRING);
|
||||
if (NULL != nidmap) {
|
||||
free(nidmap);
|
||||
}
|
||||
/* provide the info on the capabilities of each node */
|
||||
if (!orte_node_info_communicated) {
|
||||
flag = 1;
|
||||
opal_dss.pack(buf, &flag, 1, OPAL_INT8);
|
||||
if (ORTE_SUCCESS != (rc = orte_regx.encode_nodemap(buf))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_RELEASE(buf);
|
||||
return;
|
||||
}
|
||||
orte_node_info_communicated = true;
|
||||
/* get wireup info for daemons */
|
||||
jptr = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid);
|
||||
wireup = OBJ_NEW(opal_buffer_t);
|
||||
for (v=0; v < jptr->procs->size; v++) {
|
||||
if (NULL == (dmn = (orte_proc_t*)opal_pointer_array_get_item(jptr->procs, v))) {
|
||||
continue;
|
||||
}
|
||||
val = NULL;
|
||||
if (opal_pmix.legacy_get()) {
|
||||
if (OPAL_SUCCESS != (rc = opal_pmix.get(&dmn->name, OPAL_PMIX_PROC_URI, NULL, &val)) || NULL == val) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_RELEASE(buf);
|
||||
OBJ_RELEASE(wireup);
|
||||
return;
|
||||
} else {
|
||||
/* pack the name of the daemon */
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(wireup, &dmn->name, 1, ORTE_NAME))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_RELEASE(buf);
|
||||
OBJ_RELEASE(wireup);
|
||||
return;
|
||||
}
|
||||
/* pack the URI */
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(wireup, &val->data.string, 1, OPAL_STRING))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_RELEASE(buf);
|
||||
OBJ_RELEASE(wireup);
|
||||
return;
|
||||
}
|
||||
OBJ_RELEASE(val);
|
||||
}
|
||||
} else {
|
||||
if (OPAL_SUCCESS != (rc = opal_pmix.get(&dmn->name, NULL, NULL, &val)) || NULL == val) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_RELEASE(buf);
|
||||
OBJ_RELEASE(wireup);
|
||||
return;
|
||||
} else {
|
||||
/* pack the name of the daemon */
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(wireup, &dmn->name, 1, ORTE_NAME))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_RELEASE(buf);
|
||||
OBJ_RELEASE(wireup);
|
||||
return;
|
||||
}
|
||||
/* the data is returned as a list of key-value pairs in the opal_value_t */
|
||||
if (OPAL_PTR != val->type) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
||||
OBJ_RELEASE(buf);
|
||||
OBJ_RELEASE(wireup);
|
||||
return;
|
||||
}
|
||||
modex = (opal_list_t*)val->data.ptr;
|
||||
numbytes = (int32_t)opal_list_get_size(modex);
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(wireup, &numbytes, 1, OPAL_INT32))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_RELEASE(buf);
|
||||
OBJ_RELEASE(wireup);
|
||||
return;
|
||||
}
|
||||
OPAL_LIST_FOREACH(kv, modex, opal_value_t) {
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(wireup, &kv, 1, OPAL_VALUE))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_RELEASE(buf);
|
||||
OBJ_RELEASE(wireup);
|
||||
return;
|
||||
}
|
||||
}
|
||||
OPAL_LIST_RELEASE(modex);
|
||||
OBJ_RELEASE(val);
|
||||
}
|
||||
}
|
||||
}
|
||||
/* put it in a byte object for xmission */
|
||||
opal_dss.unload(wireup, (void**)&bo.bytes, &numbytes);
|
||||
/* pack the byte object - zero-byte objects are fine */
|
||||
bo.size = numbytes;
|
||||
boptr = &bo;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &boptr, 1, OPAL_BYTE_OBJECT))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_RELEASE(wireup);
|
||||
OBJ_RELEASE(buf);
|
||||
return;
|
||||
}
|
||||
/* release the data since it has now been copied into our buffer */
|
||||
if (NULL != bo.bytes) {
|
||||
free(bo.bytes);
|
||||
}
|
||||
OBJ_RELEASE(wireup);
|
||||
} else {
|
||||
flag = 0;
|
||||
opal_dss.pack(buf, &flag, 1, OPAL_INT8);
|
||||
}
|
||||
|
||||
/* goes to all daemons */
|
||||
sig = OBJ_NEW(orte_grpcomm_signature_t);
|
||||
sig->signature = (orte_process_name_t*)malloc(sizeof(orte_process_name_t));
|
||||
sig->signature[0].jobid = ORTE_PROC_MY_NAME->jobid;
|
||||
sig->signature[0].vpid = ORTE_VPID_WILDCARD;
|
||||
if (ORTE_SUCCESS != (rc = orte_grpcomm.xcast(sig, ORTE_RML_TAG_DAEMON, buf))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_RELEASE(buf);
|
||||
OBJ_RELEASE(sig);
|
||||
ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
|
||||
return;
|
||||
}
|
||||
OBJ_RELEASE(buf);
|
||||
}
|
||||
/* notify that the vm is ready */
|
||||
fprintf(stdout, "DVM ready\n"); fflush(stdout);
|
||||
OBJ_RELEASE(caddy);
|
||||
return;
|
||||
}
|
||||
|
||||
/* progress the job */
|
||||
caddy->jdata->state = ORTE_JOB_STATE_VM_READY;
|
||||
|
||||
/* position any required files */
|
||||
if (ORTE_SUCCESS != orte_filem.preposition_files(caddy->jdata, files_ready, caddy->jdata)) {
|
||||
ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
|
||||
}
|
||||
|
||||
/* cleanup */
|
||||
OBJ_RELEASE(caddy);
|
||||
}
|
||||
|
||||
static void check_complete(int fd, short args, void *cbdata)
|
||||
{
|
||||
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
|
||||
orte_job_t *jdata;
|
||||
orte_proc_t *proc;
|
||||
int i;
|
||||
orte_node_t *node;
|
||||
orte_job_map_t *map;
|
||||
orte_std_cntr_t index;
|
||||
char *rtmod;
|
||||
|
||||
ORTE_ACQUIRE_OBJECT(caddy);
|
||||
jdata = caddy->jdata;
|
||||
|
||||
opal_output_verbose(2, orte_state_base_framework.framework_output,
|
||||
"%s state:dvm:check_job_complete on job %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
(NULL == jdata) ? "NULL" : ORTE_JOBID_PRINT(jdata->jobid));
|
||||
|
||||
if (NULL == jdata || jdata->jobid == ORTE_PROC_MY_NAME->jobid) {
|
||||
/* just check to see if the daemons are complete */
|
||||
OPAL_OUTPUT_VERBOSE((2, orte_state_base_framework.framework_output,
|
||||
"%s state:dvm:check_job_complete - received NULL job, checking daemons",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
rtmod = orte_rml.get_routed(orte_mgmt_conduit);
|
||||
if (0 == orte_routed.num_routes(rtmod)) {
|
||||
/* orteds are done! */
|
||||
OPAL_OUTPUT_VERBOSE((2, orte_state_base_framework.framework_output,
|
||||
"%s orteds complete - exiting",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
if (NULL == jdata) {
|
||||
jdata = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid);
|
||||
}
|
||||
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_DAEMONS_TERMINATED);
|
||||
OBJ_RELEASE(caddy);
|
||||
return;
|
||||
}
|
||||
OBJ_RELEASE(caddy);
|
||||
return;
|
||||
}
|
||||
|
||||
/* mark the job as terminated, but don't override any
|
||||
* abnormal termination flags
|
||||
*/
|
||||
if (jdata->state < ORTE_JOB_STATE_UNTERMINATED) {
|
||||
jdata->state = ORTE_JOB_STATE_TERMINATED;
|
||||
}
|
||||
|
||||
/* tell the IOF that the job is complete */
|
||||
if (NULL != orte_iof.complete) {
|
||||
orte_iof.complete(jdata);
|
||||
}
|
||||
|
||||
/* tell the PMIx subsystem the job is complete */
|
||||
if (NULL != opal_pmix.server_deregister_nspace) {
|
||||
opal_pmix.server_deregister_nspace(jdata->jobid, NULL, NULL);
|
||||
}
|
||||
|
||||
/* Release the resources used by this job. Since some errmgrs may want
|
||||
* to continue using resources allocated to the job as part of their
|
||||
* fault recovery procedure, we only do this once the job is "complete".
|
||||
* Note that an aborted/killed job -is- flagged as complete and will
|
||||
* therefore have its resources released. We need to do this after
|
||||
* we call the errmgr so that any attempt to restart the job will
|
||||
* avoid doing so in the exact same place as the current job
|
||||
*/
|
||||
if (NULL != jdata->map) {
|
||||
map = jdata->map;
|
||||
for (index = 0; index < map->nodes->size; index++) {
|
||||
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, index))) {
|
||||
continue;
|
||||
}
|
||||
OPAL_OUTPUT_VERBOSE((2, orte_state_base_framework.framework_output,
|
||||
"%s state:dvm releasing procs from node %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
node->name));
|
||||
for (i = 0; i < node->procs->size; i++) {
|
||||
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, i))) {
|
||||
continue;
|
||||
}
|
||||
if (proc->name.jobid != jdata->jobid) {
|
||||
/* skip procs from another job */
|
||||
continue;
|
||||
}
|
||||
if (!ORTE_FLAG_TEST(proc, ORTE_PROC_FLAG_TOOL)) {
|
||||
node->slots_inuse--;
|
||||
node->num_procs--;
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((2, orte_state_base_framework.framework_output,
|
||||
"%s state:dvm releasing proc %s from node %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&proc->name), node->name));
|
||||
/* set the entry in the node array to NULL */
|
||||
opal_pointer_array_set_item(node->procs, i, NULL);
|
||||
/* release the proc once for the map entry */
|
||||
OBJ_RELEASE(proc);
|
||||
}
|
||||
/* set the node location to NULL */
|
||||
opal_pointer_array_set_item(map->nodes, index, NULL);
|
||||
/* maintain accounting */
|
||||
OBJ_RELEASE(node);
|
||||
/* flag that the node is no longer in a map */
|
||||
ORTE_FLAG_UNSET(node, ORTE_NODE_FLAG_MAPPED);
|
||||
}
|
||||
OBJ_RELEASE(map);
|
||||
jdata->map = NULL;
|
||||
}
|
||||
|
||||
if (ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_DEBUGGER_DAEMON)) {
|
||||
/* this was a debugger daemon. notify that a debugger has detached */
|
||||
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_DEBUGGER_DETACH);
|
||||
} else if (jdata->state != ORTE_JOB_STATE_NOTIFIED) {
|
||||
OPAL_OUTPUT_VERBOSE((2, orte_state_base_framework.framework_output,
|
||||
"%s state:dvm:check_job_completed state is terminated - activating notify",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_NOTIFY_COMPLETED);
|
||||
/* mark the job as notified */
|
||||
jdata->state = ORTE_JOB_STATE_NOTIFIED;
|
||||
}
|
||||
|
||||
OBJ_RELEASE(caddy);
|
||||
}
|
||||
|
||||
static void cleanup_job(int sd, short args, void *cbdata)
|
||||
{
|
||||
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
|
||||
orte_job_t *jdata;
|
||||
|
||||
ORTE_ACQUIRE_OBJECT(caddy);
|
||||
jdata = caddy->jdata;
|
||||
|
||||
/* remove this object from the job array */
|
||||
opal_hash_table_set_value_uint32(orte_job_data, jdata->jobid, NULL);
|
||||
|
||||
OBJ_RELEASE(caddy);
|
||||
}
|
||||
|
||||
typedef struct {
|
||||
opal_list_t *info;
|
||||
orte_job_t *jdata;
|
||||
} mycaddy_t;
|
||||
|
||||
static void notify_complete(int status, void *cbdata)
|
||||
{
|
||||
mycaddy_t *mycaddy = (mycaddy_t*)cbdata;
|
||||
|
||||
OPAL_LIST_RELEASE(mycaddy->info);
|
||||
ORTE_ACTIVATE_JOB_STATE(mycaddy->jdata, ORTE_JOB_STATE_NOTIFIED);
|
||||
OBJ_RELEASE(mycaddy->jdata);
|
||||
free(mycaddy);
|
||||
}
|
||||
|
||||
static void dvm_notify(int sd, short args, void *cbdata)
|
||||
{
|
||||
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
|
||||
orte_job_t *jdata = caddy->jdata;
|
||||
orte_proc_t *pptr=NULL;
|
||||
int ret;
|
||||
opal_buffer_t *reply;
|
||||
orte_daemon_cmd_flag_t command;
|
||||
orte_grpcomm_signature_t *sig;
|
||||
bool notify = true;
|
||||
opal_list_t *info;
|
||||
opal_value_t *val;
|
||||
opal_process_name_t pname, *proc, pnotify;
|
||||
mycaddy_t *mycaddy;
|
||||
|
||||
/* see if there was any problem */
|
||||
if (orte_get_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, (void**)&pptr, OPAL_PTR) && NULL != pptr) {
|
||||
ret = pptr->exit_code;
|
||||
/* or whether we got cancelled by the user */
|
||||
} else if (orte_get_attribute(&jdata->attributes, ORTE_JOB_CANCELLED, NULL, OPAL_BOOL)) {
|
||||
ret = ORTE_ERR_JOB_CANCELLED;
|
||||
} else {
|
||||
ret = ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
if (0 == ret && orte_get_attribute(&jdata->attributes, ORTE_JOB_SILENT_TERMINATION, NULL, OPAL_BOOL)) {
|
||||
notify = false;
|
||||
}
|
||||
/* if the jobid matches that of the requestor, then don't notify */
|
||||
proc = &pnotify;
|
||||
if (orte_get_attribute(&jdata->attributes, ORTE_JOB_LAUNCH_PROXY, (void**)&proc, OPAL_NAME)) {
|
||||
if (pnotify.jobid == jdata->jobid) {
|
||||
notify = false;
|
||||
}
|
||||
}
|
||||
|
||||
if (notify) {
|
||||
/* the source is the job that terminated */
|
||||
pname.jobid = jdata->jobid;
|
||||
pname.vpid = OPAL_VPID_WILDCARD;
|
||||
|
||||
info = OBJ_NEW(opal_list_t);
|
||||
/* ensure this only goes to the job terminated event handler */
|
||||
val = OBJ_NEW(opal_value_t);
|
||||
val->key = strdup(OPAL_PMIX_EVENT_NON_DEFAULT);
|
||||
val->type = OPAL_BOOL;
|
||||
val->data.flag = true;
|
||||
opal_list_append(info, &val->super);
|
||||
/* tell the server not to cache the event as subsequent jobs
|
||||
* do not need to know about it */
|
||||
val = OBJ_NEW(opal_value_t);
|
||||
val->key = strdup(OPAL_PMIX_EVENT_DO_NOT_CACHE);
|
||||
val->type = OPAL_BOOL;
|
||||
val->data.flag = true;
|
||||
opal_list_append(info, &val->super);
|
||||
/* provide the status */
|
||||
val = OBJ_NEW(opal_value_t);
|
||||
val->key = strdup(OPAL_PMIX_JOB_TERM_STATUS);
|
||||
val->type = OPAL_STATUS;
|
||||
val->data.status = ret;
|
||||
opal_list_append(info, &val->super);
|
||||
/* tell the requestor which job or proc */
|
||||
val = OBJ_NEW(opal_value_t);
|
||||
val->key = strdup(OPAL_PMIX_PROCID);
|
||||
val->type = OPAL_NAME;
|
||||
val->data.name.jobid = jdata->jobid;
|
||||
if (NULL != pptr) {
|
||||
val->data.name.vpid = pptr->name.vpid;
|
||||
} else {
|
||||
val->data.name.vpid = ORTE_VPID_WILDCARD;
|
||||
}
|
||||
opal_list_append(info, &val->super);
|
||||
/* pass along the proc to be notified */
|
||||
val = OBJ_NEW(opal_value_t);
|
||||
val->key = strdup(OPAL_PMIX_EVENT_CUSTOM_RANGE);
|
||||
val->type = OPAL_NAME;
|
||||
val->data.name.jobid = pnotify.jobid;
|
||||
val->data.name.vpid = pnotify.vpid;
|
||||
opal_list_append(info, &val->super);
|
||||
/* setup the caddy */
|
||||
mycaddy = (mycaddy_t*)malloc(sizeof(mycaddy_t));
|
||||
mycaddy->info = info;
|
||||
OBJ_RETAIN(jdata);
|
||||
mycaddy->jdata = jdata;
|
||||
opal_pmix.server_notify_event(OPAL_ERR_JOB_TERMINATED, &pname,
|
||||
info, notify_complete, mycaddy);
|
||||
}
|
||||
|
||||
/* now ensure that _all_ daemons know that this job has terminated so even
|
||||
* those that did not participate in it will know to cleanup the resources
|
||||
* they assigned to the job. This is necessary now that the mapping function
|
||||
* has been moved to the backend daemons - otherwise, non-participating daemons
|
||||
* retain the slot assignments on the participating daemons, and then incorrectly
|
||||
* map subsequent jobs thinking those nodes are still "busy" */
|
||||
reply = OBJ_NEW(opal_buffer_t);
|
||||
command = ORTE_DAEMON_DVM_CLEANUP_JOB_CMD;
|
||||
opal_dss.pack(reply, &command, 1, ORTE_DAEMON_CMD);
|
||||
opal_dss.pack(reply, &jdata->jobid, 1, ORTE_JOBID);
|
||||
sig = OBJ_NEW(orte_grpcomm_signature_t);
|
||||
sig->signature = (orte_process_name_t*)malloc(sizeof(orte_process_name_t));
|
||||
sig->signature[0].jobid = ORTE_PROC_MY_NAME->jobid;
|
||||
sig->signature[0].vpid = ORTE_VPID_WILDCARD;
|
||||
orte_grpcomm.xcast(sig, ORTE_RML_TAG_DAEMON, reply);
|
||||
OBJ_RELEASE(reply);
|
||||
OBJ_RELEASE(sig);
|
||||
}
|
@ -1,35 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2015 Intel, Inc. All rights reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
/**
|
||||
* @file
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef MCA_STATE_DVM_EXPORT_H
|
||||
#define MCA_STATE_DVM_EXPORT_H
|
||||
|
||||
#include "orte_config.h"
|
||||
|
||||
#include "orte/mca/state/state.h"
|
||||
|
||||
BEGIN_C_DECLS
|
||||
|
||||
/*
|
||||
* Local Component structures
|
||||
*/
|
||||
|
||||
ORTE_MODULE_DECLSPEC extern orte_state_base_component_t mca_state_dvm_component;
|
||||
|
||||
ORTE_DECLSPEC extern orte_state_base_module_t orte_state_dvm_module;
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
#endif /* MCA_STATE_DVM_EXPORT_H */
|
@ -1,83 +0,0 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2015 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "orte_config.h"
|
||||
#include "opal/util/output.h"
|
||||
|
||||
#include "orte/mca/state/state.h"
|
||||
#include "orte/mca/state/base/base.h"
|
||||
#include "state_dvm.h"
|
||||
|
||||
/*
|
||||
* Public string for version number
|
||||
*/
|
||||
const char *orte_state_dvm_component_version_string =
|
||||
"ORTE STATE dvm MCA component version " ORTE_VERSION;
|
||||
|
||||
/*
|
||||
* Local functionality
|
||||
*/
|
||||
static int state_dvm_open(void);
|
||||
static int state_dvm_close(void);
|
||||
static int state_dvm_component_query(mca_base_module_t **module, int *priority);
|
||||
|
||||
/*
|
||||
* Instantiate the public struct with all of our public information
|
||||
* and pointer to our public functions in it
|
||||
*/
|
||||
orte_state_base_component_t mca_state_dvm_component =
|
||||
{
|
||||
/* Handle the general mca_component_t struct containing
|
||||
* meta information about the component
|
||||
*/
|
||||
.base_version = {
|
||||
ORTE_STATE_BASE_VERSION_1_0_0,
|
||||
/* Component name and version */
|
||||
.mca_component_name = "dvm",
|
||||
MCA_BASE_MAKE_VERSION(component, ORTE_MAJOR_VERSION, ORTE_MINOR_VERSION,
|
||||
ORTE_RELEASE_VERSION),
|
||||
|
||||
/* Component open and close functions */
|
||||
.mca_open_component = state_dvm_open,
|
||||
.mca_close_component = state_dvm_close,
|
||||
.mca_query_component = state_dvm_component_query,
|
||||
},
|
||||
.base_data = {
|
||||
/* The component is checkpoint ready */
|
||||
MCA_BASE_METADATA_PARAM_CHECKPOINT
|
||||
},
|
||||
};
|
||||
|
||||
static int state_dvm_open(void)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static int state_dvm_close(void)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static int state_dvm_component_query(mca_base_module_t **module, int *priority)
|
||||
{
|
||||
/* used by DVM masters */
|
||||
if (ORTE_PROC_IS_MASTER) {
|
||||
*priority = 100;
|
||||
*module = (mca_base_module_t *)&orte_state_dvm_module;
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
*priority = 0;
|
||||
*module = NULL;
|
||||
return ORTE_ERR_NOT_AVAILABLE;
|
||||
}
|
@ -42,12 +42,4 @@ DIST_SUBDIRS += \
|
||||
tools/wrappers \
|
||||
tools/orte-top \
|
||||
tools/orte-info \
|
||||
tools/orte-server \
|
||||
tools/orte-dvm \
|
||||
tools/ompi-prun
|
||||
|
||||
if OPAL_WANT_PRUN
|
||||
SUBDIRS += \
|
||||
tools/ompi-prun \
|
||||
tools/orte-dvm
|
||||
endif
|
||||
tools/orte-server
|
||||
|
@ -1,59 +0,0 @@
|
||||
#
|
||||
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
# University Research and Technology
|
||||
# Corporation. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
# of Tennessee Research Foundation. All rights
|
||||
# reserved.
|
||||
# Copyright (c) 2004-2009 High Performance Computing Center Stuttgart,
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# Copyright (c) 2008-2014 Cisco Systems, Inc. All rights reserved.
|
||||
# Copyright (c) 2008 Sun Microsystems, Inc. All rights reserved.
|
||||
# Copyright (c) 2015-2018 Intel, Inc. All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
# This is not quite in the Automake spirit, but we have to do it.
|
||||
# Since the totalview portion of the library must be built with -g, we
|
||||
# must eliminate the CFLAGS that are passed in here by default (which
|
||||
# may already have debugging and/or optimization flags). We use
|
||||
# post-processed forms of the CFLAGS in the library targets down
|
||||
# below.
|
||||
|
||||
CFLAGS = $(CFLAGS_WITHOUT_OPTFLAGS) $(DEBUGGER_CFLAGS)
|
||||
|
||||
include $(top_srcdir)/Makefile.ompi-rules
|
||||
|
||||
man_pages = ompi-prun.1
|
||||
EXTRA_DIST = $(man_pages:.1=.1in)
|
||||
|
||||
if OPAL_INSTALL_BINARIES
|
||||
|
||||
bin_PROGRAMS = ompi-prun
|
||||
|
||||
nodist_man_MANS = $(man_pages)
|
||||
|
||||
# Ensure that the man pages are rebuilt if the opal_config.h file
|
||||
# changes; a "good enough" way to know if configure was run again (and
|
||||
# therefore the release date or version may have changed)
|
||||
$(nodist_man_MANS): $(top_builddir)/opal/include/opal_config.h
|
||||
|
||||
endif # OPAL_INSTALL_BINARIES
|
||||
|
||||
ompi_prun_SOURCES = \
|
||||
main.c \
|
||||
prun.c \
|
||||
prun.h
|
||||
|
||||
ompi_prun_LDADD = \
|
||||
$(top_builddir)/orte/lib@ORTE_LIB_PREFIX@open-rte.la \
|
||||
$(top_builddir)/opal/lib@OPAL_LIB_PREFIX@open-pal.la
|
||||
|
||||
distclean-local:
|
||||
rm -f $(man_pages)
|
@ -1,33 +0,0 @@
|
||||
/***************************************************************************
|
||||
* *
|
||||
* Open MPI: Open Source High Performance Computing *
|
||||
* *
|
||||
* http://www.open-mpi.org/ *
|
||||
* *
|
||||
***************************************************************************/
|
||||
|
||||
#include "prun.h"
|
||||
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
return prun(argc, argv);
|
||||
}
|
||||
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2017-2018 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
@ -1,228 +0,0 @@
|
||||
#! /bin/sh
|
||||
|
||||
# prun - temporary wrapper script for .libs/prun
|
||||
# Generated by libtool (GNU libtool) 2.4.6
|
||||
#
|
||||
# The prun program cannot be directly executed until all the libtool
|
||||
# libraries that it depends on are installed.
|
||||
#
|
||||
# This wrapper script should never be moved out of the build directory.
|
||||
# If it is, it will not operate correctly.
|
||||
|
||||
# Sed substitution that helps us do robust quoting. It backslashifies
|
||||
# metacharacters that are still active within double-quoted strings.
|
||||
sed_quote_subst='s|\([`"$\\]\)|\\\1|g'
|
||||
|
||||
# Be Bourne compatible
|
||||
if test -n "${ZSH_VERSION+set}" && (emulate sh) >/dev/null 2>&1; then
|
||||
emulate sh
|
||||
NULLCMD=:
|
||||
# Zsh 3.x and 4.x performs word splitting on ${1+"$@"}, which
|
||||
# is contrary to our usage. Disable this feature.
|
||||
alias -g '${1+"$@"}'='"$@"'
|
||||
setopt NO_GLOB_SUBST
|
||||
else
|
||||
case `(set -o) 2>/dev/null` in *posix*) set -o posix;; esac
|
||||
fi
|
||||
BIN_SH=xpg4; export BIN_SH # for Tru64
|
||||
DUALCASE=1; export DUALCASE # for MKS sh
|
||||
|
||||
# The HP-UX ksh and POSIX shell print the target directory to stdout
|
||||
# if CDPATH is set.
|
||||
(unset CDPATH) >/dev/null 2>&1 && unset CDPATH
|
||||
|
||||
relink_command="(cd /home/common/openmpi/foobar/orte/tools/prun; LIBRARY_PATH=/opt/local/lib; export LIBRARY_PATH; { test -z \"\${COMPILER_PATH+set}\" || unset COMPILER_PATH || { COMPILER_PATH=; export COMPILER_PATH; }; }; { test -z \"\${GCC_EXEC_PREFIX+set}\" || unset GCC_EXEC_PREFIX || { GCC_EXEC_PREFIX=; export GCC_EXEC_PREFIX; }; }; { test -z \"\${LD_RUN_PATH+set}\" || unset LD_RUN_PATH || { LD_RUN_PATH=; export LD_RUN_PATH; }; }; LD_LIBRARY_PATH=/home/common/openmpi/build/foobar/lib:/home/common/local/lib:/home/common/pmix/build/prrte/lib; export LD_LIBRARY_PATH; PATH=/home/common/openmpi/build/foobar/bin:/home/common/local/bin:/home/common/pmix/build/prrte/bin:/home/common/local/sbin:/usr/lib64/qt-3.3/bin:/home/rhc/perl5/bin:/usr/local/bin:/usr/bin:/usr/local/sbin:/usr/sbin:/home/rhc/.local/bin:/home/rhc/bin; export PATH; gcc -Wall -Wundef -Wno-long-long -Wsign-compare -Wmissing-prototypes -Wstrict-prototypes -Wcomment -pedantic -Werror-implicit-function-declaration -fno-strict-aliasing -mcx16 -pthread -g -o \$progdir/\$file main.o prun.o ../../../orte/.libs/libopen-rte.so /home/common/openmpi/foobar/opal/.libs/libopen-pal.so ../../../opal/.libs/libopen-pal.so -ldl -ludev -lrt -lm -lutil -lz -pthread -Wl,-rpath -Wl,/home/common/openmpi/foobar/orte/.libs -Wl,-rpath -Wl,/home/common/openmpi/foobar/opal/.libs -Wl,-rpath -Wl,/home/common/openmpi/build/foobar/lib)"
|
||||
|
||||
# This environment variable determines our operation mode.
|
||||
if test "$libtool_install_magic" = "%%%MAGIC variable%%%"; then
|
||||
# install mode needs the following variables:
|
||||
generated_by_libtool_version='2.4.6'
|
||||
notinst_deplibs=' ../../../orte/libopen-rte.la /home/common/openmpi/foobar/opal/libopen-pal.la ../../../opal/libopen-pal.la'
|
||||
else
|
||||
# When we are sourced in execute mode, $file and $ECHO are already set.
|
||||
if test "$libtool_execute_magic" != "%%%MAGIC variable%%%"; then
|
||||
file="$0"
|
||||
|
||||
# A function that is used when there is no print builtin or printf.
|
||||
func_fallback_echo ()
|
||||
{
|
||||
eval 'cat <<_LTECHO_EOF
|
||||
$1
|
||||
_LTECHO_EOF'
|
||||
}
|
||||
ECHO="printf %s\\n"
|
||||
fi
|
||||
|
||||
# Very basic option parsing. These options are (a) specific to
|
||||
# the libtool wrapper, (b) are identical between the wrapper
|
||||
# /script/ and the wrapper /executable/ that is used only on
|
||||
# windows platforms, and (c) all begin with the string --lt-
|
||||
# (application programs are unlikely to have options that match
|
||||
# this pattern).
|
||||
#
|
||||
# There are only two supported options: --lt-debug and
|
||||
# --lt-dump-script. There is, deliberately, no --lt-help.
|
||||
#
|
||||
# The first argument to this parsing function should be the
|
||||
# script's ../../../libtool value, followed by no.
|
||||
lt_option_debug=
|
||||
func_parse_lt_options ()
|
||||
{
|
||||
lt_script_arg0=$0
|
||||
shift
|
||||
for lt_opt
|
||||
do
|
||||
case "$lt_opt" in
|
||||
--lt-debug) lt_option_debug=1 ;;
|
||||
--lt-dump-script)
|
||||
lt_dump_D=`$ECHO "X$lt_script_arg0" | /usr/bin/sed -e 's/^X//' -e 's%/[^/]*$%%'`
|
||||
test "X$lt_dump_D" = "X$lt_script_arg0" && lt_dump_D=.
|
||||
lt_dump_F=`$ECHO "X$lt_script_arg0" | /usr/bin/sed -e 's/^X//' -e 's%^.*/%%'`
|
||||
cat "$lt_dump_D/$lt_dump_F"
|
||||
exit 0
|
||||
;;
|
||||
--lt-*)
|
||||
$ECHO "Unrecognized --lt- option: '$lt_opt'" 1>&2
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
# Print the debug banner immediately:
|
||||
if test -n "$lt_option_debug"; then
|
||||
echo "prun:prun:$LINENO: libtool wrapper (GNU libtool) 2.4.6" 1>&2
|
||||
fi
|
||||
}
|
||||
|
||||
# Used when --lt-debug. Prints its arguments to stdout
|
||||
# (redirection is the responsibility of the caller)
|
||||
func_lt_dump_args ()
|
||||
{
|
||||
lt_dump_args_N=1;
|
||||
for lt_arg
|
||||
do
|
||||
$ECHO "prun:prun:$LINENO: newargv[$lt_dump_args_N]: $lt_arg"
|
||||
lt_dump_args_N=`expr $lt_dump_args_N + 1`
|
||||
done
|
||||
}
|
||||
|
||||
# Core function for launching the target application
|
||||
func_exec_program_core ()
|
||||
{
|
||||
|
||||
if test -n "$lt_option_debug"; then
|
||||
$ECHO "prun:prun:$LINENO: newargv[0]: $progdir/$program" 1>&2
|
||||
func_lt_dump_args ${1+"$@"} 1>&2
|
||||
fi
|
||||
exec "$progdir/$program" ${1+"$@"}
|
||||
|
||||
$ECHO "$0: cannot exec $program $*" 1>&2
|
||||
exit 1
|
||||
}
|
||||
|
||||
# A function to encapsulate launching the target application
|
||||
# Strips options in the --lt-* namespace from $@ and
|
||||
# launches target application with the remaining arguments.
|
||||
func_exec_program ()
|
||||
{
|
||||
case " $* " in
|
||||
*\ --lt-*)
|
||||
for lt_wr_arg
|
||||
do
|
||||
case $lt_wr_arg in
|
||||
--lt-*) ;;
|
||||
*) set x "$@" "$lt_wr_arg"; shift;;
|
||||
esac
|
||||
shift
|
||||
done ;;
|
||||
esac
|
||||
func_exec_program_core ${1+"$@"}
|
||||
}
|
||||
|
||||
# Parse options
|
||||
func_parse_lt_options "$0" ${1+"$@"}
|
||||
|
||||
# Find the directory that this script lives in.
|
||||
thisdir=`$ECHO "$file" | /usr/bin/sed 's%/[^/]*$%%'`
|
||||
test "x$thisdir" = "x$file" && thisdir=.
|
||||
|
||||
# Follow symbolic links until we get to the real thisdir.
|
||||
file=`ls -ld "$file" | /usr/bin/sed -n 's/.*-> //p'`
|
||||
while test -n "$file"; do
|
||||
destdir=`$ECHO "$file" | /usr/bin/sed 's%/[^/]*$%%'`
|
||||
|
||||
# If there was a directory component, then change thisdir.
|
||||
if test "x$destdir" != "x$file"; then
|
||||
case "$destdir" in
|
||||
[\\/]* | [A-Za-z]:[\\/]*) thisdir="$destdir" ;;
|
||||
*) thisdir="$thisdir/$destdir" ;;
|
||||
esac
|
||||
fi
|
||||
|
||||
file=`$ECHO "$file" | /usr/bin/sed 's%^.*/%%'`
|
||||
file=`ls -ld "$thisdir/$file" | /usr/bin/sed -n 's/.*-> //p'`
|
||||
done
|
||||
|
||||
# Usually 'no', except on cygwin/mingw when embedded into
|
||||
# the cwrapper.
|
||||
WRAPPER_SCRIPT_BELONGS_IN_OBJDIR=no
|
||||
if test "$WRAPPER_SCRIPT_BELONGS_IN_OBJDIR" = "yes"; then
|
||||
# special case for '.'
|
||||
if test "$thisdir" = "."; then
|
||||
thisdir=`pwd`
|
||||
fi
|
||||
# remove .libs from thisdir
|
||||
case "$thisdir" in
|
||||
*[\\/].libs ) thisdir=`$ECHO "$thisdir" | /usr/bin/sed 's%[\\/][^\\/]*$%%'` ;;
|
||||
.libs ) thisdir=. ;;
|
||||
esac
|
||||
fi
|
||||
|
||||
# Try to get the absolute directory name.
|
||||
absdir=`cd "$thisdir" && pwd`
|
||||
test -n "$absdir" && thisdir="$absdir"
|
||||
|
||||
program=lt-'prun'
|
||||
progdir="$thisdir/.libs"
|
||||
|
||||
if test ! -f "$progdir/$program" ||
|
||||
{ file=`ls -1dt "$progdir/$program" "$progdir/../$program" 2>/dev/null | /usr/bin/sed 1q`; \
|
||||
test "X$file" != "X$progdir/$program"; }; then
|
||||
|
||||
file="$$-$program"
|
||||
|
||||
if test ! -d "$progdir"; then
|
||||
mkdir "$progdir"
|
||||
else
|
||||
rm -f "$progdir/$file"
|
||||
fi
|
||||
|
||||
# relink executable if necessary
|
||||
if test -n "$relink_command"; then
|
||||
if relink_command_output=`eval $relink_command 2>&1`; then :
|
||||
else
|
||||
$ECHO "$relink_command_output" >&2
|
||||
rm -f "$progdir/$file"
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
|
||||
mv -f "$progdir/$file" "$progdir/$program" 2>/dev/null ||
|
||||
{ rm -f "$progdir/$program";
|
||||
mv -f "$progdir/$file" "$progdir/$program"; }
|
||||
rm -f "$progdir/$file"
|
||||
fi
|
||||
|
||||
if test -f "$progdir/$program"; then
|
||||
if test "$libtool_execute_magic" != "%%%MAGIC variable%%%"; then
|
||||
# Run the actual program with our arguments.
|
||||
func_exec_program ${1+"$@"}
|
||||
fi
|
||||
else
|
||||
# The program doesn't exist.
|
||||
$ECHO "$0: error: '$progdir/$program' does not exist" 1>&2
|
||||
$ECHO "This script is just a wrapper for $program." 1>&2
|
||||
$ECHO "See the libtool documentation for more information." 1>&2
|
||||
exit 1
|
||||
fi
|
||||
fi
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
@ -1,37 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2007-2011 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2012-2013 Los Alamos National Security, LLC.
|
||||
* All rights reserved
|
||||
* Copyright (c) 2014-2018 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#ifndef PRUN_H
|
||||
#define PRUN_H
|
||||
|
||||
#include "orte_config.h"
|
||||
|
||||
BEGIN_C_DECLS
|
||||
|
||||
/**
|
||||
* Main body of prun functionality
|
||||
*/
|
||||
int prun(int argc, char *argv[]);
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
#endif /* ORTERUN_ORTERUN_H */
|
@ -1,57 +0,0 @@
|
||||
#
|
||||
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
# University Research and Technology
|
||||
# Corporation. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
# of Tennessee Research Foundation. All rights
|
||||
# reserved.
|
||||
# Copyright (c) 2004-2009 High Performance Computing Center Stuttgart,
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# Copyright (c) 2008-2014 Cisco Systems, Inc. All rights reserved.
|
||||
# Copyright (c) 2008 Sun Microsystems, Inc. All rights reserved.
|
||||
# Copyright (c) 2015 Intel, Inc. All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
# This is not quite in the Automake spirit, but we have to do it.
|
||||
# Since the totalview portion of the library must be built with -g, we
|
||||
# must eliminate the CFLAGS that are passed in here by default (which
|
||||
# may already have debugging and/or optimization flags). We use
|
||||
# post-processed forms of the CFLAGS in the library targets down
|
||||
# below.
|
||||
|
||||
CFLAGS = $(CFLAGS_WITHOUT_OPTFLAGS) $(DEBUGGER_CFLAGS)
|
||||
|
||||
include $(top_srcdir)/Makefile.ompi-rules
|
||||
|
||||
man_pages = orte-dvm.1
|
||||
EXTRA_DIST = $(man_pages:.1=.1in)
|
||||
|
||||
if OPAL_INSTALL_BINARIES
|
||||
|
||||
bin_PROGRAMS = orte-dvm
|
||||
|
||||
nodist_man_MANS = $(man_pages)
|
||||
|
||||
# Ensure that the man pages are rebuilt if the opal_config.h file
|
||||
# changes; a "good enough" way to know if configure was run again (and
|
||||
# therefore the release date or version may have changed)
|
||||
$(nodist_man_MANS): $(top_builddir)/opal/include/opal_config.h
|
||||
|
||||
endif # OPAL_INSTALL_BINARIES
|
||||
|
||||
orte_dvm_SOURCES = \
|
||||
orte-dvm.c
|
||||
|
||||
orte_dvm_LDADD = \
|
||||
$(top_builddir)/orte/lib@ORTE_LIB_PREFIX@open-rte.la \
|
||||
$(top_builddir)/opal/lib@OPAL_LIB_PREFIX@open-pal.la
|
||||
|
||||
distclean-local:
|
||||
rm -f $(man_pages)
|
@ -1,193 +0,0 @@
|
||||
.\” -*- nroff -*-
|
||||
.\" Copyright (c) 2009-2014 Cisco Systems, Inc. All rights reserved.
|
||||
.\" Copyright (c) 2008-2009 Sun Microsystems, Inc. All rights reserved.
|
||||
.\” Copyright (c) 2015 Intel, Inc. All rights reserved
|
||||
.\" $COPYRIGHT$
|
||||
.\"
|
||||
.\" Man page for ORTE's orte-dvm command
|
||||
.\"
|
||||
.\" .TH name section center-footer left-footer center-header
|
||||
.TH ORTE-DVM 1 "#OMPI_DATE#" "#PACKAGE_VERSION#" "#PACKAGE_NAME#"
|
||||
.\" **************************
|
||||
.\" Name Section
|
||||
.\" **************************
|
||||
.SH NAME
|
||||
.
|
||||
orte-dvm, ompi_dvm \- Establish a Distributed Virtual Machine (DVM).
|
||||
|
||||
.B Note:
|
||||
\fIorte-dvm\fP and \fIompi-dvm\fP are synonyms for each
|
||||
other. Using either of the names will produce the same behavior.
|
||||
.
|
||||
.\" **************************
|
||||
.\" Synopsis Section
|
||||
.\" **************************
|
||||
.SH SYNOPSIS
|
||||
.
|
||||
.PP
|
||||
.B orte-dvm
|
||||
[ options ]
|
||||
.P
|
||||
|
||||
Invoking \fIorte-dvm\fP via an absolute path
|
||||
name is equivalent to specifying the \fI--prefix\fP option with a
|
||||
\fI<dir>\fR value equivalent to the directory where \fIorte-dvm\fR
|
||||
resides, minus its last subdirectory. For example:
|
||||
|
||||
\fB%\fP /usr/local/bin/orte-dvm ...
|
||||
|
||||
is equivalent to
|
||||
|
||||
\fB%\fP orte-dvm --prefix /usr/local
|
||||
|
||||
.
|
||||
.\" **************************
|
||||
.\" Quick Summary Section
|
||||
.\" **************************
|
||||
.SH QUICK SUMMARY
|
||||
.
|
||||
\fIorte-dvm\fP will establish a DVM that can be used to execute subsequent
|
||||
applications. Use of \fIorte-dvm\fP can be advantageous, for example, when you want to
|
||||
execute a number of short-lived tasks. In such cases, the time required to start
|
||||
the ORTE DVM can be a significant fraction of the time to execute the
|
||||
overall application. Thus, creating a persistent DVM can speed the overall
|
||||
execution. In addition, a persistent DVM will support executing multiple parallel
|
||||
applications while maintaining separation between their respective cores.
|
||||
.\" **************************
|
||||
.\" Options Section
|
||||
.\" **************************
|
||||
.SH OPTIONS
|
||||
.
|
||||
.\"
|
||||
.\" Start options listing
|
||||
.\" Indent 10 characters from start of first column to start of second column
|
||||
.
|
||||
.TP
|
||||
.B -h\fR,\fP --help
|
||||
Display help for this command
|
||||
.
|
||||
.
|
||||
.TP
|
||||
.B -V\fR,\fP --version
|
||||
Print version number. If no other arguments are given, this will also
|
||||
cause orte-dvm to exit.
|
||||
.
|
||||
.
|
||||
.P
|
||||
Use one of the following options to specify which hosts (nodes) of the cluster to use
|
||||
for the DVM.
|
||||
.
|
||||
.
|
||||
.TP
|
||||
.B -H\fR,\fP -host\fR,\fP --host \fR<host1,host2,...,hostN>\fP
|
||||
List of hosts for the DVM.
|
||||
.
|
||||
.
|
||||
.TP
|
||||
.B
|
||||
-hostfile\fR,\fP --hostfile \fR<hostfile>\fP
|
||||
Provide a hostfile to use.
|
||||
.
|
||||
.
|
||||
.TP
|
||||
.B -machinefile\fR,\fP --machinefile \fR<machinefile>\fP
|
||||
Synonym for \fI-hostfile\fP.
|
||||
.
|
||||
.
|
||||
.TP
|
||||
.B --prefix \fR<dir>\fP
|
||||
Prefix directory that will be used to set the \fIPATH\fR and
|
||||
\fILD_LIBRARY_PATH\fR on the remote node before invoking the ORTE daemon.
|
||||
.
|
||||
.
|
||||
..P
|
||||
Setting MCA parameters:
|
||||
.
|
||||
.
|
||||
.TP
|
||||
.B -gmca\fR,\fP --gmca \fR<key> <value>\fP
|
||||
Pass global MCA parameters that are applicable to all contexts. \fI<key>\fP is
|
||||
the parameter name; \fI<value>\fP is the parameter value.
|
||||
.
|
||||
.
|
||||
.TP
|
||||
.B -mca\fR,\fP --mca <key> <value>
|
||||
Send arguments to various MCA modules. See the "MCA" section, below.
|
||||
.
|
||||
.
|
||||
.
|
||||
.
|
||||
.TP
|
||||
.B -report-uri\fR,\fP --report-uri <channel>
|
||||
Print out orte-dvm's URI during startup. The channel must be either a '-' to indicate that
|
||||
the URI is to be output to stdout, a '+' to indicate that the URI is to be output to stderr,
|
||||
or a filename to which the URI is to be written.
|
||||
.
|
||||
.
|
||||
.P
|
||||
The following options are useful for developers; they are not generally
|
||||
useful to most ORTE and/or MPI users:
|
||||
.
|
||||
.TP
|
||||
.B -d\fR,\fP --debug-devel
|
||||
Enable debugging of the ORTE layer.
|
||||
.
|
||||
.
|
||||
.TP
|
||||
.B --debug-daemons-file
|
||||
Enable debugging of the ORTE daemons in the DVM, storing
|
||||
output in files.
|
||||
.
|
||||
.
|
||||
.P
|
||||
There may be other options listed with \fIorte-dvm --help\fP.
|
||||
.
|
||||
.
|
||||
.\" **************************
|
||||
.\" Description Section
|
||||
.\" **************************
|
||||
.SH DESCRIPTION
|
||||
.
|
||||
\fIorte-dvm\fP starts a Distributed Virtual Machine (DVM) by launching
|
||||
a daemon on each node of the allocation, as modified or specified by
|
||||
the \fI-host\fP and \fI-hostfile\fP options. Applications can subsequently
|
||||
be executed using the \fIorte-submit\fP command.
|
||||
.
|
||||
The DVM remains in operation until receiving the \fIorte-submit -terminate\fP
|
||||
command.
|
||||
.
|
||||
.
|
||||
.
|
||||
.SS Specifying Host Nodes
|
||||
.
|
||||
Host nodes can be identified on the \fIorte-dvm\fP command line with the \fI-host\fP
|
||||
option or in a hostfile.
|
||||
.
|
||||
.PP
|
||||
For example,
|
||||
.
|
||||
.TP 4
|
||||
orte-dvm -H aa,aa,bb ./a.out
|
||||
launches two processes on node aa and one on bb.
|
||||
.
|
||||
.PP
|
||||
Or, consider the hostfile
|
||||
.
|
||||
|
||||
\fB%\fP cat myhostfile
|
||||
aa slots=2
|
||||
bb slots=2
|
||||
cc slots=2
|
||||
|
||||
.
|
||||
.PP
|
||||
Here, we list both the host names (aa, bb, and cc) but also how many "slots"
|
||||
there are for each. Slots indicate how many processes can potentially execute
|
||||
on a node. For best performance, the number of slots may be chosen to be the
|
||||
number of cores on the node or the number of processor sockets. If the hostfile
|
||||
does not provide slots information, a default of 1 is assumed.
|
||||
When running under resource managers (e.g., SLURM, Torque, etc.),
|
||||
Open MPI will obtain both the hostnames and the number of slots directly
|
||||
from the resource manger.
|
||||
.
|
||||
.
|
@ -1,482 +0,0 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2008 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2006-2017 Cisco Systems, Inc. All rights reserved
|
||||
* Copyright (c) 2007-2009 Sun Microsystems, Inc. All rights reserved.
|
||||
* Copyright (c) 2007-2016 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2013-2017 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "orte_config.h"
|
||||
#include "orte/constants.h"
|
||||
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#ifdef HAVE_STRINGS_H
|
||||
#include <strings.h>
|
||||
#endif /* HAVE_STRINGS_H */
|
||||
#ifdef HAVE_UNISTD_H
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
#ifdef HAVE_SYS_PARAM_H
|
||||
#include <sys/param.h>
|
||||
#endif
|
||||
#include <errno.h>
|
||||
#include <signal.h>
|
||||
#include <ctype.h>
|
||||
#ifdef HAVE_SYS_TYPES_H
|
||||
#include <sys/types.h>
|
||||
#endif /* HAVE_SYS_TYPES_H */
|
||||
#ifdef HAVE_SYS_WAIT_H
|
||||
#include <sys/wait.h>
|
||||
#endif /* HAVE_SYS_WAIT_H */
|
||||
#ifdef HAVE_SYS_TIME_H
|
||||
#include <sys/time.h>
|
||||
#endif /* HAVE_SYS_TIME_H */
|
||||
#include <fcntl.h>
|
||||
#ifdef HAVE_SYS_STAT_H
|
||||
#include <sys/stat.h>
|
||||
#endif
|
||||
|
||||
#include "opal/mca/event/event.h"
|
||||
#include "opal/mca/installdirs/installdirs.h"
|
||||
#include "opal/mca/base/base.h"
|
||||
#include "opal/mca/pmix/pmix.h"
|
||||
#include "opal/util/argv.h"
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/util/basename.h"
|
||||
#include "opal/util/cmd_line.h"
|
||||
#include "opal/util/opal_environ.h"
|
||||
#include "opal/util/opal_getcwd.h"
|
||||
#include "opal/util/show_help.h"
|
||||
#include "opal/util/fd.h"
|
||||
#include "opal/util/daemon_init.h"
|
||||
|
||||
#include "opal/version.h"
|
||||
#include "opal/runtime/opal.h"
|
||||
#include "opal/runtime/opal_info_support.h"
|
||||
#include "opal/util/os_path.h"
|
||||
#include "opal/util/path.h"
|
||||
#include "opal/class/opal_pointer_array.h"
|
||||
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/grpcomm/grpcomm.h"
|
||||
#include "orte/mca/odls/odls.h"
|
||||
#include "orte/mca/oob/base/base.h"
|
||||
#include "orte/mca/rml/rml.h"
|
||||
#include "orte/mca/rml/base/rml_contact.h"
|
||||
#include "orte/mca/state/state.h"
|
||||
|
||||
#include "orte/runtime/runtime.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
#include "orte/util/show_help.h"
|
||||
#include "orte/util/threads.h"
|
||||
|
||||
#include "orte/orted/orted.h"
|
||||
|
||||
/*
|
||||
* Globals
|
||||
*/
|
||||
static bool want_prefix_by_default = (bool) ORTE_WANT_ORTERUN_PREFIX_BY_DEFAULT;
|
||||
|
||||
/*
|
||||
* Globals
|
||||
*/
|
||||
static struct {
|
||||
bool help;
|
||||
bool version;
|
||||
char *prefix;
|
||||
bool run_as_root;
|
||||
bool set_sid;
|
||||
bool daemonize;
|
||||
bool system_server;
|
||||
char *report_uri;
|
||||
bool remote_connections;
|
||||
} myglobals;
|
||||
|
||||
static opal_cmd_line_init_t cmd_line_init[] = {
|
||||
/* Various "obvious" options */
|
||||
{ NULL, 'h', NULL, "help", 0,
|
||||
&myglobals.help, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"This help message" },
|
||||
{ NULL, 'V', NULL, "version", 0,
|
||||
&myglobals.version, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Print version and exit" },
|
||||
|
||||
{ NULL, '\0', "prefix", "prefix", 1,
|
||||
&myglobals.prefix, OPAL_CMD_LINE_TYPE_STRING,
|
||||
"Prefix to be used to look for ORTE executables" },
|
||||
|
||||
{ "orte_daemonize", '\0', NULL, "daemonize", 0,
|
||||
&myglobals.daemonize, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Daemonize the orte-dvm into the background" },
|
||||
|
||||
{ NULL, '\0', NULL, "set-sid", 0,
|
||||
&myglobals.set_sid, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Direct the orte-dvm to separate from the current session"},
|
||||
|
||||
{ "orte_debug_daemons", '\0', "debug-daemons", "debug-daemons", 0,
|
||||
NULL, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Debug daemons" },
|
||||
|
||||
{ "orte_debug", 'd', "debug-devel", "debug-devel", 0,
|
||||
NULL, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Enable debugging of OpenRTE" },
|
||||
|
||||
{ NULL, '\0', "allow-run-as-root", "allow-run-as-root", 0,
|
||||
&myglobals.run_as_root, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Allow execution as root (STRONGLY DISCOURAGED)" },
|
||||
|
||||
/* Specify the launch agent to be used */
|
||||
{ "orte_launch_agent", '\0', "launch-agent", "launch-agent", 1,
|
||||
NULL, OPAL_CMD_LINE_TYPE_STRING,
|
||||
"Command used to start processes on remote nodes (default: orted)" },
|
||||
|
||||
/* maximum size of VM - typically used to subdivide an allocation */
|
||||
{ "orte_max_vm_size", '\0', "max-vm-size", "max-vm-size", 1,
|
||||
NULL, OPAL_CMD_LINE_TYPE_INT,
|
||||
"Maximum size of VM" },
|
||||
|
||||
/* Set a hostfile */
|
||||
{ NULL, '\0', "hostfile", "hostfile", 1,
|
||||
NULL, OPAL_CMD_LINE_TYPE_STRING,
|
||||
"Provide a hostfile" },
|
||||
{ NULL, '\0', "machinefile", "machinefile", 1,
|
||||
NULL, OPAL_CMD_LINE_TYPE_STRING,
|
||||
"Provide a hostfile" },
|
||||
{ "orte_default_hostfile", '\0', "default-hostfile", "default-hostfile", 1,
|
||||
NULL, OPAL_CMD_LINE_TYPE_STRING,
|
||||
"Provide a default hostfile" },
|
||||
|
||||
{ NULL, 'H', "host", "host", 1,
|
||||
NULL, OPAL_CMD_LINE_TYPE_STRING,
|
||||
"List of hosts to invoke processes on" },
|
||||
|
||||
{ NULL, '\0', "system-server", "system-server", 0,
|
||||
&myglobals.system_server, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Provide a system-level server connection point - only one allowed per node" },
|
||||
|
||||
{ NULL, '\0', "report-uri", "report-uri", 1,
|
||||
&myglobals.report_uri, OPAL_CMD_LINE_TYPE_STRING,
|
||||
"Printout URI on stdout [-], stderr [+], or a file [anything else]",
|
||||
OPAL_CMD_LINE_OTYPE_DEBUG },
|
||||
|
||||
{ NULL, '\0', "remote-tools", "remote-tools", 0,
|
||||
&myglobals.remote_connections, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Enable connections from remote tools" },
|
||||
|
||||
/* End of list */
|
||||
{ NULL, '\0', NULL, NULL, 0,
|
||||
NULL, OPAL_CMD_LINE_TYPE_NULL, NULL }
|
||||
};
|
||||
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
int rc, i, j;
|
||||
opal_cmd_line_t cmd_line;
|
||||
char *param, *value;
|
||||
orte_job_t *jdata=NULL;
|
||||
orte_app_context_t *app;
|
||||
|
||||
/* Setup and parse the command line */
|
||||
memset(&myglobals, 0, sizeof(myglobals));
|
||||
/* find our basename (the name of the executable) so that we can
|
||||
use it in pretty-print error messages */
|
||||
orte_basename = opal_basename(argv[0]);
|
||||
|
||||
opal_cmd_line_create(&cmd_line, cmd_line_init);
|
||||
mca_base_cmd_line_setup(&cmd_line);
|
||||
if (OPAL_SUCCESS != (rc = opal_cmd_line_parse(&cmd_line, true, false,
|
||||
argc, argv)) ) {
|
||||
if (OPAL_ERR_SILENT != rc) {
|
||||
fprintf(stderr, "%s: command line error (%s)\n", argv[0],
|
||||
opal_strerror(rc));
|
||||
}
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* print version if requested. Do this before check for help so
|
||||
that --version --help works as one might expect. */
|
||||
if (myglobals.version) {
|
||||
char *str;
|
||||
str = opal_info_make_version_str("all",
|
||||
OPAL_MAJOR_VERSION, OPAL_MINOR_VERSION,
|
||||
OPAL_RELEASE_VERSION,
|
||||
OPAL_GREEK_VERSION,
|
||||
OPAL_REPO_REV);
|
||||
if (NULL != str) {
|
||||
fprintf(stdout, "%s %s\n\nReport bugs to %s\n",
|
||||
orte_basename, str, PACKAGE_BUGREPORT);
|
||||
free(str);
|
||||
}
|
||||
exit(0);
|
||||
}
|
||||
|
||||
/* check if we are running as root - if we are, then only allow
|
||||
* us to proceed if the allow-run-as-root flag was given. Otherwise,
|
||||
* exit with a giant warning flag
|
||||
*/
|
||||
if (0 == geteuid() && !myglobals.run_as_root) {
|
||||
/* show_help is not yet available, so print an error manually */
|
||||
fprintf(stderr, "--------------------------------------------------------------------------\n");
|
||||
if (myglobals.help) {
|
||||
fprintf(stderr, "%s cannot provide the help message when run as root.\n\n", orte_basename);
|
||||
} else {
|
||||
fprintf(stderr, "%s has detected an attempt to run as root.\n\n", orte_basename);
|
||||
}
|
||||
|
||||
fprintf(stderr, "Running at root is *strongly* discouraged as any mistake (e.g., in\n");
|
||||
fprintf(stderr, "defining TMPDIR) or bug can result in catastrophic damage to the OS\n");
|
||||
fprintf(stderr, "file system, leaving your system in an unusable state.\n\n");
|
||||
|
||||
fprintf(stderr, "We strongly suggest that you run %s as a non-root user.\n\n", orte_basename);
|
||||
|
||||
fprintf(stderr, "You can override this protection by adding the --allow-run-as-root\n");
|
||||
fprintf(stderr, "option to your command line. However, we reiterate our strong advice\n");
|
||||
fprintf(stderr, "against doing so - please do so at your own risk.\n");
|
||||
fprintf(stderr, "--------------------------------------------------------------------------\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
/*
|
||||
* Since this process can now handle MCA/GMCA parameters, make sure to
|
||||
* process them.
|
||||
* NOTE: It is "safe" to call mca_base_cmd_line_process_args() before
|
||||
* opal_init_util() since mca_base_cmd_line_process_args() does *not*
|
||||
* depend upon opal_init_util() functionality.
|
||||
*/
|
||||
if (OPAL_SUCCESS != mca_base_cmd_line_process_args(&cmd_line, &environ, &environ)) {
|
||||
exit(1);
|
||||
}
|
||||
|
||||
/* Need to initialize OPAL so that install_dirs are filled in */
|
||||
if (OPAL_SUCCESS != opal_init(&argc, &argv)) {
|
||||
exit(1);
|
||||
}
|
||||
|
||||
/* Check for help request */
|
||||
if (myglobals.help) {
|
||||
char *str, *args = NULL;
|
||||
char *project_name = NULL;
|
||||
if (0 == strcmp(orte_basename, "mpirun")) {
|
||||
project_name = "Open MPI";
|
||||
} else {
|
||||
project_name = "OpenRTE";
|
||||
}
|
||||
args = opal_cmd_line_get_usage_msg(&cmd_line);
|
||||
str = opal_show_help_string("help-orterun.txt", "orterun:usage", false,
|
||||
orte_basename, project_name, OPAL_VERSION,
|
||||
orte_basename, args,
|
||||
PACKAGE_BUGREPORT);
|
||||
if (NULL != str) {
|
||||
printf("%s", str);
|
||||
free(str);
|
||||
}
|
||||
free(args);
|
||||
|
||||
/* If someone asks for help, that should be all we do */
|
||||
exit(0);
|
||||
}
|
||||
|
||||
if (myglobals.system_server) {
|
||||
/* we should act as system-level PMIx server */
|
||||
opal_setenv(OPAL_MCA_PREFIX"pmix_system_server", "1", true, &environ);
|
||||
}
|
||||
/* always act as session-level PMIx server */
|
||||
opal_setenv(OPAL_MCA_PREFIX"pmix_session_server", "1", true, &environ);
|
||||
/* if we were asked to report a uri, set the MCA param to do so */
|
||||
if (NULL != myglobals.report_uri) {
|
||||
opal_setenv("PMIX_MCA_ptl_tcp_report_uri", myglobals.report_uri, true, &environ);
|
||||
}
|
||||
if (myglobals.remote_connections) {
|
||||
opal_setenv("PMIX_MCA_ptl_tcp_remote_connections", "1", true, &environ);
|
||||
}
|
||||
|
||||
/* Setup MCA params */
|
||||
orte_register_params();
|
||||
|
||||
/* save the environment for launch purposes. This MUST be
|
||||
* done so that we can pass it to any local procs we
|
||||
* spawn - otherwise, those local procs won't see any
|
||||
* non-MCA envars were set in the enviro prior to calling
|
||||
* orterun
|
||||
*/
|
||||
orte_launch_environ = opal_argv_copy(environ);
|
||||
|
||||
#if defined(HAVE_SETSID)
|
||||
/* see if we were directed to separate from current session */
|
||||
if (myglobals.set_sid) {
|
||||
setsid();
|
||||
}
|
||||
#endif
|
||||
|
||||
/* detach from controlling terminal
|
||||
* otherwise, remain attached so output can get to us
|
||||
*/
|
||||
if(!orte_debug_flag &&
|
||||
!orte_debug_daemons_flag &&
|
||||
myglobals.daemonize) {
|
||||
opal_daemon_init(NULL);
|
||||
}
|
||||
|
||||
/* Intialize our Open RTE environment */
|
||||
if (ORTE_SUCCESS != (rc = orte_init(&argc, &argv, ORTE_PROC_MASTER))) {
|
||||
/* cannot call ORTE_ERROR_LOG as it could be the errmgr
|
||||
* never got loaded!
|
||||
*/
|
||||
return rc;
|
||||
}
|
||||
/* finalize OPAL. As it was opened again from orte_init->opal_init
|
||||
* we continue to have a reference count on it. So we have to finalize it twice...
|
||||
*/
|
||||
opal_finalize();
|
||||
|
||||
/* get the daemon job object - was created by ess/hnp component */
|
||||
if (NULL == (jdata = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid))) {
|
||||
orte_show_help("help-orterun.txt", "bad-job-object", true,
|
||||
orte_basename);
|
||||
exit(0);
|
||||
}
|
||||
/* also should have created a daemon "app" */
|
||||
if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, 0))) {
|
||||
orte_show_help("help-orterun.txt", "bad-app-object", true,
|
||||
orte_basename);
|
||||
exit(0);
|
||||
}
|
||||
|
||||
/* Did the user specify a prefix, or want prefix by default? */
|
||||
if (opal_cmd_line_is_taken(&cmd_line, "prefix") || want_prefix_by_default) {
|
||||
size_t param_len;
|
||||
/* if both the prefix was given and we have a prefix
|
||||
* given above, check to see if they match
|
||||
*/
|
||||
if (opal_cmd_line_is_taken(&cmd_line, "prefix") &&
|
||||
NULL != myglobals.prefix) {
|
||||
/* if they don't match, then that merits a warning */
|
||||
param = strdup(opal_cmd_line_get_param(&cmd_line, "prefix", 0, 0));
|
||||
/* ensure we strip any trailing '/' */
|
||||
if (0 == strcmp(OPAL_PATH_SEP, &(param[strlen(param)-1]))) {
|
||||
param[strlen(param)-1] = '\0';
|
||||
}
|
||||
value = strdup(myglobals.prefix);
|
||||
if (0 == strcmp(OPAL_PATH_SEP, &(value[strlen(value)-1]))) {
|
||||
value[strlen(value)-1] = '\0';
|
||||
}
|
||||
if (0 != strcmp(param, value)) {
|
||||
orte_show_help("help-orterun.txt", "orterun:app-prefix-conflict",
|
||||
true, orte_basename, value, param);
|
||||
/* let the global-level prefix take precedence since we
|
||||
* know that one is being used
|
||||
*/
|
||||
free(param);
|
||||
param = strdup(myglobals.prefix);
|
||||
}
|
||||
free(value);
|
||||
} else if (NULL != myglobals.prefix) {
|
||||
param = myglobals.prefix;
|
||||
} else if (opal_cmd_line_is_taken(&cmd_line, "prefix")){
|
||||
/* must be --prefix alone */
|
||||
param = strdup(opal_cmd_line_get_param(&cmd_line, "prefix", 0, 0));
|
||||
} else {
|
||||
/* --enable-orterun-prefix-default was given to orterun */
|
||||
param = strdup(opal_install_dirs.prefix);
|
||||
}
|
||||
|
||||
if (NULL != param) {
|
||||
/* "Parse" the param, aka remove superfluous path_sep. */
|
||||
param_len = strlen(param);
|
||||
while (0 == strcmp (OPAL_PATH_SEP, &(param[param_len-1]))) {
|
||||
param[param_len-1] = '\0';
|
||||
param_len--;
|
||||
if (0 == param_len) {
|
||||
orte_show_help("help-orterun.txt", "orterun:empty-prefix",
|
||||
true, orte_basename, orte_basename);
|
||||
return ORTE_ERR_FATAL;
|
||||
}
|
||||
}
|
||||
orte_set_attribute(&app->attributes, ORTE_APP_PREFIX_DIR, ORTE_ATTR_GLOBAL, param, OPAL_STRING);
|
||||
free(param);
|
||||
}
|
||||
}
|
||||
|
||||
/* Did the user specify a hostfile. Need to check for both
|
||||
* hostfile and machine file.
|
||||
* We can only deal with one hostfile per app context, otherwise give an error.
|
||||
*/
|
||||
if (0 < (j = opal_cmd_line_get_ninsts(&cmd_line, "hostfile"))) {
|
||||
if(1 < j) {
|
||||
orte_show_help("help-orterun.txt", "orterun:multiple-hostfiles",
|
||||
true, orte_basename, NULL);
|
||||
return ORTE_ERR_FATAL;
|
||||
} else {
|
||||
value = opal_cmd_line_get_param(&cmd_line, "hostfile", 0, 0);
|
||||
orte_set_attribute(&app->attributes, ORTE_APP_HOSTFILE, ORTE_ATTR_LOCAL, value, OPAL_STRING);
|
||||
}
|
||||
}
|
||||
if (0 < (j = opal_cmd_line_get_ninsts(&cmd_line, "machinefile"))) {
|
||||
if(1 < j || orte_get_attribute(&app->attributes, ORTE_APP_HOSTFILE, NULL, OPAL_STRING)) {
|
||||
orte_show_help("help-orterun.txt", "orterun:multiple-hostfiles",
|
||||
true, orte_basename, NULL);
|
||||
return ORTE_ERR_FATAL;
|
||||
} else {
|
||||
value = opal_cmd_line_get_param(&cmd_line, "machinefile", 0, 0);
|
||||
orte_set_attribute(&app->attributes, ORTE_APP_HOSTFILE, ORTE_ATTR_LOCAL, value, OPAL_STRING);
|
||||
}
|
||||
}
|
||||
|
||||
/* Did the user specify any hosts? */
|
||||
if (0 < (j = opal_cmd_line_get_ninsts(&cmd_line, "host"))) {
|
||||
char **targ=NULL, *tval;
|
||||
for (i = 0; i < j; ++i) {
|
||||
value = opal_cmd_line_get_param(&cmd_line, "host", i, 0);
|
||||
opal_argv_append_nosize(&targ, value);
|
||||
}
|
||||
tval = opal_argv_join(targ, ',');
|
||||
orte_set_attribute(&app->attributes, ORTE_APP_DASH_HOST, ORTE_ATTR_LOCAL, tval, OPAL_STRING);
|
||||
opal_argv_free(targ);
|
||||
free(tval);
|
||||
}
|
||||
OBJ_DESTRUCT(&cmd_line);
|
||||
|
||||
/* setup to listen for commands sent specifically to me, even though I would probably
|
||||
* be the one sending them! Unfortunately, since I am a participating daemon,
|
||||
* there are times I need to send a command to "all daemons", and that means *I* have
|
||||
* to receive it too
|
||||
*/
|
||||
orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_DAEMON,
|
||||
ORTE_RML_PERSISTENT, orte_daemon_recv, NULL);
|
||||
|
||||
/* spawn the DVM - we skip the initial steps as this
|
||||
* isn't a user-level application */
|
||||
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_ALLOCATE);
|
||||
|
||||
/* loop the event lib until an exit event is detected */
|
||||
while (orte_event_base_active) {
|
||||
opal_event_loop(orte_event_base, OPAL_EVLOOP_ONCE);
|
||||
}
|
||||
ORTE_ACQUIRE_OBJECT(orte_event_base_active);
|
||||
|
||||
/* cleanup and leave */
|
||||
orte_finalize();
|
||||
|
||||
if (orte_debug_flag) {
|
||||
fprintf(stderr, "exiting with status %d\n", orte_exit_status);
|
||||
}
|
||||
exit(orte_exit_status);
|
||||
}
|
Загрузка…
Ссылка в новой задаче
Block a user