Merge pull request #5944 from rhc54/topic/psrvr
Remove the stale orte-dvm code
Этот коммит содержится в:
Коммит
6213d23f0b
@ -248,14 +248,12 @@ AC_DEFUN([OPAL_CHECK_PMIX],[
|
|||||||
AC_MSG_ERROR([Cannot continue])])
|
AC_MSG_ERROR([Cannot continue])])
|
||||||
|
|
||||||
AC_MSG_CHECKING([if user requested internal PMIx support($with_pmix)])
|
AC_MSG_CHECKING([if user requested internal PMIx support($with_pmix)])
|
||||||
opal_prun_happy=no
|
|
||||||
opal_external_pmix_happy=no
|
opal_external_pmix_happy=no
|
||||||
opal_external_have_pmix1=0
|
opal_external_have_pmix1=0
|
||||||
|
|
||||||
AS_IF([test "$with_pmix" = "internal"],
|
AS_IF([test "$with_pmix" = "internal"],
|
||||||
[AC_MSG_RESULT([yes])
|
[AC_MSG_RESULT([yes])
|
||||||
opal_external_pmix_happy=no
|
opal_external_pmix_happy=no
|
||||||
opal_prun_happy=yes
|
|
||||||
opal_external_pmix_version=internal],
|
opal_external_pmix_version=internal],
|
||||||
|
|
||||||
[AC_MSG_RESULT([no])
|
[AC_MSG_RESULT([no])
|
||||||
@ -376,7 +374,6 @@ AC_DEFUN([OPAL_CHECK_PMIX],[
|
|||||||
[AC_MSG_RESULT([found])
|
[AC_MSG_RESULT([found])
|
||||||
opal_external_pmix_version=2x
|
opal_external_pmix_version=2x
|
||||||
opal_external_pmix_version_found=1
|
opal_external_pmix_version_found=1
|
||||||
opal_prun_happy=yes
|
|
||||||
opal_external_pmix_happy=yes],
|
opal_external_pmix_happy=yes],
|
||||||
[AC_MSG_RESULT([not found])])])
|
[AC_MSG_RESULT([not found])])])
|
||||||
|
|
||||||
@ -436,7 +433,6 @@ AC_DEFUN([OPAL_CHECK_PMIX],[
|
|||||||
|
|
||||||
AC_DEFINE_UNQUOTED([OPAL_PMIX_V1],[$opal_external_have_pmix1],
|
AC_DEFINE_UNQUOTED([OPAL_PMIX_V1],[$opal_external_have_pmix1],
|
||||||
[Whether the external PMIx library is v1])
|
[Whether the external PMIx library is v1])
|
||||||
AM_CONDITIONAL([OPAL_WANT_PRUN], [test "$opal_prun_happy" = "yes"])
|
|
||||||
|
|
||||||
AS_IF([test "$opal_external_pmix_happy" = "yes"],
|
AS_IF([test "$opal_external_pmix_happy" = "yes"],
|
||||||
[AS_IF([test "$opal_external_pmix_version" = "1x"],
|
[AS_IF([test "$opal_external_pmix_version" = "1x"],
|
||||||
|
@ -30,7 +30,5 @@ AC_DEFUN([ORTE_CONFIG_FILES],[
|
|||||||
orte/tools/orte-top/Makefile
|
orte/tools/orte-top/Makefile
|
||||||
orte/tools/orte-info/Makefile
|
orte/tools/orte-info/Makefile
|
||||||
orte/tools/orte-server/Makefile
|
orte/tools/orte-server/Makefile
|
||||||
orte/tools/orte-dvm/Makefile
|
|
||||||
orte/tools/ompi-prun/Makefile
|
|
||||||
])
|
])
|
||||||
])
|
])
|
||||||
|
@ -2,7 +2,7 @@
|
|||||||
# Copyright (c) 2012 Los Alamos National Security, LLC.
|
# Copyright (c) 2012 Los Alamos National Security, LLC.
|
||||||
# All rights reserved.
|
# All rights reserved.
|
||||||
# Copyright (c) 2014 Cisco Systems, Inc. All rights reserved.
|
# Copyright (c) 2014 Cisco Systems, Inc. All rights reserved.
|
||||||
# Copyright (c) 2016-2017 Intel, Inc. All rights reserved.
|
# Copyright (c) 2016-2018 Intel, Inc. All rights reserved.
|
||||||
# $COPYRIGHT$
|
# $COPYRIGHT$
|
||||||
#
|
#
|
||||||
# Additional copyrights may follow
|
# Additional copyrights may follow
|
||||||
@ -30,12 +30,6 @@ libmca_rte_orte_la_LIBADD = $(top_builddir)/orte/lib@ORTE_LIB_PREFIX@open-rte.la
|
|||||||
|
|
||||||
man_pages = mpirun.1 mpiexec.1 ompi-ps.1 ompi-clean.1 ompi-top.1 ompi-server.1
|
man_pages = mpirun.1 mpiexec.1 ompi-ps.1 ompi-clean.1 ompi-top.1 ompi-server.1
|
||||||
|
|
||||||
if OPAL_WANT_PRUN
|
|
||||||
if WANT_INSTALL_HEADERS
|
|
||||||
man_pages += ompi-dvm.1
|
|
||||||
endif
|
|
||||||
endif
|
|
||||||
|
|
||||||
if OPAL_INSTALL_BINARIES
|
if OPAL_INSTALL_BINARIES
|
||||||
nodist_man_MANS = $(man_pages)
|
nodist_man_MANS = $(man_pages)
|
||||||
|
|
||||||
@ -46,9 +40,6 @@ install-exec-hook:
|
|||||||
(cd $(DESTDIR)$(bindir); rm -f ompi-clean$(EXEEXT); $(LN_S) orte-clean$(EXEEXT) ompi-clean$(EXEEXT))
|
(cd $(DESTDIR)$(bindir); rm -f ompi-clean$(EXEEXT); $(LN_S) orte-clean$(EXEEXT) ompi-clean$(EXEEXT))
|
||||||
(cd $(DESTDIR)$(bindir); rm -f ompi-top$(EXEEXT); $(LN_S) orte-top$(EXEEXT) ompi-top$(EXEEXT))
|
(cd $(DESTDIR)$(bindir); rm -f ompi-top$(EXEEXT); $(LN_S) orte-top$(EXEEXT) ompi-top$(EXEEXT))
|
||||||
(cd $(DESTDIR)$(bindir); rm -f ompi-server$(EXEEXT); $(LN_S) orte-server$(EXEEXT) ompi-server$(EXEEXT))
|
(cd $(DESTDIR)$(bindir); rm -f ompi-server$(EXEEXT); $(LN_S) orte-server$(EXEEXT) ompi-server$(EXEEXT))
|
||||||
if OPAL_WANT_PRUN
|
|
||||||
(cd $(DESTDIR)$(bindir); rm -f ompi-dvm$(EXEEXT); $(LN_S) orte-dvm$(EXEEXT) ompi-dvm$(EXEEXT))
|
|
||||||
endif
|
|
||||||
|
|
||||||
uninstall-local:
|
uninstall-local:
|
||||||
rm -f $(DESTDIR)$(bindir)/mpirun$(EXEEXT) \
|
rm -f $(DESTDIR)$(bindir)/mpirun$(EXEEXT) \
|
||||||
@ -57,9 +48,6 @@ uninstall-local:
|
|||||||
$(DESTDIR)$(bindir)/ompi-clean$(EXEEXT) \
|
$(DESTDIR)$(bindir)/ompi-clean$(EXEEXT) \
|
||||||
$(DESTDIR)$(bindir)/ompi-top$(EXEEXT) \
|
$(DESTDIR)$(bindir)/ompi-top$(EXEEXT) \
|
||||||
$(DESTDIR)$(bindir)/ompi-server$(EXEEXT)
|
$(DESTDIR)$(bindir)/ompi-server$(EXEEXT)
|
||||||
if OPAL_WANT_PRUN
|
|
||||||
rm -f $(DESTDIR)$(bindir)/ompi-dvm$(EXEEXT)
|
|
||||||
endif
|
|
||||||
|
|
||||||
endif # OPAL_INSTALL_BINARIES
|
endif # OPAL_INSTALL_BINARIES
|
||||||
|
|
||||||
@ -96,10 +84,5 @@ $(top_builddir)/orte/tools/orte-server/orte-server.1:
|
|||||||
ompi-server.1: $(top_builddir)/orte/tools/orte-server/orte-server.1
|
ompi-server.1: $(top_builddir)/orte/tools/orte-server/orte-server.1
|
||||||
cp -f $(top_builddir)/orte/tools/orte-server/orte-server.1 ompi-server.1
|
cp -f $(top_builddir)/orte/tools/orte-server/orte-server.1 ompi-server.1
|
||||||
|
|
||||||
if OPAL_WANT_PRUN
|
|
||||||
ompi-dvm.1: $(top_builddir)/orte/tools/orte-dvm/orte-dvm.1
|
|
||||||
cp -f $(top_builddir)/orte/tools/orte-dvm/orte-dvm.1 ompi-dvm.1
|
|
||||||
endif
|
|
||||||
|
|
||||||
clean-local:
|
clean-local:
|
||||||
rm -f $(man_pages)
|
rm -f $(man_pages)
|
||||||
|
@ -1,37 +0,0 @@
|
|||||||
#
|
|
||||||
# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
|
||||||
# Copyright (c) 2016 Intel, Inc. All rights reserved.
|
|
||||||
# Copyright (c) 2017 IBM Corporation. All rights reserved.
|
|
||||||
# $COPYRIGHT$
|
|
||||||
#
|
|
||||||
# Additional copyrights may follow
|
|
||||||
#
|
|
||||||
# $HEADER$
|
|
||||||
#
|
|
||||||
|
|
||||||
sources = \
|
|
||||||
errmgr_dvm.h \
|
|
||||||
errmgr_dvm_component.c \
|
|
||||||
errmgr_dvm.c
|
|
||||||
|
|
||||||
# Make the output library in this directory, and name it either
|
|
||||||
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
|
|
||||||
# (for static builds).
|
|
||||||
|
|
||||||
if MCA_BUILD_orte_errmgr_dvm_DSO
|
|
||||||
component_noinst =
|
|
||||||
component_install = mca_errmgr_dvm.la
|
|
||||||
else
|
|
||||||
component_noinst = libmca_errmgr_dvm.la
|
|
||||||
component_install =
|
|
||||||
endif
|
|
||||||
|
|
||||||
mcacomponentdir = $(ortelibdir)
|
|
||||||
mcacomponent_LTLIBRARIES = $(component_install)
|
|
||||||
mca_errmgr_dvm_la_SOURCES = $(sources)
|
|
||||||
mca_errmgr_dvm_la_LDFLAGS = -module -avoid-version
|
|
||||||
mca_errmgr_dvm_la_LIBADD = $(top_builddir)/orte/lib@ORTE_LIB_PREFIX@open-rte.la
|
|
||||||
|
|
||||||
noinst_LTLIBRARIES = $(component_noinst)
|
|
||||||
libmca_errmgr_dvm_la_SOURCES =$(sources)
|
|
||||||
libmca_errmgr_dvm_la_LDFLAGS = -module -avoid-version
|
|
@ -1,632 +0,0 @@
|
|||||||
/*
|
|
||||||
* Copyright (c) 2009-2011 The Trustees of Indiana University.
|
|
||||||
* All rights reserved.
|
|
||||||
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
|
||||||
* Copyright (c) 2010-2017 Oak Ridge National Labs. All rights reserved.
|
|
||||||
* Copyright (c) 2004-2011 The University of Tennessee and The University
|
|
||||||
* of Tennessee Research Foundation. All rights
|
|
||||||
* reserved.
|
|
||||||
* Copyright (c) 2011 Oracle and/or all its affiliates. All rights reserved.
|
|
||||||
* Copyright (c) 2011-2013 Los Alamos National Security, LLC.
|
|
||||||
* All rights reserved.
|
|
||||||
* Copyright (c) 2014-2018 Intel, Inc. All rights reserved.
|
|
||||||
* Copyright (c) 2017 IBM Corporation. All rights reserved.
|
|
||||||
* $COPYRIGHT$
|
|
||||||
*
|
|
||||||
* Additional copyrights may follow
|
|
||||||
*
|
|
||||||
* $HEADER$
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include "orte_config.h"
|
|
||||||
|
|
||||||
#include <sys/types.h>
|
|
||||||
#ifdef HAVE_UNISTD_H
|
|
||||||
#include <unistd.h>
|
|
||||||
#endif /* HAVE_UNISTD_H */
|
|
||||||
#include <string.h>
|
|
||||||
#ifdef HAVE_SYS_WAIT_H
|
|
||||||
#include <sys/wait.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#include "opal/util/output.h"
|
|
||||||
#include "opal/dss/dss.h"
|
|
||||||
|
|
||||||
#include "orte/mca/iof/base/base.h"
|
|
||||||
#include "orte/mca/rml/rml.h"
|
|
||||||
#include "orte/mca/odls/odls.h"
|
|
||||||
#include "orte/mca/odls/base/base.h"
|
|
||||||
#include "orte/mca/odls/base/odls_private.h"
|
|
||||||
#include "orte/mca/plm/base/plm_private.h"
|
|
||||||
#include "orte/mca/plm/plm.h"
|
|
||||||
#include "orte/mca/rmaps/rmaps_types.h"
|
|
||||||
#include "orte/mca/routed/routed.h"
|
|
||||||
#include "orte/mca/grpcomm/grpcomm.h"
|
|
||||||
#include "orte/mca/ess/ess.h"
|
|
||||||
#include "orte/mca/state/state.h"
|
|
||||||
|
|
||||||
#include "orte/util/error_strings.h"
|
|
||||||
#include "orte/util/name_fns.h"
|
|
||||||
#include "orte/util/proc_info.h"
|
|
||||||
#include "orte/util/show_help.h"
|
|
||||||
#include "orte/util/threads.h"
|
|
||||||
|
|
||||||
#include "orte/runtime/orte_globals.h"
|
|
||||||
#include "orte/runtime/orte_locks.h"
|
|
||||||
#include "orte/runtime/orte_quit.h"
|
|
||||||
#include "orte/runtime/data_type_support/orte_dt_support.h"
|
|
||||||
|
|
||||||
#include "orte/mca/errmgr/errmgr.h"
|
|
||||||
#include "orte/mca/errmgr/base/base.h"
|
|
||||||
#include "orte/mca/errmgr/base/errmgr_private.h"
|
|
||||||
|
|
||||||
#include "errmgr_dvm.h"
|
|
||||||
|
|
||||||
static int init(void);
|
|
||||||
static int finalize(void);
|
|
||||||
|
|
||||||
/******************
|
|
||||||
* dvm module
|
|
||||||
******************/
|
|
||||||
orte_errmgr_base_module_t orte_errmgr_dvm_module = {
|
|
||||||
.init = init,
|
|
||||||
.finalize = finalize,
|
|
||||||
.logfn = orte_errmgr_base_log,
|
|
||||||
.abort = orte_errmgr_base_abort,
|
|
||||||
.abort_peers = orte_errmgr_base_abort_peers
|
|
||||||
};
|
|
||||||
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Local functions
|
|
||||||
*/
|
|
||||||
static void job_errors(int fd, short args, void *cbdata);
|
|
||||||
static void proc_errors(int fd, short args, void *cbdata);
|
|
||||||
|
|
||||||
static int init(void)
|
|
||||||
{
|
|
||||||
/* setup state machine to trap job errors */
|
|
||||||
orte_state.add_job_state(ORTE_JOB_STATE_ERROR, job_errors, ORTE_ERROR_PRI);
|
|
||||||
|
|
||||||
/* set the lost connection state to run at MSG priority so
|
|
||||||
* we can process any last messages from the proc
|
|
||||||
*/
|
|
||||||
orte_state.add_proc_state(ORTE_PROC_STATE_COMM_FAILED, proc_errors, ORTE_MSG_PRI);
|
|
||||||
|
|
||||||
/* setup state machine to trap proc errors */
|
|
||||||
orte_state.add_proc_state(ORTE_PROC_STATE_ERROR, proc_errors, ORTE_ERROR_PRI);
|
|
||||||
|
|
||||||
return ORTE_SUCCESS;
|
|
||||||
}
|
|
||||||
|
|
||||||
static int finalize(void)
|
|
||||||
{
|
|
||||||
return ORTE_SUCCESS;
|
|
||||||
}
|
|
||||||
|
|
||||||
static void _terminate_job(orte_jobid_t jobid)
|
|
||||||
{
|
|
||||||
opal_pointer_array_t procs;
|
|
||||||
orte_proc_t pobj;
|
|
||||||
|
|
||||||
OBJ_CONSTRUCT(&procs, opal_pointer_array_t);
|
|
||||||
opal_pointer_array_init(&procs, 1, 1, 1);
|
|
||||||
OBJ_CONSTRUCT(&pobj, orte_proc_t);
|
|
||||||
pobj.name.jobid = jobid;
|
|
||||||
pobj.name.vpid = ORTE_VPID_WILDCARD;
|
|
||||||
opal_pointer_array_add(&procs, &pobj);
|
|
||||||
orte_plm.terminate_procs(&procs);
|
|
||||||
OBJ_DESTRUCT(&procs);
|
|
||||||
OBJ_DESTRUCT(&pobj);
|
|
||||||
}
|
|
||||||
|
|
||||||
static void job_errors(int fd, short args, void *cbdata)
|
|
||||||
{
|
|
||||||
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
|
|
||||||
orte_job_t *jdata;
|
|
||||||
orte_job_state_t jobstate;
|
|
||||||
opal_buffer_t *answer;
|
|
||||||
int32_t rc, ret;
|
|
||||||
int room, *rmptr;
|
|
||||||
|
|
||||||
ORTE_ACQUIRE_OBJECT(caddy);
|
|
||||||
|
|
||||||
/*
|
|
||||||
* if orte is trying to shutdown, just let it
|
|
||||||
*/
|
|
||||||
if (orte_finalizing) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* if the jdata is NULL, then we ignore it as this
|
|
||||||
* is reporting an unrecoverable error
|
|
||||||
*/
|
|
||||||
if (NULL == caddy->jdata) {
|
|
||||||
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
|
|
||||||
OBJ_RELEASE(caddy);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* update the state */
|
|
||||||
jdata = caddy->jdata;
|
|
||||||
jobstate = caddy->job_state;
|
|
||||||
jdata->state = jobstate;
|
|
||||||
|
|
||||||
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_framework.framework_output,
|
|
||||||
"%s errmgr:dvm: job %s reported state %s",
|
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
|
||||||
ORTE_JOBID_PRINT(jdata->jobid),
|
|
||||||
orte_job_state_to_str(jobstate)));
|
|
||||||
|
|
||||||
if (jdata->jobid == ORTE_PROC_MY_NAME->jobid) {
|
|
||||||
/* if the daemon job aborted and we haven't heard from everyone yet,
|
|
||||||
* then this could well have been caused by a daemon not finding
|
|
||||||
* a way back to us. In this case, output a message indicating a daemon
|
|
||||||
* died without reporting. Otherwise, say nothing as we
|
|
||||||
* likely already output an error message */
|
|
||||||
if (ORTE_JOB_STATE_ABORTED == jobstate &&
|
|
||||||
jdata->num_procs != jdata->num_reported) {
|
|
||||||
orte_routing_is_enabled = false;
|
|
||||||
orte_show_help("help-errmgr-base.txt", "failed-daemon", true);
|
|
||||||
}
|
|
||||||
/* there really isn't much else we can do since the problem
|
|
||||||
* is in the DVM itself, so best just to terminate */
|
|
||||||
jdata->num_terminated = jdata->num_procs;
|
|
||||||
/* activate the terminated state so we can exit */
|
|
||||||
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_TERMINATED);
|
|
||||||
OBJ_RELEASE(caddy);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* all other cases involve jobs submitted to the DVM - therefore,
|
|
||||||
* we only inform the submitter of the problem, but do NOT terminate
|
|
||||||
* the DVM itself */
|
|
||||||
|
|
||||||
rc = jobstate;
|
|
||||||
answer = OBJ_NEW(opal_buffer_t);
|
|
||||||
if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &rc, 1, OPAL_INT32))) {
|
|
||||||
ORTE_ERROR_LOG(ret);
|
|
||||||
OBJ_RELEASE(caddy);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &jdata->jobid, 1, ORTE_JOBID))) {
|
|
||||||
ORTE_ERROR_LOG(ret);
|
|
||||||
OBJ_RELEASE(caddy);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
/* pack the room number */
|
|
||||||
rmptr = &room;
|
|
||||||
if (orte_get_attribute(&jdata->attributes, ORTE_JOB_ROOM_NUM, (void**)&rmptr, OPAL_INT)) {
|
|
||||||
if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &room, 1, OPAL_INT))) {
|
|
||||||
ORTE_ERROR_LOG(ret);
|
|
||||||
OBJ_RELEASE(caddy);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
|
|
||||||
"%s errmgr:dvm sending notification of job %s failure to %s",
|
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
|
||||||
ORTE_JOBID_PRINT(jdata->jobid),
|
|
||||||
ORTE_NAME_PRINT(&jdata->originator)));
|
|
||||||
if (0 > (ret = orte_rml.send_buffer_nb(orte_mgmt_conduit,
|
|
||||||
&jdata->originator, answer,
|
|
||||||
ORTE_RML_TAG_LAUNCH_RESP,
|
|
||||||
orte_rml_send_callback, NULL))) {
|
|
||||||
ORTE_ERROR_LOG(ret);
|
|
||||||
OBJ_RELEASE(answer);
|
|
||||||
}
|
|
||||||
/* ensure we terminate any processes left running in the DVM */
|
|
||||||
_terminate_job(jdata->jobid);
|
|
||||||
|
|
||||||
/* cleanup */
|
|
||||||
OBJ_RELEASE(caddy);
|
|
||||||
}
|
|
||||||
|
|
||||||
static void proc_errors(int fd, short args, void *cbdata)
|
|
||||||
{
|
|
||||||
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
|
|
||||||
orte_job_t *jdata;
|
|
||||||
orte_proc_t *pptr, *proct;
|
|
||||||
orte_process_name_t *proc = &caddy->name;
|
|
||||||
orte_proc_state_t state = caddy->proc_state;
|
|
||||||
int i;
|
|
||||||
int32_t i32, *i32ptr;
|
|
||||||
char *rtmod;
|
|
||||||
|
|
||||||
ORTE_ACQUIRE_OBJECT(caddy);
|
|
||||||
|
|
||||||
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_framework.framework_output,
|
|
||||||
"%s errmgr:dvm: for proc %s state %s",
|
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
|
||||||
ORTE_NAME_PRINT(proc),
|
|
||||||
orte_proc_state_to_str(state)));
|
|
||||||
|
|
||||||
/*
|
|
||||||
* if orte is trying to shutdown, just let it
|
|
||||||
*/
|
|
||||||
if (orte_finalizing) {
|
|
||||||
goto cleanup;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* get the job object */
|
|
||||||
if (NULL == (jdata = orte_get_job_data_object(proc->jobid))) {
|
|
||||||
/* could be a race condition */
|
|
||||||
goto cleanup;
|
|
||||||
}
|
|
||||||
pptr = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid);
|
|
||||||
|
|
||||||
/* get the management conduit's routed module */
|
|
||||||
rtmod = orte_rml.get_routed(orte_mgmt_conduit);
|
|
||||||
|
|
||||||
/* we MUST handle a communication failure before doing anything else
|
|
||||||
* as it requires some special care to avoid normal termination issues
|
|
||||||
* for local application procs
|
|
||||||
*/
|
|
||||||
if (ORTE_PROC_STATE_COMM_FAILED == state) {
|
|
||||||
/* is this to a daemon? */
|
|
||||||
if (ORTE_PROC_MY_NAME->jobid != proc->jobid) {
|
|
||||||
/* nope - ignore it */
|
|
||||||
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
|
|
||||||
"%s Comm failure to non-daemon proc - ignoring it",
|
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
|
||||||
goto cleanup;
|
|
||||||
}
|
|
||||||
/* if this is my own connection, ignore it */
|
|
||||||
if (ORTE_PROC_MY_NAME->vpid == proc->vpid) {
|
|
||||||
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
|
|
||||||
"%s Comm failure on my own connection - ignoring it",
|
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
|
||||||
goto cleanup;
|
|
||||||
}
|
|
||||||
/* mark the daemon as gone */
|
|
||||||
ORTE_FLAG_UNSET(pptr, ORTE_PROC_FLAG_ALIVE);
|
|
||||||
/* update the state */
|
|
||||||
pptr->state = state;
|
|
||||||
/* adjust our num_procs */
|
|
||||||
--orte_process_info.num_procs;
|
|
||||||
/* if we have ordered orteds to terminate or abort
|
|
||||||
* is in progress, record it */
|
|
||||||
if (orte_orteds_term_ordered || orte_abnormal_term_ordered) {
|
|
||||||
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
|
|
||||||
"%s Comm failure: daemons terminating - recording daemon %s as gone",
|
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc)));
|
|
||||||
/* remove from dependent routes, if it is one */
|
|
||||||
orte_routed.route_lost(rtmod, proc);
|
|
||||||
/* if all my routes and local children are gone, then terminate ourselves */
|
|
||||||
if (0 == orte_routed.num_routes(rtmod)) {
|
|
||||||
for (i=0; i < orte_local_children->size; i++) {
|
|
||||||
if (NULL != (proct = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i)) &&
|
|
||||||
ORTE_FLAG_TEST(pptr, ORTE_PROC_FLAG_ALIVE) && proct->state < ORTE_PROC_STATE_UNTERMINATED) {
|
|
||||||
/* at least one is still alive */
|
|
||||||
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
|
|
||||||
"%s Comm failure: at least one proc (%s) still alive",
|
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
|
||||||
ORTE_NAME_PRINT(&proct->name)));
|
|
||||||
goto cleanup;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
/* call our appropriate exit procedure */
|
|
||||||
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
|
|
||||||
"%s errmgr_dvm: all routes and children gone - ordering exit",
|
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
|
||||||
ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED);
|
|
||||||
} else {
|
|
||||||
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
|
|
||||||
"%s Comm failure: %d routes remain alive",
|
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
|
||||||
(int)orte_routed.num_routes(rtmod)));
|
|
||||||
}
|
|
||||||
goto cleanup;
|
|
||||||
}
|
|
||||||
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
|
|
||||||
"%s Comm failure: daemon %s - aborting",
|
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc)));
|
|
||||||
/* record the first one to fail */
|
|
||||||
if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) {
|
|
||||||
/* output an error message so the user knows what happened */
|
|
||||||
orte_show_help("help-errmgr-base.txt", "node-died", true,
|
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
|
||||||
orte_process_info.nodename,
|
|
||||||
ORTE_NAME_PRINT(proc),
|
|
||||||
pptr->node->name);
|
|
||||||
/* mark the daemon job as failed */
|
|
||||||
jdata->state = ORTE_JOB_STATE_COMM_FAILED;
|
|
||||||
/* point to the lowest rank to cause the problem */
|
|
||||||
orte_set_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, ORTE_ATTR_LOCAL, pptr, OPAL_PTR);
|
|
||||||
/* retain the object so it doesn't get free'd */
|
|
||||||
OBJ_RETAIN(pptr);
|
|
||||||
ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_ABORTED);
|
|
||||||
/* update our exit code */
|
|
||||||
ORTE_UPDATE_EXIT_STATUS(pptr->exit_code);
|
|
||||||
/* just in case the exit code hadn't been set, do it here - this
|
|
||||||
* won't override any reported exit code */
|
|
||||||
ORTE_UPDATE_EXIT_STATUS(ORTE_ERR_COMM_FAILURE);
|
|
||||||
}
|
|
||||||
goto cleanup;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* update the proc state - can get multiple reports on a proc
|
|
||||||
* depending on circumstances, so ensure we only do this once
|
|
||||||
*/
|
|
||||||
if (pptr->state < ORTE_PROC_STATE_TERMINATED) {
|
|
||||||
pptr->state = state;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* if we were ordered to terminate, mark this proc as dead and see if
|
|
||||||
* any of our routes or local children remain alive - if not, then
|
|
||||||
* terminate ourselves. */
|
|
||||||
if (orte_orteds_term_ordered) {
|
|
||||||
for (i=0; i < orte_local_children->size; i++) {
|
|
||||||
if (NULL != (proct = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) {
|
|
||||||
if (ORTE_FLAG_TEST(proct, ORTE_PROC_FLAG_ALIVE)) {
|
|
||||||
goto keep_going;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
/* if all my routes and children are gone, then terminate
|
|
||||||
ourselves nicely (i.e., this is a normal termination) */
|
|
||||||
if (0 == orte_routed.num_routes(rtmod)) {
|
|
||||||
OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output,
|
|
||||||
"%s errmgr:default:dvm all routes gone - exiting",
|
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
|
||||||
ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
keep_going:
|
|
||||||
/* ensure we record the failed proc properly so we can report
|
|
||||||
* the error once we terminate
|
|
||||||
*/
|
|
||||||
switch (state) {
|
|
||||||
case ORTE_PROC_STATE_KILLED_BY_CMD:
|
|
||||||
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
|
|
||||||
"%s errmgr:dvm: proc %s killed by cmd",
|
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
|
||||||
ORTE_NAME_PRINT(proc)));
|
|
||||||
/* we ordered this proc to die, so it isn't an abnormal termination
|
|
||||||
* and we don't flag it as such
|
|
||||||
*/
|
|
||||||
if (jdata->num_terminated >= jdata->num_procs) {
|
|
||||||
/* this job has terminated */
|
|
||||||
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_TERMINATED);
|
|
||||||
}
|
|
||||||
/* don't abort the job as this isn't an abnormal termination */
|
|
||||||
break;
|
|
||||||
|
|
||||||
case ORTE_PROC_STATE_ABORTED:
|
|
||||||
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
|
|
||||||
"%s errmgr:dvm: proc %s aborted",
|
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
|
||||||
ORTE_NAME_PRINT(proc)));
|
|
||||||
if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) {
|
|
||||||
jdata->state = ORTE_JOB_STATE_ABORTED;
|
|
||||||
/* point to the first rank to cause the problem */
|
|
||||||
orte_set_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, ORTE_ATTR_LOCAL, pptr, OPAL_PTR);
|
|
||||||
/* retain the object so it doesn't get free'd */
|
|
||||||
OBJ_RETAIN(pptr);
|
|
||||||
ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_ABORTED);
|
|
||||||
ORTE_UPDATE_EXIT_STATUS(pptr->exit_code);
|
|
||||||
/* kill the job */
|
|
||||||
_terminate_job(jdata->jobid);
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
|
|
||||||
case ORTE_PROC_STATE_ABORTED_BY_SIG:
|
|
||||||
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
|
|
||||||
"%s errmgr:dvm: proc %s aborted by signal",
|
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
|
||||||
ORTE_NAME_PRINT(proc)));
|
|
||||||
if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) {
|
|
||||||
jdata->state = ORTE_JOB_STATE_ABORTED_BY_SIG;
|
|
||||||
/* point to the first rank to cause the problem */
|
|
||||||
orte_set_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, ORTE_ATTR_LOCAL, pptr, OPAL_PTR);
|
|
||||||
/* retain the object so it doesn't get free'd */
|
|
||||||
OBJ_RETAIN(pptr);
|
|
||||||
ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_ABORTED);
|
|
||||||
ORTE_UPDATE_EXIT_STATUS(pptr->exit_code);
|
|
||||||
/* kill the job */
|
|
||||||
_terminate_job(jdata->jobid);
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
|
|
||||||
case ORTE_PROC_STATE_TERM_WO_SYNC:
|
|
||||||
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
|
|
||||||
"%s errmgr:dvm: proc %s terminated without sync",
|
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
|
||||||
ORTE_NAME_PRINT(proc)));
|
|
||||||
if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) {
|
|
||||||
jdata->state = ORTE_JOB_STATE_ABORTED_WO_SYNC;
|
|
||||||
/* point to the first rank to cause the problem */
|
|
||||||
orte_set_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, ORTE_ATTR_LOCAL, pptr, OPAL_PTR);
|
|
||||||
/* retain the object so it doesn't get free'd */
|
|
||||||
OBJ_RETAIN(pptr);
|
|
||||||
ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_ABORTED);
|
|
||||||
ORTE_UPDATE_EXIT_STATUS(pptr->exit_code);
|
|
||||||
/* now treat a special case - if the proc exit'd without a required
|
|
||||||
* sync, it may have done so with a zero exit code. We want to ensure
|
|
||||||
* that the user realizes there was an error, so in this -one- case,
|
|
||||||
* we overwrite the process' exit code with the default error code
|
|
||||||
*/
|
|
||||||
ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
|
|
||||||
/* kill the job */
|
|
||||||
_terminate_job(jdata->jobid);
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
|
|
||||||
case ORTE_PROC_STATE_FAILED_TO_START:
|
|
||||||
case ORTE_PROC_STATE_FAILED_TO_LAUNCH:
|
|
||||||
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
|
|
||||||
"%s errmgr:dvm: proc %s %s",
|
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
|
||||||
ORTE_NAME_PRINT(proc),
|
|
||||||
orte_proc_state_to_str(state)));
|
|
||||||
if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) {
|
|
||||||
opal_buffer_t *answer;
|
|
||||||
int id, *idptr, ret;
|
|
||||||
|
|
||||||
if (ORTE_PROC_STATE_FAILED_TO_START) {
|
|
||||||
jdata->state = ORTE_JOB_STATE_FAILED_TO_START;
|
|
||||||
} else {
|
|
||||||
jdata->state = ORTE_JOB_STATE_FAILED_TO_LAUNCH;
|
|
||||||
}
|
|
||||||
/* point to the first rank to cause the problem */
|
|
||||||
orte_set_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, ORTE_ATTR_LOCAL, pptr, OPAL_PTR);
|
|
||||||
/* retain the object so it doesn't get free'd */
|
|
||||||
OBJ_RETAIN(pptr);
|
|
||||||
ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_ABORTED);
|
|
||||||
/* send a notification to the requestor - indicate that this is a spawn response */
|
|
||||||
answer = OBJ_NEW(opal_buffer_t);
|
|
||||||
/* pack the return status */
|
|
||||||
if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &pptr->exit_code, 1, OPAL_INT32))) {
|
|
||||||
ORTE_ERROR_LOG(ret);
|
|
||||||
OBJ_RELEASE(answer);
|
|
||||||
goto CLEANUP;
|
|
||||||
}
|
|
||||||
/* pack the jobid to be returned */
|
|
||||||
if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &jdata->jobid, 1, ORTE_JOBID))) {
|
|
||||||
ORTE_ERROR_LOG(ret);
|
|
||||||
OBJ_RELEASE(answer);
|
|
||||||
goto CLEANUP;
|
|
||||||
}
|
|
||||||
idptr = &id;
|
|
||||||
if (orte_get_attribute(&jdata->attributes, ORTE_JOB_ROOM_NUM, (void**)&idptr, OPAL_INT)) {
|
|
||||||
/* pack the sender's index to the tracking object */
|
|
||||||
if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, idptr, 1, OPAL_INT))) {
|
|
||||||
ORTE_ERROR_LOG(ret);
|
|
||||||
OBJ_RELEASE(answer);
|
|
||||||
goto CLEANUP;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (orte_get_attribute(&jdata->attributes, ORTE_JOB_FIXED_DVM, NULL, OPAL_BOOL)) {
|
|
||||||
/* we need to send the requestor more info about what happened */
|
|
||||||
opal_dss.pack(answer, &jdata->state, 1, ORTE_JOB_STATE_T);
|
|
||||||
opal_dss.pack(answer, &pptr, 1, ORTE_PROC);
|
|
||||||
opal_dss.pack(answer, &pptr->node, 1, ORTE_NODE);
|
|
||||||
}
|
|
||||||
/* return response */
|
|
||||||
if (0 > (ret = orte_rml.send_buffer_nb(orte_mgmt_conduit,
|
|
||||||
&jdata->originator, answer,
|
|
||||||
ORTE_RML_TAG_LAUNCH_RESP,
|
|
||||||
orte_rml_send_callback, NULL))) {
|
|
||||||
ORTE_ERROR_LOG(ret);
|
|
||||||
OBJ_RELEASE(answer);
|
|
||||||
}
|
|
||||||
/* record that we notified about this job */
|
|
||||||
jdata->state = ORTE_JOB_STATE_NOTIFIED;
|
|
||||||
CLEANUP:
|
|
||||||
/* kill the job */
|
|
||||||
_terminate_job(jdata->jobid);
|
|
||||||
}
|
|
||||||
/* if this was a daemon, report it */
|
|
||||||
if (jdata->jobid == ORTE_PROC_MY_NAME->jobid) {
|
|
||||||
/* output a message indicating we failed to launch a daemon */
|
|
||||||
orte_show_help("help-errmgr-base.txt", "failed-daemon-launch", true);
|
|
||||||
}
|
|
||||||
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_TERMINATED);
|
|
||||||
break;
|
|
||||||
|
|
||||||
case ORTE_PROC_STATE_CALLED_ABORT:
|
|
||||||
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
|
|
||||||
"%s errmgr:dvm: proc %s called abort with exit code %d",
|
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
|
||||||
ORTE_NAME_PRINT(proc), pptr->exit_code));
|
|
||||||
if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) {
|
|
||||||
jdata->state = ORTE_JOB_STATE_CALLED_ABORT;
|
|
||||||
/* point to the first proc to cause the problem */
|
|
||||||
orte_set_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, ORTE_ATTR_LOCAL, pptr, OPAL_PTR);
|
|
||||||
/* retain the object so it doesn't get free'd */
|
|
||||||
OBJ_RETAIN(pptr);
|
|
||||||
ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_ABORTED);
|
|
||||||
ORTE_UPDATE_EXIT_STATUS(pptr->exit_code);
|
|
||||||
/* kill the job */
|
|
||||||
_terminate_job(jdata->jobid);
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
|
|
||||||
case ORTE_PROC_STATE_TERM_NON_ZERO:
|
|
||||||
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
|
|
||||||
"%s errmgr:dvm: proc %s exited with non-zero status %d",
|
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
|
||||||
ORTE_NAME_PRINT(proc),
|
|
||||||
pptr->exit_code));
|
|
||||||
ORTE_UPDATE_EXIT_STATUS(pptr->exit_code);
|
|
||||||
/* track the number of non-zero exits */
|
|
||||||
i32 = 0;
|
|
||||||
i32ptr = &i32;
|
|
||||||
orte_get_attribute(&jdata->attributes, ORTE_JOB_NUM_NONZERO_EXIT, (void**)&i32ptr, OPAL_INT32);
|
|
||||||
++i32;
|
|
||||||
orte_set_attribute(&jdata->attributes, ORTE_JOB_NUM_NONZERO_EXIT, ORTE_ATTR_LOCAL, i32ptr, OPAL_INT32);
|
|
||||||
if (orte_abort_non_zero_exit) {
|
|
||||||
if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) {
|
|
||||||
jdata->state = ORTE_JOB_STATE_NON_ZERO_TERM;
|
|
||||||
/* point to the first rank to cause the problem */
|
|
||||||
orte_set_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, ORTE_ATTR_LOCAL, pptr, OPAL_PTR);
|
|
||||||
/* retain the object so it doesn't get free'd */
|
|
||||||
OBJ_RETAIN(pptr);
|
|
||||||
ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_ABORTED);
|
|
||||||
/* kill the job */
|
|
||||||
_terminate_job(jdata->jobid);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
/* user requested we consider this normal termination */
|
|
||||||
if (jdata->num_terminated >= jdata->num_procs) {
|
|
||||||
/* this job has terminated */
|
|
||||||
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_TERMINATED);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
|
|
||||||
case ORTE_PROC_STATE_HEARTBEAT_FAILED:
|
|
||||||
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
|
|
||||||
"%s errmgr:dvm: proc %s heartbeat failed",
|
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
|
||||||
ORTE_NAME_PRINT(proc)));
|
|
||||||
if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) {
|
|
||||||
jdata->state = ORTE_JOB_STATE_HEARTBEAT_FAILED;
|
|
||||||
/* point to the first rank to cause the problem */
|
|
||||||
orte_set_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, ORTE_ATTR_LOCAL, pptr, OPAL_PTR);
|
|
||||||
/* retain the object so it doesn't get free'd */
|
|
||||||
OBJ_RETAIN(pptr);
|
|
||||||
ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_ABORTED);
|
|
||||||
ORTE_UPDATE_EXIT_STATUS(pptr->exit_code);
|
|
||||||
/* kill the job */
|
|
||||||
_terminate_job(jdata->jobid);
|
|
||||||
}
|
|
||||||
/* remove from dependent routes, if it is one */
|
|
||||||
orte_routed.route_lost(rtmod, proc);
|
|
||||||
break;
|
|
||||||
|
|
||||||
case ORTE_PROC_STATE_UNABLE_TO_SEND_MSG:
|
|
||||||
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
|
|
||||||
"%s errmgr:dvm: unable to send message to proc %s",
|
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
|
||||||
ORTE_NAME_PRINT(proc)));
|
|
||||||
/* if this proc is one of my daemons, then we are truly
|
|
||||||
* hosed - so just exit out
|
|
||||||
*/
|
|
||||||
if (ORTE_PROC_MY_NAME->jobid == proc->jobid) {
|
|
||||||
ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
|
|
||||||
default:
|
|
||||||
/* shouldn't get this, but terminate job if required */
|
|
||||||
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
|
|
||||||
"%s errmgr:dvm: proc %s default error %s",
|
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
|
||||||
ORTE_NAME_PRINT(proc),
|
|
||||||
orte_proc_state_to_str(state)));
|
|
||||||
if (jdata->num_terminated == jdata->num_procs) {
|
|
||||||
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_TERMINATED);
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
/* if the waitpid fired, be sure to let the state machine know */
|
|
||||||
if (ORTE_FLAG_TEST(pptr, ORTE_PROC_FLAG_WAITPID)) {
|
|
||||||
ORTE_ACTIVATE_PROC_STATE(&pptr->name, ORTE_PROC_STATE_WAITPID_FIRED);
|
|
||||||
}
|
|
||||||
|
|
||||||
cleanup:
|
|
||||||
OBJ_RELEASE(caddy);
|
|
||||||
}
|
|
@ -1,39 +0,0 @@
|
|||||||
/*
|
|
||||||
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
|
||||||
* Copyright (c) 2004-2011 The University of Tennessee and The University
|
|
||||||
* of Tennessee Research Foundation. All rights
|
|
||||||
* reserved.
|
|
||||||
* Copyright (c) 2016 Intel, Inc. All rights reserved.
|
|
||||||
*
|
|
||||||
* $COPYRIGHT$
|
|
||||||
*
|
|
||||||
* Additional copyrights may follow
|
|
||||||
*
|
|
||||||
* $HEADER$
|
|
||||||
*/
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @file
|
|
||||||
*
|
|
||||||
*/
|
|
||||||
|
|
||||||
#ifndef MCA_ERRMGR_dvm_EXPORT_H
|
|
||||||
#define MCA_ERRMGR_dvm_EXPORT_H
|
|
||||||
|
|
||||||
#include "orte_config.h"
|
|
||||||
|
|
||||||
#include "orte/mca/errmgr/errmgr.h"
|
|
||||||
|
|
||||||
BEGIN_C_DECLS
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Local Component structures
|
|
||||||
*/
|
|
||||||
|
|
||||||
ORTE_MODULE_DECLSPEC extern orte_errmgr_base_component_t mca_errmgr_dvm_component;
|
|
||||||
|
|
||||||
ORTE_DECLSPEC extern orte_errmgr_base_module_t orte_errmgr_dvm_module;
|
|
||||||
|
|
||||||
END_C_DECLS
|
|
||||||
|
|
||||||
#endif /* MCA_ERRMGR_dvm_EXPORT_H */
|
|
@ -1,102 +0,0 @@
|
|||||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
|
||||||
/*
|
|
||||||
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
|
||||||
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights
|
|
||||||
* reserved.
|
|
||||||
* Copyright (c) 2016 Intel, Inc. All rights reserved.
|
|
||||||
*
|
|
||||||
* $COPYRIGHT$
|
|
||||||
*
|
|
||||||
* Additional copyrights may follow
|
|
||||||
*
|
|
||||||
* $HEADER$
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include "orte_config.h"
|
|
||||||
#include "opal/util/output.h"
|
|
||||||
|
|
||||||
#include "orte/mca/errmgr/errmgr.h"
|
|
||||||
#include "orte/mca/errmgr/base/base.h"
|
|
||||||
#include "orte/mca/errmgr/base/errmgr_private.h"
|
|
||||||
#include "errmgr_dvm.h"
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Public string for version number
|
|
||||||
*/
|
|
||||||
const char *orte_errmgr_dvm_component_version_string =
|
|
||||||
"ORTE ERRMGR dvm MCA component version " ORTE_VERSION;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Local functionality
|
|
||||||
*/
|
|
||||||
static int dvm_register(void);
|
|
||||||
static int dvm_open(void);
|
|
||||||
static int dvm_close(void);
|
|
||||||
static int dvm_component_query(mca_base_module_t **module, int *priority);
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Instantiate the public struct with all of our public information
|
|
||||||
* and pointer to our public functions in it
|
|
||||||
*/
|
|
||||||
orte_errmgr_base_component_t mca_errmgr_dvm_component = {
|
|
||||||
/* Handle the general mca_component_t struct containing
|
|
||||||
* meta information about the component dvm
|
|
||||||
*/
|
|
||||||
.base_version = {
|
|
||||||
ORTE_ERRMGR_BASE_VERSION_3_0_0,
|
|
||||||
/* Component name and version */
|
|
||||||
.mca_component_name = "dvm",
|
|
||||||
MCA_BASE_MAKE_VERSION(component, ORTE_MAJOR_VERSION, ORTE_MINOR_VERSION,
|
|
||||||
ORTE_RELEASE_VERSION),
|
|
||||||
|
|
||||||
/* Component open and close functions */
|
|
||||||
.mca_open_component = dvm_open,
|
|
||||||
.mca_close_component = dvm_close,
|
|
||||||
.mca_query_component = dvm_component_query,
|
|
||||||
.mca_register_component_params = dvm_register,
|
|
||||||
},
|
|
||||||
.base_data = {
|
|
||||||
/* The component is checkpoint ready */
|
|
||||||
MCA_BASE_METADATA_PARAM_CHECKPOINT
|
|
||||||
},
|
|
||||||
};
|
|
||||||
|
|
||||||
static int my_priority;
|
|
||||||
|
|
||||||
static int dvm_register(void)
|
|
||||||
{
|
|
||||||
mca_base_component_t *c = &mca_errmgr_dvm_component.base_version;
|
|
||||||
|
|
||||||
my_priority = 1000;
|
|
||||||
(void) mca_base_component_var_register(c, "priority",
|
|
||||||
"Priority of the dvm errmgr component",
|
|
||||||
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
|
|
||||||
OPAL_INFO_LVL_9,
|
|
||||||
MCA_BASE_VAR_SCOPE_READONLY, &my_priority);
|
|
||||||
|
|
||||||
return ORTE_SUCCESS;
|
|
||||||
}
|
|
||||||
|
|
||||||
static int dvm_open(void)
|
|
||||||
{
|
|
||||||
return ORTE_SUCCESS;
|
|
||||||
}
|
|
||||||
|
|
||||||
static int dvm_close(void)
|
|
||||||
{
|
|
||||||
return ORTE_SUCCESS;
|
|
||||||
}
|
|
||||||
|
|
||||||
static int dvm_component_query(mca_base_module_t **module, int *priority)
|
|
||||||
{
|
|
||||||
/* used by DVM masters */
|
|
||||||
if (ORTE_PROC_IS_MASTER) {
|
|
||||||
*priority = my_priority;
|
|
||||||
*module = (mca_base_module_t *)&orte_errmgr_dvm_module;
|
|
||||||
return ORTE_SUCCESS;
|
|
||||||
}
|
|
||||||
|
|
||||||
*module = NULL;
|
|
||||||
*priority = -1;
|
|
||||||
return ORTE_ERROR;
|
|
||||||
}
|
|
@ -1,7 +0,0 @@
|
|||||||
#
|
|
||||||
# owner/status file
|
|
||||||
# owner: institution that is responsible for this package
|
|
||||||
# status: e.g. active, maintenance, unmaintained
|
|
||||||
#
|
|
||||||
owner: INTEL
|
|
||||||
status: active
|
|
@ -1,36 +0,0 @@
|
|||||||
#
|
|
||||||
# Copyright (c) 2015-2018 Intel, Inc. All rights reserved.
|
|
||||||
# Copyright (c) 2017 IBM Corporation. All rights reserved.
|
|
||||||
# $COPYRIGHT$
|
|
||||||
#
|
|
||||||
# Additional copyrights may follow
|
|
||||||
#
|
|
||||||
# $HEADER$
|
|
||||||
#
|
|
||||||
|
|
||||||
sources = \
|
|
||||||
state_dvm.h \
|
|
||||||
state_dvm_component.c \
|
|
||||||
state_dvm.c
|
|
||||||
|
|
||||||
# Make the output library in this directory, and name it either
|
|
||||||
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
|
|
||||||
# (for static builds).
|
|
||||||
|
|
||||||
if MCA_BUILD_orte_state_dvm_DSO
|
|
||||||
component_noinst =
|
|
||||||
component_install = mca_state_dvm.la
|
|
||||||
else
|
|
||||||
component_noinst = libmca_state_dvm.la
|
|
||||||
component_install =
|
|
||||||
endif
|
|
||||||
|
|
||||||
mcacomponentdir = $(ortelibdir)
|
|
||||||
mcacomponent_LTLIBRARIES = $(component_install)
|
|
||||||
mca_state_dvm_la_SOURCES = $(sources)
|
|
||||||
mca_state_dvm_la_LDFLAGS = -module -avoid-version
|
|
||||||
mca_state_dvm_la_LIBADD = $(top_builddir)/orte/lib@ORTE_LIB_PREFIX@open-rte.la
|
|
||||||
|
|
||||||
noinst_LTLIBRARIES = $(component_noinst)
|
|
||||||
libmca_state_dvm_la_SOURCES =$(sources)
|
|
||||||
libmca_state_dvm_la_LDFLAGS = -module -avoid-version
|
|
@ -1,7 +0,0 @@
|
|||||||
#
|
|
||||||
# owner/status file
|
|
||||||
# owner: institution that is responsible for this package
|
|
||||||
# status: e.g. active, maintenance, unmaintained
|
|
||||||
#
|
|
||||||
owner: INTEL
|
|
||||||
status: active
|
|
@ -1,688 +0,0 @@
|
|||||||
/*
|
|
||||||
* Copyright (c) 2015-2018 Intel, Inc. All rights reserved.
|
|
||||||
* $COPYRIGHT$
|
|
||||||
*
|
|
||||||
* Additional copyrights may follow
|
|
||||||
*
|
|
||||||
* $HEADER$
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include "orte_config.h"
|
|
||||||
|
|
||||||
#include <sys/types.h>
|
|
||||||
#ifdef HAVE_UNISTD_H
|
|
||||||
#include <unistd.h>
|
|
||||||
#endif /* HAVE_UNISTD_H */
|
|
||||||
#include <string.h>
|
|
||||||
|
|
||||||
#include "opal/util/output.h"
|
|
||||||
#include "opal/mca/pmix/pmix.h"
|
|
||||||
|
|
||||||
#include "orte/mca/errmgr/errmgr.h"
|
|
||||||
#include "orte/mca/filem/filem.h"
|
|
||||||
#include "orte/mca/grpcomm/grpcomm.h"
|
|
||||||
#include "orte/mca/iof/base/base.h"
|
|
||||||
#include "orte/mca/odls/odls_types.h"
|
|
||||||
#include "orte/mca/plm/base/base.h"
|
|
||||||
#include "orte/mca/ras/base/base.h"
|
|
||||||
#include "orte/mca/regx/regx.h"
|
|
||||||
#include "orte/mca/rmaps/base/base.h"
|
|
||||||
#include "orte/mca/rml/rml.h"
|
|
||||||
#include "orte/mca/rml/base/rml_contact.h"
|
|
||||||
#include "orte/mca/routed/routed.h"
|
|
||||||
#include "orte/util/session_dir.h"
|
|
||||||
#include "orte/util/threads.h"
|
|
||||||
#include "orte/runtime/orte_quit.h"
|
|
||||||
#include "orte/runtime/orte_wait.h"
|
|
||||||
|
|
||||||
#include "orte/mca/state/state.h"
|
|
||||||
#include "orte/mca/state/base/base.h"
|
|
||||||
#include "orte/mca/state/base/state_private.h"
|
|
||||||
#include "state_dvm.h"
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Module functions: Global
|
|
||||||
*/
|
|
||||||
static int init(void);
|
|
||||||
static int finalize(void);
|
|
||||||
|
|
||||||
/* local functions */
|
|
||||||
static void init_complete(int fd, short args, void *cbdata);
|
|
||||||
static void vm_ready(int fd, short args, void *cbata);
|
|
||||||
static void check_complete(int fd, short args, void *cbdata);
|
|
||||||
static void cleanup_job(int fd, short args, void *cbdata);
|
|
||||||
|
|
||||||
/******************
|
|
||||||
* DVM module - used when mpirun is persistent
|
|
||||||
******************/
|
|
||||||
orte_state_base_module_t orte_state_dvm_module = {
|
|
||||||
init,
|
|
||||||
finalize,
|
|
||||||
orte_state_base_activate_job_state,
|
|
||||||
orte_state_base_add_job_state,
|
|
||||||
orte_state_base_set_job_state_callback,
|
|
||||||
orte_state_base_set_job_state_priority,
|
|
||||||
orte_state_base_remove_job_state,
|
|
||||||
orte_state_base_activate_proc_state,
|
|
||||||
orte_state_base_add_proc_state,
|
|
||||||
orte_state_base_set_proc_state_callback,
|
|
||||||
orte_state_base_set_proc_state_priority,
|
|
||||||
orte_state_base_remove_proc_state
|
|
||||||
};
|
|
||||||
|
|
||||||
static void dvm_notify(int sd, short args, void *cbdata);
|
|
||||||
|
|
||||||
/* defined default state machine sequence - individual
|
|
||||||
* plm's must add a state for launching daemons
|
|
||||||
*/
|
|
||||||
static orte_job_state_t launch_states[] = {
|
|
||||||
ORTE_JOB_STATE_INIT,
|
|
||||||
ORTE_JOB_STATE_INIT_COMPLETE,
|
|
||||||
ORTE_JOB_STATE_ALLOCATE,
|
|
||||||
ORTE_JOB_STATE_ALLOCATION_COMPLETE,
|
|
||||||
ORTE_JOB_STATE_DAEMONS_LAUNCHED,
|
|
||||||
ORTE_JOB_STATE_DAEMONS_REPORTED,
|
|
||||||
ORTE_JOB_STATE_VM_READY,
|
|
||||||
ORTE_JOB_STATE_MAP,
|
|
||||||
ORTE_JOB_STATE_MAP_COMPLETE,
|
|
||||||
ORTE_JOB_STATE_SYSTEM_PREP,
|
|
||||||
ORTE_JOB_STATE_LAUNCH_APPS,
|
|
||||||
ORTE_JOB_STATE_SEND_LAUNCH_MSG,
|
|
||||||
ORTE_JOB_STATE_LOCAL_LAUNCH_COMPLETE,
|
|
||||||
ORTE_JOB_STATE_RUNNING,
|
|
||||||
ORTE_JOB_STATE_REGISTERED,
|
|
||||||
/* termination states */
|
|
||||||
ORTE_JOB_STATE_TERMINATED,
|
|
||||||
ORTE_JOB_STATE_NOTIFY_COMPLETED,
|
|
||||||
ORTE_JOB_STATE_NOTIFIED,
|
|
||||||
ORTE_JOB_STATE_ALL_JOBS_COMPLETE
|
|
||||||
};
|
|
||||||
static orte_state_cbfunc_t launch_callbacks[] = {
|
|
||||||
orte_plm_base_setup_job,
|
|
||||||
init_complete,
|
|
||||||
orte_ras_base_allocate,
|
|
||||||
orte_plm_base_allocation_complete,
|
|
||||||
orte_plm_base_daemons_launched,
|
|
||||||
orte_plm_base_daemons_reported,
|
|
||||||
vm_ready,
|
|
||||||
orte_rmaps_base_map_job,
|
|
||||||
orte_plm_base_mapping_complete,
|
|
||||||
orte_plm_base_complete_setup,
|
|
||||||
orte_plm_base_launch_apps,
|
|
||||||
orte_plm_base_send_launch_msg,
|
|
||||||
orte_state_base_local_launch_complete,
|
|
||||||
orte_plm_base_post_launch,
|
|
||||||
orte_plm_base_registered,
|
|
||||||
check_complete,
|
|
||||||
dvm_notify,
|
|
||||||
cleanup_job,
|
|
||||||
orte_quit
|
|
||||||
};
|
|
||||||
|
|
||||||
static orte_proc_state_t proc_states[] = {
|
|
||||||
ORTE_PROC_STATE_RUNNING,
|
|
||||||
ORTE_PROC_STATE_REGISTERED,
|
|
||||||
ORTE_PROC_STATE_IOF_COMPLETE,
|
|
||||||
ORTE_PROC_STATE_WAITPID_FIRED,
|
|
||||||
ORTE_PROC_STATE_TERMINATED
|
|
||||||
};
|
|
||||||
static orte_state_cbfunc_t proc_callbacks[] = {
|
|
||||||
orte_state_base_track_procs,
|
|
||||||
orte_state_base_track_procs,
|
|
||||||
orte_state_base_track_procs,
|
|
||||||
orte_state_base_track_procs,
|
|
||||||
orte_state_base_track_procs
|
|
||||||
};
|
|
||||||
|
|
||||||
static void force_quit(int fd, short args, void *cbdata)
|
|
||||||
{
|
|
||||||
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
|
|
||||||
|
|
||||||
/* give us a chance to stop the orteds */
|
|
||||||
orte_plm.terminate_orteds();
|
|
||||||
OBJ_RELEASE(caddy);
|
|
||||||
}
|
|
||||||
|
|
||||||
/************************
|
|
||||||
* API Definitions
|
|
||||||
************************/
|
|
||||||
static int init(void)
|
|
||||||
{
|
|
||||||
int i, rc;
|
|
||||||
int num_states;
|
|
||||||
|
|
||||||
/* setup the state machines */
|
|
||||||
OBJ_CONSTRUCT(&orte_job_states, opal_list_t);
|
|
||||||
OBJ_CONSTRUCT(&orte_proc_states, opal_list_t);
|
|
||||||
|
|
||||||
/* setup the job state machine */
|
|
||||||
num_states = sizeof(launch_states) / sizeof(orte_job_state_t);
|
|
||||||
for (i=0; i < num_states; i++) {
|
|
||||||
if (ORTE_SUCCESS != (rc = orte_state.add_job_state(launch_states[i],
|
|
||||||
launch_callbacks[i],
|
|
||||||
ORTE_SYS_PRI))) {
|
|
||||||
ORTE_ERROR_LOG(rc);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
/* add the termination response */
|
|
||||||
if (ORTE_SUCCESS != (rc = orte_state.add_job_state(ORTE_JOB_STATE_DAEMONS_TERMINATED,
|
|
||||||
orte_quit, ORTE_SYS_PRI))) {
|
|
||||||
ORTE_ERROR_LOG(rc);
|
|
||||||
}
|
|
||||||
/* add a default error response */
|
|
||||||
if (ORTE_SUCCESS != (rc = orte_state.add_job_state(ORTE_JOB_STATE_FORCED_EXIT,
|
|
||||||
force_quit, ORTE_ERROR_PRI))) {
|
|
||||||
ORTE_ERROR_LOG(rc);
|
|
||||||
}
|
|
||||||
/* add callback to report progress, if requested */
|
|
||||||
if (ORTE_SUCCESS != (rc = orte_state.add_job_state(ORTE_JOB_STATE_REPORT_PROGRESS,
|
|
||||||
orte_state_base_report_progress, ORTE_ERROR_PRI))) {
|
|
||||||
ORTE_ERROR_LOG(rc);
|
|
||||||
}
|
|
||||||
if (5 < opal_output_get_verbosity(orte_state_base_framework.framework_output)) {
|
|
||||||
orte_state_base_print_job_state_machine();
|
|
||||||
}
|
|
||||||
|
|
||||||
/* populate the proc state machine to allow us to
|
|
||||||
* track proc lifecycle changes
|
|
||||||
*/
|
|
||||||
num_states = sizeof(proc_states) / sizeof(orte_proc_state_t);
|
|
||||||
for (i=0; i < num_states; i++) {
|
|
||||||
if (ORTE_SUCCESS != (rc = orte_state.add_proc_state(proc_states[i],
|
|
||||||
proc_callbacks[i],
|
|
||||||
ORTE_SYS_PRI))) {
|
|
||||||
ORTE_ERROR_LOG(rc);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (5 < opal_output_get_verbosity(orte_state_base_framework.framework_output)) {
|
|
||||||
orte_state_base_print_proc_state_machine();
|
|
||||||
}
|
|
||||||
|
|
||||||
return ORTE_SUCCESS;
|
|
||||||
}
|
|
||||||
|
|
||||||
static int finalize(void)
|
|
||||||
{
|
|
||||||
opal_list_item_t *item;
|
|
||||||
|
|
||||||
/* cleanup the proc state machine */
|
|
||||||
while (NULL != (item = opal_list_remove_first(&orte_proc_states))) {
|
|
||||||
OBJ_RELEASE(item);
|
|
||||||
}
|
|
||||||
OBJ_DESTRUCT(&orte_proc_states);
|
|
||||||
|
|
||||||
return ORTE_SUCCESS;
|
|
||||||
}
|
|
||||||
|
|
||||||
static void files_ready(int status, void *cbdata)
|
|
||||||
{
|
|
||||||
orte_job_t *jdata = (orte_job_t*)cbdata;
|
|
||||||
|
|
||||||
if (ORTE_SUCCESS != status) {
|
|
||||||
ORTE_FORCED_TERMINATE(status);
|
|
||||||
return;
|
|
||||||
} else {
|
|
||||||
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_MAP);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
static void init_complete(int sd, short args, void *cbdata)
|
|
||||||
{
|
|
||||||
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
|
|
||||||
|
|
||||||
ORTE_ACQUIRE_OBJECT(caddy);
|
|
||||||
|
|
||||||
/* nothing to do here but move along - if it is the
|
|
||||||
* daemon job, then next step is allocate */
|
|
||||||
ORTE_ACTIVATE_JOB_STATE(caddy->jdata, ORTE_JOB_STATE_ALLOCATE);
|
|
||||||
OBJ_RELEASE(caddy);
|
|
||||||
}
|
|
||||||
|
|
||||||
static void vm_ready(int fd, short args, void *cbdata)
|
|
||||||
{
|
|
||||||
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
|
|
||||||
int rc;
|
|
||||||
opal_buffer_t *buf;
|
|
||||||
orte_daemon_cmd_flag_t command = ORTE_DAEMON_DVM_NIDMAP_CMD;
|
|
||||||
orte_grpcomm_signature_t *sig;
|
|
||||||
opal_buffer_t *wireup;
|
|
||||||
orte_job_t *jptr;
|
|
||||||
orte_proc_t *dmn;
|
|
||||||
opal_byte_object_t bo, *boptr;
|
|
||||||
int8_t flag;
|
|
||||||
int32_t numbytes, v;
|
|
||||||
char *nidmap;
|
|
||||||
opal_list_t *modex;
|
|
||||||
opal_value_t *val, *kv;
|
|
||||||
|
|
||||||
ORTE_ACQUIRE_OBJECT(caddy);
|
|
||||||
|
|
||||||
/* if this is my job, then we are done */
|
|
||||||
if (ORTE_PROC_MY_NAME->jobid == caddy->jdata->jobid) {
|
|
||||||
/* if there is only one daemon in the job, then there
|
|
||||||
* is just a little bit to do */
|
|
||||||
if (1 == orte_process_info.num_procs) {
|
|
||||||
if (!orte_nidmap_communicated) {
|
|
||||||
if (ORTE_SUCCESS != (rc = orte_regx.nidmap_create(orte_node_pool, &orte_node_regex))) {
|
|
||||||
ORTE_ERROR_LOG(rc);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
orte_nidmap_communicated = true;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
/* send the daemon map to every daemon in this DVM - we
|
|
||||||
* do this here so we don't have to do it for every
|
|
||||||
* job we are going to launch */
|
|
||||||
buf = OBJ_NEW(opal_buffer_t);
|
|
||||||
opal_dss.pack(buf, &command, 1, ORTE_DAEMON_CMD);
|
|
||||||
/* if we couldn't provide the allocation regex on the orted
|
|
||||||
* cmd line, then we need to provide all the info here */
|
|
||||||
if (!orte_nidmap_communicated) {
|
|
||||||
if (ORTE_SUCCESS != (rc = orte_regx.nidmap_create(orte_node_pool, &nidmap))) {
|
|
||||||
ORTE_ERROR_LOG(rc);
|
|
||||||
OBJ_RELEASE(buf);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
orte_nidmap_communicated = true;
|
|
||||||
} else {
|
|
||||||
nidmap = NULL;
|
|
||||||
}
|
|
||||||
opal_dss.pack(buf, &nidmap, 1, OPAL_STRING);
|
|
||||||
if (NULL != nidmap) {
|
|
||||||
free(nidmap);
|
|
||||||
}
|
|
||||||
/* provide the info on the capabilities of each node */
|
|
||||||
if (!orte_node_info_communicated) {
|
|
||||||
flag = 1;
|
|
||||||
opal_dss.pack(buf, &flag, 1, OPAL_INT8);
|
|
||||||
if (ORTE_SUCCESS != (rc = orte_regx.encode_nodemap(buf))) {
|
|
||||||
ORTE_ERROR_LOG(rc);
|
|
||||||
OBJ_RELEASE(buf);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
orte_node_info_communicated = true;
|
|
||||||
/* get wireup info for daemons */
|
|
||||||
jptr = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid);
|
|
||||||
wireup = OBJ_NEW(opal_buffer_t);
|
|
||||||
for (v=0; v < jptr->procs->size; v++) {
|
|
||||||
if (NULL == (dmn = (orte_proc_t*)opal_pointer_array_get_item(jptr->procs, v))) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
val = NULL;
|
|
||||||
if (opal_pmix.legacy_get()) {
|
|
||||||
if (OPAL_SUCCESS != (rc = opal_pmix.get(&dmn->name, OPAL_PMIX_PROC_URI, NULL, &val)) || NULL == val) {
|
|
||||||
ORTE_ERROR_LOG(rc);
|
|
||||||
OBJ_RELEASE(buf);
|
|
||||||
OBJ_RELEASE(wireup);
|
|
||||||
return;
|
|
||||||
} else {
|
|
||||||
/* pack the name of the daemon */
|
|
||||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(wireup, &dmn->name, 1, ORTE_NAME))) {
|
|
||||||
ORTE_ERROR_LOG(rc);
|
|
||||||
OBJ_RELEASE(buf);
|
|
||||||
OBJ_RELEASE(wireup);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
/* pack the URI */
|
|
||||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(wireup, &val->data.string, 1, OPAL_STRING))) {
|
|
||||||
ORTE_ERROR_LOG(rc);
|
|
||||||
OBJ_RELEASE(buf);
|
|
||||||
OBJ_RELEASE(wireup);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
OBJ_RELEASE(val);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
if (OPAL_SUCCESS != (rc = opal_pmix.get(&dmn->name, NULL, NULL, &val)) || NULL == val) {
|
|
||||||
ORTE_ERROR_LOG(rc);
|
|
||||||
OBJ_RELEASE(buf);
|
|
||||||
OBJ_RELEASE(wireup);
|
|
||||||
return;
|
|
||||||
} else {
|
|
||||||
/* pack the name of the daemon */
|
|
||||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(wireup, &dmn->name, 1, ORTE_NAME))) {
|
|
||||||
ORTE_ERROR_LOG(rc);
|
|
||||||
OBJ_RELEASE(buf);
|
|
||||||
OBJ_RELEASE(wireup);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
/* the data is returned as a list of key-value pairs in the opal_value_t */
|
|
||||||
if (OPAL_PTR != val->type) {
|
|
||||||
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
|
||||||
OBJ_RELEASE(buf);
|
|
||||||
OBJ_RELEASE(wireup);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
modex = (opal_list_t*)val->data.ptr;
|
|
||||||
numbytes = (int32_t)opal_list_get_size(modex);
|
|
||||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(wireup, &numbytes, 1, OPAL_INT32))) {
|
|
||||||
ORTE_ERROR_LOG(rc);
|
|
||||||
OBJ_RELEASE(buf);
|
|
||||||
OBJ_RELEASE(wireup);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
OPAL_LIST_FOREACH(kv, modex, opal_value_t) {
|
|
||||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(wireup, &kv, 1, OPAL_VALUE))) {
|
|
||||||
ORTE_ERROR_LOG(rc);
|
|
||||||
OBJ_RELEASE(buf);
|
|
||||||
OBJ_RELEASE(wireup);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
OPAL_LIST_RELEASE(modex);
|
|
||||||
OBJ_RELEASE(val);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
/* put it in a byte object for xmission */
|
|
||||||
opal_dss.unload(wireup, (void**)&bo.bytes, &numbytes);
|
|
||||||
/* pack the byte object - zero-byte objects are fine */
|
|
||||||
bo.size = numbytes;
|
|
||||||
boptr = &bo;
|
|
||||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &boptr, 1, OPAL_BYTE_OBJECT))) {
|
|
||||||
ORTE_ERROR_LOG(rc);
|
|
||||||
OBJ_RELEASE(wireup);
|
|
||||||
OBJ_RELEASE(buf);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
/* release the data since it has now been copied into our buffer */
|
|
||||||
if (NULL != bo.bytes) {
|
|
||||||
free(bo.bytes);
|
|
||||||
}
|
|
||||||
OBJ_RELEASE(wireup);
|
|
||||||
} else {
|
|
||||||
flag = 0;
|
|
||||||
opal_dss.pack(buf, &flag, 1, OPAL_INT8);
|
|
||||||
}
|
|
||||||
|
|
||||||
/* goes to all daemons */
|
|
||||||
sig = OBJ_NEW(orte_grpcomm_signature_t);
|
|
||||||
sig->signature = (orte_process_name_t*)malloc(sizeof(orte_process_name_t));
|
|
||||||
sig->signature[0].jobid = ORTE_PROC_MY_NAME->jobid;
|
|
||||||
sig->signature[0].vpid = ORTE_VPID_WILDCARD;
|
|
||||||
if (ORTE_SUCCESS != (rc = orte_grpcomm.xcast(sig, ORTE_RML_TAG_DAEMON, buf))) {
|
|
||||||
ORTE_ERROR_LOG(rc);
|
|
||||||
OBJ_RELEASE(buf);
|
|
||||||
OBJ_RELEASE(sig);
|
|
||||||
ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
OBJ_RELEASE(buf);
|
|
||||||
}
|
|
||||||
/* notify that the vm is ready */
|
|
||||||
fprintf(stdout, "DVM ready\n"); fflush(stdout);
|
|
||||||
OBJ_RELEASE(caddy);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* progress the job */
|
|
||||||
caddy->jdata->state = ORTE_JOB_STATE_VM_READY;
|
|
||||||
|
|
||||||
/* position any required files */
|
|
||||||
if (ORTE_SUCCESS != orte_filem.preposition_files(caddy->jdata, files_ready, caddy->jdata)) {
|
|
||||||
ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
|
|
||||||
}
|
|
||||||
|
|
||||||
/* cleanup */
|
|
||||||
OBJ_RELEASE(caddy);
|
|
||||||
}
|
|
||||||
|
|
||||||
static void check_complete(int fd, short args, void *cbdata)
|
|
||||||
{
|
|
||||||
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
|
|
||||||
orte_job_t *jdata;
|
|
||||||
orte_proc_t *proc;
|
|
||||||
int i;
|
|
||||||
orte_node_t *node;
|
|
||||||
orte_job_map_t *map;
|
|
||||||
orte_std_cntr_t index;
|
|
||||||
char *rtmod;
|
|
||||||
|
|
||||||
ORTE_ACQUIRE_OBJECT(caddy);
|
|
||||||
jdata = caddy->jdata;
|
|
||||||
|
|
||||||
opal_output_verbose(2, orte_state_base_framework.framework_output,
|
|
||||||
"%s state:dvm:check_job_complete on job %s",
|
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
|
||||||
(NULL == jdata) ? "NULL" : ORTE_JOBID_PRINT(jdata->jobid));
|
|
||||||
|
|
||||||
if (NULL == jdata || jdata->jobid == ORTE_PROC_MY_NAME->jobid) {
|
|
||||||
/* just check to see if the daemons are complete */
|
|
||||||
OPAL_OUTPUT_VERBOSE((2, orte_state_base_framework.framework_output,
|
|
||||||
"%s state:dvm:check_job_complete - received NULL job, checking daemons",
|
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
|
||||||
rtmod = orte_rml.get_routed(orte_mgmt_conduit);
|
|
||||||
if (0 == orte_routed.num_routes(rtmod)) {
|
|
||||||
/* orteds are done! */
|
|
||||||
OPAL_OUTPUT_VERBOSE((2, orte_state_base_framework.framework_output,
|
|
||||||
"%s orteds complete - exiting",
|
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
|
||||||
if (NULL == jdata) {
|
|
||||||
jdata = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid);
|
|
||||||
}
|
|
||||||
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_DAEMONS_TERMINATED);
|
|
||||||
OBJ_RELEASE(caddy);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
OBJ_RELEASE(caddy);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* mark the job as terminated, but don't override any
|
|
||||||
* abnormal termination flags
|
|
||||||
*/
|
|
||||||
if (jdata->state < ORTE_JOB_STATE_UNTERMINATED) {
|
|
||||||
jdata->state = ORTE_JOB_STATE_TERMINATED;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* tell the IOF that the job is complete */
|
|
||||||
if (NULL != orte_iof.complete) {
|
|
||||||
orte_iof.complete(jdata);
|
|
||||||
}
|
|
||||||
|
|
||||||
/* tell the PMIx subsystem the job is complete */
|
|
||||||
if (NULL != opal_pmix.server_deregister_nspace) {
|
|
||||||
opal_pmix.server_deregister_nspace(jdata->jobid, NULL, NULL);
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Release the resources used by this job. Since some errmgrs may want
|
|
||||||
* to continue using resources allocated to the job as part of their
|
|
||||||
* fault recovery procedure, we only do this once the job is "complete".
|
|
||||||
* Note that an aborted/killed job -is- flagged as complete and will
|
|
||||||
* therefore have its resources released. We need to do this after
|
|
||||||
* we call the errmgr so that any attempt to restart the job will
|
|
||||||
* avoid doing so in the exact same place as the current job
|
|
||||||
*/
|
|
||||||
if (NULL != jdata->map) {
|
|
||||||
map = jdata->map;
|
|
||||||
for (index = 0; index < map->nodes->size; index++) {
|
|
||||||
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, index))) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
OPAL_OUTPUT_VERBOSE((2, orte_state_base_framework.framework_output,
|
|
||||||
"%s state:dvm releasing procs from node %s",
|
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
|
||||||
node->name));
|
|
||||||
for (i = 0; i < node->procs->size; i++) {
|
|
||||||
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, i))) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
if (proc->name.jobid != jdata->jobid) {
|
|
||||||
/* skip procs from another job */
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
if (!ORTE_FLAG_TEST(proc, ORTE_PROC_FLAG_TOOL)) {
|
|
||||||
node->slots_inuse--;
|
|
||||||
node->num_procs--;
|
|
||||||
}
|
|
||||||
|
|
||||||
OPAL_OUTPUT_VERBOSE((2, orte_state_base_framework.framework_output,
|
|
||||||
"%s state:dvm releasing proc %s from node %s",
|
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
|
||||||
ORTE_NAME_PRINT(&proc->name), node->name));
|
|
||||||
/* set the entry in the node array to NULL */
|
|
||||||
opal_pointer_array_set_item(node->procs, i, NULL);
|
|
||||||
/* release the proc once for the map entry */
|
|
||||||
OBJ_RELEASE(proc);
|
|
||||||
}
|
|
||||||
/* set the node location to NULL */
|
|
||||||
opal_pointer_array_set_item(map->nodes, index, NULL);
|
|
||||||
/* maintain accounting */
|
|
||||||
OBJ_RELEASE(node);
|
|
||||||
/* flag that the node is no longer in a map */
|
|
||||||
ORTE_FLAG_UNSET(node, ORTE_NODE_FLAG_MAPPED);
|
|
||||||
}
|
|
||||||
OBJ_RELEASE(map);
|
|
||||||
jdata->map = NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_DEBUGGER_DAEMON)) {
|
|
||||||
/* this was a debugger daemon. notify that a debugger has detached */
|
|
||||||
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_DEBUGGER_DETACH);
|
|
||||||
} else if (jdata->state != ORTE_JOB_STATE_NOTIFIED) {
|
|
||||||
OPAL_OUTPUT_VERBOSE((2, orte_state_base_framework.framework_output,
|
|
||||||
"%s state:dvm:check_job_completed state is terminated - activating notify",
|
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
|
||||||
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_NOTIFY_COMPLETED);
|
|
||||||
/* mark the job as notified */
|
|
||||||
jdata->state = ORTE_JOB_STATE_NOTIFIED;
|
|
||||||
}
|
|
||||||
|
|
||||||
OBJ_RELEASE(caddy);
|
|
||||||
}
|
|
||||||
|
|
||||||
static void cleanup_job(int sd, short args, void *cbdata)
|
|
||||||
{
|
|
||||||
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
|
|
||||||
orte_job_t *jdata;
|
|
||||||
|
|
||||||
ORTE_ACQUIRE_OBJECT(caddy);
|
|
||||||
jdata = caddy->jdata;
|
|
||||||
|
|
||||||
/* remove this object from the job array */
|
|
||||||
opal_hash_table_set_value_uint32(orte_job_data, jdata->jobid, NULL);
|
|
||||||
|
|
||||||
OBJ_RELEASE(caddy);
|
|
||||||
}
|
|
||||||
|
|
||||||
typedef struct {
|
|
||||||
opal_list_t *info;
|
|
||||||
orte_job_t *jdata;
|
|
||||||
} mycaddy_t;
|
|
||||||
|
|
||||||
static void notify_complete(int status, void *cbdata)
|
|
||||||
{
|
|
||||||
mycaddy_t *mycaddy = (mycaddy_t*)cbdata;
|
|
||||||
|
|
||||||
OPAL_LIST_RELEASE(mycaddy->info);
|
|
||||||
ORTE_ACTIVATE_JOB_STATE(mycaddy->jdata, ORTE_JOB_STATE_NOTIFIED);
|
|
||||||
OBJ_RELEASE(mycaddy->jdata);
|
|
||||||
free(mycaddy);
|
|
||||||
}
|
|
||||||
|
|
||||||
static void dvm_notify(int sd, short args, void *cbdata)
|
|
||||||
{
|
|
||||||
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
|
|
||||||
orte_job_t *jdata = caddy->jdata;
|
|
||||||
orte_proc_t *pptr=NULL;
|
|
||||||
int ret;
|
|
||||||
opal_buffer_t *reply;
|
|
||||||
orte_daemon_cmd_flag_t command;
|
|
||||||
orte_grpcomm_signature_t *sig;
|
|
||||||
bool notify = true;
|
|
||||||
opal_list_t *info;
|
|
||||||
opal_value_t *val;
|
|
||||||
opal_process_name_t pname, *proc, pnotify;
|
|
||||||
mycaddy_t *mycaddy;
|
|
||||||
|
|
||||||
/* see if there was any problem */
|
|
||||||
if (orte_get_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, (void**)&pptr, OPAL_PTR) && NULL != pptr) {
|
|
||||||
ret = pptr->exit_code;
|
|
||||||
/* or whether we got cancelled by the user */
|
|
||||||
} else if (orte_get_attribute(&jdata->attributes, ORTE_JOB_CANCELLED, NULL, OPAL_BOOL)) {
|
|
||||||
ret = ORTE_ERR_JOB_CANCELLED;
|
|
||||||
} else {
|
|
||||||
ret = ORTE_SUCCESS;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (0 == ret && orte_get_attribute(&jdata->attributes, ORTE_JOB_SILENT_TERMINATION, NULL, OPAL_BOOL)) {
|
|
||||||
notify = false;
|
|
||||||
}
|
|
||||||
/* if the jobid matches that of the requestor, then don't notify */
|
|
||||||
proc = &pnotify;
|
|
||||||
if (orte_get_attribute(&jdata->attributes, ORTE_JOB_LAUNCH_PROXY, (void**)&proc, OPAL_NAME)) {
|
|
||||||
if (pnotify.jobid == jdata->jobid) {
|
|
||||||
notify = false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (notify) {
|
|
||||||
/* the source is the job that terminated */
|
|
||||||
pname.jobid = jdata->jobid;
|
|
||||||
pname.vpid = OPAL_VPID_WILDCARD;
|
|
||||||
|
|
||||||
info = OBJ_NEW(opal_list_t);
|
|
||||||
/* ensure this only goes to the job terminated event handler */
|
|
||||||
val = OBJ_NEW(opal_value_t);
|
|
||||||
val->key = strdup(OPAL_PMIX_EVENT_NON_DEFAULT);
|
|
||||||
val->type = OPAL_BOOL;
|
|
||||||
val->data.flag = true;
|
|
||||||
opal_list_append(info, &val->super);
|
|
||||||
/* tell the server not to cache the event as subsequent jobs
|
|
||||||
* do not need to know about it */
|
|
||||||
val = OBJ_NEW(opal_value_t);
|
|
||||||
val->key = strdup(OPAL_PMIX_EVENT_DO_NOT_CACHE);
|
|
||||||
val->type = OPAL_BOOL;
|
|
||||||
val->data.flag = true;
|
|
||||||
opal_list_append(info, &val->super);
|
|
||||||
/* provide the status */
|
|
||||||
val = OBJ_NEW(opal_value_t);
|
|
||||||
val->key = strdup(OPAL_PMIX_JOB_TERM_STATUS);
|
|
||||||
val->type = OPAL_STATUS;
|
|
||||||
val->data.status = ret;
|
|
||||||
opal_list_append(info, &val->super);
|
|
||||||
/* tell the requestor which job or proc */
|
|
||||||
val = OBJ_NEW(opal_value_t);
|
|
||||||
val->key = strdup(OPAL_PMIX_PROCID);
|
|
||||||
val->type = OPAL_NAME;
|
|
||||||
val->data.name.jobid = jdata->jobid;
|
|
||||||
if (NULL != pptr) {
|
|
||||||
val->data.name.vpid = pptr->name.vpid;
|
|
||||||
} else {
|
|
||||||
val->data.name.vpid = ORTE_VPID_WILDCARD;
|
|
||||||
}
|
|
||||||
opal_list_append(info, &val->super);
|
|
||||||
/* pass along the proc to be notified */
|
|
||||||
val = OBJ_NEW(opal_value_t);
|
|
||||||
val->key = strdup(OPAL_PMIX_EVENT_CUSTOM_RANGE);
|
|
||||||
val->type = OPAL_NAME;
|
|
||||||
val->data.name.jobid = pnotify.jobid;
|
|
||||||
val->data.name.vpid = pnotify.vpid;
|
|
||||||
opal_list_append(info, &val->super);
|
|
||||||
/* setup the caddy */
|
|
||||||
mycaddy = (mycaddy_t*)malloc(sizeof(mycaddy_t));
|
|
||||||
mycaddy->info = info;
|
|
||||||
OBJ_RETAIN(jdata);
|
|
||||||
mycaddy->jdata = jdata;
|
|
||||||
opal_pmix.server_notify_event(OPAL_ERR_JOB_TERMINATED, &pname,
|
|
||||||
info, notify_complete, mycaddy);
|
|
||||||
}
|
|
||||||
|
|
||||||
/* now ensure that _all_ daemons know that this job has terminated so even
|
|
||||||
* those that did not participate in it will know to cleanup the resources
|
|
||||||
* they assigned to the job. This is necessary now that the mapping function
|
|
||||||
* has been moved to the backend daemons - otherwise, non-participating daemons
|
|
||||||
* retain the slot assignments on the participating daemons, and then incorrectly
|
|
||||||
* map subsequent jobs thinking those nodes are still "busy" */
|
|
||||||
reply = OBJ_NEW(opal_buffer_t);
|
|
||||||
command = ORTE_DAEMON_DVM_CLEANUP_JOB_CMD;
|
|
||||||
opal_dss.pack(reply, &command, 1, ORTE_DAEMON_CMD);
|
|
||||||
opal_dss.pack(reply, &jdata->jobid, 1, ORTE_JOBID);
|
|
||||||
sig = OBJ_NEW(orte_grpcomm_signature_t);
|
|
||||||
sig->signature = (orte_process_name_t*)malloc(sizeof(orte_process_name_t));
|
|
||||||
sig->signature[0].jobid = ORTE_PROC_MY_NAME->jobid;
|
|
||||||
sig->signature[0].vpid = ORTE_VPID_WILDCARD;
|
|
||||||
orte_grpcomm.xcast(sig, ORTE_RML_TAG_DAEMON, reply);
|
|
||||||
OBJ_RELEASE(reply);
|
|
||||||
OBJ_RELEASE(sig);
|
|
||||||
}
|
|
@ -1,35 +0,0 @@
|
|||||||
/*
|
|
||||||
* Copyright (c) 2015 Intel, Inc. All rights reserved.
|
|
||||||
*
|
|
||||||
* $COPYRIGHT$
|
|
||||||
*
|
|
||||||
* Additional copyrights may follow
|
|
||||||
*
|
|
||||||
* $HEADER$
|
|
||||||
*/
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @file
|
|
||||||
*
|
|
||||||
*/
|
|
||||||
|
|
||||||
#ifndef MCA_STATE_DVM_EXPORT_H
|
|
||||||
#define MCA_STATE_DVM_EXPORT_H
|
|
||||||
|
|
||||||
#include "orte_config.h"
|
|
||||||
|
|
||||||
#include "orte/mca/state/state.h"
|
|
||||||
|
|
||||||
BEGIN_C_DECLS
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Local Component structures
|
|
||||||
*/
|
|
||||||
|
|
||||||
ORTE_MODULE_DECLSPEC extern orte_state_base_component_t mca_state_dvm_component;
|
|
||||||
|
|
||||||
ORTE_DECLSPEC extern orte_state_base_module_t orte_state_dvm_module;
|
|
||||||
|
|
||||||
END_C_DECLS
|
|
||||||
|
|
||||||
#endif /* MCA_STATE_DVM_EXPORT_H */
|
|
@ -1,83 +0,0 @@
|
|||||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
|
||||||
/*
|
|
||||||
* Copyright (c) 2015 Intel, Inc. All rights reserved.
|
|
||||||
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights
|
|
||||||
* reserved.
|
|
||||||
*
|
|
||||||
* $COPYRIGHT$
|
|
||||||
*
|
|
||||||
* Additional copyrights may follow
|
|
||||||
*
|
|
||||||
* $HEADER$
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include "orte_config.h"
|
|
||||||
#include "opal/util/output.h"
|
|
||||||
|
|
||||||
#include "orte/mca/state/state.h"
|
|
||||||
#include "orte/mca/state/base/base.h"
|
|
||||||
#include "state_dvm.h"
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Public string for version number
|
|
||||||
*/
|
|
||||||
const char *orte_state_dvm_component_version_string =
|
|
||||||
"ORTE STATE dvm MCA component version " ORTE_VERSION;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Local functionality
|
|
||||||
*/
|
|
||||||
static int state_dvm_open(void);
|
|
||||||
static int state_dvm_close(void);
|
|
||||||
static int state_dvm_component_query(mca_base_module_t **module, int *priority);
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Instantiate the public struct with all of our public information
|
|
||||||
* and pointer to our public functions in it
|
|
||||||
*/
|
|
||||||
orte_state_base_component_t mca_state_dvm_component =
|
|
||||||
{
|
|
||||||
/* Handle the general mca_component_t struct containing
|
|
||||||
* meta information about the component
|
|
||||||
*/
|
|
||||||
.base_version = {
|
|
||||||
ORTE_STATE_BASE_VERSION_1_0_0,
|
|
||||||
/* Component name and version */
|
|
||||||
.mca_component_name = "dvm",
|
|
||||||
MCA_BASE_MAKE_VERSION(component, ORTE_MAJOR_VERSION, ORTE_MINOR_VERSION,
|
|
||||||
ORTE_RELEASE_VERSION),
|
|
||||||
|
|
||||||
/* Component open and close functions */
|
|
||||||
.mca_open_component = state_dvm_open,
|
|
||||||
.mca_close_component = state_dvm_close,
|
|
||||||
.mca_query_component = state_dvm_component_query,
|
|
||||||
},
|
|
||||||
.base_data = {
|
|
||||||
/* The component is checkpoint ready */
|
|
||||||
MCA_BASE_METADATA_PARAM_CHECKPOINT
|
|
||||||
},
|
|
||||||
};
|
|
||||||
|
|
||||||
static int state_dvm_open(void)
|
|
||||||
{
|
|
||||||
return ORTE_SUCCESS;
|
|
||||||
}
|
|
||||||
|
|
||||||
static int state_dvm_close(void)
|
|
||||||
{
|
|
||||||
return ORTE_SUCCESS;
|
|
||||||
}
|
|
||||||
|
|
||||||
static int state_dvm_component_query(mca_base_module_t **module, int *priority)
|
|
||||||
{
|
|
||||||
/* used by DVM masters */
|
|
||||||
if (ORTE_PROC_IS_MASTER) {
|
|
||||||
*priority = 100;
|
|
||||||
*module = (mca_base_module_t *)&orte_state_dvm_module;
|
|
||||||
return ORTE_SUCCESS;
|
|
||||||
}
|
|
||||||
|
|
||||||
*priority = 0;
|
|
||||||
*module = NULL;
|
|
||||||
return ORTE_ERR_NOT_AVAILABLE;
|
|
||||||
}
|
|
@ -42,12 +42,4 @@ DIST_SUBDIRS += \
|
|||||||
tools/wrappers \
|
tools/wrappers \
|
||||||
tools/orte-top \
|
tools/orte-top \
|
||||||
tools/orte-info \
|
tools/orte-info \
|
||||||
tools/orte-server \
|
tools/orte-server
|
||||||
tools/orte-dvm \
|
|
||||||
tools/ompi-prun
|
|
||||||
|
|
||||||
if OPAL_WANT_PRUN
|
|
||||||
SUBDIRS += \
|
|
||||||
tools/ompi-prun \
|
|
||||||
tools/orte-dvm
|
|
||||||
endif
|
|
||||||
|
@ -1,59 +0,0 @@
|
|||||||
#
|
|
||||||
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
|
||||||
# University Research and Technology
|
|
||||||
# Corporation. All rights reserved.
|
|
||||||
# Copyright (c) 2004-2005 The University of Tennessee and The University
|
|
||||||
# of Tennessee Research Foundation. All rights
|
|
||||||
# reserved.
|
|
||||||
# Copyright (c) 2004-2009 High Performance Computing Center Stuttgart,
|
|
||||||
# University of Stuttgart. All rights reserved.
|
|
||||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
|
||||||
# All rights reserved.
|
|
||||||
# Copyright (c) 2008-2014 Cisco Systems, Inc. All rights reserved.
|
|
||||||
# Copyright (c) 2008 Sun Microsystems, Inc. All rights reserved.
|
|
||||||
# Copyright (c) 2015-2018 Intel, Inc. All rights reserved.
|
|
||||||
# $COPYRIGHT$
|
|
||||||
#
|
|
||||||
# Additional copyrights may follow
|
|
||||||
#
|
|
||||||
# $HEADER$
|
|
||||||
#
|
|
||||||
|
|
||||||
# This is not quite in the Automake spirit, but we have to do it.
|
|
||||||
# Since the totalview portion of the library must be built with -g, we
|
|
||||||
# must eliminate the CFLAGS that are passed in here by default (which
|
|
||||||
# may already have debugging and/or optimization flags). We use
|
|
||||||
# post-processed forms of the CFLAGS in the library targets down
|
|
||||||
# below.
|
|
||||||
|
|
||||||
CFLAGS = $(CFLAGS_WITHOUT_OPTFLAGS) $(DEBUGGER_CFLAGS)
|
|
||||||
|
|
||||||
include $(top_srcdir)/Makefile.ompi-rules
|
|
||||||
|
|
||||||
man_pages = ompi-prun.1
|
|
||||||
EXTRA_DIST = $(man_pages:.1=.1in)
|
|
||||||
|
|
||||||
if OPAL_INSTALL_BINARIES
|
|
||||||
|
|
||||||
bin_PROGRAMS = ompi-prun
|
|
||||||
|
|
||||||
nodist_man_MANS = $(man_pages)
|
|
||||||
|
|
||||||
# Ensure that the man pages are rebuilt if the opal_config.h file
|
|
||||||
# changes; a "good enough" way to know if configure was run again (and
|
|
||||||
# therefore the release date or version may have changed)
|
|
||||||
$(nodist_man_MANS): $(top_builddir)/opal/include/opal_config.h
|
|
||||||
|
|
||||||
endif # OPAL_INSTALL_BINARIES
|
|
||||||
|
|
||||||
ompi_prun_SOURCES = \
|
|
||||||
main.c \
|
|
||||||
prun.c \
|
|
||||||
prun.h
|
|
||||||
|
|
||||||
ompi_prun_LDADD = \
|
|
||||||
$(top_builddir)/orte/lib@ORTE_LIB_PREFIX@open-rte.la \
|
|
||||||
$(top_builddir)/opal/lib@OPAL_LIB_PREFIX@open-pal.la
|
|
||||||
|
|
||||||
distclean-local:
|
|
||||||
rm -f $(man_pages)
|
|
@ -1,33 +0,0 @@
|
|||||||
/***************************************************************************
|
|
||||||
* *
|
|
||||||
* Open MPI: Open Source High Performance Computing *
|
|
||||||
* *
|
|
||||||
* http://www.open-mpi.org/ *
|
|
||||||
* *
|
|
||||||
***************************************************************************/
|
|
||||||
|
|
||||||
#include "prun.h"
|
|
||||||
|
|
||||||
int main(int argc, char *argv[])
|
|
||||||
{
|
|
||||||
return prun(argc, argv);
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
|
||||||
* University Research and Technology
|
|
||||||
* Corporation. All rights reserved.
|
|
||||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
|
||||||
* of Tennessee Research Foundation. All rights
|
|
||||||
* reserved.
|
|
||||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
|
||||||
* University of Stuttgart. All rights reserved.
|
|
||||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
|
||||||
* All rights reserved.
|
|
||||||
* Copyright (c) 2017-2018 Intel, Inc. All rights reserved.
|
|
||||||
* $COPYRIGHT$
|
|
||||||
*
|
|
||||||
* Additional copyrights may follow
|
|
||||||
*
|
|
||||||
* $HEADER$
|
|
||||||
*/
|
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
@ -1,228 +0,0 @@
|
|||||||
#! /bin/sh
|
|
||||||
|
|
||||||
# prun - temporary wrapper script for .libs/prun
|
|
||||||
# Generated by libtool (GNU libtool) 2.4.6
|
|
||||||
#
|
|
||||||
# The prun program cannot be directly executed until all the libtool
|
|
||||||
# libraries that it depends on are installed.
|
|
||||||
#
|
|
||||||
# This wrapper script should never be moved out of the build directory.
|
|
||||||
# If it is, it will not operate correctly.
|
|
||||||
|
|
||||||
# Sed substitution that helps us do robust quoting. It backslashifies
|
|
||||||
# metacharacters that are still active within double-quoted strings.
|
|
||||||
sed_quote_subst='s|\([`"$\\]\)|\\\1|g'
|
|
||||||
|
|
||||||
# Be Bourne compatible
|
|
||||||
if test -n "${ZSH_VERSION+set}" && (emulate sh) >/dev/null 2>&1; then
|
|
||||||
emulate sh
|
|
||||||
NULLCMD=:
|
|
||||||
# Zsh 3.x and 4.x performs word splitting on ${1+"$@"}, which
|
|
||||||
# is contrary to our usage. Disable this feature.
|
|
||||||
alias -g '${1+"$@"}'='"$@"'
|
|
||||||
setopt NO_GLOB_SUBST
|
|
||||||
else
|
|
||||||
case `(set -o) 2>/dev/null` in *posix*) set -o posix;; esac
|
|
||||||
fi
|
|
||||||
BIN_SH=xpg4; export BIN_SH # for Tru64
|
|
||||||
DUALCASE=1; export DUALCASE # for MKS sh
|
|
||||||
|
|
||||||
# The HP-UX ksh and POSIX shell print the target directory to stdout
|
|
||||||
# if CDPATH is set.
|
|
||||||
(unset CDPATH) >/dev/null 2>&1 && unset CDPATH
|
|
||||||
|
|
||||||
relink_command="(cd /home/common/openmpi/foobar/orte/tools/prun; LIBRARY_PATH=/opt/local/lib; export LIBRARY_PATH; { test -z \"\${COMPILER_PATH+set}\" || unset COMPILER_PATH || { COMPILER_PATH=; export COMPILER_PATH; }; }; { test -z \"\${GCC_EXEC_PREFIX+set}\" || unset GCC_EXEC_PREFIX || { GCC_EXEC_PREFIX=; export GCC_EXEC_PREFIX; }; }; { test -z \"\${LD_RUN_PATH+set}\" || unset LD_RUN_PATH || { LD_RUN_PATH=; export LD_RUN_PATH; }; }; LD_LIBRARY_PATH=/home/common/openmpi/build/foobar/lib:/home/common/local/lib:/home/common/pmix/build/prrte/lib; export LD_LIBRARY_PATH; PATH=/home/common/openmpi/build/foobar/bin:/home/common/local/bin:/home/common/pmix/build/prrte/bin:/home/common/local/sbin:/usr/lib64/qt-3.3/bin:/home/rhc/perl5/bin:/usr/local/bin:/usr/bin:/usr/local/sbin:/usr/sbin:/home/rhc/.local/bin:/home/rhc/bin; export PATH; gcc -Wall -Wundef -Wno-long-long -Wsign-compare -Wmissing-prototypes -Wstrict-prototypes -Wcomment -pedantic -Werror-implicit-function-declaration -fno-strict-aliasing -mcx16 -pthread -g -o \$progdir/\$file main.o prun.o ../../../orte/.libs/libopen-rte.so /home/common/openmpi/foobar/opal/.libs/libopen-pal.so ../../../opal/.libs/libopen-pal.so -ldl -ludev -lrt -lm -lutil -lz -pthread -Wl,-rpath -Wl,/home/common/openmpi/foobar/orte/.libs -Wl,-rpath -Wl,/home/common/openmpi/foobar/opal/.libs -Wl,-rpath -Wl,/home/common/openmpi/build/foobar/lib)"
|
|
||||||
|
|
||||||
# This environment variable determines our operation mode.
|
|
||||||
if test "$libtool_install_magic" = "%%%MAGIC variable%%%"; then
|
|
||||||
# install mode needs the following variables:
|
|
||||||
generated_by_libtool_version='2.4.6'
|
|
||||||
notinst_deplibs=' ../../../orte/libopen-rte.la /home/common/openmpi/foobar/opal/libopen-pal.la ../../../opal/libopen-pal.la'
|
|
||||||
else
|
|
||||||
# When we are sourced in execute mode, $file and $ECHO are already set.
|
|
||||||
if test "$libtool_execute_magic" != "%%%MAGIC variable%%%"; then
|
|
||||||
file="$0"
|
|
||||||
|
|
||||||
# A function that is used when there is no print builtin or printf.
|
|
||||||
func_fallback_echo ()
|
|
||||||
{
|
|
||||||
eval 'cat <<_LTECHO_EOF
|
|
||||||
$1
|
|
||||||
_LTECHO_EOF'
|
|
||||||
}
|
|
||||||
ECHO="printf %s\\n"
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Very basic option parsing. These options are (a) specific to
|
|
||||||
# the libtool wrapper, (b) are identical between the wrapper
|
|
||||||
# /script/ and the wrapper /executable/ that is used only on
|
|
||||||
# windows platforms, and (c) all begin with the string --lt-
|
|
||||||
# (application programs are unlikely to have options that match
|
|
||||||
# this pattern).
|
|
||||||
#
|
|
||||||
# There are only two supported options: --lt-debug and
|
|
||||||
# --lt-dump-script. There is, deliberately, no --lt-help.
|
|
||||||
#
|
|
||||||
# The first argument to this parsing function should be the
|
|
||||||
# script's ../../../libtool value, followed by no.
|
|
||||||
lt_option_debug=
|
|
||||||
func_parse_lt_options ()
|
|
||||||
{
|
|
||||||
lt_script_arg0=$0
|
|
||||||
shift
|
|
||||||
for lt_opt
|
|
||||||
do
|
|
||||||
case "$lt_opt" in
|
|
||||||
--lt-debug) lt_option_debug=1 ;;
|
|
||||||
--lt-dump-script)
|
|
||||||
lt_dump_D=`$ECHO "X$lt_script_arg0" | /usr/bin/sed -e 's/^X//' -e 's%/[^/]*$%%'`
|
|
||||||
test "X$lt_dump_D" = "X$lt_script_arg0" && lt_dump_D=.
|
|
||||||
lt_dump_F=`$ECHO "X$lt_script_arg0" | /usr/bin/sed -e 's/^X//' -e 's%^.*/%%'`
|
|
||||||
cat "$lt_dump_D/$lt_dump_F"
|
|
||||||
exit 0
|
|
||||||
;;
|
|
||||||
--lt-*)
|
|
||||||
$ECHO "Unrecognized --lt- option: '$lt_opt'" 1>&2
|
|
||||||
exit 1
|
|
||||||
;;
|
|
||||||
esac
|
|
||||||
done
|
|
||||||
|
|
||||||
# Print the debug banner immediately:
|
|
||||||
if test -n "$lt_option_debug"; then
|
|
||||||
echo "prun:prun:$LINENO: libtool wrapper (GNU libtool) 2.4.6" 1>&2
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
|
|
||||||
# Used when --lt-debug. Prints its arguments to stdout
|
|
||||||
# (redirection is the responsibility of the caller)
|
|
||||||
func_lt_dump_args ()
|
|
||||||
{
|
|
||||||
lt_dump_args_N=1;
|
|
||||||
for lt_arg
|
|
||||||
do
|
|
||||||
$ECHO "prun:prun:$LINENO: newargv[$lt_dump_args_N]: $lt_arg"
|
|
||||||
lt_dump_args_N=`expr $lt_dump_args_N + 1`
|
|
||||||
done
|
|
||||||
}
|
|
||||||
|
|
||||||
# Core function for launching the target application
|
|
||||||
func_exec_program_core ()
|
|
||||||
{
|
|
||||||
|
|
||||||
if test -n "$lt_option_debug"; then
|
|
||||||
$ECHO "prun:prun:$LINENO: newargv[0]: $progdir/$program" 1>&2
|
|
||||||
func_lt_dump_args ${1+"$@"} 1>&2
|
|
||||||
fi
|
|
||||||
exec "$progdir/$program" ${1+"$@"}
|
|
||||||
|
|
||||||
$ECHO "$0: cannot exec $program $*" 1>&2
|
|
||||||
exit 1
|
|
||||||
}
|
|
||||||
|
|
||||||
# A function to encapsulate launching the target application
|
|
||||||
# Strips options in the --lt-* namespace from $@ and
|
|
||||||
# launches target application with the remaining arguments.
|
|
||||||
func_exec_program ()
|
|
||||||
{
|
|
||||||
case " $* " in
|
|
||||||
*\ --lt-*)
|
|
||||||
for lt_wr_arg
|
|
||||||
do
|
|
||||||
case $lt_wr_arg in
|
|
||||||
--lt-*) ;;
|
|
||||||
*) set x "$@" "$lt_wr_arg"; shift;;
|
|
||||||
esac
|
|
||||||
shift
|
|
||||||
done ;;
|
|
||||||
esac
|
|
||||||
func_exec_program_core ${1+"$@"}
|
|
||||||
}
|
|
||||||
|
|
||||||
# Parse options
|
|
||||||
func_parse_lt_options "$0" ${1+"$@"}
|
|
||||||
|
|
||||||
# Find the directory that this script lives in.
|
|
||||||
thisdir=`$ECHO "$file" | /usr/bin/sed 's%/[^/]*$%%'`
|
|
||||||
test "x$thisdir" = "x$file" && thisdir=.
|
|
||||||
|
|
||||||
# Follow symbolic links until we get to the real thisdir.
|
|
||||||
file=`ls -ld "$file" | /usr/bin/sed -n 's/.*-> //p'`
|
|
||||||
while test -n "$file"; do
|
|
||||||
destdir=`$ECHO "$file" | /usr/bin/sed 's%/[^/]*$%%'`
|
|
||||||
|
|
||||||
# If there was a directory component, then change thisdir.
|
|
||||||
if test "x$destdir" != "x$file"; then
|
|
||||||
case "$destdir" in
|
|
||||||
[\\/]* | [A-Za-z]:[\\/]*) thisdir="$destdir" ;;
|
|
||||||
*) thisdir="$thisdir/$destdir" ;;
|
|
||||||
esac
|
|
||||||
fi
|
|
||||||
|
|
||||||
file=`$ECHO "$file" | /usr/bin/sed 's%^.*/%%'`
|
|
||||||
file=`ls -ld "$thisdir/$file" | /usr/bin/sed -n 's/.*-> //p'`
|
|
||||||
done
|
|
||||||
|
|
||||||
# Usually 'no', except on cygwin/mingw when embedded into
|
|
||||||
# the cwrapper.
|
|
||||||
WRAPPER_SCRIPT_BELONGS_IN_OBJDIR=no
|
|
||||||
if test "$WRAPPER_SCRIPT_BELONGS_IN_OBJDIR" = "yes"; then
|
|
||||||
# special case for '.'
|
|
||||||
if test "$thisdir" = "."; then
|
|
||||||
thisdir=`pwd`
|
|
||||||
fi
|
|
||||||
# remove .libs from thisdir
|
|
||||||
case "$thisdir" in
|
|
||||||
*[\\/].libs ) thisdir=`$ECHO "$thisdir" | /usr/bin/sed 's%[\\/][^\\/]*$%%'` ;;
|
|
||||||
.libs ) thisdir=. ;;
|
|
||||||
esac
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Try to get the absolute directory name.
|
|
||||||
absdir=`cd "$thisdir" && pwd`
|
|
||||||
test -n "$absdir" && thisdir="$absdir"
|
|
||||||
|
|
||||||
program=lt-'prun'
|
|
||||||
progdir="$thisdir/.libs"
|
|
||||||
|
|
||||||
if test ! -f "$progdir/$program" ||
|
|
||||||
{ file=`ls -1dt "$progdir/$program" "$progdir/../$program" 2>/dev/null | /usr/bin/sed 1q`; \
|
|
||||||
test "X$file" != "X$progdir/$program"; }; then
|
|
||||||
|
|
||||||
file="$$-$program"
|
|
||||||
|
|
||||||
if test ! -d "$progdir"; then
|
|
||||||
mkdir "$progdir"
|
|
||||||
else
|
|
||||||
rm -f "$progdir/$file"
|
|
||||||
fi
|
|
||||||
|
|
||||||
# relink executable if necessary
|
|
||||||
if test -n "$relink_command"; then
|
|
||||||
if relink_command_output=`eval $relink_command 2>&1`; then :
|
|
||||||
else
|
|
||||||
$ECHO "$relink_command_output" >&2
|
|
||||||
rm -f "$progdir/$file"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
fi
|
|
||||||
|
|
||||||
mv -f "$progdir/$file" "$progdir/$program" 2>/dev/null ||
|
|
||||||
{ rm -f "$progdir/$program";
|
|
||||||
mv -f "$progdir/$file" "$progdir/$program"; }
|
|
||||||
rm -f "$progdir/$file"
|
|
||||||
fi
|
|
||||||
|
|
||||||
if test -f "$progdir/$program"; then
|
|
||||||
if test "$libtool_execute_magic" != "%%%MAGIC variable%%%"; then
|
|
||||||
# Run the actual program with our arguments.
|
|
||||||
func_exec_program ${1+"$@"}
|
|
||||||
fi
|
|
||||||
else
|
|
||||||
# The program doesn't exist.
|
|
||||||
$ECHO "$0: error: '$progdir/$program' does not exist" 1>&2
|
|
||||||
$ECHO "This script is just a wrapper for $program." 1>&2
|
|
||||||
$ECHO "See the libtool documentation for more information." 1>&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
fi
|
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
@ -1,37 +0,0 @@
|
|||||||
/*
|
|
||||||
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
|
|
||||||
* University Research and Technology
|
|
||||||
* Corporation. All rights reserved.
|
|
||||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
|
||||||
* of Tennessee Research Foundation. All rights
|
|
||||||
* reserved.
|
|
||||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
|
||||||
* University of Stuttgart. All rights reserved.
|
|
||||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
|
||||||
* All rights reserved.
|
|
||||||
* Copyright (c) 2007-2011 Cisco Systems, Inc. All rights reserved.
|
|
||||||
* Copyright (c) 2012-2013 Los Alamos National Security, LLC.
|
|
||||||
* All rights reserved
|
|
||||||
* Copyright (c) 2014-2018 Intel, Inc. All rights reserved.
|
|
||||||
* $COPYRIGHT$
|
|
||||||
*
|
|
||||||
* Additional copyrights may follow
|
|
||||||
*
|
|
||||||
* $HEADER$
|
|
||||||
*/
|
|
||||||
|
|
||||||
#ifndef PRUN_H
|
|
||||||
#define PRUN_H
|
|
||||||
|
|
||||||
#include "orte_config.h"
|
|
||||||
|
|
||||||
BEGIN_C_DECLS
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Main body of prun functionality
|
|
||||||
*/
|
|
||||||
int prun(int argc, char *argv[]);
|
|
||||||
|
|
||||||
END_C_DECLS
|
|
||||||
|
|
||||||
#endif /* ORTERUN_ORTERUN_H */
|
|
@ -1,57 +0,0 @@
|
|||||||
#
|
|
||||||
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
|
||||||
# University Research and Technology
|
|
||||||
# Corporation. All rights reserved.
|
|
||||||
# Copyright (c) 2004-2005 The University of Tennessee and The University
|
|
||||||
# of Tennessee Research Foundation. All rights
|
|
||||||
# reserved.
|
|
||||||
# Copyright (c) 2004-2009 High Performance Computing Center Stuttgart,
|
|
||||||
# University of Stuttgart. All rights reserved.
|
|
||||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
|
||||||
# All rights reserved.
|
|
||||||
# Copyright (c) 2008-2014 Cisco Systems, Inc. All rights reserved.
|
|
||||||
# Copyright (c) 2008 Sun Microsystems, Inc. All rights reserved.
|
|
||||||
# Copyright (c) 2015 Intel, Inc. All rights reserved.
|
|
||||||
# $COPYRIGHT$
|
|
||||||
#
|
|
||||||
# Additional copyrights may follow
|
|
||||||
#
|
|
||||||
# $HEADER$
|
|
||||||
#
|
|
||||||
|
|
||||||
# This is not quite in the Automake spirit, but we have to do it.
|
|
||||||
# Since the totalview portion of the library must be built with -g, we
|
|
||||||
# must eliminate the CFLAGS that are passed in here by default (which
|
|
||||||
# may already have debugging and/or optimization flags). We use
|
|
||||||
# post-processed forms of the CFLAGS in the library targets down
|
|
||||||
# below.
|
|
||||||
|
|
||||||
CFLAGS = $(CFLAGS_WITHOUT_OPTFLAGS) $(DEBUGGER_CFLAGS)
|
|
||||||
|
|
||||||
include $(top_srcdir)/Makefile.ompi-rules
|
|
||||||
|
|
||||||
man_pages = orte-dvm.1
|
|
||||||
EXTRA_DIST = $(man_pages:.1=.1in)
|
|
||||||
|
|
||||||
if OPAL_INSTALL_BINARIES
|
|
||||||
|
|
||||||
bin_PROGRAMS = orte-dvm
|
|
||||||
|
|
||||||
nodist_man_MANS = $(man_pages)
|
|
||||||
|
|
||||||
# Ensure that the man pages are rebuilt if the opal_config.h file
|
|
||||||
# changes; a "good enough" way to know if configure was run again (and
|
|
||||||
# therefore the release date or version may have changed)
|
|
||||||
$(nodist_man_MANS): $(top_builddir)/opal/include/opal_config.h
|
|
||||||
|
|
||||||
endif # OPAL_INSTALL_BINARIES
|
|
||||||
|
|
||||||
orte_dvm_SOURCES = \
|
|
||||||
orte-dvm.c
|
|
||||||
|
|
||||||
orte_dvm_LDADD = \
|
|
||||||
$(top_builddir)/orte/lib@ORTE_LIB_PREFIX@open-rte.la \
|
|
||||||
$(top_builddir)/opal/lib@OPAL_LIB_PREFIX@open-pal.la
|
|
||||||
|
|
||||||
distclean-local:
|
|
||||||
rm -f $(man_pages)
|
|
@ -1,193 +0,0 @@
|
|||||||
.\” -*- nroff -*-
|
|
||||||
.\" Copyright (c) 2009-2014 Cisco Systems, Inc. All rights reserved.
|
|
||||||
.\" Copyright (c) 2008-2009 Sun Microsystems, Inc. All rights reserved.
|
|
||||||
.\” Copyright (c) 2015 Intel, Inc. All rights reserved
|
|
||||||
.\" $COPYRIGHT$
|
|
||||||
.\"
|
|
||||||
.\" Man page for ORTE's orte-dvm command
|
|
||||||
.\"
|
|
||||||
.\" .TH name section center-footer left-footer center-header
|
|
||||||
.TH ORTE-DVM 1 "#OMPI_DATE#" "#PACKAGE_VERSION#" "#PACKAGE_NAME#"
|
|
||||||
.\" **************************
|
|
||||||
.\" Name Section
|
|
||||||
.\" **************************
|
|
||||||
.SH NAME
|
|
||||||
.
|
|
||||||
orte-dvm, ompi_dvm \- Establish a Distributed Virtual Machine (DVM).
|
|
||||||
|
|
||||||
.B Note:
|
|
||||||
\fIorte-dvm\fP and \fIompi-dvm\fP are synonyms for each
|
|
||||||
other. Using either of the names will produce the same behavior.
|
|
||||||
.
|
|
||||||
.\" **************************
|
|
||||||
.\" Synopsis Section
|
|
||||||
.\" **************************
|
|
||||||
.SH SYNOPSIS
|
|
||||||
.
|
|
||||||
.PP
|
|
||||||
.B orte-dvm
|
|
||||||
[ options ]
|
|
||||||
.P
|
|
||||||
|
|
||||||
Invoking \fIorte-dvm\fP via an absolute path
|
|
||||||
name is equivalent to specifying the \fI--prefix\fP option with a
|
|
||||||
\fI<dir>\fR value equivalent to the directory where \fIorte-dvm\fR
|
|
||||||
resides, minus its last subdirectory. For example:
|
|
||||||
|
|
||||||
\fB%\fP /usr/local/bin/orte-dvm ...
|
|
||||||
|
|
||||||
is equivalent to
|
|
||||||
|
|
||||||
\fB%\fP orte-dvm --prefix /usr/local
|
|
||||||
|
|
||||||
.
|
|
||||||
.\" **************************
|
|
||||||
.\" Quick Summary Section
|
|
||||||
.\" **************************
|
|
||||||
.SH QUICK SUMMARY
|
|
||||||
.
|
|
||||||
\fIorte-dvm\fP will establish a DVM that can be used to execute subsequent
|
|
||||||
applications. Use of \fIorte-dvm\fP can be advantageous, for example, when you want to
|
|
||||||
execute a number of short-lived tasks. In such cases, the time required to start
|
|
||||||
the ORTE DVM can be a significant fraction of the time to execute the
|
|
||||||
overall application. Thus, creating a persistent DVM can speed the overall
|
|
||||||
execution. In addition, a persistent DVM will support executing multiple parallel
|
|
||||||
applications while maintaining separation between their respective cores.
|
|
||||||
.\" **************************
|
|
||||||
.\" Options Section
|
|
||||||
.\" **************************
|
|
||||||
.SH OPTIONS
|
|
||||||
.
|
|
||||||
.\"
|
|
||||||
.\" Start options listing
|
|
||||||
.\" Indent 10 characters from start of first column to start of second column
|
|
||||||
.
|
|
||||||
.TP
|
|
||||||
.B -h\fR,\fP --help
|
|
||||||
Display help for this command
|
|
||||||
.
|
|
||||||
.
|
|
||||||
.TP
|
|
||||||
.B -V\fR,\fP --version
|
|
||||||
Print version number. If no other arguments are given, this will also
|
|
||||||
cause orte-dvm to exit.
|
|
||||||
.
|
|
||||||
.
|
|
||||||
.P
|
|
||||||
Use one of the following options to specify which hosts (nodes) of the cluster to use
|
|
||||||
for the DVM.
|
|
||||||
.
|
|
||||||
.
|
|
||||||
.TP
|
|
||||||
.B -H\fR,\fP -host\fR,\fP --host \fR<host1,host2,...,hostN>\fP
|
|
||||||
List of hosts for the DVM.
|
|
||||||
.
|
|
||||||
.
|
|
||||||
.TP
|
|
||||||
.B
|
|
||||||
-hostfile\fR,\fP --hostfile \fR<hostfile>\fP
|
|
||||||
Provide a hostfile to use.
|
|
||||||
.
|
|
||||||
.
|
|
||||||
.TP
|
|
||||||
.B -machinefile\fR,\fP --machinefile \fR<machinefile>\fP
|
|
||||||
Synonym for \fI-hostfile\fP.
|
|
||||||
.
|
|
||||||
.
|
|
||||||
.TP
|
|
||||||
.B --prefix \fR<dir>\fP
|
|
||||||
Prefix directory that will be used to set the \fIPATH\fR and
|
|
||||||
\fILD_LIBRARY_PATH\fR on the remote node before invoking the ORTE daemon.
|
|
||||||
.
|
|
||||||
.
|
|
||||||
..P
|
|
||||||
Setting MCA parameters:
|
|
||||||
.
|
|
||||||
.
|
|
||||||
.TP
|
|
||||||
.B -gmca\fR,\fP --gmca \fR<key> <value>\fP
|
|
||||||
Pass global MCA parameters that are applicable to all contexts. \fI<key>\fP is
|
|
||||||
the parameter name; \fI<value>\fP is the parameter value.
|
|
||||||
.
|
|
||||||
.
|
|
||||||
.TP
|
|
||||||
.B -mca\fR,\fP --mca <key> <value>
|
|
||||||
Send arguments to various MCA modules. See the "MCA" section, below.
|
|
||||||
.
|
|
||||||
.
|
|
||||||
.
|
|
||||||
.
|
|
||||||
.TP
|
|
||||||
.B -report-uri\fR,\fP --report-uri <channel>
|
|
||||||
Print out orte-dvm's URI during startup. The channel must be either a '-' to indicate that
|
|
||||||
the URI is to be output to stdout, a '+' to indicate that the URI is to be output to stderr,
|
|
||||||
or a filename to which the URI is to be written.
|
|
||||||
.
|
|
||||||
.
|
|
||||||
.P
|
|
||||||
The following options are useful for developers; they are not generally
|
|
||||||
useful to most ORTE and/or MPI users:
|
|
||||||
.
|
|
||||||
.TP
|
|
||||||
.B -d\fR,\fP --debug-devel
|
|
||||||
Enable debugging of the ORTE layer.
|
|
||||||
.
|
|
||||||
.
|
|
||||||
.TP
|
|
||||||
.B --debug-daemons-file
|
|
||||||
Enable debugging of the ORTE daemons in the DVM, storing
|
|
||||||
output in files.
|
|
||||||
.
|
|
||||||
.
|
|
||||||
.P
|
|
||||||
There may be other options listed with \fIorte-dvm --help\fP.
|
|
||||||
.
|
|
||||||
.
|
|
||||||
.\" **************************
|
|
||||||
.\" Description Section
|
|
||||||
.\" **************************
|
|
||||||
.SH DESCRIPTION
|
|
||||||
.
|
|
||||||
\fIorte-dvm\fP starts a Distributed Virtual Machine (DVM) by launching
|
|
||||||
a daemon on each node of the allocation, as modified or specified by
|
|
||||||
the \fI-host\fP and \fI-hostfile\fP options. Applications can subsequently
|
|
||||||
be executed using the \fIorte-submit\fP command.
|
|
||||||
.
|
|
||||||
The DVM remains in operation until receiving the \fIorte-submit -terminate\fP
|
|
||||||
command.
|
|
||||||
.
|
|
||||||
.
|
|
||||||
.
|
|
||||||
.SS Specifying Host Nodes
|
|
||||||
.
|
|
||||||
Host nodes can be identified on the \fIorte-dvm\fP command line with the \fI-host\fP
|
|
||||||
option or in a hostfile.
|
|
||||||
.
|
|
||||||
.PP
|
|
||||||
For example,
|
|
||||||
.
|
|
||||||
.TP 4
|
|
||||||
orte-dvm -H aa,aa,bb ./a.out
|
|
||||||
launches two processes on node aa and one on bb.
|
|
||||||
.
|
|
||||||
.PP
|
|
||||||
Or, consider the hostfile
|
|
||||||
.
|
|
||||||
|
|
||||||
\fB%\fP cat myhostfile
|
|
||||||
aa slots=2
|
|
||||||
bb slots=2
|
|
||||||
cc slots=2
|
|
||||||
|
|
||||||
.
|
|
||||||
.PP
|
|
||||||
Here, we list both the host names (aa, bb, and cc) but also how many "slots"
|
|
||||||
there are for each. Slots indicate how many processes can potentially execute
|
|
||||||
on a node. For best performance, the number of slots may be chosen to be the
|
|
||||||
number of cores on the node or the number of processor sockets. If the hostfile
|
|
||||||
does not provide slots information, a default of 1 is assumed.
|
|
||||||
When running under resource managers (e.g., SLURM, Torque, etc.),
|
|
||||||
Open MPI will obtain both the hostnames and the number of slots directly
|
|
||||||
from the resource manger.
|
|
||||||
.
|
|
||||||
.
|
|
@ -1,482 +0,0 @@
|
|||||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
|
||||||
/*
|
|
||||||
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
|
|
||||||
* University Research and Technology
|
|
||||||
* Corporation. All rights reserved.
|
|
||||||
* Copyright (c) 2004-2008 The University of Tennessee and The University
|
|
||||||
* of Tennessee Research Foundation. All rights
|
|
||||||
* reserved.
|
|
||||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
|
||||||
* University of Stuttgart. All rights reserved.
|
|
||||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
|
||||||
* All rights reserved.
|
|
||||||
* Copyright (c) 2006-2017 Cisco Systems, Inc. All rights reserved
|
|
||||||
* Copyright (c) 2007-2009 Sun Microsystems, Inc. All rights reserved.
|
|
||||||
* Copyright (c) 2007-2016 Los Alamos National Security, LLC. All rights
|
|
||||||
* reserved.
|
|
||||||
* Copyright (c) 2013-2017 Intel, Inc. All rights reserved.
|
|
||||||
* $COPYRIGHT$
|
|
||||||
*
|
|
||||||
* Additional copyrights may follow
|
|
||||||
*
|
|
||||||
* $HEADER$
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include "orte_config.h"
|
|
||||||
#include "orte/constants.h"
|
|
||||||
|
|
||||||
#include <string.h>
|
|
||||||
#include <stdio.h>
|
|
||||||
#include <stdlib.h>
|
|
||||||
#ifdef HAVE_STRINGS_H
|
|
||||||
#include <strings.h>
|
|
||||||
#endif /* HAVE_STRINGS_H */
|
|
||||||
#ifdef HAVE_UNISTD_H
|
|
||||||
#include <unistd.h>
|
|
||||||
#endif
|
|
||||||
#ifdef HAVE_SYS_PARAM_H
|
|
||||||
#include <sys/param.h>
|
|
||||||
#endif
|
|
||||||
#include <errno.h>
|
|
||||||
#include <signal.h>
|
|
||||||
#include <ctype.h>
|
|
||||||
#ifdef HAVE_SYS_TYPES_H
|
|
||||||
#include <sys/types.h>
|
|
||||||
#endif /* HAVE_SYS_TYPES_H */
|
|
||||||
#ifdef HAVE_SYS_WAIT_H
|
|
||||||
#include <sys/wait.h>
|
|
||||||
#endif /* HAVE_SYS_WAIT_H */
|
|
||||||
#ifdef HAVE_SYS_TIME_H
|
|
||||||
#include <sys/time.h>
|
|
||||||
#endif /* HAVE_SYS_TIME_H */
|
|
||||||
#include <fcntl.h>
|
|
||||||
#ifdef HAVE_SYS_STAT_H
|
|
||||||
#include <sys/stat.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#include "opal/mca/event/event.h"
|
|
||||||
#include "opal/mca/installdirs/installdirs.h"
|
|
||||||
#include "opal/mca/base/base.h"
|
|
||||||
#include "opal/mca/pmix/pmix.h"
|
|
||||||
#include "opal/util/argv.h"
|
|
||||||
#include "opal/util/output.h"
|
|
||||||
#include "opal/util/basename.h"
|
|
||||||
#include "opal/util/cmd_line.h"
|
|
||||||
#include "opal/util/opal_environ.h"
|
|
||||||
#include "opal/util/opal_getcwd.h"
|
|
||||||
#include "opal/util/show_help.h"
|
|
||||||
#include "opal/util/fd.h"
|
|
||||||
#include "opal/util/daemon_init.h"
|
|
||||||
|
|
||||||
#include "opal/version.h"
|
|
||||||
#include "opal/runtime/opal.h"
|
|
||||||
#include "opal/runtime/opal_info_support.h"
|
|
||||||
#include "opal/util/os_path.h"
|
|
||||||
#include "opal/util/path.h"
|
|
||||||
#include "opal/class/opal_pointer_array.h"
|
|
||||||
|
|
||||||
#include "orte/mca/errmgr/errmgr.h"
|
|
||||||
#include "orte/mca/grpcomm/grpcomm.h"
|
|
||||||
#include "orte/mca/odls/odls.h"
|
|
||||||
#include "orte/mca/oob/base/base.h"
|
|
||||||
#include "orte/mca/rml/rml.h"
|
|
||||||
#include "orte/mca/rml/base/rml_contact.h"
|
|
||||||
#include "orte/mca/state/state.h"
|
|
||||||
|
|
||||||
#include "orte/runtime/runtime.h"
|
|
||||||
#include "orte/runtime/orte_globals.h"
|
|
||||||
#include "orte/util/show_help.h"
|
|
||||||
#include "orte/util/threads.h"
|
|
||||||
|
|
||||||
#include "orte/orted/orted.h"
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Globals
|
|
||||||
*/
|
|
||||||
static bool want_prefix_by_default = (bool) ORTE_WANT_ORTERUN_PREFIX_BY_DEFAULT;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Globals
|
|
||||||
*/
|
|
||||||
static struct {
|
|
||||||
bool help;
|
|
||||||
bool version;
|
|
||||||
char *prefix;
|
|
||||||
bool run_as_root;
|
|
||||||
bool set_sid;
|
|
||||||
bool daemonize;
|
|
||||||
bool system_server;
|
|
||||||
char *report_uri;
|
|
||||||
bool remote_connections;
|
|
||||||
} myglobals;
|
|
||||||
|
|
||||||
static opal_cmd_line_init_t cmd_line_init[] = {
|
|
||||||
/* Various "obvious" options */
|
|
||||||
{ NULL, 'h', NULL, "help", 0,
|
|
||||||
&myglobals.help, OPAL_CMD_LINE_TYPE_BOOL,
|
|
||||||
"This help message" },
|
|
||||||
{ NULL, 'V', NULL, "version", 0,
|
|
||||||
&myglobals.version, OPAL_CMD_LINE_TYPE_BOOL,
|
|
||||||
"Print version and exit" },
|
|
||||||
|
|
||||||
{ NULL, '\0', "prefix", "prefix", 1,
|
|
||||||
&myglobals.prefix, OPAL_CMD_LINE_TYPE_STRING,
|
|
||||||
"Prefix to be used to look for ORTE executables" },
|
|
||||||
|
|
||||||
{ "orte_daemonize", '\0', NULL, "daemonize", 0,
|
|
||||||
&myglobals.daemonize, OPAL_CMD_LINE_TYPE_BOOL,
|
|
||||||
"Daemonize the orte-dvm into the background" },
|
|
||||||
|
|
||||||
{ NULL, '\0', NULL, "set-sid", 0,
|
|
||||||
&myglobals.set_sid, OPAL_CMD_LINE_TYPE_BOOL,
|
|
||||||
"Direct the orte-dvm to separate from the current session"},
|
|
||||||
|
|
||||||
{ "orte_debug_daemons", '\0', "debug-daemons", "debug-daemons", 0,
|
|
||||||
NULL, OPAL_CMD_LINE_TYPE_BOOL,
|
|
||||||
"Debug daemons" },
|
|
||||||
|
|
||||||
{ "orte_debug", 'd', "debug-devel", "debug-devel", 0,
|
|
||||||
NULL, OPAL_CMD_LINE_TYPE_BOOL,
|
|
||||||
"Enable debugging of OpenRTE" },
|
|
||||||
|
|
||||||
{ NULL, '\0', "allow-run-as-root", "allow-run-as-root", 0,
|
|
||||||
&myglobals.run_as_root, OPAL_CMD_LINE_TYPE_BOOL,
|
|
||||||
"Allow execution as root (STRONGLY DISCOURAGED)" },
|
|
||||||
|
|
||||||
/* Specify the launch agent to be used */
|
|
||||||
{ "orte_launch_agent", '\0', "launch-agent", "launch-agent", 1,
|
|
||||||
NULL, OPAL_CMD_LINE_TYPE_STRING,
|
|
||||||
"Command used to start processes on remote nodes (default: orted)" },
|
|
||||||
|
|
||||||
/* maximum size of VM - typically used to subdivide an allocation */
|
|
||||||
{ "orte_max_vm_size", '\0', "max-vm-size", "max-vm-size", 1,
|
|
||||||
NULL, OPAL_CMD_LINE_TYPE_INT,
|
|
||||||
"Maximum size of VM" },
|
|
||||||
|
|
||||||
/* Set a hostfile */
|
|
||||||
{ NULL, '\0', "hostfile", "hostfile", 1,
|
|
||||||
NULL, OPAL_CMD_LINE_TYPE_STRING,
|
|
||||||
"Provide a hostfile" },
|
|
||||||
{ NULL, '\0', "machinefile", "machinefile", 1,
|
|
||||||
NULL, OPAL_CMD_LINE_TYPE_STRING,
|
|
||||||
"Provide a hostfile" },
|
|
||||||
{ "orte_default_hostfile", '\0', "default-hostfile", "default-hostfile", 1,
|
|
||||||
NULL, OPAL_CMD_LINE_TYPE_STRING,
|
|
||||||
"Provide a default hostfile" },
|
|
||||||
|
|
||||||
{ NULL, 'H', "host", "host", 1,
|
|
||||||
NULL, OPAL_CMD_LINE_TYPE_STRING,
|
|
||||||
"List of hosts to invoke processes on" },
|
|
||||||
|
|
||||||
{ NULL, '\0', "system-server", "system-server", 0,
|
|
||||||
&myglobals.system_server, OPAL_CMD_LINE_TYPE_BOOL,
|
|
||||||
"Provide a system-level server connection point - only one allowed per node" },
|
|
||||||
|
|
||||||
{ NULL, '\0', "report-uri", "report-uri", 1,
|
|
||||||
&myglobals.report_uri, OPAL_CMD_LINE_TYPE_STRING,
|
|
||||||
"Printout URI on stdout [-], stderr [+], or a file [anything else]",
|
|
||||||
OPAL_CMD_LINE_OTYPE_DEBUG },
|
|
||||||
|
|
||||||
{ NULL, '\0', "remote-tools", "remote-tools", 0,
|
|
||||||
&myglobals.remote_connections, OPAL_CMD_LINE_TYPE_BOOL,
|
|
||||||
"Enable connections from remote tools" },
|
|
||||||
|
|
||||||
/* End of list */
|
|
||||||
{ NULL, '\0', NULL, NULL, 0,
|
|
||||||
NULL, OPAL_CMD_LINE_TYPE_NULL, NULL }
|
|
||||||
};
|
|
||||||
|
|
||||||
int main(int argc, char *argv[])
|
|
||||||
{
|
|
||||||
int rc, i, j;
|
|
||||||
opal_cmd_line_t cmd_line;
|
|
||||||
char *param, *value;
|
|
||||||
orte_job_t *jdata=NULL;
|
|
||||||
orte_app_context_t *app;
|
|
||||||
|
|
||||||
/* Setup and parse the command line */
|
|
||||||
memset(&myglobals, 0, sizeof(myglobals));
|
|
||||||
/* find our basename (the name of the executable) so that we can
|
|
||||||
use it in pretty-print error messages */
|
|
||||||
orte_basename = opal_basename(argv[0]);
|
|
||||||
|
|
||||||
opal_cmd_line_create(&cmd_line, cmd_line_init);
|
|
||||||
mca_base_cmd_line_setup(&cmd_line);
|
|
||||||
if (OPAL_SUCCESS != (rc = opal_cmd_line_parse(&cmd_line, true, false,
|
|
||||||
argc, argv)) ) {
|
|
||||||
if (OPAL_ERR_SILENT != rc) {
|
|
||||||
fprintf(stderr, "%s: command line error (%s)\n", argv[0],
|
|
||||||
opal_strerror(rc));
|
|
||||||
}
|
|
||||||
return rc;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* print version if requested. Do this before check for help so
|
|
||||||
that --version --help works as one might expect. */
|
|
||||||
if (myglobals.version) {
|
|
||||||
char *str;
|
|
||||||
str = opal_info_make_version_str("all",
|
|
||||||
OPAL_MAJOR_VERSION, OPAL_MINOR_VERSION,
|
|
||||||
OPAL_RELEASE_VERSION,
|
|
||||||
OPAL_GREEK_VERSION,
|
|
||||||
OPAL_REPO_REV);
|
|
||||||
if (NULL != str) {
|
|
||||||
fprintf(stdout, "%s %s\n\nReport bugs to %s\n",
|
|
||||||
orte_basename, str, PACKAGE_BUGREPORT);
|
|
||||||
free(str);
|
|
||||||
}
|
|
||||||
exit(0);
|
|
||||||
}
|
|
||||||
|
|
||||||
/* check if we are running as root - if we are, then only allow
|
|
||||||
* us to proceed if the allow-run-as-root flag was given. Otherwise,
|
|
||||||
* exit with a giant warning flag
|
|
||||||
*/
|
|
||||||
if (0 == geteuid() && !myglobals.run_as_root) {
|
|
||||||
/* show_help is not yet available, so print an error manually */
|
|
||||||
fprintf(stderr, "--------------------------------------------------------------------------\n");
|
|
||||||
if (myglobals.help) {
|
|
||||||
fprintf(stderr, "%s cannot provide the help message when run as root.\n\n", orte_basename);
|
|
||||||
} else {
|
|
||||||
fprintf(stderr, "%s has detected an attempt to run as root.\n\n", orte_basename);
|
|
||||||
}
|
|
||||||
|
|
||||||
fprintf(stderr, "Running at root is *strongly* discouraged as any mistake (e.g., in\n");
|
|
||||||
fprintf(stderr, "defining TMPDIR) or bug can result in catastrophic damage to the OS\n");
|
|
||||||
fprintf(stderr, "file system, leaving your system in an unusable state.\n\n");
|
|
||||||
|
|
||||||
fprintf(stderr, "We strongly suggest that you run %s as a non-root user.\n\n", orte_basename);
|
|
||||||
|
|
||||||
fprintf(stderr, "You can override this protection by adding the --allow-run-as-root\n");
|
|
||||||
fprintf(stderr, "option to your command line. However, we reiterate our strong advice\n");
|
|
||||||
fprintf(stderr, "against doing so - please do so at your own risk.\n");
|
|
||||||
fprintf(stderr, "--------------------------------------------------------------------------\n");
|
|
||||||
exit(1);
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Since this process can now handle MCA/GMCA parameters, make sure to
|
|
||||||
* process them.
|
|
||||||
* NOTE: It is "safe" to call mca_base_cmd_line_process_args() before
|
|
||||||
* opal_init_util() since mca_base_cmd_line_process_args() does *not*
|
|
||||||
* depend upon opal_init_util() functionality.
|
|
||||||
*/
|
|
||||||
if (OPAL_SUCCESS != mca_base_cmd_line_process_args(&cmd_line, &environ, &environ)) {
|
|
||||||
exit(1);
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Need to initialize OPAL so that install_dirs are filled in */
|
|
||||||
if (OPAL_SUCCESS != opal_init(&argc, &argv)) {
|
|
||||||
exit(1);
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Check for help request */
|
|
||||||
if (myglobals.help) {
|
|
||||||
char *str, *args = NULL;
|
|
||||||
char *project_name = NULL;
|
|
||||||
if (0 == strcmp(orte_basename, "mpirun")) {
|
|
||||||
project_name = "Open MPI";
|
|
||||||
} else {
|
|
||||||
project_name = "OpenRTE";
|
|
||||||
}
|
|
||||||
args = opal_cmd_line_get_usage_msg(&cmd_line);
|
|
||||||
str = opal_show_help_string("help-orterun.txt", "orterun:usage", false,
|
|
||||||
orte_basename, project_name, OPAL_VERSION,
|
|
||||||
orte_basename, args,
|
|
||||||
PACKAGE_BUGREPORT);
|
|
||||||
if (NULL != str) {
|
|
||||||
printf("%s", str);
|
|
||||||
free(str);
|
|
||||||
}
|
|
||||||
free(args);
|
|
||||||
|
|
||||||
/* If someone asks for help, that should be all we do */
|
|
||||||
exit(0);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (myglobals.system_server) {
|
|
||||||
/* we should act as system-level PMIx server */
|
|
||||||
opal_setenv(OPAL_MCA_PREFIX"pmix_system_server", "1", true, &environ);
|
|
||||||
}
|
|
||||||
/* always act as session-level PMIx server */
|
|
||||||
opal_setenv(OPAL_MCA_PREFIX"pmix_session_server", "1", true, &environ);
|
|
||||||
/* if we were asked to report a uri, set the MCA param to do so */
|
|
||||||
if (NULL != myglobals.report_uri) {
|
|
||||||
opal_setenv("PMIX_MCA_ptl_tcp_report_uri", myglobals.report_uri, true, &environ);
|
|
||||||
}
|
|
||||||
if (myglobals.remote_connections) {
|
|
||||||
opal_setenv("PMIX_MCA_ptl_tcp_remote_connections", "1", true, &environ);
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Setup MCA params */
|
|
||||||
orte_register_params();
|
|
||||||
|
|
||||||
/* save the environment for launch purposes. This MUST be
|
|
||||||
* done so that we can pass it to any local procs we
|
|
||||||
* spawn - otherwise, those local procs won't see any
|
|
||||||
* non-MCA envars were set in the enviro prior to calling
|
|
||||||
* orterun
|
|
||||||
*/
|
|
||||||
orte_launch_environ = opal_argv_copy(environ);
|
|
||||||
|
|
||||||
#if defined(HAVE_SETSID)
|
|
||||||
/* see if we were directed to separate from current session */
|
|
||||||
if (myglobals.set_sid) {
|
|
||||||
setsid();
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
/* detach from controlling terminal
|
|
||||||
* otherwise, remain attached so output can get to us
|
|
||||||
*/
|
|
||||||
if(!orte_debug_flag &&
|
|
||||||
!orte_debug_daemons_flag &&
|
|
||||||
myglobals.daemonize) {
|
|
||||||
opal_daemon_init(NULL);
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Intialize our Open RTE environment */
|
|
||||||
if (ORTE_SUCCESS != (rc = orte_init(&argc, &argv, ORTE_PROC_MASTER))) {
|
|
||||||
/* cannot call ORTE_ERROR_LOG as it could be the errmgr
|
|
||||||
* never got loaded!
|
|
||||||
*/
|
|
||||||
return rc;
|
|
||||||
}
|
|
||||||
/* finalize OPAL. As it was opened again from orte_init->opal_init
|
|
||||||
* we continue to have a reference count on it. So we have to finalize it twice...
|
|
||||||
*/
|
|
||||||
opal_finalize();
|
|
||||||
|
|
||||||
/* get the daemon job object - was created by ess/hnp component */
|
|
||||||
if (NULL == (jdata = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid))) {
|
|
||||||
orte_show_help("help-orterun.txt", "bad-job-object", true,
|
|
||||||
orte_basename);
|
|
||||||
exit(0);
|
|
||||||
}
|
|
||||||
/* also should have created a daemon "app" */
|
|
||||||
if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, 0))) {
|
|
||||||
orte_show_help("help-orterun.txt", "bad-app-object", true,
|
|
||||||
orte_basename);
|
|
||||||
exit(0);
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Did the user specify a prefix, or want prefix by default? */
|
|
||||||
if (opal_cmd_line_is_taken(&cmd_line, "prefix") || want_prefix_by_default) {
|
|
||||||
size_t param_len;
|
|
||||||
/* if both the prefix was given and we have a prefix
|
|
||||||
* given above, check to see if they match
|
|
||||||
*/
|
|
||||||
if (opal_cmd_line_is_taken(&cmd_line, "prefix") &&
|
|
||||||
NULL != myglobals.prefix) {
|
|
||||||
/* if they don't match, then that merits a warning */
|
|
||||||
param = strdup(opal_cmd_line_get_param(&cmd_line, "prefix", 0, 0));
|
|
||||||
/* ensure we strip any trailing '/' */
|
|
||||||
if (0 == strcmp(OPAL_PATH_SEP, &(param[strlen(param)-1]))) {
|
|
||||||
param[strlen(param)-1] = '\0';
|
|
||||||
}
|
|
||||||
value = strdup(myglobals.prefix);
|
|
||||||
if (0 == strcmp(OPAL_PATH_SEP, &(value[strlen(value)-1]))) {
|
|
||||||
value[strlen(value)-1] = '\0';
|
|
||||||
}
|
|
||||||
if (0 != strcmp(param, value)) {
|
|
||||||
orte_show_help("help-orterun.txt", "orterun:app-prefix-conflict",
|
|
||||||
true, orte_basename, value, param);
|
|
||||||
/* let the global-level prefix take precedence since we
|
|
||||||
* know that one is being used
|
|
||||||
*/
|
|
||||||
free(param);
|
|
||||||
param = strdup(myglobals.prefix);
|
|
||||||
}
|
|
||||||
free(value);
|
|
||||||
} else if (NULL != myglobals.prefix) {
|
|
||||||
param = myglobals.prefix;
|
|
||||||
} else if (opal_cmd_line_is_taken(&cmd_line, "prefix")){
|
|
||||||
/* must be --prefix alone */
|
|
||||||
param = strdup(opal_cmd_line_get_param(&cmd_line, "prefix", 0, 0));
|
|
||||||
} else {
|
|
||||||
/* --enable-orterun-prefix-default was given to orterun */
|
|
||||||
param = strdup(opal_install_dirs.prefix);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (NULL != param) {
|
|
||||||
/* "Parse" the param, aka remove superfluous path_sep. */
|
|
||||||
param_len = strlen(param);
|
|
||||||
while (0 == strcmp (OPAL_PATH_SEP, &(param[param_len-1]))) {
|
|
||||||
param[param_len-1] = '\0';
|
|
||||||
param_len--;
|
|
||||||
if (0 == param_len) {
|
|
||||||
orte_show_help("help-orterun.txt", "orterun:empty-prefix",
|
|
||||||
true, orte_basename, orte_basename);
|
|
||||||
return ORTE_ERR_FATAL;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
orte_set_attribute(&app->attributes, ORTE_APP_PREFIX_DIR, ORTE_ATTR_GLOBAL, param, OPAL_STRING);
|
|
||||||
free(param);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Did the user specify a hostfile. Need to check for both
|
|
||||||
* hostfile and machine file.
|
|
||||||
* We can only deal with one hostfile per app context, otherwise give an error.
|
|
||||||
*/
|
|
||||||
if (0 < (j = opal_cmd_line_get_ninsts(&cmd_line, "hostfile"))) {
|
|
||||||
if(1 < j) {
|
|
||||||
orte_show_help("help-orterun.txt", "orterun:multiple-hostfiles",
|
|
||||||
true, orte_basename, NULL);
|
|
||||||
return ORTE_ERR_FATAL;
|
|
||||||
} else {
|
|
||||||
value = opal_cmd_line_get_param(&cmd_line, "hostfile", 0, 0);
|
|
||||||
orte_set_attribute(&app->attributes, ORTE_APP_HOSTFILE, ORTE_ATTR_LOCAL, value, OPAL_STRING);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (0 < (j = opal_cmd_line_get_ninsts(&cmd_line, "machinefile"))) {
|
|
||||||
if(1 < j || orte_get_attribute(&app->attributes, ORTE_APP_HOSTFILE, NULL, OPAL_STRING)) {
|
|
||||||
orte_show_help("help-orterun.txt", "orterun:multiple-hostfiles",
|
|
||||||
true, orte_basename, NULL);
|
|
||||||
return ORTE_ERR_FATAL;
|
|
||||||
} else {
|
|
||||||
value = opal_cmd_line_get_param(&cmd_line, "machinefile", 0, 0);
|
|
||||||
orte_set_attribute(&app->attributes, ORTE_APP_HOSTFILE, ORTE_ATTR_LOCAL, value, OPAL_STRING);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Did the user specify any hosts? */
|
|
||||||
if (0 < (j = opal_cmd_line_get_ninsts(&cmd_line, "host"))) {
|
|
||||||
char **targ=NULL, *tval;
|
|
||||||
for (i = 0; i < j; ++i) {
|
|
||||||
value = opal_cmd_line_get_param(&cmd_line, "host", i, 0);
|
|
||||||
opal_argv_append_nosize(&targ, value);
|
|
||||||
}
|
|
||||||
tval = opal_argv_join(targ, ',');
|
|
||||||
orte_set_attribute(&app->attributes, ORTE_APP_DASH_HOST, ORTE_ATTR_LOCAL, tval, OPAL_STRING);
|
|
||||||
opal_argv_free(targ);
|
|
||||||
free(tval);
|
|
||||||
}
|
|
||||||
OBJ_DESTRUCT(&cmd_line);
|
|
||||||
|
|
||||||
/* setup to listen for commands sent specifically to me, even though I would probably
|
|
||||||
* be the one sending them! Unfortunately, since I am a participating daemon,
|
|
||||||
* there are times I need to send a command to "all daemons", and that means *I* have
|
|
||||||
* to receive it too
|
|
||||||
*/
|
|
||||||
orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_DAEMON,
|
|
||||||
ORTE_RML_PERSISTENT, orte_daemon_recv, NULL);
|
|
||||||
|
|
||||||
/* spawn the DVM - we skip the initial steps as this
|
|
||||||
* isn't a user-level application */
|
|
||||||
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_ALLOCATE);
|
|
||||||
|
|
||||||
/* loop the event lib until an exit event is detected */
|
|
||||||
while (orte_event_base_active) {
|
|
||||||
opal_event_loop(orte_event_base, OPAL_EVLOOP_ONCE);
|
|
||||||
}
|
|
||||||
ORTE_ACQUIRE_OBJECT(orte_event_base_active);
|
|
||||||
|
|
||||||
/* cleanup and leave */
|
|
||||||
orte_finalize();
|
|
||||||
|
|
||||||
if (orte_debug_flag) {
|
|
||||||
fprintf(stderr, "exiting with status %d\n", orte_exit_status);
|
|
||||||
}
|
|
||||||
exit(orte_exit_status);
|
|
||||||
}
|
|
Загрузка…
Ссылка в новой задаче
Block a user