1
1

Merge pull request #5944 from rhc54/topic/psrvr

Remove the stale orte-dvm code
Этот коммит содержится в:
Ralph Castain 2018-10-17 16:12:14 -07:00 коммит произвёл GitHub
родитель 7730db9982 1bd772e8eb
Коммит 6213d23f0b
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
24 изменённых файлов: 2 добавлений и 7355 удалений

Просмотреть файл

@ -248,14 +248,12 @@ AC_DEFUN([OPAL_CHECK_PMIX],[
AC_MSG_ERROR([Cannot continue])])
AC_MSG_CHECKING([if user requested internal PMIx support($with_pmix)])
opal_prun_happy=no
opal_external_pmix_happy=no
opal_external_have_pmix1=0
AS_IF([test "$with_pmix" = "internal"],
[AC_MSG_RESULT([yes])
opal_external_pmix_happy=no
opal_prun_happy=yes
opal_external_pmix_version=internal],
[AC_MSG_RESULT([no])
@ -376,7 +374,6 @@ AC_DEFUN([OPAL_CHECK_PMIX],[
[AC_MSG_RESULT([found])
opal_external_pmix_version=2x
opal_external_pmix_version_found=1
opal_prun_happy=yes
opal_external_pmix_happy=yes],
[AC_MSG_RESULT([not found])])])
@ -436,7 +433,6 @@ AC_DEFUN([OPAL_CHECK_PMIX],[
AC_DEFINE_UNQUOTED([OPAL_PMIX_V1],[$opal_external_have_pmix1],
[Whether the external PMIx library is v1])
AM_CONDITIONAL([OPAL_WANT_PRUN], [test "$opal_prun_happy" = "yes"])
AS_IF([test "$opal_external_pmix_happy" = "yes"],
[AS_IF([test "$opal_external_pmix_version" = "1x"],

Просмотреть файл

@ -30,7 +30,5 @@ AC_DEFUN([ORTE_CONFIG_FILES],[
orte/tools/orte-top/Makefile
orte/tools/orte-info/Makefile
orte/tools/orte-server/Makefile
orte/tools/orte-dvm/Makefile
orte/tools/ompi-prun/Makefile
])
])

Просмотреть файл

@ -2,7 +2,7 @@
# Copyright (c) 2012 Los Alamos National Security, LLC.
# All rights reserved.
# Copyright (c) 2014 Cisco Systems, Inc. All rights reserved.
# Copyright (c) 2016-2017 Intel, Inc. All rights reserved.
# Copyright (c) 2016-2018 Intel, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
@ -30,12 +30,6 @@ libmca_rte_orte_la_LIBADD = $(top_builddir)/orte/lib@ORTE_LIB_PREFIX@open-rte.la
man_pages = mpirun.1 mpiexec.1 ompi-ps.1 ompi-clean.1 ompi-top.1 ompi-server.1
if OPAL_WANT_PRUN
if WANT_INSTALL_HEADERS
man_pages += ompi-dvm.1
endif
endif
if OPAL_INSTALL_BINARIES
nodist_man_MANS = $(man_pages)
@ -46,9 +40,6 @@ install-exec-hook:
(cd $(DESTDIR)$(bindir); rm -f ompi-clean$(EXEEXT); $(LN_S) orte-clean$(EXEEXT) ompi-clean$(EXEEXT))
(cd $(DESTDIR)$(bindir); rm -f ompi-top$(EXEEXT); $(LN_S) orte-top$(EXEEXT) ompi-top$(EXEEXT))
(cd $(DESTDIR)$(bindir); rm -f ompi-server$(EXEEXT); $(LN_S) orte-server$(EXEEXT) ompi-server$(EXEEXT))
if OPAL_WANT_PRUN
(cd $(DESTDIR)$(bindir); rm -f ompi-dvm$(EXEEXT); $(LN_S) orte-dvm$(EXEEXT) ompi-dvm$(EXEEXT))
endif
uninstall-local:
rm -f $(DESTDIR)$(bindir)/mpirun$(EXEEXT) \
@ -57,9 +48,6 @@ uninstall-local:
$(DESTDIR)$(bindir)/ompi-clean$(EXEEXT) \
$(DESTDIR)$(bindir)/ompi-top$(EXEEXT) \
$(DESTDIR)$(bindir)/ompi-server$(EXEEXT)
if OPAL_WANT_PRUN
rm -f $(DESTDIR)$(bindir)/ompi-dvm$(EXEEXT)
endif
endif # OPAL_INSTALL_BINARIES
@ -96,10 +84,5 @@ $(top_builddir)/orte/tools/orte-server/orte-server.1:
ompi-server.1: $(top_builddir)/orte/tools/orte-server/orte-server.1
cp -f $(top_builddir)/orte/tools/orte-server/orte-server.1 ompi-server.1
if OPAL_WANT_PRUN
ompi-dvm.1: $(top_builddir)/orte/tools/orte-dvm/orte-dvm.1
cp -f $(top_builddir)/orte/tools/orte-dvm/orte-dvm.1 ompi-dvm.1
endif
clean-local:
rm -f $(man_pages)

Просмотреть файл

@ -1,37 +0,0 @@
#
# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
# Copyright (c) 2016 Intel, Inc. All rights reserved.
# Copyright (c) 2017 IBM Corporation. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
sources = \
errmgr_dvm.h \
errmgr_dvm_component.c \
errmgr_dvm.c
# Make the output library in this directory, and name it either
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
# (for static builds).
if MCA_BUILD_orte_errmgr_dvm_DSO
component_noinst =
component_install = mca_errmgr_dvm.la
else
component_noinst = libmca_errmgr_dvm.la
component_install =
endif
mcacomponentdir = $(ortelibdir)
mcacomponent_LTLIBRARIES = $(component_install)
mca_errmgr_dvm_la_SOURCES = $(sources)
mca_errmgr_dvm_la_LDFLAGS = -module -avoid-version
mca_errmgr_dvm_la_LIBADD = $(top_builddir)/orte/lib@ORTE_LIB_PREFIX@open-rte.la
noinst_LTLIBRARIES = $(component_noinst)
libmca_errmgr_dvm_la_SOURCES =$(sources)
libmca_errmgr_dvm_la_LDFLAGS = -module -avoid-version

Просмотреть файл

@ -1,632 +0,0 @@
/*
* Copyright (c) 2009-2011 The Trustees of Indiana University.
* All rights reserved.
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2010-2017 Oak Ridge National Labs. All rights reserved.
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2011 Oracle and/or all its affiliates. All rights reserved.
* Copyright (c) 2011-2013 Los Alamos National Security, LLC.
* All rights reserved.
* Copyright (c) 2014-2018 Intel, Inc. All rights reserved.
* Copyright (c) 2017 IBM Corporation. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include <sys/types.h>
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif /* HAVE_UNISTD_H */
#include <string.h>
#ifdef HAVE_SYS_WAIT_H
#include <sys/wait.h>
#endif
#include "opal/util/output.h"
#include "opal/dss/dss.h"
#include "orte/mca/iof/base/base.h"
#include "orte/mca/rml/rml.h"
#include "orte/mca/odls/odls.h"
#include "orte/mca/odls/base/base.h"
#include "orte/mca/odls/base/odls_private.h"
#include "orte/mca/plm/base/plm_private.h"
#include "orte/mca/plm/plm.h"
#include "orte/mca/rmaps/rmaps_types.h"
#include "orte/mca/routed/routed.h"
#include "orte/mca/grpcomm/grpcomm.h"
#include "orte/mca/ess/ess.h"
#include "orte/mca/state/state.h"
#include "orte/util/error_strings.h"
#include "orte/util/name_fns.h"
#include "orte/util/proc_info.h"
#include "orte/util/show_help.h"
#include "orte/util/threads.h"
#include "orte/runtime/orte_globals.h"
#include "orte/runtime/orte_locks.h"
#include "orte/runtime/orte_quit.h"
#include "orte/runtime/data_type_support/orte_dt_support.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/errmgr/base/base.h"
#include "orte/mca/errmgr/base/errmgr_private.h"
#include "errmgr_dvm.h"
static int init(void);
static int finalize(void);
/******************
* dvm module
******************/
orte_errmgr_base_module_t orte_errmgr_dvm_module = {
.init = init,
.finalize = finalize,
.logfn = orte_errmgr_base_log,
.abort = orte_errmgr_base_abort,
.abort_peers = orte_errmgr_base_abort_peers
};
/*
* Local functions
*/
static void job_errors(int fd, short args, void *cbdata);
static void proc_errors(int fd, short args, void *cbdata);
static int init(void)
{
/* setup state machine to trap job errors */
orte_state.add_job_state(ORTE_JOB_STATE_ERROR, job_errors, ORTE_ERROR_PRI);
/* set the lost connection state to run at MSG priority so
* we can process any last messages from the proc
*/
orte_state.add_proc_state(ORTE_PROC_STATE_COMM_FAILED, proc_errors, ORTE_MSG_PRI);
/* setup state machine to trap proc errors */
orte_state.add_proc_state(ORTE_PROC_STATE_ERROR, proc_errors, ORTE_ERROR_PRI);
return ORTE_SUCCESS;
}
static int finalize(void)
{
return ORTE_SUCCESS;
}
static void _terminate_job(orte_jobid_t jobid)
{
opal_pointer_array_t procs;
orte_proc_t pobj;
OBJ_CONSTRUCT(&procs, opal_pointer_array_t);
opal_pointer_array_init(&procs, 1, 1, 1);
OBJ_CONSTRUCT(&pobj, orte_proc_t);
pobj.name.jobid = jobid;
pobj.name.vpid = ORTE_VPID_WILDCARD;
opal_pointer_array_add(&procs, &pobj);
orte_plm.terminate_procs(&procs);
OBJ_DESTRUCT(&procs);
OBJ_DESTRUCT(&pobj);
}
static void job_errors(int fd, short args, void *cbdata)
{
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
orte_job_t *jdata;
orte_job_state_t jobstate;
opal_buffer_t *answer;
int32_t rc, ret;
int room, *rmptr;
ORTE_ACQUIRE_OBJECT(caddy);
/*
* if orte is trying to shutdown, just let it
*/
if (orte_finalizing) {
return;
}
/* if the jdata is NULL, then we ignore it as this
* is reporting an unrecoverable error
*/
if (NULL == caddy->jdata) {
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
OBJ_RELEASE(caddy);
return;
}
/* update the state */
jdata = caddy->jdata;
jobstate = caddy->job_state;
jdata->state = jobstate;
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_framework.framework_output,
"%s errmgr:dvm: job %s reported state %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOBID_PRINT(jdata->jobid),
orte_job_state_to_str(jobstate)));
if (jdata->jobid == ORTE_PROC_MY_NAME->jobid) {
/* if the daemon job aborted and we haven't heard from everyone yet,
* then this could well have been caused by a daemon not finding
* a way back to us. In this case, output a message indicating a daemon
* died without reporting. Otherwise, say nothing as we
* likely already output an error message */
if (ORTE_JOB_STATE_ABORTED == jobstate &&
jdata->num_procs != jdata->num_reported) {
orte_routing_is_enabled = false;
orte_show_help("help-errmgr-base.txt", "failed-daemon", true);
}
/* there really isn't much else we can do since the problem
* is in the DVM itself, so best just to terminate */
jdata->num_terminated = jdata->num_procs;
/* activate the terminated state so we can exit */
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_TERMINATED);
OBJ_RELEASE(caddy);
return;
}
/* all other cases involve jobs submitted to the DVM - therefore,
* we only inform the submitter of the problem, but do NOT terminate
* the DVM itself */
rc = jobstate;
answer = OBJ_NEW(opal_buffer_t);
if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &rc, 1, OPAL_INT32))) {
ORTE_ERROR_LOG(ret);
OBJ_RELEASE(caddy);
return;
}
if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &jdata->jobid, 1, ORTE_JOBID))) {
ORTE_ERROR_LOG(ret);
OBJ_RELEASE(caddy);
return;
}
/* pack the room number */
rmptr = &room;
if (orte_get_attribute(&jdata->attributes, ORTE_JOB_ROOM_NUM, (void**)&rmptr, OPAL_INT)) {
if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &room, 1, OPAL_INT))) {
ORTE_ERROR_LOG(ret);
OBJ_RELEASE(caddy);
return;
}
}
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
"%s errmgr:dvm sending notification of job %s failure to %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOBID_PRINT(jdata->jobid),
ORTE_NAME_PRINT(&jdata->originator)));
if (0 > (ret = orte_rml.send_buffer_nb(orte_mgmt_conduit,
&jdata->originator, answer,
ORTE_RML_TAG_LAUNCH_RESP,
orte_rml_send_callback, NULL))) {
ORTE_ERROR_LOG(ret);
OBJ_RELEASE(answer);
}
/* ensure we terminate any processes left running in the DVM */
_terminate_job(jdata->jobid);
/* cleanup */
OBJ_RELEASE(caddy);
}
static void proc_errors(int fd, short args, void *cbdata)
{
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
orte_job_t *jdata;
orte_proc_t *pptr, *proct;
orte_process_name_t *proc = &caddy->name;
orte_proc_state_t state = caddy->proc_state;
int i;
int32_t i32, *i32ptr;
char *rtmod;
ORTE_ACQUIRE_OBJECT(caddy);
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_framework.framework_output,
"%s errmgr:dvm: for proc %s state %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(proc),
orte_proc_state_to_str(state)));
/*
* if orte is trying to shutdown, just let it
*/
if (orte_finalizing) {
goto cleanup;
}
/* get the job object */
if (NULL == (jdata = orte_get_job_data_object(proc->jobid))) {
/* could be a race condition */
goto cleanup;
}
pptr = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid);
/* get the management conduit's routed module */
rtmod = orte_rml.get_routed(orte_mgmt_conduit);
/* we MUST handle a communication failure before doing anything else
* as it requires some special care to avoid normal termination issues
* for local application procs
*/
if (ORTE_PROC_STATE_COMM_FAILED == state) {
/* is this to a daemon? */
if (ORTE_PROC_MY_NAME->jobid != proc->jobid) {
/* nope - ignore it */
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
"%s Comm failure to non-daemon proc - ignoring it",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
goto cleanup;
}
/* if this is my own connection, ignore it */
if (ORTE_PROC_MY_NAME->vpid == proc->vpid) {
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
"%s Comm failure on my own connection - ignoring it",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
goto cleanup;
}
/* mark the daemon as gone */
ORTE_FLAG_UNSET(pptr, ORTE_PROC_FLAG_ALIVE);
/* update the state */
pptr->state = state;
/* adjust our num_procs */
--orte_process_info.num_procs;
/* if we have ordered orteds to terminate or abort
* is in progress, record it */
if (orte_orteds_term_ordered || orte_abnormal_term_ordered) {
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
"%s Comm failure: daemons terminating - recording daemon %s as gone",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc)));
/* remove from dependent routes, if it is one */
orte_routed.route_lost(rtmod, proc);
/* if all my routes and local children are gone, then terminate ourselves */
if (0 == orte_routed.num_routes(rtmod)) {
for (i=0; i < orte_local_children->size; i++) {
if (NULL != (proct = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i)) &&
ORTE_FLAG_TEST(pptr, ORTE_PROC_FLAG_ALIVE) && proct->state < ORTE_PROC_STATE_UNTERMINATED) {
/* at least one is still alive */
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
"%s Comm failure: at least one proc (%s) still alive",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&proct->name)));
goto cleanup;
}
}
/* call our appropriate exit procedure */
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
"%s errmgr_dvm: all routes and children gone - ordering exit",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED);
} else {
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
"%s Comm failure: %d routes remain alive",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
(int)orte_routed.num_routes(rtmod)));
}
goto cleanup;
}
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
"%s Comm failure: daemon %s - aborting",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc)));
/* record the first one to fail */
if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) {
/* output an error message so the user knows what happened */
orte_show_help("help-errmgr-base.txt", "node-died", true,
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
orte_process_info.nodename,
ORTE_NAME_PRINT(proc),
pptr->node->name);
/* mark the daemon job as failed */
jdata->state = ORTE_JOB_STATE_COMM_FAILED;
/* point to the lowest rank to cause the problem */
orte_set_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, ORTE_ATTR_LOCAL, pptr, OPAL_PTR);
/* retain the object so it doesn't get free'd */
OBJ_RETAIN(pptr);
ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_ABORTED);
/* update our exit code */
ORTE_UPDATE_EXIT_STATUS(pptr->exit_code);
/* just in case the exit code hadn't been set, do it here - this
* won't override any reported exit code */
ORTE_UPDATE_EXIT_STATUS(ORTE_ERR_COMM_FAILURE);
}
goto cleanup;
}
/* update the proc state - can get multiple reports on a proc
* depending on circumstances, so ensure we only do this once
*/
if (pptr->state < ORTE_PROC_STATE_TERMINATED) {
pptr->state = state;
}
/* if we were ordered to terminate, mark this proc as dead and see if
* any of our routes or local children remain alive - if not, then
* terminate ourselves. */
if (orte_orteds_term_ordered) {
for (i=0; i < orte_local_children->size; i++) {
if (NULL != (proct = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) {
if (ORTE_FLAG_TEST(proct, ORTE_PROC_FLAG_ALIVE)) {
goto keep_going;
}
}
}
/* if all my routes and children are gone, then terminate
ourselves nicely (i.e., this is a normal termination) */
if (0 == orte_routed.num_routes(rtmod)) {
OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output,
"%s errmgr:default:dvm all routes gone - exiting",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED);
}
}
keep_going:
/* ensure we record the failed proc properly so we can report
* the error once we terminate
*/
switch (state) {
case ORTE_PROC_STATE_KILLED_BY_CMD:
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
"%s errmgr:dvm: proc %s killed by cmd",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(proc)));
/* we ordered this proc to die, so it isn't an abnormal termination
* and we don't flag it as such
*/
if (jdata->num_terminated >= jdata->num_procs) {
/* this job has terminated */
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_TERMINATED);
}
/* don't abort the job as this isn't an abnormal termination */
break;
case ORTE_PROC_STATE_ABORTED:
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
"%s errmgr:dvm: proc %s aborted",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(proc)));
if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) {
jdata->state = ORTE_JOB_STATE_ABORTED;
/* point to the first rank to cause the problem */
orte_set_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, ORTE_ATTR_LOCAL, pptr, OPAL_PTR);
/* retain the object so it doesn't get free'd */
OBJ_RETAIN(pptr);
ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_ABORTED);
ORTE_UPDATE_EXIT_STATUS(pptr->exit_code);
/* kill the job */
_terminate_job(jdata->jobid);
}
break;
case ORTE_PROC_STATE_ABORTED_BY_SIG:
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
"%s errmgr:dvm: proc %s aborted by signal",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(proc)));
if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) {
jdata->state = ORTE_JOB_STATE_ABORTED_BY_SIG;
/* point to the first rank to cause the problem */
orte_set_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, ORTE_ATTR_LOCAL, pptr, OPAL_PTR);
/* retain the object so it doesn't get free'd */
OBJ_RETAIN(pptr);
ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_ABORTED);
ORTE_UPDATE_EXIT_STATUS(pptr->exit_code);
/* kill the job */
_terminate_job(jdata->jobid);
}
break;
case ORTE_PROC_STATE_TERM_WO_SYNC:
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
"%s errmgr:dvm: proc %s terminated without sync",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(proc)));
if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) {
jdata->state = ORTE_JOB_STATE_ABORTED_WO_SYNC;
/* point to the first rank to cause the problem */
orte_set_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, ORTE_ATTR_LOCAL, pptr, OPAL_PTR);
/* retain the object so it doesn't get free'd */
OBJ_RETAIN(pptr);
ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_ABORTED);
ORTE_UPDATE_EXIT_STATUS(pptr->exit_code);
/* now treat a special case - if the proc exit'd without a required
* sync, it may have done so with a zero exit code. We want to ensure
* that the user realizes there was an error, so in this -one- case,
* we overwrite the process' exit code with the default error code
*/
ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
/* kill the job */
_terminate_job(jdata->jobid);
}
break;
case ORTE_PROC_STATE_FAILED_TO_START:
case ORTE_PROC_STATE_FAILED_TO_LAUNCH:
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
"%s errmgr:dvm: proc %s %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(proc),
orte_proc_state_to_str(state)));
if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) {
opal_buffer_t *answer;
int id, *idptr, ret;
if (ORTE_PROC_STATE_FAILED_TO_START) {
jdata->state = ORTE_JOB_STATE_FAILED_TO_START;
} else {
jdata->state = ORTE_JOB_STATE_FAILED_TO_LAUNCH;
}
/* point to the first rank to cause the problem */
orte_set_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, ORTE_ATTR_LOCAL, pptr, OPAL_PTR);
/* retain the object so it doesn't get free'd */
OBJ_RETAIN(pptr);
ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_ABORTED);
/* send a notification to the requestor - indicate that this is a spawn response */
answer = OBJ_NEW(opal_buffer_t);
/* pack the return status */
if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &pptr->exit_code, 1, OPAL_INT32))) {
ORTE_ERROR_LOG(ret);
OBJ_RELEASE(answer);
goto CLEANUP;
}
/* pack the jobid to be returned */
if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &jdata->jobid, 1, ORTE_JOBID))) {
ORTE_ERROR_LOG(ret);
OBJ_RELEASE(answer);
goto CLEANUP;
}
idptr = &id;
if (orte_get_attribute(&jdata->attributes, ORTE_JOB_ROOM_NUM, (void**)&idptr, OPAL_INT)) {
/* pack the sender's index to the tracking object */
if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, idptr, 1, OPAL_INT))) {
ORTE_ERROR_LOG(ret);
OBJ_RELEASE(answer);
goto CLEANUP;
}
}
if (orte_get_attribute(&jdata->attributes, ORTE_JOB_FIXED_DVM, NULL, OPAL_BOOL)) {
/* we need to send the requestor more info about what happened */
opal_dss.pack(answer, &jdata->state, 1, ORTE_JOB_STATE_T);
opal_dss.pack(answer, &pptr, 1, ORTE_PROC);
opal_dss.pack(answer, &pptr->node, 1, ORTE_NODE);
}
/* return response */
if (0 > (ret = orte_rml.send_buffer_nb(orte_mgmt_conduit,
&jdata->originator, answer,
ORTE_RML_TAG_LAUNCH_RESP,
orte_rml_send_callback, NULL))) {
ORTE_ERROR_LOG(ret);
OBJ_RELEASE(answer);
}
/* record that we notified about this job */
jdata->state = ORTE_JOB_STATE_NOTIFIED;
CLEANUP:
/* kill the job */
_terminate_job(jdata->jobid);
}
/* if this was a daemon, report it */
if (jdata->jobid == ORTE_PROC_MY_NAME->jobid) {
/* output a message indicating we failed to launch a daemon */
orte_show_help("help-errmgr-base.txt", "failed-daemon-launch", true);
}
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_TERMINATED);
break;
case ORTE_PROC_STATE_CALLED_ABORT:
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
"%s errmgr:dvm: proc %s called abort with exit code %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(proc), pptr->exit_code));
if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) {
jdata->state = ORTE_JOB_STATE_CALLED_ABORT;
/* point to the first proc to cause the problem */
orte_set_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, ORTE_ATTR_LOCAL, pptr, OPAL_PTR);
/* retain the object so it doesn't get free'd */
OBJ_RETAIN(pptr);
ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_ABORTED);
ORTE_UPDATE_EXIT_STATUS(pptr->exit_code);
/* kill the job */
_terminate_job(jdata->jobid);
}
break;
case ORTE_PROC_STATE_TERM_NON_ZERO:
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
"%s errmgr:dvm: proc %s exited with non-zero status %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(proc),
pptr->exit_code));
ORTE_UPDATE_EXIT_STATUS(pptr->exit_code);
/* track the number of non-zero exits */
i32 = 0;
i32ptr = &i32;
orte_get_attribute(&jdata->attributes, ORTE_JOB_NUM_NONZERO_EXIT, (void**)&i32ptr, OPAL_INT32);
++i32;
orte_set_attribute(&jdata->attributes, ORTE_JOB_NUM_NONZERO_EXIT, ORTE_ATTR_LOCAL, i32ptr, OPAL_INT32);
if (orte_abort_non_zero_exit) {
if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) {
jdata->state = ORTE_JOB_STATE_NON_ZERO_TERM;
/* point to the first rank to cause the problem */
orte_set_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, ORTE_ATTR_LOCAL, pptr, OPAL_PTR);
/* retain the object so it doesn't get free'd */
OBJ_RETAIN(pptr);
ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_ABORTED);
/* kill the job */
_terminate_job(jdata->jobid);
}
} else {
/* user requested we consider this normal termination */
if (jdata->num_terminated >= jdata->num_procs) {
/* this job has terminated */
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_TERMINATED);
}
}
break;
case ORTE_PROC_STATE_HEARTBEAT_FAILED:
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
"%s errmgr:dvm: proc %s heartbeat failed",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(proc)));
if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) {
jdata->state = ORTE_JOB_STATE_HEARTBEAT_FAILED;
/* point to the first rank to cause the problem */
orte_set_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, ORTE_ATTR_LOCAL, pptr, OPAL_PTR);
/* retain the object so it doesn't get free'd */
OBJ_RETAIN(pptr);
ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_ABORTED);
ORTE_UPDATE_EXIT_STATUS(pptr->exit_code);
/* kill the job */
_terminate_job(jdata->jobid);
}
/* remove from dependent routes, if it is one */
orte_routed.route_lost(rtmod, proc);
break;
case ORTE_PROC_STATE_UNABLE_TO_SEND_MSG:
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
"%s errmgr:dvm: unable to send message to proc %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(proc)));
/* if this proc is one of my daemons, then we are truly
* hosed - so just exit out
*/
if (ORTE_PROC_MY_NAME->jobid == proc->jobid) {
ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED);
break;
}
break;
default:
/* shouldn't get this, but terminate job if required */
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
"%s errmgr:dvm: proc %s default error %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(proc),
orte_proc_state_to_str(state)));
if (jdata->num_terminated == jdata->num_procs) {
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_TERMINATED);
}
break;
}
/* if the waitpid fired, be sure to let the state machine know */
if (ORTE_FLAG_TEST(pptr, ORTE_PROC_FLAG_WAITPID)) {
ORTE_ACTIVATE_PROC_STATE(&pptr->name, ORTE_PROC_STATE_WAITPID_FIRED);
}
cleanup:
OBJ_RELEASE(caddy);
}

Просмотреть файл

@ -1,39 +0,0 @@
/*
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2016 Intel, Inc. All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/**
* @file
*
*/
#ifndef MCA_ERRMGR_dvm_EXPORT_H
#define MCA_ERRMGR_dvm_EXPORT_H
#include "orte_config.h"
#include "orte/mca/errmgr/errmgr.h"
BEGIN_C_DECLS
/*
* Local Component structures
*/
ORTE_MODULE_DECLSPEC extern orte_errmgr_base_component_t mca_errmgr_dvm_component;
ORTE_DECLSPEC extern orte_errmgr_base_module_t orte_errmgr_dvm_module;
END_C_DECLS
#endif /* MCA_ERRMGR_dvm_EXPORT_H */

Просмотреть файл

@ -1,102 +0,0 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2016 Intel, Inc. All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "opal/util/output.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/errmgr/base/base.h"
#include "orte/mca/errmgr/base/errmgr_private.h"
#include "errmgr_dvm.h"
/*
* Public string for version number
*/
const char *orte_errmgr_dvm_component_version_string =
"ORTE ERRMGR dvm MCA component version " ORTE_VERSION;
/*
* Local functionality
*/
static int dvm_register(void);
static int dvm_open(void);
static int dvm_close(void);
static int dvm_component_query(mca_base_module_t **module, int *priority);
/*
* Instantiate the public struct with all of our public information
* and pointer to our public functions in it
*/
orte_errmgr_base_component_t mca_errmgr_dvm_component = {
/* Handle the general mca_component_t struct containing
* meta information about the component dvm
*/
.base_version = {
ORTE_ERRMGR_BASE_VERSION_3_0_0,
/* Component name and version */
.mca_component_name = "dvm",
MCA_BASE_MAKE_VERSION(component, ORTE_MAJOR_VERSION, ORTE_MINOR_VERSION,
ORTE_RELEASE_VERSION),
/* Component open and close functions */
.mca_open_component = dvm_open,
.mca_close_component = dvm_close,
.mca_query_component = dvm_component_query,
.mca_register_component_params = dvm_register,
},
.base_data = {
/* The component is checkpoint ready */
MCA_BASE_METADATA_PARAM_CHECKPOINT
},
};
static int my_priority;
static int dvm_register(void)
{
mca_base_component_t *c = &mca_errmgr_dvm_component.base_version;
my_priority = 1000;
(void) mca_base_component_var_register(c, "priority",
"Priority of the dvm errmgr component",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_READONLY, &my_priority);
return ORTE_SUCCESS;
}
static int dvm_open(void)
{
return ORTE_SUCCESS;
}
static int dvm_close(void)
{
return ORTE_SUCCESS;
}
static int dvm_component_query(mca_base_module_t **module, int *priority)
{
/* used by DVM masters */
if (ORTE_PROC_IS_MASTER) {
*priority = my_priority;
*module = (mca_base_module_t *)&orte_errmgr_dvm_module;
return ORTE_SUCCESS;
}
*module = NULL;
*priority = -1;
return ORTE_ERROR;
}

Просмотреть файл

@ -1,7 +0,0 @@
#
# owner/status file
# owner: institution that is responsible for this package
# status: e.g. active, maintenance, unmaintained
#
owner: INTEL
status: active

Просмотреть файл

@ -1,36 +0,0 @@
#
# Copyright (c) 2015-2018 Intel, Inc. All rights reserved.
# Copyright (c) 2017 IBM Corporation. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
sources = \
state_dvm.h \
state_dvm_component.c \
state_dvm.c
# Make the output library in this directory, and name it either
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
# (for static builds).
if MCA_BUILD_orte_state_dvm_DSO
component_noinst =
component_install = mca_state_dvm.la
else
component_noinst = libmca_state_dvm.la
component_install =
endif
mcacomponentdir = $(ortelibdir)
mcacomponent_LTLIBRARIES = $(component_install)
mca_state_dvm_la_SOURCES = $(sources)
mca_state_dvm_la_LDFLAGS = -module -avoid-version
mca_state_dvm_la_LIBADD = $(top_builddir)/orte/lib@ORTE_LIB_PREFIX@open-rte.la
noinst_LTLIBRARIES = $(component_noinst)
libmca_state_dvm_la_SOURCES =$(sources)
libmca_state_dvm_la_LDFLAGS = -module -avoid-version

Просмотреть файл

@ -1,7 +0,0 @@
#
# owner/status file
# owner: institution that is responsible for this package
# status: e.g. active, maintenance, unmaintained
#
owner: INTEL
status: active

Просмотреть файл

@ -1,688 +0,0 @@
/*
* Copyright (c) 2015-2018 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include <sys/types.h>
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif /* HAVE_UNISTD_H */
#include <string.h>
#include "opal/util/output.h"
#include "opal/mca/pmix/pmix.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/filem/filem.h"
#include "orte/mca/grpcomm/grpcomm.h"
#include "orte/mca/iof/base/base.h"
#include "orte/mca/odls/odls_types.h"
#include "orte/mca/plm/base/base.h"
#include "orte/mca/ras/base/base.h"
#include "orte/mca/regx/regx.h"
#include "orte/mca/rmaps/base/base.h"
#include "orte/mca/rml/rml.h"
#include "orte/mca/rml/base/rml_contact.h"
#include "orte/mca/routed/routed.h"
#include "orte/util/session_dir.h"
#include "orte/util/threads.h"
#include "orte/runtime/orte_quit.h"
#include "orte/runtime/orte_wait.h"
#include "orte/mca/state/state.h"
#include "orte/mca/state/base/base.h"
#include "orte/mca/state/base/state_private.h"
#include "state_dvm.h"
/*
* Module functions: Global
*/
static int init(void);
static int finalize(void);
/* local functions */
static void init_complete(int fd, short args, void *cbdata);
static void vm_ready(int fd, short args, void *cbata);
static void check_complete(int fd, short args, void *cbdata);
static void cleanup_job(int fd, short args, void *cbdata);
/******************
* DVM module - used when mpirun is persistent
******************/
orte_state_base_module_t orte_state_dvm_module = {
init,
finalize,
orte_state_base_activate_job_state,
orte_state_base_add_job_state,
orte_state_base_set_job_state_callback,
orte_state_base_set_job_state_priority,
orte_state_base_remove_job_state,
orte_state_base_activate_proc_state,
orte_state_base_add_proc_state,
orte_state_base_set_proc_state_callback,
orte_state_base_set_proc_state_priority,
orte_state_base_remove_proc_state
};
static void dvm_notify(int sd, short args, void *cbdata);
/* defined default state machine sequence - individual
* plm's must add a state for launching daemons
*/
static orte_job_state_t launch_states[] = {
ORTE_JOB_STATE_INIT,
ORTE_JOB_STATE_INIT_COMPLETE,
ORTE_JOB_STATE_ALLOCATE,
ORTE_JOB_STATE_ALLOCATION_COMPLETE,
ORTE_JOB_STATE_DAEMONS_LAUNCHED,
ORTE_JOB_STATE_DAEMONS_REPORTED,
ORTE_JOB_STATE_VM_READY,
ORTE_JOB_STATE_MAP,
ORTE_JOB_STATE_MAP_COMPLETE,
ORTE_JOB_STATE_SYSTEM_PREP,
ORTE_JOB_STATE_LAUNCH_APPS,
ORTE_JOB_STATE_SEND_LAUNCH_MSG,
ORTE_JOB_STATE_LOCAL_LAUNCH_COMPLETE,
ORTE_JOB_STATE_RUNNING,
ORTE_JOB_STATE_REGISTERED,
/* termination states */
ORTE_JOB_STATE_TERMINATED,
ORTE_JOB_STATE_NOTIFY_COMPLETED,
ORTE_JOB_STATE_NOTIFIED,
ORTE_JOB_STATE_ALL_JOBS_COMPLETE
};
static orte_state_cbfunc_t launch_callbacks[] = {
orte_plm_base_setup_job,
init_complete,
orte_ras_base_allocate,
orte_plm_base_allocation_complete,
orte_plm_base_daemons_launched,
orte_plm_base_daemons_reported,
vm_ready,
orte_rmaps_base_map_job,
orte_plm_base_mapping_complete,
orte_plm_base_complete_setup,
orte_plm_base_launch_apps,
orte_plm_base_send_launch_msg,
orte_state_base_local_launch_complete,
orte_plm_base_post_launch,
orte_plm_base_registered,
check_complete,
dvm_notify,
cleanup_job,
orte_quit
};
static orte_proc_state_t proc_states[] = {
ORTE_PROC_STATE_RUNNING,
ORTE_PROC_STATE_REGISTERED,
ORTE_PROC_STATE_IOF_COMPLETE,
ORTE_PROC_STATE_WAITPID_FIRED,
ORTE_PROC_STATE_TERMINATED
};
static orte_state_cbfunc_t proc_callbacks[] = {
orte_state_base_track_procs,
orte_state_base_track_procs,
orte_state_base_track_procs,
orte_state_base_track_procs,
orte_state_base_track_procs
};
static void force_quit(int fd, short args, void *cbdata)
{
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
/* give us a chance to stop the orteds */
orte_plm.terminate_orteds();
OBJ_RELEASE(caddy);
}
/************************
* API Definitions
************************/
static int init(void)
{
int i, rc;
int num_states;
/* setup the state machines */
OBJ_CONSTRUCT(&orte_job_states, opal_list_t);
OBJ_CONSTRUCT(&orte_proc_states, opal_list_t);
/* setup the job state machine */
num_states = sizeof(launch_states) / sizeof(orte_job_state_t);
for (i=0; i < num_states; i++) {
if (ORTE_SUCCESS != (rc = orte_state.add_job_state(launch_states[i],
launch_callbacks[i],
ORTE_SYS_PRI))) {
ORTE_ERROR_LOG(rc);
}
}
/* add the termination response */
if (ORTE_SUCCESS != (rc = orte_state.add_job_state(ORTE_JOB_STATE_DAEMONS_TERMINATED,
orte_quit, ORTE_SYS_PRI))) {
ORTE_ERROR_LOG(rc);
}
/* add a default error response */
if (ORTE_SUCCESS != (rc = orte_state.add_job_state(ORTE_JOB_STATE_FORCED_EXIT,
force_quit, ORTE_ERROR_PRI))) {
ORTE_ERROR_LOG(rc);
}
/* add callback to report progress, if requested */
if (ORTE_SUCCESS != (rc = orte_state.add_job_state(ORTE_JOB_STATE_REPORT_PROGRESS,
orte_state_base_report_progress, ORTE_ERROR_PRI))) {
ORTE_ERROR_LOG(rc);
}
if (5 < opal_output_get_verbosity(orte_state_base_framework.framework_output)) {
orte_state_base_print_job_state_machine();
}
/* populate the proc state machine to allow us to
* track proc lifecycle changes
*/
num_states = sizeof(proc_states) / sizeof(orte_proc_state_t);
for (i=0; i < num_states; i++) {
if (ORTE_SUCCESS != (rc = orte_state.add_proc_state(proc_states[i],
proc_callbacks[i],
ORTE_SYS_PRI))) {
ORTE_ERROR_LOG(rc);
}
}
if (5 < opal_output_get_verbosity(orte_state_base_framework.framework_output)) {
orte_state_base_print_proc_state_machine();
}
return ORTE_SUCCESS;
}
static int finalize(void)
{
opal_list_item_t *item;
/* cleanup the proc state machine */
while (NULL != (item = opal_list_remove_first(&orte_proc_states))) {
OBJ_RELEASE(item);
}
OBJ_DESTRUCT(&orte_proc_states);
return ORTE_SUCCESS;
}
static void files_ready(int status, void *cbdata)
{
orte_job_t *jdata = (orte_job_t*)cbdata;
if (ORTE_SUCCESS != status) {
ORTE_FORCED_TERMINATE(status);
return;
} else {
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_MAP);
}
}
static void init_complete(int sd, short args, void *cbdata)
{
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
ORTE_ACQUIRE_OBJECT(caddy);
/* nothing to do here but move along - if it is the
* daemon job, then next step is allocate */
ORTE_ACTIVATE_JOB_STATE(caddy->jdata, ORTE_JOB_STATE_ALLOCATE);
OBJ_RELEASE(caddy);
}
static void vm_ready(int fd, short args, void *cbdata)
{
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
int rc;
opal_buffer_t *buf;
orte_daemon_cmd_flag_t command = ORTE_DAEMON_DVM_NIDMAP_CMD;
orte_grpcomm_signature_t *sig;
opal_buffer_t *wireup;
orte_job_t *jptr;
orte_proc_t *dmn;
opal_byte_object_t bo, *boptr;
int8_t flag;
int32_t numbytes, v;
char *nidmap;
opal_list_t *modex;
opal_value_t *val, *kv;
ORTE_ACQUIRE_OBJECT(caddy);
/* if this is my job, then we are done */
if (ORTE_PROC_MY_NAME->jobid == caddy->jdata->jobid) {
/* if there is only one daemon in the job, then there
* is just a little bit to do */
if (1 == orte_process_info.num_procs) {
if (!orte_nidmap_communicated) {
if (ORTE_SUCCESS != (rc = orte_regx.nidmap_create(orte_node_pool, &orte_node_regex))) {
ORTE_ERROR_LOG(rc);
return;
}
orte_nidmap_communicated = true;
}
} else {
/* send the daemon map to every daemon in this DVM - we
* do this here so we don't have to do it for every
* job we are going to launch */
buf = OBJ_NEW(opal_buffer_t);
opal_dss.pack(buf, &command, 1, ORTE_DAEMON_CMD);
/* if we couldn't provide the allocation regex on the orted
* cmd line, then we need to provide all the info here */
if (!orte_nidmap_communicated) {
if (ORTE_SUCCESS != (rc = orte_regx.nidmap_create(orte_node_pool, &nidmap))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(buf);
return;
}
orte_nidmap_communicated = true;
} else {
nidmap = NULL;
}
opal_dss.pack(buf, &nidmap, 1, OPAL_STRING);
if (NULL != nidmap) {
free(nidmap);
}
/* provide the info on the capabilities of each node */
if (!orte_node_info_communicated) {
flag = 1;
opal_dss.pack(buf, &flag, 1, OPAL_INT8);
if (ORTE_SUCCESS != (rc = orte_regx.encode_nodemap(buf))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(buf);
return;
}
orte_node_info_communicated = true;
/* get wireup info for daemons */
jptr = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid);
wireup = OBJ_NEW(opal_buffer_t);
for (v=0; v < jptr->procs->size; v++) {
if (NULL == (dmn = (orte_proc_t*)opal_pointer_array_get_item(jptr->procs, v))) {
continue;
}
val = NULL;
if (opal_pmix.legacy_get()) {
if (OPAL_SUCCESS != (rc = opal_pmix.get(&dmn->name, OPAL_PMIX_PROC_URI, NULL, &val)) || NULL == val) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(buf);
OBJ_RELEASE(wireup);
return;
} else {
/* pack the name of the daemon */
if (ORTE_SUCCESS != (rc = opal_dss.pack(wireup, &dmn->name, 1, ORTE_NAME))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(buf);
OBJ_RELEASE(wireup);
return;
}
/* pack the URI */
if (ORTE_SUCCESS != (rc = opal_dss.pack(wireup, &val->data.string, 1, OPAL_STRING))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(buf);
OBJ_RELEASE(wireup);
return;
}
OBJ_RELEASE(val);
}
} else {
if (OPAL_SUCCESS != (rc = opal_pmix.get(&dmn->name, NULL, NULL, &val)) || NULL == val) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(buf);
OBJ_RELEASE(wireup);
return;
} else {
/* pack the name of the daemon */
if (ORTE_SUCCESS != (rc = opal_dss.pack(wireup, &dmn->name, 1, ORTE_NAME))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(buf);
OBJ_RELEASE(wireup);
return;
}
/* the data is returned as a list of key-value pairs in the opal_value_t */
if (OPAL_PTR != val->type) {
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
OBJ_RELEASE(buf);
OBJ_RELEASE(wireup);
return;
}
modex = (opal_list_t*)val->data.ptr;
numbytes = (int32_t)opal_list_get_size(modex);
if (ORTE_SUCCESS != (rc = opal_dss.pack(wireup, &numbytes, 1, OPAL_INT32))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(buf);
OBJ_RELEASE(wireup);
return;
}
OPAL_LIST_FOREACH(kv, modex, opal_value_t) {
if (ORTE_SUCCESS != (rc = opal_dss.pack(wireup, &kv, 1, OPAL_VALUE))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(buf);
OBJ_RELEASE(wireup);
return;
}
}
OPAL_LIST_RELEASE(modex);
OBJ_RELEASE(val);
}
}
}
/* put it in a byte object for xmission */
opal_dss.unload(wireup, (void**)&bo.bytes, &numbytes);
/* pack the byte object - zero-byte objects are fine */
bo.size = numbytes;
boptr = &bo;
if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &boptr, 1, OPAL_BYTE_OBJECT))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(wireup);
OBJ_RELEASE(buf);
return;
}
/* release the data since it has now been copied into our buffer */
if (NULL != bo.bytes) {
free(bo.bytes);
}
OBJ_RELEASE(wireup);
} else {
flag = 0;
opal_dss.pack(buf, &flag, 1, OPAL_INT8);
}
/* goes to all daemons */
sig = OBJ_NEW(orte_grpcomm_signature_t);
sig->signature = (orte_process_name_t*)malloc(sizeof(orte_process_name_t));
sig->signature[0].jobid = ORTE_PROC_MY_NAME->jobid;
sig->signature[0].vpid = ORTE_VPID_WILDCARD;
if (ORTE_SUCCESS != (rc = orte_grpcomm.xcast(sig, ORTE_RML_TAG_DAEMON, buf))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(buf);
OBJ_RELEASE(sig);
ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
return;
}
OBJ_RELEASE(buf);
}
/* notify that the vm is ready */
fprintf(stdout, "DVM ready\n"); fflush(stdout);
OBJ_RELEASE(caddy);
return;
}
/* progress the job */
caddy->jdata->state = ORTE_JOB_STATE_VM_READY;
/* position any required files */
if (ORTE_SUCCESS != orte_filem.preposition_files(caddy->jdata, files_ready, caddy->jdata)) {
ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
}
/* cleanup */
OBJ_RELEASE(caddy);
}
static void check_complete(int fd, short args, void *cbdata)
{
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
orte_job_t *jdata;
orte_proc_t *proc;
int i;
orte_node_t *node;
orte_job_map_t *map;
orte_std_cntr_t index;
char *rtmod;
ORTE_ACQUIRE_OBJECT(caddy);
jdata = caddy->jdata;
opal_output_verbose(2, orte_state_base_framework.framework_output,
"%s state:dvm:check_job_complete on job %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
(NULL == jdata) ? "NULL" : ORTE_JOBID_PRINT(jdata->jobid));
if (NULL == jdata || jdata->jobid == ORTE_PROC_MY_NAME->jobid) {
/* just check to see if the daemons are complete */
OPAL_OUTPUT_VERBOSE((2, orte_state_base_framework.framework_output,
"%s state:dvm:check_job_complete - received NULL job, checking daemons",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
rtmod = orte_rml.get_routed(orte_mgmt_conduit);
if (0 == orte_routed.num_routes(rtmod)) {
/* orteds are done! */
OPAL_OUTPUT_VERBOSE((2, orte_state_base_framework.framework_output,
"%s orteds complete - exiting",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
if (NULL == jdata) {
jdata = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid);
}
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_DAEMONS_TERMINATED);
OBJ_RELEASE(caddy);
return;
}
OBJ_RELEASE(caddy);
return;
}
/* mark the job as terminated, but don't override any
* abnormal termination flags
*/
if (jdata->state < ORTE_JOB_STATE_UNTERMINATED) {
jdata->state = ORTE_JOB_STATE_TERMINATED;
}
/* tell the IOF that the job is complete */
if (NULL != orte_iof.complete) {
orte_iof.complete(jdata);
}
/* tell the PMIx subsystem the job is complete */
if (NULL != opal_pmix.server_deregister_nspace) {
opal_pmix.server_deregister_nspace(jdata->jobid, NULL, NULL);
}
/* Release the resources used by this job. Since some errmgrs may want
* to continue using resources allocated to the job as part of their
* fault recovery procedure, we only do this once the job is "complete".
* Note that an aborted/killed job -is- flagged as complete and will
* therefore have its resources released. We need to do this after
* we call the errmgr so that any attempt to restart the job will
* avoid doing so in the exact same place as the current job
*/
if (NULL != jdata->map) {
map = jdata->map;
for (index = 0; index < map->nodes->size; index++) {
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, index))) {
continue;
}
OPAL_OUTPUT_VERBOSE((2, orte_state_base_framework.framework_output,
"%s state:dvm releasing procs from node %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
node->name));
for (i = 0; i < node->procs->size; i++) {
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, i))) {
continue;
}
if (proc->name.jobid != jdata->jobid) {
/* skip procs from another job */
continue;
}
if (!ORTE_FLAG_TEST(proc, ORTE_PROC_FLAG_TOOL)) {
node->slots_inuse--;
node->num_procs--;
}
OPAL_OUTPUT_VERBOSE((2, orte_state_base_framework.framework_output,
"%s state:dvm releasing proc %s from node %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&proc->name), node->name));
/* set the entry in the node array to NULL */
opal_pointer_array_set_item(node->procs, i, NULL);
/* release the proc once for the map entry */
OBJ_RELEASE(proc);
}
/* set the node location to NULL */
opal_pointer_array_set_item(map->nodes, index, NULL);
/* maintain accounting */
OBJ_RELEASE(node);
/* flag that the node is no longer in a map */
ORTE_FLAG_UNSET(node, ORTE_NODE_FLAG_MAPPED);
}
OBJ_RELEASE(map);
jdata->map = NULL;
}
if (ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_DEBUGGER_DAEMON)) {
/* this was a debugger daemon. notify that a debugger has detached */
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_DEBUGGER_DETACH);
} else if (jdata->state != ORTE_JOB_STATE_NOTIFIED) {
OPAL_OUTPUT_VERBOSE((2, orte_state_base_framework.framework_output,
"%s state:dvm:check_job_completed state is terminated - activating notify",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_NOTIFY_COMPLETED);
/* mark the job as notified */
jdata->state = ORTE_JOB_STATE_NOTIFIED;
}
OBJ_RELEASE(caddy);
}
static void cleanup_job(int sd, short args, void *cbdata)
{
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
orte_job_t *jdata;
ORTE_ACQUIRE_OBJECT(caddy);
jdata = caddy->jdata;
/* remove this object from the job array */
opal_hash_table_set_value_uint32(orte_job_data, jdata->jobid, NULL);
OBJ_RELEASE(caddy);
}
typedef struct {
opal_list_t *info;
orte_job_t *jdata;
} mycaddy_t;
static void notify_complete(int status, void *cbdata)
{
mycaddy_t *mycaddy = (mycaddy_t*)cbdata;
OPAL_LIST_RELEASE(mycaddy->info);
ORTE_ACTIVATE_JOB_STATE(mycaddy->jdata, ORTE_JOB_STATE_NOTIFIED);
OBJ_RELEASE(mycaddy->jdata);
free(mycaddy);
}
static void dvm_notify(int sd, short args, void *cbdata)
{
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
orte_job_t *jdata = caddy->jdata;
orte_proc_t *pptr=NULL;
int ret;
opal_buffer_t *reply;
orte_daemon_cmd_flag_t command;
orte_grpcomm_signature_t *sig;
bool notify = true;
opal_list_t *info;
opal_value_t *val;
opal_process_name_t pname, *proc, pnotify;
mycaddy_t *mycaddy;
/* see if there was any problem */
if (orte_get_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, (void**)&pptr, OPAL_PTR) && NULL != pptr) {
ret = pptr->exit_code;
/* or whether we got cancelled by the user */
} else if (orte_get_attribute(&jdata->attributes, ORTE_JOB_CANCELLED, NULL, OPAL_BOOL)) {
ret = ORTE_ERR_JOB_CANCELLED;
} else {
ret = ORTE_SUCCESS;
}
if (0 == ret && orte_get_attribute(&jdata->attributes, ORTE_JOB_SILENT_TERMINATION, NULL, OPAL_BOOL)) {
notify = false;
}
/* if the jobid matches that of the requestor, then don't notify */
proc = &pnotify;
if (orte_get_attribute(&jdata->attributes, ORTE_JOB_LAUNCH_PROXY, (void**)&proc, OPAL_NAME)) {
if (pnotify.jobid == jdata->jobid) {
notify = false;
}
}
if (notify) {
/* the source is the job that terminated */
pname.jobid = jdata->jobid;
pname.vpid = OPAL_VPID_WILDCARD;
info = OBJ_NEW(opal_list_t);
/* ensure this only goes to the job terminated event handler */
val = OBJ_NEW(opal_value_t);
val->key = strdup(OPAL_PMIX_EVENT_NON_DEFAULT);
val->type = OPAL_BOOL;
val->data.flag = true;
opal_list_append(info, &val->super);
/* tell the server not to cache the event as subsequent jobs
* do not need to know about it */
val = OBJ_NEW(opal_value_t);
val->key = strdup(OPAL_PMIX_EVENT_DO_NOT_CACHE);
val->type = OPAL_BOOL;
val->data.flag = true;
opal_list_append(info, &val->super);
/* provide the status */
val = OBJ_NEW(opal_value_t);
val->key = strdup(OPAL_PMIX_JOB_TERM_STATUS);
val->type = OPAL_STATUS;
val->data.status = ret;
opal_list_append(info, &val->super);
/* tell the requestor which job or proc */
val = OBJ_NEW(opal_value_t);
val->key = strdup(OPAL_PMIX_PROCID);
val->type = OPAL_NAME;
val->data.name.jobid = jdata->jobid;
if (NULL != pptr) {
val->data.name.vpid = pptr->name.vpid;
} else {
val->data.name.vpid = ORTE_VPID_WILDCARD;
}
opal_list_append(info, &val->super);
/* pass along the proc to be notified */
val = OBJ_NEW(opal_value_t);
val->key = strdup(OPAL_PMIX_EVENT_CUSTOM_RANGE);
val->type = OPAL_NAME;
val->data.name.jobid = pnotify.jobid;
val->data.name.vpid = pnotify.vpid;
opal_list_append(info, &val->super);
/* setup the caddy */
mycaddy = (mycaddy_t*)malloc(sizeof(mycaddy_t));
mycaddy->info = info;
OBJ_RETAIN(jdata);
mycaddy->jdata = jdata;
opal_pmix.server_notify_event(OPAL_ERR_JOB_TERMINATED, &pname,
info, notify_complete, mycaddy);
}
/* now ensure that _all_ daemons know that this job has terminated so even
* those that did not participate in it will know to cleanup the resources
* they assigned to the job. This is necessary now that the mapping function
* has been moved to the backend daemons - otherwise, non-participating daemons
* retain the slot assignments on the participating daemons, and then incorrectly
* map subsequent jobs thinking those nodes are still "busy" */
reply = OBJ_NEW(opal_buffer_t);
command = ORTE_DAEMON_DVM_CLEANUP_JOB_CMD;
opal_dss.pack(reply, &command, 1, ORTE_DAEMON_CMD);
opal_dss.pack(reply, &jdata->jobid, 1, ORTE_JOBID);
sig = OBJ_NEW(orte_grpcomm_signature_t);
sig->signature = (orte_process_name_t*)malloc(sizeof(orte_process_name_t));
sig->signature[0].jobid = ORTE_PROC_MY_NAME->jobid;
sig->signature[0].vpid = ORTE_VPID_WILDCARD;
orte_grpcomm.xcast(sig, ORTE_RML_TAG_DAEMON, reply);
OBJ_RELEASE(reply);
OBJ_RELEASE(sig);
}

Просмотреть файл

@ -1,35 +0,0 @@
/*
* Copyright (c) 2015 Intel, Inc. All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/**
* @file
*
*/
#ifndef MCA_STATE_DVM_EXPORT_H
#define MCA_STATE_DVM_EXPORT_H
#include "orte_config.h"
#include "orte/mca/state/state.h"
BEGIN_C_DECLS
/*
* Local Component structures
*/
ORTE_MODULE_DECLSPEC extern orte_state_base_component_t mca_state_dvm_component;
ORTE_DECLSPEC extern orte_state_base_module_t orte_state_dvm_module;
END_C_DECLS
#endif /* MCA_STATE_DVM_EXPORT_H */

Просмотреть файл

@ -1,83 +0,0 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2015 Intel, Inc. All rights reserved.
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights
* reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "opal/util/output.h"
#include "orte/mca/state/state.h"
#include "orte/mca/state/base/base.h"
#include "state_dvm.h"
/*
* Public string for version number
*/
const char *orte_state_dvm_component_version_string =
"ORTE STATE dvm MCA component version " ORTE_VERSION;
/*
* Local functionality
*/
static int state_dvm_open(void);
static int state_dvm_close(void);
static int state_dvm_component_query(mca_base_module_t **module, int *priority);
/*
* Instantiate the public struct with all of our public information
* and pointer to our public functions in it
*/
orte_state_base_component_t mca_state_dvm_component =
{
/* Handle the general mca_component_t struct containing
* meta information about the component
*/
.base_version = {
ORTE_STATE_BASE_VERSION_1_0_0,
/* Component name and version */
.mca_component_name = "dvm",
MCA_BASE_MAKE_VERSION(component, ORTE_MAJOR_VERSION, ORTE_MINOR_VERSION,
ORTE_RELEASE_VERSION),
/* Component open and close functions */
.mca_open_component = state_dvm_open,
.mca_close_component = state_dvm_close,
.mca_query_component = state_dvm_component_query,
},
.base_data = {
/* The component is checkpoint ready */
MCA_BASE_METADATA_PARAM_CHECKPOINT
},
};
static int state_dvm_open(void)
{
return ORTE_SUCCESS;
}
static int state_dvm_close(void)
{
return ORTE_SUCCESS;
}
static int state_dvm_component_query(mca_base_module_t **module, int *priority)
{
/* used by DVM masters */
if (ORTE_PROC_IS_MASTER) {
*priority = 100;
*module = (mca_base_module_t *)&orte_state_dvm_module;
return ORTE_SUCCESS;
}
*priority = 0;
*module = NULL;
return ORTE_ERR_NOT_AVAILABLE;
}

Просмотреть файл

@ -42,12 +42,4 @@ DIST_SUBDIRS += \
tools/wrappers \
tools/orte-top \
tools/orte-info \
tools/orte-server \
tools/orte-dvm \
tools/ompi-prun
if OPAL_WANT_PRUN
SUBDIRS += \
tools/ompi-prun \
tools/orte-dvm
endif
tools/orte-server

Просмотреть файл

@ -1,59 +0,0 @@
#
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2004-2005 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2004-2009 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# Copyright (c) 2008-2014 Cisco Systems, Inc. All rights reserved.
# Copyright (c) 2008 Sun Microsystems, Inc. All rights reserved.
# Copyright (c) 2015-2018 Intel, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# This is not quite in the Automake spirit, but we have to do it.
# Since the totalview portion of the library must be built with -g, we
# must eliminate the CFLAGS that are passed in here by default (which
# may already have debugging and/or optimization flags). We use
# post-processed forms of the CFLAGS in the library targets down
# below.
CFLAGS = $(CFLAGS_WITHOUT_OPTFLAGS) $(DEBUGGER_CFLAGS)
include $(top_srcdir)/Makefile.ompi-rules
man_pages = ompi-prun.1
EXTRA_DIST = $(man_pages:.1=.1in)
if OPAL_INSTALL_BINARIES
bin_PROGRAMS = ompi-prun
nodist_man_MANS = $(man_pages)
# Ensure that the man pages are rebuilt if the opal_config.h file
# changes; a "good enough" way to know if configure was run again (and
# therefore the release date or version may have changed)
$(nodist_man_MANS): $(top_builddir)/opal/include/opal_config.h
endif # OPAL_INSTALL_BINARIES
ompi_prun_SOURCES = \
main.c \
prun.c \
prun.h
ompi_prun_LDADD = \
$(top_builddir)/orte/lib@ORTE_LIB_PREFIX@open-rte.la \
$(top_builddir)/opal/lib@OPAL_LIB_PREFIX@open-pal.la
distclean-local:
rm -f $(man_pages)

Просмотреть файл

@ -1,33 +0,0 @@
/***************************************************************************
* *
* Open MPI: Open Source High Performance Computing *
* *
* http://www.open-mpi.org/ *
* *
***************************************************************************/
#include "prun.h"
int main(int argc, char *argv[])
{
return prun(argc, argv);
}
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2017-2018 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -1,228 +0,0 @@
#! /bin/sh
# prun - temporary wrapper script for .libs/prun
# Generated by libtool (GNU libtool) 2.4.6
#
# The prun program cannot be directly executed until all the libtool
# libraries that it depends on are installed.
#
# This wrapper script should never be moved out of the build directory.
# If it is, it will not operate correctly.
# Sed substitution that helps us do robust quoting. It backslashifies
# metacharacters that are still active within double-quoted strings.
sed_quote_subst='s|\([`"$\\]\)|\\\1|g'
# Be Bourne compatible
if test -n "${ZSH_VERSION+set}" && (emulate sh) >/dev/null 2>&1; then
emulate sh
NULLCMD=:
# Zsh 3.x and 4.x performs word splitting on ${1+"$@"}, which
# is contrary to our usage. Disable this feature.
alias -g '${1+"$@"}'='"$@"'
setopt NO_GLOB_SUBST
else
case `(set -o) 2>/dev/null` in *posix*) set -o posix;; esac
fi
BIN_SH=xpg4; export BIN_SH # for Tru64
DUALCASE=1; export DUALCASE # for MKS sh
# The HP-UX ksh and POSIX shell print the target directory to stdout
# if CDPATH is set.
(unset CDPATH) >/dev/null 2>&1 && unset CDPATH
relink_command="(cd /home/common/openmpi/foobar/orte/tools/prun; LIBRARY_PATH=/opt/local/lib; export LIBRARY_PATH; { test -z \"\${COMPILER_PATH+set}\" || unset COMPILER_PATH || { COMPILER_PATH=; export COMPILER_PATH; }; }; { test -z \"\${GCC_EXEC_PREFIX+set}\" || unset GCC_EXEC_PREFIX || { GCC_EXEC_PREFIX=; export GCC_EXEC_PREFIX; }; }; { test -z \"\${LD_RUN_PATH+set}\" || unset LD_RUN_PATH || { LD_RUN_PATH=; export LD_RUN_PATH; }; }; LD_LIBRARY_PATH=/home/common/openmpi/build/foobar/lib:/home/common/local/lib:/home/common/pmix/build/prrte/lib; export LD_LIBRARY_PATH; PATH=/home/common/openmpi/build/foobar/bin:/home/common/local/bin:/home/common/pmix/build/prrte/bin:/home/common/local/sbin:/usr/lib64/qt-3.3/bin:/home/rhc/perl5/bin:/usr/local/bin:/usr/bin:/usr/local/sbin:/usr/sbin:/home/rhc/.local/bin:/home/rhc/bin; export PATH; gcc -Wall -Wundef -Wno-long-long -Wsign-compare -Wmissing-prototypes -Wstrict-prototypes -Wcomment -pedantic -Werror-implicit-function-declaration -fno-strict-aliasing -mcx16 -pthread -g -o \$progdir/\$file main.o prun.o ../../../orte/.libs/libopen-rte.so /home/common/openmpi/foobar/opal/.libs/libopen-pal.so ../../../opal/.libs/libopen-pal.so -ldl -ludev -lrt -lm -lutil -lz -pthread -Wl,-rpath -Wl,/home/common/openmpi/foobar/orte/.libs -Wl,-rpath -Wl,/home/common/openmpi/foobar/opal/.libs -Wl,-rpath -Wl,/home/common/openmpi/build/foobar/lib)"
# This environment variable determines our operation mode.
if test "$libtool_install_magic" = "%%%MAGIC variable%%%"; then
# install mode needs the following variables:
generated_by_libtool_version='2.4.6'
notinst_deplibs=' ../../../orte/libopen-rte.la /home/common/openmpi/foobar/opal/libopen-pal.la ../../../opal/libopen-pal.la'
else
# When we are sourced in execute mode, $file and $ECHO are already set.
if test "$libtool_execute_magic" != "%%%MAGIC variable%%%"; then
file="$0"
# A function that is used when there is no print builtin or printf.
func_fallback_echo ()
{
eval 'cat <<_LTECHO_EOF
$1
_LTECHO_EOF'
}
ECHO="printf %s\\n"
fi
# Very basic option parsing. These options are (a) specific to
# the libtool wrapper, (b) are identical between the wrapper
# /script/ and the wrapper /executable/ that is used only on
# windows platforms, and (c) all begin with the string --lt-
# (application programs are unlikely to have options that match
# this pattern).
#
# There are only two supported options: --lt-debug and
# --lt-dump-script. There is, deliberately, no --lt-help.
#
# The first argument to this parsing function should be the
# script's ../../../libtool value, followed by no.
lt_option_debug=
func_parse_lt_options ()
{
lt_script_arg0=$0
shift
for lt_opt
do
case "$lt_opt" in
--lt-debug) lt_option_debug=1 ;;
--lt-dump-script)
lt_dump_D=`$ECHO "X$lt_script_arg0" | /usr/bin/sed -e 's/^X//' -e 's%/[^/]*$%%'`
test "X$lt_dump_D" = "X$lt_script_arg0" && lt_dump_D=.
lt_dump_F=`$ECHO "X$lt_script_arg0" | /usr/bin/sed -e 's/^X//' -e 's%^.*/%%'`
cat "$lt_dump_D/$lt_dump_F"
exit 0
;;
--lt-*)
$ECHO "Unrecognized --lt- option: '$lt_opt'" 1>&2
exit 1
;;
esac
done
# Print the debug banner immediately:
if test -n "$lt_option_debug"; then
echo "prun:prun:$LINENO: libtool wrapper (GNU libtool) 2.4.6" 1>&2
fi
}
# Used when --lt-debug. Prints its arguments to stdout
# (redirection is the responsibility of the caller)
func_lt_dump_args ()
{
lt_dump_args_N=1;
for lt_arg
do
$ECHO "prun:prun:$LINENO: newargv[$lt_dump_args_N]: $lt_arg"
lt_dump_args_N=`expr $lt_dump_args_N + 1`
done
}
# Core function for launching the target application
func_exec_program_core ()
{
if test -n "$lt_option_debug"; then
$ECHO "prun:prun:$LINENO: newargv[0]: $progdir/$program" 1>&2
func_lt_dump_args ${1+"$@"} 1>&2
fi
exec "$progdir/$program" ${1+"$@"}
$ECHO "$0: cannot exec $program $*" 1>&2
exit 1
}
# A function to encapsulate launching the target application
# Strips options in the --lt-* namespace from $@ and
# launches target application with the remaining arguments.
func_exec_program ()
{
case " $* " in
*\ --lt-*)
for lt_wr_arg
do
case $lt_wr_arg in
--lt-*) ;;
*) set x "$@" "$lt_wr_arg"; shift;;
esac
shift
done ;;
esac
func_exec_program_core ${1+"$@"}
}
# Parse options
func_parse_lt_options "$0" ${1+"$@"}
# Find the directory that this script lives in.
thisdir=`$ECHO "$file" | /usr/bin/sed 's%/[^/]*$%%'`
test "x$thisdir" = "x$file" && thisdir=.
# Follow symbolic links until we get to the real thisdir.
file=`ls -ld "$file" | /usr/bin/sed -n 's/.*-> //p'`
while test -n "$file"; do
destdir=`$ECHO "$file" | /usr/bin/sed 's%/[^/]*$%%'`
# If there was a directory component, then change thisdir.
if test "x$destdir" != "x$file"; then
case "$destdir" in
[\\/]* | [A-Za-z]:[\\/]*) thisdir="$destdir" ;;
*) thisdir="$thisdir/$destdir" ;;
esac
fi
file=`$ECHO "$file" | /usr/bin/sed 's%^.*/%%'`
file=`ls -ld "$thisdir/$file" | /usr/bin/sed -n 's/.*-> //p'`
done
# Usually 'no', except on cygwin/mingw when embedded into
# the cwrapper.
WRAPPER_SCRIPT_BELONGS_IN_OBJDIR=no
if test "$WRAPPER_SCRIPT_BELONGS_IN_OBJDIR" = "yes"; then
# special case for '.'
if test "$thisdir" = "."; then
thisdir=`pwd`
fi
# remove .libs from thisdir
case "$thisdir" in
*[\\/].libs ) thisdir=`$ECHO "$thisdir" | /usr/bin/sed 's%[\\/][^\\/]*$%%'` ;;
.libs ) thisdir=. ;;
esac
fi
# Try to get the absolute directory name.
absdir=`cd "$thisdir" && pwd`
test -n "$absdir" && thisdir="$absdir"
program=lt-'prun'
progdir="$thisdir/.libs"
if test ! -f "$progdir/$program" ||
{ file=`ls -1dt "$progdir/$program" "$progdir/../$program" 2>/dev/null | /usr/bin/sed 1q`; \
test "X$file" != "X$progdir/$program"; }; then
file="$$-$program"
if test ! -d "$progdir"; then
mkdir "$progdir"
else
rm -f "$progdir/$file"
fi
# relink executable if necessary
if test -n "$relink_command"; then
if relink_command_output=`eval $relink_command 2>&1`; then :
else
$ECHO "$relink_command_output" >&2
rm -f "$progdir/$file"
exit 1
fi
fi
mv -f "$progdir/$file" "$progdir/$program" 2>/dev/null ||
{ rm -f "$progdir/$program";
mv -f "$progdir/$file" "$progdir/$program"; }
rm -f "$progdir/$file"
fi
if test -f "$progdir/$program"; then
if test "$libtool_execute_magic" != "%%%MAGIC variable%%%"; then
# Run the actual program with our arguments.
func_exec_program ${1+"$@"}
fi
else
# The program doesn't exist.
$ECHO "$0: error: '$progdir/$program' does not exist" 1>&2
$ECHO "This script is just a wrapper for $program." 1>&2
$ECHO "See the libtool documentation for more information." 1>&2
exit 1
fi
fi

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -1,37 +0,0 @@
/*
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2007-2011 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2012-2013 Los Alamos National Security, LLC.
* All rights reserved
* Copyright (c) 2014-2018 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#ifndef PRUN_H
#define PRUN_H
#include "orte_config.h"
BEGIN_C_DECLS
/**
* Main body of prun functionality
*/
int prun(int argc, char *argv[]);
END_C_DECLS
#endif /* ORTERUN_ORTERUN_H */

Просмотреть файл

@ -1,57 +0,0 @@
#
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2004-2005 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2004-2009 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# Copyright (c) 2008-2014 Cisco Systems, Inc. All rights reserved.
# Copyright (c) 2008 Sun Microsystems, Inc. All rights reserved.
# Copyright (c) 2015 Intel, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# This is not quite in the Automake spirit, but we have to do it.
# Since the totalview portion of the library must be built with -g, we
# must eliminate the CFLAGS that are passed in here by default (which
# may already have debugging and/or optimization flags). We use
# post-processed forms of the CFLAGS in the library targets down
# below.
CFLAGS = $(CFLAGS_WITHOUT_OPTFLAGS) $(DEBUGGER_CFLAGS)
include $(top_srcdir)/Makefile.ompi-rules
man_pages = orte-dvm.1
EXTRA_DIST = $(man_pages:.1=.1in)
if OPAL_INSTALL_BINARIES
bin_PROGRAMS = orte-dvm
nodist_man_MANS = $(man_pages)
# Ensure that the man pages are rebuilt if the opal_config.h file
# changes; a "good enough" way to know if configure was run again (and
# therefore the release date or version may have changed)
$(nodist_man_MANS): $(top_builddir)/opal/include/opal_config.h
endif # OPAL_INSTALL_BINARIES
orte_dvm_SOURCES = \
orte-dvm.c
orte_dvm_LDADD = \
$(top_builddir)/orte/lib@ORTE_LIB_PREFIX@open-rte.la \
$(top_builddir)/opal/lib@OPAL_LIB_PREFIX@open-pal.la
distclean-local:
rm -f $(man_pages)

Просмотреть файл

@ -1,193 +0,0 @@
.\” -*- nroff -*-
.\" Copyright (c) 2009-2014 Cisco Systems, Inc. All rights reserved.
.\" Copyright (c) 2008-2009 Sun Microsystems, Inc. All rights reserved.
.\” Copyright (c) 2015 Intel, Inc. All rights reserved
.\" $COPYRIGHT$
.\"
.\" Man page for ORTE's orte-dvm command
.\"
.\" .TH name section center-footer left-footer center-header
.TH ORTE-DVM 1 "#OMPI_DATE#" "#PACKAGE_VERSION#" "#PACKAGE_NAME#"
.\" **************************
.\" Name Section
.\" **************************
.SH NAME
.
orte-dvm, ompi_dvm \- Establish a Distributed Virtual Machine (DVM).
.B Note:
\fIorte-dvm\fP and \fIompi-dvm\fP are synonyms for each
other. Using either of the names will produce the same behavior.
.
.\" **************************
.\" Synopsis Section
.\" **************************
.SH SYNOPSIS
.
.PP
.B orte-dvm
[ options ]
.P
Invoking \fIorte-dvm\fP via an absolute path
name is equivalent to specifying the \fI--prefix\fP option with a
\fI<dir>\fR value equivalent to the directory where \fIorte-dvm\fR
resides, minus its last subdirectory. For example:
\fB%\fP /usr/local/bin/orte-dvm ...
is equivalent to
\fB%\fP orte-dvm --prefix /usr/local
.
.\" **************************
.\" Quick Summary Section
.\" **************************
.SH QUICK SUMMARY
.
\fIorte-dvm\fP will establish a DVM that can be used to execute subsequent
applications. Use of \fIorte-dvm\fP can be advantageous, for example, when you want to
execute a number of short-lived tasks. In such cases, the time required to start
the ORTE DVM can be a significant fraction of the time to execute the
overall application. Thus, creating a persistent DVM can speed the overall
execution. In addition, a persistent DVM will support executing multiple parallel
applications while maintaining separation between their respective cores.
.\" **************************
.\" Options Section
.\" **************************
.SH OPTIONS
.
.\"
.\" Start options listing
.\" Indent 10 characters from start of first column to start of second column
.
.TP
.B -h\fR,\fP --help
Display help for this command
.
.
.TP
.B -V\fR,\fP --version
Print version number. If no other arguments are given, this will also
cause orte-dvm to exit.
.
.
.P
Use one of the following options to specify which hosts (nodes) of the cluster to use
for the DVM.
.
.
.TP
.B -H\fR,\fP -host\fR,\fP --host \fR<host1,host2,...,hostN>\fP
List of hosts for the DVM.
.
.
.TP
.B
-hostfile\fR,\fP --hostfile \fR<hostfile>\fP
Provide a hostfile to use.
.
.
.TP
.B -machinefile\fR,\fP --machinefile \fR<machinefile>\fP
Synonym for \fI-hostfile\fP.
.
.
.TP
.B --prefix \fR<dir>\fP
Prefix directory that will be used to set the \fIPATH\fR and
\fILD_LIBRARY_PATH\fR on the remote node before invoking the ORTE daemon.
.
.
..P
Setting MCA parameters:
.
.
.TP
.B -gmca\fR,\fP --gmca \fR<key> <value>\fP
Pass global MCA parameters that are applicable to all contexts. \fI<key>\fP is
the parameter name; \fI<value>\fP is the parameter value.
.
.
.TP
.B -mca\fR,\fP --mca <key> <value>
Send arguments to various MCA modules. See the "MCA" section, below.
.
.
.
.
.TP
.B -report-uri\fR,\fP --report-uri <channel>
Print out orte-dvm's URI during startup. The channel must be either a '-' to indicate that
the URI is to be output to stdout, a '+' to indicate that the URI is to be output to stderr,
or a filename to which the URI is to be written.
.
.
.P
The following options are useful for developers; they are not generally
useful to most ORTE and/or MPI users:
.
.TP
.B -d\fR,\fP --debug-devel
Enable debugging of the ORTE layer.
.
.
.TP
.B --debug-daemons-file
Enable debugging of the ORTE daemons in the DVM, storing
output in files.
.
.
.P
There may be other options listed with \fIorte-dvm --help\fP.
.
.
.\" **************************
.\" Description Section
.\" **************************
.SH DESCRIPTION
.
\fIorte-dvm\fP starts a Distributed Virtual Machine (DVM) by launching
a daemon on each node of the allocation, as modified or specified by
the \fI-host\fP and \fI-hostfile\fP options. Applications can subsequently
be executed using the \fIorte-submit\fP command.
.
The DVM remains in operation until receiving the \fIorte-submit -terminate\fP
command.
.
.
.
.SS Specifying Host Nodes
.
Host nodes can be identified on the \fIorte-dvm\fP command line with the \fI-host\fP
option or in a hostfile.
.
.PP
For example,
.
.TP 4
orte-dvm -H aa,aa,bb ./a.out
launches two processes on node aa and one on bb.
.
.PP
Or, consider the hostfile
.
\fB%\fP cat myhostfile
aa slots=2
bb slots=2
cc slots=2
.
.PP
Here, we list both the host names (aa, bb, and cc) but also how many "slots"
there are for each. Slots indicate how many processes can potentially execute
on a node. For best performance, the number of slots may be chosen to be the
number of cores on the node or the number of processor sockets. If the hostfile
does not provide slots information, a default of 1 is assumed.
When running under resource managers (e.g., SLURM, Torque, etc.),
Open MPI will obtain both the hostnames and the number of slots directly
from the resource manger.
.
.

Просмотреть файл

@ -1,482 +0,0 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2008 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2006-2017 Cisco Systems, Inc. All rights reserved
* Copyright (c) 2007-2009 Sun Microsystems, Inc. All rights reserved.
* Copyright (c) 2007-2016 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2013-2017 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "orte/constants.h"
#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#ifdef HAVE_STRINGS_H
#include <strings.h>
#endif /* HAVE_STRINGS_H */
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif
#ifdef HAVE_SYS_PARAM_H
#include <sys/param.h>
#endif
#include <errno.h>
#include <signal.h>
#include <ctype.h>
#ifdef HAVE_SYS_TYPES_H
#include <sys/types.h>
#endif /* HAVE_SYS_TYPES_H */
#ifdef HAVE_SYS_WAIT_H
#include <sys/wait.h>
#endif /* HAVE_SYS_WAIT_H */
#ifdef HAVE_SYS_TIME_H
#include <sys/time.h>
#endif /* HAVE_SYS_TIME_H */
#include <fcntl.h>
#ifdef HAVE_SYS_STAT_H
#include <sys/stat.h>
#endif
#include "opal/mca/event/event.h"
#include "opal/mca/installdirs/installdirs.h"
#include "opal/mca/base/base.h"
#include "opal/mca/pmix/pmix.h"
#include "opal/util/argv.h"
#include "opal/util/output.h"
#include "opal/util/basename.h"
#include "opal/util/cmd_line.h"
#include "opal/util/opal_environ.h"
#include "opal/util/opal_getcwd.h"
#include "opal/util/show_help.h"
#include "opal/util/fd.h"
#include "opal/util/daemon_init.h"
#include "opal/version.h"
#include "opal/runtime/opal.h"
#include "opal/runtime/opal_info_support.h"
#include "opal/util/os_path.h"
#include "opal/util/path.h"
#include "opal/class/opal_pointer_array.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/grpcomm/grpcomm.h"
#include "orte/mca/odls/odls.h"
#include "orte/mca/oob/base/base.h"
#include "orte/mca/rml/rml.h"
#include "orte/mca/rml/base/rml_contact.h"
#include "orte/mca/state/state.h"
#include "orte/runtime/runtime.h"
#include "orte/runtime/orte_globals.h"
#include "orte/util/show_help.h"
#include "orte/util/threads.h"
#include "orte/orted/orted.h"
/*
* Globals
*/
static bool want_prefix_by_default = (bool) ORTE_WANT_ORTERUN_PREFIX_BY_DEFAULT;
/*
* Globals
*/
static struct {
bool help;
bool version;
char *prefix;
bool run_as_root;
bool set_sid;
bool daemonize;
bool system_server;
char *report_uri;
bool remote_connections;
} myglobals;
static opal_cmd_line_init_t cmd_line_init[] = {
/* Various "obvious" options */
{ NULL, 'h', NULL, "help", 0,
&myglobals.help, OPAL_CMD_LINE_TYPE_BOOL,
"This help message" },
{ NULL, 'V', NULL, "version", 0,
&myglobals.version, OPAL_CMD_LINE_TYPE_BOOL,
"Print version and exit" },
{ NULL, '\0', "prefix", "prefix", 1,
&myglobals.prefix, OPAL_CMD_LINE_TYPE_STRING,
"Prefix to be used to look for ORTE executables" },
{ "orte_daemonize", '\0', NULL, "daemonize", 0,
&myglobals.daemonize, OPAL_CMD_LINE_TYPE_BOOL,
"Daemonize the orte-dvm into the background" },
{ NULL, '\0', NULL, "set-sid", 0,
&myglobals.set_sid, OPAL_CMD_LINE_TYPE_BOOL,
"Direct the orte-dvm to separate from the current session"},
{ "orte_debug_daemons", '\0', "debug-daemons", "debug-daemons", 0,
NULL, OPAL_CMD_LINE_TYPE_BOOL,
"Debug daemons" },
{ "orte_debug", 'd', "debug-devel", "debug-devel", 0,
NULL, OPAL_CMD_LINE_TYPE_BOOL,
"Enable debugging of OpenRTE" },
{ NULL, '\0', "allow-run-as-root", "allow-run-as-root", 0,
&myglobals.run_as_root, OPAL_CMD_LINE_TYPE_BOOL,
"Allow execution as root (STRONGLY DISCOURAGED)" },
/* Specify the launch agent to be used */
{ "orte_launch_agent", '\0', "launch-agent", "launch-agent", 1,
NULL, OPAL_CMD_LINE_TYPE_STRING,
"Command used to start processes on remote nodes (default: orted)" },
/* maximum size of VM - typically used to subdivide an allocation */
{ "orte_max_vm_size", '\0', "max-vm-size", "max-vm-size", 1,
NULL, OPAL_CMD_LINE_TYPE_INT,
"Maximum size of VM" },
/* Set a hostfile */
{ NULL, '\0', "hostfile", "hostfile", 1,
NULL, OPAL_CMD_LINE_TYPE_STRING,
"Provide a hostfile" },
{ NULL, '\0', "machinefile", "machinefile", 1,
NULL, OPAL_CMD_LINE_TYPE_STRING,
"Provide a hostfile" },
{ "orte_default_hostfile", '\0', "default-hostfile", "default-hostfile", 1,
NULL, OPAL_CMD_LINE_TYPE_STRING,
"Provide a default hostfile" },
{ NULL, 'H', "host", "host", 1,
NULL, OPAL_CMD_LINE_TYPE_STRING,
"List of hosts to invoke processes on" },
{ NULL, '\0', "system-server", "system-server", 0,
&myglobals.system_server, OPAL_CMD_LINE_TYPE_BOOL,
"Provide a system-level server connection point - only one allowed per node" },
{ NULL, '\0', "report-uri", "report-uri", 1,
&myglobals.report_uri, OPAL_CMD_LINE_TYPE_STRING,
"Printout URI on stdout [-], stderr [+], or a file [anything else]",
OPAL_CMD_LINE_OTYPE_DEBUG },
{ NULL, '\0', "remote-tools", "remote-tools", 0,
&myglobals.remote_connections, OPAL_CMD_LINE_TYPE_BOOL,
"Enable connections from remote tools" },
/* End of list */
{ NULL, '\0', NULL, NULL, 0,
NULL, OPAL_CMD_LINE_TYPE_NULL, NULL }
};
int main(int argc, char *argv[])
{
int rc, i, j;
opal_cmd_line_t cmd_line;
char *param, *value;
orte_job_t *jdata=NULL;
orte_app_context_t *app;
/* Setup and parse the command line */
memset(&myglobals, 0, sizeof(myglobals));
/* find our basename (the name of the executable) so that we can
use it in pretty-print error messages */
orte_basename = opal_basename(argv[0]);
opal_cmd_line_create(&cmd_line, cmd_line_init);
mca_base_cmd_line_setup(&cmd_line);
if (OPAL_SUCCESS != (rc = opal_cmd_line_parse(&cmd_line, true, false,
argc, argv)) ) {
if (OPAL_ERR_SILENT != rc) {
fprintf(stderr, "%s: command line error (%s)\n", argv[0],
opal_strerror(rc));
}
return rc;
}
/* print version if requested. Do this before check for help so
that --version --help works as one might expect. */
if (myglobals.version) {
char *str;
str = opal_info_make_version_str("all",
OPAL_MAJOR_VERSION, OPAL_MINOR_VERSION,
OPAL_RELEASE_VERSION,
OPAL_GREEK_VERSION,
OPAL_REPO_REV);
if (NULL != str) {
fprintf(stdout, "%s %s\n\nReport bugs to %s\n",
orte_basename, str, PACKAGE_BUGREPORT);
free(str);
}
exit(0);
}
/* check if we are running as root - if we are, then only allow
* us to proceed if the allow-run-as-root flag was given. Otherwise,
* exit with a giant warning flag
*/
if (0 == geteuid() && !myglobals.run_as_root) {
/* show_help is not yet available, so print an error manually */
fprintf(stderr, "--------------------------------------------------------------------------\n");
if (myglobals.help) {
fprintf(stderr, "%s cannot provide the help message when run as root.\n\n", orte_basename);
} else {
fprintf(stderr, "%s has detected an attempt to run as root.\n\n", orte_basename);
}
fprintf(stderr, "Running at root is *strongly* discouraged as any mistake (e.g., in\n");
fprintf(stderr, "defining TMPDIR) or bug can result in catastrophic damage to the OS\n");
fprintf(stderr, "file system, leaving your system in an unusable state.\n\n");
fprintf(stderr, "We strongly suggest that you run %s as a non-root user.\n\n", orte_basename);
fprintf(stderr, "You can override this protection by adding the --allow-run-as-root\n");
fprintf(stderr, "option to your command line. However, we reiterate our strong advice\n");
fprintf(stderr, "against doing so - please do so at your own risk.\n");
fprintf(stderr, "--------------------------------------------------------------------------\n");
exit(1);
}
/*
* Since this process can now handle MCA/GMCA parameters, make sure to
* process them.
* NOTE: It is "safe" to call mca_base_cmd_line_process_args() before
* opal_init_util() since mca_base_cmd_line_process_args() does *not*
* depend upon opal_init_util() functionality.
*/
if (OPAL_SUCCESS != mca_base_cmd_line_process_args(&cmd_line, &environ, &environ)) {
exit(1);
}
/* Need to initialize OPAL so that install_dirs are filled in */
if (OPAL_SUCCESS != opal_init(&argc, &argv)) {
exit(1);
}
/* Check for help request */
if (myglobals.help) {
char *str, *args = NULL;
char *project_name = NULL;
if (0 == strcmp(orte_basename, "mpirun")) {
project_name = "Open MPI";
} else {
project_name = "OpenRTE";
}
args = opal_cmd_line_get_usage_msg(&cmd_line);
str = opal_show_help_string("help-orterun.txt", "orterun:usage", false,
orte_basename, project_name, OPAL_VERSION,
orte_basename, args,
PACKAGE_BUGREPORT);
if (NULL != str) {
printf("%s", str);
free(str);
}
free(args);
/* If someone asks for help, that should be all we do */
exit(0);
}
if (myglobals.system_server) {
/* we should act as system-level PMIx server */
opal_setenv(OPAL_MCA_PREFIX"pmix_system_server", "1", true, &environ);
}
/* always act as session-level PMIx server */
opal_setenv(OPAL_MCA_PREFIX"pmix_session_server", "1", true, &environ);
/* if we were asked to report a uri, set the MCA param to do so */
if (NULL != myglobals.report_uri) {
opal_setenv("PMIX_MCA_ptl_tcp_report_uri", myglobals.report_uri, true, &environ);
}
if (myglobals.remote_connections) {
opal_setenv("PMIX_MCA_ptl_tcp_remote_connections", "1", true, &environ);
}
/* Setup MCA params */
orte_register_params();
/* save the environment for launch purposes. This MUST be
* done so that we can pass it to any local procs we
* spawn - otherwise, those local procs won't see any
* non-MCA envars were set in the enviro prior to calling
* orterun
*/
orte_launch_environ = opal_argv_copy(environ);
#if defined(HAVE_SETSID)
/* see if we were directed to separate from current session */
if (myglobals.set_sid) {
setsid();
}
#endif
/* detach from controlling terminal
* otherwise, remain attached so output can get to us
*/
if(!orte_debug_flag &&
!orte_debug_daemons_flag &&
myglobals.daemonize) {
opal_daemon_init(NULL);
}
/* Intialize our Open RTE environment */
if (ORTE_SUCCESS != (rc = orte_init(&argc, &argv, ORTE_PROC_MASTER))) {
/* cannot call ORTE_ERROR_LOG as it could be the errmgr
* never got loaded!
*/
return rc;
}
/* finalize OPAL. As it was opened again from orte_init->opal_init
* we continue to have a reference count on it. So we have to finalize it twice...
*/
opal_finalize();
/* get the daemon job object - was created by ess/hnp component */
if (NULL == (jdata = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid))) {
orte_show_help("help-orterun.txt", "bad-job-object", true,
orte_basename);
exit(0);
}
/* also should have created a daemon "app" */
if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, 0))) {
orte_show_help("help-orterun.txt", "bad-app-object", true,
orte_basename);
exit(0);
}
/* Did the user specify a prefix, or want prefix by default? */
if (opal_cmd_line_is_taken(&cmd_line, "prefix") || want_prefix_by_default) {
size_t param_len;
/* if both the prefix was given and we have a prefix
* given above, check to see if they match
*/
if (opal_cmd_line_is_taken(&cmd_line, "prefix") &&
NULL != myglobals.prefix) {
/* if they don't match, then that merits a warning */
param = strdup(opal_cmd_line_get_param(&cmd_line, "prefix", 0, 0));
/* ensure we strip any trailing '/' */
if (0 == strcmp(OPAL_PATH_SEP, &(param[strlen(param)-1]))) {
param[strlen(param)-1] = '\0';
}
value = strdup(myglobals.prefix);
if (0 == strcmp(OPAL_PATH_SEP, &(value[strlen(value)-1]))) {
value[strlen(value)-1] = '\0';
}
if (0 != strcmp(param, value)) {
orte_show_help("help-orterun.txt", "orterun:app-prefix-conflict",
true, orte_basename, value, param);
/* let the global-level prefix take precedence since we
* know that one is being used
*/
free(param);
param = strdup(myglobals.prefix);
}
free(value);
} else if (NULL != myglobals.prefix) {
param = myglobals.prefix;
} else if (opal_cmd_line_is_taken(&cmd_line, "prefix")){
/* must be --prefix alone */
param = strdup(opal_cmd_line_get_param(&cmd_line, "prefix", 0, 0));
} else {
/* --enable-orterun-prefix-default was given to orterun */
param = strdup(opal_install_dirs.prefix);
}
if (NULL != param) {
/* "Parse" the param, aka remove superfluous path_sep. */
param_len = strlen(param);
while (0 == strcmp (OPAL_PATH_SEP, &(param[param_len-1]))) {
param[param_len-1] = '\0';
param_len--;
if (0 == param_len) {
orte_show_help("help-orterun.txt", "orterun:empty-prefix",
true, orte_basename, orte_basename);
return ORTE_ERR_FATAL;
}
}
orte_set_attribute(&app->attributes, ORTE_APP_PREFIX_DIR, ORTE_ATTR_GLOBAL, param, OPAL_STRING);
free(param);
}
}
/* Did the user specify a hostfile. Need to check for both
* hostfile and machine file.
* We can only deal with one hostfile per app context, otherwise give an error.
*/
if (0 < (j = opal_cmd_line_get_ninsts(&cmd_line, "hostfile"))) {
if(1 < j) {
orte_show_help("help-orterun.txt", "orterun:multiple-hostfiles",
true, orte_basename, NULL);
return ORTE_ERR_FATAL;
} else {
value = opal_cmd_line_get_param(&cmd_line, "hostfile", 0, 0);
orte_set_attribute(&app->attributes, ORTE_APP_HOSTFILE, ORTE_ATTR_LOCAL, value, OPAL_STRING);
}
}
if (0 < (j = opal_cmd_line_get_ninsts(&cmd_line, "machinefile"))) {
if(1 < j || orte_get_attribute(&app->attributes, ORTE_APP_HOSTFILE, NULL, OPAL_STRING)) {
orte_show_help("help-orterun.txt", "orterun:multiple-hostfiles",
true, orte_basename, NULL);
return ORTE_ERR_FATAL;
} else {
value = opal_cmd_line_get_param(&cmd_line, "machinefile", 0, 0);
orte_set_attribute(&app->attributes, ORTE_APP_HOSTFILE, ORTE_ATTR_LOCAL, value, OPAL_STRING);
}
}
/* Did the user specify any hosts? */
if (0 < (j = opal_cmd_line_get_ninsts(&cmd_line, "host"))) {
char **targ=NULL, *tval;
for (i = 0; i < j; ++i) {
value = opal_cmd_line_get_param(&cmd_line, "host", i, 0);
opal_argv_append_nosize(&targ, value);
}
tval = opal_argv_join(targ, ',');
orte_set_attribute(&app->attributes, ORTE_APP_DASH_HOST, ORTE_ATTR_LOCAL, tval, OPAL_STRING);
opal_argv_free(targ);
free(tval);
}
OBJ_DESTRUCT(&cmd_line);
/* setup to listen for commands sent specifically to me, even though I would probably
* be the one sending them! Unfortunately, since I am a participating daemon,
* there are times I need to send a command to "all daemons", and that means *I* have
* to receive it too
*/
orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_DAEMON,
ORTE_RML_PERSISTENT, orte_daemon_recv, NULL);
/* spawn the DVM - we skip the initial steps as this
* isn't a user-level application */
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_ALLOCATE);
/* loop the event lib until an exit event is detected */
while (orte_event_base_active) {
opal_event_loop(orte_event_base, OPAL_EVLOOP_ONCE);
}
ORTE_ACQUIRE_OBJECT(orte_event_base_active);
/* cleanup and leave */
orte_finalize();
if (orte_debug_flag) {
fprintf(stderr, "exiting with status %d\n", orte_exit_status);
}
exit(orte_exit_status);
}