1
1

Merge pull request #370 from rhc54/topic/dvm

Enable a persistent DVM
Этот коммит содержится в:
rhc54 2015-02-01 17:37:10 -08:00
родитель cb66aeb697 4dba298e6e
Коммит 95660822b9
24 изменённых файлов: 4476 добавлений и 145 удалений

6
.gitignore поставляемый
Просмотреть файл

@ -476,6 +476,9 @@ orte/tools/orte-checkpoint/orte-checkpoint.1
orte/tools/orte-checkpoint/ompi-checkpoint.1
orte/tools/orte-clean/orte-clean
orte/tools/orte-clean/orte-clean.1
orte/tools/orte-dvm/orte-dvm
orte/tools/orte-dvm/orte-dvm.1
ompi/mca/rte/orte/ompi-dvm.1
orte/tools/orte-info/orte-info
orte/tools/orte-info/orte-info.1
orte/tools/orte-migrate/orte-migrate
@ -488,6 +491,9 @@ orte/tools/orte-restart/orte-restart.1
orte/tools/orte-restart/ompi-restart.1
orte/tools/orte-server/orte-server
orte/tools/orte-server/orte-server.1
orte/tools/orte-submit/orte-submit
orte/tools/orte-submit/orte-submit.1
ompi/mca/rte/orte/ompi-submit.1
orte/tools/orte-top/orte-top
orte/tools/orte-top/orte-top.1
orte/tools/orted/orted

Просмотреть файл

@ -5,7 +5,8 @@
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights
# reserved.
# reserved.
# Copyright (c) 2015 Intel, Inc. All rights reserved
# $COPYRIGHT$
#
# Additional copyrights may follow
@ -32,5 +33,7 @@ AC_DEFUN([ORTE_CONFIG_FILES],[
orte/tools/orte-migrate/Makefile
orte/tools/orte-info/Makefile
orte/tools/orte-server/Makefile
orte/tools/orte-submit/Makefile
orte/tools/orte-dvm/Makefile
])
])

Просмотреть файл

@ -27,7 +27,7 @@ libmca_rte_orte_la_SOURCES =$(sources) $(headers)
libmca_rte_orte_la_LDFLAGS = -module -avoid-version
libmca_rte_orte_la_LIBADD = $(top_builddir)/orte/lib@ORTE_LIB_PREFIX@open-rte.la
man_pages = mpirun.1 mpiexec.1 ompi-ps.1 ompi-clean.1 ompi-top.1 ompi-server.1
man_pages = mpirun.1 mpiexec.1 ompi-ps.1 ompi-clean.1 ompi-top.1 ompi-server.1 ompi-dvm.1 ompi-submit.1
if WANT_FT
man_pages += ompi-checkpoint.1 ompi-restart.1
@ -43,6 +43,8 @@ install-exec-hook:
(cd $(DESTDIR)$(bindir); rm -f ompi-clean$(EXEEXT); $(LN_S) orte-clean$(EXEEXT) ompi-clean$(EXEEXT))
(cd $(DESTDIR)$(bindir); rm -f ompi-top$(EXEEXT); $(LN_S) orte-top$(EXEEXT) ompi-top$(EXEEXT))
(cd $(DESTDIR)$(bindir); rm -f ompi-server$(EXEEXT); $(LN_S) orte-server$(EXEEXT) ompi-server$(EXEEXT))
(cd $(DESTDIR)$(bindir); rm -f ompi-dvm$(EXEEXT); $(LN_S) orte-dvm$(EXEEXT) ompi-dvm$(EXEEXT))
(cd $(DESTDIR)$(bindir); rm -f ompi-submit$(EXEEXT); $(LN_S) orte-submit$(EXEEXT) ompi-submit$(EXEEXT))
if WANT_FT
(cd $(DESTDIR)$(bindir); rm -f ompi-checkpoint$(EXEEXT); $(LN_S) orte-checkpoint$(EXEEXT) ompi-checkpoint$(EXEEXT))
(cd $(DESTDIR)$(bindir); rm -f ompi-restart$(EXEEXT); $(LN_S) orte-restart$(EXEEXT) ompi-restart$(EXEEXT))
@ -55,7 +57,9 @@ uninstall-local:
$(DESTDIR)$(bindir)/ompi-ps$(EXEEXT) \
$(DESTDIR)$(bindir)/ompi-clean$(EXEEXT) \
$(DESTDIR)$(bindir)/ompi-top$(EXEEXT) \
$(DESTDIR)$(bindir)/ompi-server$(EXEEXT)
$(DESTDIR)$(bindir)/ompi-server$(EXEEXT) \
$(DESTDIR)$(bindir)/ompi-dvm$(EXEEXT) \
$(DESTDIR)$(bindir)/ompi-submit$(EXEEXT)
if WANT_FT
rm -f $(DESTDIR)$(bindir)/ompi-checkpoint$(EXEEXT) \
$(DESTDIR)$(bindir)/ompi-restart$(EXEEXT) \
@ -115,5 +119,11 @@ $(top_builddir)/orte/tools/orte-server/orte-server.1:
ompi-server.1: $(top_builddir)/orte/tools/orte-server/orte-server.1
cp -f $(top_builddir)/orte/tools/orte-server/orte-server.1 ompi-server.1
ompi-dvm.1: $(top_builddir)/orte/tools/orte-dvm/orte-dvm.1
cp -f $(top_builddir)/orte/tools/orte-dvm/orte-dvm.1 ompi-dvm.1
ompi-submit.1: $(top_builddir)/orte/tools/orte-submit/orte-submit.1
cp -f $(top_builddir)/orte/tools/orte-submit/orte-submit.1 ompi-submit.1
clean-local:
rm -f $(man_pages)

Просмотреть файл

@ -11,7 +11,7 @@
* All rights reserved.
* Copyright (c) 2011-2013 Los Alamos National Security, LLC.
* All rights reserved.
* Copyright (c) 2013-2014 Intel, Inc. All rights reserved.
* Copyright (c) 2013-2015 Intel, Inc. All rights reserved.
* Copyright (c) 2014 Hochschule Esslingen. All rights reserved.
*
* $COPYRIGHT$
@ -36,19 +36,22 @@
#include "opal/mca/event/event.h"
#include "opal/runtime/opal.h"
#include "opal/runtime/opal_cr.h"
#include "opal/runtime/opal_progress_threads.h"
#include "opal/util/arch.h"
#include "opal/util/proc.h"
#include "orte/mca/oob/base/base.h"
#include "orte/mca/plm/base/base.h"
#include "orte/mca/rml/base/base.h"
#include "orte/mca/routed/base/base.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/errmgr/base/base.h"
#include "orte/mca/iof/base/base.h"
#include "orte/mca/state/base/base.h"
#if OPAL_ENABLE_FT_CR == 1
#include "orte/mca/snapc/base/base.h"
#include "orte/mca/sstore/base/base.h"
#endif
#include "orte/mca/schizo/base/base.h"
#include "orte/util/proc_info.h"
#include "orte/util/session_dir.h"
#include "orte/util/show_help.h"
@ -59,6 +62,8 @@
#include "orte/mca/ess/base/base.h"
static bool progress_thread_running = false;
int orte_ess_base_tool_setup(void)
{
int ret;
@ -79,6 +84,9 @@ int orte_ess_base_tool_setup(void)
* so it will do the right things.
*/
orte_process_info.proc_type |= ORTE_PROC_NON_MPI;
/* get a separate orte event base */
orte_event_base = opal_start_progress_thread("orte", true);
progress_thread_running = true;
}
/* open and setup the state machine */
@ -93,6 +101,18 @@ int orte_ess_base_tool_setup(void)
goto error;
}
/* open and setup the error manager */
if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_errmgr_base_framework, 0))) {
ORTE_ERROR_LOG(ret);
error = "orte_errmgr_base_open";
goto error;
}
if (ORTE_SUCCESS != (ret = orte_errmgr_base_select())) {
ORTE_ERROR_LOG(ret);
error = "orte_errmgr_base_select";
goto error;
}
/* Setup the communication infrastructure */
if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_oob_base_framework, 0))) {
ORTE_ERROR_LOG(ret);
@ -177,6 +197,15 @@ int orte_ess_base_tool_setup(void)
error = "orte_iof_base_select";
goto error;
}
/* if we were given an HNP, then also setup the PLM in case this
* tool wants to request that we spawn something for it */
if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_plm_base_framework, 0))) {
ORTE_ERROR_LOG(ret);
error = "orte_plm_base_open";
goto error;
}
/* we don't select the plm framework as we only want the
* base proxy functions */
}
#if OPAL_ENABLE_FT_CR == 1
@ -208,7 +237,19 @@ int orte_ess_base_tool_setup(void)
/* Tools do not need all the OPAL CR stuff */
opal_cr_set_enabled(false);
#endif
/* setup schizo in case we are parsing cmd lines */
if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_schizo_base_framework, 0))) {
ORTE_ERROR_LOG(ret);
error = "orte_schizo_base_open";
goto error;
}
if (ORTE_SUCCESS != (ret = orte_schizo_base_select())) {
ORTE_ERROR_LOG(ret);
error = "orte_schizo_base_select";
goto error;
}
return ORTE_SUCCESS;
error:
@ -237,6 +278,13 @@ int orte_ess_base_tool_finalize(void)
}
(void) mca_base_framework_close(&orte_routed_base_framework);
(void) mca_base_framework_close(&orte_rml_base_framework);
(void) mca_base_framework_close(&orte_schizo_base_framework);
(void) mca_base_framework_close(&orte_errmgr_base_framework);
/* release the event base */
if (progress_thread_running) {
opal_stop_progress_thread("orte", true);
progress_thread_running = false;
}
return ORTE_SUCCESS;
}

20
orte/mca/ess/env/ess_env_component.c поставляемый
Просмотреть файл

@ -66,21 +66,11 @@ orte_ess_env_component_open(void)
int orte_ess_env_component_query(mca_base_module_t **module, int *priority)
{
/* we are the env module, so set the priority to
* be higher than the tool component so that a
* tool launched as a distributed set of procs
* (i.e., a "tool with name") will select this
* module, but low enough that any other environment
* will override us
*/
/* if we don't have a path back to the HNP, then we
* were not launched by mpirun, so don't pick us as
* it would be impossible for the correct env vars
* to have been set!
*/
if (NULL != orte_process_info.my_hnp_uri) {
*priority = 20;
/* we are the env module, only used by daemons that are
* launched by ssh so allow any enviro-specifc modules
* to override us */
if (ORTE_PROC_IS_DAEMON) {
*priority = 1;
*module = (mca_base_module_t *)&orte_ess_env_module;
return ORTE_SUCCESS;
}

81
orte/mca/ess/env/ess_env_module.c поставляемый
Просмотреть файл

@ -112,64 +112,22 @@ static int rte_init(void)
/* if I am a daemon, complete my setup using the
* default procedure
*/
if (ORTE_PROC_IS_DAEMON) {
if (NULL != orte_node_regex) {
/* extract the nodes */
if (ORTE_SUCCESS != (ret = orte_regex_extract_node_names(orte_node_regex, &hosts))) {
error = "orte_regex_extract_node_names";
goto error;
}
}
if (ORTE_SUCCESS != (ret = orte_ess_base_orted_setup(hosts))) {
ORTE_ERROR_LOG(ret);
error = "orte_ess_base_orted_setup";
if (NULL != orte_node_regex) {
/* extract the nodes */
if (ORTE_SUCCESS != (ret = orte_regex_extract_node_names(orte_node_regex, &hosts))) {
error = "orte_regex_extract_node_names";
goto error;
}
opal_argv_free(hosts);
return ORTE_SUCCESS;
}
if (ORTE_PROC_IS_TOOL) {
/* otherwise, if I am a tool proc, use that procedure */
if (ORTE_SUCCESS != (ret = orte_ess_base_tool_setup())) {
ORTE_ERROR_LOG(ret);
error = "orte_ess_base_tool_setup";
goto error;
}
return ORTE_SUCCESS;
}
/* use the default procedure to finish my setup */
if (ORTE_SUCCESS != (ret = orte_ess_base_app_setup(true))) {
if (ORTE_SUCCESS != (ret = orte_ess_base_orted_setup(hosts))) {
ORTE_ERROR_LOG(ret);
error = "orte_ess_base_app_setup";
error = "orte_ess_base_orted_setup";
goto error;
}
/* setup process binding */
if (ORTE_SUCCESS != (ret = orte_ess_base_proc_binding())) {
error = "proc_binding";
goto error;
}
/* if we are an ORTE app - and not an MPI app - then
* we need to exchange our connection info here.
* MPI_Init has its own modex, so we don't need to do
* two of them. However, if we don't do a modex at all,
* then processes have no way to communicate
*
* NOTE: only do this when the process originally launches.
* Cannot do this on a restart as the rest of the processes
* in the job won't be executing this step, so we would hang
*/
if (ORTE_PROC_IS_NON_MPI && !orte_do_not_barrier) {
opal_pmix.fence(NULL, 0);
}
opal_argv_free(hosts);
return ORTE_SUCCESS;
error:
error:
if (ORTE_ERR_SILENT != ret && !orte_report_silent_errors) {
orte_show_help("help-orte-runtime.txt",
"orte_init:startup:internal-failure",
@ -183,29 +141,10 @@ static int rte_finalize(void)
{
int ret;
/* if I am a daemon, finalize using the default procedure */
if (ORTE_PROC_IS_DAEMON) {
if (ORTE_SUCCESS != (ret = orte_ess_base_orted_finalize())) {
ORTE_ERROR_LOG(ret);
}
return ret;
} else if (ORTE_PROC_IS_TOOL) {
/* otherwise, if I am a tool proc, use that procedure */
if (ORTE_SUCCESS != (ret = orte_ess_base_tool_finalize())) {
ORTE_ERROR_LOG(ret);
}
/* as a tool, I didn't create a nidmap - so just return now */
return ret;
}
/* otherwise, I must be an application process
* use the default procedure to finish
*/
if (ORTE_SUCCESS != (ret = orte_ess_base_app_finalize())) {
if (ORTE_SUCCESS != (ret = orte_ess_base_orted_finalize())) {
ORTE_ERROR_LOG(ret);
}
return ORTE_SUCCESS;
return ret;
}
static int env_set_name(void)

Просмотреть файл

@ -9,6 +9,7 @@
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2015 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -19,6 +20,7 @@
#include "orte_config.h"
#include "orte/constants.h"
#include "opal/hash_string.h"
#include <sys/types.h>
#include <stdio.h>
@ -57,28 +59,50 @@ static int rte_init(void)
{
int ret;
char *error = NULL;
orte_jobid_t jobid;
orte_vpid_t vpid;
/* run the prolog */
if (ORTE_SUCCESS != (ret = orte_ess_base_std_prolog())) {
error = "orte_ess_base_std_prolog";
goto error;
}
/* If we are a tool with no name, then responsibility for
* defining the name falls to the PLM component for our
* respective environment.
* Just call the base function for this.
*
* NOTE: Tools with names - i.e., tools consisting of a
* distributed set of processes - will select and use
* the appropriate enviro-specific module and -not- this one!
*/
if (ORTE_SUCCESS != (ret = orte_plm_base_set_hnp_name())) {
ORTE_ERROR_LOG(ret);
error = "orte_plm_base_set_hnp_name";
goto error;
if (NULL != orte_ess_base_jobid &&
NULL != orte_ess_base_vpid) {
opal_output_verbose(2, orte_ess_base_framework.framework_output,
"ess:tool:obtaining name from environment");
if (ORTE_SUCCESS != (ret = orte_util_convert_string_to_jobid(&jobid, orte_ess_base_jobid))) {
return(ret);
}
ORTE_PROC_MY_NAME->jobid = jobid;
if (ORTE_SUCCESS != (ret = orte_util_convert_string_to_vpid(&vpid, orte_ess_base_vpid))) {
return(ret);
}
ORTE_PROC_MY_NAME->vpid = vpid;
} else {
/* If we are a tool with no name, then define it here */
uint16_t jobfam;
uint32_t hash32;
uint32_t bias;
opal_output_verbose(2, orte_ess_base_framework.framework_output,
"ess:tool:computing name");
/* hash the nodename */
OPAL_HASH_STR(orte_process_info.nodename, hash32);
bias = (uint32_t)orte_process_info.pid;
/* fold in the bias */
hash32 = hash32 ^ bias;
/* now compress to 16-bits */
jobfam = (uint16_t)(((0x0000ffff & (0xffff0000 & hash32) >> 16)) ^ (0x0000ffff & hash32));
/* set the name */
ORTE_PROC_MY_NAME->jobid = 0xffff0000 & ((uint32_t)jobfam << 16);
ORTE_PROC_MY_NAME->vpid = 0;
}
/* do the rest of the standard tool init */
if (ORTE_SUCCESS != (ret = orte_ess_base_tool_setup())) {
ORTE_ERROR_LOG(ret);
@ -88,7 +112,7 @@ static int rte_init(void)
return ORTE_SUCCESS;
error:
error:
if (ORTE_ERR_SILENT != ret && !orte_report_silent_errors) {
orte_show_help("help-orte-runtime.txt",
"orte_init:startup:internal-failure",

Просмотреть файл

@ -164,28 +164,26 @@ void orte_plm_base_recv(int status, orte_process_name_t* sender,
jdata->originator.vpid = sender->vpid;
/* get the parent's job object */
if (NULL == (parent = orte_get_job_data_object(sender->jobid))) {
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
goto ANSWER_LAUNCH;
if (NULL != (parent = orte_get_job_data_object(sender->jobid))) {
/* if the prefix was set in the parent's job, we need to transfer
* that prefix to the child's app_context so any further launch of
* orteds can find the correct binary. There always has to be at
* least one app_context in both parent and child, so we don't
* need to check that here. However, be sure not to overwrite
* the prefix if the user already provided it!
*/
app = (orte_app_context_t*)opal_pointer_array_get_item(parent->apps, 0);
child_app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, 0);
prefix_dir = NULL;
if (orte_get_attribute(&app->attributes, ORTE_APP_PREFIX_DIR, (void**)&prefix_dir, OPAL_STRING) &&
!orte_get_attribute(&child_app->attributes, ORTE_APP_PREFIX_DIR, NULL, OPAL_STRING)) {
orte_set_attribute(&child_app->attributes, ORTE_APP_PREFIX_DIR, ORTE_ATTR_GLOBAL, prefix_dir, OPAL_STRING);
}
if (NULL != prefix_dir) {
free(prefix_dir);
}
}
/* if the prefix was set in the parent's job, we need to transfer
* that prefix to the child's app_context so any further launch of
* orteds can find the correct binary. There always has to be at
* least one app_context in both parent and child, so we don't
* need to check that here. However, be sure not to overwrite
* the prefix if the user already provided it!
*/
app = (orte_app_context_t*)opal_pointer_array_get_item(parent->apps, 0);
child_app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, 0);
prefix_dir = NULL;
if (orte_get_attribute(&app->attributes, ORTE_APP_PREFIX_DIR, (void**)&prefix_dir, OPAL_STRING) &&
!orte_get_attribute(&child_app->attributes, ORTE_APP_PREFIX_DIR, NULL, OPAL_STRING)) {
orte_set_attribute(&child_app->attributes, ORTE_APP_PREFIX_DIR, ORTE_ATTR_GLOBAL, prefix_dir, OPAL_STRING);
}
if (NULL != prefix_dir) {
free(prefix_dir);
}
/* if the user asked to forward any envars, cycle through the app contexts
* in the comm_spawn request and add them
*/
@ -210,18 +208,20 @@ void orte_plm_base_recv(int status, orte_process_name_t* sender,
goto ANSWER_LAUNCH;
}
if( NULL == parent->bookmark ) {
/* find the sender's node in the job map */
if (NULL != (proc = (orte_proc_t*)opal_pointer_array_get_item(parent->procs, sender->vpid))) {
/* set the bookmark so the child starts from that place - this means
* that the first child process could be co-located with the proc
* that called comm_spawn, assuming slots remain on that node. Otherwise,
* the procs will start on the next available node
*/
jdata->bookmark = proc->node;
if (NULL != parent) {
if (NULL == parent->bookmark) {
/* find the sender's node in the job map */
if (NULL != (proc = (orte_proc_t*)opal_pointer_array_get_item(parent->procs, sender->vpid))) {
/* set the bookmark so the child starts from that place - this means
* that the first child process could be co-located with the proc
* that called comm_spawn, assuming slots remain on that node. Otherwise,
* the procs will start on the next available node
*/
jdata->bookmark = proc->node;
}
} else {
jdata->bookmark = parent->bookmark;
}
} else {
jdata->bookmark = parent->bookmark;
}
/* launch it */

Просмотреть файл

@ -270,7 +270,7 @@ int orte_rml_oob_send_buffer_nb(orte_process_name_t* peer,
return ORTE_ERR_BAD_PARAM;
}
if( NULL == peer ||
if (NULL == peer ||
OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, ORTE_NAME_INVALID, peer) ) {
/* cannot send to an invalid peer */
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);

Просмотреть файл

@ -71,6 +71,7 @@ orte_routed_module_t orte_routed_direct_module = {
#endif
};
static orte_process_name_t mylifeline;
static orte_process_name_t *lifeline = NULL;
static opal_list_t my_children;
@ -509,7 +510,12 @@ static bool route_is_defined(const orte_process_name_t *target)
static int set_lifeline(orte_process_name_t *proc)
{
/* there is no lifeline */
OPAL_OUTPUT_VERBOSE((2, orte_routed_base_framework.framework_output,
"%s routed:direct: set lifeline to %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(proc)));
mylifeline = *proc;
lifeline = &mylifeline;
return ORTE_SUCCESS;
}

34
orte/mca/state/dvm/Makefile.am Обычный файл
Просмотреть файл

@ -0,0 +1,34 @@
#
# Copyright (c) 2015 Intel, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
sources = \
state_dvm.h \
state_dvm_component.c \
state_dvm.c
# Make the output library in this directory, and name it either
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
# (for static builds).
if MCA_BUILD_orte_state_dvm_DSO
component_noinst =
component_install = mca_state_dvm.la
else
component_noinst = libmca_state_dvm.la
component_install =
endif
mcacomponentdir = $(ortelibdir)
mcacomponent_LTLIBRARIES = $(component_install)
mca_state_dvm_la_SOURCES = $(sources)
mca_state_dvm_la_LDFLAGS = -module -avoid-version
noinst_LTLIBRARIES = $(component_noinst)
libmca_state_dvm_la_SOURCES =$(sources)
libmca_state_dvm_la_LDFLAGS = -module -avoid-version

498
orte/mca/state/dvm/state_dvm.c Обычный файл
Просмотреть файл

@ -0,0 +1,498 @@
/*
* Copyright (c) 2015 Intel, Inc. All rights reserved
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include <sys/types.h>
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif /* HAVE_UNISTD_H */
#ifdef HAVE_STRING_H
#include <string.h>
#endif
#include "opal/util/output.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/filem/filem.h"
#include "orte/mca/iof/iof.h"
#include "orte/mca/plm/base/base.h"
#include "orte/mca/ras/base/base.h"
#include "orte/mca/rmaps/base/base.h"
#include "orte/mca/routed/routed.h"
#include "orte/util/session_dir.h"
#include "orte/runtime/orte_quit.h"
#include "orte/mca/state/state.h"
#include "orte/mca/state/base/base.h"
#include "orte/mca/state/base/state_private.h"
#include "state_dvm.h"
/*
* Module functions: Global
*/
static int init(void);
static int finalize(void);
/* local functions */
static void vm_ready(int fd, short args, void *cbata);
void check_complete(int fd, short args, void *cbdata);
/******************
* DVM module - used when mpirun is persistent
******************/
orte_state_base_module_t orte_state_dvm_module = {
init,
finalize,
orte_state_base_activate_job_state,
orte_state_base_add_job_state,
orte_state_base_set_job_state_callback,
orte_state_base_set_job_state_priority,
orte_state_base_remove_job_state,
orte_state_base_activate_proc_state,
orte_state_base_add_proc_state,
orte_state_base_set_proc_state_callback,
orte_state_base_set_proc_state_priority,
orte_state_base_remove_proc_state
};
/* defined default state machine sequence - individual
* plm's must add a state for launching daemons
*/
static orte_job_state_t launch_states[] = {
ORTE_JOB_STATE_INIT,
ORTE_JOB_STATE_INIT_COMPLETE,
ORTE_JOB_STATE_ALLOCATE,
ORTE_JOB_STATE_ALLOCATION_COMPLETE,
ORTE_JOB_STATE_DAEMONS_LAUNCHED,
ORTE_JOB_STATE_DAEMONS_REPORTED,
ORTE_JOB_STATE_VM_READY,
ORTE_JOB_STATE_MAP,
ORTE_JOB_STATE_MAP_COMPLETE,
ORTE_JOB_STATE_SYSTEM_PREP,
ORTE_JOB_STATE_LAUNCH_APPS,
ORTE_JOB_STATE_LOCAL_LAUNCH_COMPLETE,
ORTE_JOB_STATE_RUNNING,
ORTE_JOB_STATE_REGISTERED,
/* termination states */
ORTE_JOB_STATE_TERMINATED,
ORTE_JOB_STATE_NOTIFY_COMPLETED,
ORTE_JOB_STATE_ALL_JOBS_COMPLETE
};
static orte_state_cbfunc_t launch_callbacks[] = {
orte_plm_base_setup_job,
orte_plm_base_setup_job_complete,
orte_ras_base_allocate,
orte_plm_base_allocation_complete,
orte_plm_base_daemons_launched,
orte_plm_base_daemons_reported,
vm_ready,
orte_rmaps_base_map_job,
orte_plm_base_mapping_complete,
orte_plm_base_complete_setup,
orte_plm_base_launch_apps,
orte_state_base_local_launch_complete,
orte_plm_base_post_launch,
orte_plm_base_registered,
check_complete,
orte_state_base_cleanup_job,
orte_quit
};
static orte_proc_state_t proc_states[] = {
ORTE_PROC_STATE_RUNNING,
ORTE_PROC_STATE_REGISTERED,
ORTE_PROC_STATE_IOF_COMPLETE,
ORTE_PROC_STATE_WAITPID_FIRED,
ORTE_PROC_STATE_TERMINATED
};
static orte_state_cbfunc_t proc_callbacks[] = {
orte_state_base_track_procs,
orte_state_base_track_procs,
orte_state_base_track_procs,
orte_state_base_track_procs,
orte_state_base_track_procs
};
static void force_quit(int fd, short args, void *cbdata)
{
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
/* give us a chance to stop the orteds */
orte_plm.terminate_orteds();
OBJ_RELEASE(caddy);
}
/************************
* API Definitions
************************/
static int init(void)
{
int i, rc;
int num_states;
/* setup the state machines */
OBJ_CONSTRUCT(&orte_job_states, opal_list_t);
OBJ_CONSTRUCT(&orte_proc_states, opal_list_t);
/* setup the job state machine */
num_states = sizeof(launch_states) / sizeof(orte_job_state_t);
for (i=0; i < num_states; i++) {
if (ORTE_SUCCESS != (rc = orte_state.add_job_state(launch_states[i],
launch_callbacks[i],
ORTE_SYS_PRI))) {
ORTE_ERROR_LOG(rc);
}
}
/* add the termination response */
if (ORTE_SUCCESS != (rc = orte_state.add_job_state(ORTE_JOB_STATE_DAEMONS_TERMINATED,
orte_quit, ORTE_SYS_PRI))) {
ORTE_ERROR_LOG(rc);
}
/* add a default error response */
if (ORTE_SUCCESS != (rc = orte_state.add_job_state(ORTE_JOB_STATE_FORCED_EXIT,
force_quit, ORTE_ERROR_PRI))) {
ORTE_ERROR_LOG(rc);
}
/* add callback to report progress, if requested */
if (ORTE_SUCCESS != (rc = orte_state.add_job_state(ORTE_JOB_STATE_REPORT_PROGRESS,
orte_state_base_report_progress, ORTE_ERROR_PRI))) {
ORTE_ERROR_LOG(rc);
}
if (5 < opal_output_get_verbosity(orte_state_base_framework.framework_output)) {
orte_state_base_print_job_state_machine();
}
/* populate the proc state machine to allow us to
* track proc lifecycle changes
*/
num_states = sizeof(proc_states) / sizeof(orte_proc_state_t);
for (i=0; i < num_states; i++) {
if (ORTE_SUCCESS != (rc = orte_state.add_proc_state(proc_states[i],
proc_callbacks[i],
ORTE_SYS_PRI))) {
ORTE_ERROR_LOG(rc);
}
}
if (5 < opal_output_get_verbosity(orte_state_base_framework.framework_output)) {
orte_state_base_print_proc_state_machine();
}
return ORTE_SUCCESS;
}
static int finalize(void)
{
opal_list_item_t *item;
/* cleanup the proc state machine */
while (NULL != (item = opal_list_remove_first(&orte_proc_states))) {
OBJ_RELEASE(item);
}
OBJ_DESTRUCT(&orte_proc_states);
return ORTE_SUCCESS;
}
static void files_ready(int status, void *cbdata)
{
orte_job_t *jdata = (orte_job_t*)cbdata;
if (ORTE_SUCCESS != status) {
ORTE_FORCED_TERMINATE(status);
} else {
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_MAP);
}
}
static void vm_ready(int fd, short args, void *cbdata)
{
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
/* if this is my job, then we are done */
if (ORTE_PROC_MY_NAME->jobid == caddy->jdata->jobid) {
/* notify that the vm is ready */
opal_output(0, "DVM ready");
OBJ_RELEASE(caddy);
return;
}
/* progress the job */
caddy->jdata->state = ORTE_JOB_STATE_VM_READY;
/* position any required files */
if (ORTE_SUCCESS != orte_filem.preposition_files(caddy->jdata, files_ready, caddy->jdata)) {
ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
}
/* cleanup */
OBJ_RELEASE(caddy);
}
void check_complete(int fd, short args, void *cbdata)
{
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
orte_job_t *jdata = caddy->jdata;
orte_proc_t *proc;
int i;
orte_std_cntr_t j;
orte_job_t *job;
orte_node_t *node;
orte_job_map_t *map;
orte_std_cntr_t index;
bool one_still_alive;
orte_vpid_t lowest=0;
int32_t i32, *i32ptr;
opal_output_verbose(2, orte_state_base_framework.framework_output,
"%s state:base:check_job_complete on job %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
(NULL == jdata) ? "NULL" : ORTE_JOBID_PRINT(jdata->jobid));
if (NULL == jdata || jdata->jobid == ORTE_PROC_MY_NAME->jobid) {
/* just check to see if the daemons are complete */
OPAL_OUTPUT_VERBOSE((2, orte_state_base_framework.framework_output,
"%s state:base:check_job_complete - received NULL job, checking daemons",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
goto CHECK_DAEMONS;
} else {
/* mark the job as terminated, but don't override any
* abnormal termination flags
*/
if (jdata->state < ORTE_JOB_STATE_UNTERMINATED) {
jdata->state = ORTE_JOB_STATE_TERMINATED;
}
}
/* tell the IOF that the job is complete */
if (NULL != orte_iof.complete) {
orte_iof.complete(jdata);
}
i32ptr = &i32;
if (orte_get_attribute(&jdata->attributes, ORTE_JOB_NUM_NONZERO_EXIT, (void**)&i32ptr, OPAL_INT32) && !orte_abort_non_zero_exit) {
if (!orte_report_child_jobs_separately || 1 == ORTE_LOCAL_JOBID(jdata->jobid)) {
/* update the exit code */
ORTE_UPDATE_EXIT_STATUS(lowest);
}
/* warn user */
opal_output(orte_clean_output,
"-------------------------------------------------------\n"
"While %s job %s terminated normally, %d %s. Further examination may be required.\n"
"-------------------------------------------------------",
(1 == ORTE_LOCAL_JOBID(jdata->jobid)) ? "the primary" : "child",
(1 == ORTE_LOCAL_JOBID(jdata->jobid)) ? "" : ORTE_LOCAL_JOBID_PRINT(jdata->jobid),
i32, (1 == i32) ? "process returned\na non-zero exit code." :
"processes returned\nnon-zero exit codes.");
}
OPAL_OUTPUT_VERBOSE((2, orte_state_base_framework.framework_output,
"%s state:base:check_job_completed declared job %s terminated with state %s - checking all jobs",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOBID_PRINT(jdata->jobid),
orte_job_state_to_str(jdata->state)));
/* if this job is a continuously operating one, then don't do
* anything further - just return here
*/
if (NULL != jdata &&
(orte_get_attribute(&jdata->attributes, ORTE_JOB_CONTINUOUS_OP, NULL, OPAL_BOOL) ||
ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_RECOVERABLE))) {
goto CHECK_ALIVE;
}
/* if the job that is being checked is the HNP, then we are
* trying to terminate the orteds. In that situation, we
* do -not- check all jobs - we simply notify the HNP
* that the orteds are complete. Also check special case
* if jdata is NULL - we want
* to definitely declare the job done if the orteds
* have completed, no matter what else may be happening.
* This can happen if a ctrl-c hits in the "wrong" place
* while launching
*/
CHECK_DAEMONS:
if (jdata == NULL || jdata->jobid == ORTE_PROC_MY_NAME->jobid) {
if (0 == orte_routed.num_routes()) {
/* orteds are done! */
OPAL_OUTPUT_VERBOSE((2, orte_state_base_framework.framework_output,
"%s orteds complete - exiting",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
if (NULL == jdata) {
jdata = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid);
}
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_DAEMONS_TERMINATED);
OBJ_RELEASE(caddy);
return;
}
OBJ_RELEASE(caddy);
return;
}
/* Release the resources used by this job. Since some errmgrs may want
* to continue using resources allocated to the job as part of their
* fault recovery procedure, we only do this once the job is "complete".
* Note that an aborted/killed job -is- flagged as complete and will
* therefore have its resources released. We need to do this after
* we call the errmgr so that any attempt to restart the job will
* avoid doing so in the exact same place as the current job
*/
if (NULL != jdata->map && jdata->state == ORTE_JOB_STATE_TERMINATED) {
map = jdata->map;
for (index = 0; index < map->nodes->size; index++) {
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, index))) {
continue;
}
OPAL_OUTPUT_VERBOSE((2, orte_state_base_framework.framework_output,
"%s releasing procs from node %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
node->name));
for (i = 0; i < node->procs->size; i++) {
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, i))) {
continue;
}
if (proc->name.jobid != jdata->jobid) {
/* skip procs from another job */
continue;
}
node->slots_inuse--;
node->num_procs--;
OPAL_OUTPUT_VERBOSE((2, orte_state_base_framework.framework_output,
"%s releasing proc %s from node %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&proc->name), node->name));
/* set the entry in the node array to NULL */
opal_pointer_array_set_item(node->procs, i, NULL);
/* release the proc once for the map entry */
OBJ_RELEASE(proc);
}
/* set the node location to NULL */
opal_pointer_array_set_item(map->nodes, index, NULL);
/* maintain accounting */
OBJ_RELEASE(node);
/* flag that the node is no longer in a map */
ORTE_FLAG_UNSET(node, ORTE_NODE_FLAG_MAPPED);
}
OBJ_RELEASE(map);
jdata->map = NULL;
}
CHECK_ALIVE:
/* now check to see if all jobs are done - trigger notification of this jdata
* object when we find it
*/
one_still_alive = false;
for (j=1; j < orte_job_data->size; j++) {
if (NULL == (job = (orte_job_t*)opal_pointer_array_get_item(orte_job_data, j))) {
/* since we are releasing jdata objects as we
* go, we can no longer assume that the job_data
* array is left justified
*/
continue;
}
/* if this is the job we are checking AND it normally terminated,
* then activate the "notify_completed" state - this will release
* the job state, but is provided so that the HNP main code can
* take alternative actions if desired. If the state is killed_by_cmd,
* then go ahead and release it. We cannot release it if it
* abnormally terminated as mpirun needs the info so it can
* report appropriately to the user
*
* NOTE: do not release the primary job (j=1) so we
* can pretty-print completion message
*/
if (NULL != jdata && job->jobid == jdata->jobid) {
if (jdata->state == ORTE_JOB_STATE_TERMINATED) {
OPAL_OUTPUT_VERBOSE((2, orte_state_base_framework.framework_output,
"%s state:base:check_job_completed state is terminated - activating notify",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_NOTIFY_COMPLETED);
one_still_alive = true;
} else if (jdata->state == ORTE_JOB_STATE_KILLED_BY_CMD ||
jdata->state == ORTE_JOB_STATE_NOTIFIED) {
OPAL_OUTPUT_VERBOSE((2, orte_state_base_framework.framework_output,
"%s state:base:check_job_completed state is killed or notified - cleaning up",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
/* release this object, ensuring that the
* pointer array internal accounting
* is maintained!
*/
if (1 < j) {
if (ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_DEBUGGER_DAEMON)) {
/* this was a debugger daemon. notify that a debugger has detached */
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_DEBUGGER_DETACH);
}
opal_pointer_array_set_item(orte_job_data, j, NULL); /* ensure the array has a NULL */
OBJ_RELEASE(jdata);
}
}
continue;
}
/* if the job is flagged to not be monitored, skip it */
if (ORTE_FLAG_TEST(job, ORTE_JOB_FLAG_DO_NOT_MONITOR)) {
continue;
}
/* when checking for job termination, we must be sure to NOT check
* our own job as it - rather obviously - has NOT terminated!
*/
if (job->num_terminated < job->num_procs) {
/* we have at least one job that is not done yet - we cannot
* just return, though, as we need to ensure we cleanout the
* job data for the job that just completed
*/
OPAL_OUTPUT_VERBOSE((2, orte_state_base_framework.framework_output,
"%s state:base:check_job_completed job %s is not terminated (%d:%d)",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOBID_PRINT(job->jobid),
job->num_terminated, job->num_procs));
one_still_alive = true;
}
else {
OPAL_OUTPUT_VERBOSE((2, orte_state_base_framework.framework_output,
"%s state:base:check_job_completed job %s is terminated (%d vs %d [%s])",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOBID_PRINT(job->jobid),
job->num_terminated, job->num_procs,
(NULL == jdata) ? "UNKNOWN" : orte_job_state_to_str(jdata->state) ));
}
}
/* if a job is still alive, we just return */
if (one_still_alive) {
OPAL_OUTPUT_VERBOSE((2, orte_state_base_framework.framework_output,
"%s state:base:check_job_completed at least one job is not terminated",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
OBJ_RELEASE(caddy);
return;
}
/* if we get here, then all jobs are done, so terminate */
OPAL_OUTPUT_VERBOSE((2, orte_state_base_framework.framework_output,
"%s state:base:check_job_completed all jobs terminated",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
/* stop the job timeout event, if set */
if (NULL != orte_mpiexec_timeout) {
OBJ_RELEASE(orte_mpiexec_timeout);
orte_mpiexec_timeout = NULL;
}
/* set the exit status to 0 - this will only happen if it
* wasn't already set by an error condition
*/
ORTE_UPDATE_EXIT_STATUS(0);
/* order daemon termination - this tells us to cleanup
* our local procs as well as telling remote daemons
* to die
*/
orte_plm.terminate_orteds();
OBJ_RELEASE(caddy);
}

35
orte/mca/state/dvm/state_dvm.h Обычный файл
Просмотреть файл

@ -0,0 +1,35 @@
/*
* Copyright (c) 2015 Intel, Inc. All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/**
* @file
*
*/
#ifndef MCA_STATE_DVM_EXPORT_H
#define MCA_STATE_DVM_EXPORT_H
#include "orte_config.h"
#include "orte/mca/state/state.h"
BEGIN_C_DECLS
/*
* Local Component structures
*/
ORTE_MODULE_DECLSPEC extern orte_state_base_component_t mca_state_dvm_component;
ORTE_DECLSPEC extern orte_state_base_module_t orte_state_dvm_module;
END_C_DECLS
#endif /* MCA_STATE_DVM_EXPORT_H */

76
orte/mca/state/dvm/state_dvm_component.c Обычный файл
Просмотреть файл

@ -0,0 +1,76 @@
/*
* Copyright (c) 2015 Intel, Inc. All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "opal/util/output.h"
#include "orte/mca/state/state.h"
#include "orte/mca/state/base/base.h"
#include "state_dvm.h"
/*
* Public string for version number
*/
const char *orte_state_dvm_component_version_string =
"ORTE STATE dvm MCA component version " ORTE_VERSION;
/*
* Local functionality
*/
static int state_dvm_open(void);
static int state_dvm_close(void);
static int state_dvm_component_query(mca_base_module_t **module, int *priority);
/*
* Instantiate the public struct with all of our public information
* and pointer to our public functions in it
*/
orte_state_base_component_t mca_state_dvm_component =
{
/* Handle the general mca_component_t struct containing
* meta information about the component
*/
{
ORTE_STATE_BASE_VERSION_1_0_0,
/* Component name and version */
"dvm",
ORTE_MAJOR_VERSION,
ORTE_MINOR_VERSION,
ORTE_RELEASE_VERSION,
/* Component open and close functions */
state_dvm_open,
state_dvm_close,
state_dvm_component_query
},
{
/* The component is checkpoint ready */
MCA_BASE_METADATA_PARAM_CHECKPOINT
},
};
static int state_dvm_open(void)
{
return ORTE_SUCCESS;
}
static int state_dvm_close(void)
{
return ORTE_SUCCESS;
}
static int state_dvm_component_query(mca_base_module_t **module, int *priority)
{
/* we are only used when an envar is set directing it,
* so set our priority very low */
*priority = 0;
*module = (mca_base_module_t *)&orte_state_dvm_module;
return ORTE_SUCCESS;
}

Просмотреть файл

@ -13,7 +13,7 @@
# Copyright (c) 2006-2008 Cisco Systems, Inc. All rights reserved.
# Copyright (c) 2011-2013 Los Alamos National Security, LLC. All rights
# reserved.
# Copyright (c) 2014 Intel, Inc. All rights reserved.
# Copyright (c) 2014-2015 Intel, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
@ -35,7 +35,9 @@ SUBDIRS += \
tools/orte-top \
tools/orte-info \
tools/orte-migrate \
tools/orte-server
tools/orte-server \
tools/orte-submit \
tools/orte-dvm
DIST_SUBDIRS += \
tools/orte-checkpoint \
@ -48,5 +50,7 @@ DIST_SUBDIRS += \
tools/orte-top \
tools/orte-info \
tools/orte-migrate \
tools/orte-server
tools/orte-server \
tools/orte-submit \
tools/orte-dvm

57
orte/tools/orte-dvm/Makefile.am Обычный файл
Просмотреть файл

@ -0,0 +1,57 @@
#
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2004-2005 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2004-2009 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# Copyright (c) 2008-2014 Cisco Systems, Inc. All rights reserved.
# Copyright (c) 2008 Sun Microsystems, Inc. All rights reserved.
# Copyright (c) 2015 Intel, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# This is not quite in the Automake spirit, but we have to do it.
# Since the totalview portion of the library must be built with -g, we
# must eliminate the CFLAGS that are passed in here by default (which
# may already have debugging and/or optimization flags). We use
# post-processed forms of the CFLAGS in the library targets down
# below.
CFLAGS = $(CFLAGS_WITHOUT_OPTFLAGS) $(DEBUGGER_CFLAGS)
include $(top_srcdir)/Makefile.ompi-rules
man_pages = orte-dvm.1
EXTRA_DIST = $(man_pages:.1=.1in)
if OPAL_INSTALL_BINARIES
bin_PROGRAMS = orte-dvm
nodist_man_MANS = $(man_pages)
# Ensure that the man pages are rebuilt if the opal_config.h file
# changes; a "good enough" way to know if configure was run again (and
# therefore the release date or version may have changed)
$(nodist_man_MANS): $(top_builddir)/opal/include/opal_config.h
endif # OPAL_INSTALL_BINARIES
orte_dvm_SOURCES = \
orte-dvm.c
orte_dvm_LDADD = \
$(top_builddir)/orte/lib@ORTE_LIB_PREFIX@open-rte.la \
$(top_builddir)/opal/lib@OPAL_LIB_PREFIX@open-pal.la
distclean-local:
rm -f $(man_pages)

193
orte/tools/orte-dvm/orte-dvm.1in Обычный файл
Просмотреть файл

@ -0,0 +1,193 @@
.\” -*- nroff -*-
.\" Copyright (c) 2009-2014 Cisco Systems, Inc. All rights reserved.
.\" Copyright (c) 2008-2009 Sun Microsystems, Inc. All rights reserved.
.\” Copyright (c) 2015 Intel, Inc. All rights reserved
.\" $COPYRIGHT$
.\"
.\" Man page for ORTE's orte-dvm command
.\"
.\" .TH name section center-footer left-footer center-header
.TH ORTE-DVM 1 "#OMPI_DATE#" "#PACKAGE_VERSION#" "#PACKAGE_NAME#"
.\" **************************
.\" Name Section
.\" **************************
.SH NAME
.
orte-dvm, ompi_dvm \- Establish a Distributed Virtual Machine (DVM).
.B Note:
\fIorte-dvm\fP and \fIompi-dvm\fP are synonyms for each
other. Using either of the names will produce the same behavior.
.
.\" **************************
.\" Synopsis Section
.\" **************************
.SH SYNOPSIS
.
.PP
.B orte-dvm
[ options ]
.P
Invoking \fIorte-dvm\fP via an absolute path
name is equivalent to specifying the \fI--prefix\fP option with a
\fI<dir>\fR value equivalent to the directory where \fIorte-dvm\fR
resides, minus its last subdirectory. For example:
\fB%\fP /usr/local/bin/orte-dvm ...
is equivalent to
\fB%\fP orte-dvm --prefix /usr/local
.
.\" **************************
.\" Quick Summary Section
.\" **************************
.SH QUICK SUMMARY
.
\fIorte-dvm\fP will establish a DVM that can be used to execute subsequent
applications. Use of \fIorte-dvm\fP can be advantageous, for example, when you want to
execute a number of short-lived tasks. In such cases, the time required to start
the ORTE DVM can be a significant fraction of the time to execute the
overall application. Thus, creating a persistent DVM can speed the overall
execution. In addition, a persistent DVM will support executing multiple parallel
applications while maintaining separation between their respective cores.
.\" **************************
.\" Options Section
.\" **************************
.SH OPTIONS
.
.\"
.\" Start options listing
.\" Indent 10 characters from start of first column to start of second column
.
.TP
.B -h\fR,\fP --help
Display help for this command
.
.
.TP
.B -V\fR,\fP --version
Print version number. If no other arguments are given, this will also
cause orte-dvm to exit.
.
.
.P
Use one of the following options to specify which hosts (nodes) of the cluster to use
for the DVM.
.
.
.TP
.B -H\fR,\fP -host\fR,\fP --host \fR<host1,host2,...,hostN>\fP
List of hosts for the DVM.
.
.
.TP
.B
-hostfile\fR,\fP --hostfile \fR<hostfile>\fP
Provide a hostfile to use.
.
.
.TP
.B -machinefile\fR,\fP --machinefile \fR<machinefile>\fP
Synonym for \fI-hostfile\fP.
.
.
.TP
.B --prefix \fR<dir>\fP
Prefix directory that will be used to set the \fIPATH\fR and
\fILD_LIBRARY_PATH\fR on the remote node before invoking the ORTE daemon.
.
.
..P
Setting MCA parameters:
.
.
.TP
.B -gmca\fR,\fP --gmca \fR<key> <value>\fP
Pass global MCA parameters that are applicable to all contexts. \fI<key>\fP is
the parameter name; \fI<value>\fP is the parameter value.
.
.
.TP
.B -mca\fR,\fP --mca <key> <value>
Send arguments to various MCA modules. See the "MCA" section, below.
.
.
.
.
.TP
.B -report-uri\fR,\fP --report-uri <channel>
Print out orte-dvm's URI during startup. The channel must be either a '-' to indicate that
the URI is to be output to stdout, a '+' to indicate that the URI is to be output to stderr,
or a filename to which the URI is to be written.
.
.
.P
The following options are useful for developers; they are not generally
useful to most ORTE and/or MPI users:
.
.TP
.B -d\fR,\fP --debug-devel
Enable debugging of the ORTE layer.
.
.
.TP
.B --debug-daemons-file
Enable debugging of the ORTE daemons in the DVM, storing
output in files.
.
.
.P
There may be other options listed with \fIorte-dvm --help\fP.
.
.
.\" **************************
.\" Description Section
.\" **************************
.SH DESCRIPTION
.
\fIorte-dvm\fP starts a Distributed Virtual Machine (DVM) by launching
a daemon on each node of the allocation, as modified or specified by
the \fI-host\fP and \fI-hostfile\fP options. Applications can subsequently
be executed using the \fIorte-submit\fP command.
.
The DVM remains in operation until receiving the \fIorte-submit -terminate\fP
command.
.
.
.
.SS Specifying Host Nodes
.
Host nodes can be identified on the \fIorte-dvm\fP command line with the \fI-host\fP
option or in a hostfile.
.
.PP
For example,
.
.TP 4
orte-dvm -H aa,aa,bb ./a.out
launches two processes on node aa and one on bb.
.
.PP
Or, consider the hostfile
.
\fB%\fP cat myhostfile
aa slots=2
bb slots=2
cc slots=2
.
.PP
Here, we list both the host names (aa, bb, and cc) but also how many "slots"
there are for each. Slots indicate how many processes can potentially execute
on a node. For best performance, the number of slots may be chosen to be the
number of cores on the node or the number of processor sockets. If the hostfile
does not provide slots information, a default of 1 is assumed.
When running under resource managers (e.g., SLURM, Torque, etc.),
Open MPI will obtain both the hostnames and the number of slots directly
from the resource manger.
.
.

446
orte/tools/orte-dvm/orte-dvm.c Обычный файл
Просмотреть файл

@ -0,0 +1,446 @@
/* -*- C -*-
*
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2008 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2006-2014 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2007-2009 Sun Microsystems, Inc. All rights reserved.
* Copyright (c) 2007-2013 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2013-2015 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "orte/constants.h"
#ifdef HAVE_STRING_H
#include <string.h>
#endif
#include <stdio.h>
#ifdef HAVE_STDLIB_H
#include <stdlib.h>
#endif /* HAVE_STDLIB_H */
#ifdef HAVE_STRINGS_H
#include <strings.h>
#endif /* HAVE_STRINGS_H */
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif
#ifdef HAVE_SYS_PARAM_H
#include <sys/param.h>
#endif
#include <errno.h>
#include <signal.h>
#include <ctype.h>
#ifdef HAVE_SYS_TYPES_H
#include <sys/types.h>
#endif /* HAVE_SYS_TYPES_H */
#ifdef HAVE_SYS_WAIT_H
#include <sys/wait.h>
#endif /* HAVE_SYS_WAIT_H */
#ifdef HAVE_SYS_TIME_H
#include <sys/time.h>
#endif /* HAVE_SYS_TIME_H */
#include <fcntl.h>
#ifdef HAVE_SYS_STAT_H
#include <sys/stat.h>
#endif
#include "opal/mca/event/event.h"
#include "opal/mca/installdirs/installdirs.h"
#include "opal/mca/base/base.h"
#include "opal/util/argv.h"
#include "opal/util/output.h"
#include "opal/util/basename.h"
#include "opal/util/cmd_line.h"
#include "opal/util/opal_environ.h"
#include "opal/util/opal_getcwd.h"
#include "opal/util/show_help.h"
#include "opal/util/fd.h"
#include "opal/version.h"
#include "opal/runtime/opal.h"
#include "opal/util/os_path.h"
#include "opal/util/path.h"
#include "opal/class/opal_pointer_array.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/rml/rml.h"
#include "orte/mca/rml/base/rml_contact.h"
#include "orte/mca/state/state.h"
#include "orte/runtime/runtime.h"
#include "orte/runtime/orte_globals.h"
#include "orte/util/show_help.h"
#include "orte/orted/orted.h"
/*
* Globals
*/
static bool want_prefix_by_default = (bool) ORTE_WANT_ORTERUN_PREFIX_BY_DEFAULT;
/*
* Globals
*/
static struct {
bool help;
bool version;
char *report_uri;
char *basename;
char *prefix;
} myglobals;
static opal_cmd_line_init_t cmd_line_init[] = {
/* Various "obvious" options */
{ NULL, 'h', NULL, "help", 0,
&myglobals.help, OPAL_CMD_LINE_TYPE_BOOL,
"This help message" },
{ NULL, 'V', NULL, "version", 0,
&myglobals.version, OPAL_CMD_LINE_TYPE_BOOL,
"Print version and exit" },
{ NULL, '\0', "report-uri", "report-uri", 1,
&myglobals.report_uri, OPAL_CMD_LINE_TYPE_STRING,
"Printout URI on stdout [-], stderr [+], or a file [anything else]" },
{ NULL, '\0', "prefix", "prefix", 1,
&myglobals.prefix, OPAL_CMD_LINE_TYPE_STRING,
"Prefix to be used to look for ORTE executables" },
/* End of list */
{ NULL, '\0', NULL, NULL, 0,
NULL, OPAL_CMD_LINE_TYPE_NULL, NULL }
};
static void notify_requestor(int sd, short args, void *cbdata);
int main(int argc, char *argv[])
{
int rc, i, j;
opal_cmd_line_t cmd_line;
char *param, *value;
orte_job_t *jdata=NULL;
orte_app_context_t *app;
char *uri;
/* Setup and parse the command line */
memset(&myglobals, 0, sizeof(myglobals));
/* find our basename (the name of the executable) so that we can
use it in pretty-print error messages */
myglobals.basename = opal_basename(argv[0]);
opal_cmd_line_create(&cmd_line, cmd_line_init);
mca_base_cmd_line_setup(&cmd_line);
if (OPAL_SUCCESS != (rc = opal_cmd_line_parse(&cmd_line, true,
argc, argv)) ) {
if (OPAL_ERR_SILENT != rc) {
fprintf(stderr, "%s: command line error (%s)\n", argv[0],
opal_strerror(rc));
}
return rc;
}
/*
* Since this process can now handle MCA/GMCA parameters, make sure to
* process them.
* NOTE: It is "safe" to call mca_base_cmd_line_process_args() before
* opal_init_util() since mca_base_cmd_line_process_args() does *not*
* depend upon opal_init_util() functionality.
*/
if (OPAL_SUCCESS != mca_base_cmd_line_process_args(&cmd_line, &environ, &environ)) {
exit(1);
}
/* Need to initialize OPAL so that install_dirs are filled in */
if (OPAL_SUCCESS != opal_init(&argc, &argv)) {
exit(1);
}
/* Check for some "global" command line params */
/* print version if requested. Do this before check for help so
that --version --help works as one might expect. */
if (myglobals.version) {
char *str;
char *project_name = NULL;
if (0 == strcmp(myglobals.basename, "ompi-dvm")) {
project_name = "Open MPI";
} else {
project_name = "OpenRTE";
}
str = opal_show_help_string("help-orterun.txt", "orterun:version",
false,
myglobals.basename, project_name, OPAL_VERSION,
PACKAGE_BUGREPORT);
if (NULL != str) {
printf("%s", str);
free(str);
}
exit(0);
}
/* Check for help request */
if (myglobals.help) {
char *str, *args = NULL;
char *project_name = NULL;
if (0 == strcmp(myglobals.basename, "mpirun")) {
project_name = "Open MPI";
} else {
project_name = "OpenRTE";
}
args = opal_cmd_line_get_usage_msg(&cmd_line);
str = opal_show_help_string("help-orterun.txt", "orterun:usage", false,
myglobals.basename, project_name, OPAL_VERSION,
myglobals.basename, args,
PACKAGE_BUGREPORT);
if (NULL != str) {
printf("%s", str);
free(str);
}
free(args);
/* If someone asks for help, that should be all we do */
exit(0);
}
/* flag that I am the HNP */
orte_process_info.proc_type = ORTE_PROC_HNP;
/* Setup MCA params */
orte_register_params();
/* specify the DVM state machine */
putenv("OMPI_MCA_state=dvm");
/* Intialize our Open RTE environment */
if (ORTE_SUCCESS != (rc = orte_init(&argc, &argv, ORTE_PROC_HNP))) {
/* cannot call ORTE_ERROR_LOG as it could be the errmgr
* never got loaded!
*/
return rc;
}
/* finalize OPAL. As it was opened again from orte_init->opal_init
* we continue to have a reference count on it. So we have to finalize it twice...
*/
opal_finalize();
/* check for request to report uri */
uri = orte_rml.get_contact_info();
if (NULL != myglobals.report_uri) {
FILE *fp;
if (0 == strcmp(myglobals.report_uri, "-")) {
/* if '-', then output to stdout */
printf("VMURI: %s\n", uri);
} else if (0 == strcmp(myglobals.report_uri, "+")) {
/* if '+', output to stderr */
fprintf(stderr, "VMURI: %s\n", uri);
} else {
fp = fopen(myglobals.report_uri, "w");
if (NULL == fp) {
orte_show_help("help-orterun.txt", "orterun:write_file", false,
myglobals.basename, "pid", myglobals.report_uri);
exit(0);
}
fprintf(fp, "%s\n", uri);
fclose(fp);
}
free(uri);
} else {
printf("VMURI: %s\n", uri);
}
/* get the daemon job object - was created by ess/hnp component */
if (NULL == (jdata = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid))) {
orte_show_help("help-orterun.txt", "bad-job-object", true,
myglobals.basename);
exit(0);
}
/* also should have created a daemon "app" */
if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, 0))) {
orte_show_help("help-orterun.txt", "bad-app-object", true,
myglobals.basename);
exit(0);
}
/* Did the user specify a prefix, or want prefix by default? */
if (opal_cmd_line_is_taken(&cmd_line, "prefix") || want_prefix_by_default) {
size_t param_len;
/* if both the prefix was given and we have a prefix
* given above, check to see if they match
*/
if (opal_cmd_line_is_taken(&cmd_line, "prefix") &&
NULL != myglobals.prefix) {
/* if they don't match, then that merits a warning */
param = strdup(opal_cmd_line_get_param(&cmd_line, "prefix", 0, 0));
/* ensure we strip any trailing '/' */
if (0 == strcmp(OPAL_PATH_SEP, &(param[strlen(param)-1]))) {
param[strlen(param)-1] = '\0';
}
value = strdup(myglobals.prefix);
if (0 == strcmp(OPAL_PATH_SEP, &(value[strlen(value)-1]))) {
value[strlen(value)-1] = '\0';
}
if (0 != strcmp(param, value)) {
orte_show_help("help-orterun.txt", "orterun:app-prefix-conflict",
true, myglobals.basename, value, param);
/* let the global-level prefix take precedence since we
* know that one is being used
*/
free(param);
param = strdup(myglobals.prefix);
}
free(value);
} else if (NULL != myglobals.prefix) {
param = myglobals.prefix;
} else if (opal_cmd_line_is_taken(&cmd_line, "prefix")){
/* must be --prefix alone */
param = strdup(opal_cmd_line_get_param(&cmd_line, "prefix", 0, 0));
} else {
/* --enable-orterun-prefix-default was given to orterun */
param = strdup(opal_install_dirs.prefix);
}
if (NULL != param) {
/* "Parse" the param, aka remove superfluous path_sep. */
param_len = strlen(param);
while (0 == strcmp (OPAL_PATH_SEP, &(param[param_len-1]))) {
param[param_len-1] = '\0';
param_len--;
if (0 == param_len) {
orte_show_help("help-orterun.txt", "orterun:empty-prefix",
true, myglobals.basename, myglobals.basename);
return ORTE_ERR_FATAL;
}
}
orte_set_attribute(&app->attributes, ORTE_APP_PREFIX_DIR, ORTE_ATTR_GLOBAL, param, OPAL_STRING);
free(param);
}
}
/* Did the user specify a hostfile. Need to check for both
* hostfile and machine file.
* We can only deal with one hostfile per app context, otherwise give an error.
*/
if (0 < (j = opal_cmd_line_get_ninsts(&cmd_line, "hostfile"))) {
if(1 < j) {
orte_show_help("help-orterun.txt", "orterun:multiple-hostfiles",
true, myglobals.basename, NULL);
return ORTE_ERR_FATAL;
} else {
value = opal_cmd_line_get_param(&cmd_line, "hostfile", 0, 0);
orte_set_attribute(&app->attributes, ORTE_APP_HOSTFILE, ORTE_ATTR_LOCAL, value, OPAL_STRING);
}
}
if (0 < (j = opal_cmd_line_get_ninsts(&cmd_line, "machinefile"))) {
if(1 < j || orte_get_attribute(&app->attributes, ORTE_APP_HOSTFILE, NULL, OPAL_STRING)) {
orte_show_help("help-orterun.txt", "orterun:multiple-hostfiles",
true, myglobals.basename, NULL);
return ORTE_ERR_FATAL;
} else {
value = opal_cmd_line_get_param(&cmd_line, "machinefile", 0, 0);
orte_set_attribute(&app->attributes, ORTE_APP_HOSTFILE, ORTE_ATTR_LOCAL, value, OPAL_STRING);
}
}
/* Did the user specify any hosts? */
if (0 < (j = opal_cmd_line_get_ninsts(&cmd_line, "host"))) {
char **targ=NULL, *tval;
for (i = 0; i < j; ++i) {
value = opal_cmd_line_get_param(&cmd_line, "host", i, 0);
opal_argv_append_nosize(&targ, value);
}
tval = opal_argv_join(targ, ',');
orte_set_attribute(&app->attributes, ORTE_APP_DASH_HOST, ORTE_ATTR_LOCAL, tval, OPAL_STRING);
opal_argv_free(targ);
free(tval);
}
OBJ_DESTRUCT(&cmd_line);
/* setup to listen for commands sent specifically to me, even though I would probably
* be the one sending them! Unfortunately, since I am a participating daemon,
* there are times I need to send a command to "all daemons", and that means *I* have
* to receive it too
*/
orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_DAEMON,
ORTE_RML_PERSISTENT, orte_daemon_recv, NULL);
/* override the notify_completed state so we can send a message
* back to anyone who submits a job to us telling them the job
* completed */
if (ORTE_SUCCESS != (rc = orte_state.set_job_state_callback(ORTE_JOB_STATE_NOTIFY_COMPLETED, notify_requestor))) {
ORTE_ERROR_LOG(rc);
ORTE_UPDATE_EXIT_STATUS(rc);
exit(orte_exit_status);
}
/* spawn the DVM - we skip the initial steps as this
* isn't a user-level application */
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_ALLOCATE);
/* loop the event lib until an exit event is detected */
while (orte_event_base_active) {
opal_event_loop(orte_event_base, OPAL_EVLOOP_ONCE);
}
/* cleanup and leave */
orte_finalize();
if (orte_debug_flag) {
fprintf(stderr, "exiting with status %d\n", orte_exit_status);
}
exit(orte_exit_status);
}
static void send_callback(int status, orte_process_name_t *peer,
opal_buffer_t* buffer, orte_rml_tag_t tag,
void* cbdata)
{
orte_job_t *jdata = (orte_job_t*)cbdata;
OBJ_RELEASE(buffer);
/* cleanup the job object */
opal_pointer_array_set_item(orte_job_data, ORTE_LOCAL_JOBID(jdata->jobid), NULL);
OBJ_RELEASE(jdata);
}
static void notify_requestor(int sd, short args, void *cbdata)
{
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
orte_job_t *jdata = caddy->jdata;
orte_proc_t *pptr;
int ret;
opal_buffer_t *reply;
opal_output(0, "%s dvm: job %s has completed",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
(NULL == jdata) ? "NULL" : ORTE_JOBID_PRINT(jdata->jobid));
/* notify the requestor */
reply = OBJ_NEW(opal_buffer_t);
/* see if there was any problem */
if (orte_get_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, (void**)&pptr, OPAL_PTR) && NULL != pptr) {
ret = pptr->exit_code;
} else {
ret = 0;
}
opal_dss.pack(reply, &ret, 1, OPAL_INT);
orte_rml.send_buffer_nb(&jdata->originator, reply, ORTE_RML_TAG_TOOL, send_callback, jdata);
/* we cannot cleanup the job object as we might
* hit an error during transmission, so clean it
* up in the send callback */
OBJ_RELEASE(caddy);
}

57
orte/tools/orte-submit/Makefile.am Обычный файл
Просмотреть файл

@ -0,0 +1,57 @@
#
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2004-2005 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2004-2009 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# Copyright (c) 2008-2014 Cisco Systems, Inc. All rights reserved.
# Copyright (c) 2008 Sun Microsystems, Inc. All rights reserved.
# Copyright (c) 2015 Intel, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# This is not quite in the Automake spirit, but we have to do it.
# Since the totalview portion of the library must be built with -g, we
# must eliminate the CFLAGS that are passed in here by default (which
# may already have debugging and/or optimization flags). We use
# post-processed forms of the CFLAGS in the library targets down
# below.
CFLAGS = $(CFLAGS_WITHOUT_OPTFLAGS) $(DEBUGGER_CFLAGS)
include $(top_srcdir)/Makefile.ompi-rules
man_pages = orte-submit.1
EXTRA_DIST = $(man_pages:.1=.1in)
if OPAL_INSTALL_BINARIES
bin_PROGRAMS = orte-submit
nodist_man_MANS = $(man_pages)
# Ensure that the man pages are rebuilt if the opal_config.h file
# changes; a "good enough" way to know if configure was run again (and
# therefore the release date or version may have changed)
$(nodist_man_MANS): $(top_builddir)/opal/include/opal_config.h
endif # OPAL_INSTALL_BINARIES
orte_submit_SOURCES = \
orte-submit.c
orte_submit_LDADD = \
$(top_builddir)/orte/lib@ORTE_LIB_PREFIX@open-rte.la \
$(top_builddir)/opal/lib@OPAL_LIB_PREFIX@open-pal.la
distclean-local:
rm -f $(man_pages)

1430
orte/tools/orte-submit/orte-submit.1in Обычный файл

Разница между файлами не показана из-за своего большого размера Загрузить разницу

1468
orte/tools/orte-submit/orte-submit.c Обычный файл

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -11,6 +11,7 @@
# All rights reserved.
# Copyright (c) 2008-2014 Cisco Systems, Inc. All rights reserved.
# Copyright (c) 2008 Sun Microsystems, Inc. All rights reserved.
# Copyright (c) 2015 Intel, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow

Просмотреть файл

@ -549,6 +549,10 @@ static opal_cmd_line_init_t cmd_line_init[] = {
&orterun_globals.personality, OPAL_CMD_LINE_TYPE_STRING,
"Programming model/language being used (default=\"ompi\")" },
{ NULL, '\0', "dvm", "dvm", 0,
&orterun_globals.dvm, OPAL_CMD_LINE_TYPE_BOOL,
"Programming model/language being used (default=\"ompi\")" },
/* End of list */
{ NULL, '\0', NULL, NULL, 0,
NULL, OPAL_CMD_LINE_TYPE_NULL, NULL }
@ -1131,6 +1135,7 @@ static int init_globals(void)
orterun_globals.index_argv = false;
orterun_globals.run_as_root = false;
orterun_globals.personality = NULL;
orterun_globals.dvm = false;
}
/* Reset the other fields every time */

Просмотреть файл

@ -65,6 +65,7 @@ struct orterun_globals_t {
bool index_argv;
bool run_as_root;
char *personality;
bool dvm;
};
/**