diff --git a/.gitignore b/.gitignore index d79ef330c4..ec6546a8de 100644 --- a/.gitignore +++ b/.gitignore @@ -476,6 +476,9 @@ orte/tools/orte-checkpoint/orte-checkpoint.1 orte/tools/orte-checkpoint/ompi-checkpoint.1 orte/tools/orte-clean/orte-clean orte/tools/orte-clean/orte-clean.1 +orte/tools/orte-dvm/orte-dvm +orte/tools/orte-dvm/orte-dvm.1 +ompi/mca/rte/orte/ompi-dvm.1 orte/tools/orte-info/orte-info orte/tools/orte-info/orte-info.1 orte/tools/orte-migrate/orte-migrate @@ -488,6 +491,9 @@ orte/tools/orte-restart/orte-restart.1 orte/tools/orte-restart/ompi-restart.1 orte/tools/orte-server/orte-server orte/tools/orte-server/orte-server.1 +orte/tools/orte-submit/orte-submit +orte/tools/orte-submit/orte-submit.1 +ompi/mca/rte/orte/ompi-submit.1 orte/tools/orte-top/orte-top orte/tools/orte-top/orte-top.1 orte/tools/orted/orted diff --git a/config/orte_config_files.m4 b/config/orte_config_files.m4 index a0e87b174b..dc87637d34 100644 --- a/config/orte_config_files.m4 +++ b/config/orte_config_files.m4 @@ -5,7 +5,8 @@ # University Research and Technology # Corporation. All rights reserved. # Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights -# reserved. +# reserved. +# Copyright (c) 2015 Intel, Inc. All rights reserved # $COPYRIGHT$ # # Additional copyrights may follow @@ -32,5 +33,7 @@ AC_DEFUN([ORTE_CONFIG_FILES],[ orte/tools/orte-migrate/Makefile orte/tools/orte-info/Makefile orte/tools/orte-server/Makefile + orte/tools/orte-submit/Makefile + orte/tools/orte-dvm/Makefile ]) ]) diff --git a/ompi/mca/rte/orte/Makefile.am b/ompi/mca/rte/orte/Makefile.am index a5f57b1e48..f5c1f210a5 100644 --- a/ompi/mca/rte/orte/Makefile.am +++ b/ompi/mca/rte/orte/Makefile.am @@ -27,7 +27,7 @@ libmca_rte_orte_la_SOURCES =$(sources) $(headers) libmca_rte_orte_la_LDFLAGS = -module -avoid-version libmca_rte_orte_la_LIBADD = $(top_builddir)/orte/lib@ORTE_LIB_PREFIX@open-rte.la -man_pages = mpirun.1 mpiexec.1 ompi-ps.1 ompi-clean.1 ompi-top.1 ompi-server.1 +man_pages = mpirun.1 mpiexec.1 ompi-ps.1 ompi-clean.1 ompi-top.1 ompi-server.1 ompi-dvm.1 ompi-submit.1 if WANT_FT man_pages += ompi-checkpoint.1 ompi-restart.1 @@ -43,6 +43,8 @@ install-exec-hook: (cd $(DESTDIR)$(bindir); rm -f ompi-clean$(EXEEXT); $(LN_S) orte-clean$(EXEEXT) ompi-clean$(EXEEXT)) (cd $(DESTDIR)$(bindir); rm -f ompi-top$(EXEEXT); $(LN_S) orte-top$(EXEEXT) ompi-top$(EXEEXT)) (cd $(DESTDIR)$(bindir); rm -f ompi-server$(EXEEXT); $(LN_S) orte-server$(EXEEXT) ompi-server$(EXEEXT)) + (cd $(DESTDIR)$(bindir); rm -f ompi-dvm$(EXEEXT); $(LN_S) orte-dvm$(EXEEXT) ompi-dvm$(EXEEXT)) + (cd $(DESTDIR)$(bindir); rm -f ompi-submit$(EXEEXT); $(LN_S) orte-submit$(EXEEXT) ompi-submit$(EXEEXT)) if WANT_FT (cd $(DESTDIR)$(bindir); rm -f ompi-checkpoint$(EXEEXT); $(LN_S) orte-checkpoint$(EXEEXT) ompi-checkpoint$(EXEEXT)) (cd $(DESTDIR)$(bindir); rm -f ompi-restart$(EXEEXT); $(LN_S) orte-restart$(EXEEXT) ompi-restart$(EXEEXT)) @@ -55,7 +57,9 @@ uninstall-local: $(DESTDIR)$(bindir)/ompi-ps$(EXEEXT) \ $(DESTDIR)$(bindir)/ompi-clean$(EXEEXT) \ $(DESTDIR)$(bindir)/ompi-top$(EXEEXT) \ - $(DESTDIR)$(bindir)/ompi-server$(EXEEXT) + $(DESTDIR)$(bindir)/ompi-server$(EXEEXT) \ + $(DESTDIR)$(bindir)/ompi-dvm$(EXEEXT) \ + $(DESTDIR)$(bindir)/ompi-submit$(EXEEXT) if WANT_FT rm -f $(DESTDIR)$(bindir)/ompi-checkpoint$(EXEEXT) \ $(DESTDIR)$(bindir)/ompi-restart$(EXEEXT) \ @@ -115,5 +119,11 @@ $(top_builddir)/orte/tools/orte-server/orte-server.1: ompi-server.1: $(top_builddir)/orte/tools/orte-server/orte-server.1 cp -f $(top_builddir)/orte/tools/orte-server/orte-server.1 ompi-server.1 +ompi-dvm.1: $(top_builddir)/orte/tools/orte-dvm/orte-dvm.1 + cp -f $(top_builddir)/orte/tools/orte-dvm/orte-dvm.1 ompi-dvm.1 + +ompi-submit.1: $(top_builddir)/orte/tools/orte-submit/orte-submit.1 + cp -f $(top_builddir)/orte/tools/orte-submit/orte-submit.1 ompi-submit.1 + clean-local: rm -f $(man_pages) diff --git a/orte/mca/ess/base/ess_base_std_tool.c b/orte/mca/ess/base/ess_base_std_tool.c index a203f13699..571b54ee5d 100644 --- a/orte/mca/ess/base/ess_base_std_tool.c +++ b/orte/mca/ess/base/ess_base_std_tool.c @@ -11,7 +11,7 @@ * All rights reserved. * Copyright (c) 2011-2013 Los Alamos National Security, LLC. * All rights reserved. - * Copyright (c) 2013-2014 Intel, Inc. All rights reserved. + * Copyright (c) 2013-2015 Intel, Inc. All rights reserved. * Copyright (c) 2014 Hochschule Esslingen. All rights reserved. * * $COPYRIGHT$ @@ -36,19 +36,22 @@ #include "opal/mca/event/event.h" #include "opal/runtime/opal.h" #include "opal/runtime/opal_cr.h" +#include "opal/runtime/opal_progress_threads.h" #include "opal/util/arch.h" #include "opal/util/proc.h" #include "orte/mca/oob/base/base.h" +#include "orte/mca/plm/base/base.h" #include "orte/mca/rml/base/base.h" #include "orte/mca/routed/base/base.h" -#include "orte/mca/errmgr/errmgr.h" +#include "orte/mca/errmgr/base/base.h" #include "orte/mca/iof/base/base.h" #include "orte/mca/state/base/base.h" #if OPAL_ENABLE_FT_CR == 1 #include "orte/mca/snapc/base/base.h" #include "orte/mca/sstore/base/base.h" #endif +#include "orte/mca/schizo/base/base.h" #include "orte/util/proc_info.h" #include "orte/util/session_dir.h" #include "orte/util/show_help.h" @@ -59,6 +62,8 @@ #include "orte/mca/ess/base/base.h" +static bool progress_thread_running = false; + int orte_ess_base_tool_setup(void) { int ret; @@ -79,6 +84,9 @@ int orte_ess_base_tool_setup(void) * so it will do the right things. */ orte_process_info.proc_type |= ORTE_PROC_NON_MPI; + /* get a separate orte event base */ + orte_event_base = opal_start_progress_thread("orte", true); + progress_thread_running = true; } /* open and setup the state machine */ @@ -93,6 +101,18 @@ int orte_ess_base_tool_setup(void) goto error; } + /* open and setup the error manager */ + if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_errmgr_base_framework, 0))) { + ORTE_ERROR_LOG(ret); + error = "orte_errmgr_base_open"; + goto error; + } + if (ORTE_SUCCESS != (ret = orte_errmgr_base_select())) { + ORTE_ERROR_LOG(ret); + error = "orte_errmgr_base_select"; + goto error; + } + /* Setup the communication infrastructure */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_oob_base_framework, 0))) { ORTE_ERROR_LOG(ret); @@ -177,6 +197,15 @@ int orte_ess_base_tool_setup(void) error = "orte_iof_base_select"; goto error; } + /* if we were given an HNP, then also setup the PLM in case this + * tool wants to request that we spawn something for it */ + if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_plm_base_framework, 0))) { + ORTE_ERROR_LOG(ret); + error = "orte_plm_base_open"; + goto error; + } + /* we don't select the plm framework as we only want the + * base proxy functions */ } #if OPAL_ENABLE_FT_CR == 1 @@ -208,7 +237,19 @@ int orte_ess_base_tool_setup(void) /* Tools do not need all the OPAL CR stuff */ opal_cr_set_enabled(false); #endif - + + /* setup schizo in case we are parsing cmd lines */ + if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_schizo_base_framework, 0))) { + ORTE_ERROR_LOG(ret); + error = "orte_schizo_base_open"; + goto error; + } + if (ORTE_SUCCESS != (ret = orte_schizo_base_select())) { + ORTE_ERROR_LOG(ret); + error = "orte_schizo_base_select"; + goto error; + } + return ORTE_SUCCESS; error: @@ -237,6 +278,13 @@ int orte_ess_base_tool_finalize(void) } (void) mca_base_framework_close(&orte_routed_base_framework); (void) mca_base_framework_close(&orte_rml_base_framework); + (void) mca_base_framework_close(&orte_schizo_base_framework); + (void) mca_base_framework_close(&orte_errmgr_base_framework); + /* release the event base */ + if (progress_thread_running) { + opal_stop_progress_thread("orte", true); + progress_thread_running = false; + } return ORTE_SUCCESS; } diff --git a/orte/mca/ess/env/ess_env_component.c b/orte/mca/ess/env/ess_env_component.c index 547dfb2d3d..3dc05c6b36 100644 --- a/orte/mca/ess/env/ess_env_component.c +++ b/orte/mca/ess/env/ess_env_component.c @@ -66,21 +66,11 @@ orte_ess_env_component_open(void) int orte_ess_env_component_query(mca_base_module_t **module, int *priority) { - /* we are the env module, so set the priority to - * be higher than the tool component so that a - * tool launched as a distributed set of procs - * (i.e., a "tool with name") will select this - * module, but low enough that any other environment - * will override us - */ - - /* if we don't have a path back to the HNP, then we - * were not launched by mpirun, so don't pick us as - * it would be impossible for the correct env vars - * to have been set! - */ - if (NULL != orte_process_info.my_hnp_uri) { - *priority = 20; + /* we are the env module, only used by daemons that are + * launched by ssh so allow any enviro-specifc modules + * to override us */ + if (ORTE_PROC_IS_DAEMON) { + *priority = 1; *module = (mca_base_module_t *)&orte_ess_env_module; return ORTE_SUCCESS; } diff --git a/orte/mca/ess/env/ess_env_module.c b/orte/mca/ess/env/ess_env_module.c index c0fdd96f04..534ae40fdb 100644 --- a/orte/mca/ess/env/ess_env_module.c +++ b/orte/mca/ess/env/ess_env_module.c @@ -112,64 +112,22 @@ static int rte_init(void) /* if I am a daemon, complete my setup using the * default procedure */ - if (ORTE_PROC_IS_DAEMON) { - if (NULL != orte_node_regex) { - /* extract the nodes */ - if (ORTE_SUCCESS != (ret = orte_regex_extract_node_names(orte_node_regex, &hosts))) { - error = "orte_regex_extract_node_names"; - goto error; - } - } - if (ORTE_SUCCESS != (ret = orte_ess_base_orted_setup(hosts))) { - ORTE_ERROR_LOG(ret); - error = "orte_ess_base_orted_setup"; + if (NULL != orte_node_regex) { + /* extract the nodes */ + if (ORTE_SUCCESS != (ret = orte_regex_extract_node_names(orte_node_regex, &hosts))) { + error = "orte_regex_extract_node_names"; goto error; } - opal_argv_free(hosts); - return ORTE_SUCCESS; } - - if (ORTE_PROC_IS_TOOL) { - /* otherwise, if I am a tool proc, use that procedure */ - if (ORTE_SUCCESS != (ret = orte_ess_base_tool_setup())) { - ORTE_ERROR_LOG(ret); - error = "orte_ess_base_tool_setup"; - goto error; - } - return ORTE_SUCCESS; - - } - - /* use the default procedure to finish my setup */ - if (ORTE_SUCCESS != (ret = orte_ess_base_app_setup(true))) { + if (ORTE_SUCCESS != (ret = orte_ess_base_orted_setup(hosts))) { ORTE_ERROR_LOG(ret); - error = "orte_ess_base_app_setup"; + error = "orte_ess_base_orted_setup"; goto error; } - - /* setup process binding */ - if (ORTE_SUCCESS != (ret = orte_ess_base_proc_binding())) { - error = "proc_binding"; - goto error; - } - - /* if we are an ORTE app - and not an MPI app - then - * we need to exchange our connection info here. - * MPI_Init has its own modex, so we don't need to do - * two of them. However, if we don't do a modex at all, - * then processes have no way to communicate - * - * NOTE: only do this when the process originally launches. - * Cannot do this on a restart as the rest of the processes - * in the job won't be executing this step, so we would hang - */ - if (ORTE_PROC_IS_NON_MPI && !orte_do_not_barrier) { - opal_pmix.fence(NULL, 0); - } - + opal_argv_free(hosts); return ORTE_SUCCESS; -error: + error: if (ORTE_ERR_SILENT != ret && !orte_report_silent_errors) { orte_show_help("help-orte-runtime.txt", "orte_init:startup:internal-failure", @@ -183,29 +141,10 @@ static int rte_finalize(void) { int ret; - /* if I am a daemon, finalize using the default procedure */ - if (ORTE_PROC_IS_DAEMON) { - if (ORTE_SUCCESS != (ret = orte_ess_base_orted_finalize())) { - ORTE_ERROR_LOG(ret); - } - return ret; - } else if (ORTE_PROC_IS_TOOL) { - /* otherwise, if I am a tool proc, use that procedure */ - if (ORTE_SUCCESS != (ret = orte_ess_base_tool_finalize())) { - ORTE_ERROR_LOG(ret); - } - /* as a tool, I didn't create a nidmap - so just return now */ - return ret; - } - - /* otherwise, I must be an application process - * use the default procedure to finish - */ - if (ORTE_SUCCESS != (ret = orte_ess_base_app_finalize())) { + if (ORTE_SUCCESS != (ret = orte_ess_base_orted_finalize())) { ORTE_ERROR_LOG(ret); } - - return ORTE_SUCCESS; + return ret; } static int env_set_name(void) diff --git a/orte/mca/ess/tool/ess_tool_module.c b/orte/mca/ess/tool/ess_tool_module.c index 5949e18e76..035f0b8ee0 100644 --- a/orte/mca/ess/tool/ess_tool_module.c +++ b/orte/mca/ess/tool/ess_tool_module.c @@ -9,6 +9,7 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. + * Copyright (c) 2015 Intel, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -19,6 +20,7 @@ #include "orte_config.h" #include "orte/constants.h" +#include "opal/hash_string.h" #include #include @@ -57,28 +59,50 @@ static int rte_init(void) { int ret; char *error = NULL; - + orte_jobid_t jobid; + orte_vpid_t vpid; + /* run the prolog */ if (ORTE_SUCCESS != (ret = orte_ess_base_std_prolog())) { error = "orte_ess_base_std_prolog"; goto error; } - /* If we are a tool with no name, then responsibility for - * defining the name falls to the PLM component for our - * respective environment. - * Just call the base function for this. - * - * NOTE: Tools with names - i.e., tools consisting of a - * distributed set of processes - will select and use - * the appropriate enviro-specific module and -not- this one! - */ - if (ORTE_SUCCESS != (ret = orte_plm_base_set_hnp_name())) { - ORTE_ERROR_LOG(ret); - error = "orte_plm_base_set_hnp_name"; - goto error; + + if (NULL != orte_ess_base_jobid && + NULL != orte_ess_base_vpid) { + opal_output_verbose(2, orte_ess_base_framework.framework_output, + "ess:tool:obtaining name from environment"); + if (ORTE_SUCCESS != (ret = orte_util_convert_string_to_jobid(&jobid, orte_ess_base_jobid))) { + return(ret); + } + ORTE_PROC_MY_NAME->jobid = jobid; + if (ORTE_SUCCESS != (ret = orte_util_convert_string_to_vpid(&vpid, orte_ess_base_vpid))) { + return(ret); + } + ORTE_PROC_MY_NAME->vpid = vpid; + } else { + /* If we are a tool with no name, then define it here */ + uint16_t jobfam; + uint32_t hash32; + uint32_t bias; + + opal_output_verbose(2, orte_ess_base_framework.framework_output, + "ess:tool:computing name"); + /* hash the nodename */ + OPAL_HASH_STR(orte_process_info.nodename, hash32); + bias = (uint32_t)orte_process_info.pid; + /* fold in the bias */ + hash32 = hash32 ^ bias; + + /* now compress to 16-bits */ + jobfam = (uint16_t)(((0x0000ffff & (0xffff0000 & hash32) >> 16)) ^ (0x0000ffff & hash32)); + + /* set the name */ + ORTE_PROC_MY_NAME->jobid = 0xffff0000 & ((uint32_t)jobfam << 16); + ORTE_PROC_MY_NAME->vpid = 0; } - + /* do the rest of the standard tool init */ if (ORTE_SUCCESS != (ret = orte_ess_base_tool_setup())) { ORTE_ERROR_LOG(ret); @@ -88,7 +112,7 @@ static int rte_init(void) return ORTE_SUCCESS; -error: + error: if (ORTE_ERR_SILENT != ret && !orte_report_silent_errors) { orte_show_help("help-orte-runtime.txt", "orte_init:startup:internal-failure", diff --git a/orte/mca/plm/base/plm_base_receive.c b/orte/mca/plm/base/plm_base_receive.c index 8e70ee452c..64ac6749be 100644 --- a/orte/mca/plm/base/plm_base_receive.c +++ b/orte/mca/plm/base/plm_base_receive.c @@ -164,28 +164,26 @@ void orte_plm_base_recv(int status, orte_process_name_t* sender, jdata->originator.vpid = sender->vpid; /* get the parent's job object */ - if (NULL == (parent = orte_get_job_data_object(sender->jobid))) { - ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); - goto ANSWER_LAUNCH; + if (NULL != (parent = orte_get_job_data_object(sender->jobid))) { + /* if the prefix was set in the parent's job, we need to transfer + * that prefix to the child's app_context so any further launch of + * orteds can find the correct binary. There always has to be at + * least one app_context in both parent and child, so we don't + * need to check that here. However, be sure not to overwrite + * the prefix if the user already provided it! + */ + app = (orte_app_context_t*)opal_pointer_array_get_item(parent->apps, 0); + child_app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, 0); + prefix_dir = NULL; + if (orte_get_attribute(&app->attributes, ORTE_APP_PREFIX_DIR, (void**)&prefix_dir, OPAL_STRING) && + !orte_get_attribute(&child_app->attributes, ORTE_APP_PREFIX_DIR, NULL, OPAL_STRING)) { + orte_set_attribute(&child_app->attributes, ORTE_APP_PREFIX_DIR, ORTE_ATTR_GLOBAL, prefix_dir, OPAL_STRING); + } + if (NULL != prefix_dir) { + free(prefix_dir); + } } - /* if the prefix was set in the parent's job, we need to transfer - * that prefix to the child's app_context so any further launch of - * orteds can find the correct binary. There always has to be at - * least one app_context in both parent and child, so we don't - * need to check that here. However, be sure not to overwrite - * the prefix if the user already provided it! - */ - app = (orte_app_context_t*)opal_pointer_array_get_item(parent->apps, 0); - child_app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, 0); - prefix_dir = NULL; - if (orte_get_attribute(&app->attributes, ORTE_APP_PREFIX_DIR, (void**)&prefix_dir, OPAL_STRING) && - !orte_get_attribute(&child_app->attributes, ORTE_APP_PREFIX_DIR, NULL, OPAL_STRING)) { - orte_set_attribute(&child_app->attributes, ORTE_APP_PREFIX_DIR, ORTE_ATTR_GLOBAL, prefix_dir, OPAL_STRING); - } - if (NULL != prefix_dir) { - free(prefix_dir); - } - + /* if the user asked to forward any envars, cycle through the app contexts * in the comm_spawn request and add them */ @@ -210,18 +208,20 @@ void orte_plm_base_recv(int status, orte_process_name_t* sender, goto ANSWER_LAUNCH; } - if( NULL == parent->bookmark ) { - /* find the sender's node in the job map */ - if (NULL != (proc = (orte_proc_t*)opal_pointer_array_get_item(parent->procs, sender->vpid))) { - /* set the bookmark so the child starts from that place - this means - * that the first child process could be co-located with the proc - * that called comm_spawn, assuming slots remain on that node. Otherwise, - * the procs will start on the next available node - */ - jdata->bookmark = proc->node; + if (NULL != parent) { + if (NULL == parent->bookmark) { + /* find the sender's node in the job map */ + if (NULL != (proc = (orte_proc_t*)opal_pointer_array_get_item(parent->procs, sender->vpid))) { + /* set the bookmark so the child starts from that place - this means + * that the first child process could be co-located with the proc + * that called comm_spawn, assuming slots remain on that node. Otherwise, + * the procs will start on the next available node + */ + jdata->bookmark = proc->node; + } + } else { + jdata->bookmark = parent->bookmark; } - } else { - jdata->bookmark = parent->bookmark; } /* launch it */ diff --git a/orte/mca/rml/oob/rml_oob_send.c b/orte/mca/rml/oob/rml_oob_send.c index 8addff084d..3331856b70 100644 --- a/orte/mca/rml/oob/rml_oob_send.c +++ b/orte/mca/rml/oob/rml_oob_send.c @@ -270,7 +270,7 @@ int orte_rml_oob_send_buffer_nb(orte_process_name_t* peer, return ORTE_ERR_BAD_PARAM; } - if( NULL == peer || + if (NULL == peer || OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, ORTE_NAME_INVALID, peer) ) { /* cannot send to an invalid peer */ ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); diff --git a/orte/mca/routed/direct/routed_direct.c b/orte/mca/routed/direct/routed_direct.c index 97dc58b739..639263678f 100644 --- a/orte/mca/routed/direct/routed_direct.c +++ b/orte/mca/routed/direct/routed_direct.c @@ -71,6 +71,7 @@ orte_routed_module_t orte_routed_direct_module = { #endif }; +static orte_process_name_t mylifeline; static orte_process_name_t *lifeline = NULL; static opal_list_t my_children; @@ -509,7 +510,12 @@ static bool route_is_defined(const orte_process_name_t *target) static int set_lifeline(orte_process_name_t *proc) { - /* there is no lifeline */ + OPAL_OUTPUT_VERBOSE((2, orte_routed_base_framework.framework_output, + "%s routed:direct: set lifeline to %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(proc))); + mylifeline = *proc; + lifeline = &mylifeline; return ORTE_SUCCESS; } diff --git a/orte/mca/state/dvm/Makefile.am b/orte/mca/state/dvm/Makefile.am new file mode 100644 index 0000000000..e741d817fa --- /dev/null +++ b/orte/mca/state/dvm/Makefile.am @@ -0,0 +1,34 @@ +# +# Copyright (c) 2015 Intel, Inc. All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +sources = \ + state_dvm.h \ + state_dvm_component.c \ + state_dvm.c + +# Make the output library in this directory, and name it either +# mca__.la (for DSO builds) or libmca__.la +# (for static builds). + +if MCA_BUILD_orte_state_dvm_DSO +component_noinst = +component_install = mca_state_dvm.la +else +component_noinst = libmca_state_dvm.la +component_install = +endif + +mcacomponentdir = $(ortelibdir) +mcacomponent_LTLIBRARIES = $(component_install) +mca_state_dvm_la_SOURCES = $(sources) +mca_state_dvm_la_LDFLAGS = -module -avoid-version + +noinst_LTLIBRARIES = $(component_noinst) +libmca_state_dvm_la_SOURCES =$(sources) +libmca_state_dvm_la_LDFLAGS = -module -avoid-version diff --git a/orte/mca/state/dvm/state_dvm.c b/orte/mca/state/dvm/state_dvm.c new file mode 100644 index 0000000000..b09859084d --- /dev/null +++ b/orte/mca/state/dvm/state_dvm.c @@ -0,0 +1,498 @@ +/* + * Copyright (c) 2015 Intel, Inc. All rights reserved + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "orte_config.h" + +#include +#ifdef HAVE_UNISTD_H +#include +#endif /* HAVE_UNISTD_H */ +#ifdef HAVE_STRING_H +#include +#endif + +#include "opal/util/output.h" + +#include "orte/mca/errmgr/errmgr.h" +#include "orte/mca/filem/filem.h" +#include "orte/mca/iof/iof.h" +#include "orte/mca/plm/base/base.h" +#include "orte/mca/ras/base/base.h" +#include "orte/mca/rmaps/base/base.h" +#include "orte/mca/routed/routed.h" +#include "orte/util/session_dir.h" +#include "orte/runtime/orte_quit.h" + +#include "orte/mca/state/state.h" +#include "orte/mca/state/base/base.h" +#include "orte/mca/state/base/state_private.h" +#include "state_dvm.h" + +/* + * Module functions: Global + */ +static int init(void); +static int finalize(void); + +/* local functions */ +static void vm_ready(int fd, short args, void *cbata); +void check_complete(int fd, short args, void *cbdata); + +/****************** + * DVM module - used when mpirun is persistent + ******************/ +orte_state_base_module_t orte_state_dvm_module = { + init, + finalize, + orte_state_base_activate_job_state, + orte_state_base_add_job_state, + orte_state_base_set_job_state_callback, + orte_state_base_set_job_state_priority, + orte_state_base_remove_job_state, + orte_state_base_activate_proc_state, + orte_state_base_add_proc_state, + orte_state_base_set_proc_state_callback, + orte_state_base_set_proc_state_priority, + orte_state_base_remove_proc_state +}; + +/* defined default state machine sequence - individual + * plm's must add a state for launching daemons + */ +static orte_job_state_t launch_states[] = { + ORTE_JOB_STATE_INIT, + ORTE_JOB_STATE_INIT_COMPLETE, + ORTE_JOB_STATE_ALLOCATE, + ORTE_JOB_STATE_ALLOCATION_COMPLETE, + ORTE_JOB_STATE_DAEMONS_LAUNCHED, + ORTE_JOB_STATE_DAEMONS_REPORTED, + ORTE_JOB_STATE_VM_READY, + ORTE_JOB_STATE_MAP, + ORTE_JOB_STATE_MAP_COMPLETE, + ORTE_JOB_STATE_SYSTEM_PREP, + ORTE_JOB_STATE_LAUNCH_APPS, + ORTE_JOB_STATE_LOCAL_LAUNCH_COMPLETE, + ORTE_JOB_STATE_RUNNING, + ORTE_JOB_STATE_REGISTERED, + /* termination states */ + ORTE_JOB_STATE_TERMINATED, + ORTE_JOB_STATE_NOTIFY_COMPLETED, + ORTE_JOB_STATE_ALL_JOBS_COMPLETE +}; +static orte_state_cbfunc_t launch_callbacks[] = { + orte_plm_base_setup_job, + orte_plm_base_setup_job_complete, + orte_ras_base_allocate, + orte_plm_base_allocation_complete, + orte_plm_base_daemons_launched, + orte_plm_base_daemons_reported, + vm_ready, + orte_rmaps_base_map_job, + orte_plm_base_mapping_complete, + orte_plm_base_complete_setup, + orte_plm_base_launch_apps, + orte_state_base_local_launch_complete, + orte_plm_base_post_launch, + orte_plm_base_registered, + check_complete, + orte_state_base_cleanup_job, + orte_quit +}; + +static orte_proc_state_t proc_states[] = { + ORTE_PROC_STATE_RUNNING, + ORTE_PROC_STATE_REGISTERED, + ORTE_PROC_STATE_IOF_COMPLETE, + ORTE_PROC_STATE_WAITPID_FIRED, + ORTE_PROC_STATE_TERMINATED +}; +static orte_state_cbfunc_t proc_callbacks[] = { + orte_state_base_track_procs, + orte_state_base_track_procs, + orte_state_base_track_procs, + orte_state_base_track_procs, + orte_state_base_track_procs +}; + +static void force_quit(int fd, short args, void *cbdata) +{ + orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata; + + /* give us a chance to stop the orteds */ + orte_plm.terminate_orteds(); + OBJ_RELEASE(caddy); +} + +/************************ + * API Definitions + ************************/ +static int init(void) +{ + int i, rc; + int num_states; + + /* setup the state machines */ + OBJ_CONSTRUCT(&orte_job_states, opal_list_t); + OBJ_CONSTRUCT(&orte_proc_states, opal_list_t); + + /* setup the job state machine */ + num_states = sizeof(launch_states) / sizeof(orte_job_state_t); + for (i=0; i < num_states; i++) { + if (ORTE_SUCCESS != (rc = orte_state.add_job_state(launch_states[i], + launch_callbacks[i], + ORTE_SYS_PRI))) { + ORTE_ERROR_LOG(rc); + } + } + /* add the termination response */ + if (ORTE_SUCCESS != (rc = orte_state.add_job_state(ORTE_JOB_STATE_DAEMONS_TERMINATED, + orte_quit, ORTE_SYS_PRI))) { + ORTE_ERROR_LOG(rc); + } + /* add a default error response */ + if (ORTE_SUCCESS != (rc = orte_state.add_job_state(ORTE_JOB_STATE_FORCED_EXIT, + force_quit, ORTE_ERROR_PRI))) { + ORTE_ERROR_LOG(rc); + } + /* add callback to report progress, if requested */ + if (ORTE_SUCCESS != (rc = orte_state.add_job_state(ORTE_JOB_STATE_REPORT_PROGRESS, + orte_state_base_report_progress, ORTE_ERROR_PRI))) { + ORTE_ERROR_LOG(rc); + } + if (5 < opal_output_get_verbosity(orte_state_base_framework.framework_output)) { + orte_state_base_print_job_state_machine(); + } + + /* populate the proc state machine to allow us to + * track proc lifecycle changes + */ + num_states = sizeof(proc_states) / sizeof(orte_proc_state_t); + for (i=0; i < num_states; i++) { + if (ORTE_SUCCESS != (rc = orte_state.add_proc_state(proc_states[i], + proc_callbacks[i], + ORTE_SYS_PRI))) { + ORTE_ERROR_LOG(rc); + } + } + if (5 < opal_output_get_verbosity(orte_state_base_framework.framework_output)) { + orte_state_base_print_proc_state_machine(); + } + + return ORTE_SUCCESS; +} + +static int finalize(void) +{ + opal_list_item_t *item; + + /* cleanup the proc state machine */ + while (NULL != (item = opal_list_remove_first(&orte_proc_states))) { + OBJ_RELEASE(item); + } + OBJ_DESTRUCT(&orte_proc_states); + + return ORTE_SUCCESS; +} + +static void files_ready(int status, void *cbdata) +{ + orte_job_t *jdata = (orte_job_t*)cbdata; + + if (ORTE_SUCCESS != status) { + ORTE_FORCED_TERMINATE(status); + } else { + ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_MAP); + } +} + +static void vm_ready(int fd, short args, void *cbdata) +{ + orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata; + + /* if this is my job, then we are done */ + if (ORTE_PROC_MY_NAME->jobid == caddy->jdata->jobid) { + /* notify that the vm is ready */ + opal_output(0, "DVM ready"); + OBJ_RELEASE(caddy); + return; + } + + /* progress the job */ + caddy->jdata->state = ORTE_JOB_STATE_VM_READY; + + /* position any required files */ + if (ORTE_SUCCESS != orte_filem.preposition_files(caddy->jdata, files_ready, caddy->jdata)) { + ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); + } + + /* cleanup */ + OBJ_RELEASE(caddy); +} + +void check_complete(int fd, short args, void *cbdata) +{ + orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata; + orte_job_t *jdata = caddy->jdata; + + orte_proc_t *proc; + int i; + orte_std_cntr_t j; + orte_job_t *job; + orte_node_t *node; + orte_job_map_t *map; + orte_std_cntr_t index; + bool one_still_alive; + orte_vpid_t lowest=0; + int32_t i32, *i32ptr; + + opal_output_verbose(2, orte_state_base_framework.framework_output, + "%s state:base:check_job_complete on job %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + (NULL == jdata) ? "NULL" : ORTE_JOBID_PRINT(jdata->jobid)); + + if (NULL == jdata || jdata->jobid == ORTE_PROC_MY_NAME->jobid) { + /* just check to see if the daemons are complete */ + OPAL_OUTPUT_VERBOSE((2, orte_state_base_framework.framework_output, + "%s state:base:check_job_complete - received NULL job, checking daemons", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + goto CHECK_DAEMONS; + } else { + /* mark the job as terminated, but don't override any + * abnormal termination flags + */ + if (jdata->state < ORTE_JOB_STATE_UNTERMINATED) { + jdata->state = ORTE_JOB_STATE_TERMINATED; + } + } + + /* tell the IOF that the job is complete */ + if (NULL != orte_iof.complete) { + orte_iof.complete(jdata); + } + + i32ptr = &i32; + if (orte_get_attribute(&jdata->attributes, ORTE_JOB_NUM_NONZERO_EXIT, (void**)&i32ptr, OPAL_INT32) && !orte_abort_non_zero_exit) { + if (!orte_report_child_jobs_separately || 1 == ORTE_LOCAL_JOBID(jdata->jobid)) { + /* update the exit code */ + ORTE_UPDATE_EXIT_STATUS(lowest); + } + + /* warn user */ + opal_output(orte_clean_output, + "-------------------------------------------------------\n" + "While %s job %s terminated normally, %d %s. Further examination may be required.\n" + "-------------------------------------------------------", + (1 == ORTE_LOCAL_JOBID(jdata->jobid)) ? "the primary" : "child", + (1 == ORTE_LOCAL_JOBID(jdata->jobid)) ? "" : ORTE_LOCAL_JOBID_PRINT(jdata->jobid), + i32, (1 == i32) ? "process returned\na non-zero exit code." : + "processes returned\nnon-zero exit codes."); + } + + OPAL_OUTPUT_VERBOSE((2, orte_state_base_framework.framework_output, + "%s state:base:check_job_completed declared job %s terminated with state %s - checking all jobs", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_JOBID_PRINT(jdata->jobid), + orte_job_state_to_str(jdata->state))); + + /* if this job is a continuously operating one, then don't do + * anything further - just return here + */ + if (NULL != jdata && + (orte_get_attribute(&jdata->attributes, ORTE_JOB_CONTINUOUS_OP, NULL, OPAL_BOOL) || + ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_RECOVERABLE))) { + goto CHECK_ALIVE; + } + + /* if the job that is being checked is the HNP, then we are + * trying to terminate the orteds. In that situation, we + * do -not- check all jobs - we simply notify the HNP + * that the orteds are complete. Also check special case + * if jdata is NULL - we want + * to definitely declare the job done if the orteds + * have completed, no matter what else may be happening. + * This can happen if a ctrl-c hits in the "wrong" place + * while launching + */ + CHECK_DAEMONS: + if (jdata == NULL || jdata->jobid == ORTE_PROC_MY_NAME->jobid) { + if (0 == orte_routed.num_routes()) { + /* orteds are done! */ + OPAL_OUTPUT_VERBOSE((2, orte_state_base_framework.framework_output, + "%s orteds complete - exiting", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + if (NULL == jdata) { + jdata = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid); + } + ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_DAEMONS_TERMINATED); + OBJ_RELEASE(caddy); + return; + } + OBJ_RELEASE(caddy); + return; + } + + /* Release the resources used by this job. Since some errmgrs may want + * to continue using resources allocated to the job as part of their + * fault recovery procedure, we only do this once the job is "complete". + * Note that an aborted/killed job -is- flagged as complete and will + * therefore have its resources released. We need to do this after + * we call the errmgr so that any attempt to restart the job will + * avoid doing so in the exact same place as the current job + */ + if (NULL != jdata->map && jdata->state == ORTE_JOB_STATE_TERMINATED) { + map = jdata->map; + for (index = 0; index < map->nodes->size; index++) { + if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, index))) { + continue; + } + OPAL_OUTPUT_VERBOSE((2, orte_state_base_framework.framework_output, + "%s releasing procs from node %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + node->name)); + for (i = 0; i < node->procs->size; i++) { + if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, i))) { + continue; + } + if (proc->name.jobid != jdata->jobid) { + /* skip procs from another job */ + continue; + } + node->slots_inuse--; + node->num_procs--; + OPAL_OUTPUT_VERBOSE((2, orte_state_base_framework.framework_output, + "%s releasing proc %s from node %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(&proc->name), node->name)); + /* set the entry in the node array to NULL */ + opal_pointer_array_set_item(node->procs, i, NULL); + /* release the proc once for the map entry */ + OBJ_RELEASE(proc); + } + /* set the node location to NULL */ + opal_pointer_array_set_item(map->nodes, index, NULL); + /* maintain accounting */ + OBJ_RELEASE(node); + /* flag that the node is no longer in a map */ + ORTE_FLAG_UNSET(node, ORTE_NODE_FLAG_MAPPED); + } + OBJ_RELEASE(map); + jdata->map = NULL; + } + + CHECK_ALIVE: + /* now check to see if all jobs are done - trigger notification of this jdata + * object when we find it + */ + one_still_alive = false; + for (j=1; j < orte_job_data->size; j++) { + if (NULL == (job = (orte_job_t*)opal_pointer_array_get_item(orte_job_data, j))) { + /* since we are releasing jdata objects as we + * go, we can no longer assume that the job_data + * array is left justified + */ + continue; + } + /* if this is the job we are checking AND it normally terminated, + * then activate the "notify_completed" state - this will release + * the job state, but is provided so that the HNP main code can + * take alternative actions if desired. If the state is killed_by_cmd, + * then go ahead and release it. We cannot release it if it + * abnormally terminated as mpirun needs the info so it can + * report appropriately to the user + * + * NOTE: do not release the primary job (j=1) so we + * can pretty-print completion message + */ + if (NULL != jdata && job->jobid == jdata->jobid) { + if (jdata->state == ORTE_JOB_STATE_TERMINATED) { + OPAL_OUTPUT_VERBOSE((2, orte_state_base_framework.framework_output, + "%s state:base:check_job_completed state is terminated - activating notify", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_NOTIFY_COMPLETED); + one_still_alive = true; + } else if (jdata->state == ORTE_JOB_STATE_KILLED_BY_CMD || + jdata->state == ORTE_JOB_STATE_NOTIFIED) { + OPAL_OUTPUT_VERBOSE((2, orte_state_base_framework.framework_output, + "%s state:base:check_job_completed state is killed or notified - cleaning up", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + /* release this object, ensuring that the + * pointer array internal accounting + * is maintained! + */ + if (1 < j) { + if (ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_DEBUGGER_DAEMON)) { + /* this was a debugger daemon. notify that a debugger has detached */ + ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_DEBUGGER_DETACH); + } + opal_pointer_array_set_item(orte_job_data, j, NULL); /* ensure the array has a NULL */ + OBJ_RELEASE(jdata); + } + } + continue; + } + /* if the job is flagged to not be monitored, skip it */ + if (ORTE_FLAG_TEST(job, ORTE_JOB_FLAG_DO_NOT_MONITOR)) { + continue; + } + /* when checking for job termination, we must be sure to NOT check + * our own job as it - rather obviously - has NOT terminated! + */ + if (job->num_terminated < job->num_procs) { + /* we have at least one job that is not done yet - we cannot + * just return, though, as we need to ensure we cleanout the + * job data for the job that just completed + */ + OPAL_OUTPUT_VERBOSE((2, orte_state_base_framework.framework_output, + "%s state:base:check_job_completed job %s is not terminated (%d:%d)", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_JOBID_PRINT(job->jobid), + job->num_terminated, job->num_procs)); + one_still_alive = true; + } + else { + OPAL_OUTPUT_VERBOSE((2, orte_state_base_framework.framework_output, + "%s state:base:check_job_completed job %s is terminated (%d vs %d [%s])", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_JOBID_PRINT(job->jobid), + job->num_terminated, job->num_procs, + (NULL == jdata) ? "UNKNOWN" : orte_job_state_to_str(jdata->state) )); + } + } + /* if a job is still alive, we just return */ + if (one_still_alive) { + OPAL_OUTPUT_VERBOSE((2, orte_state_base_framework.framework_output, + "%s state:base:check_job_completed at least one job is not terminated", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + OBJ_RELEASE(caddy); + return; + } + /* if we get here, then all jobs are done, so terminate */ + OPAL_OUTPUT_VERBOSE((2, orte_state_base_framework.framework_output, + "%s state:base:check_job_completed all jobs terminated", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + + /* stop the job timeout event, if set */ + if (NULL != orte_mpiexec_timeout) { + OBJ_RELEASE(orte_mpiexec_timeout); + orte_mpiexec_timeout = NULL; + } + + /* set the exit status to 0 - this will only happen if it + * wasn't already set by an error condition + */ + ORTE_UPDATE_EXIT_STATUS(0); + + /* order daemon termination - this tells us to cleanup + * our local procs as well as telling remote daemons + * to die + */ + orte_plm.terminate_orteds(); + + OBJ_RELEASE(caddy); +} diff --git a/orte/mca/state/dvm/state_dvm.h b/orte/mca/state/dvm/state_dvm.h new file mode 100644 index 0000000000..78c798ce7c --- /dev/null +++ b/orte/mca/state/dvm/state_dvm.h @@ -0,0 +1,35 @@ +/* + * Copyright (c) 2015 Intel, Inc. All rights reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +/** + * @file + * + */ + +#ifndef MCA_STATE_DVM_EXPORT_H +#define MCA_STATE_DVM_EXPORT_H + +#include "orte_config.h" + +#include "orte/mca/state/state.h" + +BEGIN_C_DECLS + +/* + * Local Component structures + */ + +ORTE_MODULE_DECLSPEC extern orte_state_base_component_t mca_state_dvm_component; + +ORTE_DECLSPEC extern orte_state_base_module_t orte_state_dvm_module; + +END_C_DECLS + +#endif /* MCA_STATE_DVM_EXPORT_H */ diff --git a/orte/mca/state/dvm/state_dvm_component.c b/orte/mca/state/dvm/state_dvm_component.c new file mode 100644 index 0000000000..07909fc22e --- /dev/null +++ b/orte/mca/state/dvm/state_dvm_component.c @@ -0,0 +1,76 @@ +/* + * Copyright (c) 2015 Intel, Inc. All rights reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "orte_config.h" +#include "opal/util/output.h" + +#include "orte/mca/state/state.h" +#include "orte/mca/state/base/base.h" +#include "state_dvm.h" + +/* + * Public string for version number + */ +const char *orte_state_dvm_component_version_string = + "ORTE STATE dvm MCA component version " ORTE_VERSION; + +/* + * Local functionality + */ +static int state_dvm_open(void); +static int state_dvm_close(void); +static int state_dvm_component_query(mca_base_module_t **module, int *priority); + +/* + * Instantiate the public struct with all of our public information + * and pointer to our public functions in it + */ +orte_state_base_component_t mca_state_dvm_component = +{ + /* Handle the general mca_component_t struct containing + * meta information about the component + */ + { + ORTE_STATE_BASE_VERSION_1_0_0, + /* Component name and version */ + "dvm", + ORTE_MAJOR_VERSION, + ORTE_MINOR_VERSION, + ORTE_RELEASE_VERSION, + + /* Component open and close functions */ + state_dvm_open, + state_dvm_close, + state_dvm_component_query + }, + { + /* The component is checkpoint ready */ + MCA_BASE_METADATA_PARAM_CHECKPOINT + }, +}; + +static int state_dvm_open(void) +{ + return ORTE_SUCCESS; +} + +static int state_dvm_close(void) +{ + return ORTE_SUCCESS; +} + +static int state_dvm_component_query(mca_base_module_t **module, int *priority) +{ + /* we are only used when an envar is set directing it, + * so set our priority very low */ + *priority = 0; + *module = (mca_base_module_t *)&orte_state_dvm_module; + return ORTE_SUCCESS; +} diff --git a/orte/tools/Makefile.am b/orte/tools/Makefile.am index 8ddebc8387..6ac7656410 100644 --- a/orte/tools/Makefile.am +++ b/orte/tools/Makefile.am @@ -13,7 +13,7 @@ # Copyright (c) 2006-2008 Cisco Systems, Inc. All rights reserved. # Copyright (c) 2011-2013 Los Alamos National Security, LLC. All rights # reserved. -# Copyright (c) 2014 Intel, Inc. All rights reserved. +# Copyright (c) 2014-2015 Intel, Inc. All rights reserved. # $COPYRIGHT$ # # Additional copyrights may follow @@ -35,7 +35,9 @@ SUBDIRS += \ tools/orte-top \ tools/orte-info \ tools/orte-migrate \ - tools/orte-server + tools/orte-server \ + tools/orte-submit \ + tools/orte-dvm DIST_SUBDIRS += \ tools/orte-checkpoint \ @@ -48,5 +50,7 @@ DIST_SUBDIRS += \ tools/orte-top \ tools/orte-info \ tools/orte-migrate \ - tools/orte-server + tools/orte-server \ + tools/orte-submit \ + tools/orte-dvm diff --git a/orte/tools/orte-dvm/Makefile.am b/orte/tools/orte-dvm/Makefile.am new file mode 100644 index 0000000000..ba1b0a3382 --- /dev/null +++ b/orte/tools/orte-dvm/Makefile.am @@ -0,0 +1,57 @@ +# +# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana +# University Research and Technology +# Corporation. All rights reserved. +# Copyright (c) 2004-2005 The University of Tennessee and The University +# of Tennessee Research Foundation. All rights +# reserved. +# Copyright (c) 2004-2009 High Performance Computing Center Stuttgart, +# University of Stuttgart. All rights reserved. +# Copyright (c) 2004-2005 The Regents of the University of California. +# All rights reserved. +# Copyright (c) 2008-2014 Cisco Systems, Inc. All rights reserved. +# Copyright (c) 2008 Sun Microsystems, Inc. All rights reserved. +# Copyright (c) 2015 Intel, Inc. All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +# This is not quite in the Automake spirit, but we have to do it. +# Since the totalview portion of the library must be built with -g, we +# must eliminate the CFLAGS that are passed in here by default (which +# may already have debugging and/or optimization flags). We use +# post-processed forms of the CFLAGS in the library targets down +# below. + +CFLAGS = $(CFLAGS_WITHOUT_OPTFLAGS) $(DEBUGGER_CFLAGS) + +include $(top_srcdir)/Makefile.ompi-rules + +man_pages = orte-dvm.1 +EXTRA_DIST = $(man_pages:.1=.1in) + +if OPAL_INSTALL_BINARIES + +bin_PROGRAMS = orte-dvm + +nodist_man_MANS = $(man_pages) + +# Ensure that the man pages are rebuilt if the opal_config.h file +# changes; a "good enough" way to know if configure was run again (and +# therefore the release date or version may have changed) +$(nodist_man_MANS): $(top_builddir)/opal/include/opal_config.h + +endif # OPAL_INSTALL_BINARIES + +orte_dvm_SOURCES = \ + orte-dvm.c + +orte_dvm_LDADD = \ + $(top_builddir)/orte/lib@ORTE_LIB_PREFIX@open-rte.la \ + $(top_builddir)/opal/lib@OPAL_LIB_PREFIX@open-pal.la + +distclean-local: + rm -f $(man_pages) diff --git a/orte/tools/orte-dvm/orte-dvm.1in b/orte/tools/orte-dvm/orte-dvm.1in new file mode 100644 index 0000000000..d4d74df913 --- /dev/null +++ b/orte/tools/orte-dvm/orte-dvm.1in @@ -0,0 +1,193 @@ +.\” -*- nroff -*- +.\" Copyright (c) 2009-2014 Cisco Systems, Inc. All rights reserved. +.\" Copyright (c) 2008-2009 Sun Microsystems, Inc. All rights reserved. +.\” Copyright (c) 2015 Intel, Inc. All rights reserved +.\" $COPYRIGHT$ +.\" +.\" Man page for ORTE's orte-dvm command +.\" +.\" .TH name section center-footer left-footer center-header +.TH ORTE-DVM 1 "#OMPI_DATE#" "#PACKAGE_VERSION#" "#PACKAGE_NAME#" +.\" ************************** +.\" Name Section +.\" ************************** +.SH NAME +. +orte-dvm, ompi_dvm \- Establish a Distributed Virtual Machine (DVM). + +.B Note: +\fIorte-dvm\fP and \fIompi-dvm\fP are synonyms for each +other. Using either of the names will produce the same behavior. +. +.\" ************************** +.\" Synopsis Section +.\" ************************** +.SH SYNOPSIS +. +.PP +.B orte-dvm +[ options ] +.P + +Invoking \fIorte-dvm\fP via an absolute path +name is equivalent to specifying the \fI--prefix\fP option with a +\fI\fR value equivalent to the directory where \fIorte-dvm\fR +resides, minus its last subdirectory. For example: + + \fB%\fP /usr/local/bin/orte-dvm ... + +is equivalent to + + \fB%\fP orte-dvm --prefix /usr/local + +. +.\" ************************** +.\" Quick Summary Section +.\" ************************** +.SH QUICK SUMMARY +. +\fIorte-dvm\fP will establish a DVM that can be used to execute subsequent +applications. Use of \fIorte-dvm\fP can be advantageous, for example, when you want to +execute a number of short-lived tasks. In such cases, the time required to start +the ORTE DVM can be a significant fraction of the time to execute the +overall application. Thus, creating a persistent DVM can speed the overall +execution. In addition, a persistent DVM will support executing multiple parallel +applications while maintaining separation between their respective cores. +.\" ************************** +.\" Options Section +.\" ************************** +.SH OPTIONS +. +.\" +.\" Start options listing +.\" Indent 10 characters from start of first column to start of second column +. +.TP +.B -h\fR,\fP --help +Display help for this command +. +. +.TP +.B -V\fR,\fP --version +Print version number. If no other arguments are given, this will also +cause orte-dvm to exit. +. +. +.P +Use one of the following options to specify which hosts (nodes) of the cluster to use +for the DVM. +. +. +.TP +.B -H\fR,\fP -host\fR,\fP --host \fR\fP +List of hosts for the DVM. +. +. +.TP +.B +-hostfile\fR,\fP --hostfile \fR\fP +Provide a hostfile to use. +. +. +.TP +.B -machinefile\fR,\fP --machinefile \fR\fP +Synonym for \fI-hostfile\fP. +. +. +.TP +.B --prefix \fR\fP +Prefix directory that will be used to set the \fIPATH\fR and +\fILD_LIBRARY_PATH\fR on the remote node before invoking the ORTE daemon. +. +. +..P +Setting MCA parameters: +. +. +.TP +.B -gmca\fR,\fP --gmca \fR \fP +Pass global MCA parameters that are applicable to all contexts. \fI\fP is +the parameter name; \fI\fP is the parameter value. +. +. +.TP +.B -mca\fR,\fP --mca +Send arguments to various MCA modules. See the "MCA" section, below. +. +. +. +. +.TP +.B -report-uri\fR,\fP --report-uri +Print out orte-dvm's URI during startup. The channel must be either a '-' to indicate that +the URI is to be output to stdout, a '+' to indicate that the URI is to be output to stderr, +or a filename to which the URI is to be written. +. +. +.P +The following options are useful for developers; they are not generally +useful to most ORTE and/or MPI users: +. +.TP +.B -d\fR,\fP --debug-devel +Enable debugging of the ORTE layer. +. +. +.TP +.B --debug-daemons-file +Enable debugging of the ORTE daemons in the DVM, storing +output in files. +. +. +.P +There may be other options listed with \fIorte-dvm --help\fP. +. +. +.\" ************************** +.\" Description Section +.\" ************************** +.SH DESCRIPTION +. +\fIorte-dvm\fP starts a Distributed Virtual Machine (DVM) by launching +a daemon on each node of the allocation, as modified or specified by +the \fI-host\fP and \fI-hostfile\fP options. Applications can subsequently +be executed using the \fIorte-submit\fP command. +. +The DVM remains in operation until receiving the \fIorte-submit -terminate\fP +command. +. +. +. +.SS Specifying Host Nodes +. +Host nodes can be identified on the \fIorte-dvm\fP command line with the \fI-host\fP +option or in a hostfile. +. +.PP +For example, +. +.TP 4 +orte-dvm -H aa,aa,bb ./a.out +launches two processes on node aa and one on bb. +. +.PP +Or, consider the hostfile +. + + \fB%\fP cat myhostfile + aa slots=2 + bb slots=2 + cc slots=2 + +. +.PP +Here, we list both the host names (aa, bb, and cc) but also how many "slots" +there are for each. Slots indicate how many processes can potentially execute +on a node. For best performance, the number of slots may be chosen to be the +number of cores on the node or the number of processor sockets. If the hostfile +does not provide slots information, a default of 1 is assumed. +When running under resource managers (e.g., SLURM, Torque, etc.), +Open MPI will obtain both the hostnames and the number of slots directly +from the resource manger. +. +. diff --git a/orte/tools/orte-dvm/orte-dvm.c b/orte/tools/orte-dvm/orte-dvm.c new file mode 100644 index 0000000000..72e59020a8 --- /dev/null +++ b/orte/tools/orte-dvm/orte-dvm.c @@ -0,0 +1,446 @@ +/* -*- C -*- + * + * Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2008 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2006-2014 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2007-2009 Sun Microsystems, Inc. All rights reserved. + * Copyright (c) 2007-2013 Los Alamos National Security, LLC. All rights + * reserved. + * Copyright (c) 2013-2015 Intel, Inc. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "orte_config.h" +#include "orte/constants.h" + +#ifdef HAVE_STRING_H +#include +#endif +#include +#ifdef HAVE_STDLIB_H +#include +#endif /* HAVE_STDLIB_H */ +#ifdef HAVE_STRINGS_H +#include +#endif /* HAVE_STRINGS_H */ +#ifdef HAVE_UNISTD_H +#include +#endif +#ifdef HAVE_SYS_PARAM_H +#include +#endif +#include +#include +#include +#ifdef HAVE_SYS_TYPES_H +#include +#endif /* HAVE_SYS_TYPES_H */ +#ifdef HAVE_SYS_WAIT_H +#include +#endif /* HAVE_SYS_WAIT_H */ +#ifdef HAVE_SYS_TIME_H +#include +#endif /* HAVE_SYS_TIME_H */ +#include +#ifdef HAVE_SYS_STAT_H +#include +#endif + +#include "opal/mca/event/event.h" +#include "opal/mca/installdirs/installdirs.h" +#include "opal/mca/base/base.h" +#include "opal/util/argv.h" +#include "opal/util/output.h" +#include "opal/util/basename.h" +#include "opal/util/cmd_line.h" +#include "opal/util/opal_environ.h" +#include "opal/util/opal_getcwd.h" +#include "opal/util/show_help.h" +#include "opal/util/fd.h" + +#include "opal/version.h" +#include "opal/runtime/opal.h" +#include "opal/util/os_path.h" +#include "opal/util/path.h" +#include "opal/class/opal_pointer_array.h" + +#include "orte/mca/errmgr/errmgr.h" +#include "orte/mca/rml/rml.h" +#include "orte/mca/rml/base/rml_contact.h" +#include "orte/mca/state/state.h" + +#include "orte/runtime/runtime.h" +#include "orte/runtime/orte_globals.h" +#include "orte/util/show_help.h" + +#include "orte/orted/orted.h" + +/* + * Globals + */ +static bool want_prefix_by_default = (bool) ORTE_WANT_ORTERUN_PREFIX_BY_DEFAULT; + +/* + * Globals + */ +static struct { + bool help; + bool version; + char *report_uri; + char *basename; + char *prefix; +} myglobals; + +static opal_cmd_line_init_t cmd_line_init[] = { + /* Various "obvious" options */ + { NULL, 'h', NULL, "help", 0, + &myglobals.help, OPAL_CMD_LINE_TYPE_BOOL, + "This help message" }, + { NULL, 'V', NULL, "version", 0, + &myglobals.version, OPAL_CMD_LINE_TYPE_BOOL, + "Print version and exit" }, + + { NULL, '\0', "report-uri", "report-uri", 1, + &myglobals.report_uri, OPAL_CMD_LINE_TYPE_STRING, + "Printout URI on stdout [-], stderr [+], or a file [anything else]" }, + + { NULL, '\0', "prefix", "prefix", 1, + &myglobals.prefix, OPAL_CMD_LINE_TYPE_STRING, + "Prefix to be used to look for ORTE executables" }, + + /* End of list */ + { NULL, '\0', NULL, NULL, 0, + NULL, OPAL_CMD_LINE_TYPE_NULL, NULL } +}; + +static void notify_requestor(int sd, short args, void *cbdata); + +int main(int argc, char *argv[]) +{ + int rc, i, j; + opal_cmd_line_t cmd_line; + char *param, *value; + orte_job_t *jdata=NULL; + orte_app_context_t *app; + char *uri; + + /* Setup and parse the command line */ + memset(&myglobals, 0, sizeof(myglobals)); + /* find our basename (the name of the executable) so that we can + use it in pretty-print error messages */ + myglobals.basename = opal_basename(argv[0]); + + opal_cmd_line_create(&cmd_line, cmd_line_init); + mca_base_cmd_line_setup(&cmd_line); + if (OPAL_SUCCESS != (rc = opal_cmd_line_parse(&cmd_line, true, + argc, argv)) ) { + if (OPAL_ERR_SILENT != rc) { + fprintf(stderr, "%s: command line error (%s)\n", argv[0], + opal_strerror(rc)); + } + return rc; + } + + /* + * Since this process can now handle MCA/GMCA parameters, make sure to + * process them. + * NOTE: It is "safe" to call mca_base_cmd_line_process_args() before + * opal_init_util() since mca_base_cmd_line_process_args() does *not* + * depend upon opal_init_util() functionality. + */ + if (OPAL_SUCCESS != mca_base_cmd_line_process_args(&cmd_line, &environ, &environ)) { + exit(1); + } + + /* Need to initialize OPAL so that install_dirs are filled in */ + if (OPAL_SUCCESS != opal_init(&argc, &argv)) { + exit(1); + } + + /* Check for some "global" command line params */ + /* print version if requested. Do this before check for help so + that --version --help works as one might expect. */ + if (myglobals.version) { + char *str; + char *project_name = NULL; + if (0 == strcmp(myglobals.basename, "ompi-dvm")) { + project_name = "Open MPI"; + } else { + project_name = "OpenRTE"; + } + str = opal_show_help_string("help-orterun.txt", "orterun:version", + false, + myglobals.basename, project_name, OPAL_VERSION, + PACKAGE_BUGREPORT); + if (NULL != str) { + printf("%s", str); + free(str); + } + exit(0); + } + + /* Check for help request */ + if (myglobals.help) { + char *str, *args = NULL; + char *project_name = NULL; + if (0 == strcmp(myglobals.basename, "mpirun")) { + project_name = "Open MPI"; + } else { + project_name = "OpenRTE"; + } + args = opal_cmd_line_get_usage_msg(&cmd_line); + str = opal_show_help_string("help-orterun.txt", "orterun:usage", false, + myglobals.basename, project_name, OPAL_VERSION, + myglobals.basename, args, + PACKAGE_BUGREPORT); + if (NULL != str) { + printf("%s", str); + free(str); + } + free(args); + + /* If someone asks for help, that should be all we do */ + exit(0); + } + + /* flag that I am the HNP */ + orte_process_info.proc_type = ORTE_PROC_HNP; + + /* Setup MCA params */ + orte_register_params(); + + /* specify the DVM state machine */ + putenv("OMPI_MCA_state=dvm"); + + /* Intialize our Open RTE environment */ + if (ORTE_SUCCESS != (rc = orte_init(&argc, &argv, ORTE_PROC_HNP))) { + /* cannot call ORTE_ERROR_LOG as it could be the errmgr + * never got loaded! + */ + return rc; + } + /* finalize OPAL. As it was opened again from orte_init->opal_init + * we continue to have a reference count on it. So we have to finalize it twice... + */ + opal_finalize(); + + /* check for request to report uri */ + uri = orte_rml.get_contact_info(); + if (NULL != myglobals.report_uri) { + FILE *fp; + if (0 == strcmp(myglobals.report_uri, "-")) { + /* if '-', then output to stdout */ + printf("VMURI: %s\n", uri); + } else if (0 == strcmp(myglobals.report_uri, "+")) { + /* if '+', output to stderr */ + fprintf(stderr, "VMURI: %s\n", uri); + } else { + fp = fopen(myglobals.report_uri, "w"); + if (NULL == fp) { + orte_show_help("help-orterun.txt", "orterun:write_file", false, + myglobals.basename, "pid", myglobals.report_uri); + exit(0); + } + fprintf(fp, "%s\n", uri); + fclose(fp); + } + free(uri); + } else { + printf("VMURI: %s\n", uri); + } + + /* get the daemon job object - was created by ess/hnp component */ + if (NULL == (jdata = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid))) { + orte_show_help("help-orterun.txt", "bad-job-object", true, + myglobals.basename); + exit(0); + } + /* also should have created a daemon "app" */ + if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, 0))) { + orte_show_help("help-orterun.txt", "bad-app-object", true, + myglobals.basename); + exit(0); + } + + /* Did the user specify a prefix, or want prefix by default? */ + if (opal_cmd_line_is_taken(&cmd_line, "prefix") || want_prefix_by_default) { + size_t param_len; + /* if both the prefix was given and we have a prefix + * given above, check to see if they match + */ + if (opal_cmd_line_is_taken(&cmd_line, "prefix") && + NULL != myglobals.prefix) { + /* if they don't match, then that merits a warning */ + param = strdup(opal_cmd_line_get_param(&cmd_line, "prefix", 0, 0)); + /* ensure we strip any trailing '/' */ + if (0 == strcmp(OPAL_PATH_SEP, &(param[strlen(param)-1]))) { + param[strlen(param)-1] = '\0'; + } + value = strdup(myglobals.prefix); + if (0 == strcmp(OPAL_PATH_SEP, &(value[strlen(value)-1]))) { + value[strlen(value)-1] = '\0'; + } + if (0 != strcmp(param, value)) { + orte_show_help("help-orterun.txt", "orterun:app-prefix-conflict", + true, myglobals.basename, value, param); + /* let the global-level prefix take precedence since we + * know that one is being used + */ + free(param); + param = strdup(myglobals.prefix); + } + free(value); + } else if (NULL != myglobals.prefix) { + param = myglobals.prefix; + } else if (opal_cmd_line_is_taken(&cmd_line, "prefix")){ + /* must be --prefix alone */ + param = strdup(opal_cmd_line_get_param(&cmd_line, "prefix", 0, 0)); + } else { + /* --enable-orterun-prefix-default was given to orterun */ + param = strdup(opal_install_dirs.prefix); + } + + if (NULL != param) { + /* "Parse" the param, aka remove superfluous path_sep. */ + param_len = strlen(param); + while (0 == strcmp (OPAL_PATH_SEP, &(param[param_len-1]))) { + param[param_len-1] = '\0'; + param_len--; + if (0 == param_len) { + orte_show_help("help-orterun.txt", "orterun:empty-prefix", + true, myglobals.basename, myglobals.basename); + return ORTE_ERR_FATAL; + } + } + orte_set_attribute(&app->attributes, ORTE_APP_PREFIX_DIR, ORTE_ATTR_GLOBAL, param, OPAL_STRING); + free(param); + } + } + + /* Did the user specify a hostfile. Need to check for both + * hostfile and machine file. + * We can only deal with one hostfile per app context, otherwise give an error. + */ + if (0 < (j = opal_cmd_line_get_ninsts(&cmd_line, "hostfile"))) { + if(1 < j) { + orte_show_help("help-orterun.txt", "orterun:multiple-hostfiles", + true, myglobals.basename, NULL); + return ORTE_ERR_FATAL; + } else { + value = opal_cmd_line_get_param(&cmd_line, "hostfile", 0, 0); + orte_set_attribute(&app->attributes, ORTE_APP_HOSTFILE, ORTE_ATTR_LOCAL, value, OPAL_STRING); + } + } + if (0 < (j = opal_cmd_line_get_ninsts(&cmd_line, "machinefile"))) { + if(1 < j || orte_get_attribute(&app->attributes, ORTE_APP_HOSTFILE, NULL, OPAL_STRING)) { + orte_show_help("help-orterun.txt", "orterun:multiple-hostfiles", + true, myglobals.basename, NULL); + return ORTE_ERR_FATAL; + } else { + value = opal_cmd_line_get_param(&cmd_line, "machinefile", 0, 0); + orte_set_attribute(&app->attributes, ORTE_APP_HOSTFILE, ORTE_ATTR_LOCAL, value, OPAL_STRING); + } + } + + /* Did the user specify any hosts? */ + if (0 < (j = opal_cmd_line_get_ninsts(&cmd_line, "host"))) { + char **targ=NULL, *tval; + for (i = 0; i < j; ++i) { + value = opal_cmd_line_get_param(&cmd_line, "host", i, 0); + opal_argv_append_nosize(&targ, value); + } + tval = opal_argv_join(targ, ','); + orte_set_attribute(&app->attributes, ORTE_APP_DASH_HOST, ORTE_ATTR_LOCAL, tval, OPAL_STRING); + opal_argv_free(targ); + free(tval); + } + OBJ_DESTRUCT(&cmd_line); + + /* setup to listen for commands sent specifically to me, even though I would probably + * be the one sending them! Unfortunately, since I am a participating daemon, + * there are times I need to send a command to "all daemons", and that means *I* have + * to receive it too + */ + orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_DAEMON, + ORTE_RML_PERSISTENT, orte_daemon_recv, NULL); + + /* override the notify_completed state so we can send a message + * back to anyone who submits a job to us telling them the job + * completed */ + if (ORTE_SUCCESS != (rc = orte_state.set_job_state_callback(ORTE_JOB_STATE_NOTIFY_COMPLETED, notify_requestor))) { + ORTE_ERROR_LOG(rc); + ORTE_UPDATE_EXIT_STATUS(rc); + exit(orte_exit_status); + } + + /* spawn the DVM - we skip the initial steps as this + * isn't a user-level application */ + ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_ALLOCATE); + + /* loop the event lib until an exit event is detected */ + while (orte_event_base_active) { + opal_event_loop(orte_event_base, OPAL_EVLOOP_ONCE); + } + + /* cleanup and leave */ + orte_finalize(); + + if (orte_debug_flag) { + fprintf(stderr, "exiting with status %d\n", orte_exit_status); + } + exit(orte_exit_status); +} + +static void send_callback(int status, orte_process_name_t *peer, + opal_buffer_t* buffer, orte_rml_tag_t tag, + void* cbdata) + +{ + orte_job_t *jdata = (orte_job_t*)cbdata; + + OBJ_RELEASE(buffer); + /* cleanup the job object */ + opal_pointer_array_set_item(orte_job_data, ORTE_LOCAL_JOBID(jdata->jobid), NULL); + OBJ_RELEASE(jdata); +} +static void notify_requestor(int sd, short args, void *cbdata) +{ + orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata; + orte_job_t *jdata = caddy->jdata; + orte_proc_t *pptr; + int ret; + opal_buffer_t *reply; + + opal_output(0, "%s dvm: job %s has completed", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + (NULL == jdata) ? "NULL" : ORTE_JOBID_PRINT(jdata->jobid)); + + /* notify the requestor */ + reply = OBJ_NEW(opal_buffer_t); + /* see if there was any problem */ + if (orte_get_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, (void**)&pptr, OPAL_PTR) && NULL != pptr) { + ret = pptr->exit_code; + } else { + ret = 0; + } + opal_dss.pack(reply, &ret, 1, OPAL_INT); + orte_rml.send_buffer_nb(&jdata->originator, reply, ORTE_RML_TAG_TOOL, send_callback, jdata); + + /* we cannot cleanup the job object as we might + * hit an error during transmission, so clean it + * up in the send callback */ + OBJ_RELEASE(caddy); +} + diff --git a/orte/tools/orte-submit/Makefile.am b/orte/tools/orte-submit/Makefile.am new file mode 100644 index 0000000000..e69634cb27 --- /dev/null +++ b/orte/tools/orte-submit/Makefile.am @@ -0,0 +1,57 @@ +# +# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana +# University Research and Technology +# Corporation. All rights reserved. +# Copyright (c) 2004-2005 The University of Tennessee and The University +# of Tennessee Research Foundation. All rights +# reserved. +# Copyright (c) 2004-2009 High Performance Computing Center Stuttgart, +# University of Stuttgart. All rights reserved. +# Copyright (c) 2004-2005 The Regents of the University of California. +# All rights reserved. +# Copyright (c) 2008-2014 Cisco Systems, Inc. All rights reserved. +# Copyright (c) 2008 Sun Microsystems, Inc. All rights reserved. +# Copyright (c) 2015 Intel, Inc. All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +# This is not quite in the Automake spirit, but we have to do it. +# Since the totalview portion of the library must be built with -g, we +# must eliminate the CFLAGS that are passed in here by default (which +# may already have debugging and/or optimization flags). We use +# post-processed forms of the CFLAGS in the library targets down +# below. + +CFLAGS = $(CFLAGS_WITHOUT_OPTFLAGS) $(DEBUGGER_CFLAGS) + +include $(top_srcdir)/Makefile.ompi-rules + +man_pages = orte-submit.1 +EXTRA_DIST = $(man_pages:.1=.1in) + +if OPAL_INSTALL_BINARIES + +bin_PROGRAMS = orte-submit + +nodist_man_MANS = $(man_pages) + +# Ensure that the man pages are rebuilt if the opal_config.h file +# changes; a "good enough" way to know if configure was run again (and +# therefore the release date or version may have changed) +$(nodist_man_MANS): $(top_builddir)/opal/include/opal_config.h + +endif # OPAL_INSTALL_BINARIES + +orte_submit_SOURCES = \ + orte-submit.c + +orte_submit_LDADD = \ + $(top_builddir)/orte/lib@ORTE_LIB_PREFIX@open-rte.la \ + $(top_builddir)/opal/lib@OPAL_LIB_PREFIX@open-pal.la + +distclean-local: + rm -f $(man_pages) diff --git a/orte/tools/orte-submit/orte-submit.1in b/orte/tools/orte-submit/orte-submit.1in new file mode 100644 index 0000000000..605d893c0d --- /dev/null +++ b/orte/tools/orte-submit/orte-submit.1in @@ -0,0 +1,1430 @@ +.\" -*- nroff -*- +.\" Copyright (c) 2009-2014 Cisco Systems, Inc. All rights reserved. +.\" Copyright (c) 2008-2009 Sun Microsystems, Inc. All rights reserved. +.\” Copyright (c) 2015 Intel, Inc. All rights reserved. +.\" $COPYRIGHT$ +.\" +.\" Man page for ORTE's orte-submit command +.\" +.\" .TH name section center-footer left-footer center-header +.TH ORTE-SUBMIT 1 "#OMPI_DATE#" "#PACKAGE_VERSION#" "#PACKAGE_NAME#" +.\" ************************** +.\" Name Section +.\" ************************** +.SH NAME +. +orte-submit, ompi-submit \- Execute serial and parallel jobs in Open MPI using a DVM. + +.B Note: +\fIompi-submit\fP and \fIorte-submit\fP are synonyms for each +other. Using either of the names will produce the same behavior. +. +.\" ************************** +.\" Synopsis Section +.\" ************************** +.SH SYNOPSIS +. +.PP +Single Process Multiple Data (SPMD) Model: + +.B ompi-submit +[ options ] +.B +[ ] +.P + +Multiple Instruction Multiple Data (MIMD) Model: + +.B ompi-submit +[ global_options ] + [ local_options1 ] +.B +[ ] : + [ local_options2 ] +.B +[ ] : + ... : + [ local_optionsN ] +.B +[ ] +.P + +Note that in both models, invoking \fIompi-submit\fP via an absolute path +name is equivalent to specifying the \fI--prefix\fP option with a +\fI\fR value equivalent to the directory where \fIompi-submit\fR +resides, minus its last subdirectory. For example: + + \fB%\fP /usr/local/bin/ompi-submit ... + +is equivalent to + + \fB%\fP ompi-submit --prefix /usr/local + +. +.\" ************************** +.\" Quick Summary Section +.\" ************************** +.SH QUICK SUMMARY +. +.B +Use of \fIorte-submit\fP requires that you first start the Distributed Virtual +Machine (DVM) using \fIorte-dvm\fP. +.P +If you are simply looking for how to run an MPI application, you +probably want to use a command line of the following form: + + \fB%\fP ompi-submit [ -np X ] [ --hostfile ] + +This will run X copies of \fI\fR in your current run-time +environment (if running under a supported resource manager, Open MPI's +\fIompi-submit\fR will usually automatically use the corresponding resource manager +process starter, as opposed to, for example, \fIrsh\fR or \fIssh\fR, +which require the use of a hostfile, or will default to running all X +copies on the localhost), scheduling (by default) in a round-robin fashion by +CPU slot. See the rest of this page for more details. +.P +Please note that ompi-submit automatically binds processes as of the start of the +v1.8 series. Two binding patterns are used in the absence of any further directives: +.TP 18 +.B Bind to core: +when the number of processes is <= 2 +. +. +.TP +.B Bind to socket: +when the number of processes is > 2 +. +. +.P +If your application uses threads, then you probably want to ensure that you are +either not bound at all (by specifying --bind-to none), or bound to multiple cores +using an appropriate binding level or specific number of processing elements per +application process. +. +.\" ************************** +.\" Options Section +.\" ************************** +.SH OPTIONS +. +.I ompi-submit +will send the name of the directory where it was invoked on the local +node to each of the remote nodes, and attempt to change to that +directory. See the "Current Working Directory" section below for further +details. +.\" +.\" Start options listing +.\" Indent 10 characters from start of first column to start of second column +.TP 10 +.B +The program executable. This is identified as the first non-recognized argument +to ompi-submit. +. +. +.TP +.B +Pass these run-time arguments to every new process. These must always +be the last arguments to \fIompi-submit\fP. If an app context file is used, +\fI\fP will be ignored. +. +. +.TP +.B -h\fR,\fP --help +Display help for this command +. +. +.TP +.B -q\fR,\fP --quiet +Suppress informative messages from orte-submit during application execution. +. +. +.TP +.B -v\fR,\fP --verbose +Be verbose +. +. +.TP +.B -V\fR,\fP --version +Print version number. If no other arguments are given, this will also +cause orte-submit to exit. +. +. +. +. +.P +Use one of the following options to specify which hosts (nodes) of the DVM to run on. +Specifying hosts outside the DVM will result in an error. +. +. +.TP +.B -H\fR,\fP -host\fR,\fP --host \fR\fP +List of hosts on which to invoke processes. +. +. +.TP +.B +-hostfile\fR,\fP --hostfile \fR\fP +Provide a hostfile to use. +.\" JJH - Should have man page for how to format a hostfile properly. +. +. +.TP +.B -machinefile\fR,\fP --machinefile \fR\fP +Synonym for \fI-hostfile\fP. +. +. +. +. +.P +The following options specify the number of processes to launch. Note that none +of the options imply a particular binding policy - e.g., requesting N processes +for each socket does not imply that the processes will be bound to the socket. +. +. +.TP +.B -c\fR,\fP -n\fR,\fP --n\fR,\fP -np \fR<#>\fP +Run this many copies of the program on the given nodes. This option +indicates that the specified file is an executable program and not an +application context. If no value is provided for the number of copies to +execute (i.e., neither the "-np" nor its synonyms are provided on the command +line), Open MPI will automatically execute a copy of the program on +each process slot (see below for description of a "process slot"). This +feature, however, can only be used in the SPMD model and will return an +error (without beginning execution of the application) otherwise. +. +. +.TP +.B —map-by ppr:N: +Launch N times the number of objects of the specified type on each node. +. +. +.TP +.B -npersocket\fR,\fP --npersocket <#persocket> +On each node, launch this many processes times the number of processor +sockets on the node. +The \fI-npersocket\fP option also turns on the \fI-bind-to-socket\fP option. +(deprecated in favor of --map-by ppr:n:socket) +. +. +.TP +.B -npernode\fR,\fP --npernode <#pernode> +On each node, launch this many processes. +(deprecated in favor of --map-by ppr:n:node) +. +. +.TP +.B -pernode\fR,\fP --pernode +On each node, launch one process -- equivalent to \fI-npernode\fP 1. +(deprecated in favor of --map-by ppr:1:node) +. +. +. +. +.P +To map processes: +. +. +.TP +.B --map-by +Map to the specified object, defaults to \fIsocket\fP. Supported options +include slot, hwthread, core, L1cache, L2cache, L3cache, socket, numa, +board, node, sequential, distance, and ppr. Any object can include +modifiers by adding a \fR:\fP and any combination of PE=n (bind n +processing elements to each proc), SPAN (load +balance the processes across the allocation), OVERSUBSCRIBE (allow +more processes on a node than processing elements), and NOOVERSUBSCRIBE. +This includes PPR, where the pattern would be terminated by another colon +to separate it from the modifiers. +. +.TP +.B -bycore\fR,\fP --bycore +Map processes by core (deprecated in favor of --map-by core) +. +.TP +.B -bysocket\fR,\fP --bysocket +Map processes by socket (deprecated in favor of --map-by socket) +. +.TP +.B -nolocal\fR,\fP --nolocal +Do not run any copies of the launched application on the same node as +orte-submit is running. This option will override listing the localhost +with \fB--host\fR or any other host-specifying mechanism. +. +.TP +.B -nooversubscribe\fR,\fP --nooversubscribe +Do not oversubscribe any nodes; error (without starting any processes) +if the requested number of processes would cause oversubscription. +This option implicitly sets "max_slots" equal to the "slots" value for +each node. +. +.TP +.B -bynode\fR,\fP --bynode +Launch processes one per node, cycling by node in a round-robin +fashion. This spreads processes evenly among nodes and assigns +MPI_COMM_WORLD ranks in a round-robin, "by node" manner. +. +. +. +. +.P +To order processes' ranks in MPI_COMM_WORLD: +. +. +.TP +.B --rank-by +Rank in round-robin fashion according to the specified object, +defaults to \fIslot\fP. Supported options +include slot, hwthread, core, L1cache, L2cache, L3cache, +socket, numa, board, and node. +. +. +. +. +.P +For process binding: +. +.TP +.B --bind-to +Bind processes to the specified object, defaults to \fIcore\fP. Supported options +include slot, hwthread, core, l1cache, l2cache, l3cache, socket, numa, board, and none. +. +.TP +.B -cpus-per-proc\fR,\fP --cpus-per-proc <#perproc> +Bind each process to the specified number of cpus. +(deprecated in favor of --map-by :PE=n) +. +.TP +.B -cpus-per-rank\fR,\fP --cpus-per-rank <#perrank> +Alias for \fI-cpus-per-proc\fP. +(deprecated in favor of --map-by :PE=n) +. +.TP +.B -bind-to-core\fR,\fP --bind-to-core +Bind processes to cores (deprecated in favor of --bind-to core) +. +.TP +.B -bind-to-socket\fR,\fP --bind-to-socket +Bind processes to processor sockets (deprecated in favor of --bind-to socket) +. +.TP +.B -bind-to-none\fR,\fP --bind-to-none +Do not bind processes (deprecated in favor of --bind-to none) +. +.TP +.B -report-bindings\fR,\fP --report-bindings +Report any bindings for launched processes. +. +.TP +.B -slot-list\fR,\fP --slot-list +List of processor IDs to be used for binding MPI processes. The specified bindings will +be applied to all MPI processes. See explanation below for syntax. +. +. +. +. +.P +For rankfiles: +. +. +.TP +.B -rf\fR,\fP --rankfile +Provide a rankfile file. +. +. +. +. +.P +To manage standard I/O: +. +. +.TP +.B -output-filename\fR,\fP --output-filename \fR\fP +Redirect the stdout, stderr, and stddiag of all processes to a process-unique version of +the specified filename. Any directories in the filename will automatically be created. +Each output file will consist of filename.id, where the id will be the +processes' rank in MPI_COMM_WORLD, left-filled with +zero's for correct ordering in listings. +. +. +.TP +.B -stdin\fR,\fP --stdin +The MPI_COMM_WORLD rank of the process that is to receive stdin. The +default is to forward stdin to MPI_COMM_WORLD rank 0, but this option +can be used to forward stdin to any process. It is also acceptable to +specify \fInone\fP, indicating that no processes are to receive stdin. +. +. +.TP +.B -tag-output\fR,\fP --tag-output +Tag each line of output to stdout, stderr, and stddiag with \fB[jobid, MCW_rank]\fP indicating the process jobid +and MPI_COMM_WORLD rank of the process that generated the output, and the channel which generated it. +. +. +.TP +.B -timestamp-output\fR,\fP --timestamp-output +Timestamp each line of output to stdout, stderr, and stddiag. +. +. +.TP +.B -xml\fR,\fP --xml +Provide all output to stdout, stderr, and stddiag in an xml format. +. +. +.TP +.B -xterm\fR,\fP --xterm \fR\fP +Display the output from the processes identified by their +MPI_COMM_WORLD ranks in separate xterm windows. The ranks are specified +as a comma-separated list of ranges, with a -1 indicating all. A separate +window will be created for each specified process. +.B Note: +xterm will normally terminate the window upon termination of the process running +within it. However, by adding a "!" to the end of the list of specified ranks, +the proper options will be provided to ensure that xterm keeps the window open +\fIafter\fP the process terminates, thus allowing you to see the process' output. +Each xterm window will subsequently need to be manually closed. +.B Note: +In some environments, xterm may require that the executable be in the user's +path, or be specified in absolute or relative terms. Thus, it may be necessary +to specify a local executable as "./foo" instead of just "foo". If xterm fails to +find the executable, ompi-submit will hang, but still respond correctly to a ctrl-c. +If this happens, please check that the executable is being specified correctly +and try again. +. +. +. +. +.P +To manage files and runtime environment: +. +. +.TP +.B -path\fR,\fP --path \fR\fP + that will be used when attempting to locate the requested +executables. This is used prior to using the local PATH setting. +. +. +.TP +.B --prefix \fR\fP +Prefix directory that will be used to set the \fIPATH\fR and +\fILD_LIBRARY_PATH\fR on the remote node before invoking Open MPI or +the target process. See the "Remote Execution" section, below. +. +. +.TP +.B --preload-binary +Copy the specified executable(s) to remote machines prior to starting remote processes. The +executables will be copied to the Open MPI session directory and will be deleted upon +completion of the job. +. +. +.TP +.B --preload-files +Preload the comma separated list of files to the current working directory of the remote +machines where processes will be launched prior to starting those processes. +. +. +.TP +.B --preload-files-dest-dir +The destination directory to be used for preload-files, if other than the current working +directory. By default, the absolute and relative paths provided by --preload-files are used. +. +. +.TP +.B -wd \fR\fP +Synonym for \fI-wdir\fP. +. +. +.TP +.B -wdir \fR\fP +Change to the directory before the user's program executes. +See the "Current Working Directory" section for notes on relative paths. +.B Note: +If the \fI-wdir\fP option appears both on the command line and in an +application context, the context will take precedence over the command +line. Thus, if the path to the desired wdir is different +on the backend nodes, then it must be specified as an absolute path that +is correct for the backend node. +. +. +.TP +.B -x \fR\fP +Export the specified environment variables to the remote nodes before +executing the program. Only one environment variable can be specified +per \fI-x\fP option. Existing environment variables can be specified +or new variable names specified with corresponding values. For +example: + \fB%\fP ompi-submit -x DISPLAY -x OFILE=/tmp/out ... + +The parser for the \fI-x\fP option is not very sophisticated; it does +not even understand quoted values. Users are advised to set variables +in the environment, and then use \fI-x\fP to export (not define) them. +. +. +. +. +.P +Setting MCA parameters: +. +. +.TP +.B -gmca\fR,\fP --gmca \fR \fP +Pass global MCA parameters that are applicable to all contexts. \fI\fP is +the parameter name; \fI\fP is the parameter value. +. +. +.TP +.B -mca\fR,\fP --mca +Send arguments to various MCA modules. See the "MCA" section, below. +. +. +. +. +.P +For debugging: +. +. +.TP +.B -debug\fR,\fP --debug +Invoke the user-level debugger indicated by the \fIorte_base_user_debugger\fP +MCA parameter. +. +. +.TP +.B -debugger\fR,\fP --debugger +Sequence of debuggers to search for when \fI--debug\fP is used (i.e. +a synonym for \fIorte_base_user_debugger\fP MCA parameter). +. +. +.TP +.B -tv\fR,\fP --tv +Launch processes under the TotalView debugger. +Deprecated backwards compatibility flag. Synonym for \fI--debug\fP. +. +. +. +. +.P +There are also other options: +. +. +.TP +.B --allow-run-as-root +Allow +.I ompi-submit +to run when executed by the root user +.RI ( ompi-submit +defaults to aborting when launched as the root user). +. +. +.TP +.B -aborted\fR,\fP --aborted \fR<#>\fP +Set the maximum number of aborted processes to display. +. +. +.TP +.B --app \fR\fP +Provide an appfile, ignoring all other command line options. +. +. +.TP +.B -cf\fR,\fP --cartofile \fR\fP +Provide a cartography file. +. +. +.TP +.B --hetero +Indicates that multiple app_contexts are being provided that are a mix of 32/64-bit binaries. +. +. +.TP +.B -ompi-server\fR,\fP --ompi-server +Specify the URI of the Open MPI server (or the ompi-submit to be used as the server) +, the name +of the file (specified as file:filename) that +contains that info, or the PID (specified as pid:#) of the ompi-submit to be used as + the server. +The Open MPI server is used to support multi-application data exchange via +the MPI-2 MPI_Publish_name and MPI_Lookup_name functions. +. +. +. +. +.P +The following options are useful for developers; they are not generally +useful to most ORTE and/or MPI users: +. +.TP +.B -d\fR,\fP --debug-devel +Enable debugging of the OmpiRTE (the run-time layer in Open MPI). +This is not generally useful for most users. +. +. +. +.P +There may be other options listed with \fIompi-submit --help\fP. +. +. +.SS Environment Variables +. +.TP +.B MPIEXEC_TIMEOUT +The maximum number of seconds that +.I ompi-submit +.RI ( mpiexec ) +will run. After this many seconds, +.I ompi-submit +will abort the launched job and exit. +. +. +.\" ************************** +.\" Description Section +.\" ************************** +.SH DESCRIPTION +. +One invocation of \fIompi-submit\fP starts an MPI application running under Open +MPI. If the application is single process multiple data (SPMD), the application +can be specified on the \fIompi-submit\fP command line. + +If the application is multiple instruction multiple data (MIMD), comprising of +multiple programs, the set of programs and argument can be specified in one of +two ways: Extended Command Line Arguments, and Application Context. +.PP +An application context describes the MIMD program set including all arguments +in a separate file. +.\" See appcontext(5) for a description of the application context syntax. +This file essentially contains multiple \fIompi-submit\fP command lines, less the +command name itself. The ability to specify different options for different +instantiations of a program is another reason to use an application context. +.PP +Extended command line arguments allow for the description of the application +layout on the command line using colons (\fI:\fP) to separate the specification +of programs and arguments. Some options are globally set across all specified +programs (e.g. --hostfile), while others are specific to a single program +(e.g. -np). +. +. +. +.SS Specifying Host Nodes +. +Host nodes can be identified on the \fIompi-submit\fP command line with the \fI-host\fP +option or in a hostfile. +. +.PP +For example, +. +.TP 4 +ompi-submit -H aa,aa,bb ./a.out +launches two processes on node aa and one on bb. +. +.PP +Or, consider the hostfile +. + + \fB%\fP cat myhostfile + aa slots=2 + bb slots=2 + cc slots=2 + +. +.PP +Since the DVM was started with \fIorte-dvm\fP, \fIorte-submit\fP +will ignore any slots arguments in the hostfile. Values provided +via hostfile to \fIorte-dvm\fP will control the behavior. +. +.PP +. +.TP 4 +ompi-submit -hostfile myhostfile ./a.out +will launch two processes on each of the three nodes. +. +.TP 4 +ompi-submit -hostfile myhostfile -host aa ./a.out +will launch two processes, both on node aa. +. +.TP 4 +ompi-submit -hostfile myhostfile -host dd ./a.out +will find no hosts to run on and abort with an error. +That is, the specified host dd is not in the specified hostfile. +. +.SS Specifying Number of Processes +. +As we have just seen, the number of processes to run can be set using the +hostfile. Other mechanisms exist. +. +.PP +The number of processes launched can be specified as a multiple of the +number of nodes or processor sockets available. For example, +. +.TP 4 +ompi-submit -H aa,bb -npersocket 2 ./a.out +launches processes 0-3 on node aa and process 4-7 on node bb, +where aa and bb are both dual-socket nodes. +The \fI-npersocket\fP option also turns on the \fI-bind-to-socket\fP option, +which is discussed in a later section. +. +.TP 4 +ompi-submit -H aa,bb -npernode 2 ./a.out +launches processes 0-1 on node aa and processes 2-3 on node bb. +. +.TP 4 +ompi-submit -H aa,bb -npernode 1 ./a.out +launches one process per host node. +. +.TP 4 +ompi-submit -H aa,bb -pernode ./a.out +is the same as \fI-npernode\fP 1. +. +. +.PP +Another alternative is to specify the number of processes with the +\fI-np\fP option. Consider now the hostfile +. + + \fB%\fP cat myhostfile + aa slots=4 + bb slots=4 + cc slots=4 + +. +.PP +Now, +. +.TP 4 +ompi-submit -hostfile myhostfile -np 6 ./a.out +will launch processes 0-3 on node aa and processes 4-5 on node bb. The remaining +slots in the hostfile will not be used since the \fI-np\fP option indicated +that only 6 processes should be launched. +. +.SS Mapping Processes to Nodes: Using Policies +. +The examples above illustrate the default mapping of process processes +to nodes. This mapping can also be controlled with various +\fIompi-submit\fP options that describe mapping policies. +. +. +.PP +Consider the same hostfile as above, again with \fI-np\fP 6: +. + + node aa node bb node cc + + ompi-submit 0 1 2 3 4 5 + + ompi-submit --map-by node 0 3 1 4 2 5 + + ompi-submit -nolocal 0 1 2 3 4 5 +. +.PP +The \fI--map-by node\fP option will load balance the processes across +the available nodes, numbering each process in a round-robin fashion. +. +.PP +The \fI-nolocal\fP option prevents any processes from being mapped onto the +local host (in this case node aa). While \fIompi-submit\fP typically consumes +few system resources, \fI-nolocal\fP can be helpful for launching very +large jobs where \fIompi-submit\fP may actually need to use noticeable amounts +of memory and/or processing time. +. +.PP +Just as \fI-np\fP can specify fewer processes than there are slots, it can +also oversubscribe the slots. For example, with the same hostfile: +. +.TP 4 +ompi-submit -hostfile myhostfile -np 14 ./a.out +will launch processes 0-3 on node aa, 4-7 on bb, and 8-11 on cc. It will +then add the remaining two processes to whichever nodes it chooses. +. +.PP +One can also specify limits to oversubscription. For example, with the same +hostfile: +. +.TP 4 +ompi-submit -hostfile myhostfile -np 14 -nooversubscribe ./a.out +will produce an error since \fI-nooversubscribe\fP prevents oversubscription. +. +.PP +Limits to oversubscription can also be specified in the hostfile itself: +. + % cat myhostfile + aa slots=4 max_slots=4 + bb max_slots=4 + cc slots=4 +. +.PP +The \fImax_slots\fP field specifies such a limit. When it does, the +\fIslots\fP value defaults to the limit. Now: +. +.TP 4 +ompi-submit -hostfile myhostfile -np 14 ./a.out +causes the first 12 processes to be launched as before, but the remaining +two processes will be forced onto node cc. The other two nodes are +protected by the hostfile against oversubscription by this job. +. +.PP +Using the \fI--nooversubscribe\fR option can be helpful since Open MPI +currently does not get "max_slots" values from the resource manager. +. +.PP +Of course, \fI-np\fP can also be used with the \fI-H\fP or \fI-host\fP +option. For example, +. +.TP 4 +ompi-submit -H aa,bb -np 8 ./a.out +launches 8 processes. Since only two hosts are specified, after the first +two processes are mapped, one to aa and one to bb, the remaining processes +oversubscribe the specified hosts. +. +.PP +And here is a MIMD example: +. +.TP 4 +ompi-submit -H aa -np 1 hostname : -H bb,cc -np 2 uptime +will launch process 0 running \fIhostname\fP on node aa and processes 1 and 2 +each running \fIuptime\fP on nodes bb and cc, respectively. +. +.SS Mapping, Ranking, and Binding: Oh My! +. +Open MPI employs a three-phase procedure for assigning process locations and +ranks: +. +.TP 10 +\fBmapping\fP +Assigns a default location to each process +. +.TP 10 +\fBranking\fP +Assigns an MPI_COMM_WORLD rank value to each process +. +.TP 10 +\fBbinding\fP +Constrains each process to run on specific processors +. +.PP +The \fImapping\fP step is used to assign a default location to each process +based on the mapper being employed. Mapping by slot, node, and sequentially results +in the assignment of the processes to the node level. In contrast, mapping by object, allows +the mapper to assign the process to an actual object on each node. +. +.PP +\fBNote:\fP the location assigned to the process is independent of where it will be bound - the +assignment is used solely as input to the binding algorithm. +. +.PP +The mapping of process processes to nodes can be defined not just +with general policies but also, if necessary, using arbitrary mappings +that cannot be described by a simple policy. One can use the "sequential +mapper," which reads the hostfile line by line, assigning processes +to nodes in whatever order the hostfile specifies. Use the +\fI-mca rmaps seq\fP option. For example, using the same hostfile +as before: +. +.PP +ompi-submit -hostfile myhostfile -mca rmaps seq ./a.out +. +.PP +will launch three processes, one on each of nodes aa, bb, and cc, respectively. +The slot counts don't matter; one process is launched per line on +whatever node is listed on the line. +. +.PP +Another way to specify arbitrary mappings is with a rankfile, which +gives you detailed control over process binding as well. Rankfiles +are discussed below. +. +.PP +The second phase focuses on the \fIranking\fP of the process within +the job's MPI_COMM_WORLD. Open MPI +separates this from the mapping procedure to allow more flexibility in the +relative placement of MPI processes. This is best illustrated by considering the +following two cases where we used the —map-by ppr:2:socket option: +. +.PP + node aa node bb + + rank-by core 0 1 ! 2 3 4 5 ! 6 7 + + rank-by socket 0 2 ! 1 3 4 6 ! 5 7 + + rank-by socket:span 0 4 ! 1 5 2 6 ! 3 7 +. +.PP +Ranking by core and by slot provide the identical result - a simple +progression of MPI_COMM_WORLD ranks across each node. Ranking by +socket does a round-robin ranking within each node until all processes +have been assigned an MCW rank, and then progresses to the next +node. Adding the \fIspan\fP modifier to the ranking directive causes +the ranking algorithm to treat the entire allocation as a single +entity - thus, the MCW ranks are assigned across all sockets before +circling back around to the beginning. +. +.PP +The \fIbinding\fP phase actually binds each process to a given set of processors. This can +improve performance if the operating system is placing processes +suboptimally. For example, it might oversubscribe some multi-core +processor sockets, leaving other sockets idle; this can lead +processes to contend unnecessarily for common resources. Or, it +might spread processes out too widely; this can be suboptimal if +application performance is sensitive to interprocess communication +costs. Binding can also keep the operating system from migrating +processes excessively, regardless of how optimally those processes +were placed to begin with. +. +.PP +The processors to be used for binding can be identified in terms of +topological groupings - e.g., binding to an l3cache will bind each +process to all processors within the scope of a single L3 cache within +their assigned location. Thus, if a process is assigned by the mapper +to a certain socket, then a \fI—bind-to l3cache\fP directive will +cause the process to be bound to the processors that share a single L3 +cache within that socket. +. +.PP +To help balance loads, the binding directive uses a round-robin method when binding to +levels lower than used in the mapper. For example, consider the case where a job is +mapped to the socket level, and then bound to core. Each socket will have multiple cores, +so if multiple processes are mapped to a given socket, the binding algorithm will assign +each process located to a socket to a unique core in a round-robin manner. +. +.PP +Alternatively, processes mapped by l2cache and then bound to socket will simply be bound +to all the processors in the socket where they are located. In this manner, users can +exert detailed control over relative MCW rank location and binding. +. +.PP +Finally, \fI--report-bindings\fP can be used to report bindings. +. +.PP +As an example, consider a node with two processor sockets, each comprising +four cores. We run \fIompi-submit\fP with \fI-np 4 --report-bindings\fP and +the following additional options: +. + + % ompi-submit ... --map-by core --bind-to core + [...] ... binding child [...,0] to cpus 0001 + [...] ... binding child [...,1] to cpus 0002 + [...] ... binding child [...,2] to cpus 0004 + [...] ... binding child [...,3] to cpus 0008 + + % ompi-submit ... --map-by socket -0bind-to socket + [...] ... binding child [...,0] to socket 0 cpus 000f + [...] ... binding child [...,1] to socket 1 cpus 00f0 + [...] ... binding child [...,2] to socket 0 cpus 000f + [...] ... binding child [...,3] to socket 1 cpus 00f0 + + % ompi-submit ... --map-by core:PE=2 -bind-to core + [...] ... binding child [...,0] to cpus 0003 + [...] ... binding child [...,1] to cpus 000c + [...] ... binding child [...,2] to cpus 0030 + [...] ... binding child [...,3] to cpus 00c0 + + % ompi-submit ... --bind-to none +. +.PP +Here, \fI--report-bindings\fP shows the binding of each process as a mask. +In the first case, the processes bind to successive cores as indicated by +the masks 0001, 0002, 0004, and 0008. In the second case, processes bind +to all cores on successive sockets as indicated by the masks 000f and 00f0. +The processes cycle through the processor sockets in a round-robin fashion +as many times as are needed. In the third case, the masks show us that +2 cores have been bound per process. In the fourth case, binding is +turned off and no bindings are reported. +. +.PP +Open MPI's support for process binding depends on the underlying +operating system. Therefore, certain process binding options may not be available +on every system. +. +.PP +Process binding can also be set with MCA parameters. +Their usage is less convenient than that of \fIompi-submit\fP options. +On the other hand, MCA parameters can be set not only on the \fIompi-submit\fP +command line, but alternatively in a system or user mca-params.conf file +or as environment variables, as described in the MCA section below. +Some examples include: +. +.PP + ompi-submit option MCA parameter key value + + --map-by core rmaps_base_mapping_policy core + --map-by socket rmaps_base_mapping_policy socket + --rank-by core rmaps_base_ranking_policy core + --bind-to core hwloc_base_binding_policy core + --bind-to socket hwloc_base_binding_policy socket + --bind-to none hwloc_base_binding_policy none +. +. +.SS Rankfiles +. +Rankfiles are text files that specify detailed information about how +individual processes should be mapped to nodes, and to which +processor(s) they should be bound. Each line of a rankfile specifies +the location of one process (for MPI jobs, the process' "rank" refers +to its rank in MPI_COMM_WORLD). The general form of each line in the +rankfile is: +. + + rank = slot= +. +.PP +For example: +. + + $ cat myrankfile + rank 0=aa slot=1:0-2 + rank 1=bb slot=0:0,1 + rank 2=cc slot=1-2 + $ ompi-submit -H aa,bb,cc,dd -rf myrankfile ./a.out +. +.PP +Means that +. + + Rank 0 runs on node aa, bound to logical socket 1, cores 0-2. + Rank 1 runs on node bb, bound to logical socket 0, cores 0 and 1. + Rank 2 runs on node cc, bound to logical cores 1 and 2. +. +.PP +Rankfiles can alternatively be used to specify \fIphysical\fP processor +locations. In this case, the syntax is somewhat different. Sockets are +no longer recognized, and the slot number given must be the number of +the physical PU as most OS's do not assign a unique physical identifier +to each core in the node. Thus, a proper physical rankfile looks something +like the following: +. + + $ cat myphysicalrankfile + rank 0=aa slot=1 + rank 1=bb slot=8 + rank 2=cc slot=6 +. +.PP +This means that +. + + Rank 0 will run on node aa, bound to the core that contains physical PU 1 + Rank 1 will run on node bb, bound to the core that contains physical PU 8 + Rank 2 will run on node cc, bound to the core that contains physical PU 6 +. +.PP +Rankfiles are treated as \fIlogical\fP by default, and the MCA parameter +rmaps_rank_file_physical must be set to 1 to indicate that the rankfile +is to be considered as \fIphysical\fP. +. +.PP +The hostnames listed above are "absolute," meaning that actual +resolveable hostnames are specified. However, hostnames can also be +specified as "relative," meaning that they are specified in relation +to an externally-specified list of hostnames (e.g., by ompi-submit's --host +argument, a hostfile, or a job scheduler). +. +.PP +The "relative" specification is of the form "+n", where X is an +integer specifying the Xth hostname in the set of all available +hostnames, indexed from 0. For example: +. + + $ cat myrankfile + rank 0=+n0 slot=1:0-2 + rank 1=+n1 slot=0:0,1 + rank 2=+n2 slot=1-2 + $ ompi-submit -H aa,bb,cc,dd -rf myrankfile ./a.out +. +.PP +Starting with Open MPI v1.7, all socket/core slot locations are be +specified as +.I logical +indexes (the Open MPI v1.6 series used +.I physical +indexes). You can use tools such as HWLOC's "lstopo" to find the +logical indexes of socket and cores. +. +. +.SS Application Context or Executable Program? +. +To distinguish the two different forms, \fIompi-submit\fP +looks on the command line for \fI--app\fP option. If +it is specified, then the file named on the command line is +assumed to be an application context. If it is not +specified, then the file is assumed to be an executable program. +. +. +. +.SS Locating Files +. +If no relative or absolute path is specified for a file, Open +MPI will first look for files by searching the directories specified +by the \fI--path\fP option. If there is no \fI--path\fP option set or +if the file is not found at the \fI--path\fP location, then Open MPI +will search the user's PATH environment variable as defined on the +source node(s). +.PP +If a relative directory is specified, it must be relative to the initial +working directory determined by the specific starter used. For example when +using the rsh or ssh starters, the initial directory is $HOME by default. Other +starters may set the initial directory to the current working directory from +the invocation of \fIompi-submit\fP. +. +. +. +.SS Current Working Directory +. +The \fI\-wdir\fP ompi-submit option (and its synonym, \fI\-wd\fP) allows +the user to change to an arbitrary directory before the program is +invoked. It can also be used in application context files to specify +working directories on specific nodes and/or for specific +applications. +.PP +If the \fI\-wdir\fP option appears both in a context file and on the +command line, the context file directory will override the command +line value. +.PP +If the \fI-wdir\fP option is specified, Open MPI will attempt to +change to the specified directory on all of the remote nodes. If this +fails, \fIompi-submit\fP will abort. +.PP +If the \fI-wdir\fP option is \fBnot\fP specified, Open MPI will send +the directory name where \fIompi-submit\fP was invoked to each of the +remote nodes. The remote nodes will try to change to that +directory. If they are unable (e.g., if the directory does not exist on +that node), then Open MPI will use the default directory determined by +the starter. +.PP +All directory changing occurs before the user's program is invoked; it +does not wait until \fIMPI_INIT\fP is called. +. +. +. +.SS Standard I/O +. +Open MPI directs UNIX standard input to /dev/null on all processes +except the MPI_COMM_WORLD rank 0 process. The MPI_COMM_WORLD rank 0 process +inherits standard input from \fIompi-submit\fP. +.B Note: +The node that invoked \fIompi-submit\fP need not be the same as the node where the +MPI_COMM_WORLD rank 0 process resides. Open MPI handles the redirection of +\fIompi-submit\fP's standard input to the rank 0 process. +.PP +Open MPI directs UNIX standard output and error from remote nodes to the node +that invoked \fIompi-submit\fP and prints it on the standard output/error of +\fIompi-submit\fP. +Local processes inherit the standard output/error of \fIompi-submit\fP and transfer +to it directly. +.PP +Thus it is possible to redirect standard I/O for Open MPI applications by +using the typical shell redirection procedure on \fIompi-submit\fP. + + \fB%\fP ompi-submit -np 2 my_app < my_input > my_output + +Note that in this example \fIonly\fP the MPI_COMM_WORLD rank 0 process will +receive the stream from \fImy_input\fP on stdin. The stdin on all the other +nodes will be tied to /dev/null. However, the stdout from all nodes will +be collected into the \fImy_output\fP file. +. +. +. +.SS Signal Propagation +. +When orte-submit receives a SIGTERM and SIGINT, it will attempt to kill +the entire job by sending all processes in the job a SIGTERM, waiting +a small number of seconds, then sending all processes in the job a +SIGKILL. +. +.PP +SIGUSR1 and SIGUSR2 signals received by orte-submit are propagated to +all processes in the job. +. +.PP +One can turn on forwarding of SIGSTOP and SIGCONT to the program executed +by ompi-submit by setting the MCA parameter orte_forward_job_control to 1. +A SIGTSTOP signal to ompi-submit will then cause a SIGSTOP signal to be sent +to all of the programs started by ompi-submit and likewise a SIGCONT signal +to ompi-submit will cause a SIGCONT sent. +. +.PP +Other signals are not currently propagated +by orte-submit. +. +. +.SS Process Termination / Signal Handling +. +During the run of an MPI application, if any process dies abnormally +(either exiting before invoking \fIMPI_FINALIZE\fP, or dying as the result of a +signal), \fIompi-submit\fP will print out an error message and kill the rest of the +MPI application. +.PP +User signal handlers should probably avoid trying to cleanup MPI state +(Open MPI is currently not async-signal-safe; see MPI_Init_thread(3) +for details about +.I MPI_THREAD_MULTIPLE +and thread safety). For example, if a segmentation fault occurs in +\fIMPI_SEND\fP (perhaps because a bad buffer was passed in) and a user +signal handler is invoked, if this user handler attempts to invoke +\fIMPI_FINALIZE\fP, Bad Things could happen since Open MPI was already +"in" MPI when the error occurred. Since \fIompi-submit\fP will notice that +the process died due to a signal, it is probably not necessary (and +safest) for the user to only clean up non-MPI state. +. +. +. +.SS Process Environment +. +Processes in the MPI application inherit their environment from the +Open RTE daemon upon the node on which they are running. The +environment is typically inherited from the user's shell. On remote +nodes, the exact environment is determined by the boot MCA module +used. The \fIrsh\fR launch module, for example, uses either +\fIrsh\fR/\fIssh\fR to launch the Open RTE daemon on remote nodes, and +typically executes one or more of the user's shell-setup files before +launching the Open RTE daemon. When running dynamically linked +applications which require the \fILD_LIBRARY_PATH\fR environment +variable to be set, care must be taken to ensure that it is correctly +set when booting Open MPI. +.PP +See the "Remote Execution" section for more details. +. +. +.SS Remote Execution +. +Open MPI requires that the \fIPATH\fR environment variable be set to +find executables on remote nodes (this is typically only necessary in +\fIrsh\fR- or \fIssh\fR-based environments -- batch/scheduled +environments typically copy the current environment to the execution +of remote jobs, so if the current environment has \fIPATH\fR and/or +\fILD_LIBRARY_PATH\fR set properly, the remote nodes will also have it +set properly). If Open MPI was compiled with shared library support, +it may also be necessary to have the \fILD_LIBRARY_PATH\fR environment +variable set on remote nodes as well (especially to find the shared +libraries required to run user MPI applications). +.PP +However, it is not always desirable or possible to edit shell +startup files to set \fIPATH\fR and/or \fILD_LIBRARY_PATH\fR. The +\fI--prefix\fR option is provided for some simple configurations where +this is not possible. +.PP +The \fI--prefix\fR option takes a single argument: the base directory +on the remote node where Open MPI is installed. Open MPI will use +this directory to set the remote \fIPATH\fR and \fILD_LIBRARY_PATH\fR +before executing any Open MPI or user applications. This allows +running Open MPI jobs without having pre-configured the \fIPATH\fR and +\fILD_LIBRARY_PATH\fR on the remote nodes. +.PP +Open MPI adds the basename of the current +node's "bindir" (the directory where Open MPI's executables are +installed) to the prefix and uses that to set the \fIPATH\fR on the +remote node. Similarly, Open MPI adds the basename of the current +node's "libdir" (the directory where Open MPI's libraries are +installed) to the prefix and uses that to set the +\fILD_LIBRARY_PATH\fR on the remote node. For example: +.TP 15 +Local bindir: +/local/node/directory/bin +.TP +Local libdir: +/local/node/directory/lib64 +.PP +If the following command line is used: + + \fB%\fP ompi-submit --prefix /remote/node/directory + +Open MPI will add "/remote/node/directory/bin" to the \fIPATH\fR +and "/remote/node/directory/lib64" to the \fLD_LIBRARY_PATH\fR on the +remote node before attempting to execute anything. +.PP +The \fI--prefix\fR option is not sufficient if the installation paths +on the remote node are different than the local node (e.g., if "/lib" +is used on the local node, but "/lib64" is used on the remote node), +or if the installation paths are something other than a subdirectory +under a common prefix. +.PP +Note that executing \fIompi-submit\fR via an absolute pathname is +equivalent to specifying \fI--prefix\fR without the last subdirectory +in the absolute pathname to \fIompi-submit\fR. For example: + + \fB%\fP /usr/local/bin/ompi-submit ... + +is equivalent to + + \fB%\fP ompi-submit --prefix /usr/local +. +. +. +.SS Exported Environment Variables +. +All environment variables that are named in the form OMPI_* will automatically +be exported to new processes on the local and remote nodes. Environmental +parameters can also be set/forwarded to the new processes using the MCA +parameter \fImca_base_env_list\fP. The \fI\-x\fP option to \fIompi-submit\fP has +been deprecated, but the syntax of the MCA param follows that prior +example. While the syntax of the \fI\-x\fP option and MCA param +allows the definition of new variables, note that the parser +for these options are currently not very sophisticated - it does not even +understand quoted values. Users are advised to set variables in the +environment and use the option to export them; not to define them. +. +. +. +.SS Setting MCA Parameters +. +The \fI-mca\fP switch allows the passing of parameters to various MCA +(Modular Component Architecture) modules. +.\" Open MPI's MCA modules are described in detail in ompimca(7). +MCA modules have direct impact on MPI programs because they allow tunable +parameters to be set at run time (such as which BTL communication device driver +to use, what parameters to pass to that BTL, etc.). +.PP +The \fI-mca\fP switch takes two arguments: \fI\fP and \fI\fP. +The \fI\fP argument generally specifies which MCA module will receive the value. +For example, the \fI\fP "btl" is used to select which BTL to be used for +transporting MPI messages. The \fI\fP argument is the value that is +passed. +For example: +. +.TP 4 +ompi-submit -mca btl tcp,self -np 1 foo +Tells Open MPI to use the "tcp" and "self" BTLs, and to run a single copy of +"foo" an allocated node. +. +.TP +ompi-submit -mca btl self -np 1 foo +Tells Open MPI to use the "self" BTL, and to run a single copy of "foo" an +allocated node. +.\" And so on. Open MPI's BTL MCA modules are described in ompimca_btl(7). +.PP +The \fI-mca\fP switch can be used multiple times to specify different +\fI\fP and/or \fI\fP arguments. If the same \fI\fP is +specified more than once, the \fI\fPs are concatenated with a comma +(",") separating them. +.PP +Note that the \fI-mca\fP switch is simply a shortcut for setting environment variables. +The same effect may be accomplished by setting corresponding environment +variables before running \fIompi-submit\fP. +The form of the environment variables that Open MPI sets is: + + OMPI_MCA_= +.PP +Thus, the \fI-mca\fP switch overrides any previously set environment +variables. The \fI-mca\fP settings similarly override MCA parameters set +in the +$OPAL_PREFIX/etc/openmpi-mca-params.conf or $HOME/.openmpi/mca-params.conf +file. +. +.PP +Unknown \fI\fP arguments are still set as +environment variable -- they are not checked (by \fIompi-submit\fP) for correctness. +Illegal or incorrect \fI\fP arguments may or may not be reported -- it +depends on the specific MCA module. +.PP +To find the available component types under the MCA architecture, or to find the +available parameters for a specific component, use the \fIompi_info\fP command. +See the \fIompi_info(1)\fP man page for detailed information on the command. +. +.SS Running as root +. +The Open MPI team strongly advises against executing +.I ompi-submit +as the root user. MPI applications should be run as regular +(non-root) users. +. +.PP +Reflecting this advice, ompi-submit will refuse to run as root by default. +To override this default, you can add the +.I --allow-run-as-root +option to the +.I ompi-submit +command line. +. +.SS Exit status +. +There is no standard definition for what \fIompi-submit\fP should return as an exit +status. After considerable discussion, we settled on the following method for +assigning the \fIompi-submit\fP exit status (note: in the following description, +the "primary" job is the initial application started by ompi-submit - all jobs that +are spawned by that job are designated "secondary" jobs): +. +.IP \[bu] 2 +if all processes in the primary job normally terminate with exit status 0, we return 0 +.IP \[bu] +if one or more processes in the primary job normally terminate with non-zero exit status, +we return the exit status of the process with the lowest MPI_COMM_WORLD rank to have a non-zero status +.IP \[bu] +if all processes in the primary job normally terminate with exit status 0, and one or more +processes in a secondary job normally terminate with non-zero exit status, we (a) return +the exit status of the process with the lowest MPI_COMM_WORLD rank in the lowest jobid to have a non-zero status, and (b) +output a message summarizing the exit status of the primary and all secondary jobs. +.IP \[bu] +if the cmd line option --report-child-jobs-separately is set, we will return -only- the +exit status of the primary job. Any non-zero exit status in secondary jobs will be +reported solely in a summary print statement. +. +.PP +By default, OMPI records and notes that MPI processes exited with non-zero termination status. +This is generally not considered an "abnormal termination" - i.e., OMPI will not abort an MPI +job if one or more processes return a non-zero status. Instead, the default behavior simply +reports the number of processes terminating with non-zero status upon completion of the job. +.PP +However, in some cases it can be desirable to have the job abort when any process terminates +with non-zero status. For example, a non-MPI job might detect a bad result from a calculation +and want to abort, but doesn't want to generate a core file. Or an MPI job might continue past +a call to MPI_Finalize, but indicate that all processes should abort due to some post-MPI result. +.PP +It is not anticipated that this situation will occur frequently. However, in the interest of +serving the broader community, OMPI now has a means for allowing users to direct that jobs be +aborted upon any process exiting with non-zero status. Setting the MCA parameter +"orte_abort_on_non_zero_status" to 1 will cause OMPI to abort all processes once any process + exits with non-zero status. +.PP +Terminations caused in this manner will be reported on the console as an "abnormal termination", +with the first process to so exit identified along with its exit status. +.PP +. +.\" ************************** +.\" Examples Section +.\" ************************** +.SH EXAMPLES +Be sure also to see the examples throughout the sections above. +. +.TP 4 +ompi-submit -np 4 -mca btl ib,tcp,self prog1 +Run 4 copies of prog1 using the "ib", "tcp", and "self" BTL's for the +transport of MPI messages. +. +. +.TP 4 +ompi-submit -np 4 -mca btl tcp,sm,self +.br +--mca btl_tcp_if_include eth0 prog1 +.br +Run 4 copies of prog1 using the "tcp", "sm" and "self" BTLs for the +transport of MPI messages, with TCP using only the eth0 interface to +communicate. Note that other BTLs have similar if_include MCA +parameters. +. +.\" ************************** +.\" Diagnostics Section +.\" ************************** +. +.\" .SH DIAGNOSTICS +.\" .TP 4 +.\" Error Msg: +.\" Description +. +.\" ************************** +.\" Return Value Section +.\" ************************** +. +.SH RETURN VALUE +. +\fIompi-submit\fP returns 0 if all processes started by \fIompi-submit\fP exit after calling +MPI_FINALIZE. A non-zero value is returned if an internal error occurred in +ompi-submit, or one or more processes exited before calling MPI_FINALIZE. If an +internal error occurred in ompi-submit, the corresponding error code is returned. +In the event that one or more processes exit before calling MPI_FINALIZE, the +return value of the MPI_COMM_WORLD rank of the process that \fIompi-submit\fP first notices died +before calling MPI_FINALIZE will be returned. Note that, in general, this will +be the first process that died but is not guaranteed to be so. +. +.\" ************************** +.\" See Also Section +.\" ************************** +. +.SH SEE ALSO +MPI_Init_thread(3) diff --git a/orte/tools/orte-submit/orte-submit.c b/orte/tools/orte-submit/orte-submit.c new file mode 100644 index 0000000000..5f4f055e60 --- /dev/null +++ b/orte/tools/orte-submit/orte-submit.c @@ -0,0 +1,1468 @@ +/* -*- C -*- + * + * Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2008 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2006-2014 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2007-2009 Sun Microsystems, Inc. All rights reserved. + * Copyright (c) 2007-2013 Los Alamos National Security, LLC. All rights + * reserved. + * Copyright (c) 2013-2015 Intel, Inc. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "orte_config.h" +#include "orte/constants.h" + +#ifdef HAVE_STRING_H +#include +#endif +#include +#ifdef HAVE_STDLIB_H +#include +#endif /* HAVE_STDLIB_H */ +#ifdef HAVE_STRINGS_H +#include +#endif /* HAVE_STRINGS_H */ +#ifdef HAVE_UNISTD_H +#include +#endif +#ifdef HAVE_SYS_PARAM_H +#include +#endif +#include +#include +#include +#ifdef HAVE_SYS_TYPES_H +#include +#endif /* HAVE_SYS_TYPES_H */ +#ifdef HAVE_SYS_WAIT_H +#include +#endif /* HAVE_SYS_WAIT_H */ +#ifdef HAVE_SYS_TIME_H +#include +#endif /* HAVE_SYS_TIME_H */ +#include +#ifdef HAVE_SYS_STAT_H +#include +#endif + +#include "opal/dss/dss.h" +#include "opal/mca/event/event.h" +#include "opal/mca/installdirs/installdirs.h" +#include "opal/mca/hwloc/base/base.h" +#include "opal/mca/base/base.h" +#include "opal/util/argv.h" +#include "opal/util/output.h" +#include "opal/util/basename.h" +#include "opal/util/cmd_line.h" +#include "opal/util/opal_environ.h" +#include "opal/util/opal_getcwd.h" +#include "opal/util/show_help.h" +#include "opal/util/fd.h" +#include "opal/sys/atomic.h" +#if OPAL_ENABLE_FT_CR == 1 +#include "opal/runtime/opal_cr.h" +#endif + +#include "opal/version.h" +#include "opal/runtime/opal.h" +#include "opal/util/os_path.h" +#include "opal/util/path.h" +#include "opal/class/opal_pointer_array.h" +#include "opal/dss/dss.h" + +#include "orte/mca/odls/odls_types.h" +#include "orte/mca/plm/plm.h" +#include "orte/mca/schizo/schizo.h" +#include "orte/mca/errmgr/errmgr.h" +#include "orte/mca/rml/rml.h" +#include "orte/mca/rml/base/rml_contact.h" +#include "orte/mca/routed/routed.h" + +#include "orte/runtime/runtime.h" +#include "orte/runtime/orte_globals.h" +#include "orte/runtime/orte_wait.h" +#include "orte/runtime/orte_quit.h" +#include "orte/util/show_help.h" + +/* local functions */ +static void orte_timeout_wakeup(int sd, short args, void *cbdata); +static void local_recv(int status, orte_process_name_t* sender, + opal_buffer_t *buffer, + orte_rml_tag_t tag, void *cbdata); + +/* + * Globals + */ +static char **global_mca_env = NULL; +static orte_std_cntr_t total_num_apps = 0; +static bool want_prefix_by_default = (bool) ORTE_WANT_ORTERUN_PREFIX_BY_DEFAULT; + +/* + * Globals + */ +static struct { + bool help; + bool version; + char *report_pid; + char *stdin_target; + bool index_argv; + bool preload_binaries; + char *preload_files; + char *appfile; + int num_procs; + char *hnp; + char *wdir; + bool set_cwd_to_session_dir; + char *path; + bool enable_recovery; + char *personality; + char *basename; + char *prefix; + bool terminate; +} myglobals; + +static opal_cmd_line_init_t cmd_line_init[] = { + /* Various "obvious" options */ + { NULL, 'h', NULL, "help", 0, + &myglobals.help, OPAL_CMD_LINE_TYPE_BOOL, + "This help message" }, + { NULL, 'V', NULL, "version", 0, + &myglobals.version, OPAL_CMD_LINE_TYPE_BOOL, + "Print version and exit" }, + + { NULL, '\0', "report-pid", "report-pid", 1, + &myglobals.report_pid, OPAL_CMD_LINE_TYPE_STRING, + "Printout pid on stdout [-], stderr [+], or a file [anything else]" }, + + /* select stdin option */ + { NULL, '\0', "stdin", "stdin", 1, + &myglobals.stdin_target, OPAL_CMD_LINE_TYPE_STRING, + "Specify procs to receive stdin [rank, all, none] (default: 0, indicating rank 0)" }, + + /* request that argv[0] be indexed */ + { NULL, '\0', "index-argv-by-rank", "index-argv-by-rank", 0, + &myglobals.index_argv, OPAL_CMD_LINE_TYPE_BOOL, + "Uniquely index argv[0] for each process using its rank" }, + + /* Preload the binary on the remote machine */ + { NULL, 's', NULL, "preload-binary", 0, + &myglobals.preload_binaries, OPAL_CMD_LINE_TYPE_BOOL, + "Preload the binary on the remote machine before starting the remote process." }, + + /* Preload files on the remote machine */ + { NULL, '\0', NULL, "preload-files", 1, + &myglobals.preload_files, OPAL_CMD_LINE_TYPE_STRING, + "Preload the comma separated list of files to the remote machines current working directory before starting the remote process." }, + + /* Use an appfile */ + { NULL, '\0', NULL, "app", 1, + &myglobals.appfile, OPAL_CMD_LINE_TYPE_STRING, + "Provide an appfile; ignore all other command line options" }, + + /* Number of processes; -c, -n, --n, -np, and --np are all + synonyms */ + { NULL, 'c', "np", "np", 1, + &myglobals.num_procs, OPAL_CMD_LINE_TYPE_INT, + "Number of processes to run" }, + { NULL, '\0', "n", "n", 1, + &myglobals.num_procs, OPAL_CMD_LINE_TYPE_INT, + "Number of processes to run" }, + + /* uri of Open MPI HNP, or at least where to get it */ + { NULL, '\0', "hnp", "hnp", 1, + &myglobals.hnp, OPAL_CMD_LINE_TYPE_STRING, + "Specify the URI of the Open MPI server, or the name of the file (specified as file:filename) that contains that info" }, + + /* uri of Open MPI HNP, or at least where to get it */ + { NULL, '\0', "terminate", "terminate", 0, + &myglobals.terminate, OPAL_CMD_LINE_TYPE_BOOL, + "Terminate the DVM" }, + + + /* Export environment variables; potentially used multiple times, + so it does not make sense to set into a variable */ + { NULL, 'x', NULL, NULL, 1, + NULL, OPAL_CMD_LINE_TYPE_NULL, + "Export an environment variable, optionally specifying a value (e.g., \"-x foo\" exports the environment variable foo and takes its value from the current environment; \"-x foo=bar\" exports the environment variable name foo and sets its value to \"bar\" in the started processes)" }, + + /* Mapping controls */ + { "rmaps_base_display_map", '\0', "display-map", "display-map", 0, + NULL, OPAL_CMD_LINE_TYPE_BOOL, + "Display the process map just before launch"}, + { "rmaps_base_display_devel_map", '\0', "display-devel-map", "display-devel-map", 0, + NULL, OPAL_CMD_LINE_TYPE_BOOL, + "Display a detailed process map (mostly intended for developers) just before launch"}, + { "rmaps_base_display_topo_with_map", '\0', "display-topo", "display-topo", 0, + NULL, OPAL_CMD_LINE_TYPE_BOOL, + "Display the topology as part of the process map (mostly intended for developers) just before launch"}, + { "rmaps_base_display_diffable_map", '\0', "display-diffable-map", "display-diffable-map", 0, + NULL, OPAL_CMD_LINE_TYPE_BOOL, + "Display a diffable process map (mostly intended for developers) just before launch"}, + { NULL, 'H', "host", "host", 1, + NULL, OPAL_CMD_LINE_TYPE_STRING, + "List of hosts to invoke processes on" }, + { "rmaps_base_no_schedule_local", '\0', "nolocal", "nolocal", 0, + NULL, OPAL_CMD_LINE_TYPE_BOOL, + "Do not run any MPI applications on the local node" }, + { "rmaps_base_no_oversubscribe", '\0', "nooversubscribe", "nooversubscribe", 0, + NULL, OPAL_CMD_LINE_TYPE_BOOL, + "Nodes are not to be oversubscribed, even if the system supports such operation"}, + { "rmaps_base_oversubscribe", '\0', "oversubscribe", "oversubscribe", 0, + NULL, OPAL_CMD_LINE_TYPE_BOOL, + "Nodes are allowed to be oversubscribed, even on a managed system, and overloading of processing elements"}, + { "rmaps_base_cpus_per_rank", '\0', "cpus-per-proc", "cpus-per-proc", 1, + NULL, OPAL_CMD_LINE_TYPE_INT, + "Number of cpus to use for each process [default=1]" }, + { "rmaps_base_cpus_per_rank", '\0', "cpus-per-rank", "cpus-per-rank", 1, + NULL, OPAL_CMD_LINE_TYPE_INT, + "Synonym for cpus-per-proc" }, + + /* Nperxxx options that do not require topology and are always + * available - included for backwards compatibility + */ + { "rmaps_ppr_pernode", '\0', "pernode", "pernode", 0, + NULL, OPAL_CMD_LINE_TYPE_BOOL, + "Launch one process per available node" }, + { "rmaps_ppr_n_pernode", '\0', "npernode", "npernode", 1, + NULL, OPAL_CMD_LINE_TYPE_INT, + "Launch n processes per node on all allocated nodes" }, + { "rmaps_ppr_n_pernode", '\0', "N", NULL, 1, + NULL, OPAL_CMD_LINE_TYPE_INT, + "Launch n processes per node on all allocated nodes (synonym for npernode)" }, + +#if OPAL_HAVE_HWLOC + /* declare hardware threads as independent cpus */ + { "hwloc_base_use_hwthreads_as_cpus", '\0', "use-hwthread-cpus", "use-hwthread-cpus", 0, + NULL, OPAL_CMD_LINE_TYPE_BOOL, + "Use hardware threads as independent cpus" }, + + /* include npersocket for backwards compatibility */ + { "rmaps_ppr_n_persocket", '\0', "npersocket", "npersocket", 1, + NULL, OPAL_CMD_LINE_TYPE_INT, + "Launch n processes per socket on all allocated nodes" }, + + /* Mapping options */ + { "rmaps_base_mapping_policy", '\0', NULL, "map-by", 1, + NULL, OPAL_CMD_LINE_TYPE_STRING, + "Mapping Policy [slot | hwthread | core | socket (default) | numa | board | node]" }, + + /* Ranking options */ + { "rmaps_base_ranking_policy", '\0', NULL, "rank-by", 1, + NULL, OPAL_CMD_LINE_TYPE_STRING, + "Ranking Policy [slot (default) | hwthread | core | socket | numa | board | node]" }, + + /* Binding options */ + { "hwloc_base_binding_policy", '\0', NULL, "bind-to", 1, + NULL, OPAL_CMD_LINE_TYPE_STRING, + "Policy for binding processes. Allowed values: none, hwthread, core, l1cache, l2cache, l3cache, socket, numa, board (\"none\" is the default when oversubscribed, \"core\" is the default when np<=2, and \"socket\" is the default when np>2). Allowed qualifiers: overload-allowed, if-supported" }, + + { "hwloc_base_report_bindings", '\0', "report-bindings", "report-bindings", 0, + NULL, OPAL_CMD_LINE_TYPE_BOOL, + "Whether to report process bindings to stderr" }, + + /* slot list option */ + { "hwloc_base_slot_list", '\0', "slot-list", "slot-list", 1, + NULL, OPAL_CMD_LINE_TYPE_STRING, + "List of processor IDs to bind processes to [default=NULL]"}, + +#else + /* Mapping options */ + { "rmaps_base_mapping_policy", '\0', NULL, "map-by", 1, + NULL, OPAL_CMD_LINE_TYPE_STRING, + "Mapping Policy [slot (default) | node]" }, + + /* Ranking options */ + { "rmaps_base_ranking_policy", '\0', NULL, "rank-by", 1, + NULL, OPAL_CMD_LINE_TYPE_STRING, + "Ranking Policy [slot (default) | node]" }, +#endif + + /* mpiexec-like arguments */ + { NULL, '\0', "wdir", "wdir", 1, + &myglobals.wdir, OPAL_CMD_LINE_TYPE_STRING, + "Set the working directory of the started processes" }, + { NULL, '\0', "wd", "wd", 1, + &myglobals.wdir, OPAL_CMD_LINE_TYPE_STRING, + "Synonym for --wdir" }, + { NULL, '\0', "set-cwd-to-session-dir", "set-cwd-to-session-dir", 0, + &myglobals.set_cwd_to_session_dir, OPAL_CMD_LINE_TYPE_BOOL, + "Set the working directory of the started processes to their session directory" }, + { NULL, '\0', "path", "path", 1, + &myglobals.path, OPAL_CMD_LINE_TYPE_STRING, + "PATH to be used to look for executables to start processes" }, + + { NULL, '\0', "enable-recovery", "enable-recovery", 0, + &myglobals.enable_recovery, OPAL_CMD_LINE_TYPE_BOOL, + "Enable recovery (resets all recovery options to on)" }, + + { NULL, '\0', "personality", "personality", 1, + &myglobals.personality, OPAL_CMD_LINE_TYPE_STRING, + "Programming model/language being used (default=\"ompi\")" }, + + /* End of list */ + { NULL, '\0', NULL, NULL, 0, + NULL, OPAL_CMD_LINE_TYPE_NULL, NULL } +}; + +/* + * Local functions + */ +static int create_app(int argc, char* argv[], + orte_job_t *jdata, + orte_app_context_t **app, + bool *made_app, char ***app_env); +static int init_globals(void); +static int parse_globals(int argc, char* argv[], opal_cmd_line_t *cmd_line); +static int parse_locals(orte_job_t *jdata, int argc, char* argv[]); +static void set_classpath_jar_file(orte_app_context_t *app, int index, char *jarfile); +static int parse_appfile(orte_job_t *jdata, char *filename, char ***env); + + +int main(int argc, char *argv[]) +{ + int rc; + opal_cmd_line_t cmd_line; + char *param; + orte_job_t *jdata=NULL; + char *hnpenv; + + /* Setup and parse the command line */ + memset(&myglobals, 0, sizeof(myglobals)); + /* find our basename (the name of the executable) so that we can + use it in pretty-print error messages */ + myglobals.basename = opal_basename(argv[0]); + + + opal_cmd_line_create(&cmd_line, cmd_line_init); + mca_base_cmd_line_setup(&cmd_line); + if (OPAL_SUCCESS != (rc = opal_cmd_line_parse(&cmd_line, true, + argc, argv)) ) { + if (OPAL_ERR_SILENT != rc) { + fprintf(stderr, "%s: command line error (%s)\n", argv[0], + opal_strerror(rc)); + } + return rc; + } + + /* + * Since this process can now handle MCA/GMCA parameters, make sure to + * process them. + */ + if (OPAL_SUCCESS != mca_base_cmd_line_process_args(&cmd_line, &environ, &environ)) { + exit(1); + } + + /* Ensure that enough of OPAL is setup for us to be able to run */ + /* + * NOTE: (JJH) + * We need to allow 'mca_base_cmd_line_process_args()' to process command + * line arguments *before* calling opal_init_util() since the command + * line could contain MCA parameters that affect the way opal_init_util() + * functions. AMCA parameters are one such option normally received on the + * command line that affect the way opal_init_util() behaves. + * It is "safe" to call mca_base_cmd_line_process_args() before + * opal_init_util() since mca_base_cmd_line_process_args() does *not* + * depend upon opal_init_util() functionality. + */ + /* Need to initialize OPAL so that install_dirs are filled in */ + if (OPAL_SUCCESS != opal_init(&argc, &argv)) { + exit(1); + } + + /* Check for some "global" command line params */ + parse_globals(argc, argv, &cmd_line); + + /* if they didn't point us at an HNP, that's an error */ + if (NULL == myglobals.hnp) { + fprintf(stderr, "orte-submit: required option --hnp not provided\n"); + exit(1); + } + OBJ_DESTRUCT(&cmd_line); + + if (0 == strncmp(myglobals.hnp, "file", strlen("file")) || + 0 == strncmp(myglobals.hnp, "FILE", strlen("FILE"))) { + char input[1024], *filename; + FILE *fp; + + /* it is a file - get the filename */ + filename = strchr(myglobals.hnp, ':'); + if (NULL == filename) { + /* filename is not correctly formatted */ + orte_show_help("help-orte-top.txt", "orte-top:hnp-filename-bad", true, "uri", myglobals.hnp); + exit(1); + } + ++filename; /* space past the : */ + + if (0 >= strlen(filename)) { + /* they forgot to give us the name! */ + orte_show_help("help-orte-top.txt", "orte-top:hnp-filename-bad", true, "uri", myglobals.hnp); + exit(1); + } + + /* open the file and extract the uri */ + fp = fopen(filename, "r"); + if (NULL == fp) { /* can't find or read file! */ + orte_show_help("help-orte-top.txt", "orte-top:hnp-filename-access", true, myglobals.hnp); + exit(1); + } + if (NULL == fgets(input, 1024, fp)) { + /* something malformed about file */ + fclose(fp); + orte_show_help("help-orte-top.txt", "orte-top:hnp-file-bad", true, myglobals.hnp); + exit(1); + } + fclose(fp); + input[strlen(input)-1] = '\0'; /* remove newline */ + /* construct the target hnp info */ + asprintf(&hnpenv, "OMPI_MCA_orte_hnp_uri=%s", input); + } else { + /* should just be the uri itself - construct the target hnp info */ + asprintf(&hnpenv, "OMPI_MCA_orte_hnp_uri=%s", myglobals.hnp); + } + putenv(hnpenv); // must not free + + /* Setup MCA params */ + orte_register_params(); + + /* flag that I am a TOOL */ + orte_process_info.proc_type = ORTE_PROC_TOOL; + + /* Intialize our Open RTE environment + * Set the flag telling orte_init that I am NOT a + * singleton, but am "infrastructure" - prevents setting + * up incorrect infrastructure that only a singleton would + * require + */ + if (ORTE_SUCCESS != (rc = orte_init(&argc, &argv, ORTE_PROC_TOOL))) { + /* cannot call ORTE_ERROR_LOG as it could be the errmgr + * never got loaded! + */ + return rc; + } + /* finalize OPAL. As it was opened again from orte_init->opal_init + * we continue to have a reference count on it. So we have to finalize it twice... + */ + opal_finalize(); + + /* set the info in our contact table */ + orte_rml.set_contact_info(orte_process_info.my_hnp_uri); + /* extract the name */ + if (ORTE_SUCCESS != orte_rml_base_parse_uris(orte_process_info.my_hnp_uri, ORTE_PROC_MY_HNP, NULL)) { + orte_show_help("help-orte-top.txt", "orte-top:hnp-uri-bad", true, orte_process_info.my_hnp_uri); + exit(1); + } + /* set the route to be direct */ + if (ORTE_SUCCESS != orte_routed.update_route(ORTE_PROC_MY_HNP, ORTE_PROC_MY_HNP)) { + orte_show_help("help-orte-top.txt", "orte-top:hnp-uri-bad", true, orte_process_info.my_hnp_uri); + orte_finalize(); + exit(1); + } + + /* set the target hnp as our lifeline so we will terminate if it exits */ + orte_routed.set_lifeline(ORTE_PROC_MY_HNP); + + /* setup to listen for HNP response to my commands */ + orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_TOOL, + ORTE_RML_PERSISTENT, local_recv, NULL); + + /* set a timeout event in case the HNP doesn't answer */ + + /* if this is the terminate command, just send it */ + if (myglobals.terminate) { + opal_buffer_t *buf; + orte_daemon_cmd_flag_t cmd = ORTE_DAEMON_HALT_VM_CMD; + buf = OBJ_NEW(opal_buffer_t); + opal_dss.pack(buf, &cmd, 1, ORTE_DAEMON_CMD_T); + orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, buf, + ORTE_RML_TAG_DAEMON, + orte_rml_send_callback, NULL); + goto waiting; + } + + /* default our personality to OMPI */ + if (NULL == myglobals.personality) { + myglobals.personality = strdup("ompi"); + } + + /* create a new job object to hold the info for this one - the + * jobid field will be filled in by the PLM when the job is + * launched + */ + jdata = OBJ_NEW(orte_job_t); + if (NULL == jdata) { + /* cannot call ORTE_ERROR_LOG as the errmgr + * hasn't been loaded yet! + */ + return ORTE_ERR_OUT_OF_RESOURCE; + } + jdata->personality = strdup(myglobals.personality); + + /* check what user wants us to do with stdin */ + if (NULL != myglobals.stdin_target) { + if (0 == strcmp(myglobals.stdin_target, "all")) { + jdata->stdin_target = ORTE_VPID_WILDCARD; + } else if (0 == strcmp(myglobals.stdin_target, "none")) { + jdata->stdin_target = ORTE_VPID_INVALID; + } else { + jdata->stdin_target = strtoul(myglobals.stdin_target, NULL, 10); + } + } + + /* if we want the argv's indexed, indicate that */ + if (myglobals.index_argv) { + orte_set_attribute(&jdata->attributes, ORTE_JOB_INDEX_ARGV, ORTE_ATTR_GLOBAL, NULL, OPAL_BOOL); + } + + /* Parse each app, adding it to the job object */ + parse_locals(jdata, argc, argv); + + if (0 == jdata->num_apps) { + /* This should never happen -- this case should be caught in + create_app(), but let's just double check... */ + orte_show_help("help-orterun.txt", "orterun:nothing-to-do", + true, myglobals.basename); + exit(ORTE_ERROR_DEFAULT_EXIT_CODE); + } + + /* check for a job timeout specification, to be provided in seconds + * as that is what MPICH used + */ + if (NULL != (param = getenv("MPIEXEC_TIMEOUT"))) { + if (NULL == (orte_mpiexec_timeout = OBJ_NEW(orte_timer_t))) { + ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); + ORTE_UPDATE_EXIT_STATUS(ORTE_ERR_OUT_OF_RESOURCE); + goto DONE; + } + orte_mpiexec_timeout->tv.tv_sec = strtol(param, NULL, 10); + orte_mpiexec_timeout->tv.tv_usec = 0; + opal_event_evtimer_set(orte_event_base, orte_mpiexec_timeout->ev, + orte_timeout_wakeup, jdata); + opal_event_set_priority(orte_mpiexec_timeout->ev, ORTE_ERROR_PRI); + opal_event_evtimer_add(orte_mpiexec_timeout->ev, &orte_mpiexec_timeout->tv); + } + + /* if recovery was disabled on the cmd line, do so */ + if (myglobals.enable_recovery) { + ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_RECOVERABLE); + } + + /* ask the HNP to spawn the job for us */ + rc = orte_plm.spawn(jdata); + + waiting: + /* loop the event lib until an exit event is detected */ + while (orte_event_base_active) { + opal_event_loop(orte_event_base, OPAL_EVLOOP_ONCE); + } + + DONE: + /* cleanup and leave */ + orte_finalize(); + + if (orte_debug_flag) { + fprintf(stderr, "exiting with status %d\n", orte_exit_status); + } + exit(orte_exit_status); +} + +static int init_globals(void) +{ + /* Reset the other fields every time */ + myglobals.help = false; + myglobals.version = false; + myglobals.num_procs = 0; + if (NULL != myglobals.appfile) { + free(myglobals.appfile); + } + myglobals.appfile = NULL; + if (NULL != myglobals.wdir) { + free(myglobals.wdir); + } + myglobals.set_cwd_to_session_dir = false; + myglobals.wdir = NULL; + if (NULL != myglobals.path) { + free(myglobals.path); + } + myglobals.path = NULL; + + myglobals.preload_binaries = false; + myglobals.preload_files = NULL; + + /* All done */ + return ORTE_SUCCESS; +} + + +static int parse_globals(int argc, char* argv[], opal_cmd_line_t *cmd_line) +{ + /* print version if requested. Do this before check for help so + that --version --help works as one might expect. */ + if (myglobals.version) { + char *str, *project_name = NULL; + if (0 == strcmp(myglobals.basename, "ompi-submit")) { + project_name = "Open MPI"; + } else { + project_name = "OpenRTE"; + } + str = opal_show_help_string("help-orterun.txt", "orterun:version", + false, + myglobals.basename, project_name, OPAL_VERSION, + PACKAGE_BUGREPORT); + if (NULL != str) { + printf("%s", str); + free(str); + } + exit(0); + } + + /* Check for help request */ + if (myglobals.help) { + char *str, *args = NULL; + char *project_name = NULL; + if (0 == strcmp(myglobals.basename, "ompi-submit")) { + project_name = "Open MPI"; + } else { + project_name = "OpenRTE"; + } + args = opal_cmd_line_get_usage_msg(cmd_line); + str = opal_show_help_string("help-orterun.txt", "orterun:usage", false, + myglobals.basename, project_name, OPAL_VERSION, + myglobals.basename, args, + PACKAGE_BUGREPORT); + if (NULL != str) { + printf("%s", str); + free(str); + } + free(args); + + /* If someone asks for help, that should be all we do */ + exit(0); + } + + /* check for request to report pid */ + if (NULL != myglobals.report_pid) { + FILE *fp; + if (0 == strcmp(myglobals.report_pid, "-")) { + /* if '-', then output to stdout */ + printf("%d\n", (int)getpid()); + } else if (0 == strcmp(myglobals.report_pid, "+")) { + /* if '+', output to stderr */ + fprintf(stderr, "%d\n", (int)getpid()); + } else { + fp = fopen(myglobals.report_pid, "w"); + if (NULL == fp) { + orte_show_help("help-orterun.txt", "orterun:write_file", false, + myglobals.basename, "pid", myglobals.report_pid); + exit(0); + } + fprintf(fp, "%d\n", (int)getpid()); + fclose(fp); + } + } + + return ORTE_SUCCESS; +} + + +static int parse_locals(orte_job_t *jdata, int argc, char* argv[]) +{ + int i, rc, app_num; + int temp_argc; + char **temp_argv, **env; + orte_app_context_t *app; + bool made_app; + orte_std_cntr_t j, size1; + + /* Make the apps */ + temp_argc = 0; + temp_argv = NULL; + opal_argv_append(&temp_argc, &temp_argv, argv[0]); + + /* NOTE: This bogus env variable is necessary in the calls to + create_app(), below. See comment immediately before the + create_app() function for an explanation. */ + + env = NULL; + for (app_num = 0, i = 1; i < argc; ++i) { + if (0 == strcmp(argv[i], ":")) { + /* Make an app with this argv */ + if (opal_argv_count(temp_argv) > 1) { + if (NULL != env) { + opal_argv_free(env); + env = NULL; + } + app = NULL; + rc = create_app(temp_argc, temp_argv, jdata, &app, &made_app, &env); + /** keep track of the number of apps - point this app_context to that index */ + if (ORTE_SUCCESS != rc) { + /* Assume that the error message has already been + printed; no need to cleanup -- we can just + exit */ + exit(1); + } + if (made_app) { + app->idx = app_num; + ++app_num; + opal_pointer_array_add(jdata->apps, app); + ++jdata->num_apps; + } + + /* Reset the temps */ + + temp_argc = 0; + temp_argv = NULL; + opal_argv_append(&temp_argc, &temp_argv, argv[0]); + } + } else { + opal_argv_append(&temp_argc, &temp_argv, argv[i]); + } + } + + if (opal_argv_count(temp_argv) > 1) { + app = NULL; + rc = create_app(temp_argc, temp_argv, jdata, &app, &made_app, &env); + if (ORTE_SUCCESS != rc) { + /* Assume that the error message has already been printed; + no need to cleanup -- we can just exit */ + exit(1); + } + if (made_app) { + app->idx = app_num; + ++app_num; + opal_pointer_array_add(jdata->apps, app); + ++jdata->num_apps; + } + } + if (NULL != env) { + opal_argv_free(env); + } + opal_argv_free(temp_argv); + + /* Once we've created all the apps, add the global MCA params to + each app's environment (checking for duplicates, of + course -- yay opal_environ_merge()). */ + + if (NULL != global_mca_env) { + size1 = (size_t)opal_pointer_array_get_size(jdata->apps); + /* Iterate through all the apps */ + for (j = 0; j < size1; ++j) { + app = (orte_app_context_t *) + opal_pointer_array_get_item(jdata->apps, j); + if (NULL != app) { + /* Use handy utility function */ + env = opal_environ_merge(global_mca_env, app->env); + opal_argv_free(app->env); + app->env = env; + } + } + } + + /* Now take a subset of the MCA params and set them as MCA + overrides here in orterun (so that when we orte_init() later, + all the components see these MCA params). Here's how we decide + which subset of the MCA params we set here in orterun: + + 1. If any global MCA params were set, use those + 2. If no global MCA params were set and there was only one app, + then use its app MCA params + 3. Otherwise, don't set any + */ + + env = NULL; + if (NULL != global_mca_env) { + env = global_mca_env; + } else { + if (opal_pointer_array_get_size(jdata->apps) >= 1) { + /* Remember that pointer_array's can be padded with NULL + entries; so only use the app's env if there is exactly + 1 non-NULL entry */ + app = (orte_app_context_t *) + opal_pointer_array_get_item(jdata->apps, 0); + if (NULL != app) { + env = app->env; + for (j = 1; j < opal_pointer_array_get_size(jdata->apps); ++j) { + if (NULL != opal_pointer_array_get_item(jdata->apps, j)) { + env = NULL; + break; + } + } + } + } + } + + if (NULL != env) { + size1 = opal_argv_count(env); + for (j = 0; j < size1; ++j) { + /* Use-after-Free error possible here. putenv does not copy + * the string passed to it, and instead stores only the pointer. + * env[j] may be freed later, in which case the pointer + * in environ will now be left dangling into a deallocated + * region. + * So we make a copy of the variable. + */ + char *s = strdup(env[j]); + + if (NULL == s) { + return OPAL_ERR_OUT_OF_RESOURCE; + } + putenv(s); + } + } + + /* All done */ + + return ORTE_SUCCESS; +} + + +/* + * This function takes a "char ***app_env" parameter to handle the + * specific case: + * + * orterun --mca foo bar -app appfile + * + * That is, we'll need to keep foo=bar, but the presence of the app + * file will cause an invocation of parse_appfile(), which will cause + * one or more recursive calls back to create_app(). Since the + * foo=bar value applies globally to all apps in the appfile, we need + * to pass in the "base" environment (that contains the foo=bar value) + * when we parse each line in the appfile. + * + * This is really just a special case -- when we have a simple case like: + * + * orterun --mca foo bar -np 4 hostname + * + * Then the upper-level function (parse_locals()) calls create_app() + * with a NULL value for app_env, meaning that there is no "base" + * environment that the app needs to be created from. + */ +static int create_app(int argc, char* argv[], + orte_job_t *jdata, + orte_app_context_t **app_ptr, + bool *made_app, char ***app_env) +{ + opal_cmd_line_t cmd_line; + char cwd[OPAL_PATH_MAX]; + int i, j, count, rc; + char *param, *value; + orte_app_context_t *app = NULL; + bool cmd_line_made = false; + bool found = false; + char *appname; + + *made_app = false; + + /* Pre-process the command line if we are going to parse an appfile later. + * save any mca command line args so they can be passed + * separately to the daemons. + * Use Case: + * $ cat launch.appfile + * -np 1 -mca aaa bbb ./my-app -mca ccc ddd + * -np 1 -mca aaa bbb ./my-app -mca eee fff + * $ mpirun -np 2 -mca foo bar --app launch.appfile + * Only pick up '-mca foo bar' on this pass. + */ + if (NULL != myglobals.appfile) { + if (ORTE_SUCCESS != (rc = orte_schizo.parse_cli(myglobals.personality, argc, 0, argv))) { + goto cleanup; + } + } + + /* Parse application command line options. */ + + init_globals(); + opal_cmd_line_create(&cmd_line, cmd_line_init); + mca_base_cmd_line_setup(&cmd_line); + cmd_line_made = true; + rc = opal_cmd_line_parse(&cmd_line, true, argc, argv); + if (ORTE_SUCCESS != rc) { + goto cleanup; + } + mca_base_cmd_line_process_args(&cmd_line, app_env, &global_mca_env); + + /* Is there an appfile in here? */ + + if (NULL != myglobals.appfile) { + OBJ_DESTRUCT(&cmd_line); + return parse_appfile(jdata, strdup(myglobals.appfile), app_env); + } + + /* Setup application context */ + + app = OBJ_NEW(orte_app_context_t); + opal_cmd_line_get_tail(&cmd_line, &count, &app->argv); + + /* See if we have anything left */ + + if (0 == count) { + orte_show_help("help-orterun.txt", "orterun:executable-not-specified", + true, myglobals.basename, myglobals.basename); + rc = ORTE_ERR_NOT_FOUND; + goto cleanup; + } + + /* + * Get mca parameters so we can pass them to the daemons. + * Use the count determined above to make sure we do not go past + * the executable name. Example: + * mpirun -np 2 -mca foo bar ./my-app -mca bip bop + * We want to pick up '-mca foo bar' but not '-mca bip bop' + */ + if (ORTE_SUCCESS != (rc = orte_schizo.parse_cli(myglobals.personality, + argc, count, argv))) { + goto cleanup; + } + + /* Grab all OMPI_* environment variables */ + + app->env = opal_argv_copy(*app_env); + if (ORTE_SUCCESS != (rc = orte_schizo.parse_env(myglobals.personality, + myglobals.path, + &cmd_line, NULL, + environ, &app->env))) { + goto cleanup; + } + + + /* Did the user request a specific wdir? */ + + if (NULL != myglobals.wdir) { + /* if this is a relative path, convert it to an absolute path */ + if (opal_path_is_absolute(myglobals.wdir)) { + app->cwd = strdup(myglobals.wdir); + } else { + /* get the cwd */ + if (OPAL_SUCCESS != (rc = opal_getcwd(cwd, sizeof(cwd)))) { + orte_show_help("help-orterun.txt", "orterun:init-failure", + true, "get the cwd", rc); + goto cleanup; + } + /* construct the absolute path */ + app->cwd = opal_os_path(false, cwd, myglobals.wdir, NULL); + } + orte_set_attribute(&app->attributes, ORTE_APP_USER_CWD, ORTE_ATTR_GLOBAL, NULL, OPAL_BOOL); + } else if (myglobals.set_cwd_to_session_dir) { + orte_set_attribute(&app->attributes, ORTE_APP_SSNDIR_CWD, ORTE_ATTR_GLOBAL, NULL, OPAL_BOOL); + orte_set_attribute(&app->attributes, ORTE_APP_USER_CWD, ORTE_ATTR_GLOBAL, NULL, OPAL_BOOL); + } else { + if (OPAL_SUCCESS != (rc = opal_getcwd(cwd, sizeof(cwd)))) { + orte_show_help("help-orterun.txt", "orterun:init-failure", + true, "get the cwd", rc); + goto cleanup; + } + app->cwd = strdup(cwd); + } + + /* if this is the first app_context, check for prefix directions. + * We only do this for the first app_context because the launchers + * only look at the first one when setting the prefix - we do NOT + * support per-app_context prefix settings! + */ + if (0 == total_num_apps) { + /* Check to see if the user explicitly wanted to disable automatic + --prefix behavior */ + + if (opal_cmd_line_is_taken(&cmd_line, "noprefix")) { + want_prefix_by_default = false; + } + + /* Did the user specify a prefix, or want prefix by default? */ + if (opal_cmd_line_is_taken(&cmd_line, "prefix") || want_prefix_by_default) { + size_t param_len; + /* if both the prefix was given and we have a prefix + * given above, check to see if they match + */ + if (opal_cmd_line_is_taken(&cmd_line, "prefix") && + NULL != myglobals.prefix) { + /* if they don't match, then that merits a warning */ + param = strdup(opal_cmd_line_get_param(&cmd_line, "prefix", 0, 0)); + /* ensure we strip any trailing '/' */ + if (0 == strcmp(OPAL_PATH_SEP, &(param[strlen(param)-1]))) { + param[strlen(param)-1] = '\0'; + } + value = strdup(myglobals.prefix); + if (0 == strcmp(OPAL_PATH_SEP, &(value[strlen(value)-1]))) { + value[strlen(value)-1] = '\0'; + } + if (0 != strcmp(param, value)) { + orte_show_help("help-orterun.txt", "orterun:app-prefix-conflict", + true, myglobals.basename, value, param); + /* let the global-level prefix take precedence since we + * know that one is being used + */ + free(param); + param = strdup(myglobals.prefix); + } + free(value); + } else if (NULL != myglobals.prefix) { + param = myglobals.prefix; + } else if (opal_cmd_line_is_taken(&cmd_line, "prefix")){ + /* must be --prefix alone */ + param = strdup(opal_cmd_line_get_param(&cmd_line, "prefix", 0, 0)); + } else { + /* --enable-orterun-prefix-default was given to orterun */ + param = strdup(opal_install_dirs.prefix); + } + + if (NULL != param) { + /* "Parse" the param, aka remove superfluous path_sep. */ + param_len = strlen(param); + while (0 == strcmp (OPAL_PATH_SEP, &(param[param_len-1]))) { + param[param_len-1] = '\0'; + param_len--; + if (0 == param_len) { + orte_show_help("help-orterun.txt", "orterun:empty-prefix", + true, myglobals.basename, myglobals.basename); + return ORTE_ERR_FATAL; + } + } + orte_set_attribute(&app->attributes, ORTE_APP_PREFIX_DIR, ORTE_ATTR_GLOBAL, param, OPAL_STRING); + free(param); + } + } + } + + /* Did the user specify a hostfile. Need to check for both + * hostfile and machine file. + * We can only deal with one hostfile per app context, otherwise give an error. + */ + if (0 < (j = opal_cmd_line_get_ninsts(&cmd_line, "hostfile"))) { + if(1 < j) { + orte_show_help("help-orterun.txt", "orterun:multiple-hostfiles", + true, myglobals.basename, NULL); + return ORTE_ERR_FATAL; + } else { + value = opal_cmd_line_get_param(&cmd_line, "hostfile", 0, 0); + orte_set_attribute(&app->attributes, ORTE_APP_HOSTFILE, ORTE_ATTR_LOCAL, value, OPAL_STRING); + } + } + if (0 < (j = opal_cmd_line_get_ninsts(&cmd_line, "machinefile"))) { + if(1 < j || orte_get_attribute(&app->attributes, ORTE_APP_HOSTFILE, NULL, OPAL_STRING)) { + orte_show_help("help-orterun.txt", "orterun:multiple-hostfiles", + true, myglobals.basename, NULL); + return ORTE_ERR_FATAL; + } else { + value = opal_cmd_line_get_param(&cmd_line, "machinefile", 0, 0); + orte_set_attribute(&app->attributes, ORTE_APP_HOSTFILE, ORTE_ATTR_LOCAL, value, OPAL_STRING); + } + } + + /* Did the user specify any hosts? */ + if (0 < (j = opal_cmd_line_get_ninsts(&cmd_line, "host"))) { + char **targ=NULL, *tval; + for (i = 0; i < j; ++i) { + value = opal_cmd_line_get_param(&cmd_line, "host", i, 0); + opal_argv_append_nosize(&targ, value); + } + tval = opal_argv_join(targ, ','); + orte_set_attribute(&app->attributes, ORTE_APP_DASH_HOST, ORTE_ATTR_LOCAL, tval, OPAL_STRING); + opal_argv_free(targ); + free(tval); + } + + /* check for bozo error */ + if (0 > myglobals.num_procs) { + orte_show_help("help-orterun.txt", "orterun:negative-nprocs", + true, myglobals.basename, app->argv[0], + myglobals.num_procs, NULL); + return ORTE_ERR_FATAL; + } + + app->num_procs = (orte_std_cntr_t)myglobals.num_procs; + total_num_apps++; + + /* Capture any preload flags */ + if (myglobals.preload_binaries) { + orte_set_attribute(&app->attributes, ORTE_APP_PRELOAD_BIN, ORTE_ATTR_LOCAL, NULL, OPAL_BOOL); + } + /* if we were told to cwd to the session dir and the app was given in + * relative syntax, then we need to preload the binary to + * find the app - don't do this for java apps, however, as we + * can't easily find the class on the cmd line. Java apps have to + * preload their binary via the preload_files option + */ + if (!opal_path_is_absolute(app->argv[0]) && + NULL == strstr(app->argv[0], "java")) { + if (myglobals.preload_binaries) { + orte_set_attribute(&app->attributes, ORTE_APP_SSNDIR_CWD, ORTE_ATTR_GLOBAL, NULL, OPAL_BOOL); + } else if (orte_get_attribute(&app->attributes, ORTE_APP_SSNDIR_CWD, NULL, OPAL_BOOL)) { + orte_set_attribute(&app->attributes, ORTE_APP_PRELOAD_BIN, ORTE_ATTR_LOCAL, NULL, OPAL_BOOL); + } + } + if (NULL != myglobals.preload_files) { + orte_set_attribute(&app->attributes, ORTE_APP_PRELOAD_FILES, ORTE_ATTR_LOCAL, + myglobals.preload_files, OPAL_STRING); + } + + /* Do not try to find argv[0] here -- the starter is responsible + for that because it may not be relevant to try to find it on + the node where orterun is executing. So just strdup() argv[0] + into app. */ + + app->app = strdup(app->argv[0]); + if (NULL == app->app) { + orte_show_help("help-orterun.txt", "orterun:call-failed", + true, myglobals.basename, "library", "strdup returned NULL", errno); + rc = ORTE_ERR_NOT_FOUND; + goto cleanup; + } + + /* if this is a Java application, we have a bit more work to do. Such + * applications actually need to be run under the Java virtual machine + * and the "java" command will start the "executable". So we need to ensure + * that all the proper java-specific paths are provided + */ + appname = opal_basename(app->app); + if (0 == strcmp(appname, "java")) { + /* see if we were given a library path */ + found = false; + for (i=1; NULL != app->argv[i]; i++) { + if (NULL != strstr(app->argv[i], "java.library.path")) { + /* yep - but does it include the path to the mpi libs? */ + found = true; + if (NULL == strstr(app->argv[i], opal_install_dirs.libdir)) { + /* doesn't appear to - add it to be safe */ + if (':' == app->argv[i][strlen(app->argv[i]-1)]) { + asprintf(&value, "-Djava.library.path=%s%s", app->argv[i], opal_install_dirs.libdir); + } else { + asprintf(&value, "-Djava.library.path=%s:%s", app->argv[i], opal_install_dirs.libdir); + } + free(app->argv[i]); + app->argv[i] = value; + } + break; + } + } + if (!found) { + /* need to add it right after the java command */ + asprintf(&value, "-Djava.library.path=%s", opal_install_dirs.libdir); + opal_argv_insert_element(&app->argv, 1, value); + free(value); + } + + /* see if we were given a class path */ + found = false; + for (i=1; NULL != app->argv[i]; i++) { + if (NULL != strstr(app->argv[i], "cp") || + NULL != strstr(app->argv[i], "classpath")) { + /* yep - but does it include the path to the mpi libs? */ + found = true; + /* check if mpi.jar exists - if so, add it */ + value = opal_os_path(false, opal_install_dirs.libdir, "mpi.jar", NULL); + if (access(value, F_OK ) != -1) { + set_classpath_jar_file(app, i+1, "mpi.jar"); + } + free(value); + /* check for oshmem support */ + value = opal_os_path(false, opal_install_dirs.libdir, "shmem.jar", NULL); + if (access(value, F_OK ) != -1) { + set_classpath_jar_file(app, i+1, "shmem.jar"); + } + free(value); + /* always add the local directory */ + asprintf(&value, "%s:%s", app->cwd, app->argv[i+1]); + free(app->argv[i+1]); + app->argv[i+1] = value; + break; + } + } + if (!found) { + /* check to see if CLASSPATH is in the environment */ + found = false; // just to be pedantic + for (i=0; NULL != environ[i]; i++) { + if (0 == strncmp(environ[i], "CLASSPATH", strlen("CLASSPATH"))) { + value = strchr(environ[i], '='); + ++value; /* step over the = */ + opal_argv_insert_element(&app->argv, 1, value); + /* check for mpi.jar */ + value = opal_os_path(false, opal_install_dirs.libdir, "mpi.jar", NULL); + if (access(value, F_OK ) != -1) { + set_classpath_jar_file(app, 1, "mpi.jar"); + } + free(value); + /* check for shmem.jar */ + value = opal_os_path(false, opal_install_dirs.libdir, "shmem.jar", NULL); + if (access(value, F_OK ) != -1) { + set_classpath_jar_file(app, 1, "shmem.jar"); + } + free(value); + /* always add the local directory */ + (void)asprintf(&value, "%s:%s", app->cwd, app->argv[1]); + free(app->argv[1]); + app->argv[1] = value; + opal_argv_insert_element(&app->argv, 1, "-cp"); + found = true; + break; + } + } + if (!found) { + /* need to add it right after the java command - have + * to include the working directory and trust that + * the user set cwd if necessary + */ + char *str, *str2; + /* always start with the working directory */ + str = strdup(app->cwd); + /* check for mpi.jar */ + value = opal_os_path(false, opal_install_dirs.libdir, "mpi.jar", NULL); + if (access(value, F_OK ) != -1) { + (void)asprintf(&str2, "%s:%s", str, value); + free(str); + str = str2; + } + free(value); + /* check for shmem.jar */ + value = opal_os_path(false, opal_install_dirs.libdir, "shmem.jar", NULL); + if (access(value, F_OK ) != -1) { + asprintf(&str2, "%s:%s", str, value); + free(str); + str = str2; + } + free(value); + opal_argv_insert_element(&app->argv, 1, str); + free(str); + opal_argv_insert_element(&app->argv, 1, "-cp"); + } + } + /* try to find the actual command - may not be perfect */ + for (i=1; i < opal_argv_count(app->argv); i++) { + if (NULL != strstr(app->argv[i], "java.library.path")) { + continue; + } else if (NULL != strstr(app->argv[i], "cp") || + NULL != strstr(app->argv[i], "classpath")) { + /* skip the next field */ + i++; + continue; + } + /* declare this the winner */ + opal_setenv("OMPI_COMMAND", app->argv[i], true, &app->env); + /* collect everything else as the cmd line */ + if ((i+1) < opal_argv_count(app->argv)) { + value = opal_argv_join(&app->argv[i+1], ' '); + opal_setenv("OMPI_ARGV", value, true, &app->env); + free(value); + } + break; + } + } else { + /* add the cmd to the environment for MPI_Info to pickup */ + opal_setenv("OMPI_COMMAND", appname, true, &app->env); + if (1 < opal_argv_count(app->argv)) { + value = opal_argv_join(&app->argv[1], ' '); + opal_setenv("OMPI_ARGV", value, true, &app->env); + free(value); + } + } + free(appname); + + *app_ptr = app; + app = NULL; + *made_app = true; + + /* All done */ + + cleanup: + if (NULL != app) { + OBJ_RELEASE(app); + } + if (cmd_line_made) { + OBJ_DESTRUCT(&cmd_line); + } + return rc; +} + +static void set_classpath_jar_file(orte_app_context_t *app, int index, char *jarfile) +{ + if (NULL == strstr(app->argv[index], jarfile)) { + /* nope - need to add it */ + char *fmt = ':' == app->argv[index][strlen(app->argv[index]-1)] + ? "%s%s/%s" : "%s:%s/%s"; + char *str; + asprintf(&str, fmt, app->argv[index], opal_install_dirs.libdir, jarfile); + free(app->argv[index]); + app->argv[index] = str; + } +} + +static int parse_appfile(orte_job_t *jdata, char *filename, char ***env) +{ + size_t i, len; + FILE *fp; + char line[BUFSIZ]; + int rc, argc, app_num; + char **argv; + orte_app_context_t *app; + bool blank, made_app; + char bogus[] = "bogus "; + char **tmp_env; + + /* + * Make sure to clear out this variable so we don't do anything odd in + * app_create() + */ + if (NULL != myglobals.appfile) { + free(myglobals.appfile); + myglobals.appfile = NULL; + } + + /* Try to open the file */ + + fp = fopen(filename, "r"); + if (NULL == fp) { + orte_show_help("help-orterun.txt", "orterun:appfile-not-found", true, + filename); + return ORTE_ERR_NOT_FOUND; + } + + /* Read in line by line */ + + line[sizeof(line) - 1] = '\0'; + app_num = 0; + do { + + /* We need a bogus argv[0] (because when argv comes in from + the command line, argv[0] is "orterun", so the parsing + logic ignores it). So create one here rather than making + an argv and then pre-pending a new argv[0] (which would be + rather inefficient). */ + + line[0] = '\0'; + strcat(line, bogus); + + if (NULL == fgets(line + sizeof(bogus) - 1, + sizeof(line) - sizeof(bogus) - 1, fp)) { + break; + } + + /* Remove a trailing newline */ + + len = strlen(line); + if (len > 0 && '\n' == line[len - 1]) { + line[len - 1] = '\0'; + if (len > 0) { + --len; + } + } + + /* Remove comments */ + + for (i = 0; i < len; ++i) { + if ('#' == line[i]) { + line[i] = '\0'; + break; + } else if (i + 1 < len && '/' == line[i] && '/' == line[i + 1]) { + line[i] = '\0'; + break; + } + } + + /* Is this a blank line? */ + + len = strlen(line); + for (blank = true, i = sizeof(bogus); i < len; ++i) { + if (!isspace(line[i])) { + blank = false; + break; + } + } + if (blank) { + continue; + } + + /* We got a line with *something* on it. So process it */ + + argv = opal_argv_split(line, ' '); + argc = opal_argv_count(argv); + if (argc > 0) { + + /* Create a temporary env to use in the recursive call -- + that is: don't disturb the original env so that we can + have a consistent global env. This allows for the + case: + + orterun --mca foo bar --appfile file + + where the "file" contains multiple apps. In this case, + each app in "file" will get *only* foo=bar as the base + environment from which its specific environment is + constructed. */ + + if (NULL != *env) { + tmp_env = opal_argv_copy(*env); + if (NULL == tmp_env) { + return ORTE_ERR_OUT_OF_RESOURCE; + } + } else { + tmp_env = NULL; + } + + rc = create_app(argc, argv, jdata, &app, &made_app, &tmp_env); + if (ORTE_SUCCESS != rc) { + /* Assume that the error message has already been + printed; no need to cleanup -- we can just exit */ + exit(1); + } + if (NULL != tmp_env) { + opal_argv_free(tmp_env); + } + if (made_app) { + app->idx = app_num; + ++app_num; + opal_pointer_array_add(jdata->apps, app); + ++jdata->num_apps; + } + } + } while (!feof(fp)); + fclose(fp); + + /* All done */ + + free(filename); + return ORTE_SUCCESS; +} + +void orte_timeout_wakeup(int sd, short args, void *cbdata) +{ + char *tm; + + /* this function gets called when the job execution time + * has hit a prescribed limit - so just abort + */ + tm = getenv("MPIEXEC_TIMEOUT"); + orte_show_help("help-orterun.txt", "orterun:timeout", + true, (NULL == tm) ? "NULL" : tm); + ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE); + orte_event_base_active = false; +} + +static void local_recv(int status, orte_process_name_t* sender, + opal_buffer_t *buffer, + orte_rml_tag_t tag, void *cbdata) +{ + int rc, ret; + int32_t cnt; + + /* unpack the completion status of the job */ + cnt = 1; + if (OPAL_SUCCESS != (rc = opal_dss.unpack(buffer, &ret, &cnt, OPAL_INT))) { + ORTE_UPDATE_EXIT_STATUS(rc); + } + /* update our exit status to match */ + ORTE_UPDATE_EXIT_STATUS(ret); + + /* eject us from the event loop - we are done */ +} + diff --git a/orte/tools/orterun/Makefile.am b/orte/tools/orterun/Makefile.am index d95e27dd2b..d2d49e6ca7 100644 --- a/orte/tools/orterun/Makefile.am +++ b/orte/tools/orterun/Makefile.am @@ -11,6 +11,7 @@ # All rights reserved. # Copyright (c) 2008-2014 Cisco Systems, Inc. All rights reserved. # Copyright (c) 2008 Sun Microsystems, Inc. All rights reserved. +# Copyright (c) 2015 Intel, Inc. All rights reserved. # $COPYRIGHT$ # # Additional copyrights may follow diff --git a/orte/tools/orterun/orterun.c b/orte/tools/orterun/orterun.c index 1a802526fe..26bdff0be2 100644 --- a/orte/tools/orterun/orterun.c +++ b/orte/tools/orterun/orterun.c @@ -549,6 +549,10 @@ static opal_cmd_line_init_t cmd_line_init[] = { &orterun_globals.personality, OPAL_CMD_LINE_TYPE_STRING, "Programming model/language being used (default=\"ompi\")" }, + { NULL, '\0', "dvm", "dvm", 0, + &orterun_globals.dvm, OPAL_CMD_LINE_TYPE_BOOL, + "Programming model/language being used (default=\"ompi\")" }, + /* End of list */ { NULL, '\0', NULL, NULL, 0, NULL, OPAL_CMD_LINE_TYPE_NULL, NULL } @@ -1131,6 +1135,7 @@ static int init_globals(void) orterun_globals.index_argv = false; orterun_globals.run_as_root = false; orterun_globals.personality = NULL; + orterun_globals.dvm = false; } /* Reset the other fields every time */ diff --git a/orte/tools/orterun/orterun.h b/orte/tools/orterun/orterun.h index 5f1f0fbab7..2ad00ccca4 100644 --- a/orte/tools/orterun/orterun.h +++ b/orte/tools/orterun/orterun.h @@ -65,6 +65,7 @@ struct orterun_globals_t { bool index_argv; bool run_as_root; char *personality; + bool dvm; }; /**