diff --git a/config/orte_config_files.m4 b/config/orte_config_files.m4 index 54e90a06c5..564ce0ca80 100644 --- a/config/orte_config_files.m4 +++ b/config/orte_config_files.m4 @@ -6,7 +6,7 @@ # Corporation. All rights reserved. # Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights # reserved. -# Copyright (c) 2015 Intel, Inc. All rights reserved +# Copyright (c) 2015-2016 Intel, Inc. All rights reserved # $COPYRIGHT$ # # Additional copyrights may follow @@ -33,7 +33,6 @@ AC_DEFUN([ORTE_CONFIG_FILES],[ orte/tools/orte-migrate/Makefile orte/tools/orte-info/Makefile orte/tools/orte-server/Makefile - orte/tools/orte-submit/Makefile orte/tools/orte-dvm/Makefile ]) ]) diff --git a/ompi/mca/rte/orte/Makefile.am b/ompi/mca/rte/orte/Makefile.am index 5458e41293..804d66adb5 100644 --- a/ompi/mca/rte/orte/Makefile.am +++ b/ompi/mca/rte/orte/Makefile.am @@ -1,7 +1,8 @@ # # Copyright (c) 2012 Los Alamos National Security, LLC. # All rights reserved. -# Copyright (c) 2014 Cisco Systems, Inc. All rights reserved. +# Copyright (c) 2014 Cisco Systems, Inc. All rights reserved. +# Copyright (c) 2016 Intel, Inc. All rights reserved. # $COPYRIGHT$ # # Additional copyrights may follow @@ -27,7 +28,7 @@ libmca_rte_orte_la_SOURCES =$(sources) $(headers) libmca_rte_orte_la_LDFLAGS = -module -avoid-version libmca_rte_orte_la_LIBADD = $(top_builddir)/orte/lib@ORTE_LIB_PREFIX@open-rte.la -man_pages = mpirun.1 mpiexec.1 ompi-ps.1 ompi-clean.1 ompi-top.1 ompi-server.1 ompi-dvm.1 ompi-submit.1 +man_pages = mpirun.1 mpiexec.1 ompi-ps.1 ompi-clean.1 ompi-top.1 ompi-server.1 ompi-dvm.1 if WANT_FT man_pages += ompi-checkpoint.1 ompi-restart.1 @@ -44,7 +45,6 @@ install-exec-hook: (cd $(DESTDIR)$(bindir); rm -f ompi-top$(EXEEXT); $(LN_S) orte-top$(EXEEXT) ompi-top$(EXEEXT)) (cd $(DESTDIR)$(bindir); rm -f ompi-server$(EXEEXT); $(LN_S) orte-server$(EXEEXT) ompi-server$(EXEEXT)) (cd $(DESTDIR)$(bindir); rm -f ompi-dvm$(EXEEXT); $(LN_S) orte-dvm$(EXEEXT) ompi-dvm$(EXEEXT)) - (cd $(DESTDIR)$(bindir); rm -f ompi-submit$(EXEEXT); $(LN_S) orte-submit$(EXEEXT) ompi-submit$(EXEEXT)) if WANT_FT (cd $(DESTDIR)$(bindir); rm -f ompi-checkpoint$(EXEEXT); $(LN_S) orte-checkpoint$(EXEEXT) ompi-checkpoint$(EXEEXT)) (cd $(DESTDIR)$(bindir); rm -f ompi-restart$(EXEEXT); $(LN_S) orte-restart$(EXEEXT) ompi-restart$(EXEEXT)) @@ -58,8 +58,7 @@ uninstall-local: $(DESTDIR)$(bindir)/ompi-clean$(EXEEXT) \ $(DESTDIR)$(bindir)/ompi-top$(EXEEXT) \ $(DESTDIR)$(bindir)/ompi-server$(EXEEXT) \ - $(DESTDIR)$(bindir)/ompi-dvm$(EXEEXT) \ - $(DESTDIR)$(bindir)/ompi-submit$(EXEEXT) + $(DESTDIR)$(bindir)/ompi-dvm$(EXEEXT) if WANT_FT rm -f $(DESTDIR)$(bindir)/ompi-checkpoint$(EXEEXT) \ $(DESTDIR)$(bindir)/ompi-restart$(EXEEXT) \ @@ -122,8 +121,5 @@ ompi-server.1: $(top_builddir)/orte/tools/orte-server/orte-server.1 ompi-dvm.1: $(top_builddir)/orte/tools/orte-dvm/orte-dvm.1 cp -f $(top_builddir)/orte/tools/orte-dvm/orte-dvm.1 ompi-dvm.1 -ompi-submit.1: $(top_builddir)/orte/tools/orte-submit/orte-submit.1 - cp -f $(top_builddir)/orte/tools/orte-submit/orte-submit.1 ompi-submit.1 - clean-local: rm -f $(man_pages) diff --git a/opal/mca/base/base.h b/opal/mca/base/base.h index 1fdcbd899d..7d31a0277b 100644 --- a/opal/mca/base/base.h +++ b/opal/mca/base/base.h @@ -156,7 +156,7 @@ OPAL_DECLSPEC int mca_base_is_component_required(opal_list_t *components_availab /* mca_base_cmd_line.c */ OPAL_DECLSPEC int mca_base_cmd_line_setup(opal_cmd_line_t *cmd); -OPAL_DECLSPEC int mca_base_cmd_line_process_args(opal_cmd_line_t *cmd, +OPAL_DECLSPEC int mca_base_cmd_line_process_args(char **argv, char ***app_env, char ***global_env); OPAL_DECLSPEC void mca_base_cmd_line_wrap_args(char **args); diff --git a/opal/mca/base/mca_base_cmd_line.c b/opal/mca/base/mca_base_cmd_line.c index d831916701..ded9b22e7c 100644 --- a/opal/mca/base/mca_base_cmd_line.c +++ b/opal/mca/base/mca_base_cmd_line.c @@ -94,29 +94,25 @@ int mca_base_cmd_line_setup(opal_cmd_line_t *cmd) /* * Look for and handle any -mca options on the command line */ -int mca_base_cmd_line_process_args(opal_cmd_line_t *cmd, +int mca_base_cmd_line_process_args(char **argv, char ***context_env, char ***global_env) { - int i, num_insts, rc; + int i, rc; char **params; char **values; - /* If no relevant parameters were given, just return */ - - if (!opal_cmd_line_is_taken(cmd, OPAL_MCA_CMD_LINE_ID) && - !opal_cmd_line_is_taken(cmd, "g"OPAL_MCA_CMD_LINE_ID)) { - return OPAL_SUCCESS; - } - - /* Handle app context-specific parameters */ - - num_insts = opal_cmd_line_get_ninsts(cmd, OPAL_MCA_CMD_LINE_ID); params = values = NULL; - for (i = 0; i < num_insts; ++i) { - if (OPAL_SUCCESS != (rc = process_arg(opal_cmd_line_get_param(cmd, OPAL_MCA_CMD_LINE_ID, i, 0), - opal_cmd_line_get_param(cmd, OPAL_MCA_CMD_LINE_ID, i, 1), - ¶ms, &values))) { - return rc; + for (i = 0; NULL != argv[i]; ++i) { + if (0 == strcmp("-"OPAL_MCA_CMD_LINE_ID, argv[i]) || + 0 == strcmp("--"OPAL_MCA_CMD_LINE_ID, argv[i])) { + if (NULL == argv[i+1] || NULL == argv[i+2]) { + return OPAL_ERR_BAD_PARAM; + } + if (OPAL_SUCCESS != (rc = process_arg(argv[i+1], argv[i+2], + ¶ms, &values))) { + return rc; + } + i += 2; } } if (NULL != params) { @@ -125,15 +121,19 @@ int mca_base_cmd_line_process_args(opal_cmd_line_t *cmd, opal_argv_free(values); } - /* Handle global parameters */ - num_insts = opal_cmd_line_get_ninsts(cmd, "g"OPAL_MCA_CMD_LINE_ID); params = values = NULL; - for (i = 0; i < num_insts; ++i) { - if (OPAL_SUCCESS != (rc = process_arg(opal_cmd_line_get_param(cmd, "g"OPAL_MCA_CMD_LINE_ID, i, 0), - opal_cmd_line_get_param(cmd, "g"OPAL_MCA_CMD_LINE_ID, i, 1), - ¶ms, &values))) { - return rc; + for (i = 0; NULL != argv[i]; ++i) { + if (0 == strcmp("-g"OPAL_MCA_CMD_LINE_ID, argv[i]) || + 0 == strcmp("--g"OPAL_MCA_CMD_LINE_ID, argv[i])) { + if (NULL == argv[i+1] || NULL == argv[i+2]) { + return OPAL_ERR_BAD_PARAM; + } + if (OPAL_SUCCESS != (rc = process_arg(argv[i+1], argv[i+2], + ¶ms, &values))) { + return rc; + } + i += 2; } } if (NULL != params) { @@ -190,7 +190,6 @@ static int process_arg(const char *param, const char *value, /* If we didn't already have an value for the same param, save this one away */ - opal_argv_append_nosize(params, param); opal_argv_append_nosize(values, p1); free(p1); diff --git a/opal/runtime/opal_info_support.c b/opal/runtime/opal_info_support.c index 3f4694ce27..0db8789321 100644 --- a/opal/runtime/opal_info_support.c +++ b/opal/runtime/opal_info_support.c @@ -207,7 +207,7 @@ int opal_info_init(int argc, char **argv, exit(cmd_error ? 1 : 0); } - mca_base_cmd_line_process_args(opal_info_cmd_line, &app_env, &global_env); + mca_base_cmd_line_process_args(argv, &app_env, &global_env); /* set the flags */ diff --git a/opal/util/cmd_line.c b/opal/util/cmd_line.c index 3aa8564092..059bc38fa9 100644 --- a/opal/util/cmd_line.c +++ b/opal/util/cmd_line.c @@ -156,7 +156,9 @@ int opal_cmd_line_create(opal_cmd_line_t *cmd, } OBJ_CONSTRUCT(cmd, opal_cmd_line_t); - ret = opal_cmd_line_add(cmd, table); + if (NULL != table) { + ret = opal_cmd_line_add(cmd, table); + } return ret; } diff --git a/orte/mca/ess/hnp/ess_hnp_module.c b/orte/mca/ess/hnp/ess_hnp_module.c index 7649b155e0..db490b15f8 100644 --- a/orte/mca/ess/hnp/ess_hnp_module.c +++ b/orte/mca/ess/hnp/ess_hnp_module.c @@ -76,6 +76,7 @@ #include "orte/mca/state/base/base.h" #include "orte/mca/state/state.h" +#include "orte/orted/orted_submit.h" #include "orte/orted/pmix/pmix_server.h" #include "orte/util/show_help.h" @@ -713,6 +714,14 @@ static int rte_init(void) goto error; } + /* setup to support debugging */ + orte_state.add_job_state(ORTE_JOB_STATE_READY_FOR_DEBUGGERS, + orte_debugger_init_after_spawn, + ORTE_SYS_PRI); + orte_state.add_job_state(ORTE_JOB_STATE_DEBUGGER_DETACH, + orte_debugger_detached, + ORTE_SYS_PRI); + /* if a tool has launched us and is requesting event reports, * then set its contact info into the comm system */ diff --git a/orte/mca/grpcomm/base/grpcomm_base_stubs.c b/orte/mca/grpcomm/base/grpcomm_base_stubs.c index 621b645da2..ef5874067a 100644 --- a/orte/mca/grpcomm/base/grpcomm_base_stubs.c +++ b/orte/mca/grpcomm/base/grpcomm_base_stubs.c @@ -324,7 +324,7 @@ static int create_dmns(orte_grpcomm_signature_t *sig, *dmns = NULL; return ORTE_ERR_NOT_FOUND; } - if (NULL == jdata->map) { + if (0 == jdata->map->num_nodes) { /* we haven't generated a job map yet - if we are the HNP, * then we should only involve ourselves. Otherwise, we have * no choice but to abort to avoid hangs */ @@ -340,12 +340,6 @@ static int create_dmns(orte_grpcomm_signature_t *sig, *dmns = NULL; return ORTE_ERR_NOT_FOUND; } - /* get the array */ - if (0 == jdata->map->num_nodes) { - ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE); - ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); - return ORTE_ERR_SILENT; - } dns = (orte_vpid_t*)malloc(jdata->map->num_nodes * sizeof(vpid)); nds = 0; for (i=0; i < jdata->map->nodes->size && (int)nds < jdata->map->num_nodes; i++) { diff --git a/orte/mca/plm/base/plm_base_launch_support.c b/orte/mca/plm/base/plm_base_launch_support.c index e048cd64df..e52347a65e 100644 --- a/orte/mca/plm/base/plm_base_launch_support.c +++ b/orte/mca/plm/base/plm_base_launch_support.c @@ -1537,6 +1537,9 @@ int orte_plm_base_setup_virtual_machine(orte_job_t *jdata) ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); return ORTE_ERR_NOT_FOUND; } + if (NULL == daemons->map) { + daemons->map = OBJ_NEW(orte_job_map_t); + } map = daemons->map; /* if this job is being launched against a fixed DVM, then there is @@ -1552,8 +1555,7 @@ int orte_plm_base_setup_virtual_machine(orte_job_t *jdata) * the virtual machine unless specifically requested to do so */ if (ORTE_JOBID_INVALID != jdata->originator.jobid) { - OBJ_CONSTRUCT(&nodes, opal_list_t); - if (NULL == daemons->map) { + if (0 == map->num_nodes) { OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output, "%s plm:base:setup_vm creating map", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); @@ -1562,16 +1564,15 @@ int orte_plm_base_setup_virtual_machine(orte_job_t *jdata) * are obviously already here! The ess will already * have assigned our node to us. */ - daemons->map = OBJ_NEW(orte_job_map_t); node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, 0); - opal_pointer_array_add(daemons->map->nodes, (void*)node); - ++(daemons->map->num_nodes); + opal_pointer_array_add(map->nodes, (void*)node); + ++(map->num_nodes); /* maintain accounting */ OBJ_RETAIN(node); /* mark that this is from a singleton */ singleton = true; } - map = daemons->map; + OBJ_CONSTRUCT(&nodes, opal_list_t); for (i=1; i < orte_node_pool->size; i++) { if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, i))) { continue; @@ -1618,16 +1619,6 @@ int orte_plm_base_setup_virtual_machine(orte_job_t *jdata) */ if (orte_get_attribute(&daemons->attributes, ORTE_JOB_NO_VM, NULL, OPAL_BOOL)) { OBJ_CONSTRUCT(&nodes, opal_list_t); - if (NULL == daemons->map) { - OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output, - "%s plm:base:setup_vm creating map", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - /* this is the first time thru, so the vm is just getting - * defined - create a map for it - */ - daemons->map = OBJ_NEW(orte_job_map_t); - } - map = daemons->map; /* loop across all nodes and include those that have * num_procs > 0 && no daemon already on them */ @@ -1685,23 +1676,21 @@ int orte_plm_base_setup_virtual_machine(orte_job_t *jdata) goto process; } - if (NULL == daemons->map) { + if (0 == map->num_nodes) { OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output, "%s plm:base:setup_vm creating map", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* this is the first time thru, so the vm is just getting - * defined - create a map for it and put us in as we + * defined - put us in as we * are obviously already here! The ess will already * have assigned our node to us. */ - daemons->map = OBJ_NEW(orte_job_map_t); node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, 0); - opal_pointer_array_add(daemons->map->nodes, (void*)node); - ++(daemons->map->num_nodes); + opal_pointer_array_add(map->nodes, (void*)node); + ++(map->num_nodes); /* maintain accounting */ OBJ_RETAIN(node); } - map = daemons->map; /* zero-out the number of new daemons as we will compute this * each time we are called diff --git a/orte/mca/rmaps/ppr/rmaps_ppr.c b/orte/mca/rmaps/ppr/rmaps_ppr.c index 226ee68e86..7bb6af1f6f 100644 --- a/orte/mca/rmaps/ppr/rmaps_ppr.c +++ b/orte/mca/rmaps/ppr/rmaps_ppr.c @@ -110,8 +110,10 @@ static int ppr_mapper(orte_job_t *jdata) ORTE_MAPPING_PPR != ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) { /* not for us */ opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:ppr: job %s not using ppr mapper", - ORTE_JOBID_PRINT(jdata->jobid)); + "mca:rmaps:ppr: job %s not using ppr mapper PPR %s policy %s", + ORTE_JOBID_PRINT(jdata->jobid), + (NULL == jdata->map->ppr) ? "NULL" : jdata->map->ppr, + (ORTE_MAPPING_PPR == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) ? "PPRSET" : "PPR NOTSET"); return ORTE_ERR_TAKE_NEXT_OPTION; } diff --git a/orte/mca/schizo/base/base.h b/orte/mca/schizo/base/base.h index 1cb72d702c..fd33fb10c9 100644 --- a/orte/mca/schizo/base/base.h +++ b/orte/mca/schizo/base/base.h @@ -41,6 +41,7 @@ ORTE_DECLSPEC int orte_schizo_base_select(void); typedef struct { /* list of active modules */ opal_list_t active_modules; + char **personalities; } orte_schizo_base_t; /** @@ -61,15 +62,13 @@ OBJ_CLASS_DECLARATION(orte_schizo_base_active_module_t); /* the base stub functions */ ORTE_DECLSPEC const char* orte_schizo_base_print_env(orte_schizo_launch_environ_t env); -ORTE_DECLSPEC int orte_schizo_base_parse_cli(char **personality, - int argc, int start, char **argv); -ORTE_DECLSPEC int orte_schizo_base_parse_env(char **personality, - char *path, +ORTE_DECLSPEC int orte_schizo_base_define_cli(opal_cmd_line_t *cli); +ORTE_DECLSPEC int orte_schizo_base_parse_cli(int argc, int start, char **argv); +ORTE_DECLSPEC int orte_schizo_base_parse_env(char *path, opal_cmd_line_t *cmd_line, char **srcenv, char ***dstenv); -ORTE_DECLSPEC int orte_schizo_base_setup_app(char **personality, - orte_app_context_t *app); +ORTE_DECLSPEC int orte_schizo_base_setup_app(orte_app_context_t *app); ORTE_DECLSPEC int orte_schizo_base_setup_fork(orte_job_t *jdata, orte_app_context_t *context); ORTE_DECLSPEC int orte_schizo_base_setup_child(orte_job_t *jobdat, diff --git a/orte/mca/schizo/base/schizo_base_frame.c b/orte/mca/schizo/base/schizo_base_frame.c index c9fb70650f..0665aff9d4 100644 --- a/orte/mca/schizo/base/schizo_base_frame.c +++ b/orte/mca/schizo/base/schizo_base_frame.c @@ -37,6 +37,7 @@ */ orte_schizo_base_t orte_schizo_base = {{{0}}}; orte_schizo_base_module_t orte_schizo = { + .define_cli = orte_schizo_base_define_cli, .parse_cli = orte_schizo_base_parse_cli, .parse_env = orte_schizo_base_parse_env, .setup_app = orte_schizo_base_setup_app, @@ -46,10 +47,28 @@ orte_schizo_base_module_t orte_schizo = { .finalize = orte_schizo_base_finalize }; +static char *personalities = NULL; + +static int orte_schizo_base_register(mca_base_register_flag_t flags) +{ + /* pickup any defined personalities */ + personalities = NULL; + mca_base_var_register("orte", "schizo", "base", "personalities", + "Comma-separated list of personalities", + MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, + OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_READONLY, + &personalities); + return ORTE_SUCCESS; +} + static int orte_schizo_base_close(void) { /* cleanup globals */ OPAL_LIST_DESTRUCT(&orte_schizo_base.active_modules); + if (NULL != orte_schizo_base.personalities) { + opal_argv_free(orte_schizo_base.personalities); + } return mca_base_framework_components_close(&orte_schizo_base_framework, NULL); } @@ -64,6 +83,10 @@ static int orte_schizo_base_open(mca_base_open_flag_t flags) /* init the globals */ OBJ_CONSTRUCT(&orte_schizo_base.active_modules, opal_list_t); + orte_schizo_base.personalities = NULL; + if (NULL != personalities) { + orte_schizo_base.personalities = opal_argv_split(personalities, ','); + } /* Open up all available components */ rc = mca_base_framework_components_open(&orte_schizo_base_framework, flags); @@ -73,7 +96,8 @@ static int orte_schizo_base_open(mca_base_open_flag_t flags) } MCA_BASE_FRAMEWORK_DECLARE(orte, schizo, "ORTE Schizo Subsystem", - NULL, orte_schizo_base_open, orte_schizo_base_close, + orte_schizo_base_register, + orte_schizo_base_open, orte_schizo_base_close, mca_schizo_base_static_components, 0); OBJ_CLASS_INSTANCE(orte_schizo_base_active_module_t, diff --git a/orte/mca/schizo/base/schizo_base_select.c b/orte/mca/schizo/base/schizo_base_select.c index 00fc0b0da9..d37a661aba 100644 --- a/orte/mca/schizo/base/schizo_base_select.c +++ b/orte/mca/schizo/base/schizo_base_select.c @@ -28,8 +28,6 @@ * available. */ -static bool selected = false; - int orte_schizo_base_select(void) { mca_base_component_list_item_t *cli = NULL; @@ -40,11 +38,10 @@ int orte_schizo_base_select(void) int rc, priority; bool inserted; - if (selected) { + if (0 < opal_list_get_size(&orte_schizo_base.active_modules)) { /* ensure we don't do this twice */ return ORTE_SUCCESS; } - selected = true; /* Query all available components and ask if they have a module */ OPAL_LIST_FOREACH(cli, &orte_schizo_base_framework.framework_components, mca_base_component_list_item_t) { diff --git a/orte/mca/schizo/base/schizo_base_stubs.c b/orte/mca/schizo/base/schizo_base_stubs.c index 21794c3d59..cd50b7a820 100644 --- a/orte/mca/schizo/base/schizo_base_stubs.c +++ b/orte/mca/schizo/base/schizo_base_stubs.c @@ -37,19 +37,14 @@ } } -int orte_schizo_base_parse_cli(char **personality, - int argc, int start, char **argv) +int orte_schizo_base_define_cli(opal_cmd_line_t *cli) { int rc; orte_schizo_base_active_module_t *mod; - if (NULL == personality) { - return ORTE_ERR_NOT_SUPPORTED; - } - OPAL_LIST_FOREACH(mod, &orte_schizo_base.active_modules, orte_schizo_base_active_module_t) { - if (NULL != mod->module->parse_cli) { - rc = mod->module->parse_cli(personality, argc, start, argv); + if (NULL != mod->module->define_cli) { + rc = mod->module->define_cli(cli); if (ORTE_SUCCESS != rc && ORTE_ERR_TAKE_NEXT_OPTION != rc) { ORTE_ERROR_LOG(rc); return rc; @@ -59,8 +54,24 @@ int orte_schizo_base_parse_cli(char **personality, return ORTE_SUCCESS; } -int orte_schizo_base_parse_env(char **personality, - char *path, +int orte_schizo_base_parse_cli(int argc, int start, char **argv) +{ + int rc; + orte_schizo_base_active_module_t *mod; + + OPAL_LIST_FOREACH(mod, &orte_schizo_base.active_modules, orte_schizo_base_active_module_t) { + if (NULL != mod->module->parse_cli) { + rc = mod->module->parse_cli(argc, start, argv); + if (ORTE_SUCCESS != rc && ORTE_ERR_TAKE_NEXT_OPTION != rc) { + ORTE_ERROR_LOG(rc); + return rc; + } + } + } + return ORTE_SUCCESS; +} + +int orte_schizo_base_parse_env(char *path, opal_cmd_line_t *cmd_line, char **srcenv, char ***dstenv) @@ -70,7 +81,7 @@ int orte_schizo_base_parse_env(char **personality, OPAL_LIST_FOREACH(mod, &orte_schizo_base.active_modules, orte_schizo_base_active_module_t) { if (NULL != mod->module->parse_env) { - rc = mod->module->parse_env(personality, path, cmd_line, srcenv, dstenv); + rc = mod->module->parse_env(path, cmd_line, srcenv, dstenv); if (ORTE_SUCCESS != rc && ORTE_ERR_TAKE_NEXT_OPTION != rc) { ORTE_ERROR_LOG(rc); return rc; @@ -80,15 +91,14 @@ int orte_schizo_base_parse_env(char **personality, return ORTE_SUCCESS; } -int orte_schizo_base_setup_app(char **personality, - orte_app_context_t *app) +int orte_schizo_base_setup_app(orte_app_context_t *app) { int rc; orte_schizo_base_active_module_t *mod; OPAL_LIST_FOREACH(mod, &orte_schizo_base.active_modules, orte_schizo_base_active_module_t) { if (NULL != mod->module->setup_app) { - rc = mod->module->setup_app(personality, app); + rc = mod->module->setup_app(app); if (ORTE_SUCCESS != rc && ORTE_ERR_TAKE_NEXT_OPTION != rc) { ORTE_ERROR_LOG(rc); return rc; diff --git a/orte/mca/schizo/ompi/schizo_ompi.c b/orte/mca/schizo/ompi/schizo_ompi.c index 9eba49c2bd..786d166bd6 100644 --- a/orte/mca/schizo/ompi/schizo_ompi.c +++ b/orte/mca/schizo/ompi/schizo_ompi.c @@ -13,7 +13,7 @@ * All rights reserved. * Copyright (c) 2009-2015 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2011 Oak Ridge National Labs. All rights reserved. - * Copyright (c) 2013-2015 Intel, Inc. All rights reserved. + * Copyright (c) 2013-2016 Intel, Inc. All rights reserved. * Copyright (c) 2015 Research Organization for Information Science * and Technology (RIST). All rights reserved. * $COPYRIGHT$ @@ -42,17 +42,17 @@ #include "orte/mca/errmgr/errmgr.h" #include "orte/mca/ess/base/base.h" #include "orte/mca/rmaps/rmaps_types.h" +#include "orte/orted/orted_submit.h" #include "orte/util/name_fns.h" #include "orte/util/session_dir.h" #include "orte/util/show_help.h" #include "orte/runtime/orte_globals.h" -#include "orte/mca/schizo/schizo.h" +#include "orte/mca/schizo/base/base.h" -static int parse_cli(char **personality, - int argc, int start, char **argv); -static int parse_env(char **personality, - char *path, +static int define_cli(opal_cmd_line_t *cli); +static int parse_cli(int argc, int start, char **argv); +static int parse_env(char *path, opal_cmd_line_t *cmd_line, char **srcenv, char ***dstenv); @@ -63,14 +63,426 @@ static int setup_child(orte_job_t *jobdat, orte_app_context_t *app); orte_schizo_base_module_t orte_schizo_ompi_module = { + .define_cli = define_cli, .parse_cli = parse_cli, .parse_env = parse_env, .setup_fork = setup_fork, .setup_child = setup_child }; -static int parse_cli(char **personality, - int argc, int start, char **argv) + +static opal_cmd_line_init_t cmd_line_init[] = { + /* Various "obvious" options */ + { NULL, 'h', NULL, "help", 0, + &orte_cmd_line.help, OPAL_CMD_LINE_TYPE_BOOL, + "This help message" }, + { NULL, 'V', NULL, "version", 0, + &orte_cmd_line.version, OPAL_CMD_LINE_TYPE_BOOL, + "Print version and exit" }, + { NULL, 'v', NULL, "verbose", 0, + &orte_cmd_line.verbose, OPAL_CMD_LINE_TYPE_BOOL, + "Be verbose" }, + { "orte_execute_quiet", 'q', NULL, "quiet", 0, + NULL, OPAL_CMD_LINE_TYPE_BOOL, + "Suppress helpful messages" }, + { NULL, '\0', "report-pid", "report-pid", 1, + &orte_cmd_line.report_pid, OPAL_CMD_LINE_TYPE_STRING, + "Printout pid on stdout [-], stderr [+], or a file [anything else]" }, + { NULL, '\0', "report-uri", "report-uri", 1, + &orte_cmd_line.report_uri, OPAL_CMD_LINE_TYPE_STRING, + "Printout URI on stdout [-], stderr [+], or a file [anything else]" }, + + /* exit status reporting */ + { "orte_report_child_jobs_separately", '\0', "report-child-jobs-separately", "report-child-jobs-separately", 0, + NULL, OPAL_CMD_LINE_TYPE_BOOL, + "Return the exit status of the primary job only" }, + + /* uri of the dvm, or at least where to get it */ + { NULL, '\0', "hnp", "hnp", 1, + &orte_cmd_line.hnp, OPAL_CMD_LINE_TYPE_STRING, + "Specify the URI of the HNP, or the name of the file (specified as file:filename) that contains that info" }, + + /* hetero apps */ + { "orte_hetero_apps", '\0', NULL, "hetero-apps", 0, + NULL, OPAL_CMD_LINE_TYPE_BOOL, + "Indicates that multiple app_contexts are being provided that are a mix of 32/64 bit binaries" }, + + /* select XML output */ + { "orte_xml_output", '\0', "xml", "xml", 0, + NULL, OPAL_CMD_LINE_TYPE_BOOL, + "Provide all output in XML format" }, + { "orte_xml_file", '\0', "xml-file", "xml-file", 1, + NULL, OPAL_CMD_LINE_TYPE_STRING, + "Provide all output in XML format to the specified file" }, + + /* tag output */ + { "orte_tag_output", '\0', "tag-output", "tag-output", 0, + NULL, OPAL_CMD_LINE_TYPE_BOOL, + "Tag all output with [job,rank]" }, + { "orte_timestamp_output", '\0', "timestamp-output", "timestamp-output", 0, + NULL, OPAL_CMD_LINE_TYPE_BOOL, + "Timestamp all application process output" }, + { "orte_output_filename", '\0', "output-filename", "output-filename", 1, + NULL, OPAL_CMD_LINE_TYPE_STRING, + "Redirect output from application processes into filename/job/rank/std[out,err,diag]" }, + { NULL, '\0', "merge-stderr-to-stdout", "merge-stderr-to-stdout", 0, + &orte_cmd_line.merge, OPAL_CMD_LINE_TYPE_BOOL, + "Merge stderr to stdout for each process"}, + { "orte_xterm", '\0', "xterm", "xterm", 1, + NULL, OPAL_CMD_LINE_TYPE_STRING, + "Create a new xterm window and display output from the specified ranks there" }, + + /* select stdin option */ + { NULL, '\0', "stdin", "stdin", 1, + &orte_cmd_line.stdin_target, OPAL_CMD_LINE_TYPE_STRING, + "Specify procs to receive stdin [rank, all, none] (default: 0, indicating rank 0)" }, + + /* request that argv[0] be indexed */ + { NULL, '\0', "index-argv-by-rank", "index-argv-by-rank", 0, + &orte_cmd_line.index_argv, OPAL_CMD_LINE_TYPE_BOOL, + "Uniquely index argv[0] for each process using its rank" }, + + /* Specify the launch agent to be used */ + { "orte_launch_agent", '\0', "launch-agent", "launch-agent", 1, + NULL, OPAL_CMD_LINE_TYPE_STRING, + "Command used to start processes on remote nodes (default: orted)" }, + + /* Preload the binary on the remote machine */ + { NULL, 's', NULL, "preload-binary", 0, + &orte_cmd_line.preload_binaries, OPAL_CMD_LINE_TYPE_BOOL, + "Preload the binary on the remote machine before starting the remote process." }, + + /* Preload files on the remote machine */ + { NULL, '\0', NULL, "preload-files", 1, + &orte_cmd_line.preload_files, OPAL_CMD_LINE_TYPE_STRING, + "Preload the comma separated list of files to the remote machines current working directory before starting the remote process." }, + +#if OPAL_ENABLE_FT_CR == 1 + /* Tell SStore to preload a snapshot before launch */ + { NULL, '\0', NULL, "sstore-load", 1, + &orte_cmd_line.sstore_load, OPAL_CMD_LINE_TYPE_STRING, + "Internal Use Only! Tell SStore to preload a snapshot before launch." }, +#endif + + /* Use an appfile */ + { NULL, '\0', NULL, "app", 1, + &orte_cmd_line.appfile, OPAL_CMD_LINE_TYPE_STRING, + "Provide an appfile; ignore all other command line options" }, + + /* Number of processes; -c, -n, --n, -np, and --np are all + synonyms */ + { NULL, 'c', "np", "np", 1, + &orte_cmd_line.num_procs, OPAL_CMD_LINE_TYPE_INT, + "Number of processes to run" }, + { NULL, '\0', "n", "n", 1, + &orte_cmd_line.num_procs, OPAL_CMD_LINE_TYPE_INT, + "Number of processes to run" }, + + /* maximum size of VM - typically used to subdivide an allocation */ + { "orte_max_vm_size", '\0', "max-vm-size", "max-vm-size", 1, + NULL, OPAL_CMD_LINE_TYPE_INT, + "Number of processes to run" }, + + /* Set a hostfile */ + { NULL, '\0', "hostfile", "hostfile", 1, + NULL, OPAL_CMD_LINE_TYPE_STRING, + "Provide a hostfile" }, + { NULL, '\0', "machinefile", "machinefile", 1, + NULL, OPAL_CMD_LINE_TYPE_STRING, + "Provide a hostfile" }, + { "orte_default_hostfile", '\0', "default-hostfile", "default-hostfile", 1, + NULL, OPAL_CMD_LINE_TYPE_STRING, + "Provide a default hostfile" }, + { "opal_if_do_not_resolve", '\0', "do-not-resolve", "do-not-resolve", 0, + NULL, OPAL_CMD_LINE_TYPE_BOOL, + "Do not attempt to resolve interfaces" }, + + /* uri of PMIx publish/lookup server, or at least where to get it */ + { "pmix_server_uri", '\0', "ompi-server", "ompi-server", 1, + NULL, OPAL_CMD_LINE_TYPE_STRING, + "Specify the URI of the publish/lookup server, or the name of the file (specified as file:filename) that contains that info" }, + + { "carto_file_path", '\0', "cf", "cartofile", 1, + NULL, OPAL_CMD_LINE_TYPE_STRING, + "Provide a cartography file" }, + + { "orte_rankfile", '\0', "rf", "rankfile", 1, + NULL, OPAL_CMD_LINE_TYPE_STRING, + "Provide a rankfile file" }, + + /* Export environment variables; potentially used multiple times, + so it does not make sense to set into a variable */ + { NULL, 'x', NULL, NULL, 1, + NULL, OPAL_CMD_LINE_TYPE_NULL, + "Export an environment variable, optionally specifying a value (e.g., \"-x foo\" exports the environment variable foo and takes its value from the current environment; \"-x foo=bar\" exports the environment variable name foo and sets its value to \"bar\" in the started processes)" }, + + /* Mapping controls */ + { "rmaps_base_display_map", '\0', "display-map", "display-map", 0, + NULL, OPAL_CMD_LINE_TYPE_BOOL, + "Display the process map just before launch"}, + { "rmaps_base_display_devel_map", '\0', "display-devel-map", "display-devel-map", 0, + NULL, OPAL_CMD_LINE_TYPE_BOOL, + "Display a detailed process map (mostly intended for developers) just before launch"}, + { "rmaps_base_display_topo_with_map", '\0', "display-topo", "display-topo", 0, + NULL, OPAL_CMD_LINE_TYPE_BOOL, + "Display the topology as part of the process map (mostly intended for developers) just before launch"}, + { "rmaps_base_display_diffable_map", '\0', "display-diffable-map", "display-diffable-map", 0, + NULL, OPAL_CMD_LINE_TYPE_BOOL, + "Display a diffable process map (mostly intended for developers) just before launch"}, + { NULL, 'H', "host", "host", 1, + NULL, OPAL_CMD_LINE_TYPE_STRING, + "List of hosts to invoke processes on" }, + { "rmaps_base_no_schedule_local", '\0', "nolocal", "nolocal", 0, + NULL, OPAL_CMD_LINE_TYPE_BOOL, + "Do not run any MPI applications on the local node" }, + { "rmaps_base_no_oversubscribe", '\0', "nooversubscribe", "nooversubscribe", 0, + NULL, OPAL_CMD_LINE_TYPE_BOOL, + "Nodes are not to be oversubscribed, even if the system supports such operation"}, + { "rmaps_base_oversubscribe", '\0', "oversubscribe", "oversubscribe", 0, + NULL, OPAL_CMD_LINE_TYPE_BOOL, + "Nodes are allowed to be oversubscribed, even on a managed system, and overloading of processing elements"}, + { "rmaps_base_cpus_per_rank", '\0', "cpus-per-proc", "cpus-per-proc", 1, + NULL, OPAL_CMD_LINE_TYPE_INT, + "Number of cpus to use for each process [default=1]" }, + { "rmaps_base_cpus_per_rank", '\0', "cpus-per-rank", "cpus-per-rank", 1, + NULL, OPAL_CMD_LINE_TYPE_INT, + "Synonym for cpus-per-proc" }, + + /* backward compatiblity */ + { "rmaps_base_bycore", '\0', "bycore", "bycore", 0, + NULL, OPAL_CMD_LINE_TYPE_BOOL, + "Whether to map and rank processes round-robin by core" }, + { "rmaps_base_bynode", '\0', "bynode", "bynode", 0, + NULL, OPAL_CMD_LINE_TYPE_BOOL, + "Whether to map and rank processes round-robin by node" }, + { "rmaps_base_byslot", '\0', "byslot", "byslot", 0, + NULL, OPAL_CMD_LINE_TYPE_BOOL, + "Whether to map and rank processes round-robin by slot" }, + + /* Nperxxx options that do not require topology and are always + * available - included for backwards compatibility + */ + { "rmaps_ppr_pernode", '\0', "pernode", "pernode", 0, + NULL, OPAL_CMD_LINE_TYPE_BOOL, + "Launch one process per available node" }, + { "rmaps_ppr_n_pernode", '\0', "npernode", "npernode", 1, + NULL, OPAL_CMD_LINE_TYPE_INT, + "Launch n processes per node on all allocated nodes" }, + { "rmaps_ppr_n_pernode", '\0', "N", NULL, 1, + NULL, OPAL_CMD_LINE_TYPE_INT, + "Launch n processes per node on all allocated nodes (synonym for npernode)" }, + + /* declare hardware threads as independent cpus */ + { "hwloc_base_use_hwthreads_as_cpus", '\0', "use-hwthread-cpus", "use-hwthread-cpus", 0, + NULL, OPAL_CMD_LINE_TYPE_BOOL, + "Use hardware threads as independent cpus" }, + + /* include npersocket for backwards compatibility */ + { "rmaps_ppr_n_persocket", '\0', "npersocket", "npersocket", 1, + NULL, OPAL_CMD_LINE_TYPE_INT, + "Launch n processes per socket on all allocated nodes" }, + + /* Mapping options */ + { "rmaps_base_mapping_policy", '\0', NULL, "map-by", 1, + NULL, OPAL_CMD_LINE_TYPE_STRING, + "Mapping Policy [slot | hwthread | core | socket (default) | numa | board | node]" }, + + /* Ranking options */ + { "rmaps_base_ranking_policy", '\0', NULL, "rank-by", 1, + NULL, OPAL_CMD_LINE_TYPE_STRING, + "Ranking Policy [slot (default) | hwthread | core | socket | numa | board | node]" }, + + /* Binding options */ + { "hwloc_base_binding_policy", '\0', NULL, "bind-to", 1, + NULL, OPAL_CMD_LINE_TYPE_STRING, + "Policy for binding processes. Allowed values: none, hwthread, core, l1cache, l2cache, l3cache, socket, numa, board (\"none\" is the default when oversubscribed, \"core\" is the default when np<=2, and \"socket\" is the default when np>2). Allowed qualifiers: overload-allowed, if-supported" }, + + /* backward compatiblity */ + { "hwloc_base_bind_to_core", '\0', "bind-to-core", "bind-to-core", 0, + NULL, OPAL_CMD_LINE_TYPE_BOOL, + "Bind processes to cores" }, + { "hwloc_base_bind_to_socket", '\0', "bind-to-socket", "bind-to-socket", 0, + NULL, OPAL_CMD_LINE_TYPE_BOOL, + "Bind processes to sockets" }, + + { "hwloc_base_report_bindings", '\0', "report-bindings", "report-bindings", 0, + NULL, OPAL_CMD_LINE_TYPE_BOOL, + "Whether to report process bindings to stderr" }, + + /* slot list option */ + { "hwloc_base_slot_list", '\0', "slot-list", "slot-list", 1, + NULL, OPAL_CMD_LINE_TYPE_STRING, + "List of processor IDs to bind processes to [default=NULL]"}, + + /* generalized pattern mapping option */ + { "rmaps_ppr_pattern", '\0', NULL, "ppr", 1, + NULL, OPAL_CMD_LINE_TYPE_STRING, + "Comma-separated list of number of processes on a given resource type [default: none]" }, + + /* Allocation options */ + { "orte_display_alloc", '\0', "display-allocation", "display-allocation", 0, + NULL, OPAL_CMD_LINE_TYPE_BOOL, + "Display the allocation being used by this job"}, + { "orte_display_devel_alloc", '\0', "display-devel-allocation", "display-devel-allocation", 0, + NULL, OPAL_CMD_LINE_TYPE_BOOL, + "Display a detailed list (mostly intended for developers) of the allocation being used by this job"}, + { "hwloc_base_cpu_set", '\0', "cpu-set", "cpu-set", 1, + NULL, OPAL_CMD_LINE_TYPE_STRING, + "Comma-separated list of ranges specifying logical cpus allocated to this job [default: none]"}, + + /* mpiexec-like arguments */ + { NULL, '\0', "wdir", "wdir", 1, + &orte_cmd_line.wdir, OPAL_CMD_LINE_TYPE_STRING, + "Set the working directory of the started processes" }, + { NULL, '\0', "wd", "wd", 1, + &orte_cmd_line.wdir, OPAL_CMD_LINE_TYPE_STRING, + "Synonym for --wdir" }, + { NULL, '\0', "set-cwd-to-session-dir", "set-cwd-to-session-dir", 0, + &orte_cmd_line.set_cwd_to_session_dir, OPAL_CMD_LINE_TYPE_BOOL, + "Set the working directory of the started processes to their session directory" }, + { NULL, '\0', "path", "path", 1, + &orte_cmd_line.path, OPAL_CMD_LINE_TYPE_STRING, + "PATH to be used to look for executables to start processes" }, + + /* User-level debugger arguments */ + { NULL, '\0', "tv", "tv", 0, + &orte_cmd_line.debugger, OPAL_CMD_LINE_TYPE_BOOL, + "Deprecated backwards compatibility flag; synonym for \"--debug\"" }, + { NULL, '\0', "debug", "debug", 0, + &orte_cmd_line.debugger, OPAL_CMD_LINE_TYPE_BOOL, + "Invoke the user-level debugger indicated by the orte_base_user_debugger MCA parameter" }, + { "orte_base_user_debugger", '\0', "debugger", "debugger", 1, + NULL, OPAL_CMD_LINE_TYPE_STRING, + "Sequence of debuggers to search for when \"--debug\" is used" }, + { "orte_output_debugger_proctable", '\0', "output-proctable", "output-proctable", 0, + NULL, OPAL_CMD_LINE_TYPE_BOOL, + "Output the debugger proctable after launch" }, + + /* OpenRTE arguments */ + { "orte_debug", 'd', "debug-devel", "debug-devel", 0, + NULL, OPAL_CMD_LINE_TYPE_BOOL, + "Enable debugging of OpenRTE" }, + + { "orte_debug_daemons", '\0', "debug-daemons", "debug-daemons", 0, + NULL, OPAL_CMD_LINE_TYPE_INT, + "Enable debugging of any OpenRTE daemons used by this application" }, + + { "orte_debug_daemons_file", '\0', "debug-daemons-file", "debug-daemons-file", 0, + NULL, OPAL_CMD_LINE_TYPE_BOOL, + "Enable debugging of any OpenRTE daemons used by this application, storing output in files" }, + + { "orte_leave_session_attached", '\0', "leave-session-attached", "leave-session-attached", 0, + NULL, OPAL_CMD_LINE_TYPE_BOOL, + "Enable debugging of OpenRTE" }, + + { "orte_do_not_launch", '\0', "do-not-launch", "do-not-launch", 0, + NULL, OPAL_CMD_LINE_TYPE_BOOL, + "Perform all necessary operations to prepare to launch the application, but do not actually launch it" }, + + { NULL, '\0', NULL, "prefix", 1, + NULL, OPAL_CMD_LINE_TYPE_STRING, + "Prefix where Open MPI is installed on remote nodes" }, + { NULL, '\0', NULL, "noprefix", 0, + NULL, OPAL_CMD_LINE_TYPE_STRING, + "Disable automatic --prefix behavior" }, + + { "orte_report_launch_progress", '\0', "show-progress", "show-progress", 0, + NULL, OPAL_CMD_LINE_TYPE_BOOL, + "Output a brief periodic report on launch progress" }, + + { "orte_use_regexp", '\0', "use-regexp", "use-regexp", 0, + NULL, OPAL_CMD_LINE_TYPE_BOOL, + "Use regular expressions for launch" }, + + { "orte_report_events", '\0', "report-events", "report-events", 1, + NULL, OPAL_CMD_LINE_TYPE_STRING, + "Report events to a tool listening at the specified URI" }, + + { "orte_enable_recovery", '\0', "enable-recovery", "enable-recovery", 0, + NULL, OPAL_CMD_LINE_TYPE_BOOL, + "Enable recovery from process failure [Default = disabled]" }, + + { "orte_max_restarts", '\0', "max-restarts", "max-restarts", 1, + NULL, OPAL_CMD_LINE_TYPE_INT, + "Max number of times to restart a failed process" }, + + { "orte_hetero_nodes", '\0', NULL, "hetero-nodes", 0, + NULL, OPAL_CMD_LINE_TYPE_BOOL, + "Nodes in cluster may differ in topology, so send the topology back from each node [Default = false]" }, + +#if OPAL_ENABLE_CRDEBUG == 1 + { "opal_cr_enable_crdebug", '\0', "crdebug", "crdebug", 0, + NULL, OPAL_CMD_LINE_TYPE_BOOL, + "Enable C/R Debugging" }, +#endif + + { NULL, '\0', "disable-recovery", "disable-recovery", 0, + &orte_cmd_line.disable_recovery, OPAL_CMD_LINE_TYPE_BOOL, + "Disable recovery (resets all recovery options to off)" }, + + { "state_novm_select", '\0', "novm", "novm", 0, + NULL, OPAL_CMD_LINE_TYPE_BOOL, + "Execute without creating an allocation-spanning virtual machine (only start daemons on nodes hosting application procs)" }, + + { NULL, '\0', "staged", "staged", 0, + &orte_cmd_line.staged_exec, OPAL_CMD_LINE_TYPE_BOOL, + "Used staged execution if inadequate resources are present (cannot support MPI jobs)" }, + + { NULL, '\0', "allow-run-as-root", "allow-run-as-root", 0, + &orte_cmd_line.run_as_root, OPAL_CMD_LINE_TYPE_BOOL, + "Allow execution as root (STRONGLY DISCOURAGED)" }, + + { NULL, '\0', "personality", "personality", 1, + &orte_cmd_line.personality, OPAL_CMD_LINE_TYPE_STRING, + "Comma-separated list of programming model, languages, and containers being used (default=\"ompi\")" }, + + { NULL, '\0', "dvm", "dvm", 0, + &orte_cmd_line.create_dvm, OPAL_CMD_LINE_TYPE_BOOL, + "Create a persistent distributed virtual machine (DVM)" }, + + /* tell the dvm to terminate */ + { NULL, '\0', "terminate", "terminate", 0, + &orte_cmd_line.terminate_dvm, OPAL_CMD_LINE_TYPE_BOOL, + "Terminate the DVM" }, + + /* End of list */ + { NULL, '\0', NULL, NULL, 0, + NULL, OPAL_CMD_LINE_TYPE_NULL, NULL } +}; + +static int define_cli(opal_cmd_line_t *cli) +{ + int i, rc; + bool takeus = false; + + opal_output_verbose(1, orte_schizo_base_framework.framework_output, + "%s schizo:ompi: define_cli", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); + + /* protect against bozo error */ + if (NULL == cli) { + return ORTE_ERR_BAD_PARAM; + } + + if (NULL != orte_schizo_base.personalities) { + /* if we aren't included, then ignore us */ + for (i=0; NULL != orte_schizo_base.personalities[i]; i++) { + if (0 == strcmp(orte_schizo_base.personalities[i], "ompi")) { + takeus = true; + break; + } + } + if (!takeus) { + return ORTE_ERR_TAKE_NEXT_OPTION; + } + } + + /* just add ours to the end */ + rc = opal_cmd_line_add(cli, cmd_line_init); + return rc; +} + +static int parse_cli(int argc, int start, char **argv) { int i, j, k; bool ignore; @@ -83,15 +495,25 @@ static int parse_cli(char **personality, }; bool takeus = false; - /* see if we are included */ - for (i=0; NULL != personality[i]; i++) { - if (0 == strcmp(personality[i], "ompi")) { - takeus = true; - break; + opal_output_verbose(1, orte_schizo_base_framework.framework_output, + "%s schizo:ompi: parse_cli", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); + + /* if they gave us a list of personalities, + * see if we are included */ + if (NULL != orte_schizo_base.personalities) { + for (i=0; NULL != orte_schizo_base.personalities[i]; i++) { + if (0 == strcmp(orte_schizo_base.personalities[i], "ompi")) { + takeus = true; + break; + } } - } - if (!takeus) { - return ORTE_ERR_TAKE_NEXT_OPTION; + if (!takeus) { + return ORTE_ERR_TAKE_NEXT_OPTION; + } + } else { + /* attempt to auto-detect CLI options that + * we recognize */ } for (i = 0; i < (argc-start); ++i) { @@ -162,8 +584,7 @@ static int parse_cli(char **personality, return ORTE_SUCCESS; } -static int parse_env(char **personality, - char *path, +static int parse_env(char *path, opal_cmd_line_t *cmd_line, char **srcenv, char ***dstenv) @@ -175,15 +596,21 @@ static int parse_env(char **personality, char **vars; bool takeus = false; - /* see if we are included */ - for (i=0; NULL != personality[i]; i++) { - if (0 == strcmp(personality[i], "ompi")) { - takeus = true; - break; + opal_output_verbose(1, orte_schizo_base_framework.framework_output, + "%s schizo:ompi: parse_env", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); + + if (NULL != orte_schizo_base.personalities) { + /* see if we are included */ + for (i=0; NULL != orte_schizo_base.personalities[i]; i++) { + if (0 == strcmp(orte_schizo_base.personalities[i], "ompi")) { + takeus = true; + break; + } + } + if (!takeus) { + return ORTE_ERR_TAKE_NEXT_OPTION; } - } - if (!takeus) { - return ORTE_ERR_TAKE_NEXT_OPTION; } for (i = 0; NULL != srcenv[i]; ++i) { @@ -307,15 +734,21 @@ static int setup_fork(orte_job_t *jdata, char *num_app_ctx; bool takeus = false; + opal_output_verbose(1, orte_schizo_base_framework.framework_output, + "%s schizo:ompi: setup_fork", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); + + if (NULL != orte_schizo_base.personalities) { /* see if we are included */ - for (i=0; NULL != jdata->personality[i]; i++) { - if (0 == strcmp(jdata->personality[i], "ompi")) { - takeus = true; - break; + for (i=0; NULL != jdata->personality[i]; i++) { + if (0 == strcmp(jdata->personality[i], "ompi")) { + takeus = true; + break; + } + } + if (!takeus) { + return ORTE_ERR_TAKE_NEXT_OPTION; } - } - if (!takeus) { - return ORTE_ERR_TAKE_NEXT_OPTION; } /* see if the mapper thinks we are oversubscribed */ @@ -539,15 +972,21 @@ static int setup_child(orte_job_t *jdata, int32_t nrestarts=0, *nrptr; bool takeus = false; - /* see if we are included */ - for (i=0; NULL != jdata->personality[i]; i++) { - if (0 == strcmp(jdata->personality[i], "ompi")) { - takeus = true; - break; + opal_output_verbose(1, orte_schizo_base_framework.framework_output, + "%s schizo:ompi: setup_child", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); + + if (NULL != orte_schizo_base.personalities) { + /* see if we are included */ + for (i=0; NULL != jdata->personality[i]; i++) { + if (0 == strcmp(jdata->personality[i], "ompi")) { + takeus = true; + break; + } + } + if (!takeus) { + return ORTE_ERR_TAKE_NEXT_OPTION; } - } - if (!takeus) { - return ORTE_ERR_TAKE_NEXT_OPTION; } /* setup the jobid */ diff --git a/orte/mca/schizo/schizo.h b/orte/mca/schizo/schizo.h index 5906a1e22f..932b6dfc47 100644 --- a/orte/mca/schizo/schizo.h +++ b/orte/mca/schizo/schizo.h @@ -44,32 +44,40 @@ BEGIN_C_DECLS * things it requires */ typedef int (*orte_schizo_base_module_init_fn_t)(void); -/* given an argv-array of personalities, parse a tool command line +/* provide an opportunity for components to add personality and/or + * environment-specific command line options. The OPAL cli tools + * will add provided options to the CLI definition, and so the + * resulting CLI array will include the _union_ of options provided + * by the various components. Where there is overlap (i.e., an option + * is added that was also defined earlier in the stack), then the + * first definition is used. This reflects the higher priority of + * the original definition - note that this only impacts the help + * message that will be displayed */ +typedef int (*orte_schizo_base_module_define_cli_fn_t)(opal_cmd_line_t *cli); + +/* parse a tool command line * starting from the given location according to the cmd line options * known to this module's personality. First, of course, check that - * this module is included in the specified array of personalities! - * Only one command-line parser is allowed to operate - i.e., if */ -typedef int (*orte_schizo_base_module_parse_cli_fn_t)(char **personality, - int argc, int start, + * this module is included in the base array of personalities, or is + * automatically recognizable! */ +typedef int (*orte_schizo_base_module_parse_cli_fn_t)(int argc, int start, char **argv); -/* given an argv-array of personalities, parse the environment of the +/* parse the environment of the * tool to extract any personality-specific envars that need to be * forward to the app's environment upon execution */ -typedef int (*orte_schizo_base_module_parse_env_fn_t)(char **personality, - char *path, +typedef int (*orte_schizo_base_module_parse_env_fn_t)(char *path, opal_cmd_line_t *cmd_line, char **srcenv, char ***dstenv); -/* given an argv-array of personalities, do whatever preparation work +/* do whatever preparation work * is required to setup the app for execution. This is intended to be * used by orterun and other launcher tools to, for example, change * an executable's relative-path to an absolute-path, or add a command * required for starting a particular kind of application (e.g., adding * "java" to start a Java application) */ -typedef int (*orte_schizo_base_module_setup_app_fn_t)(char **personality, - orte_app_context_t *app); +typedef int (*orte_schizo_base_module_setup_app_fn_t)(orte_app_context_t *app); /* add any personality-specific envars required at the job level prior * to beginning to execute local procs */ @@ -107,6 +115,7 @@ typedef void (*orte_schizo_base_module_finalize_fn_t)(void); */ typedef struct { orte_schizo_base_module_init_fn_t init; + orte_schizo_base_module_define_cli_fn_t define_cli; orte_schizo_base_module_parse_cli_fn_t parse_cli; orte_schizo_base_module_parse_env_fn_t parse_env; orte_schizo_base_module_setup_app_fn_t setup_app; diff --git a/orte/mca/schizo/singularity/schizo_singularity.c b/orte/mca/schizo/singularity/schizo_singularity.c index 059347bf09..bc70b56787 100644 --- a/orte/mca/schizo/singularity/schizo_singularity.c +++ b/orte/mca/schizo/singularity/schizo_singularity.c @@ -29,8 +29,7 @@ #include "schizo_singularity.h" -static int setup_app(char **personality, - orte_app_context_t *context); +static int setup_app(orte_app_context_t *context); static int setup_fork(orte_job_t *jdata, orte_app_context_t *context); @@ -39,18 +38,19 @@ orte_schizo_base_module_t orte_schizo_singularity_module = { .setup_fork = setup_fork }; -static int setup_app(char **personality, - orte_app_context_t *app) +static int setup_app(orte_app_context_t *app) { int i; char *newenv, *pth, *t2; bool takeus = false; - /* see if we are included */ - for (i=0; NULL != personality[i]; i++) { - if (0 == strcmp(personality[i], "singularity")) { - takeus = true; - break; + if (NULL != orte_schizo_base.personalities) { + /* see if we are included */ + for (i=0; NULL != orte_schizo_base.personalities[i]; i++) { + if (0 == strcmp(orte_schizo_base.personalities[i], "singularity")) { + takeus = true; + break; + } } } if (!takeus) { @@ -113,11 +113,13 @@ static int setup_fork(orte_job_t *jdata, char *p, *t2; char dir[MAXPATHLEN]; - /* see if we are included */ - for (i=0; NULL != jdata->personality[i]; i++) { - if (0 == strcmp(jdata->personality[i], "singularity")) { - takeus = true; - break; + if (NULL != orte_schizo_base.personalities) { + /* see if we are included */ + for (i=0; NULL != jdata->personality[i]; i++) { + if (0 == strcmp(jdata->personality[i], "singularity")) { + takeus = true; + break; + } } } if (!takeus) { diff --git a/orte/orted/orted_main.c b/orte/orted/orted_main.c index eef05637d7..2739318f7f 100644 --- a/orte/orted/orted_main.c +++ b/orte/orted/orted_main.c @@ -263,7 +263,7 @@ int orte_daemon(int argc, char *argv[]) * Since this process can now handle MCA/GMCA parameters, make sure to * process them. */ - mca_base_cmd_line_process_args(cmd_line, &environ, &environ); + mca_base_cmd_line_process_args(argv, &environ, &environ); /* Ensure that enough of OPAL is setup for us to be able to run */ /* diff --git a/orte/orted/orted_submit.c b/orte/orted/orted_submit.c index 9d52f1fb0d..281e3071a5 100644 --- a/orte/orted/orted_submit.c +++ b/orte/orted/orted_submit.c @@ -87,16 +87,20 @@ #include "orte/mca/rmaps/rmaps_types.h" #include "orte/mca/rmaps/base/base.h" -#include "orte/mca/schizo/schizo.h" #include "orte/mca/errmgr/errmgr.h" +#include "orte/mca/plm/base/plm_private.h" #include "orte/mca/rml/rml.h" #include "orte/mca/rml/base/rml_contact.h" #include "orte/mca/routed/routed.h" +#include "orte/mca/schizo/schizo.h" +#include "orte/mca/state/state.h" #include "orte/runtime/runtime.h" #include "orte/runtime/orte_globals.h" #include "orte/runtime/orte_wait.h" #include "orte/runtime/orte_quit.h" +#include "orte/util/cmd_line.h" +#include "orte/util/pre_condition_transports.h" #include "orte/util/show_help.h" #include "orted_submit.h" @@ -112,187 +116,9 @@ static bool want_prefix_by_default = (bool) ORTE_WANT_ORTERUN_PREFIX_BY_DEFAULT; static opal_pointer_array_t tool_jobs; static opal_cmd_line_t *cmd_line=NULL; static bool mycmdline = false; - -static opal_cmd_line_init_t cmd_line_init[] = { - /* Various "obvious" options */ - { NULL, 'h', NULL, "help", 0, - &orte_cmd_line.help, OPAL_CMD_LINE_TYPE_BOOL, - "This help message" }, - { NULL, 'V', NULL, "version", 0, - &orte_cmd_line.version, OPAL_CMD_LINE_TYPE_BOOL, - "Print version and exit" }, - - /* tag output */ - { NULL, '\0', "tag-output", "tag-output", 0, - &orte_cmd_line.tag_output, OPAL_CMD_LINE_TYPE_BOOL, - "Tag all output with [job,rank]" }, - { NULL, '\0', "timestamp-output", "timestamp-output", 0, - &orte_cmd_line.timestamp_output, OPAL_CMD_LINE_TYPE_BOOL, - "Timestamp all application process output" }, - { NULL, '\0', "output-filename", "output-filename", 1, - &orte_cmd_line.output_filename, OPAL_CMD_LINE_TYPE_STRING, - "Redirect output from application processes into filename/job/rank/std[out,err,diag]" }, - { NULL, '\0', "merge-stderr-to-stdout", "merge-stderr-to-stdout", 0, - &orte_cmd_line.merge, OPAL_CMD_LINE_TYPE_BOOL, - "Merge stderr to stdout for each process"}, - - /* select stdin option */ - { NULL, '\0', "stdin", "stdin", 1, - &orte_cmd_line.stdin_target, OPAL_CMD_LINE_TYPE_STRING, - "Specify procs to receive stdin [rank, all, none] (default: 0, indicating rank 0)" }, - - /* request that argv[0] be indexed */ - { NULL, '\0', "index-argv-by-rank", "index-argv-by-rank", 0, - &orte_cmd_line.index_argv, OPAL_CMD_LINE_TYPE_BOOL, - "Uniquely index argv[0] for each process using its rank" }, - - /* Preload the binary on the remote machine */ - { NULL, 's', NULL, "preload-binary", 0, - &orte_cmd_line.preload_binaries, OPAL_CMD_LINE_TYPE_BOOL, - "Preload the binary on the remote machine before starting the remote process." }, - - /* Preload files on the remote machine */ - { NULL, '\0', NULL, "preload-files", 1, - &orte_cmd_line.preload_files, OPAL_CMD_LINE_TYPE_STRING, - "Preload the comma separated list of files to the remote machines current working directory before starting the remote process." }, - - /* Use an appfile */ - { NULL, '\0', NULL, "app", 1, - &orte_cmd_line.appfile, OPAL_CMD_LINE_TYPE_STRING, - "Provide an appfile; ignore all other command line options" }, - - /* Number of processes; -c, -n, --n, -np, and --np are all - synonyms */ - { NULL, 'c', "np", "np", 1, - &orte_cmd_line.num_procs, OPAL_CMD_LINE_TYPE_INT, - "Number of processes to run" }, - { NULL, '\0', "n", "n", 1, - &orte_cmd_line.num_procs, OPAL_CMD_LINE_TYPE_INT, - "Number of processes to run" }, - - /* uri of the dvm, or at least where to get it */ - { NULL, '\0', "hnp", "hnp", 1, - &orte_cmd_line.hnp, OPAL_CMD_LINE_TYPE_STRING, - "Specify the URI of the Open MPI server, or the name of the file (specified as file:filename) that contains that info" }, - - /* Set a hostfile */ - { NULL, '\0', "hostfile", "hostfile", 1, - NULL, OPAL_CMD_LINE_TYPE_STRING, - "Provide a hostfile" }, - { NULL, '\0', "machinefile", "machinefile", 1, - NULL, OPAL_CMD_LINE_TYPE_STRING, - "Provide a hostfile" }, - { "orte_default_hostfile", '\0', "default-hostfile", "default-hostfile", 1, - NULL, OPAL_CMD_LINE_TYPE_STRING, - "Provide a default hostfile" }, - { "opal_if_do_not_resolve", '\0', "do-not-resolve", "do-not-resolve", 0, - NULL, OPAL_CMD_LINE_TYPE_BOOL, - "Do not attempt to resolve interfaces" }, - - /* Export environment variables; potentially used multiple times, - so it does not make sense to set into a variable */ - { NULL, 'x', NULL, NULL, 1, - NULL, OPAL_CMD_LINE_TYPE_NULL, - "Export an environment variable, optionally specifying a value (e.g., \"-x foo\" exports the environment variable foo and takes its value from the current environment; \"-x foo=bar\" exports the environment variable name foo and sets its value to \"bar\" in the started processes)" }, - - /* Mapping controls */ - { NULL, 'H', "host", "host", 1, - NULL, OPAL_CMD_LINE_TYPE_STRING, - "List of hosts to invoke processes on" }, - { NULL, '\0', "nolocal", "nolocal", 0, - &orte_cmd_line.nolocal, OPAL_CMD_LINE_TYPE_BOOL, - "Do not run any MPI applications on the local node" }, - { NULL, '\0', "nooversubscribe", "nooversubscribe", 0, - &orte_cmd_line.no_oversubscribe, OPAL_CMD_LINE_TYPE_BOOL, - "Nodes are not to be oversubscribed, even if the system supports such operation"}, - { NULL, '\0', "oversubscribe", "oversubscribe", 0, - &orte_cmd_line.oversubscribe, OPAL_CMD_LINE_TYPE_BOOL, - "Nodes are allowed to be oversubscribed, even on a managed system, and overloading of processing elements"}, - { NULL, '\0', "cpus-per-proc", "cpus-per-proc", 1, - &orte_cmd_line.cpus_per_proc, OPAL_CMD_LINE_TYPE_INT, - "Number of cpus to use for each process [default=1]" }, - - /* Nperxxx options that do not require topology and are always - * available - included for backwards compatibility - */ - { NULL, '\0', "pernode", "pernode", 0, - &orte_cmd_line.pernode, OPAL_CMD_LINE_TYPE_BOOL, - "Launch one process per available node" }, - { NULL, '\0', "npernode", "npernode", 1, - &orte_cmd_line.npernode, OPAL_CMD_LINE_TYPE_INT, - "Launch n processes per node on all allocated nodes" }, - { NULL, '\0', "N", NULL, 1, - &orte_cmd_line.npernode, OPAL_CMD_LINE_TYPE_INT, - "Launch n processes per node on all allocated nodes (synonym for npernode)" }, - - /* declare hardware threads as independent cpus */ - { NULL, '\0', "use-hwthread-cpus", "use-hwthread-cpus", 0, - &orte_cmd_line.use_hwthreads_as_cpus, OPAL_CMD_LINE_TYPE_BOOL, - "Use hardware threads as independent cpus" }, - - /* include npersocket for backwards compatibility */ - { NULL, '\0', "npersocket", "npersocket", 1, - &orte_cmd_line.npersocket, OPAL_CMD_LINE_TYPE_INT, - "Launch n processes per socket on all allocated nodes" }, - - /* Mapping options */ - { NULL, '\0', NULL, "map-by", 1, - &orte_cmd_line.mapping_policy, OPAL_CMD_LINE_TYPE_STRING, - "Mapping Policy [slot | hwthread | core | socket (default) | numa | board | node]" }, - - /* Ranking options */ - { NULL, '\0', NULL, "rank-by", 1, - &orte_cmd_line.ranking_policy, OPAL_CMD_LINE_TYPE_STRING, - "Ranking Policy [slot (default) | hwthread | core | socket | numa | board | node]" }, - - /* Binding options */ - { NULL, '\0', NULL, "bind-to", 1, - &orte_cmd_line.binding_policy, OPAL_CMD_LINE_TYPE_STRING, - "Policy for binding processes. Allowed values: none, hwthread, core, l1cache, l2cache, l3cache, socket, numa, board (\"none\" is the default when oversubscribed, \"core\" is the default when np<=2, and \"socket\" is the default when np>2). Allowed qualifiers: overload-allowed, if-supported" }, - - { NULL, '\0', "report-bindings", "report-bindings", 0, - &orte_cmd_line.report_bindings, OPAL_CMD_LINE_TYPE_BOOL, - "Whether to report process bindings to stderr" }, - - /* slot list option */ - { NULL, '\0', "slot-list", "slot-list", 1, - &orte_cmd_line.slot_list, OPAL_CMD_LINE_TYPE_STRING, - "List of processor IDs to bind processes to [default=NULL]"}, - - /* mpiexec-like arguments */ - { NULL, '\0', "wdir", "wdir", 1, - &orte_cmd_line.wdir, OPAL_CMD_LINE_TYPE_STRING, - "Set the working directory of the started processes" }, - { NULL, '\0', "wd", "wd", 1, - &orte_cmd_line.wdir, OPAL_CMD_LINE_TYPE_STRING, - "Synonym for --wdir" }, - { NULL, '\0', "set-cwd-to-session-dir", "set-cwd-to-session-dir", 0, - &orte_cmd_line.set_cwd_to_session_dir, OPAL_CMD_LINE_TYPE_BOOL, - "Set the working directory of the started processes to their session directory" }, - { NULL, '\0', "path", "path", 1, - &orte_cmd_line.path, OPAL_CMD_LINE_TYPE_STRING, - "PATH to be used to look for executables to start processes" }, - - { NULL, '\0', "enable-recovery", "enable-recovery", 0, - &orte_cmd_line.enable_recovery, OPAL_CMD_LINE_TYPE_BOOL, - "Enable recovery (resets all recovery options to on)" }, - - { NULL, '\0', "personality", "personality", 1, - &orte_cmd_line.personality, OPAL_CMD_LINE_TYPE_STRING, - "Comma-separated list of programming model, languages, and containers being used (default=\"ompi\")" }, - - { NULL, 'd', "debug-devel", "debug-devel", 0, - &orte_cmd_line.debug, OPAL_CMD_LINE_TYPE_BOOL, - "Enable debugging of OpenRTE" }, - - { NULL, '\0', "allow-run-as-root", "allow-run-as-root", 0, - &orte_cmd_line.run_as_root, OPAL_CMD_LINE_TYPE_BOOL, - "Allow execution as root (STRONGLY DISCOURAGED)" }, - - /* End of list */ - { NULL, '\0', NULL, NULL, 0, - NULL, OPAL_CMD_LINE_TYPE_NULL, NULL } -}; +int orte_debugger_attach_fd = -1; +bool orte_debugger_fifo_active=false; +opal_event_t *orte_debugger_attach=NULL; /* * Local functions @@ -313,6 +139,38 @@ static void launch_recv(int status, orte_process_name_t* sender, static void complete_recv(int status, orte_process_name_t* sender, opal_buffer_t *buffer, orte_rml_tag_t tag, void *cbdata); +static void attach_debugger(int fd, short event, void *arg); +static void build_debugger_args(orte_app_context_t *debugger); +static void open_fifo (void); +static void run_debugger(char *basename, opal_cmd_line_t *cmd_line, + int argc, char *argv[], int num_procs); + +/* instance the standard MPIR interfaces */ +#define MPIR_MAX_PATH_LENGTH 512 +#define MPIR_MAX_ARG_LENGTH 1024 +struct MPIR_PROCDESC *MPIR_proctable = NULL; +int MPIR_proctable_size = 0; +volatile int MPIR_being_debugged = 0; +volatile int MPIR_debug_state = 0; +int MPIR_i_am_starter = 0; +int MPIR_partial_attach_ok = 1; +char MPIR_executable_path[MPIR_MAX_PATH_LENGTH] = {0}; +char MPIR_server_arguments[MPIR_MAX_ARG_LENGTH] = {0}; +volatile int MPIR_forward_output = 0; +volatile int MPIR_forward_comm = 0; +char MPIR_attach_fifo[MPIR_MAX_PATH_LENGTH] = {0}; +int MPIR_force_to_main = 0; +static void orte_debugger_init_before_spawn(orte_job_t *jdata); + +ORTE_DECLSPEC void* MPIR_Breakpoint(void); + +/* + * Breakpoint function for parallel debuggers + */ +void* MPIR_Breakpoint(void) +{ + return NULL; +} /* local objects */ typedef struct { @@ -346,6 +204,8 @@ int orte_submit_init(int argc, char *argv[], opal_cmd_line_t *opts) { int rc; + bool version, help; + char *param; OBJ_CONSTRUCT(&tool_jobs, opal_pointer_array_t); opal_pointer_array_init(&tool_jobs, 256, INT_MAX, 128); @@ -356,8 +216,8 @@ int orte_submit_init(int argc, char *argv[], /* setup the cmd line only once */ if (NULL != opts) { - /* just add ours to the end */ - if (OPAL_SUCCESS != (rc = opal_cmd_line_add(opts, cmd_line_init))) { + /* just add the component-defined ones to the end */ + if (OPAL_SUCCESS != (rc = orte_schizo.define_cli(opts))) { return rc; } cmd_line = opts; @@ -365,55 +225,46 @@ int orte_submit_init(int argc, char *argv[], } else { /* create our own */ cmd_line = OBJ_NEW(opal_cmd_line_t); - opal_cmd_line_create(cmd_line, cmd_line_init); - mca_base_cmd_line_setup(cmd_line); + rc = orte_cmd_line_create(cmd_line, argc, argv, + &environ, &environ, + &version, &help); + if (ORTE_SUCCESS != rc) { + OBJ_RELEASE(cmd_line); + return rc; + } + /* print version if requested. Do this before check for help so + that --version --help works as one might expect. */ + if (version) { + char *str, *project_name = NULL; + if (0 == strcmp(orte_basename, "mpirun")) { + project_name = "Open MPI"; + } else { + project_name = "OpenRTE"; + } + str = opal_info_make_version_str("all", + OPAL_MAJOR_VERSION, OPAL_MINOR_VERSION, + OPAL_RELEASE_VERSION, + OPAL_GREEK_VERSION, + OPAL_REPO_REV); + if (NULL != str) { + fprintf(stdout, "%s (%s) %s\n\nReport bugs to %s\n", + orte_basename, project_name, str, PACKAGE_BUGREPORT); + free(str); + } + exit(0); + } mycmdline = true; } - /* parse the cmd line - we do this here to get the initial - * MCA parameters that might impact our own init */ - if (OPAL_SUCCESS != (rc = opal_cmd_line_parse(cmd_line, true, - argc, argv)) ) { - if (OPAL_ERR_SILENT != rc) { - fprintf(stderr, "%s: command line error (%s)\n", argv[0], - opal_strerror(rc)); - } - return rc; - } - - /* print version if requested. Do this before check for help so - that --version --help works as one might expect. */ - if (orte_cmd_line.version) { - char *str; - str = opal_info_make_version_str("all", - OPAL_MAJOR_VERSION, OPAL_MINOR_VERSION, - OPAL_RELEASE_VERSION, - OPAL_GREEK_VERSION, - OPAL_REPO_REV); - if (NULL != str) { - fprintf(stdout, "%s %s\n\nReport bugs to %s\n", - orte_basename, str, PACKAGE_BUGREPORT); - free(str); - } - return ORTE_ERR_SILENT; - } - - /* process MCA/GMCA parameters */ - if (OPAL_SUCCESS != (rc = mca_base_cmd_line_process_args(cmd_line, &environ, &environ))) { - return rc; - } - /* Need to initialize OPAL so that install_dirs are filled in */ if (OPAL_SUCCESS != (rc = opal_init(&argc, &argv))) { - OBJ_DESTRUCT(&cmd_line); return rc; } /* Check for help request */ - if (orte_cmd_line.help) { + if (help) { char *str, *args = NULL; char *project_name = NULL; - if (0 == strcmp(orte_basename, "mpirun")) { project_name = "Open MPI"; } else { @@ -429,67 +280,158 @@ int orte_submit_init(int argc, char *argv[], free(str); } free(args); - /* If someone asks for help, that should be all we do */ exit(0); } - /* if they didn't point us at an HNP, that's an error */ + /* set the flags - if they gave us a -hnp option, then + * we are a tool. If not, then we are an HNP */ if (NULL == orte_cmd_line.hnp) { - fprintf(stderr, "%s submit: required option --hnp not provided\n", orte_basename); - return ORTE_ERROR; + orte_process_info.proc_type = ORTE_PROC_HNP; + } else { + orte_process_info.proc_type = ORTE_PROC_TOOL; } - if (0 == strncasecmp(orte_cmd_line.hnp, "file", strlen("file"))) { - char input[1024], *filename; - FILE *fp; + if (ORTE_PROC_IS_TOOL) { + if (0 == strncasecmp(orte_cmd_line.hnp, "file", strlen("file"))) { + char input[1024], *filename; + FILE *fp; - /* it is a file - get the filename */ - filename = strchr(orte_cmd_line.hnp, ':'); - if (NULL == filename) { - /* filename is not correctly formatted */ - orte_show_help("help-orte-top.txt", "orte-top:hnp-filename-bad", true, "uri", orte_cmd_line.hnp); - exit(1); - } - ++filename; /* space past the : */ + /* it is a file - get the filename */ + filename = strchr(orte_cmd_line.hnp, ':'); + if (NULL == filename) { + /* filename is not correctly formatted */ + orte_show_help("help-orte-top.txt", "orte-top:hnp-filename-bad", true, "uri", orte_cmd_line.hnp); + exit(1); + } + ++filename; /* space past the : */ - if (0 >= strlen(filename)) { - /* they forgot to give us the name! */ - orte_show_help("help-orte-top.txt", "orte-top:hnp-filename-bad", true, "uri", orte_cmd_line.hnp); - exit(1); - } + if (0 >= strlen(filename)) { + /* they forgot to give us the name! */ + orte_show_help("help-orte-top.txt", "orte-top:hnp-filename-bad", true, "uri", orte_cmd_line.hnp); + exit(1); + } - /* open the file and extract the uri */ - fp = fopen(filename, "r"); - if (NULL == fp) { /* can't find or read file! */ - orte_show_help("help-orte-top.txt", "orte-top:hnp-filename-access", true, orte_cmd_line.hnp); - exit(1); - } - /* initialize the input to NULLs to ensure any input - * string is NULL-terminated */ - memset(input, 0, 1024); - if (NULL == fgets(input, 1024, fp)) { - /* something malformed about file */ + /* open the file and extract the uri */ + fp = fopen(filename, "r"); + if (NULL == fp) { /* can't find or read file! */ + orte_show_help("help-orte-top.txt", "orte-top:hnp-filename-access", true, orte_cmd_line.hnp); + exit(1); + } + /* initialize the input to NULLs to ensure any input + * string is NULL-terminated */ + memset(input, 0, 1024); + if (NULL == fgets(input, 1024, fp)) { + /* something malformed about file */ + fclose(fp); + orte_show_help("help-orte-top.txt", "orte-top:hnp-file-bad", true, orte_cmd_line.hnp); + exit(1); + } fclose(fp); - orte_show_help("help-orte-top.txt", "orte-top:hnp-file-bad", true, orte_cmd_line.hnp); - exit(1); + input[strlen(input)-1] = '\0'; /* remove newline */ + /* construct the target hnp info */ + opal_setenv(OPAL_MCA_PREFIX"orte_hnp_uri", input, true, &environ); + } else { + /* should just be the uri itself - construct the target hnp info */ + opal_setenv(OPAL_MCA_PREFIX"orte_hnp_uri", orte_cmd_line.hnp, true, &environ); } - fclose(fp); - input[strlen(input)-1] = '\0'; /* remove newline */ - /* construct the target hnp info */ - opal_setenv(OPAL_MCA_PREFIX"orte_hnp_uri", input, true, &environ); + /* we are never allowed to operate as a distributed tool, + * so insist on the ess/tool component */ + opal_setenv(OPAL_MCA_PREFIX"ess", "tool", true, &environ); } else { - /* should just be the uri itself - construct the target hnp info */ - opal_setenv(OPAL_MCA_PREFIX"orte_hnp_uri", orte_cmd_line.hnp, true, &environ); + /* may look strange, but the way we handle prefix is a little weird + * and probably needs to be addressed more fully at some future point. + * For now, we have a conflict between app_files and cmd line usage. + * Since app_files are used by the C/R system, we will make an + * adjustment here to avoid perturbing that system. + * + * We cannot just have the cmd line parser place any found value + * in the global struct as the app_file parser would replace it. + * So handle this specific cmd line option manually. + */ + orte_cmd_line.prefix = NULL; + orte_cmd_line.path_to_mpirun = NULL; + if (opal_cmd_line_is_taken(cmd_line, "prefix") || + '/' == argv[0][0] || want_prefix_by_default) { + size_t param_len; + if ('/' == argv[0][0]) { + char* tmp_basename = NULL; + /* If they specified an absolute path, strip off the + /bin/" and leave just the prefix */ + orte_cmd_line.path_to_mpirun = opal_dirname(argv[0]); + /* Quick sanity check to ensure we got + something/bin/ and that the installation + tree is at least more or less what we expect it to + be */ + tmp_basename = opal_basename(orte_cmd_line.path_to_mpirun); + if (0 == strcmp("bin", tmp_basename)) { + char* tmp = orte_cmd_line.path_to_mpirun; + orte_cmd_line.path_to_mpirun = opal_dirname(tmp); + free(tmp); + } else { + free(orte_cmd_line.path_to_mpirun); + orte_cmd_line.path_to_mpirun = NULL; + } + free(tmp_basename); + } + /* if both are given, check to see if they match */ + if (opal_cmd_line_is_taken(cmd_line, "prefix") && NULL != orte_cmd_line.path_to_mpirun) { + char *tmp_basename; + /* if they don't match, then that merits a warning */ + param = strdup(opal_cmd_line_get_param(cmd_line, "prefix", 0, 0)); + /* ensure we strip any trailing '/' */ + if (0 == strcmp(OPAL_PATH_SEP, &(param[strlen(param)-1]))) { + param[strlen(param)-1] = '\0'; + } + tmp_basename = strdup(orte_cmd_line.path_to_mpirun); + if (0 == strcmp(OPAL_PATH_SEP, &(tmp_basename[strlen(tmp_basename)-1]))) { + tmp_basename[strlen(tmp_basename)-1] = '\0'; + } + if (0 != strcmp(param, tmp_basename)) { + orte_show_help("help-orterun.txt", "orterun:double-prefix", + true, orte_basename, orte_basename, + param, tmp_basename, orte_basename); + /* use the prefix over the path-to-mpirun so that + * people can specify the backend prefix as different + * from the local one + */ + free(orte_cmd_line.path_to_mpirun); + orte_cmd_line.path_to_mpirun = NULL; + } + free(tmp_basename); + } else if (NULL != orte_cmd_line.path_to_mpirun) { + param = strdup(orte_cmd_line.path_to_mpirun); + } else if (opal_cmd_line_is_taken(cmd_line, "prefix")){ + /* must be --prefix alone */ + param = strdup(opal_cmd_line_get_param(cmd_line, "prefix", 0, 0)); + } else { + /* --enable-orterun-prefix-default was given to orterun */ + param = strdup(opal_install_dirs.prefix); + } + + if (NULL != param) { + /* "Parse" the param, aka remove superfluous path_sep. */ + param_len = strlen(param); + while (0 == strcmp (OPAL_PATH_SEP, &(param[param_len-1]))) { + param[param_len-1] = '\0'; + param_len--; + if (0 == param_len) { + orte_show_help("help-orterun.txt", "orterun:empty-prefix", + true, orte_basename, orte_basename); + free(param); + return ORTE_ERR_FATAL; + } + } + + orte_cmd_line.prefix = param; + } + want_prefix_by_default = true; + } } /* Setup MCA params */ orte_register_params(); - /* we are never allowed to operate as a distributed tool, - * so insist on the ess/tool component */ - opal_setenv(OPAL_MCA_PREFIX"ess", "tool", true, &environ); - if (orte_cmd_line.debug) { orte_devel_level_output = true; } @@ -500,7 +442,8 @@ int orte_submit_init(int argc, char *argv[], * up incorrect infrastructure that only a singleton would * require */ - if (ORTE_SUCCESS != (rc = orte_init(&argc, &argv, ORTE_PROC_TOOL))) { + if (ORTE_SUCCESS != (rc = orte_init(&argc, &argv, + orte_process_info.proc_type))) { /* cannot call ORTE_ERROR_LOG as it could be the errmgr * never got loaded! */ @@ -511,32 +454,43 @@ int orte_submit_init(int argc, char *argv[], */ opal_finalize(); - /* clear the ess param from the environment so our children - * don't pick it up */ + /* clear params from the environment so our children + * don't pick them up */ opal_unsetenv(OPAL_MCA_PREFIX"ess", &environ); + opal_unsetenv(OPAL_MCA_PREFIX"pmix", &environ); - /* set the info in our contact table */ - orte_rml.set_contact_info(orte_process_info.my_hnp_uri); - /* extract the name */ - if (ORTE_SUCCESS != orte_rml_base_parse_uris(orte_process_info.my_hnp_uri, ORTE_PROC_MY_HNP, NULL)) { - orte_show_help("help-orte-top.txt", "orte-top:hnp-uri-bad", true, orte_process_info.my_hnp_uri); - exit(1); + if (ORTE_PROC_IS_TOOL) { + /* set the info in our contact table */ + orte_rml.set_contact_info(orte_process_info.my_hnp_uri); + /* extract the name */ + if (ORTE_SUCCESS != orte_rml_base_parse_uris(orte_process_info.my_hnp_uri, ORTE_PROC_MY_HNP, NULL)) { + orte_show_help("help-orte-top.txt", "orte-top:hnp-uri-bad", true, orte_process_info.my_hnp_uri); + exit(1); + } + /* set the route to be direct */ + if (ORTE_SUCCESS != orte_routed.update_route(ORTE_PROC_MY_HNP, ORTE_PROC_MY_HNP)) { + orte_show_help("help-orte-top.txt", "orte-top:hnp-uri-bad", true, orte_process_info.my_hnp_uri); + orte_finalize(); + exit(1); + } + + /* set the target hnp as our lifeline so we will terminate if it exits */ + orte_routed.set_lifeline(ORTE_PROC_MY_HNP); + + /* setup to listen for HNP response to my commands */ + orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_NOTIFY_COMPLETE, + ORTE_RML_PERSISTENT, complete_recv, NULL); + orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_LAUNCH_RESP, + ORTE_RML_PERSISTENT, launch_recv, NULL); + } else { + /* save the environment for launch purposes. This MUST be + * done so that we can pass it to any local procs we + * spawn - otherwise, those local procs won't see any + * non-MCA envars were set in the enviro prior to calling + * orterun + */ + orte_launch_environ = opal_argv_copy(environ); } - /* set the route to be direct */ - if (ORTE_SUCCESS != orte_routed.update_route(ORTE_PROC_MY_HNP, ORTE_PROC_MY_HNP)) { - orte_show_help("help-orte-top.txt", "orte-top:hnp-uri-bad", true, orte_process_info.my_hnp_uri); - orte_finalize(); - exit(1); - } - - /* set the target hnp as our lifeline so we will terminate if it exits */ - orte_routed.set_lifeline(ORTE_PROC_MY_HNP); - - /* setup to listen for HNP response to my commands */ - orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_NOTIFY_COMPLETE, - ORTE_RML_PERSISTENT, complete_recv, NULL); - orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_LAUNCH_RESP, - ORTE_RML_PERSISTENT, launch_recv, NULL); return ORTE_SUCCESS; } @@ -633,10 +587,19 @@ int orte_submit_job(char *argv[], int *index, int rc; orte_daemon_cmd_flag_t cmd = ORTE_DAEMON_SPAWN_JOB_CMD; char *param; - orte_job_t *jdata = NULL; + orte_job_t *jdata = NULL, *daemons; + orte_app_context_t *app, *dapp; trackr_t *trk; int argc; + /* bozo check - we don't allow recursive calls of submit */ + if (NULL != getenv("OMPI_UNIVERSE_SIZE")) { + fprintf(stderr, "\n\n**********************************************************\n\n"); + fprintf(stderr, "%s does not support recursive calls\n", orte_basename); + fprintf(stderr, "\n**********************************************************\n"); + return ORTE_ERR_FATAL; + } + /* reset the globals every time thru as the argv * will modify them */ memset(&orte_cmd_line, 0, sizeof(orte_cmd_line)); @@ -656,13 +619,6 @@ int orte_submit_job(char *argv[], int *index, /* Check for some "global" command line params */ parse_globals(argc, argv, cmd_line); - /* default our personality to OMPI */ - if (NULL == orte_cmd_line.personality) { - opal_argv_append_nosize(&orte_cmd_line.personalities, "ompi"); - } else { - orte_cmd_line.personalities = opal_argv_split(orte_cmd_line.personality, ','); - } - /* create a new job object to hold the info for this one - the * jobid field will be filled in by the PLM when the job is * launched @@ -674,7 +630,6 @@ int orte_submit_job(char *argv[], int *index, */ return ORTE_ERR_OUT_OF_RESOURCE; } - jdata->personality = opal_argv_copy(orte_cmd_line.personalities); trk = OBJ_NEW(trackr_t); trk->jdata = jdata; trk->launch_cb = launch_cb; @@ -686,12 +641,6 @@ int orte_submit_job(char *argv[], int *index, /* pass our tracker ID */ orte_set_attribute(&jdata->attributes, ORTE_JOB_ROOM_NUM, ORTE_ATTR_GLOBAL, &trk->index, OPAL_INT); - /* flag that we are using the DVM */ - orte_set_attribute(&jdata->attributes, ORTE_JOB_DVM_JOB, ORTE_ATTR_GLOBAL, NULL, OPAL_BOOL); - /* flag that the allocation is static - i.e., the DVM is not allowed - * to be adjusted once started, and all unused nodes are to be - * removed from the node pool */ - orte_set_attribute(&jdata->attributes, ORTE_JOB_FIXED_DVM, ORTE_ATTR_GLOBAL, NULL, OPAL_BOOL); /* check for stdout/err directives */ /* if we were asked to tag output, mark it so */ @@ -731,6 +680,15 @@ int orte_submit_job(char *argv[], int *index, /* Parse each app, adding it to the job object */ parse_locals(jdata, argc, argv); + if (0 == jdata->num_apps) { + /* This should never happen -- this case should be caught in + create_app(), but let's just double check... */ + orte_show_help("help-orterun.txt", "orterun:nothing-to-do", + true, orte_basename); + ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE); + return ORTE_ERR_FATAL; + } + /* create the map object to communicate policies */ jdata->map = OBJ_NEW(orte_job_map_t); @@ -783,12 +741,17 @@ int orte_submit_job(char *argv[], int *index, orte_set_attribute(&jdata->attributes, ORTE_JOB_SLOT_LIST, ORTE_ATTR_GLOBAL, orte_cmd_line.slot_list, OPAL_STRING); } - if (0 == jdata->num_apps) { - /* This should never happen -- this case should be caught in - create_app(), but let's just double check... */ - orte_show_help("help-orterun.txt", "orterun:nothing-to-do", - true, orte_basename); - return ORTE_ERROR_DEFAULT_EXIT_CODE; + /* if recovery was disabled on the cmd line, do so */ + if (orte_cmd_line.enable_recovery) { + ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_RECOVERABLE); + } + + /* check for suicide test directives */ + if (NULL != getenv("ORTE_TEST_HNP_SUICIDE") || + NULL != getenv("ORTE_TEST_ORTED_SUICIDE")) { + /* don't forward IO from this process so we can + * see any debug after daemon termination */ + ORTE_FLAG_UNSET(jdata, ORTE_JOB_FLAG_FORWARD_OUTPUT); } /* check for a job timeout specification, to be provided in seconds @@ -808,31 +771,137 @@ int orte_submit_job(char *argv[], int *index, opal_event_evtimer_add(orte_mpiexec_timeout->ev, &orte_mpiexec_timeout->tv); } - /* if recovery was disabled on the cmd line, do so */ - if (orte_cmd_line.enable_recovery) { - ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_RECOVERABLE); - } + if (ORTE_PROC_IS_HNP) { + /* get the daemon job object */ + daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid); - // pack the ORTE_DAEMON_SPAWN_JOB_CMD command and job object and send to HNP at tag ORTE_RML_TAG_DAEMON - req = OBJ_NEW(opal_buffer_t); - if (OPAL_SUCCESS != (rc = opal_dss.pack(req, &cmd, 1, ORTE_DAEMON_CMD))) { - ORTE_ERROR_LOG(rc); - return rc; - } - if (OPAL_SUCCESS != (rc = opal_dss.pack(req, &jdata, 1, ORTE_JOB))) { - ORTE_ERROR_LOG(rc); - return rc; - } - if (OPAL_SUCCESS != (rc = opal_dss.pack(req, &trk->index, 1, OPAL_INT))) { - ORTE_ERROR_LOG(rc); - return rc; - } + /* check for request to report uri */ + if (NULL != orte_cmd_line.report_uri) { + FILE *fp; + char *rml_uri; + rml_uri = orte_rml.get_contact_info(); + if (0 == strcmp(orte_cmd_line.report_uri, "-")) { + /* if '-', then output to stdout */ + printf("%s\n", (NULL == rml_uri) ? "NULL" : rml_uri); + } else if (0 == strcmp(orte_cmd_line.report_uri, "+")) { + /* if '+', output to stderr */ + fprintf(stderr, "%s\n", (NULL == rml_uri) ? "NULL" : rml_uri); + } else { + fp = fopen(orte_cmd_line.report_uri, "w"); + if (NULL == fp) { + orte_show_help("help-orterun.txt", "orterun:write_file", false, + orte_basename, "uri", orte_cmd_line.report_uri); + exit(0); + } + fprintf(fp, "%s\n", (NULL == rml_uri) ? "NULL" : rml_uri); + fclose(fp); + } + if (NULL != rml_uri) { + free(rml_uri); + } + } + /* If we have a prefix, then modify the PATH and + LD_LIBRARY_PATH environment variables in our copy. This + will ensure that any locally-spawned children will + have our executables and libraries in their path - orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, req, ORTE_RML_TAG_DAEMON, orte_rml_send_callback, NULL); + For now, default to the prefix_dir provided in the first app_context. + Since there always MUST be at least one app_context, we are safe in + doing this. + */ + param = NULL; + if (NULL != (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, 0)) && + orte_get_attribute(&app->attributes, ORTE_APP_PREFIX_DIR, (void**)¶m, OPAL_STRING)) { + char *oldenv, *newenv, *lib_base, *bin_base; - /* Inform the caller of the tracker index if they passed a index pointer */ - if (NULL != index) - *index = trk->index; + /* copy the prefix into the daemon job so that any launcher + * can find the orteds when we launch the virtual machine + */ + if (NULL == (dapp = (orte_app_context_t*)opal_pointer_array_get_item(daemons->apps, 0))) { + /* that's an error in the ess */ + ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); + return ORTE_ERR_NOT_FOUND; + } + orte_set_attribute(&dapp->attributes, ORTE_APP_PREFIX_DIR, ORTE_ATTR_LOCAL, param, OPAL_STRING); + + lib_base = opal_basename(opal_install_dirs.libdir); + bin_base = opal_basename(opal_install_dirs.bindir); + + /* Reset PATH */ + newenv = opal_os_path( false, param, bin_base, NULL ); + oldenv = getenv("PATH"); + if (NULL != oldenv) { + char *temp; + asprintf(&temp, "%s:%s", newenv, oldenv ); + free( newenv ); + newenv = temp; + } + opal_setenv("PATH", newenv, true, &orte_launch_environ); + if (orte_debug_flag) { + opal_output(0, "%s: reset PATH: %s", orte_basename, newenv); + } + free(newenv); + free(bin_base); + + /* Reset LD_LIBRARY_PATH */ + newenv = opal_os_path( false, param, lib_base, NULL ); + oldenv = getenv("LD_LIBRARY_PATH"); + if (NULL != oldenv) { + char* temp; + asprintf(&temp, "%s:%s", newenv, oldenv); + free(newenv); + newenv = temp; + } + opal_setenv("LD_LIBRARY_PATH", newenv, true, &orte_launch_environ); + if (orte_debug_flag) { + opal_output(0, "%s: reset LD_LIBRARY_PATH: %s", + orte_basename, newenv); + } + free(newenv); + free(lib_base); + free(param); + } + + /* pre-condition any network transports that require it */ + if (ORTE_SUCCESS != (rc = orte_pre_condition_transports(jdata))) { + ORTE_ERROR_LOG(rc); + orte_show_help("help-orterun.txt", "orterun:precondition", false, + orte_basename, NULL, NULL, rc); + ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE); + return rc; + } + /* setup for debugging */ + orte_debugger_init_before_spawn(jdata); + + rc = orte_plm.spawn(jdata); + } else { + /* flag that we are using the DVM */ + orte_set_attribute(&jdata->attributes, ORTE_JOB_DVM_JOB, ORTE_ATTR_GLOBAL, NULL, OPAL_BOOL); + /* flag that the allocation is static - i.e., the DVM is not allowed + * to be adjusted once started, and all unused nodes are to be + * removed from the node pool */ + orte_set_attribute(&jdata->attributes, ORTE_JOB_FIXED_DVM, ORTE_ATTR_GLOBAL, NULL, OPAL_BOOL); + // pack the ORTE_DAEMON_SPAWN_JOB_CMD command and job object and send to HNP at tag ORTE_RML_TAG_DAEMON + req = OBJ_NEW(opal_buffer_t); + if (OPAL_SUCCESS != (rc = opal_dss.pack(req, &cmd, 1, ORTE_DAEMON_CMD))) { + ORTE_ERROR_LOG(rc); + return rc; + } + if (OPAL_SUCCESS != (rc = opal_dss.pack(req, &jdata, 1, ORTE_JOB))) { + ORTE_ERROR_LOG(rc); + return rc; + } + if (OPAL_SUCCESS != (rc = opal_dss.pack(req, &trk->index, 1, OPAL_INT))) { + ORTE_ERROR_LOG(rc); + return rc; + } + orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, req, ORTE_RML_TAG_DAEMON, orte_rml_send_callback, NULL); + + /* Inform the caller of the tracker index if they passed a index pointer */ + if (NULL != index) { + *index = trk->index; + } + } return ORTE_SUCCESS; @@ -890,6 +959,12 @@ static int parse_globals(int argc, char* argv[], opal_cmd_line_t *cmd_line) } } + /* Do we want a user-level debugger? */ + + if (orte_cmd_line.debugger) { + run_debugger(orte_basename, cmd_line, argc, argv, orte_cmd_line.num_procs); + } + return ORTE_SUCCESS; } @@ -1071,7 +1146,7 @@ static int create_app(int argc, char* argv[], orte_app_context_t **app_ptr, bool *made_app, char ***app_env) { - opal_cmd_line_t cmd_line; + opal_cmd_line_t app_cmd_line; char cwd[OPAL_PATH_MAX]; int i, j, count, rc; char *param, *value; @@ -1093,31 +1168,28 @@ static int create_app(int argc, char* argv[], * Only pick up '-mca foo bar' on this pass. */ if (NULL != orte_cmd_line.appfile) { - if (ORTE_SUCCESS != (rc = orte_schizo.parse_cli(orte_cmd_line.personalities, argc, 0, argv))) { + if (ORTE_SUCCESS != (rc = orte_schizo.parse_cli(argc, 0, argv))) { goto cleanup; } } /* Parse application command line options. */ init_globals(); - opal_cmd_line_create(&cmd_line, cmd_line_init); - mca_base_cmd_line_setup(&cmd_line); + OBJ_CONSTRUCT(&app_cmd_line, opal_cmd_line_t); + rc = orte_cmd_line_create(&app_cmd_line, argc, argv, + app_env, &global_mca_env, + NULL, NULL); cmd_line_made = true; - rc = opal_cmd_line_parse(&cmd_line, true, argc, argv); - if (ORTE_SUCCESS != rc) { - goto cleanup; - } - mca_base_cmd_line_process_args(&cmd_line, app_env, &global_mca_env); /* Is there an appfile in here? */ if (NULL != orte_cmd_line.appfile) { - OBJ_DESTRUCT(&cmd_line); + OBJ_DESTRUCT(&app_cmd_line); return parse_appfile(jdata, strdup(orte_cmd_line.appfile), app_env); } /* Setup application context */ app = OBJ_NEW(orte_app_context_t); - opal_cmd_line_get_tail(&cmd_line, &count, &app->argv); + opal_cmd_line_get_tail(&app_cmd_line, &count, &app->argv); /* See if we have anything left */ if (0 == count) { @@ -1134,16 +1206,15 @@ static int create_app(int argc, char* argv[], * mpirun -np 2 -mca foo bar ./my-app -mca bip bop * We want to pick up '-mca foo bar' but not '-mca bip bop' */ - if (ORTE_SUCCESS != (rc = orte_schizo.parse_cli(orte_cmd_line.personalities, argc, count, argv))) { + if (ORTE_SUCCESS != (rc = orte_schizo.parse_cli(argc, count, argv))) { goto cleanup; } /* Grab all MCA environment variables */ app->env = opal_argv_copy(*app_env); - if (ORTE_SUCCESS != (rc = orte_schizo.parse_env(orte_cmd_line.personalities, - orte_cmd_line.path, - &cmd_line, + if (ORTE_SUCCESS != (rc = orte_schizo.parse_env(orte_cmd_line.path, + &app_cmd_line, environ, &app->env))) { goto cleanup; } @@ -1187,20 +1258,20 @@ static int create_app(int argc, char* argv[], /* Check to see if the user explicitly wanted to disable automatic --prefix behavior */ - if (opal_cmd_line_is_taken(&cmd_line, "noprefix")) { + if (opal_cmd_line_is_taken(&app_cmd_line, "noprefix")) { want_prefix_by_default = false; } /* Did the user specify a prefix, or want prefix by default? */ - if (opal_cmd_line_is_taken(&cmd_line, "prefix") || want_prefix_by_default) { + if (opal_cmd_line_is_taken(&app_cmd_line, "prefix") || want_prefix_by_default) { size_t param_len; /* if both the prefix was given and we have a prefix * given above, check to see if they match */ - if (opal_cmd_line_is_taken(&cmd_line, "prefix") && + if (opal_cmd_line_is_taken(&app_cmd_line, "prefix") && NULL != orte_cmd_line.prefix) { /* if they don't match, then that merits a warning */ - param = strdup(opal_cmd_line_get_param(&cmd_line, "prefix", 0, 0)); + param = strdup(opal_cmd_line_get_param(&app_cmd_line, "prefix", 0, 0)); /* ensure we strip any trailing '/' */ if (0 == strcmp(OPAL_PATH_SEP, &(param[strlen(param)-1]))) { param[strlen(param)-1] = '\0'; @@ -1221,9 +1292,9 @@ static int create_app(int argc, char* argv[], free(value); } else if (NULL != orte_cmd_line.prefix) { param = strdup(orte_cmd_line.prefix); - } else if (opal_cmd_line_is_taken(&cmd_line, "prefix")){ + } else if (opal_cmd_line_is_taken(&app_cmd_line, "prefix")){ /* must be --prefix alone */ - param = strdup(opal_cmd_line_get_param(&cmd_line, "prefix", 0, 0)); + param = strdup(opal_cmd_line_get_param(&app_cmd_line, "prefix", 0, 0)); } else { /* --enable-orterun-prefix-default was given to orterun */ param = strdup(opal_install_dirs.prefix); @@ -1252,32 +1323,32 @@ static int create_app(int argc, char* argv[], * hostfile and machine file. * We can only deal with one hostfile per app context, otherwise give an error. */ - if (0 < (j = opal_cmd_line_get_ninsts(&cmd_line, "hostfile"))) { + if (0 < (j = opal_cmd_line_get_ninsts(&app_cmd_line, "hostfile"))) { if(1 < j) { orte_show_help("help-orterun.txt", "orterun:multiple-hostfiles", true, orte_basename, NULL); return ORTE_ERR_FATAL; } else { - value = opal_cmd_line_get_param(&cmd_line, "hostfile", 0, 0); + value = opal_cmd_line_get_param(&app_cmd_line, "hostfile", 0, 0); orte_set_attribute(&app->attributes, ORTE_APP_HOSTFILE, ORTE_ATTR_GLOBAL, value, OPAL_STRING); } } - if (0 < (j = opal_cmd_line_get_ninsts(&cmd_line, "machinefile"))) { + if (0 < (j = opal_cmd_line_get_ninsts(&app_cmd_line, "machinefile"))) { if(1 < j || orte_get_attribute(&app->attributes, ORTE_APP_HOSTFILE, NULL, OPAL_STRING)) { orte_show_help("help-orterun.txt", "orterun:multiple-hostfiles", true, orte_basename, NULL); return ORTE_ERR_FATAL; } else { - value = opal_cmd_line_get_param(&cmd_line, "machinefile", 0, 0); + value = opal_cmd_line_get_param(&app_cmd_line, "machinefile", 0, 0); orte_set_attribute(&app->attributes, ORTE_APP_HOSTFILE, ORTE_ATTR_GLOBAL, value, OPAL_STRING); } } /* Did the user specify any hosts? */ - if (0 < (j = opal_cmd_line_get_ninsts(&cmd_line, "host"))) { + if (0 < (j = opal_cmd_line_get_ninsts(&app_cmd_line, "host"))) { char **targ=NULL, *tval; for (i = 0; i < j; ++i) { - value = opal_cmd_line_get_param(&cmd_line, "host", i, 0); + value = opal_cmd_line_get_param(&app_cmd_line, "host", i, 0); opal_argv_append_nosize(&targ, value); } tval = opal_argv_join(targ, ','); @@ -1495,7 +1566,7 @@ static int create_app(int argc, char* argv[], OBJ_RELEASE(app); } if (cmd_line_made) { - OBJ_DESTRUCT(&cmd_line); + OBJ_DESTRUCT(&app_cmd_line); } return rc; } @@ -1654,20 +1725,6 @@ static int parse_appfile(orte_job_t *jdata, char *filename, char ***env) return ORTE_SUCCESS; } -void orte_timeout_wakeup(int sd, short args, void *cbdata) -{ - char *tm; - - /* this function gets called when the job execution time - * has hit a prescribed limit - so just abort - */ - tm = getenv("MPIEXEC_TIMEOUT"); - orte_show_help("help-orterun.txt", "orterun:timeout", - true, (NULL == tm) ? "NULL" : tm); - ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE); - exit(orte_exit_status); -} - static void launch_recv(int status, orte_process_name_t* sender, opal_buffer_t *buffer, orte_rml_tag_t tag, void *cbdata) @@ -1808,3 +1865,880 @@ static void complete_recv(int status, orte_process_name_t* sender, opal_pointer_array_set_item(&tool_jobs, tool_job_index, NULL); OBJ_RELEASE(trk); } + + +/**** DEBUGGER CODE ****/ +/* + * Debugger support for orterun + * + * We interpret the MPICH debugger interface as follows: + * + * a) The launcher + * - spawns the other processes, + * - fills in the table MPIR_proctable, and sets MPIR_proctable_size + * - sets MPIR_debug_state to MPIR_DEBUG_SPAWNED ( = 1) + * - calls MPIR_Breakpoint() which the debugger will have a + * breakpoint on. + * + * b) Applications start and then spin until MPIR_debug_gate is set + * non-zero by the debugger. + * + * This file implements (a). + * + ************************************************************************** + * + * Note that we have presently tested both TotalView and DDT parallel + * debuggers. They both nominally subscribe to the Etnus attaching + * interface, but there are differences between the two. + * + * TotalView: user launches "totalview mpirun -a ......". + * TV launches mpirun. mpirun launches the application and then calls + * MPIR_Breakpoint(). This is the signal to TV that it's a parallel + * MPI job. TV then reads the proctable in mpirun and attaches itself + * to all the processes (it takes care of launching itself on the + * remote nodes). Upon attaching to all the MPI processes, the + * variable MPIR_being_debugged is set to 1. When it has finished + * attaching itself to all the MPI processes that it wants to, + * MPIR_Breakpoint() returns. + * + * DDT: user launches "ddt bin -np X ". DDT fork/exec's + * mpirun to launch ddt-debugger on the back-end nodes via "mpirun -np + * X ddt-debugger" (not the lack of other arguments -- we can't pass + * anything to mpirun). This app will eventually fork/exec the MPI + * app. DDT does not current set MPIR_being_debugged in the MPI app. + * + ************************************************************************** + * + * We support two ways of waiting for attaching debuggers. The + * implementation spans this file and ompi/debuggers/ompi_debuggers.c. + * + * 1. If using orterun: MPI processes will have the + * orte_in_parallel_debugger MCA param set to true (because not all + * debuggers consistently set MPIR_being_debugged in both the launcher + * and in the MPI procs). The HNP will call MPIR_Breakpoint() and + * then RML send a message to VPID 0 (MCW rank 0) when it returns + * (MPIR_Breakpoint() doesn't return until the debugger has attached + * to all relevant processes). Meanwhile, VPID 0 blocks waiting for + * the RML message. All other VPIDs immediately call the grpcomm + * barrier (and therefore block until the debugger attaches). Once + * VPID 0 receives the RML message, we know that the debugger has + * attached to all processes that it cares about, and VPID 0 then + * joins the grpcomm barrier, allowing the job to continue. This + * scheme has the side effect of nicely supporting partial attaches by + * parallel debuggers (i.e., attaching to only some of the MPI + * processes; not necessarily all of them). + * + * 2. If not using orterun: in this case, we know that there will not be an RML message + * sent to VPID 0. So we have to look for a magic environment + * variable from the launcher to know if the jobs will be attached by + * a debugger (e.g., set by yod, srun, ...etc.), and if so, spin on + * MPIR_debug_gate. These environment variable names must be + * hard-coded in the OMPI layer (see ompi/debuggers/ompi_debuggers.c). + */ + +/* local globals and functions */ +#define DUMP_INT(X) fprintf(stderr, " %s = %d\n", # X, X); +#define FILE_MODE (S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH) + +struct MPIR_PROCDESC { + char *host_name; /* something that can be passed to inet_addr */ + char *executable_name; /* name of binary */ + int pid; /* process pid */ +}; + + +/** + * Initialization of data structures for running under a debugger + * using the MPICH/TotalView parallel debugger interface. Before the + * spawn we need to check if we are being run under a TotalView-like + * debugger; if so then inform applications via an MCA parameter. + */ +static void orte_debugger_init_before_spawn(orte_job_t *jdata) +{ + char *env_name; + orte_app_context_t *app; + int i; + char *attach_fifo; + + if (!MPIR_being_debugged && !orte_in_parallel_debugger) { + /* if we were given a test debugger, then we still want to + * colaunch it + */ + if (NULL != orte_debugger_test_daemon) { + opal_output_verbose(2, orte_debug_output, + "%s No debugger test daemon specified", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); + goto launchit; + } + /* if we were given an auto-detect rate, then we want to setup + * an event so we periodically do the check + */ + if (0 < orte_debugger_check_rate) { + opal_output_verbose(2, orte_debug_output, + "%s Setting debugger attach check rate for %d seconds", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + orte_debugger_check_rate); + ORTE_TIMER_EVENT(orte_debugger_check_rate, 0, attach_debugger, ORTE_SYS_PRI); + } else if (orte_create_session_dirs) { + /* create the attachment FIFO and setup readevent - cannot be + * done if no session dirs exist! + */ + attach_fifo = opal_os_path(false, orte_process_info.job_session_dir, "debugger_attach_fifo", NULL); + if ((mkfifo(attach_fifo, FILE_MODE) < 0) && errno != EEXIST) { + opal_output(0, "CANNOT CREATE FIFO %s: errno %d", attach_fifo, errno); + free(attach_fifo); + return; + } + strncpy(MPIR_attach_fifo, attach_fifo, MPIR_MAX_PATH_LENGTH - 1); + free(attach_fifo); + open_fifo(); + } + return; + } + + launchit: + opal_output_verbose(1, orte_debug_output, "Info: Spawned by a debugger"); + + /* tell the procs they are being debugged */ + (void) mca_base_var_env_name ("orte_in_parallel_debugger", &env_name); + + for (i=0; i < jdata->apps->size; i++) { + if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) { + continue; + } + opal_setenv(env_name, "1", true, &app->env); + } + free(env_name); +} + +static bool mpir_breakpoint_fired = false; + +static void orte_debugger_dump(void) +{ + int i; + + DUMP_INT(MPIR_being_debugged); + DUMP_INT(MPIR_debug_state); + DUMP_INT(MPIR_partial_attach_ok); + DUMP_INT(MPIR_i_am_starter); + DUMP_INT(MPIR_forward_output); + DUMP_INT(MPIR_proctable_size); + fprintf(stderr, " MPIR_proctable:\n"); + for (i = 0; i < MPIR_proctable_size; i++) { + fprintf(stderr, + " (i, host, exe, pid) = (%d, %s, %s, %d)\n", + i, + MPIR_proctable[i].host_name, + MPIR_proctable[i].executable_name, + MPIR_proctable[i].pid); + } + fprintf(stderr, "MPIR_executable_path: %s\n", + ('\0' == MPIR_executable_path[0]) ? + "NULL" : (char*) MPIR_executable_path); + fprintf(stderr, "MPIR_server_arguments: %s\n", + ('\0' == MPIR_server_arguments[0]) ? + "NULL" : (char*) MPIR_server_arguments); +} + +static void setup_debugger_job(void) +{ + orte_job_t *debugger; + orte_app_context_t *app; + orte_proc_t *proc; + int i, rc; + orte_node_t *node; + orte_vpid_t vpid=0; + char cwd[OPAL_PATH_MAX]; + + /* setup debugger daemon job */ + debugger = OBJ_NEW(orte_job_t); + /* create a jobid for these daemons - this is done solely + * to avoid confusing the rest of the system's bookkeeping + */ + orte_plm_base_create_jobid(debugger); + /* set the personality to ORTE */ + opal_argv_append_nosize(&debugger->personality, "orte"); + /* flag the job as being debugger daemons */ + ORTE_FLAG_SET(debugger, ORTE_JOB_FLAG_DEBUGGER_DAEMON); + /* unless directed, we do not forward output */ + if (!MPIR_forward_output) { + ORTE_FLAG_SET(debugger, ORTE_JOB_FLAG_FORWARD_OUTPUT); + } + /* dont push stdin */ + debugger->stdin_target = ORTE_VPID_INVALID; + /* add it to the global job pool */ + opal_hash_table_set_value_uint32(orte_job_data, debugger->jobid, debugger); + /* create an app_context for the debugger daemon */ + app = OBJ_NEW(orte_app_context_t); + if (NULL != orte_debugger_test_daemon) { + app->app = strdup(orte_debugger_test_daemon); + } else { + app->app = strdup((char*)MPIR_executable_path); + } + /* don't currently have an option to pass the debugger + * cwd - probably should add one someday + */ + if (OPAL_SUCCESS != (rc = opal_getcwd(cwd, sizeof(cwd)))) { + orte_show_help("help-orterun.txt", "orterun:init-failure", + true, "get the cwd", rc); + return; + } + app->cwd = strdup(cwd); + orte_remove_attribute(&app->attributes, ORTE_APP_USER_CWD); + opal_argv_append_nosize(&app->argv, app->app); + build_debugger_args(app); + opal_pointer_array_add(debugger->apps, app); + debugger->num_apps = 1; + /* create a job map */ + debugger->map = OBJ_NEW(orte_job_map_t); + /* in building the map, we want to launch one debugger daemon + * on each node that *already has an application process on it*. + * We cannot just launch one debugger daemon on EVERY node because + * the original job may not have placed procs on every node. So + * we construct the map here by cycling across all nodes, adding + * only those nodes where num_procs > 0. + */ + for (i=0; i < orte_node_pool->size; i++) { + if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, i))) { + continue; + } + /* if this node wasn't included in the vm, ignore it */ + if (NULL == node->daemon) { + continue; + } + /* if the node doesn't have any app procs on it, ignore it */ + if (node->num_procs < 1) { + continue; + } + /* this node has at least one proc, so add it to our map */ + OBJ_RETAIN(node); + opal_pointer_array_add(debugger->map->nodes, node); + debugger->map->num_nodes++; + /* add a debugger daemon to the node - note that the + * debugger daemon does NOT count against our subscribed slots + */ + proc = OBJ_NEW(orte_proc_t); + proc->name.jobid = debugger->jobid; + proc->name.vpid = vpid++; + /* set the local/node ranks - we don't actually care + * what these are, but the odls needs them + */ + proc->local_rank = 0; + proc->node_rank = 0; + proc->app_rank = proc->name.vpid; + /* flag the proc as ready for launch */ + proc->state = ORTE_PROC_STATE_INIT; + proc->app_idx = 0; + + OBJ_RETAIN(node); /* maintain accounting on object */ + proc->node = node; + /* add the proc to the job */ + opal_pointer_array_set_item(debugger->procs, proc->name.vpid, proc); + debugger->num_procs++; + + /* add the proc to the node's array */ + OBJ_RETAIN(proc); + opal_pointer_array_add(node->procs, (void*)proc); + node->num_procs++; + } + /* schedule it for launch */ + debugger->state = ORTE_JOB_STATE_INIT; + ORTE_ACTIVATE_JOB_STATE(debugger, ORTE_JOB_STATE_LAUNCH_APPS); +} + +/* + * Initialization of data structures for running under a debugger + * using the MPICH/TotalView parallel debugger interface. This stage + * of initialization must occur after spawn + * + * NOTE: We -always- perform this step to ensure that any debugger + * that attaches to us post-launch of the application can get a + * completed proctable + */ +void orte_debugger_init_after_spawn(int fd, short event, void *cbdata) +{ + orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata; + orte_job_t *jdata = caddy->jdata; + orte_proc_t *proc; + orte_app_context_t *appctx; + orte_vpid_t i, j; + opal_buffer_t *buf; + int rc; + char **aliases, *aptr; + + /* if we couldn't get thru the mapper stage, we might + * enter here with no procs. Avoid the "zero byte malloc" + * message by checking here + */ + if (MPIR_proctable || 0 == jdata->num_procs) { + /* already initialized */ + opal_output_verbose(5, orte_debug_output, + "%s: debugger already initialized or zero procs", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); + OBJ_RELEASE(caddy); + if (!mpir_breakpoint_fired) { + /* record that we have triggered the debugger */ + mpir_breakpoint_fired = true; + + /* trigger the debugger */ + MPIR_Breakpoint(); + + /* send a message to rank=0 to release it */ + if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, 0)) || + ORTE_PROC_STATE_UNTERMINATED < proc->state ) { + /* proc is already dead */ + return; + } + buf = OBJ_NEW(opal_buffer_t); /* don't need anything in this */ + if (0 > (rc = orte_rml.send_buffer_nb(&proc->name, buf, + ORTE_RML_TAG_DEBUGGER_RELEASE, + orte_rml_send_callback, NULL))) { + opal_output(0, "Error: could not send debugger release to MPI procs - error %s", ORTE_ERROR_NAME(rc)); + OBJ_RELEASE(buf); + } + } + return; + } + + /* fill in the proc table for the application processes */ + + opal_output_verbose(5, orte_debug_output, + "%s: Setting up debugger process table for applications", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); + + MPIR_debug_state = 1; + + /* set the total number of processes in the job */ + MPIR_proctable_size = jdata->num_procs; + + /* allocate MPIR_proctable */ + MPIR_proctable = (struct MPIR_PROCDESC *)malloc(sizeof(struct MPIR_PROCDESC) * + MPIR_proctable_size); + if (MPIR_proctable == NULL) { + ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); + OBJ_RELEASE(caddy); + return; + } + + if (orte_debugger_dump_proctable) { + opal_output(orte_clean_output, "MPIR Proctable for job %s", ORTE_JOBID_PRINT(jdata->jobid)); + } + + /* initialize MPIR_proctable */ + for (j=0; j < jdata->num_procs; j++) { + if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, j))) { + continue; + } + /* store this data in the location whose index + * corresponds to the proc's rank + */ + i = proc->name.vpid; + if (NULL == (appctx = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, proc->app_idx))) { + continue; + } + + /* take the indicated alias as the hostname, if aliases exist */ + if (orte_retain_aliases) { + aliases = NULL; + aptr = NULL; + if (orte_get_attribute(&proc->node->attributes, ORTE_NODE_ALIAS, (void**)&aptr, OPAL_STRING)) { + aliases = opal_argv_split(aptr, ','); + free(aptr); + if (orte_use_hostname_alias <= opal_argv_count(aliases)) { + MPIR_proctable[i].host_name = strdup(aliases[orte_use_hostname_alias-1]); + } + opal_argv_free(aliases); + } + } else { + /* just use the default name */ + MPIR_proctable[i].host_name = strdup(proc->node->name); + } + + if ( 0 == strncmp(appctx->app, OPAL_PATH_SEP, 1 )) { + MPIR_proctable[i].executable_name = + opal_os_path( false, appctx->app, NULL ); + } else { + MPIR_proctable[i].executable_name = + opal_os_path( false, appctx->cwd, appctx->app, NULL ); + } + MPIR_proctable[i].pid = proc->pid; + if (orte_debugger_dump_proctable) { + opal_output(orte_clean_output, "%s: Host %s Exe %s Pid %d", + ORTE_VPID_PRINT(i), MPIR_proctable[i].host_name, + MPIR_proctable[i].executable_name, MPIR_proctable[i].pid); + } + } + + if (0 < opal_output_get_verbosity(orte_debug_output)) { + orte_debugger_dump(); + } + + /* if we are being launched under a debugger, then we must wait + * for it to be ready to go and do some things to start the job + */ + if (MPIR_being_debugged || NULL != orte_debugger_test_daemon || + NULL != getenv("ORTE_TEST_DEBUGGER_ATTACH")) { + /* if we are not launching debugger daemons, then trigger + * the debugger - otherwise, we need to wait for the debugger + * daemons to be started + */ + if ('\0' == MPIR_executable_path[0] && NULL == orte_debugger_test_daemon) { + /* record that we have triggered the debugger */ + mpir_breakpoint_fired = true; + + /* trigger the debugger */ + MPIR_Breakpoint(); + + /* send a message to rank=0 to release it */ + if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, 0)) || + ORTE_PROC_STATE_UNTERMINATED < proc->state) { + /* proc is already dead or never registered with us (so we don't have + * contact info for him) + */ + return; + } + opal_output_verbose(2, orte_debug_output, + "%s sending debugger release to %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(&proc->name)); + buf = OBJ_NEW(opal_buffer_t); /* don't need anything in this */ + if (0 > (rc = orte_rml.send_buffer_nb(&proc->name, buf, + ORTE_RML_TAG_DEBUGGER_RELEASE, + orte_rml_send_callback, NULL))) { + opal_output(0, "Error: could not send debugger release to MPI procs - error %s", ORTE_ERROR_NAME(rc)); + OBJ_RELEASE(buf); + } + } else { + /* if I am launching debugger daemons, then I need to do so now + * that the job has been started and I know which nodes have + * apps on them + */ + opal_output_verbose(2, orte_debug_output, + "%s Cospawning debugger daemons %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + (NULL == orte_debugger_test_daemon) ? + MPIR_executable_path : orte_debugger_test_daemon); + setup_debugger_job(); + } + /* we don't have anything else to do */ + OBJ_RELEASE(caddy); + return; + } + + /* if we are not being debugged, then just cleanup and depart */ + OBJ_RELEASE(caddy); +} + +/* + * Process one line from the orte_base_user_debugger MCA param and + * look for that debugger in the path. If we find it, fill in + * new_argv. + */ +static int process(char *orig_line, char *basename, opal_cmd_line_t *cmd_line, + int argc, char **argv, char ***new_argv, int num_procs) +{ + int ret = ORTE_SUCCESS; + int i, j, count; + char *line = NULL, *tmp = NULL, *full_line = strdup(orig_line); + char **orterun_argv = NULL, **executable_argv = NULL, **line_argv = NULL; + char cwd[OPAL_PATH_MAX]; + bool used_num_procs = false; + bool single_app = false; + bool fail_needed_executable = false; + + line = full_line; + if (NULL == line) { + ret = ORTE_ERR_OUT_OF_RESOURCE; + goto out; + } + + /* Trim off whitespace at the beginning and ending of line */ + + for (i = 0; '\0' != line[i] && isspace(line[i]); ++line) { + continue; + } + for (i = strlen(line) - 2; i > 0 && isspace(line[i]); ++i) { + line[i] = '\0'; + } + if (strlen(line) <= 0) { + ret = ORTE_ERROR; + goto out; + } + + /* Get the tail of the command line (i.e., the user executable / + argv) */ + + opal_cmd_line_get_tail(cmd_line, &i, &executable_argv); + + /* Make a new copy of the orterun command line args, without the + orterun token itself, and without the --debug, --debugger, and + -tv flags. */ + + orterun_argv = opal_argv_copy(argv); + count = opal_argv_count(orterun_argv); + opal_argv_delete(&count, &orterun_argv, 0, 1); + for (i = 0; NULL != orterun_argv[i]; ++i) { + count = opal_argv_count(orterun_argv); + if (0 == strcmp(orterun_argv[i], "-debug") || + 0 == strcmp(orterun_argv[i], "--debug")) { + opal_argv_delete(&count, &orterun_argv, i, 1); + } else if (0 == strcmp(orterun_argv[i], "-tv") || + 0 == strcmp(orterun_argv[i], "--tv")) { + opal_argv_delete(&count, &orterun_argv, i, 1); + } else if (0 == strcmp(orterun_argv[i], "--debugger") || + 0 == strcmp(orterun_argv[i], "-debugger")) { + opal_argv_delete(&count, &orterun_argv, i, 2); + } + } + + /* Replace @@ tokens - line should never realistically be bigger + than MAX_INT, so just cast to int to remove compiler warning */ + + *new_argv = NULL; + line_argv = opal_argv_split(line, ' '); + if (NULL == line_argv) { + ret = ORTE_ERR_NOT_FOUND; + goto out; + } + for (i = 0; NULL != line_argv[i]; ++i) { + if (0 == strcmp(line_argv[i], "@mpirun@") || + 0 == strcmp(line_argv[i], "@orterun@")) { + opal_argv_append_nosize(new_argv, argv[0]); + } else if (0 == strcmp(line_argv[i], "@mpirun_args@") || + 0 == strcmp(line_argv[i], "@orterun_args@")) { + for (j = 0; NULL != orterun_argv && NULL != orterun_argv[j]; ++j) { + opal_argv_append_nosize(new_argv, orterun_argv[j]); + } + } else if (0 == strcmp(line_argv[i], "@np@")) { + used_num_procs = true; + asprintf(&tmp, "%d", num_procs); + opal_argv_append_nosize(new_argv, tmp); + free(tmp); + } else if (0 == strcmp(line_argv[i], "@single_app@")) { + /* This token is only a flag; it is not replaced with any + alternate text */ + single_app = true; + } else if (0 == strcmp(line_argv[i], "@executable@")) { + /* If we found the executable, paste it in. Otherwise, + this is a possible error. */ + if (NULL != executable_argv) { + opal_argv_append_nosize(new_argv, executable_argv[0]); + } else { + fail_needed_executable = true; + } + } else if (0 == strcmp(line_argv[i], "@executable_argv@")) { + /* If we found the tail, paste in the argv. Otherwise, + this is a possible error. */ + if (NULL != executable_argv) { + for (j = 1; NULL != executable_argv[j]; ++j) { + opal_argv_append_nosize(new_argv, executable_argv[j]); + } + } else { + fail_needed_executable = true; + } + } else { + /* It wasn't a special token, so just copy it over */ + opal_argv_append_nosize(new_argv, line_argv[i]); + } + } + + /* Can we find argv[0] in the path? */ + + getcwd(cwd, OPAL_PATH_MAX); + tmp = opal_path_findv((*new_argv)[0], X_OK, environ, cwd); + if (NULL != tmp) { + free(tmp); + + /* Ok, we found a good debugger. Check for some error + conditions. */ + tmp = opal_argv_join(argv, ' '); + + /* We do not support launching a debugger that requires the + -np value if the user did not specify -np on the command + line. */ + if (used_num_procs && 0 == num_procs) { + free(tmp); + tmp = opal_argv_join(orterun_argv, ' '); + orte_show_help("help-orterun.txt", "debugger requires -np", + true, (*new_argv)[0], argv[0], tmp, + (*new_argv)[0]); + /* Fall through to free / fail, below */ + } + + /* Some debuggers do not support launching MPMD */ + else if (single_app && NULL != strstr(tmp, " : ")) { + orte_show_help("help-orterun.txt", + "debugger only accepts single app", true, + (*new_argv)[0], (*new_argv)[0]); + /* Fall through to free / fail, below */ + } + + /* Some debuggers do not use orterun/mpirun, and therefore + must have an executable to run (e.g., cannot use mpirun's + app context file feature). */ + else if (fail_needed_executable) { + orte_show_help("help-orterun.txt", + "debugger requires executable", true, + (*new_argv)[0], argv[0], (*new_argv)[0], argv[0], + (*new_argv)[0]); + /* Fall through to free / fail, below */ + } + + /* Otherwise, we succeeded. Return happiness. */ + else { + goto out; + } + } + + /* All done -- didn't find it */ + + opal_argv_free(*new_argv); + *new_argv = NULL; + ret = ORTE_ERR_NOT_FOUND; + + out: + if (NULL != orterun_argv) { + opal_argv_free(orterun_argv); + } + if (NULL != executable_argv) { + opal_argv_free(executable_argv); + } + if (NULL != line_argv) { + opal_argv_free(line_argv); + } + if (NULL != tmp) { + free(tmp); + } + if (NULL != full_line) { + free(full_line); + } + return ret; +} + +static void open_fifo (void) +{ + if (orte_debugger_attach_fd > 0) { + close(orte_debugger_attach_fd); + } + + orte_debugger_attach_fd = open(MPIR_attach_fifo, O_RDONLY | O_NONBLOCK, 0); + if (orte_debugger_attach_fd < 0) { + opal_output(0, "%s unable to open debugger attach fifo", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); + return; + } + + /* Set this fd to be close-on-exec so that children don't see it */ + if (opal_fd_set_cloexec(orte_debugger_attach_fd) != OPAL_SUCCESS) { + opal_output(0, "%s unable to set debugger attach fifo to CLOEXEC", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); + close(orte_debugger_attach_fd); + orte_debugger_attach_fd = -1; + return; + } + + opal_output_verbose(2, orte_debug_output, + "%s Monitoring debugger attach fifo %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + MPIR_attach_fifo); + orte_debugger_attach = (opal_event_t*)malloc(sizeof(opal_event_t)); + opal_event_set(orte_event_base, orte_debugger_attach, orte_debugger_attach_fd, + OPAL_EV_READ, attach_debugger, orte_debugger_attach); + + orte_debugger_fifo_active = true; + opal_event_add(orte_debugger_attach, 0); +} + +static void attach_debugger(int fd, short event, void *arg) +{ + unsigned char fifo_cmd; + int rc; + orte_timer_t *tm; + + if (orte_debugger_fifo_active) { + orte_debugger_attach = (opal_event_t*)arg; + orte_debugger_fifo_active = false; + + rc = read(orte_debugger_attach_fd, &fifo_cmd, sizeof(fifo_cmd)); + if (!rc) { + /* release the current event */ + opal_event_free(orte_debugger_attach); + /* reopen device to clear hangup */ + open_fifo(); + return; + } + if (1 != fifo_cmd) { + /* ignore the cmd */ + orte_debugger_fifo_active = true; + opal_event_add(orte_debugger_attach, 0); + return; + } + } + + if (!MPIR_being_debugged && !orte_debugger_test_attach) { + /* false alarm - reset the read or timer event */ + if (0 == orte_debugger_check_rate) { + orte_debugger_fifo_active = true; + opal_event_add(orte_debugger_attach, 0); + } else if (!MPIR_being_debugged) { + tm = (orte_timer_t*)arg; + /* re-add the event */ + opal_event_evtimer_add(tm->ev, &tm->tv); + } + return; + } + + opal_output_verbose(1, orte_debug_output, + "%s Attaching debugger %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + (NULL == orte_debugger_test_daemon) ? MPIR_executable_path : orte_debugger_test_daemon); + + /* a debugger has attached! All the MPIR_Proctable + * data is already available, so we only need to + * check to see if we should spawn any daemons + */ + if ('\0' != MPIR_executable_path[0] || NULL != orte_debugger_test_daemon) { + opal_output_verbose(2, orte_debug_output, + "%s Spawning debugger daemons %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + (NULL == orte_debugger_test_daemon) ? + MPIR_executable_path : orte_debugger_test_daemon); + setup_debugger_job(); + } + + /* reset the read or timer event */ + if (0 == orte_debugger_check_rate) { + orte_debugger_fifo_active = true; + opal_event_add(orte_debugger_attach, 0); + } else if (!MPIR_being_debugged) { + tm = (orte_timer_t*)arg; + /* re-add the event */ + opal_event_evtimer_add(tm->ev, &tm->tv); + } +} + +static void build_debugger_args(orte_app_context_t *debugger) +{ + int i, j; + char mpir_arg[MPIR_MAX_ARG_LENGTH]; + + if ('\0' != MPIR_server_arguments[0]) { + j=0; + memset(mpir_arg, 0, MPIR_MAX_ARG_LENGTH); + for (i=0; i < MPIR_MAX_ARG_LENGTH; i++) { + if (MPIR_server_arguments[i] == '\0') { + if (0 < j) { + opal_argv_append_nosize(&debugger->argv, mpir_arg); + memset(mpir_arg, 0, MPIR_MAX_ARG_LENGTH); + j=0; + } + } else { + mpir_arg[j] = MPIR_server_arguments[i]; + j++; + } + } + } +} + +/** + * Run a user-level debugger + */ +static void run_debugger(char *basename, opal_cmd_line_t *cmd_line, + int argc, char *argv[], int num_procs) +{ + int i, id, ret; + char **new_argv = NULL; + const char **tmp; + char *value, **lines, *env_name; + + /* Get the orte_base_debug MCA parameter and search for a debugger + that can run */ + + id = mca_base_var_find("orte", "orte", NULL, "base_user_debugger"); + if (id < 0) { + orte_show_help("help-orterun.txt", "debugger-mca-param-not-found", + true); + exit(1); + } + + ret = mca_base_var_get_value (id, &tmp, NULL, NULL); + if (OPAL_SUCCESS != ret || NULL == tmp || NULL == tmp[0]) { + orte_show_help("help-orterun.txt", "debugger-orte_base_user_debugger-empty", + true); + exit(1); + } + + /* Look through all the values in the MCA param */ + + lines = opal_argv_split(tmp[0], ':'); + for (i = 0; NULL != lines[i]; ++i) { + if (ORTE_SUCCESS == process(lines[i], basename, cmd_line, argc, argv, + &new_argv, num_procs)) { + break; + } + } + + /* If we didn't find one, abort */ + + if (NULL == lines[i]) { + orte_show_help("help-orterun.txt", "debugger-not-found", true); + exit(1); + } + opal_argv_free(lines); + + /* We found one */ + + /* cleanup the MPIR arrays in case the debugger doesn't set them */ + memset((char*)MPIR_executable_path, 0, MPIR_MAX_PATH_LENGTH); + memset((char*)MPIR_server_arguments, 0, MPIR_MAX_ARG_LENGTH); + + /* Set an MCA param so that everyone knows that they are being + launched under a debugger; not all debuggers are consistent + about setting MPIR_being_debugged in both the launcher and the + MPI processes */ + ret = mca_base_var_env_name ("orte_in_parallel_debugger", &env_name); + if (OPAL_SUCCESS == ret && NULL != env_name) { + opal_setenv(env_name, "1", true, &environ); + free(env_name); + } + + /* Launch the debugger */ + execvp(new_argv[0], new_argv); + value = opal_argv_join(new_argv, ' '); + orte_show_help("help-orterun.txt", "debugger-exec-failed", + true, basename, value, new_argv[0]); + free(value); + opal_argv_free(new_argv); + exit(1); +} + +void orte_debugger_detached(int fd, short event, void *cbdata) +{ + orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata; + OBJ_RELEASE(caddy); + + /* need to ensure MPIR_Breakpoint is called again if another debugger attaches */ + mpir_breakpoint_fired = false; +} + +void orte_timeout_wakeup(int sd, short args, void *cbdata) +{ + char *tm; + + /* this function gets called when the job execution time + * has hit a prescribed limit - so just abort + */ + tm = getenv("MPIEXEC_TIMEOUT"); + orte_show_help("help-orterun.txt", "orterun:timeout", + true, (NULL == tm) ? "NULL" : tm); + ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE); + /* if we are testing HNP suicide, then just exit */ + if (ORTE_PROC_IS_HNP && + NULL != getenv("ORTE_TEST_HNP_SUICIDE")) { + opal_output(0, "HNP exiting w/o cleanup"); + exit(1); + } + /* abort the job */ + ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_ALL_JOBS_COMPLETE); + /* set the global abnormal exit flag */ + orte_abnormal_term_ordered = true; +} diff --git a/orte/orted/orted_submit.h b/orte/orted/orted_submit.h index 23eead9794..ed6a8313d4 100644 --- a/orte/orted/orted_submit.h +++ b/orte/orted/orted_submit.h @@ -28,6 +28,13 @@ ORTE_DECLSPEC int orte_submit_job(char *cmd[], int *index, orte_submit_cbfunc_t launch_cb, void *launch_cbdata, orte_submit_cbfunc_t complete_cb, void *complete_cbdata); ORTE_DECLSPEC int orte_submit_halt(void); +ORTE_DECLSPEC void orte_debugger_init_after_spawn(int fd, short event, void *arg); +ORTE_DECLSPEC void orte_debugger_detached(int fd, short event, void *arg); + +extern int orte_debugger_attach_fd; +extern bool orte_debugger_fifo_active; +extern opal_event_t *orte_debugger_attach; +extern char MPIR_attach_fifo[]; /** * Global struct for catching orte command line options. @@ -82,6 +89,7 @@ struct orte_cmd_line_t { bool merge; bool enable_recovery; char *hnp; + bool staged_exec; }; typedef struct orte_cmd_line_t orte_cmd_line_t; ORTE_DECLSPEC extern orte_cmd_line_t orte_cmd_line; diff --git a/orte/orted/pmix/pmix_server_dyn.c b/orte/orted/pmix/pmix_server_dyn.c index 907208329d..9c800410ff 100644 --- a/orte/orted/pmix/pmix_server_dyn.c +++ b/orte/orted/pmix/pmix_server_dyn.c @@ -169,25 +169,17 @@ int pmix_server_spawn_fn(opal_process_name_t *requestor, /* create the job object */ jdata = OBJ_NEW(orte_job_t); + jdata->map = OBJ_NEW(orte_job_map_t); /* transfer the job info across */ OPAL_LIST_FOREACH(info, job_info, opal_value_t) { if (0 == strcmp(info->key, OPAL_PMIX_PERSONALITY)) { jdata->personality = opal_argv_split(info->data.string, ','); } else if (0 == strcmp(info->key, OPAL_PMIX_MAPPER)) { - if (NULL == jdata->map) { - jdata->map = OBJ_NEW(orte_job_map_t); - } jdata->map->req_mapper = strdup(info->data.string); } else if (0 == strcmp(info->key, OPAL_PMIX_DISPLAY_MAP)) { - if (NULL == jdata->map) { - jdata->map = OBJ_NEW(orte_job_map_t); - } jdata->map->display_map = true; } else if (0 == strcmp(info->key, OPAL_PMIX_PPR)) { - if (NULL == jdata->map) { - jdata->map = OBJ_NEW(orte_job_map_t); - } if (ORTE_MAPPING_POLICY_IS_SET(jdata->map->mapping)) { /* not allowed to provide multiple mapping policies */ orte_show_help("help-orte-rmaps-base.txt", "redefining-policy", @@ -198,9 +190,6 @@ int pmix_server_spawn_fn(opal_process_name_t *requestor, ORTE_SET_MAPPING_DIRECTIVE(jdata->map->mapping, ORTE_MAPPING_PPR); jdata->map->ppr = strdup(info->data.string); } else if (0 == strcmp(info->key, OPAL_PMIX_MAPBY)) { - if (NULL == jdata->map) { - jdata->map = OBJ_NEW(orte_job_map_t); - } if (ORTE_MAPPING_POLICY_IS_SET(jdata->map->mapping)) { /* not allowed to provide multiple mapping policies */ orte_show_help("help-orte-rmaps-base.txt", "redefining-policy", @@ -214,9 +203,6 @@ int pmix_server_spawn_fn(opal_process_name_t *requestor, return rc; } } else if (0 == strcmp(info->key, OPAL_PMIX_RANKBY)) { - if (NULL == jdata->map) { - jdata->map = OBJ_NEW(orte_job_map_t); - } if (ORTE_RANKING_POLICY_IS_SET(jdata->map->ranking)) { /* not allowed to provide multiple ranking policies */ orte_show_help("help-orte-rmaps-base.txt", "redefining-policy", @@ -231,9 +217,6 @@ int pmix_server_spawn_fn(opal_process_name_t *requestor, return rc; } } else if (0 == strcmp(info->key, OPAL_PMIX_BINDTO)) { - if (NULL == jdata->map) { - jdata->map = OBJ_NEW(orte_job_map_t); - } if (OPAL_BINDING_POLICY_IS_SET(jdata->map->binding)) { /* not allowed to provide multiple mapping policies */ orte_show_help("help-opal-hwloc-base.txt", "redefining-policy", true, diff --git a/orte/tools/Makefile.am b/orte/tools/Makefile.am index 2651e373ac..51f60c290c 100644 --- a/orte/tools/Makefile.am +++ b/orte/tools/Makefile.am @@ -13,7 +13,7 @@ # Copyright (c) 2006-2008 Cisco Systems, Inc. All rights reserved. # Copyright (c) 2011-2013 Los Alamos National Security, LLC. All rights # reserved. -# Copyright (c) 2014-2015 Intel, Inc. All rights reserved. +# Copyright (c) 2014-2016 Intel, Inc. All rights reserved. # $COPYRIGHT$ # # Additional copyrights may follow @@ -36,7 +36,6 @@ SUBDIRS += \ tools/orte-info \ tools/orte-migrate \ tools/orte-server \ - tools/orte-submit \ tools/orte-dvm DIST_SUBDIRS += \ @@ -51,6 +50,5 @@ DIST_SUBDIRS += \ tools/orte-info \ tools/orte-migrate \ tools/orte-server \ - tools/orte-submit \ tools/orte-dvm diff --git a/orte/tools/orte-checkpoint/orte-checkpoint.c b/orte/tools/orte-checkpoint/orte-checkpoint.c index b51d8f538c..b732f07634 100644 --- a/orte/tools/orte-checkpoint/orte-checkpoint.c +++ b/orte/tools/orte-checkpoint/orte-checkpoint.c @@ -417,7 +417,7 @@ static int parse_args(int argc, char *argv[]) { /** * Put all of the MCA arguments in the environment */ - mca_base_cmd_line_process_args(&cmd_line, &app_env, &global_env); + mca_base_cmd_line_process_args(argc, &app_env, &global_env); len = opal_argv_count(app_env); for(i = 0; i < len; ++i) { diff --git a/orte/tools/orte-dvm/orte-dvm.c b/orte/tools/orte-dvm/orte-dvm.c index 479aed125b..2a05a685c8 100644 --- a/orte/tools/orte-dvm/orte-dvm.c +++ b/orte/tools/orte-dvm/orte-dvm.c @@ -234,7 +234,7 @@ int main(int argc, char *argv[]) * opal_init_util() since mca_base_cmd_line_process_args() does *not* * depend upon opal_init_util() functionality. */ - if (OPAL_SUCCESS != mca_base_cmd_line_process_args(&cmd_line, &environ, &environ)) { + if (OPAL_SUCCESS != mca_base_cmd_line_process_args(argv, &environ, &environ)) { exit(1); } diff --git a/orte/tools/orte-info/orte-info.c b/orte/tools/orte-info/orte-info.c index 5b2230fbb5..8ff27811ee 100644 --- a/orte/tools/orte-info/orte-info.c +++ b/orte/tools/orte-info/orte-info.c @@ -184,7 +184,7 @@ int main(int argc, char *argv[]) exit(cmd_error ? 1 : 0); } - mca_base_cmd_line_process_args(orte_info_cmd_line, &app_env, &global_env); + mca_base_cmd_line_process_args(argv, &app_env, &global_env); /* putenv() all the stuff that we got back from env (in case the * user specified some --mca params on the command line). This diff --git a/orte/tools/orte-migrate/orte-migrate.c b/orte/tools/orte-migrate/orte-migrate.c index 6b7f9827ad..1a8bb11d0a 100644 --- a/orte/tools/orte-migrate/orte-migrate.c +++ b/orte/tools/orte-migrate/orte-migrate.c @@ -304,7 +304,7 @@ static int parse_args(int argc, char *argv[]) { /** * Put all of the MCA arguments in the environment */ - mca_base_cmd_line_process_args(&cmd_line, &app_env, &global_env); + mca_base_cmd_line_process_args(argv, &app_env, &global_env); len = opal_argv_count(app_env); for(i = 0; i < len; ++i) { diff --git a/orte/tools/orte-restart/orte-restart.c b/orte/tools/orte-restart/orte-restart.c index 9a7974e8e6..5b94a10093 100644 --- a/orte/tools/orte-restart/orte-restart.c +++ b/orte/tools/orte-restart/orte-restart.c @@ -461,7 +461,7 @@ static int parse_args(int argc, char *argv[]) /** * Put all of the MCA arguments in the environment */ - mca_base_cmd_line_process_args(&cmd_line, &app_env, &global_env); + mca_base_cmd_line_process_args(argv, &app_env, &global_env); len = opal_argv_count(app_env); for(i = 0; i < len; ++i) { diff --git a/orte/tools/orte-server/orte-server.c b/orte/tools/orte-server/orte-server.c index 46ef0b9046..89ac610970 100644 --- a/orte/tools/orte-server/orte-server.c +++ b/orte/tools/orte-server/orte-server.c @@ -149,7 +149,7 @@ int main(int argc, char *argv[]) * Since this process can now handle MCA/GMCA parameters, make sure to * process them. */ - mca_base_cmd_line_process_args(cmd_line, &environ, &environ); + mca_base_cmd_line_process_args(argv, &environ, &environ); /* if debug is set, then set orte_debug_flag so that the data server * code will output diff --git a/orte/tools/orte-submit/Makefile.am b/orte/tools/orte-submit/Makefile.am deleted file mode 100644 index 93d7e1068e..0000000000 --- a/orte/tools/orte-submit/Makefile.am +++ /dev/null @@ -1,57 +0,0 @@ -# -# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana -# University Research and Technology -# Corporation. All rights reserved. -# Copyright (c) 2004-2005 The University of Tennessee and The University -# of Tennessee Research Foundation. All rights -# reserved. -# Copyright (c) 2004-2009 High Performance Computing Center Stuttgart, -# University of Stuttgart. All rights reserved. -# Copyright (c) 2004-2005 The Regents of the University of California. -# All rights reserved. -# Copyright (c) 2008-2014 Cisco Systems, Inc. All rights reserved. -# Copyright (c) 2008 Sun Microsystems, Inc. All rights reserved. -# Copyright (c) 2015 Intel, Inc. All rights reserved. -# $COPYRIGHT$ -# -# Additional copyrights may follow -# -# $HEADER$ -# - -# This is not quite in the Automake spirit, but we have to do it. -# Since the totalview portion of the library must be built with -g, we -# must eliminate the CFLAGS that are passed in here by default (which -# may already have debugging and/or optimization flags). We use -# post-processed forms of the CFLAGS in the library targets down -# below. - -CFLAGS = $(CFLAGS_WITHOUT_OPTFLAGS) $(DEBUGGER_CFLAGS) - -include $(top_srcdir)/Makefile.ompi-rules - -man_pages = orte-submit.1 -EXTRA_DIST = $(man_pages:.1=.1in) - -if OPAL_INSTALL_BINARIES - -bin_PROGRAMS = orte-submit - -nodist_man_MANS = $(man_pages) - -# Ensure that the man pages are rebuilt if the opal_config.h file -# changes; a "good enough" way to know if configure was run again (and -# therefore the release date or version may have changed) -$(nodist_man_MANS): $(top_builddir)/opal/include/opal_config.h - -endif # OPAL_INSTALL_BINARIES - -orte_submit_SOURCES = \ - orte-submit.c - -orte_submit_LDADD = \ - $(top_builddir)/orte/lib@ORTE_LIB_PREFIX@open-rte.la \ - $(top_builddir)/opal/lib@OPAL_LIB_PREFIX@open-pal.la - -distclean-local: - rm -f $(man_pages) diff --git a/orte/tools/orte-submit/orte-submit.1in b/orte/tools/orte-submit/orte-submit.1in deleted file mode 100644 index d37c48188b..0000000000 --- a/orte/tools/orte-submit/orte-submit.1in +++ /dev/null @@ -1,1428 +0,0 @@ -.\" -*- nroff -*- -.\" Copyright (c) 2009-2014 Cisco Systems, Inc. All rights reserved. -.\" Copyright (c) 2008-2009 Sun Microsystems, Inc. All rights reserved. -.\” Copyright (c) 2015 Intel, Inc. All rights reserved. -.\" $COPYRIGHT$ -.\" -.\" Man page for ORTE's orte-submit command -.\" -.\" .TH name section center-footer left-footer center-header -.TH ORTE-SUBMIT 1 "#OMPI_DATE#" "#PACKAGE_VERSION#" "#PACKAGE_NAME#" -.\" ************************** -.\" Name Section -.\" ************************** -.SH NAME -. -orte-submit, ompi-submit \- Execute serial and parallel jobs in Open MPI using a DVM. - -.B Note: -\fIompi-submit\fP and \fIorte-submit\fP are synonyms for each -other. Using either of the names will produce the same behavior. -. -.\" ************************** -.\" Synopsis Section -.\" ************************** -.SH SYNOPSIS -. -.PP -Single Process Multiple Data (SPMD) Model: - -.B ompi-submit -[ options ] -.B -[ ] -.P - -Multiple Instruction Multiple Data (MIMD) Model: - -.B ompi-submit -[ global_options ] - [ local_options1 ] -.B -[ ] : - [ local_options2 ] -.B -[ ] : - ... : - [ local_optionsN ] -.B -[ ] -.P - -Note that in both models, invoking \fIompi-submit\fP via an absolute path -name is equivalent to specifying the \fI--prefix\fP option with a -\fI\fR value equivalent to the directory where \fIompi-submit\fR -resides, minus its last subdirectory. For example: - - \fB%\fP /usr/local/bin/ompi-submit ... - -is equivalent to - - \fB%\fP ompi-submit --prefix /usr/local - -. -.\" ************************** -.\" Quick Summary Section -.\" ************************** -.SH QUICK SUMMARY -. -.B -Use of \fIorte-submit\fP requires that you first start the Distributed Virtual -Machine (DVM) using \fIorte-dvm\fP. -.P -If you are simply looking for how to run an MPI application, you -probably want to use a command line of the following form: - - \fB%\fP ompi-submit [ -np X ] [ --hostfile ] - -This will run X copies of \fI\fR in your current run-time -environment (if running under a supported resource manager, Open MPI's -\fIompi-submit\fR will usually automatically use the corresponding resource manager -process starter, as opposed to, for example, \fIrsh\fR or \fIssh\fR, -which require the use of a hostfile, or will default to running all X -copies on the localhost), scheduling (by default) in a round-robin fashion by -CPU slot. See the rest of this page for more details. -.P -Please note that ompi-submit automatically binds processes as of the start of the -v1.8 series. Two binding patterns are used in the absence of any further directives: -.TP 18 -.B Bind to core: -when the number of processes is <= 2 -. -. -.TP -.B Bind to socket: -when the number of processes is > 2 -. -. -.P -If your application uses threads, then you probably want to ensure that you are -either not bound at all (by specifying --bind-to none), or bound to multiple cores -using an appropriate binding level or specific number of processing elements per -application process. -. -.\" ************************** -.\" Options Section -.\" ************************** -.SH OPTIONS -. -.I ompi-submit -will send the name of the directory where it was invoked on the local -node to each of the remote nodes, and attempt to change to that -directory. See the "Current Working Directory" section below for further -details. -.\" -.\" Start options listing -.\" Indent 10 characters from start of first column to start of second column -.TP 10 -.B -The program executable. This is identified as the first non-recognized argument -to ompi-submit. -. -. -.TP -.B -Pass these run-time arguments to every new process. These must always -be the last arguments to \fIompi-submit\fP. If an app context file is used, -\fI\fP will be ignored. -. -. -.TP -.B -h\fR,\fP --help -Display help for this command -. -. -.TP -.B -q\fR,\fP --quiet -Suppress informative messages from orte-submit during application execution. -. -. -.TP -.B -v\fR,\fP --verbose -Be verbose -. -. -.TP -.B -V\fR,\fP --version -Print version number. If no other arguments are given, this will also -cause orte-submit to exit. -. -. -. -. -.P -Use one of the following options to specify which hosts (nodes) of the DVM to run on. -Specifying hosts outside the DVM will result in an error. -. -. -.TP -.B -H\fR,\fP -host\fR,\fP --host \fR\fP -List of hosts on which to invoke processes. -. -. -.TP -.B --hostfile\fR,\fP --hostfile \fR\fP -Provide a hostfile to use. -.\" JJH - Should have man page for how to format a hostfile properly. -. -. -.TP -.B -machinefile\fR,\fP --machinefile \fR\fP -Synonym for \fI-hostfile\fP. -. -. -. -. -.P -The following options specify the number of processes to launch. Note that none -of the options imply a particular binding policy - e.g., requesting N processes -for each socket does not imply that the processes will be bound to the socket. -. -. -.TP -.B -c\fR,\fP -n\fR,\fP --n\fR,\fP -np \fR<#>\fP -Run this many copies of the program on the given nodes. This option -indicates that the specified file is an executable program and not an -application context. If no value is provided for the number of copies to -execute (i.e., neither the "-np" nor its synonyms are provided on the command -line), Open MPI will automatically execute a copy of the program on -each process slot (see below for description of a "process slot"). This -feature, however, can only be used in the SPMD model and will return an -error (without beginning execution of the application) otherwise. -. -. -.TP -.B —map-by ppr:N: -Launch N times the number of objects of the specified type on each node. -. -. -.TP -.B -npersocket\fR,\fP --npersocket <#persocket> -On each node, launch this many processes times the number of processor -sockets on the node. -The \fI-npersocket\fP option also turns on the \fI-bind-to-socket\fP option. -(deprecated in favor of --map-by ppr:n:socket) -. -. -.TP -.B -npernode\fR,\fP --npernode <#pernode> -On each node, launch this many processes. -(deprecated in favor of --map-by ppr:n:node) -. -. -.TP -.B -pernode\fR,\fP --pernode -On each node, launch one process -- equivalent to \fI-npernode\fP 1. -(deprecated in favor of --map-by ppr:1:node) -. -. -. -. -.P -To map processes: -. -. -.TP -.B --map-by -Map to the specified object, defaults to \fIsocket\fP. Supported options -include slot, hwthread, core, L1cache, L2cache, L3cache, socket, numa, -board, node, sequential, distance, and ppr. Any object can include -modifiers by adding a \fR:\fP and any combination of PE=n (bind n -processing elements to each proc), SPAN (load -balance the processes across the allocation), OVERSUBSCRIBE (allow -more processes on a node than processing elements), and NOOVERSUBSCRIBE. -This includes PPR, where the pattern would be terminated by another colon -to separate it from the modifiers. -. -.TP -.B -bycore\fR,\fP --bycore -Map processes by core (deprecated in favor of --map-by core) -. -.TP -.B -bysocket\fR,\fP --bysocket -Map processes by socket (deprecated in favor of --map-by socket) -. -.TP -.B -nolocal\fR,\fP --nolocal -Do not run any copies of the launched application on the same node as -orte-submit is running. This option will override listing the localhost -with \fB--host\fR or any other host-specifying mechanism. -. -.TP -.B -nooversubscribe\fR,\fP --nooversubscribe -Do not oversubscribe any nodes; error (without starting any processes) -if the requested number of processes would cause oversubscription. -This option implicitly sets "max_slots" equal to the "slots" value for -each node. -. -.TP -.B -bynode\fR,\fP --bynode -Launch processes one per node, cycling by node in a round-robin -fashion. This spreads processes evenly among nodes and assigns -MPI_COMM_WORLD ranks in a round-robin, "by node" manner. -. -. -. -. -.P -To order processes' ranks in MPI_COMM_WORLD: -. -. -.TP -.B --rank-by -Rank in round-robin fashion according to the specified object, -defaults to \fIslot\fP. Supported options -include slot, hwthread, core, L1cache, L2cache, L3cache, -socket, numa, board, and node. -. -. -. -. -.P -For process binding: -. -.TP -.B --bind-to -Bind processes to the specified object, defaults to \fIcore\fP. Supported options -include slot, hwthread, core, l1cache, l2cache, l3cache, socket, numa, board, and none. -. -.TP -.B -cpus-per-proc\fR,\fP --cpus-per-proc <#perproc> -Bind each process to the specified number of cpus. -(deprecated in favor of --map-by :PE=n) -. -.TP -.B -cpus-per-rank\fR,\fP --cpus-per-rank <#perrank> -Alias for \fI-cpus-per-proc\fP. -(deprecated in favor of --map-by :PE=n) -. -.TP -.B -bind-to-core\fR,\fP --bind-to-core -Bind processes to cores (deprecated in favor of --bind-to core) -. -.TP -.B -bind-to-socket\fR,\fP --bind-to-socket -Bind processes to processor sockets (deprecated in favor of --bind-to socket) -. -.TP -.B -bind-to-none\fR,\fP --bind-to-none -Do not bind processes (deprecated in favor of --bind-to none) -. -.TP -.B -report-bindings\fR,\fP --report-bindings -Report any bindings for launched processes. -. -.TP -.B -slot-list\fR,\fP --slot-list -List of processor IDs to be used for binding MPI processes. The specified bindings will -be applied to all MPI processes. See explanation below for syntax. -. -. -. -. -.P -For rankfiles: -. -. -.TP -.B -rf\fR,\fP --rankfile -Provide a rankfile file. -. -. -. -. -.P -To manage standard I/O: -. -. -.TP -.B -output-filename\fR,\fP --output-filename \fR\fP -Redirect the stdout, stderr, and stddiag of all processes to a process-unique version of -the specified filename. Any directories in the filename will automatically be created. -Each output file will consist of filename.id, where the id will be the -processes' rank in MPI_COMM_WORLD, left-filled with -zero's for correct ordering in listings. -. -. -.TP -.B -stdin\fR,\fP --stdin -The MPI_COMM_WORLD rank of the process that is to receive stdin. The -default is to forward stdin to MPI_COMM_WORLD rank 0, but this option -can be used to forward stdin to any process. It is also acceptable to -specify \fInone\fP, indicating that no processes are to receive stdin. -. -. -.TP -.B -tag-output\fR,\fP --tag-output -Tag each line of output to stdout, stderr, and stddiag with \fB[jobid, MCW_rank]\fP indicating the process jobid -and MPI_COMM_WORLD rank of the process that generated the output, and the channel which generated it. -. -. -.TP -.B -timestamp-output\fR,\fP --timestamp-output -Timestamp each line of output to stdout, stderr, and stddiag. -. -. -.TP -.B -xml\fR,\fP --xml -Provide all output to stdout, stderr, and stddiag in an xml format. -. -. -.TP -.B -xterm\fR,\fP --xterm \fR\fP -Display the output from the processes identified by their -MPI_COMM_WORLD ranks in separate xterm windows. The ranks are specified -as a comma-separated list of ranges, with a -1 indicating all. A separate -window will be created for each specified process. -.B Note: -xterm will normally terminate the window upon termination of the process running -within it. However, by adding a "!" to the end of the list of specified ranks, -the proper options will be provided to ensure that xterm keeps the window open -\fIafter\fP the process terminates, thus allowing you to see the process' output. -Each xterm window will subsequently need to be manually closed. -.B Note: -In some environments, xterm may require that the executable be in the user's -path, or be specified in absolute or relative terms. Thus, it may be necessary -to specify a local executable as "./foo" instead of just "foo". If xterm fails to -find the executable, ompi-submit will hang, but still respond correctly to a ctrl-c. -If this happens, please check that the executable is being specified correctly -and try again. -. -. -. -. -.P -To manage files and runtime environment: -. -. -.TP -.B -path\fR,\fP --path \fR\fP - that will be used when attempting to locate the requested -executables. This is used prior to using the local PATH setting. -. -. -.TP -.B --prefix \fR\fP -Prefix directory that will be used to set the \fIPATH\fR and -\fILD_LIBRARY_PATH\fR on the remote node before invoking Open MPI or -the target process. See the "Remote Execution" section, below. -. -. -.TP -.B --preload-binary -Copy the specified executable(s) to remote machines prior to starting remote processes. The -executables will be copied to the Open MPI session directory and will be deleted upon -completion of the job. -. -. -.TP -.B --preload-files -Preload the comma separated list of files to the current working directory of the remote -machines where processes will be launched prior to starting those processes. -. -. -.TP -.B --preload-files-dest-dir -The destination directory to be used for preload-files, if other than the current working -directory. By default, the absolute and relative paths provided by --preload-files are used. -. -. -.TP -.B -wd \fR\fP -Synonym for \fI-wdir\fP. -. -. -.TP -.B -wdir \fR\fP -Change to the directory before the user's program executes. -See the "Current Working Directory" section for notes on relative paths. -.B Note: -If the \fI-wdir\fP option appears both on the command line and in an -application context, the context will take precedence over the command -line. Thus, if the path to the desired wdir is different -on the backend nodes, then it must be specified as an absolute path that -is correct for the backend node. -. -. -.TP -.B -x \fR\fP -Export the specified environment variables to the remote nodes before -executing the program. Only one environment variable can be specified -per \fI-x\fP option. Existing environment variables can be specified -or new variable names specified with corresponding values. For -example: - \fB%\fP ompi-submit -x DISPLAY -x OFILE=/tmp/out ... - -The parser for the \fI-x\fP option is not very sophisticated; it does -not even understand quoted values. Users are advised to set variables -in the environment, and then use \fI-x\fP to export (not define) them. -. -. -. -. -.P -Setting MCA parameters: -. -. -.TP -.B -gmca\fR,\fP --gmca \fR \fP -Pass global MCA parameters that are applicable to all contexts. \fI\fP is -the parameter name; \fI\fP is the parameter value. -. -. -.TP -.B -mca\fR,\fP --mca -Send arguments to various MCA modules. See the "MCA" section, below. -. -. -. -. -.P -For debugging: -. -. -.TP -.B -debug\fR,\fP --debug -Invoke the user-level debugger indicated by the \fIorte_base_user_debugger\fP -MCA parameter. -. -. -.TP -.B -debugger\fR,\fP --debugger -Sequence of debuggers to search for when \fI--debug\fP is used (i.e. -a synonym for \fIorte_base_user_debugger\fP MCA parameter). -. -. -.TP -.B -tv\fR,\fP --tv -Launch processes under the TotalView debugger. -Deprecated backwards compatibility flag. Synonym for \fI--debug\fP. -. -. -. -. -.P -There are also other options: -. -. -.TP -.B --allow-run-as-root -Allow -.I ompi-submit -to run when executed by the root user -.RI ( ompi-submit -defaults to aborting when launched as the root user). -. -. -.TP -.B -aborted\fR,\fP --aborted \fR<#>\fP -Set the maximum number of aborted processes to display. -. -. -.TP -.B --app \fR\fP -Provide an appfile, ignoring all other command line options. -. -. -.TP -.B -cf\fR,\fP --cartofile \fR\fP -Provide a cartography file. -. -. -.TP -.B --hetero -Indicates that multiple app_contexts are being provided that are a mix of 32/64-bit binaries. -. -. -.TP -.B -ompi-server\fR,\fP --ompi-server -Specify the URI of the Open MPI server (or the ompi-submit to be used as the server) -, the name -of the file (specified as file:filename) that -contains that info, or the PID (specified as pid:#) of the ompi-submit to be used as - the server. -The Open MPI server is used to support multi-application data exchange via -the MPI-2 MPI_Publish_name and MPI_Lookup_name functions. -. -. -. -. -.P -The following options are useful for developers; they are not generally -useful to most ORTE and/or MPI users: -. -.TP -.B -d\fR,\fP --debug-devel -Enable debugging of the OmpiRTE (the run-time layer in Open MPI). -This is not generally useful for most users. -. -. -. -.P -There may be other options listed with \fIompi-submit --help\fP. -. -. -.SS Environment Variables -. -.TP -.B MPIEXEC_TIMEOUT -The maximum number of seconds that -.I ompi-submit -.RI ( mpiexec ) -will run. After this many seconds, -.I ompi-submit -will abort the launched job and exit. -. -. -.\" ************************** -.\" Description Section -.\" ************************** -.SH DESCRIPTION -. -One invocation of \fIompi-submit\fP starts an MPI application running under Open -MPI. If the application is single process multiple data (SPMD), the application -can be specified on the \fIompi-submit\fP command line. - -If the application is multiple instruction multiple data (MIMD), comprising of -multiple programs, the set of programs and argument can be specified in one of -two ways: Extended Command Line Arguments, and Application Context. -.PP -An application context describes the MIMD program set including all arguments -in a separate file. -.\" See appcontext(5) for a description of the application context syntax. -This file essentially contains multiple \fIompi-submit\fP command lines, less the -command name itself. The ability to specify different options for different -instantiations of a program is another reason to use an application context. -.PP -Extended command line arguments allow for the description of the application -layout on the command line using colons (\fI:\fP) to separate the specification -of programs and arguments. Some options are globally set across all specified -programs (e.g. --hostfile), while others are specific to a single program -(e.g. -np). -. -. -. -.SS Specifying Host Nodes -. -Host nodes can be identified on the \fIompi-submit\fP command line with the \fI-host\fP -option or in a hostfile. -. -.PP -For example, -. -.TP 4 -ompi-submit -H aa,aa,bb ./a.out -launches two processes on node aa and one on bb. -. -.PP -Or, consider the hostfile -. - - \fB%\fP cat myhostfile - aa slots=2 - bb slots=2 - cc slots=2 - -. -.PP -Since the DVM was started with \fIorte-dvm\fP, \fIorte-submit\fP -will ignore any slots arguments in the hostfile. Values provided -via hostfile to \fIorte-dvm\fP will control the behavior. -. -.PP -. -.TP 4 -ompi-submit -hostfile myhostfile ./a.out -will launch two processes on each of the three nodes. -. -.TP 4 -ompi-submit -hostfile myhostfile -host aa ./a.out -will launch two processes, both on node aa. -. -.TP 4 -ompi-submit -hostfile myhostfile -host dd ./a.out -will find no hosts to run on and abort with an error. -That is, the specified host dd is not in the specified hostfile. -. -.SS Specifying Number of Processes -. -As we have just seen, the number of processes to run can be set using the -hostfile. Other mechanisms exist. -. -.PP -The number of processes launched can be specified as a multiple of the -number of nodes or processor sockets available. For example, -. -.TP 4 -ompi-submit -H aa,bb -npersocket 2 ./a.out -launches processes 0-3 on node aa and process 4-7 on node bb, -where aa and bb are both dual-socket nodes. -The \fI-npersocket\fP option also turns on the \fI-bind-to-socket\fP option, -which is discussed in a later section. -. -.TP 4 -ompi-submit -H aa,bb -npernode 2 ./a.out -launches processes 0-1 on node aa and processes 2-3 on node bb. -. -.TP 4 -ompi-submit -H aa,bb -npernode 1 ./a.out -launches one process per host node. -. -.TP 4 -ompi-submit -H aa,bb -pernode ./a.out -is the same as \fI-npernode\fP 1. -. -. -.PP -Another alternative is to specify the number of processes with the -\fI-np\fP option. Consider now the hostfile -. - - \fB%\fP cat myhostfile - aa slots=4 - bb slots=4 - cc slots=4 - -. -.PP -Now, -. -.TP 4 -ompi-submit -hostfile myhostfile -np 6 ./a.out -will launch processes 0-3 on node aa and processes 4-5 on node bb. The remaining -slots in the hostfile will not be used since the \fI-np\fP option indicated -that only 6 processes should be launched. -. -.SS Mapping Processes to Nodes: Using Policies -. -The examples above illustrate the default mapping of process processes -to nodes. This mapping can also be controlled with various -\fIompi-submit\fP options that describe mapping policies. -. -. -.PP -Consider the same hostfile as above, again with \fI-np\fP 6: -. - - node aa node bb node cc - - ompi-submit 0 1 2 3 4 5 - - ompi-submit --map-by node 0 3 1 4 2 5 - - ompi-submit -nolocal 0 1 2 3 4 5 -. -.PP -The \fI--map-by node\fP option will load balance the processes across -the available nodes, numbering each process in a round-robin fashion. -. -.PP -The \fI-nolocal\fP option prevents any processes from being mapped onto the -local host (in this case node aa). While \fIompi-submit\fP typically consumes -few system resources, \fI-nolocal\fP can be helpful for launching very -large jobs where \fIompi-submit\fP may actually need to use noticeable amounts -of memory and/or processing time. -. -.PP -Just as \fI-np\fP can specify fewer processes than there are slots, it can -also oversubscribe the slots. For example, with the same hostfile: -. -.TP 4 -ompi-submit -hostfile myhostfile -np 14 ./a.out -will launch processes 0-3 on node aa, 4-7 on bb, and 8-11 on cc. It will -then add the remaining two processes to whichever nodes it chooses. -. -.PP -One can also specify limits to oversubscription. For example, with the same -hostfile: -. -.TP 4 -ompi-submit -hostfile myhostfile -np 14 -nooversubscribe ./a.out -will produce an error since \fI-nooversubscribe\fP prevents oversubscription. -. -.PP -Limits to oversubscription can also be specified in the hostfile itself: -. - % cat myhostfile - aa slots=4 max_slots=4 - bb max_slots=4 - cc slots=4 -. -.PP -The \fImax_slots\fP field specifies such a limit. When it does, the -\fIslots\fP value defaults to the limit. Now: -. -.TP 4 -ompi-submit -hostfile myhostfile -np 14 ./a.out -causes the first 12 processes to be launched as before, but the remaining -two processes will be forced onto node cc. The other two nodes are -protected by the hostfile against oversubscription by this job. -. -.PP -Using the \fI--nooversubscribe\fR option can be helpful since Open MPI -currently does not get "max_slots" values from the resource manager. -. -.PP -Of course, \fI-np\fP can also be used with the \fI-H\fP or \fI-host\fP -option. For example, -. -.TP 4 -ompi-submit -H aa,bb -np 8 ./a.out -launches 8 processes. Since only two hosts are specified, after the first -two processes are mapped, one to aa and one to bb, the remaining processes -oversubscribe the specified hosts. -. -.PP -And here is a MIMD example: -. -.TP 4 -ompi-submit -H aa -np 1 hostname : -H bb,cc -np 2 uptime -will launch process 0 running \fIhostname\fP on node aa and processes 1 and 2 -each running \fIuptime\fP on nodes bb and cc, respectively. -. -.SS Mapping, Ranking, and Binding: Oh My! -. -Open MPI employs a three-phase procedure for assigning process locations and -ranks: -. -.TP 10 -\fBmapping\fP -Assigns a default location to each process -. -.TP 10 -\fBranking\fP -Assigns an MPI_COMM_WORLD rank value to each process -. -.TP 10 -\fBbinding\fP -Constrains each process to run on specific processors -. -.PP -The \fImapping\fP step is used to assign a default location to each process -based on the mapper being employed. Mapping by slot, node, and sequentially results -in the assignment of the processes to the node level. In contrast, mapping by object, allows -the mapper to assign the process to an actual object on each node. -. -.PP -\fBNote:\fP the location assigned to the process is independent of where it will be bound - the -assignment is used solely as input to the binding algorithm. -. -.PP -The mapping of process processes to nodes can be defined not just -with general policies but also, if necessary, using arbitrary mappings -that cannot be described by a simple policy. One can use the "sequential -mapper," which reads the hostfile line by line, assigning processes -to nodes in whatever order the hostfile specifies. Use the -\fI-mca rmaps seq\fP option. For example, using the same hostfile -as before: -. -.PP -ompi-submit -hostfile myhostfile -mca rmaps seq ./a.out -. -.PP -will launch three processes, one on each of nodes aa, bb, and cc, respectively. -The slot counts don't matter; one process is launched per line on -whatever node is listed on the line. -. -.PP -Another way to specify arbitrary mappings is with a rankfile, which -gives you detailed control over process binding as well. Rankfiles -are discussed below. -. -.PP -The second phase focuses on the \fIranking\fP of the process within -the job's MPI_COMM_WORLD. Open MPI -separates this from the mapping procedure to allow more flexibility in the -relative placement of MPI processes. This is best illustrated by considering the -following two cases where we used the —map-by ppr:2:socket option: -. -.PP - node aa node bb - - rank-by core 0 1 ! 2 3 4 5 ! 6 7 - - rank-by socket 0 2 ! 1 3 4 6 ! 5 7 - - rank-by socket:span 0 4 ! 1 5 2 6 ! 3 7 -. -.PP -Ranking by core and by slot provide the identical result - a simple -progression of MPI_COMM_WORLD ranks across each node. Ranking by -socket does a round-robin ranking within each node until all processes -have been assigned an MCW rank, and then progresses to the next -node. Adding the \fIspan\fP modifier to the ranking directive causes -the ranking algorithm to treat the entire allocation as a single -entity - thus, the MCW ranks are assigned across all sockets before -circling back around to the beginning. -. -.PP -The \fIbinding\fP phase actually binds each process to a given set of processors. This can -improve performance if the operating system is placing processes -suboptimally. For example, it might oversubscribe some multi-core -processor sockets, leaving other sockets idle; this can lead -processes to contend unnecessarily for common resources. Or, it -might spread processes out too widely; this can be suboptimal if -application performance is sensitive to interprocess communication -costs. Binding can also keep the operating system from migrating -processes excessively, regardless of how optimally those processes -were placed to begin with. -. -.PP -The processors to be used for binding can be identified in terms of -topological groupings - e.g., binding to an l3cache will bind each -process to all processors within the scope of a single L3 cache within -their assigned location. Thus, if a process is assigned by the mapper -to a certain socket, then a \fI—bind-to l3cache\fP directive will -cause the process to be bound to the processors that share a single L3 -cache within that socket. -. -.PP -To help balance loads, the binding directive uses a round-robin method when binding to -levels lower than used in the mapper. For example, consider the case where a job is -mapped to the socket level, and then bound to core. Each socket will have multiple cores, -so if multiple processes are mapped to a given socket, the binding algorithm will assign -each process located to a socket to a unique core in a round-robin manner. -. -.PP -Alternatively, processes mapped by l2cache and then bound to socket will simply be bound -to all the processors in the socket where they are located. In this manner, users can -exert detailed control over relative MCW rank location and binding. -. -.PP -Finally, \fI--report-bindings\fP can be used to report bindings. -. -.PP -As an example, consider a node with two processor sockets, each comprising -four cores. We run \fIompi-submit\fP with \fI-np 4 --report-bindings\fP and -the following additional options: -. - - % ompi-submit ... --map-by core --bind-to core - [...] ... binding child [...,0] to cpus 0001 - [...] ... binding child [...,1] to cpus 0002 - [...] ... binding child [...,2] to cpus 0004 - [...] ... binding child [...,3] to cpus 0008 - - % ompi-submit ... --map-by socket --bind-to socket - [...] ... binding child [...,0] to socket 0 cpus 000f - [...] ... binding child [...,1] to socket 1 cpus 00f0 - [...] ... binding child [...,2] to socket 0 cpus 000f - [...] ... binding child [...,3] to socket 1 cpus 00f0 - - % ompi-submit ... --map-by core:PE=2 --bind-to core - [...] ... binding child [...,0] to cpus 0003 - [...] ... binding child [...,1] to cpus 000c - [...] ... binding child [...,2] to cpus 0030 - [...] ... binding child [...,3] to cpus 00c0 - - % ompi-submit ... --bind-to none -. -.PP -Here, \fI--report-bindings\fP shows the binding of each process as a mask. -In the first case, the processes bind to successive cores as indicated by -the masks 0001, 0002, 0004, and 0008. In the second case, processes bind -to all cores on successive sockets as indicated by the masks 000f and 00f0. -The processes cycle through the processor sockets in a round-robin fashion -as many times as are needed. In the third case, the masks show us that -2 cores have been bound per process. In the fourth case, binding is -turned off and no bindings are reported. -. -.PP -Open MPI's support for process binding depends on the underlying -operating system. Therefore, certain process binding options may not be available -on every system. -. -.PP -Process binding can also be set with MCA parameters. -Their usage is less convenient than that of \fIompi-submit\fP options. -On the other hand, MCA parameters can be set not only on the \fIompi-submit\fP -command line, but alternatively in a system or user mca-params.conf file -or as environment variables, as described in the MCA section below. -Some examples include: -. -.PP - ompi-submit option MCA parameter key value - - --map-by core rmaps_base_mapping_policy core - --map-by socket rmaps_base_mapping_policy socket - --rank-by core rmaps_base_ranking_policy core - --bind-to core hwloc_base_binding_policy core - --bind-to socket hwloc_base_binding_policy socket - --bind-to none hwloc_base_binding_policy none -. -. -.SS Rankfiles -. -Rankfiles are text files that specify detailed information about how -individual processes should be mapped to nodes, and to which -processor(s) they should be bound. Each line of a rankfile specifies -the location of one process (for MPI jobs, the process' "rank" refers -to its rank in MPI_COMM_WORLD). The general form of each line in the -rankfile is: -. - - rank = slot= -. -.PP -For example: -. - - $ cat myrankfile - rank 0=aa slot=1:0-2 - rank 1=bb slot=0:0,1 - rank 2=cc slot=1-2 - $ ompi-submit -H aa,bb,cc,dd -rf myrankfile ./a.out -. -.PP -Means that -. - - Rank 0 runs on node aa, bound to logical socket 1, cores 0-2. - Rank 1 runs on node bb, bound to logical socket 0, cores 0 and 1. - Rank 2 runs on node cc, bound to logical cores 1 and 2. -. -.PP -Rankfiles can alternatively be used to specify \fIphysical\fP processor -locations. In this case, the syntax is somewhat different. Sockets are -no longer recognized, and the slot number given must be the number of -the physical PU as most OS's do not assign a unique physical identifier -to each core in the node. Thus, a proper physical rankfile looks something -like the following: -. - - $ cat myphysicalrankfile - rank 0=aa slot=1 - rank 1=bb slot=8 - rank 2=cc slot=6 -. -.PP -This means that -. - - Rank 0 will run on node aa, bound to the core that contains physical PU 1 - Rank 1 will run on node bb, bound to the core that contains physical PU 8 - Rank 2 will run on node cc, bound to the core that contains physical PU 6 -. -.PP -Rankfiles are treated as \fIlogical\fP by default, and the MCA parameter -rmaps_rank_file_physical must be set to 1 to indicate that the rankfile -is to be considered as \fIphysical\fP. -. -.PP -The hostnames listed above are "absolute," meaning that actual -resolveable hostnames are specified. However, hostnames can also be -specified as "relative," meaning that they are specified in relation -to an externally-specified list of hostnames (e.g., by ompi-submit's --host -argument, a hostfile, or a job scheduler). -. -.PP -The "relative" specification is of the form "+n", where X is an -integer specifying the Xth hostname in the set of all available -hostnames, indexed from 0. For example: -. - - $ cat myrankfile - rank 0=+n0 slot=1:0-2 - rank 1=+n1 slot=0:0,1 - rank 2=+n2 slot=1-2 - $ ompi-submit -H aa,bb,cc,dd -rf myrankfile ./a.out -. -.PP -Starting with Open MPI v1.7, all socket/core slot locations are be -specified as -.I logical -indexes (the Open MPI v1.6 series used -.I physical -indexes). You can use tools such as HWLOC's "lstopo" to find the -logical indexes of socket and cores. -. -. -.SS Application Context or Executable Program? -. -To distinguish the two different forms, \fIompi-submit\fP -looks on the command line for \fI--app\fP option. If -it is specified, then the file named on the command line is -assumed to be an application context. If it is not -specified, then the file is assumed to be an executable program. -. -. -. -.SS Locating Files -. -If no relative or absolute path is specified for a file, Open -MPI will first look for files by searching the directories specified -by the \fI--path\fP option. If there is no \fI--path\fP option set or -if the file is not found at the \fI--path\fP location, then Open MPI -will search the user's PATH environment variable as defined on the -source node(s). -.PP -If a relative directory is specified, it must be relative to the initial -working directory determined by the specific starter used. For example when -using the rsh or ssh starters, the initial directory is $HOME by default. Other -starters may set the initial directory to the current working directory from -the invocation of \fIompi-submit\fP. -. -. -. -.SS Current Working Directory -. -The \fI\-wdir\fP ompi-submit option (and its synonym, \fI\-wd\fP) allows -the user to change to an arbitrary directory before the program is -invoked. It can also be used in application context files to specify -working directories on specific nodes and/or for specific -applications. -.PP -If the \fI\-wdir\fP option appears both in a context file and on the -command line, the context file directory will override the command -line value. -.PP -If the \fI-wdir\fP option is specified, Open MPI will attempt to -change to the specified directory on all of the remote nodes. If this -fails, \fIompi-submit\fP will abort. -.PP -If the \fI-wdir\fP option is \fBnot\fP specified, Open MPI will send -the directory name where \fIompi-submit\fP was invoked to each of the -remote nodes. The remote nodes will try to change to that -directory. If they are unable (e.g., if the directory does not exist on -that node), then Open MPI will use the default directory determined by -the starter. -.PP -All directory changing occurs before the user's program is invoked; it -does not wait until \fIMPI_INIT\fP is called. -. -. -. -.SS Standard I/O -. -Open MPI directs UNIX standard input to /dev/null on all processes -except the MPI_COMM_WORLD rank 0 process. The MPI_COMM_WORLD rank 0 process -inherits standard input from \fIompi-submit\fP. -.B Note: -The node that invoked \fIompi-submit\fP need not be the same as the node where the -MPI_COMM_WORLD rank 0 process resides. Open MPI handles the redirection of -\fIompi-submit\fP's standard input to the rank 0 process. -.PP -Open MPI directs UNIX standard output and error from remote nodes to the node -that invoked \fIompi-submit\fP and prints it on the standard output/error of -\fIompi-submit\fP. -Local processes inherit the standard output/error of \fIompi-submit\fP and transfer -to it directly. -.PP -Thus it is possible to redirect standard I/O for Open MPI applications by -using the typical shell redirection procedure on \fIompi-submit\fP. - - \fB%\fP ompi-submit -np 2 my_app < my_input > my_output - -Note that in this example \fIonly\fP the MPI_COMM_WORLD rank 0 process will -receive the stream from \fImy_input\fP on stdin. The stdin on all the other -nodes will be tied to /dev/null. However, the stdout from all nodes will -be collected into the \fImy_output\fP file. -. -. -. -.SS Signal Propagation -. -When orte-submit receives a SIGTERM and SIGINT, it will attempt to kill -the entire job by sending all processes in the job a SIGTERM, waiting -a small number of seconds, then sending all processes in the job a -SIGKILL. -. -.PP -SIGUSR1 and SIGUSR2 signals received by orte-submit are propagated to -all processes in the job. -. -.PP -A SIGTSTOP signal to ompi-submit will cause a SIGSTOP signal to be sent -to all of the programs started by ompi-submit and likewise a SIGCONT signal -to ompi-submit will cause a SIGCONT sent. -. -.PP -Other signals are not currently propagated -by orte-submit. -. -. -.SS Process Termination / Signal Handling -. -During the run of an MPI application, if any process dies abnormally -(either exiting before invoking \fIMPI_FINALIZE\fP, or dying as the result of a -signal), \fIompi-submit\fP will print out an error message and kill the rest of the -MPI application. -.PP -User signal handlers should probably avoid trying to cleanup MPI state -(Open MPI is currently not async-signal-safe; see MPI_Init_thread(3) -for details about -.I MPI_THREAD_MULTIPLE -and thread safety). For example, if a segmentation fault occurs in -\fIMPI_SEND\fP (perhaps because a bad buffer was passed in) and a user -signal handler is invoked, if this user handler attempts to invoke -\fIMPI_FINALIZE\fP, Bad Things could happen since Open MPI was already -"in" MPI when the error occurred. Since \fIompi-submit\fP will notice that -the process died due to a signal, it is probably not necessary (and -safest) for the user to only clean up non-MPI state. -. -. -. -.SS Process Environment -. -Processes in the MPI application inherit their environment from the -Open RTE daemon upon the node on which they are running. The -environment is typically inherited from the user's shell. On remote -nodes, the exact environment is determined by the boot MCA module -used. The \fIrsh\fR launch module, for example, uses either -\fIrsh\fR/\fIssh\fR to launch the Open RTE daemon on remote nodes, and -typically executes one or more of the user's shell-setup files before -launching the Open RTE daemon. When running dynamically linked -applications which require the \fILD_LIBRARY_PATH\fR environment -variable to be set, care must be taken to ensure that it is correctly -set when booting Open MPI. -.PP -See the "Remote Execution" section for more details. -. -. -.SS Remote Execution -. -Open MPI requires that the \fIPATH\fR environment variable be set to -find executables on remote nodes (this is typically only necessary in -\fIrsh\fR- or \fIssh\fR-based environments -- batch/scheduled -environments typically copy the current environment to the execution -of remote jobs, so if the current environment has \fIPATH\fR and/or -\fILD_LIBRARY_PATH\fR set properly, the remote nodes will also have it -set properly). If Open MPI was compiled with shared library support, -it may also be necessary to have the \fILD_LIBRARY_PATH\fR environment -variable set on remote nodes as well (especially to find the shared -libraries required to run user MPI applications). -.PP -However, it is not always desirable or possible to edit shell -startup files to set \fIPATH\fR and/or \fILD_LIBRARY_PATH\fR. The -\fI--prefix\fR option is provided for some simple configurations where -this is not possible. -.PP -The \fI--prefix\fR option takes a single argument: the base directory -on the remote node where Open MPI is installed. Open MPI will use -this directory to set the remote \fIPATH\fR and \fILD_LIBRARY_PATH\fR -before executing any Open MPI or user applications. This allows -running Open MPI jobs without having pre-configured the \fIPATH\fR and -\fILD_LIBRARY_PATH\fR on the remote nodes. -.PP -Open MPI adds the basename of the current -node's "bindir" (the directory where Open MPI's executables are -installed) to the prefix and uses that to set the \fIPATH\fR on the -remote node. Similarly, Open MPI adds the basename of the current -node's "libdir" (the directory where Open MPI's libraries are -installed) to the prefix and uses that to set the -\fILD_LIBRARY_PATH\fR on the remote node. For example: -.TP 15 -Local bindir: -/local/node/directory/bin -.TP -Local libdir: -/local/node/directory/lib64 -.PP -If the following command line is used: - - \fB%\fP ompi-submit --prefix /remote/node/directory - -Open MPI will add "/remote/node/directory/bin" to the \fIPATH\fR -and "/remote/node/directory/lib64" to the \fLD_LIBRARY_PATH\fR on the -remote node before attempting to execute anything. -.PP -The \fI--prefix\fR option is not sufficient if the installation paths -on the remote node are different than the local node (e.g., if "/lib" -is used on the local node, but "/lib64" is used on the remote node), -or if the installation paths are something other than a subdirectory -under a common prefix. -.PP -Note that executing \fIompi-submit\fR via an absolute pathname is -equivalent to specifying \fI--prefix\fR without the last subdirectory -in the absolute pathname to \fIompi-submit\fR. For example: - - \fB%\fP /usr/local/bin/ompi-submit ... - -is equivalent to - - \fB%\fP ompi-submit --prefix /usr/local -. -. -. -.SS Exported Environment Variables -. -All environment variables that are named in the form OMPI_* will automatically -be exported to new processes on the local and remote nodes. Environmental -parameters can also be set/forwarded to the new processes using the MCA -parameter \fImca_base_env_list\fP. The \fI\-x\fP option to \fIompi-submit\fP has -been deprecated, but the syntax of the MCA param follows that prior -example. While the syntax of the \fI\-x\fP option and MCA param -allows the definition of new variables, note that the parser -for these options are currently not very sophisticated - it does not even -understand quoted values. Users are advised to set variables in the -environment and use the option to export them; not to define them. -. -. -. -.SS Setting MCA Parameters -. -The \fI-mca\fP switch allows the passing of parameters to various MCA -(Modular Component Architecture) modules. -.\" Open MPI's MCA modules are described in detail in ompimca(7). -MCA modules have direct impact on MPI programs because they allow tunable -parameters to be set at run time (such as which BTL communication device driver -to use, what parameters to pass to that BTL, etc.). -.PP -The \fI-mca\fP switch takes two arguments: \fI\fP and \fI\fP. -The \fI\fP argument generally specifies which MCA module will receive the value. -For example, the \fI\fP "btl" is used to select which BTL to be used for -transporting MPI messages. The \fI\fP argument is the value that is -passed. -For example: -. -.TP 4 -ompi-submit -mca btl tcp,self -np 1 foo -Tells Open MPI to use the "tcp" and "self" BTLs, and to run a single copy of -"foo" an allocated node. -. -.TP -ompi-submit -mca btl self -np 1 foo -Tells Open MPI to use the "self" BTL, and to run a single copy of "foo" an -allocated node. -.\" And so on. Open MPI's BTL MCA modules are described in ompimca_btl(7). -.PP -The \fI-mca\fP switch can be used multiple times to specify different -\fI\fP and/or \fI\fP arguments. If the same \fI\fP is -specified more than once, the \fI\fPs are concatenated with a comma -(",") separating them. -.PP -Note that the \fI-mca\fP switch is simply a shortcut for setting environment variables. -The same effect may be accomplished by setting corresponding environment -variables before running \fIompi-submit\fP. -The form of the environment variables that Open MPI sets is: - - OMPI_MCA_= -.PP -Thus, the \fI-mca\fP switch overrides any previously set environment -variables. The \fI-mca\fP settings similarly override MCA parameters set -in the -$OPAL_PREFIX/etc/openmpi-mca-params.conf or $HOME/.openmpi/mca-params.conf -file. -. -.PP -Unknown \fI\fP arguments are still set as -environment variable -- they are not checked (by \fIompi-submit\fP) for correctness. -Illegal or incorrect \fI\fP arguments may or may not be reported -- it -depends on the specific MCA module. -.PP -To find the available component types under the MCA architecture, or to find the -available parameters for a specific component, use the \fIompi_info\fP command. -See the \fIompi_info(1)\fP man page for detailed information on the command. -. -.SS Running as root -. -The Open MPI team strongly advises against executing -.I ompi-submit -as the root user. MPI applications should be run as regular -(non-root) users. -. -.PP -Reflecting this advice, ompi-submit will refuse to run as root by default. -To override this default, you can add the -.I --allow-run-as-root -option to the -.I ompi-submit -command line. -. -.SS Exit status -. -There is no standard definition for what \fIompi-submit\fP should return as an exit -status. After considerable discussion, we settled on the following method for -assigning the \fIompi-submit\fP exit status (note: in the following description, -the "primary" job is the initial application started by ompi-submit - all jobs that -are spawned by that job are designated "secondary" jobs): -. -.IP \[bu] 2 -if all processes in the primary job normally terminate with exit status 0, we return 0 -.IP \[bu] -if one or more processes in the primary job normally terminate with non-zero exit status, -we return the exit status of the process with the lowest MPI_COMM_WORLD rank to have a non-zero status -.IP \[bu] -if all processes in the primary job normally terminate with exit status 0, and one or more -processes in a secondary job normally terminate with non-zero exit status, we (a) return -the exit status of the process with the lowest MPI_COMM_WORLD rank in the lowest jobid to have a non-zero status, and (b) -output a message summarizing the exit status of the primary and all secondary jobs. -.IP \[bu] -if the cmd line option --report-child-jobs-separately is set, we will return -only- the -exit status of the primary job. Any non-zero exit status in secondary jobs will be -reported solely in a summary print statement. -. -.PP -By default, OMPI records and notes that MPI processes exited with non-zero termination status. -This is generally not considered an "abnormal termination" - i.e., OMPI will not abort an MPI -job if one or more processes return a non-zero status. Instead, the default behavior simply -reports the number of processes terminating with non-zero status upon completion of the job. -.PP -However, in some cases it can be desirable to have the job abort when any process terminates -with non-zero status. For example, a non-MPI job might detect a bad result from a calculation -and want to abort, but doesn't want to generate a core file. Or an MPI job might continue past -a call to MPI_Finalize, but indicate that all processes should abort due to some post-MPI result. -.PP -It is not anticipated that this situation will occur frequently. However, in the interest of -serving the broader community, OMPI now has a means for allowing users to direct that jobs be -aborted upon any process exiting with non-zero status. Setting the MCA parameter -"orte_abort_on_non_zero_status" to 1 will cause OMPI to abort all processes once any process - exits with non-zero status. -.PP -Terminations caused in this manner will be reported on the console as an "abnormal termination", -with the first process to so exit identified along with its exit status. -.PP -. -.\" ************************** -.\" Examples Section -.\" ************************** -.SH EXAMPLES -Be sure also to see the examples throughout the sections above. -. -.TP 4 -ompi-submit -np 4 -mca btl ib,tcp,self prog1 -Run 4 copies of prog1 using the "ib", "tcp", and "self" BTL's for the -transport of MPI messages. -. -. -.TP 4 -ompi-submit -np 4 -mca btl tcp,sm,self -.br ---mca btl_tcp_if_include eth0 prog1 -.br -Run 4 copies of prog1 using the "tcp", "sm" and "self" BTLs for the -transport of MPI messages, with TCP using only the eth0 interface to -communicate. Note that other BTLs have similar if_include MCA -parameters. -. -.\" ************************** -.\" Diagnostics Section -.\" ************************** -. -.\" .SH DIAGNOSTICS -.\" .TP 4 -.\" Error Msg: -.\" Description -. -.\" ************************** -.\" Return Value Section -.\" ************************** -. -.SH RETURN VALUE -. -\fIompi-submit\fP returns 0 if all processes started by \fIompi-submit\fP exit after calling -MPI_FINALIZE. A non-zero value is returned if an internal error occurred in -ompi-submit, or one or more processes exited before calling MPI_FINALIZE. If an -internal error occurred in ompi-submit, the corresponding error code is returned. -In the event that one or more processes exit before calling MPI_FINALIZE, the -return value of the MPI_COMM_WORLD rank of the process that \fIompi-submit\fP first notices died -before calling MPI_FINALIZE will be returned. Note that, in general, this will -be the first process that died but is not guaranteed to be so. -. -.\" ************************** -.\" See Also Section -.\" ************************** -. -.SH SEE ALSO -MPI_Init_thread(3) diff --git a/orte/tools/orte-submit/orte-submit.c b/orte/tools/orte-submit/orte-submit.c deleted file mode 100644 index db11bef7b1..0000000000 --- a/orte/tools/orte-submit/orte-submit.c +++ /dev/null @@ -1,182 +0,0 @@ -/* -*- C -*- - * - * Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2004-2008 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2005 The Regents of the University of California. - * All rights reserved. - * Copyright (c) 2006-2014 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2007-2009 Sun Microsystems, Inc. All rights reserved. - * Copyright (c) 2007-2013 Los Alamos National Security, LLC. All rights - * reserved. - * Copyright (c) 2013-2016 Intel, Inc. All rights reserved. - * Copyright (c) 2015 Research Organization for Information Science - * and Technology (RIST). All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "orte_config.h" -#include "orte/constants.h" - -#include -#include -#include - -#include "opal/dss/dss.h" -#include "opal/mca/event/event.h" - -#include "orte/mca/errmgr/errmgr.h" -#include "orte/orted/orted_submit.h" -#include "orte/runtime/orte_globals.h" -#include "orte/util/show_help.h" - -/* - * Globals - */ -typedef struct { - int status; - volatile bool active; - orte_job_t *jdata; -} orte_submit_status_t; - -static void launched(int index, orte_job_t *jdata, int ret, void *cbdata); -static void completed(int index, orte_job_t *jdata, int ret, void *cbdata); - - -static opal_cmd_line_init_t cmd_line_init[] = { - { "orte_execute_quiet", 'q', NULL, "quiet", 0, - NULL, OPAL_CMD_LINE_TYPE_BOOL, - "Suppress helpful messages" }, - - { NULL, '\0', "report-pid", "report-pid", 1, - &orte_cmd_line.report_pid, OPAL_CMD_LINE_TYPE_STRING, - "Printout pid on stdout [-], stderr [+], or a file [anything else]" }, - { NULL, '\0', "report-uri", "report-uri", 1, - &orte_cmd_line.report_uri, OPAL_CMD_LINE_TYPE_STRING, - "Printout URI on stdout [-], stderr [+], or a file [anything else]" }, - - /* exit status reporting */ - { "orte_report_child_jobs_separately", '\0', "report-child-jobs-separately", "report-child-jobs-separately", 0, - NULL, OPAL_CMD_LINE_TYPE_BOOL, - "Return the exit status of the primary job only" }, - - /* select XML output */ - { "orte_xml_output", '\0', "xml", "xml", 0, - NULL, OPAL_CMD_LINE_TYPE_BOOL, - "Provide all output in XML format" }, - { "orte_xml_file", '\0', "xml-file", "xml-file", 1, - NULL, OPAL_CMD_LINE_TYPE_STRING, - "Provide all output in XML format to the specified file" }, - - { "orte_xterm", '\0', "xterm", "xterm", 1, - NULL, OPAL_CMD_LINE_TYPE_STRING, - "Create a new xterm window and display output from the specified ranks there" }, - - /* tell the dvm to terminate */ - { NULL, '\0', "terminate", "terminate", 0, - &orte_cmd_line.terminate_dvm, OPAL_CMD_LINE_TYPE_BOOL, - "Terminate the DVM" }, - - /* End of list */ - { NULL, '\0', NULL, NULL, 0, - NULL, OPAL_CMD_LINE_TYPE_NULL, NULL } -}; - -int main(int argc, char *argv[]) -{ - int rc; - orte_submit_status_t launchst, completest; - opal_cmd_line_t cmd_line; - - memset(&orte_cmd_line, 0, sizeof(orte_cmd_line)); - /* setup our cmd line */ - opal_cmd_line_create(&cmd_line, cmd_line_init); - mca_base_cmd_line_setup(&cmd_line); - - /* initialize the RTE */ - if (ORTE_SUCCESS != (rc = orte_submit_init(argc, argv, &cmd_line))) { - fprintf(stderr, "Init failed due to duplicate command options\n"); - exit(rc); - } - - /* if this is the terminate command, just send it */ - if (orte_cmd_line.terminate_dvm) { - rc = orte_submit_halt(); - /* just loop the event library - the errmgr - * will exit us when the connection to our - * HNP closes */ - while (1) { - opal_event_loop(orte_event_base, OPAL_EVLOOP_ONCE); - } - } - - /* launch whatever job we were given */ - memset(&launchst, 0, sizeof(launchst)); - memset(&completest, 0, sizeof(completest)); - launchst.active = true; - completest.active = true; - if (ORTE_SUCCESS != (rc = orte_submit_job(argv, NULL, - launched, &launchst, - completed, &completest))) { - if (ORTE_ERR_OP_IN_PROGRESS == rc) { - /* terminate command was given */ - goto waiting; - } - opal_output(0, "JOB FAILED TO LAUNCH WITH ERROR %d:%s", - rc, ORTE_ERROR_NAME(rc)); - goto DONE; - } - - // wait for response and unpack the status, jobid - while (launchst.active) { - opal_event_loop(orte_event_base, OPAL_EVLOOP_ONCE); - } - if (orte_debug_flag) { - opal_output(0, "Job %s has launched", ORTE_JOBID_PRINT(launchst.jdata->jobid)); - } - if (ORTE_SUCCESS != launchst.status) { - goto DONE; - } - - waiting: - while (completest.active) { - opal_event_loop(orte_event_base, OPAL_EVLOOP_ONCE); - } - - DONE: - /* cleanup and leave */ - orte_submit_finalize(); - - if (orte_debug_flag) { - fprintf(stderr, "exiting with status %d\n", orte_exit_status); - } - exit(orte_exit_status); -} - -static void launched(int index, orte_job_t *jdata, int ret, void *cbdata) -{ - orte_submit_status_t *launchst = (orte_submit_status_t*)cbdata; - launchst->status = ret; - ORTE_UPDATE_EXIT_STATUS(ret); - OBJ_RETAIN(jdata); - launchst->jdata = jdata; - launchst->active = false; -} -static void completed(int index, orte_job_t *jdata, int ret, void *cbdata) -{ - orte_submit_status_t *completest = (orte_submit_status_t*)cbdata; - completest->status = ret; - ORTE_UPDATE_EXIT_STATUS(ret); - OBJ_RETAIN(jdata); - completest->jdata = jdata; - completest->active = false; -} diff --git a/orte/tools/orterun/orterun.c b/orte/tools/orterun/orterun.c index f663864082..8d5333f8b4 100644 --- a/orte/tools/orterun/orterun.c +++ b/orte/tools/orterun/orterun.c @@ -81,26 +81,13 @@ #include "opal/class/opal_pointer_array.h" #include "opal/dss/dss.h" -#include "orte/util/proc_info.h" -#include "orte/util/pre_condition_transports.h" -#include "orte/util/session_dir.h" -#include "orte/util/hnp_contact.h" -#include "orte/util/show_help.h" - #include "orte/mca/dfs/dfs.h" #include "orte/mca/odls/odls.h" -#include "orte/mca/plm/plm.h" -#include "orte/mca/plm/base/plm_private.h" -#include "orte/mca/ras/ras.h" -#include "orte/mca/rmaps/rmaps_types.h" #include "orte/mca/rml/rml.h" -#include "orte/mca/rml/rml_types.h" -#include "orte/mca/rml/base/rml_contact.h" -#include "orte/mca/schizo/schizo.h" -#include "orte/mca/errmgr/errmgr.h" -#include "orte/mca/errmgr/base/errmgr_private.h" -#include "orte/mca/grpcomm/grpcomm.h" #include "orte/mca/state/state.h" +#include "orte/util/cmd_line.h" +#include "orte/util/proc_info.h" +#include "orte/util/show_help.h" #include "orte/runtime/runtime.h" #include "orte/runtime/orte_globals.h" @@ -113,439 +100,9 @@ #include "orte/orted/orted_submit.h" #include "orterun.h" -/* instance the standard MPIR interfaces */ -#define MPIR_MAX_PATH_LENGTH 512 -#define MPIR_MAX_ARG_LENGTH 1024 -struct MPIR_PROCDESC *MPIR_proctable = NULL; -int MPIR_proctable_size = 0; -volatile int MPIR_being_debugged = 0; -volatile int MPIR_debug_state = 0; -int MPIR_i_am_starter = 0; -int MPIR_partial_attach_ok = 1; -char MPIR_executable_path[MPIR_MAX_PATH_LENGTH] = {0}; -char MPIR_server_arguments[MPIR_MAX_ARG_LENGTH] = {0}; -volatile int MPIR_forward_output = 0; -volatile int MPIR_forward_comm = 0; -char MPIR_attach_fifo[MPIR_MAX_PATH_LENGTH] = {0}; -int MPIR_force_to_main = 0; -static void orte_debugger_dump(void); -static void orte_debugger_init_before_spawn(orte_job_t *jdata); -static void orte_debugger_init_after_spawn(int fd, short event, void *arg); -static void orte_debugger_detached(int fd, short event, void *arg); -static void attach_debugger(int fd, short event, void *arg); -static void build_debugger_args(orte_app_context_t *debugger); -static void open_fifo (void); -static int attach_fd = -1; -static bool fifo_active=false; -static opal_event_t *attach=NULL; - -ORTE_DECLSPEC void* MPIR_Breakpoint(void); - -static void orte_timeout_wakeup(int sd, short args, void *cbdata); - -/* - * Breakpoint function for parallel debuggers - */ -void* MPIR_Breakpoint(void) -{ - return NULL; -} - -/* - * Globals - */ -static char **global_mca_env = NULL; -static orte_std_cntr_t total_num_apps = 0; -static bool want_prefix_by_default = (bool) ORTE_WANT_ORTERUN_PREFIX_BY_DEFAULT; -static bool globals_init = false; - -static opal_cmd_line_init_t cmd_line_init[] = { - /* Various "obvious" options */ - { NULL, 'h', NULL, "help", 0, - &orte_cmd_line.help, OPAL_CMD_LINE_TYPE_BOOL, - "This help message" }, - { NULL, 'V', NULL, "version", 0, - &orte_cmd_line.version, OPAL_CMD_LINE_TYPE_BOOL, - "Print version and exit" }, - { NULL, 'v', NULL, "verbose", 0, - &orte_cmd_line.verbose, OPAL_CMD_LINE_TYPE_BOOL, - "Be verbose" }, - { "orte_execute_quiet", 'q', NULL, "quiet", 0, - NULL, OPAL_CMD_LINE_TYPE_BOOL, - "Suppress helpful messages" }, - { NULL, '\0', "report-pid", "report-pid", 1, - &orte_cmd_line.report_pid, OPAL_CMD_LINE_TYPE_STRING, - "Printout pid on stdout [-], stderr [+], or a file [anything else]" }, - { NULL, '\0', "report-uri", "report-uri", 1, - &orte_cmd_line.report_uri, OPAL_CMD_LINE_TYPE_STRING, - "Printout URI on stdout [-], stderr [+], or a file [anything else]" }, - - /* exit status reporting */ - { "orte_report_child_jobs_separately", '\0', "report-child-jobs-separately", "report-child-jobs-separately", 0, - NULL, OPAL_CMD_LINE_TYPE_BOOL, - "Return the exit status of the primary job only" }, - - /* hetero apps */ - { "orte_hetero_apps", '\0', NULL, "hetero-apps", 0, - NULL, OPAL_CMD_LINE_TYPE_BOOL, - "Indicates that multiple app_contexts are being provided that are a mix of 32/64 bit binaries" }, - - /* select XML output */ - { "orte_xml_output", '\0', "xml", "xml", 0, - NULL, OPAL_CMD_LINE_TYPE_BOOL, - "Provide all output in XML format" }, - { "orte_xml_file", '\0', "xml-file", "xml-file", 1, - NULL, OPAL_CMD_LINE_TYPE_STRING, - "Provide all output in XML format to the specified file" }, - - /* tag output */ - { "orte_tag_output", '\0', "tag-output", "tag-output", 0, - NULL, OPAL_CMD_LINE_TYPE_BOOL, - "Tag all output with [job,rank]" }, - { "orte_timestamp_output", '\0', "timestamp-output", "timestamp-output", 0, - NULL, OPAL_CMD_LINE_TYPE_BOOL, - "Timestamp all application process output" }, - { "orte_output_filename", '\0', "output-filename", "output-filename", 1, - NULL, OPAL_CMD_LINE_TYPE_STRING, - "Redirect output from application processes into filename/job/rank/std[out,err,diag]" }, - { NULL, '\0', "merge-stderr-to-stdout", "merge-stderr-to-stdout", 0, - &orte_cmd_line.merge, OPAL_CMD_LINE_TYPE_BOOL, - "Merge stderr to stdout for each process"}, - { "orte_xterm", '\0', "xterm", "xterm", 1, - NULL, OPAL_CMD_LINE_TYPE_STRING, - "Create a new xterm window and display output from the specified ranks there" }, - - /* select stdin option */ - { NULL, '\0', "stdin", "stdin", 1, - &orte_cmd_line.stdin_target, OPAL_CMD_LINE_TYPE_STRING, - "Specify procs to receive stdin [rank, all, none] (default: 0, indicating rank 0)" }, - - /* request that argv[0] be indexed */ - { NULL, '\0', "index-argv-by-rank", "index-argv-by-rank", 0, - &orte_cmd_line.index_argv, OPAL_CMD_LINE_TYPE_BOOL, - "Uniquely index argv[0] for each process using its rank" }, - - /* Specify the launch agent to be used */ - { "orte_launch_agent", '\0', "launch-agent", "launch-agent", 1, - NULL, OPAL_CMD_LINE_TYPE_STRING, - "Command used to start processes on remote nodes (default: orted)" }, - - /* Preload the binary on the remote machine */ - { NULL, 's', NULL, "preload-binary", 0, - &orte_cmd_line.preload_binaries, OPAL_CMD_LINE_TYPE_BOOL, - "Preload the binary on the remote machine before starting the remote process." }, - - /* Preload files on the remote machine */ - { NULL, '\0', NULL, "preload-files", 1, - &orte_cmd_line.preload_files, OPAL_CMD_LINE_TYPE_STRING, - "Preload the comma separated list of files to the remote machines current working directory before starting the remote process." }, - -#if OPAL_ENABLE_FT_CR == 1 - /* Tell SStore to preload a snapshot before launch */ - { NULL, '\0', NULL, "sstore-load", 1, - &orte_cmd_line.sstore_load, OPAL_CMD_LINE_TYPE_STRING, - "Internal Use Only! Tell SStore to preload a snapshot before launch." }, -#endif - - /* Use an appfile */ - { NULL, '\0', NULL, "app", 1, - &orte_cmd_line.appfile, OPAL_CMD_LINE_TYPE_STRING, - "Provide an appfile; ignore all other command line options" }, - - /* Number of processes; -c, -n, --n, -np, and --np are all - synonyms */ - { NULL, 'c', "np", "np", 1, - &orte_cmd_line.num_procs, OPAL_CMD_LINE_TYPE_INT, - "Number of processes to run" }, - { NULL, '\0', "n", "n", 1, - &orte_cmd_line.num_procs, OPAL_CMD_LINE_TYPE_INT, - "Number of processes to run" }, - - /* maximum size of VM - typically used to subdivide an allocation */ - { "orte_max_vm_size", '\0', "max-vm-size", "max-vm-size", 1, - NULL, OPAL_CMD_LINE_TYPE_INT, - "Number of processes to run" }, - - /* Set a hostfile */ - { NULL, '\0', "hostfile", "hostfile", 1, - NULL, OPAL_CMD_LINE_TYPE_STRING, - "Provide a hostfile" }, - { NULL, '\0', "machinefile", "machinefile", 1, - NULL, OPAL_CMD_LINE_TYPE_STRING, - "Provide a hostfile" }, - { "orte_default_hostfile", '\0', "default-hostfile", "default-hostfile", 1, - NULL, OPAL_CMD_LINE_TYPE_STRING, - "Provide a default hostfile" }, - { "opal_if_do_not_resolve", '\0', "do-not-resolve", "do-not-resolve", 0, - NULL, OPAL_CMD_LINE_TYPE_BOOL, - "Do not attempt to resolve interfaces" }, - - /* uri of PMIx publish/lookup server, or at least where to get it */ - { "pmix_server_uri", '\0', "ompi-server", "ompi-server", 1, - NULL, OPAL_CMD_LINE_TYPE_STRING, - "Specify the URI of the publish/lookup server, or the name of the file (specified as file:filename) that contains that info" }, - - { "carto_file_path", '\0', "cf", "cartofile", 1, - NULL, OPAL_CMD_LINE_TYPE_STRING, - "Provide a cartography file" }, - - { "orte_rankfile", '\0', "rf", "rankfile", 1, - NULL, OPAL_CMD_LINE_TYPE_STRING, - "Provide a rankfile file" }, - - /* Export environment variables; potentially used multiple times, - so it does not make sense to set into a variable */ - { NULL, 'x', NULL, NULL, 1, - NULL, OPAL_CMD_LINE_TYPE_NULL, - "Export an environment variable, optionally specifying a value (e.g., \"-x foo\" exports the environment variable foo and takes its value from the current environment; \"-x foo=bar\" exports the environment variable name foo and sets its value to \"bar\" in the started processes)" }, - - /* Mapping controls */ - { "rmaps_base_display_map", '\0', "display-map", "display-map", 0, - NULL, OPAL_CMD_LINE_TYPE_BOOL, - "Display the process map just before launch"}, - { "rmaps_base_display_devel_map", '\0', "display-devel-map", "display-devel-map", 0, - NULL, OPAL_CMD_LINE_TYPE_BOOL, - "Display a detailed process map (mostly intended for developers) just before launch"}, - { "rmaps_base_display_topo_with_map", '\0', "display-topo", "display-topo", 0, - NULL, OPAL_CMD_LINE_TYPE_BOOL, - "Display the topology as part of the process map (mostly intended for developers) just before launch"}, - { "rmaps_base_display_diffable_map", '\0', "display-diffable-map", "display-diffable-map", 0, - NULL, OPAL_CMD_LINE_TYPE_BOOL, - "Display a diffable process map (mostly intended for developers) just before launch"}, - { NULL, 'H', "host", "host", 1, - NULL, OPAL_CMD_LINE_TYPE_STRING, - "List of hosts to invoke processes on" }, - { "rmaps_base_no_schedule_local", '\0', "nolocal", "nolocal", 0, - NULL, OPAL_CMD_LINE_TYPE_BOOL, - "Do not run any MPI applications on the local node" }, - { "rmaps_base_no_oversubscribe", '\0', "nooversubscribe", "nooversubscribe", 0, - NULL, OPAL_CMD_LINE_TYPE_BOOL, - "Nodes are not to be oversubscribed, even if the system supports such operation"}, - { "rmaps_base_oversubscribe", '\0', "oversubscribe", "oversubscribe", 0, - NULL, OPAL_CMD_LINE_TYPE_BOOL, - "Nodes are allowed to be oversubscribed, even on a managed system, and overloading of processing elements"}, - { "rmaps_base_cpus_per_rank", '\0', "cpus-per-proc", "cpus-per-proc", 1, - NULL, OPAL_CMD_LINE_TYPE_INT, - "Number of cpus to use for each process [default=1]" }, - { "rmaps_base_cpus_per_rank", '\0', "cpus-per-rank", "cpus-per-rank", 1, - NULL, OPAL_CMD_LINE_TYPE_INT, - "Synonym for cpus-per-proc" }, - - /* backward compatiblity */ - { "rmaps_base_bycore", '\0', "bycore", "bycore", 0, - NULL, OPAL_CMD_LINE_TYPE_BOOL, - "Whether to map and rank processes round-robin by core" }, - { "rmaps_base_bynode", '\0', "bynode", "bynode", 0, - NULL, OPAL_CMD_LINE_TYPE_BOOL, - "Whether to map and rank processes round-robin by node" }, - { "rmaps_base_byslot", '\0', "byslot", "byslot", 0, - NULL, OPAL_CMD_LINE_TYPE_BOOL, - "Whether to map and rank processes round-robin by slot" }, - - /* Nperxxx options that do not require topology and are always - * available - included for backwards compatibility - */ - { "rmaps_ppr_pernode", '\0', "pernode", "pernode", 0, - NULL, OPAL_CMD_LINE_TYPE_BOOL, - "Launch one process per available node" }, - { "rmaps_ppr_n_pernode", '\0', "npernode", "npernode", 1, - NULL, OPAL_CMD_LINE_TYPE_INT, - "Launch n processes per node on all allocated nodes" }, - { "rmaps_ppr_n_pernode", '\0', "N", NULL, 1, - NULL, OPAL_CMD_LINE_TYPE_INT, - "Launch n processes per node on all allocated nodes (synonym for npernode)" }, - - /* declare hardware threads as independent cpus */ - { "hwloc_base_use_hwthreads_as_cpus", '\0', "use-hwthread-cpus", "use-hwthread-cpus", 0, - NULL, OPAL_CMD_LINE_TYPE_BOOL, - "Use hardware threads as independent cpus" }, - - /* include npersocket for backwards compatibility */ - { "rmaps_ppr_n_persocket", '\0', "npersocket", "npersocket", 1, - NULL, OPAL_CMD_LINE_TYPE_INT, - "Launch n processes per socket on all allocated nodes" }, - - /* Mapping options */ - { "rmaps_base_mapping_policy", '\0', NULL, "map-by", 1, - NULL, OPAL_CMD_LINE_TYPE_STRING, - "Mapping Policy [slot | hwthread | core | socket (default) | numa | board | node]" }, - - /* Ranking options */ - { "rmaps_base_ranking_policy", '\0', NULL, "rank-by", 1, - NULL, OPAL_CMD_LINE_TYPE_STRING, - "Ranking Policy [slot (default) | hwthread | core | socket | numa | board | node]" }, - - /* Binding options */ - { "hwloc_base_binding_policy", '\0', NULL, "bind-to", 1, - NULL, OPAL_CMD_LINE_TYPE_STRING, - "Policy for binding processes. Allowed values: none, hwthread, core, l1cache, l2cache, l3cache, socket, numa, board (\"none\" is the default when oversubscribed, \"core\" is the default when np<=2, and \"socket\" is the default when np>2). Allowed qualifiers: overload-allowed, if-supported" }, - - /* backward compatiblity */ - { "hwloc_base_bind_to_core", '\0', "bind-to-core", "bind-to-core", 0, - NULL, OPAL_CMD_LINE_TYPE_BOOL, - "Bind processes to cores" }, - { "hwloc_base_bind_to_socket", '\0', "bind-to-socket", "bind-to-socket", 0, - NULL, OPAL_CMD_LINE_TYPE_BOOL, - "Bind processes to sockets" }, - - { "hwloc_base_report_bindings", '\0', "report-bindings", "report-bindings", 0, - NULL, OPAL_CMD_LINE_TYPE_BOOL, - "Whether to report process bindings to stderr" }, - - /* slot list option */ - { "hwloc_base_slot_list", '\0', "slot-list", "slot-list", 1, - NULL, OPAL_CMD_LINE_TYPE_STRING, - "List of processor IDs to bind processes to [default=NULL]"}, - - /* generalized pattern mapping option */ - { "rmaps_ppr_pattern", '\0', NULL, "ppr", 1, - NULL, OPAL_CMD_LINE_TYPE_STRING, - "Comma-separated list of number of processes on a given resource type [default: none]" }, - - /* Allocation options */ - { "orte_display_alloc", '\0', "display-allocation", "display-allocation", 0, - NULL, OPAL_CMD_LINE_TYPE_BOOL, - "Display the allocation being used by this job"}, - { "orte_display_devel_alloc", '\0', "display-devel-allocation", "display-devel-allocation", 0, - NULL, OPAL_CMD_LINE_TYPE_BOOL, - "Display a detailed list (mostly intended for developers) of the allocation being used by this job"}, - { "hwloc_base_cpu_set", '\0', "cpu-set", "cpu-set", 1, - NULL, OPAL_CMD_LINE_TYPE_STRING, - "Comma-separated list of ranges specifying logical cpus allocated to this job [default: none]"}, - - /* mpiexec-like arguments */ - { NULL, '\0', "wdir", "wdir", 1, - &orte_cmd_line.wdir, OPAL_CMD_LINE_TYPE_STRING, - "Set the working directory of the started processes" }, - { NULL, '\0', "wd", "wd", 1, - &orte_cmd_line.wdir, OPAL_CMD_LINE_TYPE_STRING, - "Synonym for --wdir" }, - { NULL, '\0', "set-cwd-to-session-dir", "set-cwd-to-session-dir", 0, - &orte_cmd_line.set_cwd_to_session_dir, OPAL_CMD_LINE_TYPE_BOOL, - "Set the working directory of the started processes to their session directory" }, - { NULL, '\0', "path", "path", 1, - &orte_cmd_line.path, OPAL_CMD_LINE_TYPE_STRING, - "PATH to be used to look for executables to start processes" }, - - /* User-level debugger arguments */ - { NULL, '\0', "tv", "tv", 0, - &orte_cmd_line.debugger, OPAL_CMD_LINE_TYPE_BOOL, - "Deprecated backwards compatibility flag; synonym for \"--debug\"" }, - { NULL, '\0', "debug", "debug", 0, - &orte_cmd_line.debugger, OPAL_CMD_LINE_TYPE_BOOL, - "Invoke the user-level debugger indicated by the orte_base_user_debugger MCA parameter" }, - { "orte_base_user_debugger", '\0', "debugger", "debugger", 1, - NULL, OPAL_CMD_LINE_TYPE_STRING, - "Sequence of debuggers to search for when \"--debug\" is used" }, - { "orte_output_debugger_proctable", '\0', "output-proctable", "output-proctable", 0, - NULL, OPAL_CMD_LINE_TYPE_BOOL, - "Output the debugger proctable after launch" }, - - /* OpenRTE arguments */ - { "orte_debug", 'd', "debug-devel", "debug-devel", 0, - NULL, OPAL_CMD_LINE_TYPE_BOOL, - "Enable debugging of OpenRTE" }, - - { "orte_debug_daemons", '\0', "debug-daemons", "debug-daemons", 0, - NULL, OPAL_CMD_LINE_TYPE_INT, - "Enable debugging of any OpenRTE daemons used by this application" }, - - { "orte_debug_daemons_file", '\0', "debug-daemons-file", "debug-daemons-file", 0, - NULL, OPAL_CMD_LINE_TYPE_BOOL, - "Enable debugging of any OpenRTE daemons used by this application, storing output in files" }, - - { "orte_leave_session_attached", '\0', "leave-session-attached", "leave-session-attached", 0, - NULL, OPAL_CMD_LINE_TYPE_BOOL, - "Enable debugging of OpenRTE" }, - - { "orte_do_not_launch", '\0', "do-not-launch", "do-not-launch", 0, - NULL, OPAL_CMD_LINE_TYPE_BOOL, - "Perform all necessary operations to prepare to launch the application, but do not actually launch it" }, - - { NULL, '\0', NULL, "prefix", 1, - NULL, OPAL_CMD_LINE_TYPE_STRING, - "Prefix where Open MPI is installed on remote nodes" }, - { NULL, '\0', NULL, "noprefix", 0, - NULL, OPAL_CMD_LINE_TYPE_STRING, - "Disable automatic --prefix behavior" }, - - { "orte_report_launch_progress", '\0', "show-progress", "show-progress", 0, - NULL, OPAL_CMD_LINE_TYPE_BOOL, - "Output a brief periodic report on launch progress" }, - - { "orte_use_regexp", '\0', "use-regexp", "use-regexp", 0, - NULL, OPAL_CMD_LINE_TYPE_BOOL, - "Use regular expressions for launch" }, - - { "orte_report_events", '\0', "report-events", "report-events", 1, - NULL, OPAL_CMD_LINE_TYPE_STRING, - "Report events to a tool listening at the specified URI" }, - - { "orte_enable_recovery", '\0', "enable-recovery", "enable-recovery", 0, - NULL, OPAL_CMD_LINE_TYPE_BOOL, - "Enable recovery from process failure [Default = disabled]" }, - - { "orte_max_restarts", '\0', "max-restarts", "max-restarts", 1, - NULL, OPAL_CMD_LINE_TYPE_INT, - "Max number of times to restart a failed process" }, - - { "orte_hetero_nodes", '\0', NULL, "hetero-nodes", 0, - NULL, OPAL_CMD_LINE_TYPE_BOOL, - "Nodes in cluster may differ in topology, so send the topology back from each node [Default = false]" }, - -#if OPAL_ENABLE_CRDEBUG == 1 - { "opal_cr_enable_crdebug", '\0', "crdebug", "crdebug", 0, - NULL, OPAL_CMD_LINE_TYPE_BOOL, - "Enable C/R Debugging" }, -#endif - - { NULL, '\0', "disable-recovery", "disable-recovery", 0, - &orte_cmd_line.disable_recovery, OPAL_CMD_LINE_TYPE_BOOL, - "Disable recovery (resets all recovery options to off)" }, - - { "state_novm_select", '\0', "novm", "novm", 0, - NULL, OPAL_CMD_LINE_TYPE_BOOL, - "Execute without creating an allocation-spanning virtual machine (only start daemons on nodes hosting application procs)" }, - - { "orte_staged_execution", '\0', "staged", "staged", 0, - NULL, OPAL_CMD_LINE_TYPE_BOOL, - "Used staged execution if inadequate resources are present (cannot support MPI jobs)" }, - - { NULL, '\0', "allow-run-as-root", "allow-run-as-root", 0, - &orte_cmd_line.run_as_root, OPAL_CMD_LINE_TYPE_BOOL, - "Allow execution as root (STRONGLY DISCOURAGED)" }, - - { NULL, '\0', "personality", "personality", 1, - &orte_cmd_line.personality, OPAL_CMD_LINE_TYPE_STRING, - "Comma-separated list of programming model, languages, and containers being used (default=\"ompi\")" }, - - { NULL, '\0', "dvm", "dvm", 0, - &orte_cmd_line.create_dvm, OPAL_CMD_LINE_TYPE_BOOL, - "Create a persistent distributed virtual machine (DVM)" }, - - /* End of list */ - { NULL, '\0', NULL, NULL, 0, - NULL, OPAL_CMD_LINE_TYPE_NULL, NULL } -}; - /* local data */ static opal_list_t job_stack; -/* - * Local functions - */ -static int create_app(int argc, char* argv[], - orte_job_t *jdata, - orte_app_context_t **app, - bool *made_app, char ***app_env); -static int init_globals(void); -static int parse_globals(int argc, char* argv[], opal_cmd_line_t *cmd_line); -static int parse_locals(orte_job_t *jdata, int argc, char* argv[]); -static void set_classpath_jar_file(orte_app_context_t *app, int index, char *jarfile); -static int parse_appfile(orte_job_t *jdata, char *filename, char ***env); -static void run_debugger(char *basename, opal_cmd_line_t *cmd_line, - int argc, char *argv[], int num_procs) __opal_attribute_noreturn__; - static void spawn_next_job(opal_buffer_t *bptr, void *cbdata) { orte_job_t *jdata = (orte_job_t*)cbdata; @@ -589,63 +146,11 @@ static void run_next_job(int fd, short args, void *cbdata) int orterun(int argc, char *argv[]) { - int rc; - opal_cmd_line_t cmd_line; - char *param; - orte_job_t *daemons; - orte_app_context_t *app, *dapp; - orte_job_t *jdata=NULL, *jptr; -#if OPAL_ENABLE_FT_CR == 1 - char *tmp_env_var = NULL; -#endif - /* find our basename (the name of the executable) so that we can - use it in pretty-print error messages */ - orte_basename = opal_basename(argv[0]); - - /* bozo check - we don't allow recursive calls of orterun */ - if (NULL != getenv("OMPI_UNIVERSE_SIZE")) { - fprintf(stderr, "\n\n**********************************************************\n\n"); - fprintf(stderr, "Open MPI does not support recursive calls of %s\n", orte_basename); - fprintf(stderr, "\n**********************************************************\n"); + if (ORTE_SUCCESS != orte_submit_init(argc, argv, NULL)) { exit(1); } - /* Setup and parse the command line */ - init_globals(); - opal_cmd_line_create(&cmd_line, cmd_line_init); - mca_base_cmd_line_setup(&cmd_line); - if (OPAL_SUCCESS != (rc = opal_cmd_line_parse(&cmd_line, true, - argc, argv)) ) { - if (OPAL_ERR_SILENT != rc) { - fprintf(stderr, "%s: command line error (%s)\n", argv[0], - opal_strerror(rc)); - } - return rc; - } - - /* print version if requested. Do this before check for help so - that --version --help works as one might expect. */ - if (orte_cmd_line.version) { - char *str, *project_name = NULL; - if (0 == strcmp(orte_basename, "mpirun")) { - project_name = "Open MPI"; - } else { - project_name = "OpenRTE"; - } - str = opal_info_make_version_str("all", - OPAL_MAJOR_VERSION, OPAL_MINOR_VERSION, - OPAL_RELEASE_VERSION, - OPAL_GREEK_VERSION, - OPAL_REPO_REV); - if (NULL != str) { - fprintf(stdout, "%s (%s) %s\n\nReport bugs to %s\n", - orte_basename, project_name, str, PACKAGE_BUGREPORT); - free(str); - } - exit(0); - } - /* check if we are running as root - if we are, then only allow * us to proceed if the allow-run-as-root flag was given. Otherwise, * exit with a giant warning flag @@ -668,354 +173,6 @@ int orterun(int argc, char *argv[]) exit(1); } - /* - * Since this process can now handle MCA/GMCA parameters, make sure to - * process them - we can do this step WITHOUT first calling opal_init - */ - if (OPAL_SUCCESS != mca_base_cmd_line_process_args(&cmd_line, &environ, &environ)) { - exit(1); - } - - /* Ensure that enough of OPAL is setup for us to be able to run */ - /* - * NOTE: (JJH) - * We need to allow 'mca_base_cmd_line_process_args()' to process command - * line arguments *before* calling opal_init_util() since the command - * line could contain MCA parameters that affect the way opal_init_util() - * functions. AMCA parameters are one such option normally received on the - * command line that affect the way opal_init_util() behaves. - * It is "safe" to call mca_base_cmd_line_process_args() before - * opal_init_util() since mca_base_cmd_line_process_args() does *not* - * depend upon opal_init_util() functionality. - */ - /* Need to initialize OPAL so that install_dirs are filled in */ - if (OPAL_SUCCESS != opal_init(&argc, &argv)) { - exit(1); - } - - /* Check for help request */ - if (orte_cmd_line.help) { - char *str, *args = NULL; - char *project_name = NULL; - if (0 == strcmp(orte_basename, "mpirun")) { - project_name = "Open MPI"; - } else { - project_name = "OpenRTE"; - } - args = opal_cmd_line_get_usage_msg(&cmd_line); - str = opal_show_help_string("help-orterun.txt", "orterun:usage", false, - orte_basename, project_name, OPAL_VERSION, - orte_basename, args, - PACKAGE_BUGREPORT); - if (NULL != str) { - printf("%s", str); - free(str); - } - free(args); - - /* If someone asks for help, that should be all we do */ - opal_finalize(); - exit(0); - } - - /* may look strange, but the way we handle prefix is a little weird - * and probably needs to be addressed more fully at some future point. - * For now, we have a conflict between app_files and cmd line usage. - * Since app_files are used by the C/R system, we will make an - * adjustment here to avoid perturbing that system. - * - * We cannot just have the cmd line parser place any found value - * in the global struct as the app_file parser would replace it. - * So handle this specific cmd line option manually. - */ - orte_cmd_line.prefix = NULL; - orte_cmd_line.path_to_mpirun = NULL; - if (opal_cmd_line_is_taken(&cmd_line, "prefix") || - '/' == argv[0][0] || want_prefix_by_default) { - size_t param_len; - if ('/' == argv[0][0]) { - char* tmp_basename = NULL; - /* If they specified an absolute path, strip off the - /bin/" and leave just the prefix */ - orte_cmd_line.path_to_mpirun = opal_dirname(argv[0]); - /* Quick sanity check to ensure we got - something/bin/ and that the installation - tree is at least more or less what we expect it to - be */ - tmp_basename = opal_basename(orte_cmd_line.path_to_mpirun); - if (0 == strcmp("bin", tmp_basename)) { - char* tmp = orte_cmd_line.path_to_mpirun; - orte_cmd_line.path_to_mpirun = opal_dirname(tmp); - free(tmp); - } else { - free(orte_cmd_line.path_to_mpirun); - orte_cmd_line.path_to_mpirun = NULL; - } - free(tmp_basename); - } - /* if both are given, check to see if they match */ - if (opal_cmd_line_is_taken(&cmd_line, "prefix") && NULL != orte_cmd_line.path_to_mpirun) { - char *tmp_basename; - /* if they don't match, then that merits a warning */ - param = strdup(opal_cmd_line_get_param(&cmd_line, "prefix", 0, 0)); - /* ensure we strip any trailing '/' */ - if (0 == strcmp(OPAL_PATH_SEP, &(param[strlen(param)-1]))) { - param[strlen(param)-1] = '\0'; - } - tmp_basename = strdup(orte_cmd_line.path_to_mpirun); - if (0 == strcmp(OPAL_PATH_SEP, &(tmp_basename[strlen(tmp_basename)-1]))) { - tmp_basename[strlen(tmp_basename)-1] = '\0'; - } - if (0 != strcmp(param, tmp_basename)) { - orte_show_help("help-orterun.txt", "orterun:double-prefix", - true, orte_basename, orte_basename, - param, tmp_basename, orte_basename); - /* use the prefix over the path-to-mpirun so that - * people can specify the backend prefix as different - * from the local one - */ - free(orte_cmd_line.path_to_mpirun); - orte_cmd_line.path_to_mpirun = NULL; - } - free(tmp_basename); - } else if (NULL != orte_cmd_line.path_to_mpirun) { - param = strdup(orte_cmd_line.path_to_mpirun); - } else if (opal_cmd_line_is_taken(&cmd_line, "prefix")){ - /* must be --prefix alone */ - param = strdup(opal_cmd_line_get_param(&cmd_line, "prefix", 0, 0)); - } else { - /* --enable-orterun-prefix-default was given to orterun */ - param = strdup(opal_install_dirs.prefix); - } - - if (NULL != param) { - /* "Parse" the param, aka remove superfluous path_sep. */ - param_len = strlen(param); - while (0 == strcmp (OPAL_PATH_SEP, &(param[param_len-1]))) { - param[param_len-1] = '\0'; - param_len--; - if (0 == param_len) { - orte_show_help("help-orterun.txt", "orterun:empty-prefix", - true, orte_basename, orte_basename); - free(param); - return ORTE_ERR_FATAL; - } - } - - orte_cmd_line.prefix = param; - } - want_prefix_by_default = true; - } - - /* flag that I am the HNP - needs to be done prior to - * registering params - */ - orte_process_info.proc_type = ORTE_PROC_HNP; - - /* Setup MCA params */ - orte_register_params(); - - /* save the environment for launch purposes. This MUST be - * done so that we can pass it to any local procs we - * spawn - otherwise, those local procs won't see any - * non-MCA envars were set in the enviro prior to calling - * orterun - */ - orte_launch_environ = opal_argv_copy(environ); - opal_unsetenv(OPAL_MCA_PREFIX"ess", &orte_launch_environ); - opal_unsetenv(OPAL_MCA_PREFIX"pmix", &orte_launch_environ); - - /* Intialize our Open RTE environment - * Set the flag telling orte_init that I am NOT a - * singleton, but am "infrastructure" - prevents setting - * up incorrect infrastructure that only a singleton would - * require - */ - if (ORTE_SUCCESS != (rc = orte_init(&argc, &argv, ORTE_PROC_HNP))) { - /* cannot call ORTE_ERROR_LOG as it could be the errmgr - * never got loaded! - */ - return rc; - } - /* finalize OPAL. As it was opened again from orte_init->opal_init - * we continue to have a reference count on it. So we have to finalize it twice... - */ - opal_finalize(); - - /* default our personality to OMPI */ - if (NULL == orte_cmd_line.personality) { - opal_argv_append_nosize(&orte_cmd_line.personalities, "ompi"); - } else { - orte_cmd_line.personalities = opal_argv_split(orte_cmd_line.personality, ','); - } - /* Check for some "global" command line params */ - parse_globals(argc, argv, &cmd_line); - OBJ_DESTRUCT(&cmd_line); - - /* create a new job object to hold the info for this one - the - * jobid field will be filled in by the PLM when the job is - * launched - */ - jdata = OBJ_NEW(orte_job_t); - if (NULL == jdata) { - /* cannot call ORTE_ERROR_LOG as the errmgr - * hasn't been loaded yet! - */ - return ORTE_ERR_OUT_OF_RESOURCE; - } - jdata->personality = opal_argv_copy(orte_cmd_line.personalities); - - /* check what user wants us to do with stdin */ - if (0 == strcmp(orte_cmd_line.stdin_target, "all")) { - jdata->stdin_target = ORTE_VPID_WILDCARD; - } else if (0 == strcmp(orte_cmd_line.stdin_target, "none")) { - jdata->stdin_target = ORTE_VPID_INVALID; - } else { - jdata->stdin_target = strtoul(orte_cmd_line.stdin_target, NULL, 10); - } - - /* if we want the argv's indexed, indicate that */ - if (orte_cmd_line.index_argv) { - orte_set_attribute(&jdata->attributes, ORTE_JOB_INDEX_ARGV, ORTE_ATTR_GLOBAL, NULL, OPAL_BOOL); - } - - /* Parse each app, adding it to the job object */ - parse_locals(jdata, argc, argv); - - if (0 == jdata->num_apps) { - /* This should never happen -- this case should be caught in - create_app(), but let's just double check... */ - orte_show_help("help-orterun.txt", "orterun:nothing-to-do", - true, orte_basename); - exit(ORTE_ERROR_DEFAULT_EXIT_CODE); - } - -#if OPAL_ENABLE_FT_CR == 1 - /* Disable OPAL CR notifications for this tool */ - opal_cr_set_enabled(false); - (void) mca_base_var_env_name("opal_cr_is_tool", &tmp_env_var); - opal_setenv(tmp_env_var, - "1", - true, &environ); - free(tmp_env_var); -#endif - - /* get the daemon job object */ - daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid); - - /* check for request to report uri */ - if (NULL != orte_cmd_line.report_uri) { - FILE *fp; - char *rml_uri; - rml_uri = orte_rml.get_contact_info(); - if (0 == strcmp(orte_cmd_line.report_uri, "-")) { - /* if '-', then output to stdout */ - printf("%s\n", (NULL == rml_uri) ? "NULL" : rml_uri); - } else if (0 == strcmp(orte_cmd_line.report_uri, "+")) { - /* if '+', output to stderr */ - fprintf(stderr, "%s\n", (NULL == rml_uri) ? "NULL" : rml_uri); - } else { - fp = fopen(orte_cmd_line.report_uri, "w"); - if (NULL == fp) { - orte_show_help("help-orterun.txt", "orterun:write_file", false, - orte_basename, "uri", orte_cmd_line.report_uri); - exit(0); - } - fprintf(fp, "%s\n", (NULL == rml_uri) ? "NULL" : rml_uri); - fclose(fp); - } - if (NULL != rml_uri) { - free(rml_uri); - } - } - - /* If we have a prefix, then modify the PATH and - LD_LIBRARY_PATH environment variables in our copy. This - will ensure that any locally-spawned children will - have our executables and libraries in their path - - For now, default to the prefix_dir provided in the first app_context. - Since there always MUST be at least one app_context, we are safe in - doing this. - */ - param = NULL; - if (NULL != (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, 0)) && - orte_get_attribute(&app->attributes, ORTE_APP_PREFIX_DIR, (void**)¶m, OPAL_STRING)) { - char *oldenv, *newenv, *lib_base, *bin_base; - - /* copy the prefix into the daemon job so that any launcher - * can find the orteds when we launch the virtual machine - */ - if (NULL == (dapp = (orte_app_context_t*)opal_pointer_array_get_item(daemons->apps, 0))) { - /* that's an error in the ess */ - ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); - return ORTE_ERR_NOT_FOUND; - } - orte_set_attribute(&dapp->attributes, ORTE_APP_PREFIX_DIR, ORTE_ATTR_LOCAL, param, OPAL_STRING); - - lib_base = opal_basename(opal_install_dirs.libdir); - bin_base = opal_basename(opal_install_dirs.bindir); - - /* Reset PATH */ - newenv = opal_os_path( false, param, bin_base, NULL ); - oldenv = getenv("PATH"); - if (NULL != oldenv) { - char *temp; - asprintf(&temp, "%s:%s", newenv, oldenv ); - free( newenv ); - newenv = temp; - } - opal_setenv("PATH", newenv, true, &orte_launch_environ); - if (orte_debug_flag) { - opal_output(0, "%s: reset PATH: %s", orte_basename, newenv); - } - free(newenv); - free(bin_base); - - /* Reset LD_LIBRARY_PATH */ - newenv = opal_os_path( false, param, lib_base, NULL ); - oldenv = getenv("LD_LIBRARY_PATH"); - if (NULL != oldenv) { - char* temp; - asprintf(&temp, "%s:%s", newenv, oldenv); - free(newenv); - newenv = temp; - } - opal_setenv("LD_LIBRARY_PATH", newenv, true, &orte_launch_environ); - if (orte_debug_flag) { - opal_output(0, "%s: reset LD_LIBRARY_PATH: %s", - orte_basename, newenv); - } - free(newenv); - free(lib_base); - free(param); - } - - /* pre-condition any network transports that require it */ - if (ORTE_SUCCESS != (rc = orte_pre_condition_transports(jdata))) { - ORTE_ERROR_LOG(rc); - orte_show_help("help-orterun.txt", "orterun:precondition", false, - orte_basename, NULL, NULL, rc); - ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE); - goto DONE; - } - - /* if we were asked to tag output, mark it so */ - if (orte_tag_output) { - orte_set_attribute(&jdata->attributes, ORTE_JOB_TAG_OUTPUT, ORTE_ATTR_GLOBAL, NULL, OPAL_BOOL); - } - /* if we were asked to timestamp output, mark it so */ - if (orte_timestamp_output) { - orte_set_attribute(&jdata->attributes, ORTE_JOB_TIMESTAMP_OUTPUT, ORTE_ATTR_GLOBAL, NULL, OPAL_BOOL); - } - /* if we were asked to output to files, pass it along */ - if (NULL != orte_output_filename) { - orte_set_attribute(&jdata->attributes, ORTE_JOB_OUTPUT_TO_FILE, ORTE_ATTR_GLOBAL, orte_output_filename, OPAL_STRING); - } - /* if we were asked to merge stderr to stdout, mark it so */ - if (orte_cmd_line.merge) { - orte_set_attribute(&jdata->attributes, ORTE_JOB_MERGE_STDERR_STDOUT, ORTE_ATTR_GLOBAL, NULL, OPAL_BOOL); - } /* setup to listen for commands sent specifically to me, even though I would probably * be the one sending them! Unfortunately, since I am a participating daemon, * there are times I need to send a command to "all daemons", and that means *I* have @@ -1024,15 +181,16 @@ int orterun(int argc, char *argv[]) orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_DAEMON, ORTE_RML_PERSISTENT, orte_daemon_recv, NULL); - /* setup for debugging */ - orte_debugger_init_before_spawn(jdata); - orte_state.add_job_state(ORTE_JOB_STATE_READY_FOR_DEBUGGERS, - orte_debugger_init_after_spawn, - ORTE_SYS_PRI); - orte_state.add_job_state(ORTE_JOB_STATE_DEBUGGER_DETACH, - orte_debugger_detached, - ORTE_SYS_PRI); + /* spawn the job and its daemons */ + if (ORTE_SUCCESS != orte_submit_job(argv, NULL, + NULL, NULL, + NULL, NULL)) { + ORTE_UPDATE_EXIT_STATUS(1); + goto DONE; + } + +#if 0 if (orte_staged_execution) { /* staged execution is requested - each app_context * is treated as a separate job and executed in @@ -1065,34 +223,7 @@ int orterun(int argc, char *argv[]) goto DONE; } } - - /* check for suicide test directives */ - if (NULL != getenv("ORTE_TEST_HNP_SUICIDE") || - NULL != getenv("ORTE_TEST_ORTED_SUICIDE")) { - /* don't forward IO from this process so we can - * see any debug after daemon termination */ - ORTE_FLAG_UNSET(jdata, ORTE_JOB_FLAG_FORWARD_OUTPUT); - } - - /* check for a job timeout specification, to be provided in seconds - * as that is what MPICH used - */ - if (NULL != (param = getenv("MPIEXEC_TIMEOUT"))) { - if (NULL == (orte_mpiexec_timeout = OBJ_NEW(orte_timer_t))) { - ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); - ORTE_UPDATE_EXIT_STATUS(ORTE_ERR_OUT_OF_RESOURCE); - goto DONE; - } - orte_mpiexec_timeout->tv.tv_sec = strtol(param, NULL, 10); - orte_mpiexec_timeout->tv.tv_usec = 0; - opal_event_evtimer_set(orte_event_base, orte_mpiexec_timeout->ev, - orte_timeout_wakeup, jdata); - opal_event_set_priority(orte_mpiexec_timeout->ev, ORTE_ERROR_PRI); - opal_event_evtimer_add(orte_mpiexec_timeout->ev, &orte_mpiexec_timeout->tv); - } - - /* spawn the job and its daemons */ - rc = orte_plm.spawn(jdata); +#endif /* loop the event lib until an exit event is detected */ while (orte_event_base_active) { @@ -1104,12 +235,12 @@ int orterun(int argc, char *argv[]) DONE: /* if it was created, remove the debugger attach fifo */ - if (0 <= attach_fd) { - if (fifo_active) { - opal_event_del(attach); - free(attach); + if (0 <= orte_debugger_attach_fd) { + if (orte_debugger_fifo_active) { + opal_event_del(orte_debugger_attach); + free(orte_debugger_attach); } - close(attach_fd); + close(orte_debugger_attach_fd); unlink(MPIR_attach_fifo); } @@ -1124,1746 +255,3 @@ int orterun(int argc, char *argv[]) } exit(orte_exit_status); } - -static int init_globals(void) -{ - /* Only CONSTRUCT things once */ - if (!globals_init) { - orte_cmd_line.env_val = NULL; - orte_cmd_line.appfile = NULL; - orte_cmd_line.wdir = NULL; - orte_cmd_line.path = NULL; - orte_cmd_line.stdin_target = "0"; - orte_cmd_line.report_pid = NULL; - orte_cmd_line.report_uri = NULL; - orte_cmd_line.disable_recovery = false; - orte_cmd_line.index_argv = false; - orte_cmd_line.run_as_root = false; - orte_cmd_line.personality = NULL; - orte_cmd_line.personalities = NULL; - orte_cmd_line.create_dvm = false; - } - - /* Reset the other fields every time */ - - orte_cmd_line.help = false; - orte_cmd_line.version = false; - orte_cmd_line.verbose = false; - orte_cmd_line.debugger = false; - orte_cmd_line.num_procs = 0; - if( NULL != orte_cmd_line.env_val ) - free( orte_cmd_line.env_val ); - orte_cmd_line.env_val = NULL; - if( NULL != orte_cmd_line.appfile ) - free( orte_cmd_line.appfile ); - orte_cmd_line.appfile = NULL; - if( NULL != orte_cmd_line.wdir ) - free( orte_cmd_line.wdir ); - orte_cmd_line.set_cwd_to_session_dir = false; - orte_cmd_line.wdir = NULL; - if( NULL != orte_cmd_line.path ) - free( orte_cmd_line.path ); - orte_cmd_line.path = NULL; - - orte_cmd_line.preload_binaries = false; - orte_cmd_line.preload_files = NULL; - -#if OPAL_ENABLE_FT_CR == 1 - orte_cmd_line.sstore_load = NULL; -#endif - - /* All done */ - globals_init = true; - return ORTE_SUCCESS; -} - - -static int parse_globals(int argc, char* argv[], opal_cmd_line_t *cmd_line) -{ - /* check for request to report pid */ - if (NULL != orte_cmd_line.report_pid) { - FILE *fp; - if (0 == strcmp(orte_cmd_line.report_pid, "-")) { - /* if '-', then output to stdout */ - printf("%d\n", (int)getpid()); - } else if (0 == strcmp(orte_cmd_line.report_pid, "+")) { - /* if '+', output to stderr */ - fprintf(stderr, "%d\n", (int)getpid()); - } else { - fp = fopen(orte_cmd_line.report_pid, "w"); - if (NULL == fp) { - orte_show_help("help-orterun.txt", "orterun:write_file", false, - orte_basename, "pid", orte_cmd_line.report_pid); - exit(0); - } - fprintf(fp, "%d\n", (int)getpid()); - fclose(fp); - } - } - - /* Do we want a user-level debugger? */ - - if (orte_cmd_line.debugger) { - run_debugger(orte_basename, cmd_line, argc, argv, orte_cmd_line.num_procs); - } - - /* if recovery was disabled on the cmd line, do so */ - if (orte_cmd_line.disable_recovery) { - orte_enable_recovery = false; - orte_max_restarts = 0; - } - - return ORTE_SUCCESS; -} - - -static int parse_locals(orte_job_t *jdata, int argc, char* argv[]) -{ - int i, rc, app_num; - int temp_argc; - char **temp_argv, **env; - orte_app_context_t *app; - bool made_app; - orte_std_cntr_t j, size1; - - /* Make the apps */ - temp_argc = 0; - temp_argv = NULL; - opal_argv_append(&temp_argc, &temp_argv, argv[0]); - - /* NOTE: This bogus env variable is necessary in the calls to - create_app(), below. See comment immediately before the - create_app() function for an explanation. */ - - env = NULL; - for (app_num = 0, i = 1; i < argc; ++i) { - if (0 == strcmp(argv[i], ":")) { - /* Make an app with this argv */ - if (opal_argv_count(temp_argv) > 1) { - if (NULL != env) { - opal_argv_free(env); - env = NULL; - } - app = NULL; - rc = create_app(temp_argc, temp_argv, jdata, &app, &made_app, &env); - /** keep track of the number of apps - point this app_context to that index */ - if (ORTE_SUCCESS != rc) { - /* Assume that the error message has already been - printed; no need to cleanup -- we can just - exit */ - exit(1); - } - if (made_app) { - app->idx = app_num; - ++app_num; - opal_pointer_array_add(jdata->apps, app); - ++jdata->num_apps; - if (ORTE_SUCCESS != (rc = orte_schizo.setup_app(jdata->personality, app))) { - return rc; - } - } - - /* Reset the temps */ - - temp_argc = 0; - temp_argv = NULL; - opal_argv_append(&temp_argc, &temp_argv, argv[0]); - } - } else { - opal_argv_append(&temp_argc, &temp_argv, argv[i]); - } - } - - if (opal_argv_count(temp_argv) > 1) { - app = NULL; - rc = create_app(temp_argc, temp_argv, jdata, &app, &made_app, &env); - if (ORTE_SUCCESS != rc) { - /* Assume that the error message has already been printed; - no need to cleanup -- we can just exit */ - exit(1); - } - if (made_app) { - app->idx = app_num; - ++app_num; - opal_pointer_array_add(jdata->apps, app); - ++jdata->num_apps; - if (ORTE_SUCCESS != (rc = orte_schizo.setup_app(jdata->personality, app))) { - return rc; - } - } - } - if (NULL != env) { - opal_argv_free(env); - } - opal_argv_free(temp_argv); - - /* Once we've created all the apps, add the global MCA params to - each app's environment (checking for duplicates, of - course -- yay opal_environ_merge()). */ - - if (NULL != global_mca_env) { - size1 = (size_t)opal_pointer_array_get_size(jdata->apps); - /* Iterate through all the apps */ - for (j = 0; j < size1; ++j) { - app = (orte_app_context_t *) - opal_pointer_array_get_item(jdata->apps, j); - if (NULL != app) { - /* Use handy utility function */ - env = opal_environ_merge(global_mca_env, app->env); - opal_argv_free(app->env); - app->env = env; - } - } - } - - /* Now take a subset of the MCA params and set them as MCA - overrides here in orterun (so that when we orte_init() later, - all the components see these MCA params). Here's how we decide - which subset of the MCA params we set here in orterun: - - 1. If any global MCA params were set, use those - 2. If no global MCA params were set and there was only one app, - then use its app MCA params - 3. Otherwise, don't set any - */ - - env = NULL; - if (NULL != global_mca_env) { - env = global_mca_env; - } else { - if (opal_pointer_array_get_size(jdata->apps) >= 1) { - /* Remember that pointer_array's can be padded with NULL - entries; so only use the app's env if there is exactly - 1 non-NULL entry */ - app = (orte_app_context_t *) - opal_pointer_array_get_item(jdata->apps, 0); - if (NULL != app) { - env = app->env; - for (j = 1; j < opal_pointer_array_get_size(jdata->apps); ++j) { - if (NULL != opal_pointer_array_get_item(jdata->apps, j)) { - env = NULL; - break; - } - } - } - } - } - - if (NULL != env) { - size1 = opal_argv_count(env); - for (j = 0; j < size1; ++j) { - /* Use-after-Free error possible here. putenv does not copy - * the string passed to it, and instead stores only the pointer. - * env[j] may be freed later, in which case the pointer - * in environ will now be left dangling into a deallocated - * region. - * So we make a copy of the variable. - */ - char *s = strdup(env[j]); - - if (NULL == s) { - return OPAL_ERR_OUT_OF_RESOURCE; - } - putenv(s); - } - } - - /* All done */ - - return ORTE_SUCCESS; -} - - -/* - * This function takes a "char ***app_env" parameter to handle the - * specific case: - * - * orterun --mca foo bar -app appfile - * - * That is, we'll need to keep foo=bar, but the presence of the app - * file will cause an invocation of parse_appfile(), which will cause - * one or more recursive calls back to create_app(). Since the - * foo=bar value applies globally to all apps in the appfile, we need - * to pass in the "base" environment (that contains the foo=bar value) - * when we parse each line in the appfile. - * - * This is really just a special case -- when we have a simple case like: - * - * orterun --mca foo bar -np 4 hostname - * - * Then the upper-level function (parse_locals()) calls create_app() - * with a NULL value for app_env, meaning that there is no "base" - * environment that the app needs to be created from. - */ -static int create_app(int argc, char* argv[], - orte_job_t *jdata, - orte_app_context_t **app_ptr, - bool *made_app, char ***app_env) -{ - opal_cmd_line_t cmd_line; - char cwd[OPAL_PATH_MAX]; - int i, j, count, rc; - char *param, *value; - orte_app_context_t *app = NULL; - bool cmd_line_made = false; - bool found = false; - char *appname; - - *made_app = false; - - /* Pre-process the command line if we are going to parse an appfile later. - * save any mca command line args so they can be passed - * separately to the daemons. - * Use Case: - * $ cat launch.appfile - * -np 1 -mca aaa bbb ./my-app -mca ccc ddd - * -np 1 -mca aaa bbb ./my-app -mca eee fff - * $ mpirun -np 2 -mca foo bar --app launch.appfile - * Only pick up '-mca foo bar' on this pass. - */ - if (NULL != orte_cmd_line.appfile) { - if (ORTE_SUCCESS != (rc = orte_schizo.parse_cli(orte_cmd_line.personalities, argc, 0, argv))) { - goto cleanup; - } - } - - /* Parse application command line options. */ - - init_globals(); - opal_cmd_line_create(&cmd_line, cmd_line_init); - mca_base_cmd_line_setup(&cmd_line); - cmd_line_made = true; - rc = opal_cmd_line_parse(&cmd_line, true, argc, argv); - if (ORTE_SUCCESS != rc) { - goto cleanup; - } - mca_base_cmd_line_process_args(&cmd_line, app_env, &global_mca_env); - - /* Is there an appfile in here? */ - - if (NULL != orte_cmd_line.appfile) { - OBJ_DESTRUCT(&cmd_line); - return parse_appfile(jdata, strdup(orte_cmd_line.appfile), app_env); - } - - /* Setup application context */ - - app = OBJ_NEW(orte_app_context_t); - opal_cmd_line_get_tail(&cmd_line, &count, &app->argv); - - /* See if we have anything left */ - - if (0 == count) { - orte_show_help("help-orterun.txt", "orterun:executable-not-specified", - true, orte_basename, orte_basename); - rc = ORTE_ERR_NOT_FOUND; - goto cleanup; - } - - /* - * Get mca parameters so we can pass them to the daemons. - * Use the count determined above to make sure we do not go past - * the executable name. Example: - * mpirun -np 2 -mca foo bar ./my-app -mca bip bop - * We want to pick up '-mca foo bar' but not '-mca bip bop' - */ - if (ORTE_SUCCESS != (rc = orte_schizo.parse_cli(orte_cmd_line.personalities, - argc, count, argv))) { - goto cleanup; - } - - /* Grab all OMPI_* environment variables */ - - app->env = opal_argv_copy(*app_env); - if (ORTE_SUCCESS != (rc = orte_schizo.parse_env(orte_cmd_line.personalities, - orte_cmd_line.path, - &cmd_line, - environ, &app->env))) { - goto cleanup; - } - - - /* Did the user request a specific wdir? */ - - if (NULL != orte_cmd_line.wdir) { - /* if this is a relative path, convert it to an absolute path */ - if (opal_path_is_absolute(orte_cmd_line.wdir)) { - app->cwd = strdup(orte_cmd_line.wdir); - } else { - /* get the cwd */ - if (OPAL_SUCCESS != (rc = opal_getcwd(cwd, sizeof(cwd)))) { - orte_show_help("help-orterun.txt", "orterun:init-failure", - true, "get the cwd", rc); - goto cleanup; - } - /* construct the absolute path */ - app->cwd = opal_os_path(false, cwd, orte_cmd_line.wdir, NULL); - } - orte_set_attribute(&app->attributes, ORTE_APP_USER_CWD, ORTE_ATTR_GLOBAL, NULL, OPAL_BOOL); - } else if (orte_cmd_line.set_cwd_to_session_dir) { - orte_set_attribute(&app->attributes, ORTE_APP_SSNDIR_CWD, ORTE_ATTR_GLOBAL, NULL, OPAL_BOOL); - orte_set_attribute(&app->attributes, ORTE_APP_USER_CWD, ORTE_ATTR_GLOBAL, NULL, OPAL_BOOL); - } else { - if (OPAL_SUCCESS != (rc = opal_getcwd(cwd, sizeof(cwd)))) { - orte_show_help("help-orterun.txt", "orterun:init-failure", - true, "get the cwd", rc); - goto cleanup; - } - app->cwd = strdup(cwd); - } - - /* if this is the first app_context, check for prefix directions. - * We only do this for the first app_context because the launchers - * only look at the first one when setting the prefix - we do NOT - * support per-app_context prefix settings! - */ - if (0 == total_num_apps) { - /* Check to see if the user explicitly wanted to disable automatic - --prefix behavior */ - - if (opal_cmd_line_is_taken(&cmd_line, "noprefix")) { - want_prefix_by_default = false; - } - - /* Did the user specify a prefix, or want prefix by default? */ - if (opal_cmd_line_is_taken(&cmd_line, "prefix") || want_prefix_by_default) { - size_t param_len; - /* if both the prefix was given and we have a prefix - * given above, check to see if they match - */ - if (opal_cmd_line_is_taken(&cmd_line, "prefix") && - NULL != orte_cmd_line.prefix) { - /* if they don't match, then that merits a warning */ - param = strdup(opal_cmd_line_get_param(&cmd_line, "prefix", 0, 0)); - /* ensure we strip any trailing '/' */ - if (0 == strcmp(OPAL_PATH_SEP, &(param[strlen(param)-1]))) { - param[strlen(param)-1] = '\0'; - } - value = strdup(orte_cmd_line.prefix); - if (0 == strcmp(OPAL_PATH_SEP, &(value[strlen(value)-1]))) { - value[strlen(value)-1] = '\0'; - } - if (0 != strcmp(param, value)) { - orte_show_help("help-orterun.txt", "orterun:app-prefix-conflict", - true, orte_basename, value, param); - /* let the global-level prefix take precedence since we - * know that one is being used - */ - free(param); - param = strdup(orte_cmd_line.prefix); - } - free(value); - } else if (NULL != orte_cmd_line.prefix) { - param = strdup(orte_cmd_line.prefix); - } else if (opal_cmd_line_is_taken(&cmd_line, "prefix")){ - /* must be --prefix alone */ - param = strdup(opal_cmd_line_get_param(&cmd_line, "prefix", 0, 0)); - } else { - /* --enable-orterun-prefix-default was given to orterun */ - param = strdup(opal_install_dirs.prefix); - } - - if (NULL != param) { - /* "Parse" the param, aka remove superfluous path_sep. */ - param_len = strlen(param); - while (0 == strcmp (OPAL_PATH_SEP, &(param[param_len-1]))) { - param[param_len-1] = '\0'; - param_len--; - if (0 == param_len) { - orte_show_help("help-orterun.txt", "orterun:empty-prefix", - true, orte_basename, orte_basename); - free(param); - return ORTE_ERR_FATAL; - } - } - orte_set_attribute(&app->attributes, ORTE_APP_PREFIX_DIR, ORTE_ATTR_GLOBAL, param, OPAL_STRING); - free(param); - } - } - } - - /* Did the user specify a hostfile. Need to check for both - * hostfile and machine file. - * We can only deal with one hostfile per app context, otherwise give an error. - */ - if (0 < (j = opal_cmd_line_get_ninsts(&cmd_line, "hostfile"))) { - if(1 < j) { - orte_show_help("help-orterun.txt", "orterun:multiple-hostfiles", - true, orte_basename, NULL); - return ORTE_ERR_FATAL; - } else { - value = opal_cmd_line_get_param(&cmd_line, "hostfile", 0, 0); - orte_set_attribute(&app->attributes, ORTE_APP_HOSTFILE, ORTE_ATTR_LOCAL, value, OPAL_STRING); - } - } - if (0 < (j = opal_cmd_line_get_ninsts(&cmd_line, "machinefile"))) { - if(1 < j || orte_get_attribute(&app->attributes, ORTE_APP_HOSTFILE, NULL, OPAL_STRING)) { - orte_show_help("help-orterun.txt", "orterun:multiple-hostfiles", - true, orte_basename, NULL); - return ORTE_ERR_FATAL; - } else { - value = opal_cmd_line_get_param(&cmd_line, "machinefile", 0, 0); - orte_set_attribute(&app->attributes, ORTE_APP_HOSTFILE, ORTE_ATTR_LOCAL, value, OPAL_STRING); - } - } - - /* Did the user specify any hosts? */ - if (0 < (j = opal_cmd_line_get_ninsts(&cmd_line, "host"))) { - char **targ=NULL, *tval; - for (i = 0; i < j; ++i) { - value = opal_cmd_line_get_param(&cmd_line, "host", i, 0); - opal_argv_append_nosize(&targ, value); - } - tval = opal_argv_join(targ, ','); - orte_set_attribute(&app->attributes, ORTE_APP_DASH_HOST, ORTE_ATTR_LOCAL, tval, OPAL_STRING); - opal_argv_free(targ); - free(tval); - } else if (NULL != orte_default_dash_host) { - orte_set_attribute(&app->attributes, ORTE_APP_DASH_HOST, ORTE_ATTR_LOCAL, - orte_default_dash_host, OPAL_STRING); - } - - /* check for bozo error */ - if (0 > orte_cmd_line.num_procs) { - orte_show_help("help-orterun.txt", "orterun:negative-nprocs", - true, orte_basename, app->argv[0], - orte_cmd_line.num_procs, NULL); - return ORTE_ERR_FATAL; - } - - app->num_procs = (orte_std_cntr_t)orte_cmd_line.num_procs; - total_num_apps++; - - /* Capture any preload flags */ - if (orte_cmd_line.preload_binaries) { - orte_set_attribute(&app->attributes, ORTE_APP_PRELOAD_BIN, ORTE_ATTR_LOCAL, NULL, OPAL_BOOL); - } - /* if we were told to cwd to the session dir and the app was given in - * relative syntax, then we need to preload the binary to - * find the app - don't do this for java apps, however, as we - * can't easily find the class on the cmd line. Java apps have to - * preload their binary via the preload_files option - */ - if (!opal_path_is_absolute(app->argv[0]) && - NULL == strstr(app->argv[0], "java")) { - if (orte_cmd_line.preload_binaries) { - orte_set_attribute(&app->attributes, ORTE_APP_SSNDIR_CWD, ORTE_ATTR_GLOBAL, NULL, OPAL_BOOL); - } else if (orte_get_attribute(&app->attributes, ORTE_APP_SSNDIR_CWD, NULL, OPAL_BOOL)) { - orte_set_attribute(&app->attributes, ORTE_APP_PRELOAD_BIN, ORTE_ATTR_LOCAL, NULL, OPAL_BOOL); - } - } - if (NULL != orte_cmd_line.preload_files) { - orte_set_attribute(&app->attributes, ORTE_APP_PRELOAD_FILES, ORTE_ATTR_LOCAL, - orte_cmd_line.preload_files, OPAL_STRING); - } - -#if OPAL_ENABLE_FT_CR == 1 - if(NULL != orte_cmd_line.sstore_load) { - orte_set_attribute(&app->attributes, ORTE_APP_SSTORE_LOAD, ORTE_ATTR_LOCAL, - orte_cmd_line.sstore_load, OPAL_STRING); - } -#endif - - /* Do not try to find argv[0] here -- the starter is responsible - for that because it may not be relevant to try to find it on - the node where orterun is executing. So just strdup() argv[0] - into app. */ - - app->app = strdup(app->argv[0]); - if (NULL == app->app) { - orte_show_help("help-orterun.txt", "orterun:call-failed", - true, orte_basename, "library", "strdup returned NULL", errno); - rc = ORTE_ERR_NOT_FOUND; - goto cleanup; - } - - /* if this is a Java application, we have a bit more work to do. Such - * applications actually need to be run under the Java virtual machine - * and the "java" command will start the "executable". So we need to ensure - * that all the proper java-specific paths are provided - */ - appname = opal_basename(app->app); - if (0 == strcmp(appname, "java")) { - /* see if we were given a library path */ - found = false; - for (i=1; NULL != app->argv[i]; i++) { - if (NULL != strstr(app->argv[i], "java.library.path")) { - /* yep - but does it include the path to the mpi libs? */ - found = true; - if (NULL == strstr(app->argv[i], opal_install_dirs.libdir)) { - /* doesn't appear to - add it to be safe */ - if (':' == app->argv[i][strlen(app->argv[i]-1)]) { - asprintf(&value, "-Djava.library.path=%s%s", app->argv[i], opal_install_dirs.libdir); - } else { - asprintf(&value, "-Djava.library.path=%s:%s", app->argv[i], opal_install_dirs.libdir); - } - free(app->argv[i]); - app->argv[i] = value; - } - break; - } - } - if (!found) { - /* need to add it right after the java command */ - asprintf(&value, "-Djava.library.path=%s", opal_install_dirs.libdir); - opal_argv_insert_element(&app->argv, 1, value); - free(value); - } - - /* see if we were given a class path */ - found = false; - for (i=1; NULL != app->argv[i]; i++) { - if (NULL != strstr(app->argv[i], "cp") || - NULL != strstr(app->argv[i], "classpath")) { - /* yep - but does it include the path to the mpi libs? */ - found = true; - /* check if mpi.jar exists - if so, add it */ - value = opal_os_path(false, opal_install_dirs.libdir, "mpi.jar", NULL); - if (access(value, F_OK ) != -1) { - set_classpath_jar_file(app, i+1, "mpi.jar"); - } - free(value); - /* check for oshmem support */ - value = opal_os_path(false, opal_install_dirs.libdir, "shmem.jar", NULL); - if (access(value, F_OK ) != -1) { - set_classpath_jar_file(app, i+1, "shmem.jar"); - } - free(value); - /* always add the local directory */ - asprintf(&value, "%s:%s", app->cwd, app->argv[i+1]); - free(app->argv[i+1]); - app->argv[i+1] = value; - break; - } - } - if (!found) { - /* check to see if CLASSPATH is in the environment */ - found = false; // just to be pedantic - for (i=0; NULL != environ[i]; i++) { - if (0 == strncmp(environ[i], "CLASSPATH", strlen("CLASSPATH"))) { - value = strchr(environ[i], '='); - ++value; /* step over the = */ - opal_argv_insert_element(&app->argv, 1, value); - /* check for mpi.jar */ - value = opal_os_path(false, opal_install_dirs.libdir, "mpi.jar", NULL); - if (access(value, F_OK ) != -1) { - set_classpath_jar_file(app, 1, "mpi.jar"); - } - free(value); - /* check for shmem.jar */ - value = opal_os_path(false, opal_install_dirs.libdir, "shmem.jar", NULL); - if (access(value, F_OK ) != -1) { - set_classpath_jar_file(app, 1, "shmem.jar"); - } - free(value); - /* always add the local directory */ - (void)asprintf(&value, "%s:%s", app->cwd, app->argv[1]); - free(app->argv[1]); - app->argv[1] = value; - opal_argv_insert_element(&app->argv, 1, "-cp"); - found = true; - break; - } - } - if (!found) { - /* need to add it right after the java command - have - * to include the working directory and trust that - * the user set cwd if necessary - */ - char *str, *str2; - /* always start with the working directory */ - str = strdup(app->cwd); - /* check for mpi.jar */ - value = opal_os_path(false, opal_install_dirs.libdir, "mpi.jar", NULL); - if (access(value, F_OK ) != -1) { - (void)asprintf(&str2, "%s:%s", str, value); - free(str); - str = str2; - } - free(value); - /* check for shmem.jar */ - value = opal_os_path(false, opal_install_dirs.libdir, "shmem.jar", NULL); - if (access(value, F_OK ) != -1) { - asprintf(&str2, "%s:%s", str, value); - free(str); - str = str2; - } - free(value); - opal_argv_insert_element(&app->argv, 1, str); - free(str); - opal_argv_insert_element(&app->argv, 1, "-cp"); - } - } - /* try to find the actual command - may not be perfect */ - for (i=1; i < opal_argv_count(app->argv); i++) { - if (NULL != strstr(app->argv[i], "java.library.path")) { - continue; - } else if (NULL != strstr(app->argv[i], "cp") || - NULL != strstr(app->argv[i], "classpath")) { - /* skip the next field */ - i++; - continue; - } - /* declare this the winner */ - opal_setenv("OMPI_COMMAND", app->argv[i], true, &app->env); - /* collect everything else as the cmd line */ - if ((i+1) < opal_argv_count(app->argv)) { - value = opal_argv_join(&app->argv[i+1], ' '); - opal_setenv("OMPI_ARGV", value, true, &app->env); - free(value); - } - break; - } - } else { - /* add the cmd to the environment for MPI_Info to pickup */ - opal_setenv("OMPI_COMMAND", appname, true, &app->env); - if (1 < opal_argv_count(app->argv)) { - value = opal_argv_join(&app->argv[1], ' '); - opal_setenv("OMPI_ARGV", value, true, &app->env); - free(value); - } - } - free(appname); - - *app_ptr = app; - app = NULL; - *made_app = true; - - /* All done */ - - cleanup: - if (NULL != app) { - OBJ_RELEASE(app); - } - if (cmd_line_made) { - OBJ_DESTRUCT(&cmd_line); - } - return rc; -} - -static void set_classpath_jar_file(orte_app_context_t *app, int index, char *jarfile) -{ - if (NULL == strstr(app->argv[index], jarfile)) { - /* nope - need to add it */ - char *fmt = ':' == app->argv[index][strlen(app->argv[index]-1)] - ? "%s%s/%s" : "%s:%s/%s"; - char *str; - asprintf(&str, fmt, app->argv[index], opal_install_dirs.libdir, jarfile); - free(app->argv[index]); - app->argv[index] = str; - } -} - -static int parse_appfile(orte_job_t *jdata, char *filename, char ***env) -{ - size_t i, len; - FILE *fp; - char line[BUFSIZ]; - int rc, argc, app_num; - orte_app_context_t *app; - bool blank, made_app; - char bogus[] = "bogus "; - char **tmp_env; - - /* - * Make sure to clear out this variable so we don't do anything odd in - * app_create() - */ - if( NULL != orte_cmd_line.appfile ) { - free( orte_cmd_line.appfile ); - orte_cmd_line.appfile = NULL; - } - - /* Try to open the file */ - - fp = fopen(filename, "r"); - if (NULL == fp) { - orte_show_help("help-orterun.txt", "orterun:appfile-not-found", true, - filename); - return ORTE_ERR_NOT_FOUND; - } - - /* Read in line by line */ - - line[sizeof(line) - 1] = '\0'; - app_num = 0; - do { - char **argv; - - /* We need a bogus argv[0] (because when argv comes in from - the command line, argv[0] is "orterun", so the parsing - logic ignores it). So create one here rather than making - an argv and then pre-pending a new argv[0] (which would be - rather inefficient). */ - - line[0] = '\0'; - strcat(line, bogus); - - if (NULL == fgets(line + sizeof(bogus) - 1, - sizeof(line) - sizeof(bogus) - 1, fp)) { - break; - } - - /* Remove a trailing newline */ - - len = strlen(line); - if (len > 0 && '\n' == line[len - 1]) { - line[len - 1] = '\0'; - if (len > 0) { - --len; - } - } - - /* Remove comments */ - - for (i = 0; i < len; ++i) { - if ('#' == line[i]) { - line[i] = '\0'; - break; - } else if (i + 1 < len && '/' == line[i] && '/' == line[i + 1]) { - line[i] = '\0'; - break; - } - } - - /* Is this a blank line? */ - - len = strlen(line); - for (blank = true, i = sizeof(bogus); i < len; ++i) { - if (!isspace(line[i])) { - blank = false; - break; - } - } - if (blank) { - continue; - } - - /* We got a line with *something* on it. So process it */ - - argv = opal_argv_split(line, ' '); - argc = opal_argv_count(argv); - if (argc > 0) { - - /* Create a temporary env to use in the recursive call -- - that is: don't disturb the original env so that we can - have a consistent global env. This allows for the - case: - - orterun --mca foo bar --appfile file - - where the "file" contains multiple apps. In this case, - each app in "file" will get *only* foo=bar as the base - environment from which its specific environment is - constructed. */ - - if (NULL != *env) { - tmp_env = opal_argv_copy(*env); - if (NULL == tmp_env) { - opal_argv_free(argv); - fclose(fp); - return ORTE_ERR_OUT_OF_RESOURCE; - } - } else { - tmp_env = NULL; - } - - rc = create_app(argc, argv, jdata, &app, &made_app, &tmp_env); - if (ORTE_SUCCESS != rc) { - /* Assume that the error message has already been - printed; no need to cleanup -- we can just exit */ - exit(1); - } - if (NULL != tmp_env) { - opal_argv_free(tmp_env); - } - opal_argv_free(argv); - if (made_app) { - app->idx = app_num; - ++app_num; - opal_pointer_array_add(jdata->apps, app); - ++jdata->num_apps; - } - } - } while (!feof(fp)); - fclose(fp); - - /* All done */ - - free(filename); - return ORTE_SUCCESS; -} -/* - * Process one line from the orte_base_user_debugger MCA param and - * look for that debugger in the path. If we find it, fill in - * new_argv. - */ -static int process(char *orig_line, char *basename, opal_cmd_line_t *cmd_line, - int argc, char **argv, char ***new_argv, int num_procs) -{ - int ret = ORTE_SUCCESS; - int i, j, count; - char *line = NULL, *tmp = NULL, *full_line = strdup(orig_line); - char **orterun_argv = NULL, **executable_argv = NULL, **line_argv = NULL; - char cwd[OPAL_PATH_MAX]; - bool used_num_procs = false; - bool single_app = false; - bool fail_needed_executable = false; - - line = full_line; - if (NULL == line) { - ret = ORTE_ERR_OUT_OF_RESOURCE; - goto out; - } - - /* Trim off whitespace at the beginning and ending of line */ - - for (i = 0; '\0' != line[i] && isspace(line[i]); ++line) { - continue; - } - for (i = strlen(line) - 2; i > 0 && isspace(line[i]); ++i) { - line[i] = '\0'; - } - if (strlen(line) <= 0) { - ret = ORTE_ERROR; - goto out; - } - - /* Get the tail of the command line (i.e., the user executable / - argv) */ - - opal_cmd_line_get_tail(cmd_line, &i, &executable_argv); - - /* Make a new copy of the orterun command line args, without the - orterun token itself, and without the --debug, --debugger, and - -tv flags. */ - - orterun_argv = opal_argv_copy(argv); - count = opal_argv_count(orterun_argv); - opal_argv_delete(&count, &orterun_argv, 0, 1); - for (i = 0; NULL != orterun_argv[i]; ++i) { - count = opal_argv_count(orterun_argv); - if (0 == strcmp(orterun_argv[i], "-debug") || - 0 == strcmp(orterun_argv[i], "--debug")) { - opal_argv_delete(&count, &orterun_argv, i, 1); - } else if (0 == strcmp(orterun_argv[i], "-tv") || - 0 == strcmp(orterun_argv[i], "--tv")) { - opal_argv_delete(&count, &orterun_argv, i, 1); - } else if (0 == strcmp(orterun_argv[i], "--debugger") || - 0 == strcmp(orterun_argv[i], "-debugger")) { - opal_argv_delete(&count, &orterun_argv, i, 2); - } - } - - /* Replace @@ tokens - line should never realistically be bigger - than MAX_INT, so just cast to int to remove compiler warning */ - - *new_argv = NULL; - line_argv = opal_argv_split(line, ' '); - if (NULL == line_argv) { - ret = ORTE_ERR_NOT_FOUND; - goto out; - } - for (i = 0; NULL != line_argv[i]; ++i) { - if (0 == strcmp(line_argv[i], "@mpirun@") || - 0 == strcmp(line_argv[i], "@orterun@")) { - opal_argv_append_nosize(new_argv, argv[0]); - } else if (0 == strcmp(line_argv[i], "@mpirun_args@") || - 0 == strcmp(line_argv[i], "@orterun_args@")) { - for (j = 0; NULL != orterun_argv && NULL != orterun_argv[j]; ++j) { - opal_argv_append_nosize(new_argv, orterun_argv[j]); - } - } else if (0 == strcmp(line_argv[i], "@np@")) { - used_num_procs = true; - asprintf(&tmp, "%d", num_procs); - opal_argv_append_nosize(new_argv, tmp); - free(tmp); - } else if (0 == strcmp(line_argv[i], "@single_app@")) { - /* This token is only a flag; it is not replaced with any - alternate text */ - single_app = true; - } else if (0 == strcmp(line_argv[i], "@executable@")) { - /* If we found the executable, paste it in. Otherwise, - this is a possible error. */ - if (NULL != executable_argv) { - opal_argv_append_nosize(new_argv, executable_argv[0]); - } else { - fail_needed_executable = true; - } - } else if (0 == strcmp(line_argv[i], "@executable_argv@")) { - /* If we found the tail, paste in the argv. Otherwise, - this is a possible error. */ - if (NULL != executable_argv) { - for (j = 1; NULL != executable_argv[j]; ++j) { - opal_argv_append_nosize(new_argv, executable_argv[j]); - } - } else { - fail_needed_executable = true; - } - } else { - /* It wasn't a special token, so just copy it over */ - opal_argv_append_nosize(new_argv, line_argv[i]); - } - } - - /* Can we find argv[0] in the path? */ - - getcwd(cwd, OPAL_PATH_MAX); - tmp = opal_path_findv((*new_argv)[0], X_OK, environ, cwd); - if (NULL != tmp) { - free(tmp); - - /* Ok, we found a good debugger. Check for some error - conditions. */ - tmp = opal_argv_join(argv, ' '); - - /* We do not support launching a debugger that requires the - -np value if the user did not specify -np on the command - line. */ - if (used_num_procs && 0 == num_procs) { - free(tmp); - tmp = opal_argv_join(orterun_argv, ' '); - orte_show_help("help-orterun.txt", "debugger requires -np", - true, (*new_argv)[0], argv[0], tmp, - (*new_argv)[0]); - /* Fall through to free / fail, below */ - } - - /* Some debuggers do not support launching MPMD */ - else if (single_app && NULL != strstr(tmp, " : ")) { - orte_show_help("help-orterun.txt", - "debugger only accepts single app", true, - (*new_argv)[0], (*new_argv)[0]); - /* Fall through to free / fail, below */ - } - - /* Some debuggers do not use orterun/mpirun, and therefore - must have an executable to run (e.g., cannot use mpirun's - app context file feature). */ - else if (fail_needed_executable) { - orte_show_help("help-orterun.txt", - "debugger requires executable", true, - (*new_argv)[0], argv[0], (*new_argv)[0], argv[0], - (*new_argv)[0]); - /* Fall through to free / fail, below */ - } - - /* Otherwise, we succeeded. Return happiness. */ - else { - goto out; - } - } - - /* All done -- didn't find it */ - - opal_argv_free(*new_argv); - *new_argv = NULL; - ret = ORTE_ERR_NOT_FOUND; - - out: - if (NULL != orterun_argv) { - opal_argv_free(orterun_argv); - } - if (NULL != executable_argv) { - opal_argv_free(executable_argv); - } - if (NULL != line_argv) { - opal_argv_free(line_argv); - } - if (NULL != tmp) { - free(tmp); - } - if (NULL != full_line) { - free(full_line); - } - return ret; -} - -/** - * Run a user-level debugger - */ -static void run_debugger(char *basename, opal_cmd_line_t *cmd_line, - int argc, char *argv[], int num_procs) -{ - int i, id, ret; - char **new_argv = NULL; - const char **tmp; - char *value, **lines, *env_name; - - /* Get the orte_base_debug MCA parameter and search for a debugger - that can run */ - - id = mca_base_var_find("orte", "orte", NULL, "base_user_debugger"); - if (id < 0) { - orte_show_help("help-orterun.txt", "debugger-mca-param-not-found", - true); - exit(1); - } - - ret = mca_base_var_get_value (id, &tmp, NULL, NULL); - if (OPAL_SUCCESS != ret || NULL == tmp || NULL == tmp[0]) { - orte_show_help("help-orterun.txt", "debugger-orte_base_user_debugger-empty", - true); - exit(1); - } - - /* Look through all the values in the MCA param */ - - lines = opal_argv_split(tmp[0], ':'); - for (i = 0; NULL != lines[i]; ++i) { - if (ORTE_SUCCESS == process(lines[i], basename, cmd_line, argc, argv, - &new_argv, num_procs)) { - break; - } - } - - /* If we didn't find one, abort */ - - if (NULL == lines[i]) { - orte_show_help("help-orterun.txt", "debugger-not-found", true); - exit(1); - } - opal_argv_free(lines); - - /* We found one */ - - /* cleanup the MPIR arrays in case the debugger doesn't set them */ - memset((char*)MPIR_executable_path, 0, MPIR_MAX_PATH_LENGTH); - memset((char*)MPIR_server_arguments, 0, MPIR_MAX_ARG_LENGTH); - - /* Set an MCA param so that everyone knows that they are being - launched under a debugger; not all debuggers are consistent - about setting MPIR_being_debugged in both the launcher and the - MPI processes */ - ret = mca_base_var_env_name ("orte_in_parallel_debugger", &env_name); - if (OPAL_SUCCESS == ret && NULL != env_name) { - opal_setenv(env_name, "1", true, &environ); - free(env_name); - } - - /* Launch the debugger */ - execvp(new_argv[0], new_argv); - value = opal_argv_join(new_argv, ' '); - orte_show_help("help-orterun.txt", "debugger-exec-failed", - true, basename, value, new_argv[0]); - free(value); - opal_argv_free(new_argv); - exit(1); -} - -/**** DEBUGGER CODE ****/ -/* - * Debugger support for orterun - * - * We interpret the MPICH debugger interface as follows: - * - * a) The launcher - * - spawns the other processes, - * - fills in the table MPIR_proctable, and sets MPIR_proctable_size - * - sets MPIR_debug_state to MPIR_DEBUG_SPAWNED ( = 1) - * - calls MPIR_Breakpoint() which the debugger will have a - * breakpoint on. - * - * b) Applications start and then spin until MPIR_debug_gate is set - * non-zero by the debugger. - * - * This file implements (a). - * - ************************************************************************** - * - * Note that we have presently tested both TotalView and DDT parallel - * debuggers. They both nominally subscribe to the Etnus attaching - * interface, but there are differences between the two. - * - * TotalView: user launches "totalview mpirun -a ......". - * TV launches mpirun. mpirun launches the application and then calls - * MPIR_Breakpoint(). This is the signal to TV that it's a parallel - * MPI job. TV then reads the proctable in mpirun and attaches itself - * to all the processes (it takes care of launching itself on the - * remote nodes). Upon attaching to all the MPI processes, the - * variable MPIR_being_debugged is set to 1. When it has finished - * attaching itself to all the MPI processes that it wants to, - * MPIR_Breakpoint() returns. - * - * DDT: user launches "ddt bin -np X ". DDT fork/exec's - * mpirun to launch ddt-debugger on the back-end nodes via "mpirun -np - * X ddt-debugger" (not the lack of other arguments -- we can't pass - * anything to mpirun). This app will eventually fork/exec the MPI - * app. DDT does not current set MPIR_being_debugged in the MPI app. - * - ************************************************************************** - * - * We support two ways of waiting for attaching debuggers. The - * implementation spans this file and ompi/debuggers/ompi_debuggers.c. - * - * 1. If using orterun: MPI processes will have the - * orte_in_parallel_debugger MCA param set to true (because not all - * debuggers consistently set MPIR_being_debugged in both the launcher - * and in the MPI procs). The HNP will call MPIR_Breakpoint() and - * then RML send a message to VPID 0 (MCW rank 0) when it returns - * (MPIR_Breakpoint() doesn't return until the debugger has attached - * to all relevant processes). Meanwhile, VPID 0 blocks waiting for - * the RML message. All other VPIDs immediately call the grpcomm - * barrier (and therefore block until the debugger attaches). Once - * VPID 0 receives the RML message, we know that the debugger has - * attached to all processes that it cares about, and VPID 0 then - * joins the grpcomm barrier, allowing the job to continue. This - * scheme has the side effect of nicely supporting partial attaches by - * parallel debuggers (i.e., attaching to only some of the MPI - * processes; not necessarily all of them). - * - * 2. If not using orterun: in this case, we know that there will not be an RML message - * sent to VPID 0. So we have to look for a magic environment - * variable from the launcher to know if the jobs will be attached by - * a debugger (e.g., set by yod, srun, ...etc.), and if so, spin on - * MPIR_debug_gate. These environment variable names must be - * hard-coded in the OMPI layer (see ompi/debuggers/ompi_debuggers.c). - */ - -/* local globals and functions */ -#define DUMP_INT(X) fprintf(stderr, " %s = %d\n", # X, X); -#define FILE_MODE (S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH) - -struct MPIR_PROCDESC { - char *host_name; /* something that can be passed to inet_addr */ - char *executable_name; /* name of binary */ - int pid; /* process pid */ -}; - - -static void orte_debugger_dump(void) -{ - int i; - - DUMP_INT(MPIR_being_debugged); - DUMP_INT(MPIR_debug_state); - DUMP_INT(MPIR_partial_attach_ok); - DUMP_INT(MPIR_i_am_starter); - DUMP_INT(MPIR_forward_output); - DUMP_INT(MPIR_proctable_size); - fprintf(stderr, " MPIR_proctable:\n"); - for (i = 0; i < MPIR_proctable_size; i++) { - fprintf(stderr, - " (i, host, exe, pid) = (%d, %s, %s, %d)\n", - i, - MPIR_proctable[i].host_name, - MPIR_proctable[i].executable_name, - MPIR_proctable[i].pid); - } - fprintf(stderr, "MPIR_executable_path: %s\n", - ('\0' == MPIR_executable_path[0]) ? - "NULL" : (char*) MPIR_executable_path); - fprintf(stderr, "MPIR_server_arguments: %s\n", - ('\0' == MPIR_server_arguments[0]) ? - "NULL" : (char*) MPIR_server_arguments); -} - -/** - * Initialization of data structures for running under a debugger - * using the MPICH/TotalView parallel debugger interface. Before the - * spawn we need to check if we are being run under a TotalView-like - * debugger; if so then inform applications via an MCA parameter. - */ -static void orte_debugger_init_before_spawn(orte_job_t *jdata) -{ - char *env_name; - orte_app_context_t *app; - int i; - char *attach_fifo; - - if (!MPIR_being_debugged && !orte_in_parallel_debugger) { - /* if we were given a test debugger, then we still want to - * colaunch it - */ - if (NULL != orte_debugger_test_daemon) { - opal_output_verbose(2, orte_debug_output, - "%s No debugger test daemon specified", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - goto launchit; - } - /* if we were given an auto-detect rate, then we want to setup - * an event so we periodically do the check - */ - if (0 < orte_debugger_check_rate) { - opal_output_verbose(2, orte_debug_output, - "%s Setting debugger attach check rate for %d seconds", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - orte_debugger_check_rate); - ORTE_TIMER_EVENT(orte_debugger_check_rate, 0, attach_debugger, ORTE_SYS_PRI); - } else if (orte_create_session_dirs) { - /* create the attachment FIFO and setup readevent - cannot be - * done if no session dirs exist! - */ - attach_fifo = opal_os_path(false, orte_process_info.job_session_dir, "debugger_attach_fifo", NULL); - if ((mkfifo(attach_fifo, FILE_MODE) < 0) && errno != EEXIST) { - opal_output(0, "CANNOT CREATE FIFO %s: errno %d", attach_fifo, errno); - free(attach_fifo); - return; - } - strncpy(MPIR_attach_fifo, attach_fifo, MPIR_MAX_PATH_LENGTH - 1); - free(attach_fifo); - open_fifo(); - } - return; - } - - launchit: - opal_output_verbose(1, orte_debug_output, "Info: Spawned by a debugger"); - - /* tell the procs they are being debugged */ - (void) mca_base_var_env_name ("orte_in_parallel_debugger", &env_name); - - for (i=0; i < jdata->apps->size; i++) { - if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) { - continue; - } - opal_setenv(env_name, "1", true, &app->env); - } - free(env_name); -} - -static void setup_debugger_job(void) -{ - orte_job_t *debugger; - orte_app_context_t *app; - orte_proc_t *proc; - int i, rc; - orte_node_t *node; - orte_vpid_t vpid=0; - char cwd[OPAL_PATH_MAX]; - - /* setup debugger daemon job */ - debugger = OBJ_NEW(orte_job_t); - /* create a jobid for these daemons - this is done solely - * to avoid confusing the rest of the system's bookkeeping - */ - orte_plm_base_create_jobid(debugger); - /* set the personality to ORTE */ - opal_argv_append_nosize(&debugger->personality, "orte"); - /* flag the job as being debugger daemons */ - ORTE_FLAG_SET(debugger, ORTE_JOB_FLAG_DEBUGGER_DAEMON); - /* unless directed, we do not forward output */ - if (!MPIR_forward_output) { - ORTE_FLAG_SET(debugger, ORTE_JOB_FLAG_FORWARD_OUTPUT); - } - /* dont push stdin */ - debugger->stdin_target = ORTE_VPID_INVALID; - /* add it to the global job pool */ - opal_hash_table_set_value_uint32(orte_job_data, debugger->jobid, debugger); - /* create an app_context for the debugger daemon */ - app = OBJ_NEW(orte_app_context_t); - if (NULL != orte_debugger_test_daemon) { - app->app = strdup(orte_debugger_test_daemon); - } else { - app->app = strdup((char*)MPIR_executable_path); - } - /* don't currently have an option to pass the debugger - * cwd - probably should add one someday - */ - if (OPAL_SUCCESS != (rc = opal_getcwd(cwd, sizeof(cwd)))) { - orte_show_help("help-orterun.txt", "orterun:init-failure", - true, "get the cwd", rc); - return; - } - app->cwd = strdup(cwd); - orte_remove_attribute(&app->attributes, ORTE_APP_USER_CWD); - opal_argv_append_nosize(&app->argv, app->app); - build_debugger_args(app); - opal_pointer_array_add(debugger->apps, app); - debugger->num_apps = 1; - /* create a job map */ - debugger->map = OBJ_NEW(orte_job_map_t); - /* in building the map, we want to launch one debugger daemon - * on each node that *already has an application process on it*. - * We cannot just launch one debugger daemon on EVERY node because - * the original job may not have placed procs on every node. So - * we construct the map here by cycling across all nodes, adding - * only those nodes where num_procs > 0. - */ - for (i=0; i < orte_node_pool->size; i++) { - if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, i))) { - continue; - } - /* if this node wasn't included in the vm, ignore it */ - if (NULL == node->daemon) { - continue; - } - /* if the node doesn't have any app procs on it, ignore it */ - if (node->num_procs < 1) { - continue; - } - /* this node has at least one proc, so add it to our map */ - OBJ_RETAIN(node); - opal_pointer_array_add(debugger->map->nodes, node); - debugger->map->num_nodes++; - /* add a debugger daemon to the node - note that the - * debugger daemon does NOT count against our subscribed slots - */ - proc = OBJ_NEW(orte_proc_t); - proc->name.jobid = debugger->jobid; - proc->name.vpid = vpid++; - /* set the local/node ranks - we don't actually care - * what these are, but the odls needs them - */ - proc->local_rank = 0; - proc->node_rank = 0; - proc->app_rank = proc->name.vpid; - /* flag the proc as ready for launch */ - proc->state = ORTE_PROC_STATE_INIT; - proc->app_idx = 0; - - OBJ_RETAIN(node); /* maintain accounting on object */ - proc->node = node; - /* add the proc to the job */ - opal_pointer_array_set_item(debugger->procs, proc->name.vpid, proc); - debugger->num_procs++; - - /* add the proc to the node's array */ - OBJ_RETAIN(proc); - opal_pointer_array_add(node->procs, (void*)proc); - node->num_procs++; - } - /* schedule it for launch */ - debugger->state = ORTE_JOB_STATE_INIT; - ORTE_ACTIVATE_JOB_STATE(debugger, ORTE_JOB_STATE_LAUNCH_APPS); -} - -static bool mpir_breakpoint_fired = false; - -/* - * Initialization of data structures for running under a debugger - * using the MPICH/TotalView parallel debugger interface. This stage - * of initialization must occur after spawn - * - * NOTE: We -always- perform this step to ensure that any debugger - * that attaches to us post-launch of the application can get a - * completed proctable - */ -void orte_debugger_init_after_spawn(int fd, short event, void *cbdata) -{ - orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata; - orte_job_t *jdata = caddy->jdata; - orte_proc_t *proc; - orte_app_context_t *appctx; - orte_vpid_t i, j; - opal_buffer_t *buf; - int rc; - char **aliases, *aptr; - - /* if we couldn't get thru the mapper stage, we might - * enter here with no procs. Avoid the "zero byte malloc" - * message by checking here - */ - if (MPIR_proctable || 0 == jdata->num_procs) { - /* already initialized */ - opal_output_verbose(5, orte_debug_output, - "%s: debugger already initialized or zero procs", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - OBJ_RELEASE(caddy); - if (!mpir_breakpoint_fired) { - /* record that we have triggered the debugger */ - mpir_breakpoint_fired = true; - - /* trigger the debugger */ - MPIR_Breakpoint(); - - /* send a message to rank=0 to release it */ - if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, 0)) || - ORTE_PROC_STATE_UNTERMINATED < proc->state ) { - /* proc is already dead */ - return; - } - buf = OBJ_NEW(opal_buffer_t); /* don't need anything in this */ - if (0 > (rc = orte_rml.send_buffer_nb(&proc->name, buf, - ORTE_RML_TAG_DEBUGGER_RELEASE, - orte_rml_send_callback, NULL))) { - opal_output(0, "Error: could not send debugger release to MPI procs - error %s", ORTE_ERROR_NAME(rc)); - OBJ_RELEASE(buf); - } - } - return; - } - - /* fill in the proc table for the application processes */ - - opal_output_verbose(5, orte_debug_output, - "%s: Setting up debugger process table for applications", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - - MPIR_debug_state = 1; - - /* set the total number of processes in the job */ - MPIR_proctable_size = jdata->num_procs; - - /* allocate MPIR_proctable */ - MPIR_proctable = (struct MPIR_PROCDESC *)malloc(sizeof(struct MPIR_PROCDESC) * - MPIR_proctable_size); - if (MPIR_proctable == NULL) { - ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); - OBJ_RELEASE(caddy); - return; - } - - if (orte_debugger_dump_proctable) { - opal_output(orte_clean_output, "MPIR Proctable for job %s", ORTE_JOBID_PRINT(jdata->jobid)); - } - - /* initialize MPIR_proctable */ - for (j=0; j < jdata->num_procs; j++) { - if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, j))) { - continue; - } - /* store this data in the location whose index - * corresponds to the proc's rank - */ - i = proc->name.vpid; - if (NULL == (appctx = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, proc->app_idx))) { - continue; - } - - /* take the indicated alias as the hostname, if aliases exist */ - if (orte_retain_aliases) { - aliases = NULL; - aptr = NULL; - if (orte_get_attribute(&proc->node->attributes, ORTE_NODE_ALIAS, (void**)&aptr, OPAL_STRING)) { - aliases = opal_argv_split(aptr, ','); - free(aptr); - if (orte_use_hostname_alias <= opal_argv_count(aliases)) { - MPIR_proctable[i].host_name = strdup(aliases[orte_use_hostname_alias-1]); - } - opal_argv_free(aliases); - } - } else { - /* just use the default name */ - MPIR_proctable[i].host_name = strdup(proc->node->name); - } - - if ( 0 == strncmp(appctx->app, OPAL_PATH_SEP, 1 )) { - MPIR_proctable[i].executable_name = - opal_os_path( false, appctx->app, NULL ); - } else { - MPIR_proctable[i].executable_name = - opal_os_path( false, appctx->cwd, appctx->app, NULL ); - } - MPIR_proctable[i].pid = proc->pid; - if (orte_debugger_dump_proctable) { - opal_output(orte_clean_output, "%s: Host %s Exe %s Pid %d", - ORTE_VPID_PRINT(i), MPIR_proctable[i].host_name, - MPIR_proctable[i].executable_name, MPIR_proctable[i].pid); - } - } - - if (0 < opal_output_get_verbosity(orte_debug_output)) { - orte_debugger_dump(); - } - - /* if we are being launched under a debugger, then we must wait - * for it to be ready to go and do some things to start the job - */ - if (MPIR_being_debugged || NULL != orte_debugger_test_daemon || - NULL != getenv("ORTE_TEST_DEBUGGER_ATTACH")) { - /* if we are not launching debugger daemons, then trigger - * the debugger - otherwise, we need to wait for the debugger - * daemons to be started - */ - if ('\0' == MPIR_executable_path[0] && NULL == orte_debugger_test_daemon) { - /* record that we have triggered the debugger */ - mpir_breakpoint_fired = true; - - /* trigger the debugger */ - MPIR_Breakpoint(); - - /* send a message to rank=0 to release it */ - if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, 0)) || - ORTE_PROC_STATE_UNTERMINATED < proc->state) { - /* proc is already dead or never registered with us (so we don't have - * contact info for him) - */ - return; - } - opal_output_verbose(2, orte_debug_output, - "%s sending debugger release to %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&proc->name)); - buf = OBJ_NEW(opal_buffer_t); /* don't need anything in this */ - if (0 > (rc = orte_rml.send_buffer_nb(&proc->name, buf, - ORTE_RML_TAG_DEBUGGER_RELEASE, - orte_rml_send_callback, NULL))) { - opal_output(0, "Error: could not send debugger release to MPI procs - error %s", ORTE_ERROR_NAME(rc)); - OBJ_RELEASE(buf); - } - } else { - /* if I am launching debugger daemons, then I need to do so now - * that the job has been started and I know which nodes have - * apps on them - */ - opal_output_verbose(2, orte_debug_output, - "%s Cospawning debugger daemons %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - (NULL == orte_debugger_test_daemon) ? - MPIR_executable_path : orte_debugger_test_daemon); - setup_debugger_job(); - } - /* we don't have anything else to do */ - OBJ_RELEASE(caddy); - return; - } - - /* if we are not being debugged, then just cleanup and depart */ - OBJ_RELEASE(caddy); -} - -static void orte_debugger_detached(int fd, short event, void *cbdata) -{ - orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata; - OBJ_RELEASE(caddy); - - /* need to ensure MPIR_Breakpoint is called again if another debugger attaches */ - mpir_breakpoint_fired = false; -} - -static void open_fifo (void) -{ - if (attach_fd > 0) { - close(attach_fd); - } - - attach_fd = open(MPIR_attach_fifo, O_RDONLY | O_NONBLOCK, 0); - if (attach_fd < 0) { - opal_output(0, "%s unable to open debugger attach fifo", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - return; - } - - /* Set this fd to be close-on-exec so that children don't see it */ - if (opal_fd_set_cloexec(attach_fd) != OPAL_SUCCESS) { - opal_output(0, "%s unable to set debugger attach fifo to CLOEXEC", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - close(attach_fd); - attach_fd = -1; - return; - } - - opal_output_verbose(2, orte_debug_output, - "%s Monitoring debugger attach fifo %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - MPIR_attach_fifo); - attach = (opal_event_t*)malloc(sizeof(opal_event_t)); - opal_event_set(orte_event_base, attach, attach_fd, OPAL_EV_READ, attach_debugger, attach); - - fifo_active = true; - opal_event_add(attach, 0); -} - -static void attach_debugger(int fd, short event, void *arg) -{ - unsigned char fifo_cmd; - int rc; - orte_timer_t *tm; - opal_event_t *attach; - - if (fifo_active) { - attach = (opal_event_t*)arg; - fifo_active = false; - - rc = read(attach_fd, &fifo_cmd, sizeof(fifo_cmd)); - if (!rc) { - /* release the current event */ - opal_event_free(attach); - /* reopen device to clear hangup */ - open_fifo(); - return; - } - if (1 != fifo_cmd) { - /* ignore the cmd */ - fifo_active = true; - opal_event_add(attach, 0); - return; - } - } - - if (!MPIR_being_debugged && !orte_debugger_test_attach) { - /* false alarm - reset the read or timer event */ - if (0 == orte_debugger_check_rate) { - fifo_active = true; - opal_event_add(attach, 0); - } else if (!MPIR_being_debugged) { - tm = (orte_timer_t*)arg; - /* re-add the event */ - opal_event_evtimer_add(tm->ev, &tm->tv); - } - return; - } - - opal_output_verbose(1, orte_debug_output, - "%s Attaching debugger %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - (NULL == orte_debugger_test_daemon) ? MPIR_executable_path : orte_debugger_test_daemon); - - /* a debugger has attached! All the MPIR_Proctable - * data is already available, so we only need to - * check to see if we should spawn any daemons - */ - if ('\0' != MPIR_executable_path[0] || NULL != orte_debugger_test_daemon) { - opal_output_verbose(2, orte_debug_output, - "%s Spawning debugger daemons %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - (NULL == orte_debugger_test_daemon) ? - MPIR_executable_path : orte_debugger_test_daemon); - setup_debugger_job(); - } - - /* reset the read or timer event */ - if (0 == orte_debugger_check_rate) { - fifo_active = true; - opal_event_add(attach, 0); - } else if (!MPIR_being_debugged) { - tm = (orte_timer_t*)arg; - /* re-add the event */ - opal_event_evtimer_add(tm->ev, &tm->tv); - } -} - -static void build_debugger_args(orte_app_context_t *debugger) -{ - int i, j; - char mpir_arg[MPIR_MAX_ARG_LENGTH]; - - if ('\0' != MPIR_server_arguments[0]) { - j=0; - memset(mpir_arg, 0, MPIR_MAX_ARG_LENGTH); - for (i=0; i < MPIR_MAX_ARG_LENGTH; i++) { - if (MPIR_server_arguments[i] == '\0') { - if (0 < j) { - opal_argv_append_nosize(&debugger->argv, mpir_arg); - memset(mpir_arg, 0, MPIR_MAX_ARG_LENGTH); - j=0; - } - } else { - mpir_arg[j] = MPIR_server_arguments[i]; - j++; - } - } - } -} - -void orte_timeout_wakeup(int sd, short args, void *cbdata) -{ - char *tm; - - /* this function gets called when the job execution time - * has hit a prescribed limit - so just abort - */ - tm = getenv("MPIEXEC_TIMEOUT"); - orte_show_help("help-orterun.txt", "orterun:timeout", - true, (NULL == tm) ? "NULL" : tm); - ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE); - /* if we are testing HNP suicide, then just exit */ - if (NULL != getenv("ORTE_TEST_HNP_SUICIDE")) { - opal_output(0, "HNP exiting w/o cleanup"); - exit(1); - } - /* abort the job */ - ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_ALL_JOBS_COMPLETE); - /* set the global abnormal exit flag */ - orte_abnormal_term_ordered = true; -} diff --git a/orte/util/Makefile.am b/orte/util/Makefile.am index e0af75ecc8..6873f08328 100644 --- a/orte/util/Makefile.am +++ b/orte/util/Makefile.am @@ -11,7 +11,7 @@ # All rights reserved. # Copyright (c) 2008 Sun Microsystems, Inc. All rights reserved. # Copyright (c) 2014 Cisco Systems, Inc. All rights reserved. -# Copyright (c) 2014-2015 Intel, Inc. All rights reserved. +# Copyright (c) 2014-2016 Intel, Inc. All rights reserved. # $COPYRIGHT$ # # Additional copyrights may follow @@ -57,7 +57,8 @@ headers += \ util/nidmap.h \ util/regex.h \ util/attr.h \ - util/listener.h + util/listener.h \ + util/cmd_line.h lib@ORTE_LIB_PREFIX@open_rte_la_SOURCES += \ util/error_strings.c \ @@ -76,7 +77,8 @@ lib@ORTE_LIB_PREFIX@open_rte_la_SOURCES += \ util/nidmap.c \ util/regex.c \ util/attr.c \ - util/listener.c + util/listener.c \ + util/cmd_line.c # Remove the generated man pages distclean-local: diff --git a/orte/util/cmd_line.c b/orte/util/cmd_line.c new file mode 100644 index 0000000000..cf5d755140 --- /dev/null +++ b/orte/util/cmd_line.c @@ -0,0 +1,110 @@ +/* + * Copyright (c) 2016 Intel, Inc. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +#include "orte_config.h" +#include "orte/types.h" +#include "orte/constants.h" + +#include +#include + +#include "opal/mca/base/base.h" +#include "opal/util/cmd_line.h" +#include "opal/util/printf.h" +#include "opal/runtime/opal.h" + +#include "orte/mca/errmgr/errmgr.h" +#include "orte/mca/schizo/base/base.h" + +#include "orte/util/cmd_line.h" + +int orte_cmd_line_create(opal_cmd_line_t *cmd_line, + int argc, char **argv, + char ***context_env, char ***global_env, + bool *version, bool *help) +{ + int i, rc; + + if (NULL != version) { + *version = false; + } + if (NULL != help) { + *help = false; + } + + if (NULL != version) { + /* see if they asked for us to print version */ + for (i=0; NULL != argv[i]; i++) { + if (0 == strcmp(argv[i], "--version") || + 0 == strcmp(argv[i], "-V")) { + *version = true; + return ORTE_SUCCESS; + } + } + } + + /* process any mca params */ + if (OPAL_SUCCESS != (rc = mca_base_cmd_line_process_args(argv, context_env, global_env))) { + ORTE_ERROR_LOG(rc); + return rc; + } + + + opal_cmd_line_create(cmd_line, NULL); + + /* init the MCA system - will just refcount if already initialized */ + opal_init_util(NULL, NULL); + + /* open the SCHIZO framework so we can define the cmd line options */ + if (ORTE_SUCCESS != (rc = mca_base_framework_open(&orte_schizo_base_framework, 0))) { + ORTE_ERROR_LOG(rc); + return rc; + } + if (ORTE_SUCCESS != (rc = orte_schizo_base_select())) { + ORTE_ERROR_LOG(rc); + return rc; + } + /* define the cli options */ + if (ORTE_SUCCESS != (rc = orte_schizo.define_cli(cmd_line))) { + ORTE_ERROR_LOG(rc); + return rc; + } + /* close the framework for bookkeeping purposes */ + mca_base_framework_close(&orte_schizo_base_framework); + + /* decrement the opal refcount */ + opal_finalize_util(); + + /* now that options have been defined, finish setup */ + mca_base_cmd_line_setup(cmd_line); + + + /* Check for help request - must do this after we setup + * the cmd line so the help messages can display */ + if (NULL != help) { + for (i=0; NULL != argv[i]; i++) { + if (0 == strcmp(argv[i], "--help") || + 0 == strcmp(argv[i], "-h")) { + *help = true; + return ORTE_SUCCESS; + } + } + } + + /* parse the result to get values */ + if (OPAL_SUCCESS != (rc = opal_cmd_line_parse(cmd_line, true, + argc, argv)) ) { + if (OPAL_ERR_SILENT != rc) { + fprintf(stderr, "%s: command line error (%s)\n", argv[0], + opal_strerror(rc)); + } + return rc; + } + + return ORTE_SUCCESS; +} diff --git a/orte/util/cmd_line.h b/orte/util/cmd_line.h new file mode 100644 index 0000000000..231fa1d96c --- /dev/null +++ b/orte/util/cmd_line.h @@ -0,0 +1,39 @@ +/* + * Copyright (c) 2016 Intel, Inc. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +/** @file: + * + * Populates global structure with system-specific information. + * + * Notes: add limits.h, compute size of integer and other types via sizeof(type)*CHAR_BIT + * + */ + +#ifndef _ORTE_CMD_LINE_H_ +#define _ORTE_CMD_LINE_H_ + +#include "orte_config.h" + +#ifdef HAVE_STDINT_h +#include +#endif + +#include "orte/types.h" + +#include "opal/util/cmd_line.h" + +BEGIN_C_DECLS + +ORTE_DECLSPEC int orte_cmd_line_create(opal_cmd_line_t *cmd_line, + int argc, char **argv, + char ***context_env, char ***global_env, + bool *version, bool *help); + +END_C_DECLS +#endif