From 795140e59053b45661db9eef328cc30f1264f253 Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Sun, 17 Jun 2018 02:40:12 -0700 Subject: [PATCH] Make use of "instant-on" feature optional The PMIx support for "instant on" remains experimental, so disable it by default. Provide an MCA param and corresponding command line option to enable it at runtime. Signed-off-by: Ralph Castain --- orte/mca/odls/base/odls_base_default_fns.c | 23 +++++++--------------- orte/mca/schizo/ompi/schizo_ompi.c | 8 +++++++- orte/runtime/orte_globals.c | 3 +++ orte/runtime/orte_globals.h | 3 +++ orte/runtime/orte_mca_params.c | 9 ++++++++- 5 files changed, 28 insertions(+), 18 deletions(-) mode change 100755 => 100644 orte/mca/odls/base/odls_base_default_fns.c diff --git a/orte/mca/odls/base/odls_base_default_fns.c b/orte/mca/odls/base/odls_base_default_fns.c old mode 100755 new mode 100644 index d9b3b858a9..f98dd7468b --- a/orte/mca/odls/base/odls_base_default_fns.c +++ b/orte/mca/odls/base/odls_base_default_fns.c @@ -100,7 +100,6 @@ #include "orte/mca/odls/base/base.h" #include "orte/mca/odls/base/odls_private.h" -#if 0 static void setup_cbfunc(int status, opal_list_t *info, void *provided_cbdata, @@ -132,9 +131,8 @@ static void setup_cbfunc(int status, /* move to next stage */ ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_SEND_LAUNCH_MSG); -} -#endif +} /* IT IS CRITICAL THAT ANY CHANGE IN THE ORDER OF THE INFO PACKED IN * THIS FUNCTION BE REFLECTED IN THE CONSTRUCT_CHILD_LIST PARSER BELOW */ @@ -433,8 +431,7 @@ int orte_odls_base_default_get_add_procs_data(opal_buffer_t *buffer, } /* get any application prep info */ -#if 0 - if (NULL != opal_pmix.server_setup_application) { + if (orte_enable_instant_on_support && NULL != opal_pmix.server_setup_application) { /* we don't want to block here because it could * take some indeterminate time to get the info */ if (OPAL_SUCCESS != (rc = opal_pmix.server_setup_application(jdata->jobid, NULL, setup_cbfunc, jdata))) { @@ -442,7 +439,6 @@ int orte_odls_base_default_get_add_procs_data(opal_buffer_t *buffer, } return rc; } -#endif /* move to next stage */ ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_SEND_LAUNCH_MSG); @@ -457,13 +453,11 @@ static void fm_release(void *cbdata) OBJ_RELEASE(bptr); } -#if 0 static void ls_cbunc(int status, void *cbdata) { opal_pmix_lock_t *lock = (opal_pmix_lock_t*)cbdata; OPAL_PMIX_WAKEUP_THREAD(lock); } -#endif int orte_odls_base_default_construct_child_list(opal_buffer_t *buffer, orte_jobid_t *job) @@ -801,11 +795,11 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *buffer, goto REPORT_ERROR; } -#if 0 /* if we have local support setup info, then execute it here - we * have to do so AFTER we register the nspace so the PMIx server * has the nspace info it needs */ - if (0 < opal_list_get_size(&local_support) && + if (orte_enable_instant_on_support && + 0 < opal_list_get_size(&local_support) && NULL != opal_pmix.server_setup_local_support) { if (OPAL_SUCCESS != (rc = opal_pmix.server_setup_local_support(jdata->jobid, &local_support, ls_cbunc, &lock))) { @@ -815,8 +809,6 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *buffer, } else { lock.active = false; // we won't get a callback } -#endif - lock.active = false; // we won't get a callback /* if we have a file map, then we need to load it */ if (orte_get_attribute(&jdata->attributes, ORTE_JOB_FILE_MAPS, (void**)&bptr, OPAL_BUFFER)) { @@ -1067,11 +1059,10 @@ void orte_odls_base_spawn_proc(int fd, short sd, void *cbdata) ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&child->name)); - // if (15 < opal_output_get_verbosity(orte_odls_base_framework.framework_output)) { + if (15 < opal_output_get_verbosity(orte_odls_base_framework.framework_output)) { /* dump what is going to be exec'd */ - opal_dss.dump(0, app, ORTE_APP_CONTEXT); - // } - exit(1); + opal_dss.dump(orte_odls_base_framework.framework_output, app, ORTE_APP_CONTEXT); + } if (ORTE_SUCCESS != (rc = cd->fork_local(cd))) { /* error message already output */ diff --git a/orte/mca/schizo/ompi/schizo_ompi.c b/orte/mca/schizo/ompi/schizo_ompi.c index 5e1daf9e21..ccde331d80 100644 --- a/orte/mca/schizo/ompi/schizo_ompi.c +++ b/orte/mca/schizo/ompi/schizo_ompi.c @@ -509,6 +509,12 @@ static opal_cmd_line_init_t cmd_line_init[] = { "Forward mpirun port to compute node daemons so all will use it", OPAL_CMD_LINE_OTYPE_LAUNCH }, + /* enable instant-on support */ + { "orte_enable_instant_on_support", '\0', "enable-instant-on-support", "enable-instant-on-support", 0, + NULL, OPAL_CMD_LINE_TYPE_BOOL, + "Enable PMIx-based instant on launch support (experimental)", + OPAL_CMD_LINE_OTYPE_LAUNCH }, + /* End of list */ { NULL, '\0', NULL, NULL, 0, NULL, OPAL_CMD_LINE_TYPE_NULL, NULL } @@ -874,7 +880,7 @@ static int setup_fork(orte_job_t *jdata, tmp_app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, 0); assert (NULL != tmp_app); orte_get_attribute(&tmp_app->attributes, ORTE_APP_PREFIX_DIR, (void**)¶m, OPAL_STRING); - } + } for (i = 0; NULL != param && NULL != app->env && NULL != app->env[i]; ++i) { char *newenv; diff --git a/orte/runtime/orte_globals.c b/orte/runtime/orte_globals.c index 1f7fc2ec7f..4f043329d1 100644 --- a/orte/runtime/orte_globals.c +++ b/orte/runtime/orte_globals.c @@ -205,6 +205,9 @@ char *orte_job_ident = NULL; bool orte_execute_quiet = false; bool orte_report_silent_errors = false; +/* enable PMIx-based "instant on" support */ +bool orte_enable_instant_on_support = false; + /* See comment in orte/tools/orterun/debuggers.c about this MCA param */ bool orte_in_parallel_debugger = false; diff --git a/orte/runtime/orte_globals.h b/orte/runtime/orte_globals.h index 1ae4de3eee..3c93c6dbe2 100644 --- a/orte/runtime/orte_globals.h +++ b/orte/runtime/orte_globals.h @@ -588,6 +588,9 @@ ORTE_DECLSPEC extern char *orte_daemon_cores; /* Max time to wait for stack straces to return */ ORTE_DECLSPEC extern int orte_stack_trace_wait_timeout; +/* enable PMIx-based "instant on" support */ +ORTE_DECLSPEC extern bool orte_enable_instant_on_support; + END_C_DECLS #endif /* ORTE_RUNTIME_ORTE_GLOBALS_H */ diff --git a/orte/runtime/orte_mca_params.c b/orte/runtime/orte_mca_params.c index afad291116..0053a663e4 100644 --- a/orte/runtime/orte_mca_params.c +++ b/orte/runtime/orte_mca_params.c @@ -13,7 +13,7 @@ * Copyright (c) 2009-2010 Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012-2013 Los Alamos National Security, LLC. * All rights reserved - * Copyright (c) 2013-2017 Intel, Inc. All rights reserved. + * Copyright (c) 2013-2018 Intel, Inc. All rights reserved. * Copyright (c) 2014-2018 Research Organization for Information Science * and Technology (RIST). All rights reserved. * Copyright (c) 2017 IBM Corporation. All rights reserved. @@ -790,5 +790,12 @@ int orte_register_params(void) OPAL_INFO_LVL_3, MCA_BASE_VAR_SCOPE_ALL, &orte_data_server_uri); + orte_enable_instant_on_support = false; + (void) mca_base_var_register ("orte", "orte", NULL, "enable_instant_on_support", + "Enable PMIx-based instant on launch support (experimental)", + MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, + OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY, + &orte_enable_instant_on_support); + return ORTE_SUCCESS; }