From 3913595e109100af411435160d2c43e89440b144 Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Sun, 29 May 2016 18:56:18 -0700 Subject: [PATCH] Enable simulation of large-scale clusters by allowing multiple daemons/node. Specifying the ras_base_multiplier parameter to be greater than 1 will cause ORTE to replicate each allocated node by that factor. A daemon will be spawned for each replica, thus letting ORTE function as if it were on a much larger cluster. Note that this cannot be used for MPI performance testing. It is really only useful for ORTE scaling tests. It also only works with the rsh/ssh launcher. --- orte/mca/ess/base/ess_base_std_orted.c | 11 ++++++++++ orte/mca/plm/base/plm_base_launch_support.c | 9 ++++++-- orte/mca/ras/base/base.h | 2 ++ orte/mca/ras/base/ras_base_frame.c | 14 +++++++++++- orte/mca/ras/base/ras_base_node.c | 22 ++++++++++++++++--- orte/mca/schizo/ompi/schizo_ompi.c | 3 ++- .../data_type_support/orte_dt_copy_fns.c | 16 +++++++++++--- orte/runtime/orte_mca_params.c | 15 +++++++++++++ orte/tools/orterun/orterun.c | 1 + orte/util/attr.h | 1 + 10 files changed, 84 insertions(+), 10 deletions(-) diff --git a/orte/mca/ess/base/ess_base_std_orted.c b/orte/mca/ess/base/ess_base_std_orted.c index 560fbc2eaa..041f27784f 100644 --- a/orte/mca/ess/base/ess_base_std_orted.c +++ b/orte/mca/ess/base/ess_base_std_orted.c @@ -245,6 +245,17 @@ int orte_ess_base_orted_setup(char **hosts) error = "orte_session_dir define"; goto error; } + /* if we have multiple daemons/node, then add our pid to the name */ + if (NULL != (param = getenv("OMPI_MCA_ras_base_multiplier")) && + 1 < strtol(param, NULL, 10)) { + if (0 > asprintf(¶m, "%s.%lu", orte_process_info.top_session_dir, (unsigned long)orte_process_info.pid)) { + ret = ORTE_ERR_OUT_OF_RESOURCE; + error = "create top session dir"; + goto error; + } + free(orte_process_info.top_session_dir); + orte_process_info.top_session_dir = param; + } /* clear the session directory just in case there are * stale directories laying around */ diff --git a/orte/mca/plm/base/plm_base_launch_support.c b/orte/mca/plm/base/plm_base_launch_support.c index e52347a65e..22b69dfc63 100644 --- a/orte/mca/plm/base/plm_base_launch_support.c +++ b/orte/mca/plm/base/plm_base_launch_support.c @@ -1528,6 +1528,7 @@ int orte_plm_base_setup_virtual_machine(orte_job_t *jdata) bool default_hostfile_used; char *hosts; bool singleton=false; + bool multi_sim = false; OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output, "%s plm:base:setup_vm", @@ -1617,7 +1618,8 @@ int orte_plm_base_setup_virtual_machine(orte_job_t *jdata) * look across all jobs and ensure that the "VM" contains * all nodes with application procs on them */ - if (orte_get_attribute(&daemons->attributes, ORTE_JOB_NO_VM, NULL, OPAL_BOOL)) { + multi_sim = orte_get_attribute(&jdata->attributes, ORTE_JOB_MULTI_DAEMON_SIM, NULL, OPAL_BOOL); + if (orte_get_attribute(&daemons->attributes, ORTE_JOB_NO_VM, NULL, OPAL_BOOL) || multi_sim) { OBJ_CONSTRUCT(&nodes, opal_list_t); /* loop across all nodes and include those that have * num_procs > 0 && no daemon already on them @@ -1645,7 +1647,7 @@ int orte_plm_base_setup_virtual_machine(orte_job_t *jdata) /* not to be used */ continue; } - if (0 < node->num_procs) { + if (0 < node->num_procs || multi_sim) { /* retain a copy for our use in case the item gets * destructed along the way */ @@ -1653,6 +1655,9 @@ int orte_plm_base_setup_virtual_machine(orte_job_t *jdata) opal_list_append(&nodes, &node->super); } } + if (multi_sim) { + goto process; + } /* see if anybody had procs */ if (0 == opal_list_get_size(&nodes)) { /* if the HNP has some procs, then we are still good */ diff --git a/orte/mca/ras/base/base.h b/orte/mca/ras/base/base.h index 64cccb14e1..e766dc86ce 100644 --- a/orte/mca/ras/base/base.h +++ b/orte/mca/ras/base/base.h @@ -11,6 +11,7 @@ * All rights reserved. * Copyright (c) 2011-2013 Los Alamos National Security, LLC. All rights * reserved. + * Copyright (c) 2016 Intel, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -49,6 +50,7 @@ typedef struct orte_ras_base_t { bool allocation_read; orte_ras_base_module_t *active_module; int total_slots_alloc; + int multiplier; } orte_ras_base_t; ORTE_DECLSPEC extern orte_ras_base_t orte_ras_base; diff --git a/orte/mca/ras/base/ras_base_frame.c b/orte/mca/ras/base/ras_base_frame.c index 2af38350a8..a7a0918c35 100644 --- a/orte/mca/ras/base/ras_base_frame.c +++ b/orte/mca/ras/base/ras_base_frame.c @@ -50,6 +50,18 @@ */ orte_ras_base_t orte_ras_base = {0}; +static int ras_register(mca_base_register_flag_t flags) +{ + orte_ras_base.multiplier = 1; + mca_base_var_register("orte", "ras", "base", "multiplier", + "Simulate a larger cluster by launching N daemons/node", + MCA_BASE_VAR_TYPE_INT, + NULL, 0, 0, + OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_READONLY, &orte_ras_base.multiplier); + return ORTE_SUCCESS; +} + static int orte_ras_base_close(void) { /* Close selected component */ @@ -76,5 +88,5 @@ static int orte_ras_base_open(mca_base_open_flag_t flags) } MCA_BASE_FRAMEWORK_DECLARE(orte, ras, "ORTE Resource Allocation Subsystem", - NULL, orte_ras_base_open, orte_ras_base_close, + ras_register, orte_ras_base_open, orte_ras_base_close, mca_ras_base_static_components, 0); diff --git a/orte/mca/ras/base/ras_base_node.c b/orte/mca/ras/base/ras_base_node.c index d895cedcda..ae11c44db5 100644 --- a/orte/mca/ras/base/ras_base_node.c +++ b/orte/mca/ras/base/ras_base_node.c @@ -44,7 +44,7 @@ int orte_ras_base_node_insert(opal_list_t* nodes, orte_job_t *jdata) opal_list_item_t* item; orte_std_cntr_t num_nodes; int rc, i; - orte_node_t *node, *hnp_node; + orte_node_t *node, *hnp_node, *nptr; char *ptr; bool hnp_alone = true; orte_attribute_t *kv; @@ -61,10 +61,16 @@ int orte_ras_base_node_insert(opal_list_t* nodes, orte_job_t *jdata) ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (long)num_nodes)); + /* mark the job as being a large-cluster sim if that was requested */ + if (1 < orte_ras_base.multiplier) { + orte_set_attribute(&jdata->attributes, ORTE_JOB_MULTI_DAEMON_SIM, + ORTE_ATTR_GLOBAL, NULL, OPAL_BOOL); + } + /* set the size of the global array - this helps minimize time * spent doing realloc's */ - if (ORTE_SUCCESS != (rc = opal_pointer_array_set_size(orte_node_pool, num_nodes))) { + if (ORTE_SUCCESS != (rc = opal_pointer_array_set_size(orte_node_pool, num_nodes * orte_ras_base.multiplier))) { ORTE_ERROR_LOG(rc); return rc; } @@ -139,6 +145,12 @@ int orte_ras_base_node_insert(opal_list_t* nodes, orte_job_t *jdata) } /* don't keep duplicate copy */ OBJ_RELEASE(node); + /* create copies, if required */ + for (i=1; i < orte_ras_base.multiplier; i++) { + opal_dss.copy((void**)&node, hnp_node, ORTE_NODE); + ORTE_FLAG_UNSET(node, ORTE_NODE_FLAG_DAEMON_LAUNCHED); + node->index = opal_pointer_array_add(orte_node_pool, node); + } } else { /* insert the object onto the orte_nodes global array */ OPAL_OUTPUT_VERBOSE((5, orte_ras_base_framework.framework_output, @@ -166,7 +178,11 @@ int orte_ras_base_node_insert(opal_list_t* nodes, orte_job_t *jdata) } /* indicate the HNP is not alone */ hnp_alone = false; - } + for (i=1; i < orte_ras_base.multiplier; i++) { + opal_dss.copy((void**)&nptr, node, ORTE_NODE); + nptr->index = opal_pointer_array_add(orte_node_pool, nptr); + } + } } /* if we didn't find any fqdn names in the allocation, then diff --git a/orte/mca/schizo/ompi/schizo_ompi.c b/orte/mca/schizo/ompi/schizo_ompi.c index f197d0a33d..0e8e19c148 100644 --- a/orte/mca/schizo/ompi/schizo_ompi.c +++ b/orte/mca/schizo/ompi/schizo_ompi.c @@ -941,8 +941,9 @@ static int setup_fork(orte_job_t *jdata, opal_setenv("OMPI_COMM_WORLD_LOCAL_SIZE", param, true, &app->env); free(param); - /* forcibly set the local tmpdir base to match ours */ + /* forcibly set the local tmpdir base and top session dir to match ours */ opal_setenv("OMPI_MCA_orte_tmpdir_base", orte_process_info.tmpdir_base, true, &app->env); + opal_setenv("OMPI_MCA_orte_top_session_dir", orte_process_info.top_session_dir, true, &app->env); /* MPI-3 requires we provide some further info to the procs, * so we pass them as envars to avoid introducing further diff --git a/orte/runtime/data_type_support/orte_dt_copy_fns.c b/orte/runtime/data_type_support/orte_dt_copy_fns.c index 36ee0cbf52..2b245953fb 100644 --- a/orte/runtime/data_type_support/orte_dt_copy_fns.c +++ b/orte/runtime/data_type_support/orte_dt_copy_fns.c @@ -12,7 +12,7 @@ * Copyright (c) 2011 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2011-2013 Los Alamos National Security, LLC. * All rights reserved. - * Copyright (c) 2014-2015 Intel, Inc. All rights reserved. + * Copyright (c) 2014-2016 Intel, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -66,8 +66,18 @@ int orte_dt_copy_job(orte_job_t **dest, orte_job_t *src, opal_data_type_t type) */ int orte_dt_copy_node(orte_node_t **dest, orte_node_t *src, opal_data_type_t type) { - (*dest) = src; - OBJ_RETAIN(src); + orte_node_t *node; + + node = OBJ_NEW(orte_node_t); + node->name = strdup(src->name); + node->state = src->state; + node->slots = src->slots; + node->slots_inuse = src->slots_inuse; + node->slots_max = src->slots_max; + node->topology = src->topology; + node->flags = src->flags; + (*dest) = node; + return ORTE_SUCCESS; } diff --git a/orte/runtime/orte_mca_params.c b/orte/runtime/orte_mca_params.c index bf3a852844..9e9e800f10 100644 --- a/orte/runtime/orte_mca_params.c +++ b/orte/runtime/orte_mca_params.c @@ -50,6 +50,7 @@ static char *orte_fork_agent_string = NULL; static char *orte_tmpdir_base = NULL; static char *orte_local_tmpdir_base = NULL; static char *orte_remote_tmpdir_base = NULL; +static char *orte_top_session_dir = NULL; int orte_register_params(void) { @@ -150,6 +151,20 @@ int orte_register_params(void) orte_process_info.tmpdir_base = strdup (orte_remote_tmpdir_base); } + orte_top_session_dir = NULL; + (void) mca_base_var_register ("orte", "orte", NULL, "top_session_dir", + "Top of the session directory tree for applications", + MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, + OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_ALL_EQ, + &orte_top_session_dir); + + if (NULL != orte_top_session_dir) { + if (NULL != orte_process_info.top_session_dir) { + free(orte_process_info.top_session_dir); + } + orte_process_info.top_session_dir = strdup(orte_top_session_dir); + } + orte_prohibited_session_dirs = NULL; (void) mca_base_var_register ("orte", "orte", NULL, "no_session_dirs", "Prohibited locations for session directories (multiple locations separated by ',', default=NULL)", diff --git a/orte/tools/orterun/orterun.c b/orte/tools/orterun/orterun.c index ee6e802663..be6fad05f2 100644 --- a/orte/tools/orterun/orterun.c +++ b/orte/tools/orterun/orterun.c @@ -290,6 +290,7 @@ int orterun(int argc, char *argv[]) DONE: /* cleanup and leave */ orte_submit_finalize(); + orte_finalize(); if (orte_debug_flag) { fprintf(stderr, "exiting with status %d\n", orte_exit_status); diff --git a/orte/util/attr.h b/orte/util/attr.h index 1bf95e0831..e87a498c7a 100644 --- a/orte/util/attr.h +++ b/orte/util/attr.h @@ -138,6 +138,7 @@ typedef uint16_t orte_job_flags_t; #define ORTE_JOB_MERGE_STDERR_STDOUT (ORTE_JOB_START_KEY + 46) // bool - merge stderr into stdout stream #define ORTE_JOB_TAG_OUTPUT (ORTE_JOB_START_KEY + 47) // bool - tag stdout/stderr #define ORTE_JOB_TIMESTAMP_OUTPUT (ORTE_JOB_START_KEY + 48) // bool - timestamp stdout/stderr +#define ORTE_JOB_MULTI_DAEMON_SIM (ORTE_JOB_START_KEY + 49) // bool - multiple daemons/node to simulate large cluster #define ORTE_JOB_MAX_KEY 300