diff --git a/orte/mca/ess/slurmd/Makefile.am b/orte/mca/ess/slurmd/Makefile.am new file mode 100644 index 0000000000..6dc5a217ec --- /dev/null +++ b/orte/mca/ess/slurmd/Makefile.am @@ -0,0 +1,43 @@ +# +# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana +# University Research and Technology +# Corporation. All rights reserved. +# Copyright (c) 2004-2005 The University of Tennessee and The University +# of Tennessee Research Foundation. All rights +# reserved. +# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, +# University of Stuttgart. All rights reserved. +# Copyright (c) 2004-2005 The Regents of the University of California. +# All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +sources = \ + ess_slurmd.h \ + ess_slurmd_component.c \ + ess_slurmd_module.c + +# Make the output library in this directory, and name it either +# mca__.la (for DSO builds) or libmca__.la +# (for static builds). + +if OMPI_BUILD_ess_slurmd_DSO +component_noinst = +component_install = mca_ess_slurmd.la +else +component_noinst = libmca_ess_slurmd.la +component_install = +endif + +mcacomponentdir = $(pkglibdir) +mcacomponent_LTLIBRARIES = $(component_install) +mca_ess_slurmd_la_SOURCES = $(sources) +mca_ess_slurmd_la_LDFLAGS = -module -avoid-version + +noinst_LTLIBRARIES = $(component_noinst) +libmca_ess_slurmd_la_SOURCES =$(sources) +libmca_ess_slurmd_la_LDFLAGS = -module -avoid-version diff --git a/orte/mca/ess/slurmd/configure.m4 b/orte/mca/ess/slurmd/configure.m4 new file mode 100644 index 0000000000..d7895d1a78 --- /dev/null +++ b/orte/mca/ess/slurmd/configure.m4 @@ -0,0 +1,37 @@ +# -*- shell-script -*- +# +# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana +# University Research and Technology +# Corporation. All rights reserved. +# Copyright (c) 2004-2005 The University of Tennessee and The University +# of Tennessee Research Foundation. All rights +# reserved. +# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, +# University of Stuttgart. All rights reserved. +# Copyright (c) 2004-2005 The Regents of the University of California. +# All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +# MCA_ess_slurmd_CONFIG([action-if-found], [action-if-not-found]) +# ----------------------------------------------------------- +AC_DEFUN([MCA_ess_slurmd_CONFIG],[ + OMPI_CHECK_SLURM([ess_slurmd], [ess_slurmd_good=1], [ess_slurmd_good=0]) + + # if check worked, set wrapper flags if so. + # Evaluate succeed / fail + AS_IF([test "$ess_slurmd_good" = "1"], + [ess_slurmd_WRAPPER_EXTRA_LDFLAGS="$ess_slurmd_LDFLAGS" + ess_slurmd_WRAPPER_EXTRA_LIBS="$ess_slurmd_LIBS" + $1], + [$2]) + + # set build flags to use in makefile + AC_SUBST([ess_slurmd_CPPFLAGS]) + AC_SUBST([ess_slurmd_LDFLAGS]) + AC_SUBST([ess_slurmd_LIBS]) +])dnl diff --git a/orte/mca/ess/slurmd/configure.params b/orte/mca/ess/slurmd/configure.params new file mode 100644 index 0000000000..08f3f59a30 --- /dev/null +++ b/orte/mca/ess/slurmd/configure.params @@ -0,0 +1,27 @@ +# -*- shell-script -*- +# +# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana +# University Research and Technology +# Corporation. All rights reserved. +# Copyright (c) 2004-2005 The University of Tennessee and The University +# of Tennessee Research Foundation. All rights +# reserved. +# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, +# University of Stuttgart. All rights reserved. +# Copyright (c) 2004-2005 The Regents of the University of California. +# All rights reserved. +# Copyright (c) 2007 Los Alamos National Security, LLC. All rights +# reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +PARAM_CONFIG_FILES="Makefile" +# +# Set the config priority so that, if we can build, +# all the SLURM and supporting components will build + +PARAM_CONFIG_PRIORITY=10 diff --git a/orte/mca/ess/slurmd/ess_slurmd.h b/orte/mca/ess/slurmd/ess_slurmd.h new file mode 100644 index 0000000000..409d23c88f --- /dev/null +++ b/orte/mca/ess/slurmd/ess_slurmd.h @@ -0,0 +1,35 @@ +/* + * Copyright (c) 2004-2008 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2005 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef ORTE_ESS_SLURMD_H +#define ORTE_ESS_SLURMD_H + +BEGIN_C_DECLS + +ORTE_MODULE_DECLSPEC extern orte_ess_base_component_t mca_ess_slurmd_component; + +/* + * Module open / close + */ +int orte_ess_slurmd_component_open(void); +int orte_ess_slurmd_component_close(void); +int orte_ess_slurmd_component_query(mca_base_module_t **module, int *priority); + +END_C_DECLS + +#endif /* ORTE_ESS_SLURMD_H */ diff --git a/orte/mca/ess/slurmd/ess_slurmd_component.c b/orte/mca/ess/slurmd/ess_slurmd_component.c new file mode 100644 index 0000000000..5e279337fa --- /dev/null +++ b/orte/mca/ess/slurmd/ess_slurmd_component.c @@ -0,0 +1,96 @@ +/* + * Copyright (c) 2004-2008 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2005 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + * + * These symbols are in a file by themselves to provide nice linker + * semantics. Since linkers generally pull in symbols by object + * files, keeping these symbols as the only symbols in this file + * prevents utility programs such as "ompi_info" from having to import + * entire components just to query their version and parameters. + */ + +#include "orte_config.h" +#include "orte/constants.h" + +#include "orte/util/proc_info.h" + +#include "orte/mca/ess/ess.h" +#include "orte/mca/ess/slurmd/ess_slurmd.h" + +extern orte_ess_base_module_t orte_ess_slurmd_module; + +/* + * Instantiate the public struct with all of our public information + * and pointers to our public functions in it + */ +orte_ess_base_component_t mca_ess_slurmd_component = { + { + ORTE_ESS_BASE_VERSION_2_0_0, + + /* Component name and version */ + "slurmd", + ORTE_MAJOR_VERSION, + ORTE_MINOR_VERSION, + ORTE_RELEASE_VERSION, + + /* Component open and close functions */ + orte_ess_slurmd_component_open, + orte_ess_slurmd_component_close, + orte_ess_slurmd_component_query + }, + { + /* The component is checkpoint ready */ + MCA_BASE_METADATA_PARAM_CHECKPOINT + } +}; + + +int +orte_ess_slurmd_component_open(void) +{ + return ORTE_SUCCESS; +} + + +int orte_ess_slurmd_component_query(mca_base_module_t **module, int *priority) +{ + /* Are we running under a SLURM job? Were + * we given a path back to the HNP? If the + * answer to the first is "yes" and the second + * is "no", then we were not launched + * by mpirun but are in a slurm world + */ + + if (NULL != getenv("SLURM_JOBID") && + NULL == orte_process_info.my_hnp_uri) { + *priority = 30; + *module = (mca_base_module_t *)&orte_ess_slurmd_module; + return ORTE_SUCCESS; + } + + /* Sadly, no */ + *priority = -1; + *module = NULL; + return ORTE_ERROR; +} + + +int +orte_ess_slurmd_component_close(void) +{ + return ORTE_SUCCESS; +} + diff --git a/orte/mca/ess/slurmd/ess_slurmd_module.c b/orte/mca/ess/slurmd/ess_slurmd_module.c new file mode 100644 index 0000000000..a15ecc2455 --- /dev/null +++ b/orte/mca/ess/slurmd/ess_slurmd_module.c @@ -0,0 +1,558 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2005 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2008 Cisco Systems, Inc. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + * + */ + +#include "orte_config.h" +#include "orte/constants.h" + +#ifdef HAVE_UNISTD_H +#include +#endif /* HAVE_UNISTD_H */ +#ifdef HAVE_STRING_H +#include +#endif /* HAVE_STRING_H */ +#include +#ifdef HAVE_NETDB_H +#include +#endif +#ifdef HAVE_IFADDRS_H +#include +#endif + + +#include "opal/util/opal_environ.h" +#include "opal/util/output.h" +#include "opal/mca/base/mca_base_param.h" +#include "opal/util/argv.h" +#include "opal/class/opal_pointer_array.h" +#include "opal/util/if.h" +#include "opal/util/net.h" +#include "opal/dss/dss.h" +#include "opal/mca/paffinity/paffinity.h" + +#include "orte/util/proc_info.h" +#include "orte/util/show_help.h" +#include "orte/mca/errmgr/errmgr.h" +#include "orte/util/name_fns.h" +#include "orte/runtime/orte_globals.h" +#include "orte/util/nidmap.h" +#include "orte/util/regex.h" +#include "orte/mca/rml/base/rml_contact.h" +#include "orte/runtime/orte_wait.h" + +#include "orte/mca/ess/ess.h" +#include "orte/mca/ess/base/base.h" +#include "orte/mca/ess/slurmd/ess_slurmd.h" + +static int rte_init(char flags); +static int rte_finalize(void); +static uint8_t proc_get_locality(orte_process_name_t *proc); +static orte_vpid_t proc_get_daemon(orte_process_name_t *proc); +static char* proc_get_hostname(orte_process_name_t *proc); +static uint32_t proc_get_arch(orte_process_name_t *proc); +static orte_local_rank_t proc_get_local_rank(orte_process_name_t *proc); +static orte_node_rank_t proc_get_node_rank(orte_process_name_t *proc); +static int update_arch(orte_process_name_t *proc, uint32_t arch); +static int update_pidmap(opal_byte_object_t *bo); +static int update_nidmap(opal_byte_object_t *bo); + +orte_ess_base_module_t orte_ess_slurmd_module = { + rte_init, + rte_finalize, + orte_ess_base_app_abort, + proc_get_locality, + proc_get_daemon, + proc_get_hostname, + proc_get_arch, + proc_get_local_rank, + proc_get_node_rank, + update_arch, + update_pidmap, + update_nidmap, + NULL /* ft_event */ +}; + +/**** MODULE FUNCTIONS ****/ + +static int rte_init(char flags) +{ + int ret; + char *error = NULL; + int32_t jobfam, stepid; + char **nodes = NULL; + char *envar; + int i, j; + orte_nid_t *node; + orte_jmap_t *jmap; + orte_pmap_t pmap; + orte_vpid_t vpid; + int local_rank; + int nodeid; + int num_nodes; + int cpus_per_task; + char *regexp, *tasks_per_node; + int *ppn; + bool block=false, cyclic=false; + + /* run the prolog */ + if (ORTE_SUCCESS != (ret = orte_ess_base_std_prolog())) { + error = "orte_ess_base_std_prolog"; + goto error; + } + + + /* Only application procs can use this module. Since we + * were directly launched by srun, we need to bootstrap + * our own global info so we can startup. Srun will have + * provided that info in our environment, so get it from there + */ + + /* get the slurm jobid - this will be our job family */ + envar = getenv("SLURM_JOBID"); + /* don't need to check this for NULL - if it was, we would + * never have been selected anyway + */ + jobfam = strtol(envar, NULL, 10); + /* get the slurm stepid - this will be our local jobid */ + if (NULL == (envar = getenv("SLURM_STEPID"))) { + error = "could not get SLURM_STEPID"; + goto error; + } + stepid = strtol(envar, NULL, 10); + /* now build the jobid */ + ORTE_PROC_MY_NAME->jobid = ORTE_CONSTRUCT_LOCAL_JOBID(jobfam << 16, stepid); + + /* get the slurm procid - this will be our vpid */ + if (NULL == (envar = getenv("SLURM_PROCID"))) { + error = "could not get SLURM_PROCID"; + goto error; + } + ORTE_PROC_MY_NAME->vpid = strtol(envar, NULL, 10); + + /* get our local rank */ + if (NULL == (envar = getenv("SLURM_LOCALID"))) { + error = "could not get SLURM_LOCALID"; + goto error; + } + local_rank = strtol(envar, NULL, 10); + + OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output, + "%s local rank %d", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + local_rank)); + + /* get the number of procs in this job */ + if (NULL == (envar = getenv("SLURM_STEP_NUM_TASKS"))) { + error = "could not get SLURM_STEP_NUM_TASKS"; + goto error; + } + orte_process_info.num_procs = strtol(envar, NULL, 10); + + /* get my local nodeid */ + if (NULL == (envar = getenv("SLURM_NODEID"))) { + error = "could not get SLURM_NODEID"; + goto error; + } + nodeid = strtol(envar, NULL, 10); + ORTE_PROC_MY_DAEMON->jobid = 0; + ORTE_PROC_MY_DAEMON->vpid = nodeid; + + /* get the number of ppn */ + if (NULL == (tasks_per_node = getenv("SLURM_STEP_TASKS_PER_NODE"))) { + error = "could not get SLURM_STEP_TASKS_PER_NODE"; + goto error; + } + + /* get the number of CPUs per task that the user provided to slurm */ + if (NULL != (envar = getenv("SLURM_CPUS_PER_TASK"))) { + cpus_per_task = strtol(envar, NULL, 10); + if(0 >= cpus_per_task) { + error = "got bad value from SLURM_CPUS_PER_TASK"; + goto error; + } + } else { + cpus_per_task = 1; + } + + /* get the node list */ + if (NULL == (regexp = getenv("SLURM_STEP_NODELIST"))) { + error = "could not get SLURM_STEP_NODELIST"; + goto error; + } + /* break that down into a list of nodes */ + if (ORTE_SUCCESS != (ret = orte_regex_extract_node_names(regexp, &nodes))) { + error = "could not parse node list"; + goto error; + } + num_nodes = opal_argv_count(nodes); + orte_process_info.num_nodes = num_nodes; + + /* compute the ppn */ + if (ORTE_SUCCESS != (ret = orte_regex_extract_ppn(num_nodes, tasks_per_node, &ppn))) { + error = "could not determine #procs on each node"; + goto error; + } + /* for slurm, we have to normalize the ppn by the cpus_per_task */ + for (i=0; i < num_nodes; i++) { + ppn[i] /= cpus_per_task; + } + + /* get the distribution (i.e., mapping) mode */ + if (NULL == (envar = getenv("SLURM_DISTRIBUTION")) || + 0 == strcmp(envar, "block")) { + /* assume byslot mapping */ + block = true; + } else if (0 == strcmp(envar, "cyclic")) { + /* bynode mapping */ + cyclic = true; + } else { + /* cannot currently support other mapping modes */ + error = "distribution/mapping mode not supported"; + goto error; + } +#if 0 + SLURM_DIST_PLANESIZE=0 + SLURM_DIST_LLLP= +#endif + + /* setup the nidmap arrays */ + if (ORTE_SUCCESS != (ret = orte_util_nidmap_init(NULL))) { + ORTE_ERROR_LOG(ret); + error = "orte_util_nidmap_init"; + goto error; + } + + /* set the size of the nidmap storage so we minimize realloc's */ + if (ORTE_SUCCESS != (ret = opal_pointer_array_set_size(&orte_nidmap, orte_process_info.num_nodes))) { + error = "could not set pointer array size for nidmap"; + goto error; + } + + /* construct the nidmap */ + for (i=0; i < num_nodes; i++) { + node = OBJ_NEW(orte_nid_t); + node->name = strdup(nodes[i]); + node->daemon = i; + node->index = opal_pointer_array_add(&orte_nidmap, node); + } + opal_argv_free(nodes); + + /* create a job map for this job */ + jmap = OBJ_NEW(orte_jmap_t); + jmap->job = ORTE_PROC_MY_NAME->jobid; + opal_pointer_array_add(&orte_jobmap, jmap); + /* update the num procs */ + jmap->num_procs = orte_process_info.num_procs; + /* set the size of the pidmap storage so we minimize realloc's */ + if (ORTE_SUCCESS != (ret = opal_value_array_set_size(&jmap->pmap, jmap->num_procs))) { + ORTE_ERROR_LOG(ret); + error = "could not set value array size for pidmap"; + goto error; + } + + /* construct the pidmap */ + OBJ_CONSTRUCT(&pmap, orte_pmap_t); + if (block) { + /* for each node, cycle through the ppn */ + vpid = 0; + for (i=0; i < num_nodes; i++) { + node = (orte_nid_t*)orte_nidmap.addr[i]; + /* compute the vpid for each proc on this node + * and add a pmap entry for it + */ + for (j=0; j < ppn[i]; j++) { + pmap.node = node->index; + pmap.local_rank = j; + pmap.node_rank = j; + if (ORTE_SUCCESS != (ret = opal_value_array_set_item(&jmap->pmap, vpid, &pmap))) { + ORTE_ERROR_LOG(ret); + error = "could not set pmap values"; + goto error; + } + OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output, + "%s node %d name %s rank %d", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + (int) node->index, node->name, (int)vpid)); + vpid++; + } + } + } else if (cyclic) { + /* cycle across the nodes */ + vpid = 0; + while (vpid < orte_process_info.num_procs) { + for (i=0; i < num_nodes && vpid < orte_process_info.num_procs; i++) { + if (0 < ppn[i]) { + node = (orte_nid_t*)orte_nidmap.addr[i]; + pmap.node = node->index; + pmap.local_rank = ppn[i]-1; + pmap.node_rank = ppn[i]-1; + if (ORTE_SUCCESS != (ret = opal_value_array_set_item(&jmap->pmap, vpid, &pmap))) { + ORTE_ERROR_LOG(ret); + error = "could not set pmap values"; + goto error; + } + OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output, + "%s node %d name %s rank %d", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + (int) node->index, node->name, (int)vpid)); + vpid++; + --ppn[i]; + } + } + } + } + OBJ_DESTRUCT(&pmap); + free(ppn); + + /* ensure we pick the correct critical components */ + putenv("OMPI_MCA_grpcomm=hier"); + putenv("OMPI_MCA_routed=direct"); + + /* now use the default procedure to finish my setup */ + if (ORTE_SUCCESS != (ret = orte_ess_base_app_setup())) { + ORTE_ERROR_LOG(ret); + error = "orte_ess_base_app_setup"; + goto error; + } + + return ORTE_SUCCESS; + +error: + orte_show_help("help-orte-runtime.txt", + "orte_init:startup:internal-failure", + true, error, ORTE_ERROR_NAME(ret), ret); + + return ret; +} + +static int rte_finalize(void) +{ + int ret; + + /* use the default procedure to finish */ + if (ORTE_SUCCESS != (ret = orte_ess_base_app_finalize())) { + ORTE_ERROR_LOG(ret); + } + + /* deconstruct my nidmap and jobmap arrays */ + orte_util_nidmap_finalize(); + + return ret; +} + +static uint8_t proc_get_locality(orte_process_name_t *proc) +{ + orte_nid_t *nid; + + if (NULL == (nid = orte_util_lookup_nid(proc))) { + ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); + return OPAL_PROC_NON_LOCAL; + } + + if (nid->daemon == ORTE_PROC_MY_DAEMON->vpid) { + OPAL_OUTPUT_VERBOSE((2, orte_ess_base_output, + "%s ess:slurmd: proc %s is LOCAL", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(proc))); + return (OPAL_PROC_ON_NODE | OPAL_PROC_ON_CU | OPAL_PROC_ON_CLUSTER); + } + + OPAL_OUTPUT_VERBOSE((2, orte_ess_base_output, + "%s ess:slurmd: proc %s is REMOTE", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(proc))); + + return OPAL_PROC_NON_LOCAL; + +} + +static orte_vpid_t proc_get_daemon(orte_process_name_t *proc) +{ + orte_nid_t *nid; + + if (NULL == (nid = orte_util_lookup_nid(proc))) { + /* don't generate an error message here - it could be a call to + * get a route to a proc in an unknown job. Let the caller decide + * if an error message is required + */ + return ORTE_VPID_INVALID; + } + + OPAL_OUTPUT_VERBOSE((2, orte_ess_base_output, + "%s ess:slurmd: proc %s is hosted by daemon %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(proc), + ORTE_VPID_PRINT(nid->daemon))); + + return nid->daemon; +} + +static char* proc_get_hostname(orte_process_name_t *proc) +{ + orte_nid_t *nid; + + if (NULL == (nid = orte_util_lookup_nid(proc))) { + ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); + return NULL; + } + + OPAL_OUTPUT_VERBOSE((2, orte_ess_base_output, + "%s ess:slurmd: proc %s is on host %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(proc), + nid->name)); + + return nid->name; +} + +static uint32_t proc_get_arch(orte_process_name_t *proc) +{ + orte_nid_t *nid; + + if (NULL == (nid = orte_util_lookup_nid(proc))) { + ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); + return 0; + } + + OPAL_OUTPUT_VERBOSE((2, orte_ess_base_output, + "%s ess:slurmd: proc %s has arch %d", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(proc), + nid->arch)); + + return nid->arch; +} + +static int update_arch(orte_process_name_t *proc, uint32_t arch) +{ + orte_nid_t *nid; + + if (NULL == (nid = orte_util_lookup_nid(proc))) { + ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); + return ORTE_ERR_NOT_FOUND; + } + + OPAL_OUTPUT_VERBOSE((2, orte_ess_base_output, + "%s ess:slurmd: updating proc %s to arch %0x", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(proc), + arch)); + + nid->arch = arch; + + return ORTE_SUCCESS; +} + +static orte_local_rank_t proc_get_local_rank(orte_process_name_t *proc) +{ + orte_pmap_t *pmap; + + if (NULL == (pmap = orte_util_lookup_pmap(proc))) { + ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); + return ORTE_LOCAL_RANK_INVALID; + } + + OPAL_OUTPUT_VERBOSE((2, orte_ess_base_output, + "%s ess:slurmd: proc %s has local rank %d", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(proc), + (int)pmap->local_rank)); + + return pmap->local_rank; +} + +static orte_node_rank_t proc_get_node_rank(orte_process_name_t *proc) +{ + orte_pmap_t *pmap; + + if (NULL == (pmap = orte_util_lookup_pmap(proc))) { + return ORTE_NODE_RANK_INVALID; + } + + OPAL_OUTPUT_VERBOSE((2, orte_ess_base_output, + "%s ess:slurmd: proc %s has node rank %d", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(proc), + (int)pmap->node_rank)); + + return pmap->node_rank; +} + +static int update_pidmap(opal_byte_object_t *bo) +{ + int ret; + + OPAL_OUTPUT_VERBOSE((2, orte_ess_base_output, + "%s ess:slurmd: updating pidmap", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + + /* build the pmap */ + if (ORTE_SUCCESS != (ret = orte_util_decode_pidmap(bo))) { + ORTE_ERROR_LOG(ret); + } + + return ret; +} + +static int update_nidmap(opal_byte_object_t *bo) +{ + int rc; + /* decode the nidmap - the util will know what to do */ + if (ORTE_SUCCESS != (rc = orte_util_decode_nodemap(bo))) { + ORTE_ERROR_LOG(rc); + } + return rc; +} + + + +#if 0 +/*** AVAILABLE SLURM ENVARS ***/ +SLURM_JOB_ID=38749 +SLURM_JOB_NUM_NODES=1 +SLURM_JOB_NODELIST=odin097 +SLURM_JOB_CPUS_PER_NODE=4 +SLURM_JOBID=38749 +SLURM_NNODES=1 +SLURM_NODELIST=odin097 +SLURM_TASKS_PER_NODE=2 +SLURM_PRIO_PROCESS=0 +SLURM_UMASK=0022 +SLURM_NPROCS=2 +SLURM_CPUS_PER_TASK=1 +SLURM_STEPID=1 +SLURM_SRUN_COMM_PORT=33650 +SLURM_STEP_ID=1 +SLURM_STEP_NODELIST=odin097 +SLURM_STEP_NUM_NODES=1 +SLURM_STEP_NUM_TASKS=2 +SLURM_STEP_TASKS_PER_NODE=2 +SLURM_STEP_LAUNCHER_HOSTNAME=(null) +SLURM_STEP_LAUNCHER_PORT=33650 +SLURM_SRUN_COMM_HOST=129.79.240.100 +SLURM_TASK_PID=5528 +SLURM_CPUS_ON_NODE=4 +SLURM_NODEID=0 +SLURM_PROCID=1 +SLURM_LOCALID=1 +SLURM_LAUNCH_NODE_IPADDR=129.79.240.100 +SLURM_GTIDS=0,1 +SLURM_CHECKPOINT_PATH=/nfs/rinfs/san/homedirs/rhc +SLURMD_NODENAME=odin097 +#endif diff --git a/orte/mca/routed/direct/Makefile.am b/orte/mca/routed/direct/Makefile.am new file mode 100644 index 0000000000..e588130a9f --- /dev/null +++ b/orte/mca/routed/direct/Makefile.am @@ -0,0 +1,36 @@ +# +# Copyright (c) 2007 Los Alamos National Security, LLC. +# All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +sources = \ + routed_direct.h \ + routed_direct.c \ + routed_direct_component.c + +# Make the output library in this directory, and name it either +# mca__.la (for DSO builds) or libmca__.la +# (for static builds). + +if OMPI_BUILD_routed_direct_DSO +component_noinst = +component_install = mca_routed_direct.la +else +component_noinst = libmca_routed_direct.la +component_install = +endif + +mcacomponentdir = $(pkglibdir) +mcacomponent_LTLIBRARIES = $(component_install) +mca_routed_direct_la_SOURCES = $(sources) +mca_routed_direct_la_LDFLAGS = -module -avoid-version + +noinst_LTLIBRARIES = $(component_noinst) +libmca_routed_direct_la_SOURCES = $(sources) +libmca_routed_direct_la_LDFLAGS = -module -avoid-version + diff --git a/orte/mca/routed/direct/configure.params b/orte/mca/routed/direct/configure.params new file mode 100644 index 0000000000..83e06349bb --- /dev/null +++ b/orte/mca/routed/direct/configure.params @@ -0,0 +1,14 @@ +# -*- shell-script -*- +# +# Copyright (c) 2007 Los Alamos National Security, LLC. +# All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +# Specific to this module + +PARAM_CONFIG_FILES="Makefile" diff --git a/orte/mca/routed/direct/routed_direct.c b/orte/mca/routed/direct/routed_direct.c new file mode 100644 index 0000000000..eb8092f756 --- /dev/null +++ b/orte/mca/routed/direct/routed_direct.c @@ -0,0 +1,236 @@ +/* + * Copyright (c) 2007 Los Alamos National Security, LLC. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "orte_config.h" +#include "orte/constants.h" + +#include "opal/threads/condition.h" +#include "opal/runtime/opal_progress.h" +#include "opal/dss/dss_types.h" +#include "opal/util/output.h" + +#include "orte/mca/errmgr/errmgr.h" +#include "orte/util/name_fns.h" +#include "orte/runtime/orte_globals.h" + +#include "orte/mca/rml/base/rml_contact.h" + +#include "orte/mca/routed/base/base.h" +#include "routed_direct.h" + +static int init(void); +static int finalize(void); +static int delete_route(orte_process_name_t *proc); +static int update_route(orte_process_name_t *target, + orte_process_name_t *route); +static orte_process_name_t get_route(orte_process_name_t *target); +static int init_routes(orte_jobid_t job, opal_buffer_t *ndat); +static int route_lost(const orte_process_name_t *route); +static bool route_is_defined(const orte_process_name_t *target); +static int update_routing_tree(void); +static orte_vpid_t get_routing_tree(opal_list_t *children); +static int get_wireup_info(opal_buffer_t *buf); +static int set_lifeline(orte_process_name_t *proc); + +#if OPAL_ENABLE_FT == 1 +static int direct_ft_event(int state); +#endif + +orte_routed_module_t orte_routed_direct_module = { + init, + finalize, + delete_route, + update_route, + get_route, + init_routes, + route_lost, + route_is_defined, + set_lifeline, + update_routing_tree, + get_routing_tree, + get_wireup_info, +#if OPAL_ENABLE_FT == 1 + direct_ft_event +#else + NULL +#endif +}; + +/* local globals */ +static opal_condition_t cond; +static opal_mutex_t lock; + + +static int init(void) +{ + /* setup the global condition and lock */ + OBJ_CONSTRUCT(&cond, opal_condition_t); + OBJ_CONSTRUCT(&lock, opal_mutex_t); + + return ORTE_SUCCESS; +} + +static int finalize(void) +{ + /* destruct the global condition and lock */ + OBJ_DESTRUCT(&cond); + OBJ_DESTRUCT(&lock); + + return ORTE_SUCCESS; +} + +static int delete_route(orte_process_name_t *proc) +{ + OPAL_OUTPUT_VERBOSE((1, orte_routed_base_output, + "%s routed_direct_delete_route for %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(proc))); + + /*There is nothing to do here */ + + return ORTE_SUCCESS; +} + +static int update_route(orte_process_name_t *target, + orte_process_name_t *route) +{ + OPAL_OUTPUT_VERBOSE((1, orte_routed_base_output, + "%s routed_direct_update: %s --> %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(target), + ORTE_NAME_PRINT(route))); + + /*There is nothing to do here */ + + return ORTE_SUCCESS; +} + + +static orte_process_name_t get_route(orte_process_name_t *target) +{ + orte_process_name_t *ret; + + if (target->jobid == ORTE_JOBID_INVALID || + target->vpid == ORTE_VPID_INVALID) { + ret = ORTE_NAME_INVALID; + } else { + /* all routes are direct */ + ret = target; + } + + OPAL_OUTPUT_VERBOSE((2, orte_routed_base_output, + "%s routed_direct_get(%s) --> %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(target), + ORTE_NAME_PRINT(ret))); + + return *ret; +} + + +static int init_routes(orte_jobid_t job, opal_buffer_t *ndat) +{ + int rc; + + /* if ndat=NULL, then we are being called during orte_init. In this + * case, there is nothing to do + */ + if (NULL == ndat) { + return ORTE_SUCCESS; + } + + /* if ndat != NULL, then this is being invoked by the proc to + * init a route to a specified process that is outside of our + * job family. It really doesn't matter as everything must + * go direct + */ + OPAL_OUTPUT_VERBOSE((1, orte_routed_base_output, + "%s routed_direct: init routes w/non-NULL data", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + + if (ORTE_SUCCESS != (rc = orte_rml_base_update_contact_info(ndat))) { + ORTE_ERROR_LOG(rc); + return rc; + } + + return ORTE_SUCCESS; +} + +static int route_lost(const orte_process_name_t *route) +{ + /* there is no lifeline, so we don't care */ + return ORTE_SUCCESS; +} + + +static bool route_is_defined(const orte_process_name_t *target) +{ + /* all routes are defined */ + return true; +} + +static int set_lifeline(orte_process_name_t *proc) +{ + /* there is no lifeline */ + return ORTE_SUCCESS; +} + +static int update_routing_tree(void) +{ + /* this is a meaningless command for a direct as I am not allowed to route */ + return ORTE_ERR_NOT_SUPPORTED; +} + +static orte_vpid_t get_routing_tree(opal_list_t *children) +{ + /* this is a meaningless command for a direct as I am not allowed to route */ + return ORTE_VPID_INVALID; +} + +static int get_wireup_info(opal_buffer_t *buf) +{ + /* this is a meaningless command for a direct as I am not allowed to route */ + return ORTE_ERR_NOT_SUPPORTED; +} + + +#if OPAL_ENABLE_FT == 1 +static int direct_ft_event(int state) +{ + int ret, exit_status = ORTE_SUCCESS; + + /******** Checkpoint Prep ********/ + if(OPAL_CRS_CHECKPOINT == state) { + } + /******** Continue Recovery ********/ + else if (OPAL_CRS_CONTINUE == state ) { + } + /******** Restart Recovery ********/ + else if (OPAL_CRS_RESTART == state ) { + /* + * Re-exchange the routes + */ + if (ORTE_SUCCESS != (ret = orte_routed.init_routes(ORTE_PROC_MY_NAME->jobid, NULL))) { + exit_status = ret; + goto cleanup; + } + } + else if (OPAL_CRS_TERM == state ) { + /* Nothing */ + } + else { + /* Error state = Nothing */ + } + + cleanup: + return exit_status; +} +#endif + diff --git a/orte/mca/routed/direct/routed_direct.h b/orte/mca/routed/direct/routed_direct.h new file mode 100644 index 0000000000..2ea8a8f455 --- /dev/null +++ b/orte/mca/routed/direct/routed_direct.h @@ -0,0 +1,27 @@ +/* + * Copyright (c) 2007 Los Alamos National Security, LLC. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef MCA_ROUTED_DIRECT_H +#define MCA_ROUTED_DIRECT_H + +#include "orte_config.h" +#include "orte/types.h" + +#include "orte/mca/routed/routed.h" + +BEGIN_C_DECLS + +ORTE_MODULE_DECLSPEC extern orte_routed_component_t mca_routed_direct_component; + +extern orte_routed_module_t orte_routed_direct_module; + +END_C_DECLS + +#endif diff --git a/orte/mca/routed/direct/routed_direct_component.c b/orte/mca/routed/direct/routed_direct_component.c new file mode 100644 index 0000000000..98e680fef6 --- /dev/null +++ b/orte/mca/routed/direct/routed_direct_component.c @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2007 Los Alamos National Security, LLC. + * All rights reserved. + * Copyright (c) 2004-2008 The Trustees of Indiana University. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "orte_config.h" +#include "orte/constants.h" +#include "orte/types.h" + +#include "opal/class/opal_hash_table.h" +#include "opal/mca/base/base.h" +#include "opal/mca/base/mca_base_param.h" + +#include "orte/mca/errmgr/errmgr.h" +#include "orte/runtime/orte_globals.h" + +#include "orte/mca/routed/base/base.h" +#include "routed_direct.h" + +static int orte_routed_direct_component_query(mca_base_module_t **module, int *priority); + +/** + * component definition + */ +orte_routed_component_t mca_routed_direct_component = { + /* First, the mca_base_component_t struct containing meta + information about the component itself */ + + { + ORTE_ROUTED_BASE_VERSION_2_0_0, + + "direct", /* MCA component name */ + ORTE_MAJOR_VERSION, /* MCA component major version */ + ORTE_MINOR_VERSION, /* MCA component minor version */ + ORTE_RELEASE_VERSION, /* MCA component release version */ + NULL, + NULL, + orte_routed_direct_component_query + }, + { + /* This component can be checkpointed */ + MCA_BASE_METADATA_PARAM_CHECKPOINT + } +}; + +static int orte_routed_direct_component_query(mca_base_module_t **module, int *priority) +{ + /* allow selection only when specifically requested */ + *priority = 0; + *module = (mca_base_module_t *) &orte_routed_direct_module; + return ORTE_SUCCESS; +}