1
1
openmpi/orte/mca/schizo/ompi/schizo_ompi.c
Ralph Castain aa9e5a1a27 Add support for Singularity containers, including a .m4 file for checking if Singularity is available and an orte/schizo component for setting the proper support if a container was given as the executable
Cleanup the configury so we properly check for Singularity under the various typical use-cases

Bring the Singularity support online. We have to turn "off" the sm BTL as it segfaults from inside the container - root cause remains unclear. Also turned "off" the various OPAL shmem components in case they are involved and someone else tries to use them. Happily, the vader BTL works just fine!
2016-02-13 04:40:22 -08:00

674 строки
26 KiB
C

/*
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2006-2013 Los Alamos National Security, LLC.
* All rights reserved.
* Copyright (c) 2009-2015 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2011 Oak Ridge National Labs. All rights reserved.
* Copyright (c) 2013-2015 Intel, Inc. All rights reserved.
* Copyright (c) 2015 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*
*/
#include "orte_config.h"
#include "orte/types.h"
#include "opal/types.h"
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif
#include <ctype.h>
#include "opal/util/argv.h"
#include "opal/util/opal_environ.h"
#include "opal/util/os_dirpath.h"
#include "opal/util/show_help.h"
#include "opal/mca/shmem/base/base.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/ess/base/base.h"
#include "orte/mca/rmaps/rmaps_types.h"
#include "orte/util/name_fns.h"
#include "orte/util/session_dir.h"
#include "orte/util/show_help.h"
#include "orte/runtime/orte_globals.h"
#include "orte/mca/schizo/schizo.h"
static int parse_cli(char *personality,
int argc, int start, char **argv);
static int parse_env(char *personality,
char *path,
opal_cmd_line_t *cmd_line,
char **srcenv,
char ***dstenv);
static int setup_fork(orte_job_t *jdata,
orte_app_context_t *context);
static int setup_child(orte_job_t *jobdat,
orte_proc_t *child,
orte_app_context_t *app);
orte_schizo_base_module_t orte_schizo_ompi_module = {
parse_cli,
parse_env,
setup_fork,
setup_child
};
static int parse_cli(char *personality,
int argc, int start, char **argv)
{
int i, j, k;
bool ignore;
char *no_dups[] = {
"grpcomm",
"odls",
"rml",
"routed",
NULL
};
if (0 != strcmp(personality, "ompi")) {
return ORTE_ERR_TAKE_NEXT_OPTION;
}
for (i = 0; i < (argc-start); ++i) {
if (0 == strcmp("-mca", argv[i]) ||
0 == strcmp("--mca", argv[i]) ) {
/* ignore this one */
if (0 == strcmp(argv[i+1], "mca_base_env_list")) {
i += 2;
continue;
}
/* It would be nice to avoid increasing the length
* of the orted cmd line by removing any non-ORTE
* params. However, this raises a problem since
* there could be OPAL directives that we really
* -do- want the orted to see - it's only the OMPI
* related directives we could ignore. This becomes
* a very complicated procedure, however, since
* the OMPI mca params are not cleanly separated - so
* filtering them out is nearly impossible.
*
* see if this is already present so we at least can
* avoid growing the cmd line with duplicates
*/
ignore = false;
if (NULL != orted_cmd_line) {
for (j=0; NULL != orted_cmd_line[j]; j++) {
if (0 == strcmp(argv[i+1], orted_cmd_line[j])) {
/* already here - if the value is the same,
* we can quitely ignore the fact that they
* provide it more than once. However, some
* frameworks are known to have problems if the
* value is different. We don't have a good way
* to know this, but we at least make a crude
* attempt here to protect ourselves.
*/
if (0 == strcmp(argv[i+2], orted_cmd_line[j+1])) {
/* values are the same */
ignore = true;
break;
} else {
/* values are different - see if this is a problem */
for (k=0; NULL != no_dups[k]; k++) {
if (0 == strcmp(no_dups[k], argv[i+1])) {
/* print help message
* and abort as we cannot know which one is correct
*/
orte_show_help("help-orterun.txt", "orterun:conflicting-params",
true, orte_basename, argv[i+1],
argv[i+2], orted_cmd_line[j+1]);
return ORTE_ERR_BAD_PARAM;
}
}
/* this passed muster - just ignore it */
ignore = true;
break;
}
}
}
}
if (!ignore) {
opal_argv_append_nosize(&orted_cmd_line, argv[i]);
opal_argv_append_nosize(&orted_cmd_line, argv[i+1]);
opal_argv_append_nosize(&orted_cmd_line, argv[i+2]);
}
i += 2;
}
}
return ORTE_SUCCESS;
}
static int parse_env(char *personality,
char *path,
opal_cmd_line_t *cmd_line,
char **srcenv,
char ***dstenv)
{
int i, j;
char *param;
char *value;
char *env_set_flag;
char **vars;
if (0 != strcmp(personality, "ompi")) {
return ORTE_ERR_TAKE_NEXT_OPTION;
}
for (i = 0; NULL != srcenv[i]; ++i) {
if (0 == strncmp("OMPI_", srcenv[i], 5)) {
/* check for duplicate in app->env - this
* would have been placed there by the
* cmd line processor. By convention, we
* always let the cmd line override the
* environment
*/
param = strdup(srcenv[i]);
value = strchr(param, '=');
*value = '\0';
value++;
opal_setenv(param, value, false, dstenv);
free(param);
}
}
/* set necessary env variables for external usage from tune conf file*/
int set_from_file = 0;
vars = NULL;
if (OPAL_SUCCESS == mca_base_var_process_env_list_from_file(&vars) &&
NULL != vars) {
for (i=0; NULL != vars[i]; i++) {
value = strchr(vars[i], '=');
/* terminate the name of the param */
*value = '\0';
/* step over the equals */
value++;
/* overwrite any prior entry */
opal_setenv(vars[i], value, true, dstenv);
/* save it for any comm_spawn'd apps */
opal_setenv(vars[i], value, true, &orte_forwarded_envars);
}
set_from_file = 1;
opal_argv_free(vars);
}
/* Did the user request to export any environment variables on the cmd line? */
env_set_flag = getenv("OMPI_MCA_mca_base_env_list");
if (opal_cmd_line_is_taken(cmd_line, "x")) {
if (NULL != env_set_flag) {
orte_show_help("help-orterun.txt", "orterun:conflict-env-set", false);
return ORTE_ERR_FATAL;
}
j = opal_cmd_line_get_ninsts(cmd_line, "x");
for (i = 0; i < j; ++i) {
param = opal_cmd_line_get_param(cmd_line, "x", i, 0);
if (NULL != (value = strchr(param, '='))) {
/* terminate the name of the param */
*value = '\0';
/* step over the equals */
value++;
/* overwrite any prior entry */
opal_setenv(param, value, true, dstenv);
/* save it for any comm_spawn'd apps */
opal_setenv(param, value, true, &orte_forwarded_envars);
} else {
value = getenv(param);
if (NULL != value) {
/* overwrite any prior entry */
opal_setenv(param, value, true, dstenv);
/* save it for any comm_spawn'd apps */
opal_setenv(param, value, true, &orte_forwarded_envars);
} else {
opal_output(0, "Warning: could not find environment variable \"%s\"\n", param);
}
}
}
} else if (NULL != env_set_flag) {
/* if mca_base_env_list was set, check if some of env vars were set via -x from a conf file.
* If this is the case, error out.
*/
if (!set_from_file) {
/* set necessary env variables for external usage */
vars = NULL;
if (OPAL_SUCCESS == mca_base_var_process_env_list(&vars) &&
NULL != vars) {
for (i=0; NULL != vars[i]; i++) {
value = strchr(vars[i], '=');
/* terminate the name of the param */
*value = '\0';
/* step over the equals */
value++;
/* overwrite any prior entry */
opal_setenv(vars[i], value, true, dstenv);
/* save it for any comm_spawn'd apps */
opal_setenv(vars[i], value, true, &orte_forwarded_envars);
}
opal_argv_free(vars);
}
} else {
orte_show_help("help-orterun.txt", "orterun:conflict-env-set", false);
return ORTE_ERR_FATAL;
}
}
/* If the user specified --path, store it in the user's app
environment via the OMPI_exec_path variable. */
if (NULL != path) {
asprintf(&value, "OMPI_exec_path=%s", path);
opal_argv_append_nosize(dstenv, value);
/* save it for any comm_spawn'd apps */
opal_argv_append_nosize(&orte_forwarded_envars, value);
free(value);
}
return ORTE_SUCCESS;
}
static int setup_fork(orte_job_t *jdata,
orte_app_context_t *app)
{
int i;
char *param;
bool oversubscribed;
orte_node_t *node;
char **envcpy, **nps, **firstranks;
char *npstring, *firstrankstring;
char *num_app_ctx;
if (0 != strcmp(jdata->personality, "ompi")) {
return ORTE_ERR_TAKE_NEXT_OPTION;
}
/* see if the mapper thinks we are oversubscribed */
oversubscribed = false;
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, ORTE_PROC_MY_NAME->vpid))) {
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
return ORTE_ERR_NOT_FOUND;
}
if (ORTE_FLAG_TEST(node, ORTE_NODE_FLAG_OVERSUBSCRIBED)) {
oversubscribed = true;
}
/* setup base environment: copy the current environ and merge
in the app context environ */
if (NULL != app->env) {
/* manually free original context->env to avoid a memory leak */
char **tmp = app->env;
envcpy = opal_environ_merge(orte_launch_environ, app->env);
if (NULL != tmp) {
opal_argv_free(tmp);
}
} else {
envcpy = opal_argv_copy(orte_launch_environ);
}
app->env = envcpy;
/* special case handling for --prefix: this is somewhat icky,
but at least some users do this. :-\ It is possible that
when using --prefix, the user will also "-x PATH" and/or
"-x LD_LIBRARY_PATH", which would therefore clobber the
work that was done in the prior pls to ensure that we have
the prefix at the beginning of the PATH and
LD_LIBRARY_PATH. So examine the context->env and see if we
find PATH or LD_LIBRARY_PATH. If found, that means the
prior work was clobbered, and we need to re-prefix those
variables. */
param = NULL;
orte_get_attribute(&app->attributes, ORTE_APP_PREFIX_DIR, (void**)&param, OPAL_STRING);
for (i = 0; NULL != param && NULL != app->env && NULL != app->env[i]; ++i) {
char *newenv;
/* Reset PATH */
if (0 == strncmp("PATH=", app->env[i], 5)) {
asprintf(&newenv, "%s/bin:%s", param, app->env[i] + 5);
opal_setenv("PATH", newenv, true, &app->env);
free(newenv);
}
/* Reset LD_LIBRARY_PATH */
else if (0 == strncmp("LD_LIBRARY_PATH=", app->env[i], 16)) {
asprintf(&newenv, "%s/lib:%s", param, app->env[i] + 16);
opal_setenv("LD_LIBRARY_PATH", newenv, true, &app->env);
free(newenv);
}
}
if (NULL != param) {
free(param);
}
/* pass my contact info to the local proc so we can talk */
opal_setenv("OMPI_MCA_orte_local_daemon_uri", orte_process_info.my_daemon_uri, true, &app->env);
/* pass the hnp's contact info to the local proc in case it
* needs it
*/
if (NULL != orte_process_info.my_hnp_uri) {
opal_setenv("OMPI_MCA_orte_hnp_uri", orte_process_info.my_hnp_uri, true, &app->env);
}
/* setup yield schedule - do not override any user-supplied directive! */
if (oversubscribed) {
opal_setenv("OMPI_MCA_mpi_yield_when_idle", "1", false, &app->env);
} else {
opal_setenv("OMPI_MCA_mpi_yield_when_idle", "0", false, &app->env);
}
/* set the app_context number into the environment */
asprintf(&param, "%ld", (long)app->idx);
opal_setenv("OMPI_MCA_orte_app_num", param, true, &app->env);
free(param);
/* although the total_slots_alloc is the universe size, users
* would appreciate being given a public environmental variable
* that also represents this value - something MPI specific - so
* do that here. Also required by the ompi_attributes code!
*
* AND YES - THIS BREAKS THE ABSTRACTION BARRIER TO SOME EXTENT.
* We know - just live with it
*/
asprintf(&param, "%ld", (long)jdata->total_slots_alloc);
opal_setenv("OMPI_UNIVERSE_SIZE", param, true, &app->env);
free(param);
/* pass the number of nodes involved in this job */
asprintf(&param, "%ld", (long)(jdata->map->num_nodes));
opal_setenv("OMPI_MCA_orte_num_nodes", param, true, &app->env);
free(param);
/* pass a param telling the child what type and model of cpu we are on,
* if we know it. If hwloc has the value, use what it knows. Otherwise,
* see if we were explicitly given it and use that value.
*/
hwloc_obj_t obj;
char *htmp;
if (NULL != opal_hwloc_topology) {
obj = hwloc_get_root_obj(opal_hwloc_topology);
if (NULL != (htmp = (char*)hwloc_obj_get_info_by_name(obj, "CPUType")) ||
NULL != (htmp = orte_local_cpu_type)) {
opal_setenv("OMPI_MCA_orte_cpu_type", htmp, true, &app->env);
}
if (NULL != (htmp = (char*)hwloc_obj_get_info_by_name(obj, "CPUModel")) ||
NULL != (htmp = orte_local_cpu_model)) {
opal_setenv("OMPI_MCA_orte_cpu_model", htmp, true, &app->env);
}
} else {
if (NULL != orte_local_cpu_type) {
opal_setenv("OMPI_MCA_orte_cpu_type", orte_local_cpu_type, true, &app->env);
}
if (NULL != orte_local_cpu_model) {
opal_setenv("OMPI_MCA_orte_cpu_model", orte_local_cpu_model, true, &app->env);
}
}
/* get shmem's best component name so we can provide a hint to the shmem
* framework. the idea here is to have someone figure out what component to
* select (via the shmem framework) and then have the rest of the
* components in shmem obey that decision. for more details take a look at
* the shmem framework in opal.
*/
if (NULL != (param = opal_shmem_base_best_runnable_component_name())) {
opal_setenv("OMPI_MCA_shmem_RUNTIME_QUERY_hint", param, true, &app->env);
free(param);
}
/* Set an info MCA param that tells the launched processes that
* any binding policy was applied by us (e.g., so that
* MPI_INIT doesn't try to bind itself)
*/
opal_setenv("OMPI_MCA_orte_bound_at_launch", "1", true, &app->env);
/* tell the ESS to avoid the singleton component - but don't override
* anything that may have been provided elsewhere
*/
opal_setenv("OMPI_MCA_ess", "^singleton", false, &app->env);
/* ensure that the spawned process ignores direct launch components,
* but do not overrride anything we were given */
opal_setenv("OMPI_MCA_pmix", "^s1,s2,cray", false, &app->env);
/* since we want to pass the name as separate components, make sure
* that the "name" environmental variable is cleared!
*/
opal_unsetenv("OMPI_MCA_orte_ess_name", &app->env);
asprintf(&param, "%ld", (long)jdata->num_procs);
opal_setenv("OMPI_MCA_orte_ess_num_procs", param, true, &app->env);
/* although the num_procs is the comm_world size, users
* would appreciate being given a public environmental variable
* that also represents this value - something MPI specific - so
* do that here.
*
* AND YES - THIS BREAKS THE ABSTRACTION BARRIER TO SOME EXTENT.
* We know - just live with it
*/
opal_setenv("OMPI_COMM_WORLD_SIZE", param, true, &app->env);
free(param);
/* users would appreciate being given a public environmental variable
* that also represents this value - something MPI specific - so
* do that here.
*
* AND YES - THIS BREAKS THE ABSTRACTION BARRIER TO SOME EXTENT.
* We know - just live with it
*/
asprintf(&param, "%ld", (long)jdata->num_local_procs);
opal_setenv("OMPI_COMM_WORLD_LOCAL_SIZE", param, true, &app->env);
free(param);
/* forcibly set the local tmpdir base to match ours */
opal_setenv("OMPI_MCA_orte_tmpdir_base", orte_process_info.tmpdir_base, true, &app->env);
/* MPI-3 requires we provide some further info to the procs,
* so we pass them as envars to avoid introducing further
* ORTE calls in the MPI layer
*/
asprintf(&num_app_ctx, "%lu", (unsigned long)jdata->num_apps);
/* build some common envars we need to pass for MPI-3 compatibility */
nps = NULL;
firstranks = NULL;
for (i=0; i < jdata->apps->size; i++) {
if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
continue;
}
opal_argv_append_nosize(&nps, ORTE_VPID_PRINT(app->num_procs));
opal_argv_append_nosize(&firstranks, ORTE_VPID_PRINT(app->first_rank));
}
npstring = opal_argv_join(nps, ' ');
firstrankstring = opal_argv_join(firstranks, ' ');
opal_argv_free(nps);
opal_argv_free(firstranks);
/* add the MPI-3 envars */
opal_setenv("OMPI_NUM_APP_CTX", num_app_ctx, true, &app->env);
opal_setenv("OMPI_FIRST_RANKS", firstrankstring, true, &app->env);
opal_setenv("OMPI_APP_CTX_NUM_PROCS", npstring, true, &app->env);
free(num_app_ctx);
free(firstrankstring);
free(npstring);
return ORTE_SUCCESS;
}
static int setup_child(orte_job_t *jdata,
orte_proc_t *child,
orte_app_context_t *app)
{
char *param, *value;
int rc;
int32_t nrestarts=0, *nrptr;
if (0 != strcmp(jdata->personality, "ompi")) {
return ORTE_ERR_TAKE_NEXT_OPTION;
}
/* setup the jobid */
if (ORTE_SUCCESS != (rc = orte_util_convert_jobid_to_string(&value, child->name.jobid))) {
ORTE_ERROR_LOG(rc);
return rc;
}
opal_setenv("OMPI_MCA_ess_base_jobid", value, true, &app->env);
free(value);
/* setup the vpid */
if (ORTE_SUCCESS != (rc = orte_util_convert_vpid_to_string(&value, child->name.vpid))) {
ORTE_ERROR_LOG(rc);
return rc;
}
opal_setenv("OMPI_MCA_ess_base_vpid", value, true, &app->env);
/* although the vpid IS the process' rank within the job, users
* would appreciate being given a public environmental variable
* that also represents this value - something MPI specific - so
* do that here.
*
* AND YES - THIS BREAKS THE ABSTRACTION BARRIER TO SOME EXTENT.
* We know - just live with it
*/
opal_setenv("OMPI_COMM_WORLD_RANK", value, true, &app->env);
free(value); /* done with this now */
/* users would appreciate being given a public environmental variable
* that also represents the local rank value - something MPI specific - so
* do that here.
*
* AND YES - THIS BREAKS THE ABSTRACTION BARRIER TO SOME EXTENT.
* We know - just live with it
*/
if (ORTE_LOCAL_RANK_INVALID == child->local_rank) {
ORTE_ERROR_LOG(ORTE_ERR_VALUE_OUT_OF_BOUNDS);
rc = ORTE_ERR_VALUE_OUT_OF_BOUNDS;
return rc;
}
asprintf(&value, "%lu", (unsigned long) child->local_rank);
opal_setenv("OMPI_COMM_WORLD_LOCAL_RANK", value, true, &app->env);
free(value);
/* users would appreciate being given a public environmental variable
* that also represents the node rank value - something MPI specific - so
* do that here.
*
* AND YES - THIS BREAKS THE ABSTRACTION BARRIER TO SOME EXTENT.
* We know - just live with it
*/
if (ORTE_NODE_RANK_INVALID == child->node_rank) {
ORTE_ERROR_LOG(ORTE_ERR_VALUE_OUT_OF_BOUNDS);
rc = ORTE_ERR_VALUE_OUT_OF_BOUNDS;
return rc;
}
asprintf(&value, "%lu", (unsigned long) child->node_rank);
opal_setenv("OMPI_COMM_WORLD_NODE_RANK", value, true, &app->env);
/* set an mca param for it too */
opal_setenv("OMPI_MCA_orte_ess_node_rank", value, true, &app->env);
free(value);
/* provide the identifier for the PMIx connection - the
* PMIx connection is made prior to setting the process
* name itself. Although in most cases the ID and the
* process name are the same, it isn't necessarily
* required */
orte_util_convert_process_name_to_string(&value, &child->name);
opal_setenv("PMIX_ID", value, true, &app->env);
free(value);
nrptr = &nrestarts;
if (orte_get_attribute(&child->attributes, ORTE_PROC_NRESTARTS, (void**)&nrptr, OPAL_INT32)) {
/* pass the number of restarts for this proc - will be zero for
* an initial start, but procs would like to know if they are being
* restarted so they can take appropriate action
*/
asprintf(&value, "%d", nrestarts);
opal_setenv("OMPI_MCA_orte_num_restarts", value, true, &app->env);
free(value);
}
/* if the proc should not barrier in orte_init, tell it */
if (orte_get_attribute(&child->attributes, ORTE_PROC_NOBARRIER, NULL, OPAL_BOOL)
|| 0 < nrestarts) {
opal_setenv("OMPI_MCA_orte_do_not_barrier", "1", true, &app->env);
}
/* if we are using staged execution, tell it */
if (orte_staged_execution) {
opal_setenv("OMPI_MCA_orte_staged_execution", "1", true, &app->env);
}
/* if the proc isn't going to forward IO, then we need to flag that
* it has "completed" iof termination as otherwise it will never fire
*/
if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_FORWARD_OUTPUT)) {
ORTE_FLAG_SET(child, ORTE_PROC_FLAG_IOF_COMPLETE);
}
/* construct the proc's session dir name */
if (NULL != orte_process_info.tmpdir_base) {
value = strdup(orte_process_info.tmpdir_base);
} else {
value = NULL;
}
param = NULL;
if (ORTE_SUCCESS != (rc = orte_session_dir_get_name(&param, &value, NULL,
orte_process_info.nodename,
NULL, &child->name))) {
ORTE_ERROR_LOG(rc);
if (NULL != value) {
free(value);
}
return rc;
}
free(value);
/* pass an envar so the proc can find any files it had prepositioned */
opal_setenv("OMPI_FILE_LOCATION", param, true, &app->env);
/* if the user wanted the cwd to be the proc's session dir, then
* switch to that location now
*/
if (orte_get_attribute(&app->attributes, ORTE_APP_SSNDIR_CWD, NULL, OPAL_BOOL)) {
/* create the session dir - may not exist */
if (OPAL_SUCCESS != (rc = opal_os_dirpath_create(param, S_IRWXU))) {
ORTE_ERROR_LOG(rc);
/* doesn't exist with correct permissions, and/or we can't
* create it - either way, we are done
*/
free(param);
return rc;
}
/* change to it */
if (0 != chdir(param)) {
free(param);
return ORTE_ERROR;
}
/* It seems that chdir doesn't
* adjust the $PWD enviro variable when it changes the directory. This
* can cause a user to get a different response when doing getcwd vs
* looking at the enviro variable. To keep this consistent, we explicitly
* ensure that the PWD enviro variable matches the CWD we moved to.
*
* NOTE: if a user's program does a chdir(), then $PWD will once
* again not match getcwd! This is beyond our control - we are only
* ensuring they start out matching.
*/
opal_setenv("PWD", param, true, &app->env);
/* update the initial wdir value too */
opal_setenv("OMPI_MCA_initial_wdir", param, true, &app->env);
}
free(param);
return ORTE_SUCCESS;
}