From 0d4e6482cd39adf9533b3bb8be2ef147a7bbca41 Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Fri, 17 Sep 2004 00:59:14 +0000 Subject: [PATCH] Continuing the cleanup process. Few minor fixes here and there - mostly just NULLing pointers that were free'd. Console now can connect to any universe, regardless of scope. This commit was SVN r2734. --- src/mpi/runtime/ompi_mpi_init.c | 6 + src/runtime/ompi_rte_parse_cmd_line.c | 46 ++++++- src/runtime/ompi_rte_parse_daemon_cmd_line.c | 18 +++ src/runtime/ompi_rte_parse_environ.c | 27 ++-- src/runtime/universe_exists.c | 55 ++++++-- src/tools/console/ompiconsole.c | 125 +++++++++++++++++-- src/tools/mpirun/mpirun2.c | 17 ++- src/tools/ompid/ompid.c | 103 ++++++++++----- src/tools/openmpi/openmpi.c | 5 + 9 files changed, 332 insertions(+), 70 deletions(-) diff --git a/src/mpi/runtime/ompi_mpi_init.c b/src/mpi/runtime/ompi_mpi_init.c index c59f27da4a..51b6cc4ba8 100644 --- a/src/mpi/runtime/ompi_mpi_init.c +++ b/src/mpi/runtime/ompi_mpi_init.c @@ -107,6 +107,7 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided) */ universe = strdup(ompi_universe_info.name); free(ompi_universe_info.name); + ompi_universe_info.name = NULL; pid = getpid(); if (0 > asprintf(&ompi_universe_info.name, "%s-%d", universe, pid) && ompi_rte_debug_flag) { ompi_output(0, "mpi_init: error creating unique universe name"); @@ -117,15 +118,19 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided) ompi_process_info.seed = true; if (NULL != ompi_universe_info.ns_replica) { free(ompi_universe_info.ns_replica); + ompi_universe_info.ns_replica = NULL; } if (NULL != ompi_process_info.ns_replica) { free(ompi_process_info.ns_replica); + ompi_process_info.ns_replica = NULL; } if (NULL != ompi_universe_info.gpr_replica) { free(ompi_universe_info.gpr_replica); + ompi_universe_info.gpr_replica = NULL; } if (NULL != ompi_process_info.gpr_replica) { free(ompi_process_info.gpr_replica); + ompi_process_info.gpr_replica = NULL; } } @@ -139,6 +144,7 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided) /***** SET MY NAME *****/ if (NULL != ompi_process_info.name) { /* should NOT have been previously set */ free(ompi_process_info.name); + ompi_process_info.name = NULL; } if (NULL != ompi_rte_get_self()) { /* name set in environment - nonsingleton - record name */ diff --git a/src/runtime/ompi_rte_parse_cmd_line.c b/src/runtime/ompi_rte_parse_cmd_line.c index 2349b519ae..18b2acb56d 100644 --- a/src/runtime/ompi_rte_parse_cmd_line.c +++ b/src/runtime/ompi_rte_parse_cmd_line.c @@ -15,6 +15,7 @@ #include #include "mca/oob/base/base.h" +#include "mca/ns/base/base.h" #include "util/output.h" #include "util/cmd_line.h" @@ -30,9 +31,6 @@ void ompi_rte_parse_cmd_line(ompi_cmd_line_t *cmd_line) /* get universe name and store it, if user specified it */ /* otherwise, stick with default name */ - if (NULL != ompi_universe_info.name) { - universe = strdup(ompi_universe_info.name); /* save the current value, if exists */ - } if (ompi_cmd_line_is_taken(cmd_line, "universe") || ompi_cmd_line_is_taken(cmd_line, "u")) { @@ -52,23 +50,44 @@ void ompi_rte_parse_cmd_line(ompi_cmd_line_t *cmd_line) if (NULL != (tmp = strchr(universe, '@'))) { /* remote name includes remote uid */ *tmp = '\0'; tmp++; + if (NULL != ompi_universe_info.host) { /* overwrite it */ + free(ompi_universe_info.host); + ompi_universe_info.host = NULL; + } ompi_universe_info.host = strdup(tmp); + if (NULL != ompi_universe_info.uid) { + free(ompi_universe_info.uid); + ompi_universe_info.uid = NULL; + } ompi_universe_info.uid = strdup(universe); } else { /* no remote id - just remote host */ + if (NULL != ompi_universe_info.host) { + free(ompi_universe_info.host); + ompi_universe_info.host = NULL; + } ompi_universe_info.host = strdup(universe); } } else { /* no remote host - just universe name provided */ + if (NULL != ompi_universe_info.name) { + free(ompi_universe_info.name); + ompi_universe_info.name = NULL; + } ompi_universe_info.name = strdup(universe); } } /* copy the universe name into the process_info structure */ if (NULL != ompi_universe_info.name) { + if (NULL != ompi_process_info.my_universe) { + free(ompi_process_info.my_universe); + ompi_process_info.my_universe = NULL; + } ompi_process_info.my_universe = strdup(ompi_universe_info.name); } else { /* set it to default value */ ompi_universe_info.name = strdup("default-universe"); if (NULL != ompi_process_info.my_universe) { /* overwrite it */ free(ompi_process_info.my_universe); + ompi_process_info.my_universe = NULL; } ompi_process_info.my_universe = strdup(ompi_universe_info.name); } @@ -84,6 +103,7 @@ void ompi_rte_parse_cmd_line(ompi_cmd_line_t *cmd_line) } if (NULL != ompi_process_info.tmpdir_base) { /* overwrite it */ free(ompi_process_info.tmpdir_base); + ompi_process_info.tmpdir_base = NULL; } ompi_process_info.tmpdir_base = strdup(ompi_cmd_line_get_param(cmd_line, "tmpdir", 0, 0)); setenv("OMPI_tmpdir_base", ompi_process_info.tmpdir_base, 1); @@ -96,6 +116,16 @@ void ompi_rte_parse_cmd_line(ompi_cmd_line_t *cmd_line) return; } nsreplica = strdup(ompi_cmd_line_get_param(cmd_line, "nsreplica", 0, 0)); + if (NULL != ompi_universe_info.ns_replica) { + free(ompi_universe_info.ns_replica); + ompi_universe_info.ns_replica = NULL; + } + ompi_universe_info.ns_replica = strdup(nsreplica); + if (NULL == ompi_process_info.ns_replica) { + ompi_process_info.ns_replica = ns_base_create_process_name(0,0,0); + } + mca_oob_parse_contact_info(ompi_universe_info.ns_replica, + ompi_process_info.ns_replica, NULL); setenv("OMPI_MCA_ns_base_replica", nsreplica, 1); /* set the ns_replica enviro variable */ } /* otherwise, leave it alone */ @@ -106,6 +136,16 @@ void ompi_rte_parse_cmd_line(ompi_cmd_line_t *cmd_line) return; } gprreplica = strdup(ompi_cmd_line_get_param(cmd_line, "gprreplica", 0, 0)); + if (NULL != ompi_universe_info.gpr_replica) { + free(ompi_universe_info.gpr_replica); + ompi_universe_info.gpr_replica = NULL; + } + ompi_universe_info.gpr_replica = strdup(nsreplica); + if (NULL == ompi_process_info.gpr_replica) { + ompi_process_info.gpr_replica = ns_base_create_process_name(0,0,0); + } + mca_oob_parse_contact_info(ompi_universe_info.gpr_replica, + ompi_process_info.gpr_replica, NULL); setenv("OMPI_MCA_gpr_base_replica", gprreplica, 1); /* set the gpr_replica enviro variable */ } /* otherwise leave it alone */ } diff --git a/src/runtime/ompi_rte_parse_daemon_cmd_line.c b/src/runtime/ompi_rte_parse_daemon_cmd_line.c index 1c24a6ddf0..6d8b4875ca 100644 --- a/src/runtime/ompi_rte_parse_daemon_cmd_line.c +++ b/src/runtime/ompi_rte_parse_daemon_cmd_line.c @@ -11,6 +11,8 @@ #include +#include "mca/ns/base/base.h" + #include "util/output.h" #include "util/cmd_line.h" #include "util/sys_info.h" @@ -34,6 +36,10 @@ void ompi_rte_parse_daemon_cmd_line(ompi_cmd_line_t *cmd_line) fprintf(stderr, "error retrieving seed contact info - please report error to bugs@open-mpi.org\n"); exit(1); } + if (NULL != ompi_universe_info.seed_contact_info) { /* overwrite it */ + free(ompi_universe_info.seed_contact_info); + ompi_universe_info.seed_contact_info = NULL; + } ompi_universe_info.seed_contact_info = strdup(ompi_cmd_line_get_param(cmd_line, "seedcontact", 0, 0)); setenv("OMPI_universe_contact", ompi_universe_info.seed_contact_info, 1); } @@ -51,6 +57,10 @@ void ompi_rte_parse_daemon_cmd_line(ompi_cmd_line_t *cmd_line) fprintf(stderr, "error retrieving universe scope - please report error to bugs@open-mpi.org\n"); exit(1); } + if (NULL != ompi_universe_info.scope) { + free(ompi_universe_info.scope); + ompi_universe_info.scope = NULL; + } ompi_universe_info.scope = strdup(ompi_cmd_line_get_param(cmd_line, "scope", 0, 0)); setenv("OMPI_universe_scope", ompi_universe_info.scope, 1); } @@ -74,6 +84,10 @@ void ompi_rte_parse_daemon_cmd_line(ompi_cmd_line_t *cmd_line) fprintf(stderr, "error retrieving script file name - please report error to bugs@open-mpi.org\n"); exit(1); } + if (NULL != ompi_universe_info.scriptfile) { + free(ompi_universe_info.scriptfile); + ompi_universe_info.scriptfile = NULL; + } ompi_universe_info.scriptfile = strdup(ompi_cmd_line_get_param(cmd_line, "script", 0, 0)); setenv("OMPI_universe_script", ompi_universe_info.scriptfile, 1); } @@ -84,6 +98,10 @@ void ompi_rte_parse_daemon_cmd_line(ompi_cmd_line_t *cmd_line) fprintf(stderr, "error retrieving host file name - please report error to bugs@open-mpi.org\n"); exit(1); } + if (NULL != ompi_universe_info.hostfile) { + free(ompi_universe_info.hostfile); + ompi_universe_info.hostfile = NULL; + } ompi_universe_info.hostfile = strdup(ompi_cmd_line_get_param(cmd_line, "hostfile", 0, 0)); setenv("OMPI_universe_hostfile", ompi_universe_info.hostfile, 1); } diff --git a/src/runtime/ompi_rte_parse_environ.c b/src/runtime/ompi_rte_parse_environ.c index d24e1a06ce..4ac53bb238 100644 --- a/src/runtime/ompi_rte_parse_environ.c +++ b/src/runtime/ompi_rte_parse_environ.c @@ -47,12 +47,14 @@ void ompi_rte_parse_environ(void) if (NULL != enviro_val) { /* contact info passed */ if (NULL != ompi_universe_info.seed_contact_info) { /* overwrite */ free(ompi_universe_info.seed_contact_info); + ompi_universe_info.seed_contact_info = NULL; } ompi_universe_info.seed_contact_info = strdup(enviro_val); mca_oob_set_contact_info(ompi_universe_info.seed_contact_info); } else { if (NULL != ompi_universe_info.seed_contact_info) { free(ompi_universe_info.seed_contact_info); + ompi_universe_info.seed_contact_info = NULL; } } @@ -66,6 +68,7 @@ void ompi_rte_parse_environ(void) } else { if (NULL != ompi_process_info.gpr_replica) { free(ompi_process_info.gpr_replica); + ompi_process_info.gpr_replica = NULL; } } @@ -79,6 +82,7 @@ void ompi_rte_parse_environ(void) } else { if (NULL != ompi_process_info.ns_replica) { free(ompi_process_info.ns_replica); + ompi_process_info.ns_replica = NULL; } } @@ -93,21 +97,17 @@ void ompi_rte_parse_environ(void) if (NULL != enviro_val) { /* scope passed */ if (NULL != ompi_universe_info.scope) { /* overwrite */ free(ompi_universe_info.scope); + ompi_universe_info.scope = NULL; } ompi_universe_info.scope = strdup(enviro_val); } else { if (NULL != ompi_universe_info.scope) { free(ompi_universe_info.scope); + ompi_universe_info.scope = NULL; } ompi_universe_info.scope = strdup("exclusive"); } - /*** FOR DEBUGGING PURPOSES IN THIS EARLY STAGE - FORCE PUBLIC */ - if (NULL != ompi_universe_info.scope) { - free(ompi_universe_info.scope); - } - ompi_universe_info.scope = strdup("public"); - enviro_val = getenv("OMPI_universe_persistent"); if (NULL != enviro_val) { /* persistence flag passed */ ompi_universe_info.persistence = true; @@ -115,9 +115,6 @@ void ompi_rte_parse_environ(void) ompi_universe_info.persistence = false; } - /*** FOR DEBUGGING PURPOSES IN THIS EARLY STAGE - FORCE PERSISTENCE */ - ompi_universe_info.persistence = true; - enviro_val = getenv("OMPI_universe_console"); if (NULL != enviro_val) { /* console flag passed */ ompi_universe_info.console = true; @@ -129,11 +126,13 @@ void ompi_rte_parse_environ(void) if (NULL != enviro_val) { /* scriptfile passed */ if (NULL != ompi_universe_info.scriptfile) { /* overwrite */ free(ompi_universe_info.scriptfile); + ompi_universe_info.scriptfile = NULL; } ompi_universe_info.scriptfile = strdup(enviro_val); } else { if (NULL != ompi_universe_info.scriptfile) { free(ompi_universe_info.scriptfile); + ompi_universe_info.scriptfile = NULL; } } @@ -141,28 +140,34 @@ void ompi_rte_parse_environ(void) if (NULL != enviro_val) { /* hostfile passed */ if (NULL != ompi_universe_info.hostfile) { /* overwrite */ free(ompi_universe_info.hostfile); + ompi_universe_info.hostfile = NULL; } ompi_universe_info.hostfile = strdup(enviro_val); } else { if (NULL != ompi_universe_info.hostfile) { free(ompi_universe_info.hostfile); + ompi_universe_info.hostfile = NULL; } } if (NULL != ompi_universe_info.name) { free(ompi_universe_info.name); + ompi_universe_info.name = NULL; } ompi_universe_info.name = strdup("default-universe"); if (NULL != ompi_process_info.my_universe) { free(ompi_process_info.my_universe); + ompi_process_info.my_universe = NULL; } ompi_process_info.my_universe = strdup("default-universe"); if (NULL != ompi_universe_info.host) { free(ompi_universe_info.host); + ompi_universe_info.host = NULL; } ompi_universe_info.host = strdup(ompi_system_info.nodename); if (NULL != ompi_universe_info.uid) { free(ompi_universe_info.uid); + ompi_universe_info.uid = NULL; } ompi_universe_info.uid = strdup(ompi_system_info.user); @@ -170,10 +175,12 @@ void ompi_rte_parse_environ(void) if (NULL != enviro_val) { /* universe name passed in environment */ if (NULL != ompi_universe_info.name) { /* got something in it - overwrite */ free(ompi_universe_info.name); + ompi_universe_info.name = NULL; } ompi_universe_info.name = strdup(enviro_val); if (NULL != ompi_process_info.my_universe) { free(ompi_process_info.my_universe); + ompi_process_info.my_universe = NULL; } ompi_process_info.my_universe = strdup(enviro_val); } @@ -182,11 +189,13 @@ void ompi_rte_parse_environ(void) if (NULL != enviro_val) { /* tmpdir base passed in environment */ if (NULL != ompi_process_info.tmpdir_base) { /* overwrite it */ free(ompi_process_info.tmpdir_base); + ompi_process_info.tmpdir_base = NULL; } ompi_process_info.tmpdir_base = strdup(enviro_val); } else { if (NULL != ompi_process_info.tmpdir_base) { free(ompi_process_info.tmpdir_base); + ompi_process_info.tmpdir_base = NULL; } } diff --git a/src/runtime/universe_exists.c b/src/runtime/universe_exists.c index 87dcd9a236..85c68c14e5 100644 --- a/src/runtime/universe_exists.c +++ b/src/runtime/universe_exists.c @@ -35,9 +35,9 @@ static struct timeval ompi_rte_ping_wait = {2, 0}; int ompi_rte_universe_exists() { char *contact_file; - int ret, i; + int ret; ompi_process_name_t proc={0,0,0}; - bool ns_found, gpr_found, ping_success; + bool ns_found=false, gpr_found=false, ping_success=false; /* if both ns_replica and gpr_replica were provided, check for contact with them */ if (NULL != ompi_universe_info.ns_replica && NULL != ompi_universe_info.gpr_replica) { @@ -50,11 +50,13 @@ int ompi_rte_universe_exists() free(ompi_universe_info.ns_replica); if (NULL != ompi_process_info.ns_replica) { free(ompi_process_info.ns_replica); + ompi_process_info.ns_replica = NULL; } } else { /* name server found, now try gpr */ ns_found = true; if (NULL != ompi_process_info.ns_replica) { free(ompi_process_info.ns_replica); + ompi_process_info.ns_replica = NULL; } ompi_process_info.ns_replica = ns_base_copy_process_name(&proc); } @@ -68,10 +70,12 @@ int ompi_rte_universe_exists() free(ompi_universe_info.gpr_replica); if (NULL != ompi_process_info.gpr_replica) { free(ompi_process_info.gpr_replica); + ompi_process_info.gpr_replica = NULL; } } else { if (NULL != ompi_process_info.gpr_replica) { free(ompi_process_info.gpr_replica); + ompi_process_info.gpr_replica = NULL; } ompi_process_info.gpr_replica = ns_base_copy_process_name(&proc); gpr_found = true; @@ -129,20 +133,22 @@ int ompi_rte_universe_exists() ompi_output(0, "contact info read"); } - if (!ompi_universe_info.persistence || /* not persistent... */ - (0 == strncmp(ompi_universe_info.scope, "exclusive", strlen("exclusive")))) { /* ...or no connection allowed */ - /* also need to check "local" and that we did not specify the exact - * matching universe name - */ - if (ompi_rte_debug_flag) { - ompi_output(0, "connection not allowed"); + if (!ompi_universe_info.console) { /* if we aren't trying to connect a console */ + if (!ompi_universe_info.persistence || /* not persistent... */ + (0 == strncmp(ompi_universe_info.scope, "exclusive", strlen("exclusive")))) { /* ...or no connection allowed */ + /* also need to check "local" and that we did not specify the exact + * matching universe name + */ + if (ompi_rte_debug_flag) { + ompi_output(0, "connection not allowed"); + } + return OMPI_ERR_NO_CONNECTION_ALLOWED; } - return OMPI_ERR_NO_CONNECTION_ALLOWED; } - if (ompi_rte_debug_flag) { - ompi_output(0, "contact info to set: %s", ompi_universe_info.seed_contact_info); - } + if (ompi_rte_debug_flag) { + ompi_output(0, "contact info to set: %s", ompi_universe_info.seed_contact_info); + } /* if persistent, set contact info... */ @@ -174,11 +180,34 @@ int ompi_rte_universe_exists() } /* set the my_universe field */ + if (NULL != ompi_process_info.my_universe) { + free(ompi_process_info.my_universe); + ompi_process_info.my_universe = NULL; + } ompi_process_info.my_universe = strdup(ompi_universe_info.name); + + if (NULL != ompi_process_info.ns_replica) { + free(ompi_process_info.ns_replica); + ompi_process_info.ns_replica = NULL; + } ompi_process_info.ns_replica = ns_base_copy_process_name(&proc); + + if (NULL != ompi_process_info.gpr_replica) { + free(ompi_process_info.gpr_replica); + ompi_process_info.gpr_replica = NULL; + } ompi_process_info.gpr_replica = ns_base_copy_process_name(&proc); + if (NULL != ompi_universe_info.ns_replica) { + free(ompi_universe_info.ns_replica); + ompi_universe_info.ns_replica = NULL; + } ompi_universe_info.ns_replica = strdup(ompi_universe_info.seed_contact_info); + + if (NULL != ompi_universe_info.gpr_replica) { + free(ompi_universe_info.gpr_replica); + ompi_universe_info.gpr_replica = NULL; + } ompi_universe_info.gpr_replica = strdup(ompi_universe_info.seed_contact_info); /* request ns_replica and gpr_replica info for this process diff --git a/src/tools/console/ompiconsole.c b/src/tools/console/ompiconsole.c index e1f49cbc51..78ea9da001 100644 --- a/src/tools/console/ompiconsole.c +++ b/src/tools/console/ompiconsole.c @@ -13,6 +13,10 @@ #include "util/cmd_line.h" #include "util/proc_info.h" #include "util/pack.h" +#include "util/session_dir.h" +#include "util/output.h" +#include "util/os_path.h" +#include "util/universe_setup_file_io.h" #include "runtime/runtime.h" #include "mca/base/base.h" @@ -28,6 +32,8 @@ static char *ompi_getinputline(void); static void ompi_console_sendcmd(ompi_daemon_cmd_flag_t usercmd); +static struct timeval ompi_rte_ping_wait = {2, 0}; + int main(int argc, char *argv[]) { @@ -43,8 +49,8 @@ int main(int argc, char *argv[]) ompi_cmd_line_t *cmd_line; bool allow_multi_user_threads = false; bool have_hidden_threads = false; - bool exit_cmd; - char *usercmd, *str_response; + bool exit_cmd, ping_success; + char *usercmd, *str_response, *contact_file; ompi_buffer_t buffer; ompi_process_name_t seed={0,0,0}; int recv_tag; @@ -140,12 +146,100 @@ int main(int argc, char *argv[]) fprintf(stderr, "check local univ\n"); - if (OMPI_SUCCESS != (ret = ompi_rte_universe_exists())) { - fprintf(stderr, "could not contact local universe %s\n", ompi_universe_info.name); + /* check to see if local universe already exists */ + if (OMPI_SUCCESS != ompi_session_dir(false, + ompi_process_info.tmpdir_base, + ompi_system_info.user, + ompi_system_info.nodename, + NULL, + ompi_universe_info.name, + NULL, + NULL)) { /* not found */ + if (ompi_rte_debug_flag) { + ompi_output(0, "could not find universe session dir"); + exit(1); + } + } + + if (ompi_rte_debug_flag) { + ompi_output(0, "check for contact info file"); + } + + /* check for "contact-info" file. if present, read it in. */ + contact_file = ompi_os_path(false, ompi_process_info.universe_session_dir, + "universe-setup.txt", NULL); + + if (OMPI_SUCCESS != (ret = ompi_read_universe_setup_file(contact_file))) { + if (ompi_rte_debug_flag) { + ompi_output(0, "could not read contact file %s", contact_file); + } + exit(ret); + } + + if (ompi_rte_debug_flag) { + ompi_output(0, "contact info read"); + } + + /* if persistent, set contact info... */ + if (OMPI_SUCCESS != mca_oob_set_contact_info(ompi_universe_info.seed_contact_info)) { /* set contact info */ + if (ompi_rte_debug_flag) { + ompi_output(0, "error setting oob contact info - please report error to bugs@open-mpi.org\n"); + } exit(1); } - fprintf(stderr, "init stage 2\n"); + mca_oob_parse_contact_info(ompi_universe_info.seed_contact_info, &seed, NULL); + + if (ompi_rte_debug_flag) { + ompi_output(0, "contact info set: %s", ompi_universe_info.seed_contact_info); + ompi_output(0, "issuing ping: %d %d %d", seed.cellid, seed.jobid, seed.vpid); + } + + + /* ...and ping to verify it's alive */ + ping_success = false; + if (OMPI_SUCCESS == mca_oob_ping(&seed, &ompi_rte_ping_wait)) { + ping_success = true; + } + if (!ping_success) { + if (ompi_rte_debug_flag) { + ompi_output(0, "ping failed"); + } + exit(1); + } + + /* set the my_universe field */ + if (NULL != ompi_process_info.my_universe) { + free(ompi_process_info.my_universe); + ompi_process_info.my_universe = NULL; + } + ompi_process_info.my_universe = strdup(ompi_universe_info.name); + + if (NULL != ompi_process_info.ns_replica) { + free(ompi_process_info.ns_replica); + ompi_process_info.ns_replica = NULL; + } + ompi_process_info.ns_replica = ns_base_copy_process_name(&seed); + + if (NULL != ompi_process_info.gpr_replica) { + free(ompi_process_info.gpr_replica); + ompi_process_info.gpr_replica = NULL; + } + ompi_process_info.gpr_replica = ns_base_copy_process_name(&seed); + + if (NULL != ompi_universe_info.ns_replica) { + free(ompi_universe_info.ns_replica); + ompi_universe_info.ns_replica = NULL; + } + ompi_universe_info.ns_replica = strdup(ompi_universe_info.seed_contact_info); + + if (NULL != ompi_universe_info.gpr_replica) { + free(ompi_universe_info.gpr_replica); + ompi_universe_info.gpr_replica = NULL; + } + ompi_universe_info.gpr_replica = strdup(ompi_universe_info.seed_contact_info); + + fprintf(stderr, "init stage 2\n"); /* setup the rest of the rte */ if (OMPI_SUCCESS != (ret = ompi_rte_init_stage2(&allow_multi_user_threads, @@ -156,6 +250,11 @@ int main(int argc, char *argv[]) } /***** SET MY NAME *****/ + if (NULL != ompi_process_info.name) { /* should not have been previously set */ + free(ompi_process_info.name); + ompi_process_info.name = NULL; + } + jobid = ompi_name_server.create_jobid(); vpid = ompi_name_server.reserve_range(jobid, 1); ompi_process_info.name = ompi_name_server.create_process_name(0, jobid, vpid); @@ -163,14 +262,6 @@ int main(int argc, char *argv[]) fprintf(stderr, "my name: [%d,%d,%d]\n", ompi_process_info.name->cellid, ompi_process_info.name->jobid, ompi_process_info.name->vpid); - /* - * Register my process info with my replica. - */ - if (OMPI_SUCCESS != (ret = ompi_rte_register())) { - fprintf(stderr, "ompi_rte_init: failed in ompi_rte_register()\n"); - return ret; - } - /* finalize the rte startup */ if (OMPI_SUCCESS != (ret = ompi_rte_init_finalstage(&allow_multi_user_threads, &have_hidden_threads))) { @@ -178,6 +269,14 @@ int main(int argc, char *argv[]) return ret; } + /* + * Register my process info with my replica. + */ + if (OMPI_SUCCESS != (ret = ompi_rte_register())) { + fprintf(stderr, "ompi_rte_init: failed in ompi_rte_register()\n"); + return ret; + } + exit_cmd = false; while (!exit_cmd) { diff --git a/src/tools/mpirun/mpirun2.c b/src/tools/mpirun/mpirun2.c index 8077a14f6b..2785d2452e 100644 --- a/src/tools/mpirun/mpirun2.c +++ b/src/tools/mpirun/mpirun2.c @@ -179,6 +179,7 @@ main(int argc, char *argv[]) */ universe = strdup(ompi_universe_info.name); free(ompi_universe_info.name); + ompi_universe_info.name = NULL; pid = getpid(); if (0 > asprintf(&ompi_universe_info.name, "%s-%d", universe, pid) && ompi_rte_debug_flag) { ompi_output(0, "mpi_init: error creating unique universe name"); @@ -189,15 +190,19 @@ main(int argc, char *argv[]) ompi_process_info.seed = true; if (NULL != ompi_universe_info.ns_replica) { free(ompi_universe_info.ns_replica); + ompi_universe_info.ns_replica = NULL; } if (NULL != ompi_process_info.ns_replica) { free(ompi_process_info.ns_replica); + ompi_process_info.ns_replica = NULL; } if (NULL != ompi_universe_info.gpr_replica) { free(ompi_universe_info.gpr_replica); + ompi_universe_info.gpr_replica = NULL; } if (NULL != ompi_process_info.gpr_replica) { free(ompi_process_info.gpr_replica); + ompi_process_info.gpr_replica = NULL; } } @@ -209,10 +214,12 @@ main(int argc, char *argv[]) } /***** SET MY NAME *****/ + if (NULL != ompi_process_info.name) { /* should NOT have been set yet */ + free(ompi_process_info.name); + ompi_process_info.name = NULL; + } + if (ompi_process_info.seed) { - if (NULL != ompi_process_info.name) { /* overwrite it */ - free(ompi_process_info.name); - } ompi_process_info.name = ompi_name_server.create_process_name(0, 0, 0); } else { /* if not seed, then we joined universe - get jobid and name */ jobid = ompi_name_server.create_jobid(); @@ -256,6 +263,10 @@ main(int argc, char *argv[]) /* if i'm the seed, get my contact info and write my setup file for others to find */ if (ompi_process_info.seed) { + if (NULL != ompi_universe_info.seed_contact_info) { + free(ompi_universe_info.seed_contact_info); + ompi_universe_info.seed_contact_info = NULL; + } ompi_universe_info.seed_contact_info = mca_oob_get_contact_info(); contact_file = ompi_os_path(false, ompi_process_info.universe_session_dir, "universe-setup.txt", NULL); diff --git a/src/tools/ompid/ompid.c b/src/tools/ompid/ompid.c index 174f9ff596..9b018cc091 100644 --- a/src/tools/ompid/ompid.c +++ b/src/tools/ompid/ompid.c @@ -52,7 +52,10 @@ int main(int argc, char *argv[]) bool allow_multi_user_threads = false; bool have_hidden_threads = false; char *jobid_str, *procid_str, *enviro_val, *contact_file; - char *filenm; + char *filenm, *universe; + pid_t pid; + mca_ns_base_jobid_t jobid; + mca_ns_base_vpid_t vpid; /* * Intialize the Open MPI environment @@ -165,6 +168,44 @@ int main(int argc, char *argv[]) */ ompi_rte_parse_daemon_cmd_line(cmd_line); + /* check for existing universe to join */ + if (OMPI_SUCCESS != (ret = ompi_rte_universe_exists())) { + if (ompi_rte_debug_flag) { + ompi_output(0, "ompi_mpi_init: could not join existing universe"); + } + if (OMPI_ERR_NOT_FOUND != ret) { + /* if it exists but no contact could be established, + * define unique name based on current one. + * and start new universe with me as seed + */ + universe = strdup(ompi_universe_info.name); + free(ompi_universe_info.name); + ompi_universe_info.name = NULL; + pid = getpid(); + if (0 > asprintf(&ompi_universe_info.name, "%s-%d", universe, pid) && ompi_rte_debug_flag) { + ompi_output(0, "mpi_init: error creating unique universe name"); + } + } + + ompi_process_info.my_universe = strdup(ompi_universe_info.name); + ompi_process_info.seed = true; + if (NULL != ompi_universe_info.ns_replica) { + free(ompi_universe_info.ns_replica); + ompi_universe_info.ns_replica = NULL; + } + if (NULL != ompi_process_info.ns_replica) { + free(ompi_process_info.ns_replica); + ompi_process_info.ns_replica = NULL; + } + if (NULL != ompi_universe_info.gpr_replica) { + free(ompi_universe_info.gpr_replica); + ompi_universe_info.gpr_replica = NULL; + } + if (NULL != ompi_process_info.gpr_replica) { + free(ompi_process_info.gpr_replica); + ompi_process_info.gpr_replica = NULL; + } + } /* setup the rest of the rte */ if (OMPI_SUCCESS != (ret = ompi_rte_init_stage2(&allow_multi_user_threads, @@ -175,23 +216,20 @@ int main(int argc, char *argv[]) } /***** SET MY NAME *****/ - if (ompi_process_info.seed) { - if (ompi_daemon_debug) { - ompi_output(0, "ompid: seed flag set"); - } - if (NULL != ompi_process_info.name) { /* overwrite it */ - free(ompi_process_info.name); - } - ompi_process_info.name = ompi_name_server.create_process_name(0, 0, 0); - } else { - if (ompi_daemon_debug) { - ompi_output(0, "ompid: seed flag NOT set"); - } - if (NULL != ompi_process_info.name) { /* overwrite it */ - free(ompi_process_info.name); - } - ompi_process_info.name = ompi_rte_get_self(); - } + if (NULL != ompi_process_info.name) { /* should not have been previously set */ + free(ompi_process_info.name); + ompi_process_info.name = NULL; + } + + if (NULL != ompi_rte_get_self()) { /* name set in environment - record name */ + ompi_process_info.name = ompi_rte_get_self(); + } else if (NULL == ompi_process_info.ns_replica) { /* couldn't join existing univ */ + ompi_process_info.name = ompi_name_server.create_process_name(0,0,0); + } else { /* name server exists elsewhere - get a name for me */ + jobid = ompi_name_server.create_jobid(); + vpid = ompi_name_server.reserve_range(jobid, 1); + ompi_process_info.name = ompi_name_server.create_process_name(0, jobid, vpid); + } /* setup my session directory */ jobid_str = ompi_name_server.get_jobid_string(ompi_process_info.name); @@ -219,13 +257,6 @@ int main(int argc, char *argv[]) exit(-1); } - /* - * Register my process info with my replica. - */ - if (OMPI_SUCCESS != (ret = ompi_rte_register())) { - ompi_output(0, "ompi_rte_init: failed in ompi_rte_register"); - return ret; - } /* finalize the rte startup */ if (OMPI_SUCCESS != (ret = ompi_rte_init_finalstage(&allow_multi_user_threads, @@ -235,8 +266,21 @@ int main(int argc, char *argv[]) return ret; } + /* + * Register my process info with my replica. Note that this must be done + * after the rte init is completed. + */ + if (OMPI_SUCCESS != (ret = ompi_rte_register())) { + ompi_output(0, "ompid: failed in ompi_rte_register()"); + return ret; + } + /* if i'm the seed, get my contact info and write my setup file for others to find */ if (ompi_process_info.seed) { + if (NULL != ompi_universe_info.seed_contact_info) { + free(ompi_universe_info.seed_contact_info); + ompi_universe_info.seed_contact_info = NULL; + } ompi_universe_info.seed_contact_info = mca_oob_get_contact_info(); contact_file = ompi_os_path(false, ompi_process_info.universe_session_dir, "universe-setup.txt", NULL); @@ -298,9 +342,11 @@ int main(int argc, char *argv[]) ompi_process_info.name->jobid, ompi_process_info.name->vpid); } - /* remove the universe-setup file */ - filenm = ompi_os_path(false, ompi_process_info.universe_session_dir, "universe-setup.txt", NULL); - unlink(filenm); + /* if i'm the seed, remove the universe-setup file */ + if (ompi_process_info.seed) { + filenm = ompi_os_path(false, ompi_process_info.universe_session_dir, "universe-setup.txt", NULL); + unlink(filenm); + } /* finalize the system */ ompi_rte_finalize(); @@ -322,7 +368,6 @@ static void ompi_daemon_recv(int status, ompi_process_name_t* sender, ompi_buffer_t answer; ompi_daemon_cmd_flag_t command; int ret; - int32_t str_len; char *contact_info; OMPI_THREAD_LOCK(&ompi_daemon_mutex); diff --git a/src/tools/openmpi/openmpi.c b/src/tools/openmpi/openmpi.c index 1d5795695c..052e547948 100644 --- a/src/tools/openmpi/openmpi.c +++ b/src/tools/openmpi/openmpi.c @@ -156,6 +156,7 @@ int main(int argc, char **argv) */ universe = strdup(ompi_universe_info.name); free(ompi_universe_info.name); + ompi_universe_info.name = NULL; pid = getpid(); if (0 < asprintf(&ompi_universe_info.name, "%s-%d", universe, pid)) { fprintf(stderr, "error creating unique universe name - please report error to bugs@open-mpi.org\n"); @@ -163,6 +164,10 @@ int main(int argc, char **argv) } } + if (NULL != ompi_process_info.my_universe) { + free(ompi_process_info.my_universe); + ompi_process_info.my_universe = NULL; + } ompi_process_info.my_universe = strdup(ompi_universe_info.name); /* ensure the enviro variables do NOT specify any replicas so that seed