diff --git a/src/mpi/runtime/ompi_mpi_init.c b/src/mpi/runtime/ompi_mpi_init.c index 705761a47e..c59f27da4a 100644 --- a/src/mpi/runtime/ompi_mpi_init.c +++ b/src/mpi/runtime/ompi_mpi_init.c @@ -67,6 +67,8 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided) ompi_proc_t** procs; size_t nprocs; char *error, *jobid_str, *procid_str; + char *universe; + pid_t pid; /* Become an OMPI process */ @@ -98,6 +100,33 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided) if (ompi_rte_debug_flag) { ompi_output(0, "ompi_mpi_init: could not join existing universe"); } + if (OMPI_ERR_NOT_FOUND != ret) { + /* if it exists but no contact could be established, + * define unique name based on current one. + * and start new universe with me as seed + */ + universe = strdup(ompi_universe_info.name); + free(ompi_universe_info.name); + pid = getpid(); + if (0 > asprintf(&ompi_universe_info.name, "%s-%d", universe, pid) && ompi_rte_debug_flag) { + ompi_output(0, "mpi_init: error creating unique universe name"); + } + } + + ompi_process_info.my_universe = strdup(ompi_universe_info.name); + ompi_process_info.seed = true; + if (NULL != ompi_universe_info.ns_replica) { + free(ompi_universe_info.ns_replica); + } + if (NULL != ompi_process_info.ns_replica) { + free(ompi_process_info.ns_replica); + } + if (NULL != ompi_universe_info.gpr_replica) { + free(ompi_universe_info.gpr_replica); + } + if (NULL != ompi_process_info.gpr_replica) { + free(ompi_process_info.gpr_replica); + } } /* start the rest of the rte */ @@ -111,16 +140,15 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided) if (NULL != ompi_process_info.name) { /* should NOT have been previously set */ free(ompi_process_info.name); } - if (NULL == ompi_rte_get_self()) { /* no name set in environment - must be singleton */ - if (NULL == ompi_process_info.ns_replica) { /* couldn't join existing univ */ - ompi_process_info.name = ompi_name_server.create_process_name(0,0,0); - } else { /* name server exists elsewhere - get a name for me */ - jobid = ompi_name_server.create_jobid(); - vpid = ompi_name_server.reserve_range(jobid, 1); - ompi_process_info.name = ompi_name_server.create_process_name(0, jobid, vpid); - } - } else { /* name set in environment - record it */ + + if (NULL != ompi_rte_get_self()) { /* name set in environment - nonsingleton - record name */ ompi_process_info.name = ompi_rte_get_self(); + } else if (NULL == ompi_process_info.ns_replica) { /* singleton - couldn't join existing univ */ + ompi_process_info.name = ompi_name_server.create_process_name(0,0,0); + } else { /* singleton - name server exists elsewhere - get a name for me */ + jobid = ompi_name_server.create_jobid(); + vpid = ompi_name_server.reserve_range(jobid, 1); + ompi_process_info.name = ompi_name_server.create_process_name(0, jobid, vpid); } /* setup my session directory */ diff --git a/src/runtime/universe_exists.c b/src/runtime/universe_exists.c index 17eccb8311..87dcd9a236 100644 --- a/src/runtime/universe_exists.c +++ b/src/runtime/universe_exists.c @@ -53,6 +53,10 @@ int ompi_rte_universe_exists() } } else { /* name server found, now try gpr */ ns_found = true; + if (NULL != ompi_process_info.ns_replica) { + free(ompi_process_info.ns_replica); + } + ompi_process_info.ns_replica = ns_base_copy_process_name(&proc); } mca_oob_parse_contact_info(ompi_universe_info.gpr_replica, &proc, NULL); @@ -66,6 +70,10 @@ int ompi_rte_universe_exists() free(ompi_process_info.gpr_replica); } } else { + if (NULL != ompi_process_info.gpr_replica) { + free(ompi_process_info.gpr_replica); + } + ompi_process_info.gpr_replica = ns_base_copy_process_name(&proc); gpr_found = true; } @@ -155,11 +163,8 @@ int ompi_rte_universe_exists() /* ...and ping to verify it's alive */ ping_success = false; - for (i=0; i<5 && !ping_success; i++) { - ompi_output(0, "univ_exists: attempting ping number %d", i); - if (OMPI_SUCCESS == mca_oob_ping(&proc, &ompi_rte_ping_wait)) { - ping_success = true; - } + if (OMPI_SUCCESS == mca_oob_ping(&proc, &ompi_rte_ping_wait)) { + ping_success = true; } if (!ping_success) { if (ompi_rte_debug_flag) { @@ -173,6 +178,9 @@ int ompi_rte_universe_exists() ompi_process_info.ns_replica = ns_base_copy_process_name(&proc); ompi_process_info.gpr_replica = ns_base_copy_process_name(&proc); + ompi_universe_info.ns_replica = strdup(ompi_universe_info.seed_contact_info); + ompi_universe_info.gpr_replica = strdup(ompi_universe_info.seed_contact_info); + /* request ns_replica and gpr_replica info for this process * only request info required - check ns_found/gpr_found */ diff --git a/src/tools/mpirun/mpirun2.c b/src/tools/mpirun/mpirun2.c index 281e45469c..8077a14f6b 100644 --- a/src/tools/mpirun/mpirun2.c +++ b/src/tools/mpirun/mpirun2.c @@ -41,12 +41,14 @@ main(int argc, char *argv[]) ompi_cmd_line_t *cmd_line = NULL; ompi_list_t *nodelist = NULL; ompi_list_t schedlist; - mca_ns_base_jobid_t new_jobid; + mca_ns_base_jobid_t new_jobid, jobid; + mca_ns_base_vpid_t vpid; int num_procs = 1; ompi_rte_node_schedule_t *sched; char cwd[MAXPATHLEN]; char *my_contact_info, *tmp, *jobid_str, *procid_str; - char *contact_file, *filenm; + char *contact_file, *filenm, *universe; + pid_t pid; /* * Intialize our Open MPI environment @@ -165,14 +167,39 @@ main(int argc, char *argv[]) */ ompi_rte_parse_daemon_cmd_line(cmd_line); - /* eventually, this is where we will check for existing universe and - * spin one up if it isn't there. for now, though - * temporarily force to be a seed. - * - */ - ompi_process_info.seed = true; - ompi_process_info.ns_replica = NULL; - ompi_process_info.gpr_replica = NULL; + /* check for existing universe to join */ + if (OMPI_SUCCESS != (ret = ompi_rte_universe_exists())) { + if (ompi_rte_debug_flag) { + ompi_output(0, "ompi_mpi_init: could not join existing universe"); + } + if (OMPI_ERR_NOT_FOUND != ret) { + /* if it exists but no contact could be established, + * define unique name based on current one. + * and start new universe with me as seed + */ + universe = strdup(ompi_universe_info.name); + free(ompi_universe_info.name); + pid = getpid(); + if (0 > asprintf(&ompi_universe_info.name, "%s-%d", universe, pid) && ompi_rte_debug_flag) { + ompi_output(0, "mpi_init: error creating unique universe name"); + } + } + + ompi_process_info.my_universe = strdup(ompi_universe_info.name); + ompi_process_info.seed = true; + if (NULL != ompi_universe_info.ns_replica) { + free(ompi_universe_info.ns_replica); + } + if (NULL != ompi_process_info.ns_replica) { + free(ompi_process_info.ns_replica); + } + if (NULL != ompi_universe_info.gpr_replica) { + free(ompi_universe_info.gpr_replica); + } + if (NULL != ompi_process_info.gpr_replica) { + free(ompi_process_info.gpr_replica); + } + } /* setup rest of rte */ if (OMPI_SUCCESS != ompi_rte_init_stage2(&multi_thread, &hidden_thread)) { @@ -187,11 +214,10 @@ main(int argc, char *argv[]) free(ompi_process_info.name); } ompi_process_info.name = ompi_name_server.create_process_name(0, 0, 0); - } else { /* if not seed, then someone spawned me - must have provided name info */ - if (NULL != ompi_process_info.name) { /* overwrite it */ - free(ompi_process_info.name); - } - ompi_process_info.name = ompi_rte_get_self(); + } else { /* if not seed, then we joined universe - get jobid and name */ + jobid = ompi_name_server.create_jobid(); + vpid = ompi_name_server.reserve_range(jobid, 1); + ompi_process_info.name = ompi_name_server.create_process_name(0, jobid, vpid); } /* setup my session directory */ @@ -263,7 +289,11 @@ main(int argc, char *argv[]) ompi_list_append(&schedlist, (ompi_list_item_t*) sched); ompi_cmd_line_get_tail(cmd_line, &(sched->argc), &(sched->argv)); /* set initial contact info */ - my_contact_info = mca_oob_get_contact_info(); + if (ompi_process_info.seed) { /* i'm the seed - direct them towards me */ + my_contact_info = mca_oob_get_contact_info(); + } else { /* i'm not the seed - direct them to it */ + my_contact_info = strdup(ompi_universe_info.ns_replica); + } mca_pcm_base_build_base_env(environ, &(sched->envc), &(sched->env)); asprintf(&tmp, "OMPI_MCA_ns_base_replica=%s", my_contact_info); ompi_argv_append(&(sched->envc), &(sched->env), tmp); @@ -326,8 +356,10 @@ main(int argc, char *argv[]) * for now, though, remove the universe-setup.txt file so the directories * can cleanup */ - filenm = ompi_os_path(false, ompi_process_info.universe_session_dir, "universe-setup.txt", NULL); - unlink(filenm); + if (ompi_process_info.seed) { + filenm = ompi_os_path(false, ompi_process_info.universe_session_dir, "universe-setup.txt", NULL); + unlink(filenm); + } /* finalize the system */ ompi_event_fini();