1
1

Allow mpirun2 and mpi_init to cleanly detect and join an existing universe. Will continue testing to quickly move away from a non-responsive existing universe.

This commit was SVN r2729.
Этот коммит содержится в:
Ralph Castain 2004-09-16 19:45:32 +00:00
родитель 31bacaee5a
Коммит f6dc129754
3 изменённых файлов: 100 добавлений и 32 удалений

Просмотреть файл

@ -67,6 +67,8 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
ompi_proc_t** procs; ompi_proc_t** procs;
size_t nprocs; size_t nprocs;
char *error, *jobid_str, *procid_str; char *error, *jobid_str, *procid_str;
char *universe;
pid_t pid;
/* Become an OMPI process */ /* Become an OMPI process */
@ -98,6 +100,33 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
if (ompi_rte_debug_flag) { if (ompi_rte_debug_flag) {
ompi_output(0, "ompi_mpi_init: could not join existing universe"); ompi_output(0, "ompi_mpi_init: could not join existing universe");
} }
if (OMPI_ERR_NOT_FOUND != ret) {
/* if it exists but no contact could be established,
* define unique name based on current one.
* and start new universe with me as seed
*/
universe = strdup(ompi_universe_info.name);
free(ompi_universe_info.name);
pid = getpid();
if (0 > asprintf(&ompi_universe_info.name, "%s-%d", universe, pid) && ompi_rte_debug_flag) {
ompi_output(0, "mpi_init: error creating unique universe name");
}
}
ompi_process_info.my_universe = strdup(ompi_universe_info.name);
ompi_process_info.seed = true;
if (NULL != ompi_universe_info.ns_replica) {
free(ompi_universe_info.ns_replica);
}
if (NULL != ompi_process_info.ns_replica) {
free(ompi_process_info.ns_replica);
}
if (NULL != ompi_universe_info.gpr_replica) {
free(ompi_universe_info.gpr_replica);
}
if (NULL != ompi_process_info.gpr_replica) {
free(ompi_process_info.gpr_replica);
}
} }
/* start the rest of the rte */ /* start the rest of the rte */
@ -111,16 +140,15 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
if (NULL != ompi_process_info.name) { /* should NOT have been previously set */ if (NULL != ompi_process_info.name) { /* should NOT have been previously set */
free(ompi_process_info.name); free(ompi_process_info.name);
} }
if (NULL == ompi_rte_get_self()) { /* no name set in environment - must be singleton */
if (NULL == ompi_process_info.ns_replica) { /* couldn't join existing univ */ if (NULL != ompi_rte_get_self()) { /* name set in environment - nonsingleton - record name */
ompi_process_info.name = ompi_name_server.create_process_name(0,0,0);
} else { /* name server exists elsewhere - get a name for me */
jobid = ompi_name_server.create_jobid();
vpid = ompi_name_server.reserve_range(jobid, 1);
ompi_process_info.name = ompi_name_server.create_process_name(0, jobid, vpid);
}
} else { /* name set in environment - record it */
ompi_process_info.name = ompi_rte_get_self(); ompi_process_info.name = ompi_rte_get_self();
} else if (NULL == ompi_process_info.ns_replica) { /* singleton - couldn't join existing univ */
ompi_process_info.name = ompi_name_server.create_process_name(0,0,0);
} else { /* singleton - name server exists elsewhere - get a name for me */
jobid = ompi_name_server.create_jobid();
vpid = ompi_name_server.reserve_range(jobid, 1);
ompi_process_info.name = ompi_name_server.create_process_name(0, jobid, vpid);
} }
/* setup my session directory */ /* setup my session directory */

Просмотреть файл

@ -53,6 +53,10 @@ int ompi_rte_universe_exists()
} }
} else { /* name server found, now try gpr */ } else { /* name server found, now try gpr */
ns_found = true; ns_found = true;
if (NULL != ompi_process_info.ns_replica) {
free(ompi_process_info.ns_replica);
}
ompi_process_info.ns_replica = ns_base_copy_process_name(&proc);
} }
mca_oob_parse_contact_info(ompi_universe_info.gpr_replica, &proc, NULL); mca_oob_parse_contact_info(ompi_universe_info.gpr_replica, &proc, NULL);
@ -66,6 +70,10 @@ int ompi_rte_universe_exists()
free(ompi_process_info.gpr_replica); free(ompi_process_info.gpr_replica);
} }
} else { } else {
if (NULL != ompi_process_info.gpr_replica) {
free(ompi_process_info.gpr_replica);
}
ompi_process_info.gpr_replica = ns_base_copy_process_name(&proc);
gpr_found = true; gpr_found = true;
} }
@ -155,11 +163,8 @@ int ompi_rte_universe_exists()
/* ...and ping to verify it's alive */ /* ...and ping to verify it's alive */
ping_success = false; ping_success = false;
for (i=0; i<5 && !ping_success; i++) { if (OMPI_SUCCESS == mca_oob_ping(&proc, &ompi_rte_ping_wait)) {
ompi_output(0, "univ_exists: attempting ping number %d", i); ping_success = true;
if (OMPI_SUCCESS == mca_oob_ping(&proc, &ompi_rte_ping_wait)) {
ping_success = true;
}
} }
if (!ping_success) { if (!ping_success) {
if (ompi_rte_debug_flag) { if (ompi_rte_debug_flag) {
@ -173,6 +178,9 @@ int ompi_rte_universe_exists()
ompi_process_info.ns_replica = ns_base_copy_process_name(&proc); ompi_process_info.ns_replica = ns_base_copy_process_name(&proc);
ompi_process_info.gpr_replica = ns_base_copy_process_name(&proc); ompi_process_info.gpr_replica = ns_base_copy_process_name(&proc);
ompi_universe_info.ns_replica = strdup(ompi_universe_info.seed_contact_info);
ompi_universe_info.gpr_replica = strdup(ompi_universe_info.seed_contact_info);
/* request ns_replica and gpr_replica info for this process /* request ns_replica and gpr_replica info for this process
* only request info required - check ns_found/gpr_found * only request info required - check ns_found/gpr_found
*/ */

Просмотреть файл

@ -41,12 +41,14 @@ main(int argc, char *argv[])
ompi_cmd_line_t *cmd_line = NULL; ompi_cmd_line_t *cmd_line = NULL;
ompi_list_t *nodelist = NULL; ompi_list_t *nodelist = NULL;
ompi_list_t schedlist; ompi_list_t schedlist;
mca_ns_base_jobid_t new_jobid; mca_ns_base_jobid_t new_jobid, jobid;
mca_ns_base_vpid_t vpid;
int num_procs = 1; int num_procs = 1;
ompi_rte_node_schedule_t *sched; ompi_rte_node_schedule_t *sched;
char cwd[MAXPATHLEN]; char cwd[MAXPATHLEN];
char *my_contact_info, *tmp, *jobid_str, *procid_str; char *my_contact_info, *tmp, *jobid_str, *procid_str;
char *contact_file, *filenm; char *contact_file, *filenm, *universe;
pid_t pid;
/* /*
* Intialize our Open MPI environment * Intialize our Open MPI environment
@ -165,14 +167,39 @@ main(int argc, char *argv[])
*/ */
ompi_rte_parse_daemon_cmd_line(cmd_line); ompi_rte_parse_daemon_cmd_line(cmd_line);
/* eventually, this is where we will check for existing universe and /* check for existing universe to join */
* spin one up if it isn't there. for now, though if (OMPI_SUCCESS != (ret = ompi_rte_universe_exists())) {
* temporarily force to be a seed. if (ompi_rte_debug_flag) {
* ompi_output(0, "ompi_mpi_init: could not join existing universe");
*/ }
ompi_process_info.seed = true; if (OMPI_ERR_NOT_FOUND != ret) {
ompi_process_info.ns_replica = NULL; /* if it exists but no contact could be established,
ompi_process_info.gpr_replica = NULL; * define unique name based on current one.
* and start new universe with me as seed
*/
universe = strdup(ompi_universe_info.name);
free(ompi_universe_info.name);
pid = getpid();
if (0 > asprintf(&ompi_universe_info.name, "%s-%d", universe, pid) && ompi_rte_debug_flag) {
ompi_output(0, "mpi_init: error creating unique universe name");
}
}
ompi_process_info.my_universe = strdup(ompi_universe_info.name);
ompi_process_info.seed = true;
if (NULL != ompi_universe_info.ns_replica) {
free(ompi_universe_info.ns_replica);
}
if (NULL != ompi_process_info.ns_replica) {
free(ompi_process_info.ns_replica);
}
if (NULL != ompi_universe_info.gpr_replica) {
free(ompi_universe_info.gpr_replica);
}
if (NULL != ompi_process_info.gpr_replica) {
free(ompi_process_info.gpr_replica);
}
}
/* setup rest of rte */ /* setup rest of rte */
if (OMPI_SUCCESS != ompi_rte_init_stage2(&multi_thread, &hidden_thread)) { if (OMPI_SUCCESS != ompi_rte_init_stage2(&multi_thread, &hidden_thread)) {
@ -187,11 +214,10 @@ main(int argc, char *argv[])
free(ompi_process_info.name); free(ompi_process_info.name);
} }
ompi_process_info.name = ompi_name_server.create_process_name(0, 0, 0); ompi_process_info.name = ompi_name_server.create_process_name(0, 0, 0);
} else { /* if not seed, then someone spawned me - must have provided name info */ } else { /* if not seed, then we joined universe - get jobid and name */
if (NULL != ompi_process_info.name) { /* overwrite it */ jobid = ompi_name_server.create_jobid();
free(ompi_process_info.name); vpid = ompi_name_server.reserve_range(jobid, 1);
} ompi_process_info.name = ompi_name_server.create_process_name(0, jobid, vpid);
ompi_process_info.name = ompi_rte_get_self();
} }
/* setup my session directory */ /* setup my session directory */
@ -263,7 +289,11 @@ main(int argc, char *argv[])
ompi_list_append(&schedlist, (ompi_list_item_t*) sched); ompi_list_append(&schedlist, (ompi_list_item_t*) sched);
ompi_cmd_line_get_tail(cmd_line, &(sched->argc), &(sched->argv)); ompi_cmd_line_get_tail(cmd_line, &(sched->argc), &(sched->argv));
/* set initial contact info */ /* set initial contact info */
my_contact_info = mca_oob_get_contact_info(); if (ompi_process_info.seed) { /* i'm the seed - direct them towards me */
my_contact_info = mca_oob_get_contact_info();
} else { /* i'm not the seed - direct them to it */
my_contact_info = strdup(ompi_universe_info.ns_replica);
}
mca_pcm_base_build_base_env(environ, &(sched->envc), &(sched->env)); mca_pcm_base_build_base_env(environ, &(sched->envc), &(sched->env));
asprintf(&tmp, "OMPI_MCA_ns_base_replica=%s", my_contact_info); asprintf(&tmp, "OMPI_MCA_ns_base_replica=%s", my_contact_info);
ompi_argv_append(&(sched->envc), &(sched->env), tmp); ompi_argv_append(&(sched->envc), &(sched->env), tmp);
@ -326,8 +356,10 @@ main(int argc, char *argv[])
* for now, though, remove the universe-setup.txt file so the directories * for now, though, remove the universe-setup.txt file so the directories
* can cleanup * can cleanup
*/ */
filenm = ompi_os_path(false, ompi_process_info.universe_session_dir, "universe-setup.txt", NULL); if (ompi_process_info.seed) {
unlink(filenm); filenm = ompi_os_path(false, ompi_process_info.universe_session_dir, "universe-setup.txt", NULL);
unlink(filenm);
}
/* finalize the system */ /* finalize the system */
ompi_event_fini(); ompi_event_fini();