1
1

Allow mpirun2 and mpi_init to cleanly detect and join an existing universe. Will continue testing to quickly move away from a non-responsive existing universe.

This commit was SVN r2729.
Этот коммит содержится в:
Ralph Castain 2004-09-16 19:45:32 +00:00
родитель 31bacaee5a
Коммит f6dc129754
3 изменённых файлов: 100 добавлений и 32 удалений

Просмотреть файл

@ -67,6 +67,8 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
ompi_proc_t** procs;
size_t nprocs;
char *error, *jobid_str, *procid_str;
char *universe;
pid_t pid;
/* Become an OMPI process */
@ -98,6 +100,33 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
if (ompi_rte_debug_flag) {
ompi_output(0, "ompi_mpi_init: could not join existing universe");
}
if (OMPI_ERR_NOT_FOUND != ret) {
/* if it exists but no contact could be established,
* define unique name based on current one.
* and start new universe with me as seed
*/
universe = strdup(ompi_universe_info.name);
free(ompi_universe_info.name);
pid = getpid();
if (0 > asprintf(&ompi_universe_info.name, "%s-%d", universe, pid) && ompi_rte_debug_flag) {
ompi_output(0, "mpi_init: error creating unique universe name");
}
}
ompi_process_info.my_universe = strdup(ompi_universe_info.name);
ompi_process_info.seed = true;
if (NULL != ompi_universe_info.ns_replica) {
free(ompi_universe_info.ns_replica);
}
if (NULL != ompi_process_info.ns_replica) {
free(ompi_process_info.ns_replica);
}
if (NULL != ompi_universe_info.gpr_replica) {
free(ompi_universe_info.gpr_replica);
}
if (NULL != ompi_process_info.gpr_replica) {
free(ompi_process_info.gpr_replica);
}
}
/* start the rest of the rte */
@ -111,17 +140,16 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
if (NULL != ompi_process_info.name) { /* should NOT have been previously set */
free(ompi_process_info.name);
}
if (NULL == ompi_rte_get_self()) { /* no name set in environment - must be singleton */
if (NULL == ompi_process_info.ns_replica) { /* couldn't join existing univ */
if (NULL != ompi_rte_get_self()) { /* name set in environment - nonsingleton - record name */
ompi_process_info.name = ompi_rte_get_self();
} else if (NULL == ompi_process_info.ns_replica) { /* singleton - couldn't join existing univ */
ompi_process_info.name = ompi_name_server.create_process_name(0,0,0);
} else { /* name server exists elsewhere - get a name for me */
} else { /* singleton - name server exists elsewhere - get a name for me */
jobid = ompi_name_server.create_jobid();
vpid = ompi_name_server.reserve_range(jobid, 1);
ompi_process_info.name = ompi_name_server.create_process_name(0, jobid, vpid);
}
} else { /* name set in environment - record it */
ompi_process_info.name = ompi_rte_get_self();
}
/* setup my session directory */
jobid_str = ompi_name_server.get_jobid_string(ompi_process_info.name);

Просмотреть файл

@ -53,6 +53,10 @@ int ompi_rte_universe_exists()
}
} else { /* name server found, now try gpr */
ns_found = true;
if (NULL != ompi_process_info.ns_replica) {
free(ompi_process_info.ns_replica);
}
ompi_process_info.ns_replica = ns_base_copy_process_name(&proc);
}
mca_oob_parse_contact_info(ompi_universe_info.gpr_replica, &proc, NULL);
@ -66,6 +70,10 @@ int ompi_rte_universe_exists()
free(ompi_process_info.gpr_replica);
}
} else {
if (NULL != ompi_process_info.gpr_replica) {
free(ompi_process_info.gpr_replica);
}
ompi_process_info.gpr_replica = ns_base_copy_process_name(&proc);
gpr_found = true;
}
@ -155,12 +163,9 @@ int ompi_rte_universe_exists()
/* ...and ping to verify it's alive */
ping_success = false;
for (i=0; i<5 && !ping_success; i++) {
ompi_output(0, "univ_exists: attempting ping number %d", i);
if (OMPI_SUCCESS == mca_oob_ping(&proc, &ompi_rte_ping_wait)) {
ping_success = true;
}
}
if (!ping_success) {
if (ompi_rte_debug_flag) {
ompi_output(0, "ping failed");
@ -173,6 +178,9 @@ int ompi_rte_universe_exists()
ompi_process_info.ns_replica = ns_base_copy_process_name(&proc);
ompi_process_info.gpr_replica = ns_base_copy_process_name(&proc);
ompi_universe_info.ns_replica = strdup(ompi_universe_info.seed_contact_info);
ompi_universe_info.gpr_replica = strdup(ompi_universe_info.seed_contact_info);
/* request ns_replica and gpr_replica info for this process
* only request info required - check ns_found/gpr_found
*/

Просмотреть файл

@ -41,12 +41,14 @@ main(int argc, char *argv[])
ompi_cmd_line_t *cmd_line = NULL;
ompi_list_t *nodelist = NULL;
ompi_list_t schedlist;
mca_ns_base_jobid_t new_jobid;
mca_ns_base_jobid_t new_jobid, jobid;
mca_ns_base_vpid_t vpid;
int num_procs = 1;
ompi_rte_node_schedule_t *sched;
char cwd[MAXPATHLEN];
char *my_contact_info, *tmp, *jobid_str, *procid_str;
char *contact_file, *filenm;
char *contact_file, *filenm, *universe;
pid_t pid;
/*
* Intialize our Open MPI environment
@ -165,14 +167,39 @@ main(int argc, char *argv[])
*/
ompi_rte_parse_daemon_cmd_line(cmd_line);
/* eventually, this is where we will check for existing universe and
* spin one up if it isn't there. for now, though
* temporarily force to be a seed.
*
/* check for existing universe to join */
if (OMPI_SUCCESS != (ret = ompi_rte_universe_exists())) {
if (ompi_rte_debug_flag) {
ompi_output(0, "ompi_mpi_init: could not join existing universe");
}
if (OMPI_ERR_NOT_FOUND != ret) {
/* if it exists but no contact could be established,
* define unique name based on current one.
* and start new universe with me as seed
*/
universe = strdup(ompi_universe_info.name);
free(ompi_universe_info.name);
pid = getpid();
if (0 > asprintf(&ompi_universe_info.name, "%s-%d", universe, pid) && ompi_rte_debug_flag) {
ompi_output(0, "mpi_init: error creating unique universe name");
}
}
ompi_process_info.my_universe = strdup(ompi_universe_info.name);
ompi_process_info.seed = true;
ompi_process_info.ns_replica = NULL;
ompi_process_info.gpr_replica = NULL;
if (NULL != ompi_universe_info.ns_replica) {
free(ompi_universe_info.ns_replica);
}
if (NULL != ompi_process_info.ns_replica) {
free(ompi_process_info.ns_replica);
}
if (NULL != ompi_universe_info.gpr_replica) {
free(ompi_universe_info.gpr_replica);
}
if (NULL != ompi_process_info.gpr_replica) {
free(ompi_process_info.gpr_replica);
}
}
/* setup rest of rte */
if (OMPI_SUCCESS != ompi_rte_init_stage2(&multi_thread, &hidden_thread)) {
@ -187,11 +214,10 @@ main(int argc, char *argv[])
free(ompi_process_info.name);
}
ompi_process_info.name = ompi_name_server.create_process_name(0, 0, 0);
} else { /* if not seed, then someone spawned me - must have provided name info */
if (NULL != ompi_process_info.name) { /* overwrite it */
free(ompi_process_info.name);
}
ompi_process_info.name = ompi_rte_get_self();
} else { /* if not seed, then we joined universe - get jobid and name */
jobid = ompi_name_server.create_jobid();
vpid = ompi_name_server.reserve_range(jobid, 1);
ompi_process_info.name = ompi_name_server.create_process_name(0, jobid, vpid);
}
/* setup my session directory */
@ -263,7 +289,11 @@ main(int argc, char *argv[])
ompi_list_append(&schedlist, (ompi_list_item_t*) sched);
ompi_cmd_line_get_tail(cmd_line, &(sched->argc), &(sched->argv));
/* set initial contact info */
if (ompi_process_info.seed) { /* i'm the seed - direct them towards me */
my_contact_info = mca_oob_get_contact_info();
} else { /* i'm not the seed - direct them to it */
my_contact_info = strdup(ompi_universe_info.ns_replica);
}
mca_pcm_base_build_base_env(environ, &(sched->envc), &(sched->env));
asprintf(&tmp, "OMPI_MCA_ns_base_replica=%s", my_contact_info);
ompi_argv_append(&(sched->envc), &(sched->env), tmp);
@ -326,8 +356,10 @@ main(int argc, char *argv[])
* for now, though, remove the universe-setup.txt file so the directories
* can cleanup
*/
if (ompi_process_info.seed) {
filenm = ompi_os_path(false, ompi_process_info.universe_session_dir, "universe-setup.txt", NULL);
unlink(filenm);
}
/* finalize the system */
ompi_event_fini();