Allow mpirun2 and mpi_init to cleanly detect and join an existing universe. Will continue testing to quickly move away from a non-responsive existing universe.
This commit was SVN r2729.
Этот коммит содержится в:
родитель
31bacaee5a
Коммит
f6dc129754
@ -67,6 +67,8 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
|
||||
ompi_proc_t** procs;
|
||||
size_t nprocs;
|
||||
char *error, *jobid_str, *procid_str;
|
||||
char *universe;
|
||||
pid_t pid;
|
||||
|
||||
/* Become an OMPI process */
|
||||
|
||||
@ -98,6 +100,33 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
|
||||
if (ompi_rte_debug_flag) {
|
||||
ompi_output(0, "ompi_mpi_init: could not join existing universe");
|
||||
}
|
||||
if (OMPI_ERR_NOT_FOUND != ret) {
|
||||
/* if it exists but no contact could be established,
|
||||
* define unique name based on current one.
|
||||
* and start new universe with me as seed
|
||||
*/
|
||||
universe = strdup(ompi_universe_info.name);
|
||||
free(ompi_universe_info.name);
|
||||
pid = getpid();
|
||||
if (0 > asprintf(&ompi_universe_info.name, "%s-%d", universe, pid) && ompi_rte_debug_flag) {
|
||||
ompi_output(0, "mpi_init: error creating unique universe name");
|
||||
}
|
||||
}
|
||||
|
||||
ompi_process_info.my_universe = strdup(ompi_universe_info.name);
|
||||
ompi_process_info.seed = true;
|
||||
if (NULL != ompi_universe_info.ns_replica) {
|
||||
free(ompi_universe_info.ns_replica);
|
||||
}
|
||||
if (NULL != ompi_process_info.ns_replica) {
|
||||
free(ompi_process_info.ns_replica);
|
||||
}
|
||||
if (NULL != ompi_universe_info.gpr_replica) {
|
||||
free(ompi_universe_info.gpr_replica);
|
||||
}
|
||||
if (NULL != ompi_process_info.gpr_replica) {
|
||||
free(ompi_process_info.gpr_replica);
|
||||
}
|
||||
}
|
||||
|
||||
/* start the rest of the rte */
|
||||
@ -111,17 +140,16 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
|
||||
if (NULL != ompi_process_info.name) { /* should NOT have been previously set */
|
||||
free(ompi_process_info.name);
|
||||
}
|
||||
if (NULL == ompi_rte_get_self()) { /* no name set in environment - must be singleton */
|
||||
if (NULL == ompi_process_info.ns_replica) { /* couldn't join existing univ */
|
||||
|
||||
if (NULL != ompi_rte_get_self()) { /* name set in environment - nonsingleton - record name */
|
||||
ompi_process_info.name = ompi_rte_get_self();
|
||||
} else if (NULL == ompi_process_info.ns_replica) { /* singleton - couldn't join existing univ */
|
||||
ompi_process_info.name = ompi_name_server.create_process_name(0,0,0);
|
||||
} else { /* name server exists elsewhere - get a name for me */
|
||||
} else { /* singleton - name server exists elsewhere - get a name for me */
|
||||
jobid = ompi_name_server.create_jobid();
|
||||
vpid = ompi_name_server.reserve_range(jobid, 1);
|
||||
ompi_process_info.name = ompi_name_server.create_process_name(0, jobid, vpid);
|
||||
}
|
||||
} else { /* name set in environment - record it */
|
||||
ompi_process_info.name = ompi_rte_get_self();
|
||||
}
|
||||
|
||||
/* setup my session directory */
|
||||
jobid_str = ompi_name_server.get_jobid_string(ompi_process_info.name);
|
||||
|
@ -53,6 +53,10 @@ int ompi_rte_universe_exists()
|
||||
}
|
||||
} else { /* name server found, now try gpr */
|
||||
ns_found = true;
|
||||
if (NULL != ompi_process_info.ns_replica) {
|
||||
free(ompi_process_info.ns_replica);
|
||||
}
|
||||
ompi_process_info.ns_replica = ns_base_copy_process_name(&proc);
|
||||
}
|
||||
|
||||
mca_oob_parse_contact_info(ompi_universe_info.gpr_replica, &proc, NULL);
|
||||
@ -66,6 +70,10 @@ int ompi_rte_universe_exists()
|
||||
free(ompi_process_info.gpr_replica);
|
||||
}
|
||||
} else {
|
||||
if (NULL != ompi_process_info.gpr_replica) {
|
||||
free(ompi_process_info.gpr_replica);
|
||||
}
|
||||
ompi_process_info.gpr_replica = ns_base_copy_process_name(&proc);
|
||||
gpr_found = true;
|
||||
}
|
||||
|
||||
@ -155,12 +163,9 @@ int ompi_rte_universe_exists()
|
||||
|
||||
/* ...and ping to verify it's alive */
|
||||
ping_success = false;
|
||||
for (i=0; i<5 && !ping_success; i++) {
|
||||
ompi_output(0, "univ_exists: attempting ping number %d", i);
|
||||
if (OMPI_SUCCESS == mca_oob_ping(&proc, &ompi_rte_ping_wait)) {
|
||||
ping_success = true;
|
||||
}
|
||||
}
|
||||
if (!ping_success) {
|
||||
if (ompi_rte_debug_flag) {
|
||||
ompi_output(0, "ping failed");
|
||||
@ -173,6 +178,9 @@ int ompi_rte_universe_exists()
|
||||
ompi_process_info.ns_replica = ns_base_copy_process_name(&proc);
|
||||
ompi_process_info.gpr_replica = ns_base_copy_process_name(&proc);
|
||||
|
||||
ompi_universe_info.ns_replica = strdup(ompi_universe_info.seed_contact_info);
|
||||
ompi_universe_info.gpr_replica = strdup(ompi_universe_info.seed_contact_info);
|
||||
|
||||
/* request ns_replica and gpr_replica info for this process
|
||||
* only request info required - check ns_found/gpr_found
|
||||
*/
|
||||
|
@ -41,12 +41,14 @@ main(int argc, char *argv[])
|
||||
ompi_cmd_line_t *cmd_line = NULL;
|
||||
ompi_list_t *nodelist = NULL;
|
||||
ompi_list_t schedlist;
|
||||
mca_ns_base_jobid_t new_jobid;
|
||||
mca_ns_base_jobid_t new_jobid, jobid;
|
||||
mca_ns_base_vpid_t vpid;
|
||||
int num_procs = 1;
|
||||
ompi_rte_node_schedule_t *sched;
|
||||
char cwd[MAXPATHLEN];
|
||||
char *my_contact_info, *tmp, *jobid_str, *procid_str;
|
||||
char *contact_file, *filenm;
|
||||
char *contact_file, *filenm, *universe;
|
||||
pid_t pid;
|
||||
|
||||
/*
|
||||
* Intialize our Open MPI environment
|
||||
@ -165,14 +167,39 @@ main(int argc, char *argv[])
|
||||
*/
|
||||
ompi_rte_parse_daemon_cmd_line(cmd_line);
|
||||
|
||||
/* eventually, this is where we will check for existing universe and
|
||||
* spin one up if it isn't there. for now, though
|
||||
* temporarily force to be a seed.
|
||||
*
|
||||
/* check for existing universe to join */
|
||||
if (OMPI_SUCCESS != (ret = ompi_rte_universe_exists())) {
|
||||
if (ompi_rte_debug_flag) {
|
||||
ompi_output(0, "ompi_mpi_init: could not join existing universe");
|
||||
}
|
||||
if (OMPI_ERR_NOT_FOUND != ret) {
|
||||
/* if it exists but no contact could be established,
|
||||
* define unique name based on current one.
|
||||
* and start new universe with me as seed
|
||||
*/
|
||||
universe = strdup(ompi_universe_info.name);
|
||||
free(ompi_universe_info.name);
|
||||
pid = getpid();
|
||||
if (0 > asprintf(&ompi_universe_info.name, "%s-%d", universe, pid) && ompi_rte_debug_flag) {
|
||||
ompi_output(0, "mpi_init: error creating unique universe name");
|
||||
}
|
||||
}
|
||||
|
||||
ompi_process_info.my_universe = strdup(ompi_universe_info.name);
|
||||
ompi_process_info.seed = true;
|
||||
ompi_process_info.ns_replica = NULL;
|
||||
ompi_process_info.gpr_replica = NULL;
|
||||
if (NULL != ompi_universe_info.ns_replica) {
|
||||
free(ompi_universe_info.ns_replica);
|
||||
}
|
||||
if (NULL != ompi_process_info.ns_replica) {
|
||||
free(ompi_process_info.ns_replica);
|
||||
}
|
||||
if (NULL != ompi_universe_info.gpr_replica) {
|
||||
free(ompi_universe_info.gpr_replica);
|
||||
}
|
||||
if (NULL != ompi_process_info.gpr_replica) {
|
||||
free(ompi_process_info.gpr_replica);
|
||||
}
|
||||
}
|
||||
|
||||
/* setup rest of rte */
|
||||
if (OMPI_SUCCESS != ompi_rte_init_stage2(&multi_thread, &hidden_thread)) {
|
||||
@ -187,11 +214,10 @@ main(int argc, char *argv[])
|
||||
free(ompi_process_info.name);
|
||||
}
|
||||
ompi_process_info.name = ompi_name_server.create_process_name(0, 0, 0);
|
||||
} else { /* if not seed, then someone spawned me - must have provided name info */
|
||||
if (NULL != ompi_process_info.name) { /* overwrite it */
|
||||
free(ompi_process_info.name);
|
||||
}
|
||||
ompi_process_info.name = ompi_rte_get_self();
|
||||
} else { /* if not seed, then we joined universe - get jobid and name */
|
||||
jobid = ompi_name_server.create_jobid();
|
||||
vpid = ompi_name_server.reserve_range(jobid, 1);
|
||||
ompi_process_info.name = ompi_name_server.create_process_name(0, jobid, vpid);
|
||||
}
|
||||
|
||||
/* setup my session directory */
|
||||
@ -263,7 +289,11 @@ main(int argc, char *argv[])
|
||||
ompi_list_append(&schedlist, (ompi_list_item_t*) sched);
|
||||
ompi_cmd_line_get_tail(cmd_line, &(sched->argc), &(sched->argv));
|
||||
/* set initial contact info */
|
||||
if (ompi_process_info.seed) { /* i'm the seed - direct them towards me */
|
||||
my_contact_info = mca_oob_get_contact_info();
|
||||
} else { /* i'm not the seed - direct them to it */
|
||||
my_contact_info = strdup(ompi_universe_info.ns_replica);
|
||||
}
|
||||
mca_pcm_base_build_base_env(environ, &(sched->envc), &(sched->env));
|
||||
asprintf(&tmp, "OMPI_MCA_ns_base_replica=%s", my_contact_info);
|
||||
ompi_argv_append(&(sched->envc), &(sched->env), tmp);
|
||||
@ -326,8 +356,10 @@ main(int argc, char *argv[])
|
||||
* for now, though, remove the universe-setup.txt file so the directories
|
||||
* can cleanup
|
||||
*/
|
||||
if (ompi_process_info.seed) {
|
||||
filenm = ompi_os_path(false, ompi_process_info.universe_session_dir, "universe-setup.txt", NULL);
|
||||
unlink(filenm);
|
||||
}
|
||||
|
||||
/* finalize the system */
|
||||
ompi_event_fini();
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user