Allow mpirun2 and mpi_init to cleanly detect and join an existing universe. Will continue testing to quickly move away from a non-responsive existing universe.
This commit was SVN r2729.
Этот коммит содержится в:
родитель
31bacaee5a
Коммит
f6dc129754
@ -67,6 +67,8 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
|
|||||||
ompi_proc_t** procs;
|
ompi_proc_t** procs;
|
||||||
size_t nprocs;
|
size_t nprocs;
|
||||||
char *error, *jobid_str, *procid_str;
|
char *error, *jobid_str, *procid_str;
|
||||||
|
char *universe;
|
||||||
|
pid_t pid;
|
||||||
|
|
||||||
/* Become an OMPI process */
|
/* Become an OMPI process */
|
||||||
|
|
||||||
@ -98,6 +100,33 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
|
|||||||
if (ompi_rte_debug_flag) {
|
if (ompi_rte_debug_flag) {
|
||||||
ompi_output(0, "ompi_mpi_init: could not join existing universe");
|
ompi_output(0, "ompi_mpi_init: could not join existing universe");
|
||||||
}
|
}
|
||||||
|
if (OMPI_ERR_NOT_FOUND != ret) {
|
||||||
|
/* if it exists but no contact could be established,
|
||||||
|
* define unique name based on current one.
|
||||||
|
* and start new universe with me as seed
|
||||||
|
*/
|
||||||
|
universe = strdup(ompi_universe_info.name);
|
||||||
|
free(ompi_universe_info.name);
|
||||||
|
pid = getpid();
|
||||||
|
if (0 > asprintf(&ompi_universe_info.name, "%s-%d", universe, pid) && ompi_rte_debug_flag) {
|
||||||
|
ompi_output(0, "mpi_init: error creating unique universe name");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
ompi_process_info.my_universe = strdup(ompi_universe_info.name);
|
||||||
|
ompi_process_info.seed = true;
|
||||||
|
if (NULL != ompi_universe_info.ns_replica) {
|
||||||
|
free(ompi_universe_info.ns_replica);
|
||||||
|
}
|
||||||
|
if (NULL != ompi_process_info.ns_replica) {
|
||||||
|
free(ompi_process_info.ns_replica);
|
||||||
|
}
|
||||||
|
if (NULL != ompi_universe_info.gpr_replica) {
|
||||||
|
free(ompi_universe_info.gpr_replica);
|
||||||
|
}
|
||||||
|
if (NULL != ompi_process_info.gpr_replica) {
|
||||||
|
free(ompi_process_info.gpr_replica);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/* start the rest of the rte */
|
/* start the rest of the rte */
|
||||||
@ -111,16 +140,15 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
|
|||||||
if (NULL != ompi_process_info.name) { /* should NOT have been previously set */
|
if (NULL != ompi_process_info.name) { /* should NOT have been previously set */
|
||||||
free(ompi_process_info.name);
|
free(ompi_process_info.name);
|
||||||
}
|
}
|
||||||
if (NULL == ompi_rte_get_self()) { /* no name set in environment - must be singleton */
|
|
||||||
if (NULL == ompi_process_info.ns_replica) { /* couldn't join existing univ */
|
if (NULL != ompi_rte_get_self()) { /* name set in environment - nonsingleton - record name */
|
||||||
ompi_process_info.name = ompi_name_server.create_process_name(0,0,0);
|
|
||||||
} else { /* name server exists elsewhere - get a name for me */
|
|
||||||
jobid = ompi_name_server.create_jobid();
|
|
||||||
vpid = ompi_name_server.reserve_range(jobid, 1);
|
|
||||||
ompi_process_info.name = ompi_name_server.create_process_name(0, jobid, vpid);
|
|
||||||
}
|
|
||||||
} else { /* name set in environment - record it */
|
|
||||||
ompi_process_info.name = ompi_rte_get_self();
|
ompi_process_info.name = ompi_rte_get_self();
|
||||||
|
} else if (NULL == ompi_process_info.ns_replica) { /* singleton - couldn't join existing univ */
|
||||||
|
ompi_process_info.name = ompi_name_server.create_process_name(0,0,0);
|
||||||
|
} else { /* singleton - name server exists elsewhere - get a name for me */
|
||||||
|
jobid = ompi_name_server.create_jobid();
|
||||||
|
vpid = ompi_name_server.reserve_range(jobid, 1);
|
||||||
|
ompi_process_info.name = ompi_name_server.create_process_name(0, jobid, vpid);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* setup my session directory */
|
/* setup my session directory */
|
||||||
|
@ -53,6 +53,10 @@ int ompi_rte_universe_exists()
|
|||||||
}
|
}
|
||||||
} else { /* name server found, now try gpr */
|
} else { /* name server found, now try gpr */
|
||||||
ns_found = true;
|
ns_found = true;
|
||||||
|
if (NULL != ompi_process_info.ns_replica) {
|
||||||
|
free(ompi_process_info.ns_replica);
|
||||||
|
}
|
||||||
|
ompi_process_info.ns_replica = ns_base_copy_process_name(&proc);
|
||||||
}
|
}
|
||||||
|
|
||||||
mca_oob_parse_contact_info(ompi_universe_info.gpr_replica, &proc, NULL);
|
mca_oob_parse_contact_info(ompi_universe_info.gpr_replica, &proc, NULL);
|
||||||
@ -66,6 +70,10 @@ int ompi_rte_universe_exists()
|
|||||||
free(ompi_process_info.gpr_replica);
|
free(ompi_process_info.gpr_replica);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
|
if (NULL != ompi_process_info.gpr_replica) {
|
||||||
|
free(ompi_process_info.gpr_replica);
|
||||||
|
}
|
||||||
|
ompi_process_info.gpr_replica = ns_base_copy_process_name(&proc);
|
||||||
gpr_found = true;
|
gpr_found = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -155,11 +163,8 @@ int ompi_rte_universe_exists()
|
|||||||
|
|
||||||
/* ...and ping to verify it's alive */
|
/* ...and ping to verify it's alive */
|
||||||
ping_success = false;
|
ping_success = false;
|
||||||
for (i=0; i<5 && !ping_success; i++) {
|
if (OMPI_SUCCESS == mca_oob_ping(&proc, &ompi_rte_ping_wait)) {
|
||||||
ompi_output(0, "univ_exists: attempting ping number %d", i);
|
ping_success = true;
|
||||||
if (OMPI_SUCCESS == mca_oob_ping(&proc, &ompi_rte_ping_wait)) {
|
|
||||||
ping_success = true;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
if (!ping_success) {
|
if (!ping_success) {
|
||||||
if (ompi_rte_debug_flag) {
|
if (ompi_rte_debug_flag) {
|
||||||
@ -173,6 +178,9 @@ int ompi_rte_universe_exists()
|
|||||||
ompi_process_info.ns_replica = ns_base_copy_process_name(&proc);
|
ompi_process_info.ns_replica = ns_base_copy_process_name(&proc);
|
||||||
ompi_process_info.gpr_replica = ns_base_copy_process_name(&proc);
|
ompi_process_info.gpr_replica = ns_base_copy_process_name(&proc);
|
||||||
|
|
||||||
|
ompi_universe_info.ns_replica = strdup(ompi_universe_info.seed_contact_info);
|
||||||
|
ompi_universe_info.gpr_replica = strdup(ompi_universe_info.seed_contact_info);
|
||||||
|
|
||||||
/* request ns_replica and gpr_replica info for this process
|
/* request ns_replica and gpr_replica info for this process
|
||||||
* only request info required - check ns_found/gpr_found
|
* only request info required - check ns_found/gpr_found
|
||||||
*/
|
*/
|
||||||
|
@ -41,12 +41,14 @@ main(int argc, char *argv[])
|
|||||||
ompi_cmd_line_t *cmd_line = NULL;
|
ompi_cmd_line_t *cmd_line = NULL;
|
||||||
ompi_list_t *nodelist = NULL;
|
ompi_list_t *nodelist = NULL;
|
||||||
ompi_list_t schedlist;
|
ompi_list_t schedlist;
|
||||||
mca_ns_base_jobid_t new_jobid;
|
mca_ns_base_jobid_t new_jobid, jobid;
|
||||||
|
mca_ns_base_vpid_t vpid;
|
||||||
int num_procs = 1;
|
int num_procs = 1;
|
||||||
ompi_rte_node_schedule_t *sched;
|
ompi_rte_node_schedule_t *sched;
|
||||||
char cwd[MAXPATHLEN];
|
char cwd[MAXPATHLEN];
|
||||||
char *my_contact_info, *tmp, *jobid_str, *procid_str;
|
char *my_contact_info, *tmp, *jobid_str, *procid_str;
|
||||||
char *contact_file, *filenm;
|
char *contact_file, *filenm, *universe;
|
||||||
|
pid_t pid;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Intialize our Open MPI environment
|
* Intialize our Open MPI environment
|
||||||
@ -165,14 +167,39 @@ main(int argc, char *argv[])
|
|||||||
*/
|
*/
|
||||||
ompi_rte_parse_daemon_cmd_line(cmd_line);
|
ompi_rte_parse_daemon_cmd_line(cmd_line);
|
||||||
|
|
||||||
/* eventually, this is where we will check for existing universe and
|
/* check for existing universe to join */
|
||||||
* spin one up if it isn't there. for now, though
|
if (OMPI_SUCCESS != (ret = ompi_rte_universe_exists())) {
|
||||||
* temporarily force to be a seed.
|
if (ompi_rte_debug_flag) {
|
||||||
*
|
ompi_output(0, "ompi_mpi_init: could not join existing universe");
|
||||||
*/
|
}
|
||||||
ompi_process_info.seed = true;
|
if (OMPI_ERR_NOT_FOUND != ret) {
|
||||||
ompi_process_info.ns_replica = NULL;
|
/* if it exists but no contact could be established,
|
||||||
ompi_process_info.gpr_replica = NULL;
|
* define unique name based on current one.
|
||||||
|
* and start new universe with me as seed
|
||||||
|
*/
|
||||||
|
universe = strdup(ompi_universe_info.name);
|
||||||
|
free(ompi_universe_info.name);
|
||||||
|
pid = getpid();
|
||||||
|
if (0 > asprintf(&ompi_universe_info.name, "%s-%d", universe, pid) && ompi_rte_debug_flag) {
|
||||||
|
ompi_output(0, "mpi_init: error creating unique universe name");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
ompi_process_info.my_universe = strdup(ompi_universe_info.name);
|
||||||
|
ompi_process_info.seed = true;
|
||||||
|
if (NULL != ompi_universe_info.ns_replica) {
|
||||||
|
free(ompi_universe_info.ns_replica);
|
||||||
|
}
|
||||||
|
if (NULL != ompi_process_info.ns_replica) {
|
||||||
|
free(ompi_process_info.ns_replica);
|
||||||
|
}
|
||||||
|
if (NULL != ompi_universe_info.gpr_replica) {
|
||||||
|
free(ompi_universe_info.gpr_replica);
|
||||||
|
}
|
||||||
|
if (NULL != ompi_process_info.gpr_replica) {
|
||||||
|
free(ompi_process_info.gpr_replica);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/* setup rest of rte */
|
/* setup rest of rte */
|
||||||
if (OMPI_SUCCESS != ompi_rte_init_stage2(&multi_thread, &hidden_thread)) {
|
if (OMPI_SUCCESS != ompi_rte_init_stage2(&multi_thread, &hidden_thread)) {
|
||||||
@ -187,11 +214,10 @@ main(int argc, char *argv[])
|
|||||||
free(ompi_process_info.name);
|
free(ompi_process_info.name);
|
||||||
}
|
}
|
||||||
ompi_process_info.name = ompi_name_server.create_process_name(0, 0, 0);
|
ompi_process_info.name = ompi_name_server.create_process_name(0, 0, 0);
|
||||||
} else { /* if not seed, then someone spawned me - must have provided name info */
|
} else { /* if not seed, then we joined universe - get jobid and name */
|
||||||
if (NULL != ompi_process_info.name) { /* overwrite it */
|
jobid = ompi_name_server.create_jobid();
|
||||||
free(ompi_process_info.name);
|
vpid = ompi_name_server.reserve_range(jobid, 1);
|
||||||
}
|
ompi_process_info.name = ompi_name_server.create_process_name(0, jobid, vpid);
|
||||||
ompi_process_info.name = ompi_rte_get_self();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/* setup my session directory */
|
/* setup my session directory */
|
||||||
@ -263,7 +289,11 @@ main(int argc, char *argv[])
|
|||||||
ompi_list_append(&schedlist, (ompi_list_item_t*) sched);
|
ompi_list_append(&schedlist, (ompi_list_item_t*) sched);
|
||||||
ompi_cmd_line_get_tail(cmd_line, &(sched->argc), &(sched->argv));
|
ompi_cmd_line_get_tail(cmd_line, &(sched->argc), &(sched->argv));
|
||||||
/* set initial contact info */
|
/* set initial contact info */
|
||||||
my_contact_info = mca_oob_get_contact_info();
|
if (ompi_process_info.seed) { /* i'm the seed - direct them towards me */
|
||||||
|
my_contact_info = mca_oob_get_contact_info();
|
||||||
|
} else { /* i'm not the seed - direct them to it */
|
||||||
|
my_contact_info = strdup(ompi_universe_info.ns_replica);
|
||||||
|
}
|
||||||
mca_pcm_base_build_base_env(environ, &(sched->envc), &(sched->env));
|
mca_pcm_base_build_base_env(environ, &(sched->envc), &(sched->env));
|
||||||
asprintf(&tmp, "OMPI_MCA_ns_base_replica=%s", my_contact_info);
|
asprintf(&tmp, "OMPI_MCA_ns_base_replica=%s", my_contact_info);
|
||||||
ompi_argv_append(&(sched->envc), &(sched->env), tmp);
|
ompi_argv_append(&(sched->envc), &(sched->env), tmp);
|
||||||
@ -326,8 +356,10 @@ main(int argc, char *argv[])
|
|||||||
* for now, though, remove the universe-setup.txt file so the directories
|
* for now, though, remove the universe-setup.txt file so the directories
|
||||||
* can cleanup
|
* can cleanup
|
||||||
*/
|
*/
|
||||||
filenm = ompi_os_path(false, ompi_process_info.universe_session_dir, "universe-setup.txt", NULL);
|
if (ompi_process_info.seed) {
|
||||||
unlink(filenm);
|
filenm = ompi_os_path(false, ompi_process_info.universe_session_dir, "universe-setup.txt", NULL);
|
||||||
|
unlink(filenm);
|
||||||
|
}
|
||||||
|
|
||||||
/* finalize the system */
|
/* finalize the system */
|
||||||
ompi_event_fini();
|
ompi_event_fini();
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user