Workaround the mca_oob_ping problem by doing rapid multiple checks - works just fine.
We now have the ability to generate and join a persistent universe. You can create one in two ways: (a) issue the "openmpi" command. This will fork/exec a seed daemon on your local host. You can specify a universe name or else it will just use the default. (b) issue the "ompid -seed" command. Starts the seed up directly. Takes all the same options as openmpi. I will be adjusting mpirun2 and mpi_init to allow connection to existing persistent universes, but they don't do it right now. The ompiconsole program simply issues an exit command to the persistent universe, so you can use it to shut the universe down if you like (or a kill -9 - works too). This commit was SVN r2629.
Этот коммит содержится в:
родитель
63237b9b26
Коммит
57ceb5225e
@ -29,15 +29,15 @@
|
||||
#include "runtime/runtime.h"
|
||||
|
||||
|
||||
static struct timeval ompi_rte_ping_wait = {30, 0};
|
||||
static struct timeval ompi_rte_ping_wait = {2, 0};
|
||||
|
||||
|
||||
int ompi_rte_universe_exists()
|
||||
{
|
||||
char *contact_file;
|
||||
int ret;
|
||||
int ret, i;
|
||||
ompi_process_name_t proc={0,0,0};
|
||||
bool ns_found, gpr_found;
|
||||
bool ns_found, gpr_found, ping_success;
|
||||
|
||||
/* if both ns_replica and gpr_replica were provided, check for contact with them */
|
||||
if (NULL != ompi_universe_info.ns_replica && NULL != ompi_universe_info.gpr_replica) {
|
||||
@ -154,7 +154,13 @@ int ompi_rte_universe_exists()
|
||||
|
||||
|
||||
/* ...and ping to verify it's alive */
|
||||
if (OMPI_SUCCESS != mca_oob_ping(&proc, &ompi_rte_ping_wait)) {
|
||||
ping_success = false;
|
||||
for (i=0; i<5 && !ping_success; i++) {
|
||||
if (OMPI_SUCCESS == mca_oob_ping(&proc, &ompi_rte_ping_wait)) {
|
||||
ping_success = true;
|
||||
}
|
||||
}
|
||||
if (!ping_success) {
|
||||
if (ompi_rte_debug_flag) {
|
||||
ompi_output(0, "ping failed");
|
||||
}
|
||||
|
@ -130,9 +130,6 @@ main(int argc, char *argv[])
|
||||
printf("num_procs: %d\n", num_procs);
|
||||
}
|
||||
|
||||
/* get the rte command line options */
|
||||
ompi_rte_parse_cmd_line(cmd_line);
|
||||
|
||||
/*
|
||||
* Start the Open MPI Run Time Environment
|
||||
*/
|
||||
|
@ -59,11 +59,6 @@ int main(int argc, char **argv)
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* get the system info and setup defaults */
|
||||
ompi_sys_info();
|
||||
ompi_universe_info.host = strdup(ompi_system_info.nodename);
|
||||
ompi_universe_info.uid = strdup(ompi_system_info.user);
|
||||
|
||||
/* give myself default bootstrap name */
|
||||
ompi_process_info.name = ns_base_create_process_name(MCA_NS_BASE_CELLID_MAX,
|
||||
MCA_NS_BASE_JOBID_MAX,
|
||||
@ -114,18 +109,29 @@ int main(int argc, char **argv)
|
||||
exit(1);
|
||||
}
|
||||
|
||||
/* parse the cmd_line for rte options - override settings from enviro, where necessary
|
||||
* copy everything into enviro variables for passing later on
|
||||
*/
|
||||
ompi_rte_parse_cmd_line(cmd_line);
|
||||
|
||||
/* start the initial barebones RTE (just OOB) so we can check universe existence */
|
||||
if (OMPI_SUCCESS != (ret = mca_base_open())) {
|
||||
/* JMS show_help */
|
||||
printf("show_help: mca_base_open failed\n");
|
||||
exit(ret);
|
||||
}
|
||||
ompi_rte_init_stage1(&multi_thread, &hidden_thread); /* gets universe and tmpdir enviro variables */
|
||||
|
||||
multi_thread = true;
|
||||
hidden_thread = false;
|
||||
if (OMPI_SUCCESS != ompi_rte_init_stage1(&multi_thread, &hidden_thread)) {
|
||||
printf("show_help: openmpi failed in ompi_rte_init\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
/* parse environmental variables and fill corresponding info structures
|
||||
* need the oob to be open so we can pass the contact info we extract
|
||||
*/
|
||||
ompi_rte_parse_environ();
|
||||
|
||||
/* parse the cmd_line for rte options - override settings from enviro, where necessary
|
||||
* copy everything into enviro variables for passing later on
|
||||
*/
|
||||
ompi_rte_parse_cmd_line(cmd_line);
|
||||
|
||||
/* parse the cmd_line for daemon options - gets all the options relating
|
||||
* specifically to seed behavior, but also gets
|
||||
@ -172,25 +178,16 @@ int main(int argc, char **argv)
|
||||
fprintf(stderr, "unable to fork - please report error to bugs@open-mpi.org\n");
|
||||
exit(1);
|
||||
} else if (pid != 0) {
|
||||
ompi_rte_finalize();
|
||||
mca_base_close();
|
||||
ompi_finalize();
|
||||
exit(0); /* parent goes bye-bye */
|
||||
}
|
||||
if (0 > execvp("ompid", argv)) {
|
||||
fprintf(stderr, "unable to exec daemon - please report error to bugs@open-mpi.org\n");
|
||||
fprintf(stderr, "errno: %s\n", strerror(errno));
|
||||
ompi_rte_finalize();
|
||||
mca_base_close();
|
||||
ompi_finalize();
|
||||
exit(1);
|
||||
|
||||
}
|
||||
} else {
|
||||
fprintf(stderr, "local universe check reports not implemented code\n");
|
||||
}
|
||||
ompi_rte_finalize();
|
||||
mca_base_close();
|
||||
ompi_finalize();
|
||||
return -1;
|
||||
}
|
||||
|
Загрузка…
Ссылка в новой задаче
Block a user