1
1

Workaround the mca_oob_ping problem by doing rapid multiple checks - works just fine.

We now have the ability to generate and join a persistent universe. You can create one in two ways:

(a) issue the "openmpi" command. This will fork/exec a seed daemon on your local host. You can specify a universe name or else it will just use the default.

(b) issue the "ompid -seed" command. Starts the seed up directly. Takes all the same options as openmpi.

I will be adjusting mpirun2 and mpi_init to allow connection to existing persistent universes, but they don't do it right now. The ompiconsole program simply issues an exit command to the persistent universe, so you can use it to shut the universe down if you like (or a kill -9  - works too).

This commit was SVN r2629.
Этот коммит содержится в:
Ralph Castain 2004-09-13 14:14:00 +00:00
родитель 63237b9b26
Коммит 57ceb5225e
3 изменённых файлов: 27 добавлений и 27 удалений

Просмотреть файл

@ -29,15 +29,15 @@
#include "runtime/runtime.h" #include "runtime/runtime.h"
static struct timeval ompi_rte_ping_wait = {30, 0}; static struct timeval ompi_rte_ping_wait = {2, 0};
int ompi_rte_universe_exists() int ompi_rte_universe_exists()
{ {
char *contact_file; char *contact_file;
int ret; int ret, i;
ompi_process_name_t proc={0,0,0}; ompi_process_name_t proc={0,0,0};
bool ns_found, gpr_found; bool ns_found, gpr_found, ping_success;
/* if both ns_replica and gpr_replica were provided, check for contact with them */ /* if both ns_replica and gpr_replica were provided, check for contact with them */
if (NULL != ompi_universe_info.ns_replica && NULL != ompi_universe_info.gpr_replica) { if (NULL != ompi_universe_info.ns_replica && NULL != ompi_universe_info.gpr_replica) {
@ -154,7 +154,13 @@ int ompi_rte_universe_exists()
/* ...and ping to verify it's alive */ /* ...and ping to verify it's alive */
if (OMPI_SUCCESS != mca_oob_ping(&proc, &ompi_rte_ping_wait)) { ping_success = false;
for (i=0; i<5 && !ping_success; i++) {
if (OMPI_SUCCESS == mca_oob_ping(&proc, &ompi_rte_ping_wait)) {
ping_success = true;
}
}
if (!ping_success) {
if (ompi_rte_debug_flag) { if (ompi_rte_debug_flag) {
ompi_output(0, "ping failed"); ompi_output(0, "ping failed");
} }

Просмотреть файл

@ -130,9 +130,6 @@ main(int argc, char *argv[])
printf("num_procs: %d\n", num_procs); printf("num_procs: %d\n", num_procs);
} }
/* get the rte command line options */
ompi_rte_parse_cmd_line(cmd_line);
/* /*
* Start the Open MPI Run Time Environment * Start the Open MPI Run Time Environment
*/ */

Просмотреть файл

@ -59,11 +59,6 @@ int main(int argc, char **argv)
return ret; return ret;
} }
/* get the system info and setup defaults */
ompi_sys_info();
ompi_universe_info.host = strdup(ompi_system_info.nodename);
ompi_universe_info.uid = strdup(ompi_system_info.user);
/* give myself default bootstrap name */ /* give myself default bootstrap name */
ompi_process_info.name = ns_base_create_process_name(MCA_NS_BASE_CELLID_MAX, ompi_process_info.name = ns_base_create_process_name(MCA_NS_BASE_CELLID_MAX,
MCA_NS_BASE_JOBID_MAX, MCA_NS_BASE_JOBID_MAX,
@ -114,18 +109,29 @@ int main(int argc, char **argv)
exit(1); exit(1);
} }
/* parse the cmd_line for rte options - override settings from enviro, where necessary
* copy everything into enviro variables for passing later on
*/
ompi_rte_parse_cmd_line(cmd_line);
/* start the initial barebones RTE (just OOB) so we can check universe existence */ /* start the initial barebones RTE (just OOB) so we can check universe existence */
if (OMPI_SUCCESS != (ret = mca_base_open())) { if (OMPI_SUCCESS != (ret = mca_base_open())) {
/* JMS show_help */ /* JMS show_help */
printf("show_help: mca_base_open failed\n"); printf("show_help: mca_base_open failed\n");
exit(ret); exit(ret);
} }
ompi_rte_init_stage1(&multi_thread, &hidden_thread); /* gets universe and tmpdir enviro variables */
multi_thread = true;
hidden_thread = false;
if (OMPI_SUCCESS != ompi_rte_init_stage1(&multi_thread, &hidden_thread)) {
printf("show_help: openmpi failed in ompi_rte_init\n");
exit(1);
}
/* parse environmental variables and fill corresponding info structures
* need the oob to be open so we can pass the contact info we extract
*/
ompi_rte_parse_environ();
/* parse the cmd_line for rte options - override settings from enviro, where necessary
* copy everything into enviro variables for passing later on
*/
ompi_rte_parse_cmd_line(cmd_line);
/* parse the cmd_line for daemon options - gets all the options relating /* parse the cmd_line for daemon options - gets all the options relating
* specifically to seed behavior, but also gets * specifically to seed behavior, but also gets
@ -172,25 +178,16 @@ int main(int argc, char **argv)
fprintf(stderr, "unable to fork - please report error to bugs@open-mpi.org\n"); fprintf(stderr, "unable to fork - please report error to bugs@open-mpi.org\n");
exit(1); exit(1);
} else if (pid != 0) { } else if (pid != 0) {
ompi_rte_finalize();
mca_base_close();
ompi_finalize();
exit(0); /* parent goes bye-bye */ exit(0); /* parent goes bye-bye */
} }
if (0 > execvp("ompid", argv)) { if (0 > execvp("ompid", argv)) {
fprintf(stderr, "unable to exec daemon - please report error to bugs@open-mpi.org\n"); fprintf(stderr, "unable to exec daemon - please report error to bugs@open-mpi.org\n");
fprintf(stderr, "errno: %s\n", strerror(errno)); fprintf(stderr, "errno: %s\n", strerror(errno));
ompi_rte_finalize();
mca_base_close();
ompi_finalize();
exit(1); exit(1);
} }
} else { } else {
fprintf(stderr, "local universe check reports not implemented code\n"); fprintf(stderr, "local universe check reports not implemented code\n");
} }
ompi_rte_finalize();
mca_base_close();
ompi_finalize();
return -1; return -1;
} }