From b7b0ee1905531a5640841dc9a31a13fe6822ff65 Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Tue, 24 Aug 2004 03:04:41 +0000 Subject: [PATCH] Checkpointing the openmpi program. Added ability to write/read universe setup files in session directory. Added logic for connecting to existing universe. This commit was SVN r2274. --- src/tools/openmpi/openmpi.c | 94 ++++++++++----------- src/tools/openmpi/openmpi.h | 2 + src/util/Makefile.am | 2 + src/util/universe_setup_file_io.c | 134 ++++++++++++++++++++++++++++++ src/util/universe_setup_file_io.h | 12 +++ 5 files changed, 196 insertions(+), 48 deletions(-) create mode 100644 src/util/universe_setup_file_io.c create mode 100644 src/util/universe_setup_file_io.h diff --git a/src/tools/openmpi/openmpi.c b/src/tools/openmpi/openmpi.c index 4595e05631..a176dd873e 100644 --- a/src/tools/openmpi/openmpi.c +++ b/src/tools/openmpi/openmpi.c @@ -25,20 +25,23 @@ #include "util/session_dir.h" #include "util/printf.h" #include "util/daemon_init.h" +#include "util/universe_setup_file_io.h" #include "mca/base/base.h" #include "mca/oob/base/base.h" #include "tools/openmpi/openmpi.h" ompi_universe_t ompi_universe = { - /* .name = */ NULL, - /* .host = */ NULL, - /* .uid = */ NULL, - /* .persistence = */ false, - /* .silent_mode = */ false, - /* .script_mode = */ false, - /* .web_server = */ false, - /* .console_connected = */ false + /* .name = */ NULL, + /* .host = */ NULL, + /* .uid = */ NULL, + /* .persistence = */ false, + /* .silent_mode = */ false, + /* .script_mode = */ false, + /* .web_server = */ false, + /* .socket_contact_info = */ NULL, + /* .oob_contact_info = */ NULL, + /* .console_connected = */ false }; @@ -54,7 +57,6 @@ int main(int argc, char **argv) bool persistent, silent, script, webserver; bool multi_thread = false; bool hidden_thread = false; - FILE *fp; tmp = universe_name = remote_host = remote_uid = script_file = NULL; persistent = silent = script = webserver = false; @@ -198,26 +200,37 @@ int main(int argc, char **argv) ompi_universe.name, NULL, NULL)) { /* found */ fprintf(stderr, "think i found something\n"); /* check for "contact-info" file. if present, read it in. if not present, wait one second (might - * be race condition) and try again. second failure => abnormal termination, go ahead and create own - * file and assume prior seed daemon died + * be race condition) and try again. */ - /* read the universe info - see if it's persistent */ - /* if not persistent, augment universe name until unique, create own universe and continue */ + if (OMPI_SUCCESS != ompi_session_dir(true, tmpdir, ompi_system_info.user, ompi_system_info.nodename, NULL, + ompi_universe.name, NULL, NULL)) { + fprintf(stderr, "couldn't update the process info structure - please report error to bugs@open-mpi.org\n"); + exit(1); + } + contact_file = ompi_os_path(false, ompi_process_info.universe_session_dir, + "universe-setup.txt", NULL); + + if (OMPI_SUCCESS != (ret = ompi_read_universe_setup_file(contact_file))) { + if (OMPI_ERR_NOT_FOUND == ret) { /* couldn't find file - assume prior seed daemon died */ + goto STARTUP; + } else { + fprintf(stderr, "couldn't read contact info: %s\n", contact_file); + exit(1); + } + } + + if (!ompi_universe.persistence) { /* if not persistent, define our own name and start new universe */ + /* derive unique name based on current one */ + goto STARTUP; + } + /* if persistent, use contact info to connect */ - /* if (OMPI_ERROR == ompi_universe_connect(tmp)) { /\* try to connect *\/ */ - /* /\* first failure - try to start universe and then try again *\/ */ - /* if (NULL == (tmp = ompi_universe_init(tmpdir, ompi_system_info.user, */ - /* ompi_universe.name))) { /\* couldn't create universe - error *\/ */ - /* fprintf(stderr, "could not create universe session directory tree - please report error to bugs@open-mpi.org\n"); */ - /* exit(1); */ - /* } */ - /* if (OMPI_ERROR == ompi_universe_connect(tmp)) { /\* try to connect *\/ */ - /* /\* second failure - we're doomed *\/ */ - /* fprintf(stderr, "could not connect to universe - please report error to bugs@open-mpi.org\n"); */ - /* exit(1); */ - /* } */ - /* } */ + if (OMPI_ERROR == ompi_universe_connect(ompi_universe.oob_contact_info)) { /* try to connect */ + /* universe must have died - try starting up new one */ + goto STARTUP; + } + } else { fprintf(stderr, "session dir not found - creating it - calling univ_init\n"); /* setup universe session directory */ @@ -228,6 +241,7 @@ int main(int argc, char **argv) } /* convert myself to be the seed daemon */ + STARTUP: ompi_process_info.seed = true; ompi_process_info.my_universe = strdup(ompi_universe.name); @@ -259,30 +273,14 @@ int main(int argc, char **argv) /* save all pertinent info in universe file */ contact_file = ompi_os_path(false, ompi_process_info.universe_session_dir, - "universe-setup.txt", NULL); - fp = fopen(contact_file, "w"); - if (NULL == fp) { - ompi_output(0, "cannot open file to save contact info"); + "universe-setup.txt", NULL); + + if (OMPI_SUCCESS != ompi_write_universe_setup_file(contact_file)) { + fprintf(stderr, "couldn't write universe setup file: %s\n", contact_file); exit(1); } - fprintf(fp, "name: %s\n", ompi_universe.name); - fprintf(fp, "host: %s\n", ompi_universe.host); - fprintf(fp, "user: %s\n", ompi_universe.uid); - if (persistent) { - fprintf(fp, "state: persistent\n"); - } else { - fprintf(fp, "state: non-persistent\n"); - } - if (silent) { - fprintf(fp, "mode: silent\n"); - } else { - fprintf(fp, "mode: console\n"); - } - if (webserver) { - fprintf(fp, "socket: %s\n", socket_contact_info); - } - fprintf(fp, "oob: %s\n", oob_contact_info); - fclose(fp); + + /* put info on the registry */ } } diff --git a/src/tools/openmpi/openmpi.h b/src/tools/openmpi/openmpi.h index ba692968ca..82f0140848 100644 --- a/src/tools/openmpi/openmpi.h +++ b/src/tools/openmpi/openmpi.h @@ -15,6 +15,8 @@ struct ompi_universe_t { bool silent_mode; bool script_mode; bool web_server; + char *socket_contact_info; + char *oob_contact_info; bool console_connected; }; typedef struct ompi_universe_t ompi_universe_t; diff --git a/src/util/Makefile.am b/src/util/Makefile.am index ab5dab2e30..412bae2ba5 100644 --- a/src/util/Makefile.am +++ b/src/util/Makefile.am @@ -31,6 +31,7 @@ headers = \ pow2.h \ session_dir.h \ daemon_init.h \ + universe_setup_file_io.h \ strncpy.h libutil_la_SOURCES = \ @@ -54,6 +55,7 @@ libutil_la_SOURCES = \ pow2.c \ session_dir.c \ daemon_init.c \ + universe_setup_file_io.c \ strncpy.c # Conditionally install the header files diff --git a/src/util/universe_setup_file_io.c b/src/util/universe_setup_file_io.c new file mode 100644 index 0000000000..a0300227dc --- /dev/null +++ b/src/util/universe_setup_file_io.c @@ -0,0 +1,134 @@ +/* + * + * $HEADER$ + * + * $Id: ompi_universe_setup_file I/O functions $ + * + */ +#include "ompi_config.h" + +#include +#include +#include +#include +#include + +#include "include/constants.h" + +#include "util/output.h" +#include "tools/openmpi/openmpi.h" +#include "util/universe_setup_file_io.h" + +char *ompi_getline_buffer(FILE *fp); + +int ompi_write_universe_setup_file(char *filename) +{ + FILE *fp; + + fp = fopen(filename, "w"); + if (NULL == fp) { + ompi_output(0, "cannot open file to save contact info"); + return OMPI_ERROR; + } + fprintf(fp, "%ld name: %s\n", strlen(ompi_universe.name), ompi_universe.name); + fprintf(fp, "%ld host: %s\n", strlen(ompi_universe.host), ompi_universe.host); + fprintf(fp, "%ld user: %s\n", strlen(ompi_universe.uid), ompi_universe.uid); + if (ompi_universe.persistence) { + fprintf(fp, "state: persistent\n"); + } else { + fprintf(fp, "state: non-persistent\n"); + } + if (ompi_universe.silent_mode) { + fprintf(fp, "mode: silent\n"); + } else { + fprintf(fp, "mode: console\n"); + } + if (ompi_universe.web_server) { + fprintf(fp, "%ld socket: %s\n", strlen(ompi_universe.socket_contact_info), + ompi_universe.socket_contact_info); + } else { + fprintf(fp, "0\n"); + } + fprintf(fp, "%ld oob: %s\n", strlen(ompi_universe.oob_contact_info), + ompi_universe.oob_contact_info); + fclose(fp); + return OMPI_SUCCESS; +} + +int ompi_read_universe_setup_file(char *filename) +{ + char persist[20], mode[10]; + FILE *fp; + + fp = fopen(filename, "r"); + if (NULL == fp) { /* failed on first read - wait and try again */ + sleep(1); + fp = fopen(filename, "r"); + if (NULL == fp) { /* failed twice - give up */ + return OMPI_ERR_NOT_FOUND; + } + } + + ompi_universe.name = ompi_getline_buffer(fp); + fscanf(fp, "name: %s\n", ompi_universe.name); + + ompi_universe.host = ompi_getline_buffer(fp); + fscanf(fp, "host: %s\n", ompi_universe.host); + + ompi_universe.uid = ompi_getline_buffer(fp); + fscanf(fp, "user: %s\n", ompi_universe.uid); + + fscanf(fp, "state: %s", persist); + if (0 == strncmp(persist, "persistent", strlen("persistent"))) { + ompi_universe.persistence = true; + } else if (0 == strncmp(persist, "non-persistent", strlen("non-persistent"))) { + ompi_universe.persistence = false; + } else { + return OMPI_ERROR; + } + + fscanf(fp, "mode: %s", mode); + if (0 == strncmp(mode, "silent", strlen("silent"))) { + ompi_universe.silent_mode = true; + } else if (0 == strncmp(mode, "console", strlen("console"))) { + ompi_universe.silent_mode = false; + } else { + return OMPI_ERROR; + } + + ompi_universe.socket_contact_info = ompi_getline_buffer(fp); + if (NULL != ompi_universe.socket_contact_info) { + fscanf(fp, "socket: %s", ompi_universe.socket_contact_info); + ompi_universe.web_server = true; + } else { + ompi_universe.web_server = false; + } + + ompi_universe.oob_contact_info = ompi_getline_buffer(fp); + fscanf(fp, "oob: %s", ompi_universe.oob_contact_info); + + fclose(fp); + + return OMPI_SUCCESS; +} + +char *ompi_getline_buffer(FILE *fp) +{ + int len, i, in_val; + char in_buf[100], in_char, *buffer; + + i = 0; + while ((EOF != (in_val = fgetc(fp))) && (' ' != (in_char = (char)in_val))) { + in_buf[i] = in_char; + i++; + } + in_buf[i] = '\0'; + len = atoi(in_buf); + if (len > 0) { + buffer = (char*)malloc((len+1)*sizeof(char)); + } else { + buffer = NULL; + } + + return buffer; +} diff --git a/src/util/universe_setup_file_io.h b/src/util/universe_setup_file_io.h new file mode 100644 index 0000000000..098b0d360a --- /dev/null +++ b/src/util/universe_setup_file_io.h @@ -0,0 +1,12 @@ +/* + * + * $HEADER$ + * + * $Id: ompi_universe_setup_file I/O functions $ + * + */ + +int ompi_write_universe_setup_file(char *filename); + +int ompi_read_universe_setup_file(char *filename); +