1
1

Checkpointing the openmpi program. Added ability to write/read universe setup files in session directory. Added logic for connecting to existing universe.

This commit was SVN r2274.
Этот коммит содержится в:
Ralph Castain 2004-08-24 03:04:41 +00:00
родитель e6b35105c0
Коммит b7b0ee1905
5 изменённых файлов: 196 добавлений и 48 удалений

Просмотреть файл

@ -25,20 +25,23 @@
#include "util/session_dir.h"
#include "util/printf.h"
#include "util/daemon_init.h"
#include "util/universe_setup_file_io.h"
#include "mca/base/base.h"
#include "mca/oob/base/base.h"
#include "tools/openmpi/openmpi.h"
ompi_universe_t ompi_universe = {
/* .name = */ NULL,
/* .host = */ NULL,
/* .uid = */ NULL,
/* .persistence = */ false,
/* .silent_mode = */ false,
/* .script_mode = */ false,
/* .web_server = */ false,
/* .console_connected = */ false
/* .name = */ NULL,
/* .host = */ NULL,
/* .uid = */ NULL,
/* .persistence = */ false,
/* .silent_mode = */ false,
/* .script_mode = */ false,
/* .web_server = */ false,
/* .socket_contact_info = */ NULL,
/* .oob_contact_info = */ NULL,
/* .console_connected = */ false
};
@ -54,7 +57,6 @@ int main(int argc, char **argv)
bool persistent, silent, script, webserver;
bool multi_thread = false;
bool hidden_thread = false;
FILE *fp;
tmp = universe_name = remote_host = remote_uid = script_file = NULL;
persistent = silent = script = webserver = false;
@ -198,26 +200,37 @@ int main(int argc, char **argv)
ompi_universe.name, NULL, NULL)) { /* found */
fprintf(stderr, "think i found something\n");
/* check for "contact-info" file. if present, read it in. if not present, wait one second (might
* be race condition) and try again. second failure => abnormal termination, go ahead and create own
* file and assume prior seed daemon died
* be race condition) and try again.
*/
/* read the universe info - see if it's persistent */
/* if not persistent, augment universe name until unique, create own universe and continue */
if (OMPI_SUCCESS != ompi_session_dir(true, tmpdir, ompi_system_info.user, ompi_system_info.nodename, NULL,
ompi_universe.name, NULL, NULL)) {
fprintf(stderr, "couldn't update the process info structure - please report error to bugs@open-mpi.org\n");
exit(1);
}
contact_file = ompi_os_path(false, ompi_process_info.universe_session_dir,
"universe-setup.txt", NULL);
if (OMPI_SUCCESS != (ret = ompi_read_universe_setup_file(contact_file))) {
if (OMPI_ERR_NOT_FOUND == ret) { /* couldn't find file - assume prior seed daemon died */
goto STARTUP;
} else {
fprintf(stderr, "couldn't read contact info: %s\n", contact_file);
exit(1);
}
}
if (!ompi_universe.persistence) { /* if not persistent, define our own name and start new universe */
/* derive unique name based on current one */
goto STARTUP;
}
/* if persistent, use contact info to connect */
/* if (OMPI_ERROR == ompi_universe_connect(tmp)) { /\* try to connect *\/ */
/* /\* first failure - try to start universe and then try again *\/ */
/* if (NULL == (tmp = ompi_universe_init(tmpdir, ompi_system_info.user, */
/* ompi_universe.name))) { /\* couldn't create universe - error *\/ */
/* fprintf(stderr, "could not create universe session directory tree - please report error to bugs@open-mpi.org\n"); */
/* exit(1); */
/* } */
/* if (OMPI_ERROR == ompi_universe_connect(tmp)) { /\* try to connect *\/ */
/* /\* second failure - we're doomed *\/ */
/* fprintf(stderr, "could not connect to universe - please report error to bugs@open-mpi.org\n"); */
/* exit(1); */
/* } */
/* } */
if (OMPI_ERROR == ompi_universe_connect(ompi_universe.oob_contact_info)) { /* try to connect */
/* universe must have died - try starting up new one */
goto STARTUP;
}
} else {
fprintf(stderr, "session dir not found - creating it - calling univ_init\n");
/* setup universe session directory */
@ -228,6 +241,7 @@ int main(int argc, char **argv)
}
/* convert myself to be the seed daemon */
STARTUP:
ompi_process_info.seed = true;
ompi_process_info.my_universe = strdup(ompi_universe.name);
@ -259,30 +273,14 @@ int main(int argc, char **argv)
/* save all pertinent info in universe file */
contact_file = ompi_os_path(false, ompi_process_info.universe_session_dir,
"universe-setup.txt", NULL);
fp = fopen(contact_file, "w");
if (NULL == fp) {
ompi_output(0, "cannot open file to save contact info");
"universe-setup.txt", NULL);
if (OMPI_SUCCESS != ompi_write_universe_setup_file(contact_file)) {
fprintf(stderr, "couldn't write universe setup file: %s\n", contact_file);
exit(1);
}
fprintf(fp, "name: %s\n", ompi_universe.name);
fprintf(fp, "host: %s\n", ompi_universe.host);
fprintf(fp, "user: %s\n", ompi_universe.uid);
if (persistent) {
fprintf(fp, "state: persistent\n");
} else {
fprintf(fp, "state: non-persistent\n");
}
if (silent) {
fprintf(fp, "mode: silent\n");
} else {
fprintf(fp, "mode: console\n");
}
if (webserver) {
fprintf(fp, "socket: %s\n", socket_contact_info);
}
fprintf(fp, "oob: %s\n", oob_contact_info);
fclose(fp);
/* put info on the registry */
}
}

Просмотреть файл

@ -15,6 +15,8 @@ struct ompi_universe_t {
bool silent_mode;
bool script_mode;
bool web_server;
char *socket_contact_info;
char *oob_contact_info;
bool console_connected;
};
typedef struct ompi_universe_t ompi_universe_t;

Просмотреть файл

@ -31,6 +31,7 @@ headers = \
pow2.h \
session_dir.h \
daemon_init.h \
universe_setup_file_io.h \
strncpy.h
libutil_la_SOURCES = \
@ -54,6 +55,7 @@ libutil_la_SOURCES = \
pow2.c \
session_dir.c \
daemon_init.c \
universe_setup_file_io.c \
strncpy.c
# Conditionally install the header files

134
src/util/universe_setup_file_io.c Обычный файл
Просмотреть файл

@ -0,0 +1,134 @@
/*
*
* $HEADER$
*
* $Id: ompi_universe_setup_file I/O functions $
*
*/
#include "ompi_config.h"
#include <stdio.h>
#include <sys/types.h>
#include <stdarg.h>
#include <string.h>
#include <unistd.h>
#include "include/constants.h"
#include "util/output.h"
#include "tools/openmpi/openmpi.h"
#include "util/universe_setup_file_io.h"
char *ompi_getline_buffer(FILE *fp);
int ompi_write_universe_setup_file(char *filename)
{
FILE *fp;
fp = fopen(filename, "w");
if (NULL == fp) {
ompi_output(0, "cannot open file to save contact info");
return OMPI_ERROR;
}
fprintf(fp, "%ld name: %s\n", strlen(ompi_universe.name), ompi_universe.name);
fprintf(fp, "%ld host: %s\n", strlen(ompi_universe.host), ompi_universe.host);
fprintf(fp, "%ld user: %s\n", strlen(ompi_universe.uid), ompi_universe.uid);
if (ompi_universe.persistence) {
fprintf(fp, "state: persistent\n");
} else {
fprintf(fp, "state: non-persistent\n");
}
if (ompi_universe.silent_mode) {
fprintf(fp, "mode: silent\n");
} else {
fprintf(fp, "mode: console\n");
}
if (ompi_universe.web_server) {
fprintf(fp, "%ld socket: %s\n", strlen(ompi_universe.socket_contact_info),
ompi_universe.socket_contact_info);
} else {
fprintf(fp, "0\n");
}
fprintf(fp, "%ld oob: %s\n", strlen(ompi_universe.oob_contact_info),
ompi_universe.oob_contact_info);
fclose(fp);
return OMPI_SUCCESS;
}
int ompi_read_universe_setup_file(char *filename)
{
char persist[20], mode[10];
FILE *fp;
fp = fopen(filename, "r");
if (NULL == fp) { /* failed on first read - wait and try again */
sleep(1);
fp = fopen(filename, "r");
if (NULL == fp) { /* failed twice - give up */
return OMPI_ERR_NOT_FOUND;
}
}
ompi_universe.name = ompi_getline_buffer(fp);
fscanf(fp, "name: %s\n", ompi_universe.name);
ompi_universe.host = ompi_getline_buffer(fp);
fscanf(fp, "host: %s\n", ompi_universe.host);
ompi_universe.uid = ompi_getline_buffer(fp);
fscanf(fp, "user: %s\n", ompi_universe.uid);
fscanf(fp, "state: %s", persist);
if (0 == strncmp(persist, "persistent", strlen("persistent"))) {
ompi_universe.persistence = true;
} else if (0 == strncmp(persist, "non-persistent", strlen("non-persistent"))) {
ompi_universe.persistence = false;
} else {
return OMPI_ERROR;
}
fscanf(fp, "mode: %s", mode);
if (0 == strncmp(mode, "silent", strlen("silent"))) {
ompi_universe.silent_mode = true;
} else if (0 == strncmp(mode, "console", strlen("console"))) {
ompi_universe.silent_mode = false;
} else {
return OMPI_ERROR;
}
ompi_universe.socket_contact_info = ompi_getline_buffer(fp);
if (NULL != ompi_universe.socket_contact_info) {
fscanf(fp, "socket: %s", ompi_universe.socket_contact_info);
ompi_universe.web_server = true;
} else {
ompi_universe.web_server = false;
}
ompi_universe.oob_contact_info = ompi_getline_buffer(fp);
fscanf(fp, "oob: %s", ompi_universe.oob_contact_info);
fclose(fp);
return OMPI_SUCCESS;
}
char *ompi_getline_buffer(FILE *fp)
{
int len, i, in_val;
char in_buf[100], in_char, *buffer;
i = 0;
while ((EOF != (in_val = fgetc(fp))) && (' ' != (in_char = (char)in_val))) {
in_buf[i] = in_char;
i++;
}
in_buf[i] = '\0';
len = atoi(in_buf);
if (len > 0) {
buffer = (char*)malloc((len+1)*sizeof(char));
} else {
buffer = NULL;
}
return buffer;
}

12
src/util/universe_setup_file_io.h Обычный файл
Просмотреть файл

@ -0,0 +1,12 @@
/*
*
* $HEADER$
*
* $Id: ompi_universe_setup_file I/O functions $
*
*/
int ompi_write_universe_setup_file(char *filename);
int ompi_read_universe_setup_file(char *filename);