Checkpointing the openmpi program. Added ability to write/read universe setup files in session directory. Added logic for connecting to existing universe.
This commit was SVN r2274.
Этот коммит содержится в:
родитель
e6b35105c0
Коммит
b7b0ee1905
@ -25,20 +25,23 @@
|
||||
#include "util/session_dir.h"
|
||||
#include "util/printf.h"
|
||||
#include "util/daemon_init.h"
|
||||
#include "util/universe_setup_file_io.h"
|
||||
#include "mca/base/base.h"
|
||||
#include "mca/oob/base/base.h"
|
||||
#include "tools/openmpi/openmpi.h"
|
||||
|
||||
|
||||
ompi_universe_t ompi_universe = {
|
||||
/* .name = */ NULL,
|
||||
/* .host = */ NULL,
|
||||
/* .uid = */ NULL,
|
||||
/* .persistence = */ false,
|
||||
/* .silent_mode = */ false,
|
||||
/* .script_mode = */ false,
|
||||
/* .web_server = */ false,
|
||||
/* .console_connected = */ false
|
||||
/* .name = */ NULL,
|
||||
/* .host = */ NULL,
|
||||
/* .uid = */ NULL,
|
||||
/* .persistence = */ false,
|
||||
/* .silent_mode = */ false,
|
||||
/* .script_mode = */ false,
|
||||
/* .web_server = */ false,
|
||||
/* .socket_contact_info = */ NULL,
|
||||
/* .oob_contact_info = */ NULL,
|
||||
/* .console_connected = */ false
|
||||
};
|
||||
|
||||
|
||||
@ -54,7 +57,6 @@ int main(int argc, char **argv)
|
||||
bool persistent, silent, script, webserver;
|
||||
bool multi_thread = false;
|
||||
bool hidden_thread = false;
|
||||
FILE *fp;
|
||||
|
||||
tmp = universe_name = remote_host = remote_uid = script_file = NULL;
|
||||
persistent = silent = script = webserver = false;
|
||||
@ -198,26 +200,37 @@ int main(int argc, char **argv)
|
||||
ompi_universe.name, NULL, NULL)) { /* found */
|
||||
fprintf(stderr, "think i found something\n");
|
||||
/* check for "contact-info" file. if present, read it in. if not present, wait one second (might
|
||||
* be race condition) and try again. second failure => abnormal termination, go ahead and create own
|
||||
* file and assume prior seed daemon died
|
||||
* be race condition) and try again.
|
||||
*/
|
||||
/* read the universe info - see if it's persistent */
|
||||
/* if not persistent, augment universe name until unique, create own universe and continue */
|
||||
if (OMPI_SUCCESS != ompi_session_dir(true, tmpdir, ompi_system_info.user, ompi_system_info.nodename, NULL,
|
||||
ompi_universe.name, NULL, NULL)) {
|
||||
fprintf(stderr, "couldn't update the process info structure - please report error to bugs@open-mpi.org\n");
|
||||
exit(1);
|
||||
}
|
||||
contact_file = ompi_os_path(false, ompi_process_info.universe_session_dir,
|
||||
"universe-setup.txt", NULL);
|
||||
|
||||
if (OMPI_SUCCESS != (ret = ompi_read_universe_setup_file(contact_file))) {
|
||||
if (OMPI_ERR_NOT_FOUND == ret) { /* couldn't find file - assume prior seed daemon died */
|
||||
goto STARTUP;
|
||||
} else {
|
||||
fprintf(stderr, "couldn't read contact info: %s\n", contact_file);
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
if (!ompi_universe.persistence) { /* if not persistent, define our own name and start new universe */
|
||||
/* derive unique name based on current one */
|
||||
goto STARTUP;
|
||||
}
|
||||
|
||||
/* if persistent, use contact info to connect */
|
||||
|
||||
/* if (OMPI_ERROR == ompi_universe_connect(tmp)) { /\* try to connect *\/ */
|
||||
/* /\* first failure - try to start universe and then try again *\/ */
|
||||
/* if (NULL == (tmp = ompi_universe_init(tmpdir, ompi_system_info.user, */
|
||||
/* ompi_universe.name))) { /\* couldn't create universe - error *\/ */
|
||||
/* fprintf(stderr, "could not create universe session directory tree - please report error to bugs@open-mpi.org\n"); */
|
||||
/* exit(1); */
|
||||
/* } */
|
||||
/* if (OMPI_ERROR == ompi_universe_connect(tmp)) { /\* try to connect *\/ */
|
||||
/* /\* second failure - we're doomed *\/ */
|
||||
/* fprintf(stderr, "could not connect to universe - please report error to bugs@open-mpi.org\n"); */
|
||||
/* exit(1); */
|
||||
/* } */
|
||||
/* } */
|
||||
if (OMPI_ERROR == ompi_universe_connect(ompi_universe.oob_contact_info)) { /* try to connect */
|
||||
/* universe must have died - try starting up new one */
|
||||
goto STARTUP;
|
||||
}
|
||||
|
||||
} else {
|
||||
fprintf(stderr, "session dir not found - creating it - calling univ_init\n");
|
||||
/* setup universe session directory */
|
||||
@ -228,6 +241,7 @@ int main(int argc, char **argv)
|
||||
}
|
||||
|
||||
/* convert myself to be the seed daemon */
|
||||
STARTUP:
|
||||
ompi_process_info.seed = true;
|
||||
ompi_process_info.my_universe = strdup(ompi_universe.name);
|
||||
|
||||
@ -259,30 +273,14 @@ int main(int argc, char **argv)
|
||||
|
||||
/* save all pertinent info in universe file */
|
||||
contact_file = ompi_os_path(false, ompi_process_info.universe_session_dir,
|
||||
"universe-setup.txt", NULL);
|
||||
fp = fopen(contact_file, "w");
|
||||
if (NULL == fp) {
|
||||
ompi_output(0, "cannot open file to save contact info");
|
||||
"universe-setup.txt", NULL);
|
||||
|
||||
if (OMPI_SUCCESS != ompi_write_universe_setup_file(contact_file)) {
|
||||
fprintf(stderr, "couldn't write universe setup file: %s\n", contact_file);
|
||||
exit(1);
|
||||
}
|
||||
fprintf(fp, "name: %s\n", ompi_universe.name);
|
||||
fprintf(fp, "host: %s\n", ompi_universe.host);
|
||||
fprintf(fp, "user: %s\n", ompi_universe.uid);
|
||||
if (persistent) {
|
||||
fprintf(fp, "state: persistent\n");
|
||||
} else {
|
||||
fprintf(fp, "state: non-persistent\n");
|
||||
}
|
||||
if (silent) {
|
||||
fprintf(fp, "mode: silent\n");
|
||||
} else {
|
||||
fprintf(fp, "mode: console\n");
|
||||
}
|
||||
if (webserver) {
|
||||
fprintf(fp, "socket: %s\n", socket_contact_info);
|
||||
}
|
||||
fprintf(fp, "oob: %s\n", oob_contact_info);
|
||||
fclose(fp);
|
||||
|
||||
/* put info on the registry */
|
||||
|
||||
}
|
||||
}
|
||||
|
@ -15,6 +15,8 @@ struct ompi_universe_t {
|
||||
bool silent_mode;
|
||||
bool script_mode;
|
||||
bool web_server;
|
||||
char *socket_contact_info;
|
||||
char *oob_contact_info;
|
||||
bool console_connected;
|
||||
};
|
||||
typedef struct ompi_universe_t ompi_universe_t;
|
||||
|
@ -31,6 +31,7 @@ headers = \
|
||||
pow2.h \
|
||||
session_dir.h \
|
||||
daemon_init.h \
|
||||
universe_setup_file_io.h \
|
||||
strncpy.h
|
||||
|
||||
libutil_la_SOURCES = \
|
||||
@ -54,6 +55,7 @@ libutil_la_SOURCES = \
|
||||
pow2.c \
|
||||
session_dir.c \
|
||||
daemon_init.c \
|
||||
universe_setup_file_io.c \
|
||||
strncpy.c
|
||||
|
||||
# Conditionally install the header files
|
||||
|
134
src/util/universe_setup_file_io.c
Обычный файл
134
src/util/universe_setup_file_io.c
Обычный файл
@ -0,0 +1,134 @@
|
||||
/*
|
||||
*
|
||||
* $HEADER$
|
||||
*
|
||||
* $Id: ompi_universe_setup_file I/O functions $
|
||||
*
|
||||
*/
|
||||
#include "ompi_config.h"
|
||||
|
||||
#include <stdio.h>
|
||||
#include <sys/types.h>
|
||||
#include <stdarg.h>
|
||||
#include <string.h>
|
||||
#include <unistd.h>
|
||||
|
||||
#include "include/constants.h"
|
||||
|
||||
#include "util/output.h"
|
||||
#include "tools/openmpi/openmpi.h"
|
||||
#include "util/universe_setup_file_io.h"
|
||||
|
||||
char *ompi_getline_buffer(FILE *fp);
|
||||
|
||||
int ompi_write_universe_setup_file(char *filename)
|
||||
{
|
||||
FILE *fp;
|
||||
|
||||
fp = fopen(filename, "w");
|
||||
if (NULL == fp) {
|
||||
ompi_output(0, "cannot open file to save contact info");
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
fprintf(fp, "%ld name: %s\n", strlen(ompi_universe.name), ompi_universe.name);
|
||||
fprintf(fp, "%ld host: %s\n", strlen(ompi_universe.host), ompi_universe.host);
|
||||
fprintf(fp, "%ld user: %s\n", strlen(ompi_universe.uid), ompi_universe.uid);
|
||||
if (ompi_universe.persistence) {
|
||||
fprintf(fp, "state: persistent\n");
|
||||
} else {
|
||||
fprintf(fp, "state: non-persistent\n");
|
||||
}
|
||||
if (ompi_universe.silent_mode) {
|
||||
fprintf(fp, "mode: silent\n");
|
||||
} else {
|
||||
fprintf(fp, "mode: console\n");
|
||||
}
|
||||
if (ompi_universe.web_server) {
|
||||
fprintf(fp, "%ld socket: %s\n", strlen(ompi_universe.socket_contact_info),
|
||||
ompi_universe.socket_contact_info);
|
||||
} else {
|
||||
fprintf(fp, "0\n");
|
||||
}
|
||||
fprintf(fp, "%ld oob: %s\n", strlen(ompi_universe.oob_contact_info),
|
||||
ompi_universe.oob_contact_info);
|
||||
fclose(fp);
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
int ompi_read_universe_setup_file(char *filename)
|
||||
{
|
||||
char persist[20], mode[10];
|
||||
FILE *fp;
|
||||
|
||||
fp = fopen(filename, "r");
|
||||
if (NULL == fp) { /* failed on first read - wait and try again */
|
||||
sleep(1);
|
||||
fp = fopen(filename, "r");
|
||||
if (NULL == fp) { /* failed twice - give up */
|
||||
return OMPI_ERR_NOT_FOUND;
|
||||
}
|
||||
}
|
||||
|
||||
ompi_universe.name = ompi_getline_buffer(fp);
|
||||
fscanf(fp, "name: %s\n", ompi_universe.name);
|
||||
|
||||
ompi_universe.host = ompi_getline_buffer(fp);
|
||||
fscanf(fp, "host: %s\n", ompi_universe.host);
|
||||
|
||||
ompi_universe.uid = ompi_getline_buffer(fp);
|
||||
fscanf(fp, "user: %s\n", ompi_universe.uid);
|
||||
|
||||
fscanf(fp, "state: %s", persist);
|
||||
if (0 == strncmp(persist, "persistent", strlen("persistent"))) {
|
||||
ompi_universe.persistence = true;
|
||||
} else if (0 == strncmp(persist, "non-persistent", strlen("non-persistent"))) {
|
||||
ompi_universe.persistence = false;
|
||||
} else {
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
||||
fscanf(fp, "mode: %s", mode);
|
||||
if (0 == strncmp(mode, "silent", strlen("silent"))) {
|
||||
ompi_universe.silent_mode = true;
|
||||
} else if (0 == strncmp(mode, "console", strlen("console"))) {
|
||||
ompi_universe.silent_mode = false;
|
||||
} else {
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
||||
ompi_universe.socket_contact_info = ompi_getline_buffer(fp);
|
||||
if (NULL != ompi_universe.socket_contact_info) {
|
||||
fscanf(fp, "socket: %s", ompi_universe.socket_contact_info);
|
||||
ompi_universe.web_server = true;
|
||||
} else {
|
||||
ompi_universe.web_server = false;
|
||||
}
|
||||
|
||||
ompi_universe.oob_contact_info = ompi_getline_buffer(fp);
|
||||
fscanf(fp, "oob: %s", ompi_universe.oob_contact_info);
|
||||
|
||||
fclose(fp);
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
char *ompi_getline_buffer(FILE *fp)
|
||||
{
|
||||
int len, i, in_val;
|
||||
char in_buf[100], in_char, *buffer;
|
||||
|
||||
i = 0;
|
||||
while ((EOF != (in_val = fgetc(fp))) && (' ' != (in_char = (char)in_val))) {
|
||||
in_buf[i] = in_char;
|
||||
i++;
|
||||
}
|
||||
in_buf[i] = '\0';
|
||||
len = atoi(in_buf);
|
||||
if (len > 0) {
|
||||
buffer = (char*)malloc((len+1)*sizeof(char));
|
||||
} else {
|
||||
buffer = NULL;
|
||||
}
|
||||
|
||||
return buffer;
|
||||
}
|
12
src/util/universe_setup_file_io.h
Обычный файл
12
src/util/universe_setup_file_io.h
Обычный файл
@ -0,0 +1,12 @@
|
||||
/*
|
||||
*
|
||||
* $HEADER$
|
||||
*
|
||||
* $Id: ompi_universe_setup_file I/O functions $
|
||||
*
|
||||
*/
|
||||
|
||||
int ompi_write_universe_setup_file(char *filename);
|
||||
|
||||
int ompi_read_universe_setup_file(char *filename);
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user