1
1

Only orted was saving the universe contact info - this should actually be saved by whomever believes they are the "seed". Modified files to ensure this happens. Also includes a checkpoint of the probe and remote launch functions.

This commit was SVN r5746.
Этот коммит содержится в:
Ralph Castain 2005-05-18 16:31:03 +00:00
родитель c71f3f7152
Коммит 1b42e973d5
5 изменённых файлов: 55 добавлений и 38 удалений

Просмотреть файл

@ -35,6 +35,7 @@
#include "util/sys_info.h" #include "util/sys_info.h"
#include "util/proc_info.h" #include "util/proc_info.h"
#include "util/univ_info.h" #include "util/univ_info.h"
#include "util/os_path.h"
/** /**
* Leave ORTE. * Leave ORTE.
@ -46,6 +47,16 @@
*/ */
int orte_finalize(void) int orte_finalize(void)
{ {
char *contact_path;
/* if I'm the seed, remove the universe contact info file */
if (orte_process_info.seed) {
contact_path = orte_os_path(false, orte_process_info.universe_session_dir,
"universe-setup.txt", NULL);
unlink(contact_path);
free(contact_path);
}
/* rmgr close depends on wait/iof */ /* rmgr close depends on wait/iof */
orte_rmgr_base_close(); orte_rmgr_base_close();
orte_wait_finalize(); orte_wait_finalize();

Просмотреть файл

@ -46,6 +46,8 @@
#include "util/session_dir.h" #include "util/session_dir.h"
#include "util/sys_info.h" #include "util/sys_info.h"
#include "util/cmd_line.h" #include "util/cmd_line.h"
#include "util/universe_setup_file_io.h"
#include "util/os_path.h"
#include "runtime/runtime.h" #include "runtime/runtime.h"
#include "runtime/runtime_internal.h" #include "runtime/runtime_internal.h"
@ -57,6 +59,7 @@ int orte_init_stage1(void)
char *universe; char *universe;
char *jobid_str = NULL; char *jobid_str = NULL;
char *procid_str = NULL; char *procid_str = NULL;
char *contact_path = NULL;
pid_t pid; pid_t pid;
orte_universe_t univ; orte_universe_t univ;
@ -204,9 +207,6 @@ int orte_init_stage1(void)
orte_process_info.ns_replica_uri = strdup(univ.seed_uri); orte_process_info.ns_replica_uri = strdup(univ.seed_uri);
orte_process_info.gpr_replica_uri = strdup(univ.seed_uri); orte_process_info.gpr_replica_uri = strdup(univ.seed_uri);
} else { } else {
if (orte_debug_flag) {
ompi_output(0, "orte_init: could not join existing universe");
}
if (ORTE_ERR_NOT_FOUND != ret) { if (ORTE_ERR_NOT_FOUND != ret) {
/* if it exists but no contact could be established, /* if it exists but no contact could be established,
* define unique name based on current one. * define unique name based on current one.
@ -221,6 +221,9 @@ int orte_init_stage1(void)
return ret; return ret;
} }
} }
ompi_output(0, "Could not join an existing universe");
ompi_output(0, "Establishing a new one named: %s",
orte_universe_info.name);
orte_process_info.seed = true; orte_process_info.seed = true;
/* since we are seed, ensure that all replica info is NULL'd */ /* since we are seed, ensure that all replica info is NULL'd */
@ -266,6 +269,11 @@ int orte_init_stage1(void)
return ret; return ret;
} }
/* if I'm the seed, set the seed uri to be me! */
if (orte_process_info.seed) {
orte_universe_info.seed_uri = orte_rml.get_uri();
}
/* setup my session directory */ /* setup my session directory */
if (ORTE_SUCCESS != (ret = orte_ns.get_jobid_string(&jobid_str, orte_process_info.my_name))) { if (ORTE_SUCCESS != (ret = orte_ns.get_jobid_string(&jobid_str, orte_process_info.my_name))) {
ORTE_ERROR_LOG(ret); ORTE_ERROR_LOG(ret);
@ -306,6 +314,33 @@ int orte_init_stage1(void)
free(procid_str); free(procid_str);
} }
/* if i'm the seed, get my contact info and write my setup file for others to find */
if (orte_process_info.seed) {
if (NULL != orte_universe_info.seed_uri) {
free(orte_universe_info.seed_uri);
orte_universe_info.seed_uri = NULL;
}
if (NULL == (orte_universe_info.seed_uri = orte_rml.get_uri())) {
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
return ORTE_ERR_NOT_FOUND;
}
contact_path = orte_os_path(false, orte_process_info.universe_session_dir,
"universe-setup.txt", NULL);
if (orte_debug_flag) {
ompi_output(0, "[%lu,%lu,%lu] contact_file %s",
ORTE_NAME_ARGS(orte_process_info.my_name), contact_path);
}
if (OMPI_SUCCESS != (ret = orte_write_universe_setup_file(contact_path, &orte_universe_info))) {
if (orte_debug_flag) {
ompi_output(0, "[%lu,%lu,%lu] couldn't write setup file", ORTE_NAME_ARGS(orte_process_info.my_name));
}
} else if (orte_debug_flag) {
ompi_output(0, "[%lu,%lu,%lu] wrote setup file", ORTE_NAME_ARGS(orte_process_info.my_name));
}
free(contact_path);
}
/* set contact info for ns/gpr */ /* set contact info for ns/gpr */
if(NULL != orte_process_info.ns_replica_uri) { if(NULL != orte_process_info.ns_replica_uri) {
orte_rml.set_uri(orte_process_info.ns_replica_uri); orte_rml.set_uri(orte_process_info.ns_replica_uri);

Просмотреть файл

@ -455,7 +455,6 @@ static void orte_setup_hnp_recv(int status, orte_process_name_t* sender,
static void orte_setup_hnp_wait(pid_t wpid, int status, void *cbdata) static void orte_setup_hnp_wait(pid_t wpid, int status, void *cbdata)
{ {
int rc;
orte_setup_hnp_cb_data_t *data; orte_setup_hnp_cb_data_t *data;
OMPI_THREAD_LOCK(&orte_setup_hnp_mutex); OMPI_THREAD_LOCK(&orte_setup_hnp_mutex);

Просмотреть файл

@ -148,7 +148,6 @@ int main(int argc, char *argv[])
int ret = 0; int ret = 0;
int fd; int fd;
ompi_cmd_line_t *cmd_line = NULL; ompi_cmd_line_t *cmd_line = NULL;
char *contact_path = NULL;
char *log_path = NULL; char *log_path = NULL;
char log_file[PATH_MAX]; char log_file[PATH_MAX];
char *jobidstring; char *jobidstring;
@ -276,29 +275,6 @@ int main(int argc, char *argv[])
return ret; return ret;
} }
/* if i'm the seed, get my contact info and write my setup file for others to find */
if (orte_process_info.seed) {
if (NULL != orte_universe_info.seed_uri) {
free(orte_universe_info.seed_uri);
orte_universe_info.seed_uri = NULL;
}
orte_universe_info.seed_uri = orte_rml.get_uri();
contact_path = orte_os_path(false, orte_process_info.universe_session_dir,
"universe-setup.txt", NULL);
if (orted_globals.debug_daemons) {
ompi_output(0, "ompid: contact_file %s", contact_path);
}
if (OMPI_SUCCESS != (ret = orte_write_universe_setup_file(contact_path, &orte_universe_info))) {
if (orted_globals.debug_daemons) {
ompi_output(0, "[%lu,%lu,%lu] ompid: couldn't write setup file", ORTE_NAME_ARGS(orte_process_info.my_name));
}
} else if (orted_globals.debug_daemons) {
ompi_output(0, "[%lu,%lu,%lu] ompid: wrote setup file", ORTE_NAME_ARGS(orte_process_info.my_name));
}
}
if (orted_globals.debug_daemons) { if (orted_globals.debug_daemons) {
ompi_output(0, "[%lu,%lu,%lu] ompid: issuing callback", ORTE_NAME_ARGS(orte_process_info.my_name)); ompi_output(0, "[%lu,%lu,%lu] ompid: issuing callback", ORTE_NAME_ARGS(orte_process_info.my_name));
} }
@ -332,9 +308,6 @@ int main(int argc, char *argv[])
} }
/* cleanup */ /* cleanup */
if (NULL != contact_path) {
unlink(contact_path);
}
if (NULL != log_path) { if (NULL != log_path) {
unlink(log_path); unlink(log_path);
} }

Просмотреть файл

@ -286,14 +286,13 @@ int main(int argc, char *argv[])
/* see if a universe already exists on this machine */ /* see if a universe already exists on this machine */
if (ORTE_SUCCESS == (ret = orte_universe_exists(&univ))) { if (ORTE_SUCCESS == (ret = orte_universe_exists(&univ))) {
/* universe is here! send info back and die */ /* universe is here! send info back and die */
} else {
/* existing universe is not here or does not allow contact.
* ensure we have a unique universe name, fork/exec an appropriate
* daemon, and then tell whomever spawned us how to talk to the new
* daemon
*/
} }
/* existing universe is not here or does not allow contact.
* ensure we have a unique universe name, fork/exec an appropriate
* daemon, and then tell whomever spawned us how to talk to the new
* daemon
*/
/* cleanup */ /* cleanup */
if (NULL != contact_path) { if (NULL != contact_path) {