Only orted was saving the universe contact info - this should actually be saved by whomever believes they are the "seed". Modified files to ensure this happens. Also includes a checkpoint of the probe and remote launch functions.
This commit was SVN r5746.
Этот коммит содержится в:
родитель
c71f3f7152
Коммит
1b42e973d5
@ -35,6 +35,7 @@
|
||||
#include "util/sys_info.h"
|
||||
#include "util/proc_info.h"
|
||||
#include "util/univ_info.h"
|
||||
#include "util/os_path.h"
|
||||
|
||||
/**
|
||||
* Leave ORTE.
|
||||
@ -46,6 +47,16 @@
|
||||
*/
|
||||
int orte_finalize(void)
|
||||
{
|
||||
char *contact_path;
|
||||
|
||||
/* if I'm the seed, remove the universe contact info file */
|
||||
if (orte_process_info.seed) {
|
||||
contact_path = orte_os_path(false, orte_process_info.universe_session_dir,
|
||||
"universe-setup.txt", NULL);
|
||||
unlink(contact_path);
|
||||
free(contact_path);
|
||||
}
|
||||
|
||||
/* rmgr close depends on wait/iof */
|
||||
orte_rmgr_base_close();
|
||||
orte_wait_finalize();
|
||||
|
@ -46,6 +46,8 @@
|
||||
#include "util/session_dir.h"
|
||||
#include "util/sys_info.h"
|
||||
#include "util/cmd_line.h"
|
||||
#include "util/universe_setup_file_io.h"
|
||||
#include "util/os_path.h"
|
||||
|
||||
#include "runtime/runtime.h"
|
||||
#include "runtime/runtime_internal.h"
|
||||
@ -57,6 +59,7 @@ int orte_init_stage1(void)
|
||||
char *universe;
|
||||
char *jobid_str = NULL;
|
||||
char *procid_str = NULL;
|
||||
char *contact_path = NULL;
|
||||
pid_t pid;
|
||||
orte_universe_t univ;
|
||||
|
||||
@ -204,9 +207,6 @@ int orte_init_stage1(void)
|
||||
orte_process_info.ns_replica_uri = strdup(univ.seed_uri);
|
||||
orte_process_info.gpr_replica_uri = strdup(univ.seed_uri);
|
||||
} else {
|
||||
if (orte_debug_flag) {
|
||||
ompi_output(0, "orte_init: could not join existing universe");
|
||||
}
|
||||
if (ORTE_ERR_NOT_FOUND != ret) {
|
||||
/* if it exists but no contact could be established,
|
||||
* define unique name based on current one.
|
||||
@ -221,6 +221,9 @@ int orte_init_stage1(void)
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
ompi_output(0, "Could not join an existing universe");
|
||||
ompi_output(0, "Establishing a new one named: %s",
|
||||
orte_universe_info.name);
|
||||
|
||||
orte_process_info.seed = true;
|
||||
/* since we are seed, ensure that all replica info is NULL'd */
|
||||
@ -266,6 +269,11 @@ int orte_init_stage1(void)
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* if I'm the seed, set the seed uri to be me! */
|
||||
if (orte_process_info.seed) {
|
||||
orte_universe_info.seed_uri = orte_rml.get_uri();
|
||||
}
|
||||
|
||||
/* setup my session directory */
|
||||
if (ORTE_SUCCESS != (ret = orte_ns.get_jobid_string(&jobid_str, orte_process_info.my_name))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
@ -306,6 +314,33 @@ int orte_init_stage1(void)
|
||||
free(procid_str);
|
||||
}
|
||||
|
||||
/* if i'm the seed, get my contact info and write my setup file for others to find */
|
||||
if (orte_process_info.seed) {
|
||||
if (NULL != orte_universe_info.seed_uri) {
|
||||
free(orte_universe_info.seed_uri);
|
||||
orte_universe_info.seed_uri = NULL;
|
||||
}
|
||||
if (NULL == (orte_universe_info.seed_uri = orte_rml.get_uri())) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
||||
return ORTE_ERR_NOT_FOUND;
|
||||
}
|
||||
contact_path = orte_os_path(false, orte_process_info.universe_session_dir,
|
||||
"universe-setup.txt", NULL);
|
||||
if (orte_debug_flag) {
|
||||
ompi_output(0, "[%lu,%lu,%lu] contact_file %s",
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name), contact_path);
|
||||
}
|
||||
|
||||
if (OMPI_SUCCESS != (ret = orte_write_universe_setup_file(contact_path, &orte_universe_info))) {
|
||||
if (orte_debug_flag) {
|
||||
ompi_output(0, "[%lu,%lu,%lu] couldn't write setup file", ORTE_NAME_ARGS(orte_process_info.my_name));
|
||||
}
|
||||
} else if (orte_debug_flag) {
|
||||
ompi_output(0, "[%lu,%lu,%lu] wrote setup file", ORTE_NAME_ARGS(orte_process_info.my_name));
|
||||
}
|
||||
free(contact_path);
|
||||
}
|
||||
|
||||
/* set contact info for ns/gpr */
|
||||
if(NULL != orte_process_info.ns_replica_uri) {
|
||||
orte_rml.set_uri(orte_process_info.ns_replica_uri);
|
||||
|
@ -455,7 +455,6 @@ static void orte_setup_hnp_recv(int status, orte_process_name_t* sender,
|
||||
|
||||
static void orte_setup_hnp_wait(pid_t wpid, int status, void *cbdata)
|
||||
{
|
||||
int rc;
|
||||
orte_setup_hnp_cb_data_t *data;
|
||||
|
||||
OMPI_THREAD_LOCK(&orte_setup_hnp_mutex);
|
||||
|
@ -148,7 +148,6 @@ int main(int argc, char *argv[])
|
||||
int ret = 0;
|
||||
int fd;
|
||||
ompi_cmd_line_t *cmd_line = NULL;
|
||||
char *contact_path = NULL;
|
||||
char *log_path = NULL;
|
||||
char log_file[PATH_MAX];
|
||||
char *jobidstring;
|
||||
@ -276,29 +275,6 @@ int main(int argc, char *argv[])
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* if i'm the seed, get my contact info and write my setup file for others to find */
|
||||
if (orte_process_info.seed) {
|
||||
if (NULL != orte_universe_info.seed_uri) {
|
||||
free(orte_universe_info.seed_uri);
|
||||
orte_universe_info.seed_uri = NULL;
|
||||
}
|
||||
orte_universe_info.seed_uri = orte_rml.get_uri();
|
||||
contact_path = orte_os_path(false, orte_process_info.universe_session_dir,
|
||||
"universe-setup.txt", NULL);
|
||||
if (orted_globals.debug_daemons) {
|
||||
ompi_output(0, "ompid: contact_file %s", contact_path);
|
||||
}
|
||||
|
||||
if (OMPI_SUCCESS != (ret = orte_write_universe_setup_file(contact_path, &orte_universe_info))) {
|
||||
if (orted_globals.debug_daemons) {
|
||||
ompi_output(0, "[%lu,%lu,%lu] ompid: couldn't write setup file", ORTE_NAME_ARGS(orte_process_info.my_name));
|
||||
}
|
||||
} else if (orted_globals.debug_daemons) {
|
||||
ompi_output(0, "[%lu,%lu,%lu] ompid: wrote setup file", ORTE_NAME_ARGS(orte_process_info.my_name));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
if (orted_globals.debug_daemons) {
|
||||
ompi_output(0, "[%lu,%lu,%lu] ompid: issuing callback", ORTE_NAME_ARGS(orte_process_info.my_name));
|
||||
}
|
||||
@ -332,9 +308,6 @@ int main(int argc, char *argv[])
|
||||
}
|
||||
|
||||
/* cleanup */
|
||||
if (NULL != contact_path) {
|
||||
unlink(contact_path);
|
||||
}
|
||||
if (NULL != log_path) {
|
||||
unlink(log_path);
|
||||
}
|
||||
|
@ -286,14 +286,13 @@ int main(int argc, char *argv[])
|
||||
/* see if a universe already exists on this machine */
|
||||
if (ORTE_SUCCESS == (ret = orte_universe_exists(&univ))) {
|
||||
/* universe is here! send info back and die */
|
||||
} else {
|
||||
/* existing universe is not here or does not allow contact.
|
||||
* ensure we have a unique universe name, fork/exec an appropriate
|
||||
* daemon, and then tell whomever spawned us how to talk to the new
|
||||
* daemon
|
||||
*/
|
||||
}
|
||||
|
||||
/* existing universe is not here or does not allow contact.
|
||||
* ensure we have a unique universe name, fork/exec an appropriate
|
||||
* daemon, and then tell whomever spawned us how to talk to the new
|
||||
* daemon
|
||||
*/
|
||||
|
||||
|
||||
/* cleanup */
|
||||
if (NULL != contact_path) {
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user