Only orted was saving the universe contact info - this should actually be saved by whomever believes they are the "seed". Modified files to ensure this happens. Also includes a checkpoint of the probe and remote launch functions.
This commit was SVN r5746.
Этот коммит содержится в:
родитель
c71f3f7152
Коммит
1b42e973d5
@ -35,6 +35,7 @@
|
|||||||
#include "util/sys_info.h"
|
#include "util/sys_info.h"
|
||||||
#include "util/proc_info.h"
|
#include "util/proc_info.h"
|
||||||
#include "util/univ_info.h"
|
#include "util/univ_info.h"
|
||||||
|
#include "util/os_path.h"
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Leave ORTE.
|
* Leave ORTE.
|
||||||
@ -46,6 +47,16 @@
|
|||||||
*/
|
*/
|
||||||
int orte_finalize(void)
|
int orte_finalize(void)
|
||||||
{
|
{
|
||||||
|
char *contact_path;
|
||||||
|
|
||||||
|
/* if I'm the seed, remove the universe contact info file */
|
||||||
|
if (orte_process_info.seed) {
|
||||||
|
contact_path = orte_os_path(false, orte_process_info.universe_session_dir,
|
||||||
|
"universe-setup.txt", NULL);
|
||||||
|
unlink(contact_path);
|
||||||
|
free(contact_path);
|
||||||
|
}
|
||||||
|
|
||||||
/* rmgr close depends on wait/iof */
|
/* rmgr close depends on wait/iof */
|
||||||
orte_rmgr_base_close();
|
orte_rmgr_base_close();
|
||||||
orte_wait_finalize();
|
orte_wait_finalize();
|
||||||
|
@ -46,6 +46,8 @@
|
|||||||
#include "util/session_dir.h"
|
#include "util/session_dir.h"
|
||||||
#include "util/sys_info.h"
|
#include "util/sys_info.h"
|
||||||
#include "util/cmd_line.h"
|
#include "util/cmd_line.h"
|
||||||
|
#include "util/universe_setup_file_io.h"
|
||||||
|
#include "util/os_path.h"
|
||||||
|
|
||||||
#include "runtime/runtime.h"
|
#include "runtime/runtime.h"
|
||||||
#include "runtime/runtime_internal.h"
|
#include "runtime/runtime_internal.h"
|
||||||
@ -57,6 +59,7 @@ int orte_init_stage1(void)
|
|||||||
char *universe;
|
char *universe;
|
||||||
char *jobid_str = NULL;
|
char *jobid_str = NULL;
|
||||||
char *procid_str = NULL;
|
char *procid_str = NULL;
|
||||||
|
char *contact_path = NULL;
|
||||||
pid_t pid;
|
pid_t pid;
|
||||||
orte_universe_t univ;
|
orte_universe_t univ;
|
||||||
|
|
||||||
@ -204,9 +207,6 @@ int orte_init_stage1(void)
|
|||||||
orte_process_info.ns_replica_uri = strdup(univ.seed_uri);
|
orte_process_info.ns_replica_uri = strdup(univ.seed_uri);
|
||||||
orte_process_info.gpr_replica_uri = strdup(univ.seed_uri);
|
orte_process_info.gpr_replica_uri = strdup(univ.seed_uri);
|
||||||
} else {
|
} else {
|
||||||
if (orte_debug_flag) {
|
|
||||||
ompi_output(0, "orte_init: could not join existing universe");
|
|
||||||
}
|
|
||||||
if (ORTE_ERR_NOT_FOUND != ret) {
|
if (ORTE_ERR_NOT_FOUND != ret) {
|
||||||
/* if it exists but no contact could be established,
|
/* if it exists but no contact could be established,
|
||||||
* define unique name based on current one.
|
* define unique name based on current one.
|
||||||
@ -221,6 +221,9 @@ int orte_init_stage1(void)
|
|||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
ompi_output(0, "Could not join an existing universe");
|
||||||
|
ompi_output(0, "Establishing a new one named: %s",
|
||||||
|
orte_universe_info.name);
|
||||||
|
|
||||||
orte_process_info.seed = true;
|
orte_process_info.seed = true;
|
||||||
/* since we are seed, ensure that all replica info is NULL'd */
|
/* since we are seed, ensure that all replica info is NULL'd */
|
||||||
@ -266,6 +269,11 @@ int orte_init_stage1(void)
|
|||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* if I'm the seed, set the seed uri to be me! */
|
||||||
|
if (orte_process_info.seed) {
|
||||||
|
orte_universe_info.seed_uri = orte_rml.get_uri();
|
||||||
|
}
|
||||||
|
|
||||||
/* setup my session directory */
|
/* setup my session directory */
|
||||||
if (ORTE_SUCCESS != (ret = orte_ns.get_jobid_string(&jobid_str, orte_process_info.my_name))) {
|
if (ORTE_SUCCESS != (ret = orte_ns.get_jobid_string(&jobid_str, orte_process_info.my_name))) {
|
||||||
ORTE_ERROR_LOG(ret);
|
ORTE_ERROR_LOG(ret);
|
||||||
@ -306,6 +314,33 @@ int orte_init_stage1(void)
|
|||||||
free(procid_str);
|
free(procid_str);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* if i'm the seed, get my contact info and write my setup file for others to find */
|
||||||
|
if (orte_process_info.seed) {
|
||||||
|
if (NULL != orte_universe_info.seed_uri) {
|
||||||
|
free(orte_universe_info.seed_uri);
|
||||||
|
orte_universe_info.seed_uri = NULL;
|
||||||
|
}
|
||||||
|
if (NULL == (orte_universe_info.seed_uri = orte_rml.get_uri())) {
|
||||||
|
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
||||||
|
return ORTE_ERR_NOT_FOUND;
|
||||||
|
}
|
||||||
|
contact_path = orte_os_path(false, orte_process_info.universe_session_dir,
|
||||||
|
"universe-setup.txt", NULL);
|
||||||
|
if (orte_debug_flag) {
|
||||||
|
ompi_output(0, "[%lu,%lu,%lu] contact_file %s",
|
||||||
|
ORTE_NAME_ARGS(orte_process_info.my_name), contact_path);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (OMPI_SUCCESS != (ret = orte_write_universe_setup_file(contact_path, &orte_universe_info))) {
|
||||||
|
if (orte_debug_flag) {
|
||||||
|
ompi_output(0, "[%lu,%lu,%lu] couldn't write setup file", ORTE_NAME_ARGS(orte_process_info.my_name));
|
||||||
|
}
|
||||||
|
} else if (orte_debug_flag) {
|
||||||
|
ompi_output(0, "[%lu,%lu,%lu] wrote setup file", ORTE_NAME_ARGS(orte_process_info.my_name));
|
||||||
|
}
|
||||||
|
free(contact_path);
|
||||||
|
}
|
||||||
|
|
||||||
/* set contact info for ns/gpr */
|
/* set contact info for ns/gpr */
|
||||||
if(NULL != orte_process_info.ns_replica_uri) {
|
if(NULL != orte_process_info.ns_replica_uri) {
|
||||||
orte_rml.set_uri(orte_process_info.ns_replica_uri);
|
orte_rml.set_uri(orte_process_info.ns_replica_uri);
|
||||||
|
@ -455,7 +455,6 @@ static void orte_setup_hnp_recv(int status, orte_process_name_t* sender,
|
|||||||
|
|
||||||
static void orte_setup_hnp_wait(pid_t wpid, int status, void *cbdata)
|
static void orte_setup_hnp_wait(pid_t wpid, int status, void *cbdata)
|
||||||
{
|
{
|
||||||
int rc;
|
|
||||||
orte_setup_hnp_cb_data_t *data;
|
orte_setup_hnp_cb_data_t *data;
|
||||||
|
|
||||||
OMPI_THREAD_LOCK(&orte_setup_hnp_mutex);
|
OMPI_THREAD_LOCK(&orte_setup_hnp_mutex);
|
||||||
|
@ -148,7 +148,6 @@ int main(int argc, char *argv[])
|
|||||||
int ret = 0;
|
int ret = 0;
|
||||||
int fd;
|
int fd;
|
||||||
ompi_cmd_line_t *cmd_line = NULL;
|
ompi_cmd_line_t *cmd_line = NULL;
|
||||||
char *contact_path = NULL;
|
|
||||||
char *log_path = NULL;
|
char *log_path = NULL;
|
||||||
char log_file[PATH_MAX];
|
char log_file[PATH_MAX];
|
||||||
char *jobidstring;
|
char *jobidstring;
|
||||||
@ -276,29 +275,6 @@ int main(int argc, char *argv[])
|
|||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* if i'm the seed, get my contact info and write my setup file for others to find */
|
|
||||||
if (orte_process_info.seed) {
|
|
||||||
if (NULL != orte_universe_info.seed_uri) {
|
|
||||||
free(orte_universe_info.seed_uri);
|
|
||||||
orte_universe_info.seed_uri = NULL;
|
|
||||||
}
|
|
||||||
orte_universe_info.seed_uri = orte_rml.get_uri();
|
|
||||||
contact_path = orte_os_path(false, orte_process_info.universe_session_dir,
|
|
||||||
"universe-setup.txt", NULL);
|
|
||||||
if (orted_globals.debug_daemons) {
|
|
||||||
ompi_output(0, "ompid: contact_file %s", contact_path);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (OMPI_SUCCESS != (ret = orte_write_universe_setup_file(contact_path, &orte_universe_info))) {
|
|
||||||
if (orted_globals.debug_daemons) {
|
|
||||||
ompi_output(0, "[%lu,%lu,%lu] ompid: couldn't write setup file", ORTE_NAME_ARGS(orte_process_info.my_name));
|
|
||||||
}
|
|
||||||
} else if (orted_globals.debug_daemons) {
|
|
||||||
ompi_output(0, "[%lu,%lu,%lu] ompid: wrote setup file", ORTE_NAME_ARGS(orte_process_info.my_name));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
if (orted_globals.debug_daemons) {
|
if (orted_globals.debug_daemons) {
|
||||||
ompi_output(0, "[%lu,%lu,%lu] ompid: issuing callback", ORTE_NAME_ARGS(orte_process_info.my_name));
|
ompi_output(0, "[%lu,%lu,%lu] ompid: issuing callback", ORTE_NAME_ARGS(orte_process_info.my_name));
|
||||||
}
|
}
|
||||||
@ -332,9 +308,6 @@ int main(int argc, char *argv[])
|
|||||||
}
|
}
|
||||||
|
|
||||||
/* cleanup */
|
/* cleanup */
|
||||||
if (NULL != contact_path) {
|
|
||||||
unlink(contact_path);
|
|
||||||
}
|
|
||||||
if (NULL != log_path) {
|
if (NULL != log_path) {
|
||||||
unlink(log_path);
|
unlink(log_path);
|
||||||
}
|
}
|
||||||
|
@ -286,14 +286,13 @@ int main(int argc, char *argv[])
|
|||||||
/* see if a universe already exists on this machine */
|
/* see if a universe already exists on this machine */
|
||||||
if (ORTE_SUCCESS == (ret = orte_universe_exists(&univ))) {
|
if (ORTE_SUCCESS == (ret = orte_universe_exists(&univ))) {
|
||||||
/* universe is here! send info back and die */
|
/* universe is here! send info back and die */
|
||||||
|
} else {
|
||||||
|
/* existing universe is not here or does not allow contact.
|
||||||
|
* ensure we have a unique universe name, fork/exec an appropriate
|
||||||
|
* daemon, and then tell whomever spawned us how to talk to the new
|
||||||
|
* daemon
|
||||||
|
*/
|
||||||
}
|
}
|
||||||
|
|
||||||
/* existing universe is not here or does not allow contact.
|
|
||||||
* ensure we have a unique universe name, fork/exec an appropriate
|
|
||||||
* daemon, and then tell whomever spawned us how to talk to the new
|
|
||||||
* daemon
|
|
||||||
*/
|
|
||||||
|
|
||||||
|
|
||||||
/* cleanup */
|
/* cleanup */
|
||||||
if (NULL != contact_path) {
|
if (NULL != contact_path) {
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user