MPI_Init will now detect and join a persistent universe - hooray! Fixed the session_dir cleanup process so it is kinder to the universe-setup file (i.e., leaves it alone), thus allowing persistent universes to retain their contact info on the session_dir tree. Adjusted mpirun2, ompid, and ompiconsole accordingly.
Put some error protection in ompi_rte_monitor. This commit was SVN r2678.
Этот коммит содержится в:
родитель
5de6a8c211
Коммит
70dae461e4
@ -60,6 +60,8 @@ int ompi_mpi_thread_provided = MPI_THREAD_SINGLE;
|
||||
int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
|
||||
{
|
||||
int ret, param;
|
||||
mca_ns_base_jobid_t jobid;
|
||||
mca_ns_base_vpid_t vpid;
|
||||
bool allow_multi_user_threads;
|
||||
bool have_hidden_threads;
|
||||
ompi_proc_t** procs;
|
||||
@ -73,6 +75,9 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
|
||||
goto error;
|
||||
}
|
||||
|
||||
/* parse environmental variables and fill corresponding info structures */
|
||||
ompi_rte_parse_environ();
|
||||
|
||||
/* Open up the MCA */
|
||||
|
||||
if (OMPI_SUCCESS != (ret = mca_base_open())) {
|
||||
@ -88,8 +93,12 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
|
||||
goto error;
|
||||
}
|
||||
|
||||
/* parse environmental variables and fill corresponding info structures */
|
||||
ompi_rte_parse_environ();
|
||||
/* check for existing universe to join */
|
||||
if (OMPI_SUCCESS != (ret = ompi_rte_universe_exists())) {
|
||||
if (ompi_rte_debug_flag) {
|
||||
ompi_output(0, "ompi_mpi_init: could not join existing universe");
|
||||
}
|
||||
}
|
||||
|
||||
/* start the rest of the rte */
|
||||
if (OMPI_SUCCESS != (ret = ompi_rte_init_stage2(&allow_multi_user_threads,
|
||||
@ -102,7 +111,17 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
|
||||
if (NULL != ompi_process_info.name) { /* should NOT have been previously set */
|
||||
free(ompi_process_info.name);
|
||||
}
|
||||
ompi_process_info.name = ompi_rte_get_self();
|
||||
if (NULL == ompi_rte_get_self()) { /* no name set in environment - must be singleton */
|
||||
if (NULL == ompi_process_info.ns_replica) { /* couldn't join existing univ */
|
||||
ompi_process_info.name = ompi_name_server.create_process_name(0,0,0);
|
||||
} else { /* name server exists elsewhere - get a name for me */
|
||||
jobid = ompi_name_server.create_jobid();
|
||||
vpid = ompi_name_server.reserve_range(jobid, 1);
|
||||
ompi_process_info.name = ompi_name_server.create_process_name(0, jobid, vpid);
|
||||
}
|
||||
} else { /* name set in environment - record it */
|
||||
ompi_process_info.name = ompi_rte_get_self();
|
||||
}
|
||||
|
||||
/* setup my session directory */
|
||||
jobid_str = ompi_name_server.get_jobid_string(ompi_process_info.name);
|
||||
|
@ -41,6 +41,11 @@ int ompi_rte_register(void)
|
||||
void *addr;
|
||||
int rc,size;
|
||||
|
||||
/* protect against error */
|
||||
if (NULL == jobid) {
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
||||
/* setup keys and segment for this job */
|
||||
sprintf(segment, "job-%s", jobid);
|
||||
keys[0] = ompi_name_server.get_proc_name_string(ompi_process_info.name);
|
||||
@ -80,7 +85,12 @@ int ompi_rte_unregister(void)
|
||||
char *jobid = ompi_name_server.get_jobid_string(ompi_process_info.name);
|
||||
char *keys[2];
|
||||
int rc;
|
||||
|
||||
|
||||
/* protect against error */
|
||||
if (NULL == jobid) {
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
||||
/* setup keys and segment for this job */
|
||||
sprintf(segment, "job-%s", jobid);
|
||||
free(jobid);
|
||||
|
@ -155,6 +155,22 @@ int main(int argc, char *argv[])
|
||||
return ret;
|
||||
}
|
||||
|
||||
/***** SET MY NAME *****/
|
||||
jobid = ompi_name_server.create_jobid();
|
||||
vpid = ompi_name_server.reserve_range(jobid, 1);
|
||||
ompi_process_info.name = ompi_name_server.create_process_name(0, jobid, vpid);
|
||||
|
||||
fprintf(stderr, "my name: [%d,%d,%d]\n", ompi_process_info.name->cellid,
|
||||
ompi_process_info.name->jobid, ompi_process_info.name->vpid);
|
||||
|
||||
/*
|
||||
* Register my process info with my replica.
|
||||
*/
|
||||
if (OMPI_SUCCESS != (ret = ompi_rte_register())) {
|
||||
fprintf(stderr, "ompi_rte_init: failed in ompi_rte_register()\n");
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* finalize the rte startup */
|
||||
if (OMPI_SUCCESS != (ret = ompi_rte_init_finalstage(&allow_multi_user_threads,
|
||||
&have_hidden_threads))) {
|
||||
@ -162,21 +178,6 @@ int main(int argc, char *argv[])
|
||||
return ret;
|
||||
}
|
||||
|
||||
/***** SET MY NAME *****/
|
||||
/* jobid = ompi_name_server.create_jobid(); */
|
||||
/* vpid = ompi_name_server.reserve_range(jobid, 1); */
|
||||
/* ompi_process_info.name = ompi_name_server.create_process_name(0, jobid, vpid); */
|
||||
|
||||
/* fprintf(stderr, "my name: [%d,%d,%d]\n", ompi_process_info.name->cellid, */
|
||||
/* ompi_process_info.name->jobid, ompi_process_info.name->vpid); */
|
||||
|
||||
|
||||
/* /\* register the console callback function *\/ */
|
||||
/* ret = mca_oob_recv_packed_nb(MCA_OOB_NAME_ANY, MCA_OOB_TAG_DAEMON, 0, ompi_console_recv, NULL); */
|
||||
/* if(ret != OMPI_SUCCESS && ret != OMPI_ERR_NOT_IMPLEMENTED) { */
|
||||
/* printf("daemon callback not registered: error code %d", ret); */
|
||||
/* return ret; */
|
||||
/* } */
|
||||
|
||||
exit_cmd = false;
|
||||
while (!exit_cmd) {
|
||||
@ -221,10 +222,13 @@ int main(int argc, char *argv[])
|
||||
}
|
||||
}
|
||||
|
||||
fprintf(stderr, "finalize rte\n");
|
||||
ompi_rte_finalize();
|
||||
fprintf(stderr, "close mca\n");
|
||||
mca_base_close();
|
||||
fprintf(stderr, "finalize ompi\n");
|
||||
ompi_finalize();
|
||||
return 0;
|
||||
exit(0);
|
||||
}
|
||||
|
||||
|
||||
|
@ -45,7 +45,7 @@ main(int argc, char *argv[])
|
||||
ompi_rte_node_schedule_t *sched;
|
||||
char cwd[MAXPATHLEN];
|
||||
char *my_contact_info, *tmp, *jobid_str, *procid_str;
|
||||
char *contact_file;
|
||||
char *contact_file, *filenm;
|
||||
|
||||
/*
|
||||
* Intialize our Open MPI environment
|
||||
@ -328,6 +328,15 @@ main(int argc, char *argv[])
|
||||
*/
|
||||
if (NULL != nodelist) ompi_rte_deallocate_resources(new_jobid, nodelist);
|
||||
if (NULL != cmd_line) OBJ_RELEASE(cmd_line);
|
||||
|
||||
/* eventually, mpirun won't be the seed and so won't have to do this.
|
||||
* for now, though, remove the universe-setup.txt file so the directories
|
||||
* can cleanup
|
||||
*/
|
||||
filenm = ompi_os_path(false, ompi_process_info.universe_session_dir, "universe-setup.txt", NULL);
|
||||
unlink(filenm);
|
||||
|
||||
/* finalize the system */
|
||||
ompi_rte_finalize();
|
||||
mca_base_close();
|
||||
ompi_finalize();
|
||||
|
@ -52,6 +52,7 @@ int main(int argc, char *argv[])
|
||||
bool allow_multi_user_threads = false;
|
||||
bool have_hidden_threads = false;
|
||||
char *jobid_str, *procid_str, *enviro_val, *contact_file;
|
||||
char *filenm;
|
||||
|
||||
/*
|
||||
* Intialize the Open MPI environment
|
||||
@ -239,12 +240,16 @@ int main(int argc, char *argv[])
|
||||
ompi_universe_info.seed_contact_info = mca_oob_get_contact_info();
|
||||
contact_file = ompi_os_path(false, ompi_process_info.universe_session_dir,
|
||||
"universe-setup.txt", NULL);
|
||||
ompi_output(0, "ompid: contact_file %s", contact_file);
|
||||
|
||||
if (OMPI_SUCCESS != (ret = ompi_write_universe_setup_file(contact_file))) {
|
||||
if (ompi_daemon_debug) {
|
||||
ompi_output(0, "[%d,%d,%d] ompid: couldn't write setup file", ompi_process_info.name->cellid,
|
||||
ompi_process_info.name->jobid, ompi_process_info.name->vpid);
|
||||
}
|
||||
} else if (ompi_daemon_debug) {
|
||||
ompi_output(0, "[%d,%d,%d] ompid: wrote setup file", ompi_process_info.name->cellid,
|
||||
ompi_process_info.name->jobid, ompi_process_info.name->vpid);
|
||||
}
|
||||
}
|
||||
|
||||
@ -293,7 +298,11 @@ int main(int argc, char *argv[])
|
||||
ompi_process_info.name->jobid, ompi_process_info.name->vpid);
|
||||
}
|
||||
|
||||
/* remove the universe-setup file */
|
||||
filenm = ompi_os_path(false, ompi_process_info.universe_session_dir, "universe-setup.txt", NULL);
|
||||
unlink(filenm);
|
||||
|
||||
/* finalize the system */
|
||||
ompi_rte_finalize();
|
||||
mca_base_close();
|
||||
ompi_finalize();
|
||||
@ -325,10 +334,11 @@ static void ompi_daemon_recv(int status, ompi_process_name_t* sender,
|
||||
|
||||
if (OMPI_SUCCESS != ompi_buffer_init(&answer, 0)) {
|
||||
/* RHC -- not sure what to do if this fails */
|
||||
goto DONE;
|
||||
}
|
||||
|
||||
if (OMPI_SUCCESS != ompi_unpack(buffer, &command, 1, OMPI_DAEMON_OOB_PACK_CMD)) {
|
||||
goto RETURN_ERROR;
|
||||
goto CLEANUP;
|
||||
}
|
||||
|
||||
/**** EXIT COMMAND ****/
|
||||
@ -355,8 +365,10 @@ static void ompi_daemon_recv(int status, ompi_process_name_t* sender,
|
||||
}
|
||||
|
||||
}
|
||||
CLEANUP:
|
||||
ompi_buffer_free(answer);
|
||||
|
||||
RETURN_ERROR:
|
||||
DONE:
|
||||
/* reissue the non-blocking receive */
|
||||
ret = mca_oob_recv_packed_nb(MCA_OOB_NAME_ANY, MCA_OOB_TAG_DAEMON, 0, ompi_daemon_recv, NULL);
|
||||
if(ret != OMPI_SUCCESS && ret != OMPI_ERR_NOT_IMPLEMENTED) {
|
||||
|
@ -347,7 +347,8 @@ ompi_dir_empty(char *pathname)
|
||||
if ((0 != strcmp(ep->d_name, ".")) &&
|
||||
(0 != strcmp(ep->d_name, "..")) &&
|
||||
(DT_DIR != ep->d_type) &&
|
||||
(0 != strncmp(ep->d_name, "output-", strlen("output-")))) {
|
||||
(0 != strncmp(ep->d_name, "output-", strlen("output-"))) &&
|
||||
(0 != strcmp(ep->d_name, "universe-setup.txt"))) {
|
||||
filenm = ompi_os_path(false, pathname, ep->d_name, NULL);
|
||||
unlink(filenm);
|
||||
}
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user