1
1

bproc cleanup from release branch

This commit was SVN r9054.
Этот коммит содержится в:
Tim Woodall 2006-02-16 00:16:22 +00:00
родитель 650dc99387
Коммит fc751171cd
4 изменённых файлов: 217 добавлений и 19 удалений

Просмотреть файл

@ -387,6 +387,7 @@ static void orte_pls_bproc_setup_env(char *** env)
{ {
char ** merged; char ** merged;
char * var; char * var;
char * param;
int rc; int rc;
int num_env; int num_env;
@ -433,6 +434,14 @@ static void orte_pls_bproc_setup_env(char *** env)
opal_setenv(var,orte_process_info.gpr_replica_uri, true, env); opal_setenv(var,orte_process_info.gpr_replica_uri, true, env);
free(var); free(var);
/* universe directory - needs to match orted */
var = mca_base_param_environ_variable("universe", NULL, NULL);
asprintf(&param, "%s@%s:%s", orte_universe_info.uid,
orte_universe_info.host, orte_universe_info.name);
opal_setenv(var, param, true, env);
free(param);
free(var);
/* merge in environment */ /* merge in environment */
merged = opal_environ_merge(*env, environ); merged = opal_environ_merge(*env, environ);
opal_argv_free(*env); opal_argv_free(*env);
@ -572,12 +581,12 @@ static int orte_pls_bproc_launch_daemons(orte_cellid_t cellid, char *** envp,
} }
/* launch the daemons */ /* launch the daemons */
mca_pls_bproc_component.num_daemons = num_daemons; mca_pls_bproc_component.num_daemons += num_daemons;
rc = bproc_vexecmove(num_daemons, daemon_list, pids, orted_path, argv, *envp); rc = bproc_vexecmove(num_daemons, daemon_list, pids, orted_path, argv, *envp);
if(rc != num_daemons) { if(rc != num_daemons) {
opal_show_help("help-pls-bproc.txt", "daemon-launch-number", true, opal_show_help("help-pls-bproc.txt", "daemon-launch-number", true,
num_daemons, rc, orted_path); num_daemons, rc, orted_path);
mca_pls_bproc_component.num_daemons = 0; mca_pls_bproc_component.num_daemons -= num_daemons;
rc = ORTE_ERROR; rc = ORTE_ERROR;
goto cleanup; goto cleanup;
} }
@ -672,7 +681,7 @@ static int orte_pls_bproc_launch_app(orte_cellid_t cellid, orte_jobid_t jobid,
goto cleanup; goto cleanup;
} }
/* set out app context */ /* set up app context */
asprintf(&param, "%d", app_context); asprintf(&param, "%d", app_context);
var = mca_base_param_environ_variable("pls", "bproc", "app_context"); var = mca_base_param_environ_variable("pls", "bproc", "app_context");
opal_setenv(var, param, true, &map->app->env); opal_setenv(var, param, true, &map->app->env);
@ -937,17 +946,7 @@ int orte_pls_bproc_terminate_job(orte_jobid_t jobid) {
} }
if(NULL != pids) if(NULL != pids)
free(pids); free(pids);
/* kill daemons */ /* dont kill daemons - allow them to do cleanup when they see the job aborts */
if(ORTE_SUCCESS != (rc = orte_pls_base_get_node_pids(jobid, &pids, &num_pids)))
return rc;
for(i=0; i<num_pids; i++) {
if(mca_pls_bproc_component.debug) {
opal_output(0, "orte_pls_bproc: killing daemon: %d\n", pids[i]);
}
kill(pids[i], mca_pls_bproc_component.terminate_sig);
}
if(NULL != pids)
free(pids);
return ORTE_SUCCESS; return ORTE_SUCCESS;
} }
@ -977,7 +976,14 @@ int orte_pls_bproc_terminate_proc(const orte_process_name_t* proc_name) {
/** /**
* Module cleanup * Module cleanup
*/ */
int orte_pls_bproc_finalize(void) { int orte_pls_bproc_finalize(void)
{
/* wait for all daemons */
OPAL_THREAD_LOCK(&mca_pls_bproc_component.lock);
while(mca_pls_bproc_component.num_daemons || mca_pls_bproc_component.num_procs) {
opal_condition_wait(&mca_pls_bproc_component.condition, &mca_pls_bproc_component.lock);
}
OPAL_THREAD_UNLOCK(&mca_pls_bproc_component.lock);
return ORTE_SUCCESS; return ORTE_SUCCESS;
} }

Просмотреть файл

@ -286,8 +286,10 @@ int main(int argc, char *argv[])
} }
/* Set signal handlers to catch kill signals so we can properly clean up /* Set signal handlers to catch kill signals so we can properly clean up
* after ourselves * after ourselves. Set new process group so that we don't receive signals
* from controlling terminal/parent.
*/ */
setpgid(0,0);
opal_event_set(&term_handler, SIGTERM, OPAL_EV_SIGNAL, opal_event_set(&term_handler, SIGTERM, OPAL_EV_SIGNAL,
signal_callback, NULL); signal_callback, NULL);
opal_event_add(&term_handler, NULL); opal_event_add(&term_handler, NULL);
@ -351,7 +353,7 @@ int main(int argc, char *argv[])
opal_setenv(var, "1", true, &environ); opal_setenv(var, "1", true, &environ);
} }
/* Setup callback on jobid */ /* setup callback on jobid */
ret = orte_rmgr_base_proc_stage_gate_subscribe(orted_globals.bootproxy, job_state_callback, NULL, ORTE_PROC_STATE_TERMINATION); ret = orte_rmgr_base_proc_stage_gate_subscribe(orted_globals.bootproxy, job_state_callback, NULL, ORTE_PROC_STATE_TERMINATION);
if(ORTE_SUCCESS != ret) { if(ORTE_SUCCESS != ret) {
ORTE_ERROR_LOG(ret); ORTE_ERROR_LOG(ret);
@ -370,11 +372,13 @@ int main(int argc, char *argv[])
} }
OPAL_THREAD_UNLOCK(&orted_globals.mutex); OPAL_THREAD_UNLOCK(&orted_globals.mutex);
/* cleanup session directory */
orte_session_dir_cleanup(orted_globals.bootproxy);
/* Finalize and clean up */ /* Finalize and clean up */
if (ORTE_SUCCESS != (ret = orte_finalize())) { if (ORTE_SUCCESS != (ret = orte_finalize())) {
ORTE_ERROR_LOG(ret); ORTE_ERROR_LOG(ret);
} }
exit(ret); exit(ret);
} }
@ -438,7 +442,6 @@ int main(int argc, char *argv[])
static void signal_callback(int fd, short flags, void *arg) static void signal_callback(int fd, short flags, void *arg)
{ {
OPAL_TRACE(1); OPAL_TRACE(1);
orted_globals.exit_condition = true; orted_globals.exit_condition = true;
opal_condition_signal(&orted_globals.condition); opal_condition_signal(&orted_globals.condition);
} }
@ -609,3 +612,4 @@ void job_state_callback(orte_gpr_notify_data_t *data, void *cbdata)
return; return;
} }

Просмотреть файл

@ -61,6 +61,7 @@
static int orte_check_dir(bool create, char *directory); static int orte_check_dir(bool create, char *directory);
static void orte_dir_empty(char *pathname); static void orte_dir_empty(char *pathname);
static void orte_dir_empty_all(char *pathname);
static bool orte_is_empty(char *pathname); static bool orte_is_empty(char *pathname);
@ -357,6 +358,83 @@ int orte_session_dir(bool create, char *prfx, char *usr, char *hostid,
return return_code; return return_code;
} }
/*
* A job has aborted - so force cleanup.
*/
int
orte_session_dir_cleanup(orte_jobid_t jobid)
{
int rc;
char *tmp;
char *job, *job_session_dir;
/* need to setup the top_session_dir with the prefix */
tmp = opal_os_path(false,
orte_process_info.tmpdir_base,
orte_process_info.top_session_dir, NULL);
/* define the proc and job session directories for this process */
if (ORTE_SUCCESS != (rc = orte_ns.convert_jobid_to_string(&job, jobid))) {
ORTE_ERROR_LOG(rc);
free(tmp);
return rc;
}
if (0 > asprintf(&job_session_dir, "%s%s%s",
orte_process_info.universe_session_dir,
orte_system_info.path_sep, job)) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
free(tmp);
free(job);
return ORTE_ERR_OUT_OF_RESOURCE;
}
orte_dir_empty_all(job_session_dir);
orte_dir_empty(orte_process_info.universe_session_dir);
orte_dir_empty(tmp);
if (orte_is_empty(job_session_dir)) {
if (orte_debug_flag) {
opal_output(0, "sess_dir_finalize: found job session dir empty - deleting");
}
rmdir(job_session_dir);
} else {
if (orte_debug_flag) {
opal_output(0, "sess_dir_finalize: job session dir not empty - leaving");
}
goto CLEANUP;
}
if (orte_is_empty(orte_process_info.universe_session_dir)) {
if (orte_debug_flag) {
opal_output(0, "sess_dir_finalize: found univ session dir empty - deleting");
}
rmdir(orte_process_info.universe_session_dir);
} else {
if (orte_debug_flag) {
opal_output(0, "sess_dir_finalize: univ session dir not empty - leaving");
}
goto CLEANUP;
}
if (orte_is_empty(tmp)) {
if (orte_debug_flag) {
opal_output(0, "sess_dir_finalize: found top session dir empty - deleting");
}
rmdir(tmp);
} else {
if (orte_debug_flag) {
opal_output(0, "sess_dir_finalize: top session dir not empty - leaving");
}
}
CLEANUP:
free(tmp);
free(job);
free(job_session_dir);
return ORTE_SUCCESS;
}
int int
orte_session_dir_finalize(orte_process_name_t *proc) orte_session_dir_finalize(orte_process_name_t *proc)
@ -554,6 +632,105 @@ orte_dir_empty(char *pathname)
#endif #endif
} }
static void
orte_dir_empty_all(char *pathname)
{
#ifndef WIN32
DIR *dp;
struct dirent *ep;
char *filenm;
#ifndef HAVE_STRUCT_DIRENT_D_TYPE
int ret;
struct stat buf;
#endif
int rc;
if (NULL == pathname) { /* protect against error */
return;
}
dp = opendir(pathname);
if (NULL == dp) {
return;
}
while (NULL != (ep = readdir(dp)) ) {
/* skip:
* - . and ..
* - directories
* - files starting with "output-"
* - universe contact (universe-setup.txt)
*/
if ((0 != strcmp(ep->d_name, ".")) &&
(0 != strcmp(ep->d_name, "..")) &&
(0 != strncmp(ep->d_name, "output-", strlen("output-"))) &&
(0 != strcmp(ep->d_name, "universe-setup.txt"))) {
filenm = opal_os_path(false, pathname, ep->d_name, NULL);
/* is it a directory */
#ifdef HAVE_STRUCT_DIRENT_D_TYPE
if (DT_DIR == ep->d_type) {
orte_dir_empty_all(filenm);
rmdir(filenm);
free(filenm);
continue;
}
#else /* have dirent.d_type */
ret = stat(filenm, &buf);
if (ret < 0 || S_ISDIR(buf.st_mode)) {
orte_dir_empty_all(filenm);
rmdir(filenm);
free(filenm);
continue;
}
#endif /* have dirent.d_type */
rc = unlink(filenm);
free(filenm);
}
}
closedir(dp);
#else
bool empty = false;
char search_path[MAX_PATH];
HANDLE file;
WIN32_FIND_DATA file_data;
TCHAR *file_name;
if (NULL != pathname) {
strncpy(search_path, pathname, strlen(pathname)+1);
strncat (search_path, "\\*", 3);
file = FindFirstFile(search_path, &file_data);
if (INVALID_HANDLE_VALUE == file) {
FindClose(&file_data);
return;
}
do {
if(file_data.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY) {
orte_dir_empty_all(file_name);
}
if ((0 != strcmp(file_data.cFileName, ".")) &&
(0 != strcmp(file_data.cFileName, "..")) &&
(0 != strncmp(file_data.cFileName,"output-", strlen("output-"))) &&
(0 != strcmp(file_data.cFileName,"universe-setup.txt-"))) {
file_name = opal_os_path(false, pathname, file_data.cFileName, NULL);
DeleteFile(file_name);
}
if (0 == FindNextFile(file, &file_data)) {
empty = true;
}
} while(!empty);
FindClose(&file_data);
}
#endif
}
/* tests if the directory is empty */ /* tests if the directory is empty */
static bool orte_is_empty(char *pathname) static bool orte_is_empty(char *pathname)
{ {

Просмотреть файл

@ -146,3 +146,14 @@ OMPI_DECLSPEC int orte_session_dir(bool create, char *prefix, char *user, char *
* properly cleaned up. * properly cleaned up.
*/ */
OMPI_DECLSPEC int orte_session_dir_finalize(orte_process_name_t *proc); OMPI_DECLSPEC int orte_session_dir_finalize(orte_process_name_t *proc);
/** The orte_session_dir_cleanup() function performs a cleanup of the
* session directory tree when a job is aborted. It cleans up all
* process directories for a given job and then backs up the tree.
*
* @param jobid
* @retval OMPI_SUCCESS If the directory tree is properly cleaned up.
* @retval OMPI_ERROR If something prevents the tree from being
* properly cleaned up.
*/
OMPI_DECLSPEC int orte_session_dir_cleanup(orte_jobid_t jobid);