diff --git a/orte/mca/pls/bproc/pls_bproc.c b/orte/mca/pls/bproc/pls_bproc.c index 2ef562efc7..67d73cfff8 100644 --- a/orte/mca/pls/bproc/pls_bproc.c +++ b/orte/mca/pls/bproc/pls_bproc.c @@ -387,6 +387,7 @@ static void orte_pls_bproc_setup_env(char *** env) { char ** merged; char * var; + char * param; int rc; int num_env; @@ -433,6 +434,14 @@ static void orte_pls_bproc_setup_env(char *** env) opal_setenv(var,orte_process_info.gpr_replica_uri, true, env); free(var); + /* universe directory - needs to match orted */ + var = mca_base_param_environ_variable("universe", NULL, NULL); + asprintf(¶m, "%s@%s:%s", orte_universe_info.uid, + orte_universe_info.host, orte_universe_info.name); + opal_setenv(var, param, true, env); + free(param); + free(var); + /* merge in environment */ merged = opal_environ_merge(*env, environ); opal_argv_free(*env); @@ -572,12 +581,12 @@ static int orte_pls_bproc_launch_daemons(orte_cellid_t cellid, char *** envp, } /* launch the daemons */ - mca_pls_bproc_component.num_daemons = num_daemons; + mca_pls_bproc_component.num_daemons += num_daemons; rc = bproc_vexecmove(num_daemons, daemon_list, pids, orted_path, argv, *envp); if(rc != num_daemons) { opal_show_help("help-pls-bproc.txt", "daemon-launch-number", true, num_daemons, rc, orted_path); - mca_pls_bproc_component.num_daemons = 0; + mca_pls_bproc_component.num_daemons -= num_daemons; rc = ORTE_ERROR; goto cleanup; } @@ -672,7 +681,7 @@ static int orte_pls_bproc_launch_app(orte_cellid_t cellid, orte_jobid_t jobid, goto cleanup; } - /* set out app context */ + /* set up app context */ asprintf(¶m, "%d", app_context); var = mca_base_param_environ_variable("pls", "bproc", "app_context"); opal_setenv(var, param, true, &map->app->env); @@ -937,17 +946,7 @@ int orte_pls_bproc_terminate_job(orte_jobid_t jobid) { } if(NULL != pids) free(pids); - /* kill daemons */ - if(ORTE_SUCCESS != (rc = orte_pls_base_get_node_pids(jobid, &pids, &num_pids))) - return rc; - for(i=0; i asprintf(&job_session_dir, "%s%s%s", + orte_process_info.universe_session_dir, + orte_system_info.path_sep, job)) { + ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); + free(tmp); + free(job); + return ORTE_ERR_OUT_OF_RESOURCE; + } + + orte_dir_empty_all(job_session_dir); + orte_dir_empty(orte_process_info.universe_session_dir); + orte_dir_empty(tmp); + + if (orte_is_empty(job_session_dir)) { + if (orte_debug_flag) { + opal_output(0, "sess_dir_finalize: found job session dir empty - deleting"); + } + rmdir(job_session_dir); + } else { + if (orte_debug_flag) { + opal_output(0, "sess_dir_finalize: job session dir not empty - leaving"); + } + goto CLEANUP; + } + + if (orte_is_empty(orte_process_info.universe_session_dir)) { + if (orte_debug_flag) { + opal_output(0, "sess_dir_finalize: found univ session dir empty - deleting"); + } + rmdir(orte_process_info.universe_session_dir); + } else { + if (orte_debug_flag) { + opal_output(0, "sess_dir_finalize: univ session dir not empty - leaving"); + } + goto CLEANUP; + } + + if (orte_is_empty(tmp)) { + if (orte_debug_flag) { + opal_output(0, "sess_dir_finalize: found top session dir empty - deleting"); + } + rmdir(tmp); + } else { + if (orte_debug_flag) { + opal_output(0, "sess_dir_finalize: top session dir not empty - leaving"); + } + } + +CLEANUP: + free(tmp); + free(job); + free(job_session_dir); + return ORTE_SUCCESS; +} + int orte_session_dir_finalize(orte_process_name_t *proc) @@ -554,6 +632,105 @@ orte_dir_empty(char *pathname) #endif } + +static void +orte_dir_empty_all(char *pathname) +{ +#ifndef WIN32 + DIR *dp; + struct dirent *ep; + char *filenm; +#ifndef HAVE_STRUCT_DIRENT_D_TYPE + int ret; + struct stat buf; +#endif + int rc; + + if (NULL == pathname) { /* protect against error */ + return; + } + + dp = opendir(pathname); + if (NULL == dp) { + return; + } + + while (NULL != (ep = readdir(dp)) ) { + /* skip: + * - . and .. + * - directories + * - files starting with "output-" + * - universe contact (universe-setup.txt) + */ + if ((0 != strcmp(ep->d_name, ".")) && + (0 != strcmp(ep->d_name, "..")) && + (0 != strncmp(ep->d_name, "output-", strlen("output-"))) && + (0 != strcmp(ep->d_name, "universe-setup.txt"))) { + + filenm = opal_os_path(false, pathname, ep->d_name, NULL); + + /* is it a directory */ +#ifdef HAVE_STRUCT_DIRENT_D_TYPE + if (DT_DIR == ep->d_type) { + orte_dir_empty_all(filenm); + rmdir(filenm); + free(filenm); + continue; + } +#else /* have dirent.d_type */ + ret = stat(filenm, &buf); + if (ret < 0 || S_ISDIR(buf.st_mode)) { + orte_dir_empty_all(filenm); + rmdir(filenm); + free(filenm); + continue; + } +#endif /* have dirent.d_type */ + rc = unlink(filenm); + free(filenm); + } + } + closedir(dp); +#else + bool empty = false; + char search_path[MAX_PATH]; + HANDLE file; + WIN32_FIND_DATA file_data; + TCHAR *file_name; + + if (NULL != pathname) { + strncpy(search_path, pathname, strlen(pathname)+1); + strncat (search_path, "\\*", 3); + file = FindFirstFile(search_path, &file_data); + + if (INVALID_HANDLE_VALUE == file) { + FindClose(&file_data); + return; + } + + do { + if(file_data.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY) { + orte_dir_empty_all(file_name); + } + if ((0 != strcmp(file_data.cFileName, ".")) && + (0 != strcmp(file_data.cFileName, "..")) && + (0 != strncmp(file_data.cFileName,"output-", strlen("output-"))) && + (0 != strcmp(file_data.cFileName,"universe-setup.txt-"))) { + + file_name = opal_os_path(false, pathname, file_data.cFileName, NULL); + DeleteFile(file_name); + + } + if (0 == FindNextFile(file, &file_data)) { + empty = true; + } + } while(!empty); + FindClose(&file_data); + } +#endif +} + + /* tests if the directory is empty */ static bool orte_is_empty(char *pathname) { diff --git a/orte/util/session_dir.h b/orte/util/session_dir.h index 2aad74d4e0..cb10765ada 100644 --- a/orte/util/session_dir.h +++ b/orte/util/session_dir.h @@ -146,3 +146,14 @@ OMPI_DECLSPEC int orte_session_dir(bool create, char *prefix, char *user, char * * properly cleaned up. */ OMPI_DECLSPEC int orte_session_dir_finalize(orte_process_name_t *proc); + +/** The orte_session_dir_cleanup() function performs a cleanup of the + * session directory tree when a job is aborted. It cleans up all + * process directories for a given job and then backs up the tree. + * + * @param jobid + * @retval OMPI_SUCCESS If the directory tree is properly cleaned up. + * @retval OMPI_ERROR If something prevents the tree from being + * properly cleaned up. + */ +OMPI_DECLSPEC int orte_session_dir_cleanup(orte_jobid_t jobid);