diff --git a/orte/mca/ess/base/ess_base_std_app.c b/orte/mca/ess/base/ess_base_std_app.c index 05c2b18ddb..0c801c05c3 100644 --- a/orte/mca/ess/base/ess_base_std_app.c +++ b/orte/mca/ess/base/ess_base_std_app.c @@ -59,7 +59,6 @@ int orte_ess_base_app_setup(void) { int ret; char *error = NULL; - char *jobid_str, *procid_str; /* Setup the communication infrastructure */ @@ -120,39 +119,20 @@ int orte_ess_base_app_setup(void) } /* setup my session directory */ - if (ORTE_SUCCESS != (ret = orte_util_convert_jobid_to_string(&jobid_str, ORTE_PROC_MY_NAME->jobid))) { - ORTE_ERROR_LOG(ret); - error = "orte_convert_jobid_to_string"; - goto error; - } - if (ORTE_SUCCESS != (ret = orte_util_convert_vpid_to_string(&procid_str, ORTE_PROC_MY_NAME->vpid))) { - ORTE_ERROR_LOG(ret); - error = "orte_convert_vpid_to_string"; - goto error; - } - OPAL_OUTPUT_VERBOSE((2, orte_debug_output, - "%s setting up session dir with\n\ttmpdir: %s\n\thost %s\n\tjobid %s\n\tprocid %s", + "%s setting up session dir with\n\ttmpdir: %s\n\thost %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (NULL == orte_process_info.tmpdir_base) ? "UNDEF" : orte_process_info.tmpdir_base, - orte_process_info.nodename, jobid_str, procid_str)); + orte_process_info.nodename)); if (ORTE_SUCCESS != (ret = orte_session_dir(true, orte_process_info.tmpdir_base, orte_process_info.nodename, NULL, - jobid_str, procid_str))) { - if (jobid_str != NULL) free(jobid_str); - if (procid_str != NULL) free(procid_str); + ORTE_PROC_MY_NAME))) { ORTE_ERROR_LOG(ret); error = "orte_session_dir"; goto error; } - if (NULL != jobid_str) { - free(jobid_str); - } - if (NULL != procid_str) { - free(procid_str); - } /* Once the session directory location has been established, set the opal_output env file location to be in the diff --git a/orte/mca/ess/base/ess_base_std_orted.c b/orte/mca/ess/base/ess_base_std_orted.c index a7a6808307..4c249b3e5d 100644 --- a/orte/mca/ess/base/ess_base_std_orted.c +++ b/orte/mca/ess/base/ess_base_std_orted.c @@ -61,7 +61,6 @@ int orte_ess_base_orted_setup(void) { int ret; char *error = NULL; - char *jobid_str, *procid_str; /* some environments allow remote launches - e.g., ssh - so * open the PLM and select something @@ -136,39 +135,20 @@ int orte_ess_base_orted_setup(void) } /* setup my session directory */ - if (ORTE_SUCCESS != (ret = orte_util_convert_jobid_to_string(&jobid_str, ORTE_PROC_MY_NAME->jobid))) { - ORTE_ERROR_LOG(ret); - error = "orte_convert_jobid_to_string"; - goto error; - } - if (ORTE_SUCCESS != (ret = orte_util_convert_vpid_to_string(&procid_str, ORTE_PROC_MY_NAME->vpid))) { - ORTE_ERROR_LOG(ret); - error = "orte_convert_vpid_to_string"; - goto error; - } - OPAL_OUTPUT_VERBOSE((2, orte_debug_output, - "%s setting up session dir with\n\ttmpdir: %s\n\thost %s\n\tjobid %s\n\tprocid %s", + "%s setting up session dir with\n\ttmpdir: %s\n\thost %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (NULL == orte_process_info.tmpdir_base) ? "UNDEF" : orte_process_info.tmpdir_base, - orte_process_info.nodename, jobid_str, procid_str)); + orte_process_info.nodename)); if (ORTE_SUCCESS != (ret = orte_session_dir(true, orte_process_info.tmpdir_base, orte_process_info.nodename, NULL, - jobid_str, procid_str))) { - if (jobid_str != NULL) free(jobid_str); - if (procid_str != NULL) free(procid_str); + ORTE_PROC_MY_NAME))) { ORTE_ERROR_LOG(ret); error = "orte_session_dir"; goto error; } - if (NULL != jobid_str) { - free(jobid_str); - } - if (NULL != procid_str) { - free(procid_str); - } /* setup the routed info - the selected routed component * will know what to do. diff --git a/orte/mca/ess/base/ess_base_std_tool.c b/orte/mca/ess/base/ess_base_std_tool.c index 0ecb432731..4ec77d1faf 100644 --- a/orte/mca/ess/base/ess_base_std_tool.c +++ b/orte/mca/ess/base/ess_base_std_tool.c @@ -99,8 +99,7 @@ int orte_ess_base_tool_setup(void) if (ORTE_SUCCESS != (ret = orte_session_dir_get_name(NULL, &orte_process_info.tmpdir_base, &orte_process_info.top_session_dir, - orte_process_info.nodename, NULL, - NULL, NULL))) { + orte_process_info.nodename, NULL, NULL))) { ORTE_ERROR_LOG(ret); error = "define session dir names"; goto error; diff --git a/orte/mca/ess/hnp/ess_hnp_module.c b/orte/mca/ess/hnp/ess_hnp_module.c index dfa604176c..50f68e13c6 100644 --- a/orte/mca/ess/hnp/ess_hnp_module.c +++ b/orte/mca/ess/hnp/ess_hnp_module.c @@ -91,7 +91,7 @@ static int rte_init(char flags) { int ret; char *error = NULL; - char *jobid_str, *procid_str, *contact_path; + char *contact_path; orte_job_t *jdata; orte_node_t *node; orte_proc_t *proc; @@ -248,39 +248,20 @@ static int rte_init(char flags) #endif /* setup my session directory */ - if (ORTE_SUCCESS != (ret = orte_util_convert_jobid_to_string(&jobid_str, ORTE_PROC_MY_NAME->jobid))) { - ORTE_ERROR_LOG(ret); - error = "orte_convert_jobid_to_string"; - goto error; - } - if (ORTE_SUCCESS != (ret = orte_util_convert_vpid_to_string(&procid_str, ORTE_PROC_MY_NAME->vpid))) { - ORTE_ERROR_LOG(ret); - error = "orte_convert_vpid_to_string"; - goto error; - } - OPAL_OUTPUT_VERBOSE((2, orte_debug_output, - "%s setting up session dir with\n\ttmpdir: %s\n\thost %s\n\tjobid %s\n\tprocid %s", + "%s setting up session dir with\n\ttmpdir: %s\n\thost %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (NULL == orte_process_info.tmpdir_base) ? "UNDEF" : orte_process_info.tmpdir_base, - orte_process_info.nodename, jobid_str, procid_str)); + orte_process_info.nodename)); if (ORTE_SUCCESS != (ret = orte_session_dir(true, orte_process_info.tmpdir_base, orte_process_info.nodename, NULL, - jobid_str, procid_str))) { - if (jobid_str != NULL) free(jobid_str); - if (procid_str != NULL) free(procid_str); + ORTE_PROC_MY_NAME))) { ORTE_ERROR_LOG(ret); error = "orte_session_dir"; goto error; } - if (NULL != jobid_str) { - free(jobid_str); - } - if (NULL != procid_str) { - free(procid_str); - } /* Once the session directory location has been established, set the opal_output hnp file location to be in the diff --git a/orte/util/name_fns.h b/orte/util/name_fns.h index ecfd06e9c5..db80974796 100644 --- a/orte/util/name_fns.h +++ b/orte/util/name_fns.h @@ -61,6 +61,12 @@ ORTE_DECLSPEC char* orte_util_print_vpids(const orte_vpid_t vpid); (((n) >> 16) & 0x0000ffff) +/* a macro for extracting the local jobid from the jobid - i.e., + * the non-mpirun-specific id field of the jobid + */ +#define ORTE_LOCAL_JOBID(n) \ + ( (n) & 0x0000ffff) + /* a macro for identifying that a proc is a daemon */ #define ORTE_PROC_IS_DAEMON(n) \ !((n) & 0x0000ffff) diff --git a/orte/util/session_dir.c b/orte/util/session_dir.c index dca53d4f4a..8479aa898f 100644 --- a/orte/util/session_dir.c +++ b/orte/util/session_dir.c @@ -112,13 +112,16 @@ orte_session_dir_get_name(char **fulldirpath, char **return_frontend, char *hostid, char *batchid, - char *job, char *proc) { + orte_process_name_t *proc) { char *hostname = NULL, *batchname = NULL, *sessions = NULL, *user = NULL, *prefix = NULL, - *frontend = NULL; + *frontend = NULL, + *jobfam = NULL, + *job = NULL, + *vpidstr = NULL; bool prefix_provided = false; int exit_status = ORTE_SUCCESS; #ifndef __WINDOWS__ @@ -181,16 +184,6 @@ orte_session_dir_get_name(char **fulldirpath, else batchname = strdup("0"); - /* - * Check: Can't give a proc without a job - */ - if( NULL == job && - NULL != proc) { - ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); - exit_status = ORTE_ERR_BAD_PARAM; - goto cleanup; - } - /* * get the front part of the session directory * Will look something like: @@ -210,29 +203,65 @@ orte_session_dir_get_name(char **fulldirpath, /* * Construct the session directory */ - /* If we were given a 'proc' then we can construct it fully into: - * openmpi-sessions-USERNAME@HOSTNAME_BATCHID/JOBID/PROC + /* If we were given a valid vpid then we can construct it fully into: + * openmpi-sessions-USERNAME@HOSTNAME_BATCHID/JOB-FAMILY/JOBID/VPID */ if( NULL != proc) { - sessions = opal_os_path( false, frontend, job, proc, NULL ); - if( NULL == sessions ) { - ORTE_ERROR_LOG(ORTE_ERROR); - exit_status = ORTE_ERROR; - goto cleanup; + if (ORTE_VPID_INVALID != proc->vpid) { + + if (0 > asprintf(&jobfam, "%d", ORTE_JOB_FAMILY(proc->jobid))) { + ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); + exit_status = ORTE_ERR_OUT_OF_RESOURCE; + goto cleanup; + } + + if (0 > asprintf(&job, "%d", ORTE_LOCAL_JOBID(proc->jobid))) { + ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); + exit_status = ORTE_ERR_OUT_OF_RESOURCE; + goto cleanup; + } + + if (ORTE_SUCCESS != orte_util_convert_vpid_to_string(&vpidstr, proc->vpid)) { + ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); + exit_status = ORTE_ERR_OUT_OF_RESOURCE; + goto cleanup; + } + + sessions = opal_os_path( false, frontend, jobfam, job, vpidstr, NULL ); + if( NULL == sessions ) { + ORTE_ERROR_LOG(ORTE_ERROR); + exit_status = ORTE_ERROR; + goto cleanup; + } } - } - /* If we were given a 'job' then we can construct it partially into: - * openmpi-sessions-USERNAME@HOSTNAME_BATCHID/JOBID - */ - else if(NULL != job) { - sessions = opal_os_path( false, frontend, job, NULL ); - if( NULL == sessions ) { - ORTE_ERROR_LOG(ORTE_ERROR); - exit_status = ORTE_ERROR; - goto cleanup; + /* If we were given a valid jobid then we can construct it partially into: + * openmpi-sessions-USERNAME@HOSTNAME_BATCHID/JOB-FAMILY/JOBID + */ + else if (ORTE_JOBID_INVALID != proc->jobid) { + if (0 > asprintf(&jobfam, "%d", ORTE_JOB_FAMILY(proc->jobid))) { + ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); + exit_status = ORTE_ERR_OUT_OF_RESOURCE; + goto cleanup; + } + + if (0 > asprintf(&job, "%d", ORTE_LOCAL_JOBID(proc->jobid))) { + ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); + exit_status = ORTE_ERR_OUT_OF_RESOURCE; + goto cleanup; + } + + sessions = opal_os_path( false, frontend, jobfam, job, NULL ); + if( NULL == sessions ) { + ORTE_ERROR_LOG(ORTE_ERROR); + exit_status = ORTE_ERROR; + goto cleanup; + } + } /* if both are invalid */ + else { + sessions = strdup(frontend); /* must dup this to avoid double-free later */ } - } - /* If we were not given either then we just set it to frontend + + } /* If we were not given a proc at all, then we just set it to frontend */ else { sessions = strdup(frontend); /* must dup this to avoid double-free later */ @@ -314,6 +343,9 @@ orte_session_dir_get_name(char **fulldirpath, free(user); if (NULL != prefix) free(prefix); if (NULL != frontend) free(frontend); + if (NULL != jobfam) free(jobfam); + if (NULL != job) free(job); + if (NULL != vpidstr) free(vpidstr); return exit_status; } @@ -323,7 +355,7 @@ orte_session_dir_get_name(char **fulldirpath, */ int orte_session_dir(bool create, char *prefix, char *hostid, - char *batchid, char *job, char *proc) + char *batchid, orte_process_name_t *proc) { char *fulldirpath = NULL, *frontend = NULL, @@ -355,8 +387,7 @@ int orte_session_dir(bool create, &prefix, &frontend, hostid, - batchid, job, - proc) ) ) { + batchid, proc) ) ) { if (ORTE_ERR_FATAL == rtn) { /* this indicates we definitely need to abort, so * don't try the NULL prefix @@ -450,7 +481,7 @@ int orte_session_dir(bool create, /* * Set the process session directory */ - if (NULL != proc) { + if (ORTE_VPID_INVALID != proc->vpid) { if (create) { /* overwrite if creating */ if (NULL != orte_process_info.proc_session_dir) { free(orte_process_info.proc_session_dir); @@ -471,7 +502,7 @@ int orte_session_dir(bool create, /* * Set the job session directory */ - if (NULL != job) { + if (ORTE_JOBID_INVALID != proc->jobid) { if (create) { /* overwrite if creating */ if (NULL != orte_process_info.job_session_dir) { free(orte_process_info.job_session_dir); @@ -481,12 +512,6 @@ int orte_session_dir(bool create, if (NULL == orte_process_info.job_session_dir) { orte_process_info.job_session_dir = strdup(fulldirpath); } - - /* Strip off last part of directory structure */ - sav = opal_dirname(fulldirpath); - free(fulldirpath); - fulldirpath = sav; - sav = NULL; } if (orte_debug_flag) { @@ -519,41 +544,51 @@ orte_session_dir_cleanup(orte_jobid_t jobid) { int rc; char *tmp; - char *job=NULL, *job_session_dir=NULL; + char *jobfam=NULL, *job=NULL, *job_session_dir=NULL; /* need to setup the top_session_dir with the prefix */ tmp = opal_os_path(false, orte_process_info.tmpdir_base, orte_process_info.top_session_dir, NULL); + + /* we can only blow away session directories for our job family */ + if (0 > asprintf(&jobfam, "%d", ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid))) { + ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); + rc = ORTE_ERR_OUT_OF_RESOURCE; + goto CLEANUP; + } if (ORTE_JOBID_WILDCARD != jobid) { - /* define the proc and job session directories for this process */ - if (ORTE_SUCCESS != (rc = orte_util_convert_jobid_to_string(&job, jobid))) { - ORTE_ERROR_LOG(rc); - free(tmp); - return rc; + + if (0 > asprintf(&job, "%d", jobid)) { + ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); + rc = ORTE_ERR_OUT_OF_RESOURCE; + goto CLEANUP; } - job_session_dir = opal_os_path( false, orte_process_info.top_session_dir, - job, NULL ); + + job_session_dir = opal_os_path(false, tmp, jobfam, job, NULL ); if( NULL == job_session_dir ) { ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); - free(tmp); - free(job); - return ORTE_ERR_OUT_OF_RESOURCE; + rc = ORTE_ERR_OUT_OF_RESOURCE; + goto CLEANUP; } opal_os_dirpath_destroy(job_session_dir, true, orte_dir_check_file); } else { /* if we want the session_dir removed for ALL jobids, then - * just recursively blow the whole session away, saving only - * output files + * just recursively blow the whole session away for our job family, + * saving only output files */ - opal_os_dirpath_destroy(tmp, true, orte_dir_check_file_output); + job_session_dir = opal_os_path(false, tmp, jobfam, NULL); + opal_os_dirpath_destroy(job_session_dir, true, orte_dir_check_file_output); } - opal_os_dirpath_destroy(tmp, - false, orte_dir_check_file); + /* now attempt to eliminate the top level directory itself - this + * will fail if anything is present, but ensures we cleanup if + * we are the last one out + */ + opal_os_dirpath_destroy(tmp, false, orte_dir_check_file); if (NULL != job_session_dir && opal_os_dirpath_is_empty(job_session_dir)) { if (orte_debug_flag) { @@ -581,6 +616,7 @@ orte_session_dir_cleanup(orte_jobid_t jobid) CLEANUP: free(tmp); if (NULL != job) free(job); + if (NULL != jobfam) free(jobfam); if (NULL != job_session_dir) free(job_session_dir); return ORTE_SUCCESS; } diff --git a/orte/util/session_dir.h b/orte/util/session_dir.h index 21f32d8677..627069857e 100644 --- a/orte/util/session_dir.h +++ b/orte/util/session_dir.h @@ -112,13 +112,8 @@ BEGIN_C_DECLS * @param batchid Batch job name, used in batch scheduling * systems. NULL indicates that the default of "0" is * to be used. - * @param job String version of the jobid for which a session - * directory is to be created/found. NULL indicates - * that only the universe directory is to be - * created/found. - * @param vpid String version of the vpid for which a session - * directory is to be created/found. NULL indicates - * that only the job directory is to be created/found. + * @param proc Pointer to a process name for which the session + * dir name is desired * * @retval ORTE_SUCCESS The directory was found and/or created with * the proper permissions. @@ -126,7 +121,7 @@ BEGIN_C_DECLS * "false") or created (if create is "true"). */ ORTE_DECLSPEC int orte_session_dir(bool create, char *prefix, char *hostid, - char *batchid, char *job, char *vpid); + char *batchid, orte_process_name_t *proc); /* * Construct the session directory name from the input parameters. @@ -137,7 +132,7 @@ ORTE_DECLSPEC int orte_session_dir_get_name(char **fulldirpath, char **frontend, char *hostid, char *batchid, - char *job, char *proc); + orte_process_name_t *proc); /** The orte_session_dir_finalize() function performs a cleanup of the * session directory tree. It first removes the session directory for