This fixes ticket #1426 - mpirun is cleaning up ALL session dirs
Mpirun - and the orteds - were doing their best to whack all session dirs on their nodes just in case there was something lingering due to an abnormal termination. Unfortunately, they were -too- good at it. They were whacking all session directories under the user's name, even those from other mpiruns! This adds another layer to the session dir tree so that we can denote which jobs come from our own job family, and restricts the cleanup operation to only session dirs from within our own job family. So we'll still cleanup anything due to our own mpirun, but won't whack any other mpirun from this user. Call it being polite... This commit was SVN r19083.
Этот коммит содержится в:
родитель
49d9f614d0
Коммит
01a7259a7d
@ -59,7 +59,6 @@ int orte_ess_base_app_setup(void)
|
|||||||
{
|
{
|
||||||
int ret;
|
int ret;
|
||||||
char *error = NULL;
|
char *error = NULL;
|
||||||
char *jobid_str, *procid_str;
|
|
||||||
|
|
||||||
/* Setup the communication infrastructure */
|
/* Setup the communication infrastructure */
|
||||||
|
|
||||||
@ -120,39 +119,20 @@ int orte_ess_base_app_setup(void)
|
|||||||
}
|
}
|
||||||
|
|
||||||
/* setup my session directory */
|
/* setup my session directory */
|
||||||
if (ORTE_SUCCESS != (ret = orte_util_convert_jobid_to_string(&jobid_str, ORTE_PROC_MY_NAME->jobid))) {
|
|
||||||
ORTE_ERROR_LOG(ret);
|
|
||||||
error = "orte_convert_jobid_to_string";
|
|
||||||
goto error;
|
|
||||||
}
|
|
||||||
if (ORTE_SUCCESS != (ret = orte_util_convert_vpid_to_string(&procid_str, ORTE_PROC_MY_NAME->vpid))) {
|
|
||||||
ORTE_ERROR_LOG(ret);
|
|
||||||
error = "orte_convert_vpid_to_string";
|
|
||||||
goto error;
|
|
||||||
}
|
|
||||||
|
|
||||||
OPAL_OUTPUT_VERBOSE((2, orte_debug_output,
|
OPAL_OUTPUT_VERBOSE((2, orte_debug_output,
|
||||||
"%s setting up session dir with\n\ttmpdir: %s\n\thost %s\n\tjobid %s\n\tprocid %s",
|
"%s setting up session dir with\n\ttmpdir: %s\n\thost %s",
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
(NULL == orte_process_info.tmpdir_base) ? "UNDEF" : orte_process_info.tmpdir_base,
|
(NULL == orte_process_info.tmpdir_base) ? "UNDEF" : orte_process_info.tmpdir_base,
|
||||||
orte_process_info.nodename, jobid_str, procid_str));
|
orte_process_info.nodename));
|
||||||
|
|
||||||
if (ORTE_SUCCESS != (ret = orte_session_dir(true,
|
if (ORTE_SUCCESS != (ret = orte_session_dir(true,
|
||||||
orte_process_info.tmpdir_base,
|
orte_process_info.tmpdir_base,
|
||||||
orte_process_info.nodename, NULL,
|
orte_process_info.nodename, NULL,
|
||||||
jobid_str, procid_str))) {
|
ORTE_PROC_MY_NAME))) {
|
||||||
if (jobid_str != NULL) free(jobid_str);
|
|
||||||
if (procid_str != NULL) free(procid_str);
|
|
||||||
ORTE_ERROR_LOG(ret);
|
ORTE_ERROR_LOG(ret);
|
||||||
error = "orte_session_dir";
|
error = "orte_session_dir";
|
||||||
goto error;
|
goto error;
|
||||||
}
|
}
|
||||||
if (NULL != jobid_str) {
|
|
||||||
free(jobid_str);
|
|
||||||
}
|
|
||||||
if (NULL != procid_str) {
|
|
||||||
free(procid_str);
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Once the session directory location has been established, set
|
/* Once the session directory location has been established, set
|
||||||
the opal_output env file location to be in the
|
the opal_output env file location to be in the
|
||||||
|
@ -61,7 +61,6 @@ int orte_ess_base_orted_setup(void)
|
|||||||
{
|
{
|
||||||
int ret;
|
int ret;
|
||||||
char *error = NULL;
|
char *error = NULL;
|
||||||
char *jobid_str, *procid_str;
|
|
||||||
|
|
||||||
/* some environments allow remote launches - e.g., ssh - so
|
/* some environments allow remote launches - e.g., ssh - so
|
||||||
* open the PLM and select something
|
* open the PLM and select something
|
||||||
@ -136,39 +135,20 @@ int orte_ess_base_orted_setup(void)
|
|||||||
}
|
}
|
||||||
|
|
||||||
/* setup my session directory */
|
/* setup my session directory */
|
||||||
if (ORTE_SUCCESS != (ret = orte_util_convert_jobid_to_string(&jobid_str, ORTE_PROC_MY_NAME->jobid))) {
|
|
||||||
ORTE_ERROR_LOG(ret);
|
|
||||||
error = "orte_convert_jobid_to_string";
|
|
||||||
goto error;
|
|
||||||
}
|
|
||||||
if (ORTE_SUCCESS != (ret = orte_util_convert_vpid_to_string(&procid_str, ORTE_PROC_MY_NAME->vpid))) {
|
|
||||||
ORTE_ERROR_LOG(ret);
|
|
||||||
error = "orte_convert_vpid_to_string";
|
|
||||||
goto error;
|
|
||||||
}
|
|
||||||
|
|
||||||
OPAL_OUTPUT_VERBOSE((2, orte_debug_output,
|
OPAL_OUTPUT_VERBOSE((2, orte_debug_output,
|
||||||
"%s setting up session dir with\n\ttmpdir: %s\n\thost %s\n\tjobid %s\n\tprocid %s",
|
"%s setting up session dir with\n\ttmpdir: %s\n\thost %s",
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
(NULL == orte_process_info.tmpdir_base) ? "UNDEF" : orte_process_info.tmpdir_base,
|
(NULL == orte_process_info.tmpdir_base) ? "UNDEF" : orte_process_info.tmpdir_base,
|
||||||
orte_process_info.nodename, jobid_str, procid_str));
|
orte_process_info.nodename));
|
||||||
|
|
||||||
if (ORTE_SUCCESS != (ret = orte_session_dir(true,
|
if (ORTE_SUCCESS != (ret = orte_session_dir(true,
|
||||||
orte_process_info.tmpdir_base,
|
orte_process_info.tmpdir_base,
|
||||||
orte_process_info.nodename, NULL,
|
orte_process_info.nodename, NULL,
|
||||||
jobid_str, procid_str))) {
|
ORTE_PROC_MY_NAME))) {
|
||||||
if (jobid_str != NULL) free(jobid_str);
|
|
||||||
if (procid_str != NULL) free(procid_str);
|
|
||||||
ORTE_ERROR_LOG(ret);
|
ORTE_ERROR_LOG(ret);
|
||||||
error = "orte_session_dir";
|
error = "orte_session_dir";
|
||||||
goto error;
|
goto error;
|
||||||
}
|
}
|
||||||
if (NULL != jobid_str) {
|
|
||||||
free(jobid_str);
|
|
||||||
}
|
|
||||||
if (NULL != procid_str) {
|
|
||||||
free(procid_str);
|
|
||||||
}
|
|
||||||
|
|
||||||
/* setup the routed info - the selected routed component
|
/* setup the routed info - the selected routed component
|
||||||
* will know what to do.
|
* will know what to do.
|
||||||
|
@ -99,8 +99,7 @@ int orte_ess_base_tool_setup(void)
|
|||||||
if (ORTE_SUCCESS != (ret = orte_session_dir_get_name(NULL,
|
if (ORTE_SUCCESS != (ret = orte_session_dir_get_name(NULL,
|
||||||
&orte_process_info.tmpdir_base,
|
&orte_process_info.tmpdir_base,
|
||||||
&orte_process_info.top_session_dir,
|
&orte_process_info.top_session_dir,
|
||||||
orte_process_info.nodename, NULL,
|
orte_process_info.nodename, NULL, NULL))) {
|
||||||
NULL, NULL))) {
|
|
||||||
ORTE_ERROR_LOG(ret);
|
ORTE_ERROR_LOG(ret);
|
||||||
error = "define session dir names";
|
error = "define session dir names";
|
||||||
goto error;
|
goto error;
|
||||||
|
@ -91,7 +91,7 @@ static int rte_init(char flags)
|
|||||||
{
|
{
|
||||||
int ret;
|
int ret;
|
||||||
char *error = NULL;
|
char *error = NULL;
|
||||||
char *jobid_str, *procid_str, *contact_path;
|
char *contact_path;
|
||||||
orte_job_t *jdata;
|
orte_job_t *jdata;
|
||||||
orte_node_t *node;
|
orte_node_t *node;
|
||||||
orte_proc_t *proc;
|
orte_proc_t *proc;
|
||||||
@ -248,39 +248,20 @@ static int rte_init(char flags)
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
/* setup my session directory */
|
/* setup my session directory */
|
||||||
if (ORTE_SUCCESS != (ret = orte_util_convert_jobid_to_string(&jobid_str, ORTE_PROC_MY_NAME->jobid))) {
|
|
||||||
ORTE_ERROR_LOG(ret);
|
|
||||||
error = "orte_convert_jobid_to_string";
|
|
||||||
goto error;
|
|
||||||
}
|
|
||||||
if (ORTE_SUCCESS != (ret = orte_util_convert_vpid_to_string(&procid_str, ORTE_PROC_MY_NAME->vpid))) {
|
|
||||||
ORTE_ERROR_LOG(ret);
|
|
||||||
error = "orte_convert_vpid_to_string";
|
|
||||||
goto error;
|
|
||||||
}
|
|
||||||
|
|
||||||
OPAL_OUTPUT_VERBOSE((2, orte_debug_output,
|
OPAL_OUTPUT_VERBOSE((2, orte_debug_output,
|
||||||
"%s setting up session dir with\n\ttmpdir: %s\n\thost %s\n\tjobid %s\n\tprocid %s",
|
"%s setting up session dir with\n\ttmpdir: %s\n\thost %s",
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
(NULL == orte_process_info.tmpdir_base) ? "UNDEF" : orte_process_info.tmpdir_base,
|
(NULL == orte_process_info.tmpdir_base) ? "UNDEF" : orte_process_info.tmpdir_base,
|
||||||
orte_process_info.nodename, jobid_str, procid_str));
|
orte_process_info.nodename));
|
||||||
|
|
||||||
if (ORTE_SUCCESS != (ret = orte_session_dir(true,
|
if (ORTE_SUCCESS != (ret = orte_session_dir(true,
|
||||||
orte_process_info.tmpdir_base,
|
orte_process_info.tmpdir_base,
|
||||||
orte_process_info.nodename, NULL,
|
orte_process_info.nodename, NULL,
|
||||||
jobid_str, procid_str))) {
|
ORTE_PROC_MY_NAME))) {
|
||||||
if (jobid_str != NULL) free(jobid_str);
|
|
||||||
if (procid_str != NULL) free(procid_str);
|
|
||||||
ORTE_ERROR_LOG(ret);
|
ORTE_ERROR_LOG(ret);
|
||||||
error = "orte_session_dir";
|
error = "orte_session_dir";
|
||||||
goto error;
|
goto error;
|
||||||
}
|
}
|
||||||
if (NULL != jobid_str) {
|
|
||||||
free(jobid_str);
|
|
||||||
}
|
|
||||||
if (NULL != procid_str) {
|
|
||||||
free(procid_str);
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Once the session directory location has been established, set
|
/* Once the session directory location has been established, set
|
||||||
the opal_output hnp file location to be in the
|
the opal_output hnp file location to be in the
|
||||||
|
@ -61,6 +61,12 @@ ORTE_DECLSPEC char* orte_util_print_vpids(const orte_vpid_t vpid);
|
|||||||
(((n) >> 16) & 0x0000ffff)
|
(((n) >> 16) & 0x0000ffff)
|
||||||
|
|
||||||
|
|
||||||
|
/* a macro for extracting the local jobid from the jobid - i.e.,
|
||||||
|
* the non-mpirun-specific id field of the jobid
|
||||||
|
*/
|
||||||
|
#define ORTE_LOCAL_JOBID(n) \
|
||||||
|
( (n) & 0x0000ffff)
|
||||||
|
|
||||||
/* a macro for identifying that a proc is a daemon */
|
/* a macro for identifying that a proc is a daemon */
|
||||||
#define ORTE_PROC_IS_DAEMON(n) \
|
#define ORTE_PROC_IS_DAEMON(n) \
|
||||||
!((n) & 0x0000ffff)
|
!((n) & 0x0000ffff)
|
||||||
|
@ -112,13 +112,16 @@ orte_session_dir_get_name(char **fulldirpath,
|
|||||||
char **return_frontend,
|
char **return_frontend,
|
||||||
char *hostid,
|
char *hostid,
|
||||||
char *batchid,
|
char *batchid,
|
||||||
char *job, char *proc) {
|
orte_process_name_t *proc) {
|
||||||
char *hostname = NULL,
|
char *hostname = NULL,
|
||||||
*batchname = NULL,
|
*batchname = NULL,
|
||||||
*sessions = NULL,
|
*sessions = NULL,
|
||||||
*user = NULL,
|
*user = NULL,
|
||||||
*prefix = NULL,
|
*prefix = NULL,
|
||||||
*frontend = NULL;
|
*frontend = NULL,
|
||||||
|
*jobfam = NULL,
|
||||||
|
*job = NULL,
|
||||||
|
*vpidstr = NULL;
|
||||||
bool prefix_provided = false;
|
bool prefix_provided = false;
|
||||||
int exit_status = ORTE_SUCCESS;
|
int exit_status = ORTE_SUCCESS;
|
||||||
#ifndef __WINDOWS__
|
#ifndef __WINDOWS__
|
||||||
@ -181,16 +184,6 @@ orte_session_dir_get_name(char **fulldirpath,
|
|||||||
else
|
else
|
||||||
batchname = strdup("0");
|
batchname = strdup("0");
|
||||||
|
|
||||||
/*
|
|
||||||
* Check: Can't give a proc without a job
|
|
||||||
*/
|
|
||||||
if( NULL == job &&
|
|
||||||
NULL != proc) {
|
|
||||||
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
|
|
||||||
exit_status = ORTE_ERR_BAD_PARAM;
|
|
||||||
goto cleanup;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* get the front part of the session directory
|
* get the front part of the session directory
|
||||||
* Will look something like:
|
* Will look something like:
|
||||||
@ -210,29 +203,65 @@ orte_session_dir_get_name(char **fulldirpath,
|
|||||||
/*
|
/*
|
||||||
* Construct the session directory
|
* Construct the session directory
|
||||||
*/
|
*/
|
||||||
/* If we were given a 'proc' then we can construct it fully into:
|
/* If we were given a valid vpid then we can construct it fully into:
|
||||||
* openmpi-sessions-USERNAME@HOSTNAME_BATCHID/JOBID/PROC
|
* openmpi-sessions-USERNAME@HOSTNAME_BATCHID/JOB-FAMILY/JOBID/VPID
|
||||||
*/
|
*/
|
||||||
if( NULL != proc) {
|
if( NULL != proc) {
|
||||||
sessions = opal_os_path( false, frontend, job, proc, NULL );
|
if (ORTE_VPID_INVALID != proc->vpid) {
|
||||||
if( NULL == sessions ) {
|
|
||||||
ORTE_ERROR_LOG(ORTE_ERROR);
|
if (0 > asprintf(&jobfam, "%d", ORTE_JOB_FAMILY(proc->jobid))) {
|
||||||
exit_status = ORTE_ERROR;
|
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||||
goto cleanup;
|
exit_status = ORTE_ERR_OUT_OF_RESOURCE;
|
||||||
|
goto cleanup;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (0 > asprintf(&job, "%d", ORTE_LOCAL_JOBID(proc->jobid))) {
|
||||||
|
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||||
|
exit_status = ORTE_ERR_OUT_OF_RESOURCE;
|
||||||
|
goto cleanup;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (ORTE_SUCCESS != orte_util_convert_vpid_to_string(&vpidstr, proc->vpid)) {
|
||||||
|
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||||
|
exit_status = ORTE_ERR_OUT_OF_RESOURCE;
|
||||||
|
goto cleanup;
|
||||||
|
}
|
||||||
|
|
||||||
|
sessions = opal_os_path( false, frontend, jobfam, job, vpidstr, NULL );
|
||||||
|
if( NULL == sessions ) {
|
||||||
|
ORTE_ERROR_LOG(ORTE_ERROR);
|
||||||
|
exit_status = ORTE_ERROR;
|
||||||
|
goto cleanup;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
/* If we were given a valid jobid then we can construct it partially into:
|
||||||
/* If we were given a 'job' then we can construct it partially into:
|
* openmpi-sessions-USERNAME@HOSTNAME_BATCHID/JOB-FAMILY/JOBID
|
||||||
* openmpi-sessions-USERNAME@HOSTNAME_BATCHID/JOBID
|
*/
|
||||||
*/
|
else if (ORTE_JOBID_INVALID != proc->jobid) {
|
||||||
else if(NULL != job) {
|
if (0 > asprintf(&jobfam, "%d", ORTE_JOB_FAMILY(proc->jobid))) {
|
||||||
sessions = opal_os_path( false, frontend, job, NULL );
|
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||||
if( NULL == sessions ) {
|
exit_status = ORTE_ERR_OUT_OF_RESOURCE;
|
||||||
ORTE_ERROR_LOG(ORTE_ERROR);
|
goto cleanup;
|
||||||
exit_status = ORTE_ERROR;
|
}
|
||||||
goto cleanup;
|
|
||||||
|
if (0 > asprintf(&job, "%d", ORTE_LOCAL_JOBID(proc->jobid))) {
|
||||||
|
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||||
|
exit_status = ORTE_ERR_OUT_OF_RESOURCE;
|
||||||
|
goto cleanup;
|
||||||
|
}
|
||||||
|
|
||||||
|
sessions = opal_os_path( false, frontend, jobfam, job, NULL );
|
||||||
|
if( NULL == sessions ) {
|
||||||
|
ORTE_ERROR_LOG(ORTE_ERROR);
|
||||||
|
exit_status = ORTE_ERROR;
|
||||||
|
goto cleanup;
|
||||||
|
}
|
||||||
|
} /* if both are invalid */
|
||||||
|
else {
|
||||||
|
sessions = strdup(frontend); /* must dup this to avoid double-free later */
|
||||||
}
|
}
|
||||||
}
|
|
||||||
/* If we were not given either then we just set it to frontend
|
} /* If we were not given a proc at all, then we just set it to frontend
|
||||||
*/
|
*/
|
||||||
else {
|
else {
|
||||||
sessions = strdup(frontend); /* must dup this to avoid double-free later */
|
sessions = strdup(frontend); /* must dup this to avoid double-free later */
|
||||||
@ -314,6 +343,9 @@ orte_session_dir_get_name(char **fulldirpath,
|
|||||||
free(user);
|
free(user);
|
||||||
if (NULL != prefix) free(prefix);
|
if (NULL != prefix) free(prefix);
|
||||||
if (NULL != frontend) free(frontend);
|
if (NULL != frontend) free(frontend);
|
||||||
|
if (NULL != jobfam) free(jobfam);
|
||||||
|
if (NULL != job) free(job);
|
||||||
|
if (NULL != vpidstr) free(vpidstr);
|
||||||
|
|
||||||
return exit_status;
|
return exit_status;
|
||||||
}
|
}
|
||||||
@ -323,7 +355,7 @@ orte_session_dir_get_name(char **fulldirpath,
|
|||||||
*/
|
*/
|
||||||
int orte_session_dir(bool create,
|
int orte_session_dir(bool create,
|
||||||
char *prefix, char *hostid,
|
char *prefix, char *hostid,
|
||||||
char *batchid, char *job, char *proc)
|
char *batchid, orte_process_name_t *proc)
|
||||||
{
|
{
|
||||||
char *fulldirpath = NULL,
|
char *fulldirpath = NULL,
|
||||||
*frontend = NULL,
|
*frontend = NULL,
|
||||||
@ -355,8 +387,7 @@ int orte_session_dir(bool create,
|
|||||||
&prefix,
|
&prefix,
|
||||||
&frontend,
|
&frontend,
|
||||||
hostid,
|
hostid,
|
||||||
batchid, job,
|
batchid, proc) ) ) {
|
||||||
proc) ) ) {
|
|
||||||
if (ORTE_ERR_FATAL == rtn) {
|
if (ORTE_ERR_FATAL == rtn) {
|
||||||
/* this indicates we definitely need to abort, so
|
/* this indicates we definitely need to abort, so
|
||||||
* don't try the NULL prefix
|
* don't try the NULL prefix
|
||||||
@ -450,7 +481,7 @@ int orte_session_dir(bool create,
|
|||||||
/*
|
/*
|
||||||
* Set the process session directory
|
* Set the process session directory
|
||||||
*/
|
*/
|
||||||
if (NULL != proc) {
|
if (ORTE_VPID_INVALID != proc->vpid) {
|
||||||
if (create) { /* overwrite if creating */
|
if (create) { /* overwrite if creating */
|
||||||
if (NULL != orte_process_info.proc_session_dir) {
|
if (NULL != orte_process_info.proc_session_dir) {
|
||||||
free(orte_process_info.proc_session_dir);
|
free(orte_process_info.proc_session_dir);
|
||||||
@ -471,7 +502,7 @@ int orte_session_dir(bool create,
|
|||||||
/*
|
/*
|
||||||
* Set the job session directory
|
* Set the job session directory
|
||||||
*/
|
*/
|
||||||
if (NULL != job) {
|
if (ORTE_JOBID_INVALID != proc->jobid) {
|
||||||
if (create) { /* overwrite if creating */
|
if (create) { /* overwrite if creating */
|
||||||
if (NULL != orte_process_info.job_session_dir) {
|
if (NULL != orte_process_info.job_session_dir) {
|
||||||
free(orte_process_info.job_session_dir);
|
free(orte_process_info.job_session_dir);
|
||||||
@ -481,12 +512,6 @@ int orte_session_dir(bool create,
|
|||||||
if (NULL == orte_process_info.job_session_dir) {
|
if (NULL == orte_process_info.job_session_dir) {
|
||||||
orte_process_info.job_session_dir = strdup(fulldirpath);
|
orte_process_info.job_session_dir = strdup(fulldirpath);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Strip off last part of directory structure */
|
|
||||||
sav = opal_dirname(fulldirpath);
|
|
||||||
free(fulldirpath);
|
|
||||||
fulldirpath = sav;
|
|
||||||
sav = NULL;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (orte_debug_flag) {
|
if (orte_debug_flag) {
|
||||||
@ -519,41 +544,51 @@ orte_session_dir_cleanup(orte_jobid_t jobid)
|
|||||||
{
|
{
|
||||||
int rc;
|
int rc;
|
||||||
char *tmp;
|
char *tmp;
|
||||||
char *job=NULL, *job_session_dir=NULL;
|
char *jobfam=NULL, *job=NULL, *job_session_dir=NULL;
|
||||||
|
|
||||||
/* need to setup the top_session_dir with the prefix */
|
/* need to setup the top_session_dir with the prefix */
|
||||||
tmp = opal_os_path(false,
|
tmp = opal_os_path(false,
|
||||||
orte_process_info.tmpdir_base,
|
orte_process_info.tmpdir_base,
|
||||||
orte_process_info.top_session_dir, NULL);
|
orte_process_info.top_session_dir, NULL);
|
||||||
|
|
||||||
|
/* we can only blow away session directories for our job family */
|
||||||
|
if (0 > asprintf(&jobfam, "%d", ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid))) {
|
||||||
|
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||||
|
rc = ORTE_ERR_OUT_OF_RESOURCE;
|
||||||
|
goto CLEANUP;
|
||||||
|
}
|
||||||
|
|
||||||
if (ORTE_JOBID_WILDCARD != jobid) {
|
if (ORTE_JOBID_WILDCARD != jobid) {
|
||||||
/* define the proc and job session directories for this process */
|
|
||||||
if (ORTE_SUCCESS != (rc = orte_util_convert_jobid_to_string(&job, jobid))) {
|
if (0 > asprintf(&job, "%d", jobid)) {
|
||||||
ORTE_ERROR_LOG(rc);
|
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||||
free(tmp);
|
rc = ORTE_ERR_OUT_OF_RESOURCE;
|
||||||
return rc;
|
goto CLEANUP;
|
||||||
}
|
}
|
||||||
job_session_dir = opal_os_path( false, orte_process_info.top_session_dir,
|
|
||||||
job, NULL );
|
job_session_dir = opal_os_path(false, tmp, jobfam, job, NULL );
|
||||||
if( NULL == job_session_dir ) {
|
if( NULL == job_session_dir ) {
|
||||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||||
free(tmp);
|
rc = ORTE_ERR_OUT_OF_RESOURCE;
|
||||||
free(job);
|
goto CLEANUP;
|
||||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
opal_os_dirpath_destroy(job_session_dir,
|
opal_os_dirpath_destroy(job_session_dir,
|
||||||
true, orte_dir_check_file);
|
true, orte_dir_check_file);
|
||||||
} else {
|
} else {
|
||||||
/* if we want the session_dir removed for ALL jobids, then
|
/* if we want the session_dir removed for ALL jobids, then
|
||||||
* just recursively blow the whole session away, saving only
|
* just recursively blow the whole session away for our job family,
|
||||||
* output files
|
* saving only output files
|
||||||
*/
|
*/
|
||||||
opal_os_dirpath_destroy(tmp, true, orte_dir_check_file_output);
|
job_session_dir = opal_os_path(false, tmp, jobfam, NULL);
|
||||||
|
opal_os_dirpath_destroy(job_session_dir, true, orte_dir_check_file_output);
|
||||||
}
|
}
|
||||||
|
|
||||||
opal_os_dirpath_destroy(tmp,
|
/* now attempt to eliminate the top level directory itself - this
|
||||||
false, orte_dir_check_file);
|
* will fail if anything is present, but ensures we cleanup if
|
||||||
|
* we are the last one out
|
||||||
|
*/
|
||||||
|
opal_os_dirpath_destroy(tmp, false, orte_dir_check_file);
|
||||||
|
|
||||||
if (NULL != job_session_dir && opal_os_dirpath_is_empty(job_session_dir)) {
|
if (NULL != job_session_dir && opal_os_dirpath_is_empty(job_session_dir)) {
|
||||||
if (orte_debug_flag) {
|
if (orte_debug_flag) {
|
||||||
@ -581,6 +616,7 @@ orte_session_dir_cleanup(orte_jobid_t jobid)
|
|||||||
CLEANUP:
|
CLEANUP:
|
||||||
free(tmp);
|
free(tmp);
|
||||||
if (NULL != job) free(job);
|
if (NULL != job) free(job);
|
||||||
|
if (NULL != jobfam) free(jobfam);
|
||||||
if (NULL != job_session_dir) free(job_session_dir);
|
if (NULL != job_session_dir) free(job_session_dir);
|
||||||
return ORTE_SUCCESS;
|
return ORTE_SUCCESS;
|
||||||
}
|
}
|
||||||
|
@ -112,13 +112,8 @@ BEGIN_C_DECLS
|
|||||||
* @param batchid Batch job name, used in batch scheduling
|
* @param batchid Batch job name, used in batch scheduling
|
||||||
* systems. NULL indicates that the default of "0" is
|
* systems. NULL indicates that the default of "0" is
|
||||||
* to be used.
|
* to be used.
|
||||||
* @param job String version of the jobid for which a session
|
* @param proc Pointer to a process name for which the session
|
||||||
* directory is to be created/found. NULL indicates
|
* dir name is desired
|
||||||
* that only the universe directory is to be
|
|
||||||
* created/found.
|
|
||||||
* @param vpid String version of the vpid for which a session
|
|
||||||
* directory is to be created/found. NULL indicates
|
|
||||||
* that only the job directory is to be created/found.
|
|
||||||
*
|
*
|
||||||
* @retval ORTE_SUCCESS The directory was found and/or created with
|
* @retval ORTE_SUCCESS The directory was found and/or created with
|
||||||
* the proper permissions.
|
* the proper permissions.
|
||||||
@ -126,7 +121,7 @@ BEGIN_C_DECLS
|
|||||||
* "false") or created (if create is "true").
|
* "false") or created (if create is "true").
|
||||||
*/
|
*/
|
||||||
ORTE_DECLSPEC int orte_session_dir(bool create, char *prefix, char *hostid,
|
ORTE_DECLSPEC int orte_session_dir(bool create, char *prefix, char *hostid,
|
||||||
char *batchid, char *job, char *vpid);
|
char *batchid, orte_process_name_t *proc);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Construct the session directory name from the input parameters.
|
* Construct the session directory name from the input parameters.
|
||||||
@ -137,7 +132,7 @@ ORTE_DECLSPEC int orte_session_dir_get_name(char **fulldirpath,
|
|||||||
char **frontend,
|
char **frontend,
|
||||||
char *hostid,
|
char *hostid,
|
||||||
char *batchid,
|
char *batchid,
|
||||||
char *job, char *proc);
|
orte_process_name_t *proc);
|
||||||
|
|
||||||
/** The orte_session_dir_finalize() function performs a cleanup of the
|
/** The orte_session_dir_finalize() function performs a cleanup of the
|
||||||
* session directory tree. It first removes the session directory for
|
* session directory tree. It first removes the session directory for
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user