Merge pull request #2042 from artpol84/pmix_sdirs
Several fixes related to session directories:
Этот коммит содержится в:
Коммит
9eba1b0b75
@ -75,6 +75,7 @@ BEGIN_C_DECLS
|
||||
#define OPAL_PMIX_TMPDIR "pmix.tmpdir" // (char*) top-level tmp dir assigned to session
|
||||
#define OPAL_PMIX_NSDIR "pmix.nsdir" // (char*) sub-tmpdir assigned to namespace
|
||||
#define OPAL_PMIX_PROCDIR "pmix.pdir" // (char*) sub-nsdir assigned to proc
|
||||
#define OPAL_PMIX_TDIR_RMCLEAN "pmix.tdir.rmclean" // (bool) Resource Manager will clean session directories
|
||||
|
||||
/* information about relative ranks as assigned by the RM */
|
||||
#define OPAL_PMIX_JOBID "pmix.jobid" // (uint32_t) jobid assigned by scheduler
|
||||
|
@ -136,10 +136,7 @@ int orte_ess_base_app_setup(bool db_restrict_local)
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
(NULL == orte_process_info.tmpdir_base) ? "UNDEF" : orte_process_info.tmpdir_base,
|
||||
orte_process_info.nodename));
|
||||
if (ORTE_SUCCESS != (ret = orte_session_dir(true,
|
||||
orte_process_info.tmpdir_base,
|
||||
orte_process_info.nodename,
|
||||
ORTE_PROC_MY_NAME))) {
|
||||
if (ORTE_SUCCESS != (ret = orte_session_dir(true, ORTE_PROC_MY_NAME))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
error = "orte_session_dir";
|
||||
goto error;
|
||||
@ -149,29 +146,6 @@ int orte_ess_base_app_setup(bool db_restrict_local)
|
||||
proc-specific session directory. */
|
||||
opal_output_set_output_file_info(orte_process_info.proc_session_dir,
|
||||
"output-", NULL, NULL);
|
||||
/* store the session directory location */
|
||||
OBJ_CONSTRUCT(&kv, opal_value_t);
|
||||
kv.key = strdup(OPAL_PMIX_NSDIR);
|
||||
kv.type = OPAL_STRING;
|
||||
kv.data.string = strdup(orte_process_info.job_session_dir);
|
||||
if (OPAL_SUCCESS != (ret = opal_pmix.store_local(ORTE_PROC_MY_NAME, &kv))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
OBJ_DESTRUCT(&kv);
|
||||
error = "opal pmix put job sessiondir";
|
||||
goto error;
|
||||
}
|
||||
OBJ_DESTRUCT(&kv);
|
||||
OBJ_CONSTRUCT(&kv, opal_value_t);
|
||||
kv.key = strdup(OPAL_PMIX_PROCDIR);
|
||||
kv.type = OPAL_STRING;
|
||||
kv.data.string = strdup(orte_process_info.proc_session_dir);
|
||||
if (OPAL_SUCCESS != (ret = opal_pmix.store_local(ORTE_PROC_MY_NAME, &kv))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
OBJ_DESTRUCT(&kv);
|
||||
error = "opal pmix put proc sessiondir";
|
||||
goto error;
|
||||
}
|
||||
OBJ_DESTRUCT(&kv);
|
||||
}
|
||||
/* Setup the communication infrastructure */
|
||||
/*
|
||||
|
@ -237,10 +237,7 @@ int orte_ess_base_orted_setup(char **hosts)
|
||||
/* take a pass thru the session directory code to fillin the
|
||||
* tmpdir names - don't create anything yet
|
||||
*/
|
||||
if (ORTE_SUCCESS != (ret = orte_session_dir(false,
|
||||
orte_process_info.tmpdir_base,
|
||||
orte_process_info.nodename,
|
||||
ORTE_PROC_MY_NAME))) {
|
||||
if (ORTE_SUCCESS != (ret = orte_session_dir(false, ORTE_PROC_MY_NAME))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
error = "orte_session_dir define";
|
||||
goto error;
|
||||
@ -250,10 +247,7 @@ int orte_ess_base_orted_setup(char **hosts)
|
||||
*/
|
||||
orte_session_dir_cleanup(ORTE_JOBID_WILDCARD);
|
||||
/* now actually create the directory tree */
|
||||
if (ORTE_SUCCESS != (ret = orte_session_dir(true,
|
||||
orte_process_info.tmpdir_base,
|
||||
orte_process_info.nodename,
|
||||
ORTE_PROC_MY_NAME))) {
|
||||
if (ORTE_SUCCESS != (ret = orte_session_dir(true, ORTE_PROC_MY_NAME))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
error = "orte_session_dir";
|
||||
goto error;
|
||||
@ -277,11 +271,8 @@ int orte_ess_base_orted_setup(char **hosts)
|
||||
/* define a log file name in the session directory */
|
||||
snprintf(log_file, PATH_MAX, "output-orted-%s-%s.log",
|
||||
jobidstring, orte_process_info.nodename);
|
||||
log_path = opal_os_path(false,
|
||||
orte_process_info.tmpdir_base,
|
||||
orte_process_info.top_session_dir,
|
||||
log_file,
|
||||
NULL);
|
||||
log_path = opal_os_path(false, orte_process_info.top_session_dir,
|
||||
log_file, NULL);
|
||||
|
||||
fd = open(log_path, O_RDWR|O_CREAT|O_TRUNC, 0640);
|
||||
if (fd < 0) {
|
||||
|
@ -145,10 +145,9 @@ int orte_ess_base_tool_setup(void)
|
||||
* tmp base where any other session directories on
|
||||
* this node might be located
|
||||
*/
|
||||
if (ORTE_SUCCESS != (ret = orte_session_dir_get_name(NULL,
|
||||
&orte_process_info.tmpdir_base,
|
||||
&orte_process_info.top_session_dir,
|
||||
orte_process_info.nodename, NULL))) {
|
||||
|
||||
ret = orte_session_setup_base(NULL);
|
||||
if (ORTE_SUCCESS != ret ) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
error = "define session dir names";
|
||||
goto error;
|
||||
|
@ -138,7 +138,7 @@ static int rte_init(void)
|
||||
{
|
||||
int ret;
|
||||
char *error = NULL;
|
||||
char *contact_path, *jobfam_dir;
|
||||
char *contact_path;
|
||||
orte_job_t *jdata;
|
||||
orte_node_t *node;
|
||||
orte_proc_t *proc;
|
||||
@ -294,10 +294,7 @@ static int rte_init(void)
|
||||
/* take a pass thru the session directory code to fillin the
|
||||
* tmpdir names - don't create anything yet
|
||||
*/
|
||||
if (ORTE_SUCCESS != (ret = orte_session_dir(false,
|
||||
orte_process_info.tmpdir_base,
|
||||
orte_process_info.nodename,
|
||||
ORTE_PROC_MY_NAME))) {
|
||||
if (ORTE_SUCCESS != (ret = orte_session_dir(false, ORTE_PROC_MY_NAME))) {
|
||||
error = "orte_session_dir define";
|
||||
goto error;
|
||||
}
|
||||
@ -307,10 +304,7 @@ static int rte_init(void)
|
||||
orte_session_dir_cleanup(ORTE_JOBID_WILDCARD);
|
||||
|
||||
/* now actually create the directory tree */
|
||||
if (ORTE_SUCCESS != (ret = orte_session_dir(true,
|
||||
orte_process_info.tmpdir_base,
|
||||
orte_process_info.nodename,
|
||||
ORTE_PROC_MY_NAME))) {
|
||||
if (ORTE_SUCCESS != (ret = orte_session_dir(true, ORTE_PROC_MY_NAME))) {
|
||||
error = "orte_session_dir";
|
||||
goto error;
|
||||
}
|
||||
@ -586,9 +580,12 @@ static int rte_init(void)
|
||||
opal_output_set_output_file_info(orte_process_info.proc_session_dir,
|
||||
"output-", NULL, NULL);
|
||||
/* save my contact info in a file for others to find */
|
||||
jobfam_dir = opal_dirname(orte_process_info.job_session_dir);
|
||||
contact_path = opal_os_path(false, jobfam_dir, "contact.txt", NULL);
|
||||
free(jobfam_dir);
|
||||
if( NULL == orte_process_info.jobfam_session_dir ){
|
||||
/* has to be set here! */
|
||||
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
|
||||
goto error;
|
||||
}
|
||||
contact_path = opal_os_path(false, orte_process_info.jobfam_session_dir, "contact.txt", NULL);
|
||||
OPAL_OUTPUT_VERBOSE((2, orte_debug_output,
|
||||
"%s writing contact file %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
@ -758,10 +755,9 @@ static int rte_init(void)
|
||||
true, error, ORTE_ERROR_NAME(ret), ret);
|
||||
}
|
||||
/* remove my contact info file, if we have session directories */
|
||||
if (NULL != orte_process_info.job_session_dir) {
|
||||
jobfam_dir = opal_dirname(orte_process_info.job_session_dir);
|
||||
contact_path = opal_os_path(false, jobfam_dir, "contact.txt", NULL);
|
||||
free(jobfam_dir);
|
||||
if (NULL != orte_process_info.jobfam_session_dir) {
|
||||
contact_path = opal_os_path(false, orte_process_info.jobfam_session_dir,
|
||||
"contact.txt", NULL);
|
||||
unlink(contact_path);
|
||||
free(contact_path);
|
||||
}
|
||||
@ -775,7 +771,6 @@ static int rte_init(void)
|
||||
static int rte_finalize(void)
|
||||
{
|
||||
char *contact_path;
|
||||
char *jobfam_dir;
|
||||
|
||||
if (signals_set) {
|
||||
/* Remove the epipe handler */
|
||||
@ -816,10 +811,9 @@ static int rte_finalize(void)
|
||||
(void) mca_base_framework_close(&opal_pstat_base_framework);
|
||||
|
||||
/* remove my contact info file, if we have session directories */
|
||||
if (NULL != orte_process_info.job_session_dir) {
|
||||
jobfam_dir = opal_dirname(orte_process_info.job_session_dir);
|
||||
contact_path = opal_os_path(false, jobfam_dir, "contact.txt", NULL);
|
||||
free(jobfam_dir);
|
||||
if (NULL != orte_process_info.jobfam_session_dir) {
|
||||
contact_path = opal_os_path(false, orte_process_info.jobfam_session_dir,
|
||||
"contact.txt", NULL);
|
||||
unlink(contact_path);
|
||||
free(contact_path);
|
||||
}
|
||||
|
@ -94,6 +94,7 @@ static int rte_init(void)
|
||||
uint16_t u16, *u16ptr;
|
||||
char **peers=NULL, *mycpuset, **cpusets=NULL;
|
||||
opal_process_name_t wildcard_rank, pname;
|
||||
bool bool_val, *bool_ptr = &bool_val, tdir_mca_override = false;
|
||||
size_t i;
|
||||
|
||||
/* run the prolog */
|
||||
@ -242,6 +243,63 @@ static int rte_init(void)
|
||||
free(string_key);
|
||||
}
|
||||
|
||||
/* retrieve temp directories info */
|
||||
OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, OPAL_PMIX_TMPDIR, &wildcard_rank, &val, OPAL_STRING);
|
||||
if (OPAL_SUCCESS == ret && NULL != val) {
|
||||
/* We want to provide user with ability
|
||||
* to override RM settings at his own risk
|
||||
*/
|
||||
if( NULL == orte_process_info.top_session_dir ){
|
||||
orte_process_info.top_session_dir = val;
|
||||
} else {
|
||||
/* keep the MCA setting */
|
||||
tdir_mca_override = true;
|
||||
free(val);
|
||||
}
|
||||
val = NULL;
|
||||
}
|
||||
|
||||
if( !tdir_mca_override ){
|
||||
OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, OPAL_PMIX_NSDIR, &wildcard_rank, &val, OPAL_STRING);
|
||||
if (OPAL_SUCCESS == ret && NULL != val) {
|
||||
/* We want to provide user with ability
|
||||
* to override RM settings at his own risk
|
||||
*/
|
||||
if( NULL == orte_process_info.job_session_dir ){
|
||||
orte_process_info.job_session_dir = val;
|
||||
} else {
|
||||
/* keep the MCA setting */
|
||||
free(val);
|
||||
tdir_mca_override = true;
|
||||
}
|
||||
val = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
if( !tdir_mca_override ){
|
||||
OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, OPAL_PMIX_PROCDIR, &wildcard_rank, &val, OPAL_STRING);
|
||||
if (OPAL_SUCCESS == ret && NULL != val) {
|
||||
/* We want to provide user with ability
|
||||
* to override RM settings at his own risk
|
||||
*/
|
||||
if( NULL == orte_process_info.proc_session_dir ){
|
||||
orte_process_info.proc_session_dir = val;
|
||||
} else {
|
||||
/* keep the MCA setting */
|
||||
tdir_mca_override = true;
|
||||
free(val);
|
||||
}
|
||||
val = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
if( !tdir_mca_override ){
|
||||
OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, OPAL_PMIX_TDIR_RMCLEAN, &wildcard_rank, &bool_ptr, OPAL_BOOL);
|
||||
if (OPAL_SUCCESS == ret ) {
|
||||
orte_process_info.rm_session_dirs = bool_val;
|
||||
}
|
||||
}
|
||||
|
||||
/* retrieve our topology */
|
||||
val = NULL;
|
||||
OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, OPAL_PMIX_LOCAL_TOPO,
|
||||
|
@ -105,6 +105,17 @@ static void recv_ack(int status, orte_process_name_t* sender,
|
||||
void* cbdata);
|
||||
static void write_handler(int fd, short event, void *cbdata);
|
||||
|
||||
static char *filem_session_dir()
|
||||
{
|
||||
char *session_dir = orte_process_info.jobfam_session_dir;
|
||||
if( NULL == session_dir ){
|
||||
/* if no job family session dir was provided -
|
||||
* use the job session dir */
|
||||
session_dir = orte_process_info.job_session_dir;
|
||||
}
|
||||
return session_dir;
|
||||
}
|
||||
|
||||
static int raw_init(void)
|
||||
{
|
||||
OBJ_CONSTRUCT(&incoming_files, opal_list_t);
|
||||
@ -657,25 +668,26 @@ static int create_link(char *my_dir, char *path,
|
||||
static int raw_link_local_files(orte_job_t *jdata,
|
||||
orte_app_context_t *app)
|
||||
{
|
||||
char *my_dir, *path=NULL;
|
||||
char *session_dir, *path=NULL;
|
||||
orte_proc_t *proc;
|
||||
char *prefix;
|
||||
int i, j, rc;
|
||||
orte_filem_raw_incoming_t *inbnd;
|
||||
opal_list_item_t *item;
|
||||
char **files=NULL, *bname, *filestring;
|
||||
|
||||
/* check my session directory for files I have received and
|
||||
/* check my jobfam session directory for files I have received and
|
||||
* symlink them to the proc-level session directory of each
|
||||
* local process in the job
|
||||
*
|
||||
* TODO: @rhc - please check that I've correctly interpret your
|
||||
* intention here
|
||||
*/
|
||||
my_dir = opal_dirname(orte_process_info.job_session_dir);
|
||||
|
||||
/* setup */
|
||||
if (NULL != orte_process_info.tmpdir_base) {
|
||||
prefix = strdup(orte_process_info.tmpdir_base);
|
||||
} else {
|
||||
prefix = NULL;
|
||||
session_dir = filem_session_dir();
|
||||
if( NULL == session_dir){
|
||||
/* we were unable to find any suitable directory */
|
||||
rc = ORTE_ERR_BAD_PARAM;
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* get the list of files this app wants */
|
||||
@ -692,10 +704,6 @@ static int raw_link_local_files(orte_job_t *jdata,
|
||||
|
||||
/* if there are no files to link, then ignore this */
|
||||
if (NULL == files) {
|
||||
free(my_dir);
|
||||
if (NULL != prefix) {
|
||||
free(prefix);
|
||||
}
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
@ -736,10 +744,8 @@ static int raw_link_local_files(orte_job_t *jdata,
|
||||
ORTE_NAME_PRINT(&proc->name)));
|
||||
|
||||
/* get the session dir name in absolute form */
|
||||
path = NULL;
|
||||
rc = orte_session_dir_get_name(&path, &prefix, NULL,
|
||||
orte_process_info.nodename,
|
||||
&proc->name);
|
||||
path = orte_process_info.proc_session_dir;
|
||||
|
||||
/* create it, if it doesn't already exist */
|
||||
if (OPAL_SUCCESS != (rc = opal_os_dirpath_create(path, S_IRWXU))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
@ -747,11 +753,6 @@ static int raw_link_local_files(orte_job_t *jdata,
|
||||
* create it - either way, we are done
|
||||
*/
|
||||
free(files);
|
||||
if (NULL != prefix) {
|
||||
free(prefix);
|
||||
}
|
||||
free(path);
|
||||
free(my_dir);
|
||||
return rc;
|
||||
}
|
||||
|
||||
@ -775,13 +776,8 @@ static int raw_link_local_files(orte_job_t *jdata,
|
||||
inbnd->file));
|
||||
/* cycle thru the link points and create symlinks to them */
|
||||
for (j=0; NULL != inbnd->link_pts[j]; j++) {
|
||||
if (ORTE_SUCCESS != (rc = create_link(my_dir, path, inbnd->link_pts[j]))) {
|
||||
if (ORTE_SUCCESS != (rc = create_link(session_dir, path, inbnd->link_pts[j]))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
free(my_dir);
|
||||
free(path);
|
||||
if (NULL != prefix) {
|
||||
free(prefix);
|
||||
}
|
||||
free(files);
|
||||
return rc;
|
||||
}
|
||||
@ -796,13 +792,8 @@ static int raw_link_local_files(orte_job_t *jdata,
|
||||
}
|
||||
}
|
||||
}
|
||||
free(path);
|
||||
}
|
||||
opal_argv_free(files);
|
||||
if (NULL != prefix) {
|
||||
free(prefix);
|
||||
}
|
||||
free(my_dir);
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
@ -999,7 +990,7 @@ static void recv_files(int status, orte_process_name_t* sender,
|
||||
opal_buffer_t* buffer, orte_rml_tag_t tag,
|
||||
void* cbdata)
|
||||
{
|
||||
char *file, *jobfam_dir;
|
||||
char *file, *session_dir;
|
||||
int32_t nchunk, n, nbytes;
|
||||
unsigned char data[ORTE_FILEM_RAW_CHUNK_MAX];
|
||||
int rc;
|
||||
@ -1086,9 +1077,9 @@ static void recv_files(int status, orte_process_name_t* sender,
|
||||
incoming->top = strdup(tmp);
|
||||
free(tmp);
|
||||
/* define the full path to where we will put it */
|
||||
jobfam_dir = opal_dirname(orte_process_info.job_session_dir);
|
||||
incoming->fullpath = opal_os_path(false, jobfam_dir, file, NULL);
|
||||
free(jobfam_dir);
|
||||
session_dir = filem_session_dir();
|
||||
|
||||
incoming->fullpath = opal_os_path(false, session_dir, file, NULL);
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_filem_base_framework.framework_output,
|
||||
"%s filem:raw: opening target file %s",
|
||||
|
@ -164,8 +164,7 @@ static int component_available(void)
|
||||
|
||||
/* if session directories were forbidden, then we cannot be used */
|
||||
if (!orte_create_session_dirs ||
|
||||
NULL == orte_process_info.tmpdir_base ||
|
||||
NULL == orte_process_info.top_session_dir) {
|
||||
NULL == orte_process_info.jobfam_session_dir ) {
|
||||
return ORTE_ERR_NOT_SUPPORTED;
|
||||
}
|
||||
|
||||
@ -216,9 +215,7 @@ static int component_startup(void)
|
||||
/* setup the path to the daemon rendezvous point */
|
||||
memset(&mca_oob_usock_component.address, 0, sizeof(struct sockaddr_un));
|
||||
mca_oob_usock_component.address.sun_family = AF_UNIX;
|
||||
session = opal_os_path(false, orte_process_info.tmpdir_base,
|
||||
orte_process_info.top_session_dir,
|
||||
orte_process_info.jobfam_session_dir,
|
||||
session = opal_os_path(false, orte_process_info.jobfam_session_dir,
|
||||
"usock", NULL);
|
||||
if ((strlen(session) + 1) > sizeof(mca_oob_usock_component.address.sun_path)-1) {
|
||||
opal_output(0, "SESSION DIR TOO LONG");
|
||||
|
@ -943,6 +943,7 @@ static int setup_fork(orte_job_t *jdata,
|
||||
|
||||
/* forcibly set the local tmpdir base and top session dir to match ours */
|
||||
opal_setenv("OMPI_MCA_orte_tmpdir_base", orte_process_info.tmpdir_base, true, &app->env);
|
||||
/* TODO: should we use PMIx key to pass this data? */
|
||||
opal_setenv("OMPI_MCA_orte_top_session_dir", orte_process_info.top_session_dir, true, &app->env);
|
||||
opal_setenv("OMPI_MCA_orte_jobfam_session_dir", orte_process_info.jobfam_session_dir, true, &app->env);
|
||||
|
||||
@ -1102,24 +1103,8 @@ static int setup_child(orte_job_t *jdata,
|
||||
ORTE_FLAG_SET(child, ORTE_PROC_FLAG_IOF_COMPLETE);
|
||||
}
|
||||
|
||||
/* construct the proc's session dir name */
|
||||
if (NULL != orte_process_info.tmpdir_base) {
|
||||
value = strdup(orte_process_info.tmpdir_base);
|
||||
} else {
|
||||
value = NULL;
|
||||
}
|
||||
param = NULL;
|
||||
if (ORTE_SUCCESS != (rc = orte_session_dir_get_name(¶m, &value, NULL,
|
||||
orte_process_info.nodename,
|
||||
&child->name))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
if (NULL != value) {
|
||||
free(value);
|
||||
}
|
||||
return rc;
|
||||
}
|
||||
free(value);
|
||||
/* pass an envar so the proc can find any files it had prepositioned */
|
||||
param = orte_process_info.proc_session_dir;
|
||||
opal_setenv("OMPI_FILE_LOCATION", param, true, &app->env);
|
||||
|
||||
/* if the user wanted the cwd to be the proc's session dir, then
|
||||
@ -1132,12 +1117,10 @@ static int setup_child(orte_job_t *jdata,
|
||||
/* doesn't exist with correct permissions, and/or we can't
|
||||
* create it - either way, we are done
|
||||
*/
|
||||
free(param);
|
||||
return rc;
|
||||
}
|
||||
/* change to it */
|
||||
if (0 != chdir(param)) {
|
||||
free(param);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
/* It seems that chdir doesn't
|
||||
@ -1154,6 +1137,5 @@ static int setup_child(orte_job_t *jdata,
|
||||
/* update the initial wdir value too */
|
||||
opal_setenv("OMPI_MCA_initial_wdir", param, true, &app->env);
|
||||
}
|
||||
free(param);
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
@ -2134,7 +2134,8 @@ static void orte_debugger_init_before_spawn(orte_job_t *jdata)
|
||||
/* create the attachment FIFO and setup readevent - cannot be
|
||||
* done if no session dirs exist!
|
||||
*/
|
||||
attach_fifo = opal_os_path(false, orte_process_info.job_session_dir, "debugger_attach_fifo", NULL);
|
||||
attach_fifo = opal_os_path(false, orte_process_info.job_session_dir,
|
||||
"debugger_attach_fifo", NULL);
|
||||
if ((mkfifo(attach_fifo, FILE_MODE) < 0) && errno != EEXIST) {
|
||||
opal_output(0, "CANNOT CREATE FIFO %s: errno %d", attach_fifo, errno);
|
||||
free(attach_fifo);
|
||||
|
@ -262,9 +262,7 @@ int pmix_server_init(void)
|
||||
kv = OBJ_NEW(opal_value_t);
|
||||
kv->key = strdup(OPAL_PMIX_SERVER_TMPDIR);
|
||||
kv->type = OPAL_STRING;
|
||||
kv->data.string = opal_os_path(false, orte_process_info.tmpdir_base,
|
||||
orte_process_info.top_session_dir,
|
||||
orte_process_info.jobfam_session_dir, NULL);
|
||||
kv->data.string = opal_os_path(false, orte_process_info.jobfam_session_dir, NULL);
|
||||
opal_list_append(&info, &kv->super);
|
||||
/* use the same for the system temp directory - this is
|
||||
* where the system-level tool connections will go */
|
||||
|
@ -310,7 +310,7 @@ static int orte_cr_coord_post_restart(void) {
|
||||
* Add the previous session directory for cleanup
|
||||
*/
|
||||
opal_crs_base_cleanup_append(orte_process_info.job_session_dir, true);
|
||||
tmp_dir = opal_dirname(orte_process_info.job_session_dir);
|
||||
tmp_dir = orte_process_info.jobfam_session_dir;
|
||||
if( NULL != tmp_dir ) {
|
||||
opal_crs_base_cleanup_append(tmp_dir, true);
|
||||
free(tmp_dir);
|
||||
|
@ -160,7 +160,7 @@ int orte_register_params(void)
|
||||
&orte_top_session_dir);
|
||||
|
||||
if (NULL != orte_top_session_dir) {
|
||||
if (NULL != orte_process_info.top_session_dir) {
|
||||
if (NULL != orte_process_info.top_session_dir) {
|
||||
free(orte_process_info.top_session_dir);
|
||||
}
|
||||
orte_process_info.top_session_dir = strdup(orte_top_session_dir);
|
||||
|
@ -182,7 +182,7 @@ int orte_list_local_hnps(opal_list_t *hnps, bool connect)
|
||||
/*
|
||||
* Check to make sure we have access to the top-level directory
|
||||
*/
|
||||
headdir = opal_os_path(false, orte_process_info.tmpdir_base, orte_process_info.top_session_dir, NULL);
|
||||
headdir = orte_process_info.top_session_dir;
|
||||
|
||||
if( ORTE_SUCCESS != (ret = opal_os_dirpath_access(headdir, 0) )) {
|
||||
/* it is okay not to find this as there may not be any
|
||||
@ -231,7 +231,6 @@ int orte_list_local_hnps(opal_list_t *hnps, bool connect)
|
||||
cleanup:
|
||||
if( NULL != cur_dirp )
|
||||
closedir(cur_dirp);
|
||||
free(headdir);
|
||||
|
||||
return (opal_list_is_empty(hnps) ? ORTE_ERR_NOT_FOUND : ORTE_SUCCESS);
|
||||
}
|
||||
|
@ -106,6 +106,14 @@ int orte_proc_info(void)
|
||||
if (init) {
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
{
|
||||
int delay = 0;
|
||||
while( delay ){
|
||||
sleep(1);
|
||||
}
|
||||
}
|
||||
|
||||
init = true;
|
||||
|
||||
OBJ_CONSTRUCT(&orte_process_info.super, opal_proc_t);
|
||||
|
@ -122,6 +122,7 @@ struct orte_proc_info_t {
|
||||
char *jobfam_session_dir; /**< Session directory for this family of jobs (i.e., share same mpirun) */
|
||||
char *job_session_dir; /**< Session directory for job */
|
||||
char *proc_session_dir; /**< Session directory for the process */
|
||||
bool rm_session_dirs; /**< Session directories will be cleaned up by RM */
|
||||
|
||||
char *sock_stdin; /**< Path name to temp file for stdin. */
|
||||
char *sock_stdout; /**< Path name to temp file for stdout. */
|
||||
|
@ -73,10 +73,6 @@ static int orte_create_dir(char *directory);
|
||||
|
||||
static bool orte_dir_check_file(const char *root, const char *path);
|
||||
|
||||
static char *orte_build_job_session_dir(char *top_dir,
|
||||
orte_process_name_t *proc,
|
||||
orte_jobid_t jobid);
|
||||
|
||||
#define OMPI_PRINTF_FIX_STRING(a) ((NULL == a) ? "(null)" : a)
|
||||
|
||||
/****************************
|
||||
@ -112,175 +108,186 @@ static int orte_create_dir(char *directory)
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Construct the fullpath to the session directory - it
|
||||
* will consist of "ompi.<hostname>.<effective-uid>", and
|
||||
* have subdirs:
|
||||
*
|
||||
* pid - the pid of the mpirun that oversees this job. Note
|
||||
* that direct-launched processes will have manufactured
|
||||
* this value
|
||||
*
|
||||
* jobid - jobid of the application being executed
|
||||
*
|
||||
* vpid - vpid of the process
|
||||
*/
|
||||
int
|
||||
orte_session_dir_get_name(char **fulldirpath,
|
||||
char **return_prefix, /* This will come back as the valid tmp dir */
|
||||
char **return_frontend,
|
||||
char *hostid,
|
||||
orte_process_name_t *proc) {
|
||||
char *hostname = NULL,
|
||||
*sessions = NULL,
|
||||
*prefix = NULL,
|
||||
*frontend = NULL,
|
||||
*jobfam = NULL,
|
||||
*job = NULL,
|
||||
*vpidstr = NULL;
|
||||
bool prefix_provided = false;
|
||||
int exit_status = ORTE_SUCCESS;
|
||||
size_t len;
|
||||
uid_t uid;
|
||||
|
||||
static int _setup_tmpdir_base()
|
||||
{
|
||||
int rc = ORTE_SUCCESS;
|
||||
|
||||
/* make sure that we have tmpdir_base set
|
||||
* if we need it
|
||||
*/
|
||||
if (NULL == orte_process_info.tmpdir_base) {
|
||||
orte_process_info.tmpdir_base =
|
||||
strdup(opal_tmp_directory());
|
||||
if (NULL == orte_process_info.tmpdir_base) {
|
||||
rc = ORTE_ERR_OUT_OF_RESOURCE;
|
||||
goto exit;
|
||||
}
|
||||
}
|
||||
exit:
|
||||
if( ORTE_SUCCESS != rc ){
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
return rc;
|
||||
}
|
||||
|
||||
static int _setup_top_session_dir()
|
||||
{
|
||||
int rc = ORTE_SUCCESS;
|
||||
/* get the effective uid */
|
||||
uid_t uid = geteuid();
|
||||
|
||||
/* construct the top_session_dir if we need */
|
||||
if (NULL == orte_process_info.top_session_dir) {
|
||||
if (ORTE_SUCCESS != (rc = _setup_tmpdir_base())) {
|
||||
return rc;
|
||||
}
|
||||
if( NULL == orte_process_info.nodename ||
|
||||
NULL == orte_process_info.tmpdir_base ){
|
||||
/* we can't setup top session dir */
|
||||
rc = ORTE_ERR_BAD_PARAM;
|
||||
goto exit;
|
||||
}
|
||||
|
||||
if (0 > asprintf(&orte_process_info.top_session_dir,
|
||||
"%s/ompi.%s.%lu", orte_process_info.tmpdir_base,
|
||||
orte_process_info.nodename, (unsigned long)uid)) {
|
||||
orte_process_info.top_session_dir = NULL;
|
||||
rc = ORTE_ERR_OUT_OF_RESOURCE;
|
||||
goto exit;
|
||||
}
|
||||
}
|
||||
exit:
|
||||
if( ORTE_SUCCESS != rc ){
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
return rc;
|
||||
}
|
||||
|
||||
static int _setup_jobfam_session_dir(orte_process_name_t *proc)
|
||||
{
|
||||
int rc = ORTE_SUCCESS;
|
||||
|
||||
/* construct the top_session_dir if we need */
|
||||
if (NULL == orte_process_info.jobfam_session_dir) {
|
||||
if (ORTE_SUCCESS != (rc = _setup_top_session_dir())) {
|
||||
return rc;
|
||||
}
|
||||
|
||||
if (ORTE_PROC_IS_HNP) {
|
||||
if (0 > asprintf(&orte_process_info.jobfam_session_dir,
|
||||
"%s/pid.%lu", orte_process_info.top_session_dir,
|
||||
(unsigned long)orte_process_info.pid) ) {
|
||||
rc = ORTE_ERR_OUT_OF_RESOURCE;
|
||||
goto exit;
|
||||
}
|
||||
} else {
|
||||
/* we were not given one, so define it */
|
||||
if (NULL == proc || (ORTE_JOBID_INVALID == proc->jobid) ) {
|
||||
if (0 > asprintf(&orte_process_info.jobfam_session_dir,
|
||||
"%s/jobfam", orte_process_info.top_session_dir) ) {
|
||||
rc = ORTE_ERR_OUT_OF_RESOURCE;
|
||||
goto exit;
|
||||
}
|
||||
} else {
|
||||
if (0 > asprintf(&orte_process_info.jobfam_session_dir,
|
||||
"%s/jf.%d", orte_process_info.top_session_dir,
|
||||
ORTE_JOB_FAMILY(proc->jobid))) {
|
||||
orte_process_info.jobfam_session_dir = NULL;
|
||||
rc = ORTE_ERR_OUT_OF_RESOURCE;
|
||||
goto exit;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
exit:
|
||||
if( ORTE_SUCCESS != rc ){
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
return rc;
|
||||
}
|
||||
|
||||
static int
|
||||
_setup_job_session_dir(orte_process_name_t *proc)
|
||||
{
|
||||
int rc = ORTE_SUCCESS;
|
||||
|
||||
/* construct the top_session_dir if we need */
|
||||
if( NULL == orte_process_info.job_session_dir ){
|
||||
if( ORTE_SUCCESS != (rc = _setup_jobfam_session_dir(proc)) ){
|
||||
return rc;
|
||||
}
|
||||
if (ORTE_JOBID_INVALID != proc->jobid) {
|
||||
if (0 > asprintf(&orte_process_info.job_session_dir,
|
||||
"%s/%d", orte_process_info.jobfam_session_dir,
|
||||
ORTE_LOCAL_JOBID(proc->jobid))) {
|
||||
orte_process_info.job_session_dir = NULL;
|
||||
rc = ORTE_ERR_OUT_OF_RESOURCE;
|
||||
goto exit;
|
||||
}
|
||||
} else {
|
||||
orte_process_info.job_session_dir = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
exit:
|
||||
if( ORTE_SUCCESS != rc ){
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
return rc;
|
||||
}
|
||||
|
||||
static int
|
||||
_setup_proc_session_dir(orte_process_name_t *proc)
|
||||
{
|
||||
int rc = ORTE_SUCCESS;
|
||||
|
||||
/* construct the top_session_dir if we need */
|
||||
if( NULL == orte_process_info.proc_session_dir ){
|
||||
if( ORTE_SUCCESS != (rc = _setup_job_session_dir(proc)) ){
|
||||
return rc;
|
||||
}
|
||||
if (ORTE_VPID_INVALID != proc->vpid) {
|
||||
if (0 > asprintf(&orte_process_info.proc_session_dir,
|
||||
"%s/%d", orte_process_info.job_session_dir,
|
||||
proc->vpid)) {
|
||||
orte_process_info.proc_session_dir = NULL;
|
||||
rc = ORTE_ERR_OUT_OF_RESOURCE;
|
||||
goto exit;
|
||||
}
|
||||
} else {
|
||||
orte_process_info.proc_session_dir = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
exit:
|
||||
if( ORTE_SUCCESS != rc ){
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
return rc;
|
||||
}
|
||||
|
||||
int orte_session_setup_base(orte_process_name_t *proc)
|
||||
{
|
||||
int rc;
|
||||
|
||||
/* Ensure that system info is set */
|
||||
orte_proc_info();
|
||||
|
||||
/* get the effective uid */
|
||||
uid = geteuid();
|
||||
|
||||
/*
|
||||
* set the 'hostname'
|
||||
*/
|
||||
if( NULL != hostid) { /* User specified version */
|
||||
hostname = strdup(hostid);
|
||||
}
|
||||
else { /* check if it is set elsewhere */
|
||||
if( NULL != orte_process_info.nodename)
|
||||
hostname = strdup(orte_process_info.nodename);
|
||||
else {
|
||||
/* Couldn't find it, so fail */
|
||||
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
|
||||
exit_status = ORTE_ERR_BAD_PARAM;
|
||||
goto cleanup;
|
||||
}
|
||||
/* setup job and proc session directories */
|
||||
if( ORTE_SUCCESS != (rc = _setup_job_session_dir(proc)) ){
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* construct the frontend of the session directory*/
|
||||
if (NULL != orte_process_info.top_session_dir) {
|
||||
frontend = strdup(orte_process_info.top_session_dir);
|
||||
} else { /* If not set then construct it */
|
||||
if (0 > asprintf(&frontend, "ompi.%s.%lu", hostname, (unsigned long)uid)) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
exit_status = ORTE_ERR_OUT_OF_RESOURCE;
|
||||
goto cleanup;
|
||||
}
|
||||
}
|
||||
|
||||
/* construct the next level down, which belongs to the
|
||||
* job family. This is related to the mpirun that launched
|
||||
* the job, or is an arbitrary (agreed upon) value if
|
||||
* direct launched */
|
||||
if (ORTE_PROC_IS_HNP) {
|
||||
if (0 > asprintf(&jobfam, "pid.%lu", (unsigned long)orte_process_info.pid)) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
exit_status = ORTE_ERR_OUT_OF_RESOURCE;
|
||||
goto cleanup;
|
||||
}
|
||||
orte_process_info.jobfam_session_dir = strdup(jobfam);
|
||||
} else if (NULL != orte_process_info.jobfam_session_dir) {
|
||||
/* we had a job family session dir passed down to us by mpirun */
|
||||
jobfam = strdup(orte_process_info.jobfam_session_dir);
|
||||
} else {
|
||||
/* we were not given one, so define it */
|
||||
if (NULL == proc) {
|
||||
jobfam = strdup("jobfam");
|
||||
} else {
|
||||
if (0 > asprintf(&jobfam, "jf.%d", ORTE_JOB_FAMILY(proc->jobid))) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
exit_status = ORTE_ERR_OUT_OF_RESOURCE;
|
||||
goto cleanup;
|
||||
}
|
||||
}
|
||||
orte_process_info.jobfam_session_dir = strdup(jobfam);
|
||||
}
|
||||
|
||||
/*
|
||||
* Construct the session directory
|
||||
*/
|
||||
/* If we were given a valid vpid then we can construct it fully */
|
||||
if( NULL != proc) {
|
||||
if (ORTE_VPID_INVALID != proc->vpid) {
|
||||
if (0 > asprintf(&job, "%d", ORTE_LOCAL_JOBID(proc->jobid))) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
exit_status = ORTE_ERR_OUT_OF_RESOURCE;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
if (ORTE_SUCCESS != orte_util_convert_vpid_to_string(&vpidstr, proc->vpid)) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
exit_status = ORTE_ERR_OUT_OF_RESOURCE;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
sessions = opal_os_path(false, frontend, jobfam, job, vpidstr, NULL);
|
||||
if( NULL == sessions ) {
|
||||
ORTE_ERROR_LOG(ORTE_ERROR);
|
||||
exit_status = ORTE_ERROR;
|
||||
goto cleanup;
|
||||
}
|
||||
} else if (ORTE_JOBID_INVALID != proc->jobid) {
|
||||
if (0 > asprintf(&job, "%d", ORTE_LOCAL_JOBID(proc->jobid))) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
exit_status = ORTE_ERR_OUT_OF_RESOURCE;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
sessions = opal_os_path( false, frontend, jobfam, job, NULL );
|
||||
if( NULL == sessions ) {
|
||||
ORTE_ERROR_LOG(ORTE_ERROR);
|
||||
exit_status = ORTE_ERROR;
|
||||
goto cleanup;
|
||||
}
|
||||
} else {
|
||||
sessions = strdup(frontend); /* must dup this to avoid double-free later */
|
||||
}
|
||||
|
||||
} else {
|
||||
/* If we were not given a proc at all, then we just set it to frontend */
|
||||
sessions = strdup(frontend); /* must dup this to avoid double-free later */
|
||||
}
|
||||
|
||||
/*
|
||||
* If the user specified an invalid prefix, or no prefix at all
|
||||
* we need to keep looking
|
||||
*/
|
||||
if( NULL != fulldirpath && NULL != *fulldirpath) {
|
||||
free(*fulldirpath);
|
||||
*fulldirpath = NULL;
|
||||
}
|
||||
|
||||
if( NULL != return_prefix && NULL != *return_prefix) { /* use the user specified one, if available */
|
||||
prefix = strdup(*return_prefix);
|
||||
prefix_provided = true;
|
||||
}
|
||||
/* Try to find a proper alternative prefix */
|
||||
else if (NULL != orte_process_info.tmpdir_base) { /* stored value */
|
||||
prefix = strdup(orte_process_info.tmpdir_base);
|
||||
}
|
||||
else { /* General Environment var */
|
||||
prefix = strdup(opal_tmp_directory());
|
||||
}
|
||||
len = strlen(prefix);
|
||||
/* check for a trailing path separator */
|
||||
if (OPAL_PATH_SEP[0] == prefix[len-1]) {
|
||||
prefix[len-1] = '\0';
|
||||
if( ORTE_SUCCESS != (rc = _setup_proc_session_dir(proc)) ){
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* BEFORE doing anything else, check to see if this prefix is
|
||||
* allowed by the system
|
||||
*/
|
||||
if (NULL != orte_prohibited_session_dirs) {
|
||||
if (NULL != orte_prohibited_session_dirs ||
|
||||
NULL != orte_process_info.tmpdir_base ) {
|
||||
char **list;
|
||||
int i, len;
|
||||
/* break the string into tokens - it should be
|
||||
@ -291,97 +298,36 @@ orte_session_dir_get_name(char **fulldirpath,
|
||||
/* cycle through the list */
|
||||
for (i=0; i < len; i++) {
|
||||
/* check if prefix matches */
|
||||
if (0 == strncmp(prefix, list[i], strlen(list[i]))) {
|
||||
if (0 == strncmp(orte_process_info.tmpdir_base, list[i], strlen(list[i]))) {
|
||||
/* this is a prohibited location */
|
||||
orte_show_help("help-orte-runtime.txt",
|
||||
"orte:session:dir:prohibited",
|
||||
true, prefix, orte_prohibited_session_dirs);
|
||||
true, orte_process_info.tmpdir_base,
|
||||
orte_prohibited_session_dirs);
|
||||
opal_argv_free(list);
|
||||
free(prefix);
|
||||
free(sessions);
|
||||
free(hostname);
|
||||
free(frontend);
|
||||
return ORTE_ERR_FATAL;
|
||||
}
|
||||
}
|
||||
opal_argv_free(list); /* done with this */
|
||||
}
|
||||
/*
|
||||
* Construct the absolute final path, if requested
|
||||
*/
|
||||
if (NULL != fulldirpath) {
|
||||
*fulldirpath = opal_os_path(false, prefix, sessions, NULL);
|
||||
}
|
||||
|
||||
/*
|
||||
* Return the frontend and prefix, if user requested we do so
|
||||
*/
|
||||
if (NULL != return_frontend) {
|
||||
*return_frontend = strdup(frontend);
|
||||
}
|
||||
if (!prefix_provided && NULL != return_prefix) {
|
||||
*return_prefix = strdup(prefix);
|
||||
}
|
||||
|
||||
cleanup:
|
||||
if(NULL != hostname) {
|
||||
free(hostname);
|
||||
}
|
||||
if(NULL != sessions) {
|
||||
free(sessions);
|
||||
}
|
||||
if (NULL != prefix) {
|
||||
free(prefix);
|
||||
}
|
||||
if (NULL != frontend) {
|
||||
free(frontend);
|
||||
}
|
||||
if (NULL != jobfam) {
|
||||
free(jobfam);
|
||||
}
|
||||
if (NULL != job) {
|
||||
free(job);
|
||||
}
|
||||
if (NULL != vpidstr) {
|
||||
free(vpidstr);
|
||||
}
|
||||
|
||||
return exit_status;
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/*
|
||||
* Construct the session directory and create it if necessary
|
||||
*/
|
||||
int orte_session_dir(bool create,
|
||||
char *prefix, char *hostid,
|
||||
orte_process_name_t *proc)
|
||||
int orte_session_dir(bool create, orte_process_name_t *proc)
|
||||
{
|
||||
char *fulldirpath = NULL,
|
||||
*frontend = NULL,
|
||||
*sav = NULL;
|
||||
int rc = ORTE_SUCCESS;
|
||||
char *local_prefix = NULL;
|
||||
|
||||
/* use the specified prefix, if one was given */
|
||||
if (NULL != prefix) {
|
||||
local_prefix = strdup(prefix);
|
||||
}
|
||||
|
||||
/*
|
||||
* Get the session directory full name
|
||||
*/
|
||||
if (ORTE_SUCCESS != (rc = orte_session_dir_get_name(&fulldirpath,
|
||||
&local_prefix,
|
||||
&frontend,
|
||||
hostid,
|
||||
proc))) {
|
||||
if (ORTE_SUCCESS != (rc = orte_session_setup_base(proc))) {
|
||||
if (ORTE_ERR_FATAL == rc) {
|
||||
/* this indicates we should abort quietly */
|
||||
rc = ORTE_ERR_SILENT;
|
||||
goto cleanup;
|
||||
}
|
||||
/* otherwise, bark a little first */
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
@ -389,73 +335,26 @@ int orte_session_dir(bool create,
|
||||
* Now that we have the full path, go ahead and create it if necessary
|
||||
*/
|
||||
if( create ) {
|
||||
if( ORTE_SUCCESS != (rc = orte_create_dir(fulldirpath) ) ) {
|
||||
if( ORTE_SUCCESS != (rc = orte_create_dir(orte_process_info.proc_session_dir)) ) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
}
|
||||
}
|
||||
|
||||
/* update global structure fields */
|
||||
if (NULL != orte_process_info.tmpdir_base) {
|
||||
free(orte_process_info.tmpdir_base);
|
||||
}
|
||||
orte_process_info.tmpdir_base = strdup(local_prefix);
|
||||
if (NULL != orte_process_info.top_session_dir) {
|
||||
free(orte_process_info.top_session_dir);
|
||||
orte_process_info.top_session_dir = NULL;
|
||||
}
|
||||
if (NULL != frontend) {
|
||||
orte_process_info.top_session_dir = strdup(frontend);
|
||||
}
|
||||
|
||||
/*
|
||||
* Set the process session directory
|
||||
*/
|
||||
if (ORTE_VPID_INVALID != proc->vpid) {
|
||||
if (NULL != orte_process_info.proc_session_dir) {
|
||||
free(orte_process_info.proc_session_dir);
|
||||
}
|
||||
orte_process_info.proc_session_dir = strdup(fulldirpath);
|
||||
|
||||
/* Strip off last part of directory structure */
|
||||
sav = opal_dirname(fulldirpath);
|
||||
free(fulldirpath);
|
||||
fulldirpath = sav;
|
||||
sav = NULL;
|
||||
}
|
||||
|
||||
/*
|
||||
* Set the job session directory
|
||||
*/
|
||||
if (ORTE_JOBID_INVALID != proc->jobid) {
|
||||
if (NULL != orte_process_info.job_session_dir) {
|
||||
free(orte_process_info.job_session_dir);
|
||||
}
|
||||
orte_process_info.job_session_dir = strdup(fulldirpath);
|
||||
}
|
||||
|
||||
if (orte_debug_flag) {
|
||||
opal_output(0, "procdir: %s",
|
||||
OMPI_PRINTF_FIX_STRING(orte_process_info.proc_session_dir));
|
||||
opal_output(0, "jobdir: %s",
|
||||
OMPI_PRINTF_FIX_STRING(orte_process_info.job_session_dir));
|
||||
opal_output(0, "top: %s",
|
||||
opal_output(0, "top: %s",
|
||||
OMPI_PRINTF_FIX_STRING(orte_process_info.jobfam_session_dir));
|
||||
opal_output(0, "top: %s",
|
||||
OMPI_PRINTF_FIX_STRING(orte_process_info.top_session_dir));
|
||||
opal_output(0, "tmp: %s",
|
||||
OMPI_PRINTF_FIX_STRING(orte_process_info.tmpdir_base));
|
||||
}
|
||||
|
||||
cleanup:
|
||||
if (NULL != local_prefix) {
|
||||
free(local_prefix);
|
||||
}
|
||||
if(NULL != fulldirpath) {
|
||||
free(fulldirpath);
|
||||
}
|
||||
if(NULL != frontend) {
|
||||
free(frontend);
|
||||
}
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
||||
@ -466,16 +365,14 @@ int
|
||||
orte_session_dir_cleanup(orte_jobid_t jobid)
|
||||
{
|
||||
int rc = ORTE_SUCCESS;
|
||||
char *tmp = NULL;
|
||||
char *job_session_dir=NULL;
|
||||
|
||||
if (!orte_create_session_dirs) {
|
||||
/* didn't create them */
|
||||
if (!orte_create_session_dirs || orte_process_info.rm_session_dirs ) {
|
||||
/* we haven't created them or RM will clean them up for us*/
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
if (NULL == orte_process_info.tmpdir_base &&
|
||||
NULL == orte_process_info.top_session_dir) {
|
||||
if (NULL == orte_process_info.job_session_dir ||
|
||||
NULL == orte_process_info.proc_session_dir) {
|
||||
/* this should never happen - it means we are calling
|
||||
* cleanup *before* properly setting up the session
|
||||
* dir system. This leaves open the possibility of
|
||||
@ -486,37 +383,30 @@ orte_session_dir_cleanup(orte_jobid_t jobid)
|
||||
goto CLEANUP;
|
||||
}
|
||||
|
||||
/* need to setup the top_session_dir with the prefix */
|
||||
tmp = opal_os_path(false,
|
||||
orte_process_info.tmpdir_base,
|
||||
orte_process_info.top_session_dir, NULL);
|
||||
|
||||
/* we can only blow away session directories for our job family */
|
||||
job_session_dir = orte_build_job_session_dir(tmp, ORTE_PROC_MY_NAME, jobid);
|
||||
if (NULL == job_session_dir) {
|
||||
rc = ORTE_ERR_OUT_OF_RESOURCE;
|
||||
goto CLEANUP;
|
||||
}
|
||||
|
||||
/* recursively blow the whole session away for our job family,
|
||||
* saving only output files
|
||||
*/
|
||||
opal_os_dirpath_destroy(job_session_dir, true, orte_dir_check_file);
|
||||
opal_os_dirpath_destroy(orte_process_info.job_session_dir,
|
||||
true, orte_dir_check_file);
|
||||
|
||||
/* now attempt to eliminate the top level directory itself - this
|
||||
* will fail if anything is present, but ensures we cleanup if
|
||||
* we are the last one out
|
||||
*/
|
||||
opal_os_dirpath_destroy(tmp, false, orte_dir_check_file);
|
||||
if( NULL != orte_process_info.top_session_dir ){
|
||||
opal_os_dirpath_destroy(orte_process_info.top_session_dir,
|
||||
false, orte_dir_check_file);
|
||||
}
|
||||
|
||||
if (NULL != job_session_dir && opal_os_dirpath_is_empty(job_session_dir)) {
|
||||
if (opal_os_dirpath_is_empty(orte_process_info.job_session_dir)) {
|
||||
if (orte_debug_flag) {
|
||||
opal_output(0, "sess_dir_cleanup: found job session dir empty - deleting");
|
||||
}
|
||||
rmdir(job_session_dir);
|
||||
rmdir(orte_process_info.job_session_dir);
|
||||
} else {
|
||||
if (orte_debug_flag) {
|
||||
if (OPAL_ERR_NOT_FOUND == opal_os_dirpath_access(job_session_dir, 0)) {
|
||||
if (OPAL_ERR_NOT_FOUND ==
|
||||
opal_os_dirpath_access(orte_process_info.job_session_dir, 0)) {
|
||||
opal_output(0, "sess_dir_cleanup: job session dir does not exist");
|
||||
} else {
|
||||
opal_output(0, "sess_dir_cleanup: job session dir not empty - leaving");
|
||||
@ -525,24 +415,27 @@ orte_session_dir_cleanup(orte_jobid_t jobid)
|
||||
goto CLEANUP;
|
||||
}
|
||||
|
||||
if (opal_os_dirpath_is_empty(tmp)) {
|
||||
if (orte_debug_flag) {
|
||||
opal_output(0, "sess_dir_cleanup: found top session dir empty - deleting");
|
||||
}
|
||||
rmdir(tmp);
|
||||
} else {
|
||||
if (orte_debug_flag) {
|
||||
if (OPAL_ERR_NOT_FOUND == opal_os_dirpath_access(tmp, 0)) {
|
||||
opal_output(0, "sess_dir_cleanup: top session dir does not exist");
|
||||
} else {
|
||||
opal_output(0, "sess_dir_cleanup: top session dir not empty - leaving");
|
||||
if ( NULL != orte_process_info.top_session_dir ){
|
||||
|
||||
if( opal_os_dirpath_is_empty(orte_process_info.top_session_dir) ) {
|
||||
if (orte_debug_flag) {
|
||||
opal_output(0, "sess_dir_cleanup: found top session dir empty - deleting");
|
||||
}
|
||||
}
|
||||
rmdir(orte_process_info.top_session_dir);
|
||||
} else {
|
||||
if (orte_debug_flag) {
|
||||
if (OPAL_ERR_NOT_FOUND ==
|
||||
opal_os_dirpath_access(orte_process_info.top_session_dir, 0)) {
|
||||
opal_output(0, "sess_dir_cleanup: top session dir does not exist");
|
||||
} else {
|
||||
opal_output(0, "sess_dir_cleanup: top session dir not empty - leaving");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
CLEANUP:
|
||||
if (NULL != tmp) free(tmp);
|
||||
if (NULL != job_session_dir) free(job_session_dir);
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
||||
@ -551,66 +444,42 @@ int
|
||||
orte_session_dir_finalize(orte_process_name_t *proc)
|
||||
{
|
||||
int rc;
|
||||
char *tmp;
|
||||
char *job_session_dir, *vpid, *proc_session_dir;
|
||||
|
||||
if (!orte_create_session_dirs) {
|
||||
/* didn't create them */
|
||||
if (!orte_create_session_dirs || orte_process_info.rm_session_dirs ) {
|
||||
/* we haven't created them or RM will clean them up for us*/
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
if (NULL == orte_process_info.tmpdir_base &&
|
||||
NULL == orte_process_info.top_session_dir) {
|
||||
if (NULL == orte_process_info.job_session_dir ||
|
||||
NULL == orte_process_info.proc_session_dir) {
|
||||
/* this should never happen - it means we are calling
|
||||
* cleanup *before* properly setting up the session
|
||||
* dir system. Protect against the possibility of
|
||||
* dir system. This leaves open the possibility of
|
||||
* accidentally removing directories we shouldn't
|
||||
* touch by returning
|
||||
* touch
|
||||
*/
|
||||
return ORTE_ERR_NOT_INITIALIZED;
|
||||
rc = ORTE_ERR_NOT_INITIALIZED;
|
||||
goto CLEANUP;
|
||||
}
|
||||
|
||||
/* need to setup the top_session_dir with the prefix */
|
||||
tmp = opal_os_path(false,
|
||||
orte_process_info.tmpdir_base,
|
||||
orte_process_info.top_session_dir, NULL);
|
||||
|
||||
/* define the proc and job session directories for this process */
|
||||
if (ORTE_SUCCESS != (rc = orte_util_convert_vpid_to_string(&vpid, proc->vpid))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
free(tmp);
|
||||
return rc;
|
||||
}
|
||||
job_session_dir = orte_build_job_session_dir(tmp, proc, proc->jobid);
|
||||
if( NULL == job_session_dir) {
|
||||
free(tmp);
|
||||
free(vpid);
|
||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
proc_session_dir = opal_os_path( false, job_session_dir, vpid, NULL );
|
||||
if( NULL == proc_session_dir ) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
free(tmp);
|
||||
free(vpid);
|
||||
free(job_session_dir);
|
||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
opal_os_dirpath_destroy(proc_session_dir,
|
||||
opal_os_dirpath_destroy(orte_process_info.proc_session_dir,
|
||||
false, orte_dir_check_file);
|
||||
opal_os_dirpath_destroy(job_session_dir,
|
||||
false, orte_dir_check_file);
|
||||
opal_os_dirpath_destroy(tmp,
|
||||
opal_os_dirpath_destroy(orte_process_info.job_session_dir,
|
||||
false, orte_dir_check_file);
|
||||
if( NULL != orte_process_info.top_session_dir ){
|
||||
opal_os_dirpath_destroy(orte_process_info.top_session_dir,
|
||||
false, orte_dir_check_file);
|
||||
}
|
||||
|
||||
if (opal_os_dirpath_is_empty(proc_session_dir)) {
|
||||
if (opal_os_dirpath_is_empty(orte_process_info.proc_session_dir)) {
|
||||
if (orte_debug_flag) {
|
||||
opal_output(0, "sess_dir_finalize: found proc session dir empty - deleting");
|
||||
}
|
||||
rmdir(proc_session_dir);
|
||||
rmdir(orte_process_info.proc_session_dir);
|
||||
} else {
|
||||
if (orte_debug_flag) {
|
||||
if (OPAL_ERR_NOT_FOUND == opal_os_dirpath_access(proc_session_dir, 0)) {
|
||||
if (OPAL_ERR_NOT_FOUND ==
|
||||
opal_os_dirpath_access(orte_process_info.proc_session_dir, 0)) {
|
||||
opal_output(0, "sess_dir_finalize: proc session dir does not exist");
|
||||
} else {
|
||||
opal_output(0, "sess_dir_finalize: proc session dir not empty - leaving");
|
||||
@ -619,14 +488,15 @@ orte_session_dir_finalize(orte_process_name_t *proc)
|
||||
goto CLEANUP;
|
||||
}
|
||||
|
||||
if (opal_os_dirpath_is_empty(job_session_dir)) {
|
||||
if (opal_os_dirpath_is_empty(orte_process_info.job_session_dir)) {
|
||||
if (orte_debug_flag) {
|
||||
opal_output(0, "sess_dir_finalize: found job session dir empty - deleting");
|
||||
}
|
||||
rmdir(job_session_dir);
|
||||
rmdir(orte_process_info.job_session_dir);
|
||||
} else {
|
||||
if (orte_debug_flag) {
|
||||
if (OPAL_ERR_NOT_FOUND == opal_os_dirpath_access(job_session_dir, 0)) {
|
||||
if (OPAL_ERR_NOT_FOUND ==
|
||||
opal_os_dirpath_access(orte_process_info.job_session_dir, 0)) {
|
||||
opal_output(0, "sess_dir_finalize: job session dir does not exist");
|
||||
} else {
|
||||
opal_output(0, "sess_dir_finalize: job session dir not empty - leaving");
|
||||
@ -635,26 +505,25 @@ orte_session_dir_finalize(orte_process_name_t *proc)
|
||||
goto CLEANUP;
|
||||
}
|
||||
|
||||
if (opal_os_dirpath_is_empty(tmp)) {
|
||||
if (orte_debug_flag) {
|
||||
opal_output(0, "sess_dir_finalize: found top session dir empty - deleting");
|
||||
}
|
||||
rmdir(tmp);
|
||||
} else {
|
||||
if (orte_debug_flag) {
|
||||
if (OPAL_ERR_NOT_FOUND == opal_os_dirpath_access(tmp, 0)) {
|
||||
opal_output(0, "sess_dir_finalize: top session dir does not exist");
|
||||
} else {
|
||||
opal_output(0, "sess_dir_finalize: top session dir not empty - leaving");
|
||||
if(NULL != orte_process_info.top_session_dir) {
|
||||
if (opal_os_dirpath_is_empty(orte_process_info.top_session_dir)) {
|
||||
if (orte_debug_flag) {
|
||||
opal_output(0, "sess_dir_finalize: found top session dir empty - deleting");
|
||||
}
|
||||
}
|
||||
rmdir(orte_process_info.top_session_dir);
|
||||
} else {
|
||||
if (orte_debug_flag) {
|
||||
if (OPAL_ERR_NOT_FOUND ==
|
||||
opal_os_dirpath_access(orte_process_info.top_session_dir, 0)) {
|
||||
opal_output(0, "sess_dir_finalize: top session dir does not exist");
|
||||
} else {
|
||||
opal_output(0, "sess_dir_finalize: top session dir not empty - leaving");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
CLEANUP:
|
||||
free(tmp);
|
||||
free(vpid);
|
||||
free(job_session_dir);
|
||||
free(proc_session_dir);
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
@ -680,33 +549,3 @@ orte_dir_check_file(const char *root, const char *path)
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static char *orte_build_job_session_dir(char *top_dir,
|
||||
orte_process_name_t *proc,
|
||||
orte_jobid_t jobid)
|
||||
{
|
||||
char *job_session_dir;
|
||||
|
||||
if (ORTE_JOBID_WILDCARD != jobid) {
|
||||
char *job = NULL;
|
||||
|
||||
if (0 > asprintf(&job, "%d", ORTE_LOCAL_JOBID(jobid))) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
job_session_dir = NULL;
|
||||
goto out;
|
||||
}
|
||||
job_session_dir = opal_os_path(false, top_dir, orte_process_info.jobfam_session_dir, job, NULL);
|
||||
free(job);
|
||||
if (NULL == job_session_dir) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
}
|
||||
} else {
|
||||
job_session_dir = opal_os_path(false, top_dir, orte_process_info.jobfam_session_dir, NULL);
|
||||
if( NULL == job_session_dir) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
}
|
||||
}
|
||||
|
||||
out:
|
||||
return job_session_dir;
|
||||
}
|
||||
|
@ -99,19 +99,6 @@ BEGIN_C_DECLS
|
||||
* locate an already existing universe for reconnection
|
||||
* purposes. If set to "true", then the function
|
||||
* creates the directory, if possible.
|
||||
* @param prefix A string variable indicating where the user
|
||||
* stipulated the directory should be found or
|
||||
* placed. A value of "NULL" indicates that the user
|
||||
* specified no location - hence, the function explores
|
||||
* a range of "standard" locations.
|
||||
* @param hostid Name of the host on which the session directory is
|
||||
* being built. Used to build the name of the
|
||||
* "openmpi-sessions-[user]@[host]:[batch]" branch of
|
||||
* the directory tree. NULL indicates that the nodename
|
||||
* found in orte_process_info is to be used.
|
||||
* @param batchid Batch job name, used in batch scheduling
|
||||
* systems. NULL indicates that the default of "0" is
|
||||
* to be used.
|
||||
* @param proc Pointer to a process name for which the session
|
||||
* dir name is desired
|
||||
*
|
||||
@ -120,18 +107,13 @@ BEGIN_C_DECLS
|
||||
* @retval OMPI_ERROR The directory cannot be found (if create is
|
||||
* "false") or created (if create is "true").
|
||||
*/
|
||||
ORTE_DECLSPEC int orte_session_dir(bool create, char *prefix, char *hostid,
|
||||
orte_process_name_t *proc);
|
||||
ORTE_DECLSPEC int orte_session_dir(bool create, orte_process_name_t *proc);
|
||||
|
||||
/*
|
||||
* Construct the session directory name from the input parameters.
|
||||
* This function does no checking that the directory exists, or can be used
|
||||
* Setup session-related directory paths
|
||||
*/
|
||||
ORTE_DECLSPEC int orte_session_dir_get_name(char **fulldirpath,
|
||||
char **prfx,
|
||||
char **frontend,
|
||||
char *hostid,
|
||||
orte_process_name_t *proc);
|
||||
ORTE_DECLSPEC int orte_session_setup_base(orte_process_name_t *proc);
|
||||
|
||||
|
||||
/** The orte_session_dir_finalize() function performs a cleanup of the
|
||||
* session directory tree. It first removes the session directory for
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user