Update the session dir structure. Restore the creation of a top-level dir based on userid so that everything is contained under the user's top-level dir. Make the next level down (the "job family" level) be either the pid (indicated by a name of "pid.N") or the job family if not launched by mpirun. This allows for proper rendezvous by direct-launched procs.
Этот коммит содержится в:
родитель
ecbedee8bb
Коммит
ae2af61ee3
@ -205,6 +205,7 @@ static void connection_event_handler(int incoming_sd, short flags, void* cbdata)
|
||||
static int component_startup(void)
|
||||
{
|
||||
int rc=ORTE_SUCCESS;
|
||||
char *session;
|
||||
|
||||
opal_output_verbose(2, orte_oob_base_framework.framework_output,
|
||||
"%s USOCK STARTUP",
|
||||
@ -213,11 +214,18 @@ static int component_startup(void)
|
||||
/* setup the path to the daemon rendezvous point */
|
||||
memset(&mca_oob_usock_component.address, 0, sizeof(struct sockaddr_un));
|
||||
mca_oob_usock_component.address.sun_family = AF_UNIX;
|
||||
session = opal_os_path(false, orte_process_info.tmpdir_base,
|
||||
orte_process_info.top_session_dir,
|
||||
orte_process_info.jobfam_session_dir,
|
||||
"usock", NULL);
|
||||
if ((strlen(session) + 1) > sizeof(mca_oob_usock_component.address.sun_path)-1) {
|
||||
opal_output(0, "SESSION DIR TOO LONG");
|
||||
return ORTE_ERR_NOT_SUPPORTED;
|
||||
}
|
||||
snprintf(mca_oob_usock_component.address.sun_path,
|
||||
sizeof(mca_oob_usock_component.address.sun_path)-1,
|
||||
"%s/%s/%s/0/%s", orte_process_info.tmpdir_base,
|
||||
orte_process_info.top_session_dir,
|
||||
ORTE_JOB_FAMILY_PRINT(ORTE_PROC_MY_NAME->jobid), "usock");
|
||||
"%s", session);
|
||||
free(session);
|
||||
opal_output_verbose(2, orte_oob_base_framework.framework_output,
|
||||
"SUNPATH: %s", mca_oob_usock_component.address.sun_path);
|
||||
|
||||
|
@ -944,6 +944,7 @@ static int setup_fork(orte_job_t *jdata,
|
||||
/* forcibly set the local tmpdir base and top session dir to match ours */
|
||||
opal_setenv("OMPI_MCA_orte_tmpdir_base", orte_process_info.tmpdir_base, true, &app->env);
|
||||
opal_setenv("OMPI_MCA_orte_top_session_dir", orte_process_info.top_session_dir, true, &app->env);
|
||||
opal_setenv("OMPI_MCA_orte_jobfam_session_dir", orte_process_info.jobfam_session_dir, true, &app->env);
|
||||
|
||||
/* MPI-3 requires we provide some further info to the procs,
|
||||
* so we pass them as envars to avoid introducing further
|
||||
|
@ -57,6 +57,7 @@
|
||||
#include "opal/util/show_help.h"
|
||||
#include "opal/util/error.h"
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/util/os_path.h"
|
||||
#include "opal/util/argv.h"
|
||||
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
@ -261,9 +262,12 @@ int pmix_server_init(void)
|
||||
kv = OBJ_NEW(opal_value_t);
|
||||
kv->key = strdup(OPAL_PMIX_SERVER_TMPDIR);
|
||||
kv->type = OPAL_STRING;
|
||||
kv->data.string = strdup(orte_process_info.tmpdir_base);
|
||||
kv->data.string = opal_os_path(false, orte_process_info.tmpdir_base,
|
||||
orte_process_info.top_session_dir,
|
||||
orte_process_info.jobfam_session_dir, NULL);
|
||||
opal_list_append(&info, &kv->super);
|
||||
/* use the same for the system temp directory */
|
||||
/* use the same for the system temp directory - this is
|
||||
* where the system-level tool connections will go */
|
||||
kv = OBJ_NEW(opal_value_t);
|
||||
kv->key = strdup(OPAL_PMIX_SYSTEM_TMPDIR);
|
||||
kv->type = OPAL_STRING;
|
||||
|
@ -13,7 +13,7 @@
|
||||
* Copyright (c) 2009-2010 Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2012-2013 Los Alamos National Security, LLC.
|
||||
* All rights reserved
|
||||
* Copyright (c) 2013-2015 Intel, Inc. All rights reserved
|
||||
* Copyright (c) 2013-2016 Intel, Inc. All rights reserved
|
||||
* Copyright (c) 2014 Research Organization for Information Science
|
||||
* and Technology (RIST). All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
@ -51,6 +51,7 @@ static char *orte_tmpdir_base = NULL;
|
||||
static char *orte_local_tmpdir_base = NULL;
|
||||
static char *orte_remote_tmpdir_base = NULL;
|
||||
static char *orte_top_session_dir = NULL;
|
||||
static char *orte_jobfam_session_dir = NULL;
|
||||
|
||||
int orte_register_params(void)
|
||||
{
|
||||
@ -165,6 +166,20 @@ int orte_register_params(void)
|
||||
orte_process_info.top_session_dir = strdup(orte_top_session_dir);
|
||||
}
|
||||
|
||||
orte_jobfam_session_dir = NULL;
|
||||
(void) mca_base_var_register ("orte", "orte", NULL, "jobfam_session_dir",
|
||||
"The jobfamily session directory for applications",
|
||||
MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_ALL_EQ,
|
||||
&orte_jobfam_session_dir);
|
||||
|
||||
if (NULL != orte_jobfam_session_dir) {
|
||||
if (NULL != orte_process_info.jobfam_session_dir) {
|
||||
free(orte_process_info.jobfam_session_dir);
|
||||
}
|
||||
orte_process_info.jobfam_session_dir = strdup(orte_jobfam_session_dir);
|
||||
}
|
||||
|
||||
orte_prohibited_session_dirs = NULL;
|
||||
(void) mca_base_var_register ("orte", "orte", NULL, "no_session_dirs",
|
||||
"Prohibited locations for session directories (multiple locations separated by ',', default=NULL)",
|
||||
|
@ -12,7 +12,7 @@
|
||||
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2012 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2014-2015 Intel, Inc. All rights reserved
|
||||
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -80,6 +80,7 @@ ORTE_DECLSPEC orte_proc_info_t orte_process_info = {
|
||||
.num_local_peers = 0,
|
||||
.tmpdir_base = NULL,
|
||||
.top_session_dir = NULL,
|
||||
.jobfam_session_dir = NULL,
|
||||
.job_session_dir = NULL,
|
||||
.proc_session_dir = NULL,
|
||||
.sock_stdin = NULL,
|
||||
@ -294,6 +295,11 @@ int orte_proc_info_finalize(void)
|
||||
orte_process_info.top_session_dir = NULL;
|
||||
}
|
||||
|
||||
if (NULL != orte_process_info.jobfam_session_dir) {
|
||||
free(orte_process_info.jobfam_session_dir);
|
||||
orte_process_info.jobfam_session_dir = NULL;
|
||||
}
|
||||
|
||||
if (NULL != orte_process_info.job_session_dir) {
|
||||
free(orte_process_info.job_session_dir);
|
||||
orte_process_info.job_session_dir = NULL;
|
||||
|
@ -11,7 +11,7 @@
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2011-2012 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2013-2015 Intel, Inc. All rights reserved
|
||||
* Copyright (c) 2013-2016 Intel, Inc. All rights reserved
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -119,6 +119,7 @@ struct orte_proc_info_t {
|
||||
*/
|
||||
char *tmpdir_base; /**< Base directory of the session dir tree */
|
||||
char *top_session_dir; /**< Top-most directory of the session tree */
|
||||
char *jobfam_session_dir; /**< Session directory for this family of jobs (i.e., share same mpirun) */
|
||||
char *job_session_dir; /**< Session directory for job */
|
||||
char *proc_session_dir; /**< Session directory for the process */
|
||||
|
||||
|
@ -114,7 +114,16 @@ static int orte_create_dir(char *directory)
|
||||
|
||||
/*
|
||||
* Construct the fullpath to the session directory - it
|
||||
* will consist of "ompi.<hostname>.<pid>"
|
||||
* will consist of "ompi.<hostname>.<effective-uid>", and
|
||||
* have subdirs:
|
||||
*
|
||||
* pid - the pid of the mpirun that oversees this job. Note
|
||||
* that direct-launched processes will have manufactured
|
||||
* this value
|
||||
*
|
||||
* jobid - jobid of the application being executed
|
||||
*
|
||||
* vpid - vpid of the process
|
||||
*/
|
||||
int
|
||||
orte_session_dir_get_name(char **fulldirpath,
|
||||
@ -132,10 +141,14 @@ orte_session_dir_get_name(char **fulldirpath,
|
||||
bool prefix_provided = false;
|
||||
int exit_status = ORTE_SUCCESS;
|
||||
size_t len;
|
||||
uid_t uid;
|
||||
|
||||
/* Ensure that system info is set */
|
||||
orte_proc_info();
|
||||
|
||||
/* get the effective uid */
|
||||
uid = geteuid();
|
||||
|
||||
/*
|
||||
* set the 'hostname'
|
||||
*/
|
||||
@ -156,30 +169,48 @@ orte_session_dir_get_name(char **fulldirpath,
|
||||
/* construct the frontend of the session directory*/
|
||||
if (NULL != orte_process_info.top_session_dir) {
|
||||
frontend = strdup(orte_process_info.top_session_dir);
|
||||
}
|
||||
else { /* If not set then construct it */
|
||||
if (0 > asprintf(&frontend, "ompi.%s.%lu", hostname, (unsigned long)orte_process_info.pid)) {
|
||||
} else { /* If not set then construct it */
|
||||
if (0 > asprintf(&frontend, "ompi.%s.%lu", hostname, (unsigned long)uid)) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
exit_status = ORTE_ERR_OUT_OF_RESOURCE;
|
||||
goto cleanup;
|
||||
}
|
||||
}
|
||||
|
||||
/* construct the next level down, which belongs to the
|
||||
* job family. This is related to the mpirun that launched
|
||||
* the job, or is an arbitrary (agreed upon) value if
|
||||
* direct launched */
|
||||
if (ORTE_PROC_IS_HNP) {
|
||||
if (0 > asprintf(&jobfam, "pid.%lu", (unsigned long)orte_process_info.pid)) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
exit_status = ORTE_ERR_OUT_OF_RESOURCE;
|
||||
goto cleanup;
|
||||
}
|
||||
orte_process_info.jobfam_session_dir = strdup(jobfam);
|
||||
} else if (NULL != orte_process_info.jobfam_session_dir) {
|
||||
/* we had a job family session dir passed down to us by mpirun */
|
||||
jobfam = strdup(orte_process_info.jobfam_session_dir);
|
||||
} else {
|
||||
/* we were not given one, so define it */
|
||||
if (NULL == proc) {
|
||||
jobfam = strdup("jobfam");
|
||||
} else {
|
||||
if (0 > asprintf(&jobfam, "jf.%d", ORTE_JOB_FAMILY(proc->jobid))) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
exit_status = ORTE_ERR_OUT_OF_RESOURCE;
|
||||
goto cleanup;
|
||||
}
|
||||
}
|
||||
orte_process_info.jobfam_session_dir = strdup(jobfam);
|
||||
}
|
||||
|
||||
/*
|
||||
* Construct the session directory
|
||||
*/
|
||||
/* If we were given a valid vpid then we can construct it fully into:
|
||||
* openmpi-sessions-USERNAME@HOSTNAME_BATCHID/JOB-FAMILY/JOBID/VPID
|
||||
*/
|
||||
/* If we were given a valid vpid then we can construct it fully */
|
||||
if( NULL != proc) {
|
||||
if (ORTE_VPID_INVALID != proc->vpid) {
|
||||
|
||||
if (0 > asprintf(&jobfam, "%d", ORTE_JOB_FAMILY(proc->jobid))) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
exit_status = ORTE_ERR_OUT_OF_RESOURCE;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
if (0 > asprintf(&job, "%d", ORTE_LOCAL_JOBID(proc->jobid))) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
exit_status = ORTE_ERR_OUT_OF_RESOURCE;
|
||||
@ -198,17 +229,7 @@ orte_session_dir_get_name(char **fulldirpath,
|
||||
exit_status = ORTE_ERROR;
|
||||
goto cleanup;
|
||||
}
|
||||
}
|
||||
/* If we were given a valid jobid then we can construct it partially into:
|
||||
* openmpi-sessions-USERNAME@HOSTNAME_BATCHID/JOB-FAMILY/JOBID
|
||||
*/
|
||||
else if (ORTE_JOBID_INVALID != proc->jobid) {
|
||||
if (0 > asprintf(&jobfam, "%d", ORTE_JOB_FAMILY(proc->jobid))) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
exit_status = ORTE_ERR_OUT_OF_RESOURCE;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
} else if (ORTE_JOBID_INVALID != proc->jobid) {
|
||||
if (0 > asprintf(&job, "%d", ORTE_LOCAL_JOBID(proc->jobid))) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
exit_status = ORTE_ERR_OUT_OF_RESOURCE;
|
||||
@ -221,14 +242,12 @@ orte_session_dir_get_name(char **fulldirpath,
|
||||
exit_status = ORTE_ERROR;
|
||||
goto cleanup;
|
||||
}
|
||||
} /* if both are invalid */
|
||||
else {
|
||||
} else {
|
||||
sessions = strdup(frontend); /* must dup this to avoid double-free later */
|
||||
}
|
||||
|
||||
} /* If we were not given a proc at all, then we just set it to frontend
|
||||
*/
|
||||
else {
|
||||
} else {
|
||||
/* If we were not given a proc at all, then we just set it to frontend */
|
||||
sessions = strdup(frontend); /* must dup this to avoid double-free later */
|
||||
}
|
||||
|
||||
@ -666,14 +685,8 @@ static char *orte_build_job_session_dir(char *top_dir,
|
||||
orte_process_name_t *proc,
|
||||
orte_jobid_t jobid)
|
||||
{
|
||||
char *jobfam = NULL;
|
||||
char *job_session_dir;
|
||||
|
||||
if (0 > asprintf(&jobfam, "%d", ORTE_JOB_FAMILY(proc->jobid))) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (ORTE_JOBID_WILDCARD != jobid) {
|
||||
char *job = NULL;
|
||||
|
||||
@ -682,19 +695,18 @@ static char *orte_build_job_session_dir(char *top_dir,
|
||||
job_session_dir = NULL;
|
||||
goto out;
|
||||
}
|
||||
job_session_dir = opal_os_path(false, top_dir, jobfam, job, NULL);
|
||||
job_session_dir = opal_os_path(false, top_dir, orte_process_info.jobfam_session_dir, job, NULL);
|
||||
free(job);
|
||||
if (NULL == job_session_dir) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
}
|
||||
} else {
|
||||
job_session_dir = opal_os_path(false, top_dir, jobfam, NULL);
|
||||
job_session_dir = opal_os_path(false, top_dir, orte_process_info.jobfam_session_dir, NULL);
|
||||
if( NULL == job_session_dir) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
}
|
||||
}
|
||||
|
||||
out:
|
||||
free(jobfam);
|
||||
return job_session_dir;
|
||||
}
|
||||
|
Загрузка…
Ссылка в новой задаче
Block a user