1
1

Update the session dir structure. Restore the creation of a top-level dir based on userid so that everything is contained under the user's top-level dir. Make the next level down (the "job family" level) be either the pid (indicated by a name of "pid.N") or the job family if not launched by mpirun. This allows for proper rendezvous by direct-launched procs.

Этот коммит содержится в:
Ralph Castain 2016-08-15 22:46:46 -05:00
родитель ecbedee8bb
Коммит ae2af61ee3
7 изменённых файлов: 96 добавлений и 49 удалений

Просмотреть файл

@ -205,6 +205,7 @@ static void connection_event_handler(int incoming_sd, short flags, void* cbdata)
static int component_startup(void)
{
int rc=ORTE_SUCCESS;
char *session;
opal_output_verbose(2, orte_oob_base_framework.framework_output,
"%s USOCK STARTUP",
@ -213,11 +214,18 @@ static int component_startup(void)
/* setup the path to the daemon rendezvous point */
memset(&mca_oob_usock_component.address, 0, sizeof(struct sockaddr_un));
mca_oob_usock_component.address.sun_family = AF_UNIX;
session = opal_os_path(false, orte_process_info.tmpdir_base,
orte_process_info.top_session_dir,
orte_process_info.jobfam_session_dir,
"usock", NULL);
if ((strlen(session) + 1) > sizeof(mca_oob_usock_component.address.sun_path)-1) {
opal_output(0, "SESSION DIR TOO LONG");
return ORTE_ERR_NOT_SUPPORTED;
}
snprintf(mca_oob_usock_component.address.sun_path,
sizeof(mca_oob_usock_component.address.sun_path)-1,
"%s/%s/%s/0/%s", orte_process_info.tmpdir_base,
orte_process_info.top_session_dir,
ORTE_JOB_FAMILY_PRINT(ORTE_PROC_MY_NAME->jobid), "usock");
"%s", session);
free(session);
opal_output_verbose(2, orte_oob_base_framework.framework_output,
"SUNPATH: %s", mca_oob_usock_component.address.sun_path);

Просмотреть файл

@ -944,6 +944,7 @@ static int setup_fork(orte_job_t *jdata,
/* forcibly set the local tmpdir base and top session dir to match ours */
opal_setenv("OMPI_MCA_orte_tmpdir_base", orte_process_info.tmpdir_base, true, &app->env);
opal_setenv("OMPI_MCA_orte_top_session_dir", orte_process_info.top_session_dir, true, &app->env);
opal_setenv("OMPI_MCA_orte_jobfam_session_dir", orte_process_info.jobfam_session_dir, true, &app->env);
/* MPI-3 requires we provide some further info to the procs,
* so we pass them as envars to avoid introducing further

Просмотреть файл

@ -57,6 +57,7 @@
#include "opal/util/show_help.h"
#include "opal/util/error.h"
#include "opal/util/output.h"
#include "opal/util/os_path.h"
#include "opal/util/argv.h"
#include "orte/mca/errmgr/errmgr.h"
@ -261,9 +262,12 @@ int pmix_server_init(void)
kv = OBJ_NEW(opal_value_t);
kv->key = strdup(OPAL_PMIX_SERVER_TMPDIR);
kv->type = OPAL_STRING;
kv->data.string = strdup(orte_process_info.tmpdir_base);
kv->data.string = opal_os_path(false, orte_process_info.tmpdir_base,
orte_process_info.top_session_dir,
orte_process_info.jobfam_session_dir, NULL);
opal_list_append(&info, &kv->super);
/* use the same for the system temp directory */
/* use the same for the system temp directory - this is
* where the system-level tool connections will go */
kv = OBJ_NEW(opal_value_t);
kv->key = strdup(OPAL_PMIX_SYSTEM_TMPDIR);
kv->type = OPAL_STRING;

Просмотреть файл

@ -13,7 +13,7 @@
* Copyright (c) 2009-2010 Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012-2013 Los Alamos National Security, LLC.
* All rights reserved
* Copyright (c) 2013-2015 Intel, Inc. All rights reserved
* Copyright (c) 2013-2016 Intel, Inc. All rights reserved
* Copyright (c) 2014 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* $COPYRIGHT$
@ -51,6 +51,7 @@ static char *orte_tmpdir_base = NULL;
static char *orte_local_tmpdir_base = NULL;
static char *orte_remote_tmpdir_base = NULL;
static char *orte_top_session_dir = NULL;
static char *orte_jobfam_session_dir = NULL;
int orte_register_params(void)
{
@ -165,6 +166,20 @@ int orte_register_params(void)
orte_process_info.top_session_dir = strdup(orte_top_session_dir);
}
orte_jobfam_session_dir = NULL;
(void) mca_base_var_register ("orte", "orte", NULL, "jobfam_session_dir",
"The jobfamily session directory for applications",
MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0,
OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_ALL_EQ,
&orte_jobfam_session_dir);
if (NULL != orte_jobfam_session_dir) {
if (NULL != orte_process_info.jobfam_session_dir) {
free(orte_process_info.jobfam_session_dir);
}
orte_process_info.jobfam_session_dir = strdup(orte_jobfam_session_dir);
}
orte_prohibited_session_dirs = NULL;
(void) mca_base_var_register ("orte", "orte", NULL, "no_session_dirs",
"Prohibited locations for session directories (multiple locations separated by ',', default=NULL)",

Просмотреть файл

@ -12,7 +12,7 @@
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2012 Los Alamos National Security, LLC.
* All rights reserved.
* Copyright (c) 2014-2015 Intel, Inc. All rights reserved
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -80,6 +80,7 @@ ORTE_DECLSPEC orte_proc_info_t orte_process_info = {
.num_local_peers = 0,
.tmpdir_base = NULL,
.top_session_dir = NULL,
.jobfam_session_dir = NULL,
.job_session_dir = NULL,
.proc_session_dir = NULL,
.sock_stdin = NULL,
@ -294,6 +295,11 @@ int orte_proc_info_finalize(void)
orte_process_info.top_session_dir = NULL;
}
if (NULL != orte_process_info.jobfam_session_dir) {
free(orte_process_info.jobfam_session_dir);
orte_process_info.jobfam_session_dir = NULL;
}
if (NULL != orte_process_info.job_session_dir) {
free(orte_process_info.job_session_dir);
orte_process_info.job_session_dir = NULL;

Просмотреть файл

@ -11,7 +11,7 @@
* All rights reserved.
* Copyright (c) 2011-2012 Los Alamos National Security, LLC.
* All rights reserved.
* Copyright (c) 2013-2015 Intel, Inc. All rights reserved
* Copyright (c) 2013-2016 Intel, Inc. All rights reserved
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -119,6 +119,7 @@ struct orte_proc_info_t {
*/
char *tmpdir_base; /**< Base directory of the session dir tree */
char *top_session_dir; /**< Top-most directory of the session tree */
char *jobfam_session_dir; /**< Session directory for this family of jobs (i.e., share same mpirun) */
char *job_session_dir; /**< Session directory for job */
char *proc_session_dir; /**< Session directory for the process */

Просмотреть файл

@ -114,7 +114,16 @@ static int orte_create_dir(char *directory)
/*
* Construct the fullpath to the session directory - it
* will consist of "ompi.<hostname>.<pid>"
* will consist of "ompi.<hostname>.<effective-uid>", and
* have subdirs:
*
* pid - the pid of the mpirun that oversees this job. Note
* that direct-launched processes will have manufactured
* this value
*
* jobid - jobid of the application being executed
*
* vpid - vpid of the process
*/
int
orte_session_dir_get_name(char **fulldirpath,
@ -132,10 +141,14 @@ orte_session_dir_get_name(char **fulldirpath,
bool prefix_provided = false;
int exit_status = ORTE_SUCCESS;
size_t len;
uid_t uid;
/* Ensure that system info is set */
orte_proc_info();
/* get the effective uid */
uid = geteuid();
/*
* set the 'hostname'
*/
@ -156,30 +169,48 @@ orte_session_dir_get_name(char **fulldirpath,
/* construct the frontend of the session directory*/
if (NULL != orte_process_info.top_session_dir) {
frontend = strdup(orte_process_info.top_session_dir);
}
else { /* If not set then construct it */
if (0 > asprintf(&frontend, "ompi.%s.%lu", hostname, (unsigned long)orte_process_info.pid)) {
} else { /* If not set then construct it */
if (0 > asprintf(&frontend, "ompi.%s.%lu", hostname, (unsigned long)uid)) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
exit_status = ORTE_ERR_OUT_OF_RESOURCE;
goto cleanup;
}
}
/* construct the next level down, which belongs to the
* job family. This is related to the mpirun that launched
* the job, or is an arbitrary (agreed upon) value if
* direct launched */
if (ORTE_PROC_IS_HNP) {
if (0 > asprintf(&jobfam, "pid.%lu", (unsigned long)orte_process_info.pid)) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
exit_status = ORTE_ERR_OUT_OF_RESOURCE;
goto cleanup;
}
orte_process_info.jobfam_session_dir = strdup(jobfam);
} else if (NULL != orte_process_info.jobfam_session_dir) {
/* we had a job family session dir passed down to us by mpirun */
jobfam = strdup(orte_process_info.jobfam_session_dir);
} else {
/* we were not given one, so define it */
if (NULL == proc) {
jobfam = strdup("jobfam");
} else {
if (0 > asprintf(&jobfam, "jf.%d", ORTE_JOB_FAMILY(proc->jobid))) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
exit_status = ORTE_ERR_OUT_OF_RESOURCE;
goto cleanup;
}
}
orte_process_info.jobfam_session_dir = strdup(jobfam);
}
/*
* Construct the session directory
*/
/* If we were given a valid vpid then we can construct it fully into:
* openmpi-sessions-USERNAME@HOSTNAME_BATCHID/JOB-FAMILY/JOBID/VPID
*/
/* If we were given a valid vpid then we can construct it fully */
if( NULL != proc) {
if (ORTE_VPID_INVALID != proc->vpid) {
if (0 > asprintf(&jobfam, "%d", ORTE_JOB_FAMILY(proc->jobid))) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
exit_status = ORTE_ERR_OUT_OF_RESOURCE;
goto cleanup;
}
if (0 > asprintf(&job, "%d", ORTE_LOCAL_JOBID(proc->jobid))) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
exit_status = ORTE_ERR_OUT_OF_RESOURCE;
@ -198,17 +229,7 @@ orte_session_dir_get_name(char **fulldirpath,
exit_status = ORTE_ERROR;
goto cleanup;
}
}
/* If we were given a valid jobid then we can construct it partially into:
* openmpi-sessions-USERNAME@HOSTNAME_BATCHID/JOB-FAMILY/JOBID
*/
else if (ORTE_JOBID_INVALID != proc->jobid) {
if (0 > asprintf(&jobfam, "%d", ORTE_JOB_FAMILY(proc->jobid))) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
exit_status = ORTE_ERR_OUT_OF_RESOURCE;
goto cleanup;
}
} else if (ORTE_JOBID_INVALID != proc->jobid) {
if (0 > asprintf(&job, "%d", ORTE_LOCAL_JOBID(proc->jobid))) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
exit_status = ORTE_ERR_OUT_OF_RESOURCE;
@ -221,14 +242,12 @@ orte_session_dir_get_name(char **fulldirpath,
exit_status = ORTE_ERROR;
goto cleanup;
}
} /* if both are invalid */
else {
} else {
sessions = strdup(frontend); /* must dup this to avoid double-free later */
}
} /* If we were not given a proc at all, then we just set it to frontend
*/
else {
} else {
/* If we were not given a proc at all, then we just set it to frontend */
sessions = strdup(frontend); /* must dup this to avoid double-free later */
}
@ -666,14 +685,8 @@ static char *orte_build_job_session_dir(char *top_dir,
orte_process_name_t *proc,
orte_jobid_t jobid)
{
char *jobfam = NULL;
char *job_session_dir;
if (0 > asprintf(&jobfam, "%d", ORTE_JOB_FAMILY(proc->jobid))) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return NULL;
}
if (ORTE_JOBID_WILDCARD != jobid) {
char *job = NULL;
@ -682,19 +695,18 @@ static char *orte_build_job_session_dir(char *top_dir,
job_session_dir = NULL;
goto out;
}
job_session_dir = opal_os_path(false, top_dir, jobfam, job, NULL);
job_session_dir = opal_os_path(false, top_dir, orte_process_info.jobfam_session_dir, job, NULL);
free(job);
if (NULL == job_session_dir) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
}
} else {
job_session_dir = opal_os_path(false, top_dir, jobfam, NULL);
job_session_dir = opal_os_path(false, top_dir, orte_process_info.jobfam_session_dir, NULL);
if( NULL == job_session_dir) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
}
}
out:
free(jobfam);
return job_session_dir;
}