As per the pull request to pmix/master:
https://github.com/pmix/master/pull/71 Have OMPI's current version of pmix120 nicely fail in case of too long sun_path (longer than 108 or in case of OSX 103 chars). And have OMPI return proper error messages with hints how to amend.
Этот коммит содержится в:
родитель
896f857fc4
Коммит
52080a5736
@ -143,6 +143,7 @@ static pmix_status_t initialize_server_base(pmix_server_module_t *module)
|
|||||||
{
|
{
|
||||||
int debug_level;
|
int debug_level;
|
||||||
char *tdir, *evar;
|
char *tdir, *evar;
|
||||||
|
char * pmix_pid;
|
||||||
pid_t pid;
|
pid_t pid;
|
||||||
|
|
||||||
/* initialize the output system */
|
/* initialize the output system */
|
||||||
@ -219,7 +220,14 @@ static pmix_status_t initialize_server_base(pmix_server_module_t *module)
|
|||||||
/* now set the address - we use the pid here to reduce collisions */
|
/* now set the address - we use the pid here to reduce collisions */
|
||||||
memset(&myaddress, 0, sizeof(struct sockaddr_un));
|
memset(&myaddress, 0, sizeof(struct sockaddr_un));
|
||||||
myaddress.sun_family = AF_UNIX;
|
myaddress.sun_family = AF_UNIX;
|
||||||
snprintf(myaddress.sun_path, sizeof(myaddress.sun_path)-1, "%s/pmix-%d", tdir, pid);
|
asprintf(&pmix_pid, "pmix-%d", pid);
|
||||||
|
// If the above set temporary directory name plus the pmix-PID string
|
||||||
|
// plus the '/' separator are too long, just fail, so the caller
|
||||||
|
// may provide the user with a proper help... *Cough*, *Cough* OSX...
|
||||||
|
if ((strlen(tdir) + strlen(pmix_pid) + 1) > sizeof(myaddress.sun_path)-1) {
|
||||||
|
return PMIX_ERR_INVALID_LENGTH;
|
||||||
|
}
|
||||||
|
snprintf(myaddress.sun_path, sizeof(myaddress.sun_path)-1, "%s/%s", tdir, pmix_pid);
|
||||||
asprintf(&myuri, "%s:%lu:%s", pmix_globals.myid.nspace, (unsigned long)pmix_globals.myid.rank, myaddress.sun_path);
|
asprintf(&myuri, "%s:%lu:%s", pmix_globals.myid.nspace, (unsigned long)pmix_globals.myid.rank, myaddress.sun_path);
|
||||||
|
|
||||||
|
|
||||||
|
@ -72,19 +72,20 @@ pmix_status_t pmix_start_listening(struct sockaddr_un *address)
|
|||||||
{
|
{
|
||||||
int flags;
|
int flags;
|
||||||
pmix_status_t rc;
|
pmix_status_t rc;
|
||||||
unsigned int addrlen;
|
socklen_t addrlen;
|
||||||
char *ptr;
|
char *ptr;
|
||||||
|
|
||||||
/* create a listen socket for incoming connection attempts */
|
/* create a listen socket for incoming connection attempts */
|
||||||
pmix_server_globals.listen_socket = socket(PF_UNIX, SOCK_STREAM, 0);
|
pmix_server_globals.listen_socket = socket(PF_UNIX, SOCK_STREAM, 0);
|
||||||
if (pmix_server_globals.listen_socket < 0) {
|
if (pmix_server_globals.listen_socket < 0) {
|
||||||
printf("%s:%d socket() failed", __FILE__, __LINE__);
|
printf("%s:%d socket() failed\n", __FILE__, __LINE__);
|
||||||
return PMIX_ERROR;
|
return PMIX_ERROR;
|
||||||
}
|
}
|
||||||
|
|
||||||
addrlen = sizeof(struct sockaddr_un);
|
addrlen = sizeof(struct sockaddr_un);
|
||||||
if (bind(pmix_server_globals.listen_socket, (struct sockaddr*)address, addrlen) < 0) {
|
if (bind(pmix_server_globals.listen_socket, (struct sockaddr*)address, addrlen) < 0) {
|
||||||
printf("%s:%d bind() failed", __FILE__, __LINE__);
|
printf("%s:%d bind() failed error:%s\n", __FILE__, __LINE__,
|
||||||
|
strerror(errno));
|
||||||
return PMIX_ERROR;
|
return PMIX_ERROR;
|
||||||
}
|
}
|
||||||
/* set the mode as required */
|
/* set the mode as required */
|
||||||
@ -95,18 +96,18 @@ pmix_status_t pmix_start_listening(struct sockaddr_un *address)
|
|||||||
|
|
||||||
/* setup listen backlog to maximum allowed by kernel */
|
/* setup listen backlog to maximum allowed by kernel */
|
||||||
if (listen(pmix_server_globals.listen_socket, SOMAXCONN) < 0) {
|
if (listen(pmix_server_globals.listen_socket, SOMAXCONN) < 0) {
|
||||||
printf("%s:%d listen() failed", __FILE__, __LINE__);
|
printf("%s:%d listen() failed\n", __FILE__, __LINE__);
|
||||||
return PMIX_ERROR;
|
return PMIX_ERROR;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* set socket up to be non-blocking, otherwise accept could block */
|
/* set socket up to be non-blocking, otherwise accept could block */
|
||||||
if ((flags = fcntl(pmix_server_globals.listen_socket, F_GETFL, 0)) < 0) {
|
if ((flags = fcntl(pmix_server_globals.listen_socket, F_GETFL, 0)) < 0) {
|
||||||
printf("%s:%d fcntl(F_GETFL) failed", __FILE__, __LINE__);
|
printf("%s:%d fcntl(F_GETFL) failed\n", __FILE__, __LINE__);
|
||||||
return PMIX_ERROR;
|
return PMIX_ERROR;
|
||||||
}
|
}
|
||||||
flags |= O_NONBLOCK;
|
flags |= O_NONBLOCK;
|
||||||
if (fcntl(pmix_server_globals.listen_socket, F_SETFL, flags) < 0) {
|
if (fcntl(pmix_server_globals.listen_socket, F_SETFL, flags) < 0) {
|
||||||
printf("%s:%d fcntl(F_SETFL) failed", __FILE__, __LINE__);
|
printf("%s:%d fcntl(F_SETFL) failed\n", __FILE__, __LINE__);
|
||||||
return PMIX_ERROR;
|
return PMIX_ERROR;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -515,7 +515,7 @@ int orte_ess_base_orted_setup(char **hosts)
|
|||||||
/* setup the PMIx server */
|
/* setup the PMIx server */
|
||||||
if (ORTE_SUCCESS != (ret = pmix_server_init())) {
|
if (ORTE_SUCCESS != (ret = pmix_server_init())) {
|
||||||
ORTE_ERROR_LOG(ret);
|
ORTE_ERROR_LOG(ret);
|
||||||
error = "pmix server init";
|
error = "Try a shorter TMPDIR var. or change your computer's name (see uname -n), since pmix_server_init";
|
||||||
goto error;
|
goto error;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -634,7 +634,7 @@ static int rte_init(void)
|
|||||||
/* setup the PMIx server */
|
/* setup the PMIx server */
|
||||||
if (ORTE_SUCCESS != (ret = pmix_server_init())) {
|
if (ORTE_SUCCESS != (ret = pmix_server_init())) {
|
||||||
ORTE_ERROR_LOG(ret);
|
ORTE_ERROR_LOG(ret);
|
||||||
error = "pmix server init";
|
error = "Try a shorter TMPDIR var. or change your computer's name (see uname -n), since pmix_server_init";
|
||||||
goto error;
|
goto error;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -43,6 +43,15 @@ a core that does not exist on this node:
|
|||||||
The MCA param directing this behavior is orte_daemon_cores.
|
The MCA param directing this behavior is orte_daemon_cores.
|
||||||
Please correct the request and try again.
|
Please correct the request and try again.
|
||||||
#
|
#
|
||||||
|
[orterun:pmix-failed]
|
||||||
|
The call to pmix_init_server() failed. This may be due to your
|
||||||
|
system's restriction for Unix's socket's path-length.
|
||||||
|
|
||||||
|
orte_proc_session_dir: %s
|
||||||
|
|
||||||
|
Please try to set TMPDIR to something short (like /tmp) or change
|
||||||
|
Your computer's name (see uname -n).
|
||||||
|
#
|
||||||
[cwd]
|
[cwd]
|
||||||
A dynamic operation (%s) was requested that requires us to obtain
|
A dynamic operation (%s) was requested that requires us to obtain
|
||||||
the current working directory. Unfortunately, an error was returned
|
the current working directory. Unfortunately, an error was returned
|
||||||
|
@ -246,6 +246,9 @@ int pmix_server_init(void)
|
|||||||
if (ORTE_SUCCESS != (rc = opal_pmix.server_init(&pmix_server, &info))) {
|
if (ORTE_SUCCESS != (rc = opal_pmix.server_init(&pmix_server, &info))) {
|
||||||
ORTE_ERROR_LOG(rc);
|
ORTE_ERROR_LOG(rc);
|
||||||
/* memory cleanup will occur when finalize is called */
|
/* memory cleanup will occur when finalize is called */
|
||||||
|
orte_show_help("help-orterun.txt", "orterun:pmix-failed", true,
|
||||||
|
orte_process_info.proc_session_dir);
|
||||||
|
return rc;
|
||||||
}
|
}
|
||||||
OPAL_LIST_DESTRUCT(&info);
|
OPAL_LIST_DESTRUCT(&info);
|
||||||
|
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user