1
1

As per the pull request to pmix/master:

https://github.com/pmix/master/pull/71

Have OMPI's current version of pmix120 nicely fail in case of
too long sun_path (longer than 108 or in case of OSX 103 chars).
And have OMPI return proper error messages with hints how to
amend.
Этот коммит содержится в:
Rainer Keller 2016-04-07 22:12:53 +02:00
родитель 896f857fc4
Коммит 52080a5736
6 изменённых файлов: 30 добавлений и 9 удалений

Просмотреть файл

@ -143,6 +143,7 @@ static pmix_status_t initialize_server_base(pmix_server_module_t *module)
{
int debug_level;
char *tdir, *evar;
char * pmix_pid;
pid_t pid;
/* initialize the output system */
@ -219,7 +220,14 @@ static pmix_status_t initialize_server_base(pmix_server_module_t *module)
/* now set the address - we use the pid here to reduce collisions */
memset(&myaddress, 0, sizeof(struct sockaddr_un));
myaddress.sun_family = AF_UNIX;
snprintf(myaddress.sun_path, sizeof(myaddress.sun_path)-1, "%s/pmix-%d", tdir, pid);
asprintf(&pmix_pid, "pmix-%d", pid);
// If the above set temporary directory name plus the pmix-PID string
// plus the '/' separator are too long, just fail, so the caller
// may provide the user with a proper help... *Cough*, *Cough* OSX...
if ((strlen(tdir) + strlen(pmix_pid) + 1) > sizeof(myaddress.sun_path)-1) {
return PMIX_ERR_INVALID_LENGTH;
}
snprintf(myaddress.sun_path, sizeof(myaddress.sun_path)-1, "%s/%s", tdir, pmix_pid);
asprintf(&myuri, "%s:%lu:%s", pmix_globals.myid.nspace, (unsigned long)pmix_globals.myid.rank, myaddress.sun_path);

Просмотреть файл

@ -72,19 +72,20 @@ pmix_status_t pmix_start_listening(struct sockaddr_un *address)
{
int flags;
pmix_status_t rc;
unsigned int addrlen;
socklen_t addrlen;
char *ptr;
/* create a listen socket for incoming connection attempts */
pmix_server_globals.listen_socket = socket(PF_UNIX, SOCK_STREAM, 0);
if (pmix_server_globals.listen_socket < 0) {
printf("%s:%d socket() failed", __FILE__, __LINE__);
printf("%s:%d socket() failed\n", __FILE__, __LINE__);
return PMIX_ERROR;
}
addrlen = sizeof(struct sockaddr_un);
if (bind(pmix_server_globals.listen_socket, (struct sockaddr*)address, addrlen) < 0) {
printf("%s:%d bind() failed", __FILE__, __LINE__);
printf("%s:%d bind() failed error:%s\n", __FILE__, __LINE__,
strerror(errno));
return PMIX_ERROR;
}
/* set the mode as required */
@ -95,18 +96,18 @@ pmix_status_t pmix_start_listening(struct sockaddr_un *address)
/* setup listen backlog to maximum allowed by kernel */
if (listen(pmix_server_globals.listen_socket, SOMAXCONN) < 0) {
printf("%s:%d listen() failed", __FILE__, __LINE__);
printf("%s:%d listen() failed\n", __FILE__, __LINE__);
return PMIX_ERROR;
}
/* set socket up to be non-blocking, otherwise accept could block */
if ((flags = fcntl(pmix_server_globals.listen_socket, F_GETFL, 0)) < 0) {
printf("%s:%d fcntl(F_GETFL) failed", __FILE__, __LINE__);
printf("%s:%d fcntl(F_GETFL) failed\n", __FILE__, __LINE__);
return PMIX_ERROR;
}
flags |= O_NONBLOCK;
if (fcntl(pmix_server_globals.listen_socket, F_SETFL, flags) < 0) {
printf("%s:%d fcntl(F_SETFL) failed", __FILE__, __LINE__);
printf("%s:%d fcntl(F_SETFL) failed\n", __FILE__, __LINE__);
return PMIX_ERROR;
}

Просмотреть файл

@ -515,7 +515,7 @@ int orte_ess_base_orted_setup(char **hosts)
/* setup the PMIx server */
if (ORTE_SUCCESS != (ret = pmix_server_init())) {
ORTE_ERROR_LOG(ret);
error = "pmix server init";
error = "Try a shorter TMPDIR var. or change your computer's name (see uname -n), since pmix_server_init";
goto error;
}

Просмотреть файл

@ -634,7 +634,7 @@ static int rte_init(void)
/* setup the PMIx server */
if (ORTE_SUCCESS != (ret = pmix_server_init())) {
ORTE_ERROR_LOG(ret);
error = "pmix server init";
error = "Try a shorter TMPDIR var. or change your computer's name (see uname -n), since pmix_server_init";
goto error;
}

Просмотреть файл

@ -43,6 +43,15 @@ a core that does not exist on this node:
The MCA param directing this behavior is orte_daemon_cores.
Please correct the request and try again.
#
[orterun:pmix-failed]
The call to pmix_init_server() failed. This may be due to your
system's restriction for Unix's socket's path-length.
orte_proc_session_dir: %s
Please try to set TMPDIR to something short (like /tmp) or change
Your computer's name (see uname -n).
#
[cwd]
A dynamic operation (%s) was requested that requires us to obtain
the current working directory. Unfortunately, an error was returned

Просмотреть файл

@ -246,6 +246,9 @@ int pmix_server_init(void)
if (ORTE_SUCCESS != (rc = opal_pmix.server_init(&pmix_server, &info))) {
ORTE_ERROR_LOG(rc);
/* memory cleanup will occur when finalize is called */
orte_show_help("help-orterun.txt", "orterun:pmix-failed", true,
orte_process_info.proc_session_dir);
return rc;
}
OPAL_LIST_DESTRUCT(&info);