From 52080a57362c6cec8cd42115dfaef8f7429f5302 Mon Sep 17 00:00:00 2001 From: Rainer Keller Date: Thu, 7 Apr 2016 22:12:53 +0200 Subject: [PATCH] As per the pull request to pmix/master: https://github.com/pmix/master/pull/71 Have OMPI's current version of pmix120 nicely fail in case of too long sun_path (longer than 108 or in case of OSX 103 chars). And have OMPI return proper error messages with hints how to amend. --- opal/mca/pmix/pmix120/pmix/src/server/pmix_server.c | 10 +++++++++- .../pmix120/pmix/src/server/pmix_server_listener.c | 13 +++++++------ orte/mca/ess/base/ess_base_std_orted.c | 2 +- orte/mca/ess/hnp/ess_hnp_module.c | 2 +- orte/orted/help-orted.txt | 9 +++++++++ orte/orted/pmix/pmix_server.c | 3 +++ 6 files changed, 30 insertions(+), 9 deletions(-) diff --git a/opal/mca/pmix/pmix120/pmix/src/server/pmix_server.c b/opal/mca/pmix/pmix120/pmix/src/server/pmix_server.c index 04dd1eafd0..fcdfe709ab 100644 --- a/opal/mca/pmix/pmix120/pmix/src/server/pmix_server.c +++ b/opal/mca/pmix/pmix120/pmix/src/server/pmix_server.c @@ -143,6 +143,7 @@ static pmix_status_t initialize_server_base(pmix_server_module_t *module) { int debug_level; char *tdir, *evar; + char * pmix_pid; pid_t pid; /* initialize the output system */ @@ -219,7 +220,14 @@ static pmix_status_t initialize_server_base(pmix_server_module_t *module) /* now set the address - we use the pid here to reduce collisions */ memset(&myaddress, 0, sizeof(struct sockaddr_un)); myaddress.sun_family = AF_UNIX; - snprintf(myaddress.sun_path, sizeof(myaddress.sun_path)-1, "%s/pmix-%d", tdir, pid); + asprintf(&pmix_pid, "pmix-%d", pid); + // If the above set temporary directory name plus the pmix-PID string + // plus the '/' separator are too long, just fail, so the caller + // may provide the user with a proper help... *Cough*, *Cough* OSX... + if ((strlen(tdir) + strlen(pmix_pid) + 1) > sizeof(myaddress.sun_path)-1) { + return PMIX_ERR_INVALID_LENGTH; + } + snprintf(myaddress.sun_path, sizeof(myaddress.sun_path)-1, "%s/%s", tdir, pmix_pid); asprintf(&myuri, "%s:%lu:%s", pmix_globals.myid.nspace, (unsigned long)pmix_globals.myid.rank, myaddress.sun_path); diff --git a/opal/mca/pmix/pmix120/pmix/src/server/pmix_server_listener.c b/opal/mca/pmix/pmix120/pmix/src/server/pmix_server_listener.c index 4fe76cfb9c..b0538fddc8 100644 --- a/opal/mca/pmix/pmix120/pmix/src/server/pmix_server_listener.c +++ b/opal/mca/pmix/pmix120/pmix/src/server/pmix_server_listener.c @@ -72,19 +72,20 @@ pmix_status_t pmix_start_listening(struct sockaddr_un *address) { int flags; pmix_status_t rc; - unsigned int addrlen; + socklen_t addrlen; char *ptr; /* create a listen socket for incoming connection attempts */ pmix_server_globals.listen_socket = socket(PF_UNIX, SOCK_STREAM, 0); if (pmix_server_globals.listen_socket < 0) { - printf("%s:%d socket() failed", __FILE__, __LINE__); + printf("%s:%d socket() failed\n", __FILE__, __LINE__); return PMIX_ERROR; } addrlen = sizeof(struct sockaddr_un); if (bind(pmix_server_globals.listen_socket, (struct sockaddr*)address, addrlen) < 0) { - printf("%s:%d bind() failed", __FILE__, __LINE__); + printf("%s:%d bind() failed error:%s\n", __FILE__, __LINE__, + strerror(errno)); return PMIX_ERROR; } /* set the mode as required */ @@ -95,18 +96,18 @@ pmix_status_t pmix_start_listening(struct sockaddr_un *address) /* setup listen backlog to maximum allowed by kernel */ if (listen(pmix_server_globals.listen_socket, SOMAXCONN) < 0) { - printf("%s:%d listen() failed", __FILE__, __LINE__); + printf("%s:%d listen() failed\n", __FILE__, __LINE__); return PMIX_ERROR; } /* set socket up to be non-blocking, otherwise accept could block */ if ((flags = fcntl(pmix_server_globals.listen_socket, F_GETFL, 0)) < 0) { - printf("%s:%d fcntl(F_GETFL) failed", __FILE__, __LINE__); + printf("%s:%d fcntl(F_GETFL) failed\n", __FILE__, __LINE__); return PMIX_ERROR; } flags |= O_NONBLOCK; if (fcntl(pmix_server_globals.listen_socket, F_SETFL, flags) < 0) { - printf("%s:%d fcntl(F_SETFL) failed", __FILE__, __LINE__); + printf("%s:%d fcntl(F_SETFL) failed\n", __FILE__, __LINE__); return PMIX_ERROR; } diff --git a/orte/mca/ess/base/ess_base_std_orted.c b/orte/mca/ess/base/ess_base_std_orted.c index 4865412cf4..6a94286ce6 100644 --- a/orte/mca/ess/base/ess_base_std_orted.c +++ b/orte/mca/ess/base/ess_base_std_orted.c @@ -515,7 +515,7 @@ int orte_ess_base_orted_setup(char **hosts) /* setup the PMIx server */ if (ORTE_SUCCESS != (ret = pmix_server_init())) { ORTE_ERROR_LOG(ret); - error = "pmix server init"; + error = "Try a shorter TMPDIR var. or change your computer's name (see uname -n), since pmix_server_init"; goto error; } diff --git a/orte/mca/ess/hnp/ess_hnp_module.c b/orte/mca/ess/hnp/ess_hnp_module.c index ed03c13ef3..dc41fb6f9c 100644 --- a/orte/mca/ess/hnp/ess_hnp_module.c +++ b/orte/mca/ess/hnp/ess_hnp_module.c @@ -634,7 +634,7 @@ static int rte_init(void) /* setup the PMIx server */ if (ORTE_SUCCESS != (ret = pmix_server_init())) { ORTE_ERROR_LOG(ret); - error = "pmix server init"; + error = "Try a shorter TMPDIR var. or change your computer's name (see uname -n), since pmix_server_init"; goto error; } diff --git a/orte/orted/help-orted.txt b/orte/orted/help-orted.txt index fb271f90d8..827edeccd2 100644 --- a/orte/orted/help-orted.txt +++ b/orte/orted/help-orted.txt @@ -43,6 +43,15 @@ a core that does not exist on this node: The MCA param directing this behavior is orte_daemon_cores. Please correct the request and try again. # +[orterun:pmix-failed] +The call to pmix_init_server() failed. This may be due to your +system's restriction for Unix's socket's path-length. + + orte_proc_session_dir: %s + +Please try to set TMPDIR to something short (like /tmp) or change +Your computer's name (see uname -n). +# [cwd] A dynamic operation (%s) was requested that requires us to obtain the current working directory. Unfortunately, an error was returned diff --git a/orte/orted/pmix/pmix_server.c b/orte/orted/pmix/pmix_server.c index ecd800d4fe..95a0829689 100644 --- a/orte/orted/pmix/pmix_server.c +++ b/orte/orted/pmix/pmix_server.c @@ -246,6 +246,9 @@ int pmix_server_init(void) if (ORTE_SUCCESS != (rc = opal_pmix.server_init(&pmix_server, &info))) { ORTE_ERROR_LOG(rc); /* memory cleanup will occur when finalize is called */ + orte_show_help("help-orterun.txt", "orterun:pmix-failed", true, + orte_process_info.proc_session_dir); + return rc; } OPAL_LIST_DESTRUCT(&info);