diff --git a/contrib/cleanperms b/contrib/cleanperms new file mode 100755 index 0000000000..9b9373ca99 --- /dev/null +++ b/contrib/cleanperms @@ -0,0 +1,11 @@ +#!/usr/bin/bash + +find . -type f -name "*.c" -perm /u+x -print -exec chmod -x {} \; +find . -type f -name Makefile.am -perm /u+x -print -exec chmod -x {} \; +find . -type f -name "*.h" -perm /u+x -print -exec chmod -x {} \; +find . -type f -name Makefile.include -perm /u+x -print -exec chmod -x {} \; +find . -type f -name Makefile -perm /u+x -print -exec chmod -x {} \; +find . -type f -name "*.m4" -perm /u+x -print -exec chmod -x {} \; +find . -type f -name "*.ac" -perm /u+x -print -exec chmod -x {} \; +find . -type f -name "*.txt" -perm /u+x -print -exec chmod -x {} \; +find . -type f -name "*.l" -perm /u+x -print -exec chmod -x {} \; diff --git a/opal/mca/pmix/pmix2x/pmix/src/server/Makefile.include b/opal/mca/pmix/pmix2x/pmix/src/server/Makefile.include index e62f430538..e3577865ae 100644 --- a/opal/mca/pmix/pmix2x/pmix/src/server/Makefile.include +++ b/opal/mca/pmix/pmix2x/pmix/src/server/Makefile.include @@ -11,6 +11,8 @@ # $HEADER$ # +dist_pmixdata_DATA += server/help-pmix-server.txt + headers += \ server/pmix_server_ops.h diff --git a/opal/mca/pmix/pmix2x/pmix/src/server/help-pmix-server.txt b/opal/mca/pmix/pmix2x/pmix/src/server/help-pmix-server.txt new file mode 100644 index 0000000000..1266926083 --- /dev/null +++ b/opal/mca/pmix/pmix2x/pmix/src/server/help-pmix-server.txt @@ -0,0 +1,35 @@ +# -*- text -*- +# +# Copyright (c) 2016 Intel, Inc. All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# +# +[rnd-path-too-long] +The PMIx server was unable to setup a rendezvous file due to your +system's restriction for Unix's socket's path-length. + + Temporary directory: %s + Rendezvous filename: %s + +Please try to set TMPDIR to something short (like /tmp) or change +your computer's name to something shorter (see uname -n). +[listener-failed-start] +The PMIx server was unable to start its listening thread. This is +usually due to a conflicting stale named pipe from a prior failed +job, thus preventing the server from binding to its assigned socket. + + Rendezvous filename: %s + +Please remove the stale file and try again. +[data-store-failed] +The PMIx server was unable to store the specified key-value: + + Key: %s + +The precise reason for the failure was provided in the above +"error-log" message. This is probably something that should +be referred to the PMIx developers. diff --git a/opal/mca/pmix/pmix2x/pmix/src/server/pmix_server.c b/opal/mca/pmix/pmix2x/pmix/src/server/pmix_server.c index 7d2105ad9c..24bd5c28eb 100644 --- a/opal/mca/pmix/pmix2x/pmix/src/server/pmix_server.c +++ b/opal/mca/pmix/pmix2x/pmix/src/server/pmix_server.c @@ -52,6 +52,10 @@ #include "src/util/error.h" #include "src/util/output.h" #include "src/util/pmix_environ.h" +#include "src/util/show_help.h" +#include "src/mca/base/base.h" +#include "src/mca/base/pmix_mca_base_var.h" +#include "src/mca/pinstalldirs/base/base.h" #include "src/runtime/pmix_progress_threads.h" #include "src/usock/usock.h" #include "src/sec/pmix_sec.h" @@ -121,11 +125,25 @@ static pmix_status_t initialize_server_base(pmix_server_module_t *module) char *tdir, *evar; char * pmix_pid; pmix_listener_t *listener; + pmix_status_t ret; /* initialize the output system */ if (!pmix_output_init()) { + fprintf(stderr, "PMIx server was unable to initialize its output system\n"); return PMIX_ERR_INIT; } + /* initialize install dirs code */ + if (PMIX_SUCCESS != (ret = pmix_mca_base_framework_open(&pmix_pinstalldirs_base_framework, 0))) { + fprintf(stderr, "pmix_pinstalldirs_base_open() failed -- process will likely abort (%s:%d, returned %d instead of PMIX_SUCCESS)\n", + __FILE__, __LINE__, ret); + return ret; + } + + if (PMIX_SUCCESS != pmix_show_help_init()) { + fprintf(stderr, "PMIx server was unable to initialize its show_help system\n"); + return PMIX_ERR_INIT; + } + /* setup the globals */ pmix_globals_init(); memset(&pmix_server_globals, 0, sizeof(pmix_server_globals)); @@ -198,7 +216,9 @@ static pmix_status_t initialize_server_base(pmix_server_module_t *module) if (0 > asprintf(&pmix_pid, "%s/pmix-%d", tdir, mypid)) { return PMIX_ERR_NOMEM; } + if ((strlen(pmix_pid) + 1) > sizeof(listener->address.sun_path)-1) { + pmix_show_help("help-pmix-server.txt", "rnd-path-too-long", true, tdir, pmix_pid); free(pmix_pid); return PMIX_ERR_INVALID_LENGTH; } @@ -352,6 +372,7 @@ PMIX_EXPORT pmix_status_t PMIx_server_init(pmix_server_module_t *module, return PMIX_ERR_NOMEM; } if ((strlen(pmix_pid) + 1) > sizeof(tl->address.sun_path)-1) { + pmix_show_help("help-pmix-server.txt", "rnd-path-too-long", true, tdir, pmix_pid); free(pmix_pid); return PMIX_ERR_INVALID_LENGTH; } @@ -380,6 +401,7 @@ PMIX_EXPORT pmix_status_t PMIx_server_init(pmix_server_module_t *module, return PMIX_ERR_NOMEM; } if ((strlen(pmix_pid) + 1) > sizeof(tl->address.sun_path)-1) { + pmix_show_help("help-pmix-server.txt", "rnd-path-too-long", true, tdir, pmix_pid); free(pmix_pid); return PMIX_ERR_INVALID_LENGTH; } @@ -413,6 +435,7 @@ PMIX_EXPORT pmix_status_t PMIx_server_init(pmix_server_module_t *module, } if (need_listener) { if (PMIX_SUCCESS != pmix_start_listening()) { + pmix_show_help("help-pmix-server.txt", "listener-failed-start", true, tl->address.sun_path); PMIx_server_finalize(); return PMIX_ERR_INIT; } @@ -441,6 +464,7 @@ PMIX_EXPORT pmix_status_t PMIx_server_init(pmix_server_module_t *module, kv.value = &info[n].value; if (PMIX_SUCCESS != (rc = pmix_bfrop.pack(&pmix_server_globals.gdata, &kv, 1, PMIX_KVAL))) { PMIX_ERROR_LOG(rc); + pmix_show_help("help-pmix-server.txt", "data-store-failed", true, kv.key); /* protect the incoming data */ kv.key = NULL; kv.value = NULL; diff --git a/orte/mca/ess/pmi/ess_pmi_module.c b/orte/mca/ess/pmi/ess_pmi_module.c index 6f49d292fa..e0f17f004f 100644 --- a/orte/mca/ess/pmi/ess_pmi_module.c +++ b/orte/mca/ess/pmi/ess_pmi_module.c @@ -314,12 +314,13 @@ static int rte_init(void) } /* retrieve the local peers */ OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_LOCAL_PEERS, - ORTE_PROC_MY_NAME, &val, OPAL_STRING); + &wildcard_rank, &val, OPAL_STRING); if (OPAL_SUCCESS == ret && NULL != val) { peers = opal_argv_split(val, ','); free(val); /* and their cpusets, if available */ - OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, OPAL_PMIX_LOCAL_CPUSETS, ORTE_PROC_MY_NAME, &val, OPAL_STRING); + OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, OPAL_PMIX_LOCAL_CPUSETS, + &wildcard_rank, &val, OPAL_STRING); if (OPAL_SUCCESS == ret && NULL != val) { cpusets = opal_argv_split(val, ':'); free(val); diff --git a/orte/orted/pmix/pmix_server.c b/orte/orted/pmix/pmix_server.c index c30000827f..67e53af9bf 100644 --- a/orte/orted/pmix/pmix_server.c +++ b/orte/orted/pmix/pmix_server.c @@ -272,10 +272,7 @@ int pmix_server_init(void) /* setup the local server */ if (ORTE_SUCCESS != (rc = opal_pmix.server_init(&pmix_server, &info))) { - ORTE_ERROR_LOG(rc); - /* memory cleanup will occur when finalize is called */ - orte_show_help("help-orterun.txt", "orterun:pmix-failed", true, - orte_process_info.proc_session_dir); + /* pmix will provide a nice show_help output here */ return rc; } OPAL_LIST_DESTRUCT(&info); diff --git a/orte/orted/pmix/pmix_server_register_fns.c b/orte/orted/pmix/pmix_server_register_fns.c index bbce31c952..80dc80a217 100644 --- a/orte/orted/pmix/pmix_server_register_fns.c +++ b/orte/orted/pmix/pmix_server_register_fns.c @@ -54,10 +54,10 @@ int orte_pmix_server_register_nspace(orte_job_t *jdata) { int rc; orte_proc_t *pptr; - int i, k, n, nlocalprocs; + int i, k, n; opal_list_t *info, *pmap; opal_value_t *kv; - orte_node_t *node, *n2; + orte_node_t *node, *mynode; opal_vpid_t vpid; char **list, **procs, **micro, *tmp, *regex, *cpulist, *peerlist; orte_job_t *dmns; @@ -164,8 +164,8 @@ int orte_pmix_server_register_nspace(orte_job_t *jdata) OPAL_LIST_RELEASE(info); return ORTE_ERR_NOT_FOUND; } - node = pptr->node; - if (NULL == node) { + mynode = pptr->node; + if (NULL == mynode) { /* cannot happen */ ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); OPAL_LIST_RELEASE(info); @@ -175,14 +175,14 @@ int orte_pmix_server_register_nspace(orte_job_t *jdata) kv = OBJ_NEW(opal_value_t); kv->key = strdup(OPAL_PMIX_NODEID); kv->type = OPAL_UINT32; - kv->data.uint32 = node->index; + kv->data.uint32 = mynode->index; opal_list_append(info, &kv->super); /* pass our node size */ kv = OBJ_NEW(opal_value_t); kv->key = strdup(OPAL_PMIX_NODE_SIZE); kv->type = OPAL_UINT32; - kv->data.uint32 = node->num_procs; + kv->data.uint32 = mynode->num_procs; opal_list_append(info, &kv->super); /* univ size */ @@ -220,43 +220,29 @@ int orte_pmix_server_register_nspace(orte_job_t *jdata) kv->data.uint32 = jdata->total_slots_alloc; opal_list_append(info, &kv->super); - /* identify our local node object within the map, - * if we were included */ - node = NULL; - map = (orte_job_map_t*)jdata->map; - for (i=0; i < map->nodes->size; i++) { - if (NULL == (n2 = (orte_node_t*)opal_pointer_array_get_item(map->nodes, i))) { + /* register any local clients */ + vpid = ORTE_VPID_MAX; + for (i=0; i < mynode->procs->size; i++) { + if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(mynode->procs, i))) { continue; } - if (n2 == pptr->node) { - node = n2; - break; - } - } - if (NULL != node) { - vpid = ORTE_VPID_MAX; - for (i=0; i < node->procs->size; i++) { - if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(node->procs, i))) { - continue; + if (pptr->name.jobid == jdata->jobid) { + if (pptr->name.vpid < vpid) { + vpid = pptr->name.vpid; } - if (pptr->name.jobid == jdata->jobid) { - if (pptr->name.vpid < vpid) { - vpid = pptr->name.vpid; - } - /* go ahead and register this client */ - if (OPAL_SUCCESS != (rc = opal_pmix.server_register_client(&pptr->name, uid, gid, - (void*)pptr, NULL, NULL))) { - ORTE_ERROR_LOG(rc); - } + /* go ahead and register this client */ + if (OPAL_SUCCESS != (rc = opal_pmix.server_register_client(&pptr->name, uid, gid, + (void*)pptr, NULL, NULL))) { + ORTE_ERROR_LOG(rc); } } - /* pass the local ldr */ - kv = OBJ_NEW(opal_value_t); - kv->key = strdup(OPAL_PMIX_LOCALLDR); - kv->type = OPAL_VPID; - kv->data.name.vpid = vpid; - opal_list_append(info, &kv->super); } + /* pass the local ldr */ + kv = OBJ_NEW(opal_value_t); + kv->key = strdup(OPAL_PMIX_LOCALLDR); + kv->type = OPAL_VPID; + kv->data.name.vpid = vpid; + opal_list_append(info, &kv->super); /* for each proc in this job, create an object that * includes the info describing the proc so the recipient has a complete @@ -276,13 +262,11 @@ int orte_pmix_server_register_nspace(orte_job_t *jdata) cpulist = NULL; peerlist = NULL; vpid = ORTE_VPID_MAX; - nlocalprocs = 0; for (i=0; i < node->procs->size; i++) { if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(node->procs, i))) { continue; } if (pptr->name.jobid == jdata->jobid) { - ++nlocalprocs; opal_argv_append_nosize(&list, ORTE_VPID_PRINT(pptr->name.vpid)); if (pptr->name.vpid < vpid) { vpid = pptr->name.vpid; @@ -315,6 +299,26 @@ int orte_pmix_server_register_nspace(orte_job_t *jdata) procs = NULL; } + /* if this is me, then pass the peers and cpusets to myself + * in order to maintain backward compatibility for the non-pmix + * components in OPAL/pmix */ + if (node == mynode) { + /* pass the list of peers */ + kv = OBJ_NEW(opal_value_t); + kv->key = strdup(OPAL_PMIX_LOCAL_PEERS); + kv->type = OPAL_STRING; + kv->data.string = strdup(peerlist); + opal_list_append(info, &kv->super); + + /* pass the list of cpusets */ + kv = OBJ_NEW(opal_value_t); + kv->key = strdup(OPAL_PMIX_LOCAL_CPUSETS); + kv->type = OPAL_STRING; + kv->data.string = strdup(cpulist); + opal_list_append(info, &kv->super); + + } + /* now cycle across each proc on this node, passing all data that * varies by proc */ for (i=0; i < node->procs->size; i++) {