1
1

Provide backward compatible keys so that the non-PMIx components in the opal/pmix framework don't have to adjust as we continue to work on finalizing the PMIx reference scheme. Activate and utilize the new PMIx show_help capability to provide more meaningful error output when the server cannot start.

Add a contrib script to cleanup permissions incorrectly modified due to things like smb mounts

dd
Этот коммит содержится в:
Ralph Castain 2016-08-13 08:14:50 -07:00
родитель d12e50b2d6
Коммит be8424b691
7 изменённых файлов: 119 добавлений и 45 удалений

11
contrib/cleanperms Исполняемый файл
Просмотреть файл

@ -0,0 +1,11 @@
#!/usr/bin/bash
find . -type f -name "*.c" -perm /u+x -print -exec chmod -x {} \;
find . -type f -name Makefile.am -perm /u+x -print -exec chmod -x {} \;
find . -type f -name "*.h" -perm /u+x -print -exec chmod -x {} \;
find . -type f -name Makefile.include -perm /u+x -print -exec chmod -x {} \;
find . -type f -name Makefile -perm /u+x -print -exec chmod -x {} \;
find . -type f -name "*.m4" -perm /u+x -print -exec chmod -x {} \;
find . -type f -name "*.ac" -perm /u+x -print -exec chmod -x {} \;
find . -type f -name "*.txt" -perm /u+x -print -exec chmod -x {} \;
find . -type f -name "*.l" -perm /u+x -print -exec chmod -x {} \;

Просмотреть файл

@ -11,6 +11,8 @@
# $HEADER$ # $HEADER$
# #
dist_pmixdata_DATA += server/help-pmix-server.txt
headers += \ headers += \
server/pmix_server_ops.h server/pmix_server_ops.h

Просмотреть файл

@ -0,0 +1,35 @@
# -*- text -*-
#
# Copyright (c) 2016 Intel, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
#
[rnd-path-too-long]
The PMIx server was unable to setup a rendezvous file due to your
system's restriction for Unix's socket's path-length.
Temporary directory: %s
Rendezvous filename: %s
Please try to set TMPDIR to something short (like /tmp) or change
your computer's name to something shorter (see uname -n).
[listener-failed-start]
The PMIx server was unable to start its listening thread. This is
usually due to a conflicting stale named pipe from a prior failed
job, thus preventing the server from binding to its assigned socket.
Rendezvous filename: %s
Please remove the stale file and try again.
[data-store-failed]
The PMIx server was unable to store the specified key-value:
Key: %s
The precise reason for the failure was provided in the above
"error-log" message. This is probably something that should
be referred to the PMIx developers.

Просмотреть файл

@ -52,6 +52,10 @@
#include "src/util/error.h" #include "src/util/error.h"
#include "src/util/output.h" #include "src/util/output.h"
#include "src/util/pmix_environ.h" #include "src/util/pmix_environ.h"
#include "src/util/show_help.h"
#include "src/mca/base/base.h"
#include "src/mca/base/pmix_mca_base_var.h"
#include "src/mca/pinstalldirs/base/base.h"
#include "src/runtime/pmix_progress_threads.h" #include "src/runtime/pmix_progress_threads.h"
#include "src/usock/usock.h" #include "src/usock/usock.h"
#include "src/sec/pmix_sec.h" #include "src/sec/pmix_sec.h"
@ -121,11 +125,25 @@ static pmix_status_t initialize_server_base(pmix_server_module_t *module)
char *tdir, *evar; char *tdir, *evar;
char * pmix_pid; char * pmix_pid;
pmix_listener_t *listener; pmix_listener_t *listener;
pmix_status_t ret;
/* initialize the output system */ /* initialize the output system */
if (!pmix_output_init()) { if (!pmix_output_init()) {
fprintf(stderr, "PMIx server was unable to initialize its output system\n");
return PMIX_ERR_INIT; return PMIX_ERR_INIT;
} }
/* initialize install dirs code */
if (PMIX_SUCCESS != (ret = pmix_mca_base_framework_open(&pmix_pinstalldirs_base_framework, 0))) {
fprintf(stderr, "pmix_pinstalldirs_base_open() failed -- process will likely abort (%s:%d, returned %d instead of PMIX_SUCCESS)\n",
__FILE__, __LINE__, ret);
return ret;
}
if (PMIX_SUCCESS != pmix_show_help_init()) {
fprintf(stderr, "PMIx server was unable to initialize its show_help system\n");
return PMIX_ERR_INIT;
}
/* setup the globals */ /* setup the globals */
pmix_globals_init(); pmix_globals_init();
memset(&pmix_server_globals, 0, sizeof(pmix_server_globals)); memset(&pmix_server_globals, 0, sizeof(pmix_server_globals));
@ -198,7 +216,9 @@ static pmix_status_t initialize_server_base(pmix_server_module_t *module)
if (0 > asprintf(&pmix_pid, "%s/pmix-%d", tdir, mypid)) { if (0 > asprintf(&pmix_pid, "%s/pmix-%d", tdir, mypid)) {
return PMIX_ERR_NOMEM; return PMIX_ERR_NOMEM;
} }
if ((strlen(pmix_pid) + 1) > sizeof(listener->address.sun_path)-1) { if ((strlen(pmix_pid) + 1) > sizeof(listener->address.sun_path)-1) {
pmix_show_help("help-pmix-server.txt", "rnd-path-too-long", true, tdir, pmix_pid);
free(pmix_pid); free(pmix_pid);
return PMIX_ERR_INVALID_LENGTH; return PMIX_ERR_INVALID_LENGTH;
} }
@ -352,6 +372,7 @@ PMIX_EXPORT pmix_status_t PMIx_server_init(pmix_server_module_t *module,
return PMIX_ERR_NOMEM; return PMIX_ERR_NOMEM;
} }
if ((strlen(pmix_pid) + 1) > sizeof(tl->address.sun_path)-1) { if ((strlen(pmix_pid) + 1) > sizeof(tl->address.sun_path)-1) {
pmix_show_help("help-pmix-server.txt", "rnd-path-too-long", true, tdir, pmix_pid);
free(pmix_pid); free(pmix_pid);
return PMIX_ERR_INVALID_LENGTH; return PMIX_ERR_INVALID_LENGTH;
} }
@ -380,6 +401,7 @@ PMIX_EXPORT pmix_status_t PMIx_server_init(pmix_server_module_t *module,
return PMIX_ERR_NOMEM; return PMIX_ERR_NOMEM;
} }
if ((strlen(pmix_pid) + 1) > sizeof(tl->address.sun_path)-1) { if ((strlen(pmix_pid) + 1) > sizeof(tl->address.sun_path)-1) {
pmix_show_help("help-pmix-server.txt", "rnd-path-too-long", true, tdir, pmix_pid);
free(pmix_pid); free(pmix_pid);
return PMIX_ERR_INVALID_LENGTH; return PMIX_ERR_INVALID_LENGTH;
} }
@ -413,6 +435,7 @@ PMIX_EXPORT pmix_status_t PMIx_server_init(pmix_server_module_t *module,
} }
if (need_listener) { if (need_listener) {
if (PMIX_SUCCESS != pmix_start_listening()) { if (PMIX_SUCCESS != pmix_start_listening()) {
pmix_show_help("help-pmix-server.txt", "listener-failed-start", true, tl->address.sun_path);
PMIx_server_finalize(); PMIx_server_finalize();
return PMIX_ERR_INIT; return PMIX_ERR_INIT;
} }
@ -441,6 +464,7 @@ PMIX_EXPORT pmix_status_t PMIx_server_init(pmix_server_module_t *module,
kv.value = &info[n].value; kv.value = &info[n].value;
if (PMIX_SUCCESS != (rc = pmix_bfrop.pack(&pmix_server_globals.gdata, &kv, 1, PMIX_KVAL))) { if (PMIX_SUCCESS != (rc = pmix_bfrop.pack(&pmix_server_globals.gdata, &kv, 1, PMIX_KVAL))) {
PMIX_ERROR_LOG(rc); PMIX_ERROR_LOG(rc);
pmix_show_help("help-pmix-server.txt", "data-store-failed", true, kv.key);
/* protect the incoming data */ /* protect the incoming data */
kv.key = NULL; kv.key = NULL;
kv.value = NULL; kv.value = NULL;

Просмотреть файл

@ -314,12 +314,13 @@ static int rte_init(void)
} }
/* retrieve the local peers */ /* retrieve the local peers */
OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_LOCAL_PEERS, OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_LOCAL_PEERS,
ORTE_PROC_MY_NAME, &val, OPAL_STRING); &wildcard_rank, &val, OPAL_STRING);
if (OPAL_SUCCESS == ret && NULL != val) { if (OPAL_SUCCESS == ret && NULL != val) {
peers = opal_argv_split(val, ','); peers = opal_argv_split(val, ',');
free(val); free(val);
/* and their cpusets, if available */ /* and their cpusets, if available */
OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, OPAL_PMIX_LOCAL_CPUSETS, ORTE_PROC_MY_NAME, &val, OPAL_STRING); OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, OPAL_PMIX_LOCAL_CPUSETS,
&wildcard_rank, &val, OPAL_STRING);
if (OPAL_SUCCESS == ret && NULL != val) { if (OPAL_SUCCESS == ret && NULL != val) {
cpusets = opal_argv_split(val, ':'); cpusets = opal_argv_split(val, ':');
free(val); free(val);

Просмотреть файл

@ -272,10 +272,7 @@ int pmix_server_init(void)
/* setup the local server */ /* setup the local server */
if (ORTE_SUCCESS != (rc = opal_pmix.server_init(&pmix_server, &info))) { if (ORTE_SUCCESS != (rc = opal_pmix.server_init(&pmix_server, &info))) {
ORTE_ERROR_LOG(rc); /* pmix will provide a nice show_help output here */
/* memory cleanup will occur when finalize is called */
orte_show_help("help-orterun.txt", "orterun:pmix-failed", true,
orte_process_info.proc_session_dir);
return rc; return rc;
} }
OPAL_LIST_DESTRUCT(&info); OPAL_LIST_DESTRUCT(&info);

Просмотреть файл

@ -54,10 +54,10 @@ int orte_pmix_server_register_nspace(orte_job_t *jdata)
{ {
int rc; int rc;
orte_proc_t *pptr; orte_proc_t *pptr;
int i, k, n, nlocalprocs; int i, k, n;
opal_list_t *info, *pmap; opal_list_t *info, *pmap;
opal_value_t *kv; opal_value_t *kv;
orte_node_t *node, *n2; orte_node_t *node, *mynode;
opal_vpid_t vpid; opal_vpid_t vpid;
char **list, **procs, **micro, *tmp, *regex, *cpulist, *peerlist; char **list, **procs, **micro, *tmp, *regex, *cpulist, *peerlist;
orte_job_t *dmns; orte_job_t *dmns;
@ -164,8 +164,8 @@ int orte_pmix_server_register_nspace(orte_job_t *jdata)
OPAL_LIST_RELEASE(info); OPAL_LIST_RELEASE(info);
return ORTE_ERR_NOT_FOUND; return ORTE_ERR_NOT_FOUND;
} }
node = pptr->node; mynode = pptr->node;
if (NULL == node) { if (NULL == mynode) {
/* cannot happen */ /* cannot happen */
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
OPAL_LIST_RELEASE(info); OPAL_LIST_RELEASE(info);
@ -175,14 +175,14 @@ int orte_pmix_server_register_nspace(orte_job_t *jdata)
kv = OBJ_NEW(opal_value_t); kv = OBJ_NEW(opal_value_t);
kv->key = strdup(OPAL_PMIX_NODEID); kv->key = strdup(OPAL_PMIX_NODEID);
kv->type = OPAL_UINT32; kv->type = OPAL_UINT32;
kv->data.uint32 = node->index; kv->data.uint32 = mynode->index;
opal_list_append(info, &kv->super); opal_list_append(info, &kv->super);
/* pass our node size */ /* pass our node size */
kv = OBJ_NEW(opal_value_t); kv = OBJ_NEW(opal_value_t);
kv->key = strdup(OPAL_PMIX_NODE_SIZE); kv->key = strdup(OPAL_PMIX_NODE_SIZE);
kv->type = OPAL_UINT32; kv->type = OPAL_UINT32;
kv->data.uint32 = node->num_procs; kv->data.uint32 = mynode->num_procs;
opal_list_append(info, &kv->super); opal_list_append(info, &kv->super);
/* univ size */ /* univ size */
@ -220,43 +220,29 @@ int orte_pmix_server_register_nspace(orte_job_t *jdata)
kv->data.uint32 = jdata->total_slots_alloc; kv->data.uint32 = jdata->total_slots_alloc;
opal_list_append(info, &kv->super); opal_list_append(info, &kv->super);
/* identify our local node object within the map, /* register any local clients */
* if we were included */ vpid = ORTE_VPID_MAX;
node = NULL; for (i=0; i < mynode->procs->size; i++) {
map = (orte_job_map_t*)jdata->map; if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(mynode->procs, i))) {
for (i=0; i < map->nodes->size; i++) {
if (NULL == (n2 = (orte_node_t*)opal_pointer_array_get_item(map->nodes, i))) {
continue; continue;
} }
if (n2 == pptr->node) { if (pptr->name.jobid == jdata->jobid) {
node = n2; if (pptr->name.vpid < vpid) {
break; vpid = pptr->name.vpid;
}
}
if (NULL != node) {
vpid = ORTE_VPID_MAX;
for (i=0; i < node->procs->size; i++) {
if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(node->procs, i))) {
continue;
} }
if (pptr->name.jobid == jdata->jobid) { /* go ahead and register this client */
if (pptr->name.vpid < vpid) { if (OPAL_SUCCESS != (rc = opal_pmix.server_register_client(&pptr->name, uid, gid,
vpid = pptr->name.vpid; (void*)pptr, NULL, NULL))) {
} ORTE_ERROR_LOG(rc);
/* go ahead and register this client */
if (OPAL_SUCCESS != (rc = opal_pmix.server_register_client(&pptr->name, uid, gid,
(void*)pptr, NULL, NULL))) {
ORTE_ERROR_LOG(rc);
}
} }
} }
/* pass the local ldr */
kv = OBJ_NEW(opal_value_t);
kv->key = strdup(OPAL_PMIX_LOCALLDR);
kv->type = OPAL_VPID;
kv->data.name.vpid = vpid;
opal_list_append(info, &kv->super);
} }
/* pass the local ldr */
kv = OBJ_NEW(opal_value_t);
kv->key = strdup(OPAL_PMIX_LOCALLDR);
kv->type = OPAL_VPID;
kv->data.name.vpid = vpid;
opal_list_append(info, &kv->super);
/* for each proc in this job, create an object that /* for each proc in this job, create an object that
* includes the info describing the proc so the recipient has a complete * includes the info describing the proc so the recipient has a complete
@ -276,13 +262,11 @@ int orte_pmix_server_register_nspace(orte_job_t *jdata)
cpulist = NULL; cpulist = NULL;
peerlist = NULL; peerlist = NULL;
vpid = ORTE_VPID_MAX; vpid = ORTE_VPID_MAX;
nlocalprocs = 0;
for (i=0; i < node->procs->size; i++) { for (i=0; i < node->procs->size; i++) {
if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(node->procs, i))) { if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(node->procs, i))) {
continue; continue;
} }
if (pptr->name.jobid == jdata->jobid) { if (pptr->name.jobid == jdata->jobid) {
++nlocalprocs;
opal_argv_append_nosize(&list, ORTE_VPID_PRINT(pptr->name.vpid)); opal_argv_append_nosize(&list, ORTE_VPID_PRINT(pptr->name.vpid));
if (pptr->name.vpid < vpid) { if (pptr->name.vpid < vpid) {
vpid = pptr->name.vpid; vpid = pptr->name.vpid;
@ -315,6 +299,26 @@ int orte_pmix_server_register_nspace(orte_job_t *jdata)
procs = NULL; procs = NULL;
} }
/* if this is me, then pass the peers and cpusets to myself
* in order to maintain backward compatibility for the non-pmix
* components in OPAL/pmix */
if (node == mynode) {
/* pass the list of peers */
kv = OBJ_NEW(opal_value_t);
kv->key = strdup(OPAL_PMIX_LOCAL_PEERS);
kv->type = OPAL_STRING;
kv->data.string = strdup(peerlist);
opal_list_append(info, &kv->super);
/* pass the list of cpusets */
kv = OBJ_NEW(opal_value_t);
kv->key = strdup(OPAL_PMIX_LOCAL_CPUSETS);
kv->type = OPAL_STRING;
kv->data.string = strdup(cpulist);
opal_list_append(info, &kv->super);
}
/* now cycle across each proc on this node, passing all data that /* now cycle across each proc on this node, passing all data that
* varies by proc */ * varies by proc */
for (i=0; i < node->procs->size; i++) { for (i=0; i < node->procs->size; i++) {