1
1

Merge pull request #1965 from rhc54/topic/pmixfix

Provide backward compatible keys so that the non-PMIx components in t…
Этот коммит содержится в:
rhc54 2016-08-13 13:48:12 -07:00 коммит произвёл GitHub
родитель d12e50b2d6 be8424b691
Коммит 2228d2efc2
7 изменённых файлов: 119 добавлений и 45 удалений

11
contrib/cleanperms Исполняемый файл
Просмотреть файл

@ -0,0 +1,11 @@
#!/usr/bin/bash
find . -type f -name "*.c" -perm /u+x -print -exec chmod -x {} \;
find . -type f -name Makefile.am -perm /u+x -print -exec chmod -x {} \;
find . -type f -name "*.h" -perm /u+x -print -exec chmod -x {} \;
find . -type f -name Makefile.include -perm /u+x -print -exec chmod -x {} \;
find . -type f -name Makefile -perm /u+x -print -exec chmod -x {} \;
find . -type f -name "*.m4" -perm /u+x -print -exec chmod -x {} \;
find . -type f -name "*.ac" -perm /u+x -print -exec chmod -x {} \;
find . -type f -name "*.txt" -perm /u+x -print -exec chmod -x {} \;
find . -type f -name "*.l" -perm /u+x -print -exec chmod -x {} \;

Просмотреть файл

@ -11,6 +11,8 @@
# $HEADER$
#
dist_pmixdata_DATA += server/help-pmix-server.txt
headers += \
server/pmix_server_ops.h

Просмотреть файл

@ -0,0 +1,35 @@
# -*- text -*-
#
# Copyright (c) 2016 Intel, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
#
[rnd-path-too-long]
The PMIx server was unable to setup a rendezvous file due to your
system's restriction for Unix's socket's path-length.
Temporary directory: %s
Rendezvous filename: %s
Please try to set TMPDIR to something short (like /tmp) or change
your computer's name to something shorter (see uname -n).
[listener-failed-start]
The PMIx server was unable to start its listening thread. This is
usually due to a conflicting stale named pipe from a prior failed
job, thus preventing the server from binding to its assigned socket.
Rendezvous filename: %s
Please remove the stale file and try again.
[data-store-failed]
The PMIx server was unable to store the specified key-value:
Key: %s
The precise reason for the failure was provided in the above
"error-log" message. This is probably something that should
be referred to the PMIx developers.

Просмотреть файл

@ -52,6 +52,10 @@
#include "src/util/error.h"
#include "src/util/output.h"
#include "src/util/pmix_environ.h"
#include "src/util/show_help.h"
#include "src/mca/base/base.h"
#include "src/mca/base/pmix_mca_base_var.h"
#include "src/mca/pinstalldirs/base/base.h"
#include "src/runtime/pmix_progress_threads.h"
#include "src/usock/usock.h"
#include "src/sec/pmix_sec.h"
@ -121,11 +125,25 @@ static pmix_status_t initialize_server_base(pmix_server_module_t *module)
char *tdir, *evar;
char * pmix_pid;
pmix_listener_t *listener;
pmix_status_t ret;
/* initialize the output system */
if (!pmix_output_init()) {
fprintf(stderr, "PMIx server was unable to initialize its output system\n");
return PMIX_ERR_INIT;
}
/* initialize install dirs code */
if (PMIX_SUCCESS != (ret = pmix_mca_base_framework_open(&pmix_pinstalldirs_base_framework, 0))) {
fprintf(stderr, "pmix_pinstalldirs_base_open() failed -- process will likely abort (%s:%d, returned %d instead of PMIX_SUCCESS)\n",
__FILE__, __LINE__, ret);
return ret;
}
if (PMIX_SUCCESS != pmix_show_help_init()) {
fprintf(stderr, "PMIx server was unable to initialize its show_help system\n");
return PMIX_ERR_INIT;
}
/* setup the globals */
pmix_globals_init();
memset(&pmix_server_globals, 0, sizeof(pmix_server_globals));
@ -198,7 +216,9 @@ static pmix_status_t initialize_server_base(pmix_server_module_t *module)
if (0 > asprintf(&pmix_pid, "%s/pmix-%d", tdir, mypid)) {
return PMIX_ERR_NOMEM;
}
if ((strlen(pmix_pid) + 1) > sizeof(listener->address.sun_path)-1) {
pmix_show_help("help-pmix-server.txt", "rnd-path-too-long", true, tdir, pmix_pid);
free(pmix_pid);
return PMIX_ERR_INVALID_LENGTH;
}
@ -352,6 +372,7 @@ PMIX_EXPORT pmix_status_t PMIx_server_init(pmix_server_module_t *module,
return PMIX_ERR_NOMEM;
}
if ((strlen(pmix_pid) + 1) > sizeof(tl->address.sun_path)-1) {
pmix_show_help("help-pmix-server.txt", "rnd-path-too-long", true, tdir, pmix_pid);
free(pmix_pid);
return PMIX_ERR_INVALID_LENGTH;
}
@ -380,6 +401,7 @@ PMIX_EXPORT pmix_status_t PMIx_server_init(pmix_server_module_t *module,
return PMIX_ERR_NOMEM;
}
if ((strlen(pmix_pid) + 1) > sizeof(tl->address.sun_path)-1) {
pmix_show_help("help-pmix-server.txt", "rnd-path-too-long", true, tdir, pmix_pid);
free(pmix_pid);
return PMIX_ERR_INVALID_LENGTH;
}
@ -413,6 +435,7 @@ PMIX_EXPORT pmix_status_t PMIx_server_init(pmix_server_module_t *module,
}
if (need_listener) {
if (PMIX_SUCCESS != pmix_start_listening()) {
pmix_show_help("help-pmix-server.txt", "listener-failed-start", true, tl->address.sun_path);
PMIx_server_finalize();
return PMIX_ERR_INIT;
}
@ -441,6 +464,7 @@ PMIX_EXPORT pmix_status_t PMIx_server_init(pmix_server_module_t *module,
kv.value = &info[n].value;
if (PMIX_SUCCESS != (rc = pmix_bfrop.pack(&pmix_server_globals.gdata, &kv, 1, PMIX_KVAL))) {
PMIX_ERROR_LOG(rc);
pmix_show_help("help-pmix-server.txt", "data-store-failed", true, kv.key);
/* protect the incoming data */
kv.key = NULL;
kv.value = NULL;

Просмотреть файл

@ -314,12 +314,13 @@ static int rte_init(void)
}
/* retrieve the local peers */
OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_LOCAL_PEERS,
ORTE_PROC_MY_NAME, &val, OPAL_STRING);
&wildcard_rank, &val, OPAL_STRING);
if (OPAL_SUCCESS == ret && NULL != val) {
peers = opal_argv_split(val, ',');
free(val);
/* and their cpusets, if available */
OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, OPAL_PMIX_LOCAL_CPUSETS, ORTE_PROC_MY_NAME, &val, OPAL_STRING);
OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, OPAL_PMIX_LOCAL_CPUSETS,
&wildcard_rank, &val, OPAL_STRING);
if (OPAL_SUCCESS == ret && NULL != val) {
cpusets = opal_argv_split(val, ':');
free(val);

Просмотреть файл

@ -272,10 +272,7 @@ int pmix_server_init(void)
/* setup the local server */
if (ORTE_SUCCESS != (rc = opal_pmix.server_init(&pmix_server, &info))) {
ORTE_ERROR_LOG(rc);
/* memory cleanup will occur when finalize is called */
orte_show_help("help-orterun.txt", "orterun:pmix-failed", true,
orte_process_info.proc_session_dir);
/* pmix will provide a nice show_help output here */
return rc;
}
OPAL_LIST_DESTRUCT(&info);

Просмотреть файл

@ -54,10 +54,10 @@ int orte_pmix_server_register_nspace(orte_job_t *jdata)
{
int rc;
orte_proc_t *pptr;
int i, k, n, nlocalprocs;
int i, k, n;
opal_list_t *info, *pmap;
opal_value_t *kv;
orte_node_t *node, *n2;
orte_node_t *node, *mynode;
opal_vpid_t vpid;
char **list, **procs, **micro, *tmp, *regex, *cpulist, *peerlist;
orte_job_t *dmns;
@ -164,8 +164,8 @@ int orte_pmix_server_register_nspace(orte_job_t *jdata)
OPAL_LIST_RELEASE(info);
return ORTE_ERR_NOT_FOUND;
}
node = pptr->node;
if (NULL == node) {
mynode = pptr->node;
if (NULL == mynode) {
/* cannot happen */
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
OPAL_LIST_RELEASE(info);
@ -175,14 +175,14 @@ int orte_pmix_server_register_nspace(orte_job_t *jdata)
kv = OBJ_NEW(opal_value_t);
kv->key = strdup(OPAL_PMIX_NODEID);
kv->type = OPAL_UINT32;
kv->data.uint32 = node->index;
kv->data.uint32 = mynode->index;
opal_list_append(info, &kv->super);
/* pass our node size */
kv = OBJ_NEW(opal_value_t);
kv->key = strdup(OPAL_PMIX_NODE_SIZE);
kv->type = OPAL_UINT32;
kv->data.uint32 = node->num_procs;
kv->data.uint32 = mynode->num_procs;
opal_list_append(info, &kv->super);
/* univ size */
@ -220,43 +220,29 @@ int orte_pmix_server_register_nspace(orte_job_t *jdata)
kv->data.uint32 = jdata->total_slots_alloc;
opal_list_append(info, &kv->super);
/* identify our local node object within the map,
* if we were included */
node = NULL;
map = (orte_job_map_t*)jdata->map;
for (i=0; i < map->nodes->size; i++) {
if (NULL == (n2 = (orte_node_t*)opal_pointer_array_get_item(map->nodes, i))) {
/* register any local clients */
vpid = ORTE_VPID_MAX;
for (i=0; i < mynode->procs->size; i++) {
if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(mynode->procs, i))) {
continue;
}
if (n2 == pptr->node) {
node = n2;
break;
}
}
if (NULL != node) {
vpid = ORTE_VPID_MAX;
for (i=0; i < node->procs->size; i++) {
if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(node->procs, i))) {
continue;
if (pptr->name.jobid == jdata->jobid) {
if (pptr->name.vpid < vpid) {
vpid = pptr->name.vpid;
}
if (pptr->name.jobid == jdata->jobid) {
if (pptr->name.vpid < vpid) {
vpid = pptr->name.vpid;
}
/* go ahead and register this client */
if (OPAL_SUCCESS != (rc = opal_pmix.server_register_client(&pptr->name, uid, gid,
(void*)pptr, NULL, NULL))) {
ORTE_ERROR_LOG(rc);
}
/* go ahead and register this client */
if (OPAL_SUCCESS != (rc = opal_pmix.server_register_client(&pptr->name, uid, gid,
(void*)pptr, NULL, NULL))) {
ORTE_ERROR_LOG(rc);
}
}
/* pass the local ldr */
kv = OBJ_NEW(opal_value_t);
kv->key = strdup(OPAL_PMIX_LOCALLDR);
kv->type = OPAL_VPID;
kv->data.name.vpid = vpid;
opal_list_append(info, &kv->super);
}
/* pass the local ldr */
kv = OBJ_NEW(opal_value_t);
kv->key = strdup(OPAL_PMIX_LOCALLDR);
kv->type = OPAL_VPID;
kv->data.name.vpid = vpid;
opal_list_append(info, &kv->super);
/* for each proc in this job, create an object that
* includes the info describing the proc so the recipient has a complete
@ -276,13 +262,11 @@ int orte_pmix_server_register_nspace(orte_job_t *jdata)
cpulist = NULL;
peerlist = NULL;
vpid = ORTE_VPID_MAX;
nlocalprocs = 0;
for (i=0; i < node->procs->size; i++) {
if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(node->procs, i))) {
continue;
}
if (pptr->name.jobid == jdata->jobid) {
++nlocalprocs;
opal_argv_append_nosize(&list, ORTE_VPID_PRINT(pptr->name.vpid));
if (pptr->name.vpid < vpid) {
vpid = pptr->name.vpid;
@ -315,6 +299,26 @@ int orte_pmix_server_register_nspace(orte_job_t *jdata)
procs = NULL;
}
/* if this is me, then pass the peers and cpusets to myself
* in order to maintain backward compatibility for the non-pmix
* components in OPAL/pmix */
if (node == mynode) {
/* pass the list of peers */
kv = OBJ_NEW(opal_value_t);
kv->key = strdup(OPAL_PMIX_LOCAL_PEERS);
kv->type = OPAL_STRING;
kv->data.string = strdup(peerlist);
opal_list_append(info, &kv->super);
/* pass the list of cpusets */
kv = OBJ_NEW(opal_value_t);
kv->key = strdup(OPAL_PMIX_LOCAL_CPUSETS);
kv->type = OPAL_STRING;
kv->data.string = strdup(cpulist);
opal_list_append(info, &kv->super);
}
/* now cycle across each proc on this node, passing all data that
* varies by proc */
for (i=0; i < node->procs->size; i++) {