1
1

Roll to PMIx 1.1.4rc2 - replaces some code that was incorrectly removed in prior update

Этот коммит содержится в:
Ralph Castain 2016-04-16 18:23:41 -07:00
родитель 8cce6df688
Коммит b009e58d25
9 изменённых файлов: 122 добавлений и 31 удалений

Просмотреть файл

@ -23,14 +23,14 @@ release=4
# The only requirement is that it must be entirely printable ASCII # The only requirement is that it must be entirely printable ASCII
# characters and have no white space. # characters and have no white space.
greek=rc1 greek=rc2
# If repo_rev is empty, then the repository version number will be # If repo_rev is empty, then the repository version number will be
# obtained during "make dist" via the "git describe --tags --always" # obtained during "make dist" via the "git describe --tags --always"
# command, or with the date (if "git describe" fails) in the form of # command, or with the date (if "git describe" fails) in the form of
# "date<date>". # "date<date>".
repo_rev=gitb363c5d repo_rev=gitd9fd3da
# If tarball_version is not empty, it is used as the version string in # If tarball_version is not empty, it is used as the version string in
# the tarball filename, regardless of all other versions listed in # the tarball filename, regardless of all other versions listed in
@ -44,7 +44,7 @@ tarball_version=
# The date when this release was created # The date when this release was created
date="Apr 15, 2016" date="Apr 16, 2016"
# The shared library version of each of PMIx's public libraries. # The shared library version of each of PMIx's public libraries.
# These versions are maintained in accordance with the "Library # These versions are maintained in accordance with the "Library

Просмотреть файл

@ -702,6 +702,10 @@ PMIX_EXPORT pmix_status_t PMIx_Commit(void)
pmix_cb_t *cb; pmix_cb_t *cb;
pmix_status_t rc; pmix_status_t rc;
if (pmix_globals.init_cntr <= 0) {
return PMIX_ERR_INIT;
}
/* if we are a server, or we aren't connected, don't attempt to send */ /* if we are a server, or we aren't connected, don't attempt to send */
if (pmix_globals.server) { if (pmix_globals.server) {
return PMIX_SUCCESS; // not an error return PMIX_SUCCESS; // not an error
@ -784,6 +788,10 @@ PMIX_EXPORT pmix_status_t PMIx_Resolve_peers(const char *nodename,
pmix_cb_t *cb; pmix_cb_t *cb;
pmix_status_t rc; pmix_status_t rc;
if (pmix_globals.init_cntr <= 0) {
return PMIX_ERR_INIT;
}
/* create a callback object */ /* create a callback object */
cb = PMIX_NEW(pmix_cb_t); cb = PMIX_NEW(pmix_cb_t);
cb->active = true; cb->active = true;
@ -843,6 +851,10 @@ PMIX_EXPORT pmix_status_t PMIx_Resolve_nodes(const char *nspace, char **nodelist
pmix_cb_t *cb; pmix_cb_t *cb;
pmix_status_t rc; pmix_status_t rc;
if (pmix_globals.init_cntr <= 0) {
return PMIX_ERR_INIT;
}
/* create a callback object */ /* create a callback object */
cb = PMIX_NEW(pmix_cb_t); cb = PMIX_NEW(pmix_cb_t);
cb->active = true; cb->active = true;

Просмотреть файл

@ -31,6 +31,10 @@ PMIX_EXPORT void PMIx_Register_errhandler(pmix_info_t info[], size_t ninfo,
pmix_errhandler_reg_cbfunc_t cbfunc, pmix_errhandler_reg_cbfunc_t cbfunc,
void *cbdata) void *cbdata)
{ {
if (pmix_globals.init_cntr <= 0) {
return;
}
/* common err handler registration */ /* common err handler registration */
if (pmix_globals.server) { if (pmix_globals.server) {
/* PMIX server: store the error handler, process info keys and call /* PMIX server: store the error handler, process info keys and call
@ -57,6 +61,10 @@ PMIX_EXPORT void PMIx_Deregister_errhandler(int errhandler_ref,
pmix_op_cbfunc_t cbfunc, pmix_op_cbfunc_t cbfunc,
void *cbdata) void *cbdata)
{ {
if (pmix_globals.init_cntr <= 0) {
return;
}
/* common err handler registration */ /* common err handler registration */
if (pmix_globals.server) { if (pmix_globals.server) {
/* PMIX server: store the error handler, process info keys and call /* PMIX server: store the error handler, process info keys and call
@ -82,6 +90,10 @@ PMIX_EXPORT pmix_status_t PMIx_Notify_error(pmix_status_t status,
{ {
int rc; int rc;
if (pmix_globals.init_cntr <= 0) {
return PMIX_ERR_INIT;
}
if (pmix_globals.server) { if (pmix_globals.server) {
rc = pmix_server_notify_error(status, procs, nprocs, error_procs, rc = pmix_server_notify_error(status, procs, nprocs, error_procs,
error_nprocs, info, ninfo, error_nprocs, info, ninfo,

Просмотреть файл

@ -1,6 +1,6 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/* /*
* Copyright (c) 2014-2015 Intel, Inc. All rights reserved. * Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
* Copyright (c) 2014-2015 Research Organization for Information Science * Copyright (c) 2014-2015 Research Organization for Information Science
* and Technology (RIST). All rights reserved. * and Technology (RIST). All rights reserved.
* Copyright (c) 2014-2015 Artem Y. Polyakov <artpol84@gmail.com>. * Copyright (c) 2014-2015 Artem Y. Polyakov <artpol84@gmail.com>.
@ -586,17 +586,35 @@ PMIX_EXPORT pmix_status_t PMIx_server_register_nspace(const char nspace[], int n
static void _deregister_nspace(int sd, short args, void *cbdata) static void _deregister_nspace(int sd, short args, void *cbdata)
{ {
pmix_setup_caddy_t *cd = (pmix_setup_caddy_t*)cbdata; pmix_setup_caddy_t *cd = (pmix_setup_caddy_t*)cbdata;
pmix_nspace_t *tmp; pmix_nspace_t *nptr;
int i;
pmix_peer_t *peer;
pmix_output_verbose(2, pmix_globals.debug_output, pmix_output_verbose(2, pmix_globals.debug_output,
"pmix:server _deregister_nspace %s", "pmix:server _deregister_nspace %s",
cd->proc.nspace); cd->proc.nspace);
/* see if we already have this nspace */ /* see if we already have this nspace */
PMIX_LIST_FOREACH(tmp, &pmix_globals.nspaces, pmix_nspace_t) { PMIX_LIST_FOREACH(nptr, &pmix_globals.nspaces, pmix_nspace_t) {
if (0 == strcmp(tmp->nspace, cd->proc.nspace)) { if (0 == strcmp(nptr->nspace, cd->proc.nspace)) {
pmix_list_remove_item(&pmix_globals.nspaces, &tmp->super); /* find and remove this client from our array of local
PMIX_RELEASE(tmp); * peers - remember that it can occur multiple times
* if the peer called fork/exec and its children called
* PMIx_Init! We have to rely on none of those children
* living beyond our child as we otherwise cannot
* track them */
for (i=0; i < pmix_server_globals.clients.size; i++) {
if (NULL == (peer = (pmix_peer_t*)pmix_pointer_array_get_item(&pmix_server_globals.clients, i))) {
continue;
}
if (nptr == peer->info->nptr) {
/* remove this entry */
pmix_pointer_array_set_item(&pmix_server_globals.clients, i, NULL);
PMIX_RELEASE(peer);
}
}
pmix_list_remove_item(&pmix_globals.nspaces, &nptr->super);
PMIX_RELEASE(nptr);
break; break;
} }
} }
@ -620,8 +638,7 @@ PMIX_EXPORT void PMIx_server_deregister_nspace(const char nspace[])
PMIX_THREADSHIFT(cd, _deregister_nspace); PMIX_THREADSHIFT(cd, _deregister_nspace);
} }
static void _execute_collective(int sd, short args, void *cbdata) void pmix_server_execute_collective(int sd, short args, void *cbdata) {
{
pmix_trkr_caddy_t *tcd = (pmix_trkr_caddy_t*)cbdata; pmix_trkr_caddy_t *tcd = (pmix_trkr_caddy_t*)cbdata;
pmix_server_trkr_t *trk = tcd->trk; pmix_server_trkr_t *trk = tcd->trk;
char *data = NULL; char *data = NULL;
@ -759,7 +776,7 @@ static void _register_client(int sd, short args, void *cbdata)
* we don't want to block someone * we don't want to block someone
* here, so kick any completed trackers into a * here, so kick any completed trackers into a
* new event for processing */ * new event for processing */
PMIX_EXECUTE_COLLECTIVE(tcd, trk, _execute_collective); PMIX_EXECUTE_COLLECTIVE(tcd, trk, pmix_server_execute_collective);
} }
} }
/* also check any pending local modex requests to see if /* also check any pending local modex requests to see if
@ -803,8 +820,9 @@ PMIX_EXPORT pmix_status_t PMIx_server_register_client(const pmix_proc_t *proc,
static void _deregister_client(int sd, short args, void *cbdata) static void _deregister_client(int sd, short args, void *cbdata)
{ {
pmix_setup_caddy_t *cd = (pmix_setup_caddy_t*)cbdata; pmix_setup_caddy_t *cd = (pmix_setup_caddy_t*)cbdata;
pmix_rank_info_t *info;
pmix_nspace_t *nptr, *tmp; pmix_nspace_t *nptr, *tmp;
int i;
pmix_peer_t *peer;
pmix_output_verbose(2, pmix_globals.debug_output, pmix_output_verbose(2, pmix_globals.debug_output,
"pmix:server _deregister_client for nspace %s rank %d", "pmix:server _deregister_client for nspace %s rank %d",
@ -822,15 +840,27 @@ static void _deregister_client(int sd, short args, void *cbdata)
/* nothing to do */ /* nothing to do */
goto cleanup; goto cleanup;
} }
/* find an remove this client */ /* find and remove this client from our array of local
PMIX_LIST_FOREACH(info, &nptr->server->ranks, pmix_rank_info_t) { * peers - remember that it can occur multiple times
if (info->rank == cd->proc.rank) { * if the peer called fork/exec and its children called
pmix_list_remove_item(&nptr->server->ranks, &info->super); * PMIx_Init! We have to rely on none of those children
PMIX_RELEASE(info); * living beyond our child as we otherwise cannot
break; * track them */
for (i=0; i < pmix_server_globals.clients.size; i++) {
if (NULL == (peer = (pmix_peer_t*)pmix_pointer_array_get_item(&pmix_server_globals.clients, i))) {
continue;
}
if (nptr != peer->info->nptr) {
continue;
}
if (cd->proc.rank == peer->info->rank) {
/* remove this entry */
pmix_pointer_array_set_item(&pmix_server_globals.clients, i, NULL);
PMIX_RELEASE(peer);
} }
} }
cleanup: cleanup:
PMIX_RELEASE(cd); PMIX_RELEASE(cd);
} }
@ -2139,8 +2169,9 @@ void regevents_cbfunc (pmix_status_t status, void *cbdata)
} }
} }
reply = PMIX_NEW(pmix_buffer_t); reply = PMIX_NEW(pmix_buffer_t);
if (PMIX_SUCCESS != (rc = pmix_bfrop.pack(reply, &status, 1, PMIX_INT))) if (PMIX_SUCCESS != (rc = pmix_bfrop.pack(reply, &status, 1, PMIX_INT))) {
PMIX_ERROR_LOG(rc); PMIX_ERROR_LOG(rc);
}
// send reply // send reply
PMIX_SERVER_QUEUE_REPLY(cd->peer, cd->hdr.tag, reply); PMIX_SERVER_QUEUE_REPLY(cd->peer, cd->hdr.tag, reply);
PMIX_RELEASE(cd); PMIX_RELEASE(cd);

Просмотреть файл

@ -91,7 +91,7 @@ pmix_status_t pmix_start_listening(struct sockaddr_un *address)
return PMIX_ERROR; return PMIX_ERROR;
} }
/* set the mode as required */ /* set the mode as required */
if (0 != chmod(address->sun_path, S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP)) { if (0 != chmod(address->sun_path, S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH)) {
pmix_output(0, "CANNOT CHMOD %s\n", address->sun_path); pmix_output(0, "CANNOT CHMOD %s\n", address->sun_path);
return PMIX_ERROR; return PMIX_ERROR;
} }

Просмотреть файл

@ -234,6 +234,8 @@ void pmix_server_check_notifications(pmix_regevents_info_t *reginfo,
void regevents_cbfunc (pmix_status_t status, void *cbdata); void regevents_cbfunc (pmix_status_t status, void *cbdata);
void pmix_server_execute_collective(int sd, short args, void *cbdata);
extern pmix_server_module_t pmix_host_server; extern pmix_server_module_t pmix_host_server;
extern pmix_server_globals_t pmix_server_globals; extern pmix_server_globals_t pmix_server_globals;

Просмотреть файл

@ -50,6 +50,10 @@ static uint32_t current_tag = 1; // 0 is reserved for system purposes
static void lost_connection(pmix_peer_t *peer, pmix_status_t err) static void lost_connection(pmix_peer_t *peer, pmix_status_t err)
{ {
pmix_server_trkr_t *trk;
pmix_rank_info_t *rinfo, *rnext;
pmix_trkr_caddy_t *tcd;
/* stop all events */ /* stop all events */
if (peer->recv_ev_active) { if (peer->recv_ev_active) {
event_del(&peer->recv_event); event_del(&peer->recv_event);
@ -65,9 +69,42 @@ static void lost_connection(pmix_peer_t *peer, pmix_status_t err)
} }
CLOSE_THE_SOCKET(peer->sd); CLOSE_THE_SOCKET(peer->sd);
if (pmix_globals.server) { if (pmix_globals.server) {
/* if I am a server, then we need to /* if I am a server, then we need to ensure that
* do some cleanup as the client has * we properly account for the loss of this client
* left us */ * from any local collectives in which it was
* participating - note that the proc would not
* have been added to any collective tracker until
* after it successfully connected */
PMIX_LIST_FOREACH(trk, &pmix_server_globals.collectives, pmix_server_trkr_t) {
/* see if this proc is participating in this tracker */
PMIX_LIST_FOREACH_SAFE(rinfo, rnext, &trk->ranks, pmix_rank_info_t) {
if (0 != strncmp(rinfo->nptr->nspace, peer->info->nptr->nspace, PMIX_MAX_NSLEN)) {
continue;
}
if (rinfo->rank != peer->info->rank) {
continue;
}
/* it is - adjust the count */
--trk->nlocal;
/* remove it from the list */
pmix_list_remove_item(&trk->ranks, &rinfo->super);
PMIX_RELEASE(rinfo);
/* check for completion */
if (pmix_list_get_size(&trk->local_cbs) == trk->nlocal) {
/* complete, so now we need to process it
* we don't want to block someone
* here, so kick any completed trackers into a
* new event for processing */
PMIX_EXECUTE_COLLECTIVE(tcd, trk, pmix_server_execute_collective);
}
}
}
/* remove this proc from the list of ranks for this nspace */
pmix_list_remove_item(&(peer->info->nptr->server->ranks), &(peer->info->super));
PMIX_RELEASE(peer->info);
/* reduce the number of local procs */
--peer->info->nptr->server->nlocalprocs;
/* do some cleanup as the client has left us */
pmix_pointer_array_set_item(&pmix_server_globals.clients, pmix_pointer_array_set_item(&pmix_server_globals.clients,
peer->index, NULL); peer->index, NULL);
PMIX_RELEASE(peer); PMIX_RELEASE(peer);

Просмотреть файл

@ -11,7 +11,7 @@
* All rights reserved. * All rights reserved.
* Copyright (c) 2007-2012 Los Alamos National Security, LLC. * Copyright (c) 2007-2012 Los Alamos National Security, LLC.
* All rights reserved. * All rights reserved.
* Copyright (c) 2014-2015 Intel, Inc. All rights reserved. * Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
* $COPYRIGHT$ * $COPYRIGHT$
* *
* Additional copyrights may follow * Additional copyrights may follow
@ -157,9 +157,8 @@ void pmix_errhandler_invoke(pmix_status_t status,
/* We need to parse thru each registered handler and determine /* We need to parse thru each registered handler and determine
* which one to call for the specific error */ * which one to call for the specific error */
int i, idflt; int i, idflt;
size_t j, k; size_t j;
bool fired = false; bool fired = false;
bool exact_match;
pmix_error_reg_info_t *errreg, *errdflt=NULL; pmix_error_reg_info_t *errreg, *errdflt=NULL;
pmix_info_t *iptr; pmix_info_t *iptr;
@ -184,14 +183,12 @@ void pmix_errhandler_invoke(pmix_status_t status,
} }
iptr[0].value.data.integer = i; iptr[0].value.data.integer = i;
/* match error name key first */ /* match error name key first */
exact_match = false;
for (j = 0; j < errreg->ninfo; j++) { for (j = 0; j < errreg->ninfo; j++) {
if ((0 == strcmp(errreg->info[j].key, PMIX_ERROR_NAME)) && if ((0 == strcmp(errreg->info[j].key, PMIX_ERROR_NAME)) &&
(status == errreg->info[j].value.data.int32)) { (status == errreg->info[j].value.data.int32)) {
iptr[0].value.data.integer = i; iptr[0].value.data.integer = i;
errreg->errhandler(status, procs, nprocs, iptr, ninfo+1); errreg->errhandler(status, procs, nprocs, iptr, ninfo+1);
fired = true; fired = true;
exact_match = true;
break; break;
} }
} }

Просмотреть файл

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2013-2014 Intel, Inc. All rights reserved. * Copyright (c) 2013-2016 Intel, Inc. All rights reserved.
* Copyright (c) 2015 Mellanox Technologies, Inc. * Copyright (c) 2015 Mellanox Technologies, Inc.
* All rights reserved. * All rights reserved.
* $COPYRIGHT$ * $COPYRIGHT$
@ -305,7 +305,7 @@ static int test_item5(void)
const char **ptr = tkeys; const char **ptr = tkeys;
if (_legacy || !_legacy) { if (_legacy || !_legacy) {
log_error("PMIx and SLURM/PMI1 do not set 'PMI_process_mapping' (Do not mark test as failed)\n"); log_error("PMIx and SLURM/PMI1 do not set 'PMI_process_mapping' %s\n", "(Do not mark test as failed)");
return rc; return rc;
} }