diff --git a/opal/mca/pmix/pmix114/pmix/VERSION b/opal/mca/pmix/pmix114/pmix/VERSION index 3ec0e1ba04..9d1bd79804 100644 --- a/opal/mca/pmix/pmix114/pmix/VERSION +++ b/opal/mca/pmix/pmix114/pmix/VERSION @@ -23,14 +23,14 @@ release=4 # The only requirement is that it must be entirely printable ASCII # characters and have no white space. -greek=rc1 +greek=rc2 # If repo_rev is empty, then the repository version number will be # obtained during "make dist" via the "git describe --tags --always" # command, or with the date (if "git describe" fails) in the form of # "date". -repo_rev=gitb363c5d +repo_rev=gitd9fd3da # If tarball_version is not empty, it is used as the version string in # the tarball filename, regardless of all other versions listed in @@ -44,7 +44,7 @@ tarball_version= # The date when this release was created -date="Apr 15, 2016" +date="Apr 16, 2016" # The shared library version of each of PMIx's public libraries. # These versions are maintained in accordance with the "Library diff --git a/opal/mca/pmix/pmix114/pmix/src/client/pmix_client.c b/opal/mca/pmix/pmix114/pmix/src/client/pmix_client.c index b5d0cd9bb9..10620dce81 100644 --- a/opal/mca/pmix/pmix114/pmix/src/client/pmix_client.c +++ b/opal/mca/pmix/pmix114/pmix/src/client/pmix_client.c @@ -702,6 +702,10 @@ PMIX_EXPORT pmix_status_t PMIx_Commit(void) pmix_cb_t *cb; pmix_status_t rc; + if (pmix_globals.init_cntr <= 0) { + return PMIX_ERR_INIT; + } + /* if we are a server, or we aren't connected, don't attempt to send */ if (pmix_globals.server) { return PMIX_SUCCESS; // not an error @@ -784,6 +788,10 @@ PMIX_EXPORT pmix_status_t PMIx_Resolve_peers(const char *nodename, pmix_cb_t *cb; pmix_status_t rc; + if (pmix_globals.init_cntr <= 0) { + return PMIX_ERR_INIT; + } + /* create a callback object */ cb = PMIX_NEW(pmix_cb_t); cb->active = true; @@ -843,6 +851,10 @@ PMIX_EXPORT pmix_status_t PMIx_Resolve_nodes(const char *nspace, char **nodelist pmix_cb_t *cb; pmix_status_t rc; + if (pmix_globals.init_cntr <= 0) { + return PMIX_ERR_INIT; + } + /* create a callback object */ cb = PMIX_NEW(pmix_cb_t); cb->active = true; diff --git a/opal/mca/pmix/pmix114/pmix/src/common/pmix_common.c b/opal/mca/pmix/pmix114/pmix/src/common/pmix_common.c index a84f9e76e7..5e83b5be5a 100644 --- a/opal/mca/pmix/pmix114/pmix/src/common/pmix_common.c +++ b/opal/mca/pmix/pmix114/pmix/src/common/pmix_common.c @@ -31,6 +31,10 @@ PMIX_EXPORT void PMIx_Register_errhandler(pmix_info_t info[], size_t ninfo, pmix_errhandler_reg_cbfunc_t cbfunc, void *cbdata) { + if (pmix_globals.init_cntr <= 0) { + return; + } + /* common err handler registration */ if (pmix_globals.server) { /* PMIX server: store the error handler, process info keys and call @@ -57,6 +61,10 @@ PMIX_EXPORT void PMIx_Deregister_errhandler(int errhandler_ref, pmix_op_cbfunc_t cbfunc, void *cbdata) { + if (pmix_globals.init_cntr <= 0) { + return; + } + /* common err handler registration */ if (pmix_globals.server) { /* PMIX server: store the error handler, process info keys and call @@ -82,6 +90,10 @@ PMIX_EXPORT pmix_status_t PMIx_Notify_error(pmix_status_t status, { int rc; + if (pmix_globals.init_cntr <= 0) { + return PMIX_ERR_INIT; + } + if (pmix_globals.server) { rc = pmix_server_notify_error(status, procs, nprocs, error_procs, error_nprocs, info, ninfo, diff --git a/opal/mca/pmix/pmix114/pmix/src/server/pmix_server.c b/opal/mca/pmix/pmix114/pmix/src/server/pmix_server.c index 8cb3a98700..c8bb2953c2 100644 --- a/opal/mca/pmix/pmix114/pmix/src/server/pmix_server.c +++ b/opal/mca/pmix/pmix114/pmix/src/server/pmix_server.c @@ -1,6 +1,6 @@ /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* - * Copyright (c) 2014-2015 Intel, Inc. All rights reserved. + * Copyright (c) 2014-2016 Intel, Inc. All rights reserved. * Copyright (c) 2014-2015 Research Organization for Information Science * and Technology (RIST). All rights reserved. * Copyright (c) 2014-2015 Artem Y. Polyakov . @@ -586,17 +586,35 @@ PMIX_EXPORT pmix_status_t PMIx_server_register_nspace(const char nspace[], int n static void _deregister_nspace(int sd, short args, void *cbdata) { pmix_setup_caddy_t *cd = (pmix_setup_caddy_t*)cbdata; - pmix_nspace_t *tmp; + pmix_nspace_t *nptr; + int i; + pmix_peer_t *peer; pmix_output_verbose(2, pmix_globals.debug_output, "pmix:server _deregister_nspace %s", cd->proc.nspace); /* see if we already have this nspace */ - PMIX_LIST_FOREACH(tmp, &pmix_globals.nspaces, pmix_nspace_t) { - if (0 == strcmp(tmp->nspace, cd->proc.nspace)) { - pmix_list_remove_item(&pmix_globals.nspaces, &tmp->super); - PMIX_RELEASE(tmp); + PMIX_LIST_FOREACH(nptr, &pmix_globals.nspaces, pmix_nspace_t) { + if (0 == strcmp(nptr->nspace, cd->proc.nspace)) { + /* find and remove this client from our array of local + * peers - remember that it can occur multiple times + * if the peer called fork/exec and its children called + * PMIx_Init! We have to rely on none of those children + * living beyond our child as we otherwise cannot + * track them */ + for (i=0; i < pmix_server_globals.clients.size; i++) { + if (NULL == (peer = (pmix_peer_t*)pmix_pointer_array_get_item(&pmix_server_globals.clients, i))) { + continue; + } + if (nptr == peer->info->nptr) { + /* remove this entry */ + pmix_pointer_array_set_item(&pmix_server_globals.clients, i, NULL); + PMIX_RELEASE(peer); + } + } + pmix_list_remove_item(&pmix_globals.nspaces, &nptr->super); + PMIX_RELEASE(nptr); break; } } @@ -620,8 +638,7 @@ PMIX_EXPORT void PMIx_server_deregister_nspace(const char nspace[]) PMIX_THREADSHIFT(cd, _deregister_nspace); } -static void _execute_collective(int sd, short args, void *cbdata) -{ + void pmix_server_execute_collective(int sd, short args, void *cbdata) { pmix_trkr_caddy_t *tcd = (pmix_trkr_caddy_t*)cbdata; pmix_server_trkr_t *trk = tcd->trk; char *data = NULL; @@ -759,7 +776,7 @@ static void _register_client(int sd, short args, void *cbdata) * we don't want to block someone * here, so kick any completed trackers into a * new event for processing */ - PMIX_EXECUTE_COLLECTIVE(tcd, trk, _execute_collective); + PMIX_EXECUTE_COLLECTIVE(tcd, trk, pmix_server_execute_collective); } } /* also check any pending local modex requests to see if @@ -803,8 +820,9 @@ PMIX_EXPORT pmix_status_t PMIx_server_register_client(const pmix_proc_t *proc, static void _deregister_client(int sd, short args, void *cbdata) { pmix_setup_caddy_t *cd = (pmix_setup_caddy_t*)cbdata; - pmix_rank_info_t *info; pmix_nspace_t *nptr, *tmp; + int i; + pmix_peer_t *peer; pmix_output_verbose(2, pmix_globals.debug_output, "pmix:server _deregister_client for nspace %s rank %d", @@ -822,15 +840,27 @@ static void _deregister_client(int sd, short args, void *cbdata) /* nothing to do */ goto cleanup; } - /* find an remove this client */ - PMIX_LIST_FOREACH(info, &nptr->server->ranks, pmix_rank_info_t) { - if (info->rank == cd->proc.rank) { - pmix_list_remove_item(&nptr->server->ranks, &info->super); - PMIX_RELEASE(info); - break; + /* find and remove this client from our array of local + * peers - remember that it can occur multiple times + * if the peer called fork/exec and its children called + * PMIx_Init! We have to rely on none of those children + * living beyond our child as we otherwise cannot + * track them */ + for (i=0; i < pmix_server_globals.clients.size; i++) { + if (NULL == (peer = (pmix_peer_t*)pmix_pointer_array_get_item(&pmix_server_globals.clients, i))) { + continue; + } + if (nptr != peer->info->nptr) { + continue; + } + if (cd->proc.rank == peer->info->rank) { + /* remove this entry */ + pmix_pointer_array_set_item(&pmix_server_globals.clients, i, NULL); + PMIX_RELEASE(peer); } } + cleanup: PMIX_RELEASE(cd); } @@ -2139,8 +2169,9 @@ void regevents_cbfunc (pmix_status_t status, void *cbdata) } } reply = PMIX_NEW(pmix_buffer_t); - if (PMIX_SUCCESS != (rc = pmix_bfrop.pack(reply, &status, 1, PMIX_INT))) + if (PMIX_SUCCESS != (rc = pmix_bfrop.pack(reply, &status, 1, PMIX_INT))) { PMIX_ERROR_LOG(rc); + } // send reply PMIX_SERVER_QUEUE_REPLY(cd->peer, cd->hdr.tag, reply); PMIX_RELEASE(cd); diff --git a/opal/mca/pmix/pmix114/pmix/src/server/pmix_server_listener.c b/opal/mca/pmix/pmix114/pmix/src/server/pmix_server_listener.c index 4abae391fa..2a01dd63a0 100644 --- a/opal/mca/pmix/pmix114/pmix/src/server/pmix_server_listener.c +++ b/opal/mca/pmix/pmix114/pmix/src/server/pmix_server_listener.c @@ -91,7 +91,7 @@ pmix_status_t pmix_start_listening(struct sockaddr_un *address) return PMIX_ERROR; } /* set the mode as required */ - if (0 != chmod(address->sun_path, S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP)) { + if (0 != chmod(address->sun_path, S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH)) { pmix_output(0, "CANNOT CHMOD %s\n", address->sun_path); return PMIX_ERROR; } diff --git a/opal/mca/pmix/pmix114/pmix/src/server/pmix_server_ops.h b/opal/mca/pmix/pmix114/pmix/src/server/pmix_server_ops.h index 60ad913f0e..936dab1501 100644 --- a/opal/mca/pmix/pmix114/pmix/src/server/pmix_server_ops.h +++ b/opal/mca/pmix/pmix114/pmix/src/server/pmix_server_ops.h @@ -234,6 +234,8 @@ void pmix_server_check_notifications(pmix_regevents_info_t *reginfo, void regevents_cbfunc (pmix_status_t status, void *cbdata); +void pmix_server_execute_collective(int sd, short args, void *cbdata); + extern pmix_server_module_t pmix_host_server; extern pmix_server_globals_t pmix_server_globals; diff --git a/opal/mca/pmix/pmix114/pmix/src/usock/usock_sendrecv.c b/opal/mca/pmix/pmix114/pmix/src/usock/usock_sendrecv.c index dfd0f113e6..3dbc594517 100644 --- a/opal/mca/pmix/pmix114/pmix/src/usock/usock_sendrecv.c +++ b/opal/mca/pmix/pmix114/pmix/src/usock/usock_sendrecv.c @@ -50,6 +50,10 @@ static uint32_t current_tag = 1; // 0 is reserved for system purposes static void lost_connection(pmix_peer_t *peer, pmix_status_t err) { + pmix_server_trkr_t *trk; + pmix_rank_info_t *rinfo, *rnext; + pmix_trkr_caddy_t *tcd; + /* stop all events */ if (peer->recv_ev_active) { event_del(&peer->recv_event); @@ -65,9 +69,42 @@ static void lost_connection(pmix_peer_t *peer, pmix_status_t err) } CLOSE_THE_SOCKET(peer->sd); if (pmix_globals.server) { - /* if I am a server, then we need to - * do some cleanup as the client has - * left us */ + /* if I am a server, then we need to ensure that + * we properly account for the loss of this client + * from any local collectives in which it was + * participating - note that the proc would not + * have been added to any collective tracker until + * after it successfully connected */ + PMIX_LIST_FOREACH(trk, &pmix_server_globals.collectives, pmix_server_trkr_t) { + /* see if this proc is participating in this tracker */ + PMIX_LIST_FOREACH_SAFE(rinfo, rnext, &trk->ranks, pmix_rank_info_t) { + if (0 != strncmp(rinfo->nptr->nspace, peer->info->nptr->nspace, PMIX_MAX_NSLEN)) { + continue; + } + if (rinfo->rank != peer->info->rank) { + continue; + } + /* it is - adjust the count */ + --trk->nlocal; + /* remove it from the list */ + pmix_list_remove_item(&trk->ranks, &rinfo->super); + PMIX_RELEASE(rinfo); + /* check for completion */ + if (pmix_list_get_size(&trk->local_cbs) == trk->nlocal) { + /* complete, so now we need to process it + * we don't want to block someone + * here, so kick any completed trackers into a + * new event for processing */ + PMIX_EXECUTE_COLLECTIVE(tcd, trk, pmix_server_execute_collective); + } + } + } + /* remove this proc from the list of ranks for this nspace */ + pmix_list_remove_item(&(peer->info->nptr->server->ranks), &(peer->info->super)); + PMIX_RELEASE(peer->info); + /* reduce the number of local procs */ + --peer->info->nptr->server->nlocalprocs; + /* do some cleanup as the client has left us */ pmix_pointer_array_set_item(&pmix_server_globals.clients, peer->index, NULL); PMIX_RELEASE(peer); diff --git a/opal/mca/pmix/pmix114/pmix/src/util/error.c b/opal/mca/pmix/pmix114/pmix/src/util/error.c index 9ac5f73cdc..8f182d2281 100644 --- a/opal/mca/pmix/pmix114/pmix/src/util/error.c +++ b/opal/mca/pmix/pmix114/pmix/src/util/error.c @@ -11,7 +11,7 @@ * All rights reserved. * Copyright (c) 2007-2012 Los Alamos National Security, LLC. * All rights reserved. - * Copyright (c) 2014-2015 Intel, Inc. All rights reserved. + * Copyright (c) 2014-2016 Intel, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -157,9 +157,8 @@ void pmix_errhandler_invoke(pmix_status_t status, /* We need to parse thru each registered handler and determine * which one to call for the specific error */ int i, idflt; - size_t j, k; + size_t j; bool fired = false; - bool exact_match; pmix_error_reg_info_t *errreg, *errdflt=NULL; pmix_info_t *iptr; @@ -184,14 +183,12 @@ void pmix_errhandler_invoke(pmix_status_t status, } iptr[0].value.data.integer = i; /* match error name key first */ - exact_match = false; for (j = 0; j < errreg->ninfo; j++) { if ((0 == strcmp(errreg->info[j].key, PMIX_ERROR_NAME)) && (status == errreg->info[j].value.data.int32)) { iptr[0].value.data.integer = i; errreg->errhandler(status, procs, nprocs, iptr, ninfo+1); fired = true; - exact_match = true; break; } } diff --git a/opal/mca/pmix/pmix114/pmix/test/pmi_client.c b/opal/mca/pmix/pmix114/pmix/test/pmi_client.c index 51d8ffab40..c16e07ccce 100644 --- a/opal/mca/pmix/pmix114/pmix/test/pmi_client.c +++ b/opal/mca/pmix/pmix114/pmix/test/pmi_client.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013-2014 Intel, Inc. All rights reserved. + * Copyright (c) 2013-2016 Intel, Inc. All rights reserved. * Copyright (c) 2015 Mellanox Technologies, Inc. * All rights reserved. * $COPYRIGHT$ @@ -305,7 +305,7 @@ static int test_item5(void) const char **ptr = tkeys; if (_legacy || !_legacy) { - log_error("PMIx and SLURM/PMI1 do not set 'PMI_process_mapping' (Do not mark test as failed)\n"); + log_error("PMIx and SLURM/PMI1 do not set 'PMI_process_mapping' %s\n", "(Do not mark test as failed)"); return rc; }