1
1
Signed-off-by: Ralph Castain <rhc@open-mpi.org>
Этот коммит содержится в:
Ralph Castain 2018-08-20 14:00:41 -07:00
родитель 8483eb4bf7
Коммит 3eef3d1d8f
16 изменённых файлов: 410 добавлений и 42 удалений

Просмотреть файл

@ -21,7 +21,21 @@ example, a bug might be fixed in the master, and then moved to the
current release as well as the "stable" bug fix release branch.
3.0.0 -- TBD
3.0.1 -- 23 Aug 2018
----------------------
**** DEPRECATION WARNING: The pmix_info_array_t struct was
**** initially marked for deprecation in the v2.x series.
**** We failed to provide clear warning at that time. This
**** therefore serves as warning of intended removal of
**** pmix_info_array_t in the future v4 release series.
- Fixed memory corruption bug in event notification
system due to uninitialized variable
- Add numeric version field to pmix_version.h
- Transfer all cached data to client dstore upon first connect
- Implement missing job control and sensor APIs
3.0.0 -- 6 July 2018
------------------------------------
**** NOTE: This release implements the complete PMIX v3.0 Standard
**** and therefore includes a number of new APIs and features. These
@ -63,7 +77,15 @@ current release as well as the "stable" bug fix release branch.
- Fix several memory and file descriptor leaks
2.1.2 -- TBD
2.1.3 -- 23 Aug 2018
----------------------
- Fixed memory corruption bug in event notification
system due to uninitialized variable
- Add numeric version definition
- Transfer all cached data to client dstore upon first connect
2.1.2 -- 6 July 2018
----------------------
- Added PMIX_VERSION_RELEASE string to pmix_version.h
- Added PMIX_SPAWNED and PMIX_PARENT_ID keys to all procs

Просмотреть файл

@ -15,7 +15,7 @@
major=3
minor=0
release=0
release=1
# greek is used for alpha or beta release tags. If it is non-empty,
# it will be appended to the version number. It does not have to be
@ -30,7 +30,7 @@ greek=
# command, or with the date (if "git describe" fails) in the form of
# "date<date>".
repo_rev=gitffba520
repo_rev=gitbf30a5f
# If tarball_version is not empty, it is used as the version string in
# the tarball filename, regardless of all other versions listed in
@ -44,7 +44,7 @@ tarball_version=
# The date when this release was created
date="Jul 01, 2018"
date="Aug 20, 2018"
# The shared library version of each of PMIx's public libraries.
# These versions are maintained in accordance with the "Library
@ -75,6 +75,6 @@ date="Jul 01, 2018"
# Version numbers are described in the Libtool current:revision:age
# format.
libpmix_so_version=4:0:2
libpmix_so_version=4:1:2
libpmi_so_version=1:0:0
libpmi2_so_version=1:0:0

Просмотреть файл

@ -192,7 +192,7 @@
Summary: An extended/exascale implementation of PMI
Name: %{?_name:%{_name}}%{!?_name:pmix}
Version: 3.0.0
Version: 3.0.1
Release: 1%{?dist}
License: BSD
Group: Development/Libraries

Просмотреть файл

@ -781,6 +781,7 @@ typedef int pmix_status_t;
#define PMIX_LAUNCH_DIRECTIVE (PMIX_ERR_OP_BASE - 24)
#define PMIX_LAUNCHER_READY (PMIX_ERR_OP_BASE - 25)
#define PMIX_OPERATION_IN_PROGRESS (PMIX_ERR_OP_BASE - 26)
#define PMIX_OPERATION_SUCCEEDED (PMIX_ERR_OP_BASE - 27)
/* define a starting point for system error constants so

Просмотреть файл

@ -2,6 +2,7 @@
* Copyright (c) 2016 Mellanox Technologies, Inc.
* All rights reserved.
* Copyright (c) 2018 IBM Corporation. All rights reserved.
* Copyright (c) 2018 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -17,4 +18,6 @@
#define PMIX_VERSION_MAJOR @pmixmajor@
#define PMIX_VERSION_MINOR @pmixminor@
#define PMIX_VERSION_RELEASE @pmixrelease@
#define PMIX_NUMERIC_VERSION 0x00030001
#endif

Просмотреть файл

@ -85,7 +85,7 @@ static void query_cbfunc(struct pmix_peer_t *peer,
/* unpack any returned data */
cnt = 1;
PMIX_BFROPS_UNPACK(rc, peer, buf, &results->ninfo, &cnt, PMIX_SIZE);
if (PMIX_SUCCESS != rc) {
if (PMIX_SUCCESS != rc && PMIX_ERR_UNPACK_READ_PAST_END_OF_BUFFER != rc) {
PMIX_ERROR_LOG(rc);
goto complete;
}
@ -332,6 +332,12 @@ PMIX_EXPORT pmix_status_t PMIx_Process_monitor_nb(const pmix_info_t *monitor, pm
return PMIX_ERR_INIT;
}
/* sanity check */
if (NULL == monitor) {
PMIX_RELEASE_THREAD(&pmix_global_lock);
return PMIX_ERR_BAD_PARAM;
}
/* if we are the server, then we just issue the request and
* return the response */
if (PMIX_PROC_IS_SERVER(pmix_globals.mypeer) &&
@ -355,6 +361,19 @@ PMIX_EXPORT pmix_status_t PMIx_Process_monitor_nb(const pmix_info_t *monitor, pm
}
PMIX_RELEASE_THREAD(&pmix_global_lock);
/* if the monitor is PMIX_SEND_HEARTBEAT, then send it */
if (0 == strncmp(monitor->key, PMIX_SEND_HEARTBEAT, PMIX_MAX_KEYLEN)) {
msg = PMIX_NEW(pmix_buffer_t);
if (NULL == msg) {
return PMIX_ERR_NOMEM;
}
PMIX_PTL_SEND_ONEWAY(rc, pmix_client_globals.myserver, msg, PMIX_PTL_TAG_HEARTBEAT);
if (PMIX_SUCCESS != rc) {
PMIX_RELEASE(msg);
}
return rc;
}
/* if we are a client, then relay this request to the server */
msg = PMIX_NEW(pmix_buffer_t);
/* pack the cmd */

Просмотреть файл

@ -1,6 +1,6 @@
/*
* Copyright (c) 2015-2018 Intel, Inc. All rights reserved.
* Copyright (c) 2016 IBM Corporation. All rights reserved.
* Copyright (c) 2016-2018 IBM Corporation. All rights reserved.
* Copyright (c) 2016-2017 Mellanox Technologies, Inc.
* All rights reserved.
* Copyright (c) 2018 Research Organization for Information Science
@ -3170,7 +3170,7 @@ static pmix_status_t dstore_register_job_info(struct pmix_peer_t *pr,
char *msg;
pmix_status_t rc;
pmix_proc_t proc;
pmix_rank_info_t *rinfo;
pmix_rank_t rank;
pmix_output_verbose(2, pmix_gds_base_framework.framework_output,
"[%s:%d] gds:dstore:register_job_info for peer [%s:%d]",
@ -3187,8 +3187,8 @@ static pmix_status_t dstore_register_job_info(struct pmix_peer_t *pr,
return rc;
}
PMIX_LIST_FOREACH(rinfo, &ns->ranks, pmix_rank_info_t) {
proc.rank = rinfo->pname.rank;
for (rank=0; rank < ns->nprocs; rank++) {
proc.rank = rank;
rc = _store_job_info(&proc);
if (PMIX_SUCCESS != rc) {
PMIX_ERROR_LOG(rc);

Просмотреть файл

@ -1,7 +1,7 @@
/*
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2012 Los Alamos National Security, Inc. All rights reserved.
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
* Copyright (c) 2014-2018 Intel, Inc. All rights reserved.
*
* $COPYRIGHT$
*
@ -24,6 +24,7 @@ pmix_status_t pmix_psensor_base_start(pmix_peer_t *requestor, pmix_status_t erro
{
pmix_psensor_active_module_t *mod;
pmix_status_t rc;
bool didit = false;
pmix_output_verbose(5, pmix_psensor_base_framework.framework_output,
"%s:%d sensor:base: starting sensors",
@ -36,9 +37,17 @@ pmix_status_t pmix_psensor_base_start(pmix_peer_t *requestor, pmix_status_t erro
if (PMIX_SUCCESS != rc && PMIX_ERR_TAKE_NEXT_OPTION != rc) {
return rc;
}
didit = true;
}
}
/* if none of the components could do it, then report
* not supported upwards so the server knows to ask
* the host to try */
if (!didit) {
return PMIX_ERR_NOT_SUPPORTED;
}
return PMIX_SUCCESS;
}
@ -46,7 +55,7 @@ pmix_status_t pmix_psensor_base_stop(pmix_peer_t *requestor,
char *id)
{
pmix_psensor_active_module_t *mod;
pmix_status_t rc;
pmix_status_t rc, ret = PMIX_SUCCESS;
pmix_output_verbose(5, pmix_psensor_base_framework.framework_output,
"%s:%d sensor:base: stopping sensors",
@ -57,10 +66,14 @@ pmix_status_t pmix_psensor_base_stop(pmix_peer_t *requestor,
if (NULL != mod->module->stop) {
rc = mod->module->stop(requestor, id);
if (PMIX_SUCCESS != rc && PMIX_ERR_TAKE_NEXT_OPTION != rc) {
return rc;
if (PMIX_SUCCESS == ret) {
ret = rc;
}
/* need to continue to ensure that all
* sensors have been stopped */
}
}
}
return PMIX_SUCCESS;
return ret;
}

Просмотреть файл

@ -3,7 +3,7 @@
* Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights
* reserved.
*
* Copyright (c) 2017 Intel, Inc. All rights reserved.
* Copyright (c) 2017-2018 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -30,7 +30,7 @@
#include "src/util/output.h"
#include "src/util/show_help.h"
#include "src/include/pmix_globals.h"
#include "src/mca/ptl/ptl.h"
#include "src/mca/ptl/base/base.h"
#include "src/mca/psensor/base/base.h"
#include "psensor_heartbeat.h"
@ -168,6 +168,7 @@ static pmix_status_t heartbeat_start(pmix_peer_t *requestor, pmix_status_t error
{
pmix_heartbeat_trkr_t *ft;
size_t n;
pmix_ptl_posted_recv_t *rcv;
PMIX_OUTPUT_VERBOSE((1, pmix_psensor_base_framework.framework_output,
"[%s:%d] checking heartbeat monitoring for requestor %s:%d",
@ -202,6 +203,17 @@ static pmix_status_t heartbeat_start(pmix_peer_t *requestor, pmix_status_t error
return PMIX_ERR_BAD_PARAM;
}
/* if the recv hasn't been posted, so so now */
if (!mca_psensor_heartbeat_component.recv_active) {
/* setup to receive heartbeats */
rcv = PMIX_NEW(pmix_ptl_posted_recv_t);
rcv->tag = PMIX_PTL_TAG_HEARTBEAT;
rcv->cbfunc = pmix_psensor_heartbeat_recv_beats;
/* add it to the beginning of the list of recvs */
pmix_list_prepend(&pmix_ptl_globals.posted_recvs, &rcv->super);
mca_psensor_heartbeat_component.recv_active = true;
}
/* need to push into our event base to add this to our trackers */
pmix_event_assign(&ft->cdev, pmix_psensor_base.evbase, -1,
EV_WRITE, add_tracker, ft);
@ -241,7 +253,7 @@ static pmix_status_t heartbeat_stop(pmix_peer_t *requestor, char *id)
cd->requestor = requestor;
cd->id = strdup(id);
/* need to push into our event base to add this to our trackers */
/* need to push into our event base to remove this from our trackers */
pmix_event_assign(&cd->ev, pmix_psensor_base.evbase, -1,
EV_WRITE, del_tracker, cd);
PMIX_POST_OBJECT(cd);

Просмотреть файл

@ -2,7 +2,7 @@
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2012 Los Alamos National Security, Inc. All rights reserved.
*
* Copyright (c) 2017 Intel, Inc. All rights reserved.
* Copyright (c) 2017-2018 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -28,6 +28,7 @@ BEGIN_C_DECLS
typedef struct {
pmix_psensor_base_component_t super;
bool recv_active;
pmix_list_t trackers;
} pmix_psensor_heartbeat_component_t;

Просмотреть файл

@ -1,7 +1,7 @@
/*
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2012 Los Alamos National Security, Inc. All rights reserved.
* Copyright (c) 2017 Intel, Inc. All rights reserved.
* Copyright (c) 2017-2018 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -50,14 +50,9 @@ pmix_psensor_heartbeat_component_t mca_psensor_heartbeat_component = {
*/
static int heartbeat_open(void)
{
pmix_status_t rc;
PMIX_CONSTRUCT(&mca_psensor_heartbeat_component.trackers, pmix_list_t);
/* setup to receive heartbeats */
PMIX_PTL_RECV(rc, pmix_globals.mypeer, pmix_psensor_heartbeat_recv_beats, PMIX_PTL_TAG_HEARTBEAT);
return rc;
return PMIX_SUCCESS;
}
@ -74,12 +69,7 @@ static int heartbeat_query(pmix_mca_base_module_t **module, int *priority)
static int heartbeat_close(void)
{
pmix_status_t rc;
/* cancel our persistent recv */
PMIX_PTL_CANCEL(rc, pmix_globals.mypeer, PMIX_PTL_TAG_HEARTBEAT);
PMIX_LIST_DESTRUCT(&mca_psensor_heartbeat_component.trackers);
return rc;
return PMIX_SUCCESS;
}

Просмотреть файл

@ -66,6 +66,7 @@
#include "src/mca/bfrops/base/base.h"
#include "src/mca/gds/base/base.h"
#include "src/mca/preg/preg.h"
#include "src/mca/psensor/base/base.h"
#include "src/mca/ptl/base/base.h"
#include "src/hwloc/hwloc-internal.h"
@ -397,6 +398,16 @@ PMIX_EXPORT pmix_status_t PMIx_server_init(pmix_server_module_t *module,
}
}
/* open the psensor framework */
if (PMIX_SUCCESS != (rc = pmix_mca_base_framework_open(&pmix_psensor_base_framework, 0))) {
PMIX_RELEASE_THREAD(&pmix_global_lock);
return rc;
}
if (PMIX_SUCCESS != (rc = pmix_psensor_base_select())) {
PMIX_RELEASE_THREAD(&pmix_global_lock);
return rc;
}
/* setup the wildcard recv for inbound messages from clients */
req = PMIX_NEW(pmix_ptl_posted_recv_t);
req->tag = UINT32_MAX;
@ -511,6 +522,8 @@ PMIX_EXPORT pmix_status_t PMIx_server_finalize(void)
if (NULL != pmix_server_globals.tmpdir) {
free(pmix_server_globals.tmpdir);
}
/* close the psensor framework */
(void)pmix_mca_base_framework_close(&pmix_psensor_base_framework);
/* close the pnet framework */
(void)pmix_mca_base_framework_close(&pmix_pnet_base_framework);
@ -3228,6 +3241,9 @@ void pmix_server_message_handler(struct pmix_peer_t *pr,
PMIX_ERROR_LOG(PMIX_ERR_NOMEM);
return;
}
if (PMIX_OPERATION_SUCCEEDED == ret) {
ret = PMIX_SUCCESS;
}
PMIX_BFROPS_PACK(rc, pr, reply, &ret, 1, PMIX_STATUS);
if (PMIX_SUCCESS != rc) {
PMIX_ERROR_LOG(rc);

Просмотреть файл

@ -52,6 +52,7 @@
#include "src/class/pmix_list.h"
#include "src/mca/bfrops/bfrops.h"
#include "src/mca/plog/plog.h"
#include "src/mca/psensor/psensor.h"
#include "src/util/argv.h"
#include "src/util/error.h"
#include "src/util/output.h"
@ -2536,10 +2537,8 @@ pmix_status_t pmix_server_job_ctrl(pmix_peer_t *peer,
PMIX_LIST_DESTRUCT(&cachefiles);
if (cnt == (int)cd->ninfo) {
/* nothing more to do */
if (NULL != cbfunc) {
cbfunc(PMIX_SUCCESS, NULL, 0, cd, NULL, NULL);
}
return PMIX_SUCCESS;
rc = PMIX_SUCCESS;
goto exit;
}
}
@ -2575,9 +2574,6 @@ pmix_status_t pmix_server_monitor(pmix_peer_t *peer,
pmix_output_verbose(2, pmix_server_globals.base_output,
"recvd monitor request from client");
if (NULL == pmix_host_server.monitor) {
return PMIX_ERR_NOT_SUPPORTED;
}
cd = PMIX_NEW(pmix_query_caddy_t);
if (NULL == cd) {
@ -2620,6 +2616,24 @@ pmix_status_t pmix_server_monitor(pmix_peer_t *peer,
}
}
/* see if they are requesting one of the monitoring
* methods we internally support */
rc = pmix_psensor.start(peer, error, &monitor, cd->info, cd->ninfo);
if (PMIX_SUCCESS == rc) {
rc = PMIX_OPERATION_SUCCEEDED;
goto exit;
}
if (PMIX_ERR_NOT_SUPPORTED != rc) {
goto exit;
}
/* if we don't internally support it, see if
* our host does */
if (NULL == pmix_host_server.monitor) {
rc = PMIX_ERR_NOT_SUPPORTED;
goto exit;
}
/* setup the requesting peer name */
(void)strncpy(proc.nspace, peer->info->pname.nspace, PMIX_MAX_NSLEN);
proc.rank = peer->info->pname.rank;

Просмотреть файл

@ -25,7 +25,7 @@ headers = simptest.h
noinst_PROGRAMS = simptest simpclient simppub simpdyn simpft simpdmodex \
test_pmix simptool simpdie simplegacy simptimeout \
gwtest gwclient stability quietclient
gwtest gwclient stability quietclient simpjctrl
simptest_SOURCES = $(headers) \
simptest.c
@ -116,3 +116,9 @@ quietclient_SOURCES = $(headers) \
quietclient_LDFLAGS = $(PMIX_PKG_CONFIG_LDFLAGS)
quietclient_LDADD = \
$(top_builddir)/src/libpmix.la
simpjctrl_SOURCES = \
simpjctrl.c
simpjctrl_LDFLAGS = $(PMIX_PKG_CONFIG_LDFLAGS)
simpjctrl_LDADD = \
$(top_builddir)/src/libpmix.la

Просмотреть файл

@ -0,0 +1,231 @@
/*
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2006-2013 Los Alamos National Security, LLC.
* All rights reserved.
* Copyright (c) 2009-2012 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2011 Oak Ridge National Labs. All rights reserved.
* Copyright (c) 2013-2018 Intel, Inc. All rights reserved.
* Copyright (c) 2015 Mellanox Technologies, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*
*/
#define _GNU_SOURCE
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <time.h>
#include <signal.h>
#include <pmix.h>
#include "simptest.h"
static pmix_proc_t myproc;
/* this is the event notification function we pass down below
* when registering for general events - i.e.,, the default
* handler. We don't technically need to register one, but it
* is usually good practice to catch any events that occur */
static void notification_fn(size_t evhdlr_registration_id,
pmix_status_t status,
const pmix_proc_t *source,
pmix_info_t info[], size_t ninfo,
pmix_info_t results[], size_t nresults,
pmix_event_notification_cbfunc_fn_t cbfunc,
void *cbdata)
{
if (NULL != cbfunc) {
cbfunc(PMIX_EVENT_ACTION_COMPLETE, NULL, 0, NULL, NULL, cbdata);
}
}
/* event handler registration is done asynchronously because it
* may involve the PMIx server registering with the host RM for
* external events. So we provide a callback function that returns
* the status of the request (success or an error), plus a numerical index
* to the registered event. The index is used later on to deregister
* an event handler - if we don't explicitly deregister it, then the
* PMIx server will do so when it see us exit */
static void evhandler_reg_callbk(pmix_status_t status,
size_t evhandler_ref,
void *cbdata)
{
mylock_t *lk = (mylock_t*)cbdata;
if (PMIX_SUCCESS != status) {
fprintf(stderr, "Client %s:%d EVENT HANDLER REGISTRATION FAILED WITH STATUS %d, ref=%lu\n",
myproc.nspace, myproc.rank, status, (unsigned long)evhandler_ref);
}
lk->status = status;
DEBUG_WAKEUP_THREAD(lk);
}
static void infocbfunc(pmix_status_t status,
pmix_info_t *info, size_t ninfo,
void *cbdata,
pmix_release_cbfunc_t release_fn,
void *release_cbdata)
{
mylock_t *lk = (mylock_t*)cbdata;
fprintf(stderr, "Callback recvd with status %d\n", status);
/* release the caller */
if (NULL != release_fn) {
release_fn(release_cbdata);
}
lk->status = status;
DEBUG_WAKEUP_THREAD(lk);
}
int main(int argc, char **argv)
{
int rc;
pmix_value_t value;
pmix_value_t *val = &value;
pmix_proc_t proc;
uint32_t nprocs, n;
pmix_info_t *info, *iptr;
bool flag;
mylock_t mylock;
pmix_data_array_t *dptr;
/* init us - note that the call to "init" includes the return of
* any job-related info provided by the RM. */
if (PMIX_SUCCESS != (rc = PMIx_Init(&myproc, NULL, 0))) {
fprintf(stderr, "Client ns %s rank %d: PMIx_Init failed: %d\n", myproc.nspace, myproc.rank, rc);
exit(0);
}
fprintf(stderr, "Client ns %s rank %d: Running\n", myproc.nspace, myproc.rank);
/* register our default event handler - again, this isn't strictly
* required, but is generally good practice */
DEBUG_CONSTRUCT_LOCK(&mylock);
PMIx_Register_event_handler(NULL, 0, NULL, 0,
notification_fn, evhandler_reg_callbk, (void*)&mylock);
DEBUG_WAIT_THREAD(&mylock);
if (0 != mylock.status) {
fprintf(stderr, "[%s:%d] Default handler registration failed\n", myproc.nspace, myproc.rank);
exit(mylock.status);
}
DEBUG_DESTRUCT_LOCK(&mylock);
/* job-related info is found in our nspace, assigned to the
* wildcard rank as it doesn't relate to a specific rank. Setup
* a name to retrieve such values */
PMIX_PROC_CONSTRUCT(&proc);
(void)strncpy(proc.nspace, myproc.nspace, PMIX_MAX_NSLEN);
proc.rank = PMIX_RANK_WILDCARD;
/* get our universe size */
if (PMIX_SUCCESS != (rc = PMIx_Get(&proc, PMIX_UNIV_SIZE, NULL, 0, &val))) {
fprintf(stderr, "Client ns %s rank %d: PMIx_Get universe size failed: %d\n", myproc.nspace, myproc.rank, rc);
goto done;
}
nprocs = val->data.uint32;
PMIX_VALUE_RELEASE(val);
fprintf(stderr, "Client %s:%d universe size %d\n", myproc.nspace, myproc.rank, nprocs);
/* inform the RM that we are preemptible, and that our checkpoint methods are
* "signal" on SIGUSR2 and event on PMIX_JCTRL_CHECKPOINT */
PMIX_INFO_CREATE(info, 2);
flag = true;
PMIX_INFO_LOAD(&info[0], PMIX_JOB_CTRL_PREEMPTIBLE, (void*)&flag, PMIX_BOOL);
/* can't use "load" to load a pmix_data_array_t */
(void)strncpy(info[1].key, PMIX_JOB_CTRL_CHECKPOINT_METHOD, PMIX_MAX_KEYLEN);
info[1].value.type = PMIX_DATA_ARRAY;
dptr = (pmix_data_array_t*)malloc(sizeof(pmix_data_array_t));
info[1].value.data.darray = dptr;
dptr->type = PMIX_INFO;
dptr->size = 2;
PMIX_INFO_CREATE(dptr->array, dptr->size);
rc = SIGUSR2;
iptr = (pmix_info_t*)dptr->array;
PMIX_INFO_LOAD(&iptr[0], PMIX_JOB_CTRL_CHECKPOINT_SIGNAL, &rc, PMIX_INT);
rc = PMIX_JCTRL_CHECKPOINT;
PMIX_INFO_LOAD(&iptr[1], PMIX_JOB_CTRL_CHECKPOINT_EVENT, &rc, PMIX_STATUS);
/* since this is informational and not a requested operation, the target parameter
* doesn't mean anything and can be ignored */
DEBUG_CONSTRUCT_LOCK(&mylock);
if (PMIX_SUCCESS != (rc = PMIx_Job_control_nb(NULL, 0, info, 2, infocbfunc, (void*)&mylock))) {
fprintf(stderr, "Client ns %s rank %d: PMIx_Job_control_nb failed: %d\n", myproc.nspace, myproc.rank, rc);
goto done;
}
DEBUG_WAIT_THREAD(&mylock);
PMIX_INFO_FREE(info, 2);
if (0 != mylock.status) {
fprintf(stderr, "Client ns %s rank %d: PMIx_Job_control_nb failed: %d\n", myproc.nspace, myproc.rank, mylock.status);
exit(mylock.status);
}
DEBUG_DESTRUCT_LOCK(&mylock);
/* now request that this process be monitored using heartbeats */
PMIX_INFO_CREATE(iptr, 1);
PMIX_INFO_LOAD(&iptr[0], PMIX_MONITOR_HEARTBEAT, NULL, PMIX_POINTER);
PMIX_INFO_CREATE(info, 3);
PMIX_INFO_LOAD(&info[0], PMIX_MONITOR_ID, "MONITOR1", PMIX_STRING);
n = 5; // require a heartbeat every 5 seconds
PMIX_INFO_LOAD(&info[1], PMIX_MONITOR_HEARTBEAT_TIME, &n, PMIX_UINT32);
n = 2; // two heartbeats can be missed before declaring us "stalled"
PMIX_INFO_LOAD(&info[2], PMIX_MONITOR_HEARTBEAT_DROPS, &n, PMIX_UINT32);
/* make the request */
DEBUG_CONSTRUCT_LOCK(&mylock);
if (PMIX_SUCCESS != (rc = PMIx_Process_monitor_nb(iptr, PMIX_MONITOR_HEARTBEAT_ALERT,
info, 3, infocbfunc, (void*)&mylock))) {
fprintf(stderr, "Client ns %s rank %d: PMIx_Process_monitor_nb failed: %d\n", myproc.nspace, myproc.rank, rc);
goto done;
}
DEBUG_WAIT_THREAD(&mylock);
PMIX_INFO_FREE(iptr, 1);
PMIX_INFO_FREE(info, 3);
if (0 != mylock.status) {
fprintf(stderr, "Client ns %s rank %d: PMIx_Process_monitor_nb failed: %d\n", myproc.nspace, myproc.rank, mylock.status);
exit(mylock.status);
}
DEBUG_DESTRUCT_LOCK(&mylock);
/* send a heartbeat */
PMIx_Heartbeat();
/* call fence to synchronize with our peers - no need to
* collect any info as we didn't "put" anything */
PMIX_INFO_CREATE(info, 1);
flag = false;
PMIX_INFO_LOAD(info, PMIX_COLLECT_DATA, &flag, PMIX_BOOL);
if (PMIX_SUCCESS != (rc = PMIx_Fence(&proc, 1, info, 1))) {
fprintf(stderr, "Client ns %s rank %d: PMIx_Fence failed: %d\n", myproc.nspace, myproc.rank, rc);
goto done;
}
PMIX_INFO_FREE(info, 1);
done:
/* finalize us */
fprintf(stderr, "Client ns %s rank %d: Finalizing\n", myproc.nspace, myproc.rank);
if (PMIX_SUCCESS != (rc = PMIx_Finalize(NULL, 0))) {
fprintf(stderr, "Client ns %s rank %d:PMIx_Finalize failed: %d\n", myproc.nspace, myproc.rank, rc);
} else {
fprintf(stderr, "Client ns %s rank %d:PMIx_Finalize successfully completed\n", myproc.nspace, myproc.rank);
}
fflush(stderr);
return(0);
}

Просмотреть файл

@ -107,6 +107,18 @@ static void log_fn(const pmix_proc_t *client,
const pmix_info_t data[], size_t ndata,
const pmix_info_t directives[], size_t ndirs,
pmix_op_cbfunc_t cbfunc, void *cbdata);
static pmix_status_t alloc_fn(const pmix_proc_t *client,
pmix_alloc_directive_t directive,
const pmix_info_t data[], size_t ndata,
pmix_info_cbfunc_t cbfunc, void *cbdata);
static pmix_status_t jctrl_fn(const pmix_proc_t *requestor,
const pmix_proc_t targets[], size_t ntargets,
const pmix_info_t directives[], size_t ndirs,
pmix_info_cbfunc_t cbfunc, void *cbdata);
static pmix_status_t mon_fn(const pmix_proc_t *requestor,
const pmix_info_t *monitor, pmix_status_t error,
const pmix_info_t directives[], size_t ndirs,
pmix_info_cbfunc_t cbfunc, void *cbdata);
static pmix_server_module_t mymodule = {
.client_connected = connected,
@ -125,7 +137,10 @@ static pmix_server_module_t mymodule = {
.notify_event = notify_event,
.query = query_fn,
.tool_connected = tool_connect_fn,
.log = log_fn
.log = log_fn,
.allocate = alloc_fn,
.job_control = jctrl_fn,
.monitor = mon_fn
};
typedef struct {
@ -1073,6 +1088,31 @@ static void log_fn(const pmix_proc_t *client,
}
}
static pmix_status_t alloc_fn(const pmix_proc_t *client,
pmix_alloc_directive_t directive,
const pmix_info_t data[], size_t ndata,
pmix_info_cbfunc_t cbfunc, void *cbdata)
{
return PMIX_SUCCESS;
}
static pmix_status_t jctrl_fn(const pmix_proc_t *requestor,
const pmix_proc_t targets[], size_t ntargets,
const pmix_info_t directives[], size_t ndirs,
pmix_info_cbfunc_t cbfunc, void *cbdata)
{
return PMIX_OPERATION_SUCCEEDED;
}
static pmix_status_t mon_fn(const pmix_proc_t *requestor,
const pmix_info_t *monitor, pmix_status_t error,
const pmix_info_t directives[], size_t ndirs,
pmix_info_cbfunc_t cbfunc, void *cbdata)
{
return PMIX_ERR_NOT_SUPPORTED;
}
static void wait_signal_callback(int fd, short event, void *arg)
{
pmix_event_t *sig = (pmix_event_t*) arg;