Update to PMIx 3.0.1
Signed-off-by: Ralph Castain <rhc@open-mpi.org>
Этот коммит содержится в:
родитель
8483eb4bf7
Коммит
3eef3d1d8f
@ -21,7 +21,21 @@ example, a bug might be fixed in the master, and then moved to the
|
||||
current release as well as the "stable" bug fix release branch.
|
||||
|
||||
|
||||
3.0.0 -- TBD
|
||||
3.0.1 -- 23 Aug 2018
|
||||
----------------------
|
||||
**** DEPRECATION WARNING: The pmix_info_array_t struct was
|
||||
**** initially marked for deprecation in the v2.x series.
|
||||
**** We failed to provide clear warning at that time. This
|
||||
**** therefore serves as warning of intended removal of
|
||||
**** pmix_info_array_t in the future v4 release series.
|
||||
- Fixed memory corruption bug in event notification
|
||||
system due to uninitialized variable
|
||||
- Add numeric version field to pmix_version.h
|
||||
- Transfer all cached data to client dstore upon first connect
|
||||
- Implement missing job control and sensor APIs
|
||||
|
||||
|
||||
3.0.0 -- 6 July 2018
|
||||
------------------------------------
|
||||
**** NOTE: This release implements the complete PMIX v3.0 Standard
|
||||
**** and therefore includes a number of new APIs and features. These
|
||||
@ -63,7 +77,15 @@ current release as well as the "stable" bug fix release branch.
|
||||
- Fix several memory and file descriptor leaks
|
||||
|
||||
|
||||
2.1.2 -- TBD
|
||||
2.1.3 -- 23 Aug 2018
|
||||
----------------------
|
||||
- Fixed memory corruption bug in event notification
|
||||
system due to uninitialized variable
|
||||
- Add numeric version definition
|
||||
- Transfer all cached data to client dstore upon first connect
|
||||
|
||||
|
||||
2.1.2 -- 6 July 2018
|
||||
----------------------
|
||||
- Added PMIX_VERSION_RELEASE string to pmix_version.h
|
||||
- Added PMIX_SPAWNED and PMIX_PARENT_ID keys to all procs
|
||||
|
@ -15,7 +15,7 @@
|
||||
|
||||
major=3
|
||||
minor=0
|
||||
release=0
|
||||
release=1
|
||||
|
||||
# greek is used for alpha or beta release tags. If it is non-empty,
|
||||
# it will be appended to the version number. It does not have to be
|
||||
@ -30,7 +30,7 @@ greek=
|
||||
# command, or with the date (if "git describe" fails) in the form of
|
||||
# "date<date>".
|
||||
|
||||
repo_rev=gitffba520
|
||||
repo_rev=gitbf30a5f
|
||||
|
||||
# If tarball_version is not empty, it is used as the version string in
|
||||
# the tarball filename, regardless of all other versions listed in
|
||||
@ -44,7 +44,7 @@ tarball_version=
|
||||
|
||||
# The date when this release was created
|
||||
|
||||
date="Jul 01, 2018"
|
||||
date="Aug 20, 2018"
|
||||
|
||||
# The shared library version of each of PMIx's public libraries.
|
||||
# These versions are maintained in accordance with the "Library
|
||||
@ -75,6 +75,6 @@ date="Jul 01, 2018"
|
||||
# Version numbers are described in the Libtool current:revision:age
|
||||
# format.
|
||||
|
||||
libpmix_so_version=4:0:2
|
||||
libpmix_so_version=4:1:2
|
||||
libpmi_so_version=1:0:0
|
||||
libpmi2_so_version=1:0:0
|
||||
|
@ -192,7 +192,7 @@
|
||||
|
||||
Summary: An extended/exascale implementation of PMI
|
||||
Name: %{?_name:%{_name}}%{!?_name:pmix}
|
||||
Version: 3.0.0
|
||||
Version: 3.0.1
|
||||
Release: 1%{?dist}
|
||||
License: BSD
|
||||
Group: Development/Libraries
|
||||
|
@ -781,6 +781,7 @@ typedef int pmix_status_t;
|
||||
#define PMIX_LAUNCH_DIRECTIVE (PMIX_ERR_OP_BASE - 24)
|
||||
#define PMIX_LAUNCHER_READY (PMIX_ERR_OP_BASE - 25)
|
||||
#define PMIX_OPERATION_IN_PROGRESS (PMIX_ERR_OP_BASE - 26)
|
||||
#define PMIX_OPERATION_SUCCEEDED (PMIX_ERR_OP_BASE - 27)
|
||||
|
||||
|
||||
/* define a starting point for system error constants so
|
||||
|
@ -2,6 +2,7 @@
|
||||
* Copyright (c) 2016 Mellanox Technologies, Inc.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2018 IBM Corporation. All rights reserved.
|
||||
* Copyright (c) 2018 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -17,4 +18,6 @@
|
||||
#define PMIX_VERSION_MAJOR @pmixmajor@
|
||||
#define PMIX_VERSION_MINOR @pmixminor@
|
||||
#define PMIX_VERSION_RELEASE @pmixrelease@
|
||||
|
||||
#define PMIX_NUMERIC_VERSION 0x00030001
|
||||
#endif
|
||||
|
@ -85,7 +85,7 @@ static void query_cbfunc(struct pmix_peer_t *peer,
|
||||
/* unpack any returned data */
|
||||
cnt = 1;
|
||||
PMIX_BFROPS_UNPACK(rc, peer, buf, &results->ninfo, &cnt, PMIX_SIZE);
|
||||
if (PMIX_SUCCESS != rc) {
|
||||
if (PMIX_SUCCESS != rc && PMIX_ERR_UNPACK_READ_PAST_END_OF_BUFFER != rc) {
|
||||
PMIX_ERROR_LOG(rc);
|
||||
goto complete;
|
||||
}
|
||||
@ -332,6 +332,12 @@ PMIX_EXPORT pmix_status_t PMIx_Process_monitor_nb(const pmix_info_t *monitor, pm
|
||||
return PMIX_ERR_INIT;
|
||||
}
|
||||
|
||||
/* sanity check */
|
||||
if (NULL == monitor) {
|
||||
PMIX_RELEASE_THREAD(&pmix_global_lock);
|
||||
return PMIX_ERR_BAD_PARAM;
|
||||
}
|
||||
|
||||
/* if we are the server, then we just issue the request and
|
||||
* return the response */
|
||||
if (PMIX_PROC_IS_SERVER(pmix_globals.mypeer) &&
|
||||
@ -355,6 +361,19 @@ PMIX_EXPORT pmix_status_t PMIx_Process_monitor_nb(const pmix_info_t *monitor, pm
|
||||
}
|
||||
PMIX_RELEASE_THREAD(&pmix_global_lock);
|
||||
|
||||
/* if the monitor is PMIX_SEND_HEARTBEAT, then send it */
|
||||
if (0 == strncmp(monitor->key, PMIX_SEND_HEARTBEAT, PMIX_MAX_KEYLEN)) {
|
||||
msg = PMIX_NEW(pmix_buffer_t);
|
||||
if (NULL == msg) {
|
||||
return PMIX_ERR_NOMEM;
|
||||
}
|
||||
PMIX_PTL_SEND_ONEWAY(rc, pmix_client_globals.myserver, msg, PMIX_PTL_TAG_HEARTBEAT);
|
||||
if (PMIX_SUCCESS != rc) {
|
||||
PMIX_RELEASE(msg);
|
||||
}
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* if we are a client, then relay this request to the server */
|
||||
msg = PMIX_NEW(pmix_buffer_t);
|
||||
/* pack the cmd */
|
||||
|
@ -1,6 +1,6 @@
|
||||
/*
|
||||
* Copyright (c) 2015-2018 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2016 IBM Corporation. All rights reserved.
|
||||
* Copyright (c) 2016-2018 IBM Corporation. All rights reserved.
|
||||
* Copyright (c) 2016-2017 Mellanox Technologies, Inc.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2018 Research Organization for Information Science
|
||||
@ -3170,7 +3170,7 @@ static pmix_status_t dstore_register_job_info(struct pmix_peer_t *pr,
|
||||
char *msg;
|
||||
pmix_status_t rc;
|
||||
pmix_proc_t proc;
|
||||
pmix_rank_info_t *rinfo;
|
||||
pmix_rank_t rank;
|
||||
|
||||
pmix_output_verbose(2, pmix_gds_base_framework.framework_output,
|
||||
"[%s:%d] gds:dstore:register_job_info for peer [%s:%d]",
|
||||
@ -3187,8 +3187,8 @@ static pmix_status_t dstore_register_job_info(struct pmix_peer_t *pr,
|
||||
return rc;
|
||||
}
|
||||
|
||||
PMIX_LIST_FOREACH(rinfo, &ns->ranks, pmix_rank_info_t) {
|
||||
proc.rank = rinfo->pname.rank;
|
||||
for (rank=0; rank < ns->nprocs; rank++) {
|
||||
proc.rank = rank;
|
||||
rc = _store_job_info(&proc);
|
||||
if (PMIX_SUCCESS != rc) {
|
||||
PMIX_ERROR_LOG(rc);
|
||||
|
@ -1,7 +1,7 @@
|
||||
/*
|
||||
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2012 Los Alamos National Security, Inc. All rights reserved.
|
||||
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2014-2018 Intel, Inc. All rights reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
@ -24,6 +24,7 @@ pmix_status_t pmix_psensor_base_start(pmix_peer_t *requestor, pmix_status_t erro
|
||||
{
|
||||
pmix_psensor_active_module_t *mod;
|
||||
pmix_status_t rc;
|
||||
bool didit = false;
|
||||
|
||||
pmix_output_verbose(5, pmix_psensor_base_framework.framework_output,
|
||||
"%s:%d sensor:base: starting sensors",
|
||||
@ -36,9 +37,17 @@ pmix_status_t pmix_psensor_base_start(pmix_peer_t *requestor, pmix_status_t erro
|
||||
if (PMIX_SUCCESS != rc && PMIX_ERR_TAKE_NEXT_OPTION != rc) {
|
||||
return rc;
|
||||
}
|
||||
didit = true;
|
||||
}
|
||||
}
|
||||
|
||||
/* if none of the components could do it, then report
|
||||
* not supported upwards so the server knows to ask
|
||||
* the host to try */
|
||||
if (!didit) {
|
||||
return PMIX_ERR_NOT_SUPPORTED;
|
||||
}
|
||||
|
||||
return PMIX_SUCCESS;
|
||||
}
|
||||
|
||||
@ -46,7 +55,7 @@ pmix_status_t pmix_psensor_base_stop(pmix_peer_t *requestor,
|
||||
char *id)
|
||||
{
|
||||
pmix_psensor_active_module_t *mod;
|
||||
pmix_status_t rc;
|
||||
pmix_status_t rc, ret = PMIX_SUCCESS;
|
||||
|
||||
pmix_output_verbose(5, pmix_psensor_base_framework.framework_output,
|
||||
"%s:%d sensor:base: stopping sensors",
|
||||
@ -57,10 +66,14 @@ pmix_status_t pmix_psensor_base_stop(pmix_peer_t *requestor,
|
||||
if (NULL != mod->module->stop) {
|
||||
rc = mod->module->stop(requestor, id);
|
||||
if (PMIX_SUCCESS != rc && PMIX_ERR_TAKE_NEXT_OPTION != rc) {
|
||||
return rc;
|
||||
if (PMIX_SUCCESS == ret) {
|
||||
ret = rc;
|
||||
}
|
||||
/* need to continue to ensure that all
|
||||
* sensors have been stopped */
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return PMIX_SUCCESS;
|
||||
return ret;
|
||||
}
|
||||
|
@ -3,7 +3,7 @@
|
||||
* Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
*
|
||||
* Copyright (c) 2017 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2017-2018 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -30,7 +30,7 @@
|
||||
#include "src/util/output.h"
|
||||
#include "src/util/show_help.h"
|
||||
#include "src/include/pmix_globals.h"
|
||||
#include "src/mca/ptl/ptl.h"
|
||||
#include "src/mca/ptl/base/base.h"
|
||||
|
||||
#include "src/mca/psensor/base/base.h"
|
||||
#include "psensor_heartbeat.h"
|
||||
@ -168,6 +168,7 @@ static pmix_status_t heartbeat_start(pmix_peer_t *requestor, pmix_status_t error
|
||||
{
|
||||
pmix_heartbeat_trkr_t *ft;
|
||||
size_t n;
|
||||
pmix_ptl_posted_recv_t *rcv;
|
||||
|
||||
PMIX_OUTPUT_VERBOSE((1, pmix_psensor_base_framework.framework_output,
|
||||
"[%s:%d] checking heartbeat monitoring for requestor %s:%d",
|
||||
@ -202,6 +203,17 @@ static pmix_status_t heartbeat_start(pmix_peer_t *requestor, pmix_status_t error
|
||||
return PMIX_ERR_BAD_PARAM;
|
||||
}
|
||||
|
||||
/* if the recv hasn't been posted, so so now */
|
||||
if (!mca_psensor_heartbeat_component.recv_active) {
|
||||
/* setup to receive heartbeats */
|
||||
rcv = PMIX_NEW(pmix_ptl_posted_recv_t);
|
||||
rcv->tag = PMIX_PTL_TAG_HEARTBEAT;
|
||||
rcv->cbfunc = pmix_psensor_heartbeat_recv_beats;
|
||||
/* add it to the beginning of the list of recvs */
|
||||
pmix_list_prepend(&pmix_ptl_globals.posted_recvs, &rcv->super);
|
||||
mca_psensor_heartbeat_component.recv_active = true;
|
||||
}
|
||||
|
||||
/* need to push into our event base to add this to our trackers */
|
||||
pmix_event_assign(&ft->cdev, pmix_psensor_base.evbase, -1,
|
||||
EV_WRITE, add_tracker, ft);
|
||||
@ -241,7 +253,7 @@ static pmix_status_t heartbeat_stop(pmix_peer_t *requestor, char *id)
|
||||
cd->requestor = requestor;
|
||||
cd->id = strdup(id);
|
||||
|
||||
/* need to push into our event base to add this to our trackers */
|
||||
/* need to push into our event base to remove this from our trackers */
|
||||
pmix_event_assign(&cd->ev, pmix_psensor_base.evbase, -1,
|
||||
EV_WRITE, del_tracker, cd);
|
||||
PMIX_POST_OBJECT(cd);
|
||||
|
@ -2,7 +2,7 @@
|
||||
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2012 Los Alamos National Security, Inc. All rights reserved.
|
||||
*
|
||||
* Copyright (c) 2017 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2017-2018 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -28,6 +28,7 @@ BEGIN_C_DECLS
|
||||
|
||||
typedef struct {
|
||||
pmix_psensor_base_component_t super;
|
||||
bool recv_active;
|
||||
pmix_list_t trackers;
|
||||
} pmix_psensor_heartbeat_component_t;
|
||||
|
||||
|
@ -1,7 +1,7 @@
|
||||
/*
|
||||
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2012 Los Alamos National Security, Inc. All rights reserved.
|
||||
* Copyright (c) 2017 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2017-2018 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -50,14 +50,9 @@ pmix_psensor_heartbeat_component_t mca_psensor_heartbeat_component = {
|
||||
*/
|
||||
static int heartbeat_open(void)
|
||||
{
|
||||
pmix_status_t rc;
|
||||
|
||||
PMIX_CONSTRUCT(&mca_psensor_heartbeat_component.trackers, pmix_list_t);
|
||||
|
||||
/* setup to receive heartbeats */
|
||||
PMIX_PTL_RECV(rc, pmix_globals.mypeer, pmix_psensor_heartbeat_recv_beats, PMIX_PTL_TAG_HEARTBEAT);
|
||||
|
||||
return rc;
|
||||
return PMIX_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
@ -74,12 +69,7 @@ static int heartbeat_query(pmix_mca_base_module_t **module, int *priority)
|
||||
|
||||
static int heartbeat_close(void)
|
||||
{
|
||||
pmix_status_t rc;
|
||||
|
||||
/* cancel our persistent recv */
|
||||
PMIX_PTL_CANCEL(rc, pmix_globals.mypeer, PMIX_PTL_TAG_HEARTBEAT);
|
||||
|
||||
PMIX_LIST_DESTRUCT(&mca_psensor_heartbeat_component.trackers);
|
||||
|
||||
return rc;
|
||||
return PMIX_SUCCESS;
|
||||
}
|
||||
|
@ -66,6 +66,7 @@
|
||||
#include "src/mca/bfrops/base/base.h"
|
||||
#include "src/mca/gds/base/base.h"
|
||||
#include "src/mca/preg/preg.h"
|
||||
#include "src/mca/psensor/base/base.h"
|
||||
#include "src/mca/ptl/base/base.h"
|
||||
#include "src/hwloc/hwloc-internal.h"
|
||||
|
||||
@ -397,6 +398,16 @@ PMIX_EXPORT pmix_status_t PMIx_server_init(pmix_server_module_t *module,
|
||||
}
|
||||
}
|
||||
|
||||
/* open the psensor framework */
|
||||
if (PMIX_SUCCESS != (rc = pmix_mca_base_framework_open(&pmix_psensor_base_framework, 0))) {
|
||||
PMIX_RELEASE_THREAD(&pmix_global_lock);
|
||||
return rc;
|
||||
}
|
||||
if (PMIX_SUCCESS != (rc = pmix_psensor_base_select())) {
|
||||
PMIX_RELEASE_THREAD(&pmix_global_lock);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* setup the wildcard recv for inbound messages from clients */
|
||||
req = PMIX_NEW(pmix_ptl_posted_recv_t);
|
||||
req->tag = UINT32_MAX;
|
||||
@ -511,6 +522,8 @@ PMIX_EXPORT pmix_status_t PMIx_server_finalize(void)
|
||||
if (NULL != pmix_server_globals.tmpdir) {
|
||||
free(pmix_server_globals.tmpdir);
|
||||
}
|
||||
/* close the psensor framework */
|
||||
(void)pmix_mca_base_framework_close(&pmix_psensor_base_framework);
|
||||
/* close the pnet framework */
|
||||
(void)pmix_mca_base_framework_close(&pmix_pnet_base_framework);
|
||||
|
||||
@ -3228,6 +3241,9 @@ void pmix_server_message_handler(struct pmix_peer_t *pr,
|
||||
PMIX_ERROR_LOG(PMIX_ERR_NOMEM);
|
||||
return;
|
||||
}
|
||||
if (PMIX_OPERATION_SUCCEEDED == ret) {
|
||||
ret = PMIX_SUCCESS;
|
||||
}
|
||||
PMIX_BFROPS_PACK(rc, pr, reply, &ret, 1, PMIX_STATUS);
|
||||
if (PMIX_SUCCESS != rc) {
|
||||
PMIX_ERROR_LOG(rc);
|
||||
|
@ -52,6 +52,7 @@
|
||||
#include "src/class/pmix_list.h"
|
||||
#include "src/mca/bfrops/bfrops.h"
|
||||
#include "src/mca/plog/plog.h"
|
||||
#include "src/mca/psensor/psensor.h"
|
||||
#include "src/util/argv.h"
|
||||
#include "src/util/error.h"
|
||||
#include "src/util/output.h"
|
||||
@ -2536,10 +2537,8 @@ pmix_status_t pmix_server_job_ctrl(pmix_peer_t *peer,
|
||||
PMIX_LIST_DESTRUCT(&cachefiles);
|
||||
if (cnt == (int)cd->ninfo) {
|
||||
/* nothing more to do */
|
||||
if (NULL != cbfunc) {
|
||||
cbfunc(PMIX_SUCCESS, NULL, 0, cd, NULL, NULL);
|
||||
}
|
||||
return PMIX_SUCCESS;
|
||||
rc = PMIX_SUCCESS;
|
||||
goto exit;
|
||||
}
|
||||
}
|
||||
|
||||
@ -2575,9 +2574,6 @@ pmix_status_t pmix_server_monitor(pmix_peer_t *peer,
|
||||
pmix_output_verbose(2, pmix_server_globals.base_output,
|
||||
"recvd monitor request from client");
|
||||
|
||||
if (NULL == pmix_host_server.monitor) {
|
||||
return PMIX_ERR_NOT_SUPPORTED;
|
||||
}
|
||||
|
||||
cd = PMIX_NEW(pmix_query_caddy_t);
|
||||
if (NULL == cd) {
|
||||
@ -2620,6 +2616,24 @@ pmix_status_t pmix_server_monitor(pmix_peer_t *peer,
|
||||
}
|
||||
}
|
||||
|
||||
/* see if they are requesting one of the monitoring
|
||||
* methods we internally support */
|
||||
rc = pmix_psensor.start(peer, error, &monitor, cd->info, cd->ninfo);
|
||||
if (PMIX_SUCCESS == rc) {
|
||||
rc = PMIX_OPERATION_SUCCEEDED;
|
||||
goto exit;
|
||||
}
|
||||
if (PMIX_ERR_NOT_SUPPORTED != rc) {
|
||||
goto exit;
|
||||
}
|
||||
|
||||
/* if we don't internally support it, see if
|
||||
* our host does */
|
||||
if (NULL == pmix_host_server.monitor) {
|
||||
rc = PMIX_ERR_NOT_SUPPORTED;
|
||||
goto exit;
|
||||
}
|
||||
|
||||
/* setup the requesting peer name */
|
||||
(void)strncpy(proc.nspace, peer->info->pname.nspace, PMIX_MAX_NSLEN);
|
||||
proc.rank = peer->info->pname.rank;
|
||||
|
@ -25,7 +25,7 @@ headers = simptest.h
|
||||
|
||||
noinst_PROGRAMS = simptest simpclient simppub simpdyn simpft simpdmodex \
|
||||
test_pmix simptool simpdie simplegacy simptimeout \
|
||||
gwtest gwclient stability quietclient
|
||||
gwtest gwclient stability quietclient simpjctrl
|
||||
|
||||
simptest_SOURCES = $(headers) \
|
||||
simptest.c
|
||||
@ -116,3 +116,9 @@ quietclient_SOURCES = $(headers) \
|
||||
quietclient_LDFLAGS = $(PMIX_PKG_CONFIG_LDFLAGS)
|
||||
quietclient_LDADD = \
|
||||
$(top_builddir)/src/libpmix.la
|
||||
|
||||
simpjctrl_SOURCES = \
|
||||
simpjctrl.c
|
||||
simpjctrl_LDFLAGS = $(PMIX_PKG_CONFIG_LDFLAGS)
|
||||
simpjctrl_LDADD = \
|
||||
$(top_builddir)/src/libpmix.la
|
||||
|
231
opal/mca/pmix/pmix3x/pmix/test/simple/simpjctrl.c
Обычный файл
231
opal/mca/pmix/pmix3x/pmix/test/simple/simpjctrl.c
Обычный файл
@ -0,0 +1,231 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2011 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2006-2013 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2009-2012 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2011 Oak Ridge National Labs. All rights reserved.
|
||||
* Copyright (c) 2013-2018 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2015 Mellanox Technologies, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*
|
||||
*/
|
||||
|
||||
#define _GNU_SOURCE
|
||||
#include <stdbool.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <unistd.h>
|
||||
#include <time.h>
|
||||
#include <signal.h>
|
||||
|
||||
#include <pmix.h>
|
||||
#include "simptest.h"
|
||||
|
||||
static pmix_proc_t myproc;
|
||||
|
||||
/* this is the event notification function we pass down below
|
||||
* when registering for general events - i.e.,, the default
|
||||
* handler. We don't technically need to register one, but it
|
||||
* is usually good practice to catch any events that occur */
|
||||
static void notification_fn(size_t evhdlr_registration_id,
|
||||
pmix_status_t status,
|
||||
const pmix_proc_t *source,
|
||||
pmix_info_t info[], size_t ninfo,
|
||||
pmix_info_t results[], size_t nresults,
|
||||
pmix_event_notification_cbfunc_fn_t cbfunc,
|
||||
void *cbdata)
|
||||
{
|
||||
if (NULL != cbfunc) {
|
||||
cbfunc(PMIX_EVENT_ACTION_COMPLETE, NULL, 0, NULL, NULL, cbdata);
|
||||
}
|
||||
}
|
||||
|
||||
/* event handler registration is done asynchronously because it
|
||||
* may involve the PMIx server registering with the host RM for
|
||||
* external events. So we provide a callback function that returns
|
||||
* the status of the request (success or an error), plus a numerical index
|
||||
* to the registered event. The index is used later on to deregister
|
||||
* an event handler - if we don't explicitly deregister it, then the
|
||||
* PMIx server will do so when it see us exit */
|
||||
static void evhandler_reg_callbk(pmix_status_t status,
|
||||
size_t evhandler_ref,
|
||||
void *cbdata)
|
||||
{
|
||||
mylock_t *lk = (mylock_t*)cbdata;
|
||||
|
||||
if (PMIX_SUCCESS != status) {
|
||||
fprintf(stderr, "Client %s:%d EVENT HANDLER REGISTRATION FAILED WITH STATUS %d, ref=%lu\n",
|
||||
myproc.nspace, myproc.rank, status, (unsigned long)evhandler_ref);
|
||||
}
|
||||
lk->status = status;
|
||||
DEBUG_WAKEUP_THREAD(lk);
|
||||
}
|
||||
|
||||
static void infocbfunc(pmix_status_t status,
|
||||
pmix_info_t *info, size_t ninfo,
|
||||
void *cbdata,
|
||||
pmix_release_cbfunc_t release_fn,
|
||||
void *release_cbdata)
|
||||
{
|
||||
mylock_t *lk = (mylock_t*)cbdata;
|
||||
|
||||
fprintf(stderr, "Callback recvd with status %d\n", status);
|
||||
|
||||
/* release the caller */
|
||||
if (NULL != release_fn) {
|
||||
release_fn(release_cbdata);
|
||||
}
|
||||
|
||||
lk->status = status;
|
||||
DEBUG_WAKEUP_THREAD(lk);
|
||||
}
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
int rc;
|
||||
pmix_value_t value;
|
||||
pmix_value_t *val = &value;
|
||||
pmix_proc_t proc;
|
||||
uint32_t nprocs, n;
|
||||
pmix_info_t *info, *iptr;
|
||||
bool flag;
|
||||
mylock_t mylock;
|
||||
pmix_data_array_t *dptr;
|
||||
|
||||
/* init us - note that the call to "init" includes the return of
|
||||
* any job-related info provided by the RM. */
|
||||
if (PMIX_SUCCESS != (rc = PMIx_Init(&myproc, NULL, 0))) {
|
||||
fprintf(stderr, "Client ns %s rank %d: PMIx_Init failed: %d\n", myproc.nspace, myproc.rank, rc);
|
||||
exit(0);
|
||||
}
|
||||
fprintf(stderr, "Client ns %s rank %d: Running\n", myproc.nspace, myproc.rank);
|
||||
|
||||
|
||||
/* register our default event handler - again, this isn't strictly
|
||||
* required, but is generally good practice */
|
||||
DEBUG_CONSTRUCT_LOCK(&mylock);
|
||||
PMIx_Register_event_handler(NULL, 0, NULL, 0,
|
||||
notification_fn, evhandler_reg_callbk, (void*)&mylock);
|
||||
DEBUG_WAIT_THREAD(&mylock);
|
||||
if (0 != mylock.status) {
|
||||
fprintf(stderr, "[%s:%d] Default handler registration failed\n", myproc.nspace, myproc.rank);
|
||||
exit(mylock.status);
|
||||
}
|
||||
DEBUG_DESTRUCT_LOCK(&mylock);
|
||||
|
||||
/* job-related info is found in our nspace, assigned to the
|
||||
* wildcard rank as it doesn't relate to a specific rank. Setup
|
||||
* a name to retrieve such values */
|
||||
PMIX_PROC_CONSTRUCT(&proc);
|
||||
(void)strncpy(proc.nspace, myproc.nspace, PMIX_MAX_NSLEN);
|
||||
proc.rank = PMIX_RANK_WILDCARD;
|
||||
|
||||
/* get our universe size */
|
||||
if (PMIX_SUCCESS != (rc = PMIx_Get(&proc, PMIX_UNIV_SIZE, NULL, 0, &val))) {
|
||||
fprintf(stderr, "Client ns %s rank %d: PMIx_Get universe size failed: %d\n", myproc.nspace, myproc.rank, rc);
|
||||
goto done;
|
||||
}
|
||||
nprocs = val->data.uint32;
|
||||
PMIX_VALUE_RELEASE(val);
|
||||
fprintf(stderr, "Client %s:%d universe size %d\n", myproc.nspace, myproc.rank, nprocs);
|
||||
|
||||
/* inform the RM that we are preemptible, and that our checkpoint methods are
|
||||
* "signal" on SIGUSR2 and event on PMIX_JCTRL_CHECKPOINT */
|
||||
PMIX_INFO_CREATE(info, 2);
|
||||
flag = true;
|
||||
PMIX_INFO_LOAD(&info[0], PMIX_JOB_CTRL_PREEMPTIBLE, (void*)&flag, PMIX_BOOL);
|
||||
/* can't use "load" to load a pmix_data_array_t */
|
||||
(void)strncpy(info[1].key, PMIX_JOB_CTRL_CHECKPOINT_METHOD, PMIX_MAX_KEYLEN);
|
||||
info[1].value.type = PMIX_DATA_ARRAY;
|
||||
dptr = (pmix_data_array_t*)malloc(sizeof(pmix_data_array_t));
|
||||
info[1].value.data.darray = dptr;
|
||||
dptr->type = PMIX_INFO;
|
||||
dptr->size = 2;
|
||||
PMIX_INFO_CREATE(dptr->array, dptr->size);
|
||||
rc = SIGUSR2;
|
||||
iptr = (pmix_info_t*)dptr->array;
|
||||
PMIX_INFO_LOAD(&iptr[0], PMIX_JOB_CTRL_CHECKPOINT_SIGNAL, &rc, PMIX_INT);
|
||||
rc = PMIX_JCTRL_CHECKPOINT;
|
||||
PMIX_INFO_LOAD(&iptr[1], PMIX_JOB_CTRL_CHECKPOINT_EVENT, &rc, PMIX_STATUS);
|
||||
|
||||
/* since this is informational and not a requested operation, the target parameter
|
||||
* doesn't mean anything and can be ignored */
|
||||
DEBUG_CONSTRUCT_LOCK(&mylock);
|
||||
if (PMIX_SUCCESS != (rc = PMIx_Job_control_nb(NULL, 0, info, 2, infocbfunc, (void*)&mylock))) {
|
||||
fprintf(stderr, "Client ns %s rank %d: PMIx_Job_control_nb failed: %d\n", myproc.nspace, myproc.rank, rc);
|
||||
goto done;
|
||||
}
|
||||
DEBUG_WAIT_THREAD(&mylock);
|
||||
PMIX_INFO_FREE(info, 2);
|
||||
if (0 != mylock.status) {
|
||||
fprintf(stderr, "Client ns %s rank %d: PMIx_Job_control_nb failed: %d\n", myproc.nspace, myproc.rank, mylock.status);
|
||||
exit(mylock.status);
|
||||
}
|
||||
DEBUG_DESTRUCT_LOCK(&mylock);
|
||||
|
||||
/* now request that this process be monitored using heartbeats */
|
||||
PMIX_INFO_CREATE(iptr, 1);
|
||||
PMIX_INFO_LOAD(&iptr[0], PMIX_MONITOR_HEARTBEAT, NULL, PMIX_POINTER);
|
||||
|
||||
PMIX_INFO_CREATE(info, 3);
|
||||
PMIX_INFO_LOAD(&info[0], PMIX_MONITOR_ID, "MONITOR1", PMIX_STRING);
|
||||
n = 5; // require a heartbeat every 5 seconds
|
||||
PMIX_INFO_LOAD(&info[1], PMIX_MONITOR_HEARTBEAT_TIME, &n, PMIX_UINT32);
|
||||
n = 2; // two heartbeats can be missed before declaring us "stalled"
|
||||
PMIX_INFO_LOAD(&info[2], PMIX_MONITOR_HEARTBEAT_DROPS, &n, PMIX_UINT32);
|
||||
|
||||
/* make the request */
|
||||
DEBUG_CONSTRUCT_LOCK(&mylock);
|
||||
if (PMIX_SUCCESS != (rc = PMIx_Process_monitor_nb(iptr, PMIX_MONITOR_HEARTBEAT_ALERT,
|
||||
info, 3, infocbfunc, (void*)&mylock))) {
|
||||
fprintf(stderr, "Client ns %s rank %d: PMIx_Process_monitor_nb failed: %d\n", myproc.nspace, myproc.rank, rc);
|
||||
goto done;
|
||||
}
|
||||
DEBUG_WAIT_THREAD(&mylock);
|
||||
PMIX_INFO_FREE(iptr, 1);
|
||||
PMIX_INFO_FREE(info, 3);
|
||||
if (0 != mylock.status) {
|
||||
fprintf(stderr, "Client ns %s rank %d: PMIx_Process_monitor_nb failed: %d\n", myproc.nspace, myproc.rank, mylock.status);
|
||||
exit(mylock.status);
|
||||
}
|
||||
DEBUG_DESTRUCT_LOCK(&mylock);
|
||||
|
||||
/* send a heartbeat */
|
||||
PMIx_Heartbeat();
|
||||
|
||||
/* call fence to synchronize with our peers - no need to
|
||||
* collect any info as we didn't "put" anything */
|
||||
PMIX_INFO_CREATE(info, 1);
|
||||
flag = false;
|
||||
PMIX_INFO_LOAD(info, PMIX_COLLECT_DATA, &flag, PMIX_BOOL);
|
||||
if (PMIX_SUCCESS != (rc = PMIx_Fence(&proc, 1, info, 1))) {
|
||||
fprintf(stderr, "Client ns %s rank %d: PMIx_Fence failed: %d\n", myproc.nspace, myproc.rank, rc);
|
||||
goto done;
|
||||
}
|
||||
PMIX_INFO_FREE(info, 1);
|
||||
|
||||
|
||||
done:
|
||||
/* finalize us */
|
||||
fprintf(stderr, "Client ns %s rank %d: Finalizing\n", myproc.nspace, myproc.rank);
|
||||
if (PMIX_SUCCESS != (rc = PMIx_Finalize(NULL, 0))) {
|
||||
fprintf(stderr, "Client ns %s rank %d:PMIx_Finalize failed: %d\n", myproc.nspace, myproc.rank, rc);
|
||||
} else {
|
||||
fprintf(stderr, "Client ns %s rank %d:PMIx_Finalize successfully completed\n", myproc.nspace, myproc.rank);
|
||||
}
|
||||
fflush(stderr);
|
||||
return(0);
|
||||
}
|
@ -107,6 +107,18 @@ static void log_fn(const pmix_proc_t *client,
|
||||
const pmix_info_t data[], size_t ndata,
|
||||
const pmix_info_t directives[], size_t ndirs,
|
||||
pmix_op_cbfunc_t cbfunc, void *cbdata);
|
||||
static pmix_status_t alloc_fn(const pmix_proc_t *client,
|
||||
pmix_alloc_directive_t directive,
|
||||
const pmix_info_t data[], size_t ndata,
|
||||
pmix_info_cbfunc_t cbfunc, void *cbdata);
|
||||
static pmix_status_t jctrl_fn(const pmix_proc_t *requestor,
|
||||
const pmix_proc_t targets[], size_t ntargets,
|
||||
const pmix_info_t directives[], size_t ndirs,
|
||||
pmix_info_cbfunc_t cbfunc, void *cbdata);
|
||||
static pmix_status_t mon_fn(const pmix_proc_t *requestor,
|
||||
const pmix_info_t *monitor, pmix_status_t error,
|
||||
const pmix_info_t directives[], size_t ndirs,
|
||||
pmix_info_cbfunc_t cbfunc, void *cbdata);
|
||||
|
||||
static pmix_server_module_t mymodule = {
|
||||
.client_connected = connected,
|
||||
@ -125,7 +137,10 @@ static pmix_server_module_t mymodule = {
|
||||
.notify_event = notify_event,
|
||||
.query = query_fn,
|
||||
.tool_connected = tool_connect_fn,
|
||||
.log = log_fn
|
||||
.log = log_fn,
|
||||
.allocate = alloc_fn,
|
||||
.job_control = jctrl_fn,
|
||||
.monitor = mon_fn
|
||||
};
|
||||
|
||||
typedef struct {
|
||||
@ -1073,6 +1088,31 @@ static void log_fn(const pmix_proc_t *client,
|
||||
}
|
||||
}
|
||||
|
||||
static pmix_status_t alloc_fn(const pmix_proc_t *client,
|
||||
pmix_alloc_directive_t directive,
|
||||
const pmix_info_t data[], size_t ndata,
|
||||
pmix_info_cbfunc_t cbfunc, void *cbdata)
|
||||
{
|
||||
return PMIX_SUCCESS;
|
||||
}
|
||||
|
||||
static pmix_status_t jctrl_fn(const pmix_proc_t *requestor,
|
||||
const pmix_proc_t targets[], size_t ntargets,
|
||||
const pmix_info_t directives[], size_t ndirs,
|
||||
pmix_info_cbfunc_t cbfunc, void *cbdata)
|
||||
{
|
||||
return PMIX_OPERATION_SUCCEEDED;
|
||||
}
|
||||
|
||||
static pmix_status_t mon_fn(const pmix_proc_t *requestor,
|
||||
const pmix_info_t *monitor, pmix_status_t error,
|
||||
const pmix_info_t directives[], size_t ndirs,
|
||||
pmix_info_cbfunc_t cbfunc, void *cbdata)
|
||||
{
|
||||
return PMIX_ERR_NOT_SUPPORTED;
|
||||
}
|
||||
|
||||
|
||||
static void wait_signal_callback(int fd, short event, void *arg)
|
||||
{
|
||||
pmix_event_t *sig = (pmix_event_t*) arg;
|
||||
|
Загрузка…
Ссылка в новой задаче
Block a user