1
1
Ralph Castain c696e04c5e Since PMIx is moving to release v3.0, embed the new release candidate in opal/pmix framework. Move the pmix2x code over to the ext2x component. Create a new ext3x component
Remove some build product. Tell PMIx that we don't need a new nspace generated when OMPI calls connect
Add missing Makefile

Signed-off-by: Ralph Castain <rhc@open-mpi.org>
2017-10-09 13:51:08 -07:00

230 строки
8.8 KiB
C

/*
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2006-2013 Los Alamos National Security, LLC.
* All rights reserved.
* Copyright (c) 2009-2012 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2011 Oak Ridge National Labs. All rights reserved.
* Copyright (c) 2013-2017 Intel, Inc. All rights reserved.
* Copyright (c) 2015 Mellanox Technologies, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*
*/
#define _GNU_SOURCE
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <time.h>
#include <signal.h>
#include <pmix.h>
static pmix_proc_t myproc;
/* this is the event notification function we pass down below
* when registering for general events - i.e.,, the default
* handler. We don't technically need to register one, but it
* is usually good practice to catch any events that occur */
static void notification_fn(size_t evhdlr_registration_id,
pmix_status_t status,
const pmix_proc_t *source,
pmix_info_t info[], size_t ninfo,
pmix_info_t results[], size_t nresults,
pmix_event_notification_cbfunc_fn_t cbfunc,
void *cbdata)
{
if (NULL != cbfunc) {
cbfunc(PMIX_EVENT_ACTION_COMPLETE, NULL, 0, NULL, NULL, cbdata);
}
}
/* event handler registration is done asynchronously because it
* may involve the PMIx server registering with the host RM for
* external events. So we provide a callback function that returns
* the status of the request (success or an error), plus a numerical index
* to the registered event. The index is used later on to deregister
* an event handler - if we don't explicitly deregister it, then the
* PMIx server will do so when it see us exit */
static void evhandler_reg_callbk(pmix_status_t status,
size_t evhandler_ref,
void *cbdata)
{
volatile int *active = (volatile int*)cbdata;
if (PMIX_SUCCESS != status) {
fprintf(stderr, "Client %s:%d EVENT HANDLER REGISTRATION FAILED WITH STATUS %d, ref=%lu\n",
myproc.nspace, myproc.rank, status, (unsigned long)evhandler_ref);
}
*active = status;
}
static void infocbfunc(pmix_status_t status,
pmix_info_t *info, size_t ninfo,
void *cbdata,
pmix_release_cbfunc_t release_fn,
void *release_cbdata)
{
volatile int *active = (volatile int*)cbdata;
/* release the caller */
if (NULL != release_fn) {
release_fn(release_cbdata);
}
*active = status;
}
int main(int argc, char **argv)
{
int rc;
pmix_value_t value;
pmix_value_t *val = &value;
pmix_proc_t proc;
uint32_t nprocs, n;
pmix_info_t *info, *iptr;
bool flag;
volatile int active;
pmix_data_array_t *dptr;
/* init us - note that the call to "init" includes the return of
* any job-related info provided by the RM. */
if (PMIX_SUCCESS != (rc = PMIx_Init(&myproc, NULL, 0))) {
fprintf(stderr, "Client ns %s rank %d: PMIx_Init failed: %d\n", myproc.nspace, myproc.rank, rc);
exit(0);
}
fprintf(stderr, "Client ns %s rank %d: Running\n", myproc.nspace, myproc.rank);
/* register our default event handler - again, this isn't strictly
* required, but is generally good practice */
active = -1;
PMIx_Register_event_handler(NULL, 0, NULL, 0,
notification_fn, evhandler_reg_callbk, (void*)&active);
while (-1 == active) {
sleep(1);
}
if (0 != active) {
fprintf(stderr, "[%s:%d] Default handler registration failed\n", myproc.nspace, myproc.rank);
exit(active);
}
/* job-related info is found in our nspace, assigned to the
* wildcard rank as it doesn't relate to a specific rank. Setup
* a name to retrieve such values */
PMIX_PROC_CONSTRUCT(&proc);
(void)strncpy(proc.nspace, myproc.nspace, PMIX_MAX_NSLEN);
proc.rank = PMIX_RANK_WILDCARD;
/* get our universe size */
if (PMIX_SUCCESS != (rc = PMIx_Get(&proc, PMIX_UNIV_SIZE, NULL, 0, &val))) {
fprintf(stderr, "Client ns %s rank %d: PMIx_Get universe size failed: %d\n", myproc.nspace, myproc.rank, rc);
goto done;
}
nprocs = val->data.uint32;
PMIX_VALUE_RELEASE(val);
fprintf(stderr, "Client %s:%d universe size %d\n", myproc.nspace, myproc.rank, nprocs);
/* inform the RM that we are preemptible, and that our checkpoint methods are
* "signal" on SIGUSR2 and event on PMIX_JCTRL_CHECKPOINT */
PMIX_INFO_CREATE(info, 2);
flag = true;
PMIX_INFO_LOAD(&info[0], PMIX_JOB_CTRL_PREEMPTIBLE, (void*)&flag, PMIX_BOOL);
/* can't use "load" to load a pmix_data_array_t */
(void)strncpy(info[1].key, PMIX_JOB_CTRL_CHECKPOINT_METHOD, PMIX_MAX_KEYLEN);
info[1].value.type = PMIX_DATA_ARRAY;
dptr = (pmix_data_array_t*)malloc(sizeof(pmix_data_array_t));
info[1].value.data.darray = dptr;
dptr->type = PMIX_INFO;
dptr->size = 2;
PMIX_INFO_CREATE(dptr->array, dptr->size);
rc = SIGUSR2;
iptr = (pmix_info_t*)dptr->array;
PMIX_INFO_LOAD(&iptr[0], PMIX_JOB_CTRL_CHECKPOINT_SIGNAL, &rc, PMIX_INT);
rc = PMIX_JCTRL_CHECKPOINT;
PMIX_INFO_LOAD(&iptr[1], PMIX_JOB_CTRL_CHECKPOINT_EVENT, &rc, PMIX_STATUS);
/* since this is informational and not a requested operation, the target parameter
* doesn't mean anything and can be ignored */
active = -1;
if (PMIX_SUCCESS != (rc = PMIx_Job_control_nb(NULL, 0, info, 2, infocbfunc, (void*)&active))) {
fprintf(stderr, "Client ns %s rank %d: PMIx_Job_control_nb failed: %d\n", myproc.nspace, myproc.rank, rc);
goto done;
}
while (-1 == active) {
sleep(1);
}
PMIX_INFO_FREE(info, 2);
if (0 != active) {
fprintf(stderr, "Client ns %s rank %d: PMIx_Job_control_nb failed: %d\n", myproc.nspace, myproc.rank, rc);
exit(active);
}
/* now request that this process be monitored using heartbeats */
PMIX_INFO_CREATE(iptr, 1);
PMIX_INFO_LOAD(&iptr[0], PMIX_MONITOR_HEARTBEAT, NULL, PMIX_POINTER);
PMIX_INFO_CREATE(info, 3);
PMIX_INFO_LOAD(&info[0], PMIX_MONITOR_ID, "MONITOR1", PMIX_STRING);
n = 5; // require a heartbeat every 5 seconds
PMIX_INFO_LOAD(&info[1], PMIX_MONITOR_HEARTBEAT_TIME, &n, PMIX_UINT32);
n = 2; // two heartbeats can be missed before declaring us "stalled"
PMIX_INFO_LOAD(&info[2], PMIX_MONITOR_HEARTBEAT_DROPS, &n, PMIX_UINT32);
/* make the request */
active = -1;
if (PMIX_SUCCESS != (rc = PMIx_Process_monitor_nb(iptr, PMIX_MONITOR_HEARTBEAT_ALERT,
info, 3, infocbfunc, (void*)&active))) {
fprintf(stderr, "Client ns %s rank %d: PMIx_Process_monitor_nb failed: %d\n", myproc.nspace, myproc.rank, rc);
goto done;
}
while (-1 == active) {
sleep(1);
}
PMIX_INFO_FREE(iptr, 1);
PMIX_INFO_FREE(info, 3);
if (0 != active) {
fprintf(stderr, "Client ns %s rank %d: PMIx_Process_monitor_nb failed: %d\n", myproc.nspace, myproc.rank, rc);
exit(active);
}
/* send a heartbeat */
PMIx_Heartbeat();
/* call fence to synchronize with our peers - no need to
* collect any info as we didn't "put" anything */
PMIX_INFO_CREATE(info, 1);
flag = false;
PMIX_INFO_LOAD(info, PMIX_COLLECT_DATA, &flag, PMIX_BOOL);
if (PMIX_SUCCESS != (rc = PMIx_Fence(&proc, 1, info, 1))) {
fprintf(stderr, "Client ns %s rank %d: PMIx_Fence failed: %d\n", myproc.nspace, myproc.rank, rc);
goto done;
}
PMIX_INFO_FREE(info, 1);
done:
/* finalize us */
fprintf(stderr, "Client ns %s rank %d: Finalizing\n", myproc.nspace, myproc.rank);
if (PMIX_SUCCESS != (rc = PMIx_Finalize(NULL, 0))) {
fprintf(stderr, "Client ns %s rank %d:PMIx_Finalize failed: %d\n", myproc.nspace, myproc.rank, rc);
} else {
fprintf(stderr, "Client ns %s rank %d:PMIx_Finalize successfully completed\n", myproc.nspace, myproc.rank);
}
fflush(stderr);
return(0);
}