6509f60929
Samples are taken after MPI_Init, and then again after MPI_Barrier. This allows the user to see memory consumption caused by add_procs, as well as any modex contribution from forming connections if pmix_base_async_modex is given. Using the probe simply involves executing it via mpirun, with however many copies you want per node. Example: $ mpirun -npernode 2 ./mpi_memprobe Sampling memory usage after MPI_Init Data for node rhc001 Daemon: 12.483398 Client: 6.514648 Data for node rhc002 Daemon: 11.865234 Client: 4.643555 Sampling memory usage after MPI_Barrier Data for node rhc001 Daemon: 12.520508 Client: 6.576660 Data for node rhc002 Daemon: 11.879883 Client: 4.703125 Note that the client value on node rhc001 is larger - this is where rank=0 is housed, and apparently it gets a larger footprint for some reason. Signed-off-by: Ralph Castain <rhc@open-mpi.org>
246 строки
7.2 KiB
C
246 строки
7.2 KiB
C
/* -*- C -*-
|
|
*
|
|
* $HEADER$
|
|
*
|
|
* The most basic of MPI applications
|
|
*/
|
|
|
|
#include "orte_config.h"
|
|
|
|
#include <stdio.h>
|
|
#include "mpi.h"
|
|
#include "opal/mca/pmix/pmix.h"
|
|
#include "orte/runtime/runtime.h"
|
|
#include "orte/util/proc_info.h"
|
|
#include "orte/util/name_fns.h"
|
|
#include "orte/runtime/orte_globals.h"
|
|
#include "orte/mca/errmgr/errmgr.h"
|
|
|
|
static int rank, size;
|
|
static volatile bool wait_for_release = true;
|
|
#define MEMPROBE_RELEASE 12345
|
|
|
|
static void _release_fn(int status,
|
|
const opal_process_name_t *source,
|
|
opal_list_t *info, opal_list_t *results,
|
|
opal_pmix_notification_complete_fn_t cbfunc,
|
|
void *cbdata)
|
|
{
|
|
/* must let the notifier know we are done */
|
|
if (NULL != cbfunc) {
|
|
cbfunc(OPAL_ERR_HANDLERS_COMPLETE, NULL, NULL, NULL, cbdata);
|
|
}
|
|
/* flag that the debugger is complete so we can exit */
|
|
wait_for_release = false;
|
|
}
|
|
|
|
static void _register_fn(int status,
|
|
size_t evhandler_ref,
|
|
void *cbdata)
|
|
{
|
|
volatile int *active = (volatile int*)cbdata;
|
|
|
|
if (0 != status) {
|
|
fprintf(stderr, "Client EVENT HANDLER REGISTRATION FAILED WITH STATUS %d, ref=%lu\n",
|
|
status, (unsigned long)evhandler_ref);
|
|
}
|
|
*active = status;
|
|
}
|
|
|
|
static void qcbfunc(int status,
|
|
opal_list_t *info,
|
|
void *cbdata,
|
|
opal_pmix_release_cbfunc_t release_fn,
|
|
void *release_cbdata)
|
|
{
|
|
opal_list_t *results = (opal_list_t*)cbdata;
|
|
opal_value_t *kv;
|
|
|
|
if (NULL != info) {
|
|
while (NULL != (kv = (opal_value_t*)opal_list_remove_first(info))) {
|
|
opal_list_append(results, &kv->super);
|
|
}
|
|
}
|
|
if (NULL != release_fn) {
|
|
release_fn(release_cbdata);
|
|
}
|
|
wait_for_release = false;
|
|
}
|
|
|
|
static void notifycbfunc(int status, void *cbdata)
|
|
{
|
|
volatile int *active = (volatile int*)cbdata;
|
|
*active = status;
|
|
}
|
|
|
|
static void sample(void)
|
|
{
|
|
opal_value_t *kv, *ival;
|
|
opal_pmix_query_t *q;
|
|
opal_list_t query, response, *lt;
|
|
volatile int active;
|
|
char **answer = NULL, *tmp, *msg;
|
|
|
|
OBJ_CONSTRUCT(&query, opal_list_t);
|
|
OBJ_CONSTRUCT(&response, opal_list_t);
|
|
q = OBJ_NEW(opal_pmix_query_t);
|
|
opal_list_append(&query, &q->super);
|
|
opal_argv_append_nosize(&q->keys, OPAL_PMIX_QUERY_MEMORY_USAGE);
|
|
/* qualify that we just want local avg, min/max values reported */
|
|
kv = OBJ_NEW(opal_value_t);
|
|
kv->key = strdup(OPAL_PMIX_QUERY_LOCAL_ONLY);
|
|
kv->type = OPAL_BOOL;
|
|
kv->data.flag = true;
|
|
opal_list_append(&q->qualifiers, &kv->super);
|
|
kv = OBJ_NEW(opal_value_t);
|
|
kv->key = strdup(OPAL_PMIX_QUERY_REPORT_AVG);
|
|
kv->type = OPAL_BOOL;
|
|
kv->data.flag = true;
|
|
opal_list_append(&q->qualifiers, &kv->super);
|
|
kv = OBJ_NEW(opal_value_t);
|
|
kv->key = strdup(OPAL_PMIX_QUERY_REPORT_MINMAX);
|
|
kv->type = OPAL_BOOL;
|
|
kv->data.flag = true;
|
|
opal_list_append(&q->qualifiers, &kv->super);
|
|
/* issue the request */
|
|
wait_for_release = true;
|
|
opal_pmix.query(&query, qcbfunc, (void*)&response);
|
|
/* wait for the query to complete */
|
|
while (wait_for_release) {
|
|
usleep(10);
|
|
}
|
|
wait_for_release = true;
|
|
/* log my own results as a single string so the output
|
|
* doesn't get garbled on the other end */
|
|
asprintf(&tmp, "Data for node %s", orte_process_info.nodename);
|
|
opal_argv_append_nosize(&answer, tmp);
|
|
free(tmp);
|
|
OPAL_LIST_FOREACH(kv, &response, opal_value_t) {
|
|
lt = (opal_list_t*)kv->data.ptr;
|
|
OPAL_LIST_FOREACH(ival, lt, opal_value_t) {
|
|
if (0 == strcmp(ival->key, OPAL_PMIX_DAEMON_MEMORY)) {
|
|
asprintf(&tmp, "\tDaemon: %f", ival->data.fval);
|
|
opal_argv_append_nosize(&answer, tmp);
|
|
free(tmp);
|
|
} else if (0 == strcmp(ival->key, OPAL_PMIX_CLIENT_AVG_MEMORY)) {
|
|
asprintf(&tmp, "\tClient: %f", ival->data.fval);
|
|
opal_argv_append_nosize(&answer, tmp);
|
|
free(tmp);
|
|
} else {
|
|
fprintf(stderr, "\tUnknown key: %s", ival->key);
|
|
}
|
|
}
|
|
}
|
|
opal_argv_append_nosize(&answer, "\n");
|
|
OPAL_LIST_DESTRUCT(&response);
|
|
|
|
/* construct the log output */
|
|
OBJ_CONSTRUCT(&response, opal_list_t);
|
|
kv = OBJ_NEW(opal_value_t);
|
|
kv->key = strdup(OPAL_PMIX_LOG_STDOUT);
|
|
kv->type = OPAL_STRING;
|
|
kv->data.string = opal_argv_join(answer, '\n');
|
|
opal_list_append(&response, &kv->super);
|
|
opal_argv_free(answer);
|
|
active = -1;
|
|
opal_pmix.log(&response, notifycbfunc, (void*)&active);
|
|
while (-1 == active) {
|
|
usleep(10);
|
|
}
|
|
OPAL_LIST_DESTRUCT(&response);
|
|
|
|
|
|
if (0 == rank) {
|
|
/* send the notification to release the other procs */
|
|
wait_for_release = true;
|
|
OBJ_CONSTRUCT(&response, opal_list_t);
|
|
kv = OBJ_NEW(opal_value_t);
|
|
kv->key = strdup(OPAL_PMIX_EVENT_NON_DEFAULT);
|
|
kv->type = OPAL_BOOL;
|
|
kv->data.flag = true;
|
|
opal_list_append(&response, &kv->super);
|
|
active = -1;
|
|
if (OPAL_SUCCESS != opal_pmix.notify_event(MEMPROBE_RELEASE, NULL,
|
|
OPAL_PMIX_RANGE_GLOBAL, &response,
|
|
notifycbfunc, (void*)&active)) {
|
|
fprintf(stderr, "Notify event failed\n");
|
|
exit(1);
|
|
}
|
|
while (-1 == active) {
|
|
usleep(10);
|
|
}
|
|
OPAL_LIST_DESTRUCT(&response);
|
|
}
|
|
|
|
/* now wait for notification */
|
|
while (wait_for_release) {
|
|
usleep(10);
|
|
}
|
|
}
|
|
|
|
int main(int argc, char* argv[])
|
|
{
|
|
opal_list_t *codes;
|
|
opal_value_t *kv;
|
|
volatile int active;
|
|
|
|
MPI_Init(&argc, &argv);
|
|
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
|
|
MPI_Comm_size(MPI_COMM_WORLD, &size);
|
|
|
|
if (0 == rank) {
|
|
fprintf(stderr, "Sampling memory usage after MPI_Init\n");
|
|
}
|
|
|
|
/* everyone registers their event handler */
|
|
codes = OBJ_NEW(opal_list_t);
|
|
kv = OBJ_NEW(opal_value_t);
|
|
kv->key = strdup("errorcode");
|
|
kv->type = OPAL_INT;
|
|
kv->data.integer = MEMPROBE_RELEASE;
|
|
opal_list_append(codes, &kv->super);
|
|
|
|
active = -1;
|
|
opal_pmix.register_evhandler(codes, NULL, _release_fn, _register_fn, (void*)&active);
|
|
while (-1 == active) {
|
|
usleep(10);
|
|
}
|
|
|
|
/* if I am the local leader (i.e., local_rank=0), then I ask
|
|
* my daemon to report the local memory usage, and send it
|
|
* to rank=0 */
|
|
if (0 == orte_process_info.my_local_rank) {
|
|
sample();
|
|
} else {
|
|
/* now wait for notification */
|
|
while (wait_for_release) {
|
|
usleep(10);
|
|
}
|
|
}
|
|
wait_for_release = true;
|
|
|
|
/* perform a barrier so some communication will occur, thus
|
|
* requiring exchange of endpoint info */
|
|
MPI_Barrier(MPI_COMM_WORLD);
|
|
|
|
if (0 == rank) {
|
|
fprintf(stderr, "\n\nSampling memory usage after MPI_Barrier\n");
|
|
}
|
|
|
|
if (0 == orte_process_info.my_local_rank) {
|
|
if (0 != rank) {
|
|
/* wait a little */
|
|
usleep(1000);
|
|
}
|
|
sample();
|
|
} else {
|
|
/* wait again while memory is sampled */
|
|
while (wait_for_release) {
|
|
usleep(10);
|
|
}
|
|
}
|
|
|
|
MPI_Finalize();
|
|
return 0;
|
|
}
|