1
1
openmpi/contrib/scaling/mpi_memprobe.c
Ralph Castain 6509f60929 Complete the memprobe support. This provides a new scaling tool called "mpi_memprobe" that samples the memory footprint of the local daemon and the client procs, and then reports the results. The output contains the footprint of the daemon on each node, plus the average footprint of the client procs on that node.
Samples are taken after MPI_Init, and then again after MPI_Barrier. This allows the user to see memory consumption caused by add_procs, as well as any modex contribution from forming connections if pmix_base_async_modex is given.

Using the probe simply involves executing it via mpirun, with however many copies you want per node. Example:

$ mpirun -npernode 2 ./mpi_memprobe
Sampling memory usage after MPI_Init
Data for node rhc001
	Daemon: 12.483398
	Client: 6.514648

Data for node rhc002
	Daemon: 11.865234
	Client: 4.643555

Sampling memory usage after MPI_Barrier
Data for node rhc001
	Daemon: 12.520508
	Client: 6.576660

Data for node rhc002
	Daemon: 11.879883
	Client: 4.703125

Note that the client value on node rhc001 is larger - this is where rank=0 is housed, and apparently it gets a larger footprint for some reason.

Signed-off-by: Ralph Castain <rhc@open-mpi.org>
2017-01-05 10:32:17 -08:00

246 строки
7.2 KiB
C

/* -*- C -*-
*
* $HEADER$
*
* The most basic of MPI applications
*/
#include "orte_config.h"
#include <stdio.h>
#include "mpi.h"
#include "opal/mca/pmix/pmix.h"
#include "orte/runtime/runtime.h"
#include "orte/util/proc_info.h"
#include "orte/util/name_fns.h"
#include "orte/runtime/orte_globals.h"
#include "orte/mca/errmgr/errmgr.h"
static int rank, size;
static volatile bool wait_for_release = true;
#define MEMPROBE_RELEASE 12345
static void _release_fn(int status,
const opal_process_name_t *source,
opal_list_t *info, opal_list_t *results,
opal_pmix_notification_complete_fn_t cbfunc,
void *cbdata)
{
/* must let the notifier know we are done */
if (NULL != cbfunc) {
cbfunc(OPAL_ERR_HANDLERS_COMPLETE, NULL, NULL, NULL, cbdata);
}
/* flag that the debugger is complete so we can exit */
wait_for_release = false;
}
static void _register_fn(int status,
size_t evhandler_ref,
void *cbdata)
{
volatile int *active = (volatile int*)cbdata;
if (0 != status) {
fprintf(stderr, "Client EVENT HANDLER REGISTRATION FAILED WITH STATUS %d, ref=%lu\n",
status, (unsigned long)evhandler_ref);
}
*active = status;
}
static void qcbfunc(int status,
opal_list_t *info,
void *cbdata,
opal_pmix_release_cbfunc_t release_fn,
void *release_cbdata)
{
opal_list_t *results = (opal_list_t*)cbdata;
opal_value_t *kv;
if (NULL != info) {
while (NULL != (kv = (opal_value_t*)opal_list_remove_first(info))) {
opal_list_append(results, &kv->super);
}
}
if (NULL != release_fn) {
release_fn(release_cbdata);
}
wait_for_release = false;
}
static void notifycbfunc(int status, void *cbdata)
{
volatile int *active = (volatile int*)cbdata;
*active = status;
}
static void sample(void)
{
opal_value_t *kv, *ival;
opal_pmix_query_t *q;
opal_list_t query, response, *lt;
volatile int active;
char **answer = NULL, *tmp, *msg;
OBJ_CONSTRUCT(&query, opal_list_t);
OBJ_CONSTRUCT(&response, opal_list_t);
q = OBJ_NEW(opal_pmix_query_t);
opal_list_append(&query, &q->super);
opal_argv_append_nosize(&q->keys, OPAL_PMIX_QUERY_MEMORY_USAGE);
/* qualify that we just want local avg, min/max values reported */
kv = OBJ_NEW(opal_value_t);
kv->key = strdup(OPAL_PMIX_QUERY_LOCAL_ONLY);
kv->type = OPAL_BOOL;
kv->data.flag = true;
opal_list_append(&q->qualifiers, &kv->super);
kv = OBJ_NEW(opal_value_t);
kv->key = strdup(OPAL_PMIX_QUERY_REPORT_AVG);
kv->type = OPAL_BOOL;
kv->data.flag = true;
opal_list_append(&q->qualifiers, &kv->super);
kv = OBJ_NEW(opal_value_t);
kv->key = strdup(OPAL_PMIX_QUERY_REPORT_MINMAX);
kv->type = OPAL_BOOL;
kv->data.flag = true;
opal_list_append(&q->qualifiers, &kv->super);
/* issue the request */
wait_for_release = true;
opal_pmix.query(&query, qcbfunc, (void*)&response);
/* wait for the query to complete */
while (wait_for_release) {
usleep(10);
}
wait_for_release = true;
/* log my own results as a single string so the output
* doesn't get garbled on the other end */
asprintf(&tmp, "Data for node %s", orte_process_info.nodename);
opal_argv_append_nosize(&answer, tmp);
free(tmp);
OPAL_LIST_FOREACH(kv, &response, opal_value_t) {
lt = (opal_list_t*)kv->data.ptr;
OPAL_LIST_FOREACH(ival, lt, opal_value_t) {
if (0 == strcmp(ival->key, OPAL_PMIX_DAEMON_MEMORY)) {
asprintf(&tmp, "\tDaemon: %f", ival->data.fval);
opal_argv_append_nosize(&answer, tmp);
free(tmp);
} else if (0 == strcmp(ival->key, OPAL_PMIX_CLIENT_AVG_MEMORY)) {
asprintf(&tmp, "\tClient: %f", ival->data.fval);
opal_argv_append_nosize(&answer, tmp);
free(tmp);
} else {
fprintf(stderr, "\tUnknown key: %s", ival->key);
}
}
}
opal_argv_append_nosize(&answer, "\n");
OPAL_LIST_DESTRUCT(&response);
/* construct the log output */
OBJ_CONSTRUCT(&response, opal_list_t);
kv = OBJ_NEW(opal_value_t);
kv->key = strdup(OPAL_PMIX_LOG_STDOUT);
kv->type = OPAL_STRING;
kv->data.string = opal_argv_join(answer, '\n');
opal_list_append(&response, &kv->super);
opal_argv_free(answer);
active = -1;
opal_pmix.log(&response, notifycbfunc, (void*)&active);
while (-1 == active) {
usleep(10);
}
OPAL_LIST_DESTRUCT(&response);
if (0 == rank) {
/* send the notification to release the other procs */
wait_for_release = true;
OBJ_CONSTRUCT(&response, opal_list_t);
kv = OBJ_NEW(opal_value_t);
kv->key = strdup(OPAL_PMIX_EVENT_NON_DEFAULT);
kv->type = OPAL_BOOL;
kv->data.flag = true;
opal_list_append(&response, &kv->super);
active = -1;
if (OPAL_SUCCESS != opal_pmix.notify_event(MEMPROBE_RELEASE, NULL,
OPAL_PMIX_RANGE_GLOBAL, &response,
notifycbfunc, (void*)&active)) {
fprintf(stderr, "Notify event failed\n");
exit(1);
}
while (-1 == active) {
usleep(10);
}
OPAL_LIST_DESTRUCT(&response);
}
/* now wait for notification */
while (wait_for_release) {
usleep(10);
}
}
int main(int argc, char* argv[])
{
opal_list_t *codes;
opal_value_t *kv;
volatile int active;
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &size);
if (0 == rank) {
fprintf(stderr, "Sampling memory usage after MPI_Init\n");
}
/* everyone registers their event handler */
codes = OBJ_NEW(opal_list_t);
kv = OBJ_NEW(opal_value_t);
kv->key = strdup("errorcode");
kv->type = OPAL_INT;
kv->data.integer = MEMPROBE_RELEASE;
opal_list_append(codes, &kv->super);
active = -1;
opal_pmix.register_evhandler(codes, NULL, _release_fn, _register_fn, (void*)&active);
while (-1 == active) {
usleep(10);
}
/* if I am the local leader (i.e., local_rank=0), then I ask
* my daemon to report the local memory usage, and send it
* to rank=0 */
if (0 == orte_process_info.my_local_rank) {
sample();
} else {
/* now wait for notification */
while (wait_for_release) {
usleep(10);
}
}
wait_for_release = true;
/* perform a barrier so some communication will occur, thus
* requiring exchange of endpoint info */
MPI_Barrier(MPI_COMM_WORLD);
if (0 == rank) {
fprintf(stderr, "\n\nSampling memory usage after MPI_Barrier\n");
}
if (0 == orte_process_info.my_local_rank) {
if (0 != rank) {
/* wait a little */
usleep(1000);
}
sample();
} else {
/* wait again while memory is sampled */
while (wait_for_release) {
usleep(10);
}
}
MPI_Finalize();
return 0;
}