1
1
openmpi/orte/tools/orte-top/orte-top.c
2015-06-23 20:59:57 -07:00

1015 строки
30 KiB
C

/*
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2007-2012 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2007-2013 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2015 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include <stdio.h>
#include <ctype.h>
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif
#ifdef HAVE_NETDB_H
#include <netdb.h>
#endif
#ifdef HAVE_SYS_PARAM_H
#include <sys/param.h>
#endif
#include <fcntl.h>
#include <errno.h>
#include <signal.h>
#include <stdlib.h>
#include "opal/util/cmd_line.h"
#include "opal/util/argv.h"
#include "opal/util/show_help.h"
#include "opal/util/opal_environ.h"
#include "opal/dss/dss.h"
#include "opal/mca/base/base.h"
#include "opal/runtime/opal.h"
#include "opal/mca/event/event.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/rml/rml.h"
#include "orte/mca/odls/odls_types.h"
#include "orte/mca/routed/routed.h"
#include "orte/runtime/orte_globals.h"
#include "orte/util/hnp_contact.h"
#include "orte/util/name_fns.h"
#include "orte/util/show_help.h"
#include "orte/util/proc_info.h"
#include "orte/runtime/orte_wait.h"
#include "orte/mca/rml/base/rml_contact.h"
#include "orte/runtime/orte_quit.h"
/*
* Local variables & functions
*/
static void abort_exit_callback(int fd, short flags, void *arg);
static opal_event_t term_handler;
static opal_event_t int_handler;
static opal_list_t hnp_list;
static bool all_recvd;
static int32_t num_replies;
static int32_t num_recvd;
static opal_buffer_t cmdbuf;
static FILE *fp = NULL;
static bool help;
static char *hnppidstr;
static char *hnpuristr;
static char *ranks;
static orte_hnp_contact_t *target_hnp;
static int update_rate;
static bool timestamp;
static char *logfile;
static bool bynode;
static opal_list_t recvd_stats;
static char *sample_time;
static bool need_header = true;
static int num_lines=0;
static bool fields_set = false;
static int nodefield = 0;
static int rankfield = 0;
static int pidfield = 0;
static int cmdfield = 0;
static int timefield = 6;
static int prifield = 0;
static int thrfield = 0;
static int vsizefield = 0;
static int rssfield = 0;
static int pkvfield = 0;
static int pfield = 0;
/* flag what fields were actually found */
static bool pri_found = false;
static bool thr_found = false;
static bool vsize_found = false;
static bool rss_found = false;
static bool pkv_found = false;
static bool p_found = false;
#define MAX_LINES 20
opal_cmd_line_init_t cmd_line_opts[] = {
{ NULL,
'h', NULL, "help",
0,
&help, OPAL_CMD_LINE_TYPE_BOOL,
"This help message" },
{ NULL,
'\0', "pid", "pid",
1,
&hnppidstr, OPAL_CMD_LINE_TYPE_STRING,
"The pid of the mpirun that you wish to query/monitor" },
{ NULL,
'\0', "uri", "uri",
1,
&hnpuristr, OPAL_CMD_LINE_TYPE_STRING,
"The uri of the mpirun that you wish to query/monitor" },
{ NULL,
'\0', "rank", "rank",
1,
&ranks, OPAL_CMD_LINE_TYPE_STRING,
"Rank whose resource usage is to be displayed/monitored" },
{ NULL,
'\0', "update-rate", "update-rate",
1,
&update_rate, OPAL_CMD_LINE_TYPE_INT,
"Number of seconds between updates" },
{ NULL,
'\0', "timestamp", "timestamp",
0,
&timestamp, OPAL_CMD_LINE_TYPE_BOOL,
"Time stamp each sample" },
{ NULL,
'\0', "log-file", "log-file",
1,
&logfile, OPAL_CMD_LINE_TYPE_STRING,
"Output file for returned statistics" },
{ NULL,
'\0', "bynode", "bynode",
0,
&bynode, OPAL_CMD_LINE_TYPE_BOOL,
"Group statistics by node, sorted by rank within each node" },
/* End of list */
{ NULL,
'\0', NULL, NULL,
0,
NULL, OPAL_CMD_LINE_TYPE_NULL,
NULL }
};
static void recv_stats(int status, orte_process_name_t* sender,
opal_buffer_t *buffer, orte_rml_tag_t tag,
void* cbdata);
static void pretty_print(void);
static void print_headers(void);
static void send_cmd(int fd, short dummy, void *arg)
{
int ret;
opal_buffer_t *buf;
all_recvd = false;
num_replies = INT_MAX;
num_recvd = 0;
buf = OBJ_NEW(opal_buffer_t);
opal_dss.copy_payload(buf, &cmdbuf);
if (0 > (ret = orte_rml.send_buffer_nb(&(target_hnp->name), buf,
ORTE_RML_TAG_DAEMON,
orte_rml_send_callback, NULL))) {
ORTE_ERROR_LOG(ret);
OBJ_RELEASE(buf);
orte_quit(0,0,NULL);
return;
}
}
int
main(int argc, char *argv[])
{
int ret;
opal_cmd_line_t cmd_line;
opal_list_item_t* item = NULL;
orte_daemon_cmd_flag_t command;
pid_t hnppid;
orte_process_name_t proc;
char **r1=NULL, **r2;
int i;
orte_vpid_t vstart, vend;
int vint;
/***************
* Initialize
***************/
/*
* Make sure to init util before parse_args
* to ensure installdirs is setup properly
* before calling mca_base_open();
*/
if( ORTE_SUCCESS != (ret = opal_init_util(&argc, &argv)) ) {
return ret;
}
/* initialize the globals */
help = false;
hnppidstr = NULL;
ranks = NULL;
target_hnp = NULL;
update_rate = -1;
timestamp = false;
logfile = NULL;
/* Parse the command line options */
opal_cmd_line_create(&cmd_line, cmd_line_opts);
mca_base_open();
mca_base_cmd_line_setup(&cmd_line);
ret = opal_cmd_line_parse(&cmd_line, false, argc, argv);
if (OPAL_SUCCESS != ret) {
if (OPAL_ERR_SILENT != ret) {
fprintf(stderr, "%s: command line error (%s)\n", argv[0],
opal_strerror(ret));
}
return 1;
}
/**
* Now start parsing our specific arguments
*/
if (help) {
char *str, *args = NULL;
args = opal_cmd_line_get_usage_msg(&cmd_line);
str = opal_show_help_string("help-orte-top.txt", "orte-top:usage",
true, "orte-top", args);
if (NULL != str) {
printf("%s", str);
free(str);
}
free(args);
/* If we show the help message, that should be all we do */
return 0;
}
/* we are never allowed to operate as a distributed tool,
* so insist on the ess/tool component */
opal_setenv("OMPI_MCA_ess", "tool", true, &environ);
/***************************
* We need all of OPAL and the TOOL portion of ORTE
***************************/
if (ORTE_SUCCESS != orte_init(&argc, &argv, ORTE_PROC_TOOL)) {
orte_finalize();
return 1;
}
/* setup the list for recvd stats */
OBJ_CONSTRUCT(&recvd_stats, opal_list_t);
/** setup callbacks for abort signals - from this point
* forward, we need to abort in a manner that allows us
* to cleanup
*/
opal_event_signal_set(orte_event_base, &term_handler, SIGTERM,
abort_exit_callback, &term_handler);
opal_event_signal_add(&term_handler, NULL);
opal_event_signal_set(orte_event_base, &int_handler, SIGINT,
abort_exit_callback, &int_handler);
opal_event_signal_add(&int_handler, NULL);
/*
* Must specify the mpirun pid
*/
if (NULL != hnppidstr) {
if (0 == strncmp(hnppidstr, "file", strlen("file")) ||
0 == strncmp(hnppidstr, "FILE", strlen("FILE"))) {
char input[1024], *filename;
FILE *fp;
/* it is a file - get the filename */
filename = strchr(hnppidstr, ':');
if (NULL == filename) {
/* filename is not correctly formatted */
orte_show_help("help-orte-top.txt", "orte-top:hnp-filename-bad", true, "pid", hnppidstr);
orte_finalize();
exit(1);
}
++filename; /* space past the : */
if (0 >= strlen(filename)) {
/* they forgot to give us the name! */
orte_show_help("help-orte-top.txt", "orte-top:hnp-filename-bad", true, "pid", hnppidstr);
orte_finalize();
exit(1);
}
/* open the file and extract the pid */
fp = fopen(filename, "r");
if (NULL == fp) { /* can't find or read file! */
orte_show_help("help-orte-top.txt", "orte-top:hnp-filename-access", true, filename);
orte_finalize();
exit(1);
}
if (NULL == fgets(input, 1024, fp)) {
/* something malformed about file */
fclose(fp);
orte_show_help("help-orte-top.txt", "orte-top:hnp-file-bad", true, filename);
orte_finalize();
exit(1);
}
fclose(fp);
input[strlen(input)-1] = '\0'; /* remove newline */
/* convert the pid */
hnppid = strtoul(input, NULL, 10);
} else {
/* should just be the pid itself */
hnppid = strtoul(hnppidstr, NULL, 10);
}
/*
* Get the list of available hnp's and setup contact info
* to them in the RML
*/
OBJ_CONSTRUCT(&hnp_list, opal_list_t);
if (ORTE_SUCCESS != (ret = orte_list_local_hnps(&hnp_list, true) ) ) {
orte_show_help("help-orte-top.txt", "orte-top:pid-not-found", true, hnppid);
orte_finalize();
exit(1);
}
/*
* For each hnp in the listing
*/
while (NULL != (item = opal_list_remove_first(&hnp_list))) {
orte_hnp_contact_t *hnp = (orte_hnp_contact_t*)item;
if (hnppid == hnp->pid) {
/* this is the one we want */
target_hnp = hnp;
/* let it continue to run so we deconstruct the list */
continue;
}
OBJ_RELEASE(hnp);
}
OBJ_DESTRUCT(&hnp_list);
/* if we get here without finding the one we wanted, then abort */
if (NULL == target_hnp) {
orte_show_help("help-orte-top.txt", "orte-top:pid-not-found", true, hnppid);
orte_finalize();
exit(1);
}
} else if (NULL != hnpuristr) {
if (0 == strncmp(hnpuristr, "file", strlen("file")) ||
0 == strncmp(hnpuristr, "FILE", strlen("FILE"))) {
char input[1024], *filename;
FILE *fp;
/* it is a file - get the filename */
filename = strchr(hnpuristr, ':');
if (NULL == filename) {
/* filename is not correctly formatted */
orte_show_help("help-orte-top.txt", "orte-top:hnp-filename-bad", true, "uri", hnpuristr);
orte_finalize();
exit(1);
}
++filename; /* space past the : */
if (0 >= strlen(filename)) {
/* they forgot to give us the name! */
orte_show_help("help-orte-top.txt", "orte-top:hnp-filename-bad", true, "uri", hnpuristr);
orte_finalize();
exit(1);
}
/* open the file and extract the uri */
fp = fopen(filename, "r");
if (NULL == fp) { /* can't find or read file! */
orte_show_help("help-orte-top.txt", "orte-top:hnp-filename-access", true, filename);
orte_finalize();
exit(1);
}
if (NULL == fgets(input, 1024, fp)) {
/* something malformed about file */
fclose(fp);
orte_show_help("help-orte-top.txt", "orte-top:hnp-file-bad", true, filename);
orte_finalize();
exit(1);
}
fclose(fp);
input[strlen(input)-1] = '\0'; /* remove newline */
/* construct the target hnp info */
target_hnp = OBJ_NEW(orte_hnp_contact_t);
target_hnp->rml_uri = strdup(input);
} else {
/* should just be the uri itself - construct the target hnp info */
target_hnp = OBJ_NEW(orte_hnp_contact_t);
target_hnp->rml_uri = strdup(hnpuristr);
}
/* set the info in our contact table */
orte_rml.set_contact_info(target_hnp->rml_uri);
/* extract the name */
if (ORTE_SUCCESS != orte_rml_base_parse_uris(target_hnp->rml_uri, &target_hnp->name, NULL)) {
orte_show_help("help-orte-top.txt", "orte-top:hnp-uri-bad", true, target_hnp->rml_uri);
orte_finalize();
exit(1);
}
/* set the route to be direct */
if (ORTE_SUCCESS != orte_routed.update_route(&target_hnp->name, &target_hnp->name)) {
orte_show_help("help-orte-top.txt", "orte-top:hnp-uri-bad", true, target_hnp->rml_uri);
orte_finalize();
exit(1);
}
} else {
orte_show_help("help-orte-top.txt", "orte-top:no-contact-given", true);
orte_finalize();
exit(1);
}
/* set the target hnp as our lifeline so we will terminate if it exits */
orte_routed.set_lifeline(&target_hnp->name);
/* if an output file was specified, open it */
if (NULL != logfile) {
fp = fopen(logfile, "w");
if (NULL == fp) {
orte_show_help("help-orte-top.txt", "orte-top:cant-open-logfile", true, logfile);
orte_finalize();
exit(1);
}
} else {
fp = stdout;
}
/* setup a non-blocking recv to get answers - we don't know how
* many daemons are going to send replies, so we just have to
* accept whatever comes back
*/
orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_TOOL,
ORTE_RML_NON_PERSISTENT, recv_stats, NULL);
/* setup the command to get the resource usage */
OBJ_CONSTRUCT(&cmdbuf, opal_buffer_t);
command = ORTE_DAEMON_TOP_CMD;
if (ORTE_SUCCESS != (ret = opal_dss.pack(&cmdbuf, &command, 1, ORTE_DAEMON_CMD))) {
ORTE_ERROR_LOG(ret);
goto cleanup;
}
proc.jobid = ORTE_PROC_MY_NAME->jobid+1; /* only support initial launch at this time */
/* parse the rank list - this can be a comma-separated list of ranks,
* each element being either a single rank or a range. We also allow
* for a -1 to indicate all ranks. If not rank is given, we assume -1
*/
if (NULL == ranks) {
/* take all ranks */
proc.vpid = ORTE_VPID_WILDCARD;
if (ORTE_SUCCESS != (ret = opal_dss.pack(&cmdbuf, &proc, 1, ORTE_NAME))) {
ORTE_ERROR_LOG(ret);
goto cleanup;
}
goto SEND;
}
/* split on commas */
r1 = opal_argv_split(ranks, ',');
/* for each resulting element, check for range */
for (i=0; i < opal_argv_count(r1); i++) {
r2 = opal_argv_split(r1[i], '-');
if (1 < opal_argv_count(r2)) {
/* given range - get start and end */
vstart = strtol(r2[0], NULL, 10);
vend = strtol(r2[1], NULL, 10);
} else {
/* check for wildcard - have to do this here because
* the -1 would have been caught in the split
*/
vint = strtol(r1[i], NULL, 10);
if (-1 == vint) {
proc.vpid = ORTE_VPID_WILDCARD;
if (ORTE_SUCCESS != (ret = opal_dss.pack(&cmdbuf, &proc, 1, ORTE_NAME))) {
ORTE_ERROR_LOG(ret);
goto cleanup;
}
opal_argv_free(r2);
goto SEND;
}
vstart = strtol(r2[0], NULL, 10);
vend = vstart + 1;
}
for (proc.vpid = vstart; proc.vpid < vend; proc.vpid++) {
if (ORTE_SUCCESS != (ret = opal_dss.pack(&cmdbuf, &proc, 1, ORTE_NAME))) {
ORTE_ERROR_LOG(ret);
goto cleanup;
}
}
opal_argv_free(r2);
}
SEND:
if (NULL != r1) {
opal_argv_free(r1);
}
send_cmd(0, 0, NULL);
/* now wait until the termination event fires */
while (orte_event_base_active) {
opal_event_loop(orte_event_base, OPAL_EVLOOP_ONCE);
}
/***************
* Cleanup
***************/
cleanup:
/* Remove the TERM and INT signal handlers */
opal_event_signal_del(&term_handler);
opal_event_signal_del(&int_handler);
while (NULL != (item = opal_list_remove_first(&recvd_stats))) {
OBJ_RELEASE(item);
}
OBJ_DESTRUCT(&recvd_stats);
OBJ_DESTRUCT(&cmdbuf);
if (NULL != fp && fp != stdout) {
fclose(fp);
}
orte_finalize();
return ret;
}
static void abort_exit_callback(int fd, short ign, void *arg)
{
opal_list_item_t *item;
/* Remove the TERM and INT signal handlers */
opal_event_signal_del(&term_handler);
OBJ_DESTRUCT(&term_handler);
opal_event_signal_del(&int_handler);
OBJ_DESTRUCT(&int_handler);
while (NULL != (item = opal_list_remove_first(&recvd_stats))) {
OBJ_RELEASE(item);
}
OBJ_DESTRUCT(&recvd_stats);
OBJ_DESTRUCT(&cmdbuf);
if (NULL != fp && fp != stdout) {
fclose(fp);
}
ORTE_UPDATE_EXIT_STATUS(1);
orte_quit(0,0,NULL);
}
static void recv_stats(int status, orte_process_name_t* sender,
opal_buffer_t *buffer, orte_rml_tag_t tag,
void* cbdata)
{
int32_t n;
opal_pstats_t *stats;
orte_process_name_t proc;
int ret;
/* if the sender is the HNP we contacted, this message
* contains info on the number of responses we should get
*/
if (sender->vpid == 0) {
n = 1;
if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &num_replies, &n, OPAL_INT32))) {
ORTE_ERROR_LOG(ret);
goto cleanup;
}
n = 1;
if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &sample_time, &n, OPAL_STRING))) {
ORTE_ERROR_LOG(ret);
goto cleanup;
}
}
n = 1;
while (ORTE_SUCCESS == opal_dss.unpack(buffer, &proc, &n, ORTE_NAME)) {
n = 1;
if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &stats, &n, OPAL_PSTAT))) {
ORTE_ERROR_LOG(ret);
goto cleanup;
}
/* if field sizes are not yet set, do so now */
if (!fields_set) {
int tmp;
char *ctmp;
tmp = strlen(stats->node);
if (nodefield < tmp) {
nodefield = tmp;
}
asprintf(&ctmp, "%d", stats->rank);
tmp = strlen(ctmp);
free(ctmp);
if (rankfield < tmp) {
rankfield = tmp;
}
asprintf(&ctmp, "%lu", (unsigned long)stats->pid);
tmp = strlen(ctmp);
free(ctmp);
if (pidfield < tmp) {
pidfield = tmp;
}
tmp = strlen(stats->cmd);
if (cmdfield < tmp) {
cmdfield = tmp;
}
if (0 <= stats->priority) {
pri_found = true;
asprintf(&ctmp, "%d", stats->priority);
tmp = strlen(ctmp);
free(ctmp);
if (prifield < tmp) {
prifield = tmp;
}
}
if (0 <= stats->num_threads) {
thr_found = true;
asprintf(&ctmp, "%d", stats->num_threads);
tmp = strlen(ctmp);
free(ctmp);
if (thrfield < tmp) {
thrfield = tmp;
}
}
if (0 < stats->vsize) {
vsize_found = true;
asprintf(&ctmp, "%8.2f", stats->vsize);
tmp = strlen(ctmp);
free(ctmp);
if (vsizefield < tmp) {
vsizefield = tmp;
}
}
if (0 < stats->rss) {
rss_found = true;
asprintf(&ctmp, "%8.2f", stats->rss);
tmp = strlen(ctmp);
free(ctmp);
if (rssfield < tmp) {
rssfield = tmp;
}
}
if (0 < stats->peak_vsize) {
pkv_found = true;
asprintf(&ctmp, "%8.2f", stats->peak_vsize);
tmp = strlen(ctmp);
free(ctmp);
if (pkvfield < tmp) {
pkvfield = tmp;
}
}
if (0 <= stats->processor) {
p_found = true;
asprintf(&ctmp, "%d", stats->processor);
tmp = strlen(ctmp);
free(ctmp);
if (pfield < tmp) {
pfield = tmp;
}
}
}
/* add it to the list */
opal_list_append(&recvd_stats, &stats->super);
}
cleanup:
/* check for completion */
num_recvd++;
if (num_replies <= num_recvd) {
/* flag that field sizes are set */
fields_set = true;
/* pretty-print what we got */
pretty_print();
/* see if we want to do it again */
if (0 < update_rate) {
ORTE_TIMER_EVENT(update_rate, 0, send_cmd, ORTE_SYS_PRI);
} else {
orte_finalize();
exit(0);
}
}
/* repost the receive */
orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_TOOL,
ORTE_RML_NON_PERSISTENT, recv_stats, NULL);
}
/* static values needed for printing */
static int lennode = 0;
static int lenrank = 0;
static int lenpid = 0;
static int lencmd = 0;
static int lenstate = 0;
static int lentime = 0;
static int lenpri = 0;
static int lenthr = 0;
static int lenvsize = 0;
static int lenrss = 0;
static int lenpkv = 0;
static int lensh = 0;
static int lenp = 0;
static void print_ranks(opal_list_t *statlist)
{
opal_list_item_t *item;
opal_pstats_t *stats, *pstats;
int32_t minrank;
char pretty_time[10];
int i;
/* sort the results by rank */
while (0 < opal_list_get_size(statlist)) {
minrank = INT32_MAX;
pstats = NULL;
for (item = opal_list_get_first(statlist);
item != opal_list_get_end(statlist);
item = opal_list_get_next(item)) {
stats = (opal_pstats_t*)item;
if (stats->rank < minrank) {
pstats = stats;
minrank = stats->rank;
}
}
memset(pretty_time, 0, sizeof(pretty_time));
if (pstats->time.tv_sec >= 3600) {
snprintf(pretty_time, sizeof(pretty_time), "%5.1fH",
(double)pstats->time.tv_sec / (double)(3600));
} else {
snprintf(pretty_time, sizeof(pretty_time), "%3ld:%02ld",
(unsigned long)pstats->time.tv_sec/60,
(unsigned long)pstats->time.tv_sec % 60);
}
if (bynode) {
/* print blanks in the nodename field */
for (i=0; i < lennode; i++) {
fprintf(fp, " ");
}
fprintf(fp, " | ");
/* print fields */
fprintf(fp, "%*d | ", lenrank, pstats->rank);
} else {
fprintf(fp, "%*d | ", lenrank, pstats->rank);
fprintf(fp, "%*s | ", lennode, pstats->node);
}
fprintf(fp, "%*s | ", lencmd, pstats->cmd);
fprintf(fp, "%*lu | ", lenpid, (unsigned long)pstats->pid);
fprintf(fp, "%*c | ", lenstate, pstats->state[0]);
fprintf(fp, "%*s | ", lentime, pretty_time);
if (pri_found) {
fprintf(fp, "%*d | ", lenpri, pstats->priority);
}
if (thr_found) {
fprintf(fp, "%*d | ", lenthr, pstats->num_threads);
}
if (vsize_found) {
fprintf(fp, "%*lu | ", lenvsize, (unsigned long)pstats->vsize);
}
if (rss_found) {
fprintf(fp, "%*lu | ", lenvsize, (unsigned long)pstats->rss);
}
if (pkv_found) {
fprintf(fp, "%*lu | ", lenpkv, (unsigned long)pstats->peak_vsize);
}
if (p_found) {
fprintf(fp, "%*d | ", lenp, pstats->processor);
}
fprintf(fp, "\n");
num_lines++;
opal_list_remove_item(statlist, &pstats->super);
OBJ_RELEASE(pstats);
}
}
static void pretty_print(void)
{
opal_list_item_t *item, *next;
opal_pstats_t *stats;
opal_list_t tmplist;
char *node;
if (bynode) {
if (need_header) {
print_headers();
need_header = false;
}
if (timestamp) {
fprintf(fp, "TIMESTAMP: %s\n", sample_time);
}
if (NULL != sample_time) {
free(sample_time);
sample_time = NULL;
}
/* sort the results by node and then rank */
while (NULL != (item = opal_list_remove_first(&recvd_stats))) {
OBJ_CONSTRUCT(&tmplist, opal_list_t);
stats = (opal_pstats_t*)item;
node = strdup(stats->node);
opal_list_append(&tmplist, &stats->super);
/* cycle through the rest of the list looking
* for matching nodes
*/
item = opal_list_get_first(&recvd_stats);
while (item != opal_list_get_end(&recvd_stats)) {
stats = (opal_pstats_t*)item;
next = opal_list_get_next(item);
if (0 == strcmp(stats->node, node)) {
opal_list_remove_item(&recvd_stats, item);
opal_list_append(&tmplist, &stats->super);
}
item = next;
}
fprintf(fp, "%*s\n", lennode, node);
free(node);
print_ranks(&tmplist);
OBJ_DESTRUCT(&tmplist);
}
} else {
if (need_header) {
print_headers();
need_header = false;
}
if (timestamp) {
fprintf(fp, "\n\nTIMESTAMP: %s\n", sample_time);
}
if (NULL != sample_time) {
free(sample_time);
sample_time = NULL;
}
print_ranks(&recvd_stats);
}
/* provide some separation between iterations */
fprintf(fp, "\n");
/* if we have printed more than MAX_LINES since the last header,
* flag that we need to print the header next time
*/
if (MAX_LINES < num_lines) {
need_header = true;
num_lines = 0;
fprintf(fp, "\n\n");
}
}
static void print_headers(void)
{
int num_fields = 0;
int i;
int linelen;
lennode = strlen("Nodename");
if (nodefield > lennode) {
lennode = nodefield;
}
num_fields++;
lenrank = strlen("Rank");
if (rankfield > lenrank) {
lenrank = rankfield;
}
num_fields++;
lenpid = strlen("Pid");
if (pidfield > lenpid) {
lenpid = pidfield;
}
num_fields++;
lencmd = strlen("Command");
if (cmdfield > lencmd) {
lencmd = cmdfield;
}
num_fields++;
lenstate = strlen("State");
num_fields++;
lentime = strlen("Time");
if (timefield > lentime) {
lentime = timefield;
}
num_fields++;
if (pri_found) {
lenpri = strlen("Pri");
if (prifield > lenpri) {
lenpri = prifield;
}
num_fields++;
}
if (thr_found) {
lenthr = strlen("#threads");
if (thrfield > lenthr) {
lenthr = thrfield;
}
num_fields++;
}
if (vsize_found) {
lenvsize = strlen("Vsize");
if (vsizefield > lenvsize) {
lenvsize = vsizefield;
}
num_fields++;
}
if (rss_found) {
lenrss = strlen("RSS");
if (rssfield > lenrss) {
lenrss = rssfield;
}
num_fields++;
}
if (pkv_found) {
lenpkv = strlen("Peak Vsize");
if (pkvfield > lenpkv) {
lenpkv = pkvfield;
}
num_fields++;
}
if (p_found) {
lenp = strlen("Processor");
if (pfield > lenp) {
lenp = pfield;
}
num_fields++;
}
linelen = lennode + lenrank + lenpid + lencmd + lenstate + lentime + lenpri + lenthr + lenvsize + lenrss + lenpkv + lensh + lenp;
/* add spacing */
linelen += num_fields * 3;
/* print the rip line */
for(i = 0; i < linelen; ++i) {
fprintf(fp, "=");
}
fprintf(fp, "\n");
/* print the header */
if (bynode) {
fprintf(fp, "%*s | ", lennode , "Nodename");
fprintf(fp, "%*s | ", lenrank , "Rank");
} else {
fprintf(fp, "%*s | ", lenrank , "Rank");
fprintf(fp, "%*s | ", lennode , "Nodename");
}
fprintf(fp, "%*s | ", lencmd , "Command");
fprintf(fp, "%*s | ", lenpid , "Pid");
fprintf(fp, "%*s | ", lenstate , "State");
fprintf(fp, "%*s | ", lentime , "Time");
if (pri_found) {
fprintf(fp, "%*s | ", lenpri , "Pri");
}
if (thr_found) {
fprintf(fp, "%*s | ", lenthr , "#threads");
}
if (vsize_found) {
fprintf(fp, "%*s | ", lenvsize , "Vsize");
}
if (rss_found) {
fprintf(fp, "%*s | ", lenrss , "RSS");
}
if (pkv_found) {
fprintf(fp, "%*s | ", lenpkv , "Peak Vsize");
}
if (p_found) {
fprintf(fp, "%*s | ", lenp , "Processor");
}
fprintf(fp, "\n");
/* print the separator */
for(i = 0; i < linelen; ++i) {
fprintf(fp, "-");
}
fprintf(fp, "\n");
}