42ecffb6d0
Take another shot at untangling the spaghetti orterun: fix for command line parsing orte-submit calls opal_init_util () before parsing out MCA command line options (-mca, -am, etc). This prevents mpirun from setting opal MCA variables for some frameworks as well as the MCA base. This is because when a framework is opened all of its variables are set to read-only. Eventually we want to lift this restriction on some MCA variables but since -mca is affected we must parse out the MCA command line options before opal_init_util(). This commit fixes the bug by adding a new option to opal_cmd_line_parse (ignore unknown option) so orte-submit can pre-parse the command line for MCA options. Signed-off-by: Nathan Hjelm <hjelmn@me.com> Minor cleanups to avoid releasing/recreating the cmd line
1016 строки
30 KiB
C
1016 строки
30 KiB
C
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
|
/*
|
|
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
|
|
* University Research and Technology
|
|
* Corporation. All rights reserved.
|
|
* Copyright (c) 2004-2011 The University of Tennessee and The University
|
|
* of Tennessee Research Foundation. All rights
|
|
* reserved.
|
|
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
|
* University of Stuttgart. All rights reserved.
|
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
|
* All rights reserved.
|
|
* Copyright (c) 2007-2012 Cisco Systems, Inc. All rights reserved.
|
|
* Copyright (c) 2007-2016 Los Alamos National Security, LLC. All rights
|
|
* reserved.
|
|
* Copyright (c) 2015 Intel, Inc. All rights reserved.
|
|
* $COPYRIGHT$
|
|
*
|
|
* Additional copyrights may follow
|
|
*
|
|
* $HEADER$
|
|
*/
|
|
|
|
#include "orte_config.h"
|
|
|
|
#include <stdio.h>
|
|
#include <ctype.h>
|
|
#ifdef HAVE_UNISTD_H
|
|
#include <unistd.h>
|
|
#endif
|
|
#ifdef HAVE_NETDB_H
|
|
#include <netdb.h>
|
|
#endif
|
|
#ifdef HAVE_SYS_PARAM_H
|
|
#include <sys/param.h>
|
|
#endif
|
|
#include <fcntl.h>
|
|
#include <errno.h>
|
|
#include <signal.h>
|
|
#include <stdlib.h>
|
|
|
|
#include "opal/util/cmd_line.h"
|
|
#include "opal/util/argv.h"
|
|
#include "opal/util/show_help.h"
|
|
#include "opal/util/opal_environ.h"
|
|
#include "opal/dss/dss.h"
|
|
#include "opal/mca/base/base.h"
|
|
#include "opal/runtime/opal.h"
|
|
#include "opal/mca/event/event.h"
|
|
|
|
#include "orte/mca/errmgr/errmgr.h"
|
|
#include "orte/mca/rml/rml.h"
|
|
#include "orte/mca/odls/odls_types.h"
|
|
#include "orte/mca/routed/routed.h"
|
|
#include "orte/runtime/orte_globals.h"
|
|
#include "orte/util/hnp_contact.h"
|
|
#include "orte/util/name_fns.h"
|
|
#include "orte/util/show_help.h"
|
|
#include "orte/util/proc_info.h"
|
|
#include "orte/runtime/orte_wait.h"
|
|
#include "orte/mca/rml/base/rml_contact.h"
|
|
#include "orte/runtime/orte_quit.h"
|
|
|
|
/*
|
|
* Local variables & functions
|
|
*/
|
|
static void abort_exit_callback(int fd, short flags, void *arg);
|
|
static opal_event_t term_handler;
|
|
static opal_event_t int_handler;
|
|
static opal_list_t hnp_list;
|
|
static bool all_recvd;
|
|
static int32_t num_replies;
|
|
static int32_t num_recvd;
|
|
static opal_buffer_t cmdbuf;
|
|
static FILE *fp = NULL;
|
|
static bool help;
|
|
static char *hnppidstr;
|
|
static char *hnpuristr;
|
|
static char *ranks;
|
|
static orte_hnp_contact_t *target_hnp;
|
|
static int update_rate;
|
|
static bool timestamp;
|
|
static char *logfile;
|
|
static bool bynode;
|
|
static opal_list_t recvd_stats;
|
|
static char *sample_time;
|
|
static bool need_header = true;
|
|
static int num_lines=0;
|
|
static bool fields_set = false;
|
|
static int nodefield = 0;
|
|
static int rankfield = 0;
|
|
static int pidfield = 0;
|
|
static int cmdfield = 0;
|
|
static int timefield = 6;
|
|
static int prifield = 0;
|
|
static int thrfield = 0;
|
|
static int vsizefield = 0;
|
|
static int rssfield = 0;
|
|
static int pkvfield = 0;
|
|
static int pfield = 0;
|
|
|
|
/* flag what fields were actually found */
|
|
static bool pri_found = false;
|
|
static bool thr_found = false;
|
|
static bool vsize_found = false;
|
|
static bool rss_found = false;
|
|
static bool pkv_found = false;
|
|
static bool p_found = false;
|
|
|
|
#define MAX_LINES 20
|
|
|
|
opal_cmd_line_init_t cmd_line_opts[] = {
|
|
{ NULL,
|
|
'h', NULL, "help",
|
|
0,
|
|
&help, OPAL_CMD_LINE_TYPE_BOOL,
|
|
"This help message" },
|
|
|
|
{ NULL,
|
|
'\0', "pid", "pid",
|
|
1,
|
|
&hnppidstr, OPAL_CMD_LINE_TYPE_STRING,
|
|
"The pid of the mpirun that you wish to query/monitor" },
|
|
|
|
{ NULL,
|
|
'\0', "uri", "uri",
|
|
1,
|
|
&hnpuristr, OPAL_CMD_LINE_TYPE_STRING,
|
|
"The uri of the mpirun that you wish to query/monitor" },
|
|
|
|
{ NULL,
|
|
'\0', "rank", "rank",
|
|
1,
|
|
&ranks, OPAL_CMD_LINE_TYPE_STRING,
|
|
"Rank whose resource usage is to be displayed/monitored" },
|
|
|
|
{ NULL,
|
|
'\0', "update-rate", "update-rate",
|
|
1,
|
|
&update_rate, OPAL_CMD_LINE_TYPE_INT,
|
|
"Number of seconds between updates" },
|
|
|
|
{ NULL,
|
|
'\0', "timestamp", "timestamp",
|
|
0,
|
|
×tamp, OPAL_CMD_LINE_TYPE_BOOL,
|
|
"Time stamp each sample" },
|
|
|
|
{ NULL,
|
|
'\0', "log-file", "log-file",
|
|
1,
|
|
&logfile, OPAL_CMD_LINE_TYPE_STRING,
|
|
"Output file for returned statistics" },
|
|
|
|
{ NULL,
|
|
'\0', "bynode", "bynode",
|
|
0,
|
|
&bynode, OPAL_CMD_LINE_TYPE_BOOL,
|
|
"Group statistics by node, sorted by rank within each node" },
|
|
|
|
/* End of list */
|
|
{ NULL,
|
|
'\0', NULL, NULL,
|
|
0,
|
|
NULL, OPAL_CMD_LINE_TYPE_NULL,
|
|
NULL }
|
|
};
|
|
|
|
|
|
static void recv_stats(int status, orte_process_name_t* sender,
|
|
opal_buffer_t *buffer, orte_rml_tag_t tag,
|
|
void* cbdata);
|
|
|
|
static void pretty_print(void);
|
|
static void print_headers(void);
|
|
|
|
static void send_cmd(int fd, short dummy, void *arg)
|
|
{
|
|
int ret;
|
|
opal_buffer_t *buf;
|
|
|
|
all_recvd = false;
|
|
num_replies = INT_MAX;
|
|
num_recvd = 0;
|
|
buf = OBJ_NEW(opal_buffer_t);
|
|
opal_dss.copy_payload(buf, &cmdbuf);
|
|
if (0 > (ret = orte_rml.send_buffer_nb(&(target_hnp->name), buf,
|
|
ORTE_RML_TAG_DAEMON,
|
|
orte_rml_send_callback, NULL))) {
|
|
ORTE_ERROR_LOG(ret);
|
|
OBJ_RELEASE(buf);
|
|
orte_quit(0,0,NULL);
|
|
return;
|
|
}
|
|
}
|
|
|
|
int
|
|
main(int argc, char *argv[])
|
|
{
|
|
int ret;
|
|
opal_cmd_line_t cmd_line;
|
|
opal_list_item_t* item = NULL;
|
|
orte_daemon_cmd_flag_t command;
|
|
pid_t hnppid;
|
|
orte_process_name_t proc;
|
|
char **r1=NULL, **r2;
|
|
int i;
|
|
orte_vpid_t vstart, vend;
|
|
int vint;
|
|
|
|
/***************
|
|
* Initialize
|
|
***************/
|
|
|
|
/*
|
|
* Make sure to init util before parse_args
|
|
* to ensure installdirs is setup properly
|
|
* before calling mca_base_open();
|
|
*/
|
|
if( ORTE_SUCCESS != (ret = opal_init_util(&argc, &argv)) ) {
|
|
return ret;
|
|
}
|
|
|
|
/* initialize the globals */
|
|
help = false;
|
|
hnppidstr = NULL;
|
|
ranks = NULL;
|
|
target_hnp = NULL;
|
|
update_rate = -1;
|
|
timestamp = false;
|
|
logfile = NULL;
|
|
|
|
/* Parse the command line options */
|
|
opal_cmd_line_create(&cmd_line, cmd_line_opts);
|
|
|
|
mca_base_open();
|
|
mca_base_cmd_line_setup(&cmd_line);
|
|
ret = opal_cmd_line_parse(&cmd_line, false, false, argc, argv);
|
|
if (OPAL_SUCCESS != ret) {
|
|
if (OPAL_ERR_SILENT != ret) {
|
|
fprintf(stderr, "%s: command line error (%s)\n", argv[0],
|
|
opal_strerror(ret));
|
|
}
|
|
return 1;
|
|
}
|
|
|
|
/**
|
|
* Now start parsing our specific arguments
|
|
*/
|
|
if (help) {
|
|
char *str, *args = NULL;
|
|
args = opal_cmd_line_get_usage_msg(&cmd_line);
|
|
str = opal_show_help_string("help-orte-top.txt", "orte-top:usage",
|
|
true, "orte-top", args);
|
|
if (NULL != str) {
|
|
printf("%s", str);
|
|
free(str);
|
|
}
|
|
free(args);
|
|
/* If we show the help message, that should be all we do */
|
|
return 0;
|
|
}
|
|
|
|
/* we are never allowed to operate as a distributed tool,
|
|
* so insist on the ess/tool component */
|
|
opal_setenv("OMPI_MCA_ess", "tool", true, &environ);
|
|
|
|
/***************************
|
|
* We need all of OPAL and the TOOL portion of ORTE
|
|
***************************/
|
|
if (ORTE_SUCCESS != orte_init(&argc, &argv, ORTE_PROC_TOOL)) {
|
|
orte_finalize();
|
|
return 1;
|
|
}
|
|
|
|
/* setup the list for recvd stats */
|
|
OBJ_CONSTRUCT(&recvd_stats, opal_list_t);
|
|
|
|
/** setup callbacks for abort signals - from this point
|
|
* forward, we need to abort in a manner that allows us
|
|
* to cleanup
|
|
*/
|
|
opal_event_signal_set(orte_event_base, &term_handler, SIGTERM,
|
|
abort_exit_callback, &term_handler);
|
|
opal_event_signal_add(&term_handler, NULL);
|
|
opal_event_signal_set(orte_event_base, &int_handler, SIGINT,
|
|
abort_exit_callback, &int_handler);
|
|
opal_event_signal_add(&int_handler, NULL);
|
|
|
|
/*
|
|
* Must specify the mpirun pid
|
|
*/
|
|
if (NULL != hnppidstr) {
|
|
if (0 == strncmp(hnppidstr, "file", strlen("file")) ||
|
|
0 == strncmp(hnppidstr, "FILE", strlen("FILE"))) {
|
|
char input[1024], *filename;
|
|
FILE *fp;
|
|
|
|
/* it is a file - get the filename */
|
|
filename = strchr(hnppidstr, ':');
|
|
if (NULL == filename) {
|
|
/* filename is not correctly formatted */
|
|
orte_show_help("help-orte-top.txt", "orte-top:hnp-filename-bad", true, "pid", hnppidstr);
|
|
orte_finalize();
|
|
exit(1);
|
|
}
|
|
++filename; /* space past the : */
|
|
|
|
if (0 >= strlen(filename)) {
|
|
/* they forgot to give us the name! */
|
|
orte_show_help("help-orte-top.txt", "orte-top:hnp-filename-bad", true, "pid", hnppidstr);
|
|
orte_finalize();
|
|
exit(1);
|
|
}
|
|
|
|
/* open the file and extract the pid */
|
|
fp = fopen(filename, "r");
|
|
if (NULL == fp) { /* can't find or read file! */
|
|
orte_show_help("help-orte-top.txt", "orte-top:hnp-filename-access", true, filename);
|
|
orte_finalize();
|
|
exit(1);
|
|
}
|
|
if (NULL == fgets(input, 1024, fp)) {
|
|
/* something malformed about file */
|
|
fclose(fp);
|
|
orte_show_help("help-orte-top.txt", "orte-top:hnp-file-bad", true, filename);
|
|
orte_finalize();
|
|
exit(1);
|
|
}
|
|
fclose(fp);
|
|
input[strlen(input)-1] = '\0'; /* remove newline */
|
|
/* convert the pid */
|
|
hnppid = strtoul(input, NULL, 10);
|
|
} else {
|
|
/* should just be the pid itself */
|
|
hnppid = strtoul(hnppidstr, NULL, 10);
|
|
}
|
|
/*
|
|
* Get the list of available hnp's and setup contact info
|
|
* to them in the RML
|
|
*/
|
|
OBJ_CONSTRUCT(&hnp_list, opal_list_t);
|
|
if (ORTE_SUCCESS != (ret = orte_list_local_hnps(&hnp_list, true) ) ) {
|
|
orte_show_help("help-orte-top.txt", "orte-top:pid-not-found", true, hnppid);
|
|
orte_finalize();
|
|
exit(1);
|
|
}
|
|
|
|
/*
|
|
* For each hnp in the listing
|
|
*/
|
|
while (NULL != (item = opal_list_remove_first(&hnp_list))) {
|
|
orte_hnp_contact_t *hnp = (orte_hnp_contact_t*)item;
|
|
if (hnppid == hnp->pid) {
|
|
/* this is the one we want */
|
|
target_hnp = hnp;
|
|
/* let it continue to run so we deconstruct the list */
|
|
continue;
|
|
}
|
|
OBJ_RELEASE(hnp);
|
|
}
|
|
OBJ_DESTRUCT(&hnp_list);
|
|
|
|
/* if we get here without finding the one we wanted, then abort */
|
|
if (NULL == target_hnp) {
|
|
orte_show_help("help-orte-top.txt", "orte-top:pid-not-found", true, hnppid);
|
|
orte_finalize();
|
|
exit(1);
|
|
}
|
|
} else if (NULL != hnpuristr) {
|
|
if (0 == strncmp(hnpuristr, "file", strlen("file")) ||
|
|
0 == strncmp(hnpuristr, "FILE", strlen("FILE"))) {
|
|
char input[1024], *filename;
|
|
FILE *fp;
|
|
|
|
/* it is a file - get the filename */
|
|
filename = strchr(hnpuristr, ':');
|
|
if (NULL == filename) {
|
|
/* filename is not correctly formatted */
|
|
orte_show_help("help-orte-top.txt", "orte-top:hnp-filename-bad", true, "uri", hnpuristr);
|
|
orte_finalize();
|
|
exit(1);
|
|
}
|
|
++filename; /* space past the : */
|
|
|
|
if (0 >= strlen(filename)) {
|
|
/* they forgot to give us the name! */
|
|
orte_show_help("help-orte-top.txt", "orte-top:hnp-filename-bad", true, "uri", hnpuristr);
|
|
orte_finalize();
|
|
exit(1);
|
|
}
|
|
|
|
/* open the file and extract the uri */
|
|
fp = fopen(filename, "r");
|
|
if (NULL == fp) { /* can't find or read file! */
|
|
orte_show_help("help-orte-top.txt", "orte-top:hnp-filename-access", true, filename);
|
|
orte_finalize();
|
|
exit(1);
|
|
}
|
|
if (NULL == fgets(input, 1024, fp)) {
|
|
/* something malformed about file */
|
|
fclose(fp);
|
|
orte_show_help("help-orte-top.txt", "orte-top:hnp-file-bad", true, filename);
|
|
orte_finalize();
|
|
exit(1);
|
|
}
|
|
fclose(fp);
|
|
input[strlen(input)-1] = '\0'; /* remove newline */
|
|
/* construct the target hnp info */
|
|
target_hnp = OBJ_NEW(orte_hnp_contact_t);
|
|
target_hnp->rml_uri = strdup(input);
|
|
} else {
|
|
/* should just be the uri itself - construct the target hnp info */
|
|
target_hnp = OBJ_NEW(orte_hnp_contact_t);
|
|
target_hnp->rml_uri = strdup(hnpuristr);
|
|
}
|
|
/* set the info in our contact table */
|
|
orte_rml.set_contact_info(target_hnp->rml_uri);
|
|
/* extract the name */
|
|
if (ORTE_SUCCESS != orte_rml_base_parse_uris(target_hnp->rml_uri, &target_hnp->name, NULL)) {
|
|
orte_show_help("help-orte-top.txt", "orte-top:hnp-uri-bad", true, target_hnp->rml_uri);
|
|
orte_finalize();
|
|
exit(1);
|
|
}
|
|
/* set the route to be direct */
|
|
if (ORTE_SUCCESS != orte_routed.update_route(&target_hnp->name, &target_hnp->name)) {
|
|
orte_show_help("help-orte-top.txt", "orte-top:hnp-uri-bad", true, target_hnp->rml_uri);
|
|
orte_finalize();
|
|
exit(1);
|
|
}
|
|
} else {
|
|
orte_show_help("help-orte-top.txt", "orte-top:no-contact-given", true);
|
|
orte_finalize();
|
|
exit(1);
|
|
}
|
|
|
|
/* set the target hnp as our lifeline so we will terminate if it exits */
|
|
orte_routed.set_lifeline(&target_hnp->name);
|
|
|
|
/* if an output file was specified, open it */
|
|
if (NULL != logfile) {
|
|
fp = fopen(logfile, "w");
|
|
if (NULL == fp) {
|
|
orte_show_help("help-orte-top.txt", "orte-top:cant-open-logfile", true, logfile);
|
|
orte_finalize();
|
|
exit(1);
|
|
}
|
|
} else {
|
|
fp = stdout;
|
|
}
|
|
|
|
/* setup a non-blocking recv to get answers - we don't know how
|
|
* many daemons are going to send replies, so we just have to
|
|
* accept whatever comes back
|
|
*/
|
|
orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_TOOL,
|
|
ORTE_RML_NON_PERSISTENT, recv_stats, NULL);
|
|
|
|
|
|
/* setup the command to get the resource usage */
|
|
OBJ_CONSTRUCT(&cmdbuf, opal_buffer_t);
|
|
command = ORTE_DAEMON_TOP_CMD;
|
|
if (ORTE_SUCCESS != (ret = opal_dss.pack(&cmdbuf, &command, 1, ORTE_DAEMON_CMD))) {
|
|
ORTE_ERROR_LOG(ret);
|
|
goto cleanup;
|
|
}
|
|
|
|
proc.jobid = ORTE_PROC_MY_NAME->jobid+1; /* only support initial launch at this time */
|
|
|
|
/* parse the rank list - this can be a comma-separated list of ranks,
|
|
* each element being either a single rank or a range. We also allow
|
|
* for a -1 to indicate all ranks. If not rank is given, we assume -1
|
|
*/
|
|
if (NULL == ranks) {
|
|
/* take all ranks */
|
|
proc.vpid = ORTE_VPID_WILDCARD;
|
|
if (ORTE_SUCCESS != (ret = opal_dss.pack(&cmdbuf, &proc, 1, ORTE_NAME))) {
|
|
ORTE_ERROR_LOG(ret);
|
|
goto cleanup;
|
|
}
|
|
goto SEND;
|
|
}
|
|
|
|
/* split on commas */
|
|
r1 = opal_argv_split(ranks, ',');
|
|
/* for each resulting element, check for range */
|
|
for (i=0; i < opal_argv_count(r1); i++) {
|
|
r2 = opal_argv_split(r1[i], '-');
|
|
if (1 < opal_argv_count(r2)) {
|
|
/* given range - get start and end */
|
|
vstart = strtol(r2[0], NULL, 10);
|
|
vend = strtol(r2[1], NULL, 10);
|
|
} else {
|
|
/* check for wildcard - have to do this here because
|
|
* the -1 would have been caught in the split
|
|
*/
|
|
vint = strtol(r1[i], NULL, 10);
|
|
if (-1 == vint) {
|
|
proc.vpid = ORTE_VPID_WILDCARD;
|
|
if (ORTE_SUCCESS != (ret = opal_dss.pack(&cmdbuf, &proc, 1, ORTE_NAME))) {
|
|
ORTE_ERROR_LOG(ret);
|
|
goto cleanup;
|
|
}
|
|
opal_argv_free(r2);
|
|
goto SEND;
|
|
}
|
|
vstart = strtol(r2[0], NULL, 10);
|
|
vend = vstart + 1;
|
|
}
|
|
for (proc.vpid = vstart; proc.vpid < vend; proc.vpid++) {
|
|
if (ORTE_SUCCESS != (ret = opal_dss.pack(&cmdbuf, &proc, 1, ORTE_NAME))) {
|
|
ORTE_ERROR_LOG(ret);
|
|
goto cleanup;
|
|
}
|
|
}
|
|
opal_argv_free(r2);
|
|
}
|
|
|
|
SEND:
|
|
if (NULL != r1) {
|
|
opal_argv_free(r1);
|
|
}
|
|
send_cmd(0, 0, NULL);
|
|
|
|
/* now wait until the termination event fires */
|
|
while (orte_event_base_active) {
|
|
opal_event_loop(orte_event_base, OPAL_EVLOOP_ONCE);
|
|
}
|
|
|
|
/***************
|
|
* Cleanup
|
|
***************/
|
|
cleanup:
|
|
/* Remove the TERM and INT signal handlers */
|
|
opal_event_signal_del(&term_handler);
|
|
opal_event_signal_del(&int_handler);
|
|
|
|
while (NULL != (item = opal_list_remove_first(&recvd_stats))) {
|
|
OBJ_RELEASE(item);
|
|
}
|
|
OBJ_DESTRUCT(&recvd_stats);
|
|
OBJ_DESTRUCT(&cmdbuf);
|
|
if (NULL != fp && fp != stdout) {
|
|
fclose(fp);
|
|
}
|
|
orte_finalize();
|
|
|
|
return ret;
|
|
}
|
|
|
|
static void abort_exit_callback(int fd, short ign, void *arg)
|
|
{
|
|
opal_list_item_t *item;
|
|
|
|
/* Remove the TERM and INT signal handlers */
|
|
opal_event_signal_del(&term_handler);
|
|
OBJ_DESTRUCT(&term_handler);
|
|
opal_event_signal_del(&int_handler);
|
|
OBJ_DESTRUCT(&int_handler);
|
|
|
|
while (NULL != (item = opal_list_remove_first(&recvd_stats))) {
|
|
OBJ_RELEASE(item);
|
|
}
|
|
OBJ_DESTRUCT(&recvd_stats);
|
|
OBJ_DESTRUCT(&cmdbuf);
|
|
if (NULL != fp && fp != stdout) {
|
|
fclose(fp);
|
|
}
|
|
ORTE_UPDATE_EXIT_STATUS(1);
|
|
orte_quit(0,0,NULL);
|
|
}
|
|
|
|
static void recv_stats(int status, orte_process_name_t* sender,
|
|
opal_buffer_t *buffer, orte_rml_tag_t tag,
|
|
void* cbdata)
|
|
{
|
|
int32_t n;
|
|
opal_pstats_t *stats;
|
|
orte_process_name_t proc;
|
|
int ret;
|
|
|
|
/* if the sender is the HNP we contacted, this message
|
|
* contains info on the number of responses we should get
|
|
*/
|
|
if (sender->vpid == 0) {
|
|
n = 1;
|
|
if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &num_replies, &n, OPAL_INT32))) {
|
|
ORTE_ERROR_LOG(ret);
|
|
goto cleanup;
|
|
}
|
|
n = 1;
|
|
if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &sample_time, &n, OPAL_STRING))) {
|
|
ORTE_ERROR_LOG(ret);
|
|
goto cleanup;
|
|
}
|
|
}
|
|
|
|
n = 1;
|
|
while (ORTE_SUCCESS == opal_dss.unpack(buffer, &proc, &n, ORTE_NAME)) {
|
|
n = 1;
|
|
if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &stats, &n, OPAL_PSTAT))) {
|
|
ORTE_ERROR_LOG(ret);
|
|
goto cleanup;
|
|
}
|
|
/* if field sizes are not yet set, do so now */
|
|
if (!fields_set) {
|
|
int tmp;
|
|
char *ctmp;
|
|
|
|
tmp = strlen(stats->node);
|
|
if (nodefield < tmp) {
|
|
nodefield = tmp;
|
|
}
|
|
|
|
asprintf(&ctmp, "%d", stats->rank);
|
|
tmp = strlen(ctmp);
|
|
free(ctmp);
|
|
if (rankfield < tmp) {
|
|
rankfield = tmp;
|
|
}
|
|
|
|
asprintf(&ctmp, "%lu", (unsigned long)stats->pid);
|
|
tmp = strlen(ctmp);
|
|
free(ctmp);
|
|
if (pidfield < tmp) {
|
|
pidfield = tmp;
|
|
}
|
|
|
|
tmp = strlen(stats->cmd);
|
|
if (cmdfield < tmp) {
|
|
cmdfield = tmp;
|
|
}
|
|
|
|
if (0 <= stats->priority) {
|
|
pri_found = true;
|
|
asprintf(&ctmp, "%d", stats->priority);
|
|
tmp = strlen(ctmp);
|
|
free(ctmp);
|
|
if (prifield < tmp) {
|
|
prifield = tmp;
|
|
}
|
|
}
|
|
|
|
if (0 <= stats->num_threads) {
|
|
thr_found = true;
|
|
asprintf(&ctmp, "%d", stats->num_threads);
|
|
tmp = strlen(ctmp);
|
|
free(ctmp);
|
|
if (thrfield < tmp) {
|
|
thrfield = tmp;
|
|
}
|
|
}
|
|
|
|
if (0 < stats->vsize) {
|
|
vsize_found = true;
|
|
asprintf(&ctmp, "%8.2f", stats->vsize);
|
|
tmp = strlen(ctmp);
|
|
free(ctmp);
|
|
if (vsizefield < tmp) {
|
|
vsizefield = tmp;
|
|
}
|
|
}
|
|
|
|
if (0 < stats->rss) {
|
|
rss_found = true;
|
|
asprintf(&ctmp, "%8.2f", stats->rss);
|
|
tmp = strlen(ctmp);
|
|
free(ctmp);
|
|
if (rssfield < tmp) {
|
|
rssfield = tmp;
|
|
}
|
|
}
|
|
|
|
if (0 < stats->peak_vsize) {
|
|
pkv_found = true;
|
|
asprintf(&ctmp, "%8.2f", stats->peak_vsize);
|
|
tmp = strlen(ctmp);
|
|
free(ctmp);
|
|
if (pkvfield < tmp) {
|
|
pkvfield = tmp;
|
|
}
|
|
}
|
|
|
|
if (0 <= stats->processor) {
|
|
p_found = true;
|
|
asprintf(&ctmp, "%d", stats->processor);
|
|
tmp = strlen(ctmp);
|
|
free(ctmp);
|
|
if (pfield < tmp) {
|
|
pfield = tmp;
|
|
}
|
|
}
|
|
}
|
|
/* add it to the list */
|
|
opal_list_append(&recvd_stats, &stats->super);
|
|
}
|
|
|
|
cleanup:
|
|
/* check for completion */
|
|
num_recvd++;
|
|
if (num_replies <= num_recvd) {
|
|
/* flag that field sizes are set */
|
|
fields_set = true;
|
|
|
|
/* pretty-print what we got */
|
|
pretty_print();
|
|
|
|
/* see if we want to do it again */
|
|
if (0 < update_rate) {
|
|
ORTE_TIMER_EVENT(update_rate, 0, send_cmd, ORTE_SYS_PRI);
|
|
} else {
|
|
orte_finalize();
|
|
exit(0);
|
|
}
|
|
}
|
|
|
|
/* repost the receive */
|
|
orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_TOOL,
|
|
ORTE_RML_NON_PERSISTENT, recv_stats, NULL);
|
|
}
|
|
|
|
/* static values needed for printing */
|
|
static int lennode = 0;
|
|
static int lenrank = 0;
|
|
static int lenpid = 0;
|
|
static int lencmd = 0;
|
|
static int lenstate = 0;
|
|
static int lentime = 0;
|
|
static int lenpri = 0;
|
|
static int lenthr = 0;
|
|
static int lenvsize = 0;
|
|
static int lenrss = 0;
|
|
static int lenpkv = 0;
|
|
static int lensh = 0;
|
|
static int lenp = 0;
|
|
|
|
static void print_ranks(opal_list_t *statlist)
|
|
{
|
|
opal_list_item_t *item;
|
|
opal_pstats_t *stats, *pstats;
|
|
int32_t minrank;
|
|
char pretty_time[10];
|
|
int i;
|
|
|
|
/* sort the results by rank */
|
|
while (0 < opal_list_get_size(statlist)) {
|
|
minrank = INT32_MAX;
|
|
pstats = NULL;
|
|
for (item = opal_list_get_first(statlist);
|
|
item != opal_list_get_end(statlist);
|
|
item = opal_list_get_next(item)) {
|
|
stats = (opal_pstats_t*)item;
|
|
if (stats->rank < minrank) {
|
|
pstats = stats;
|
|
minrank = stats->rank;
|
|
}
|
|
}
|
|
memset(pretty_time, 0, sizeof(pretty_time));
|
|
if (pstats->time.tv_sec >= 3600) {
|
|
snprintf(pretty_time, sizeof(pretty_time), "%5.1fH",
|
|
(double)pstats->time.tv_sec / (double)(3600));
|
|
} else {
|
|
snprintf(pretty_time, sizeof(pretty_time), "%3ld:%02ld",
|
|
(unsigned long)pstats->time.tv_sec/60,
|
|
(unsigned long)pstats->time.tv_sec % 60);
|
|
}
|
|
|
|
if (bynode) {
|
|
/* print blanks in the nodename field */
|
|
for (i=0; i < lennode; i++) {
|
|
fprintf(fp, " ");
|
|
}
|
|
fprintf(fp, " | ");
|
|
/* print fields */
|
|
fprintf(fp, "%*d | ", lenrank, pstats->rank);
|
|
} else {
|
|
fprintf(fp, "%*d | ", lenrank, pstats->rank);
|
|
fprintf(fp, "%*s | ", lennode, pstats->node);
|
|
}
|
|
fprintf(fp, "%*s | ", lencmd, pstats->cmd);
|
|
fprintf(fp, "%*lu | ", lenpid, (unsigned long)pstats->pid);
|
|
fprintf(fp, "%*c | ", lenstate, pstats->state[0]);
|
|
fprintf(fp, "%*s | ", lentime, pretty_time);
|
|
if (pri_found) {
|
|
fprintf(fp, "%*d | ", lenpri, pstats->priority);
|
|
}
|
|
if (thr_found) {
|
|
fprintf(fp, "%*d | ", lenthr, pstats->num_threads);
|
|
}
|
|
if (vsize_found) {
|
|
fprintf(fp, "%*lu | ", lenvsize, (unsigned long)pstats->vsize);
|
|
}
|
|
if (rss_found) {
|
|
fprintf(fp, "%*lu | ", lenvsize, (unsigned long)pstats->rss);
|
|
}
|
|
if (pkv_found) {
|
|
fprintf(fp, "%*lu | ", lenpkv, (unsigned long)pstats->peak_vsize);
|
|
}
|
|
if (p_found) {
|
|
fprintf(fp, "%*d | ", lenp, pstats->processor);
|
|
}
|
|
fprintf(fp, "\n");
|
|
num_lines++;
|
|
opal_list_remove_item(statlist, &pstats->super);
|
|
OBJ_RELEASE(pstats);
|
|
}
|
|
}
|
|
|
|
static void pretty_print(void)
|
|
{
|
|
opal_list_item_t *item, *next;
|
|
opal_pstats_t *stats;
|
|
opal_list_t tmplist;
|
|
char *node;
|
|
|
|
if (bynode) {
|
|
if (need_header) {
|
|
print_headers();
|
|
need_header = false;
|
|
}
|
|
if (timestamp) {
|
|
fprintf(fp, "TIMESTAMP: %s\n", sample_time);
|
|
}
|
|
if (NULL != sample_time) {
|
|
free(sample_time);
|
|
sample_time = NULL;
|
|
}
|
|
/* sort the results by node and then rank */
|
|
while (NULL != (item = opal_list_remove_first(&recvd_stats))) {
|
|
OBJ_CONSTRUCT(&tmplist, opal_list_t);
|
|
stats = (opal_pstats_t*)item;
|
|
node = strdup(stats->node);
|
|
opal_list_append(&tmplist, &stats->super);
|
|
/* cycle through the rest of the list looking
|
|
* for matching nodes
|
|
*/
|
|
item = opal_list_get_first(&recvd_stats);
|
|
while (item != opal_list_get_end(&recvd_stats)) {
|
|
stats = (opal_pstats_t*)item;
|
|
next = opal_list_get_next(item);
|
|
if (0 == strcmp(stats->node, node)) {
|
|
opal_list_remove_item(&recvd_stats, item);
|
|
opal_list_append(&tmplist, &stats->super);
|
|
}
|
|
item = next;
|
|
}
|
|
fprintf(fp, "%*s\n", lennode, node);
|
|
free(node);
|
|
print_ranks(&tmplist);
|
|
OBJ_DESTRUCT(&tmplist);
|
|
}
|
|
} else {
|
|
if (need_header) {
|
|
print_headers();
|
|
need_header = false;
|
|
}
|
|
if (timestamp) {
|
|
fprintf(fp, "\n\nTIMESTAMP: %s\n", sample_time);
|
|
}
|
|
if (NULL != sample_time) {
|
|
free(sample_time);
|
|
sample_time = NULL;
|
|
}
|
|
print_ranks(&recvd_stats);
|
|
}
|
|
|
|
/* provide some separation between iterations */
|
|
fprintf(fp, "\n");
|
|
|
|
/* if we have printed more than MAX_LINES since the last header,
|
|
* flag that we need to print the header next time
|
|
*/
|
|
if (MAX_LINES < num_lines) {
|
|
need_header = true;
|
|
num_lines = 0;
|
|
fprintf(fp, "\n\n");
|
|
}
|
|
}
|
|
|
|
static void print_headers(void)
|
|
{
|
|
int num_fields = 0;
|
|
int i;
|
|
int linelen;
|
|
|
|
lennode = strlen("Nodename");
|
|
if (nodefield > lennode) {
|
|
lennode = nodefield;
|
|
}
|
|
num_fields++;
|
|
|
|
lenrank = strlen("Rank");
|
|
if (rankfield > lenrank) {
|
|
lenrank = rankfield;
|
|
}
|
|
num_fields++;
|
|
|
|
lenpid = strlen("Pid");
|
|
if (pidfield > lenpid) {
|
|
lenpid = pidfield;
|
|
}
|
|
num_fields++;
|
|
|
|
lencmd = strlen("Command");
|
|
if (cmdfield > lencmd) {
|
|
lencmd = cmdfield;
|
|
}
|
|
num_fields++;
|
|
|
|
lenstate = strlen("State");
|
|
num_fields++;
|
|
|
|
lentime = strlen("Time");
|
|
if (timefield > lentime) {
|
|
lentime = timefield;
|
|
}
|
|
num_fields++;
|
|
|
|
if (pri_found) {
|
|
lenpri = strlen("Pri");
|
|
if (prifield > lenpri) {
|
|
lenpri = prifield;
|
|
}
|
|
num_fields++;
|
|
}
|
|
|
|
if (thr_found) {
|
|
lenthr = strlen("#threads");
|
|
if (thrfield > lenthr) {
|
|
lenthr = thrfield;
|
|
}
|
|
num_fields++;
|
|
}
|
|
|
|
if (vsize_found) {
|
|
lenvsize = strlen("Vsize");
|
|
if (vsizefield > lenvsize) {
|
|
lenvsize = vsizefield;
|
|
}
|
|
num_fields++;
|
|
}
|
|
|
|
if (rss_found) {
|
|
lenrss = strlen("RSS");
|
|
if (rssfield > lenrss) {
|
|
lenrss = rssfield;
|
|
}
|
|
num_fields++;
|
|
}
|
|
|
|
if (pkv_found) {
|
|
lenpkv = strlen("Peak Vsize");
|
|
if (pkvfield > lenpkv) {
|
|
lenpkv = pkvfield;
|
|
}
|
|
num_fields++;
|
|
}
|
|
|
|
if (p_found) {
|
|
lenp = strlen("Processor");
|
|
if (pfield > lenp) {
|
|
lenp = pfield;
|
|
}
|
|
num_fields++;
|
|
}
|
|
|
|
linelen = lennode + lenrank + lenpid + lencmd + lenstate + lentime + lenpri + lenthr + lenvsize + lenrss + lenpkv + lensh + lenp;
|
|
/* add spacing */
|
|
linelen += num_fields * 3;
|
|
|
|
/* print the rip line */
|
|
for(i = 0; i < linelen; ++i) {
|
|
fprintf(fp, "=");
|
|
}
|
|
fprintf(fp, "\n");
|
|
|
|
/* print the header */
|
|
if (bynode) {
|
|
fprintf(fp, "%*s | ", lennode , "Nodename");
|
|
fprintf(fp, "%*s | ", lenrank , "Rank");
|
|
} else {
|
|
fprintf(fp, "%*s | ", lenrank , "Rank");
|
|
fprintf(fp, "%*s | ", lennode , "Nodename");
|
|
}
|
|
fprintf(fp, "%*s | ", lencmd , "Command");
|
|
fprintf(fp, "%*s | ", lenpid , "Pid");
|
|
fprintf(fp, "%*s | ", lenstate , "State");
|
|
fprintf(fp, "%*s | ", lentime , "Time");
|
|
if (pri_found) {
|
|
fprintf(fp, "%*s | ", lenpri , "Pri");
|
|
}
|
|
if (thr_found) {
|
|
fprintf(fp, "%*s | ", lenthr , "#threads");
|
|
}
|
|
if (vsize_found) {
|
|
fprintf(fp, "%*s | ", lenvsize , "Vsize");
|
|
}
|
|
if (rss_found) {
|
|
fprintf(fp, "%*s | ", lenrss , "RSS");
|
|
}
|
|
if (pkv_found) {
|
|
fprintf(fp, "%*s | ", lenpkv , "Peak Vsize");
|
|
}
|
|
if (p_found) {
|
|
fprintf(fp, "%*s | ", lenp , "Processor");
|
|
}
|
|
fprintf(fp, "\n");
|
|
|
|
/* print the separator */
|
|
for(i = 0; i < linelen; ++i) {
|
|
fprintf(fp, "-");
|
|
}
|
|
fprintf(fp, "\n");
|
|
|
|
}
|