1
1
Josh Hursey 5a812c8211 Fix orte-ps which George broke in r10718 by extending the orte_session_dir_get_name()
so that it does not return an error when no universe is passed to it.

Also put back in the 'Slots In Use' column as it is now working properly
per Ralphs recent ras commits. Still not sure what 'Slots Alloc' is meant
to represent, so left that as #if 0'd out for the moment.

This commit was SVN r10739.

The following SVN revision numbers were found above:
  r10718 --> open-mpi/ompi@47eef2e002
2006-07-11 16:54:07 +00:00

1540 строки
46 KiB
C

/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include <stdio.h>
#include <errno.h>
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif /* HAVE_UNISTD_H */
#ifdef HAVE_STDLIB_H
#include <stdlib.h>
#endif /* HAVE_STDLIB_H */
#ifdef HAVE_SYS_STAT_H
#include <sys/stat.h>
#endif
#ifdef HAVE_LIBGEN_H
#include <libgen.h>
#endif
#ifdef HAVE_SYS_TYPES_H
#include <sys/types.h>
#endif
#ifdef HAVE_SYS_WAIT_H
#include <sys/wait.h>
#endif
#ifdef HAVE_STRING_H
#include <string.h>
#endif /* HAVE_STRING_H */
#include <sys/types.h>
#include "orte/orte_constants.h"
#include "opal/util/cmd_line.h"
#include "opal/util/argv.h"
#include "opal/util/show_help.h"
#include "opal/util/output.h"
#include "opal/util/opal_environ.h"
#include "opal/util/os_path.h"
#include "opal/mca/base/base.h"
#include "opal/mca/base/mca_base_param.h"
#include "orte/util/univ_info.h"
#include "orte/util/sys_info.h"
#include "orte/util/proc_info.h"
#include "opal/util/os_path.h"
#include "orte/util/session_dir.h"
#include "orte/util/universe_setup_file_io.h"
#include "orte/mca/gpr/gpr.h"
#include "orte/mca/rmgr/base/base.h"
#include "orte/mca/ras/ras.h"
#include "orte/mca/ras/ras_types.h"
#include "orte/mca/ras/base/base.h"
#include "orte/mca/ras/base/ras_base_node.h"
#include "opal/runtime/opal.h"
#include "orte/runtime/runtime.h"
extern char **environ;
/*******************
* Universe/job/vpid information Objects
*******************/
struct orte_ps_vpid_info_t {
/** This is an object, so it must have a super */
opal_list_item_t super;
/** General VPID Information */
size_t rank;
pid_t pid;
orte_process_name_t name;
char * node;
orte_proc_state_t state;
size_t app_context_idx;
};
typedef struct orte_ps_vpid_info_t orte_ps_vpid_info_t;
OBJ_CLASS_DECLARATION(orte_ps_vpid_info_t);
void orte_ps_vpid_info_construct(orte_ps_vpid_info_t *obj);
void orte_ps_vpid_info_destruct( orte_ps_vpid_info_t *obj);
OBJ_CLASS_INSTANCE(orte_ps_vpid_info_t,
opal_list_item_t,
orte_ps_vpid_info_construct,
orte_ps_vpid_info_destruct);
struct orte_ps_job_info_t {
/** This is an object, so it must have a super */
opal_list_item_t super;
/** General Job Information */
orte_jobid_t id;
orte_job_state_t state;
size_t num_init;
size_t num_launched;
size_t num_running;
size_t num_finalized;
size_t num_terminated;
size_t num_aborted;
size_t slots;
orte_vpid_t vpid_start;
orte_vpid_t vpid_range;
orte_app_context_t **app_context;
size_t num_app_context;
/** List of vpids */
opal_list_t vpid_list;
};
typedef struct orte_ps_job_info_t orte_ps_job_info_t;
OBJ_CLASS_DECLARATION(orte_ps_job_info_t);
void orte_ps_job_info_construct(orte_ps_job_info_t *obj);
void orte_ps_job_info_destruct( orte_ps_job_info_t *obj);
OBJ_CLASS_INSTANCE(orte_ps_job_info_t,
opal_list_item_t,
orte_ps_job_info_construct,
orte_ps_job_info_destruct);
struct orte_ps_universe_info_t {
/** This is an object, so it must have a super */
opal_list_item_t super;
/** Universe information */
orte_universe_t universe_info;
/** List of Jobs */
opal_list_t job_list;
/** List of nodes on orte-node segment */
opal_list_t nodes;
};
typedef struct orte_ps_universe_info_t orte_ps_universe_info_t;
OBJ_CLASS_DECLARATION(orte_ps_universe_info_t);
void orte_ps_universe_info_construct(orte_ps_universe_info_t *obj);
void orte_ps_universe_info_destruct( orte_ps_universe_info_t *obj);
OBJ_CLASS_INSTANCE(orte_ps_universe_info_t,
opal_list_item_t,
orte_ps_universe_info_construct,
orte_ps_universe_info_destruct);
/******************
* Local Functions
******************/
static int orte_ps_init(void);
static int parse_args(int argc, char *argv[]);
static int connect_to_universe(orte_universe_t universe_info);
static int gather_information(orte_ps_universe_info_t* universe);
static int gather_active_jobs(orte_ps_universe_info_t* universe);
static int gather_nodes(orte_ps_universe_info_t* universe);
static int gather_job_info(orte_ps_universe_info_t* universe);
static int gather_vpid_info(orte_ps_universe_info_t* universe);
static int pretty_print(orte_ps_universe_info_t* universe);
static int pretty_print_nodes(opal_list_t *nodes);
static int pretty_print_jobs(opal_list_t *jobs);
static int pretty_print_vpids(orte_ps_job_info_t *job);
static char *pretty_univ_state(orte_universe_state_t state);
static char *pretty_node_state(orte_node_state_t state);
static char *pretty_job_state(orte_job_state_t state);
static char *pretty_vpid_state(orte_proc_state_t state);
/*****************************************
* Global Vars for Command line Arguments
*****************************************/
typedef struct {
bool help;
bool verbose;
char *universe;
int jobid;
int vpid;
bool gpr_dump;
bool attached;
bool nodes;
} orte_ps_globals_t;
orte_ps_globals_t orte_ps_globals;
opal_cmd_line_init_t cmd_line_opts[] = {
{ NULL, NULL, NULL,
'h', NULL, "help",
0,
&orte_ps_globals.help, OPAL_CMD_LINE_TYPE_BOOL,
"This help message" },
{ NULL, NULL, NULL,
'v', NULL, "verbose",
0,
&orte_ps_globals.verbose, OPAL_CMD_LINE_TYPE_BOOL,
"Be Verbose" },
{ NULL, NULL, NULL,
'd', NULL, "dump",
0,
&orte_ps_globals.gpr_dump, OPAL_CMD_LINE_TYPE_BOOL,
"Dump the state of the GPR" },
{ NULL, NULL, NULL,
'\0', NULL, "universe",
1,
&orte_ps_globals.universe, OPAL_CMD_LINE_TYPE_STRING,
"Specify a universe" },
{ NULL, NULL, NULL,
'j', NULL, "jobid",
1,
&orte_ps_globals.jobid, OPAL_CMD_LINE_TYPE_INT,
"Specify a specific jobid" },
{ NULL, NULL, NULL,
'p', NULL, "vpid",
1,
&orte_ps_globals.vpid, OPAL_CMD_LINE_TYPE_INT,
"Specify a specific vpid. Must specify a --jobid as well" },
{ NULL, NULL, NULL,
'n', NULL, "nodes",
0,
&orte_ps_globals.nodes, OPAL_CMD_LINE_TYPE_INT,
"Print Node Information" },
/* End of list */
{ NULL, NULL, NULL,
'\0', NULL, NULL,
0,
NULL, OPAL_CMD_LINE_TYPE_NULL,
NULL }
};
int
main(int argc, char *argv[])
{
int ret, exit_status = ORTE_SUCCESS;
opal_list_t universe_list;
opal_list_item_t* item = NULL;
opal_list_t universe_search_result;
/***************
* Initialize
***************/
if (ORTE_SUCCESS != (ret = parse_args(argc, argv))) {
return ret;
}
if (ORTE_SUCCESS != (ret = orte_ps_init())) {
exit_status = ret;
goto cleanup;
}
OBJ_CONSTRUCT(&universe_list, opal_list_t);
OBJ_CONSTRUCT(&universe_search_result, opal_list_t);
/*
* Get the directory listing
*/
if( orte_ps_globals.verbose ) {
printf("orte_ps: Acquiring universe list...\n");
}
if (ORTE_SUCCESS != (ret = orte_universe_search(&universe_search_result) ) ) {
exit_status = ret;
goto cleanup;
}
/*
* For each universe in the listing
*/
for(item = opal_list_get_first(&universe_search_result);
item != opal_list_get_end(&universe_search_result);
item = opal_list_get_next(item) ) {
orte_ps_universe_info_t *univ;
orte_universe_t *tmp_univ;
/*
* Copy over the universe information
*/
univ = OBJ_NEW(orte_ps_universe_info_t);
tmp_univ = (orte_universe_t *) item;
univ->universe_info.state = tmp_univ->state;
univ->universe_info.persistence = tmp_univ->persistence;
univ->universe_info.console = tmp_univ->console;
univ->universe_info.console_connected = tmp_univ->console_connected;
if( NULL != tmp_univ->name )
univ->universe_info.name = strdup(tmp_univ->name);
else
univ->universe_info.name = NULL;
if( NULL != tmp_univ->host )
univ->universe_info.host = strdup(tmp_univ->host);
else
univ->universe_info.host = NULL;
if( NULL != tmp_univ->uid )
univ->universe_info.uid = strdup(tmp_univ->uid);
else
univ->universe_info.uid = NULL;
if( NULL != tmp_univ->scope )
univ->universe_info.scope = strdup(tmp_univ->scope);
else
univ->universe_info.scope = NULL;
if( NULL != tmp_univ->seed_uri)
univ->universe_info.seed_uri = strdup(tmp_univ->seed_uri);
else
univ->universe_info.seed_uri = NULL;
if( NULL != tmp_univ->scriptfile )
univ->universe_info.scriptfile = strdup(tmp_univ->scriptfile);
else
univ->universe_info.scriptfile = NULL;
opal_list_append(&universe_list, &(univ->super));
/*
* Connect to the universe
*/
if( orte_ps_globals.verbose ) {
printf("orte_ps: Connecting to universe: %s\n", univ->universe_info.name);
}
if( ORTE_SUCCESS != (ret = connect_to_universe(univ->universe_info)) ) {
exit_status = ret;
goto cleanup;
}
/*
* Gather the information
*/
if( orte_ps_globals.verbose ) {
printf("orte_ps: Gathering Universe Information\n");
}
if( ORTE_SUCCESS != (ret = gather_information(univ)) ) {
exit_status = ret;
goto cleanup;
}
/*
* Print the General Universe information
*/
if(ORTE_SUCCESS != (ret = pretty_print(univ)) ) {
exit_status = ret;
goto cleanup;
}
/*
* If we were asked to dump the GPR then do so
*/
if( orte_ps_globals.gpr_dump) {
if( ORTE_SUCCESS != (ret = orte_gpr.dump_all() ) ) {
exit_status = ret;
goto cleanup;
}
}
/*
* Since connecting and disconnecting from a universe is
* not well defined, only allow connection to the first
* universe found.
*/
break;
}
/***************
* Cleanup
***************/
cleanup:
while (NULL != (item = opal_list_remove_first(&universe_list))) {
OBJ_RELEASE(item);
}
while (NULL != (item = opal_list_remove_first(&universe_search_result))) {
OBJ_RELEASE(item);
}
/*
* Only finalize if we are attached to a specific universe
*/
if(orte_ps_globals.attached) {
if (OPAL_SUCCESS != (ret = orte_finalize())) {
return ret;
}
}
return exit_status;
}
static int parse_args(int argc, char *argv[]) {
int i, ret, len;
opal_cmd_line_t cmd_line;
char **app_env = NULL, **global_env = NULL;
orte_ps_globals_t tmp = { false,
false,
NULL,
-1,
-1,
false,
false,
false};
/* Parse the command line options */
orte_ps_globals = tmp;
opal_cmd_line_create(&cmd_line, cmd_line_opts);
mca_base_open();
mca_base_cmd_line_setup(&cmd_line);
ret = opal_cmd_line_parse(&cmd_line, true, argc, argv);
/**
* Put all of the MCA arguments in the environment
*/
mca_base_cmd_line_process_args(&cmd_line, &app_env, &global_env);
len = opal_argv_count(app_env);
for(i = 0; i < len; ++i) {
putenv(app_env[i]);
}
len = opal_argv_count(global_env);
for(i = 0; i < len; ++i) {
putenv(global_env[i]);
}
/**
* Now start parsing our specific arguments
*/
if (OPAL_SUCCESS != ret ||
orte_ps_globals.help) {
char *args = NULL;
args = opal_cmd_line_get_usage_msg(&cmd_line);
opal_show_help("help-orte-ps.txt", "usage", true,
args);
free(args);
return ORTE_ERROR;
}
/*
* If they specify a vpid, they must specify a jobid
*/
if( 0 <= orte_ps_globals.vpid) {
if( 0 > orte_ps_globals.jobid) {
opal_show_help("help-orte-ps.txt", "vpid-usage", true,
orte_ps_globals.vpid);
return ORTE_ERROR;
}
}
return ORTE_SUCCESS;
}
static int orte_ps_init(void) {
int exit_status = ORTE_SUCCESS, ret;
/*
* We are trying to attach to another process' GPR so we need to
* attach no matter if it is identified as private or not.
*/
opal_setenv(mca_base_param_env_var("universe_console"),
"1",
true, &environ);
/***************************
* We need all of OPAL
***************************/
if (ORTE_SUCCESS != (ret = opal_init())) {
exit_status = ret;
goto cleanup;
}
/***************************
* And ORTE, but need to do a bit of a dance first
***************************/
/* register handler for errnum -> string converstion */
opal_error_register("ORTE", ORTE_ERR_BASE, ORTE_ERR_MAX, orte_err2str);
/* Register all MCA Params */
if (ORTE_SUCCESS != (ret = orte_register_params(true))) {
exit_status = ret;
goto cleanup;
}
/* Ensure the system_info structure is instantiated and initialized */
if (ORTE_SUCCESS != (ret = orte_sys_info())) {
exit_status = ret;
goto cleanup;
}
/* Ensure the process info structure is instantiated and initialized */
if (ORTE_SUCCESS != (ret = orte_proc_info())) {
exit_status = ret;
goto cleanup;
}
cleanup:
return exit_status;
}
static int pretty_print(orte_ps_universe_info_t* universe) {
int i, line_len;
int len_name = 0,
len_host = 0,
len_uid = 0,
len_scope = 0,
len_per = 0,
len_state = 0;
/*
* Calculate segment lengths
*/
len_name = (int) (strlen(universe->universe_info.name) < strlen("Universe Name") ?
strlen("Universe Name") :
strlen(universe->universe_info.name) );
len_host = (int) (strlen(universe->universe_info.host) < strlen("Hostname") ?
strlen("Hostname") :
strlen(universe->universe_info.host));
len_uid = (int) (strlen(universe->universe_info.uid) < strlen("UID") ?
strlen("UID") :
strlen(universe->universe_info.uid));
len_per = (int) strlen("Persistent");
len_scope = (int) (strlen(universe->universe_info.scope) < strlen("Scope") ?
strlen("Scope") :
strlen(universe->universe_info.scope));
len_state = (int) (strlen(pretty_univ_state(universe->universe_info.state)) < strlen("State") ?
strlen("State") :
strlen(pretty_univ_state(universe->universe_info.state)) );
line_len = (len_name + 3 +
len_host + 3 +
len_uid + 3 +
len_per + 3 +
len_scope + 3 +
len_state) + 3 ;
/*
* Print header
*/
printf("%*s | ", len_name , "Universe Name");
printf("%*s | ", len_host , "Hostname");
printf("%*s | ", len_uid , "UID");
printf("%*s | ", len_per , "Persistent");
printf("%*s | ", len_scope, "Scope");
printf("%*s |" , len_state, "State");
printf("\n");
for(i = 0; i < line_len; ++i) {
printf("-");
}
printf("\n");
/*
* Print Info
*/
printf("%*s | ", len_name, universe->universe_info.name);
printf("%*s | ", len_host, universe->universe_info.host);
printf("%*s | ", len_uid, universe->universe_info.uid);
if(universe->universe_info.persistence)
printf("%*s | ", len_per, "true");
else
printf("%*s | ", len_per, "false");
printf("%*s | ", len_scope, universe->universe_info.scope);
printf("%*s |", len_state, pretty_univ_state(universe->universe_info.state));
printf("\n");
printf("\n");
/*
* Print Node Information
*/
if( orte_ps_globals.nodes )
pretty_print_nodes(&universe->nodes);
/*
* Print Job Information
*/
pretty_print_jobs(&universe->job_list);
return ORTE_SUCCESS;
}
static int pretty_print_nodes(opal_list_t *nodes) {
opal_list_item_t* node_item = NULL;
int i, line_len;
int len_name = 0,
len_arch = 0,
len_cell = 0,
len_state = 0,
len_slots = 0,
len_slots_i = 0,
len_slots_a = 0,
len_slots_m = 0;
/*
* Caculate segment lengths
*/
len_name = (int) strlen("Node Name");
len_arch = (int) strlen("Arch");
len_cell = (int) strlen("Cell ID");
len_state = (int) strlen("State");
len_slots = (int) strlen("Slots");
len_slots_i = (int) strlen("Slots In Use");
#if 0
len_slots_a = (int) strlen("Slots Alloc");
#else
len_slots_a = -3;
#endif
len_slots_m = (int) strlen("Slots Max");
for(node_item = opal_list_get_first(nodes);
node_item != opal_list_get_end(nodes);
node_item = opal_list_get_next(node_item) ) {
orte_ras_node_t *node;
node = (orte_ras_node_t *)node_item;
if( NULL != node->node_name &&
(int)strlen(node->node_name) > len_name)
len_name = (int) strlen(node->node_name);
if( NULL != node->node_arch &&
(int)strlen(node->node_arch) > len_arch)
len_arch = (int) strlen(node->node_arch);
if( (int)strlen(pretty_node_state(node->node_state)) > len_state )
len_state = (int)strlen(pretty_node_state(node->node_state));
}
/*
* JJH Since node_slots_inuse and node_slots_alloc are not used properly
* JJH do not display them to the user.
*/
line_len = (len_name + 3 +
len_arch + 3 +
len_cell + 3 +
len_state + 3 +
len_slots + 3 +
len_slots_i + 3 +
len_slots_a + 3 +
len_slots_m + 3);
/*
* Print the header
*/
printf("%*s | ", len_name, "Node Name");
printf("%*s | ", len_arch, "Arch");
printf("%*s | ", len_cell, "Cell ID");
printf("%*s | ", len_state, "State");
printf("%*s | ", len_slots, "Slots");
printf("%*s | ", len_slots_m, "Slots Max");
printf("%*s | ", len_slots_i, "Slots In Use");
#if 0
printf("%*s | ", len_slots_a, "Slots Alloc");
#endif
printf("\n");
for(i = 0; i < line_len; ++i) {
printf("-");
}
printf("\n");
/*
* Print Info
*/
for(node_item = opal_list_get_first(nodes);
node_item != opal_list_get_end(nodes);
node_item = opal_list_get_next(node_item) ) {
orte_ras_node_t *node;
node = (orte_ras_node_t *)node_item;
printf("%*s | ", len_name, node->node_name);
printf("%*s | ", len_arch, (NULL == node->node_arch ?
"" :
node->node_arch));
printf("%*d | ", len_cell, node->node_cellid);
printf("%*s | ", len_state, pretty_node_state(node->node_state));
printf("%*d | ", len_slots, (uint)node->node_slots);
printf("%*d | ", len_slots_m, (uint)node->node_slots_max);
printf("%*d | ", len_slots_i, (uint)node->node_slots_inuse);
#if 0
printf("%*d | ", len_slots_a, (uint)node->node_slots_alloc);
#endif
printf("\n");
}
return ORTE_SUCCESS;
}
static int pretty_print_jobs(opal_list_t *jobs) {
opal_list_item_t* job_item = NULL;
int len_jobid = 0,
len_state = 0,
len_slots = 0,
len_vpid_s = 0,
len_vpid_r = 0,
len_ckpt_s = 0,
len_ckpt_r = 0,
len_ckpt_l = 0;
int i, line_len;
for(job_item = opal_list_get_first(jobs);
job_item != opal_list_get_end(jobs);
job_item = opal_list_get_next(job_item) ) {
orte_ps_job_info_t *job;
job = (orte_ps_job_info_t *)job_item;
/*
* Caculate segment lengths
*/
len_jobid = 6;
len_state = (int) (strlen(pretty_job_state(job->state)) < strlen("State") ?
strlen("State") :
strlen(pretty_job_state(job->state)));
len_slots = 6;
len_vpid_s = (int) strlen("VPID Start");
len_vpid_r = (int) strlen("VPID Range");
len_ckpt_s = 0;
len_ckpt_r = 0;
len_ckpt_l = 0;
line_len = (len_jobid + 3 +
len_state + 3 +
len_slots + 3 +
len_vpid_s + 3 +
len_vpid_r + 3 +
len_ckpt_s + 3 +
len_ckpt_r + 3 +
len_ckpt_l - 6
);
/*
* Print Header
*/
printf("\n");
printf("%*s | ", len_jobid , "JobID");
printf("%*s | ", len_state , "State");
printf("%*s | ", len_slots , "Slots");
printf("%*s | ", len_vpid_s , "VPID Start");
printf("%*s | ", len_vpid_r , "VPID Range");
printf("\n");
for(i = 0; i < line_len; ++i) {
printf("-");
}
printf("\n");
/*
* Print Info
*/
printf("%*d | ", len_jobid , job->id);
printf("%*s | ", len_state , pretty_job_state(job->state));
printf("%*d | ", len_slots , (uint)job->slots);
printf("%*d | ", len_vpid_s, job->vpid_start);
printf("%*d | ", len_vpid_r, job->vpid_range);
printf("\n");
/*
* Pretty print all VPID's in job
*/
if(0 == job->id) { /* No vpids for the HNP */
continue;
}
pretty_print_vpids(job);
}
return ORTE_SUCCESS;
}
static int pretty_print_vpids(orte_ps_job_info_t *job) {
opal_list_item_t* vpid_item = NULL;
int len_o_proc_name = 0,
len_proc_name = 0,
len_rank = 0,
len_pid = 0,
len_state = 0,
len_node = 0,
len_ckpt_s = 0,
len_ckpt_r = 0,
len_ckpt_l = 0;
int i, line_len;
/*
* Caculate segment lengths
*/
len_o_proc_name = strlen("ORTE Name");
len_proc_name = strlen("Process Name");
len_rank = 6;
len_pid = 6;
len_state = 0;
len_node = 0;
len_ckpt_s = 0;
len_ckpt_r = 0;
len_ckpt_l = 0;
for(vpid_item = opal_list_get_first(&(job->vpid_list));
vpid_item != opal_list_get_end(&(job->vpid_list));
vpid_item = opal_list_get_next(vpid_item) ) {
orte_ps_vpid_info_t *vpid;
char *proc_name = NULL;
vpid = (orte_ps_vpid_info_t *)vpid_item;
/*
* Find my app context
*/
for( i = 0; i < (int)job->num_app_context; ++i) {
if( job->app_context[i]->idx == vpid->app_context_idx ) {
if( (int)strlen(job->app_context[i]->app) > len_proc_name)
len_proc_name = strlen(job->app_context[i]->app);
break;
}
}
asprintf(&proc_name, "%d.%d.%d", vpid->name.cellid, vpid->name.jobid, vpid->name.vpid);
if( (int)strlen(proc_name) > len_o_proc_name )
len_o_proc_name = strlen(proc_name);
if( (int)strlen(vpid->node) > len_node)
len_node = strlen(vpid->node);
if( (int)strlen(pretty_vpid_state(vpid->state)) > len_state)
len_state = strlen(pretty_vpid_state(vpid->state));
if( NULL != proc_name) {
free(proc_name);
proc_name = NULL;
}
}
line_len = (len_o_proc_name + 3 +
len_proc_name + 3 +
len_rank + 3 +
len_pid + 3 +
len_state + 3 +
len_node + 3 +
len_ckpt_s + 3 +
len_ckpt_r + 3 +
len_ckpt_l - 6
);
/*
* Print Header
*/
printf("\t");
printf("%*s | ", len_proc_name , "Process Name");
printf("%*s | ", len_o_proc_name , "ORTE Name");
printf("%*s | ", len_rank , "Rank");
printf("%*s | ", len_pid , "PID");
printf("%*s | ", len_node , "Node");
printf("%*s | ", len_state , "State");
printf("\n");
printf("\t");
for(i = 0; i < line_len; ++i) {
printf("-");
}
printf("\n");
/*
* Print Info
*/
for(vpid_item = opal_list_get_first(&(job->vpid_list));
vpid_item != opal_list_get_end(&(job->vpid_list));
vpid_item = opal_list_get_next(vpid_item) ) {
orte_ps_vpid_info_t *vpid;
char *proc_name = NULL;
vpid = (orte_ps_vpid_info_t *)vpid_item;
printf("\t");
asprintf(&proc_name, "%d.%d.%d", vpid->name.cellid, vpid->name.jobid, vpid->name.vpid);
for( i = 0; i < (int)job->num_app_context; ++i) {
if( job->app_context[i]->idx == vpid->app_context_idx ) {
printf("%*s | ", len_proc_name, job->app_context[i]->app);
break;
}
}
printf("%*s | ", len_o_proc_name, proc_name);
printf("%*d | ", len_rank , (uint)vpid->rank);
printf("%*d | ", len_pid , vpid->pid);
printf("%*s | ", len_node , vpid->node);
printf("%*s | ", len_state , pretty_vpid_state(vpid->state));
printf("\n");
if( NULL != proc_name) {
free(proc_name);
proc_name = NULL;
}
}
return ORTE_SUCCESS;
}
static int connect_to_universe(orte_universe_t universe_info) {
int ret, exit_status = ORTE_SUCCESS;
char * univ_mca_param = NULL;
/*
* Construct the MCA parameter
*/
asprintf(&univ_mca_param, "%s@%s:%s",
universe_info.uid,
universe_info.host,
universe_info.name);
#if 0
/*
* Disconnect from the current universe
*/
if(orte_ps_globals.attached) {
if (OPAL_SUCCESS != (ret = orte_system_finalize())) {
return ret;
}
}
#endif
/*
* Set the environment universe information
*/
opal_setenv(mca_base_param_env_var("universe"),
univ_mca_param,
true, &environ);
/*
* Restart ORTE in the requested universe
*/
if(!orte_ps_globals.attached) {
if (ORTE_SUCCESS != (ret = orte_system_init(true)) ) {
exit_status = ret;
goto cleanup;
}
}
else {
if( ORTE_SUCCESS != (ret = orte_restart(orte_process_info.my_name, universe_info.seed_uri)) ) {
printf("orte_restart: FAILED (%d)\n", ret);
exit_status = ret;
goto cleanup;
}
}
orte_ps_globals.attached = true;
cleanup:
if( NULL != univ_mca_param)
free(univ_mca_param);
return exit_status;
}
static int gather_information(orte_ps_universe_info_t* universe) {
int ret, exit_status = ORTE_SUCCESS;
if( ORTE_SUCCESS != (ret = gather_active_jobs(universe) )) {
exit_status = ret;
goto cleanup;
}
if( ORTE_SUCCESS != (ret = gather_nodes(universe) )) {
exit_status = ret;
goto cleanup;
}
if( ORTE_SUCCESS != (ret = gather_job_info(universe) )) {
exit_status = ret;
goto cleanup;
}
if( ORTE_SUCCESS != (ret = gather_vpid_info(universe) )) {
exit_status = ret;
goto cleanup;
}
cleanup:
return exit_status;
}
static int gather_active_jobs(orte_ps_universe_info_t* universe) {
int ret, exit_status = ORTE_SUCCESS;
char *segment = NULL;
orte_gpr_value_t** values = NULL;
size_t i, j, num_values = 0;
/**********************
* Job Info segment
**********************/
segment = strdup(ORTE_JOBINFO_SEGMENT);
if( ORTE_SUCCESS != (ret = orte_gpr.get(ORTE_GPR_KEYS_OR|ORTE_GPR_TOKENS_OR,
segment,
NULL,
NULL,
&num_values,
&values ) ) ) {
exit_status = ret;
goto cleanup;
}
/*
* Parse the structure returned
*/
for(i = 0; i < num_values; ++i) {
orte_gpr_value_t* value = values[i];
orte_ps_job_info_t *job = NULL;
job = OBJ_NEW(orte_ps_job_info_t);
orte_schema.extract_jobid_from_segment_name(&(job->id), value->tokens[0]);
/*
* If the user specified a jobid, then
* only access the info for that jobid
*/
if( 0 <= orte_ps_globals.jobid ) {
if( (int)job->id != orte_ps_globals.jobid) {
continue;
}
}
for( j = 0; j < value->cnt; ++j) {
orte_gpr_keyval_t* keyval = value->keyvals[j];
orte_job_state_t *job_state;
if( 0 == strncmp(keyval->key, ORTE_JOB_STATE_KEY, strlen(ORTE_JOB_STATE_KEY)) ) {
if( ORTE_SUCCESS != (ret = orte_dss.get( (void **) &job_state, keyval->value, ORTE_JOB_STATE))) {
exit_status = ret;
goto cleanup;
}
job->state = *job_state;
continue;
}
}
opal_list_append(&universe->job_list, &(job->super));
}
cleanup:
return exit_status;
}
static int gather_nodes(orte_ps_universe_info_t* universe) {
int ret, exit_status = ORTE_SUCCESS;
if( ORTE_SUCCESS != (ret = orte_ras_base_node_query(&(universe->nodes)))) {
exit_status = ret;
}
return exit_status;
}
static int gather_job_info(orte_ps_universe_info_t* universe) {
int ret, exit_status = ORTE_SUCCESS;
char *segment = NULL, *tokens[2];
orte_gpr_value_t** values = NULL;
size_t i, j, num_values = 0;
opal_list_item_t* item = NULL;
/*
* For each job in the universe
*/
for(item = opal_list_get_first(&(universe->job_list));
item != opal_list_get_end(&(universe->job_list));
item = opal_list_get_next(item) ) {
orte_ps_job_info_t *job;
job = (orte_ps_job_info_t *)item;
/*
* Get the App Context(s)
*/
orte_rmgr_base_get_app_context(job->id,
&job->app_context,
&job->num_app_context);
/*
* Access the job segment
*/
orte_schema.get_job_segment_name(&segment, job->id);
/*
* Here we are just focused on the orte-job-globals container
*/
tokens[0] = strdup(ORTE_JOB_GLOBALS);
tokens[1] = NULL;
if( ORTE_SUCCESS != (ret = orte_gpr.get(ORTE_GPR_KEYS_OR|ORTE_GPR_TOKENS_OR,
segment,
tokens,
NULL,
&num_values,
&values ) ) ) {
exit_status = ret;
goto cleanup;
}
/*
* Parse the structure returned
*/
for(i = 0; i < num_values; ++i) {
orte_gpr_value_t* value = values[i];
for( j = 0; j < value->cnt; ++j) {
orte_gpr_keyval_t* keyval = value->keyvals[j];
size_t *tmp_num;
orte_vpid_t *tmp_vpid;
if( 0 == strncmp(keyval->key, ORTE_PROC_NUM_AT_INIT, strlen(ORTE_PROC_NUM_AT_INIT)) ) {
if( ORTE_SUCCESS != (ret = orte_dss.get( (void **) &tmp_num, keyval->value, ORTE_SIZE))) {
exit_status = ret;
goto cleanup;
}
job->num_init = *tmp_num;
continue;
}
else if( 0 == strncmp(keyval->key, ORTE_PROC_NUM_LAUNCHED, strlen(ORTE_PROC_NUM_LAUNCHED)) ) {
if( ORTE_SUCCESS != (ret = orte_dss.get( (void **) &tmp_num, keyval->value, ORTE_SIZE))) {
exit_status = ret;
goto cleanup;
}
job->num_launched = *tmp_num;
continue;
}
else if( 0 == strncmp(keyval->key, ORTE_PROC_NUM_RUNNING, strlen(ORTE_PROC_NUM_RUNNING)) ) {
if( ORTE_SUCCESS != (ret = orte_dss.get( (void **) &tmp_num, keyval->value, ORTE_SIZE))) {
exit_status = ret;
goto cleanup;
}
job->num_running = *tmp_num;
continue;
}
else if( 0 == strncmp(keyval->key, ORTE_PROC_NUM_FINALIZED, strlen(ORTE_PROC_NUM_FINALIZED)) ) {
if( ORTE_SUCCESS != (ret = orte_dss.get( (void **) &tmp_num, keyval->value, ORTE_SIZE))) {
exit_status = ret;
goto cleanup;
}
job->num_finalized = *tmp_num;
continue;
}
else if( 0 == strncmp(keyval->key, ORTE_PROC_NUM_TERMINATED, strlen(ORTE_PROC_NUM_TERMINATED)) ) {
if( ORTE_SUCCESS != (ret = orte_dss.get( (void **) &tmp_num, keyval->value, ORTE_SIZE))) {
exit_status = ret;
goto cleanup;
}
job->num_terminated = *tmp_num;
continue;
}
else if( 0 == strncmp(keyval->key, ORTE_PROC_NUM_ABORTED, strlen(ORTE_PROC_NUM_ABORTED)) ) {
if( ORTE_SUCCESS != (ret = orte_dss.get( (void **) &tmp_num, keyval->value, ORTE_SIZE))) {
exit_status = ret;
goto cleanup;
}
job->num_aborted = *tmp_num;
continue;
}
else if( 0 == strncmp(keyval->key, ORTE_JOB_SLOTS_KEY, strlen(ORTE_JOB_SLOTS_KEY)) ) {
if( ORTE_SUCCESS != (ret = orte_dss.get( (void **) &tmp_num, keyval->value, ORTE_SIZE))) {
exit_status = ret;
goto cleanup;
}
job->slots = *tmp_num;
continue;
}
else if( 0 == strncmp(keyval->key, ORTE_JOB_VPID_START_KEY, strlen(ORTE_JOB_VPID_START_KEY)) ) {
if( ORTE_SUCCESS != (ret = orte_dss.get( (void **) &tmp_vpid, keyval->value, ORTE_VPID))) {
exit_status = ret;
goto cleanup;
}
job->vpid_start = *tmp_vpid;
continue;
}
else if( 0 == strncmp(keyval->key, ORTE_JOB_VPID_RANGE_KEY, strlen(ORTE_JOB_VPID_RANGE_KEY)) ) {
if( ORTE_SUCCESS != (ret = orte_dss.get( (void **) &tmp_vpid, keyval->value, ORTE_VPID))) {
exit_status = ret;
goto cleanup;
}
job->vpid_range = *tmp_vpid;
continue;
}
}
}
}
cleanup:
return exit_status;
}
static int gather_vpid_info(orte_ps_universe_info_t* universe) {
int ret, exit_status = ORTE_SUCCESS;
char *segment = NULL;
orte_gpr_value_t** values = NULL;
size_t i, j, num_values = 0;
opal_list_item_t* job_item = NULL;
size_t v = 0;
/*
* For each Job in the universe
*/
for(job_item = opal_list_get_first(&(universe->job_list));
job_item != opal_list_get_end(&(universe->job_list));
job_item = opal_list_get_next(job_item) ) {
orte_ps_job_info_t *job;
job = (orte_ps_job_info_t *)job_item;
/*
* Skip getting the vpid's for the HNP, since the information is not complete
*/
if( 0 == job->id) {
continue;
}
/*
* For each vpid in the job
*/
for(v = job->vpid_start; v < (job->vpid_start + job->vpid_range); ++v) {
orte_ps_vpid_info_t *vpid = NULL;
orte_process_name_t proc;
char **tokens = NULL;
size_t num_tokens = 0;
/*
* If the user specified a vpid, then just get that one
*/
if( 0 <= orte_ps_globals.vpid) {
/*
* Check to make sure it is a valid vpid
*/
if( (int)(job->vpid_start + job->vpid_range) <= orte_ps_globals.vpid) {
opal_show_help("help-orte-ps.txt", "invalid-vpid", true,
orte_ps_globals.vpid,
orte_ps_globals.jobid );
return ORTE_ERROR;
}
if( (int)v != orte_ps_globals.vpid ) {
continue;
}
}
vpid = OBJ_NEW(orte_ps_vpid_info_t);
/*
* Access the job segment
*/
orte_schema.get_job_segment_name(&segment, job->id);
/*
* Access the vpid container
*/
proc.cellid = 0;
proc.jobid = job->id;
proc.vpid = v;
orte_schema.get_proc_tokens(&tokens,
&num_tokens,
&proc);
if( ORTE_SUCCESS != (ret = orte_gpr.get(ORTE_GPR_KEYS_OR|ORTE_GPR_TOKENS_OR,
segment,
tokens,
NULL,
&num_values,
&values ) ) ) {
exit_status = ret;
goto cleanup;
}
/*
* Parse the structure returned
*/
for(i = 0; i < num_values; ++i) {
orte_gpr_value_t* value = values[i];
for( j = 0; j < value->cnt; ++j) {
orte_gpr_keyval_t* keyval = value->keyvals[j];
if( 0 == strncmp(keyval->key, ORTE_PROC_RANK_KEY, strlen(ORTE_PROC_RANK_KEY)) ) {
size_t *tmp_size;
if( ORTE_SUCCESS != (ret = orte_dss.get( (void **) &tmp_size, keyval->value, ORTE_SIZE))) {
exit_status = ret;
goto cleanup;
}
vpid->rank = *tmp_size;
continue;
}
else if( 0 == strncmp(keyval->key, ORTE_PROC_APP_CONTEXT_KEY, strlen(ORTE_PROC_APP_CONTEXT_KEY)) ) {
size_t *tmp_size;
if( ORTE_SUCCESS != (ret = orte_dss.get( (void **) &tmp_size, keyval->value, ORTE_SIZE))) {
exit_status = ret;
goto cleanup;
}
vpid->app_context_idx = *tmp_size;
continue;
}
else if( 0 == strncmp(keyval->key, ORTE_PROC_PID_KEY, strlen(ORTE_PROC_PID_KEY)) ) {
pid_t *tmp_pid;
if( ORTE_SUCCESS != (ret = orte_dss.get( (void **) &tmp_pid, keyval->value, ORTE_PID))) {
exit_status = ret;
goto cleanup;
}
vpid->pid = *tmp_pid;
continue;
}
else if( 0 == strncmp(keyval->key, ORTE_PROC_NAME_KEY, strlen(ORTE_PROC_NAME_KEY)) ) {
orte_process_name_t *tmp_proc;
if( ORTE_SUCCESS != (ret = orte_dss.get( (void **) &tmp_proc, keyval->value, ORTE_NAME))) {
exit_status = ret;
goto cleanup;
}
vpid->name.cellid = tmp_proc->cellid;
vpid->name.jobid = tmp_proc->jobid;
vpid->name.vpid = tmp_proc->vpid;
continue;
}
else if( 0 == strncmp(keyval->key, ORTE_NODE_NAME_KEY, strlen(ORTE_NODE_NAME_KEY)) ) {
char *tmp_node = NULL;
if( ORTE_SUCCESS != (ret = orte_dss.get( (void **) &tmp_node, keyval->value, ORTE_STRING))) {
exit_status = ret;
goto cleanup;
}
vpid->node = strdup(tmp_node);
continue;
}
else if( 0 == strncmp(keyval->key, ORTE_PROC_STATE_KEY, strlen(ORTE_PROC_STATE_KEY)) ) {
orte_proc_state_t *tmp_state;
if( ORTE_SUCCESS != (ret = orte_dss.get( (void **) &tmp_state, keyval->value, ORTE_PROC_STATE))) {
exit_status = ret;
goto cleanup;
}
vpid->state = *tmp_state;
continue;
}
}
}
opal_list_append(&job->vpid_list, &(vpid->super));
}
}
cleanup:
return exit_status;
}
/************************
* Object handling
************************/
void orte_ps_vpid_info_construct(orte_ps_vpid_info_t *obj) {
obj->node = NULL;
}
void orte_ps_vpid_info_destruct( orte_ps_vpid_info_t *obj) {
if( NULL != obj->node)
free(obj->node);
}
void orte_ps_job_info_construct(orte_ps_job_info_t *obj) {
OBJ_CONSTRUCT(&obj->vpid_list, opal_list_t);
obj->app_context = NULL;
obj->num_app_context = 0;
}
void orte_ps_job_info_destruct( orte_ps_job_info_t *obj) {
opal_list_item_t* item = NULL;
size_t i;
for(i = 0; i < obj->num_app_context; ++i) {
free(obj->app_context[i]);
}
obj->num_app_context = 0;
while (NULL != (item = opal_list_remove_first(&obj->vpid_list))) {
OBJ_RELEASE(item);
}
OBJ_DESTRUCT(&obj->vpid_list);
}
void orte_ps_universe_info_construct(orte_ps_universe_info_t *obj) {
OBJ_CONSTRUCT(&obj->job_list, opal_list_t);
OBJ_CONSTRUCT(&obj->nodes, opal_list_t);
OBJ_CONSTRUCT(&obj->universe_info, orte_universe_t);
}
void orte_ps_universe_info_destruct( orte_ps_universe_info_t *obj) {
opal_list_item_t* item = NULL;
while (NULL != (item = opal_list_remove_first(&obj->job_list))) {
OBJ_RELEASE(item);
}
OBJ_DESTRUCT(&obj->job_list);
while (NULL != (item = opal_list_remove_first(&obj->nodes))) {
OBJ_RELEASE(item);
}
OBJ_DESTRUCT(&obj->nodes);
OBJ_DESTRUCT(&obj->universe_info);
}
static char *pretty_job_state(orte_job_state_t state) {
switch(state) {
case ORTE_JOB_STATE_INIT:
return strdup("Init");
break;
case ORTE_JOB_STATE_LAUNCHED:
return strdup("Launched");
break;
case ORTE_JOB_STATE_AT_STG1:
return strdup("Stage 1");
break;
case ORTE_JOB_STATE_AT_STG2:
return strdup("Stage 2");
break;
case ORTE_JOB_STATE_RUNNING:
return strdup("Running");
break;
case ORTE_JOB_STATE_AT_STG3:
return strdup("Stage 3");
break;
case ORTE_JOB_STATE_FINALIZED:
return strdup("Finalized");
break;
case ORTE_JOB_STATE_TERMINATED:
return strdup("Terminated");
break;
case ORTE_JOB_STATE_ABORTED:
return strdup("Aborted");
break;
default:
break;
}
return strdup("Unknown");
}
static char *pretty_vpid_state(orte_proc_state_t state) {
switch(state) {
case ORTE_PROC_STATE_INIT:
return strdup("Init");
break;
case ORTE_PROC_STATE_LAUNCHED:
return strdup("Launched");
break;
case ORTE_PROC_STATE_AT_STG1:
return strdup("Stage 1");
break;
case ORTE_PROC_STATE_AT_STG2:
return strdup("Stage 2");
break;
case ORTE_PROC_STATE_RUNNING:
return strdup("Running");
break;
case ORTE_PROC_STATE_AT_STG3:
return strdup("Stage 3");
break;
case ORTE_PROC_STATE_FINALIZED:
return strdup("Finalized");
break;
case ORTE_PROC_STATE_TERMINATED:
return strdup("Terminated");
break;
case ORTE_PROC_STATE_ABORTED:
return strdup("Aborted");
break;
default:
break;
}
return strdup("Unknown");
}
static char *pretty_univ_state(orte_universe_state_t state) {
switch(state) {
case ORTE_UNIVERSE_STATE_PRE_INIT:
return strdup("Pre-Init");
break;
case ORTE_UNIVERSE_STATE_INIT:
return strdup("Initializing");
break;
case ORTE_UNIVERSE_STATE_RUNNING:
return strdup("Running");
break;
case ORTE_UNIVERSE_STATE_FINALIZE:
return strdup("Finalized");
break;
default:
break;
}
return strdup("Unknown");
}
static char *pretty_node_state(orte_node_state_t state) {
switch(state) {
case ORTE_NODE_STATE_DOWN:
return strdup("Down");
break;
case ORTE_NODE_STATE_UP:
return strdup("Up");
break;
case ORTE_NODE_STATE_REBOOT:
return strdup("Reboot");
break;
case ORTE_NODE_STATE_UNKNOWN:
default:
break;
}
return strdup("Unknown");
}