523b6dcbe8
function. This commit was SVN r10716.
1727 строки
54 KiB
C
1727 строки
54 KiB
C
/*
|
|
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
|
* University Research and Technology
|
|
* Corporation. All rights reserved.
|
|
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
|
* of Tennessee Research Foundation. All rights
|
|
* reserved.
|
|
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
|
* University of Stuttgart. All rights reserved.
|
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
|
* All rights reserved.
|
|
* $COPYRIGHT$
|
|
*
|
|
* Additional copyrights may follow
|
|
*
|
|
* $HEADER$
|
|
*/
|
|
|
|
#include "orte_config.h"
|
|
|
|
/*
|
|
* JJH Temp workaround until this symbol is exported
|
|
*/
|
|
#define OPAL_ENABLE_FT 0
|
|
|
|
#include <stdio.h>
|
|
#include <errno.h>
|
|
#ifdef HAVE_UNISTD_H
|
|
#include <unistd.h>
|
|
#endif /* HAVE_UNISTD_H */
|
|
#ifdef HAVE_STDLIB_H
|
|
#include <stdlib.h>
|
|
#endif /* HAVE_STDLIB_H */
|
|
#ifdef HAVE_SYS_STAT_H
|
|
#include <sys/stat.h>
|
|
#endif
|
|
#ifdef HAVE_LIBGEN_H
|
|
#include <libgen.h>
|
|
#endif
|
|
#ifdef HAVE_SYS_TYPES_H
|
|
#include <sys/types.h>
|
|
#endif
|
|
#ifdef HAVE_SYS_WAIT_H
|
|
#include <sys/wait.h>
|
|
#endif
|
|
#ifdef HAVE_STRING_H
|
|
#include <string.h>
|
|
#endif /* HAVE_STRING_H */
|
|
#include <sys/types.h>
|
|
|
|
#include "orte/orte_constants.h"
|
|
|
|
#include "opal/util/cmd_line.h"
|
|
#include "opal/util/argv.h"
|
|
#include "opal/util/show_help.h"
|
|
#include "opal/util/output.h"
|
|
#include "opal/util/opal_environ.h"
|
|
#include "opal/util/os_path.h"
|
|
#include "opal/mca/base/base.h"
|
|
#include "opal/mca/base/mca_base_param.h"
|
|
|
|
#include "orte/util/univ_info.h"
|
|
#include "orte/util/sys_info.h"
|
|
#include "orte/util/proc_info.h"
|
|
#include "opal/util/os_path.h"
|
|
#include "orte/util/session_dir.h"
|
|
#include "orte/util/universe_setup_file_io.h"
|
|
#include "orte/mca/gpr/gpr.h"
|
|
#if OPAL_ENABLE_FT == 1
|
|
#include "orte/mca/snapc/snapc.h"
|
|
#include "orte/mca/snapc/base/base.h"
|
|
#endif
|
|
#include "orte/mca/rmgr/base/base.h"
|
|
#include "orte/mca/ras/ras.h"
|
|
#include "orte/mca/ras/ras_types.h"
|
|
#include "orte/mca/ras/base/base.h"
|
|
#include "orte/mca/ras/base/ras_base_node.h"
|
|
|
|
#include "opal/runtime/opal.h"
|
|
#if OPAL_ENABLE_FT == 1
|
|
#include "opal/runtime/opal_cr.h"
|
|
#endif
|
|
#include "orte/runtime/runtime.h"
|
|
|
|
|
|
extern char **environ;
|
|
|
|
/*******************
|
|
* Universe/job/vpid information Objects
|
|
*******************/
|
|
struct orte_ps_vpid_info_t {
|
|
/** This is an object, so it must have a super */
|
|
opal_list_item_t super;
|
|
|
|
/** General VPID Information */
|
|
size_t rank;
|
|
pid_t pid;
|
|
orte_process_name_t name;
|
|
char * node;
|
|
orte_proc_state_t state;
|
|
|
|
size_t app_context_idx;
|
|
|
|
#if OPAL_ENABLE_FT == 1
|
|
size_t ckpt_state;
|
|
char *ckpt_ref;
|
|
char *ckpt_loc;
|
|
#endif
|
|
};
|
|
typedef struct orte_ps_vpid_info_t orte_ps_vpid_info_t;
|
|
|
|
OBJ_CLASS_DECLARATION(orte_ps_vpid_info_t);
|
|
|
|
void orte_ps_vpid_info_construct(orte_ps_vpid_info_t *obj);
|
|
void orte_ps_vpid_info_destruct( orte_ps_vpid_info_t *obj);
|
|
|
|
OBJ_CLASS_INSTANCE(orte_ps_vpid_info_t,
|
|
opal_list_item_t,
|
|
orte_ps_vpid_info_construct,
|
|
orte_ps_vpid_info_destruct);
|
|
|
|
struct orte_ps_job_info_t {
|
|
/** This is an object, so it must have a super */
|
|
opal_list_item_t super;
|
|
|
|
/** General Job Information */
|
|
orte_jobid_t id;
|
|
orte_job_state_t state;
|
|
|
|
size_t num_init;
|
|
size_t num_launched;
|
|
size_t num_running;
|
|
size_t num_finalized;
|
|
size_t num_terminated;
|
|
size_t num_aborted;
|
|
size_t slots;
|
|
orte_vpid_t vpid_start;
|
|
orte_vpid_t vpid_range;
|
|
|
|
#if OPAL_ENABLE_FT == 1
|
|
size_t ckpt_state;
|
|
char *ckpt_ref;
|
|
char *ckpt_loc;
|
|
#endif
|
|
|
|
orte_app_context_t **app_context;
|
|
size_t num_app_context;
|
|
|
|
/** List of vpids */
|
|
opal_list_t vpid_list;
|
|
};
|
|
typedef struct orte_ps_job_info_t orte_ps_job_info_t;
|
|
|
|
OBJ_CLASS_DECLARATION(orte_ps_job_info_t);
|
|
|
|
void orte_ps_job_info_construct(orte_ps_job_info_t *obj);
|
|
void orte_ps_job_info_destruct( orte_ps_job_info_t *obj);
|
|
|
|
OBJ_CLASS_INSTANCE(orte_ps_job_info_t,
|
|
opal_list_item_t,
|
|
orte_ps_job_info_construct,
|
|
orte_ps_job_info_destruct);
|
|
|
|
|
|
struct orte_ps_universe_info_t {
|
|
/** This is an object, so it must have a super */
|
|
opal_list_item_t super;
|
|
|
|
/** Universe information */
|
|
orte_universe_t universe_info;
|
|
|
|
/** List of Jobs */
|
|
opal_list_t job_list;
|
|
|
|
/** List of nodes on orte-node segment */
|
|
opal_list_t nodes;
|
|
};
|
|
typedef struct orte_ps_universe_info_t orte_ps_universe_info_t;
|
|
|
|
OBJ_CLASS_DECLARATION(orte_ps_universe_info_t);
|
|
|
|
void orte_ps_universe_info_construct(orte_ps_universe_info_t *obj);
|
|
void orte_ps_universe_info_destruct( orte_ps_universe_info_t *obj);
|
|
|
|
OBJ_CLASS_INSTANCE(orte_ps_universe_info_t,
|
|
opal_list_item_t,
|
|
orte_ps_universe_info_construct,
|
|
orte_ps_universe_info_destruct);
|
|
|
|
|
|
/******************
|
|
* Local Functions
|
|
******************/
|
|
static int orte_ps_init(void);
|
|
static int parse_args(int argc, char *argv[]);
|
|
|
|
static int connect_to_universe(orte_universe_t universe_info);
|
|
|
|
static int gather_information(orte_ps_universe_info_t* universe);
|
|
static int gather_active_jobs(orte_ps_universe_info_t* universe);
|
|
static int gather_nodes(orte_ps_universe_info_t* universe);
|
|
static int gather_job_info(orte_ps_universe_info_t* universe);
|
|
static int gather_vpid_info(orte_ps_universe_info_t* universe);
|
|
|
|
static int pretty_print(orte_ps_universe_info_t* universe);
|
|
static int pretty_print_nodes(opal_list_t *nodes);
|
|
static int pretty_print_jobs(opal_list_t *jobs);
|
|
static int pretty_print_vpids(orte_ps_job_info_t *job);
|
|
|
|
static char *pretty_univ_state(orte_universe_state_t state);
|
|
static char *pretty_node_state(orte_node_state_t state);
|
|
static char *pretty_job_state(orte_job_state_t state);
|
|
static char *pretty_vpid_state(orte_proc_state_t state);
|
|
|
|
/*****************************************
|
|
* Global Vars for Command line Arguments
|
|
*****************************************/
|
|
typedef struct {
|
|
bool help;
|
|
bool verbose;
|
|
char *universe;
|
|
int jobid;
|
|
int vpid;
|
|
bool gpr_dump;
|
|
bool attached;
|
|
bool nodes;
|
|
} orte_ps_globals_t;
|
|
|
|
orte_ps_globals_t orte_ps_globals;
|
|
|
|
opal_cmd_line_init_t cmd_line_opts[] = {
|
|
{ NULL, NULL, NULL,
|
|
'h', NULL, "help",
|
|
0,
|
|
&orte_ps_globals.help, OPAL_CMD_LINE_TYPE_BOOL,
|
|
"This help message" },
|
|
|
|
{ NULL, NULL, NULL,
|
|
'v', NULL, "verbose",
|
|
0,
|
|
&orte_ps_globals.verbose, OPAL_CMD_LINE_TYPE_BOOL,
|
|
"Be Verbose" },
|
|
|
|
{ NULL, NULL, NULL,
|
|
'd', NULL, "dump",
|
|
0,
|
|
&orte_ps_globals.gpr_dump, OPAL_CMD_LINE_TYPE_BOOL,
|
|
"Dump the state of the GPR" },
|
|
|
|
{ NULL, NULL, NULL,
|
|
'\0', NULL, "universe",
|
|
1,
|
|
&orte_ps_globals.universe, OPAL_CMD_LINE_TYPE_STRING,
|
|
"Specify a universe" },
|
|
|
|
{ NULL, NULL, NULL,
|
|
'j', NULL, "jobid",
|
|
1,
|
|
&orte_ps_globals.jobid, OPAL_CMD_LINE_TYPE_INT,
|
|
"Specify a specific jobid" },
|
|
|
|
{ NULL, NULL, NULL,
|
|
'p', NULL, "vpid",
|
|
1,
|
|
&orte_ps_globals.vpid, OPAL_CMD_LINE_TYPE_INT,
|
|
"Specify a specific vpid. Must specify a --jobid as well" },
|
|
|
|
{ NULL, NULL, NULL,
|
|
'n', NULL, "nodes",
|
|
0,
|
|
&orte_ps_globals.nodes, OPAL_CMD_LINE_TYPE_INT,
|
|
"Print Node Information" },
|
|
|
|
/* End of list */
|
|
{ NULL, NULL, NULL,
|
|
'\0', NULL, NULL,
|
|
0,
|
|
NULL, OPAL_CMD_LINE_TYPE_NULL,
|
|
NULL }
|
|
};
|
|
|
|
int
|
|
main(int argc, char *argv[])
|
|
{
|
|
int ret, exit_status = ORTE_SUCCESS;
|
|
opal_list_t universe_list;
|
|
opal_list_item_t* item = NULL;
|
|
opal_list_t universe_search_result;
|
|
|
|
/***************
|
|
* Initialize
|
|
***************/
|
|
if (ORTE_SUCCESS != (ret = parse_args(argc, argv))) {
|
|
return ret;
|
|
}
|
|
|
|
if (ORTE_SUCCESS != (ret = orte_ps_init())) {
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
|
|
OBJ_CONSTRUCT(&universe_list, opal_list_t);
|
|
OBJ_CONSTRUCT(&universe_search_result, opal_list_t);
|
|
|
|
/*
|
|
* Get the directory listing
|
|
*/
|
|
if( orte_ps_globals.verbose ) {
|
|
printf("orte_ps: Acquiring universe list...\n");
|
|
}
|
|
if (ORTE_SUCCESS != (ret = orte_universe_search(&universe_search_result) ) ) {
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
|
|
/*
|
|
* For each universe in the listing
|
|
*/
|
|
for(item = opal_list_get_first(&universe_search_result);
|
|
item != opal_list_get_end(&universe_search_result);
|
|
item = opal_list_get_next(item) ) {
|
|
orte_ps_universe_info_t *univ;
|
|
orte_universe_t *tmp_univ;
|
|
|
|
/*
|
|
* Copy over the universe information
|
|
*/
|
|
univ = OBJ_NEW(orte_ps_universe_info_t);
|
|
tmp_univ = (orte_universe_t *) item;
|
|
univ->universe_info.state = tmp_univ->state;
|
|
univ->universe_info.persistence = tmp_univ->persistence;
|
|
univ->universe_info.console = tmp_univ->console;
|
|
univ->universe_info.console_connected = tmp_univ->console_connected;
|
|
if( NULL != tmp_univ->name )
|
|
univ->universe_info.name = strdup(tmp_univ->name);
|
|
else
|
|
univ->universe_info.name = NULL;
|
|
if( NULL != tmp_univ->host )
|
|
univ->universe_info.host = strdup(tmp_univ->host);
|
|
else
|
|
univ->universe_info.host = NULL;
|
|
if( NULL != tmp_univ->uid )
|
|
univ->universe_info.uid = strdup(tmp_univ->uid);
|
|
else
|
|
univ->universe_info.uid = NULL;
|
|
if( NULL != tmp_univ->scope )
|
|
univ->universe_info.scope = strdup(tmp_univ->scope);
|
|
else
|
|
univ->universe_info.scope = NULL;
|
|
if( NULL != tmp_univ->seed_uri)
|
|
univ->universe_info.seed_uri = strdup(tmp_univ->seed_uri);
|
|
else
|
|
univ->universe_info.seed_uri = NULL;
|
|
if( NULL != tmp_univ->scriptfile )
|
|
univ->universe_info.scriptfile = strdup(tmp_univ->scriptfile);
|
|
else
|
|
univ->universe_info.scriptfile = NULL;
|
|
|
|
opal_list_append(&universe_list, &(univ->super));
|
|
|
|
/*
|
|
* Connect to the universe
|
|
*/
|
|
if( orte_ps_globals.verbose ) {
|
|
printf("orte_ps: Connecting to universe: %s\n", univ->universe_info.name);
|
|
}
|
|
if( ORTE_SUCCESS != (ret = connect_to_universe(univ->universe_info)) ) {
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
|
|
/*
|
|
* Gather the information
|
|
*/
|
|
if( orte_ps_globals.verbose ) {
|
|
printf("orte_ps: Gathering Universe Information\n");
|
|
}
|
|
if( ORTE_SUCCESS != (ret = gather_information(univ)) ) {
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
|
|
/*
|
|
* Print the General Universe information
|
|
*/
|
|
if(ORTE_SUCCESS != (ret = pretty_print(univ)) ) {
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
|
|
/*
|
|
* If we were asked to dump the GPR then do so
|
|
*/
|
|
if( orte_ps_globals.gpr_dump) {
|
|
if( ORTE_SUCCESS != (ret = orte_gpr.dump_all() ) ) {
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Since connecting and disconnecting from a universe is
|
|
* not well defined, only allow connection to the first
|
|
* universe found.
|
|
*/
|
|
break;
|
|
}
|
|
|
|
/***************
|
|
* Cleanup
|
|
***************/
|
|
cleanup:
|
|
while (NULL != (item = opal_list_remove_first(&universe_list))) {
|
|
OBJ_RELEASE(item);
|
|
}
|
|
while (NULL != (item = opal_list_remove_first(&universe_search_result))) {
|
|
OBJ_RELEASE(item);
|
|
}
|
|
|
|
/*
|
|
* Only finalize if we are attached to a specific universe
|
|
*/
|
|
if(orte_ps_globals.attached) {
|
|
if (OPAL_SUCCESS != (ret = orte_finalize())) {
|
|
return ret;
|
|
}
|
|
}
|
|
|
|
return exit_status;
|
|
}
|
|
|
|
static int parse_args(int argc, char *argv[]) {
|
|
int i, ret, len;
|
|
opal_cmd_line_t cmd_line;
|
|
char **app_env = NULL, **global_env = NULL;
|
|
orte_ps_globals_t tmp = { false,
|
|
false,
|
|
NULL,
|
|
-1,
|
|
-1,
|
|
false,
|
|
false,
|
|
false};
|
|
|
|
/* Parse the command line options */
|
|
|
|
orte_ps_globals = tmp;
|
|
|
|
opal_cmd_line_create(&cmd_line, cmd_line_opts);
|
|
|
|
mca_base_open();
|
|
mca_base_cmd_line_setup(&cmd_line);
|
|
ret = opal_cmd_line_parse(&cmd_line, true, argc, argv);
|
|
|
|
/**
|
|
* Put all of the MCA arguments in the environment
|
|
*/
|
|
mca_base_cmd_line_process_args(&cmd_line, &app_env, &global_env);
|
|
|
|
len = opal_argv_count(app_env);
|
|
for(i = 0; i < len; ++i) {
|
|
putenv(app_env[i]);
|
|
}
|
|
|
|
len = opal_argv_count(global_env);
|
|
for(i = 0; i < len; ++i) {
|
|
putenv(global_env[i]);
|
|
}
|
|
|
|
#if OPAL_ENABLE_FT == 1
|
|
opal_setenv(mca_base_param_env_var("crs_base_is_tool"),
|
|
"1",
|
|
true, &environ);
|
|
#endif
|
|
|
|
/**
|
|
* Now start parsing our specific arguments
|
|
*/
|
|
if (OPAL_SUCCESS != ret ||
|
|
orte_ps_globals.help) {
|
|
char *args = NULL;
|
|
args = opal_cmd_line_get_usage_msg(&cmd_line);
|
|
opal_show_help("help-orte-ps.txt", "usage", true,
|
|
args);
|
|
free(args);
|
|
return ORTE_ERROR;
|
|
}
|
|
|
|
/*
|
|
* If they specify a vpid, they must specify a jobid
|
|
*/
|
|
if( 0 <= orte_ps_globals.vpid) {
|
|
if( 0 > orte_ps_globals.jobid) {
|
|
opal_show_help("help-orte-ps.txt", "vpid-usage", true,
|
|
orte_ps_globals.vpid);
|
|
return ORTE_ERROR;
|
|
}
|
|
}
|
|
|
|
return ORTE_SUCCESS;
|
|
}
|
|
|
|
static int orte_ps_init(void) {
|
|
int exit_status = ORTE_SUCCESS, ret;
|
|
|
|
/*
|
|
* We are trying to attach to another process' GPR so we need to
|
|
* attach no matter if it is identified as private or not.
|
|
*/
|
|
opal_setenv(mca_base_param_env_var("universe_console"),
|
|
"1",
|
|
true, &environ);
|
|
|
|
#if OPAL_ENABLE_FT == 1
|
|
/* Disable the checkpoint notification routine for this
|
|
* tool. As we will never need to checkpoint this tool.
|
|
* Note: This must happen before opal_init().
|
|
*/
|
|
opal_cr_is_enabled(false);
|
|
|
|
/* Select the none component, since we don't actually use a checkpointer */
|
|
opal_setenv(mca_base_param_env_var("crs"),
|
|
"none",
|
|
true, &environ);
|
|
#endif
|
|
|
|
/***************************
|
|
* We need all of OPAL
|
|
***************************/
|
|
if (ORTE_SUCCESS != (ret = opal_init())) {
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
|
|
/***************************
|
|
* And ORTE, but need to do a bit of a dance first
|
|
***************************/
|
|
/* register handler for errnum -> string converstion */
|
|
opal_error_register("ORTE", ORTE_ERR_BASE, ORTE_ERR_MAX, orte_err2str);
|
|
|
|
/* Register all MCA Params */
|
|
if (ORTE_SUCCESS != (ret = orte_register_params(true))) {
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
|
|
/* Ensure the system_info structure is instantiated and initialized */
|
|
if (ORTE_SUCCESS != (ret = orte_sys_info())) {
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
|
|
/* Ensure the process info structure is instantiated and initialized */
|
|
if (ORTE_SUCCESS != (ret = orte_proc_info())) {
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
|
|
cleanup:
|
|
return exit_status;
|
|
}
|
|
|
|
static int pretty_print(orte_ps_universe_info_t* universe) {
|
|
int i, line_len;
|
|
int len_name = 0,
|
|
len_host = 0,
|
|
len_uid = 0,
|
|
len_scope = 0,
|
|
len_per = 0,
|
|
len_state = 0;
|
|
|
|
/*
|
|
* Calculate segment lengths
|
|
*/
|
|
len_name = (int) (strlen(universe->universe_info.name) < strlen("Universe Name") ?
|
|
strlen("Universe Name") :
|
|
strlen(universe->universe_info.name) );
|
|
len_host = (int) (strlen(universe->universe_info.host) < strlen("Hostname") ?
|
|
strlen("Hostname") :
|
|
strlen(universe->universe_info.host));
|
|
len_uid = (int) (strlen(universe->universe_info.uid) < strlen("UID") ?
|
|
strlen("UID") :
|
|
strlen(universe->universe_info.uid));
|
|
len_per = (int) strlen("Persistent");
|
|
len_scope = (int) (strlen(universe->universe_info.scope) < strlen("Scope") ?
|
|
strlen("Scope") :
|
|
strlen(universe->universe_info.scope));
|
|
len_state = (int) (strlen(pretty_univ_state(universe->universe_info.state)) < strlen("State") ?
|
|
strlen("State") :
|
|
strlen(pretty_univ_state(universe->universe_info.state)) );
|
|
|
|
line_len = (len_name + 3 +
|
|
len_host + 3 +
|
|
len_uid + 3 +
|
|
len_per + 3 +
|
|
len_scope + 3 +
|
|
len_state) + 3 ;
|
|
|
|
/*
|
|
* Print header
|
|
*/
|
|
printf("%*s | ", len_name , "Universe Name");
|
|
printf("%*s | ", len_host , "Hostname");
|
|
printf("%*s | ", len_uid , "UID");
|
|
printf("%*s | ", len_per , "Persistent");
|
|
printf("%*s | ", len_scope, "Scope");
|
|
printf("%*s |" , len_state, "State");
|
|
printf("\n");
|
|
|
|
for(i = 0; i < line_len; ++i) {
|
|
printf("-");
|
|
}
|
|
printf("\n");
|
|
|
|
/*
|
|
* Print Info
|
|
*/
|
|
printf("%*s | ", len_name, universe->universe_info.name);
|
|
printf("%*s | ", len_host, universe->universe_info.host);
|
|
printf("%*s | ", len_uid, universe->universe_info.uid);
|
|
if(universe->universe_info.persistence)
|
|
printf("%*s | ", len_per, "true");
|
|
else
|
|
printf("%*s | ", len_per, "false");
|
|
printf("%*s | ", len_scope, universe->universe_info.scope);
|
|
printf("%*s |", len_state, pretty_univ_state(universe->universe_info.state));
|
|
printf("\n");
|
|
|
|
printf("\n");
|
|
|
|
/*
|
|
* Print Node Information
|
|
*/
|
|
if( orte_ps_globals.nodes )
|
|
pretty_print_nodes(&universe->nodes);
|
|
|
|
/*
|
|
* Print Job Information
|
|
*/
|
|
pretty_print_jobs(&universe->job_list);
|
|
|
|
return ORTE_SUCCESS;
|
|
}
|
|
|
|
static int pretty_print_nodes(opal_list_t *nodes) {
|
|
opal_list_item_t* node_item = NULL;
|
|
int i, line_len;
|
|
int len_name = 0,
|
|
len_arch = 0,
|
|
len_cell = 0,
|
|
len_state = 0,
|
|
len_slots = 0,
|
|
len_slots_i = 0,
|
|
len_slots_a = 0,
|
|
len_slots_m = 0;
|
|
|
|
/*
|
|
* Caculate segment lengths
|
|
*/
|
|
len_name = (int) strlen("Node Name");
|
|
len_arch = (int) strlen("Arch");
|
|
len_cell = (int) strlen("Cell ID");
|
|
len_state = (int) strlen("State");
|
|
len_slots = (int) strlen("Slots");
|
|
len_slots_i = (int) strlen("Slots In Use");
|
|
len_slots_a = (int) strlen("Slots Alloc");
|
|
len_slots_m = (int) strlen("Slots Max");
|
|
for(node_item = opal_list_get_first(nodes);
|
|
node_item != opal_list_get_end(nodes);
|
|
node_item = opal_list_get_next(node_item) ) {
|
|
orte_ras_node_t *node;
|
|
node = (orte_ras_node_t *)node_item;
|
|
|
|
if( NULL != node->node_name &&
|
|
(int)strlen(node->node_name) > len_name)
|
|
len_name = (int) strlen(node->node_name);
|
|
|
|
if( NULL != node->node_arch &&
|
|
(int)strlen(node->node_arch) > len_arch)
|
|
len_arch = (int) strlen(node->node_arch);
|
|
|
|
if( (int)strlen(pretty_node_state(node->node_state)) > len_state )
|
|
len_state = (int)strlen(pretty_node_state(node->node_state));
|
|
}
|
|
|
|
/*
|
|
* JJH Since node_slots_inuse and node_slots_alloc are not used properly
|
|
* JJH do not display them to the user.
|
|
*/
|
|
line_len = (len_name + 3 +
|
|
len_arch + 3 +
|
|
len_cell + 3 +
|
|
len_state + 3 +
|
|
len_slots + 3 +
|
|
#if 0
|
|
len_slots_i + 3 +
|
|
len_slots_a + 3 +
|
|
#endif
|
|
len_slots_m + 3);
|
|
|
|
/*
|
|
* Print the header
|
|
*/
|
|
printf("%*s | ", len_name, "Node Name");
|
|
printf("%*s | ", len_arch, "Arch");
|
|
printf("%*s | ", len_cell, "Cell ID");
|
|
printf("%*s | ", len_state, "State");
|
|
printf("%*s | ", len_slots, "Slots");
|
|
printf("%*s | ", len_slots_m, "Slots Max");
|
|
#if 0
|
|
printf("%*s | ", len_slots_i, "Slots In Use");
|
|
printf("%*s | ", len_slots_a, "Slots Alloc");
|
|
#endif
|
|
printf("\n");
|
|
|
|
for(i = 0; i < line_len; ++i) {
|
|
printf("-");
|
|
}
|
|
printf("\n");
|
|
|
|
/*
|
|
* Print Info
|
|
*/
|
|
for(node_item = opal_list_get_first(nodes);
|
|
node_item != opal_list_get_end(nodes);
|
|
node_item = opal_list_get_next(node_item) ) {
|
|
orte_ras_node_t *node;
|
|
node = (orte_ras_node_t *)node_item;
|
|
|
|
printf("%*s | ", len_name, node->node_name);
|
|
printf("%*s | ", len_arch, (NULL == node->node_arch ?
|
|
"" :
|
|
node->node_arch));
|
|
printf("%*d | ", len_cell, node->node_cellid);
|
|
printf("%*s | ", len_state, pretty_node_state(node->node_state));
|
|
printf("%*d | ", len_slots, (uint)node->node_slots);
|
|
printf("%*d | ", len_slots_m, (uint)node->node_slots_max);
|
|
#if 0
|
|
printf("%*d | ", len_slots_i, (uint)node->node_slots_inuse);
|
|
printf("%*d | ", len_slots_a, (uint)node->node_slots_alloc);
|
|
#endif
|
|
printf("\n");
|
|
}
|
|
|
|
return ORTE_SUCCESS;
|
|
}
|
|
|
|
static int pretty_print_jobs(opal_list_t *jobs) {
|
|
opal_list_item_t* job_item = NULL;
|
|
int len_jobid = 0,
|
|
len_state = 0,
|
|
len_slots = 0,
|
|
len_vpid_s = 0,
|
|
len_vpid_r = 0,
|
|
len_ckpt_s = 0,
|
|
len_ckpt_r = 0,
|
|
len_ckpt_l = 0;
|
|
int i, line_len;
|
|
|
|
for(job_item = opal_list_get_first(jobs);
|
|
job_item != opal_list_get_end(jobs);
|
|
job_item = opal_list_get_next(job_item) ) {
|
|
orte_ps_job_info_t *job;
|
|
job = (orte_ps_job_info_t *)job_item;
|
|
|
|
/*
|
|
* Caculate segment lengths
|
|
*/
|
|
len_jobid = 6;
|
|
len_state = (int) (strlen(pretty_job_state(job->state)) < strlen("State") ?
|
|
strlen("State") :
|
|
strlen(pretty_job_state(job->state)));
|
|
len_slots = 6;
|
|
len_vpid_s = (int) strlen("VPID Start");
|
|
len_vpid_r = (int) strlen("VPID Range");
|
|
#if OPAL_ENABLE_FT == 1
|
|
len_ckpt_s = (int) (strlen(orte_snapc_ckpt_state_str(job->ckpt_state)) < strlen("Ckpt State") ?
|
|
strlen("Ckpt State") ?
|
|
strlen(orte_snapc_ckpt_state_str(job->ckpt_state)) );
|
|
len_ckpt_r = (int) (NULL == job->ckpt_ref ? strlen("Ckpt Ref") :
|
|
(strlen(job->ckpt_ref) < strlen("Ckpt Ref") ?
|
|
strlen("Ckpt Ref") ?
|
|
strlen(job->ckpt_ref) ) );
|
|
len_ckpt_l = (int) (NULL == job->ckpt_loc ? strlen("Ckpt Loc") :
|
|
(strlen(job->ckpt_loc) < strlen("Ckpt Loc") ?
|
|
strlen("Ckpt Loc") ?
|
|
strlen(job->ckpt_loc) ) );
|
|
#else
|
|
len_ckpt_s = 0;
|
|
len_ckpt_r = 0;
|
|
len_ckpt_l = 0;
|
|
#endif
|
|
line_len = (len_jobid + 3 +
|
|
len_state + 3 +
|
|
len_slots + 3 +
|
|
len_vpid_s + 3 +
|
|
len_vpid_r + 3 +
|
|
len_ckpt_s + 3 +
|
|
len_ckpt_r + 3 +
|
|
len_ckpt_l
|
|
#if OPAL_ENABLE_FT != 1
|
|
- 6
|
|
#endif
|
|
);
|
|
/*
|
|
* Print Header
|
|
*/
|
|
printf("\n");
|
|
printf("%*s | ", len_jobid , "JobID");
|
|
printf("%*s | ", len_state , "State");
|
|
printf("%*s | ", len_slots , "Slots");
|
|
printf("%*s | ", len_vpid_s , "VPID Start");
|
|
printf("%*s | ", len_vpid_r , "VPID Range");
|
|
#if OPAL_ENABLE_FT == 1
|
|
printf("%*s | ", len_ckpt_s , "Ckpt State");
|
|
printf("%*s | ", len_ckpt_r , "Ckpt Ref");
|
|
printf("%*s |", len_ckpt_l , "Ckpt Loc");
|
|
#endif
|
|
printf("\n");
|
|
|
|
for(i = 0; i < line_len; ++i) {
|
|
printf("-");
|
|
}
|
|
printf("\n");
|
|
|
|
/*
|
|
* Print Info
|
|
*/
|
|
printf("%*d | ", len_jobid , job->id);
|
|
printf("%*s | ", len_state , pretty_job_state(job->state));
|
|
printf("%*d | ", len_slots , (uint)job->slots);
|
|
printf("%*d | ", len_vpid_s, job->vpid_start);
|
|
printf("%*d | ", len_vpid_r, job->vpid_range);
|
|
#if OPAL_ENABLE_FT == 1
|
|
printf("%*s | ", len_ckpt_s, orte_snapc_ckpt_state_str(job->ckpt_state));
|
|
printf("%*s | ", len_ckpt_r, (NULL == job->ckpt_ref ?
|
|
"" :
|
|
job->ckpt_ref) );
|
|
printf("%*s |", len_ckpt_l, (NULL == job->ckpt_loc ?
|
|
"" :
|
|
job->ckpt_loc) );
|
|
#endif
|
|
printf("\n");
|
|
|
|
/*
|
|
* Pretty print all VPID's in job
|
|
*/
|
|
if(0 == job->id) { /* No vpids for the HNP */
|
|
continue;
|
|
}
|
|
|
|
pretty_print_vpids(job);
|
|
}
|
|
|
|
return ORTE_SUCCESS;
|
|
}
|
|
|
|
static int pretty_print_vpids(orte_ps_job_info_t *job) {
|
|
opal_list_item_t* vpid_item = NULL;
|
|
int len_o_proc_name = 0,
|
|
len_proc_name = 0,
|
|
len_rank = 0,
|
|
len_pid = 0,
|
|
len_state = 0,
|
|
len_node = 0,
|
|
len_ckpt_s = 0,
|
|
len_ckpt_r = 0,
|
|
len_ckpt_l = 0;
|
|
int i, line_len;
|
|
|
|
/*
|
|
* Caculate segment lengths
|
|
*/
|
|
len_o_proc_name = strlen("ORTE Name");
|
|
len_proc_name = strlen("Process Name");
|
|
len_rank = 6;
|
|
len_pid = 6;
|
|
len_state = 0;
|
|
len_node = 0;
|
|
#if OPAL_ENABLE_FT == 1
|
|
len_ckpt_s = strlen("Ckpt State");
|
|
len_ckpt_r = strlen("Ckpt Ref");
|
|
len_ckpt_l = strlen("Ckpt Loc");
|
|
#else
|
|
len_ckpt_s = 0;
|
|
len_ckpt_r = 0;
|
|
len_ckpt_l = 0;
|
|
#endif
|
|
|
|
for(vpid_item = opal_list_get_first(&(job->vpid_list));
|
|
vpid_item != opal_list_get_end(&(job->vpid_list));
|
|
vpid_item = opal_list_get_next(vpid_item) ) {
|
|
orte_ps_vpid_info_t *vpid;
|
|
char *proc_name = NULL;
|
|
vpid = (orte_ps_vpid_info_t *)vpid_item;
|
|
|
|
/*
|
|
* Find my app context
|
|
*/
|
|
for( i = 0; i < (int)job->num_app_context; ++i) {
|
|
if( job->app_context[i]->idx == vpid->app_context_idx ) {
|
|
if( (int)strlen(job->app_context[i]->app) > len_proc_name)
|
|
len_proc_name = strlen(job->app_context[i]->app);
|
|
break;
|
|
}
|
|
}
|
|
|
|
asprintf(&proc_name, "%d.%d.%d", vpid->name.cellid, vpid->name.jobid, vpid->name.vpid);
|
|
if( (int)strlen(proc_name) > len_o_proc_name )
|
|
len_o_proc_name = strlen(proc_name);
|
|
|
|
if( (int)strlen(vpid->node) > len_node)
|
|
len_node = strlen(vpid->node);
|
|
|
|
if( (int)strlen(pretty_vpid_state(vpid->state)) > len_state)
|
|
len_state = strlen(pretty_vpid_state(vpid->state));
|
|
|
|
#if OPAL_ENABLE_FT == 1
|
|
if( (int)strlen(orte_snapc_ckpt_state_str(vpid->ckpt_state)) > len_ckpt_s)
|
|
len_ckpt_s = strlen(orte_snapc_ckpt_state_str(vpid->ckpt_state));
|
|
|
|
if( NULL != vpid->ckpt_ref &&
|
|
(int)strlen(vpid->ckpt_ref) > len_ckpt_r)
|
|
len_ckpt_r = strlen(vpid->ckpt_ref);
|
|
|
|
if( NULL != vpid->ckpt_loc &&
|
|
(int)strlen(vpid->ckpt_loc) > len_ckpt_l)
|
|
len_ckpt_l = strlen(vpid->ckpt_loc);
|
|
#endif
|
|
|
|
if( NULL != proc_name) {
|
|
free(proc_name);
|
|
proc_name = NULL;
|
|
}
|
|
}
|
|
|
|
line_len = (len_o_proc_name + 3 +
|
|
len_proc_name + 3 +
|
|
len_rank + 3 +
|
|
len_pid + 3 +
|
|
len_state + 3 +
|
|
len_node + 3 +
|
|
len_ckpt_s + 3 +
|
|
len_ckpt_r + 3 +
|
|
len_ckpt_l
|
|
#if OPAL_ENABLE_FT != 1
|
|
- 6
|
|
#endif
|
|
);
|
|
|
|
/*
|
|
* Print Header
|
|
*/
|
|
printf("\t");
|
|
printf("%*s | ", len_proc_name , "Process Name");
|
|
printf("%*s | ", len_o_proc_name , "ORTE Name");
|
|
printf("%*s | ", len_rank , "Rank");
|
|
printf("%*s | ", len_pid , "PID");
|
|
printf("%*s | ", len_node , "Node");
|
|
printf("%*s | ", len_state , "State");
|
|
#if OPAL_ENABLE_FT == 1
|
|
printf("%*s | ", len_ckpt_s , "Ckpt State");
|
|
printf("%*s | ", len_ckpt_r , "Ckpt Ref");
|
|
printf("%*s |", len_ckpt_l , "Ckpt Loc");
|
|
#endif
|
|
printf("\n");
|
|
|
|
printf("\t");
|
|
for(i = 0; i < line_len; ++i) {
|
|
printf("-");
|
|
}
|
|
printf("\n");
|
|
|
|
/*
|
|
* Print Info
|
|
*/
|
|
for(vpid_item = opal_list_get_first(&(job->vpid_list));
|
|
vpid_item != opal_list_get_end(&(job->vpid_list));
|
|
vpid_item = opal_list_get_next(vpid_item) ) {
|
|
orte_ps_vpid_info_t *vpid;
|
|
char *proc_name = NULL;
|
|
vpid = (orte_ps_vpid_info_t *)vpid_item;
|
|
|
|
printf("\t");
|
|
|
|
asprintf(&proc_name, "%d.%d.%d", vpid->name.cellid, vpid->name.jobid, vpid->name.vpid);
|
|
|
|
for( i = 0; i < (int)job->num_app_context; ++i) {
|
|
if( job->app_context[i]->idx == vpid->app_context_idx ) {
|
|
printf("%*s | ", len_proc_name, job->app_context[i]->app);
|
|
break;
|
|
}
|
|
}
|
|
|
|
printf("%*s | ", len_o_proc_name, proc_name);
|
|
printf("%*d | ", len_rank , (uint)vpid->rank);
|
|
printf("%*d | ", len_pid , vpid->pid);
|
|
printf("%*s | ", len_node , vpid->node);
|
|
printf("%*s | ", len_state , pretty_vpid_state(vpid->state));
|
|
|
|
#if OPAL_ENABLE_FT == 1
|
|
printf("%*s | ", len_ckpt_s, orte_snapc_ckpt_state_str(vpid->ckpt_state));
|
|
printf("%*s | ", len_ckpt_r, (NULL == vpid->ckpt_ref ?
|
|
"" :
|
|
vpid->ckpt_ref));
|
|
printf("%*s |", len_ckpt_l, (NULL == vpid->ckpt_loc ?
|
|
"" :
|
|
vpid->ckpt_loc));
|
|
#endif
|
|
printf("\n");
|
|
|
|
if( NULL != proc_name) {
|
|
free(proc_name);
|
|
proc_name = NULL;
|
|
}
|
|
}
|
|
|
|
return ORTE_SUCCESS;
|
|
}
|
|
|
|
static int connect_to_universe(orte_universe_t universe_info) {
|
|
int ret, exit_status = ORTE_SUCCESS;
|
|
char * univ_mca_param = NULL;
|
|
|
|
/*
|
|
* Construct the MCA parameter
|
|
*/
|
|
asprintf(&univ_mca_param, "%s@%s:%s",
|
|
universe_info.uid,
|
|
universe_info.host,
|
|
universe_info.name);
|
|
#if 0
|
|
/*
|
|
* Disconnect from the current universe
|
|
*/
|
|
if(orte_ps_globals.attached) {
|
|
if (OPAL_SUCCESS != (ret = orte_system_finalize())) {
|
|
return ret;
|
|
}
|
|
}
|
|
#endif
|
|
/*
|
|
* Set the environment universe information
|
|
*/
|
|
opal_setenv(mca_base_param_env_var("universe"),
|
|
univ_mca_param,
|
|
true, &environ);
|
|
|
|
/*
|
|
* Restart ORTE in the requested universe
|
|
*/
|
|
if(!orte_ps_globals.attached) {
|
|
if (ORTE_SUCCESS != (ret = orte_system_init(true)) ) {
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
}
|
|
else {
|
|
if( ORTE_SUCCESS != (ret = orte_restart(orte_process_info.my_name, universe_info.seed_uri)) ) {
|
|
printf("orte_restart: FAILED (%d)\n", ret);
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
}
|
|
|
|
orte_ps_globals.attached = true;
|
|
|
|
cleanup:
|
|
if( NULL != univ_mca_param)
|
|
free(univ_mca_param);
|
|
|
|
return exit_status;
|
|
}
|
|
|
|
static int gather_information(orte_ps_universe_info_t* universe) {
|
|
int ret, exit_status = ORTE_SUCCESS;
|
|
|
|
if( ORTE_SUCCESS != (ret = gather_active_jobs(universe) )) {
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
|
|
if( ORTE_SUCCESS != (ret = gather_nodes(universe) )) {
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
|
|
if( ORTE_SUCCESS != (ret = gather_job_info(universe) )) {
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
|
|
if( ORTE_SUCCESS != (ret = gather_vpid_info(universe) )) {
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
|
|
cleanup:
|
|
return exit_status;
|
|
}
|
|
|
|
static int gather_active_jobs(orte_ps_universe_info_t* universe) {
|
|
int ret, exit_status = ORTE_SUCCESS;
|
|
char *segment = NULL;
|
|
orte_gpr_value_t** values = NULL;
|
|
size_t i, j, num_values = 0;
|
|
|
|
/**********************
|
|
* Job Info segment
|
|
**********************/
|
|
segment = strdup(ORTE_JOBINFO_SEGMENT);
|
|
|
|
if( ORTE_SUCCESS != (ret = orte_gpr.get(ORTE_GPR_KEYS_OR|ORTE_GPR_TOKENS_OR,
|
|
segment,
|
|
NULL,
|
|
NULL,
|
|
&num_values,
|
|
&values ) ) ) {
|
|
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
|
|
/*
|
|
* Parse the structure returned
|
|
*/
|
|
for(i = 0; i < num_values; ++i) {
|
|
orte_gpr_value_t* value = values[i];
|
|
orte_ps_job_info_t *job = NULL;
|
|
|
|
job = OBJ_NEW(orte_ps_job_info_t);
|
|
orte_schema.extract_jobid_from_segment_name(&(job->id), value->tokens[0]);
|
|
|
|
/*
|
|
* If the user specified a jobid, then
|
|
* only access the info for that jobid
|
|
*/
|
|
if( 0 <= orte_ps_globals.jobid ) {
|
|
if( (int)job->id != orte_ps_globals.jobid) {
|
|
continue;
|
|
}
|
|
}
|
|
|
|
for( j = 0; j < value->cnt; ++j) {
|
|
orte_gpr_keyval_t* keyval = value->keyvals[j];
|
|
orte_job_state_t *job_state;
|
|
|
|
if( 0 == strncmp(keyval->key, ORTE_JOB_STATE_KEY, strlen(ORTE_JOB_STATE_KEY)) ) {
|
|
if( ORTE_SUCCESS != (ret = orte_dss.get( (void **) &job_state, keyval->value, ORTE_JOB_STATE))) {
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
|
|
job->state = *job_state;
|
|
continue;
|
|
}
|
|
}
|
|
|
|
opal_list_append(&universe->job_list, &(job->super));
|
|
}
|
|
|
|
cleanup:
|
|
return exit_status;
|
|
}
|
|
|
|
static int gather_nodes(orte_ps_universe_info_t* universe) {
|
|
int ret, exit_status = ORTE_SUCCESS;
|
|
|
|
if( ORTE_SUCCESS != (ret = orte_ras_base_node_query(&(universe->nodes)))) {
|
|
exit_status = ret;
|
|
}
|
|
|
|
return exit_status;
|
|
}
|
|
|
|
static int gather_job_info(orte_ps_universe_info_t* universe) {
|
|
int ret, exit_status = ORTE_SUCCESS;
|
|
char *segment = NULL, *tokens[2];
|
|
orte_gpr_value_t** values = NULL;
|
|
size_t i, j, num_values = 0;
|
|
opal_list_item_t* item = NULL;
|
|
|
|
/*
|
|
* For each job in the universe
|
|
*/
|
|
for(item = opal_list_get_first(&(universe->job_list));
|
|
item != opal_list_get_end(&(universe->job_list));
|
|
item = opal_list_get_next(item) ) {
|
|
orte_ps_job_info_t *job;
|
|
job = (orte_ps_job_info_t *)item;
|
|
|
|
/*
|
|
* Get the App Context(s)
|
|
*/
|
|
orte_rmgr_base_get_app_context(job->id,
|
|
&job->app_context,
|
|
&job->num_app_context);
|
|
/*
|
|
* Access the job segment
|
|
*/
|
|
orte_schema.get_job_segment_name(&segment, job->id);
|
|
|
|
/*
|
|
* Here we are just focused on the orte-job-globals container
|
|
*/
|
|
tokens[0] = strdup(ORTE_JOB_GLOBALS);
|
|
tokens[1] = NULL;
|
|
|
|
if( ORTE_SUCCESS != (ret = orte_gpr.get(ORTE_GPR_KEYS_OR|ORTE_GPR_TOKENS_OR,
|
|
segment,
|
|
tokens,
|
|
NULL,
|
|
&num_values,
|
|
&values ) ) ) {
|
|
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
|
|
/*
|
|
* Parse the structure returned
|
|
*/
|
|
for(i = 0; i < num_values; ++i) {
|
|
orte_gpr_value_t* value = values[i];
|
|
|
|
for( j = 0; j < value->cnt; ++j) {
|
|
orte_gpr_keyval_t* keyval = value->keyvals[j];
|
|
size_t *tmp_num;
|
|
orte_vpid_t *tmp_vpid;
|
|
|
|
if( 0 == strncmp(keyval->key, ORTE_PROC_NUM_AT_INIT, strlen(ORTE_PROC_NUM_AT_INIT)) ) {
|
|
if( ORTE_SUCCESS != (ret = orte_dss.get( (void **) &tmp_num, keyval->value, ORTE_SIZE))) {
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
job->num_init = *tmp_num;
|
|
continue;
|
|
}
|
|
else if( 0 == strncmp(keyval->key, ORTE_PROC_NUM_LAUNCHED, strlen(ORTE_PROC_NUM_LAUNCHED)) ) {
|
|
if( ORTE_SUCCESS != (ret = orte_dss.get( (void **) &tmp_num, keyval->value, ORTE_SIZE))) {
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
job->num_launched = *tmp_num;
|
|
continue;
|
|
}
|
|
else if( 0 == strncmp(keyval->key, ORTE_PROC_NUM_RUNNING, strlen(ORTE_PROC_NUM_RUNNING)) ) {
|
|
if( ORTE_SUCCESS != (ret = orte_dss.get( (void **) &tmp_num, keyval->value, ORTE_SIZE))) {
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
job->num_running = *tmp_num;
|
|
continue;
|
|
}
|
|
else if( 0 == strncmp(keyval->key, ORTE_PROC_NUM_FINALIZED, strlen(ORTE_PROC_NUM_FINALIZED)) ) {
|
|
if( ORTE_SUCCESS != (ret = orte_dss.get( (void **) &tmp_num, keyval->value, ORTE_SIZE))) {
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
job->num_finalized = *tmp_num;
|
|
continue;
|
|
}
|
|
else if( 0 == strncmp(keyval->key, ORTE_PROC_NUM_TERMINATED, strlen(ORTE_PROC_NUM_TERMINATED)) ) {
|
|
if( ORTE_SUCCESS != (ret = orte_dss.get( (void **) &tmp_num, keyval->value, ORTE_SIZE))) {
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
job->num_terminated = *tmp_num;
|
|
continue;
|
|
}
|
|
else if( 0 == strncmp(keyval->key, ORTE_PROC_NUM_ABORTED, strlen(ORTE_PROC_NUM_ABORTED)) ) {
|
|
if( ORTE_SUCCESS != (ret = orte_dss.get( (void **) &tmp_num, keyval->value, ORTE_SIZE))) {
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
job->num_aborted = *tmp_num;
|
|
continue;
|
|
}
|
|
else if( 0 == strncmp(keyval->key, ORTE_JOB_SLOTS_KEY, strlen(ORTE_JOB_SLOTS_KEY)) ) {
|
|
if( ORTE_SUCCESS != (ret = orte_dss.get( (void **) &tmp_num, keyval->value, ORTE_SIZE))) {
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
job->slots = *tmp_num;
|
|
continue;
|
|
}
|
|
else if( 0 == strncmp(keyval->key, ORTE_JOB_VPID_START_KEY, strlen(ORTE_JOB_VPID_START_KEY)) ) {
|
|
if( ORTE_SUCCESS != (ret = orte_dss.get( (void **) &tmp_vpid, keyval->value, ORTE_VPID))) {
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
job->vpid_start = *tmp_vpid;
|
|
continue;
|
|
}
|
|
else if( 0 == strncmp(keyval->key, ORTE_JOB_VPID_RANGE_KEY, strlen(ORTE_JOB_VPID_RANGE_KEY)) ) {
|
|
if( ORTE_SUCCESS != (ret = orte_dss.get( (void **) &tmp_vpid, keyval->value, ORTE_VPID))) {
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
job->vpid_range = *tmp_vpid;
|
|
continue;
|
|
}
|
|
#if OPAL_ENABLE_FT == 1
|
|
else if( 0 == strncmp(keyval->key, ORTE_JOB_CKPT_STATE_KEY, strlen(ORTE_JOB_CKPT_STATE_KEY)) ) {
|
|
if( ORTE_SUCCESS != (ret = orte_dss.get( (void **) &tmp_num, keyval->value, ORTE_SIZE))) {
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
job->ckpt_state = *tmp_num;
|
|
continue;
|
|
}
|
|
else if( 0 == strncmp(keyval->key, ORTE_JOB_CKPT_SNAPSHOT_REF_KEY, strlen(ORTE_JOB_CKPT_SNAPSHOT_REF_KEY)) ) {
|
|
char *tmp_str = NULL;
|
|
if( ORTE_SUCCESS != (ret = orte_dss.get( (void **) &tmp_str, keyval->value, ORTE_STRING))) {
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
job->ckpt_ref = strdup(tmp_str);
|
|
continue;
|
|
}
|
|
else if( 0 == strncmp(keyval->key, ORTE_JOB_CKPT_SNAPSHOT_LOC_KEY, strlen(ORTE_JOB_CKPT_SNAPSHOT_LOC_KEY)) ) {
|
|
char *tmp_str = NULL;
|
|
if( ORTE_SUCCESS != (ret = orte_dss.get( (void **) &tmp_str, keyval->value, ORTE_STRING))) {
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
job->ckpt_loc = strdup(tmp_str);
|
|
continue;
|
|
}
|
|
#endif
|
|
}
|
|
}
|
|
}
|
|
|
|
cleanup:
|
|
return exit_status;
|
|
}
|
|
|
|
static int gather_vpid_info(orte_ps_universe_info_t* universe) {
|
|
int ret, exit_status = ORTE_SUCCESS;
|
|
char *segment = NULL;
|
|
orte_gpr_value_t** values = NULL;
|
|
size_t i, j, num_values = 0;
|
|
opal_list_item_t* job_item = NULL;
|
|
size_t v = 0;
|
|
|
|
/*
|
|
* For each Job in the universe
|
|
*/
|
|
for(job_item = opal_list_get_first(&(universe->job_list));
|
|
job_item != opal_list_get_end(&(universe->job_list));
|
|
job_item = opal_list_get_next(job_item) ) {
|
|
orte_ps_job_info_t *job;
|
|
job = (orte_ps_job_info_t *)job_item;
|
|
|
|
/*
|
|
* Skip getting the vpid's for the HNP, since the information is not complete
|
|
*/
|
|
if( 0 == job->id) {
|
|
continue;
|
|
}
|
|
|
|
/*
|
|
* For each vpid in the job
|
|
*/
|
|
for(v = job->vpid_start; v < (job->vpid_start + job->vpid_range); ++v) {
|
|
orte_ps_vpid_info_t *vpid = NULL;
|
|
orte_process_name_t proc;
|
|
char **tokens = NULL;
|
|
size_t num_tokens = 0;
|
|
|
|
/*
|
|
* If the user specified a vpid, then just get that one
|
|
*/
|
|
if( 0 <= orte_ps_globals.vpid) {
|
|
/*
|
|
* Check to make sure it is a valid vpid
|
|
*/
|
|
if( (int)(job->vpid_start + job->vpid_range) <= orte_ps_globals.vpid) {
|
|
opal_show_help("help-orte-ps.txt", "invalid-vpid", true,
|
|
orte_ps_globals.vpid,
|
|
orte_ps_globals.jobid );
|
|
return ORTE_ERROR;
|
|
}
|
|
|
|
if( (int)v != orte_ps_globals.vpid ) {
|
|
continue;
|
|
}
|
|
}
|
|
|
|
vpid = OBJ_NEW(orte_ps_vpid_info_t);
|
|
|
|
/*
|
|
* Access the job segment
|
|
*/
|
|
orte_schema.get_job_segment_name(&segment, job->id);
|
|
|
|
/*
|
|
* Access the vpid container
|
|
*/
|
|
proc.cellid = 0;
|
|
proc.jobid = job->id;
|
|
proc.vpid = v;
|
|
|
|
orte_schema.get_proc_tokens(&tokens,
|
|
&num_tokens,
|
|
&proc);
|
|
|
|
|
|
if( ORTE_SUCCESS != (ret = orte_gpr.get(ORTE_GPR_KEYS_OR|ORTE_GPR_TOKENS_OR,
|
|
segment,
|
|
tokens,
|
|
NULL,
|
|
&num_values,
|
|
&values ) ) ) {
|
|
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
|
|
/*
|
|
* Parse the structure returned
|
|
*/
|
|
for(i = 0; i < num_values; ++i) {
|
|
orte_gpr_value_t* value = values[i];
|
|
|
|
for( j = 0; j < value->cnt; ++j) {
|
|
orte_gpr_keyval_t* keyval = value->keyvals[j];
|
|
|
|
if( 0 == strncmp(keyval->key, ORTE_PROC_RANK_KEY, strlen(ORTE_PROC_RANK_KEY)) ) {
|
|
size_t *tmp_size;
|
|
if( ORTE_SUCCESS != (ret = orte_dss.get( (void **) &tmp_size, keyval->value, ORTE_SIZE))) {
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
|
|
vpid->rank = *tmp_size;
|
|
continue;
|
|
}
|
|
else if( 0 == strncmp(keyval->key, ORTE_PROC_APP_CONTEXT_KEY, strlen(ORTE_PROC_APP_CONTEXT_KEY)) ) {
|
|
size_t *tmp_size;
|
|
if( ORTE_SUCCESS != (ret = orte_dss.get( (void **) &tmp_size, keyval->value, ORTE_SIZE))) {
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
|
|
vpid->app_context_idx = *tmp_size;
|
|
continue;
|
|
}
|
|
else if( 0 == strncmp(keyval->key, ORTE_PROC_PID_KEY, strlen(ORTE_PROC_PID_KEY)) ) {
|
|
pid_t *tmp_pid;
|
|
if( ORTE_SUCCESS != (ret = orte_dss.get( (void **) &tmp_pid, keyval->value, ORTE_PID))) {
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
|
|
vpid->pid = *tmp_pid;
|
|
continue;
|
|
}
|
|
else if( 0 == strncmp(keyval->key, ORTE_PROC_NAME_KEY, strlen(ORTE_PROC_NAME_KEY)) ) {
|
|
orte_process_name_t *tmp_proc;
|
|
if( ORTE_SUCCESS != (ret = orte_dss.get( (void **) &tmp_proc, keyval->value, ORTE_NAME))) {
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
|
|
vpid->name.cellid = tmp_proc->cellid;
|
|
vpid->name.jobid = tmp_proc->jobid;
|
|
vpid->name.vpid = tmp_proc->vpid;
|
|
|
|
continue;
|
|
}
|
|
else if( 0 == strncmp(keyval->key, ORTE_NODE_NAME_KEY, strlen(ORTE_NODE_NAME_KEY)) ) {
|
|
char *tmp_node = NULL;
|
|
if( ORTE_SUCCESS != (ret = orte_dss.get( (void **) &tmp_node, keyval->value, ORTE_STRING))) {
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
|
|
vpid->node = strdup(tmp_node);
|
|
continue;
|
|
}
|
|
else if( 0 == strncmp(keyval->key, ORTE_PROC_STATE_KEY, strlen(ORTE_PROC_STATE_KEY)) ) {
|
|
orte_proc_state_t *tmp_state;
|
|
if( ORTE_SUCCESS != (ret = orte_dss.get( (void **) &tmp_state, keyval->value, ORTE_PROC_STATE))) {
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
|
|
vpid->state = *tmp_state;
|
|
continue;
|
|
}
|
|
#if OPAL_ENABLE_FT == 1
|
|
else if( 0 == strncmp(keyval->key, ORTE_PROC_CKPT_STATE_KEY, strlen(ORTE_PROC_CKPT_STATE_KEY)) ) {
|
|
size_t *tmp_state;
|
|
if( ORTE_SUCCESS != (ret = orte_dss.get( (void **) &tmp_state, keyval->value, ORTE_SIZE))) {
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
|
|
vpid->ckpt_state = *tmp_state;
|
|
continue;
|
|
}
|
|
else if( 0 == strncmp(keyval->key, ORTE_PROC_CKPT_SNAPSHOT_REF_KEY, strlen(ORTE_PROC_CKPT_SNAPSHOT_REF_KEY)) ) {
|
|
char *tmp_str = NULL;
|
|
if( ORTE_SUCCESS != (ret = orte_dss.get( (void **) &tmp_str, keyval->value, ORTE_STRING))) {
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
|
|
vpid->ckpt_ref = strdup(tmp_str);
|
|
continue;
|
|
}
|
|
else if( 0 == strncmp(keyval->key, ORTE_PROC_CKPT_SNAPSHOT_LOC_KEY, strlen(ORTE_PROC_CKPT_SNAPSHOT_LOC_KEY)) ) {
|
|
char *tmp_str = NULL;
|
|
if( ORTE_SUCCESS != (ret = orte_dss.get( (void **) &tmp_str, keyval->value, ORTE_STRING))) {
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
|
|
vpid->ckpt_loc = strdup(tmp_str);
|
|
continue;
|
|
}
|
|
#endif
|
|
}
|
|
}
|
|
|
|
opal_list_append(&job->vpid_list, &(vpid->super));
|
|
}
|
|
}
|
|
|
|
cleanup:
|
|
return exit_status;
|
|
}
|
|
|
|
/************************
|
|
* Object handling
|
|
************************/
|
|
void orte_ps_vpid_info_construct(orte_ps_vpid_info_t *obj) {
|
|
obj->node = NULL;
|
|
|
|
#if OPAL_ENABLE_FT == 1
|
|
obj->ckpt_ref = NULL;
|
|
obj->ckpt_loc = NULL;
|
|
#endif
|
|
}
|
|
|
|
void orte_ps_vpid_info_destruct( orte_ps_vpid_info_t *obj) {
|
|
if( NULL != obj->node)
|
|
free(obj->node);
|
|
|
|
#if OPAL_ENABLE_FT == 1
|
|
if( NULL != obj->ckpt_ref)
|
|
free(obj->ckpt_ref);
|
|
if( NULL != obj->ckpt_loc)
|
|
free(obj->ckpt_loc);
|
|
#endif
|
|
}
|
|
|
|
void orte_ps_job_info_construct(orte_ps_job_info_t *obj) {
|
|
OBJ_CONSTRUCT(&obj->vpid_list, opal_list_t);
|
|
|
|
#if OPAL_ENABLE_FT == 1
|
|
obj->ckpt_ref = NULL;
|
|
obj->ckpt_loc = NULL;
|
|
#endif
|
|
|
|
obj->app_context = NULL;
|
|
obj->num_app_context = 0;
|
|
}
|
|
|
|
void orte_ps_job_info_destruct( orte_ps_job_info_t *obj) {
|
|
opal_list_item_t* item = NULL;
|
|
size_t i;
|
|
|
|
#if OPAL_ENABLE_FT == 1
|
|
if( NULL != obj->ckpt_ref)
|
|
free(obj->ckpt_ref);
|
|
if( NULL != obj->ckpt_loc)
|
|
free(obj->ckpt_loc);
|
|
#endif
|
|
|
|
for(i = 0; i < obj->num_app_context; ++i) {
|
|
free(obj->app_context[i]);
|
|
}
|
|
obj->num_app_context = 0;
|
|
|
|
while (NULL != (item = opal_list_remove_first(&obj->vpid_list))) {
|
|
OBJ_RELEASE(item);
|
|
}
|
|
OBJ_DESTRUCT(&obj->vpid_list);
|
|
}
|
|
|
|
void orte_ps_universe_info_construct(orte_ps_universe_info_t *obj) {
|
|
OBJ_CONSTRUCT(&obj->job_list, opal_list_t);
|
|
OBJ_CONSTRUCT(&obj->nodes, opal_list_t);
|
|
OBJ_CONSTRUCT(&obj->universe_info, orte_universe_t);
|
|
}
|
|
|
|
void orte_ps_universe_info_destruct( orte_ps_universe_info_t *obj) {
|
|
opal_list_item_t* item = NULL;
|
|
|
|
while (NULL != (item = opal_list_remove_first(&obj->job_list))) {
|
|
OBJ_RELEASE(item);
|
|
}
|
|
OBJ_DESTRUCT(&obj->job_list);
|
|
|
|
while (NULL != (item = opal_list_remove_first(&obj->nodes))) {
|
|
OBJ_RELEASE(item);
|
|
}
|
|
OBJ_DESTRUCT(&obj->nodes);
|
|
|
|
OBJ_DESTRUCT(&obj->universe_info);
|
|
}
|
|
|
|
static char *pretty_job_state(orte_job_state_t state) {
|
|
switch(state) {
|
|
case ORTE_JOB_STATE_INIT:
|
|
return strdup("Init");
|
|
break;
|
|
case ORTE_JOB_STATE_LAUNCHED:
|
|
return strdup("Launched");
|
|
break;
|
|
case ORTE_JOB_STATE_AT_STG1:
|
|
return strdup("Stage 1");
|
|
break;
|
|
case ORTE_JOB_STATE_AT_STG2:
|
|
return strdup("Stage 2");
|
|
break;
|
|
case ORTE_JOB_STATE_RUNNING:
|
|
return strdup("Running");
|
|
break;
|
|
case ORTE_JOB_STATE_AT_STG3:
|
|
return strdup("Stage 3");
|
|
break;
|
|
case ORTE_JOB_STATE_FINALIZED:
|
|
return strdup("Finalized");
|
|
break;
|
|
case ORTE_JOB_STATE_TERMINATED:
|
|
return strdup("Terminated");
|
|
break;
|
|
case ORTE_JOB_STATE_ABORTED:
|
|
return strdup("Aborted");
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
|
|
return strdup("Unknown");
|
|
}
|
|
|
|
static char *pretty_vpid_state(orte_proc_state_t state) {
|
|
switch(state) {
|
|
case ORTE_PROC_STATE_INIT:
|
|
return strdup("Init");
|
|
break;
|
|
case ORTE_PROC_STATE_LAUNCHED:
|
|
return strdup("Launched");
|
|
break;
|
|
case ORTE_PROC_STATE_AT_STG1:
|
|
return strdup("Stage 1");
|
|
break;
|
|
case ORTE_PROC_STATE_AT_STG2:
|
|
return strdup("Stage 2");
|
|
break;
|
|
case ORTE_PROC_STATE_RUNNING:
|
|
return strdup("Running");
|
|
break;
|
|
case ORTE_PROC_STATE_AT_STG3:
|
|
return strdup("Stage 3");
|
|
break;
|
|
case ORTE_PROC_STATE_FINALIZED:
|
|
return strdup("Finalized");
|
|
break;
|
|
case ORTE_PROC_STATE_TERMINATED:
|
|
return strdup("Terminated");
|
|
break;
|
|
case ORTE_PROC_STATE_ABORTED:
|
|
return strdup("Aborted");
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
|
|
return strdup("Unknown");
|
|
}
|
|
|
|
static char *pretty_univ_state(orte_universe_state_t state) {
|
|
switch(state) {
|
|
case ORTE_UNIVERSE_STATE_PRE_INIT:
|
|
return strdup("Pre-Init");
|
|
break;
|
|
case ORTE_UNIVERSE_STATE_INIT:
|
|
return strdup("Initializing");
|
|
break;
|
|
case ORTE_UNIVERSE_STATE_RUNNING:
|
|
return strdup("Running");
|
|
break;
|
|
case ORTE_UNIVERSE_STATE_FINALIZE:
|
|
return strdup("Finalized");
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
|
|
return strdup("Unknown");
|
|
}
|
|
|
|
static char *pretty_node_state(orte_node_state_t state) {
|
|
switch(state) {
|
|
case ORTE_NODE_STATE_DOWN:
|
|
return strdup("Down");
|
|
break;
|
|
case ORTE_NODE_STATE_UP:
|
|
return strdup("Up");
|
|
break;
|
|
case ORTE_NODE_STATE_REBOOT:
|
|
return strdup("Reboot");
|
|
break;
|
|
case ORTE_NODE_STATE_UNKNOWN:
|
|
default:
|
|
break;
|
|
}
|
|
|
|
return strdup("Unknown");
|
|
}
|