5dfd54c778
Clean up the remainder of the size_t references in the runtime itself. Convert to orte_std_cntr_t wherever it makes sense (only avoid those places where the actual memory size is referenced). Remove the obsolete oob barrier function (we actually obsoleted it a long time ago - just never bothered to clean it up). I have done my best to go through all the components and catch everything, even if I couldn't test compile them since I wasn't on that type of system. Still, I cannot guarantee that problems won't show up when you test this on specific systems. Usually, these will just show as "warning: comparison between signed and unsigned" notes which are easily fixed (just change a size_t to orte_std_cntr_t). In some places, people didn't use size_t, but instead used some other variant (e.g., I found several places with uint32_t). I tried to catch all of them, but... Once we get all the instances caught and fixed, this should once and for all resolve many of the heterogeneity problems. This commit was SVN r11204.
1542 строки
47 KiB
C
1542 строки
47 KiB
C
/*
|
|
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
|
* University Research and Technology
|
|
* Corporation. All rights reserved.
|
|
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
|
* of Tennessee Research Foundation. All rights
|
|
* reserved.
|
|
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
|
* University of Stuttgart. All rights reserved.
|
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
|
* All rights reserved.
|
|
* Copyright (c) 2006 Cisco Systems, Inc. All rights reserved.
|
|
* $COPYRIGHT$
|
|
*
|
|
* Additional copyrights may follow
|
|
*
|
|
* $HEADER$
|
|
*/
|
|
|
|
#include "orte_config.h"
|
|
|
|
#include <stdio.h>
|
|
#include <errno.h>
|
|
#ifdef HAVE_UNISTD_H
|
|
#include <unistd.h>
|
|
#endif /* HAVE_UNISTD_H */
|
|
#ifdef HAVE_STDLIB_H
|
|
#include <stdlib.h>
|
|
#endif /* HAVE_STDLIB_H */
|
|
#ifdef HAVE_SYS_STAT_H
|
|
#include <sys/stat.h>
|
|
#endif
|
|
#ifdef HAVE_LIBGEN_H
|
|
#include <libgen.h>
|
|
#endif
|
|
#ifdef HAVE_SYS_TYPES_H
|
|
#include <sys/types.h>
|
|
#endif
|
|
#ifdef HAVE_SYS_WAIT_H
|
|
#include <sys/wait.h>
|
|
#endif
|
|
#ifdef HAVE_STRING_H
|
|
#include <string.h>
|
|
#endif /* HAVE_STRING_H */
|
|
#include <sys/types.h>
|
|
|
|
#include "orte/orte_constants.h"
|
|
|
|
#include "opal/util/cmd_line.h"
|
|
#include "opal/util/argv.h"
|
|
#include "opal/util/show_help.h"
|
|
#include "opal/util/output.h"
|
|
#include "opal/util/opal_environ.h"
|
|
#include "opal/util/os_path.h"
|
|
#include "opal/mca/base/base.h"
|
|
#include "opal/mca/base/mca_base_param.h"
|
|
|
|
#include "orte/util/univ_info.h"
|
|
#include "orte/util/sys_info.h"
|
|
#include "orte/util/proc_info.h"
|
|
#include "opal/util/os_path.h"
|
|
#include "orte/util/session_dir.h"
|
|
#include "orte/util/universe_setup_file_io.h"
|
|
#include "orte/mca/gpr/gpr.h"
|
|
#include "orte/mca/rmgr/base/base.h"
|
|
#include "orte/mca/ras/ras.h"
|
|
#include "orte/mca/ras/ras_types.h"
|
|
#include "orte/mca/ras/base/base.h"
|
|
#include "orte/mca/ras/base/ras_base_node.h"
|
|
|
|
#include "opal/runtime/opal.h"
|
|
#include "orte/runtime/runtime.h"
|
|
|
|
|
|
extern char **environ;
|
|
|
|
/*******************
|
|
* Universe/job/vpid information Objects
|
|
*******************/
|
|
struct orte_ps_vpid_info_t {
|
|
/** This is an object, so it must have a super */
|
|
opal_list_item_t super;
|
|
|
|
/** General VPID Information */
|
|
orte_std_cntr_t rank;
|
|
pid_t pid;
|
|
orte_process_name_t name;
|
|
char * node;
|
|
orte_proc_state_t state;
|
|
|
|
orte_std_cntr_t app_context_idx;
|
|
|
|
};
|
|
typedef struct orte_ps_vpid_info_t orte_ps_vpid_info_t;
|
|
|
|
OBJ_CLASS_DECLARATION(orte_ps_vpid_info_t);
|
|
|
|
void orte_ps_vpid_info_construct(orte_ps_vpid_info_t *obj);
|
|
void orte_ps_vpid_info_destruct( orte_ps_vpid_info_t *obj);
|
|
|
|
OBJ_CLASS_INSTANCE(orte_ps_vpid_info_t,
|
|
opal_list_item_t,
|
|
orte_ps_vpid_info_construct,
|
|
orte_ps_vpid_info_destruct);
|
|
|
|
struct orte_ps_job_info_t {
|
|
/** This is an object, so it must have a super */
|
|
opal_list_item_t super;
|
|
|
|
/** General Job Information */
|
|
orte_jobid_t id;
|
|
orte_job_state_t state;
|
|
|
|
orte_std_cntr_t num_init;
|
|
orte_std_cntr_t num_launched;
|
|
orte_std_cntr_t num_running;
|
|
orte_std_cntr_t num_finalized;
|
|
orte_std_cntr_t num_terminated;
|
|
orte_std_cntr_t num_aborted;
|
|
orte_std_cntr_t slots;
|
|
orte_vpid_t vpid_start;
|
|
orte_vpid_t vpid_range;
|
|
|
|
orte_app_context_t **app_context;
|
|
orte_std_cntr_t num_app_context;
|
|
|
|
/** List of vpids */
|
|
opal_list_t vpid_list;
|
|
};
|
|
typedef struct orte_ps_job_info_t orte_ps_job_info_t;
|
|
|
|
OBJ_CLASS_DECLARATION(orte_ps_job_info_t);
|
|
|
|
void orte_ps_job_info_construct(orte_ps_job_info_t *obj);
|
|
void orte_ps_job_info_destruct( orte_ps_job_info_t *obj);
|
|
|
|
OBJ_CLASS_INSTANCE(orte_ps_job_info_t,
|
|
opal_list_item_t,
|
|
orte_ps_job_info_construct,
|
|
orte_ps_job_info_destruct);
|
|
|
|
|
|
struct orte_ps_universe_info_t {
|
|
/** This is an object, so it must have a super */
|
|
opal_list_item_t super;
|
|
|
|
/** Universe information */
|
|
orte_universe_t universe_info;
|
|
|
|
/** List of Jobs */
|
|
opal_list_t job_list;
|
|
|
|
/** List of nodes on orte-node segment */
|
|
opal_list_t nodes;
|
|
};
|
|
typedef struct orte_ps_universe_info_t orte_ps_universe_info_t;
|
|
|
|
OBJ_CLASS_DECLARATION(orte_ps_universe_info_t);
|
|
|
|
void orte_ps_universe_info_construct(orte_ps_universe_info_t *obj);
|
|
void orte_ps_universe_info_destruct( orte_ps_universe_info_t *obj);
|
|
|
|
OBJ_CLASS_INSTANCE(orte_ps_universe_info_t,
|
|
opal_list_item_t,
|
|
orte_ps_universe_info_construct,
|
|
orte_ps_universe_info_destruct);
|
|
|
|
|
|
/******************
|
|
* Local Functions
|
|
******************/
|
|
static int orte_ps_init(void);
|
|
static int parse_args(int argc, char *argv[]);
|
|
|
|
static int connect_to_universe(orte_universe_t universe_info);
|
|
|
|
static int gather_information(orte_ps_universe_info_t* universe);
|
|
static int gather_active_jobs(orte_ps_universe_info_t* universe);
|
|
static int gather_nodes(orte_ps_universe_info_t* universe);
|
|
static int gather_job_info(orte_ps_universe_info_t* universe);
|
|
static int gather_vpid_info(orte_ps_universe_info_t* universe);
|
|
|
|
static int pretty_print(orte_ps_universe_info_t* universe);
|
|
static int pretty_print_nodes(opal_list_t *nodes);
|
|
static int pretty_print_jobs(opal_list_t *jobs);
|
|
static int pretty_print_vpids(orte_ps_job_info_t *job);
|
|
|
|
static char *pretty_univ_state(orte_universe_state_t state);
|
|
static char *pretty_node_state(orte_node_state_t state);
|
|
static char *pretty_job_state(orte_job_state_t state);
|
|
static char *pretty_vpid_state(orte_proc_state_t state);
|
|
|
|
/*****************************************
|
|
* Global Vars for Command line Arguments
|
|
*****************************************/
|
|
typedef struct {
|
|
bool help;
|
|
bool verbose;
|
|
char *universe;
|
|
int jobid;
|
|
int vpid;
|
|
bool gpr_dump;
|
|
bool attached;
|
|
bool nodes;
|
|
} orte_ps_globals_t;
|
|
|
|
orte_ps_globals_t orte_ps_globals;
|
|
|
|
opal_cmd_line_init_t cmd_line_opts[] = {
|
|
{ NULL, NULL, NULL,
|
|
'h', NULL, "help",
|
|
0,
|
|
&orte_ps_globals.help, OPAL_CMD_LINE_TYPE_BOOL,
|
|
"This help message" },
|
|
|
|
{ NULL, NULL, NULL,
|
|
'v', NULL, "verbose",
|
|
0,
|
|
&orte_ps_globals.verbose, OPAL_CMD_LINE_TYPE_BOOL,
|
|
"Be Verbose" },
|
|
|
|
{ NULL, NULL, NULL,
|
|
'd', NULL, "dump",
|
|
0,
|
|
&orte_ps_globals.gpr_dump, OPAL_CMD_LINE_TYPE_BOOL,
|
|
"Dump the state of the GPR" },
|
|
|
|
{ NULL, NULL, NULL,
|
|
'\0', NULL, "universe",
|
|
1,
|
|
&orte_ps_globals.universe, OPAL_CMD_LINE_TYPE_STRING,
|
|
"Specify a universe" },
|
|
|
|
{ NULL, NULL, NULL,
|
|
'j', NULL, "jobid",
|
|
1,
|
|
&orte_ps_globals.jobid, OPAL_CMD_LINE_TYPE_INT,
|
|
"Specify a specific jobid" },
|
|
|
|
{ NULL, NULL, NULL,
|
|
'p', NULL, "vpid",
|
|
1,
|
|
&orte_ps_globals.vpid, OPAL_CMD_LINE_TYPE_INT,
|
|
"Specify a specific vpid. Must specify a --jobid as well" },
|
|
|
|
{ NULL, NULL, NULL,
|
|
'n', NULL, "nodes",
|
|
0,
|
|
&orte_ps_globals.nodes, OPAL_CMD_LINE_TYPE_INT,
|
|
"Print Node Information" },
|
|
|
|
/* End of list */
|
|
{ NULL, NULL, NULL,
|
|
'\0', NULL, NULL,
|
|
0,
|
|
NULL, OPAL_CMD_LINE_TYPE_NULL,
|
|
NULL }
|
|
};
|
|
|
|
int
|
|
main(int argc, char *argv[])
|
|
{
|
|
int ret, exit_status = ORTE_SUCCESS;
|
|
opal_list_t universe_list;
|
|
opal_list_item_t* item = NULL;
|
|
opal_list_t universe_search_result;
|
|
|
|
/***************
|
|
* Initialize
|
|
***************/
|
|
if (ORTE_SUCCESS != (ret = parse_args(argc, argv))) {
|
|
return ret;
|
|
}
|
|
|
|
OBJ_CONSTRUCT(&universe_list, opal_list_t);
|
|
OBJ_CONSTRUCT(&universe_search_result, opal_list_t);
|
|
orte_ps_globals.attached = false;
|
|
|
|
if (ORTE_SUCCESS != (ret = orte_ps_init())) {
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
|
|
/*
|
|
* Get the directory listing
|
|
*/
|
|
if( orte_ps_globals.verbose ) {
|
|
printf("orte_ps: Acquiring universe list...\n");
|
|
}
|
|
if (ORTE_SUCCESS != (ret = orte_universe_search(&universe_search_result) ) ) {
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
|
|
/*
|
|
* For each universe in the listing
|
|
*/
|
|
for(item = opal_list_get_first(&universe_search_result);
|
|
item != opal_list_get_end(&universe_search_result);
|
|
item = opal_list_get_next(item) ) {
|
|
orte_ps_universe_info_t *univ;
|
|
orte_universe_t *tmp_univ;
|
|
|
|
/*
|
|
* Copy over the universe information
|
|
*/
|
|
univ = OBJ_NEW(orte_ps_universe_info_t);
|
|
tmp_univ = (orte_universe_t *) item;
|
|
univ->universe_info.state = tmp_univ->state;
|
|
univ->universe_info.persistence = tmp_univ->persistence;
|
|
univ->universe_info.console = tmp_univ->console;
|
|
univ->universe_info.console_connected = tmp_univ->console_connected;
|
|
if( NULL != tmp_univ->name )
|
|
univ->universe_info.name = strdup(tmp_univ->name);
|
|
else
|
|
univ->universe_info.name = NULL;
|
|
if( NULL != tmp_univ->host )
|
|
univ->universe_info.host = strdup(tmp_univ->host);
|
|
else
|
|
univ->universe_info.host = NULL;
|
|
if( NULL != tmp_univ->uid )
|
|
univ->universe_info.uid = strdup(tmp_univ->uid);
|
|
else
|
|
univ->universe_info.uid = NULL;
|
|
if( NULL != tmp_univ->scope )
|
|
univ->universe_info.scope = strdup(tmp_univ->scope);
|
|
else
|
|
univ->universe_info.scope = NULL;
|
|
if( NULL != tmp_univ->seed_uri)
|
|
univ->universe_info.seed_uri = strdup(tmp_univ->seed_uri);
|
|
else
|
|
univ->universe_info.seed_uri = NULL;
|
|
if( NULL != tmp_univ->scriptfile )
|
|
univ->universe_info.scriptfile = strdup(tmp_univ->scriptfile);
|
|
else
|
|
univ->universe_info.scriptfile = NULL;
|
|
|
|
opal_list_append(&universe_list, &(univ->super));
|
|
|
|
/*
|
|
* Connect to the universe
|
|
*/
|
|
if( orte_ps_globals.verbose ) {
|
|
printf("orte_ps: Connecting to universe: %s\n", univ->universe_info.name);
|
|
}
|
|
if( ORTE_SUCCESS != (ret = connect_to_universe(univ->universe_info)) ) {
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
|
|
/*
|
|
* Gather the information
|
|
*/
|
|
if( orte_ps_globals.verbose ) {
|
|
printf("orte_ps: Gathering Universe Information\n");
|
|
}
|
|
if( ORTE_SUCCESS != (ret = gather_information(univ)) ) {
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
|
|
/*
|
|
* Print the General Universe information
|
|
*/
|
|
if(ORTE_SUCCESS != (ret = pretty_print(univ)) ) {
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
|
|
/*
|
|
* If we were asked to dump the GPR then do so
|
|
*/
|
|
if( orte_ps_globals.gpr_dump) {
|
|
if( ORTE_SUCCESS != (ret = orte_gpr.dump_all() ) ) {
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Since connecting and disconnecting from a universe is
|
|
* not well defined, only allow connection to the first
|
|
* universe found.
|
|
*/
|
|
break;
|
|
}
|
|
|
|
/***************
|
|
* Cleanup
|
|
***************/
|
|
cleanup:
|
|
while (NULL != (item = opal_list_remove_first(&universe_list))) {
|
|
OBJ_RELEASE(item);
|
|
}
|
|
while (NULL != (item = opal_list_remove_first(&universe_search_result))) {
|
|
OBJ_RELEASE(item);
|
|
}
|
|
|
|
/*
|
|
* Only finalize if we are attached to a specific universe
|
|
*/
|
|
if(orte_ps_globals.attached) {
|
|
if (OPAL_SUCCESS != (ret = orte_finalize())) {
|
|
return ret;
|
|
}
|
|
}
|
|
|
|
return exit_status;
|
|
}
|
|
|
|
static int parse_args(int argc, char *argv[]) {
|
|
int i, ret, len;
|
|
opal_cmd_line_t cmd_line;
|
|
char **app_env = NULL, **global_env = NULL;
|
|
orte_ps_globals_t tmp = { false,
|
|
false,
|
|
NULL,
|
|
-1,
|
|
-1,
|
|
false,
|
|
false,
|
|
false};
|
|
|
|
/* Parse the command line options */
|
|
|
|
orte_ps_globals = tmp;
|
|
|
|
opal_cmd_line_create(&cmd_line, cmd_line_opts);
|
|
|
|
mca_base_open();
|
|
mca_base_cmd_line_setup(&cmd_line);
|
|
ret = opal_cmd_line_parse(&cmd_line, true, argc, argv);
|
|
|
|
/**
|
|
* Put all of the MCA arguments in the environment
|
|
*/
|
|
mca_base_cmd_line_process_args(&cmd_line, &app_env, &global_env);
|
|
|
|
len = opal_argv_count(app_env);
|
|
for(i = 0; i < len; ++i) {
|
|
putenv(app_env[i]);
|
|
}
|
|
|
|
len = opal_argv_count(global_env);
|
|
for(i = 0; i < len; ++i) {
|
|
putenv(global_env[i]);
|
|
}
|
|
|
|
/**
|
|
* Now start parsing our specific arguments
|
|
*/
|
|
if (OPAL_SUCCESS != ret ||
|
|
orte_ps_globals.help) {
|
|
char *args = NULL;
|
|
args = opal_cmd_line_get_usage_msg(&cmd_line);
|
|
opal_show_help("help-orte-ps.txt", "usage", true,
|
|
args);
|
|
free(args);
|
|
return ORTE_ERROR;
|
|
}
|
|
|
|
/*
|
|
* If they specify a vpid, they must specify a jobid
|
|
*/
|
|
if( 0 <= orte_ps_globals.vpid) {
|
|
if( 0 > orte_ps_globals.jobid) {
|
|
opal_show_help("help-orte-ps.txt", "vpid-usage", true,
|
|
orte_ps_globals.vpid);
|
|
return ORTE_ERROR;
|
|
}
|
|
}
|
|
|
|
return ORTE_SUCCESS;
|
|
}
|
|
|
|
static int orte_ps_init(void) {
|
|
int exit_status = ORTE_SUCCESS, ret;
|
|
|
|
/*
|
|
* We are trying to attach to another process' GPR so we need to
|
|
* attach no matter if it is identified as private or not.
|
|
*/
|
|
opal_setenv(mca_base_param_env_var("universe_console"),
|
|
"1",
|
|
true, &environ);
|
|
|
|
/***************************
|
|
* We need all of OPAL
|
|
***************************/
|
|
if (ORTE_SUCCESS != (ret = opal_init())) {
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
|
|
/***************************
|
|
* And ORTE, but need to do a bit of a dance first
|
|
***************************/
|
|
/* register handler for errnum -> string converstion */
|
|
opal_error_register("ORTE", ORTE_ERR_BASE, ORTE_ERR_MAX, orte_err2str);
|
|
|
|
/* Register all MCA Params */
|
|
if (ORTE_SUCCESS != (ret = orte_register_params(true))) {
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
|
|
/* Ensure the system_info structure is instantiated and initialized */
|
|
if (ORTE_SUCCESS != (ret = orte_sys_info())) {
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
|
|
/* Ensure the process info structure is instantiated and initialized */
|
|
if (ORTE_SUCCESS != (ret = orte_proc_info())) {
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
|
|
cleanup:
|
|
return exit_status;
|
|
}
|
|
|
|
static int pretty_print(orte_ps_universe_info_t* universe) {
|
|
int i, line_len;
|
|
int len_name = 0,
|
|
len_host = 0,
|
|
len_uid = 0,
|
|
len_scope = 0,
|
|
len_per = 0,
|
|
len_state = 0;
|
|
|
|
/*
|
|
* Calculate segment lengths
|
|
*/
|
|
len_name = (int) (strlen(universe->universe_info.name) < strlen("Universe Name") ?
|
|
strlen("Universe Name") :
|
|
strlen(universe->universe_info.name) );
|
|
len_host = (int) (strlen(universe->universe_info.host) < strlen("Hostname") ?
|
|
strlen("Hostname") :
|
|
strlen(universe->universe_info.host));
|
|
len_uid = (int) (strlen(universe->universe_info.uid) < strlen("UID") ?
|
|
strlen("UID") :
|
|
strlen(universe->universe_info.uid));
|
|
len_per = (int) strlen("Persistent");
|
|
len_scope = (int) (strlen(universe->universe_info.scope) < strlen("Scope") ?
|
|
strlen("Scope") :
|
|
strlen(universe->universe_info.scope));
|
|
len_state = (int) (strlen(pretty_univ_state(universe->universe_info.state)) < strlen("State") ?
|
|
strlen("State") :
|
|
strlen(pretty_univ_state(universe->universe_info.state)) );
|
|
|
|
line_len = (len_name + 3 +
|
|
len_host + 3 +
|
|
len_uid + 3 +
|
|
len_per + 3 +
|
|
len_scope + 3 +
|
|
len_state) + 3 ;
|
|
|
|
/*
|
|
* Print header
|
|
*/
|
|
printf("%*s | ", len_name , "Universe Name");
|
|
printf("%*s | ", len_host , "Hostname");
|
|
printf("%*s | ", len_uid , "UID");
|
|
printf("%*s | ", len_per , "Persistent");
|
|
printf("%*s | ", len_scope, "Scope");
|
|
printf("%*s |" , len_state, "State");
|
|
printf("\n");
|
|
|
|
for(i = 0; i < line_len; ++i) {
|
|
printf("-");
|
|
}
|
|
printf("\n");
|
|
|
|
/*
|
|
* Print Info
|
|
*/
|
|
printf("%*s | ", len_name, universe->universe_info.name);
|
|
printf("%*s | ", len_host, universe->universe_info.host);
|
|
printf("%*s | ", len_uid, universe->universe_info.uid);
|
|
if(universe->universe_info.persistence)
|
|
printf("%*s | ", len_per, "true");
|
|
else
|
|
printf("%*s | ", len_per, "false");
|
|
printf("%*s | ", len_scope, universe->universe_info.scope);
|
|
printf("%*s |", len_state, pretty_univ_state(universe->universe_info.state));
|
|
printf("\n");
|
|
|
|
printf("\n");
|
|
|
|
/*
|
|
* Print Node Information
|
|
*/
|
|
if( orte_ps_globals.nodes )
|
|
pretty_print_nodes(&universe->nodes);
|
|
|
|
/*
|
|
* Print Job Information
|
|
*/
|
|
pretty_print_jobs(&universe->job_list);
|
|
|
|
return ORTE_SUCCESS;
|
|
}
|
|
|
|
static int pretty_print_nodes(opal_list_t *nodes) {
|
|
opal_list_item_t* node_item = NULL;
|
|
int i, line_len;
|
|
int len_name = 0,
|
|
len_arch = 0,
|
|
len_cell = 0,
|
|
len_state = 0,
|
|
len_slots = 0,
|
|
len_slots_i = 0,
|
|
len_slots_a = 0,
|
|
len_slots_m = 0;
|
|
|
|
/*
|
|
* Caculate segment lengths
|
|
*/
|
|
len_name = (int) strlen("Node Name");
|
|
len_arch = (int) strlen("Arch");
|
|
len_cell = (int) strlen("Cell ID");
|
|
len_state = (int) strlen("State");
|
|
len_slots = (int) strlen("Slots");
|
|
len_slots_i = (int) strlen("Slots In Use");
|
|
#if 0
|
|
len_slots_a = (int) strlen("Slots Alloc");
|
|
#else
|
|
len_slots_a = -3;
|
|
#endif
|
|
len_slots_m = (int) strlen("Slots Max");
|
|
for(node_item = opal_list_get_first(nodes);
|
|
node_item != opal_list_get_end(nodes);
|
|
node_item = opal_list_get_next(node_item) ) {
|
|
orte_ras_node_t *node;
|
|
node = (orte_ras_node_t *)node_item;
|
|
|
|
if( NULL != node->node_name &&
|
|
(int)strlen(node->node_name) > len_name)
|
|
len_name = (int) strlen(node->node_name);
|
|
|
|
if( NULL != node->node_arch &&
|
|
(int)strlen(node->node_arch) > len_arch)
|
|
len_arch = (int) strlen(node->node_arch);
|
|
|
|
if( (int)strlen(pretty_node_state(node->node_state)) > len_state )
|
|
len_state = (int)strlen(pretty_node_state(node->node_state));
|
|
}
|
|
|
|
/*
|
|
* JJH Since node_slots_inuse and node_slots_alloc are not used properly
|
|
* JJH do not display them to the user.
|
|
*/
|
|
line_len = (len_name + 3 +
|
|
len_arch + 3 +
|
|
len_cell + 3 +
|
|
len_state + 3 +
|
|
len_slots + 3 +
|
|
len_slots_i + 3 +
|
|
len_slots_a + 3 +
|
|
len_slots_m + 3);
|
|
|
|
/*
|
|
* Print the header
|
|
*/
|
|
printf("%*s | ", len_name, "Node Name");
|
|
printf("%*s | ", len_arch, "Arch");
|
|
printf("%*s | ", len_cell, "Cell ID");
|
|
printf("%*s | ", len_state, "State");
|
|
printf("%*s | ", len_slots, "Slots");
|
|
printf("%*s | ", len_slots_m, "Slots Max");
|
|
printf("%*s | ", len_slots_i, "Slots In Use");
|
|
#if 0
|
|
printf("%*s | ", len_slots_a, "Slots Alloc");
|
|
#endif
|
|
printf("\n");
|
|
|
|
for(i = 0; i < line_len; ++i) {
|
|
printf("-");
|
|
}
|
|
printf("\n");
|
|
|
|
/*
|
|
* Print Info
|
|
*/
|
|
for(node_item = opal_list_get_first(nodes);
|
|
node_item != opal_list_get_end(nodes);
|
|
node_item = opal_list_get_next(node_item) ) {
|
|
orte_ras_node_t *node;
|
|
node = (orte_ras_node_t *)node_item;
|
|
|
|
printf("%*s | ", len_name, node->node_name);
|
|
printf("%*s | ", len_arch, (NULL == node->node_arch ?
|
|
"" :
|
|
node->node_arch));
|
|
printf("%*d | ", len_cell, node->node_cellid);
|
|
printf("%*s | ", len_state, pretty_node_state(node->node_state));
|
|
printf("%*d | ", len_slots, (uint)node->node_slots);
|
|
printf("%*d | ", len_slots_m, (uint)node->node_slots_max);
|
|
printf("%*d | ", len_slots_i, (uint)node->node_slots_inuse);
|
|
#if 0
|
|
printf("%*d | ", len_slots_a, (uint)node->node_slots_alloc);
|
|
#endif
|
|
|
|
printf("\n");
|
|
}
|
|
|
|
return ORTE_SUCCESS;
|
|
}
|
|
|
|
static int pretty_print_jobs(opal_list_t *jobs) {
|
|
opal_list_item_t* job_item = NULL;
|
|
int len_jobid = 0,
|
|
len_state = 0,
|
|
len_slots = 0,
|
|
len_vpid_s = 0,
|
|
len_vpid_r = 0,
|
|
len_ckpt_s = 0,
|
|
len_ckpt_r = 0,
|
|
len_ckpt_l = 0;
|
|
int i, line_len;
|
|
|
|
for(job_item = opal_list_get_first(jobs);
|
|
job_item != opal_list_get_end(jobs);
|
|
job_item = opal_list_get_next(job_item) ) {
|
|
orte_ps_job_info_t *job;
|
|
job = (orte_ps_job_info_t *)job_item;
|
|
|
|
/*
|
|
* Caculate segment lengths
|
|
*/
|
|
len_jobid = 6;
|
|
len_state = (int) (strlen(pretty_job_state(job->state)) < strlen("State") ?
|
|
strlen("State") :
|
|
strlen(pretty_job_state(job->state)));
|
|
len_slots = 6;
|
|
len_vpid_s = (int) strlen("VPID Start");
|
|
len_vpid_r = (int) strlen("VPID Range");
|
|
len_ckpt_s = 0;
|
|
len_ckpt_r = 0;
|
|
len_ckpt_l = 0;
|
|
|
|
line_len = (len_jobid + 3 +
|
|
len_state + 3 +
|
|
len_slots + 3 +
|
|
len_vpid_s + 3 +
|
|
len_vpid_r + 3 +
|
|
len_ckpt_s + 3 +
|
|
len_ckpt_r + 3 +
|
|
len_ckpt_l - 6
|
|
);
|
|
/*
|
|
* Print Header
|
|
*/
|
|
printf("\n");
|
|
printf("%*s | ", len_jobid , "JobID");
|
|
printf("%*s | ", len_state , "State");
|
|
printf("%*s | ", len_slots , "Slots");
|
|
printf("%*s | ", len_vpid_s , "VPID Start");
|
|
printf("%*s | ", len_vpid_r , "VPID Range");
|
|
printf("\n");
|
|
|
|
for(i = 0; i < line_len; ++i) {
|
|
printf("-");
|
|
}
|
|
printf("\n");
|
|
|
|
/*
|
|
* Print Info
|
|
*/
|
|
printf("%*d | ", len_jobid , job->id);
|
|
printf("%*s | ", len_state , pretty_job_state(job->state));
|
|
printf("%*d | ", len_slots , (uint)job->slots);
|
|
printf("%*d | ", len_vpid_s, job->vpid_start);
|
|
printf("%*d | ", len_vpid_r, job->vpid_range);
|
|
|
|
printf("\n");
|
|
|
|
/*
|
|
* Pretty print all VPID's in job
|
|
*/
|
|
if(0 == job->id) { /* No vpids for the HNP */
|
|
continue;
|
|
}
|
|
|
|
pretty_print_vpids(job);
|
|
}
|
|
|
|
return ORTE_SUCCESS;
|
|
}
|
|
|
|
static int pretty_print_vpids(orte_ps_job_info_t *job) {
|
|
opal_list_item_t* vpid_item = NULL;
|
|
int len_o_proc_name = 0,
|
|
len_proc_name = 0,
|
|
len_rank = 0,
|
|
len_pid = 0,
|
|
len_state = 0,
|
|
len_node = 0,
|
|
len_ckpt_s = 0,
|
|
len_ckpt_r = 0,
|
|
len_ckpt_l = 0;
|
|
int i, line_len;
|
|
|
|
/*
|
|
* Caculate segment lengths
|
|
*/
|
|
len_o_proc_name = strlen("ORTE Name");
|
|
len_proc_name = strlen("Process Name");
|
|
len_rank = 6;
|
|
len_pid = 6;
|
|
len_state = 0;
|
|
len_node = 0;
|
|
len_ckpt_s = 0;
|
|
len_ckpt_r = 0;
|
|
len_ckpt_l = 0;
|
|
|
|
for(vpid_item = opal_list_get_first(&(job->vpid_list));
|
|
vpid_item != opal_list_get_end(&(job->vpid_list));
|
|
vpid_item = opal_list_get_next(vpid_item) ) {
|
|
orte_ps_vpid_info_t *vpid;
|
|
char *proc_name = NULL;
|
|
vpid = (orte_ps_vpid_info_t *)vpid_item;
|
|
|
|
/*
|
|
* Find my app context
|
|
*/
|
|
for( i = 0; i < (int)job->num_app_context; ++i) {
|
|
if( job->app_context[i]->idx == vpid->app_context_idx ) {
|
|
if( (int)strlen(job->app_context[i]->app) > len_proc_name)
|
|
len_proc_name = strlen(job->app_context[i]->app);
|
|
break;
|
|
}
|
|
}
|
|
|
|
asprintf(&proc_name, "%d.%d.%d", vpid->name.cellid, vpid->name.jobid, vpid->name.vpid);
|
|
if( (int)strlen(proc_name) > len_o_proc_name )
|
|
len_o_proc_name = strlen(proc_name);
|
|
|
|
if( (int)strlen(vpid->node) > len_node)
|
|
len_node = strlen(vpid->node);
|
|
|
|
if( (int)strlen(pretty_vpid_state(vpid->state)) > len_state)
|
|
len_state = strlen(pretty_vpid_state(vpid->state));
|
|
|
|
if( NULL != proc_name) {
|
|
free(proc_name);
|
|
proc_name = NULL;
|
|
}
|
|
}
|
|
|
|
line_len = (len_o_proc_name + 3 +
|
|
len_proc_name + 3 +
|
|
len_rank + 3 +
|
|
len_pid + 3 +
|
|
len_state + 3 +
|
|
len_node + 3 +
|
|
len_ckpt_s + 3 +
|
|
len_ckpt_r + 3 +
|
|
len_ckpt_l - 6
|
|
);
|
|
|
|
/*
|
|
* Print Header
|
|
*/
|
|
printf("\t");
|
|
printf("%*s | ", len_proc_name , "Process Name");
|
|
printf("%*s | ", len_o_proc_name , "ORTE Name");
|
|
printf("%*s | ", len_rank , "Rank");
|
|
printf("%*s | ", len_pid , "PID");
|
|
printf("%*s | ", len_node , "Node");
|
|
printf("%*s | ", len_state , "State");
|
|
printf("\n");
|
|
|
|
printf("\t");
|
|
for(i = 0; i < line_len; ++i) {
|
|
printf("-");
|
|
}
|
|
printf("\n");
|
|
|
|
/*
|
|
* Print Info
|
|
*/
|
|
for(vpid_item = opal_list_get_first(&(job->vpid_list));
|
|
vpid_item != opal_list_get_end(&(job->vpid_list));
|
|
vpid_item = opal_list_get_next(vpid_item) ) {
|
|
orte_ps_vpid_info_t *vpid;
|
|
char *proc_name = NULL;
|
|
vpid = (orte_ps_vpid_info_t *)vpid_item;
|
|
|
|
printf("\t");
|
|
|
|
asprintf(&proc_name, "%d.%d.%d", vpid->name.cellid, vpid->name.jobid, vpid->name.vpid);
|
|
|
|
for( i = 0; i < (int)job->num_app_context; ++i) {
|
|
if( job->app_context[i]->idx == vpid->app_context_idx ) {
|
|
printf("%*s | ", len_proc_name, job->app_context[i]->app);
|
|
break;
|
|
}
|
|
}
|
|
|
|
printf("%*s | ", len_o_proc_name, proc_name);
|
|
printf("%*d | ", len_rank , (uint)vpid->rank);
|
|
printf("%*d | ", len_pid , vpid->pid);
|
|
printf("%*s | ", len_node , vpid->node);
|
|
printf("%*s | ", len_state , pretty_vpid_state(vpid->state));
|
|
printf("\n");
|
|
|
|
if( NULL != proc_name) {
|
|
free(proc_name);
|
|
proc_name = NULL;
|
|
}
|
|
}
|
|
|
|
return ORTE_SUCCESS;
|
|
}
|
|
|
|
static int connect_to_universe(orte_universe_t universe_info) {
|
|
int ret, exit_status = ORTE_SUCCESS;
|
|
char * univ_mca_param = NULL;
|
|
|
|
/*
|
|
* Construct the MCA parameter
|
|
*/
|
|
asprintf(&univ_mca_param, "%s@%s:%s",
|
|
universe_info.uid,
|
|
universe_info.host,
|
|
universe_info.name);
|
|
#if 0
|
|
/*
|
|
* Disconnect from the current universe
|
|
*/
|
|
if(orte_ps_globals.attached) {
|
|
if (OPAL_SUCCESS != (ret = orte_system_finalize())) {
|
|
return ret;
|
|
}
|
|
}
|
|
#endif
|
|
/*
|
|
* Set the environment universe information
|
|
*/
|
|
opal_setenv(mca_base_param_env_var("universe"),
|
|
univ_mca_param,
|
|
true, &environ);
|
|
|
|
/*
|
|
* Restart ORTE in the requested universe
|
|
*/
|
|
if(!orte_ps_globals.attached) {
|
|
if (ORTE_SUCCESS != (ret = orte_system_init(true)) ) {
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
}
|
|
else {
|
|
if( ORTE_SUCCESS != (ret = orte_restart(orte_process_info.my_name, universe_info.seed_uri)) ) {
|
|
printf("orte_restart: FAILED (%d)\n", ret);
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
}
|
|
|
|
orte_ps_globals.attached = true;
|
|
|
|
cleanup:
|
|
if( NULL != univ_mca_param)
|
|
free(univ_mca_param);
|
|
|
|
return exit_status;
|
|
}
|
|
|
|
static int gather_information(orte_ps_universe_info_t* universe) {
|
|
int ret, exit_status = ORTE_SUCCESS;
|
|
|
|
if( ORTE_SUCCESS != (ret = gather_active_jobs(universe) )) {
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
|
|
if( ORTE_SUCCESS != (ret = gather_nodes(universe) )) {
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
|
|
if( ORTE_SUCCESS != (ret = gather_job_info(universe) )) {
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
|
|
if( ORTE_SUCCESS != (ret = gather_vpid_info(universe) )) {
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
|
|
cleanup:
|
|
return exit_status;
|
|
}
|
|
|
|
static int gather_active_jobs(orte_ps_universe_info_t* universe) {
|
|
int ret, exit_status = ORTE_SUCCESS;
|
|
char *segment = NULL;
|
|
orte_gpr_value_t** values = NULL;
|
|
orte_std_cntr_t i, j, num_values = 0;
|
|
|
|
/**********************
|
|
* Job Info segment
|
|
**********************/
|
|
segment = strdup(ORTE_JOBINFO_SEGMENT);
|
|
|
|
if( ORTE_SUCCESS != (ret = orte_gpr.get(ORTE_GPR_KEYS_OR|ORTE_GPR_TOKENS_OR,
|
|
segment,
|
|
NULL,
|
|
NULL,
|
|
&num_values,
|
|
&values ) ) ) {
|
|
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
|
|
/*
|
|
* Parse the structure returned
|
|
*/
|
|
for(i = 0; i < num_values; ++i) {
|
|
orte_gpr_value_t* value = values[i];
|
|
orte_ps_job_info_t *job = NULL;
|
|
|
|
job = OBJ_NEW(orte_ps_job_info_t);
|
|
orte_schema.extract_jobid_from_segment_name(&(job->id), value->tokens[0]);
|
|
|
|
/*
|
|
* If the user specified a jobid, then
|
|
* only access the info for that jobid
|
|
*/
|
|
if( 0 <= orte_ps_globals.jobid ) {
|
|
if( (int)job->id != orte_ps_globals.jobid) {
|
|
continue;
|
|
}
|
|
}
|
|
|
|
for( j = 0; j < value->cnt; ++j) {
|
|
orte_gpr_keyval_t* keyval = value->keyvals[j];
|
|
orte_job_state_t *job_state;
|
|
|
|
if( 0 == strncmp(keyval->key, ORTE_JOB_STATE_KEY, strlen(ORTE_JOB_STATE_KEY)) ) {
|
|
if( ORTE_SUCCESS != (ret = orte_dss.get( (void **) &job_state, keyval->value, ORTE_JOB_STATE))) {
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
|
|
job->state = *job_state;
|
|
continue;
|
|
}
|
|
}
|
|
|
|
opal_list_append(&universe->job_list, &(job->super));
|
|
}
|
|
|
|
cleanup:
|
|
return exit_status;
|
|
}
|
|
|
|
static int gather_nodes(orte_ps_universe_info_t* universe) {
|
|
int ret, exit_status = ORTE_SUCCESS;
|
|
|
|
if( ORTE_SUCCESS != (ret = orte_ras_base_node_query(&(universe->nodes)))) {
|
|
exit_status = ret;
|
|
}
|
|
|
|
return exit_status;
|
|
}
|
|
|
|
static int gather_job_info(orte_ps_universe_info_t* universe) {
|
|
int ret, exit_status = ORTE_SUCCESS;
|
|
char *segment = NULL, *tokens[2];
|
|
orte_gpr_value_t** values = NULL;
|
|
orte_std_cntr_t i, j, num_values = 0;
|
|
opal_list_item_t* item = NULL;
|
|
|
|
/*
|
|
* For each job in the universe
|
|
*/
|
|
for(item = opal_list_get_first(&(universe->job_list));
|
|
item != opal_list_get_end(&(universe->job_list));
|
|
item = opal_list_get_next(item) ) {
|
|
orte_ps_job_info_t *job;
|
|
job = (orte_ps_job_info_t *)item;
|
|
|
|
/*
|
|
* Get the App Context(s)
|
|
*/
|
|
orte_rmgr_base_get_app_context(job->id,
|
|
&job->app_context,
|
|
&job->num_app_context);
|
|
/*
|
|
* Access the job segment
|
|
*/
|
|
orte_schema.get_job_segment_name(&segment, job->id);
|
|
|
|
/*
|
|
* Here we are just focused on the orte-job-globals container
|
|
*/
|
|
tokens[0] = strdup(ORTE_JOB_GLOBALS);
|
|
tokens[1] = NULL;
|
|
|
|
if( ORTE_SUCCESS != (ret = orte_gpr.get(ORTE_GPR_KEYS_OR|ORTE_GPR_TOKENS_OR,
|
|
segment,
|
|
tokens,
|
|
NULL,
|
|
&num_values,
|
|
&values ) ) ) {
|
|
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
|
|
/*
|
|
* Parse the structure returned
|
|
*/
|
|
for(i = 0; i < num_values; ++i) {
|
|
orte_gpr_value_t* value = values[i];
|
|
|
|
for( j = 0; j < value->cnt; ++j) {
|
|
orte_gpr_keyval_t* keyval = value->keyvals[j];
|
|
orte_std_cntr_t *tmp_num;
|
|
orte_vpid_t *tmp_vpid;
|
|
|
|
if( 0 == strncmp(keyval->key, ORTE_PROC_NUM_AT_INIT, strlen(ORTE_PROC_NUM_AT_INIT)) ) {
|
|
if( ORTE_SUCCESS != (ret = orte_dss.get( (void **) &tmp_num, keyval->value, ORTE_STD_CNTR))) {
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
job->num_init = *tmp_num;
|
|
continue;
|
|
}
|
|
else if( 0 == strncmp(keyval->key, ORTE_PROC_NUM_LAUNCHED, strlen(ORTE_PROC_NUM_LAUNCHED)) ) {
|
|
if( ORTE_SUCCESS != (ret = orte_dss.get( (void **) &tmp_num, keyval->value, ORTE_STD_CNTR))) {
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
job->num_launched = *tmp_num;
|
|
continue;
|
|
}
|
|
else if( 0 == strncmp(keyval->key, ORTE_PROC_NUM_RUNNING, strlen(ORTE_PROC_NUM_RUNNING)) ) {
|
|
if( ORTE_SUCCESS != (ret = orte_dss.get( (void **) &tmp_num, keyval->value, ORTE_STD_CNTR))) {
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
job->num_running = *tmp_num;
|
|
continue;
|
|
}
|
|
else if( 0 == strncmp(keyval->key, ORTE_PROC_NUM_FINALIZED, strlen(ORTE_PROC_NUM_FINALIZED)) ) {
|
|
if( ORTE_SUCCESS != (ret = orte_dss.get( (void **) &tmp_num, keyval->value, ORTE_STD_CNTR))) {
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
job->num_finalized = *tmp_num;
|
|
continue;
|
|
}
|
|
else if( 0 == strncmp(keyval->key, ORTE_PROC_NUM_TERMINATED, strlen(ORTE_PROC_NUM_TERMINATED)) ) {
|
|
if( ORTE_SUCCESS != (ret = orte_dss.get( (void **) &tmp_num, keyval->value, ORTE_STD_CNTR))) {
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
job->num_terminated = *tmp_num;
|
|
continue;
|
|
}
|
|
else if( 0 == strncmp(keyval->key, ORTE_PROC_NUM_ABORTED, strlen(ORTE_PROC_NUM_ABORTED)) ) {
|
|
if( ORTE_SUCCESS != (ret = orte_dss.get( (void **) &tmp_num, keyval->value, ORTE_STD_CNTR))) {
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
job->num_aborted = *tmp_num;
|
|
continue;
|
|
}
|
|
else if( 0 == strncmp(keyval->key, ORTE_JOB_SLOTS_KEY, strlen(ORTE_JOB_SLOTS_KEY)) ) {
|
|
if( ORTE_SUCCESS != (ret = orte_dss.get( (void **) &tmp_num, keyval->value, ORTE_STD_CNTR))) {
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
job->slots = *tmp_num;
|
|
continue;
|
|
}
|
|
else if( 0 == strncmp(keyval->key, ORTE_JOB_VPID_START_KEY, strlen(ORTE_JOB_VPID_START_KEY)) ) {
|
|
if( ORTE_SUCCESS != (ret = orte_dss.get( (void **) &tmp_vpid, keyval->value, ORTE_VPID))) {
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
job->vpid_start = *tmp_vpid;
|
|
continue;
|
|
}
|
|
else if( 0 == strncmp(keyval->key, ORTE_JOB_VPID_RANGE_KEY, strlen(ORTE_JOB_VPID_RANGE_KEY)) ) {
|
|
if( ORTE_SUCCESS != (ret = orte_dss.get( (void **) &tmp_vpid, keyval->value, ORTE_VPID))) {
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
job->vpid_range = *tmp_vpid;
|
|
continue;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
cleanup:
|
|
return exit_status;
|
|
}
|
|
|
|
static int gather_vpid_info(orte_ps_universe_info_t* universe) {
|
|
int ret, exit_status = ORTE_SUCCESS;
|
|
char *segment = NULL;
|
|
orte_gpr_value_t** values = NULL;
|
|
orte_std_cntr_t i, j, num_values = 0;
|
|
opal_list_item_t* job_item = NULL;
|
|
orte_vpid_t v = 0;
|
|
|
|
/*
|
|
* For each Job in the universe
|
|
*/
|
|
for(job_item = opal_list_get_first(&(universe->job_list));
|
|
job_item != opal_list_get_end(&(universe->job_list));
|
|
job_item = opal_list_get_next(job_item) ) {
|
|
orte_ps_job_info_t *job;
|
|
job = (orte_ps_job_info_t *)job_item;
|
|
|
|
/*
|
|
* Skip getting the vpid's for the HNP, since the information is not complete
|
|
*/
|
|
if( 0 == job->id) {
|
|
continue;
|
|
}
|
|
|
|
/*
|
|
* For each vpid in the job
|
|
*/
|
|
for(v = job->vpid_start; v < (job->vpid_start + job->vpid_range); ++v) {
|
|
orte_ps_vpid_info_t *vpid = NULL;
|
|
orte_process_name_t proc;
|
|
char **tokens = NULL;
|
|
orte_std_cntr_t num_tokens = 0;
|
|
|
|
/*
|
|
* If the user specified a vpid, then just get that one
|
|
*/
|
|
if( 0 <= orte_ps_globals.vpid) {
|
|
/*
|
|
* Check to make sure it is a valid vpid
|
|
*/
|
|
if( (int)(job->vpid_start + job->vpid_range) <= orte_ps_globals.vpid) {
|
|
opal_show_help("help-orte-ps.txt", "invalid-vpid", true,
|
|
orte_ps_globals.vpid,
|
|
orte_ps_globals.jobid );
|
|
return ORTE_ERROR;
|
|
}
|
|
|
|
if( (int)v != orte_ps_globals.vpid ) {
|
|
continue;
|
|
}
|
|
}
|
|
|
|
vpid = OBJ_NEW(orte_ps_vpid_info_t);
|
|
|
|
/*
|
|
* Access the job segment
|
|
*/
|
|
orte_schema.get_job_segment_name(&segment, job->id);
|
|
|
|
/*
|
|
* Access the vpid container
|
|
*/
|
|
proc.cellid = 0;
|
|
proc.jobid = job->id;
|
|
proc.vpid = v;
|
|
|
|
orte_schema.get_proc_tokens(&tokens,
|
|
&num_tokens,
|
|
&proc);
|
|
|
|
|
|
if( ORTE_SUCCESS != (ret = orte_gpr.get(ORTE_GPR_KEYS_OR|ORTE_GPR_TOKENS_OR,
|
|
segment,
|
|
tokens,
|
|
NULL,
|
|
&num_values,
|
|
&values ) ) ) {
|
|
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
|
|
/*
|
|
* Parse the structure returned
|
|
*/
|
|
for(i = 0; i < num_values; ++i) {
|
|
orte_gpr_value_t* value = values[i];
|
|
|
|
for( j = 0; j < value->cnt; ++j) {
|
|
orte_gpr_keyval_t* keyval = value->keyvals[j];
|
|
|
|
if( 0 == strncmp(keyval->key, ORTE_PROC_RANK_KEY, strlen(ORTE_PROC_RANK_KEY)) ) {
|
|
orte_std_cntr_t *tmp_size;
|
|
if( ORTE_SUCCESS != (ret = orte_dss.get( (void **) &tmp_size, keyval->value, ORTE_STD_CNTR))) {
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
|
|
vpid->rank = *tmp_size;
|
|
continue;
|
|
}
|
|
else if( 0 == strncmp(keyval->key, ORTE_PROC_APP_CONTEXT_KEY, strlen(ORTE_PROC_APP_CONTEXT_KEY)) ) {
|
|
orte_std_cntr_t *tmp_size;
|
|
if( ORTE_SUCCESS != (ret = orte_dss.get( (void **) &tmp_size, keyval->value, ORTE_STD_CNTR))) {
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
|
|
vpid->app_context_idx = *tmp_size;
|
|
continue;
|
|
}
|
|
else if( 0 == strncmp(keyval->key, ORTE_PROC_PID_KEY, strlen(ORTE_PROC_PID_KEY)) ) {
|
|
pid_t *tmp_pid;
|
|
if( ORTE_SUCCESS != (ret = orte_dss.get( (void **) &tmp_pid, keyval->value, ORTE_PID))) {
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
|
|
vpid->pid = *tmp_pid;
|
|
continue;
|
|
}
|
|
else if( 0 == strncmp(keyval->key, ORTE_PROC_NAME_KEY, strlen(ORTE_PROC_NAME_KEY)) ) {
|
|
orte_process_name_t *tmp_proc;
|
|
if( ORTE_SUCCESS != (ret = orte_dss.get( (void **) &tmp_proc, keyval->value, ORTE_NAME))) {
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
|
|
vpid->name.cellid = tmp_proc->cellid;
|
|
vpid->name.jobid = tmp_proc->jobid;
|
|
vpid->name.vpid = tmp_proc->vpid;
|
|
|
|
continue;
|
|
}
|
|
else if( 0 == strncmp(keyval->key, ORTE_NODE_NAME_KEY, strlen(ORTE_NODE_NAME_KEY)) ) {
|
|
char *tmp_node = NULL;
|
|
if( ORTE_SUCCESS != (ret = orte_dss.get( (void **) &tmp_node, keyval->value, ORTE_STRING))) {
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
|
|
vpid->node = strdup(tmp_node);
|
|
continue;
|
|
}
|
|
else if( 0 == strncmp(keyval->key, ORTE_PROC_STATE_KEY, strlen(ORTE_PROC_STATE_KEY)) ) {
|
|
orte_proc_state_t *tmp_state;
|
|
if( ORTE_SUCCESS != (ret = orte_dss.get( (void **) &tmp_state, keyval->value, ORTE_PROC_STATE))) {
|
|
exit_status = ret;
|
|
goto cleanup;
|
|
}
|
|
|
|
vpid->state = *tmp_state;
|
|
continue;
|
|
}
|
|
}
|
|
}
|
|
|
|
opal_list_append(&job->vpid_list, &(vpid->super));
|
|
}
|
|
}
|
|
|
|
cleanup:
|
|
return exit_status;
|
|
}
|
|
|
|
/************************
|
|
* Object handling
|
|
************************/
|
|
void orte_ps_vpid_info_construct(orte_ps_vpid_info_t *obj) {
|
|
obj->node = NULL;
|
|
|
|
}
|
|
|
|
void orte_ps_vpid_info_destruct( orte_ps_vpid_info_t *obj) {
|
|
if( NULL != obj->node)
|
|
free(obj->node);
|
|
|
|
}
|
|
|
|
void orte_ps_job_info_construct(orte_ps_job_info_t *obj) {
|
|
OBJ_CONSTRUCT(&obj->vpid_list, opal_list_t);
|
|
|
|
obj->app_context = NULL;
|
|
obj->num_app_context = 0;
|
|
}
|
|
|
|
void orte_ps_job_info_destruct( orte_ps_job_info_t *obj) {
|
|
opal_list_item_t* item = NULL;
|
|
orte_std_cntr_t i;
|
|
|
|
for(i = 0; i < obj->num_app_context; ++i) {
|
|
free(obj->app_context[i]);
|
|
}
|
|
obj->num_app_context = 0;
|
|
|
|
while (NULL != (item = opal_list_remove_first(&obj->vpid_list))) {
|
|
OBJ_RELEASE(item);
|
|
}
|
|
OBJ_DESTRUCT(&obj->vpid_list);
|
|
}
|
|
|
|
void orte_ps_universe_info_construct(orte_ps_universe_info_t *obj) {
|
|
OBJ_CONSTRUCT(&obj->job_list, opal_list_t);
|
|
OBJ_CONSTRUCT(&obj->nodes, opal_list_t);
|
|
OBJ_CONSTRUCT(&obj->universe_info, orte_universe_t);
|
|
}
|
|
|
|
void orte_ps_universe_info_destruct( orte_ps_universe_info_t *obj) {
|
|
opal_list_item_t* item = NULL;
|
|
|
|
while (NULL != (item = opal_list_remove_first(&obj->job_list))) {
|
|
OBJ_RELEASE(item);
|
|
}
|
|
OBJ_DESTRUCT(&obj->job_list);
|
|
|
|
while (NULL != (item = opal_list_remove_first(&obj->nodes))) {
|
|
OBJ_RELEASE(item);
|
|
}
|
|
OBJ_DESTRUCT(&obj->nodes);
|
|
|
|
OBJ_DESTRUCT(&obj->universe_info);
|
|
}
|
|
|
|
static char *pretty_job_state(orte_job_state_t state) {
|
|
switch(state) {
|
|
case ORTE_JOB_STATE_INIT:
|
|
return strdup("Init");
|
|
break;
|
|
case ORTE_JOB_STATE_LAUNCHED:
|
|
return strdup("Launched");
|
|
break;
|
|
case ORTE_JOB_STATE_AT_STG1:
|
|
return strdup("Stage 1");
|
|
break;
|
|
case ORTE_JOB_STATE_AT_STG2:
|
|
return strdup("Stage 2");
|
|
break;
|
|
case ORTE_JOB_STATE_RUNNING:
|
|
return strdup("Running");
|
|
break;
|
|
case ORTE_JOB_STATE_AT_STG3:
|
|
return strdup("Stage 3");
|
|
break;
|
|
case ORTE_JOB_STATE_FINALIZED:
|
|
return strdup("Finalized");
|
|
break;
|
|
case ORTE_JOB_STATE_TERMINATED:
|
|
return strdup("Terminated");
|
|
break;
|
|
case ORTE_JOB_STATE_ABORTED:
|
|
return strdup("Aborted");
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
|
|
return strdup("Unknown");
|
|
}
|
|
|
|
static char *pretty_vpid_state(orte_proc_state_t state) {
|
|
switch(state) {
|
|
case ORTE_PROC_STATE_INIT:
|
|
return strdup("Init");
|
|
break;
|
|
case ORTE_PROC_STATE_LAUNCHED:
|
|
return strdup("Launched");
|
|
break;
|
|
case ORTE_PROC_STATE_AT_STG1:
|
|
return strdup("Stage 1");
|
|
break;
|
|
case ORTE_PROC_STATE_AT_STG2:
|
|
return strdup("Stage 2");
|
|
break;
|
|
case ORTE_PROC_STATE_RUNNING:
|
|
return strdup("Running");
|
|
break;
|
|
case ORTE_PROC_STATE_AT_STG3:
|
|
return strdup("Stage 3");
|
|
break;
|
|
case ORTE_PROC_STATE_FINALIZED:
|
|
return strdup("Finalized");
|
|
break;
|
|
case ORTE_PROC_STATE_TERMINATED:
|
|
return strdup("Terminated");
|
|
break;
|
|
case ORTE_PROC_STATE_ABORTED:
|
|
return strdup("Aborted");
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
|
|
return strdup("Unknown");
|
|
}
|
|
|
|
static char *pretty_univ_state(orte_universe_state_t state) {
|
|
switch(state) {
|
|
case ORTE_UNIVERSE_STATE_PRE_INIT:
|
|
return strdup("Pre-Init");
|
|
break;
|
|
case ORTE_UNIVERSE_STATE_INIT:
|
|
return strdup("Initializing");
|
|
break;
|
|
case ORTE_UNIVERSE_STATE_RUNNING:
|
|
return strdup("Running");
|
|
break;
|
|
case ORTE_UNIVERSE_STATE_FINALIZE:
|
|
return strdup("Finalized");
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
|
|
return strdup("Unknown");
|
|
}
|
|
|
|
static char *pretty_node_state(orte_node_state_t state) {
|
|
switch(state) {
|
|
case ORTE_NODE_STATE_DOWN:
|
|
return strdup("Down");
|
|
break;
|
|
case ORTE_NODE_STATE_UP:
|
|
return strdup("Up");
|
|
break;
|
|
case ORTE_NODE_STATE_REBOOT:
|
|
return strdup("Reboot");
|
|
break;
|
|
case ORTE_NODE_STATE_UNKNOWN:
|
|
default:
|
|
break;
|
|
}
|
|
|
|
return strdup("Unknown");
|
|
}
|