/* * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. * Copyright (c) 2004-2005 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow * * $HEADER$ */ #include "orte_config.h" #include #include #ifdef HAVE_UNISTD_H #include #endif /* HAVE_UNISTD_H */ #ifdef HAVE_STDLIB_H #include #endif /* HAVE_STDLIB_H */ #ifdef HAVE_SYS_STAT_H #include #endif #ifdef HAVE_LIBGEN_H #include #endif #ifdef HAVE_SYS_TYPES_H #include #endif #ifdef HAVE_SYS_WAIT_H #include #endif #ifdef HAVE_STRING_H #include #endif /* HAVE_STRING_H */ #include #include "orte/orte_constants.h" #include "opal/util/cmd_line.h" #include "opal/util/argv.h" #include "opal/util/show_help.h" #include "opal/util/output.h" #include "opal/util/opal_environ.h" #include "opal/util/os_path.h" #include "opal/mca/base/base.h" #include "opal/mca/base/mca_base_param.h" #include "orte/util/univ_info.h" #include "orte/util/sys_info.h" #include "orte/util/proc_info.h" #include "opal/util/os_path.h" #include "orte/util/session_dir.h" #include "orte/util/universe_setup_file_io.h" #include "orte/mca/gpr/gpr.h" #include "orte/mca/rmgr/base/base.h" #include "orte/mca/ras/ras.h" #include "orte/mca/ras/ras_types.h" #include "orte/mca/ras/base/base.h" #include "orte/mca/ras/base/ras_base_node.h" #include "opal/runtime/opal.h" #include "orte/runtime/runtime.h" extern char **environ; /******************* * Universe/job/vpid information Objects *******************/ struct orte_ps_vpid_info_t { /** This is an object, so it must have a super */ opal_list_item_t super; /** General VPID Information */ size_t rank; pid_t pid; orte_process_name_t name; char * node; orte_proc_state_t state; size_t app_context_idx; }; typedef struct orte_ps_vpid_info_t orte_ps_vpid_info_t; OBJ_CLASS_DECLARATION(orte_ps_vpid_info_t); void orte_ps_vpid_info_construct(orte_ps_vpid_info_t *obj); void orte_ps_vpid_info_destruct( orte_ps_vpid_info_t *obj); OBJ_CLASS_INSTANCE(orte_ps_vpid_info_t, opal_list_item_t, orte_ps_vpid_info_construct, orte_ps_vpid_info_destruct); struct orte_ps_job_info_t { /** This is an object, so it must have a super */ opal_list_item_t super; /** General Job Information */ orte_jobid_t id; orte_job_state_t state; size_t num_init; size_t num_launched; size_t num_running; size_t num_finalized; size_t num_terminated; size_t num_aborted; size_t slots; orte_vpid_t vpid_start; orte_vpid_t vpid_range; orte_app_context_t **app_context; size_t num_app_context; /** List of vpids */ opal_list_t vpid_list; }; typedef struct orte_ps_job_info_t orte_ps_job_info_t; OBJ_CLASS_DECLARATION(orte_ps_job_info_t); void orte_ps_job_info_construct(orte_ps_job_info_t *obj); void orte_ps_job_info_destruct( orte_ps_job_info_t *obj); OBJ_CLASS_INSTANCE(orte_ps_job_info_t, opal_list_item_t, orte_ps_job_info_construct, orte_ps_job_info_destruct); struct orte_ps_universe_info_t { /** This is an object, so it must have a super */ opal_list_item_t super; /** Universe information */ orte_universe_t universe_info; /** List of Jobs */ opal_list_t job_list; /** List of nodes on orte-node segment */ opal_list_t nodes; }; typedef struct orte_ps_universe_info_t orte_ps_universe_info_t; OBJ_CLASS_DECLARATION(orte_ps_universe_info_t); void orte_ps_universe_info_construct(orte_ps_universe_info_t *obj); void orte_ps_universe_info_destruct( orte_ps_universe_info_t *obj); OBJ_CLASS_INSTANCE(orte_ps_universe_info_t, opal_list_item_t, orte_ps_universe_info_construct, orte_ps_universe_info_destruct); /****************** * Local Functions ******************/ static int orte_ps_init(void); static int parse_args(int argc, char *argv[]); static int connect_to_universe(orte_universe_t universe_info); static int gather_information(orte_ps_universe_info_t* universe); static int gather_active_jobs(orte_ps_universe_info_t* universe); static int gather_nodes(orte_ps_universe_info_t* universe); static int gather_job_info(orte_ps_universe_info_t* universe); static int gather_vpid_info(orte_ps_universe_info_t* universe); static int pretty_print(orte_ps_universe_info_t* universe); static int pretty_print_nodes(opal_list_t *nodes); static int pretty_print_jobs(opal_list_t *jobs); static int pretty_print_vpids(orte_ps_job_info_t *job); static char *pretty_univ_state(orte_universe_state_t state); static char *pretty_node_state(orte_node_state_t state); static char *pretty_job_state(orte_job_state_t state); static char *pretty_vpid_state(orte_proc_state_t state); /***************************************** * Global Vars for Command line Arguments *****************************************/ typedef struct { bool help; bool verbose; char *universe; int jobid; int vpid; bool gpr_dump; bool attached; bool nodes; } orte_ps_globals_t; orte_ps_globals_t orte_ps_globals; opal_cmd_line_init_t cmd_line_opts[] = { { NULL, NULL, NULL, 'h', NULL, "help", 0, &orte_ps_globals.help, OPAL_CMD_LINE_TYPE_BOOL, "This help message" }, { NULL, NULL, NULL, 'v', NULL, "verbose", 0, &orte_ps_globals.verbose, OPAL_CMD_LINE_TYPE_BOOL, "Be Verbose" }, { NULL, NULL, NULL, 'd', NULL, "dump", 0, &orte_ps_globals.gpr_dump, OPAL_CMD_LINE_TYPE_BOOL, "Dump the state of the GPR" }, { NULL, NULL, NULL, '\0', NULL, "universe", 1, &orte_ps_globals.universe, OPAL_CMD_LINE_TYPE_STRING, "Specify a universe" }, { NULL, NULL, NULL, 'j', NULL, "jobid", 1, &orte_ps_globals.jobid, OPAL_CMD_LINE_TYPE_INT, "Specify a specific jobid" }, { NULL, NULL, NULL, 'p', NULL, "vpid", 1, &orte_ps_globals.vpid, OPAL_CMD_LINE_TYPE_INT, "Specify a specific vpid. Must specify a --jobid as well" }, { NULL, NULL, NULL, 'n', NULL, "nodes", 0, &orte_ps_globals.nodes, OPAL_CMD_LINE_TYPE_INT, "Print Node Information" }, /* End of list */ { NULL, NULL, NULL, '\0', NULL, NULL, 0, NULL, OPAL_CMD_LINE_TYPE_NULL, NULL } }; int main(int argc, char *argv[]) { int ret, exit_status = ORTE_SUCCESS; opal_list_t universe_list; opal_list_item_t* item = NULL; opal_list_t universe_search_result; /*************** * Initialize ***************/ if (ORTE_SUCCESS != (ret = parse_args(argc, argv))) { return ret; } if (ORTE_SUCCESS != (ret = orte_ps_init())) { exit_status = ret; goto cleanup; } OBJ_CONSTRUCT(&universe_list, opal_list_t); OBJ_CONSTRUCT(&universe_search_result, opal_list_t); /* * Get the directory listing */ if( orte_ps_globals.verbose ) { printf("orte_ps: Acquiring universe list...\n"); } if (ORTE_SUCCESS != (ret = orte_universe_search(&universe_search_result) ) ) { exit_status = ret; goto cleanup; } /* * For each universe in the listing */ for(item = opal_list_get_first(&universe_search_result); item != opal_list_get_end(&universe_search_result); item = opal_list_get_next(item) ) { orte_ps_universe_info_t *univ; orte_universe_t *tmp_univ; /* * Copy over the universe information */ univ = OBJ_NEW(orte_ps_universe_info_t); tmp_univ = (orte_universe_t *) item; univ->universe_info.state = tmp_univ->state; univ->universe_info.persistence = tmp_univ->persistence; univ->universe_info.console = tmp_univ->console; univ->universe_info.console_connected = tmp_univ->console_connected; if( NULL != tmp_univ->name ) univ->universe_info.name = strdup(tmp_univ->name); else univ->universe_info.name = NULL; if( NULL != tmp_univ->host ) univ->universe_info.host = strdup(tmp_univ->host); else univ->universe_info.host = NULL; if( NULL != tmp_univ->uid ) univ->universe_info.uid = strdup(tmp_univ->uid); else univ->universe_info.uid = NULL; if( NULL != tmp_univ->scope ) univ->universe_info.scope = strdup(tmp_univ->scope); else univ->universe_info.scope = NULL; if( NULL != tmp_univ->seed_uri) univ->universe_info.seed_uri = strdup(tmp_univ->seed_uri); else univ->universe_info.seed_uri = NULL; if( NULL != tmp_univ->scriptfile ) univ->universe_info.scriptfile = strdup(tmp_univ->scriptfile); else univ->universe_info.scriptfile = NULL; opal_list_append(&universe_list, &(univ->super)); /* * Connect to the universe */ if( orte_ps_globals.verbose ) { printf("orte_ps: Connecting to universe: %s\n", univ->universe_info.name); } if( ORTE_SUCCESS != (ret = connect_to_universe(univ->universe_info)) ) { exit_status = ret; goto cleanup; } /* * Gather the information */ if( orte_ps_globals.verbose ) { printf("orte_ps: Gathering Universe Information\n"); } if( ORTE_SUCCESS != (ret = gather_information(univ)) ) { exit_status = ret; goto cleanup; } /* * Print the General Universe information */ if(ORTE_SUCCESS != (ret = pretty_print(univ)) ) { exit_status = ret; goto cleanup; } /* * If we were asked to dump the GPR then do so */ if( orte_ps_globals.gpr_dump) { if( ORTE_SUCCESS != (ret = orte_gpr.dump_all() ) ) { exit_status = ret; goto cleanup; } } /* * Since connecting and disconnecting from a universe is * not well defined, only allow connection to the first * universe found. */ break; } /*************** * Cleanup ***************/ cleanup: while (NULL != (item = opal_list_remove_first(&universe_list))) { OBJ_RELEASE(item); } while (NULL != (item = opal_list_remove_first(&universe_search_result))) { OBJ_RELEASE(item); } /* * Only finalize if we are attached to a specific universe */ if(orte_ps_globals.attached) { if (OPAL_SUCCESS != (ret = orte_finalize())) { return ret; } } return exit_status; } static int parse_args(int argc, char *argv[]) { int i, ret, len; opal_cmd_line_t cmd_line; char **app_env = NULL, **global_env = NULL; orte_ps_globals_t tmp = { false, false, NULL, -1, -1, false, false, false}; /* Parse the command line options */ orte_ps_globals = tmp; opal_cmd_line_create(&cmd_line, cmd_line_opts); mca_base_open(); mca_base_cmd_line_setup(&cmd_line); ret = opal_cmd_line_parse(&cmd_line, true, argc, argv); /** * Put all of the MCA arguments in the environment */ mca_base_cmd_line_process_args(&cmd_line, &app_env, &global_env); len = opal_argv_count(app_env); for(i = 0; i < len; ++i) { putenv(app_env[i]); } len = opal_argv_count(global_env); for(i = 0; i < len; ++i) { putenv(global_env[i]); } /** * Now start parsing our specific arguments */ if (OPAL_SUCCESS != ret || orte_ps_globals.help) { char *args = NULL; args = opal_cmd_line_get_usage_msg(&cmd_line); opal_show_help("help-orte-ps.txt", "usage", true, args); free(args); return ORTE_ERROR; } /* * If they specify a vpid, they must specify a jobid */ if( 0 <= orte_ps_globals.vpid) { if( 0 > orte_ps_globals.jobid) { opal_show_help("help-orte-ps.txt", "vpid-usage", true, orte_ps_globals.vpid); return ORTE_ERROR; } } return ORTE_SUCCESS; } static int orte_ps_init(void) { int exit_status = ORTE_SUCCESS, ret; /* * We are trying to attach to another process' GPR so we need to * attach no matter if it is identified as private or not. */ opal_setenv(mca_base_param_env_var("universe_console"), "1", true, &environ); /*************************** * We need all of OPAL ***************************/ if (ORTE_SUCCESS != (ret = opal_init())) { exit_status = ret; goto cleanup; } /*************************** * And ORTE, but need to do a bit of a dance first ***************************/ /* register handler for errnum -> string converstion */ opal_error_register("ORTE", ORTE_ERR_BASE, ORTE_ERR_MAX, orte_err2str); /* Register all MCA Params */ if (ORTE_SUCCESS != (ret = orte_register_params(true))) { exit_status = ret; goto cleanup; } /* Ensure the system_info structure is instantiated and initialized */ if (ORTE_SUCCESS != (ret = orte_sys_info())) { exit_status = ret; goto cleanup; } /* Ensure the process info structure is instantiated and initialized */ if (ORTE_SUCCESS != (ret = orte_proc_info())) { exit_status = ret; goto cleanup; } cleanup: return exit_status; } static int pretty_print(orte_ps_universe_info_t* universe) { int i, line_len; int len_name = 0, len_host = 0, len_uid = 0, len_scope = 0, len_per = 0, len_state = 0; /* * Calculate segment lengths */ len_name = (int) (strlen(universe->universe_info.name) < strlen("Universe Name") ? strlen("Universe Name") : strlen(universe->universe_info.name) ); len_host = (int) (strlen(universe->universe_info.host) < strlen("Hostname") ? strlen("Hostname") : strlen(universe->universe_info.host)); len_uid = (int) (strlen(universe->universe_info.uid) < strlen("UID") ? strlen("UID") : strlen(universe->universe_info.uid)); len_per = (int) strlen("Persistent"); len_scope = (int) (strlen(universe->universe_info.scope) < strlen("Scope") ? strlen("Scope") : strlen(universe->universe_info.scope)); len_state = (int) (strlen(pretty_univ_state(universe->universe_info.state)) < strlen("State") ? strlen("State") : strlen(pretty_univ_state(universe->universe_info.state)) ); line_len = (len_name + 3 + len_host + 3 + len_uid + 3 + len_per + 3 + len_scope + 3 + len_state) + 3 ; /* * Print header */ printf("%*s | ", len_name , "Universe Name"); printf("%*s | ", len_host , "Hostname"); printf("%*s | ", len_uid , "UID"); printf("%*s | ", len_per , "Persistent"); printf("%*s | ", len_scope, "Scope"); printf("%*s |" , len_state, "State"); printf("\n"); for(i = 0; i < line_len; ++i) { printf("-"); } printf("\n"); /* * Print Info */ printf("%*s | ", len_name, universe->universe_info.name); printf("%*s | ", len_host, universe->universe_info.host); printf("%*s | ", len_uid, universe->universe_info.uid); if(universe->universe_info.persistence) printf("%*s | ", len_per, "true"); else printf("%*s | ", len_per, "false"); printf("%*s | ", len_scope, universe->universe_info.scope); printf("%*s |", len_state, pretty_univ_state(universe->universe_info.state)); printf("\n"); printf("\n"); /* * Print Node Information */ if( orte_ps_globals.nodes ) pretty_print_nodes(&universe->nodes); /* * Print Job Information */ pretty_print_jobs(&universe->job_list); return ORTE_SUCCESS; } static int pretty_print_nodes(opal_list_t *nodes) { opal_list_item_t* node_item = NULL; int i, line_len; int len_name = 0, len_arch = 0, len_cell = 0, len_state = 0, len_slots = 0, len_slots_i = 0, len_slots_a = 0, len_slots_m = 0; /* * Caculate segment lengths */ len_name = (int) strlen("Node Name"); len_arch = (int) strlen("Arch"); len_cell = (int) strlen("Cell ID"); len_state = (int) strlen("State"); len_slots = (int) strlen("Slots"); len_slots_i = (int) strlen("Slots In Use"); #if 0 len_slots_a = (int) strlen("Slots Alloc"); #else len_slots_a = -3; #endif len_slots_m = (int) strlen("Slots Max"); for(node_item = opal_list_get_first(nodes); node_item != opal_list_get_end(nodes); node_item = opal_list_get_next(node_item) ) { orte_ras_node_t *node; node = (orte_ras_node_t *)node_item; if( NULL != node->node_name && (int)strlen(node->node_name) > len_name) len_name = (int) strlen(node->node_name); if( NULL != node->node_arch && (int)strlen(node->node_arch) > len_arch) len_arch = (int) strlen(node->node_arch); if( (int)strlen(pretty_node_state(node->node_state)) > len_state ) len_state = (int)strlen(pretty_node_state(node->node_state)); } /* * JJH Since node_slots_inuse and node_slots_alloc are not used properly * JJH do not display them to the user. */ line_len = (len_name + 3 + len_arch + 3 + len_cell + 3 + len_state + 3 + len_slots + 3 + len_slots_i + 3 + len_slots_a + 3 + len_slots_m + 3); /* * Print the header */ printf("%*s | ", len_name, "Node Name"); printf("%*s | ", len_arch, "Arch"); printf("%*s | ", len_cell, "Cell ID"); printf("%*s | ", len_state, "State"); printf("%*s | ", len_slots, "Slots"); printf("%*s | ", len_slots_m, "Slots Max"); printf("%*s | ", len_slots_i, "Slots In Use"); #if 0 printf("%*s | ", len_slots_a, "Slots Alloc"); #endif printf("\n"); for(i = 0; i < line_len; ++i) { printf("-"); } printf("\n"); /* * Print Info */ for(node_item = opal_list_get_first(nodes); node_item != opal_list_get_end(nodes); node_item = opal_list_get_next(node_item) ) { orte_ras_node_t *node; node = (orte_ras_node_t *)node_item; printf("%*s | ", len_name, node->node_name); printf("%*s | ", len_arch, (NULL == node->node_arch ? "" : node->node_arch)); printf("%*d | ", len_cell, node->node_cellid); printf("%*s | ", len_state, pretty_node_state(node->node_state)); printf("%*d | ", len_slots, (uint)node->node_slots); printf("%*d | ", len_slots_m, (uint)node->node_slots_max); printf("%*d | ", len_slots_i, (uint)node->node_slots_inuse); #if 0 printf("%*d | ", len_slots_a, (uint)node->node_slots_alloc); #endif printf("\n"); } return ORTE_SUCCESS; } static int pretty_print_jobs(opal_list_t *jobs) { opal_list_item_t* job_item = NULL; int len_jobid = 0, len_state = 0, len_slots = 0, len_vpid_s = 0, len_vpid_r = 0, len_ckpt_s = 0, len_ckpt_r = 0, len_ckpt_l = 0; int i, line_len; for(job_item = opal_list_get_first(jobs); job_item != opal_list_get_end(jobs); job_item = opal_list_get_next(job_item) ) { orte_ps_job_info_t *job; job = (orte_ps_job_info_t *)job_item; /* * Caculate segment lengths */ len_jobid = 6; len_state = (int) (strlen(pretty_job_state(job->state)) < strlen("State") ? strlen("State") : strlen(pretty_job_state(job->state))); len_slots = 6; len_vpid_s = (int) strlen("VPID Start"); len_vpid_r = (int) strlen("VPID Range"); len_ckpt_s = 0; len_ckpt_r = 0; len_ckpt_l = 0; line_len = (len_jobid + 3 + len_state + 3 + len_slots + 3 + len_vpid_s + 3 + len_vpid_r + 3 + len_ckpt_s + 3 + len_ckpt_r + 3 + len_ckpt_l - 6 ); /* * Print Header */ printf("\n"); printf("%*s | ", len_jobid , "JobID"); printf("%*s | ", len_state , "State"); printf("%*s | ", len_slots , "Slots"); printf("%*s | ", len_vpid_s , "VPID Start"); printf("%*s | ", len_vpid_r , "VPID Range"); printf("\n"); for(i = 0; i < line_len; ++i) { printf("-"); } printf("\n"); /* * Print Info */ printf("%*d | ", len_jobid , job->id); printf("%*s | ", len_state , pretty_job_state(job->state)); printf("%*d | ", len_slots , (uint)job->slots); printf("%*d | ", len_vpid_s, job->vpid_start); printf("%*d | ", len_vpid_r, job->vpid_range); printf("\n"); /* * Pretty print all VPID's in job */ if(0 == job->id) { /* No vpids for the HNP */ continue; } pretty_print_vpids(job); } return ORTE_SUCCESS; } static int pretty_print_vpids(orte_ps_job_info_t *job) { opal_list_item_t* vpid_item = NULL; int len_o_proc_name = 0, len_proc_name = 0, len_rank = 0, len_pid = 0, len_state = 0, len_node = 0, len_ckpt_s = 0, len_ckpt_r = 0, len_ckpt_l = 0; int i, line_len; /* * Caculate segment lengths */ len_o_proc_name = strlen("ORTE Name"); len_proc_name = strlen("Process Name"); len_rank = 6; len_pid = 6; len_state = 0; len_node = 0; len_ckpt_s = 0; len_ckpt_r = 0; len_ckpt_l = 0; for(vpid_item = opal_list_get_first(&(job->vpid_list)); vpid_item != opal_list_get_end(&(job->vpid_list)); vpid_item = opal_list_get_next(vpid_item) ) { orte_ps_vpid_info_t *vpid; char *proc_name = NULL; vpid = (orte_ps_vpid_info_t *)vpid_item; /* * Find my app context */ for( i = 0; i < (int)job->num_app_context; ++i) { if( job->app_context[i]->idx == vpid->app_context_idx ) { if( (int)strlen(job->app_context[i]->app) > len_proc_name) len_proc_name = strlen(job->app_context[i]->app); break; } } asprintf(&proc_name, "%d.%d.%d", vpid->name.cellid, vpid->name.jobid, vpid->name.vpid); if( (int)strlen(proc_name) > len_o_proc_name ) len_o_proc_name = strlen(proc_name); if( (int)strlen(vpid->node) > len_node) len_node = strlen(vpid->node); if( (int)strlen(pretty_vpid_state(vpid->state)) > len_state) len_state = strlen(pretty_vpid_state(vpid->state)); if( NULL != proc_name) { free(proc_name); proc_name = NULL; } } line_len = (len_o_proc_name + 3 + len_proc_name + 3 + len_rank + 3 + len_pid + 3 + len_state + 3 + len_node + 3 + len_ckpt_s + 3 + len_ckpt_r + 3 + len_ckpt_l - 6 ); /* * Print Header */ printf("\t"); printf("%*s | ", len_proc_name , "Process Name"); printf("%*s | ", len_o_proc_name , "ORTE Name"); printf("%*s | ", len_rank , "Rank"); printf("%*s | ", len_pid , "PID"); printf("%*s | ", len_node , "Node"); printf("%*s | ", len_state , "State"); printf("\n"); printf("\t"); for(i = 0; i < line_len; ++i) { printf("-"); } printf("\n"); /* * Print Info */ for(vpid_item = opal_list_get_first(&(job->vpid_list)); vpid_item != opal_list_get_end(&(job->vpid_list)); vpid_item = opal_list_get_next(vpid_item) ) { orte_ps_vpid_info_t *vpid; char *proc_name = NULL; vpid = (orte_ps_vpid_info_t *)vpid_item; printf("\t"); asprintf(&proc_name, "%d.%d.%d", vpid->name.cellid, vpid->name.jobid, vpid->name.vpid); for( i = 0; i < (int)job->num_app_context; ++i) { if( job->app_context[i]->idx == vpid->app_context_idx ) { printf("%*s | ", len_proc_name, job->app_context[i]->app); break; } } printf("%*s | ", len_o_proc_name, proc_name); printf("%*d | ", len_rank , (uint)vpid->rank); printf("%*d | ", len_pid , vpid->pid); printf("%*s | ", len_node , vpid->node); printf("%*s | ", len_state , pretty_vpid_state(vpid->state)); printf("\n"); if( NULL != proc_name) { free(proc_name); proc_name = NULL; } } return ORTE_SUCCESS; } static int connect_to_universe(orte_universe_t universe_info) { int ret, exit_status = ORTE_SUCCESS; char * univ_mca_param = NULL; /* * Construct the MCA parameter */ asprintf(&univ_mca_param, "%s@%s:%s", universe_info.uid, universe_info.host, universe_info.name); #if 0 /* * Disconnect from the current universe */ if(orte_ps_globals.attached) { if (OPAL_SUCCESS != (ret = orte_system_finalize())) { return ret; } } #endif /* * Set the environment universe information */ opal_setenv(mca_base_param_env_var("universe"), univ_mca_param, true, &environ); /* * Restart ORTE in the requested universe */ if(!orte_ps_globals.attached) { if (ORTE_SUCCESS != (ret = orte_system_init(true)) ) { exit_status = ret; goto cleanup; } } else { if( ORTE_SUCCESS != (ret = orte_restart(orte_process_info.my_name, universe_info.seed_uri)) ) { printf("orte_restart: FAILED (%d)\n", ret); exit_status = ret; goto cleanup; } } orte_ps_globals.attached = true; cleanup: if( NULL != univ_mca_param) free(univ_mca_param); return exit_status; } static int gather_information(orte_ps_universe_info_t* universe) { int ret, exit_status = ORTE_SUCCESS; if( ORTE_SUCCESS != (ret = gather_active_jobs(universe) )) { exit_status = ret; goto cleanup; } if( ORTE_SUCCESS != (ret = gather_nodes(universe) )) { exit_status = ret; goto cleanup; } if( ORTE_SUCCESS != (ret = gather_job_info(universe) )) { exit_status = ret; goto cleanup; } if( ORTE_SUCCESS != (ret = gather_vpid_info(universe) )) { exit_status = ret; goto cleanup; } cleanup: return exit_status; } static int gather_active_jobs(orte_ps_universe_info_t* universe) { int ret, exit_status = ORTE_SUCCESS; char *segment = NULL; orte_gpr_value_t** values = NULL; size_t i, j, num_values = 0; /********************** * Job Info segment **********************/ segment = strdup(ORTE_JOBINFO_SEGMENT); if( ORTE_SUCCESS != (ret = orte_gpr.get(ORTE_GPR_KEYS_OR|ORTE_GPR_TOKENS_OR, segment, NULL, NULL, &num_values, &values ) ) ) { exit_status = ret; goto cleanup; } /* * Parse the structure returned */ for(i = 0; i < num_values; ++i) { orte_gpr_value_t* value = values[i]; orte_ps_job_info_t *job = NULL; job = OBJ_NEW(orte_ps_job_info_t); orte_schema.extract_jobid_from_segment_name(&(job->id), value->tokens[0]); /* * If the user specified a jobid, then * only access the info for that jobid */ if( 0 <= orte_ps_globals.jobid ) { if( (int)job->id != orte_ps_globals.jobid) { continue; } } for( j = 0; j < value->cnt; ++j) { orte_gpr_keyval_t* keyval = value->keyvals[j]; orte_job_state_t *job_state; if( 0 == strncmp(keyval->key, ORTE_JOB_STATE_KEY, strlen(ORTE_JOB_STATE_KEY)) ) { if( ORTE_SUCCESS != (ret = orte_dss.get( (void **) &job_state, keyval->value, ORTE_JOB_STATE))) { exit_status = ret; goto cleanup; } job->state = *job_state; continue; } } opal_list_append(&universe->job_list, &(job->super)); } cleanup: return exit_status; } static int gather_nodes(orte_ps_universe_info_t* universe) { int ret, exit_status = ORTE_SUCCESS; if( ORTE_SUCCESS != (ret = orte_ras_base_node_query(&(universe->nodes)))) { exit_status = ret; } return exit_status; } static int gather_job_info(orte_ps_universe_info_t* universe) { int ret, exit_status = ORTE_SUCCESS; char *segment = NULL, *tokens[2]; orte_gpr_value_t** values = NULL; size_t i, j, num_values = 0; opal_list_item_t* item = NULL; /* * For each job in the universe */ for(item = opal_list_get_first(&(universe->job_list)); item != opal_list_get_end(&(universe->job_list)); item = opal_list_get_next(item) ) { orte_ps_job_info_t *job; job = (orte_ps_job_info_t *)item; /* * Get the App Context(s) */ orte_rmgr_base_get_app_context(job->id, &job->app_context, &job->num_app_context); /* * Access the job segment */ orte_schema.get_job_segment_name(&segment, job->id); /* * Here we are just focused on the orte-job-globals container */ tokens[0] = strdup(ORTE_JOB_GLOBALS); tokens[1] = NULL; if( ORTE_SUCCESS != (ret = orte_gpr.get(ORTE_GPR_KEYS_OR|ORTE_GPR_TOKENS_OR, segment, tokens, NULL, &num_values, &values ) ) ) { exit_status = ret; goto cleanup; } /* * Parse the structure returned */ for(i = 0; i < num_values; ++i) { orte_gpr_value_t* value = values[i]; for( j = 0; j < value->cnt; ++j) { orte_gpr_keyval_t* keyval = value->keyvals[j]; size_t *tmp_num; orte_vpid_t *tmp_vpid; if( 0 == strncmp(keyval->key, ORTE_PROC_NUM_AT_INIT, strlen(ORTE_PROC_NUM_AT_INIT)) ) { if( ORTE_SUCCESS != (ret = orte_dss.get( (void **) &tmp_num, keyval->value, ORTE_SIZE))) { exit_status = ret; goto cleanup; } job->num_init = *tmp_num; continue; } else if( 0 == strncmp(keyval->key, ORTE_PROC_NUM_LAUNCHED, strlen(ORTE_PROC_NUM_LAUNCHED)) ) { if( ORTE_SUCCESS != (ret = orte_dss.get( (void **) &tmp_num, keyval->value, ORTE_SIZE))) { exit_status = ret; goto cleanup; } job->num_launched = *tmp_num; continue; } else if( 0 == strncmp(keyval->key, ORTE_PROC_NUM_RUNNING, strlen(ORTE_PROC_NUM_RUNNING)) ) { if( ORTE_SUCCESS != (ret = orte_dss.get( (void **) &tmp_num, keyval->value, ORTE_SIZE))) { exit_status = ret; goto cleanup; } job->num_running = *tmp_num; continue; } else if( 0 == strncmp(keyval->key, ORTE_PROC_NUM_FINALIZED, strlen(ORTE_PROC_NUM_FINALIZED)) ) { if( ORTE_SUCCESS != (ret = orte_dss.get( (void **) &tmp_num, keyval->value, ORTE_SIZE))) { exit_status = ret; goto cleanup; } job->num_finalized = *tmp_num; continue; } else if( 0 == strncmp(keyval->key, ORTE_PROC_NUM_TERMINATED, strlen(ORTE_PROC_NUM_TERMINATED)) ) { if( ORTE_SUCCESS != (ret = orte_dss.get( (void **) &tmp_num, keyval->value, ORTE_SIZE))) { exit_status = ret; goto cleanup; } job->num_terminated = *tmp_num; continue; } else if( 0 == strncmp(keyval->key, ORTE_PROC_NUM_ABORTED, strlen(ORTE_PROC_NUM_ABORTED)) ) { if( ORTE_SUCCESS != (ret = orte_dss.get( (void **) &tmp_num, keyval->value, ORTE_SIZE))) { exit_status = ret; goto cleanup; } job->num_aborted = *tmp_num; continue; } else if( 0 == strncmp(keyval->key, ORTE_JOB_SLOTS_KEY, strlen(ORTE_JOB_SLOTS_KEY)) ) { if( ORTE_SUCCESS != (ret = orte_dss.get( (void **) &tmp_num, keyval->value, ORTE_SIZE))) { exit_status = ret; goto cleanup; } job->slots = *tmp_num; continue; } else if( 0 == strncmp(keyval->key, ORTE_JOB_VPID_START_KEY, strlen(ORTE_JOB_VPID_START_KEY)) ) { if( ORTE_SUCCESS != (ret = orte_dss.get( (void **) &tmp_vpid, keyval->value, ORTE_VPID))) { exit_status = ret; goto cleanup; } job->vpid_start = *tmp_vpid; continue; } else if( 0 == strncmp(keyval->key, ORTE_JOB_VPID_RANGE_KEY, strlen(ORTE_JOB_VPID_RANGE_KEY)) ) { if( ORTE_SUCCESS != (ret = orte_dss.get( (void **) &tmp_vpid, keyval->value, ORTE_VPID))) { exit_status = ret; goto cleanup; } job->vpid_range = *tmp_vpid; continue; } } } } cleanup: return exit_status; } static int gather_vpid_info(orte_ps_universe_info_t* universe) { int ret, exit_status = ORTE_SUCCESS; char *segment = NULL; orte_gpr_value_t** values = NULL; size_t i, j, num_values = 0; opal_list_item_t* job_item = NULL; size_t v = 0; /* * For each Job in the universe */ for(job_item = opal_list_get_first(&(universe->job_list)); job_item != opal_list_get_end(&(universe->job_list)); job_item = opal_list_get_next(job_item) ) { orte_ps_job_info_t *job; job = (orte_ps_job_info_t *)job_item; /* * Skip getting the vpid's for the HNP, since the information is not complete */ if( 0 == job->id) { continue; } /* * For each vpid in the job */ for(v = job->vpid_start; v < (job->vpid_start + job->vpid_range); ++v) { orte_ps_vpid_info_t *vpid = NULL; orte_process_name_t proc; char **tokens = NULL; size_t num_tokens = 0; /* * If the user specified a vpid, then just get that one */ if( 0 <= orte_ps_globals.vpid) { /* * Check to make sure it is a valid vpid */ if( (int)(job->vpid_start + job->vpid_range) <= orte_ps_globals.vpid) { opal_show_help("help-orte-ps.txt", "invalid-vpid", true, orte_ps_globals.vpid, orte_ps_globals.jobid ); return ORTE_ERROR; } if( (int)v != orte_ps_globals.vpid ) { continue; } } vpid = OBJ_NEW(orte_ps_vpid_info_t); /* * Access the job segment */ orte_schema.get_job_segment_name(&segment, job->id); /* * Access the vpid container */ proc.cellid = 0; proc.jobid = job->id; proc.vpid = v; orte_schema.get_proc_tokens(&tokens, &num_tokens, &proc); if( ORTE_SUCCESS != (ret = orte_gpr.get(ORTE_GPR_KEYS_OR|ORTE_GPR_TOKENS_OR, segment, tokens, NULL, &num_values, &values ) ) ) { exit_status = ret; goto cleanup; } /* * Parse the structure returned */ for(i = 0; i < num_values; ++i) { orte_gpr_value_t* value = values[i]; for( j = 0; j < value->cnt; ++j) { orte_gpr_keyval_t* keyval = value->keyvals[j]; if( 0 == strncmp(keyval->key, ORTE_PROC_RANK_KEY, strlen(ORTE_PROC_RANK_KEY)) ) { size_t *tmp_size; if( ORTE_SUCCESS != (ret = orte_dss.get( (void **) &tmp_size, keyval->value, ORTE_SIZE))) { exit_status = ret; goto cleanup; } vpid->rank = *tmp_size; continue; } else if( 0 == strncmp(keyval->key, ORTE_PROC_APP_CONTEXT_KEY, strlen(ORTE_PROC_APP_CONTEXT_KEY)) ) { size_t *tmp_size; if( ORTE_SUCCESS != (ret = orte_dss.get( (void **) &tmp_size, keyval->value, ORTE_SIZE))) { exit_status = ret; goto cleanup; } vpid->app_context_idx = *tmp_size; continue; } else if( 0 == strncmp(keyval->key, ORTE_PROC_PID_KEY, strlen(ORTE_PROC_PID_KEY)) ) { pid_t *tmp_pid; if( ORTE_SUCCESS != (ret = orte_dss.get( (void **) &tmp_pid, keyval->value, ORTE_PID))) { exit_status = ret; goto cleanup; } vpid->pid = *tmp_pid; continue; } else if( 0 == strncmp(keyval->key, ORTE_PROC_NAME_KEY, strlen(ORTE_PROC_NAME_KEY)) ) { orte_process_name_t *tmp_proc; if( ORTE_SUCCESS != (ret = orte_dss.get( (void **) &tmp_proc, keyval->value, ORTE_NAME))) { exit_status = ret; goto cleanup; } vpid->name.cellid = tmp_proc->cellid; vpid->name.jobid = tmp_proc->jobid; vpid->name.vpid = tmp_proc->vpid; continue; } else if( 0 == strncmp(keyval->key, ORTE_NODE_NAME_KEY, strlen(ORTE_NODE_NAME_KEY)) ) { char *tmp_node = NULL; if( ORTE_SUCCESS != (ret = orte_dss.get( (void **) &tmp_node, keyval->value, ORTE_STRING))) { exit_status = ret; goto cleanup; } vpid->node = strdup(tmp_node); continue; } else if( 0 == strncmp(keyval->key, ORTE_PROC_STATE_KEY, strlen(ORTE_PROC_STATE_KEY)) ) { orte_proc_state_t *tmp_state; if( ORTE_SUCCESS != (ret = orte_dss.get( (void **) &tmp_state, keyval->value, ORTE_PROC_STATE))) { exit_status = ret; goto cleanup; } vpid->state = *tmp_state; continue; } } } opal_list_append(&job->vpid_list, &(vpid->super)); } } cleanup: return exit_status; } /************************ * Object handling ************************/ void orte_ps_vpid_info_construct(orte_ps_vpid_info_t *obj) { obj->node = NULL; } void orte_ps_vpid_info_destruct( orte_ps_vpid_info_t *obj) { if( NULL != obj->node) free(obj->node); } void orte_ps_job_info_construct(orte_ps_job_info_t *obj) { OBJ_CONSTRUCT(&obj->vpid_list, opal_list_t); obj->app_context = NULL; obj->num_app_context = 0; } void orte_ps_job_info_destruct( orte_ps_job_info_t *obj) { opal_list_item_t* item = NULL; size_t i; for(i = 0; i < obj->num_app_context; ++i) { free(obj->app_context[i]); } obj->num_app_context = 0; while (NULL != (item = opal_list_remove_first(&obj->vpid_list))) { OBJ_RELEASE(item); } OBJ_DESTRUCT(&obj->vpid_list); } void orte_ps_universe_info_construct(orte_ps_universe_info_t *obj) { OBJ_CONSTRUCT(&obj->job_list, opal_list_t); OBJ_CONSTRUCT(&obj->nodes, opal_list_t); OBJ_CONSTRUCT(&obj->universe_info, orte_universe_t); } void orte_ps_universe_info_destruct( orte_ps_universe_info_t *obj) { opal_list_item_t* item = NULL; while (NULL != (item = opal_list_remove_first(&obj->job_list))) { OBJ_RELEASE(item); } OBJ_DESTRUCT(&obj->job_list); while (NULL != (item = opal_list_remove_first(&obj->nodes))) { OBJ_RELEASE(item); } OBJ_DESTRUCT(&obj->nodes); OBJ_DESTRUCT(&obj->universe_info); } static char *pretty_job_state(orte_job_state_t state) { switch(state) { case ORTE_JOB_STATE_INIT: return strdup("Init"); break; case ORTE_JOB_STATE_LAUNCHED: return strdup("Launched"); break; case ORTE_JOB_STATE_AT_STG1: return strdup("Stage 1"); break; case ORTE_JOB_STATE_AT_STG2: return strdup("Stage 2"); break; case ORTE_JOB_STATE_RUNNING: return strdup("Running"); break; case ORTE_JOB_STATE_AT_STG3: return strdup("Stage 3"); break; case ORTE_JOB_STATE_FINALIZED: return strdup("Finalized"); break; case ORTE_JOB_STATE_TERMINATED: return strdup("Terminated"); break; case ORTE_JOB_STATE_ABORTED: return strdup("Aborted"); break; default: break; } return strdup("Unknown"); } static char *pretty_vpid_state(orte_proc_state_t state) { switch(state) { case ORTE_PROC_STATE_INIT: return strdup("Init"); break; case ORTE_PROC_STATE_LAUNCHED: return strdup("Launched"); break; case ORTE_PROC_STATE_AT_STG1: return strdup("Stage 1"); break; case ORTE_PROC_STATE_AT_STG2: return strdup("Stage 2"); break; case ORTE_PROC_STATE_RUNNING: return strdup("Running"); break; case ORTE_PROC_STATE_AT_STG3: return strdup("Stage 3"); break; case ORTE_PROC_STATE_FINALIZED: return strdup("Finalized"); break; case ORTE_PROC_STATE_TERMINATED: return strdup("Terminated"); break; case ORTE_PROC_STATE_ABORTED: return strdup("Aborted"); break; default: break; } return strdup("Unknown"); } static char *pretty_univ_state(orte_universe_state_t state) { switch(state) { case ORTE_UNIVERSE_STATE_PRE_INIT: return strdup("Pre-Init"); break; case ORTE_UNIVERSE_STATE_INIT: return strdup("Initializing"); break; case ORTE_UNIVERSE_STATE_RUNNING: return strdup("Running"); break; case ORTE_UNIVERSE_STATE_FINALIZE: return strdup("Finalized"); break; default: break; } return strdup("Unknown"); } static char *pretty_node_state(orte_node_state_t state) { switch(state) { case ORTE_NODE_STATE_DOWN: return strdup("Down"); break; case ORTE_NODE_STATE_UP: return strdup("Up"); break; case ORTE_NODE_STATE_REBOOT: return strdup("Reboot"); break; case ORTE_NODE_STATE_UNKNOWN: default: break; } return strdup("Unknown"); }