1
1

Repair orte-ps by updating some of the interface code. Add ability to recover from attempting to contact non-responsive HNPs due to stale session directories. Implement the -j option. Turn "off" the -p option as it doesn't work and will take a little while to actually implement it (if anyone really cares).

This commit was SVN r21245.
Этот коммит содержится в:
Ralph Castain 2009-05-15 13:21:18 +00:00
родитель 17761f60b9
Коммит 484a6f58f2
6 изменённых файлов: 322 добавлений и 120 удалений

Просмотреть файл

@ -687,10 +687,10 @@ static int process_commands(orte_process_name_t* sender,
* back 0 procs so the tool won't hang
*/
if (!ORTE_PROC_IS_HNP) {
orte_std_cntr_t zero=0;
int32_t zero=0;
answer = OBJ_NEW(opal_buffer_t);
if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &zero, 1, ORTE_STD_CNTR))) {
if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &zero, 1, OPAL_INT32))) {
ORTE_ERROR_LOG(ret);
OBJ_RELEASE(answer);
goto CLEANUP;
@ -702,8 +702,8 @@ static int process_commands(orte_process_name_t* sender,
}
} else {
/* if we are the HNP, process the request */
orte_std_cntr_t i, num_jobs=0;
orte_job_t **jobs=NULL, *jobdat;
int32_t i, num_jobs;
orte_job_t *jobdat;
/* unpack the jobid */
n = 1;
@ -717,9 +717,10 @@ static int process_commands(orte_process_name_t* sender,
/* if they asked for a specific job, then just get that info */
if (ORTE_JOBID_WILDCARD != job) {
job = ORTE_CONSTRUCT_LOCAL_JOBID(ORTE_PROC_MY_NAME->jobid, job);
if (NULL != (jobdat = orte_get_job_data_object(job))) {
num_jobs = 1;
if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &num_jobs, 1, ORTE_STD_CNTR))) {
if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &num_jobs, 1, OPAL_INT32))) {
ORTE_ERROR_LOG(ret);
OBJ_RELEASE(answer);
goto CLEANUP;
@ -729,22 +730,36 @@ static int process_commands(orte_process_name_t* sender,
OBJ_RELEASE(answer);
goto CLEANUP;
}
} else {
/* if we get here, then send a zero answer */
num_jobs = 0;
if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &num_jobs, 1, OPAL_INT32))) {
ORTE_ERROR_LOG(ret);
OBJ_RELEASE(answer);
goto CLEANUP;
}
}
} else {
/* since the job array is no longer
* left-justified and may have holes, we have
* to cnt the number of jobs
*/
jobs = (orte_job_t**)orte_job_data->addr;
for (i=0; i < orte_job_data->size; i++) {
if (NULL != orte_job_data->addr[i]) {
num_jobs = 0;
for (i=1; i < orte_job_data->size; i++) {
if (NULL != opal_pointer_array_get_item(orte_job_data, i)) {
num_jobs++;
}
}
/* now pack the, one at a time */
for (i=0; i < orte_job_data->size; i++) {
if (NULL != orte_job_data->addr[i]) {
if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &jobs[i], 1, ORTE_JOB))) {
/* pack the number of jobs */
if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &num_jobs, 1, OPAL_INT32))) {
ORTE_ERROR_LOG(ret);
OBJ_RELEASE(answer);
goto CLEANUP;
}
/* now pack the data, one at a time */
for (i=1; i < orte_job_data->size; i++) {
if (NULL != (jobdat = (orte_job_t*)opal_pointer_array_get_item(orte_job_data, i))) {
if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &jobdat, 1, ORTE_JOB))) {
ORTE_ERROR_LOG(ret);
OBJ_RELEASE(answer);
goto CLEANUP;
@ -770,10 +785,10 @@ static int process_commands(orte_process_name_t* sender,
* back 0 nodes so the tool won't hang
*/
if (!ORTE_PROC_IS_HNP) {
orte_std_cntr_t zero=0;
int32_t zero=0;
answer = OBJ_NEW(opal_buffer_t);
if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &zero, 1, ORTE_STD_CNTR))) {
if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &zero, 1, OPAL_INT32))) {
ORTE_ERROR_LOG(ret);
OBJ_RELEASE(answer);
goto CLEANUP;
@ -785,8 +800,8 @@ static int process_commands(orte_process_name_t* sender,
OBJ_RELEASE(answer);
} else {
/* if we are the HNP, process the request */
orte_std_cntr_t i, num_nodes=0;
orte_node_t **nodes;
int32_t i, num_nodes;
orte_node_t *node;
char *nid;
/* unpack the nodename */
@ -802,37 +817,50 @@ static int process_commands(orte_process_name_t* sender,
/* if they asked for a specific node, then just get that info */
if (NULL != nid) {
/* find this node */
nodes = (orte_node_t**)orte_node_pool->addr;
for (i=0; i < orte_node_pool->size; i++) {
if (NULL == nodes[i]) break; /* stop when we get past the end of data */
if (0 == strcmp(nid, nodes[i]->name)) {
nodes = &nodes[i];
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, i))) {
continue;
}
if (0 == strcmp(nid, node->name)) {
num_nodes = 1;
break;
}
}
} else {
/* count number of nodes */
for (i=0; i < orte_node_pool->size; i++) {
if (NULL == orte_node_pool->addr[i]) break;
num_nodes++;
}
nodes = (orte_node_t**)orte_node_pool->addr;
}
/* pack the answer */
if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &num_nodes, 1, ORTE_STD_CNTR))) {
ORTE_ERROR_LOG(ret);
OBJ_RELEASE(answer);
goto CLEANUP;
}
if (0 < num_nodes) {
if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, nodes, num_nodes, ORTE_NODE))) {
if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &num_nodes, 1, OPAL_INT32))) {
ORTE_ERROR_LOG(ret);
OBJ_RELEASE(answer);
goto CLEANUP;
}
if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &node, 1, ORTE_NODE))) {
ORTE_ERROR_LOG(ret);
OBJ_RELEASE(answer);
goto CLEANUP;
}
} else {
/* count number of nodes */
for (i=0; i < orte_node_pool->size; i++) {
if (NULL != opal_pointer_array_get_item(orte_node_pool, i)) {
num_nodes++;
}
}
/* pack the answer */
if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &num_nodes, 1, OPAL_INT32))) {
ORTE_ERROR_LOG(ret);
OBJ_RELEASE(answer);
goto CLEANUP;
}
/* pack each node separately */
for (i=0; i < orte_node_pool->size; i++) {
if (NULL != (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, i))) {
if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &node, 1, ORTE_NODE))) {
ORTE_ERROR_LOG(ret);
OBJ_RELEASE(answer);
goto CLEANUP;
}
}
}
}
/* send the info */
if (0 > orte_rml.send_buffer(sender, answer, ORTE_RML_TAG_TOOL, 0)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
ret = ORTE_ERR_COMM_FAILURE;
@ -851,19 +879,14 @@ static int process_commands(orte_process_name_t* sender,
* back 0 procs so the tool won't hang
*/
if (!ORTE_PROC_IS_HNP) {
orte_std_cntr_t zero=0;
int32_t zero=0;
answer = OBJ_NEW(opal_buffer_t);
if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &zero, 1, ORTE_STD_CNTR))) {
if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &zero, 1, OPAL_INT32))) {
ORTE_ERROR_LOG(ret);
OBJ_RELEASE(answer);
goto CLEANUP;
}
/* callback function will release buffer */
#if 0
if (0 > orte_rml.send_buffer_nb(sender, answer, ORTE_RML_TAG_TOOL, 0,
send_callback, NULL)) {
#endif
if (0 > orte_rml.send_buffer(sender, answer, ORTE_RML_TAG_TOOL, 0)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
ret = ORTE_ERR_COMM_FAILURE;
@ -871,9 +894,9 @@ static int process_commands(orte_process_name_t* sender,
} else {
/* if we are the HNP, process the request */
orte_job_t *jdata;
orte_proc_t **procs=NULL;
orte_vpid_t num_procs=0, vpid;
orte_std_cntr_t i;
orte_proc_t *proc;
orte_vpid_t vpid;
int32_t i, num_procs;
/* setup the answer */
answer = OBJ_NEW(opal_buffer_t);
@ -886,57 +909,70 @@ static int process_commands(orte_process_name_t* sender,
}
/* look up job data object */
job = ORTE_CONSTRUCT_LOCAL_JOBID(ORTE_PROC_MY_NAME->jobid, job);
if (NULL == (jdata = orte_get_job_data_object(job))) {
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
goto PACK_ANSWER;
goto CLEANUP;
}
/* unpack the vpid */
n = 1;
if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &vpid, &n, ORTE_VPID))) {
ORTE_ERROR_LOG(ret);
goto PACK_ANSWER;
goto CLEANUP;
}
/* if they asked for a specific proc, then just get that info */
if (ORTE_VPID_WILDCARD != vpid) {
/* find this proc */
procs = (orte_proc_t**)jdata->procs->addr;
for (i=0; i < jdata->procs->size; i++) {
if (NULL == procs[i]) break; /* stop when we get past the end of data */
if (vpid == procs[i]->name.vpid) {
procs = &procs[i];
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, i))) {
continue;
}
if (vpid == proc->name.vpid) {
num_procs = 1;
if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &num_procs, 1, OPAL_INT32))) {
ORTE_ERROR_LOG(ret);
goto CLEANUP;
}
if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &proc, 1, ORTE_PROC))) {
ORTE_ERROR_LOG(ret);
goto CLEANUP;
}
break;
}
}
} else {
procs = (orte_proc_t**)jdata->procs->addr;
num_procs = jdata->num_procs;
}
PACK_ANSWER:
/* pack number of procs */
if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &num_procs, 1, ORTE_VPID))) {
ORTE_ERROR_LOG(ret);
goto SEND_ANSWER;
}
if (0 < num_procs) {
if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, procs, jdata->num_procs, ORTE_PROC))) {
/* count number of procs */
num_procs = 0;
for (i=0; i < jdata->procs->size; i++) {
if (NULL != opal_pointer_array_get_item(jdata->procs, i)) {
num_procs++;
}
}
/* pack the answer */
if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &num_procs, 1, OPAL_INT32))) {
ORTE_ERROR_LOG(ret);
goto SEND_ANSWER;
OBJ_RELEASE(answer);
goto CLEANUP;
}
/* pack each proc separately */
for (i=0; i < jdata->procs->size; i++) {
if (NULL != (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, i))) {
if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &proc, 1, ORTE_PROC))) {
ORTE_ERROR_LOG(ret);
OBJ_RELEASE(answer);
goto CLEANUP;
}
}
}
}
SEND_ANSWER:
/* callback function will release buffer */
#if 0
if (0 > orte_rml.send_buffer_nb(sender, answer, ORTE_RML_TAG_TOOL, 0,
send_callback, NULL)) {
#endif
/* send the info */
if (0 > orte_rml.send_buffer(sender, answer, ORTE_RML_TAG_TOOL, 0)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
ret = ORTE_ERR_COMM_FAILURE;
}
OBJ_RELEASE(answer);
}
break;

Просмотреть файл

@ -148,6 +148,7 @@ int orte_dt_pack_job(opal_buffer_t *buffer, const void *src,
int rc;
int32_t i, j;
orte_job_t **jobs;
orte_proc_t *proc;
/* array of pointers to orte_job_t objects - need to pack the objects a set of fields at a time */
jobs = (orte_job_t**) src;
@ -200,7 +201,25 @@ int orte_dt_pack_job(opal_buffer_t *buffer, const void *src,
return rc;
}
/* do not pack the proc data */
/* pack the number of procs */
if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer,
(void*)(&(jobs[i]->num_procs)), 1, ORTE_VPID))) {
ORTE_ERROR_LOG(rc);
return rc;
}
if (0 < jobs[i]->num_procs) {
for (j=0; j < jobs[i]->procs->size; j++) {
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jobs[i]->procs, j))) {
continue;
}
if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer,
(void*)&proc, 1, ORTE_PROC))) {
ORTE_ERROR_LOG(rc);
return rc;
}
}
}
/* if the map is NULL, then we cannot pack it as there is
* nothing to pack. However, we have to flag whether or not

Просмотреть файл

@ -152,7 +152,9 @@ int orte_dt_unpack_job(opal_buffer_t *buffer, void *dest,
int rc;
int32_t i, j, n;
orte_job_t **jobs;
orte_proc_t *proc;
orte_vpid_t np;
/* unpack into array of orte_job_t objects */
jobs = (orte_job_t**) dest;
for (i=0; i < *num_vals; i++) {
@ -218,7 +220,23 @@ int orte_dt_unpack_job(opal_buffer_t *buffer, void *dest,
return rc;
}
/* no proc data to unpack */
/* unpack the number of procs */
n = 1;
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
(void*)(&(jobs[i]->num_procs)), &n, ORTE_VPID))) {
ORTE_ERROR_LOG(rc);
return rc;
}
for (np=0; np < jobs[i]->num_procs; np++) {
n = 1;
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
(void**)&proc, &n, ORTE_PROC))) {
ORTE_ERROR_LOG(rc);
return rc;
}
opal_pointer_array_set_item(jobs[i]->procs, proc->name.vpid, proc);
}
/* if the map is NULL, then we din't pack it as there was
* nothing to pack. Instead, we packed a flag to indicate whether or not

Просмотреть файл

@ -31,3 +31,11 @@ Error: You specified a vpid (%d) without also specifying a jobid.
[invalid-vpid]
Error: The specified vpid (%d) is not valid for job %d.
[stale-hnp]
An attempt was made to obtain ps information from a non-responsive
HNP:
HNP name: %s
You may want to cleanup stale session directories in your temporary
directory (e.g., $TMPDIR).

Просмотреть файл

@ -185,13 +185,13 @@ opal_cmd_line_init_t cmd_line_opts[] = {
1,
&orte_ps_globals.jobid, OPAL_CMD_LINE_TYPE_INT,
"Specify a specific jobid" },
#if 0
{ NULL, NULL, NULL,
'p', NULL, "vpid",
1,
&orte_ps_globals.vpid, OPAL_CMD_LINE_TYPE_INT,
"Specify a specific vpid. Must specify a --jobid as well" },
#endif
{ NULL, NULL, NULL,
'n', NULL, "nodes",
0,
@ -251,7 +251,14 @@ main(int argc, char *argv[])
hnpinfo.hnp->pid);
if( ORTE_SUCCESS != (ret = gather_information(&hnpinfo)) ) {
exit_status = ret;
/* this could be due to a stale session directory - if so,
* just skip this entry, but don't abort
*/
if (ORTE_ERR_SILENT == ret) {
orte_show_help("help-orte-ps.txt", "stale-hnp", true,
ORTE_NAME_PRINT(&(hnpinfo.hnp->name)));
continue;
}
goto cleanup;
}
@ -276,13 +283,13 @@ main(int argc, char *argv[])
static int parse_args(int argc, char *argv[]) {
int ret;
opal_cmd_line_t cmd_line;
orte_ps_globals_t tmp = { false,
false,
-1,
-1,
false,
false,
-1};
orte_ps_globals_t tmp = { false, /* help */
false, /* verbose */
ORTE_JOBID_WILDCARD, /* jobid */
ORTE_VPID_WILDCARD, /* vpid */
false, /* nodes */
false, /* daemons */
-1}; /* output */
orte_ps_globals = tmp;
@ -309,14 +316,15 @@ static int parse_args(int argc, char *argv[]) {
/*
* If they specify a vpid, they must specify a jobid
*/
if( 0 <= orte_ps_globals.vpid) {
if( 0 > orte_ps_globals.jobid) {
#if 0
if( ORTE_VPID_WILDCARD != orte_ps_globals.vpid) {
if( ORTE_JOBID_WILDCARD == orte_ps_globals.jobid) {
orte_show_help("help-orte-ps.txt", "vpid-usage", true,
orte_ps_globals.vpid);
return ORTE_ERROR;
}
}
#endif
return ORTE_SUCCESS;
}
@ -794,7 +802,7 @@ static int gather_information(orte_ps_mpirun_info_t *hnpinfo) {
static int gather_active_jobs(orte_ps_mpirun_info_t *hnpinfo) {
int ret;
if (ORTE_SUCCESS != (ret = orte_util_comm_query_job_info(&(hnpinfo->hnp->name), ORTE_JOBID_WILDCARD,
if (ORTE_SUCCESS != (ret = orte_util_comm_query_job_info(&(hnpinfo->hnp->name), orte_ps_globals.jobid,
&hnpinfo->num_jobs, &hnpinfo->jobs))) {
ORTE_ERROR_LOG(ret);
}
@ -836,7 +844,7 @@ static int gather_vpid_info(orte_ps_mpirun_info_t *hnpinfo) {
/* query the HNP for info on the procs in this job */
if (ORTE_SUCCESS != (ret = orte_util_comm_query_proc_info(&(hnpinfo->hnp->name), job->jobid,
ORTE_VPID_WILDCARD, &cnt, &procs))) {
orte_ps_globals.vpid, &cnt, &procs))) {
ORTE_ERROR_LOG(ret);
}
job->procs->addr = (void**)procs;

Просмотреть файл

@ -32,18 +32,56 @@
#include "orte/mca/rml/rml_types.h"
#include "orte/util/name_fns.h"
#include "orte/runtime/orte_globals.h"
#include "orte/runtime/orte_wait.h"
#include "orte/util/comm/comm.h"
/* quick timeout loop */
static bool timer_fired;
static opal_buffer_t answer;
static opal_event_t *quicktime=NULL;
static int error_exit;
static void quicktime_cb(int fd, short event, void *cbdata)
{
if (NULL != quicktime) {
free(quicktime);
quicktime = NULL;
}
error_exit = ORTE_ERR_SILENT;
/* declare it fired */
timer_fired = true;
}
static void recv_info(int status, orte_process_name_t* sender,
opal_buffer_t* buffer, orte_rml_tag_t tag,
void* cbdata)
{
int rc;
/* cancel the timer */
if (NULL != quicktime) {
opal_evtimer_del(quicktime);
free(quicktime);
quicktime = NULL;
}
/* xfer the answer */
if (ORTE_SUCCESS != (rc = opal_dss.copy_payload(&answer, buffer))) {
ORTE_ERROR_LOG(rc);
}
/* declare the work done */
timer_fired = true;
}
int orte_util_comm_query_job_info(const orte_process_name_t *hnp, orte_jobid_t job,
int *num_jobs, orte_job_t ***job_info_array)
{
int ret;
orte_std_cntr_t cnt, cnt_jobs, n;
opal_buffer_t cmd, answer;
int32_t cnt, cnt_jobs, n;
opal_buffer_t cmd;
orte_daemon_cmd_flag_t command = ORTE_DAEMON_REPORT_JOB_INFO_CMD;
orte_job_t **job_info;
/* set default response */
*num_jobs = 0;
*job_info_array = NULL;
@ -65,15 +103,40 @@ int orte_util_comm_query_job_info(const orte_process_name_t *hnp, orte_jobid_t j
}
OBJ_DESTRUCT(&cmd);
/* get the answer */
/* setup for answer */
OBJ_CONSTRUCT(&answer, opal_buffer_t);
if (ORTE_SUCCESS != (ret = orte_rml.recv_buffer(ORTE_NAME_WILDCARD, &answer, ORTE_RML_TAG_TOOL, 0))) {
/* define a max time to wait for an answer */
timer_fired = false;
error_exit = ORTE_SUCCESS;
ORTE_DETECT_TIMEOUT(&quicktime, 10, 1000, 10000, quicktime_cb);
/* get the answer */
if (ORTE_SUCCESS != (ret = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD,
ORTE_RML_TAG_TOOL,
ORTE_RML_NON_PERSISTENT,
recv_info,
NULL))) {
/* cancel the timer */
if (NULL != quicktime) {
opal_evtimer_del(quicktime);
free(quicktime);
quicktime = NULL;
}
ORTE_ERROR_LOG(ret);
OBJ_DESTRUCT(&answer);
return ret;
}
ORTE_PROGRESSED_WAIT(timer_fired, 0, 1);
if (ORTE_SUCCESS != error_exit) {
OBJ_DESTRUCT(&answer);
return error_exit;
}
cnt = 1;
if (ORTE_SUCCESS != (ret = opal_dss.unpack(&answer, &cnt_jobs, &cnt, ORTE_STD_CNTR))) {
if (ORTE_SUCCESS != (ret = opal_dss.unpack(&answer, &cnt_jobs, &cnt, OPAL_INT32))) {
ORTE_ERROR_LOG(ret);
OBJ_DESTRUCT(&answer);
return ret;
@ -104,8 +167,8 @@ int orte_util_comm_query_node_info(const orte_process_name_t *hnp, char *node,
int *num_nodes, orte_node_t ***node_info_array)
{
int ret;
orte_std_cntr_t cnt, cnt_nodes;
opal_buffer_t cmd, answer;
int32_t cnt, cnt_nodes, n;
opal_buffer_t cmd;
orte_daemon_cmd_flag_t command = ORTE_DAEMON_REPORT_NODE_INFO_CMD;
orte_node_t **node_info;
@ -132,15 +195,38 @@ int orte_util_comm_query_node_info(const orte_process_name_t *hnp, char *node,
}
OBJ_DESTRUCT(&cmd);
/* define a max time to wait for an answer */
timer_fired = false;
error_exit = ORTE_SUCCESS;
ORTE_DETECT_TIMEOUT(&quicktime, 10, 1000, 10000, quicktime_cb);
/* get the answer */
OBJ_CONSTRUCT(&answer, opal_buffer_t);
if (ORTE_SUCCESS != (ret = orte_rml.recv_buffer(ORTE_NAME_WILDCARD, &answer, ORTE_RML_TAG_TOOL, 0))) {
if (ORTE_SUCCESS != (ret = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD,
ORTE_RML_TAG_TOOL,
ORTE_RML_NON_PERSISTENT,
recv_info,
NULL))) {
/* cancel the timer */
if (NULL != quicktime) {
opal_evtimer_del(quicktime);
free(quicktime);
quicktime = NULL;
}
ORTE_ERROR_LOG(ret);
OBJ_DESTRUCT(&answer);
OBJ_DESTRUCT(ret);
return ret;
}
ORTE_PROGRESSED_WAIT(timer_fired, 0, 1);
if (ORTE_SUCCESS != error_exit) {
OBJ_DESTRUCT(&answer);
return error_exit;
}
cnt = 1;
if (ORTE_SUCCESS != (ret = opal_dss.unpack(&answer, &cnt_nodes, &cnt, ORTE_STD_CNTR))) {
if (ORTE_SUCCESS != (ret = opal_dss.unpack(&answer, &cnt_nodes, &cnt, OPAL_INT32))) {
ORTE_ERROR_LOG(ret);
OBJ_DESTRUCT(&answer);
return ret;
@ -150,11 +236,14 @@ int orte_util_comm_query_node_info(const orte_process_name_t *hnp, char *node,
if (0 < cnt_nodes) {
node_info = (orte_node_t**)malloc(cnt_nodes * sizeof(orte_node_t*));
/* unpack the node data */
if (ORTE_SUCCESS != (ret = opal_dss.unpack(&answer, node_info, &cnt_nodes, ORTE_NODE))) {
ORTE_ERROR_LOG(ret);
OBJ_DESTRUCT(&answer);
free(node_info);
return ret;
for (n=0; n < cnt_nodes; n++) {
cnt = 1;
if (ORTE_SUCCESS != (ret = opal_dss.unpack(&answer, &node_info[n], &cnt, ORTE_NODE))) {
ORTE_ERROR_LOG(ret);
OBJ_DESTRUCT(&answer);
free(node_info);
return ret;
}
}
*node_info_array = node_info;
*num_nodes = cnt_nodes;
@ -168,9 +257,8 @@ int orte_util_comm_query_proc_info(const orte_process_name_t *hnp, orte_jobid_t
int *num_procs, orte_proc_t ***proc_info_array)
{
int ret;
orte_std_cntr_t cnt;
orte_vpid_t cnt_procs;
opal_buffer_t cmd, answer;
int32_t cnt, cnt_procs, n;
opal_buffer_t cmd;
orte_daemon_cmd_flag_t command = ORTE_DAEMON_REPORT_PROC_INFO_CMD;
orte_proc_t **proc_info;
@ -202,15 +290,38 @@ int orte_util_comm_query_proc_info(const orte_process_name_t *hnp, orte_jobid_t
}
OBJ_DESTRUCT(&cmd);
/* get the response */
/* define a max time to wait for an answer */
timer_fired = false;
error_exit = ORTE_SUCCESS;
ORTE_DETECT_TIMEOUT(&quicktime, 10, 1000, 10000, quicktime_cb);
/* get the answer */
OBJ_CONSTRUCT(&answer, opal_buffer_t);
if (ORTE_SUCCESS != (ret = orte_rml.recv_buffer(ORTE_NAME_WILDCARD, &answer, ORTE_RML_TAG_TOOL, 0))) {
if (ORTE_SUCCESS != (ret = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD,
ORTE_RML_TAG_TOOL,
ORTE_RML_NON_PERSISTENT,
recv_info,
NULL))) {
/* cancel the timer */
if (NULL != quicktime) {
opal_evtimer_del(quicktime);
free(quicktime);
quicktime = NULL;
}
ORTE_ERROR_LOG(ret);
OBJ_DESTRUCT(&answer);
OBJ_DESTRUCT(ret);
return ret;
}
ORTE_PROGRESSED_WAIT(timer_fired, 0, 1);
if (ORTE_SUCCESS != error_exit) {
OBJ_DESTRUCT(&answer);
return error_exit;
}
cnt = 1;
if (ORTE_SUCCESS != (ret = opal_dss.unpack(&answer, &cnt_procs, &cnt, ORTE_VPID))) {
if (ORTE_SUCCESS != (ret = opal_dss.unpack(&answer, &cnt_procs, &cnt, OPAL_INT32))) {
ORTE_ERROR_LOG(ret);
OBJ_DESTRUCT(&answer);
return ret;
@ -220,12 +331,14 @@ int orte_util_comm_query_proc_info(const orte_process_name_t *hnp, orte_jobid_t
if (0 < cnt_procs) {
proc_info = (orte_proc_t**)malloc(cnt_procs * sizeof(orte_proc_t*));
/* unpack the procs */
cnt = cnt_procs;
if (ORTE_SUCCESS != (ret = opal_dss.unpack(&answer, proc_info, &cnt, ORTE_PROC))) {
ORTE_ERROR_LOG(ret);
OBJ_DESTRUCT(&answer);
free(proc_info);
return ret;
for (n=0; n < cnt_procs; n++) {
cnt = 1;
if (ORTE_SUCCESS != (ret = opal_dss.unpack(&answer, &proc_info[n], &cnt, ORTE_PROC))) {
ORTE_ERROR_LOG(ret);
OBJ_DESTRUCT(&answer);
free(proc_info);
return ret;
}
}
*proc_info_array = proc_info;
*num_procs = (int)cnt_procs;