1
1

Fix orte-ps so it properly ignores/reports stale HNPs, but continues to provide output on running ones. Add a timeout on the send side of the comm so we don't hang while trying to send the info request to the non-existent HNP.

This commit was SVN r21257.
This commit is contained in:
Ralph Castain 2009-05-21 02:42:21 +00:00
parent 26342508de
commit cc7620c210
3 changed files with 122 additions and 46 deletions

View File

@ -54,6 +54,16 @@ int orte_ess_base_tool_setup(void)
int ret;
char *error = NULL;
if (NULL != orte_process_info.my_hnp_uri) {
/* if we were given an HNP, then we were launched
* by mpirun in some fashion - in this case, we want
* to look like an application as well as being a tool.
* Need to do this before opening the routed framework
* so it will do the right things.
*/
orte_process_info.proc_type |= ORTE_PROC_NON_MPI;
}
/* Setup the communication infrastructure */
/* Runtime Messaging Layer */
@ -116,15 +126,20 @@ int orte_ess_base_tool_setup(void)
}
/* setup I/O forwarding system - must come after we init routes */
if (ORTE_SUCCESS != (ret = orte_iof_base_open())) {
ORTE_ERROR_LOG(ret);
error = "orte_iof_base_open";
goto error;
}
if (ORTE_SUCCESS != (ret = orte_iof_base_select())) {
ORTE_ERROR_LOG(ret);
error = "orte_iof_base_select";
goto error;
if (NULL != orte_process_info.my_hnp_uri) {
/* only do this if we were NOT given an HNP - i.e., if we
* are a standalone tool
*/
if (ORTE_SUCCESS != (ret = orte_iof_base_open())) {
ORTE_ERROR_LOG(ret);
error = "orte_iof_base_open";
goto error;
}
if (ORTE_SUCCESS != (ret = orte_iof_base_select())) {
ORTE_ERROR_LOG(ret);
error = "orte_iof_base_select";
goto error;
}
}
#if OPAL_ENABLE_FT == 1
@ -164,7 +179,9 @@ int orte_ess_base_tool_finalize(void)
* a very small subset of orte_init - ensure that
* I only back those elements out
*/
orte_iof_base_close();
if (NULL != orte_process_info.my_hnp_uri) {
orte_iof_base_close();
}
orte_routed_base_close();
orte_rml_base_close();

View File

@ -73,12 +73,27 @@ static void recv_info(int status, orte_process_name_t* sender,
timer_fired = true;
}
static void send_cbfunc(int status, orte_process_name_t* sender,
opal_buffer_t* buffer, orte_rml_tag_t tag,
void* cbdata)
{
/* cancel the timer */
if (NULL != quicktime) {
opal_evtimer_del(quicktime);
free(quicktime);
quicktime = NULL;
}
OBJ_RELEASE(buffer);
/* declare the work done */
timer_fired = true;
}
int orte_util_comm_query_job_info(const orte_process_name_t *hnp, orte_jobid_t job,
int *num_jobs, orte_job_t ***job_info_array)
{
int ret;
int32_t cnt, cnt_jobs, n;
opal_buffer_t cmd;
opal_buffer_t *cmd;
orte_daemon_cmd_flag_t command = ORTE_DAEMON_REPORT_JOB_INFO_CMD;
orte_job_t **job_info;
@ -87,21 +102,37 @@ int orte_util_comm_query_job_info(const orte_process_name_t *hnp, orte_jobid_t j
*job_info_array = NULL;
/* send query to HNP */
OBJ_CONSTRUCT(&cmd, opal_buffer_t);
if (ORTE_SUCCESS != (ret = opal_dss.pack(&cmd, &command, 1, ORTE_DAEMON_CMD))) {
cmd = OBJ_NEW(opal_buffer_t);
if (ORTE_SUCCESS != (ret = opal_dss.pack(cmd, &command, 1, ORTE_DAEMON_CMD))) {
ORTE_ERROR_LOG(ret);
OBJ_RELEASE(cmd);
return ret;
}
if (ORTE_SUCCESS != (ret = opal_dss.pack(&cmd, &job, 1, ORTE_JOBID))) {
if (ORTE_SUCCESS != (ret = opal_dss.pack(cmd, &job, 1, ORTE_JOBID))) {
ORTE_ERROR_LOG(ret);
OBJ_RELEASE(cmd);
return ret;
}
if (0 > (ret = orte_rml.send_buffer((orte_process_name_t*)hnp, &cmd, ORTE_RML_TAG_DAEMON, 0))) {
/* define a max time to wait for send to complete */
timer_fired = false;
error_exit = ORTE_SUCCESS;
ORTE_DETECT_TIMEOUT(&quicktime, 100, 1000, 100000, quicktime_cb);
/* do the send */
if (0 > (ret = orte_rml.send_buffer_nb((orte_process_name_t*)hnp, cmd, ORTE_RML_TAG_DAEMON, 0,
send_cbfunc, NULL))) {
ORTE_ERROR_LOG(ret);
OBJ_DESTRUCT(&cmd);
OBJ_RELEASE(cmd);
return ret;
}
OBJ_DESTRUCT(&cmd);
/* wait for send to complete */
ORTE_PROGRESSED_WAIT(timer_fired, 0, 1);
/* did it succeed? */
if (ORTE_SUCCESS != error_exit) {
return error_exit;
}
/* setup for answer */
OBJ_CONSTRUCT(&answer, opal_buffer_t);
@ -109,7 +140,7 @@ int orte_util_comm_query_job_info(const orte_process_name_t *hnp, orte_jobid_t j
/* define a max time to wait for an answer */
timer_fired = false;
error_exit = ORTE_SUCCESS;
ORTE_DETECT_TIMEOUT(&quicktime, 10, 1000, 10000, quicktime_cb);
ORTE_DETECT_TIMEOUT(&quicktime, 100, 1000, 100000, quicktime_cb);
/* get the answer */
if (ORTE_SUCCESS != (ret = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD,
@ -168,7 +199,7 @@ int orte_util_comm_query_node_info(const orte_process_name_t *hnp, char *node,
{
int ret;
int32_t cnt, cnt_nodes, n;
opal_buffer_t cmd;
opal_buffer_t *cmd;
orte_daemon_cmd_flag_t command = ORTE_DAEMON_REPORT_NODE_INFO_CMD;
orte_node_t **node_info;
@ -177,24 +208,38 @@ int orte_util_comm_query_node_info(const orte_process_name_t *hnp, char *node,
*node_info_array = NULL;
/* query the HNP for node info */
OBJ_CONSTRUCT(&cmd, opal_buffer_t);
if (ORTE_SUCCESS != (ret = opal_dss.pack(&cmd, &command, 1, ORTE_DAEMON_CMD))) {
cmd = OBJ_NEW(opal_buffer_t);
if (ORTE_SUCCESS != (ret = opal_dss.pack(cmd, &command, 1, ORTE_DAEMON_CMD))) {
ORTE_ERROR_LOG(ret);
OBJ_DESTRUCT(&cmd);
OBJ_RELEASE(cmd);
return ret;
}
if (ORTE_SUCCESS != (ret = opal_dss.pack(&cmd, &node, 1, OPAL_STRING))) {
if (ORTE_SUCCESS != (ret = opal_dss.pack(cmd, &node, 1, OPAL_STRING))) {
ORTE_ERROR_LOG(ret);
OBJ_DESTRUCT(&cmd);
OBJ_RELEASE(cmd);
return ret;
}
if (0 > (ret = orte_rml.send_buffer((orte_process_name_t*)hnp, &cmd, ORTE_RML_TAG_DAEMON, 0))) {
ORTE_ERROR_LOG(ret);
OBJ_DESTRUCT(&cmd);
return ret;
}
OBJ_DESTRUCT(&cmd);
/* define a max time to wait for send to complete */
timer_fired = false;
error_exit = ORTE_SUCCESS;
ORTE_DETECT_TIMEOUT(&quicktime, 100, 1000, 100000, quicktime_cb);
/* do the send */
if (0 > (ret = orte_rml.send_buffer_nb((orte_process_name_t*)hnp, cmd, ORTE_RML_TAG_DAEMON, 0,
send_cbfunc, NULL))) {
ORTE_ERROR_LOG(ret);
OBJ_RELEASE(cmd);
return ret;
}
/* wait for send to complete */
ORTE_PROGRESSED_WAIT(timer_fired, 0, 1);
/* did it succeed? */
if (ORTE_SUCCESS != error_exit) {
return error_exit;
}
/* define a max time to wait for an answer */
timer_fired = false;
error_exit = ORTE_SUCCESS;
@ -258,7 +303,7 @@ int orte_util_comm_query_proc_info(const orte_process_name_t *hnp, orte_jobid_t
{
int ret;
int32_t cnt, cnt_procs, n;
opal_buffer_t cmd;
opal_buffer_t *cmd;
orte_daemon_cmd_flag_t command = ORTE_DAEMON_REPORT_PROC_INFO_CMD;
orte_proc_t **proc_info;
@ -267,29 +312,43 @@ int orte_util_comm_query_proc_info(const orte_process_name_t *hnp, orte_jobid_t
*proc_info_array = NULL;
/* query the HNP for info on the procs in this job */
OBJ_CONSTRUCT(&cmd, opal_buffer_t);
if (ORTE_SUCCESS != (ret = opal_dss.pack(&cmd, &command, 1, ORTE_DAEMON_CMD))) {
cmd = OBJ_NEW(opal_buffer_t);
if (ORTE_SUCCESS != (ret = opal_dss.pack(cmd, &command, 1, ORTE_DAEMON_CMD))) {
ORTE_ERROR_LOG(ret);
OBJ_DESTRUCT(&cmd);
OBJ_RELEASE(cmd);
return ret;
}
if (ORTE_SUCCESS != (ret = opal_dss.pack(&cmd, &job, 1, ORTE_JOBID))) {
if (ORTE_SUCCESS != (ret = opal_dss.pack(cmd, &job, 1, ORTE_JOBID))) {
ORTE_ERROR_LOG(ret);
OBJ_DESTRUCT(&cmd);
OBJ_RELEASE(cmd);
return ret;
}
if (ORTE_SUCCESS != (ret = opal_dss.pack(&cmd, &vpid, 1, ORTE_VPID))) {
if (ORTE_SUCCESS != (ret = opal_dss.pack(cmd, &vpid, 1, ORTE_VPID))) {
ORTE_ERROR_LOG(ret);
OBJ_DESTRUCT(&cmd);
OBJ_RELEASE(cmd);
return ret;
}
if (0 > (ret = orte_rml.send_buffer((orte_process_name_t*)hnp, &cmd, ORTE_RML_TAG_DAEMON, 0))) {
ORTE_ERROR_LOG(ret);
OBJ_DESTRUCT(&cmd);
return ret;
}
OBJ_DESTRUCT(&cmd);
/* define a max time to wait for send to complete */
timer_fired = false;
error_exit = ORTE_SUCCESS;
ORTE_DETECT_TIMEOUT(&quicktime, 100, 1000, 100000, quicktime_cb);
/* do the send */
if (0 > (ret = orte_rml.send_buffer_nb((orte_process_name_t*)hnp, cmd, ORTE_RML_TAG_DAEMON, 0,
send_cbfunc, NULL))) {
ORTE_ERROR_LOG(ret);
OBJ_RELEASE(cmd);
return ret;
}
/* wait for send to complete */
ORTE_PROGRESSED_WAIT(timer_fired, 0, 1);
/* did it succeed? */
if (ORTE_SUCCESS != error_exit) {
return error_exit;
}
/* define a max time to wait for an answer */
timer_fired = false;
error_exit = ORTE_SUCCESS;

View File

@ -425,12 +425,12 @@ int orte_show_help(const char *filename, const char *topic,
}
/* if we are the HNP, or the RML has not yet been setup,
* or we don't yet know our HNP, then all we can do
* or we weren't given an HNP, then all we can do
* is process this locally
*/
if (ORTE_PROC_IS_HNP ||
NULL == orte_rml.send_buffer ||
ORTE_PROC_MY_HNP->vpid == ORTE_VPID_INVALID) {
NULL == orte_process_info.my_hnp_uri) {
rc = show_help(filename, topic, output, ORTE_PROC_MY_NAME);
}