Fix orte-ps so it properly ignores/reports stale HNPs, but continues to provide output on running ones. Add a timeout on the send side of the comm so we don't hang while trying to send the info request to the non-existent HNP.
This commit was SVN r21257.
This commit is contained in:
parent
26342508de
commit
cc7620c210
@ -54,6 +54,16 @@ int orte_ess_base_tool_setup(void)
|
||||
int ret;
|
||||
char *error = NULL;
|
||||
|
||||
if (NULL != orte_process_info.my_hnp_uri) {
|
||||
/* if we were given an HNP, then we were launched
|
||||
* by mpirun in some fashion - in this case, we want
|
||||
* to look like an application as well as being a tool.
|
||||
* Need to do this before opening the routed framework
|
||||
* so it will do the right things.
|
||||
*/
|
||||
orte_process_info.proc_type |= ORTE_PROC_NON_MPI;
|
||||
}
|
||||
|
||||
/* Setup the communication infrastructure */
|
||||
|
||||
/* Runtime Messaging Layer */
|
||||
@ -116,15 +126,20 @@ int orte_ess_base_tool_setup(void)
|
||||
}
|
||||
|
||||
/* setup I/O forwarding system - must come after we init routes */
|
||||
if (ORTE_SUCCESS != (ret = orte_iof_base_open())) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
error = "orte_iof_base_open";
|
||||
goto error;
|
||||
}
|
||||
if (ORTE_SUCCESS != (ret = orte_iof_base_select())) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
error = "orte_iof_base_select";
|
||||
goto error;
|
||||
if (NULL != orte_process_info.my_hnp_uri) {
|
||||
/* only do this if we were NOT given an HNP - i.e., if we
|
||||
* are a standalone tool
|
||||
*/
|
||||
if (ORTE_SUCCESS != (ret = orte_iof_base_open())) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
error = "orte_iof_base_open";
|
||||
goto error;
|
||||
}
|
||||
if (ORTE_SUCCESS != (ret = orte_iof_base_select())) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
error = "orte_iof_base_select";
|
||||
goto error;
|
||||
}
|
||||
}
|
||||
|
||||
#if OPAL_ENABLE_FT == 1
|
||||
@ -164,7 +179,9 @@ int orte_ess_base_tool_finalize(void)
|
||||
* a very small subset of orte_init - ensure that
|
||||
* I only back those elements out
|
||||
*/
|
||||
orte_iof_base_close();
|
||||
if (NULL != orte_process_info.my_hnp_uri) {
|
||||
orte_iof_base_close();
|
||||
}
|
||||
orte_routed_base_close();
|
||||
orte_rml_base_close();
|
||||
|
||||
|
@ -73,12 +73,27 @@ static void recv_info(int status, orte_process_name_t* sender,
|
||||
timer_fired = true;
|
||||
}
|
||||
|
||||
static void send_cbfunc(int status, orte_process_name_t* sender,
|
||||
opal_buffer_t* buffer, orte_rml_tag_t tag,
|
||||
void* cbdata)
|
||||
{
|
||||
/* cancel the timer */
|
||||
if (NULL != quicktime) {
|
||||
opal_evtimer_del(quicktime);
|
||||
free(quicktime);
|
||||
quicktime = NULL;
|
||||
}
|
||||
OBJ_RELEASE(buffer);
|
||||
/* declare the work done */
|
||||
timer_fired = true;
|
||||
}
|
||||
|
||||
int orte_util_comm_query_job_info(const orte_process_name_t *hnp, orte_jobid_t job,
|
||||
int *num_jobs, orte_job_t ***job_info_array)
|
||||
{
|
||||
int ret;
|
||||
int32_t cnt, cnt_jobs, n;
|
||||
opal_buffer_t cmd;
|
||||
opal_buffer_t *cmd;
|
||||
orte_daemon_cmd_flag_t command = ORTE_DAEMON_REPORT_JOB_INFO_CMD;
|
||||
orte_job_t **job_info;
|
||||
|
||||
@ -87,21 +102,37 @@ int orte_util_comm_query_job_info(const orte_process_name_t *hnp, orte_jobid_t j
|
||||
*job_info_array = NULL;
|
||||
|
||||
/* send query to HNP */
|
||||
OBJ_CONSTRUCT(&cmd, opal_buffer_t);
|
||||
if (ORTE_SUCCESS != (ret = opal_dss.pack(&cmd, &command, 1, ORTE_DAEMON_CMD))) {
|
||||
cmd = OBJ_NEW(opal_buffer_t);
|
||||
if (ORTE_SUCCESS != (ret = opal_dss.pack(cmd, &command, 1, ORTE_DAEMON_CMD))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
OBJ_RELEASE(cmd);
|
||||
return ret;
|
||||
}
|
||||
if (ORTE_SUCCESS != (ret = opal_dss.pack(&cmd, &job, 1, ORTE_JOBID))) {
|
||||
if (ORTE_SUCCESS != (ret = opal_dss.pack(cmd, &job, 1, ORTE_JOBID))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
OBJ_RELEASE(cmd);
|
||||
return ret;
|
||||
}
|
||||
if (0 > (ret = orte_rml.send_buffer((orte_process_name_t*)hnp, &cmd, ORTE_RML_TAG_DAEMON, 0))) {
|
||||
/* define a max time to wait for send to complete */
|
||||
timer_fired = false;
|
||||
error_exit = ORTE_SUCCESS;
|
||||
ORTE_DETECT_TIMEOUT(&quicktime, 100, 1000, 100000, quicktime_cb);
|
||||
|
||||
/* do the send */
|
||||
if (0 > (ret = orte_rml.send_buffer_nb((orte_process_name_t*)hnp, cmd, ORTE_RML_TAG_DAEMON, 0,
|
||||
send_cbfunc, NULL))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
OBJ_DESTRUCT(&cmd);
|
||||
OBJ_RELEASE(cmd);
|
||||
return ret;
|
||||
}
|
||||
OBJ_DESTRUCT(&cmd);
|
||||
|
||||
/* wait for send to complete */
|
||||
ORTE_PROGRESSED_WAIT(timer_fired, 0, 1);
|
||||
|
||||
/* did it succeed? */
|
||||
if (ORTE_SUCCESS != error_exit) {
|
||||
return error_exit;
|
||||
}
|
||||
|
||||
/* setup for answer */
|
||||
OBJ_CONSTRUCT(&answer, opal_buffer_t);
|
||||
@ -109,7 +140,7 @@ int orte_util_comm_query_job_info(const orte_process_name_t *hnp, orte_jobid_t j
|
||||
/* define a max time to wait for an answer */
|
||||
timer_fired = false;
|
||||
error_exit = ORTE_SUCCESS;
|
||||
ORTE_DETECT_TIMEOUT(&quicktime, 10, 1000, 10000, quicktime_cb);
|
||||
ORTE_DETECT_TIMEOUT(&quicktime, 100, 1000, 100000, quicktime_cb);
|
||||
|
||||
/* get the answer */
|
||||
if (ORTE_SUCCESS != (ret = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD,
|
||||
@ -168,7 +199,7 @@ int orte_util_comm_query_node_info(const orte_process_name_t *hnp, char *node,
|
||||
{
|
||||
int ret;
|
||||
int32_t cnt, cnt_nodes, n;
|
||||
opal_buffer_t cmd;
|
||||
opal_buffer_t *cmd;
|
||||
orte_daemon_cmd_flag_t command = ORTE_DAEMON_REPORT_NODE_INFO_CMD;
|
||||
orte_node_t **node_info;
|
||||
|
||||
@ -177,24 +208,38 @@ int orte_util_comm_query_node_info(const orte_process_name_t *hnp, char *node,
|
||||
*node_info_array = NULL;
|
||||
|
||||
/* query the HNP for node info */
|
||||
OBJ_CONSTRUCT(&cmd, opal_buffer_t);
|
||||
if (ORTE_SUCCESS != (ret = opal_dss.pack(&cmd, &command, 1, ORTE_DAEMON_CMD))) {
|
||||
cmd = OBJ_NEW(opal_buffer_t);
|
||||
if (ORTE_SUCCESS != (ret = opal_dss.pack(cmd, &command, 1, ORTE_DAEMON_CMD))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
OBJ_DESTRUCT(&cmd);
|
||||
OBJ_RELEASE(cmd);
|
||||
return ret;
|
||||
}
|
||||
if (ORTE_SUCCESS != (ret = opal_dss.pack(&cmd, &node, 1, OPAL_STRING))) {
|
||||
if (ORTE_SUCCESS != (ret = opal_dss.pack(cmd, &node, 1, OPAL_STRING))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
OBJ_DESTRUCT(&cmd);
|
||||
OBJ_RELEASE(cmd);
|
||||
return ret;
|
||||
}
|
||||
if (0 > (ret = orte_rml.send_buffer((orte_process_name_t*)hnp, &cmd, ORTE_RML_TAG_DAEMON, 0))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
OBJ_DESTRUCT(&cmd);
|
||||
return ret;
|
||||
}
|
||||
OBJ_DESTRUCT(&cmd);
|
||||
/* define a max time to wait for send to complete */
|
||||
timer_fired = false;
|
||||
error_exit = ORTE_SUCCESS;
|
||||
ORTE_DETECT_TIMEOUT(&quicktime, 100, 1000, 100000, quicktime_cb);
|
||||
|
||||
/* do the send */
|
||||
if (0 > (ret = orte_rml.send_buffer_nb((orte_process_name_t*)hnp, cmd, ORTE_RML_TAG_DAEMON, 0,
|
||||
send_cbfunc, NULL))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
OBJ_RELEASE(cmd);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* wait for send to complete */
|
||||
ORTE_PROGRESSED_WAIT(timer_fired, 0, 1);
|
||||
|
||||
/* did it succeed? */
|
||||
if (ORTE_SUCCESS != error_exit) {
|
||||
return error_exit;
|
||||
}
|
||||
|
||||
/* define a max time to wait for an answer */
|
||||
timer_fired = false;
|
||||
error_exit = ORTE_SUCCESS;
|
||||
@ -258,7 +303,7 @@ int orte_util_comm_query_proc_info(const orte_process_name_t *hnp, orte_jobid_t
|
||||
{
|
||||
int ret;
|
||||
int32_t cnt, cnt_procs, n;
|
||||
opal_buffer_t cmd;
|
||||
opal_buffer_t *cmd;
|
||||
orte_daemon_cmd_flag_t command = ORTE_DAEMON_REPORT_PROC_INFO_CMD;
|
||||
orte_proc_t **proc_info;
|
||||
|
||||
@ -267,29 +312,43 @@ int orte_util_comm_query_proc_info(const orte_process_name_t *hnp, orte_jobid_t
|
||||
*proc_info_array = NULL;
|
||||
|
||||
/* query the HNP for info on the procs in this job */
|
||||
OBJ_CONSTRUCT(&cmd, opal_buffer_t);
|
||||
if (ORTE_SUCCESS != (ret = opal_dss.pack(&cmd, &command, 1, ORTE_DAEMON_CMD))) {
|
||||
cmd = OBJ_NEW(opal_buffer_t);
|
||||
if (ORTE_SUCCESS != (ret = opal_dss.pack(cmd, &command, 1, ORTE_DAEMON_CMD))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
OBJ_DESTRUCT(&cmd);
|
||||
OBJ_RELEASE(cmd);
|
||||
return ret;
|
||||
}
|
||||
if (ORTE_SUCCESS != (ret = opal_dss.pack(&cmd, &job, 1, ORTE_JOBID))) {
|
||||
if (ORTE_SUCCESS != (ret = opal_dss.pack(cmd, &job, 1, ORTE_JOBID))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
OBJ_DESTRUCT(&cmd);
|
||||
OBJ_RELEASE(cmd);
|
||||
return ret;
|
||||
}
|
||||
if (ORTE_SUCCESS != (ret = opal_dss.pack(&cmd, &vpid, 1, ORTE_VPID))) {
|
||||
if (ORTE_SUCCESS != (ret = opal_dss.pack(cmd, &vpid, 1, ORTE_VPID))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
OBJ_DESTRUCT(&cmd);
|
||||
OBJ_RELEASE(cmd);
|
||||
return ret;
|
||||
}
|
||||
if (0 > (ret = orte_rml.send_buffer((orte_process_name_t*)hnp, &cmd, ORTE_RML_TAG_DAEMON, 0))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
OBJ_DESTRUCT(&cmd);
|
||||
return ret;
|
||||
}
|
||||
OBJ_DESTRUCT(&cmd);
|
||||
/* define a max time to wait for send to complete */
|
||||
timer_fired = false;
|
||||
error_exit = ORTE_SUCCESS;
|
||||
ORTE_DETECT_TIMEOUT(&quicktime, 100, 1000, 100000, quicktime_cb);
|
||||
|
||||
/* do the send */
|
||||
if (0 > (ret = orte_rml.send_buffer_nb((orte_process_name_t*)hnp, cmd, ORTE_RML_TAG_DAEMON, 0,
|
||||
send_cbfunc, NULL))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
OBJ_RELEASE(cmd);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* wait for send to complete */
|
||||
ORTE_PROGRESSED_WAIT(timer_fired, 0, 1);
|
||||
|
||||
/* did it succeed? */
|
||||
if (ORTE_SUCCESS != error_exit) {
|
||||
return error_exit;
|
||||
}
|
||||
|
||||
/* define a max time to wait for an answer */
|
||||
timer_fired = false;
|
||||
error_exit = ORTE_SUCCESS;
|
||||
|
@ -425,12 +425,12 @@ int orte_show_help(const char *filename, const char *topic,
|
||||
}
|
||||
|
||||
/* if we are the HNP, or the RML has not yet been setup,
|
||||
* or we don't yet know our HNP, then all we can do
|
||||
* or we weren't given an HNP, then all we can do
|
||||
* is process this locally
|
||||
*/
|
||||
if (ORTE_PROC_IS_HNP ||
|
||||
NULL == orte_rml.send_buffer ||
|
||||
ORTE_PROC_MY_HNP->vpid == ORTE_VPID_INVALID) {
|
||||
NULL == orte_process_info.my_hnp_uri) {
|
||||
rc = show_help(filename, topic, output, ORTE_PROC_MY_NAME);
|
||||
}
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user