1
1

Fix orted termination so we get the #@# relay out before we exit ourselves.

Minor change in the way we respond to job info requests - needed for coming change.

This commit was SVN r20698.
Этот коммит содержится в:
Ralph Castain 2009-03-03 13:38:29 +00:00
родитель d5eddc7541
Коммит fb1ecb7a45
2 изменённых файлов: 48 добавлений и 27 удалений

Просмотреть файл

@ -92,6 +92,7 @@ static int process_commands(orte_process_name_t* sender,
opal_buffer_t *buffer,
orte_rml_tag_t tag);
static bool exit_reqd;
/* instantiate this - it is shared via orted.h */
struct timeval orte_daemon_msg_recvd;
@ -300,6 +301,9 @@ void orte_daemon_cmd_processor(int fd, short event, void *data)
/* rewind the buffer to the right place for processing the cmd */
buffer->unpack_ptr = save;
/* init flag */
exit_reqd = false;
/* process the command */
if (ORTE_SUCCESS != (ret = process_commands(sender, buffer, tag))) {
OPAL_OUTPUT_VERBOSE((1, orte_debug_output,
@ -312,6 +316,11 @@ void orte_daemon_cmd_processor(int fd, short event, void *data)
/* do the relay */
send_relay(buffer);
/* if we need to exit, do so now */
if (exit_reqd) {
orte_trigger_event(&orte_exit);
}
/* done */
goto CLEANUP;
@ -635,7 +644,7 @@ static int process_commands(orte_process_name_t* sender,
orte_rml.send_buffer(ORTE_PROC_MY_HNP, &ack, ORTE_RML_TAG_PLM, 0);
OBJ_DESTRUCT(&ack);
}
orte_trigger_event(&orte_exit);
exit_reqd = true;
return ORTE_SUCCESS;
break;
@ -673,7 +682,7 @@ static int process_commands(orte_process_name_t* sender,
*/
return ORTE_SUCCESS;
}
orte_trigger_event(&orte_exit);
exit_reqd = true;
return ORTE_SUCCESS;
break;
@ -802,30 +811,39 @@ static int process_commands(orte_process_name_t* sender,
if (ORTE_JOBID_WILDCARD != job) {
if (NULL != (jobdat = orte_get_job_data_object(job))) {
num_jobs = 1;
jobs = &jobdat;
}
} else {
/* count number of jobs */
for (i=0; i < orte_job_data->size; i++) {
if (NULL == orte_job_data->addr[i]) break;
num_jobs++;
}
jobs = (orte_job_t**)orte_job_data->addr;
}
/* pack the answer */
if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &num_jobs, 1, ORTE_STD_CNTR))) {
ORTE_ERROR_LOG(ret);
OBJ_RELEASE(answer);
goto CLEANUP;
}
if (0 < num_jobs) {
if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, jobs, num_jobs, ORTE_JOB))) {
if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &jobdat, 1, ORTE_JOB))) {
ORTE_ERROR_LOG(ret);
OBJ_RELEASE(answer);
goto CLEANUP;
}
}
} else {
/* since the job array is no longer
* left-justified and may have holes, we have
* to cnt the number of jobs
*/
jobs = (orte_job_t**)orte_job_data->addr;
for (i=0; i < orte_job_data->size; i++) {
if (NULL != orte_job_data->addr[i]) {
num_jobs++;
}
}
/* now pack the, one at a time */
for (i=0; i < orte_job_data->size; i++) {
if (NULL != orte_job_data->addr[i]) {
if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &jobs[i], 1, ORTE_JOB))) {
ORTE_ERROR_LOG(ret);
OBJ_RELEASE(answer);
goto CLEANUP;
}
}
}
}
if (0 > orte_rml.send_buffer(sender, answer, ORTE_RML_TAG_TOOL, 0)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
ret = ORTE_ERR_COMM_FAILURE;

Просмотреть файл

@ -39,7 +39,7 @@ int orte_util_comm_query_job_info(const orte_process_name_t *hnp, orte_jobid_t j
int *num_jobs, orte_job_t ***job_info_array)
{
int ret;
orte_std_cntr_t cnt, cnt_jobs;
orte_std_cntr_t cnt, cnt_jobs, n;
opal_buffer_t cmd, answer;
orte_daemon_cmd_flag_t command = ORTE_DAEMON_REPORT_JOB_INFO_CMD;
orte_job_t **job_info;
@ -83,12 +83,15 @@ int orte_util_comm_query_job_info(const orte_process_name_t *hnp, orte_jobid_t j
if (0 < cnt_jobs) {
job_info = (orte_job_t**)malloc(cnt_jobs * sizeof(orte_job_t*));
/* unpack the job data */
if (ORTE_SUCCESS != (ret = opal_dss.unpack(&answer, job_info, &cnt_jobs, ORTE_JOB))) {
for (n=0; n < cnt_jobs; n++) {
cnt = 1;
if (ORTE_SUCCESS != (ret = opal_dss.unpack(&answer, &job_info[n], &cnt, ORTE_JOB))) {
ORTE_ERROR_LOG(ret);
OBJ_DESTRUCT(&answer);
free(job_info);
return ret;
}
}
*job_info_array = job_info;
*num_jobs = cnt_jobs;
}