1
1

Fix a few things relating to comm_spawn that causes new daemons to be launched. Ensure that all new daemons receive a full pidmap. Properly mark the daemon job as "updated" when daemons are added

This commit was SVN r27177.
Этот коммит содержится в:
Ralph Castain 2012-08-29 03:11:37 +00:00
родитель f0077820f2
Коммит a3b08f5800
3 изменённых файлов: 26 добавлений и 5 удалений

Просмотреть файл

@ -155,9 +155,7 @@ static void coll_id_req(int status, orte_process_name_t* sender,
orte_grpcomm_coll_id_t id;
opal_buffer_t *relay;
int rc;
/* collective - only the HNP ever gets this message, but check
* in case a developer makes a mistake!
*/
id = orte_grpcomm_base_get_coll_id();
OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output,
"%s grpcomm:base:receive proc %s requested coll id - returned id %d",
@ -316,7 +314,11 @@ static void daemon_local_recv(int status, orte_process_name_t* sender,
* our own local procs, but this could involve a proc
* running remotely that we don't know about yet
*/
do_progress = false;
OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output,
"%s cant find job %s - not progressing collective",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOBID_PRINT(proc.jobid)));
do_progress = false;
}
}

Просмотреть файл

@ -1269,6 +1269,7 @@ int orte_plm_base_setup_virtual_machine(orte_job_t *jdata)
OBJ_DESTRUCT(&nodes);
/* mark that the daemons have reported so we can proceed */
daemons->state = ORTE_JOB_STATE_DAEMONS_REPORTED;
daemons->updated = false;
return ORTE_SUCCESS;
}
@ -1430,5 +1431,8 @@ int orte_plm_base_setup_virtual_machine(orte_job_t *jdata)
orte_routed.update_routing_plan();
}
/* mark that the daemon job changed */
daemons->updated = true;
return ORTE_SUCCESS;
}

Просмотреть файл

@ -509,10 +509,25 @@ int orte_util_encode_pidmap(opal_byte_object_t *boptr, bool update)
opal_buffer_t buf;
int i, j, rc = ORTE_SUCCESS;
orte_job_t *jdata;
bool include_all;
/* setup the working buffer */
OBJ_CONSTRUCT(&buf, opal_buffer_t);
/* check the daemon job to see if it has changed - perhaps
* new daemons were added as the result of a comm_spawn
*/
jdata = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid);
/* if it did change, then the pidmap will be going
* to new daemons - so we need to include everything.
* also include everything if we were asked to do so
*/
if (jdata->updated || !update) {
include_all = true;
} else {
include_all = false;
}
for (j=1; j < orte_job_data->size; j++) {
/* the job array is no longer left-justified and may
* have holes in it as we recover resources at job
@ -532,7 +547,7 @@ int orte_util_encode_pidmap(opal_byte_object_t *boptr, bool update)
continue;
}
/* if we want an update version and there is nothing to update, ignore it */
if (update && !jdata->updated) {
if (!include_all && !jdata->updated) {
continue;
}
/* flag that we included it so we don't do so again */