Fix a few things relating to comm_spawn that causes new daemons to be launched. Ensure that all new daemons receive a full pidmap. Properly mark the daemon job as "updated" when daemons are added
This commit was SVN r27177.
Этот коммит содержится в:
родитель
f0077820f2
Коммит
a3b08f5800
@ -155,9 +155,7 @@ static void coll_id_req(int status, orte_process_name_t* sender,
|
||||
orte_grpcomm_coll_id_t id;
|
||||
opal_buffer_t *relay;
|
||||
int rc;
|
||||
/* collective - only the HNP ever gets this message, but check
|
||||
* in case a developer makes a mistake!
|
||||
*/
|
||||
|
||||
id = orte_grpcomm_base_get_coll_id();
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output,
|
||||
"%s grpcomm:base:receive proc %s requested coll id - returned id %d",
|
||||
@ -316,7 +314,11 @@ static void daemon_local_recv(int status, orte_process_name_t* sender,
|
||||
* our own local procs, but this could involve a proc
|
||||
* running remotely that we don't know about yet
|
||||
*/
|
||||
do_progress = false;
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output,
|
||||
"%s cant find job %s - not progressing collective",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_JOBID_PRINT(proc.jobid)));
|
||||
do_progress = false;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1269,6 +1269,7 @@ int orte_plm_base_setup_virtual_machine(orte_job_t *jdata)
|
||||
OBJ_DESTRUCT(&nodes);
|
||||
/* mark that the daemons have reported so we can proceed */
|
||||
daemons->state = ORTE_JOB_STATE_DAEMONS_REPORTED;
|
||||
daemons->updated = false;
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
@ -1430,5 +1431,8 @@ int orte_plm_base_setup_virtual_machine(orte_job_t *jdata)
|
||||
orte_routed.update_routing_plan();
|
||||
}
|
||||
|
||||
/* mark that the daemon job changed */
|
||||
daemons->updated = true;
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
@ -509,10 +509,25 @@ int orte_util_encode_pidmap(opal_byte_object_t *boptr, bool update)
|
||||
opal_buffer_t buf;
|
||||
int i, j, rc = ORTE_SUCCESS;
|
||||
orte_job_t *jdata;
|
||||
bool include_all;
|
||||
|
||||
/* setup the working buffer */
|
||||
OBJ_CONSTRUCT(&buf, opal_buffer_t);
|
||||
|
||||
/* check the daemon job to see if it has changed - perhaps
|
||||
* new daemons were added as the result of a comm_spawn
|
||||
*/
|
||||
jdata = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid);
|
||||
/* if it did change, then the pidmap will be going
|
||||
* to new daemons - so we need to include everything.
|
||||
* also include everything if we were asked to do so
|
||||
*/
|
||||
if (jdata->updated || !update) {
|
||||
include_all = true;
|
||||
} else {
|
||||
include_all = false;
|
||||
}
|
||||
|
||||
for (j=1; j < orte_job_data->size; j++) {
|
||||
/* the job array is no longer left-justified and may
|
||||
* have holes in it as we recover resources at job
|
||||
@ -532,7 +547,7 @@ int orte_util_encode_pidmap(opal_byte_object_t *boptr, bool update)
|
||||
continue;
|
||||
}
|
||||
/* if we want an update version and there is nothing to update, ignore it */
|
||||
if (update && !jdata->updated) {
|
||||
if (!include_all && !jdata->updated) {
|
||||
continue;
|
||||
}
|
||||
/* flag that we included it so we don't do so again */
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user