Fix a few things relating to comm_spawn that causes new daemons to be launched. Ensure that all new daemons receive a full pidmap. Properly mark the daemon job as "updated" when daemons are added
This commit was SVN r27177.
Этот коммит содержится в:
родитель
f0077820f2
Коммит
a3b08f5800
@ -155,9 +155,7 @@ static void coll_id_req(int status, orte_process_name_t* sender,
|
|||||||
orte_grpcomm_coll_id_t id;
|
orte_grpcomm_coll_id_t id;
|
||||||
opal_buffer_t *relay;
|
opal_buffer_t *relay;
|
||||||
int rc;
|
int rc;
|
||||||
/* collective - only the HNP ever gets this message, but check
|
|
||||||
* in case a developer makes a mistake!
|
|
||||||
*/
|
|
||||||
id = orte_grpcomm_base_get_coll_id();
|
id = orte_grpcomm_base_get_coll_id();
|
||||||
OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output,
|
OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output,
|
||||||
"%s grpcomm:base:receive proc %s requested coll id - returned id %d",
|
"%s grpcomm:base:receive proc %s requested coll id - returned id %d",
|
||||||
@ -316,7 +314,11 @@ static void daemon_local_recv(int status, orte_process_name_t* sender,
|
|||||||
* our own local procs, but this could involve a proc
|
* our own local procs, but this could involve a proc
|
||||||
* running remotely that we don't know about yet
|
* running remotely that we don't know about yet
|
||||||
*/
|
*/
|
||||||
do_progress = false;
|
OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output,
|
||||||
|
"%s cant find job %s - not progressing collective",
|
||||||
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
|
ORTE_JOBID_PRINT(proc.jobid)));
|
||||||
|
do_progress = false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1269,6 +1269,7 @@ int orte_plm_base_setup_virtual_machine(orte_job_t *jdata)
|
|||||||
OBJ_DESTRUCT(&nodes);
|
OBJ_DESTRUCT(&nodes);
|
||||||
/* mark that the daemons have reported so we can proceed */
|
/* mark that the daemons have reported so we can proceed */
|
||||||
daemons->state = ORTE_JOB_STATE_DAEMONS_REPORTED;
|
daemons->state = ORTE_JOB_STATE_DAEMONS_REPORTED;
|
||||||
|
daemons->updated = false;
|
||||||
return ORTE_SUCCESS;
|
return ORTE_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1430,5 +1431,8 @@ int orte_plm_base_setup_virtual_machine(orte_job_t *jdata)
|
|||||||
orte_routed.update_routing_plan();
|
orte_routed.update_routing_plan();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* mark that the daemon job changed */
|
||||||
|
daemons->updated = true;
|
||||||
|
|
||||||
return ORTE_SUCCESS;
|
return ORTE_SUCCESS;
|
||||||
}
|
}
|
||||||
|
@ -509,10 +509,25 @@ int orte_util_encode_pidmap(opal_byte_object_t *boptr, bool update)
|
|||||||
opal_buffer_t buf;
|
opal_buffer_t buf;
|
||||||
int i, j, rc = ORTE_SUCCESS;
|
int i, j, rc = ORTE_SUCCESS;
|
||||||
orte_job_t *jdata;
|
orte_job_t *jdata;
|
||||||
|
bool include_all;
|
||||||
|
|
||||||
/* setup the working buffer */
|
/* setup the working buffer */
|
||||||
OBJ_CONSTRUCT(&buf, opal_buffer_t);
|
OBJ_CONSTRUCT(&buf, opal_buffer_t);
|
||||||
|
|
||||||
|
/* check the daemon job to see if it has changed - perhaps
|
||||||
|
* new daemons were added as the result of a comm_spawn
|
||||||
|
*/
|
||||||
|
jdata = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid);
|
||||||
|
/* if it did change, then the pidmap will be going
|
||||||
|
* to new daemons - so we need to include everything.
|
||||||
|
* also include everything if we were asked to do so
|
||||||
|
*/
|
||||||
|
if (jdata->updated || !update) {
|
||||||
|
include_all = true;
|
||||||
|
} else {
|
||||||
|
include_all = false;
|
||||||
|
}
|
||||||
|
|
||||||
for (j=1; j < orte_job_data->size; j++) {
|
for (j=1; j < orte_job_data->size; j++) {
|
||||||
/* the job array is no longer left-justified and may
|
/* the job array is no longer left-justified and may
|
||||||
* have holes in it as we recover resources at job
|
* have holes in it as we recover resources at job
|
||||||
@ -532,7 +547,7 @@ int orte_util_encode_pidmap(opal_byte_object_t *boptr, bool update)
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
/* if we want an update version and there is nothing to update, ignore it */
|
/* if we want an update version and there is nothing to update, ignore it */
|
||||||
if (update && !jdata->updated) {
|
if (!include_all && !jdata->updated) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
/* flag that we included it so we don't do so again */
|
/* flag that we included it so we don't do so again */
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user