Now that we have the daemon collectives, the unity routed module no longer needs the "hack" we inserted a week ago to tell the daemons how to talk directly to all the application procs. The modex and barrier messages flow cleanly across the daemons and are "dropped" into the procs where required.
Add some insurance to make certain that the daemons' number of procs only gets updated when it absolutely is intended. This commit was SVN r18118.
Этот коммит содержится в:
родитель
0b3122ee2f
Коммит
dc2f88b9f0
@ -68,6 +68,7 @@ int orte_rml_base_update_contact_info(opal_buffer_t* data)
|
||||
char *rml_uri;
|
||||
orte_process_name_t name;
|
||||
int rc;
|
||||
orte_jobid_t jobid;
|
||||
|
||||
/* unpack the data for each entry */
|
||||
num_procs = 0;
|
||||
@ -96,6 +97,12 @@ int orte_rml_base_update_contact_info(opal_buffer_t* data)
|
||||
* since we were given the contact info
|
||||
*/
|
||||
orte_routed.update_route(&name, &name);
|
||||
/* we only get an update from a single jobid - the command
|
||||
* that creates these doesn't cross jobid boundaries - so
|
||||
* record it here
|
||||
*/
|
||||
jobid = name.jobid;
|
||||
/* track how many procs were in the message */
|
||||
++num_procs;
|
||||
}
|
||||
if (ORTE_ERR_UNPACK_READ_PAST_END_OF_BUFFER != rc) {
|
||||
@ -103,12 +110,13 @@ int orte_rml_base_update_contact_info(opal_buffer_t* data)
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* if we are a daemon, this update would include updated contact info
|
||||
/* if we are a daemon and this was info about our jobid, this update would
|
||||
* include updated contact info
|
||||
* for all daemons in the system - indicating that the number of daemons
|
||||
* changed since we were initially launched. Thus, update the num_procs
|
||||
* in our process_info struct so we can correctly route any messages
|
||||
*/
|
||||
if (orte_process_info.daemon) {
|
||||
if (ORTE_PROC_MY_NAME->jobid == jobid && orte_process_info.daemon) {
|
||||
orte_process_info.num_procs = num_procs;
|
||||
}
|
||||
|
||||
|
@ -205,7 +205,6 @@ static int process_callback(orte_jobid_t job, opal_buffer_t *buffer)
|
||||
orte_std_cntr_t cnt;
|
||||
char *rml_uri;
|
||||
int rc;
|
||||
orte_rml_cmd_flag_t command=ORTE_RML_UPDATE_CMD;
|
||||
|
||||
/* lookup the job object */
|
||||
if (NULL == (jdata = orte_get_job_data_object(job))) {
|
||||
@ -266,30 +265,6 @@ static int process_callback(orte_jobid_t job, opal_buffer_t *buffer)
|
||||
jdata->state = ORTE_JOB_STATE_RUNNING;
|
||||
}
|
||||
|
||||
/* first update the daemons so they will know how to talk to the
|
||||
* procs - this is required for support of modex and barrier
|
||||
*/
|
||||
OBJ_CONSTRUCT(&buf, opal_buffer_t);
|
||||
/* pack an update command */
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &command, 1, ORTE_RML_CMD))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_DESTRUCT(&buf);
|
||||
return rc;
|
||||
}
|
||||
/* pack the RML contact info for each proc */
|
||||
if (ORTE_SUCCESS != (rc = orte_rml_base_get_contact_info(jdata->jobid, &buf))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_DESTRUCT(&buf);
|
||||
return rc;
|
||||
}
|
||||
/* send it to the daemons via xcast */
|
||||
if (ORTE_SUCCESS != (rc = orte_grpcomm.xcast(ORTE_PROC_MY_NAME->jobid, &buf, ORTE_RML_TAG_RML_INFO_UPDATE))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_DESTRUCT(&buf);
|
||||
return rc;
|
||||
}
|
||||
OBJ_DESTRUCT(&buf);
|
||||
|
||||
/* now send to the procs so they release from their barrier */
|
||||
OBJ_CONSTRUCT(&buf, opal_buffer_t);
|
||||
/* pack the RML contact info for each proc */
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user