From dc2f88b9f08e8ac7e9069540368ef2a95ec503c5 Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Thu, 10 Apr 2008 02:45:42 +0000 Subject: [PATCH] Now that we have the daemon collectives, the unity routed module no longer needs the "hack" we inserted a week ago to tell the daemons how to talk directly to all the application procs. The modex and barrier messages flow cleanly across the daemons and are "dropped" into the procs where required. Add some insurance to make certain that the daemons' number of procs only gets updated when it absolutely is intended. This commit was SVN r18118. --- orte/mca/rml/base/rml_base_contact.c | 12 ++++++++++-- orte/mca/routed/unity/routed_unity.c | 25 ------------------------- 2 files changed, 10 insertions(+), 27 deletions(-) diff --git a/orte/mca/rml/base/rml_base_contact.c b/orte/mca/rml/base/rml_base_contact.c index 73ed100473..fa87289882 100644 --- a/orte/mca/rml/base/rml_base_contact.c +++ b/orte/mca/rml/base/rml_base_contact.c @@ -68,6 +68,7 @@ int orte_rml_base_update_contact_info(opal_buffer_t* data) char *rml_uri; orte_process_name_t name; int rc; + orte_jobid_t jobid; /* unpack the data for each entry */ num_procs = 0; @@ -96,6 +97,12 @@ int orte_rml_base_update_contact_info(opal_buffer_t* data) * since we were given the contact info */ orte_routed.update_route(&name, &name); + /* we only get an update from a single jobid - the command + * that creates these doesn't cross jobid boundaries - so + * record it here + */ + jobid = name.jobid; + /* track how many procs were in the message */ ++num_procs; } if (ORTE_ERR_UNPACK_READ_PAST_END_OF_BUFFER != rc) { @@ -103,12 +110,13 @@ int orte_rml_base_update_contact_info(opal_buffer_t* data) return rc; } - /* if we are a daemon, this update would include updated contact info + /* if we are a daemon and this was info about our jobid, this update would + * include updated contact info * for all daemons in the system - indicating that the number of daemons * changed since we were initially launched. Thus, update the num_procs * in our process_info struct so we can correctly route any messages */ - if (orte_process_info.daemon) { + if (ORTE_PROC_MY_NAME->jobid == jobid && orte_process_info.daemon) { orte_process_info.num_procs = num_procs; } diff --git a/orte/mca/routed/unity/routed_unity.c b/orte/mca/routed/unity/routed_unity.c index cf224cd4d1..91ff6b32c0 100644 --- a/orte/mca/routed/unity/routed_unity.c +++ b/orte/mca/routed/unity/routed_unity.c @@ -205,7 +205,6 @@ static int process_callback(orte_jobid_t job, opal_buffer_t *buffer) orte_std_cntr_t cnt; char *rml_uri; int rc; - orte_rml_cmd_flag_t command=ORTE_RML_UPDATE_CMD; /* lookup the job object */ if (NULL == (jdata = orte_get_job_data_object(job))) { @@ -266,30 +265,6 @@ static int process_callback(orte_jobid_t job, opal_buffer_t *buffer) jdata->state = ORTE_JOB_STATE_RUNNING; } - /* first update the daemons so they will know how to talk to the - * procs - this is required for support of modex and barrier - */ - OBJ_CONSTRUCT(&buf, opal_buffer_t); - /* pack an update command */ - if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &command, 1, ORTE_RML_CMD))) { - ORTE_ERROR_LOG(rc); - OBJ_DESTRUCT(&buf); - return rc; - } - /* pack the RML contact info for each proc */ - if (ORTE_SUCCESS != (rc = orte_rml_base_get_contact_info(jdata->jobid, &buf))) { - ORTE_ERROR_LOG(rc); - OBJ_DESTRUCT(&buf); - return rc; - } - /* send it to the daemons via xcast */ - if (ORTE_SUCCESS != (rc = orte_grpcomm.xcast(ORTE_PROC_MY_NAME->jobid, &buf, ORTE_RML_TAG_RML_INFO_UPDATE))) { - ORTE_ERROR_LOG(rc); - OBJ_DESTRUCT(&buf); - return rc; - } - OBJ_DESTRUCT(&buf); - /* now send to the procs so they release from their barrier */ OBJ_CONSTRUCT(&buf, opal_buffer_t); /* pack the RML contact info for each proc */