From f9dfa03fdec0cef33d2479d38ab2e6b62237ea6a Mon Sep 17 00:00:00 2001 From: Jeff Squyres Date: Mon, 21 Sep 2009 21:10:45 +0000 Subject: [PATCH] Fix a potential ordering issue with the names and RML exchange during sm coll setup. This commit was SVN r21981. --- ompi/mca/common/sm/common_sm_mmap.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/ompi/mca/common/sm/common_sm_mmap.c b/ompi/mca/common/sm/common_sm_mmap.c index 92b02fb897..1acf5a919e 100644 --- a/ompi/mca/common/sm/common_sm_mmap.c +++ b/ompi/mca/common/sm/common_sm_mmap.c @@ -192,19 +192,23 @@ mca_common_sm_mmap_t* mca_common_sm_mmap_init(ompi_proc_t **procs, char filename_to_send[OPAL_PATH_MAX]; opal_list_item_t *item; pending_rml_msg_t *rml_msg; + ompi_proc_t *temp_proc; /* Reorder all procs array to have all the local procs at the beginning. Simultaneously look for the local proc with the - lowest name. */ + lowest name. Ensure that procs[0] is the lowest named + process. */ for (p = 0; p < num_procs; p++) { if (OPAL_PROC_ON_LOCAL_NODE(procs[p]->proc_flags)) { procs[num_local_procs] = procs[p]; if (NULL == lowest_name) { - lowest_name = &(procs[0]->proc_name); + procs[num_local_procs] = procs[p]; } else if (orte_util_compare_name_fields(ORTE_NS_CMP_ALL, &(procs[p]->proc_name), lowest_name) < 0) { - lowest_name = &(procs[p]->proc_name); + temp_proc = procs[0]; + procs[0] = procs[p]; + procs[num_local_procs] = temp_proc; } ++num_local_procs; } @@ -213,6 +217,7 @@ mca_common_sm_mmap_t* mca_common_sm_mmap_init(ompi_proc_t **procs, if (0 == num_local_procs) { return NULL; } + lowest_name = &(procs[0]->proc_name); num_procs = num_local_procs; iov[0].iov_base = &sm_file_created; @@ -268,10 +273,12 @@ mca_common_sm_mmap_t* mca_common_sm_mmap_init(ompi_proc_t **procs, for (p = 1; p < num_procs; p++) { rc = orte_rml.send(&(procs[p]->proc_name), iov, 3, OMPI_RML_TAG_SM_BACK_FILE_CREATED, 0); - if (rc < 0) { + if (rc < (ssize_t) (iov[0].iov_len + iov[1].iov_len + iov[2].iov_len)) { opal_output(0, "mca_common_sm_mmap_init: " - "orte_rml.send failed to %lu with errno=%d\n", - (unsigned long)p, errno); + "orte_rml.send failed to %lu with errno=%d, ret=%d, iov_len sum=%d\n", + (unsigned long)p, errno, + rc, + (int) (iov[0].iov_len + iov[1].iov_len + iov[2].iov_len)); goto out; } }