From 21d743393fac00d91edfbcb615e61cb3d72b7746 Mon Sep 17 00:00:00 2001 From: George Bosilca Date: Fri, 28 Feb 2020 17:53:48 -0500 Subject: [PATCH] Make sure the PML is consistent across the world. Temporary solution for the PML inconsistency issue discussed in #7475. This patch address 2 things: first it make the PMIx key optional so that if we are not in a full modex mode we don't do a direct modex, and second it get the PML info from the vpid 0 instead of from the local rank. Signed-off-by: George Bosilca --- ompi/mca/pml/base/pml_base_select.c | 9 +++--- opal/mca/pmix/pmix-internal.h | 48 ++++++++++++++++++++++++++--- 2 files changed, 49 insertions(+), 8 deletions(-) diff --git a/ompi/mca/pml/base/pml_base_select.c b/ompi/mca/pml/base/pml_base_select.c index 20cb3010dc..33b98a93e9 100644 --- a/ompi/mca/pml/base/pml_base_select.c +++ b/ompi/mca/pml/base/pml_base_select.c @@ -324,6 +324,7 @@ mca_pml_base_pml_check_selected(const char *my_pml, size_t size; int ret; char *remote_pml; + opal_process_name_t rank0 = {.jobid = ompi_proc_local()->super.proc_name.jobid, .vpid = 0}; /* if no modex was required by the PML, then * we can assume success @@ -342,13 +343,13 @@ mca_pml_base_pml_check_selected(const char *my_pml, } /* get the name of the PML module selected by rank=0 */ - OPAL_MODEX_RECV(ret, &pml_base_component, - &procs[0]->super.proc_name, (void**) &remote_pml, &size); + OPAL_MODEX_RECV_STRING_OPTIONAL(ret, mca_base_component_to_string(&pml_base_component), + &rank0, (void**) &remote_pml, &size); /* if this key wasn't found, then just assume all is well... */ - if (OMPI_SUCCESS != ret) { + if (PMIX_ERR_NOT_FOUND != ret) { opal_output_verbose( 10, ompi_pml_base_framework.framework_output, - "check:select: modex data not found"); + "check:select: PML modex for vpid 0 data not found"); return OMPI_SUCCESS; } diff --git a/opal/mca/pmix/pmix-internal.h b/opal/mca/pmix/pmix-internal.h index 3094c4d8aa..9114bb3492 100644 --- a/opal/mca/pmix/pmix-internal.h +++ b/opal/mca/pmix/pmix-internal.h @@ -301,14 +301,14 @@ typedef struct { * is to be returned * t - the expected data type */ -#define OPAL_MODEX_RECV_VALUE_IMMEDIATE(r, s, p, d, t) \ - do { \ +#define OPAL_MODEX_RECV_VALUE_IMMEDIATE(r, s, p, d, t) \ + do { \ pmix_proc_t _proc; \ pmix_value_t *_kv = NULL; \ pmix_info_t _info; \ size_t _sz; \ OPAL_OUTPUT_VERBOSE((1, opal_pmix_verbose_output, \ - "%s[%s:%d] MODEX RECV VALUE OPTIONAL FOR PROC %s KEY %s", \ + "%s[%s:%d] MODEX RECV VALUE IMMEDIATE FOR PROC %s KEY %s", \ OPAL_NAME_PRINT(OPAL_PROC_MY_NAME), \ __FILE__, __LINE__, \ OPAL_NAME_PRINT(*(p)), (s))); \ @@ -349,7 +349,7 @@ typedef struct { OPAL_NAME_PRINT(OPAL_PROC_MY_NAME), \ __FILE__, __LINE__, \ OPAL_NAME_PRINT(*(p)), (s))); \ - OPAL_PMIX_CONVERT_NAME(&_proc, (p)); \ + OPAL_PMIX_CONVERT_NAME(&_proc, (p)); \ (r) = PMIx_Get(&(_proc), (s), NULL, 0, &(_kv)); \ if (NULL == _kv) { \ (r) = PMIX_ERR_NOT_FOUND; \ @@ -363,6 +363,46 @@ typedef struct { } \ } while(0); +/** + * Provide a simplified macro for retrieving modex data + * from another process: + * + * r - the integer return status from the modex op (int) + * s - string key (char*) + * p - pointer to the opal_process_name_t of the proc that posted + * the data (opal_process_name_t*) + * d - pointer to a location wherein the data object + * it to be returned (char**) + * sz - pointer to a location wherein the number of bytes + * in the data object can be returned (size_t) + */ +#define OPAL_MODEX_RECV_STRING_OPTIONAL(r, s, p, d, sz) \ + do { \ + pmix_proc_t _proc; \ + pmix_value_t *_kv = NULL; \ + pmix_info_t _info; \ + OPAL_OUTPUT_VERBOSE((1, opal_pmix_verbose_output, \ + "%s[%s:%d] MODEX RECV STRING OPTIONAL FOR PROC %s KEY %s", \ + OPAL_NAME_PRINT(OPAL_PROC_MY_NAME), \ + __FILE__, __LINE__, \ + OPAL_NAME_PRINT(*(p)), (s))); \ + *(d) = NULL; \ + *(sz) = 0; \ + OPAL_PMIX_CONVERT_NAME(&_proc, (p)); \ + PMIX_INFO_LOAD(&_info, PMIX_OPTIONAL, NULL, PMIX_BOOL); \ + (r) = PMIx_Get(&(_proc), (s), &(_info), 1, &(_kv)); \ + if (NULL == _kv) { \ + (r) = PMIX_ERR_NOT_FOUND; \ + } else if (PMIX_SUCCESS == (r)) { \ + *(d) = (uint8_t*)_kv->data.bo.bytes; \ + *(sz) = _kv->data.bo.size; \ + _kv->data.bo.bytes = NULL; /* protect the data */ \ + } \ + if (NULL != _kv) { \ + PMIX_VALUE_RELEASE(_kv); \ + } \ + } while(0); + /** * Provide a simplified macro for retrieving modex data * from another process: