1
1

Make sure the PML is consistent across the world.

Temporary solution for the PML inconsistency issue discussed in #7475.
This patch address 2 things: first it make the PMIx key optional so that
if we are not in a full modex mode we don't do a direct modex, and
second it get the PML info from the vpid 0 instead of from the local
rank.

Signed-off-by: George Bosilca <bosilca@icl.utk.edu>
Этот коммит содержится в:
George Bosilca 2020-02-28 17:53:48 -05:00
родитель 0276679595
Коммит 21d743393f
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 09C926752C9F09B1
2 изменённых файлов: 49 добавлений и 8 удалений

Просмотреть файл

@ -324,6 +324,7 @@ mca_pml_base_pml_check_selected(const char *my_pml,
size_t size;
int ret;
char *remote_pml;
opal_process_name_t rank0 = {.jobid = ompi_proc_local()->super.proc_name.jobid, .vpid = 0};
/* if no modex was required by the PML, then
* we can assume success
@ -342,13 +343,13 @@ mca_pml_base_pml_check_selected(const char *my_pml,
}
/* get the name of the PML module selected by rank=0 */
OPAL_MODEX_RECV(ret, &pml_base_component,
&procs[0]->super.proc_name, (void**) &remote_pml, &size);
OPAL_MODEX_RECV_STRING_OPTIONAL(ret, mca_base_component_to_string(&pml_base_component),
&rank0, (void**) &remote_pml, &size);
/* if this key wasn't found, then just assume all is well... */
if (OMPI_SUCCESS != ret) {
if (PMIX_ERR_NOT_FOUND != ret) {
opal_output_verbose( 10, ompi_pml_base_framework.framework_output,
"check:select: modex data not found");
"check:select: PML modex for vpid 0 data not found");
return OMPI_SUCCESS;
}

Просмотреть файл

@ -301,14 +301,14 @@ typedef struct {
* is to be returned
* t - the expected data type
*/
#define OPAL_MODEX_RECV_VALUE_IMMEDIATE(r, s, p, d, t) \
do { \
#define OPAL_MODEX_RECV_VALUE_IMMEDIATE(r, s, p, d, t) \
do { \
pmix_proc_t _proc; \
pmix_value_t *_kv = NULL; \
pmix_info_t _info; \
size_t _sz; \
OPAL_OUTPUT_VERBOSE((1, opal_pmix_verbose_output, \
"%s[%s:%d] MODEX RECV VALUE OPTIONAL FOR PROC %s KEY %s", \
"%s[%s:%d] MODEX RECV VALUE IMMEDIATE FOR PROC %s KEY %s", \
OPAL_NAME_PRINT(OPAL_PROC_MY_NAME), \
__FILE__, __LINE__, \
OPAL_NAME_PRINT(*(p)), (s))); \
@ -349,7 +349,7 @@ typedef struct {
OPAL_NAME_PRINT(OPAL_PROC_MY_NAME), \
__FILE__, __LINE__, \
OPAL_NAME_PRINT(*(p)), (s))); \
OPAL_PMIX_CONVERT_NAME(&_proc, (p)); \
OPAL_PMIX_CONVERT_NAME(&_proc, (p)); \
(r) = PMIx_Get(&(_proc), (s), NULL, 0, &(_kv)); \
if (NULL == _kv) { \
(r) = PMIX_ERR_NOT_FOUND; \
@ -363,6 +363,46 @@ typedef struct {
} \
} while(0);
/**
* Provide a simplified macro for retrieving modex data
* from another process:
*
* r - the integer return status from the modex op (int)
* s - string key (char*)
* p - pointer to the opal_process_name_t of the proc that posted
* the data (opal_process_name_t*)
* d - pointer to a location wherein the data object
* it to be returned (char**)
* sz - pointer to a location wherein the number of bytes
* in the data object can be returned (size_t)
*/
#define OPAL_MODEX_RECV_STRING_OPTIONAL(r, s, p, d, sz) \
do { \
pmix_proc_t _proc; \
pmix_value_t *_kv = NULL; \
pmix_info_t _info; \
OPAL_OUTPUT_VERBOSE((1, opal_pmix_verbose_output, \
"%s[%s:%d] MODEX RECV STRING OPTIONAL FOR PROC %s KEY %s", \
OPAL_NAME_PRINT(OPAL_PROC_MY_NAME), \
__FILE__, __LINE__, \
OPAL_NAME_PRINT(*(p)), (s))); \
*(d) = NULL; \
*(sz) = 0; \
OPAL_PMIX_CONVERT_NAME(&_proc, (p)); \
PMIX_INFO_LOAD(&_info, PMIX_OPTIONAL, NULL, PMIX_BOOL); \
(r) = PMIx_Get(&(_proc), (s), &(_info), 1, &(_kv)); \
if (NULL == _kv) { \
(r) = PMIX_ERR_NOT_FOUND; \
} else if (PMIX_SUCCESS == (r)) { \
*(d) = (uint8_t*)_kv->data.bo.bytes; \
*(sz) = _kv->data.bo.size; \
_kv->data.bo.bytes = NULL; /* protect the data */ \
} \
if (NULL != _kv) { \
PMIX_VALUE_RELEASE(_kv); \
} \
} while(0);
/**
* Provide a simplified macro for retrieving modex data
* from another process: