1
1

Merge pull request #5838 from rhc54/topic/ev

Correctly notify upon process failure
Этот коммит содержится в:
Ralph Castain 2018-10-04 04:56:05 -07:00 коммит произвёл GitHub
родитель 2a6c543197 86702b71bc
Коммит 44afb59a01
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
2 изменённых файлов: 29 добавлений и 45 удалений

Просмотреть файл

@ -516,6 +516,7 @@ int pmix4x_server_notify_event(int status,
size_t sz, n;
pmix_status_t rc;
pmix4x_opcaddy_t *op;
pmix_data_range_t range = PMIX_RANGE_SESSION;
OPAL_PMIX_ACQUIRE_THREAD(&opal_pmix_base.lock);
if (0 >= opal_pmix_base.initialized) {
@ -535,6 +536,9 @@ int pmix4x_server_notify_event(int status,
pinfo[n].value.data.status = pmix4x_convert_opalrc(kv->data.integer);
} else {
pmix4x_value_load(&pinfo[n].value, kv);
if (0 == strcmp(kv->key, OPAL_PMIX_EVENT_CUSTOM_RANGE)) {
range = PMIX_RANGE_CUSTOM;
}
}
++n;
}
@ -561,7 +565,7 @@ int pmix4x_server_notify_event(int status,
rc = pmix4x_convert_opalrc(status);
/* the range must be nonlocal so the server will pass
* the event down to its local clients */
rc = PMIx_Notify_event(rc, &op->p, PMIX_RANGE_SESSION,
rc = PMIx_Notify_event(rc, &op->p, range,
pinfo, sz, opcbfunc, op);
if (PMIX_SUCCESS != rc) {
OBJ_RELEASE(op);

Просмотреть файл

@ -551,37 +551,18 @@ static void _send_notification(int status,
return;
}
if (OPAL_ERR_PROC_ABORTED == status) {
/* we will pass three opal_value_t's */
rc = 3;
if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &rc, 1, OPAL_INT))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(buf);
return;
}
/* pass along the affected proc(s) */
OBJ_CONSTRUCT(&kv, opal_value_t);
kv.key = strdup(OPAL_PMIX_EVENT_AFFECTED_PROC);
kv.type = OPAL_NAME;
kv.data.name.jobid = proc->jobid;
kv.data.name.vpid = proc->vpid;
kvptr = &kv;
if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &kvptr, 1, OPAL_VALUE))) {
ORTE_ERROR_LOG(rc);
OBJ_DESTRUCT(&kv);
OBJ_RELEASE(buf);
return;
}
OBJ_DESTRUCT(&kv);
if (ORTE_VPID_WILDCARD == target->vpid) {
/* we will only pass the affected proc */
rc = 1;
} else {
/* we are going to pass two opal_value_t's */
/* we have to pass the target */
rc = 2;
}
if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &rc, 1, OPAL_INT))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(buf);
return;
}
}
/* pass along the affected proc(s) */
OBJ_CONSTRUCT(&kv, opal_value_t);
@ -598,7 +579,21 @@ static void _send_notification(int status,
}
OBJ_DESTRUCT(&kv);
/* pass along the proc(s) to be notified */
if (ORTE_VPID_WILDCARD == target->vpid) {
/* xcast it to everyone */
OBJ_CONSTRUCT(&sig, orte_grpcomm_signature_t);
sig.signature = (orte_process_name_t*)malloc(sizeof(orte_process_name_t));
sig.signature[0].jobid = ORTE_PROC_MY_NAME->jobid;
sig.signature[0].vpid = ORTE_VPID_WILDCARD;
sig.sz = 1;
if (ORTE_SUCCESS != (rc = orte_grpcomm.xcast(&sig, ORTE_RML_TAG_NOTIFICATION, buf))) {
ORTE_ERROR_LOG(rc);
}
OBJ_DESTRUCT(&sig);
OBJ_RELEASE(buf);
} else {
/* pass along the proc to be notified */
OBJ_CONSTRUCT(&kv, opal_value_t);
kv.key = strdup(OPAL_PMIX_EVENT_CUSTOM_RANGE);
kv.type = OPAL_NAME;
@ -612,21 +607,6 @@ static void _send_notification(int status,
return;
}
OBJ_DESTRUCT(&kv);
/* if the targets are a wildcard, then xcast it to everyone */
if (ORTE_VPID_WILDCARD == target->vpid) {
OBJ_CONSTRUCT(&sig, orte_grpcomm_signature_t);
sig.signature = (orte_process_name_t*)malloc(sizeof(orte_process_name_t));
sig.signature[0].jobid = ORTE_PROC_MY_NAME->jobid;
sig.signature[0].vpid = ORTE_VPID_WILDCARD;
sig.sz = 1;
if (ORTE_SUCCESS != (rc = orte_grpcomm.xcast(&sig, ORTE_RML_TAG_NOTIFICATION, buf))) {
ORTE_ERROR_LOG(rc);
}
OBJ_DESTRUCT(&sig);
OBJ_RELEASE(buf);
} else {
/* get the daemon hosting the proc to be notified */
daemon.jobid = ORTE_PROC_MY_NAME->jobid;
daemon.vpid = orte_get_proc_daemon_vpid(target);