1
1

Merge pull request #5838 from rhc54/topic/ev

Correctly notify upon process failure
Этот коммит содержится в:
Ralph Castain 2018-10-04 04:56:05 -07:00 коммит произвёл GitHub
родитель 2a6c543197 86702b71bc
Коммит 44afb59a01
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
2 изменённых файлов: 29 добавлений и 45 удалений

Просмотреть файл

@ -516,6 +516,7 @@ int pmix4x_server_notify_event(int status,
size_t sz, n; size_t sz, n;
pmix_status_t rc; pmix_status_t rc;
pmix4x_opcaddy_t *op; pmix4x_opcaddy_t *op;
pmix_data_range_t range = PMIX_RANGE_SESSION;
OPAL_PMIX_ACQUIRE_THREAD(&opal_pmix_base.lock); OPAL_PMIX_ACQUIRE_THREAD(&opal_pmix_base.lock);
if (0 >= opal_pmix_base.initialized) { if (0 >= opal_pmix_base.initialized) {
@ -535,6 +536,9 @@ int pmix4x_server_notify_event(int status,
pinfo[n].value.data.status = pmix4x_convert_opalrc(kv->data.integer); pinfo[n].value.data.status = pmix4x_convert_opalrc(kv->data.integer);
} else { } else {
pmix4x_value_load(&pinfo[n].value, kv); pmix4x_value_load(&pinfo[n].value, kv);
if (0 == strcmp(kv->key, OPAL_PMIX_EVENT_CUSTOM_RANGE)) {
range = PMIX_RANGE_CUSTOM;
}
} }
++n; ++n;
} }
@ -561,7 +565,7 @@ int pmix4x_server_notify_event(int status,
rc = pmix4x_convert_opalrc(status); rc = pmix4x_convert_opalrc(status);
/* the range must be nonlocal so the server will pass /* the range must be nonlocal so the server will pass
* the event down to its local clients */ * the event down to its local clients */
rc = PMIx_Notify_event(rc, &op->p, PMIX_RANGE_SESSION, rc = PMIx_Notify_event(rc, &op->p, range,
pinfo, sz, opcbfunc, op); pinfo, sz, opcbfunc, op);
if (PMIX_SUCCESS != rc) { if (PMIX_SUCCESS != rc) {
OBJ_RELEASE(op); OBJ_RELEASE(op);

Просмотреть файл

@ -551,36 +551,17 @@ static void _send_notification(int status,
return; return;
} }
if (OPAL_ERR_PROC_ABORTED == status) { if (ORTE_VPID_WILDCARD == target->vpid) {
/* we will pass three opal_value_t's */ /* we will only pass the affected proc */
rc = 3; rc = 1;
if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &rc, 1, OPAL_INT))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(buf);
return;
}
/* pass along the affected proc(s) */
OBJ_CONSTRUCT(&kv, opal_value_t);
kv.key = strdup(OPAL_PMIX_EVENT_AFFECTED_PROC);
kv.type = OPAL_NAME;
kv.data.name.jobid = proc->jobid;
kv.data.name.vpid = proc->vpid;
kvptr = &kv;
if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &kvptr, 1, OPAL_VALUE))) {
ORTE_ERROR_LOG(rc);
OBJ_DESTRUCT(&kv);
OBJ_RELEASE(buf);
return;
}
OBJ_DESTRUCT(&kv);
} else { } else {
/* we are going to pass two opal_value_t's */ /* we have to pass the target */
rc = 2; rc = 2;
if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &rc, 1, OPAL_INT))) { }
ORTE_ERROR_LOG(rc); if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &rc, 1, OPAL_INT))) {
OBJ_RELEASE(buf); ORTE_ERROR_LOG(rc);
return; OBJ_RELEASE(buf);
} return;
} }
/* pass along the affected proc(s) */ /* pass along the affected proc(s) */
@ -598,23 +579,8 @@ static void _send_notification(int status,
} }
OBJ_DESTRUCT(&kv); OBJ_DESTRUCT(&kv);
/* pass along the proc(s) to be notified */
OBJ_CONSTRUCT(&kv, opal_value_t);
kv.key = strdup(OPAL_PMIX_EVENT_CUSTOM_RANGE);
kv.type = OPAL_NAME;
kv.data.name.jobid = target->jobid;
kv.data.name.vpid = target->vpid;
kvptr = &kv;
if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &kvptr, 1, OPAL_VALUE))) {
ORTE_ERROR_LOG(rc);
OBJ_DESTRUCT(&kv);
OBJ_RELEASE(buf);
return;
}
OBJ_DESTRUCT(&kv);
/* if the targets are a wildcard, then xcast it to everyone */
if (ORTE_VPID_WILDCARD == target->vpid) { if (ORTE_VPID_WILDCARD == target->vpid) {
/* xcast it to everyone */
OBJ_CONSTRUCT(&sig, orte_grpcomm_signature_t); OBJ_CONSTRUCT(&sig, orte_grpcomm_signature_t);
sig.signature = (orte_process_name_t*)malloc(sizeof(orte_process_name_t)); sig.signature = (orte_process_name_t*)malloc(sizeof(orte_process_name_t));
sig.signature[0].jobid = ORTE_PROC_MY_NAME->jobid; sig.signature[0].jobid = ORTE_PROC_MY_NAME->jobid;
@ -627,6 +593,20 @@ static void _send_notification(int status,
OBJ_DESTRUCT(&sig); OBJ_DESTRUCT(&sig);
OBJ_RELEASE(buf); OBJ_RELEASE(buf);
} else { } else {
/* pass along the proc to be notified */
OBJ_CONSTRUCT(&kv, opal_value_t);
kv.key = strdup(OPAL_PMIX_EVENT_CUSTOM_RANGE);
kv.type = OPAL_NAME;
kv.data.name.jobid = target->jobid;
kv.data.name.vpid = target->vpid;
kvptr = &kv;
if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &kvptr, 1, OPAL_VALUE))) {
ORTE_ERROR_LOG(rc);
OBJ_DESTRUCT(&kv);
OBJ_RELEASE(buf);
return;
}
OBJ_DESTRUCT(&kv);
/* get the daemon hosting the proc to be notified */ /* get the daemon hosting the proc to be notified */
daemon.jobid = ORTE_PROC_MY_NAME->jobid; daemon.jobid = ORTE_PROC_MY_NAME->jobid;
daemon.vpid = orte_get_proc_daemon_vpid(target); daemon.vpid = orte_get_proc_daemon_vpid(target);