1
1

Merge pull request #3301 from rhc54/topic/faults

Correctly identify the source of the event when notifying of abnormal termination by a proces
Этот коммит содержится в:
Ralph Castain 2017-04-06 21:43:34 -07:00 коммит произвёл GitHub
родитель 666386fc19 b526bca56c
Коммит eba6c6b827
2 изменённых файлов: 43 добавлений и 11 удалений

Просмотреть файл

@ -460,6 +460,7 @@ void orte_state_base_report_progress(int fd, short argc, void *cbdata)
} }
static void _send_notification(int status, static void _send_notification(int status,
orte_proc_state_t state,
orte_process_name_t *proc, orte_process_name_t *proc,
orte_process_name_t *target) orte_process_name_t *target)
{ {
@ -485,13 +486,36 @@ static void _send_notification(int status,
return; return;
} }
/* the source is me */ /* the source is the proc */
if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, ORTE_PROC_MY_NAME, 1, ORTE_NAME))) { if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, proc, 1, ORTE_NAME))) {
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
OBJ_RELEASE(buf); OBJ_RELEASE(buf);
return; return;
} }
if (OPAL_ERR_PROC_ABORTED == status) {
/* we will pass four opal_value_t's */
rc = 4;
if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &rc, 1, OPAL_INT))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(buf);
return;
}
/* pass along the affected proc(s) */
OBJ_CONSTRUCT(&kv, opal_value_t);
kv.key = strdup(OPAL_PMIX_EVENT_AFFECTED_PROC);
kv.type = OPAL_NAME;
kv.data.name.jobid = proc->jobid;
kv.data.name.vpid = proc->vpid;
kvptr = &kv;
if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &kvptr, 1, OPAL_VALUE))) {
ORTE_ERROR_LOG(rc);
OBJ_DESTRUCT(&kv);
OBJ_RELEASE(buf);
return;
}
OBJ_DESTRUCT(&kv);
} else {
/* we are going to pass three opal_value_t's */ /* we are going to pass three opal_value_t's */
rc = 3; rc = 3;
if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &rc, 1, OPAL_INT))) { if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &rc, 1, OPAL_INT))) {
@ -499,6 +523,7 @@ static void _send_notification(int status,
OBJ_RELEASE(buf); OBJ_RELEASE(buf);
return; return;
} }
}
/* pass along the affected proc(s) */ /* pass along the affected proc(s) */
OBJ_CONSTRUCT(&kv, opal_value_t); OBJ_CONSTRUCT(&kv, opal_value_t);
@ -699,11 +724,11 @@ void orte_state_base_track_procs(int fd, short argc, void *cbdata)
/* notify everyone who asked for it */ /* notify everyone who asked for it */
target.jobid = jdata->jobid; target.jobid = jdata->jobid;
target.vpid = ORTE_VPID_WILDCARD; target.vpid = ORTE_VPID_WILDCARD;
_send_notification(OPAL_ERR_JOB_TERMINATED, &target, ORTE_NAME_WILDCARD); _send_notification(OPAL_ERR_JOB_TERMINATED, pdata->state, &target, ORTE_NAME_WILDCARD);
} else { } else {
target.jobid = jdata->jobid; target.jobid = jdata->jobid;
target.vpid = ORTE_VPID_WILDCARD; target.vpid = ORTE_VPID_WILDCARD;
_send_notification(OPAL_ERR_JOB_TERMINATED, &target, &parent); _send_notification(OPAL_ERR_JOB_TERMINATED, pdata->state, &target, &parent);
} }
} }
} else if (ORTE_PROC_STATE_TERMINATED < pdata->state && } else if (ORTE_PROC_STATE_TERMINATED < pdata->state &&
@ -711,7 +736,7 @@ void orte_state_base_track_procs(int fd, short argc, void *cbdata)
/* if this was an abnormal term, notify the other procs of the termination */ /* if this was an abnormal term, notify the other procs of the termination */
parent.jobid = jdata->jobid; parent.jobid = jdata->jobid;
parent.vpid = ORTE_VPID_WILDCARD; parent.vpid = ORTE_VPID_WILDCARD;
_send_notification(OPAL_ERR_PROC_ABORTED, &pdata->name, &parent); _send_notification(OPAL_ERR_PROC_ABORTED, pdata->state, &pdata->name, &parent);
} }
} }

Просмотреть файл

@ -641,6 +641,13 @@ int orte_util_encode_nodemap(opal_buffer_t *buffer)
OBJ_CONSTRUCT(&bucket, opal_buffer_t); OBJ_CONSTRUCT(&bucket, opal_buffer_t);
while (NULL != (item = opal_list_remove_first(&topos))) { while (NULL != (item = opal_list_remove_first(&topos))) {
rng = (orte_regex_range_t*)item; rng = (orte_regex_range_t*)item;
if (NULL == rng->t) {
/* when we pass thru here prior to launching the daemons, we
* won't have topologies for them and so this entry might
* be NULL - protect ourselves */
OBJ_RELEASE(item);
continue;
}
if (NULL == tmp) { if (NULL == tmp) {
asprintf(&tmp, "%d", rng->cnt); asprintf(&tmp, "%d", rng->cnt);
} else { } else {