1
1

Transfer across final fixes from debugger attach work

Signed-off-by: Ralph Castain <rhc@open-mpi.org>
Этот коммит содержится в:
Ralph Castain 2016-12-19 00:34:27 -08:00
родитель c1b8538216
Коммит 256b5adac5
5 изменённых файлов: 132 добавлений и 86 удалений

Просмотреть файл

@ -501,8 +501,9 @@ static void _notify_client_event(int sd, short args, void *cbdata)
} }
} }
pmix_output_verbose(2, pmix_globals.debug_output, pmix_output_verbose(2, pmix_globals.debug_output,
"pmix_server: notifying client %s:%d", "pmix_server: notifying client %s:%d of status %s",
pr->peer->info->nptr->nspace, pr->peer->info->rank); pr->peer->info->nptr->nspace, pr->peer->info->rank,
PMIx_Error_string(cd->status));
PMIX_RETAIN(cd->buf); PMIX_RETAIN(cd->buf);
PMIX_SERVER_QUEUE_REPLY(pr->peer, 0, cd->buf); PMIX_SERVER_QUEUE_REPLY(pr->peer, 0, cd->buf);
} }
@ -555,7 +556,7 @@ static pmix_status_t notify_client_of_event(pmix_status_t status,
for (n=0; n < ninfo; n++) { for (n=0; n < ninfo; n++) {
if (0 == strncmp(info[n].key, PMIX_EVENT_NON_DEFAULT, PMIX_MAX_KEYLEN)) { if (0 == strncmp(info[n].key, PMIX_EVENT_NON_DEFAULT, PMIX_MAX_KEYLEN)) {
cd->nondefault = true; cd->nondefault = true;
} else if (strncmp(info[n].key, PMIX_EVENT_CUSTOM_RANGE, PMIX_MAX_KEYLEN)) { } else if (0 == strncmp(info[n].key, PMIX_EVENT_CUSTOM_RANGE, PMIX_MAX_KEYLEN)) {
/* provides an array of pmix_proc_t identifying the procs /* provides an array of pmix_proc_t identifying the procs
* that are to receive this notification */ * that are to receive this notification */
if (PMIX_DATA_ARRAY == info[n].value.type && if (PMIX_DATA_ARRAY == info[n].value.type &&
@ -570,6 +571,7 @@ static pmix_status_t notify_client_of_event(pmix_status_t status,
memcpy(cd->targets, info[n].value.data.proc, sizeof(pmix_proc_t)); memcpy(cd->targets, info[n].value.data.proc, sizeof(pmix_proc_t));
} else { } else {
/* this is an error */ /* this is an error */
PMIX_ERROR_LOG(PMIX_ERR_BAD_PARAM);
return PMIX_ERR_BAD_PARAM; return PMIX_ERR_BAD_PARAM;
} }
} }

Просмотреть файл

@ -733,6 +733,9 @@ pmix_persistence_t pmix2x_convert_opalpersist(opal_pmix_persistence_t persist)
void pmix2x_value_load(pmix_value_t *v, void pmix2x_value_load(pmix_value_t *v,
opal_value_t *kv) opal_value_t *kv)
{ {
opal_pmix2x_jobid_trkr_t *job;
bool found;
switch(kv->type) { switch(kv->type) {
case OPAL_UNDEF: case OPAL_UNDEF:
v->type = PMIX_UNDEF; v->type = PMIX_UNDEF;
@ -829,7 +832,18 @@ void pmix2x_value_load(pmix_value_t *v,
v->type = PMIX_PROC; v->type = PMIX_PROC;
/* have to stringify the jobid */ /* have to stringify the jobid */
PMIX_PROC_CREATE(v->data.proc, 1); PMIX_PROC_CREATE(v->data.proc, 1);
(void)opal_snprintf_jobid(v->data.proc->nspace, PMIX_MAX_NSLEN, kv->data.name.vpid); /* see if this job is in our list of known nspaces */
found = false;
OPAL_LIST_FOREACH(job, &mca_pmix_pmix2x_component.jobids, opal_pmix2x_jobid_trkr_t) {
if (job->jobid == kv->data.name.jobid) {
(void)strncpy(v->data.proc->nspace, job->nspace, PMIX_MAX_NSLEN);
found = true;
break;
}
}
if (!found) {
(void)opal_snprintf_jobid(v->data.proc->nspace, PMIX_MAX_NSLEN, kv->data.name.vpid);
}
v->data.proc->rank = pmix2x_convert_opalrank(kv->data.name.vpid); v->data.proc->rank = pmix2x_convert_opalrank(kv->data.name.vpid);
break; break;
case OPAL_BYTE_OBJECT: case OPAL_BYTE_OBJECT:
@ -875,7 +889,8 @@ int pmix2x_value_unload(opal_value_t *kv,
const pmix_value_t *v) const pmix_value_t *v)
{ {
int rc=OPAL_SUCCESS; int rc=OPAL_SUCCESS;
bool found;
opal_pmix2x_jobid_trkr_t *job;
switch(v->type) { switch(v->type) {
case PMIX_UNDEF: case PMIX_UNDEF:
@ -969,8 +984,19 @@ int pmix2x_value_unload(opal_value_t *kv,
break; break;
case PMIX_PROC: case PMIX_PROC:
kv->type = OPAL_NAME; kv->type = OPAL_NAME;
if (OPAL_SUCCESS != (rc = opal_convert_string_to_jobid(&kv->data.name.jobid, v->data.proc->nspace))) { /* see if this job is in our list of known nspaces */
return pmix2x_convert_opalrc(rc); found = false;
OPAL_LIST_FOREACH(job, &mca_pmix_pmix2x_component.jobids, opal_pmix2x_jobid_trkr_t) {
if (0 == strncmp(job->nspace, v->data.proc->nspace, PMIX_MAX_NSLEN)) {
kv->data.name.jobid = job->jobid;
found = true;
break;
}
}
if (!found) {
if (OPAL_SUCCESS != (rc = opal_convert_string_to_jobid(&kv->data.name.jobid, v->data.proc->nspace))) {
return pmix2x_convert_opalrc(rc);
}
} }
kv->data.name.vpid = pmix2x_convert_rank(v->data.proc->rank); kv->data.name.vpid = pmix2x_convert_rank(v->data.proc->rank);
break; break;

Просмотреть файл

@ -920,13 +920,22 @@ static void toolcbfunc(int status,
pmix2x_opalcaddy_t *opalcaddy = (pmix2x_opalcaddy_t*)cbdata; pmix2x_opalcaddy_t *opalcaddy = (pmix2x_opalcaddy_t*)cbdata;
pmix_status_t rc; pmix_status_t rc;
pmix_proc_t p; pmix_proc_t p;
opal_pmix2x_jobid_trkr_t *job;
/* convert the status */ /* convert the status */
rc = pmix2x_convert_opalrc(status); rc = pmix2x_convert_opalrc(status);
/* convert the process name */ memset(&p, 0, sizeof(pmix_proc_t));
(void)opal_snprintf_jobid(p.nspace, PMIX_MAX_NSLEN, proc.jobid); if (OPAL_SUCCESS == status) {
p.rank = pmix2x_convert_opalrank(proc.vpid); /* convert the process name */
(void)opal_snprintf_jobid(p.nspace, PMIX_MAX_NSLEN, proc.jobid);
p.rank = pmix2x_convert_opalrank(proc.vpid);
/* store this job in our list of known nspaces */
job = OBJ_NEW(opal_pmix2x_jobid_trkr_t);
(void)strncpy(job->nspace, p.nspace, PMIX_MAX_NSLEN);
job->jobid = proc.jobid;
opal_list_append(&mca_pmix_pmix2x_component.jobids, &job->super);
}
/* pass it down */ /* pass it down */
if (NULL != opalcaddy->toolcbfunc) { if (NULL != opalcaddy->toolcbfunc) {

Просмотреть файл

@ -499,6 +499,7 @@ int pmix2x_server_notify_event(int status,
OPAL_LIST_FOREACH(kv, info, opal_value_t) { OPAL_LIST_FOREACH(kv, info, opal_value_t) {
(void)strncpy(pinfo[n].key, kv->key, PMIX_MAX_KEYLEN); (void)strncpy(pinfo[n].key, kv->key, PMIX_MAX_KEYLEN);
pmix2x_value_load(&pinfo[n].value, kv); pmix2x_value_load(&pinfo[n].value, kv);
++n;
} }
} else { } else {
sz = 0; sz = 0;

Просмотреть файл

@ -459,74 +459,25 @@ void orte_state_base_report_progress(int fd, short argc, void *cbdata)
OBJ_RELEASE(caddy); OBJ_RELEASE(caddy);
} }
static void _send_notification(int status, orte_process_name_t *proc) static void _send_notification(int status,
{ orte_process_name_t *proc,
opal_buffer_t buf; orte_process_name_t *target)
orte_grpcomm_signature_t sig;
int rc;
opal_value_t kv, *kvptr;
OBJ_CONSTRUCT(&buf, opal_buffer_t);
/* pack the status */
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &status, 1, OPAL_INT))) {
ORTE_ERROR_LOG(rc);
OBJ_DESTRUCT(&buf);
return;
}
/* the source is me */
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, ORTE_PROC_MY_NAME, 1, ORTE_NAME))) {
ORTE_ERROR_LOG(rc);
OBJ_DESTRUCT(&buf);
return;
}
/* pass along the affected proc (one opal_value_t) */
rc = 1;
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &rc, 1, OPAL_INT))) {
ORTE_ERROR_LOG(rc);
OBJ_DESTRUCT(&buf);
return;
}
OBJ_CONSTRUCT(&kv, opal_value_t);
kv.key = strdup(OPAL_PMIX_EVENT_AFFECTED_PROC);
kv.type = OPAL_NAME;
kv.data.name.jobid = proc->jobid;
kv.data.name.vpid = proc->vpid;
kvptr = &kv;
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &kvptr, 1, OPAL_VALUE))) {
ORTE_ERROR_LOG(rc);
OBJ_DESTRUCT(&kv);
OBJ_DESTRUCT(&buf);
return;
}
OBJ_DESTRUCT(&kv);
/* xcast it to everyone */
OBJ_CONSTRUCT(&sig, orte_grpcomm_signature_t);
sig.signature = (orte_process_name_t*)malloc(sizeof(orte_process_name_t));
sig.signature[0].jobid = ORTE_PROC_MY_NAME->jobid;
sig.signature[0].vpid = ORTE_VPID_WILDCARD;
sig.sz = 1;
if (ORTE_SUCCESS != (rc = orte_grpcomm.xcast(&sig, ORTE_RML_TAG_NOTIFICATION, &buf))) {
ORTE_ERROR_LOG(rc);
}
OBJ_DESTRUCT(&sig);
OBJ_DESTRUCT(&buf);
}
static void _send_direct_notify(int status, orte_process_name_t *proc)
{ {
opal_buffer_t *buf; opal_buffer_t *buf;
orte_grpcomm_signature_t sig;
int rc; int rc;
opal_value_t kv, *kvptr; opal_value_t kv, *kvptr;
orte_process_name_t daemon; orte_process_name_t daemon;
buf = OBJ_NEW(opal_buffer_t); buf = OBJ_NEW(opal_buffer_t);
opal_output_verbose(5, orte_state_base_framework.framework_output,
"%s state:base:sending notification %s proc %s target %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_ERROR_NAME(status),
ORTE_NAME_PRINT(proc),
ORTE_NAME_PRINT(target));
/* pack the status */ /* pack the status */
if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &status, 1, OPAL_INT))) { if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &status, 1, OPAL_INT))) {
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
@ -541,15 +492,17 @@ static void _send_direct_notify(int status, orte_process_name_t *proc)
return; return;
} }
/* pass along the proc to be notified (one opal_value_t) */ /* we are going to pass three opal_value_t's */
rc = 1; rc = 3;
if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &rc, 1, OPAL_INT))) { if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &rc, 1, OPAL_INT))) {
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
OBJ_RELEASE(buf); OBJ_RELEASE(buf);
return; return;
} }
/* pass along the affected proc(s) */
OBJ_CONSTRUCT(&kv, opal_value_t); OBJ_CONSTRUCT(&kv, opal_value_t);
kv.key = strdup(OPAL_PMIX_EVENT_CUSTOM_RANGE); kv.key = strdup(OPAL_PMIX_EVENT_AFFECTED_PROC);
kv.type = OPAL_NAME; kv.type = OPAL_NAME;
kv.data.name.jobid = proc->jobid; kv.data.name.jobid = proc->jobid;
kv.data.name.vpid = proc->vpid; kv.data.name.vpid = proc->vpid;
@ -562,17 +515,66 @@ static void _send_direct_notify(int status, orte_process_name_t *proc)
} }
OBJ_DESTRUCT(&kv); OBJ_DESTRUCT(&kv);
/* pass along the proc(s) to be notified */
/* get the daemon hosting the proc to be notified */ OBJ_CONSTRUCT(&kv, opal_value_t);
daemon.jobid = ORTE_PROC_MY_NAME->jobid; kv.key = strdup(OPAL_PMIX_EVENT_CUSTOM_RANGE);
daemon.vpid = orte_get_proc_daemon_vpid(proc); kv.type = OPAL_NAME;
/* send the notification to that daemon */ kv.data.name.jobid = target->jobid;
if (ORTE_SUCCESS != (rc = orte_rml.send_buffer_nb(orte_mgmt_conduit, kv.data.name.vpid = target->vpid;
&daemon, buf, kvptr = &kv;
ORTE_RML_TAG_NOTIFICATION, if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &kvptr, 1, OPAL_VALUE))) {
orte_rml_send_callback, NULL))) {
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
OBJ_DESTRUCT(&kv);
OBJ_RELEASE(buf); OBJ_RELEASE(buf);
return;
}
OBJ_DESTRUCT(&kv);
/* mark this as intended for non-default event handlers */
OBJ_CONSTRUCT(&kv, opal_value_t);
kv.key = strdup(OPAL_PMIX_EVENT_NON_DEFAULT);
kv.type = OPAL_BOOL;
kv.data.flag = true;
kvptr = &kv;
if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &kvptr, 1, OPAL_VALUE))) {
ORTE_ERROR_LOG(rc);
OBJ_DESTRUCT(&kv);
OBJ_RELEASE(buf);
return;
}
OBJ_DESTRUCT(&kv);
/* if the targets are a wildcard, then xcast it to everyone */
if (ORTE_VPID_WILDCARD == target->vpid) {
OBJ_CONSTRUCT(&sig, orte_grpcomm_signature_t);
sig.signature = (orte_process_name_t*)malloc(sizeof(orte_process_name_t));
sig.signature[0].jobid = ORTE_PROC_MY_NAME->jobid;
sig.signature[0].vpid = ORTE_VPID_WILDCARD;
sig.sz = 1;
if (ORTE_SUCCESS != (rc = orte_grpcomm.xcast(&sig, ORTE_RML_TAG_NOTIFICATION, buf))) {
ORTE_ERROR_LOG(rc);
}
OBJ_DESTRUCT(&sig);
OBJ_RELEASE(buf);
} else {
/* get the daemon hosting the proc to be notified */
daemon.jobid = ORTE_PROC_MY_NAME->jobid;
daemon.vpid = orte_get_proc_daemon_vpid(target);
/* send the notification to that daemon */
opal_output_verbose(5, orte_state_base_framework.framework_output,
"%s state:base:sending notification %s to proc %s at daemon %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_ERROR_NAME(status),
ORTE_NAME_PRINT(target),
ORTE_NAME_PRINT(&daemon));
if (ORTE_SUCCESS != (rc = orte_rml.send_buffer_nb(orte_mgmt_conduit,
&daemon, buf,
ORTE_RML_TAG_NOTIFICATION,
orte_rml_send_callback, NULL))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(buf);
}
} }
} }
@ -585,7 +587,7 @@ void orte_state_base_track_procs(int fd, short argc, void *cbdata)
orte_proc_t *pdata; orte_proc_t *pdata;
int i; int i;
char *rtmod; char *rtmod;
orte_process_name_t parent, *npptr; orte_process_name_t parent, target, *npptr;
opal_output_verbose(5, orte_state_base_framework.framework_output, opal_output_verbose(5, orte_state_base_framework.framework_output,
"%s state:base:track_procs called for proc %s state %s", "%s state:base:track_procs called for proc %s state %s",
@ -699,15 +701,21 @@ void orte_state_base_track_procs(int fd, short argc, void *cbdata)
npptr = &parent; npptr = &parent;
if (!orte_get_attribute(&jdata->attributes, ORTE_JOB_LAUNCH_PROXY, (void**)&npptr, OPAL_NAME)) { if (!orte_get_attribute(&jdata->attributes, ORTE_JOB_LAUNCH_PROXY, (void**)&npptr, OPAL_NAME)) {
/* notify everyone who asked for it */ /* notify everyone who asked for it */
_send_direct_notify(OPAL_ERR_JOB_TERMINATED, ORTE_NAME_WILDCARD); target.jobid = jdata->jobid;
target.vpid = ORTE_VPID_WILDCARD;
_send_notification(OPAL_ERR_JOB_TERMINATED, &target, ORTE_NAME_WILDCARD);
} else { } else {
_send_direct_notify(OPAL_ERR_JOB_TERMINATED, &parent); target.jobid = jdata->jobid;
target.vpid = ORTE_VPID_WILDCARD;
_send_notification(OPAL_ERR_JOB_TERMINATED, &target, &parent);
} }
} }
} else if (ORTE_PROC_STATE_TERMINATED < pdata->state && } else if (ORTE_PROC_STATE_TERMINATED < pdata->state &&
!orte_job_term_ordered) { !orte_job_term_ordered) {
/* if this was an abnormal term, notify the other procs of the termination */ /* if this was an abnormal term, notify the other procs of the termination */
_send_notification(OPAL_ERR_PROC_ABORTED, &pdata->name); parent.jobid = jdata->jobid;
parent.vpid = ORTE_VPID_WILDCARD;
_send_notification(OPAL_ERR_PROC_ABORTED, &pdata->name, &parent);
} }
} }