1
1

Mostly just added some diagnostic messages to help chase down a problem in comm_spawn. Fixed an error in the gpr notification system - not sure if that totally fixes the problem, but definitely made progress on it.

This commit was SVN r3709.
Этот коммит содержится в:
Ralph Castain 2004-12-06 16:50:45 +00:00
родитель 8304d0c5fa
Коммит 1454832bf7
6 изменённых файлов: 42 добавлений и 16 удалений

Просмотреть файл

@ -339,15 +339,16 @@ static int mca_base_modex_subscribe(ompi_process_name_t* name)
/* otherwise - subscribe */ /* otherwise - subscribe */
asprintf(&segment, "%s-%s", OMPI_RTE_MODEX_SEGMENT, mca_ns_base_get_jobid_string(name)); asprintf(&segment, "%s-%s", OMPI_RTE_MODEX_SEGMENT, mca_ns_base_get_jobid_string(name));
rctag = ompi_registry.subscribe( rctag = ompi_registry.subscribe(
OMPI_REGISTRY_OR, OMPI_REGISTRY_OR,
OMPI_REGISTRY_NOTIFY_ADD_ENTRY|OMPI_REGISTRY_NOTIFY_DELETE_ENTRY| OMPI_REGISTRY_NOTIFY_ADD_ENTRY|OMPI_REGISTRY_NOTIFY_DELETE_ENTRY|
OMPI_REGISTRY_NOTIFY_MODIFICATION| OMPI_REGISTRY_NOTIFY_MODIFICATION|
OMPI_REGISTRY_NOTIFY_ON_STARTUP|OMPI_REGISTRY_NOTIFY_INCLUDE_STARTUP_DATA| OMPI_REGISTRY_NOTIFY_ON_STARTUP|OMPI_REGISTRY_NOTIFY_INCLUDE_STARTUP_DATA|
OMPI_REGISTRY_NOTIFY_ON_SHUTDOWN, OMPI_REGISTRY_NOTIFY_PRE_EXISTING|
segment, OMPI_REGISTRY_NOTIFY_ON_SHUTDOWN,
NULL, segment,
mca_base_modex_registry_callback, NULL,
NULL); mca_base_modex_registry_callback,
NULL);
if(rctag == OMPI_REGISTRY_NOTIFY_ID_MAX) { if(rctag == OMPI_REGISTRY_NOTIFY_ID_MAX) {
ompi_output(0, "mca_base_modex_exchange: " ompi_output(0, "mca_base_modex_exchange: "
"ompi_registry.subscribe failed with return code %d\n", (int)rctag); "ompi_registry.subscribe failed with return code %d\n", (int)rctag);

Просмотреть файл

@ -249,14 +249,16 @@ void mca_gpr_proxy_notify_recv(int status, ompi_process_name_t* sender,
char **tokptr; char **tokptr;
mca_gpr_cmd_flag_t command; mca_gpr_cmd_flag_t command;
uint32_t num_items; uint32_t num_items;
uint32_t i, id_tag; uint32_t i;
ompi_registry_notify_id_t id_tag;
ompi_registry_value_t *regval; ompi_registry_value_t *regval;
ompi_registry_notify_message_t *message; ompi_registry_notify_message_t *message;
bool found; bool found;
mca_gpr_proxy_notify_request_tracker_t *trackptr; mca_gpr_proxy_notify_request_tracker_t *trackptr;
if (mca_gpr_proxy_debug) { if (mca_gpr_proxy_debug) {
ompi_output(0, "gpr proxy: received trigger message"); ompi_output(0, "[%d,%d,%d] gpr proxy: received trigger message",
OMPI_NAME_ARGS(*ompi_rte_get_self()));
} }
message = OBJ_NEW(ompi_registry_notify_message_t); message = OBJ_NEW(ompi_registry_notify_message_t);
@ -275,9 +277,15 @@ void mca_gpr_proxy_notify_recv(int status, ompi_process_name_t* sender,
} }
message->owning_job = (mca_ns_base_jobid_t)i; message->owning_job = (mca_ns_base_jobid_t)i;
if (OMPI_SUCCESS != ompi_unpack(buffer, &id_tag, 1, OMPI_INT32)) { if (OMPI_SUCCESS != ompi_unpack(buffer, &i, 1, OMPI_INT32)) {
goto RETURN_ERROR; goto RETURN_ERROR;
} }
id_tag = (ompi_registry_notify_id_t)i;
if (mca_gpr_proxy_debug) {
ompi_output(0, "[%d,%d,%d] trigger from segment %s id %d",
OMPI_NAME_ARGS(*ompi_rte_get_self()), message->segment, (int)id_tag);
}
if (OMPI_SUCCESS != ompi_unpack(buffer, &message->trig_action, 1, MCA_GPR_OOB_PACK_ACTION)) { if (OMPI_SUCCESS != ompi_unpack(buffer, &message->trig_action, 1, MCA_GPR_OOB_PACK_ACTION)) {
goto RETURN_ERROR; goto RETURN_ERROR;
@ -330,6 +338,8 @@ void mca_gpr_proxy_notify_recv(int status, ompi_process_name_t* sender,
for (trackptr = (mca_gpr_proxy_notify_request_tracker_t*)ompi_list_get_first(&mca_gpr_proxy_notify_request_tracker); for (trackptr = (mca_gpr_proxy_notify_request_tracker_t*)ompi_list_get_first(&mca_gpr_proxy_notify_request_tracker);
trackptr != (mca_gpr_proxy_notify_request_tracker_t*)ompi_list_get_end(&mca_gpr_proxy_notify_request_tracker) && !found; trackptr != (mca_gpr_proxy_notify_request_tracker_t*)ompi_list_get_end(&mca_gpr_proxy_notify_request_tracker) && !found;
trackptr = (mca_gpr_proxy_notify_request_tracker_t*)ompi_list_get_next(trackptr)) { trackptr = (mca_gpr_proxy_notify_request_tracker_t*)ompi_list_get_next(trackptr)) {
ompi_output(0, "\tchecking trigger %d for segment %s\n", trackptr->local_idtag,
trackptr->segment);
if (trackptr->local_idtag == id_tag) { if (trackptr->local_idtag == id_tag) {
found = true; found = true;
} }

Просмотреть файл

@ -84,6 +84,12 @@ mca_gpr_proxy_subscribe(ompi_registry_mode_t mode,
goto CLEANUP; goto CLEANUP;
} }
if (mca_gpr_proxy_debug) {
ompi_output(0, "[%d,%d,%d] gpr proxy subscribe: subscribing to segment %s local idtag %d",
OMPI_NAME_ARGS(*ompi_rte_get_self()), segment, (int)idtag);
}
if (0 > mca_oob_send_packed(mca_gpr_my_replica, cmd, MCA_OOB_TAG_GPR, 0)) { if (0 > mca_oob_send_packed(mca_gpr_my_replica, cmd, MCA_OOB_TAG_GPR, 0)) {
goto CLEANUP; goto CLEANUP;
} }

Просмотреть файл

@ -232,6 +232,12 @@ bool mca_gpr_replica_process_triggers(mca_gpr_replica_segment_t *seg,
cb->user_tag = NULL; cb->user_tag = NULL;
cb->message = message; cb->message = message;
cb->remote_idtag = trackptr->remote_idtag; cb->remote_idtag = trackptr->remote_idtag;
if (mca_gpr_replica_debug) {
ompi_output(0, "[%d,%d,%d] process_trig: queueing message for [%d,%d,%d] with idtag %d using remoteid %d\n",
OMPI_NAME_ARGS(*ompi_rte_get_self()), OMPI_NAME_ARGS(*(cb->requestor)),
(int)cb->remote_idtag, (int)trackptr->remote_idtag);
}
} }
ompi_list_append(&mca_gpr_replica_callbacks, &cb->item); ompi_list_append(&mca_gpr_replica_callbacks, &cb->item);
@ -316,8 +322,7 @@ mca_gpr_replica_enter_notify_request(mca_gpr_replica_segment_t *seg,
trackptr->segptr = seg; trackptr->segptr = seg;
trackptr->action = action; trackptr->action = action;
trackptr->requestor = ompi_name_server.copy_process_name(requestor); trackptr->requestor = ompi_name_server.copy_process_name(requestor);
trackptr->local_idtag = idtag; trackptr->remote_idtag = idtag;
trackptr->remote_idtag = OMPI_REGISTRY_NOTIFY_ID_MAX;
trackptr->callback = cb_func; trackptr->callback = cb_func;
trackptr->user_tag = user_tag; trackptr->user_tag = user_tag;
if (ompi_list_is_empty(&mca_gpr_replica_free_notify_id_tags)) { if (ompi_list_is_empty(&mca_gpr_replica_free_notify_id_tags)) {

Просмотреть файл

@ -926,6 +926,11 @@ static ompi_registry_notify_id_t mca_gpr_replica_recv_subscribe_cmd(ompi_process
if (NULL != sender) { /* remote sender */ if (NULL != sender) { /* remote sender */
if (mca_gpr_replica_debug) {
ompi_output(0, "[%d,%d,%d] subscribe created for remote sender [%d,%d,%d] on segment %s for idtag %d",
OMPI_NAME_ARGS(*ompi_rte_get_self()), OMPI_NAME_ARGS(*sender), segment, id_tag);
}
/* enter request on local notify tracking system */ /* enter request on local notify tracking system */
local_idtag1 = mca_gpr_replica_enter_notify_request(seg, action, sender, id_tag, NULL, NULL); local_idtag1 = mca_gpr_replica_enter_notify_request(seg, action, sender, id_tag, NULL, NULL);

Просмотреть файл

@ -323,8 +323,7 @@ void mca_gpr_replica_remote_notify(ompi_process_name_t *recipient, int recipient
if (OMPI_SUCCESS != ompi_pack(msg, regval->object, regval->object_size, OMPI_BYTE)) { if (OMPI_SUCCESS != ompi_pack(msg, regval->object, regval->object_size, OMPI_BYTE)) {
return; return;
} }
/* TSW - should we add */ OBJ_RELEASE(regval);
/* OBJ_RELEASE(regval); */
} }
} }
if (OMPI_SUCCESS != ompi_pack(msg, &message->num_tokens, 1, OMPI_INT32)) { if (OMPI_SUCCESS != ompi_pack(msg, &message->num_tokens, 1, OMPI_INT32)) {