2007-07-26 02:28:04 +04:00
|
|
|
/*
|
2011-10-04 18:50:31 +04:00
|
|
|
* Copyright (c) 2004-2011 The Trustees of the University of Tennessee.
|
2007-07-26 02:28:04 +04:00
|
|
|
* All rights reserved.
|
2012-04-06 18:23:13 +04:00
|
|
|
* Copyright (c) 2012 Los Alamos National Security, LLC. All rights
|
2015-06-24 06:59:57 +03:00
|
|
|
* reserved.
|
2015-06-18 19:53:20 +03:00
|
|
|
* Copyright (c) 2015 Intel, Inc. All rights reserved.
|
2007-07-26 02:28:04 +04:00
|
|
|
* $COPYRIGHT$
|
|
|
|
*
|
|
|
|
* Additional copyrights may follow
|
|
|
|
*
|
|
|
|
* $HEADER$
|
|
|
|
*/
|
|
|
|
|
2008-01-05 03:17:32 +03:00
|
|
|
#include "ompi_config.h"
|
2007-07-31 20:01:32 +04:00
|
|
|
#include "vprotocol_pessimist_eventlog.h"
|
2015-06-18 19:53:20 +03:00
|
|
|
#include "opal/mca/pmix/pmix.h"
|
|
|
|
#include "ompi/dpm/dpm.h"
|
2008-10-01 22:42:43 +04:00
|
|
|
|
2009-03-17 20:35:28 +03:00
|
|
|
int vprotocol_pessimist_event_logger_connect(int el_rank, ompi_communicator_t **el_comm)
|
2008-10-01 22:42:43 +04:00
|
|
|
{
|
|
|
|
int rc;
|
|
|
|
char *port;
|
2009-03-17 20:35:28 +03:00
|
|
|
int rank;
|
|
|
|
vprotocol_pessimist_clock_t connect_info[2];
|
2015-06-18 19:53:20 +03:00
|
|
|
opal_list_t results;
|
|
|
|
opal_pmix_pdata_t *pdat;
|
|
|
|
|
|
|
|
OBJ_CONSTRUCT(&results, opal_list_t);
|
|
|
|
pdat = OBJ_NEW(opal_pmix_pdata_t);
|
2015-08-31 06:54:45 +03:00
|
|
|
asprintf(&pdat->value.key, VPROTOCOL_EVENT_LOGGER_NAME_FMT, el_rank);
|
2015-06-18 19:53:20 +03:00
|
|
|
opal_list_append(&results, &pdat->super);
|
2015-06-24 06:59:57 +03:00
|
|
|
|
2015-06-18 19:53:20 +03:00
|
|
|
rc = opal_pmix.lookup(OPAL_PMIX_NAMESPACE, &results);
|
|
|
|
if (OPAL_SUCCESS != rc ||
|
|
|
|
OPAL_STRING != pdat->value.type ||
|
|
|
|
NULL == pdat->value.data.string) {
|
|
|
|
OPAL_LIST_DESTRUCT(&results);
|
2011-10-04 18:50:31 +04:00
|
|
|
return OMPI_ERR_NOT_FOUND;
|
2009-03-17 20:35:28 +03:00
|
|
|
}
|
2015-06-18 19:53:20 +03:00
|
|
|
port = strdup(pdat->value.data.string);
|
|
|
|
OPAL_LIST_DESTRUCT(&results);
|
2008-10-01 22:42:43 +04:00
|
|
|
V_OUTPUT_VERBOSE(45, "Found port < %s >", port);
|
2015-06-24 06:59:57 +03:00
|
|
|
|
2015-06-18 19:53:20 +03:00
|
|
|
rc = ompi_dpm_connect_accept(MPI_COMM_SELF, 0, port, true, el_comm);
|
2008-10-01 22:42:43 +04:00
|
|
|
if(OMPI_SUCCESS != rc) {
|
2013-01-28 03:25:10 +04:00
|
|
|
OMPI_ERROR_LOG(rc);
|
2008-10-01 22:42:43 +04:00
|
|
|
}
|
2015-06-24 06:59:57 +03:00
|
|
|
|
2009-03-17 20:35:28 +03:00
|
|
|
/* Send Rank, receive max buffer size and max_clock back */
|
|
|
|
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
|
2015-06-24 06:59:57 +03:00
|
|
|
rc = mca_pml_v.host_pml.pml_send(&rank, 1, MPI_INTEGER, 0,
|
2009-03-17 20:35:28 +03:00
|
|
|
VPROTOCOL_PESSIMIST_EVENTLOG_NEW_CLIENT_CMD,
|
2015-06-24 06:59:57 +03:00
|
|
|
MCA_PML_BASE_SEND_STANDARD,
|
2009-03-17 20:35:28 +03:00
|
|
|
mca_vprotocol_pessimist.el_comm);
|
|
|
|
if(OPAL_UNLIKELY(MPI_SUCCESS != rc))
|
|
|
|
OMPI_ERRHANDLER_INVOKE(mca_vprotocol_pessimist.el_comm, rc,
|
|
|
|
__FILE__ ": failed sending event logger handshake");
|
2015-06-24 06:59:57 +03:00
|
|
|
rc = mca_pml_v.host_pml.pml_recv(&connect_info, 2, MPI_UNSIGNED_LONG_LONG,
|
2009-03-17 20:35:28 +03:00
|
|
|
0, VPROTOCOL_PESSIMIST_EVENTLOG_NEW_CLIENT_CMD,
|
|
|
|
mca_vprotocol_pessimist.el_comm, MPI_STATUS_IGNORE);
|
|
|
|
if(OPAL_UNLIKELY(MPI_SUCCESS != rc)) \
|
|
|
|
OMPI_ERRHANDLER_INVOKE(mca_vprotocol_pessimist.el_comm, rc, \
|
2015-06-24 06:59:57 +03:00
|
|
|
__FILE__ ": failed receiving event logger handshake");
|
|
|
|
|
2008-10-01 22:42:43 +04:00
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
|
|
|
|
int vprotocol_pessimist_event_logger_disconnect(ompi_communicator_t *el_comm)
|
|
|
|
{
|
2015-06-18 19:53:20 +03:00
|
|
|
ompi_dpm_disconnect(el_comm);
|
2008-10-01 22:42:43 +04:00
|
|
|
return OMPI_SUCCESS;
|
|
|
|
}
|
2008-03-28 00:05:44 +03:00
|
|
|
|
2007-07-31 20:01:32 +04:00
|
|
|
void vprotocol_pessimist_matching_replay(int *src) {
|
2009-05-07 00:11:28 +04:00
|
|
|
#if OPAL_ENABLE_DEBUG
|
2007-07-31 20:01:32 +04:00
|
|
|
vprotocol_pessimist_clock_t max = 0;
|
|
|
|
#endif
|
|
|
|
mca_vprotocol_pessimist_event_t *event;
|
2007-07-21 01:36:11 +04:00
|
|
|
|
2007-07-31 20:01:32 +04:00
|
|
|
/* searching this request in the event list */
|
|
|
|
for(event = (mca_vprotocol_pessimist_event_t *) opal_list_get_first(&mca_vprotocol_pessimist.replay_events);
|
|
|
|
event != (mca_vprotocol_pessimist_event_t *) opal_list_get_end(&mca_vprotocol_pessimist.replay_events);
|
|
|
|
event = (mca_vprotocol_pessimist_event_t *) opal_list_get_next(event))
|
|
|
|
{
|
|
|
|
vprotocol_pessimist_matching_event_t *mevent;
|
2015-06-24 06:59:57 +03:00
|
|
|
|
|
|
|
if(VPROTOCOL_PESSIMIST_EVENT_TYPE_MATCHING != event->type) continue;
|
2007-07-31 20:01:32 +04:00
|
|
|
mevent = &(event->u_event.e_matching);
|
|
|
|
if(mevent->reqid == mca_vprotocol_pessimist.clock)
|
|
|
|
{
|
|
|
|
/* this is the event to replay */
|
2007-07-31 21:12:21 +04:00
|
|
|
V_OUTPUT_VERBOSE(70, "pessimist: replay\tmatch\t%"PRIpclock"\trecv is forced from %d", mevent->reqid, mevent->src);
|
2007-07-31 20:01:32 +04:00
|
|
|
(*src) = mevent->src;
|
2015-06-24 06:59:57 +03:00
|
|
|
opal_list_remove_item(&mca_vprotocol_pessimist.replay_events,
|
2007-07-31 20:01:32 +04:00
|
|
|
(opal_list_item_t *) event);
|
|
|
|
VPESSIMIST_EVENT_RETURN(event);
|
2015-06-24 06:59:57 +03:00
|
|
|
}
|
2009-05-07 00:11:28 +04:00
|
|
|
#if OPAL_ENABLE_DEBUG
|
2015-06-24 06:59:57 +03:00
|
|
|
else if(mevent->reqid > max)
|
|
|
|
max = mevent->reqid;
|
2007-07-31 20:01:32 +04:00
|
|
|
}
|
|
|
|
/* not forcing a ANY SOURCE event whose recieve clock is lower than max
|
|
|
|
* is a bug indicating we have missed an event during logging ! */
|
|
|
|
assert(((*src) != MPI_ANY_SOURCE) || (mca_vprotocol_pessimist.clock > max));
|
|
|
|
#else
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
}
|
2007-07-21 01:36:11 +04:00
|
|
|
|
2007-07-31 20:01:32 +04:00
|
|
|
void vprotocol_pessimist_delivery_replay(size_t n, ompi_request_t **reqs,
|
2015-06-24 06:59:57 +03:00
|
|
|
int *outcount, int *index,
|
2007-12-07 11:17:30 +03:00
|
|
|
ompi_status_public_t *status) {
|
2007-07-31 20:01:32 +04:00
|
|
|
mca_vprotocol_pessimist_event_t *event;
|
2007-07-21 01:36:11 +04:00
|
|
|
|
2007-07-31 20:01:32 +04:00
|
|
|
for(event = (mca_vprotocol_pessimist_event_t *) opal_list_get_first(&mca_vprotocol_pessimist.replay_events);
|
|
|
|
event != (mca_vprotocol_pessimist_event_t *) opal_list_get_end(&mca_vprotocol_pessimist.replay_events);
|
|
|
|
event = (mca_vprotocol_pessimist_event_t *) opal_list_get_next(event))
|
2007-07-21 01:36:11 +04:00
|
|
|
{
|
2015-06-24 06:59:57 +03:00
|
|
|
vprotocol_pessimist_delivery_event_t *devent;
|
2007-07-21 01:36:11 +04:00
|
|
|
|
2007-07-31 20:01:32 +04:00
|
|
|
if(VPROTOCOL_PESSIMIST_EVENT_TYPE_DELIVERY != event->type) continue;
|
|
|
|
devent = &(event->u_event.e_delivery);
|
|
|
|
if(devent->probeid < mca_vprotocol_pessimist.clock)
|
|
|
|
{
|
|
|
|
/* this particular test have to return no request completed yet */
|
2007-07-31 21:12:21 +04:00
|
|
|
V_OUTPUT_VERBOSE(70, "pessimist:\treplay\tdeliver\t%"PRIpclock"\tnone", mca_vprotocol_pessimist.clock);
|
2007-07-31 20:01:32 +04:00
|
|
|
*index = MPI_UNDEFINED;
|
2007-12-07 11:17:30 +03:00
|
|
|
*outcount = 0;
|
2007-07-31 20:01:32 +04:00
|
|
|
mca_vprotocol_pessimist.clock++;
|
|
|
|
/* This request have to stay in the queue until probeid matches */
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
else if(devent->probeid == mca_vprotocol_pessimist.clock)
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
for(i = 0; i < (int) n; i++)
|
|
|
|
{
|
2008-03-28 00:05:44 +03:00
|
|
|
if(VPESSIMIST_FTREQ(reqs[i])->reqid == devent->reqid)
|
2007-07-31 20:01:32 +04:00
|
|
|
{
|
2007-07-31 21:12:21 +04:00
|
|
|
V_OUTPUT_VERBOSE(70, "pessimist:\treplay\tdeliver\t%"PRIpclock"\t%"PRIpclock, devent->probeid, devent->reqid);
|
2007-07-31 20:01:32 +04:00
|
|
|
opal_list_remove_item(&mca_vprotocol_pessimist.replay_events,
|
|
|
|
(opal_list_item_t *) event);
|
|
|
|
VPESSIMIST_EVENT_RETURN(event);
|
|
|
|
*index = i;
|
2007-12-07 11:17:30 +03:00
|
|
|
*outcount = 1;
|
2007-07-31 20:01:32 +04:00
|
|
|
mca_vprotocol_pessimist.clock++;
|
|
|
|
ompi_request_wait(&reqs[i], status);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
2007-07-31 21:12:21 +04:00
|
|
|
V_OUTPUT_VERBOSE(70, "pessimist:\treplay\tdeliver\t%"PRIpclock"\tnone", mca_vprotocol_pessimist.clock);
|
2007-07-31 20:01:32 +04:00
|
|
|
assert(devent->reqid == 0); /* make sure we don't missed a request */
|
|
|
|
*index = MPI_UNDEFINED;
|
2007-12-07 11:17:30 +03:00
|
|
|
*outcount = 0;
|
2007-07-31 20:01:32 +04:00
|
|
|
mca_vprotocol_pessimist.clock++;
|
|
|
|
opal_list_remove_item(&mca_vprotocol_pessimist.replay_events,
|
|
|
|
(opal_list_item_t *) event);
|
|
|
|
VPESSIMIST_EVENT_RETURN(event);
|
|
|
|
return;
|
|
|
|
}
|
2007-07-21 01:36:11 +04:00
|
|
|
}
|
2007-07-31 21:12:21 +04:00
|
|
|
V_OUTPUT_VERBOSE(50, "pessimist:\treplay\tdeliver\t%"PRIpclock"\tnot forced", mca_vprotocol_pessimist.clock);
|
2007-07-21 01:36:11 +04:00
|
|
|
}
|