1
1

Merge pull request #1987 from hjelmn/cid

comm/cid: fix threaded CID allocation
Этот коммит содержится в:
Nathan Hjelm 2016-08-19 14:26:39 -06:00 коммит произвёл GitHub
родитель bb6b87f086 fbbf743c36
Коммит f3e9a72f1a

Просмотреть файл

@ -181,6 +181,7 @@ static ompi_comm_cid_context_t *mca_comm_cid_context_alloc (ompi_communicator_t
context->newcomm = newcomm; context->newcomm = newcomm;
context->comm = comm; context->comm = comm;
context->bridgecomm = bridgecomm; context->bridgecomm = bridgecomm;
context->pml_tag = 0;
/* Determine which implementation of allreduce we have to use /* Determine which implementation of allreduce we have to use
* for the current mode. */ * for the current mode. */
@ -245,8 +246,8 @@ static int ompi_comm_allreduce_getnextcid (ompi_comm_request_t *request);
static int ompi_comm_checkcid (ompi_comm_request_t *request); static int ompi_comm_checkcid (ompi_comm_request_t *request);
/* verify that the cid was available globally */ /* verify that the cid was available globally */
static int ompi_comm_nextcid_check_flag (ompi_comm_request_t *request); static int ompi_comm_nextcid_check_flag (ompi_comm_request_t *request);
/* lock the cid generator */
static int ompi_comm_cid_lock (ompi_comm_request_t *request); static volatile int64_t ompi_comm_cid_lowest_id = INT64_MAX;
int ompi_comm_nextcid_nb (ompi_communicator_t *newcomm, ompi_communicator_t *comm, int ompi_comm_nextcid_nb (ompi_communicator_t *newcomm, ompi_communicator_t *comm,
ompi_communicator_t *bridgecomm, const void *arg0, const void *arg1, ompi_communicator_t *bridgecomm, const void *arg0, const void *arg1,
@ -271,7 +272,7 @@ int ompi_comm_nextcid_nb (ompi_communicator_t *newcomm, ompi_communicator_t *com
request->context = &context->super; request->context = &context->super;
ompi_comm_request_schedule_append (request, ompi_comm_cid_lock, NULL, 0); ompi_comm_request_schedule_append (request, ompi_comm_allreduce_getnextcid, NULL, 0);
ompi_comm_request_start (request); ompi_comm_request_start (request);
*req = &request->super; *req = &request->super;
@ -299,30 +300,33 @@ int ompi_comm_nextcid (ompi_communicator_t *newcomm, ompi_communicator_t *comm,
return rc; return rc;
} }
static int ompi_comm_cid_lock (ompi_comm_request_t *request)
{
if (!OPAL_THREAD_TRYLOCK(&ompi_cid_lock)) {
return ompi_comm_request_schedule_append (request, ompi_comm_allreduce_getnextcid, NULL, 0);
}
return ompi_comm_request_schedule_append (request, ompi_comm_cid_lock, NULL, 0);
}
static int ompi_comm_allreduce_getnextcid (ompi_comm_request_t *request) static int ompi_comm_allreduce_getnextcid (ompi_comm_request_t *request)
{ {
ompi_comm_cid_context_t *context = (ompi_comm_cid_context_t *) request->context; ompi_comm_cid_context_t *context = (ompi_comm_cid_context_t *) request->context;
int64_t my_id = ((int64_t) ompi_comm_get_cid (context->comm) << 32 | context->pml_tag);
ompi_request_t *subreq; ompi_request_t *subreq;
bool flag; bool flag;
int ret; int ret;
if (OPAL_THREAD_TRYLOCK(&ompi_cid_lock)) {
return ompi_comm_request_schedule_append (request, ompi_comm_allreduce_getnextcid, NULL, 0);
}
if (ompi_comm_cid_lowest_id < my_id) {
OPAL_THREAD_UNLOCK(&ompi_cid_lock);
return ompi_comm_request_schedule_append (request, ompi_comm_allreduce_getnextcid, NULL, 0);
}
ompi_comm_cid_lowest_id = my_id;
/** /**
* This is the real algorithm described in the doc * This is the real algorithm described in the doc
*/ */
flag = false; flag = false;
context->nextlocal_cid = mca_pml.pml_max_contextid; context->nextlocal_cid = mca_pml.pml_max_contextid;
for (unsigned int i = context->start ; i < mca_pml.pml_max_contextid ; ++i) { for (unsigned int i = context->start ; i < mca_pml.pml_max_contextid ; ++i) {
flag = opal_pointer_array_test_and_set_item(&ompi_mpi_communicators, flag = opal_pointer_array_test_and_set_item (&ompi_mpi_communicators, i,
i, context->comm); context->comm);
if (true == flag) { if (true == flag) {
context->nextlocal_cid = i; context->nextlocal_cid = i;
break; break;
@ -332,6 +336,7 @@ static int ompi_comm_allreduce_getnextcid (ompi_comm_request_t *request)
ret = context->allreduce_fn (&context->nextlocal_cid, &context->nextcid, 1, MPI_MAX, ret = context->allreduce_fn (&context->nextlocal_cid, &context->nextcid, 1, MPI_MAX,
context, &subreq); context, &subreq);
if (OMPI_SUCCESS != ret) { if (OMPI_SUCCESS != ret) {
ompi_comm_cid_lowest_id = INT64_MAX;
OPAL_THREAD_UNLOCK(&ompi_cid_lock); OPAL_THREAD_UNLOCK(&ompi_cid_lock);
return ret; return ret;
} }
@ -341,10 +346,12 @@ static int ompi_comm_allreduce_getnextcid (ompi_comm_request_t *request)
if (flag) { if (flag) {
opal_pointer_array_test_and_set_item(&ompi_mpi_communicators, context->nextlocal_cid, NULL); opal_pointer_array_test_and_set_item(&ompi_mpi_communicators, context->nextlocal_cid, NULL);
} }
ompi_comm_cid_lowest_id = INT64_MAX;
OPAL_THREAD_UNLOCK(&ompi_cid_lock); OPAL_THREAD_UNLOCK(&ompi_cid_lock);
return OMPI_ERR_OUT_OF_RESOURCE; return OMPI_ERR_OUT_OF_RESOURCE;
} }
OPAL_THREAD_UNLOCK(&ompi_cid_lock);
/* next we want to verify that the resulting commid is ok */ /* next we want to verify that the resulting commid is ok */
return ompi_comm_request_schedule_append (request, ompi_comm_checkcid, &subreq, 1); return ompi_comm_request_schedule_append (request, ompi_comm_checkcid, &subreq, 1);
@ -356,6 +363,10 @@ static int ompi_comm_checkcid (ompi_comm_request_t *request)
ompi_request_t *subreq; ompi_request_t *subreq;
int ret; int ret;
if (OPAL_THREAD_TRYLOCK(&ompi_cid_lock)) {
return ompi_comm_request_schedule_append (request, ompi_comm_checkcid, NULL, 0);
}
context->flag = (context->nextcid == context->nextlocal_cid); context->flag = (context->nextcid == context->nextlocal_cid);
if (!context->flag) { if (!context->flag) {
@ -372,6 +383,8 @@ static int ompi_comm_checkcid (ompi_comm_request_t *request)
ompi_comm_request_schedule_append (request, ompi_comm_nextcid_check_flag, &subreq, 1); ompi_comm_request_schedule_append (request, ompi_comm_nextcid_check_flag, &subreq, 1);
} }
OPAL_THREAD_UNLOCK(&ompi_cid_lock);
return ret; return ret;
} }
@ -379,12 +392,17 @@ static int ompi_comm_nextcid_check_flag (ompi_comm_request_t *request)
{ {
ompi_comm_cid_context_t *context = (ompi_comm_cid_context_t *) request->context; ompi_comm_cid_context_t *context = (ompi_comm_cid_context_t *) request->context;
if (OPAL_THREAD_TRYLOCK(&ompi_cid_lock)) {
return ompi_comm_request_schedule_append (request, ompi_comm_nextcid_check_flag, NULL, 0);
}
if (1 == context->rflag) { if (1 == context->rflag) {
/* set the according values to the newcomm */ /* set the according values to the newcomm */
context->newcomm->c_contextid = context->nextcid; context->newcomm->c_contextid = context->nextcid;
opal_pointer_array_set_item (&ompi_mpi_communicators, context->nextcid, context->newcomm); opal_pointer_array_set_item (&ompi_mpi_communicators, context->nextcid, context->newcomm);
/* unlock the cid generator */ /* unlock the cid generator */
ompi_comm_cid_lowest_id = INT64_MAX;
OPAL_THREAD_UNLOCK(&ompi_cid_lock); OPAL_THREAD_UNLOCK(&ompi_cid_lock);
/* done! */ /* done! */
@ -399,6 +417,8 @@ static int ompi_comm_nextcid_check_flag (ompi_comm_request_t *request)
++context->iter; ++context->iter;
OPAL_THREAD_UNLOCK(&ompi_cid_lock);
/* try again */ /* try again */
return ompi_comm_allreduce_getnextcid (request); return ompi_comm_allreduce_getnextcid (request);
} }