1
1

comm: detect if we run out of communicator ids (cids)

Due to a leak in the osc/rdma component we were running out of cids on
a one-sided tests. This resulted in a hang instead of an error. This
commit causes the nextcid algorithm to return an error if we run out
of cids.

cmr=v1.8.2:reviewer=jsquyres

This commit was SVN r31538.
Этот коммит содержится в:
Nathan Hjelm 2014-04-28 19:55:09 +00:00
родитель 3723b39f30
Коммит e410401523

Просмотреть файл

@ -1,4 +1,4 @@
/* -*- Mode: C; c-basic-offset:4 ; -*- */
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
@ -14,7 +14,7 @@
* Copyright (c) 2007 Voltaire All rights reserved.
* Copyright (c) 2006-2010 University of Houston. All rights reserved.
* Copyright (c) 2009 Sun Microsystems, Inc. All rights reserved.
* Copyright (c) 2012-2013 Los Alamos National Security, LLC. All rights
* Copyright (c) 2012-2014 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2012 Oak Ridge National Labs. All rights reserved.
* Copyright (c) 2013 Intel, Inc. All rights reserved.
@ -248,6 +248,7 @@ int ompi_comm_nextcid ( ompi_communicator_t* newcomm,
}
OPAL_THREAD_UNLOCK(&ompi_cid_lock);
nextlocal_cid = mca_pml.pml_max_contextid;
for (i=start; i < mca_pml.pml_max_contextid ; i++) {
flag = opal_pointer_array_test_and_set_item(&ompi_mpi_communicators,
i, comm);
@ -263,6 +264,16 @@ int ompi_comm_nextcid ( ompi_communicator_t* newcomm,
opal_pointer_array_set_item(&ompi_mpi_communicators, nextlocal_cid, NULL);
goto release_and_return;
}
if (mca_pml.pml_max_contextid == (unsigned int) nextcid) {
/* at least one peer ran out of CIDs */
if (1 == flag) {
opal_pointer_array_set_item(&ompi_mpi_communicators, nextlocal_cid, NULL);
ret = OMPI_ERR_OUT_OF_RESOURCE;
goto release_and_return;
}
}
if (nextcid == nextlocal_cid) {
response = 1; /* fine with me */
}
@ -401,6 +412,7 @@ static int ompi_comm_allreduce_getnextcid (ompi_comm_request_t *request)
}
OPAL_THREAD_UNLOCK(&ompi_cid_lock);
context->nextlocal_cid = mca_pml.pml_max_contextid;
for (i = context->start ; i < mca_pml.pml_max_contextid ; ++i) {
flag = opal_pointer_array_test_and_set_item(&ompi_mpi_communicators,
i, context->comm);
@ -422,6 +434,15 @@ static int ompi_comm_allreduce_getnextcid (ompi_comm_request_t *request)
return ret;
}
if ((unsigned int) context->nextlocal_cid == mca_pml.pml_max_contextid) {
/* at least one peer ran out of CIDs */
if (flag) {
opal_pointer_array_test_and_set_item(&ompi_mpi_communicators, context->nextlocal_cid, NULL);
}
return OMPI_ERR_OUT_OF_RESOURCE;
}
/* next we want to verify that the resulting commid is ok */
ompi_comm_request_schedule_append (request, ompi_comm_checkcid, &subreq, 1);