comm: detect if we run out of communicator ids (cids)
Due to a leak in the osc/rdma component we were running out of cids on a one-sided tests. This resulted in a hang instead of an error. This commit causes the nextcid algorithm to return an error if we run out of cids. cmr=v1.8.2:reviewer=jsquyres This commit was SVN r31538.
Этот коммит содержится в:
родитель
3723b39f30
Коммит
e410401523
@ -1,4 +1,4 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; -*- */
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
@ -14,7 +14,7 @@
|
||||
* Copyright (c) 2007 Voltaire All rights reserved.
|
||||
* Copyright (c) 2006-2010 University of Houston. All rights reserved.
|
||||
* Copyright (c) 2009 Sun Microsystems, Inc. All rights reserved.
|
||||
* Copyright (c) 2012-2013 Los Alamos National Security, LLC. All rights
|
||||
* Copyright (c) 2012-2014 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2012 Oak Ridge National Labs. All rights reserved.
|
||||
* Copyright (c) 2013 Intel, Inc. All rights reserved.
|
||||
@ -248,6 +248,7 @@ int ompi_comm_nextcid ( ompi_communicator_t* newcomm,
|
||||
}
|
||||
OPAL_THREAD_UNLOCK(&ompi_cid_lock);
|
||||
|
||||
nextlocal_cid = mca_pml.pml_max_contextid;
|
||||
for (i=start; i < mca_pml.pml_max_contextid ; i++) {
|
||||
flag = opal_pointer_array_test_and_set_item(&ompi_mpi_communicators,
|
||||
i, comm);
|
||||
@ -263,6 +264,16 @@ int ompi_comm_nextcid ( ompi_communicator_t* newcomm,
|
||||
opal_pointer_array_set_item(&ompi_mpi_communicators, nextlocal_cid, NULL);
|
||||
goto release_and_return;
|
||||
}
|
||||
|
||||
if (mca_pml.pml_max_contextid == (unsigned int) nextcid) {
|
||||
/* at least one peer ran out of CIDs */
|
||||
if (1 == flag) {
|
||||
opal_pointer_array_set_item(&ompi_mpi_communicators, nextlocal_cid, NULL);
|
||||
ret = OMPI_ERR_OUT_OF_RESOURCE;
|
||||
goto release_and_return;
|
||||
}
|
||||
}
|
||||
|
||||
if (nextcid == nextlocal_cid) {
|
||||
response = 1; /* fine with me */
|
||||
}
|
||||
@ -401,6 +412,7 @@ static int ompi_comm_allreduce_getnextcid (ompi_comm_request_t *request)
|
||||
}
|
||||
OPAL_THREAD_UNLOCK(&ompi_cid_lock);
|
||||
|
||||
context->nextlocal_cid = mca_pml.pml_max_contextid;
|
||||
for (i = context->start ; i < mca_pml.pml_max_contextid ; ++i) {
|
||||
flag = opal_pointer_array_test_and_set_item(&ompi_mpi_communicators,
|
||||
i, context->comm);
|
||||
@ -422,6 +434,15 @@ static int ompi_comm_allreduce_getnextcid (ompi_comm_request_t *request)
|
||||
return ret;
|
||||
}
|
||||
|
||||
if ((unsigned int) context->nextlocal_cid == mca_pml.pml_max_contextid) {
|
||||
/* at least one peer ran out of CIDs */
|
||||
if (flag) {
|
||||
opal_pointer_array_test_and_set_item(&ompi_mpi_communicators, context->nextlocal_cid, NULL);
|
||||
}
|
||||
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
/* next we want to verify that the resulting commid is ok */
|
||||
ompi_comm_request_schedule_append (request, ompi_comm_checkcid, &subreq, 1);
|
||||
|
||||
|
Загрузка…
Ссылка в новой задаче
Block a user