From 9c788ff9400960823637dc0eebb3a8640414fa05 Mon Sep 17 00:00:00 2001 From: Nathan Hjelm Date: Thu, 25 Sep 2014 17:43:29 +0000 Subject: [PATCH] coll/basic: fix segmentation fault in neighborhood collectives if the degree of the topology is higher than the communicator size It is possible to have a topology degree higher than the size of the communicator. For example, a periodic cartesian communicator on MPI_COMM_SELF. This will leave the neighborhood collectives with a request buffer that is too small. This commit adds a call that will dynamically increase the size of the request buffer if it is too small. A better fix would be to create the topology *before* calling the coll_select routine on a communicator. This will take some discussion and the solution will not likely be ready anytime soon. Thanks to Lisandro Dalcin for reporting this. Original thread: http://www.open-mpi.org/community/lists/devel/2014/08/15713.php cmr=v1.8.3:reviewer=jsquyres This commit was SVN r32796. --- ompi/mca/coll/basic/coll_basic.h | 21 ++++++++++++++++++- ompi/mca/coll/basic/coll_basic_module.c | 13 +++++++----- .../basic/coll_basic_neighbor_allgather.c | 20 +++++++++++++++++- .../basic/coll_basic_neighbor_allgatherv.c | 20 +++++++++++++++++- .../coll/basic/coll_basic_neighbor_alltoall.c | 20 +++++++++++++++++- .../basic/coll_basic_neighbor_alltoallv.c | 20 +++++++++++++++++- .../basic/coll_basic_neighbor_alltoallw.c | 19 ++++++++++++++++- 7 files changed, 122 insertions(+), 11 deletions(-) diff --git a/ompi/mca/coll/basic/coll_basic.h b/ompi/mca/coll/basic/coll_basic.h index ca0b6d558f..425dd3066d 100644 --- a/ompi/mca/coll/basic/coll_basic.h +++ b/ompi/mca/coll/basic/coll_basic.h @@ -12,7 +12,7 @@ * All rights reserved. * Copyright (c) 2008 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2012 Sandia National Laboratories. All rights reserved. - * Copyright (c) 2013 Los Alamos National Security, LLC. All rights + * Copyright (c) 2013-2014 Los Alamos National Security, LLC. All rights * reserved. * $COPYRIGHT$ * @@ -354,6 +354,25 @@ struct mca_coll_basic_module_t { int mccb_num_reqs; }; typedef struct mca_coll_basic_module_t mca_coll_basic_module_t; + +static inline int mca_coll_basic_check_for_requests (mca_coll_basic_module_t *basic_module, int max_reqs) +{ + if (basic_module->mccb_num_reqs < max_reqs) { + void *tmp; + + basic_module->mccb_num_reqs = max_reqs; + + tmp = realloc (basic_module->mccb_reqs, sizeof(ompi_request_t *) * basic_module->mccb_num_reqs); + if (NULL == tmp) { + return OMPI_ERR_OUT_OF_RESOURCE; + } + + basic_module->mccb_reqs = tmp; + } + + return OMPI_SUCCESS; +} + OBJ_CLASS_DECLARATION(mca_coll_basic_module_t); END_C_DECLS diff --git a/ompi/mca/coll/basic/coll_basic_module.c b/ompi/mca/coll/basic/coll_basic_module.c index 8bb929a1f1..cb9b0d769b 100644 --- a/ompi/mca/coll/basic/coll_basic_module.c +++ b/ompi/mca/coll/basic/coll_basic_module.c @@ -11,7 +11,7 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2012 Sandia National Laboratories. All rights reserved. - * Copyright (c) 2013 Los Alamos National Security, LLC. All rights + * Copyright (c) 2013-2014 Los Alamos National Security, LLC. All rights * reserved. * $COPYRIGHT$ * @@ -55,7 +55,7 @@ mca_coll_base_module_t * mca_coll_basic_comm_query(struct ompi_communicator_t *comm, int *priority) { - int size; + int size, ret; mca_coll_basic_module_t *basic_module; basic_module = OBJ_NEW(mca_coll_basic_module_t); @@ -70,9 +70,12 @@ mca_coll_basic_comm_query(struct ompi_communicator_t *comm, } else { size = ompi_comm_size(comm); } - basic_module->mccb_num_reqs = size * 2; - basic_module->mccb_reqs = (ompi_request_t**) - malloc(sizeof(ompi_request_t *) * basic_module->mccb_num_reqs); + + ret = mca_coll_basic_check_for_requests (basic_module, size * 2); + if (OMPI_SUCCESS != ret) { + OBJ_RELEASE(basic_module); + return NULL; + } /* Choose whether to use [intra|inter], and [linear|log]-based * algorithms. */ diff --git a/ompi/mca/coll/basic/coll_basic_neighbor_allgather.c b/ompi/mca/coll/basic/coll_basic_neighbor_allgather.c index 8d8242dc9b..31aba0d4bf 100644 --- a/ompi/mca/coll/basic/coll_basic_neighbor_allgather.c +++ b/ompi/mca/coll/basic/coll_basic_neighbor_allgather.c @@ -10,7 +10,7 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. - * Copyright (c) 2013 Los Alamos National Security, LLC. All rights + * Copyright (c) 2013-2014 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2014 Research Organization for Information Science * and Technology (RIST). All rights reserved. @@ -49,6 +49,12 @@ mca_coll_basic_neighbor_allgather_cart(const void *sbuf, int scount, ptrdiff_t lb, extent; int rc = MPI_SUCCESS, dim, nreqs; + /* ensure we have enough storage for requests */ + rc = mca_coll_basic_check_for_requests (basic_module, cart->ndims * 4); + if (OMPI_SUCCESS != rc) { + return rc; + } + ompi_datatype_get_extent(rdtype, &lb, &extent); /* The ordering is defined as -1 then +1 in each dimension in @@ -126,6 +132,12 @@ mca_coll_basic_neighbor_allgather_graph(const void *sbuf, int scount, mca_topo_base_graph_neighbors_count (comm, rank, °ree); + /* ensure we have enough storage for requests */ + rc = mca_coll_basic_check_for_requests (basic_module, degree * 2); + if (OMPI_SUCCESS != rc) { + return rc; + } + edges = graph->edges; if (rank > 0) { edges += graph->index[rank - 1]; @@ -173,6 +185,12 @@ mca_coll_basic_neighbor_allgather_dist_graph(const void *sbuf, int scount, indegree = dist_graph->indegree; outdegree = dist_graph->outdegree; + /* ensure we have enough storage for requests */ + rc = mca_coll_basic_check_for_requests (basic_module, indegree + outdegree); + if (OMPI_SUCCESS != rc) { + return rc; + } + inedges = dist_graph->in; outedges = dist_graph->out; diff --git a/ompi/mca/coll/basic/coll_basic_neighbor_allgatherv.c b/ompi/mca/coll/basic/coll_basic_neighbor_allgatherv.c index cdcf91de95..89e4d584af 100644 --- a/ompi/mca/coll/basic/coll_basic_neighbor_allgatherv.c +++ b/ompi/mca/coll/basic/coll_basic_neighbor_allgatherv.c @@ -10,7 +10,7 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. - * Copyright (c) 2013 Los Alamos National Security, LLC. All rights + * Copyright (c) 2013-2014 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2014 Research Organization for Information Science * and Technology (RIST). All rights reserved. @@ -48,6 +48,12 @@ mca_coll_basic_neighbor_allgatherv_cart(const void *sbuf, int scount, struct omp ptrdiff_t lb, extent; int rc = MPI_SUCCESS, dim, i, nreqs; + /* ensure we have enough storage for requests */ + rc = mca_coll_basic_check_for_requests (basic_module, cart->ndims * 4); + if (OMPI_SUCCESS != rc) { + return rc; + } + ompi_datatype_get_extent(rdtype, &lb, &extent); reqs = basic_module->mccb_reqs; @@ -113,6 +119,12 @@ mca_coll_basic_neighbor_allgatherv_graph(const void *sbuf, int scount, struct om mca_topo_base_graph_neighbors_count (comm, rank, °ree); + /* ensure we have enough storage for requests */ + rc = mca_coll_basic_check_for_requests (basic_module, degree * 2); + if (OMPI_SUCCESS != rc) { + return rc; + } + edges = graph->edges; if (rank > 0) { edges += graph->index[rank - 1]; @@ -158,6 +170,12 @@ mca_coll_basic_neighbor_allgatherv_dist_graph(const void *sbuf, int scount, stru indegree = dist_graph->indegree; outdegree = dist_graph->outdegree; + /* ensure we have enough storage for requests */ + rc = mca_coll_basic_check_for_requests (basic_module, indegree + outdegree); + if (OMPI_SUCCESS != rc) { + return rc; + } + inedges = dist_graph->in; outedges = dist_graph->out; diff --git a/ompi/mca/coll/basic/coll_basic_neighbor_alltoall.c b/ompi/mca/coll/basic/coll_basic_neighbor_alltoall.c index 289f60acbc..4517fd8f4e 100644 --- a/ompi/mca/coll/basic/coll_basic_neighbor_alltoall.c +++ b/ompi/mca/coll/basic/coll_basic_neighbor_alltoall.c @@ -10,7 +10,7 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. - * Copyright (c) 2013 Los Alamos National Security, LLC. All rights + * Copyright (c) 2013-2014 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2014 Research Organization for Information Science * and Technology (RIST). All rights reserved. @@ -47,6 +47,12 @@ mca_coll_basic_neighbor_alltoall_cart(const void *sbuf, int scount, struct ompi_ ptrdiff_t lb, rdextent, sdextent; int rc = MPI_SUCCESS, dim, nreqs; + /* ensure we have enough storage for requests */ + rc = mca_coll_basic_check_for_requests (basic_module, cart->ndims * 4); + if (OMPI_SUCCESS != rc) { + return rc; + } + ompi_datatype_get_extent(rdtype, &lb, &rdextent); ompi_datatype_get_extent(sdtype, &lb, &sdextent); @@ -143,6 +149,12 @@ mca_coll_basic_neighbor_alltoall_graph(const void *sbuf, int scount, struct ompi mca_topo_base_graph_neighbors_count (comm, rank, °ree); + /* ensure we have enough storage for requests */ + rc = mca_coll_basic_check_for_requests (basic_module, degree * 2); + if (OMPI_SUCCESS != rc) { + return rc; + } + edges = graph->edges; if (rank > 0) { edges += graph->index[rank - 1]; @@ -193,6 +205,12 @@ mca_coll_basic_neighbor_alltoall_dist_graph(const void *sbuf, int scount,struct indegree = dist_graph->indegree; outdegree = dist_graph->outdegree; + /* ensure we have enough storage for requests */ + rc = mca_coll_basic_check_for_requests (basic_module, indegree + outdegree); + if (OMPI_SUCCESS != rc) { + return rc; + } + inedges = dist_graph->in; outedges = dist_graph->out; diff --git a/ompi/mca/coll/basic/coll_basic_neighbor_alltoallv.c b/ompi/mca/coll/basic/coll_basic_neighbor_alltoallv.c index 9ace900662..3c909968cc 100644 --- a/ompi/mca/coll/basic/coll_basic_neighbor_alltoallv.c +++ b/ompi/mca/coll/basic/coll_basic_neighbor_alltoallv.c @@ -10,7 +10,7 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. - * Copyright (c) 2013 Los Alamos National Security, LLC. All rights + * Copyright (c) 2013-2014 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2014 Research Organization for Information Science * and Technology (RIST). All rights reserved. @@ -48,6 +48,12 @@ mca_coll_basic_neighbor_alltoallv_cart(const void *sbuf, const int scounts[], co ptrdiff_t lb, rdextent, sdextent; ompi_request_t **reqs; + /* ensure we have enough storage for requests */ + rc = mca_coll_basic_check_for_requests (basic_module, cart->ndims * 4); + if (OMPI_SUCCESS != rc) { + return rc; + } + ompi_datatype_get_extent(rdtype, &lb, &rdextent); ompi_datatype_get_extent(sdtype, &lb, &sdextent); @@ -130,6 +136,12 @@ mca_coll_basic_neighbor_alltoallv_graph(const void *sbuf, const int scounts[], c mca_topo_base_graph_neighbors_count (comm, rank, °ree); + /* ensure we have enough storage for requests */ + rc = mca_coll_basic_check_for_requests (basic_module, 2 * degree); + if (OMPI_SUCCESS != rc) { + return rc; + } + edges = graph->edges; if (rank > 0) { edges += graph->index[rank - 1]; @@ -183,6 +195,12 @@ mca_coll_basic_neighbor_alltoallv_dist_graph(const void *sbuf, const int scounts indegree = dist_graph->indegree; outdegree = dist_graph->outdegree; + /* ensure we have enough storage for requests */ + rc = mca_coll_basic_check_for_requests (basic_module, indegree + outdegree); + if (OMPI_SUCCESS != rc) { + return rc; + } + inedges = dist_graph->in; outedges = dist_graph->out; diff --git a/ompi/mca/coll/basic/coll_basic_neighbor_alltoallw.c b/ompi/mca/coll/basic/coll_basic_neighbor_alltoallw.c index 28ecf04cbb..dcf3592250 100644 --- a/ompi/mca/coll/basic/coll_basic_neighbor_alltoallw.c +++ b/ompi/mca/coll/basic/coll_basic_neighbor_alltoallw.c @@ -10,7 +10,7 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. - * Copyright (c) 2013 Los Alamos National Security, LLC. All rights + * Copyright (c) 2013-2014 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2014 Research Organization for Information Science * and Technology (RIST). All rights reserved. @@ -47,6 +47,11 @@ mca_coll_basic_neighbor_alltoallw_cart(const void *sbuf, const int scounts[], co int rc = MPI_SUCCESS, dim, i, nreqs; ompi_request_t **reqs; + /* ensure we have enough storage for requests */ + rc = mca_coll_basic_check_for_requests (basic_module, cart->ndims * 4); + if (OMPI_SUCCESS != rc) { + return rc; + } /* post receives first */ for (dim = 0, i = 0, nreqs = 0, reqs = basic_module->mccb_reqs ; dim < cart->ndims ; ++dim, i += 2) { @@ -126,6 +131,12 @@ mca_coll_basic_neighbor_alltoallw_graph(const void *sbuf, const int scounts[], c mca_topo_base_graph_neighbors_count (comm, rank, °ree); + /* ensure we have enough storage for requests */ + rc = mca_coll_basic_check_for_requests (basic_module, 2 * degree); + if (OMPI_SUCCESS != rc) { + return rc; + } + edges = graph->edges; if (rank > 0) { edges += graph->index[rank - 1]; @@ -175,6 +186,12 @@ mca_coll_basic_neighbor_alltoallw_dist_graph(const void *sbuf, const int scounts indegree = dist_graph->indegree; outdegree = dist_graph->outdegree; + /* ensure we have enough storage for requests */ + rc = mca_coll_basic_check_for_requests (basic_module, indegree + outdegree); + if (OMPI_SUCCESS != rc) { + return rc; + } + inedges = dist_graph->in; outedges = dist_graph->out;