From edfbeba7bf0a54e6ee59692b8bd13f2ea08a1aa9 Mon Sep 17 00:00:00 2001 From: Gilles Gouaillardet Date: Mon, 1 Sep 2014 10:00:49 +0000 Subject: [PATCH] coll/ml: better error handling when CHECK_AND_RECYCLE detects an error, a message is displayed if the error occurs on an intrinsic communicator, then abort the program (instead of trying to free the communicator) cmr=v1.8.3:reviewer=hjelmn This commit was SVN r32659. --- ompi/mca/coll/ml/coll_ml_colls.h | 18 +++++++++++++++--- ompi/mca/coll/ml/help-mpi-coll-ml.txt | 14 ++++++++++++++ 2 files changed, 29 insertions(+), 3 deletions(-) diff --git a/ompi/mca/coll/ml/coll_ml_colls.h b/ompi/mca/coll/ml/coll_ml_colls.h index f5f8b5d822..0b8519ee20 100644 --- a/ompi/mca/coll/ml/coll_ml_colls.h +++ b/ompi/mca/coll/ml/coll_ml_colls.h @@ -4,6 +4,8 @@ * Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. * Copyright (c) 2014 Los Alamos National Security, LLC. All rights * reserved. + * Copyright (c) 2014 Research Organization for Information Science + * and Technology (RIST). All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -424,9 +426,19 @@ do { /* release potentially may trigger ML module distraction and having */ \ /* the element not on the list may cause memory leak. */ \ if (OPAL_UNLIKELY(is_coll_sync)) { \ - OBJ_RELEASE(comm); \ - /* After this point it is UNSAFE to touch ml module */ \ - /* or communicator */ \ + if (OMPI_COMM_IS_INTRINSIC(comm)) { \ + opal_show_help("help-mpi-coll-ml.txt", \ + "coll-ml-check-fatal-error", true, \ + comm->c_name); \ + ompi_mpi_abort(comm, 6); \ + } else { \ + opal_show_help("help-mpi-coll-ml.txt", \ + "coll-ml-check-error", true, \ + comm->c_name); \ + /* After this point it is UNSAFE to touch ml module */ \ + /* or communicator */ \ + OBJ_RELEASE(comm); \ + } \ } \ } \ } while (0) diff --git a/ompi/mca/coll/ml/help-mpi-coll-ml.txt b/ompi/mca/coll/ml/help-mpi-coll-ml.txt index 613ea83b15..874516f7ac 100644 --- a/ompi/mca/coll/ml/help-mpi-coll-ml.txt +++ b/ompi/mca/coll/ml/help-mpi-coll-ml.txt @@ -1,6 +1,8 @@ # -*- text -*- # # Copyright (c) 2009-2014 Oak Ridge National Laboratory. All rights reserved. +# Copyright (c) 2014 Research Organization for Information Science +# and Technology (RIST). All rights reserved. # $COPYRIGHT$ # # Additional copyrights may follow @@ -48,3 +50,15 @@ zero copy method. ML could not be used because the mca param coll_ml_bcast_algorithm was not set to static and other broadcast implementation was available. + +[coll-ml-check-error] + +ML detected an error on communicator %s + +This communicator cannot be used any more + +[coll-ml-check-fatal-error] + +ML detected an unrecoverable error on intrinsic communicator %s + +The program will now abort