1
1

btl/openib: fix locking bugs with XRC ib_addr lock

This bug fixes two issue with the ib_addr lock:

 - The ib_addr lock must always be obtained regardless of
   opal_using_threads() as the CPC is run in a seperate thread.

 - The ib_addr lock is held in mca_btl_openib_endpoint_connected when
   calling back into the CPC start_connect on any pending
   connections. This will attempt to obtain the ib_addr lock
   again. Since this is not a performance-critical part of the code
   the lock has been changed to be recursive.

Signed-off-by: Nathan Hjelm <hjelmn@lanl.gov>
Этот коммит содержится в:
Nathan Hjelm 2016-02-18 15:55:34 -07:00
родитель 4dc73d7765
Коммит 371df45bf8
2 изменённых файлов: 10 добавлений и 4 удалений

Просмотреть файл

@ -11,7 +11,7 @@
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2006-2013 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2006-2015 Los Alamos National Security, LLC. All rights
* Copyright (c) 2006-2016 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2006-2007 Voltaire All rights reserved.
* Copyright (c) 2006-2009 Mellanox Technologies, Inc. All rights reserved.
@ -579,7 +579,7 @@ void mca_btl_openib_endpoint_connected(mca_btl_openib_endpoint_t *endpoint)
opal_output(-1, "Now we are CONNECTED");
if (MCA_BTL_XRC_ENABLED) {
OPAL_THREAD_LOCK(&endpoint->ib_addr->addr_lock);
opal_mutex_lock (&endpoint->ib_addr->addr_lock);
if (MCA_BTL_IB_ADDR_CONNECTED == endpoint->ib_addr->status) {
/* We are not xrc master */
/* set our qp pointer to master qp */
@ -622,7 +622,7 @@ void mca_btl_openib_endpoint_connected(mca_btl_openib_endpoint_t *endpoint)
}
}
}
OPAL_THREAD_UNLOCK(&endpoint->ib_addr->addr_lock);
opal_mutex_unlock (&endpoint->ib_addr->addr_lock);
}

Просмотреть файл

@ -1,3 +1,4 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2007-2008 Mellanox Technologies. All rights reserved.
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
@ -5,6 +6,8 @@
* Copyright (c) 2014-2015 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* Copyright (c) 2014 Bull SAS. All rights reserved.
* Copyright (c) 2016 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -122,7 +125,10 @@ static void ib_address_constructor(ib_address_t *ib_addr)
ib_addr->lid = 0;
ib_addr->status = MCA_BTL_IB_ADDR_CLOSED;
ib_addr->qp = NULL;
OBJ_CONSTRUCT(&ib_addr->addr_lock, opal_mutex_t);
/* NTH: make the addr_lock recursive because mca_btl_openib_endpoint_connected can call
* into the CPC with the lock held. The alternative would be to drop the lock but the
* lock is never obtained in a critical path. */
OBJ_CONSTRUCT(&ib_addr->addr_lock, opal_recursive_mutex_t);
OBJ_CONSTRUCT(&ib_addr->pending_ep, opal_list_t);
}