1
1
openmpi/ompi/mca/osc/pt2pt/osc_pt2pt_module.c
Nathan Hjelm 362ac8b87e osc/pt2pt: fix threading issues
This commit fixes a number of threading issues discovered in
osc/pt2pt. This includes:

 - Lock the synchronization object not the module in osc_pt2pt_start.
   This fixes a race between the start function and processing post
   messages.

 - Always lock before calling cond_broadcast. Fixes a race between
   the waiting thread and signaling thread.

 - Make all atomically updated values volatile.

 - Make the module lock recursive to protect against some deadlock
   conditions. Will roll this back once the locks have been
   re-designed.

 - Mark incoming complete *after* completing an accumulate not
   before. This was causing an incorrect answer under certain
   conditions.

Signed-off-by: Nathan Hjelm <hjelmn@lanl.gov>
2017-02-01 10:33:01 -07:00

116 строки
3.7 KiB
C

/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University.
* All rights reserved.
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
* All rights reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2007-2016 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2012-2013 Sandia National Laboratories. All rights reserved.
* Copyright (c) 2015 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "osc_pt2pt.h"
int ompi_osc_pt2pt_attach(struct ompi_win_t *win, void *base, size_t len)
{
return OMPI_SUCCESS;
}
int
ompi_osc_pt2pt_detach(struct ompi_win_t *win, const void *base)
{
return OMPI_SUCCESS;
}
int ompi_osc_pt2pt_free(ompi_win_t *win)
{
int ret = OMPI_SUCCESS;
ompi_osc_pt2pt_module_t *module = GET_MODULE(win);
ompi_osc_pt2pt_peer_t *peer;
uint32_t key;
void *node;
if (NULL == module) {
return OMPI_SUCCESS;
}
if (NULL != module->comm) {
opal_output_verbose(1, ompi_osc_base_framework.framework_output,
"pt2pt component destroying window with id %d",
ompi_comm_get_cid(module->comm));
/* finish with a barrier */
if (ompi_group_size(win->w_group) > 1) {
(void) module->comm->c_coll.coll_barrier (module->comm,
module->comm->c_coll.coll_barrier_module);
}
/* remove from component information */
OPAL_THREAD_SCOPED_LOCK(&mca_osc_pt2pt_component.lock,
opal_hash_table_remove_value_uint32(&mca_osc_pt2pt_component.modules,
ompi_comm_get_cid(module->comm)));
}
win->w_osc_module = NULL;
OBJ_DESTRUCT(&module->outstanding_locks);
OBJ_DESTRUCT(&module->locks_pending);
OBJ_DESTRUCT(&module->locks_pending_lock);
OBJ_DESTRUCT(&module->cond);
OBJ_DESTRUCT(&module->lock);
OBJ_DESTRUCT(&module->all_sync);
/* it is erroneous to close a window with active operations on it so we should
* probably produce an error here instead of cleaning up */
OPAL_LIST_DESTRUCT(&module->pending_acc);
OBJ_DESTRUCT(&module->pending_acc_lock);
osc_pt2pt_gc_clean (module);
OPAL_LIST_DESTRUCT(&module->buffer_gc);
OBJ_DESTRUCT(&module->gc_lock);
ret = opal_hash_table_get_first_key_uint32 (&module->peer_hash, &key, (void **) &peer, &node);
while (OPAL_SUCCESS == ret) {
OBJ_RELEASE(peer);
ret = opal_hash_table_get_next_key_uint32 (&module->peer_hash, &key, (void **) &peer, node,
&node);
}
OBJ_DESTRUCT(&module->peer_hash);
OBJ_DESTRUCT(&module->peer_lock);
if (NULL != module->recv_frags) {
for (unsigned int i = 0 ; i < module->recv_frag_count ; ++i) {
OBJ_DESTRUCT(module->recv_frags + i);
}
free (module->recv_frags);
}
if (NULL != module->epoch_outgoing_frag_count) free(module->epoch_outgoing_frag_count);
if (NULL != module->comm) {
ompi_comm_free(&module->comm);
}
if (NULL != module->free_after) free(module->free_after);
free (module);
return OMPI_SUCCESS;
}