1
1
openmpi/ompi/mca/osc/pt2pt/osc_pt2pt_module.c
Nathan Hjelm 7589a25377 osc/pt2pt: do not repost receive from request callback
This commit fixes an issue that can occur if a target gets overwhelmed with
requests. This can cause osc/pt2pt to go into deep recursion with a stack
like req_complete_cb -> ompi_osc_pt2pt_callback -> start -> req_complete_cb
-> ... . At small scale this is fine as the recursion depth stays small but
at larger scale we can quickly exhaust the stack processing frag requests.
To fix the issue the request callback now simply puts the request on a
list and returns. The osc/pt2pt progress function then handles the
processing and reposting of the request.

As part of this change osc/pt2pt can now post multiple fragment receive
requests per window. This should help prevent a target from being overwhelmed.

Signed-off-by: Nathan Hjelm <hjelmn@me.com>
2016-08-11 15:33:07 -06:00

116 строки
3.7 KiB
C

/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University.
* All rights reserved.
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
* All rights reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2007-2016 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2012-2013 Sandia National Laboratories. All rights reserved.
* Copyright (c) 2015 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "osc_pt2pt.h"
int ompi_osc_pt2pt_attach(struct ompi_win_t *win, void *base, size_t len)
{
return OMPI_SUCCESS;
}
int
ompi_osc_pt2pt_detach(struct ompi_win_t *win, const void *base)
{
return OMPI_SUCCESS;
}
int ompi_osc_pt2pt_free(ompi_win_t *win)
{
int ret = OMPI_SUCCESS;
ompi_osc_pt2pt_module_t *module = GET_MODULE(win);
ompi_osc_pt2pt_peer_t *peer;
uint32_t key;
void *node;
if (NULL == module) {
return OMPI_SUCCESS;
}
if (NULL != module->comm) {
opal_output_verbose(1, ompi_osc_base_framework.framework_output,
"pt2pt component destroying window with id %d",
ompi_comm_get_cid(module->comm));
/* finish with a barrier */
if (ompi_group_size(win->w_group) > 1) {
(void) module->comm->c_coll.coll_barrier (module->comm,
module->comm->c_coll.coll_barrier_module);
}
/* remove from component information */
OPAL_THREAD_SCOPED_LOCK(&mca_osc_pt2pt_component.lock,
opal_hash_table_remove_value_uint32(&mca_osc_pt2pt_component.modules,
ompi_comm_get_cid(module->comm)));
}
win->w_osc_module = NULL;
OBJ_DESTRUCT(&module->outstanding_locks);
OBJ_DESTRUCT(&module->locks_pending);
OBJ_DESTRUCT(&module->locks_pending_lock);
OBJ_DESTRUCT(&module->cond);
OBJ_DESTRUCT(&module->lock);
OBJ_DESTRUCT(&module->all_sync);
/* it is erroneous to close a window with active operations on it so we should
* probably produce an error here instead of cleaning up */
OPAL_LIST_DESTRUCT(&module->pending_acc);
osc_pt2pt_gc_clean (module);
OPAL_LIST_DESTRUCT(&module->request_gc);
OPAL_LIST_DESTRUCT(&module->buffer_gc);
OBJ_DESTRUCT(&module->gc_lock);
ret = opal_hash_table_get_first_key_uint32 (&module->peer_hash, &key, (void **) &peer, &node);
while (OPAL_SUCCESS == ret) {
OBJ_RELEASE(peer);
ret = opal_hash_table_get_next_key_uint32 (&module->peer_hash, &key, (void **) &peer, node,
&node);
}
OBJ_DESTRUCT(&module->peer_hash);
OBJ_DESTRUCT(&module->peer_lock);
if (NULL != module->recv_frags) {
for (int i = 0 ; i < module->recv_frag_count ; ++i) {
OBJ_DESTRUCT(module->recv_frags + i);
}
free (module->recv_frags);
}
if (NULL != module->epoch_outgoing_frag_count) free(module->epoch_outgoing_frag_count);
if (NULL != module->comm) {
ompi_comm_free(&module->comm);
}
if (NULL != module->free_after) free(module->free_after);
free (module);
return OMPI_SUCCESS;
}