Fix a number of issues in the new one sided code.
- Fix several typos is osc/rdma. - Fix a locking issue in osc/sm that was caused by an incorrect assumption about the semantics of opal_atomic_add_32. - Always unlock the accumulation lock in osc/sm. - The base of a processes shared memory window should be NULL if the size is zero. Fixed. cmr=v1.7.5:ticket=trac:4304 This commit was SVN r30853. The following Trac tickets were found above: Ticket 4304 --> https://svn.open-mpi.org/trac/ompi/ticket/4304
Этот коммит содержится в:
родитель
4e282a3295
Коммит
30b61a3333
@ -152,6 +152,9 @@ typedef struct udcm_module {
|
||||
|
||||
/* This module's modex message */
|
||||
modex_msg_t modex;
|
||||
|
||||
/** The channel is being monitored */
|
||||
bool channel_monitored;
|
||||
} udcm_module_t;
|
||||
|
||||
/*
|
||||
@ -401,7 +404,7 @@ static void udcm_component_register(void)
|
||||
0, 0, OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&udcm_timeout);
|
||||
|
||||
udcm_max_retry = 10;
|
||||
udcm_max_retry = 25;
|
||||
(void) mca_base_component_var_register(&mca_btl_openib_component.super.btl_version,
|
||||
"connect_udcm_max_retry", "Maximum number of times "
|
||||
"to retry sending a udcm connection message",
|
||||
@ -607,6 +610,7 @@ static int udcm_module_init (udcm_module_t *m, mca_btl_openib_module_t *btl)
|
||||
/* Monitor the fd associated with the completion channel */
|
||||
ompi_btl_openib_fd_monitor(m->cm_channel->fd, OPAL_EV_READ,
|
||||
udcm_cq_event_dispatch, m);
|
||||
m->channel_monitored = true;
|
||||
|
||||
OBJ_CONSTRUCT(&m->cm_lock, opal_mutex_t);
|
||||
OBJ_CONSTRUCT(&m->cm_send_lock, opal_mutex_t);
|
||||
@ -705,11 +709,13 @@ static int udcm_module_finalize(mca_btl_openib_module_t *btl,
|
||||
|
||||
m->cm_exiting = true;
|
||||
|
||||
/* stop monitoring the channel's fd before destroying the listen qp */
|
||||
ompi_btl_openib_fd_unmonitor(m->cm_channel->fd, udcm_unmonitor, (void *)&barrier);
|
||||
if (m->channel_monitored) {
|
||||
/* stop monitoring the channel's fd before destroying the listen qp */
|
||||
ompi_btl_openib_fd_unmonitor(m->cm_channel->fd, udcm_unmonitor, (void *)&barrier);
|
||||
|
||||
while (0 == barrier) {
|
||||
sched_yield();
|
||||
while (0 == barrier) {
|
||||
sched_yield();
|
||||
}
|
||||
}
|
||||
|
||||
opal_mutex_lock (&m->cm_lock);
|
||||
|
@ -103,7 +103,7 @@ typedef struct ompi_osc_rdma_header_complete_t ompi_osc_rdma_header_complete_t;
|
||||
struct ompi_osc_rdma_header_get_acc_t {
|
||||
ompi_osc_rdma_header_base_t base;
|
||||
|
||||
int16_t tag;
|
||||
uint16_t tag;
|
||||
uint32_t count;
|
||||
uint64_t len;
|
||||
uint64_t displacement;
|
||||
@ -114,7 +114,7 @@ typedef struct ompi_osc_rdma_header_get_acc_t ompi_osc_rdma_header_get_acc_t;
|
||||
struct ompi_osc_rdma_header_cswap_t {
|
||||
ompi_osc_rdma_header_base_t base;
|
||||
|
||||
int16_t tag;
|
||||
uint16_t tag;
|
||||
|
||||
uint32_t len;
|
||||
uint64_t displacement;
|
||||
|
@ -113,7 +113,7 @@ static inline int ompi_osc_rdma_lock_self (ompi_osc_rdma_module_t *module, ompi_
|
||||
module->shared_count++;
|
||||
}
|
||||
|
||||
lock->lock_acks_received = 1;
|
||||
lock->lock_acks_received++;
|
||||
} else {
|
||||
/* queue the lock */
|
||||
queue_lock (module, my_rank, lock->type, lock->serial_number);
|
||||
@ -415,7 +415,7 @@ int ompi_osc_rdma_unlock_all (struct ompi_win_t *win)
|
||||
|
||||
/* reset all fragment counters */
|
||||
memset (module->epoch_outgoing_frag_count, 0, ompi_comm_size(module->comm) * sizeof (module->epoch_outgoing_frag_count[0]));
|
||||
memset (module->passive_eager_send_active, 0, ompi_comm_size(module->comm) * module->passive_eager_send_active[0]);
|
||||
memset (module->passive_eager_send_active, 0, ompi_comm_size(module->comm) * sizeof (module->passive_eager_send_active[0]));
|
||||
|
||||
opal_list_remove_item (&module->outstanding_locks, &lock->super);
|
||||
OBJ_RELEASE(lock);
|
||||
@ -477,6 +477,12 @@ static int ompi_osc_rdma_flush_lock (ompi_osc_rdma_module_t *module, ompi_osc_rd
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* start all sendreqs to target */
|
||||
ret = ompi_osc_rdma_frag_flush_target (module, i);
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
flush_req.frag_count = module->epoch_outgoing_frag_count[target];
|
||||
@ -486,12 +492,12 @@ static int ompi_osc_rdma_flush_lock (ompi_osc_rdma_module_t *module, ompi_osc_rd
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
|
||||
/* start all sendreqs to target */
|
||||
ret = ompi_osc_rdma_frag_flush_all (module);
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
|
||||
return ret;
|
||||
/* start all sendreqs to target */
|
||||
ret = ompi_osc_rdma_frag_flush_target (module, target);
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
|
||||
/* wait for all the requests and the flush ack (meaning remote completion) */
|
||||
@ -521,9 +527,7 @@ int ompi_osc_rdma_flush (int target, struct ompi_win_t *win)
|
||||
"ompi_osc_rdma_flush starting..."));
|
||||
|
||||
if (ompi_comm_rank (module->comm) == target) {
|
||||
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
|
||||
"calling opal_progress. incoming complete = %d",
|
||||
module->frag_request->req_complete));
|
||||
/* nothing to flush */
|
||||
opal_progress ();
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
@ -665,7 +669,7 @@ static inline int activate_lock (ompi_osc_rdma_module_t *module, int requestor,
|
||||
}
|
||||
}
|
||||
|
||||
lock->lock_acks_received = 1;
|
||||
lock->lock_acks_received++;
|
||||
opal_condition_broadcast (&module->cond);
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
|
@ -198,8 +198,8 @@ ompi_osc_sm_rget_accumulate(void *origin_addr,
|
||||
if (OMPI_SUCCESS != ret || op == &ompi_mpi_op_no_op.op) goto done;
|
||||
|
||||
if (op == &ompi_mpi_op_replace.op) {
|
||||
return ompi_datatype_sndrcv(origin_addr, origin_count, origin_dt,
|
||||
remote_address, target_count, target_dt);
|
||||
ret = ompi_datatype_sndrcv(origin_addr, origin_count, origin_dt,
|
||||
remote_address, target_count, target_dt);
|
||||
} else {
|
||||
ret = ompi_osc_base_sndrcv_op(origin_addr, origin_count, origin_dt,
|
||||
remote_address, target_count, target_dt,
|
||||
@ -354,8 +354,8 @@ ompi_osc_sm_get_accumulate(void *origin_addr,
|
||||
if (OMPI_SUCCESS != ret || op == &ompi_mpi_op_no_op.op) goto done;
|
||||
|
||||
if (op == &ompi_mpi_op_replace.op) {
|
||||
return ompi_datatype_sndrcv(origin_addr, origin_count, origin_dt,
|
||||
remote_address, target_count, target_dt);
|
||||
ret = ompi_datatype_sndrcv(origin_addr, origin_count, origin_dt,
|
||||
remote_address, target_count, target_dt);
|
||||
} else {
|
||||
ret = ompi_osc_base_sndrcv_op(origin_addr, origin_count, origin_dt,
|
||||
remote_address, target_count, target_dt,
|
||||
|
@ -291,8 +291,12 @@ component_select(struct ompi_win_t *win, void **base, size_t size, int disp_unit
|
||||
|
||||
for (i = 0, total = state_size ; i < ompi_comm_size(comm) ; ++i) {
|
||||
module->sizes[i] = rbuf[i];
|
||||
module->bases[i] = ((char *) module->segment_base) + total;
|
||||
total += rbuf[i];
|
||||
if (module->sizes[i]) {
|
||||
module->bases[i] = ((char *) module->segment_base) + total;
|
||||
total += rbuf[i];
|
||||
} else {
|
||||
module->bases[i] = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
free(rbuf);
|
||||
|
@ -1,5 +1,8 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2011 Sandia National Laboratories. All rights reserved.
|
||||
* Copyright (c) 2014 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -22,8 +25,10 @@ lk_fetch_add32(ompi_osc_sm_module_t *module,
|
||||
size_t offset,
|
||||
uint32_t delta)
|
||||
{
|
||||
/* opal_atomic_add_32 is an add then fetch so delta needs to be subtracted out to get the
|
||||
* old value */
|
||||
return opal_atomic_add_32((int32_t*) ((char*) &module->node_states[target].lock + offset),
|
||||
delta);
|
||||
delta) - delta;
|
||||
}
|
||||
|
||||
|
||||
@ -149,8 +154,10 @@ ompi_osc_sm_unlock(int target,
|
||||
ret = OMPI_SUCCESS;
|
||||
} else if (module->outstanding_locks[target] == lock_exclusive) {
|
||||
ret = end_exclusive(module, target);
|
||||
module->outstanding_locks[target] = lock_none;
|
||||
} else if (module->outstanding_locks[target] == lock_shared) {
|
||||
ret = end_shared(module, target);
|
||||
module->outstanding_locks[target] = lock_none;
|
||||
} else {
|
||||
ret = MPI_ERR_RMA_SYNC;
|
||||
}
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user