simplify the bcast code by taking abstract actions and making them
macros -- will help with the other algorithms This commit was SVN r7214.
Этот коммит содержится в:
родитель
3e002203a0
Коммит
9302f924ea
@ -284,7 +284,6 @@ extern "C" {
|
||||
*/
|
||||
extern mca_coll_sm_component_t mca_coll_sm_component;
|
||||
|
||||
|
||||
/*
|
||||
* coll module functions
|
||||
*/
|
||||
@ -385,4 +384,109 @@ extern "C" {
|
||||
}
|
||||
#endif
|
||||
|
||||
/**
|
||||
* Global variables used in the macros (essentially constants, so
|
||||
* these are thread safe)
|
||||
*/
|
||||
extern uint32_t mca_coll_sm_iov_size;
|
||||
extern int32_t mca_coll_sm_bogus_free_after;
|
||||
|
||||
|
||||
/**
|
||||
* Macro to setup flag usage
|
||||
*/
|
||||
#define FLAG_SETUP(flag_num, flag, data) \
|
||||
(flag) = (mca_coll_sm_in_use_flag_t*) \
|
||||
(((char *) (data)->mcb_in_use_flags) + \
|
||||
((flag_num) * mca_coll_sm_component.sm_control_size));
|
||||
|
||||
/**
|
||||
* Macro to wait for the in-use flag to become idle (used by the root)
|
||||
*/
|
||||
#define FLAG_WAIT_FOR_IDLE(flag) \
|
||||
while (0 != (flag)->mcsiuf_num_procs_using) continue;
|
||||
|
||||
/**
|
||||
* Macro to wait for a flag to indicate that it's ready for this
|
||||
* operation (used by non-root processes to know when FLAG_SET() has
|
||||
* been called)
|
||||
*/
|
||||
#define FLAG_WAIT_FOR_OP(flag, op) \
|
||||
while ((op) != flag->mcsiuf_operation_count) continue;
|
||||
|
||||
/**
|
||||
* Macro to set an in-use flag with relevant data to claim it
|
||||
*/
|
||||
#define FLAG_RETAIN(flag, num_procs, op_count) \
|
||||
(flag)->mcsiuf_num_procs_using = (num_procs); \
|
||||
(flag)->mcsiuf_operation_count = (op_count);
|
||||
|
||||
/**
|
||||
* Macro to release an in-use flag from this process
|
||||
*/
|
||||
#define FLAG_RELEASE(flag) \
|
||||
opal_atomic_add(&(flag)->mcsiuf_num_procs_using, -1);
|
||||
|
||||
/**
|
||||
* Macro to copy a single segment in from a user buffer to a shared
|
||||
* segment
|
||||
*/
|
||||
#define COPY_FRAGMENT_IN(convertor, index, iov, max_data) \
|
||||
(iov).iov_base = \
|
||||
(index)->mcbmi_data + \
|
||||
(rank * mca_coll_sm_component.sm_fragment_size); \
|
||||
(max_data) = (iov).iov_len = mca_coll_sm_component.sm_fragment_size; \
|
||||
ompi_convertor_pack(&(convertor), &(iov), &mca_coll_sm_iov_size, \
|
||||
&(max_data), &mca_coll_sm_bogus_free_after);
|
||||
|
||||
/**
|
||||
* Macro to copy a single segment out from a shared segment to a user
|
||||
* buffer
|
||||
*/
|
||||
#define COPY_FRAGMENT_OUT(convertor, src_rank, index, iov, max_data) \
|
||||
(iov).iov_base = (((char*) (index)->mcbmi_data) + \
|
||||
((src_rank) * mca_coll_sm_component.sm_fragment_size)); \
|
||||
ompi_convertor_unpack(&(convertor), &(iov), &mca_coll_sm_iov_size, \
|
||||
&(max_data), &mca_coll_sm_bogus_free_after);
|
||||
|
||||
/**
|
||||
* Macro to memcpy a fragment between one shared segment and another
|
||||
*/
|
||||
#define COPY_FRAGMENT_BETWEEN(src_rank, dest_rank, index, len) \
|
||||
memcpy(((index)->mcbmi_data + \
|
||||
((dest_rank) * mca_coll_sm_component.sm_fragment_size)), \
|
||||
((index)->mcbmi_data + \
|
||||
((src_rank) * \
|
||||
mca_coll_sm_component.sm_fragment_size)), \
|
||||
(len));
|
||||
|
||||
/**
|
||||
* Macro to tell children that a segment is ready (normalize the
|
||||
* child's ID based on the shift used to calculate the "me" node in
|
||||
* the tree)
|
||||
*/
|
||||
#define PARENT_NOTIFY_CHILDREN(children, num_children, index, value) \
|
||||
for (i = 0; i < (num_children); ++i) { \
|
||||
*((size_t*) \
|
||||
(((char*) index->mcbmi_control) + \
|
||||
(mca_coll_sm_component.sm_control_size * \
|
||||
(((children)[i]->mcstn_id + root) % size)))) = (value); \
|
||||
}
|
||||
|
||||
/**
|
||||
* Macro for childen to wait for parent notification (use real rank).
|
||||
* Save the value passed and then reset it when done.
|
||||
*/
|
||||
#define CHILD_WAIT_FOR_NOTIFY(rank, index, value) \
|
||||
while (0 == *((volatile uint32_t*) \
|
||||
(((char*) index->mcbmi_control) + \
|
||||
((rank) * mca_coll_sm_component.sm_control_size)))) { \
|
||||
continue; \
|
||||
} \
|
||||
(value) = *((volatile uint32_t*) \
|
||||
(((char*) index->mcbmi_control) + \
|
||||
((rank) * mca_coll_sm_component.sm_control_size))); \
|
||||
*((uint32_t*) (((char*) index->mcbmi_control) + \
|
||||
((rank) * mca_coll_sm_component.sm_control_size))) = 0;
|
||||
|
||||
#endif /* MCA_COLL_SM_EXPORT_H */
|
||||
|
@ -46,17 +46,15 @@ int mca_coll_sm_bcast_intra(void *buff, int count,
|
||||
struct ompi_communicator_t *comm)
|
||||
{
|
||||
struct iovec iov;
|
||||
uint32_t iov_size = 1;
|
||||
mca_coll_base_comm_t *data = comm->c_coll_selected_data;
|
||||
int i, ret, rank, size, num_children;
|
||||
int i, ret, rank, size, num_children, src_rank;
|
||||
int flag_num, segment_num, max_segment_num;
|
||||
int parent_rank, child_rank;
|
||||
int parent_rank;
|
||||
size_t total_size, max_data, bytes;
|
||||
volatile uint32_t *my_control;
|
||||
mca_coll_sm_in_use_flag_t *flag;
|
||||
ompi_convertor_t convertor;
|
||||
mca_coll_sm_tree_node_t *me, *parent, **children;
|
||||
int32_t bogus_free_after = 0;
|
||||
mca_coll_base_mpool_index_t *index;
|
||||
|
||||
/* Setup some identities */
|
||||
|
||||
@ -106,27 +104,9 @@ int mca_coll_sm_bcast_intra(void *buff, int count,
|
||||
flag_num = (data->mcb_operation_count++ %
|
||||
mca_coll_sm_component.sm_comm_num_in_use_flags);
|
||||
|
||||
/* Wait for the set of segments to become available */
|
||||
flag = (mca_coll_sm_in_use_flag_t*)
|
||||
(((char *) data->mcb_in_use_flags) +
|
||||
(flag_num * mca_coll_sm_component.sm_control_size));
|
||||
D(("root waiting for in_use flag %d (value %d), %p\n",
|
||||
flag_num, flag->mcsiuf_num_procs_using, flag));
|
||||
while (0 != flag->mcsiuf_num_procs_using) {
|
||||
continue;
|
||||
}
|
||||
D(("root got in_use flag %d (value %d), %p\n",
|
||||
flag_num, flag->mcsiuf_num_procs_using, flag));
|
||||
|
||||
/* Now that the set of segments is availble, mark it as
|
||||
used. No need to include the root in the count (we'd
|
||||
only have to decrement it later). Don't need a write
|
||||
barrier here -- we have another later that will
|
||||
guarantee that the write has completed, if
|
||||
necessary. */
|
||||
|
||||
flag->mcsiuf_num_procs_using = size - 1;
|
||||
flag->mcsiuf_operation_count = data->mcb_operation_count - 1;
|
||||
FLAG_SETUP(flag_num, flag, data);
|
||||
FLAG_WAIT_FOR_IDLE(flag);
|
||||
FLAG_RETAIN(flag, size - 1, data->mcb_operation_count - 1);
|
||||
|
||||
/* Loop over all the segments in this set */
|
||||
|
||||
@ -135,40 +115,19 @@ int mca_coll_sm_bcast_intra(void *buff, int count,
|
||||
max_segment_num = (flag_num + 1) *
|
||||
mca_coll_sm_component.sm_comm_num_in_use_flags;
|
||||
do {
|
||||
index = &(data->mcb_mpool_index[segment_num]);
|
||||
|
||||
/* Copy the fragment from the user buffer to my fragment
|
||||
in the current segment */
|
||||
iov.iov_base =
|
||||
data->mcb_mpool_index[segment_num].mcbmi_data +
|
||||
(rank * mca_coll_sm_component.sm_fragment_size);
|
||||
max_data = iov.iov_len;
|
||||
D(("root copying %lu bytes to data fan out, seg %d: %p\n",
|
||||
(unsigned long) iov.iov_len, segment_num, iov.iov_base));
|
||||
ompi_convertor_pack(&convertor, &iov, &iov_size,
|
||||
&max_data, &bogus_free_after);
|
||||
COPY_FRAGMENT_IN(convertor, index, iov, max_data);
|
||||
bytes += max_data;
|
||||
|
||||
/* Wait for the write to absolutely complete */
|
||||
opal_atomic_wmb();
|
||||
|
||||
/* Tell my children that this fragment is ready (be
|
||||
sure to normalize the child's ID based on the shift
|
||||
we did above to calculate the "me" node in the
|
||||
tree) */
|
||||
for (i = 0; i < num_children; ++i) {
|
||||
child_rank = (children[i]->mcstn_id + root) % size;
|
||||
*((size_t*)
|
||||
(((char*)
|
||||
data->mcb_mpool_index[segment_num].mcbmi_control) +
|
||||
(mca_coll_sm_component.sm_control_size *
|
||||
child_rank))) = max_data;
|
||||
D(("root sent notice to child %d (vrank %d), control to %p\n",
|
||||
i, children[i]->mcstn_id,
|
||||
(((char*)
|
||||
data->mcb_mpool_index[segment_num].mcbmi_control) +
|
||||
(mca_coll_sm_component.sm_control_size *
|
||||
child_rank))));
|
||||
}
|
||||
/* Tell my children that this fragment is ready */
|
||||
PARENT_NOTIFY_CHILDREN(children, num_children, index,
|
||||
max_data);
|
||||
|
||||
++segment_num;
|
||||
} while (bytes < total_size && segment_num < max_segment_num);
|
||||
@ -207,13 +166,8 @@ int mca_coll_sm_bcast_intra(void *buff, int count,
|
||||
|
||||
/* Wait for the root to mark this set of segments as
|
||||
ours */
|
||||
flag = (mca_coll_sm_in_use_flag_t*)
|
||||
(((char *) data->mcb_in_use_flags) +
|
||||
(flag_num * mca_coll_sm_component.sm_control_size));
|
||||
D(("rank %d waiting for root to claim in-use flag %d, %p (op count %d)\n", rank, flag_num, flag, data->mcb_operation_count));
|
||||
while (data->mcb_operation_count != flag->mcsiuf_operation_count) {
|
||||
continue;
|
||||
}
|
||||
FLAG_SETUP(flag_num, flag, data);
|
||||
FLAG_WAIT_FOR_OP(flag, data->mcb_operation_count);
|
||||
++data->mcb_operation_count;
|
||||
|
||||
/* Loop over all the segments in this set */
|
||||
@ -224,71 +178,32 @@ int mca_coll_sm_bcast_intra(void *buff, int count,
|
||||
mca_coll_sm_component.sm_comm_num_in_use_flags;
|
||||
do {
|
||||
|
||||
/* Pre-calculate some pointers */
|
||||
|
||||
/* Pre-calculate some values */
|
||||
parent_rank = (parent->mcstn_id + root) % size;
|
||||
D(("rank %d parent rank is %d\n", rank, parent_rank));
|
||||
my_control = (uint32_t *)
|
||||
(((char*)
|
||||
data->mcb_mpool_index[segment_num].mcbmi_control) +
|
||||
(rank * mca_coll_sm_component.sm_control_size));
|
||||
index = &(data->mcb_mpool_index[segment_num]);
|
||||
|
||||
/* Wait for the fragment: the parent will mark the segment
|
||||
as ready */
|
||||
D(("rank %d waiting for fragment in segment %d (control %p)\n",
|
||||
rank, segment_num, (char*) my_control));
|
||||
while (0 == *my_control) {
|
||||
continue;
|
||||
}
|
||||
max_data = *my_control;
|
||||
D(("rank %d: fragment ready in segment %d\n", rank, segment_num));
|
||||
/* Wait for my parent to tell me that the segment is ready */
|
||||
CHILD_WAIT_FOR_NOTIFY(rank, index, max_data);
|
||||
|
||||
/* If I have children, send the data to them */
|
||||
if (num_children > 0) {
|
||||
/* No need to wait for the segment to become
|
||||
available -- the root has already claimed it
|
||||
and we're all already using it. So copy the
|
||||
fragment from the parent's portion in the
|
||||
segment to my portion in the segment. This is
|
||||
a simply memcpy because it's already been
|
||||
packed into the parent's segment. */
|
||||
memcpy(/* my data fan out section in the segment */
|
||||
(data->mcb_mpool_index[segment_num].mcbmi_data +
|
||||
(rank * mca_coll_sm_component.sm_fragment_size)),
|
||||
/* parent's fan out section in the segment */
|
||||
(data->mcb_mpool_index[segment_num].mcbmi_data +
|
||||
(parent_rank *
|
||||
mca_coll_sm_component.sm_fragment_size)),
|
||||
/* length */
|
||||
*my_control);
|
||||
D(("rank %d memcopy'ed fragment (%p) to my data fan out (%lu bytes)\n", rank,
|
||||
(data->mcb_mpool_index[segment_num].mcbmi_data +
|
||||
(parent_rank * mca_coll_sm_component.sm_fragment_size)),
|
||||
(unsigned long) *my_control));
|
||||
/* Copy the fragment from the parent's portion in
|
||||
the segment to my portion in the segment. */
|
||||
COPY_FRAGMENT_BETWEEN(parent_rank, rank, index, max_data);
|
||||
|
||||
/* Wait for the write to absolutely complete */
|
||||
opal_atomic_wmb();
|
||||
|
||||
/* Tell my children that this fragment is ready */
|
||||
for (i = 0; i < num_children; ++i) {
|
||||
child_rank = (children[i]->mcstn_id + root) % size;
|
||||
*((size_t*)
|
||||
(((char*)
|
||||
data->mcb_mpool_index[segment_num].mcbmi_control) +
|
||||
(mca_coll_sm_component.sm_control_size *
|
||||
child_rank))) = *my_control;
|
||||
D(("rank %d notifying child %d (vrank %d, rank %d)\n",
|
||||
rank, i, children[i]->mcstn_id, child_rank));
|
||||
}
|
||||
PARENT_NOTIFY_CHILDREN(children, num_children, index,
|
||||
max_data);
|
||||
|
||||
/* Set the "copy from buffer" to be my local
|
||||
segment buffer so that we don't potentially
|
||||
incur a non-local memory copy from the parent's
|
||||
fan out data segment [again] when copying to
|
||||
the user's buffer */
|
||||
iov.iov_base =
|
||||
data->mcb_mpool_index[segment_num].mcbmi_data +
|
||||
(rank * mca_coll_sm_component.sm_fragment_size);
|
||||
src_rank = rank;
|
||||
}
|
||||
|
||||
/* If I don't have any children, set the "copy from
|
||||
@ -296,20 +211,12 @@ int mca_coll_sm_bcast_intra(void *buff, int count,
|
||||
directly from my parent */
|
||||
|
||||
else {
|
||||
iov.iov_base =
|
||||
(((char*)
|
||||
data->mcb_mpool_index[segment_num].mcbmi_data) +
|
||||
(parent_rank *
|
||||
mca_coll_sm_component.sm_fragment_size));
|
||||
src_rank = parent_rank;
|
||||
}
|
||||
|
||||
/* Copy to my output buffer */
|
||||
D(("rank %d convertor copied from parent data %p to user buffer (%lu bytes)\n",
|
||||
rank, iov.iov_base, (unsigned long) iov.iov_len));
|
||||
ompi_convertor_unpack(&convertor, &iov, &iov_size,
|
||||
&max_data, &bogus_free_after);
|
||||
COPY_FRAGMENT_OUT(convertor, src_rank, index, iov, max_data);
|
||||
|
||||
*my_control = 0;
|
||||
bytes += max_data;
|
||||
++segment_num;
|
||||
} while (bytes < total_size && segment_num < max_segment_num);
|
||||
@ -319,10 +226,7 @@ int mca_coll_sm_bcast_intra(void *buff, int count,
|
||||
opal_atomic_wmb();
|
||||
|
||||
/* We're finished with this set of segments */
|
||||
|
||||
D(("rank %d done with in-use flag %d (value %d)\n",
|
||||
rank, flag_num, flag->mcsiuf_num_procs_using));
|
||||
opal_atomic_add(&flag->mcsiuf_num_procs_using, -1);
|
||||
FLAG_RELEASE(flag);
|
||||
} while (bytes < total_size);
|
||||
}
|
||||
|
||||
|
@ -44,6 +44,13 @@
|
||||
#include "coll_sm.h"
|
||||
|
||||
|
||||
/*
|
||||
* Global variables
|
||||
*/
|
||||
uint32_t mca_coll_sm_iov_size = 1;
|
||||
int32_t mca_coll_sm_bogus_free_after = 0;
|
||||
|
||||
|
||||
/*
|
||||
* Local functions
|
||||
*/
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user