bunches of updates
- finally added "in use" flags -- one flag protects a set of segments - these flags now used in bcast to protect (for example) when a message is so long that the root loops around the segments and has to re-use old segments -- now it knows that it has to wait until the non-root processes have finished with that set of segments before it can start using them - implement allreduce as a reduce followed by a bcast (per discussion with rich) - removed some redundant data on various data structures - implemented query MCA param ("coll_sm_shared_mem_used_data") that tells you how much shared memory will be used for a given set of MCA params (e.g., number of segments, etc.). For example: ompi_info --mca coll_sm_info_num_procs 4 --param coll sm | \ grep shared_mem_used_data tells you that for the default MCA param values (as of r7172), for 4 processes, sm will use 548864 bytes of shared memory for its data transfer section - remove a bunch of .c files from the Makefile.am that aren't implemented yet (i.e., all they do is return ERR_NOT_IMPLEMENTED) Now on to the big Altix to test that this stuff really works... This commit was SVN r7205. The following SVN revision numbers were found above: r7172 --> open-mpi/ompi@bc72a7722b
Этот коммит содержится в:
родитель
d38316ddbf
Коммит
7bab4ed269
@ -18,27 +18,29 @@
|
||||
|
||||
include $(top_ompi_srcdir)/config/Makefile.options
|
||||
|
||||
sources = \
|
||||
coll_sm.h \
|
||||
not_used_yet = \
|
||||
coll_sm_allgather.c \
|
||||
coll_sm_allgatherv.c \
|
||||
coll_sm_allreduce.c \
|
||||
coll_sm_alltoall.c \
|
||||
coll_sm_alltoallv.c \
|
||||
coll_sm_alltoallw.c \
|
||||
coll_sm_barrier.c \
|
||||
coll_sm_bcast.c \
|
||||
coll_sm_component.c \
|
||||
coll_sm_gather.c \
|
||||
coll_sm_gatherv.c \
|
||||
coll_sm_module.c \
|
||||
coll_sm_reduce.c \
|
||||
coll_sm_reduce_scatter.c \
|
||||
coll_sm_scan.c \
|
||||
coll_sm_exscan.c \
|
||||
coll_sm_scatter.c \
|
||||
coll_sm_scatterv.c
|
||||
|
||||
sources = \
|
||||
coll_sm.h \
|
||||
coll_sm_allreduce.c \
|
||||
coll_sm_barrier.c \
|
||||
coll_sm_bcast.c \
|
||||
coll_sm_component.c \
|
||||
coll_sm_module.c \
|
||||
coll_sm_reduce.c
|
||||
|
||||
# Make the output library in this directory, and name it either
|
||||
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
|
||||
# (for static builds).
|
||||
|
@ -13,9 +13,7 @@
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
/**
|
||||
* @file
|
||||
*/
|
||||
/** @file */
|
||||
|
||||
#ifndef MCA_COLL_SM_EXPORT_H
|
||||
#define MCA_COLL_SM_EXPORT_H
|
||||
@ -52,15 +50,15 @@ extern "C" {
|
||||
collective sm operations -- use this value plus the base
|
||||
of the mpool to obtain the pointer to this comm's
|
||||
mca_coll_sm_mpool_area_t */
|
||||
size_t smbcs_data_mpool_offset;
|
||||
|
||||
/** Number of segments in the data mpool area for this
|
||||
communicator */
|
||||
int smbcs_comm_num_segments;
|
||||
volatile size_t smbcs_data_mpool_offset;
|
||||
|
||||
/** Number of processes in this communicator who have seen
|
||||
this value already. */
|
||||
int smbcs_count;
|
||||
volatile int smbcs_count;
|
||||
|
||||
/** Mechanism for letting multiple processes know whether this
|
||||
allocation succeeded or not */
|
||||
volatile bool smbcs_success;
|
||||
};
|
||||
/**
|
||||
* Convenience typedef
|
||||
@ -75,19 +73,17 @@ extern "C" {
|
||||
/** upper-level control structure */
|
||||
mca_common_sm_file_header_t super;
|
||||
|
||||
/** Number of segments in the bootstrap mmap file */
|
||||
int smbhe_num_segments;
|
||||
|
||||
/** Pointer to the start of the segments in the bootstrap area
|
||||
(map->seg_data only points to just beyond the
|
||||
mca_common_sm_file_header_t) */
|
||||
mca_coll_sm_bootstrap_comm_setup_t *smbhe_segments;
|
||||
|
||||
/** Pointer to array containing smhe_num_segments CIDs for use
|
||||
in bootstrap phase -- will always point immediately after
|
||||
the end of this struct (i.e., still within this header,
|
||||
but since it's variable size (set by MCA param), we can't
|
||||
just have it here in the struct. Bonk). */
|
||||
/** Pointer to array containing
|
||||
component.sm_bootstrap_num_segments CIDs for use in
|
||||
bootstrap phase -- will always point immediately after the
|
||||
end of this struct (i.e., still within this header, but
|
||||
since it's variable size (set by MCA param), we can't just
|
||||
have it here in the struct. Bonk). */
|
||||
uint32_t *smbhe_cids;
|
||||
};
|
||||
/**
|
||||
@ -143,6 +139,8 @@ extern "C" {
|
||||
calculation of the "info" MCA parameter */
|
||||
int sm_info_comm_size;
|
||||
|
||||
/******* end of MCA params ********/
|
||||
|
||||
/** Size of the bootstrap area -- calculated in
|
||||
coll_sm_component.c */
|
||||
size_t sm_bootstrap_size;
|
||||
@ -185,6 +183,24 @@ extern "C" {
|
||||
*/
|
||||
typedef struct mca_coll_sm_tree_node_t mca_coll_sm_tree_node_t;
|
||||
|
||||
/**
|
||||
* Simple structure comprising the "in use" flags. Contains two
|
||||
* members: the number of processes that are currently using this
|
||||
* set of segments and the operation number of the current
|
||||
* operation.
|
||||
*/
|
||||
struct mca_coll_sm_in_use_flag_t {
|
||||
/** Number of processes currently using this set of
|
||||
segments */
|
||||
volatile uint32_t mcsiuf_num_procs_using;
|
||||
/** Must match data->mcb_count */
|
||||
volatile uint32_t mcsiuf_operation_count;
|
||||
};
|
||||
/**
|
||||
* Convenienve typedef
|
||||
*/
|
||||
typedef struct mca_coll_sm_in_use_flag_t mca_coll_sm_in_use_flag_t;
|
||||
|
||||
/**
|
||||
* Structure containing pointers to various arrays of data in the
|
||||
* data mpool area (one of these indexes a single segment in the
|
||||
@ -195,8 +211,6 @@ extern "C" {
|
||||
struct mca_coll_base_mpool_index_t {
|
||||
/** Pointer to beginning of control data */
|
||||
uint32_t *mcbmi_control;
|
||||
/** Pointer to the "in use" buffer for this segment */
|
||||
uint32_t *mcbmi_in_use;
|
||||
/** Pointer to beginning of message fragment data */
|
||||
char *mcbmi_data;
|
||||
};
|
||||
@ -223,8 +237,6 @@ extern "C" {
|
||||
operations area (i.e., mcb_mpool_base +
|
||||
mcb_mpool_offset) */
|
||||
unsigned char *mcb_mpool_area;
|
||||
/** Number of segments in this comm's area in the data mpool */
|
||||
int mcb_mpool_num_segments;
|
||||
|
||||
/** Pointer to my barrier control pages (odd index pages are
|
||||
"in", even index pages are "out") */
|
||||
@ -246,6 +258,9 @@ extern "C" {
|
||||
of barrier buffers to use). */
|
||||
int mcb_barrier_count;
|
||||
|
||||
/** "In use" flags indicating which segments are available */
|
||||
mca_coll_sm_in_use_flag_t *mcb_in_use_flags;
|
||||
|
||||
/** Array of indexes into the mpool area for control and data
|
||||
fragment passing (containing pointers to each segments
|
||||
control and data areas). */
|
||||
@ -256,7 +271,7 @@ extern "C" {
|
||||
mca_coll_sm_tree_node_t *mcb_tree;
|
||||
|
||||
/** Operation number (i.e., which segment number to use) */
|
||||
int mcb_operation_count;
|
||||
uint32_t mcb_operation_count;
|
||||
};
|
||||
/**
|
||||
* Convenience typedef
|
||||
|
@ -13,6 +13,7 @@
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
/** @file */
|
||||
|
||||
#include "ompi_config.h"
|
||||
|
||||
@ -20,17 +21,20 @@
|
||||
#include "coll_sm.h"
|
||||
|
||||
|
||||
/*
|
||||
* allreduce_intra
|
||||
/**
|
||||
* Shared memory allreduce.
|
||||
*
|
||||
* Function: - allreduce using other MPI collectives
|
||||
* Accepts: - same as MPI_Allreduce()
|
||||
* Returns: - MPI_SUCCESS or error code
|
||||
* For the moment, all we're doing is a reduce to root==0 and then a
|
||||
* broadcast. It is possible that we'll do something better someday.
|
||||
*/
|
||||
int mca_coll_sm_allreduce_intra(void *sbuf, void *rbuf, int count,
|
||||
struct ompi_datatype_t *dtype,
|
||||
struct ompi_op_t *op,
|
||||
struct ompi_communicator_t *comm)
|
||||
{
|
||||
return OMPI_ERR_NOT_IMPLEMENTED;
|
||||
int ret;
|
||||
|
||||
ret = mca_coll_sm_reduce_intra(sbuf, rbuf, count, dtype, op, 0, comm);
|
||||
return (ret == OMPI_SUCCESS) ?
|
||||
mca_coll_sm_bcast_intra(rbuf, count, dtype, 0, comm) : ret;
|
||||
}
|
||||
|
@ -48,224 +48,290 @@ int mca_coll_sm_bcast_intra(void *buff, int count,
|
||||
struct iovec iov;
|
||||
uint32_t iov_size = 1;
|
||||
mca_coll_base_comm_t *data = comm->c_coll_selected_data;
|
||||
int i, ret, rank, vrank, size, segment, num_children;
|
||||
int i, ret, rank, size, num_children;
|
||||
int flag_num, segment_num, max_segment_num;
|
||||
int parent_rank, child_rank;
|
||||
size_t total_size, max_data, bytes;
|
||||
uint32_t *my_control, *parent_control;
|
||||
ompi_convertor_t send_convertor;
|
||||
ompi_convertor_t recv_convertor;
|
||||
volatile uint32_t *my_control;
|
||||
mca_coll_sm_in_use_flag_t *flag;
|
||||
ompi_convertor_t convertor;
|
||||
mca_coll_sm_tree_node_t *me, *parent, **children;
|
||||
int32_t bogus_free_after;
|
||||
int32_t bogus_free_after = 0;
|
||||
|
||||
/* Setup some identities */
|
||||
|
||||
rank = ompi_comm_rank(comm);
|
||||
size = ompi_comm_size(comm);
|
||||
vrank = (rank + size - root) % size;
|
||||
|
||||
me = &data->mcb_tree[vrank];
|
||||
D(("rank %d: virtual rank %d\n", rank, vrank));
|
||||
OBJ_CONSTRUCT(&convertor, ompi_convertor_t);
|
||||
iov.iov_len = mca_coll_sm_component.sm_fragment_size;
|
||||
bytes = 0;
|
||||
|
||||
me = &data->mcb_tree[(rank + size - root) % size];
|
||||
D(("rank %d: virtual rank %d\n", rank, me - data->mcb_tree));
|
||||
parent = me->mcstn_parent;
|
||||
children = me->mcstn_children;
|
||||
num_children = me->mcstn_num_children;
|
||||
|
||||
/* If I'm the root, I need a send convertor to pack from the
|
||||
user's buffer to shared memory */
|
||||
/* Only have one top-level decision as to whether I'm the root or
|
||||
not. Do this at the slight expense of repeating a little logic
|
||||
-- but it's better than a conditional branch in every loop
|
||||
iteration. */
|
||||
|
||||
/*********************************************************************
|
||||
* Root
|
||||
*********************************************************************/
|
||||
|
||||
if (root == rank) {
|
||||
OBJ_CONSTRUCT(&send_convertor, ompi_convertor_t);
|
||||
|
||||
/* The root needs a send convertor to pack from the user's
|
||||
buffer to shared memory */
|
||||
|
||||
if (OMPI_SUCCESS !=
|
||||
(ret =
|
||||
ompi_convertor_copy_and_prepare_for_send(ompi_mpi_local_convertor,
|
||||
datatype,
|
||||
count,
|
||||
buff,
|
||||
&send_convertor))) {
|
||||
&convertor))) {
|
||||
return ret;
|
||||
}
|
||||
ompi_convertor_get_packed_size(&send_convertor, &total_size);
|
||||
ompi_convertor_get_packed_size(&convertor, &total_size);
|
||||
D(("root got send convertor w/ total_size == %lu\n",
|
||||
(unsigned long) total_size));
|
||||
|
||||
/* Main loop over sending fragments */
|
||||
|
||||
do {
|
||||
flag_num = (data->mcb_operation_count++ %
|
||||
mca_coll_sm_component.sm_comm_num_in_use_flags);
|
||||
|
||||
/* Wait for the set of segments to become available */
|
||||
flag = (mca_coll_sm_in_use_flag_t*)
|
||||
(((char *) data->mcb_in_use_flags) +
|
||||
(flag_num * mca_coll_sm_component.sm_control_size));
|
||||
D(("root waiting for in_use flag %d (value %d), %p\n",
|
||||
flag_num, flag->mcsiuf_num_procs_using, flag));
|
||||
while (0 != flag->mcsiuf_num_procs_using) {
|
||||
continue;
|
||||
}
|
||||
D(("root got in_use flag %d (value %d), %p\n",
|
||||
flag_num, flag->mcsiuf_num_procs_using, flag));
|
||||
|
||||
/* Now that the set of segments is availble, mark it as
|
||||
used. No need to include the root in the count (we'd
|
||||
only have to decrement it later). Don't need a write
|
||||
barrier here -- we have another later that will
|
||||
guarantee that the write has completed, if
|
||||
necessary. */
|
||||
|
||||
flag->mcsiuf_num_procs_using = size - 1;
|
||||
flag->mcsiuf_operation_count = data->mcb_operation_count - 1;
|
||||
|
||||
/* Loop over all the segments in this set */
|
||||
|
||||
segment_num = flag_num *
|
||||
mca_coll_sm_component.sm_comm_num_in_use_flags;
|
||||
max_segment_num = (flag_num + 1) *
|
||||
mca_coll_sm_component.sm_comm_num_in_use_flags;
|
||||
do {
|
||||
|
||||
/* Copy the fragment from the user buffer to my fragment
|
||||
in the current segment */
|
||||
iov.iov_base =
|
||||
data->mcb_mpool_index[segment_num].mcbmi_data +
|
||||
(rank * mca_coll_sm_component.sm_fragment_size);
|
||||
max_data = iov.iov_len;
|
||||
D(("root copying %lu bytes to data fan out, seg %d: %p\n",
|
||||
(unsigned long) iov.iov_len, segment_num, iov.iov_base));
|
||||
ompi_convertor_pack(&convertor, &iov, &iov_size,
|
||||
&max_data, &bogus_free_after);
|
||||
bytes += max_data;
|
||||
|
||||
/* Wait for the write to absolutely complete */
|
||||
opal_atomic_wmb();
|
||||
|
||||
/* Tell my children that this fragment is ready (be
|
||||
sure to normalize the child's ID based on the shift
|
||||
we did above to calculate the "me" node in the
|
||||
tree) */
|
||||
for (i = 0; i < num_children; ++i) {
|
||||
child_rank = (children[i]->mcstn_id + root) % size;
|
||||
*((size_t*)
|
||||
(((char*)
|
||||
data->mcb_mpool_index[segment_num].mcbmi_control) +
|
||||
(mca_coll_sm_component.sm_control_size *
|
||||
child_rank))) = max_data;
|
||||
D(("root sent notice to child %d (vrank %d), control to %p\n",
|
||||
i, children[i]->mcstn_id,
|
||||
(((char*)
|
||||
data->mcb_mpool_index[segment_num].mcbmi_control) +
|
||||
(mca_coll_sm_component.sm_control_size *
|
||||
child_rank))));
|
||||
}
|
||||
|
||||
++segment_num;
|
||||
} while (bytes < total_size && segment_num < max_segment_num);
|
||||
} while (bytes < total_size);
|
||||
}
|
||||
|
||||
/* If I'm not the root, I need a receive convertor to unpack from
|
||||
shared mmory to the user's buffer */
|
||||
/*********************************************************************
|
||||
* Non-root
|
||||
*********************************************************************/
|
||||
|
||||
else {
|
||||
OBJ_CONSTRUCT(&recv_convertor, ompi_convertor_t);
|
||||
|
||||
/* Non-root processes need a receive convertor to unpack from
|
||||
shared mmory to the user's buffer */
|
||||
|
||||
OBJ_CONSTRUCT(&convertor, ompi_convertor_t);
|
||||
if (OMPI_SUCCESS !=
|
||||
(ret =
|
||||
ompi_convertor_copy_and_prepare_for_recv(ompi_mpi_local_convertor,
|
||||
datatype,
|
||||
count,
|
||||
buff,
|
||||
&recv_convertor))) {
|
||||
&convertor))) {
|
||||
return ret;
|
||||
}
|
||||
ompi_convertor_get_packed_size(&recv_convertor, &total_size);
|
||||
ompi_convertor_get_packed_size(&convertor, &total_size);
|
||||
D(("rank %d got recv convertor w/ total_size == %lu\n",
|
||||
rank, (unsigned long) total_size));
|
||||
}
|
||||
|
||||
/* Setup some more constants */
|
||||
/* Loop over receiving (and possibly re-sending) the
|
||||
fragments */
|
||||
|
||||
do {
|
||||
flag_num = (data->mcb_operation_count %
|
||||
mca_coll_sm_component.sm_comm_num_in_use_flags);
|
||||
|
||||
iov.iov_len = mca_coll_sm_component.sm_fragment_size;
|
||||
bytes = 0;
|
||||
|
||||
/* Loop over the fragments */
|
||||
|
||||
do {
|
||||
segment = (data->mcb_operation_count++ %
|
||||
data->mcb_mpool_num_segments);
|
||||
|
||||
/* Root */
|
||||
|
||||
if (root == rank) {
|
||||
|
||||
/* Wait for the segment to become available */
|
||||
/* JMS */
|
||||
|
||||
/* Copy the fragment from the user buffer to my fragment
|
||||
in the current segment */
|
||||
iov.iov_base =
|
||||
data->mcb_mpool_index[segment].mcbmi_data +
|
||||
(rank * mca_coll_sm_component.sm_fragment_size);
|
||||
max_data = iov.iov_len;
|
||||
D(("root copying %lu bytes to data fan out, seg %d: %p\n",
|
||||
(unsigned long) iov.iov_len, segment, iov.iov_base));
|
||||
ompi_convertor_pack(&send_convertor, &iov, &iov_size,
|
||||
&max_data, &bogus_free_after);
|
||||
|
||||
/* Wait for the write to absolutely complete */
|
||||
opal_atomic_wmb();
|
||||
|
||||
/* Tell my children that this fragment is ready (be sure
|
||||
to normalize the child's ID based on the shift we did
|
||||
above to calculate the "me" node in the tree) */
|
||||
for (i = 0; i < num_children; ++i) {
|
||||
child_rank = (children[i]->mcstn_id + root) % size;
|
||||
*((size_t*)
|
||||
(((char*)
|
||||
data->mcb_mpool_index[segment].mcbmi_control) +
|
||||
(mca_coll_sm_component.sm_control_size *
|
||||
child_rank))) = max_data;
|
||||
D(("root sent notice to child %d (vrank %d), control to %p\n",
|
||||
i, children[i]->mcstn_id,
|
||||
(((char*)
|
||||
data->mcb_mpool_index[segment].mcbmi_control) +
|
||||
(mca_coll_sm_component.sm_control_size *
|
||||
child_rank))));
|
||||
}
|
||||
}
|
||||
|
||||
/* Non-root */
|
||||
|
||||
else {
|
||||
|
||||
/* Pre-calculate some pointers */
|
||||
|
||||
parent_rank = (parent->mcstn_id + root) % size;
|
||||
D(("rank %d parent rank is %d\n", rank, parent_rank));
|
||||
my_control = (uint32_t *)
|
||||
(((char*)
|
||||
data->mcb_mpool_index[segment].mcbmi_control) +
|
||||
(rank * mca_coll_sm_component.sm_control_size));
|
||||
parent_control = (uint32_t *)
|
||||
(((char*)
|
||||
data->mcb_mpool_index[segment].mcbmi_control) +
|
||||
(parent_rank * mca_coll_sm_component.sm_control_size));
|
||||
|
||||
/* Wait for the fragment: the parent will mark the segment
|
||||
as ready */
|
||||
D(("rank %d waiting for fragment in segment %d (control %p)\n",
|
||||
rank, segment, (char*) my_control));
|
||||
while (0 == *my_control) {
|
||||
/* Wait for the root to mark this set of segments as
|
||||
ours */
|
||||
flag = (mca_coll_sm_in_use_flag_t*)
|
||||
(((char *) data->mcb_in_use_flags) +
|
||||
(flag_num * mca_coll_sm_component.sm_control_size));
|
||||
D(("rank %d waiting for root to claim in-use flag %d, %p (op count %d)\n", rank, flag_num, flag, data->mcb_operation_count));
|
||||
while (data->mcb_operation_count != flag->mcsiuf_operation_count) {
|
||||
continue;
|
||||
}
|
||||
max_data = *my_control;
|
||||
D(("rank %d: fragment ready in segment %d\n", rank, segment));
|
||||
++data->mcb_operation_count;
|
||||
|
||||
/* If I have children, send the data to them */
|
||||
if (num_children > 0) {
|
||||
max_data = iov.iov_len;
|
||||
|
||||
/* No need to wait for the segment to become available
|
||||
-- the root has already claimed it and we're all
|
||||
already using it. So copy the fragment from the
|
||||
parent's portion in the segment to my portion in
|
||||
the segment. This is a simply memcpy because it's
|
||||
already been packed into the parent's segment. */
|
||||
memcpy(/* my data fan out section in the segment */
|
||||
(data->mcb_mpool_index[segment].mcbmi_data +
|
||||
(rank * mca_coll_sm_component.sm_fragment_size)),
|
||||
/* parent's fan out section in the segment */
|
||||
(data->mcb_mpool_index[segment].mcbmi_data +
|
||||
(parent_rank *
|
||||
mca_coll_sm_component.sm_fragment_size)),
|
||||
/* length */
|
||||
*my_control);
|
||||
D(("rank %d memcopy'ed fragment (%p) to my data fan out (%lu bytes)\n", rank,
|
||||
(data->mcb_mpool_index[segment].mcbmi_data +
|
||||
(parent_rank * mca_coll_sm_component.sm_fragment_size)),
|
||||
(unsigned long) *my_control));
|
||||
/* Loop over all the segments in this set */
|
||||
|
||||
/* Wait for the write to absolutely complete */
|
||||
opal_atomic_wmb();
|
||||
segment_num = flag_num *
|
||||
mca_coll_sm_component.sm_comm_num_in_use_flags;
|
||||
max_segment_num = (flag_num + 1) *
|
||||
mca_coll_sm_component.sm_comm_num_in_use_flags;
|
||||
do {
|
||||
|
||||
/* Tell my children that this fragment is ready */
|
||||
for (i = 0; i < num_children; ++i) {
|
||||
child_rank = (children[i]->mcstn_id + root) % size;
|
||||
*((size_t*)
|
||||
(((char*)
|
||||
data->mcb_mpool_index[segment].mcbmi_control) +
|
||||
(mca_coll_sm_component.sm_control_size *
|
||||
child_rank))) = *my_control;
|
||||
D(("rank %d notifying child %d (vrank %d, rank %d)\n",
|
||||
rank, i, children[i]->mcstn_id, child_rank));
|
||||
}
|
||||
|
||||
/* Set the "copy from buffer" to be my local segment
|
||||
buffer so that we don't potentially incur a
|
||||
non-local memory copy from the parent's fan out
|
||||
data segment [again] when copying to the user's
|
||||
buffer */
|
||||
iov.iov_base =
|
||||
data->mcb_mpool_index[segment].mcbmi_data +
|
||||
(rank * mca_coll_sm_component.sm_fragment_size);
|
||||
}
|
||||
|
||||
/* If I don't have any children, set the "copy from
|
||||
buffer" to be my parent's fan out segment to copy
|
||||
directly from my parent */
|
||||
|
||||
else {
|
||||
iov.iov_base =
|
||||
/* Pre-calculate some pointers */
|
||||
|
||||
parent_rank = (parent->mcstn_id + root) % size;
|
||||
D(("rank %d parent rank is %d\n", rank, parent_rank));
|
||||
my_control = (uint32_t *)
|
||||
(((char*)
|
||||
data->mcb_mpool_index[segment].mcbmi_data) +
|
||||
(parent_rank * mca_coll_sm_component.sm_fragment_size));
|
||||
}
|
||||
|
||||
/* Copy to my output buffer */
|
||||
D(("rank %d convertor copied from parent data %p to user buffer (%lu bytes)\n",
|
||||
rank, iov.iov_base, (unsigned long) iov.iov_len));
|
||||
ompi_convertor_unpack(&recv_convertor, &iov, &iov_size,
|
||||
&max_data, &bogus_free_after);
|
||||
}
|
||||
data->mcb_mpool_index[segment_num].mcbmi_control) +
|
||||
(rank * mca_coll_sm_component.sm_control_size));
|
||||
|
||||
/* It's ok to only look at the max_data from the last
|
||||
operation because it will be the same value for all of
|
||||
them */
|
||||
bytes += max_data;
|
||||
} while (bytes < total_size);
|
||||
/* Wait for the fragment: the parent will mark the segment
|
||||
as ready */
|
||||
D(("rank %d waiting for fragment in segment %d (control %p)\n",
|
||||
rank, segment_num, (char*) my_control));
|
||||
while (0 == *my_control) {
|
||||
continue;
|
||||
}
|
||||
max_data = *my_control;
|
||||
D(("rank %d: fragment ready in segment %d\n", rank, segment_num));
|
||||
|
||||
/* If I have children, send the data to them */
|
||||
if (num_children > 0) {
|
||||
/* No need to wait for the segment to become
|
||||
available -- the root has already claimed it
|
||||
and we're all already using it. So copy the
|
||||
fragment from the parent's portion in the
|
||||
segment to my portion in the segment. This is
|
||||
a simply memcpy because it's already been
|
||||
packed into the parent's segment. */
|
||||
memcpy(/* my data fan out section in the segment */
|
||||
(data->mcb_mpool_index[segment_num].mcbmi_data +
|
||||
(rank * mca_coll_sm_component.sm_fragment_size)),
|
||||
/* parent's fan out section in the segment */
|
||||
(data->mcb_mpool_index[segment_num].mcbmi_data +
|
||||
(parent_rank *
|
||||
mca_coll_sm_component.sm_fragment_size)),
|
||||
/* length */
|
||||
*my_control);
|
||||
D(("rank %d memcopy'ed fragment (%p) to my data fan out (%lu bytes)\n", rank,
|
||||
(data->mcb_mpool_index[segment_num].mcbmi_data +
|
||||
(parent_rank * mca_coll_sm_component.sm_fragment_size)),
|
||||
(unsigned long) *my_control));
|
||||
|
||||
/* Wait for the write to absolutely complete */
|
||||
opal_atomic_wmb();
|
||||
|
||||
/* Tell my children that this fragment is ready */
|
||||
for (i = 0; i < num_children; ++i) {
|
||||
child_rank = (children[i]->mcstn_id + root) % size;
|
||||
*((size_t*)
|
||||
(((char*)
|
||||
data->mcb_mpool_index[segment_num].mcbmi_control) +
|
||||
(mca_coll_sm_component.sm_control_size *
|
||||
child_rank))) = *my_control;
|
||||
D(("rank %d notifying child %d (vrank %d, rank %d)\n",
|
||||
rank, i, children[i]->mcstn_id, child_rank));
|
||||
}
|
||||
|
||||
/* Set the "copy from buffer" to be my local
|
||||
segment buffer so that we don't potentially
|
||||
incur a non-local memory copy from the parent's
|
||||
fan out data segment [again] when copying to
|
||||
the user's buffer */
|
||||
iov.iov_base =
|
||||
data->mcb_mpool_index[segment_num].mcbmi_data +
|
||||
(rank * mca_coll_sm_component.sm_fragment_size);
|
||||
}
|
||||
|
||||
/* If I don't have any children, set the "copy from
|
||||
buffer" to be my parent's fan out segment to copy
|
||||
directly from my parent */
|
||||
|
||||
else {
|
||||
iov.iov_base =
|
||||
(((char*)
|
||||
data->mcb_mpool_index[segment_num].mcbmi_data) +
|
||||
(parent_rank *
|
||||
mca_coll_sm_component.sm_fragment_size));
|
||||
}
|
||||
|
||||
/* Copy to my output buffer */
|
||||
D(("rank %d convertor copied from parent data %p to user buffer (%lu bytes)\n",
|
||||
rank, iov.iov_base, (unsigned long) iov.iov_len));
|
||||
ompi_convertor_unpack(&convertor, &iov, &iov_size,
|
||||
&max_data, &bogus_free_after);
|
||||
|
||||
if (root == rank) {
|
||||
OBJ_DESTRUCT(&send_convertor);
|
||||
} else {
|
||||
OBJ_DESTRUCT(&recv_convertor);
|
||||
*my_control = 0;
|
||||
bytes += max_data;
|
||||
++segment_num;
|
||||
} while (bytes < total_size && segment_num < max_segment_num);
|
||||
|
||||
/* Wait for all copy-out writes to complete before I say
|
||||
I'm done with the segments */
|
||||
opal_atomic_wmb();
|
||||
|
||||
/* We're finished with this set of segments */
|
||||
|
||||
D(("rank %d done with in-use flag %d (value %d)\n",
|
||||
rank, flag_num, flag->mcsiuf_num_procs_using));
|
||||
opal_atomic_add(&flag->mcsiuf_num_procs_using, -1);
|
||||
} while (bytes < total_size);
|
||||
}
|
||||
D(("rank %d done with bcast\n", rank));
|
||||
|
||||
/* Kill the convertor */
|
||||
|
||||
OBJ_DESTRUCT(&convertor);
|
||||
|
||||
/* All done */
|
||||
|
||||
D(("rank %d done with bcast\n", rank));
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
@ -173,7 +173,7 @@ static int sm_open(void)
|
||||
&cs->sm_bootstrap_num_segments);
|
||||
|
||||
mca_base_param_reg_int(c, "fragment_size",
|
||||
"Fragment size (in bytes) used for passing data through shared memory (bytes, will be rounded up to the nearest control_size size)",
|
||||
"Fragment size (in bytes) used for passing data through shared memory (will be rounded up to the nearest control_size size)",
|
||||
false, false,
|
||||
cs->sm_fragment_size,
|
||||
&cs->sm_fragment_size);
|
||||
@ -202,8 +202,8 @@ static int sm_open(void)
|
||||
false, false,
|
||||
cs->sm_comm_num_segments,
|
||||
&cs->sm_comm_num_segments);
|
||||
if (cs->sm_comm_num_segments < 2) {
|
||||
cs->sm_comm_num_segments = 2;
|
||||
if (cs->sm_comm_num_segments < cs->sm_comm_num_in_use_flags) {
|
||||
cs->sm_comm_num_segments = cs->sm_comm_num_in_use_flags;
|
||||
}
|
||||
if (0 != (cs->sm_comm_num_segments % cs->sm_comm_num_in_use_flags)) {
|
||||
cs->sm_comm_num_segments += cs->sm_comm_num_in_use_flags -
|
||||
@ -240,26 +240,8 @@ static int sm_open(void)
|
||||
false, true,
|
||||
size1, NULL);
|
||||
|
||||
/* Calculate how much space we need in the data mpool. There are
|
||||
several values to add (one of these for each segment):
|
||||
|
||||
- size of the control data:
|
||||
- fan-in data (num_procs * control_size size)
|
||||
- fan-out data (num_procs * control_size size)
|
||||
- size of message data
|
||||
- fan-in data (num_procs * (frag_size rounded up to
|
||||
control_size size))
|
||||
- fan-out data (num_procs * (frag_size rounded up
|
||||
to control_size size))
|
||||
|
||||
So it's:
|
||||
|
||||
num_segs * ((num_procs * control_size * 2) + (num_procs * frag * 2))
|
||||
|
||||
Which reduces to:
|
||||
|
||||
num_segs * num_procs * 2 * (control_size + frag)
|
||||
*/
|
||||
/* Calculate how much space we need in the data mpool. This
|
||||
formula taken directly from coll_sm_module.c. */
|
||||
|
||||
mca_base_param_reg_int(c, "info_num_procs",
|
||||
"Number of processes to use for the calculation of the shared_mem_size MCA information parameter (must be => 2)",
|
||||
@ -267,10 +249,12 @@ static int sm_open(void)
|
||||
cs->sm_info_comm_size,
|
||||
&cs->sm_info_comm_size);
|
||||
|
||||
size2 = cs->sm_comm_num_segments * cs->sm_info_comm_size *
|
||||
(cs->sm_control_size + cs->sm_fragment_size);
|
||||
size2 = 4 * cs->sm_control_size +
|
||||
(cs->sm_comm_num_in_use_flags * cs->sm_control_size) +
|
||||
(cs->sm_comm_num_segments * (cs->sm_info_comm_size * cs->sm_control_size * 2)) +
|
||||
(cs->sm_comm_num_segments * (cs->sm_info_comm_size * cs->sm_fragment_size));
|
||||
mca_base_param_reg_int(c, "shared_mem_used_data",
|
||||
"Amount of shared memory used in the shared memory data area for one process (in bytes)",
|
||||
"Amount of shared memory used in the shared memory data area for info_num_procs processes (in bytes)",
|
||||
false, true,
|
||||
size2, NULL);
|
||||
|
||||
|
@ -16,6 +16,11 @@
|
||||
/**
|
||||
* @file
|
||||
*
|
||||
* Warning: this is not for the faint of heart -- don't even bother
|
||||
* reading this source code if you don't have a strong understanding
|
||||
* of nested data structures and pointer math (remeber that
|
||||
* associativity and order of C operations is *critical* in terms of
|
||||
* pointer math!).
|
||||
*/
|
||||
|
||||
#include "ompi_config.h"
|
||||
@ -70,7 +75,7 @@ static const mca_coll_base_module_1_0_0_t module = {
|
||||
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
mca_coll_sm_allreduce_intra,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
@ -79,7 +84,7 @@ static const mca_coll_base_module_1_0_0_t module = {
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
mca_coll_sm_reduce_intra,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
@ -199,7 +204,7 @@ sm_module_init(struct ompi_communicator_t *comm)
|
||||
mca_coll_base_comm_t *data;
|
||||
size_t control_size, frag_size;
|
||||
mca_coll_sm_component_t *c = &mca_coll_sm_component;
|
||||
opal_maffinity_base_segment_t *segments;
|
||||
opal_maffinity_base_segment_t *maffinity;
|
||||
int parent, min_child, max_child, num_children;
|
||||
char *base;
|
||||
const int num_barrier_buffers = 2;
|
||||
@ -207,9 +212,9 @@ sm_module_init(struct ompi_communicator_t *comm)
|
||||
/* Get some space to setup memory affinity (just easier to try to
|
||||
alloc here to handle the error case) */
|
||||
|
||||
segments = malloc(sizeof(opal_maffinity_base_segment_t) *
|
||||
c->sm_bootstrap_num_segments * 5);
|
||||
if (NULL == segments) {
|
||||
maffinity = malloc(sizeof(opal_maffinity_base_segment_t) *
|
||||
c->sm_comm_num_segments * 3);
|
||||
if (NULL == maffinity) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
@ -334,13 +339,37 @@ sm_module_init(struct ompi_communicator_t *comm)
|
||||
}
|
||||
data->mcb_barrier_count = 0;
|
||||
|
||||
/* Next, setup the mca_coll_base_mpool_index_t pointers to point
|
||||
to the appropriate places in the mpool. */
|
||||
/* Next, setup the pointer to the in-use flags. The number of
|
||||
segments will be an even multiple of the number of in-use
|
||||
flags. */
|
||||
|
||||
base += (c->sm_control_size * size * num_barrier_buffers * 2);
|
||||
data->mcb_in_use_flags = (mca_coll_sm_in_use_flag_t*) base;
|
||||
|
||||
/* All things being equal, if we're rank 0, then make the in-use
|
||||
flags be local (memory affinity). Then zero them all out so
|
||||
that they're marked as unused. */
|
||||
|
||||
j = 0;
|
||||
if (0 == rank) {
|
||||
maffinity[j].mbs_start_addr = base;
|
||||
maffinity[j].mbs_len = c->sm_control_size *
|
||||
c->sm_comm_num_in_use_flags;
|
||||
memset(maffinity[j].mbs_start_addr, 0, maffinity[j].mbs_len);
|
||||
D(("rank 0 zeroed in-use flags (num %d, len %d): %p - %p\n",
|
||||
c->sm_comm_num_in_use_flags,
|
||||
maffinity[j].mbs_len,
|
||||
base, base + maffinity[j].mbs_len));
|
||||
++j;
|
||||
}
|
||||
|
||||
/* Next, setup pointers to the control and data portions of the
|
||||
segments, as well as to the relevant in-use flags. */
|
||||
|
||||
base += (c->sm_comm_num_in_use_flags * c->sm_control_size);
|
||||
control_size = size * c->sm_control_size;
|
||||
frag_size = size * c->sm_fragment_size;
|
||||
base += (control_size * num_barrier_buffers * 2);
|
||||
for (j = i = 0; i < data->mcb_mpool_num_segments; ++i) {
|
||||
for (i = 0; i < c->sm_comm_num_segments; ++i) {
|
||||
data->mcb_mpool_index[i].mcbmi_control = (uint32_t*)
|
||||
(base + (i * (control_size + frag_size)));
|
||||
data->mcb_mpool_index[i].mcbmi_data =
|
||||
@ -349,16 +378,16 @@ sm_module_init(struct ompi_communicator_t *comm)
|
||||
|
||||
/* Memory affinity: control */
|
||||
|
||||
segments[j].mbs_len = c->sm_control_size;
|
||||
segments[j].mbs_start_addr =
|
||||
maffinity[j].mbs_len = c->sm_control_size;
|
||||
maffinity[j].mbs_start_addr =
|
||||
data->mcb_mpool_index[i].mcbmi_control +
|
||||
(rank * c->sm_control_size);
|
||||
++j;
|
||||
|
||||
/* Memory affinity: data */
|
||||
|
||||
segments[j].mbs_len = c->sm_fragment_size;
|
||||
segments[j].mbs_start_addr =
|
||||
maffinity[j].mbs_len = c->sm_fragment_size;
|
||||
maffinity[j].mbs_start_addr =
|
||||
data->mcb_mpool_index[i].mcbmi_data +
|
||||
(rank * c->sm_control_size);
|
||||
++j;
|
||||
@ -367,14 +396,14 @@ sm_module_init(struct ompi_communicator_t *comm)
|
||||
/* Setup memory affinity so that the pages that belong to this
|
||||
process are local to this process */
|
||||
|
||||
opal_maffinity_base_set(segments, j);
|
||||
free(segments);
|
||||
opal_maffinity_base_set(maffinity, j);
|
||||
free(maffinity);
|
||||
|
||||
/* Zero out the control structures that belong to this process */
|
||||
|
||||
memset(data->mcb_barrier_control_me, 0,
|
||||
num_barrier_buffers * 2 * c->sm_control_size);
|
||||
for (i = 0; i < data->mcb_mpool_num_segments; ++i) {
|
||||
for (i = 0; i < c->sm_comm_num_segments; ++i) {
|
||||
memset(data->mcb_mpool_index[i].mcbmi_control, 0,
|
||||
c->sm_control_size);
|
||||
}
|
||||
@ -476,8 +505,6 @@ static int bootstrap_init(void)
|
||||
opal_atomic_lock(&bshe->super.seg_lock);
|
||||
opal_atomic_wmb();
|
||||
if (!bshe->super.seg_inited) {
|
||||
bshe->smbhe_num_segments =
|
||||
mca_coll_sm_component.sm_bootstrap_num_segments;
|
||||
bshe->smbhe_segments = (mca_coll_sm_bootstrap_comm_setup_t *)
|
||||
(((char *) bshe) +
|
||||
sizeof(mca_coll_sm_bootstrap_header_extension_t) +
|
||||
@ -485,7 +512,7 @@ static int bootstrap_init(void)
|
||||
mca_coll_sm_component.sm_bootstrap_num_segments));
|
||||
bshe->smbhe_cids = (uint32_t *)
|
||||
(((char *) bshe) + sizeof(*bshe));
|
||||
for (i = 0; i < bshe->smbhe_num_segments; ++i) {
|
||||
for (i = 0; i < mca_coll_sm_component.sm_bootstrap_num_segments; ++i) {
|
||||
bshe->smbhe_cids[i] = INT_MAX;
|
||||
}
|
||||
|
||||
@ -510,6 +537,7 @@ static int bootstrap_comm(ompi_communicator_t *comm)
|
||||
mca_coll_base_comm_t *data = comm->c_coll_selected_data;
|
||||
int comm_size = ompi_comm_size(comm);
|
||||
int num_segments = c->sm_comm_num_segments;
|
||||
int num_in_use = c->sm_comm_num_in_use_flags;
|
||||
int frag_size = c->sm_fragment_size;
|
||||
int control_size = c->sm_control_size;
|
||||
|
||||
@ -525,7 +553,7 @@ static int bootstrap_comm(ompi_communicator_t *comm)
|
||||
opal_atomic_wmb();
|
||||
found = false;
|
||||
empty_index = -1;
|
||||
for (i = 0; i < bshe->smbhe_num_segments; ++i) {
|
||||
for (i = 0; i < mca_coll_sm_component.sm_bootstrap_num_segments; ++i) {
|
||||
if (comm->c_contextid == bshe->smbhe_cids[i]) {
|
||||
found = true;
|
||||
break;
|
||||
@ -552,7 +580,6 @@ static int bootstrap_comm(ompi_communicator_t *comm)
|
||||
i = empty_index;
|
||||
bshe->smbhe_cids[i] = comm->c_contextid;
|
||||
|
||||
bscs[i].smbcs_comm_num_segments = num_segments;
|
||||
bscs[i].smbcs_count = comm_size;
|
||||
|
||||
/* Calculate how much space we need in the data mpool.
|
||||
@ -570,12 +597,14 @@ static int bootstrap_comm(ompi_communicator_t *comm)
|
||||
So it's:
|
||||
|
||||
barrier: 2 * control_size + 2 * control_size
|
||||
in use: num_in_use * control_size
|
||||
control: num_segments * (num_procs * control_size * 2 +
|
||||
num_procs * control_size)
|
||||
message: num_segments * (num_procs * frag_size)
|
||||
*/
|
||||
|
||||
size = 4 * c->sm_control_size +
|
||||
size = 4 * control_size +
|
||||
(num_in_use * control_size) +
|
||||
(num_segments * (comm_size * control_size * 2)) +
|
||||
(num_segments * (comm_size * frag_size));
|
||||
|
||||
@ -591,11 +620,12 @@ static int bootstrap_comm(ompi_communicator_t *comm)
|
||||
component would have been chosen), so we don't need
|
||||
to do that cleanup. */
|
||||
bscs[i].smbcs_data_mpool_offset = 0;
|
||||
bscs[i].smbcs_comm_num_segments = 0;
|
||||
bscs[i].smbcs_success = false;
|
||||
--bscs[i].smbcs_count;
|
||||
opal_atomic_unlock(&bshe->super.seg_lock);
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
bscs[i].smbcs_success = true;
|
||||
|
||||
/* Calculate the offset and put it in the bootstrap
|
||||
area */
|
||||
@ -624,7 +654,7 @@ static int bootstrap_comm(ompi_communicator_t *comm)
|
||||
|
||||
/* Check to see if there was an error while allocating the shared
|
||||
memory */
|
||||
if (0 == bscs[i].smbcs_comm_num_segments) {
|
||||
if (!bscs[i].smbcs_success) {
|
||||
err = OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
@ -636,7 +666,6 @@ static int bootstrap_comm(ompi_communicator_t *comm)
|
||||
data->mcb_mpool_base = c->sm_data_mpool->mpool_base(c->sm_data_mpool);
|
||||
data->mcb_mpool_offset = bscs[i].smbcs_data_mpool_offset;
|
||||
data->mcb_mpool_area = data->mcb_mpool_base + data->mcb_mpool_offset;
|
||||
data->mcb_mpool_num_segments = bscs[i].smbcs_comm_num_segments;
|
||||
data->mcb_operation_count = 0;
|
||||
}
|
||||
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user