1
1
- finally added "in use" flags -- one flag protects a set of segments
- these flags now used in bcast to protect (for example) when a
  message is so long that the root loops around the segments and has
  to re-use old segments -- now it knows that it has to wait until the
  non-root processes have finished with that set of segments before it
  can start using them
- implement allreduce as a reduce followed by a bcast (per discussion
  with rich)
- removed some redundant data on various data structures
- implemented query MCA param ("coll_sm_shared_mem_used_data") that
  tells you how much shared memory will be used for a given set of MCA
  params (e.g., number of segments, etc.).  For example:

  ompi_info --mca coll_sm_info_num_procs 4 --param coll sm | \
	    grep shared_mem_used_data

  tells you that for the default MCA param values (as of r7172), for 4
  processes, sm will use 548864 bytes of shared memory for its data
  transfer section
- remove a bunch of .c files from the Makefile.am that aren't
  implemented yet (i.e., all they do is return ERR_NOT_IMPLEMENTED)

Now on to the big Altix to test that this stuff really works...

This commit was SVN r7205.

The following SVN revision numbers were found above:
  r7172 --> open-mpi/ompi@bc72a7722b
Этот коммит содержится в:
Jeff Squyres 2005-09-06 21:41:55 +00:00
родитель d38316ddbf
Коммит 7bab4ed269
6 изменённых файлов: 354 добавлений и 254 удалений

Просмотреть файл

@ -18,27 +18,29 @@
include $(top_ompi_srcdir)/config/Makefile.options include $(top_ompi_srcdir)/config/Makefile.options
sources = \ not_used_yet = \
coll_sm.h \
coll_sm_allgather.c \ coll_sm_allgather.c \
coll_sm_allgatherv.c \ coll_sm_allgatherv.c \
coll_sm_allreduce.c \
coll_sm_alltoall.c \ coll_sm_alltoall.c \
coll_sm_alltoallv.c \ coll_sm_alltoallv.c \
coll_sm_alltoallw.c \ coll_sm_alltoallw.c \
coll_sm_barrier.c \
coll_sm_bcast.c \
coll_sm_component.c \
coll_sm_gather.c \ coll_sm_gather.c \
coll_sm_gatherv.c \ coll_sm_gatherv.c \
coll_sm_module.c \
coll_sm_reduce.c \
coll_sm_reduce_scatter.c \ coll_sm_reduce_scatter.c \
coll_sm_scan.c \ coll_sm_scan.c \
coll_sm_exscan.c \ coll_sm_exscan.c \
coll_sm_scatter.c \ coll_sm_scatter.c \
coll_sm_scatterv.c coll_sm_scatterv.c
sources = \
coll_sm.h \
coll_sm_allreduce.c \
coll_sm_barrier.c \
coll_sm_bcast.c \
coll_sm_component.c \
coll_sm_module.c \
coll_sm_reduce.c
# Make the output library in this directory, and name it either # Make the output library in this directory, and name it either
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la # mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
# (for static builds). # (for static builds).

Просмотреть файл

@ -13,9 +13,7 @@
* *
* $HEADER$ * $HEADER$
*/ */
/** /** @file */
* @file
*/
#ifndef MCA_COLL_SM_EXPORT_H #ifndef MCA_COLL_SM_EXPORT_H
#define MCA_COLL_SM_EXPORT_H #define MCA_COLL_SM_EXPORT_H
@ -52,15 +50,15 @@ extern "C" {
collective sm operations -- use this value plus the base collective sm operations -- use this value plus the base
of the mpool to obtain the pointer to this comm's of the mpool to obtain the pointer to this comm's
mca_coll_sm_mpool_area_t */ mca_coll_sm_mpool_area_t */
size_t smbcs_data_mpool_offset; volatile size_t smbcs_data_mpool_offset;
/** Number of segments in the data mpool area for this
communicator */
int smbcs_comm_num_segments;
/** Number of processes in this communicator who have seen /** Number of processes in this communicator who have seen
this value already. */ this value already. */
int smbcs_count; volatile int smbcs_count;
/** Mechanism for letting multiple processes know whether this
allocation succeeded or not */
volatile bool smbcs_success;
}; };
/** /**
* Convenience typedef * Convenience typedef
@ -75,19 +73,17 @@ extern "C" {
/** upper-level control structure */ /** upper-level control structure */
mca_common_sm_file_header_t super; mca_common_sm_file_header_t super;
/** Number of segments in the bootstrap mmap file */
int smbhe_num_segments;
/** Pointer to the start of the segments in the bootstrap area /** Pointer to the start of the segments in the bootstrap area
(map->seg_data only points to just beyond the (map->seg_data only points to just beyond the
mca_common_sm_file_header_t) */ mca_common_sm_file_header_t) */
mca_coll_sm_bootstrap_comm_setup_t *smbhe_segments; mca_coll_sm_bootstrap_comm_setup_t *smbhe_segments;
/** Pointer to array containing smhe_num_segments CIDs for use /** Pointer to array containing
in bootstrap phase -- will always point immediately after component.sm_bootstrap_num_segments CIDs for use in
the end of this struct (i.e., still within this header, bootstrap phase -- will always point immediately after the
but since it's variable size (set by MCA param), we can't end of this struct (i.e., still within this header, but
just have it here in the struct. Bonk). */ since it's variable size (set by MCA param), we can't just
have it here in the struct. Bonk). */
uint32_t *smbhe_cids; uint32_t *smbhe_cids;
}; };
/** /**
@ -143,6 +139,8 @@ extern "C" {
calculation of the "info" MCA parameter */ calculation of the "info" MCA parameter */
int sm_info_comm_size; int sm_info_comm_size;
/******* end of MCA params ********/
/** Size of the bootstrap area -- calculated in /** Size of the bootstrap area -- calculated in
coll_sm_component.c */ coll_sm_component.c */
size_t sm_bootstrap_size; size_t sm_bootstrap_size;
@ -185,6 +183,24 @@ extern "C" {
*/ */
typedef struct mca_coll_sm_tree_node_t mca_coll_sm_tree_node_t; typedef struct mca_coll_sm_tree_node_t mca_coll_sm_tree_node_t;
/**
* Simple structure comprising the "in use" flags. Contains two
* members: the number of processes that are currently using this
* set of segments and the operation number of the current
* operation.
*/
struct mca_coll_sm_in_use_flag_t {
/** Number of processes currently using this set of
segments */
volatile uint32_t mcsiuf_num_procs_using;
/** Must match data->mcb_count */
volatile uint32_t mcsiuf_operation_count;
};
/**
* Convenienve typedef
*/
typedef struct mca_coll_sm_in_use_flag_t mca_coll_sm_in_use_flag_t;
/** /**
* Structure containing pointers to various arrays of data in the * Structure containing pointers to various arrays of data in the
* data mpool area (one of these indexes a single segment in the * data mpool area (one of these indexes a single segment in the
@ -195,8 +211,6 @@ extern "C" {
struct mca_coll_base_mpool_index_t { struct mca_coll_base_mpool_index_t {
/** Pointer to beginning of control data */ /** Pointer to beginning of control data */
uint32_t *mcbmi_control; uint32_t *mcbmi_control;
/** Pointer to the "in use" buffer for this segment */
uint32_t *mcbmi_in_use;
/** Pointer to beginning of message fragment data */ /** Pointer to beginning of message fragment data */
char *mcbmi_data; char *mcbmi_data;
}; };
@ -223,8 +237,6 @@ extern "C" {
operations area (i.e., mcb_mpool_base + operations area (i.e., mcb_mpool_base +
mcb_mpool_offset) */ mcb_mpool_offset) */
unsigned char *mcb_mpool_area; unsigned char *mcb_mpool_area;
/** Number of segments in this comm's area in the data mpool */
int mcb_mpool_num_segments;
/** Pointer to my barrier control pages (odd index pages are /** Pointer to my barrier control pages (odd index pages are
"in", even index pages are "out") */ "in", even index pages are "out") */
@ -246,6 +258,9 @@ extern "C" {
of barrier buffers to use). */ of barrier buffers to use). */
int mcb_barrier_count; int mcb_barrier_count;
/** "In use" flags indicating which segments are available */
mca_coll_sm_in_use_flag_t *mcb_in_use_flags;
/** Array of indexes into the mpool area for control and data /** Array of indexes into the mpool area for control and data
fragment passing (containing pointers to each segments fragment passing (containing pointers to each segments
control and data areas). */ control and data areas). */
@ -256,7 +271,7 @@ extern "C" {
mca_coll_sm_tree_node_t *mcb_tree; mca_coll_sm_tree_node_t *mcb_tree;
/** Operation number (i.e., which segment number to use) */ /** Operation number (i.e., which segment number to use) */
int mcb_operation_count; uint32_t mcb_operation_count;
}; };
/** /**
* Convenience typedef * Convenience typedef

Просмотреть файл

@ -13,6 +13,7 @@
* *
* $HEADER$ * $HEADER$
*/ */
/** @file */
#include "ompi_config.h" #include "ompi_config.h"
@ -20,17 +21,20 @@
#include "coll_sm.h" #include "coll_sm.h"
/* /**
* allreduce_intra * Shared memory allreduce.
* *
* Function: - allreduce using other MPI collectives * For the moment, all we're doing is a reduce to root==0 and then a
* Accepts: - same as MPI_Allreduce() * broadcast. It is possible that we'll do something better someday.
* Returns: - MPI_SUCCESS or error code
*/ */
int mca_coll_sm_allreduce_intra(void *sbuf, void *rbuf, int count, int mca_coll_sm_allreduce_intra(void *sbuf, void *rbuf, int count,
struct ompi_datatype_t *dtype, struct ompi_datatype_t *dtype,
struct ompi_op_t *op, struct ompi_op_t *op,
struct ompi_communicator_t *comm) struct ompi_communicator_t *comm)
{ {
return OMPI_ERR_NOT_IMPLEMENTED; int ret;
ret = mca_coll_sm_reduce_intra(sbuf, rbuf, count, dtype, op, 0, comm);
return (ret == OMPI_SUCCESS) ?
mca_coll_sm_bcast_intra(rbuf, count, dtype, 0, comm) : ret;
} }

Просмотреть файл

@ -48,119 +48,181 @@ int mca_coll_sm_bcast_intra(void *buff, int count,
struct iovec iov; struct iovec iov;
uint32_t iov_size = 1; uint32_t iov_size = 1;
mca_coll_base_comm_t *data = comm->c_coll_selected_data; mca_coll_base_comm_t *data = comm->c_coll_selected_data;
int i, ret, rank, vrank, size, segment, num_children; int i, ret, rank, size, num_children;
int flag_num, segment_num, max_segment_num;
int parent_rank, child_rank; int parent_rank, child_rank;
size_t total_size, max_data, bytes; size_t total_size, max_data, bytes;
uint32_t *my_control, *parent_control; volatile uint32_t *my_control;
ompi_convertor_t send_convertor; mca_coll_sm_in_use_flag_t *flag;
ompi_convertor_t recv_convertor; ompi_convertor_t convertor;
mca_coll_sm_tree_node_t *me, *parent, **children; mca_coll_sm_tree_node_t *me, *parent, **children;
int32_t bogus_free_after; int32_t bogus_free_after = 0;
/* Setup some identities */ /* Setup some identities */
rank = ompi_comm_rank(comm); rank = ompi_comm_rank(comm);
size = ompi_comm_size(comm); size = ompi_comm_size(comm);
vrank = (rank + size - root) % size;
me = &data->mcb_tree[vrank]; OBJ_CONSTRUCT(&convertor, ompi_convertor_t);
D(("rank %d: virtual rank %d\n", rank, vrank)); iov.iov_len = mca_coll_sm_component.sm_fragment_size;
bytes = 0;
me = &data->mcb_tree[(rank + size - root) % size];
D(("rank %d: virtual rank %d\n", rank, me - data->mcb_tree));
parent = me->mcstn_parent; parent = me->mcstn_parent;
children = me->mcstn_children; children = me->mcstn_children;
num_children = me->mcstn_num_children; num_children = me->mcstn_num_children;
/* If I'm the root, I need a send convertor to pack from the /* Only have one top-level decision as to whether I'm the root or
user's buffer to shared memory */ not. Do this at the slight expense of repeating a little logic
-- but it's better than a conditional branch in every loop
iteration. */
/*********************************************************************
* Root
*********************************************************************/
if (root == rank) { if (root == rank) {
OBJ_CONSTRUCT(&send_convertor, ompi_convertor_t);
/* The root needs a send convertor to pack from the user's
buffer to shared memory */
if (OMPI_SUCCESS != if (OMPI_SUCCESS !=
(ret = (ret =
ompi_convertor_copy_and_prepare_for_send(ompi_mpi_local_convertor, ompi_convertor_copy_and_prepare_for_send(ompi_mpi_local_convertor,
datatype, datatype,
count, count,
buff, buff,
&send_convertor))) { &convertor))) {
return ret; return ret;
} }
ompi_convertor_get_packed_size(&send_convertor, &total_size); ompi_convertor_get_packed_size(&convertor, &total_size);
D(("root got send convertor w/ total_size == %lu\n", D(("root got send convertor w/ total_size == %lu\n",
(unsigned long) total_size)); (unsigned long) total_size));
/* Main loop over sending fragments */
do {
flag_num = (data->mcb_operation_count++ %
mca_coll_sm_component.sm_comm_num_in_use_flags);
/* Wait for the set of segments to become available */
flag = (mca_coll_sm_in_use_flag_t*)
(((char *) data->mcb_in_use_flags) +
(flag_num * mca_coll_sm_component.sm_control_size));
D(("root waiting for in_use flag %d (value %d), %p\n",
flag_num, flag->mcsiuf_num_procs_using, flag));
while (0 != flag->mcsiuf_num_procs_using) {
continue;
}
D(("root got in_use flag %d (value %d), %p\n",
flag_num, flag->mcsiuf_num_procs_using, flag));
/* Now that the set of segments is availble, mark it as
used. No need to include the root in the count (we'd
only have to decrement it later). Don't need a write
barrier here -- we have another later that will
guarantee that the write has completed, if
necessary. */
flag->mcsiuf_num_procs_using = size - 1;
flag->mcsiuf_operation_count = data->mcb_operation_count - 1;
/* Loop over all the segments in this set */
segment_num = flag_num *
mca_coll_sm_component.sm_comm_num_in_use_flags;
max_segment_num = (flag_num + 1) *
mca_coll_sm_component.sm_comm_num_in_use_flags;
do {
/* Copy the fragment from the user buffer to my fragment
in the current segment */
iov.iov_base =
data->mcb_mpool_index[segment_num].mcbmi_data +
(rank * mca_coll_sm_component.sm_fragment_size);
max_data = iov.iov_len;
D(("root copying %lu bytes to data fan out, seg %d: %p\n",
(unsigned long) iov.iov_len, segment_num, iov.iov_base));
ompi_convertor_pack(&convertor, &iov, &iov_size,
&max_data, &bogus_free_after);
bytes += max_data;
/* Wait for the write to absolutely complete */
opal_atomic_wmb();
/* Tell my children that this fragment is ready (be
sure to normalize the child's ID based on the shift
we did above to calculate the "me" node in the
tree) */
for (i = 0; i < num_children; ++i) {
child_rank = (children[i]->mcstn_id + root) % size;
*((size_t*)
(((char*)
data->mcb_mpool_index[segment_num].mcbmi_control) +
(mca_coll_sm_component.sm_control_size *
child_rank))) = max_data;
D(("root sent notice to child %d (vrank %d), control to %p\n",
i, children[i]->mcstn_id,
(((char*)
data->mcb_mpool_index[segment_num].mcbmi_control) +
(mca_coll_sm_component.sm_control_size *
child_rank))));
} }
/* If I'm not the root, I need a receive convertor to unpack from ++segment_num;
shared mmory to the user's buffer */ } while (bytes < total_size && segment_num < max_segment_num);
} while (bytes < total_size);
}
/*********************************************************************
* Non-root
*********************************************************************/
else { else {
OBJ_CONSTRUCT(&recv_convertor, ompi_convertor_t);
/* Non-root processes need a receive convertor to unpack from
shared mmory to the user's buffer */
OBJ_CONSTRUCT(&convertor, ompi_convertor_t);
if (OMPI_SUCCESS != if (OMPI_SUCCESS !=
(ret = (ret =
ompi_convertor_copy_and_prepare_for_recv(ompi_mpi_local_convertor, ompi_convertor_copy_and_prepare_for_recv(ompi_mpi_local_convertor,
datatype, datatype,
count, count,
buff, buff,
&recv_convertor))) { &convertor))) {
return ret; return ret;
} }
ompi_convertor_get_packed_size(&recv_convertor, &total_size); ompi_convertor_get_packed_size(&convertor, &total_size);
D(("rank %d got recv convertor w/ total_size == %lu\n", D(("rank %d got recv convertor w/ total_size == %lu\n",
rank, (unsigned long) total_size)); rank, (unsigned long) total_size));
}
/* Setup some more constants */ /* Loop over receiving (and possibly re-sending) the
fragments */
iov.iov_len = mca_coll_sm_component.sm_fragment_size;
bytes = 0;
/* Loop over the fragments */
do { do {
segment = (data->mcb_operation_count++ % flag_num = (data->mcb_operation_count %
data->mcb_mpool_num_segments); mca_coll_sm_component.sm_comm_num_in_use_flags);
/* Root */ /* Wait for the root to mark this set of segments as
ours */
if (root == rank) { flag = (mca_coll_sm_in_use_flag_t*)
(((char *) data->mcb_in_use_flags) +
/* Wait for the segment to become available */ (flag_num * mca_coll_sm_component.sm_control_size));
/* JMS */ D(("rank %d waiting for root to claim in-use flag %d, %p (op count %d)\n", rank, flag_num, flag, data->mcb_operation_count));
while (data->mcb_operation_count != flag->mcsiuf_operation_count) {
/* Copy the fragment from the user buffer to my fragment continue;
in the current segment */
iov.iov_base =
data->mcb_mpool_index[segment].mcbmi_data +
(rank * mca_coll_sm_component.sm_fragment_size);
max_data = iov.iov_len;
D(("root copying %lu bytes to data fan out, seg %d: %p\n",
(unsigned long) iov.iov_len, segment, iov.iov_base));
ompi_convertor_pack(&send_convertor, &iov, &iov_size,
&max_data, &bogus_free_after);
/* Wait for the write to absolutely complete */
opal_atomic_wmb();
/* Tell my children that this fragment is ready (be sure
to normalize the child's ID based on the shift we did
above to calculate the "me" node in the tree) */
for (i = 0; i < num_children; ++i) {
child_rank = (children[i]->mcstn_id + root) % size;
*((size_t*)
(((char*)
data->mcb_mpool_index[segment].mcbmi_control) +
(mca_coll_sm_component.sm_control_size *
child_rank))) = max_data;
D(("root sent notice to child %d (vrank %d), control to %p\n",
i, children[i]->mcstn_id,
(((char*)
data->mcb_mpool_index[segment].mcbmi_control) +
(mca_coll_sm_component.sm_control_size *
child_rank))));
}
} }
++data->mcb_operation_count;
/* Non-root */ /* Loop over all the segments in this set */
else { segment_num = flag_num *
mca_coll_sm_component.sm_comm_num_in_use_flags;
max_segment_num = (flag_num + 1) *
mca_coll_sm_component.sm_comm_num_in_use_flags;
do {
/* Pre-calculate some pointers */ /* Pre-calculate some pointers */
@ -168,44 +230,39 @@ int mca_coll_sm_bcast_intra(void *buff, int count,
D(("rank %d parent rank is %d\n", rank, parent_rank)); D(("rank %d parent rank is %d\n", rank, parent_rank));
my_control = (uint32_t *) my_control = (uint32_t *)
(((char*) (((char*)
data->mcb_mpool_index[segment].mcbmi_control) + data->mcb_mpool_index[segment_num].mcbmi_control) +
(rank * mca_coll_sm_component.sm_control_size)); (rank * mca_coll_sm_component.sm_control_size));
parent_control = (uint32_t *)
(((char*)
data->mcb_mpool_index[segment].mcbmi_control) +
(parent_rank * mca_coll_sm_component.sm_control_size));
/* Wait for the fragment: the parent will mark the segment /* Wait for the fragment: the parent will mark the segment
as ready */ as ready */
D(("rank %d waiting for fragment in segment %d (control %p)\n", D(("rank %d waiting for fragment in segment %d (control %p)\n",
rank, segment, (char*) my_control)); rank, segment_num, (char*) my_control));
while (0 == *my_control) { while (0 == *my_control) {
continue; continue;
} }
max_data = *my_control; max_data = *my_control;
D(("rank %d: fragment ready in segment %d\n", rank, segment)); D(("rank %d: fragment ready in segment %d\n", rank, segment_num));
/* If I have children, send the data to them */ /* If I have children, send the data to them */
if (num_children > 0) { if (num_children > 0) {
max_data = iov.iov_len; /* No need to wait for the segment to become
available -- the root has already claimed it
/* No need to wait for the segment to become available and we're all already using it. So copy the
-- the root has already claimed it and we're all fragment from the parent's portion in the
already using it. So copy the fragment from the segment to my portion in the segment. This is
parent's portion in the segment to my portion in a simply memcpy because it's already been
the segment. This is a simply memcpy because it's packed into the parent's segment. */
already been packed into the parent's segment. */
memcpy(/* my data fan out section in the segment */ memcpy(/* my data fan out section in the segment */
(data->mcb_mpool_index[segment].mcbmi_data + (data->mcb_mpool_index[segment_num].mcbmi_data +
(rank * mca_coll_sm_component.sm_fragment_size)), (rank * mca_coll_sm_component.sm_fragment_size)),
/* parent's fan out section in the segment */ /* parent's fan out section in the segment */
(data->mcb_mpool_index[segment].mcbmi_data + (data->mcb_mpool_index[segment_num].mcbmi_data +
(parent_rank * (parent_rank *
mca_coll_sm_component.sm_fragment_size)), mca_coll_sm_component.sm_fragment_size)),
/* length */ /* length */
*my_control); *my_control);
D(("rank %d memcopy'ed fragment (%p) to my data fan out (%lu bytes)\n", rank, D(("rank %d memcopy'ed fragment (%p) to my data fan out (%lu bytes)\n", rank,
(data->mcb_mpool_index[segment].mcbmi_data + (data->mcb_mpool_index[segment_num].mcbmi_data +
(parent_rank * mca_coll_sm_component.sm_fragment_size)), (parent_rank * mca_coll_sm_component.sm_fragment_size)),
(unsigned long) *my_control)); (unsigned long) *my_control));
@ -217,20 +274,20 @@ int mca_coll_sm_bcast_intra(void *buff, int count,
child_rank = (children[i]->mcstn_id + root) % size; child_rank = (children[i]->mcstn_id + root) % size;
*((size_t*) *((size_t*)
(((char*) (((char*)
data->mcb_mpool_index[segment].mcbmi_control) + data->mcb_mpool_index[segment_num].mcbmi_control) +
(mca_coll_sm_component.sm_control_size * (mca_coll_sm_component.sm_control_size *
child_rank))) = *my_control; child_rank))) = *my_control;
D(("rank %d notifying child %d (vrank %d, rank %d)\n", D(("rank %d notifying child %d (vrank %d, rank %d)\n",
rank, i, children[i]->mcstn_id, child_rank)); rank, i, children[i]->mcstn_id, child_rank));
} }
/* Set the "copy from buffer" to be my local segment /* Set the "copy from buffer" to be my local
buffer so that we don't potentially incur a segment buffer so that we don't potentially
non-local memory copy from the parent's fan out incur a non-local memory copy from the parent's
data segment [again] when copying to the user's fan out data segment [again] when copying to
buffer */ the user's buffer */
iov.iov_base = iov.iov_base =
data->mcb_mpool_index[segment].mcbmi_data + data->mcb_mpool_index[segment_num].mcbmi_data +
(rank * mca_coll_sm_component.sm_fragment_size); (rank * mca_coll_sm_component.sm_fragment_size);
} }
@ -241,31 +298,40 @@ int mca_coll_sm_bcast_intra(void *buff, int count,
else { else {
iov.iov_base = iov.iov_base =
(((char*) (((char*)
data->mcb_mpool_index[segment].mcbmi_data) + data->mcb_mpool_index[segment_num].mcbmi_data) +
(parent_rank * mca_coll_sm_component.sm_fragment_size)); (parent_rank *
mca_coll_sm_component.sm_fragment_size));
} }
/* Copy to my output buffer */ /* Copy to my output buffer */
D(("rank %d convertor copied from parent data %p to user buffer (%lu bytes)\n", D(("rank %d convertor copied from parent data %p to user buffer (%lu bytes)\n",
rank, iov.iov_base, (unsigned long) iov.iov_len)); rank, iov.iov_base, (unsigned long) iov.iov_len));
ompi_convertor_unpack(&recv_convertor, &iov, &iov_size, ompi_convertor_unpack(&convertor, &iov, &iov_size,
&max_data, &bogus_free_after); &max_data, &bogus_free_after);
}
/* It's ok to only look at the max_data from the last *my_control = 0;
operation because it will be the same value for all of
them */
bytes += max_data; bytes += max_data;
} while (bytes < total_size); ++segment_num;
} while (bytes < total_size && segment_num < max_segment_num);
if (root == rank) { /* Wait for all copy-out writes to complete before I say
OBJ_DESTRUCT(&send_convertor); I'm done with the segments */
} else { opal_atomic_wmb();
OBJ_DESTRUCT(&recv_convertor);
/* We're finished with this set of segments */
D(("rank %d done with in-use flag %d (value %d)\n",
rank, flag_num, flag->mcsiuf_num_procs_using));
opal_atomic_add(&flag->mcsiuf_num_procs_using, -1);
} while (bytes < total_size);
} }
D(("rank %d done with bcast\n", rank));
/* Kill the convertor */
OBJ_DESTRUCT(&convertor);
/* All done */ /* All done */
D(("rank %d done with bcast\n", rank));
return OMPI_SUCCESS; return OMPI_SUCCESS;
} }

Просмотреть файл

@ -173,7 +173,7 @@ static int sm_open(void)
&cs->sm_bootstrap_num_segments); &cs->sm_bootstrap_num_segments);
mca_base_param_reg_int(c, "fragment_size", mca_base_param_reg_int(c, "fragment_size",
"Fragment size (in bytes) used for passing data through shared memory (bytes, will be rounded up to the nearest control_size size)", "Fragment size (in bytes) used for passing data through shared memory (will be rounded up to the nearest control_size size)",
false, false, false, false,
cs->sm_fragment_size, cs->sm_fragment_size,
&cs->sm_fragment_size); &cs->sm_fragment_size);
@ -202,8 +202,8 @@ static int sm_open(void)
false, false, false, false,
cs->sm_comm_num_segments, cs->sm_comm_num_segments,
&cs->sm_comm_num_segments); &cs->sm_comm_num_segments);
if (cs->sm_comm_num_segments < 2) { if (cs->sm_comm_num_segments < cs->sm_comm_num_in_use_flags) {
cs->sm_comm_num_segments = 2; cs->sm_comm_num_segments = cs->sm_comm_num_in_use_flags;
} }
if (0 != (cs->sm_comm_num_segments % cs->sm_comm_num_in_use_flags)) { if (0 != (cs->sm_comm_num_segments % cs->sm_comm_num_in_use_flags)) {
cs->sm_comm_num_segments += cs->sm_comm_num_in_use_flags - cs->sm_comm_num_segments += cs->sm_comm_num_in_use_flags -
@ -240,26 +240,8 @@ static int sm_open(void)
false, true, false, true,
size1, NULL); size1, NULL);
/* Calculate how much space we need in the data mpool. There are /* Calculate how much space we need in the data mpool. This
several values to add (one of these for each segment): formula taken directly from coll_sm_module.c. */
- size of the control data:
- fan-in data (num_procs * control_size size)
- fan-out data (num_procs * control_size size)
- size of message data
- fan-in data (num_procs * (frag_size rounded up to
control_size size))
- fan-out data (num_procs * (frag_size rounded up
to control_size size))
So it's:
num_segs * ((num_procs * control_size * 2) + (num_procs * frag * 2))
Which reduces to:
num_segs * num_procs * 2 * (control_size + frag)
*/
mca_base_param_reg_int(c, "info_num_procs", mca_base_param_reg_int(c, "info_num_procs",
"Number of processes to use for the calculation of the shared_mem_size MCA information parameter (must be => 2)", "Number of processes to use for the calculation of the shared_mem_size MCA information parameter (must be => 2)",
@ -267,10 +249,12 @@ static int sm_open(void)
cs->sm_info_comm_size, cs->sm_info_comm_size,
&cs->sm_info_comm_size); &cs->sm_info_comm_size);
size2 = cs->sm_comm_num_segments * cs->sm_info_comm_size * size2 = 4 * cs->sm_control_size +
(cs->sm_control_size + cs->sm_fragment_size); (cs->sm_comm_num_in_use_flags * cs->sm_control_size) +
(cs->sm_comm_num_segments * (cs->sm_info_comm_size * cs->sm_control_size * 2)) +
(cs->sm_comm_num_segments * (cs->sm_info_comm_size * cs->sm_fragment_size));
mca_base_param_reg_int(c, "shared_mem_used_data", mca_base_param_reg_int(c, "shared_mem_used_data",
"Amount of shared memory used in the shared memory data area for one process (in bytes)", "Amount of shared memory used in the shared memory data area for info_num_procs processes (in bytes)",
false, true, false, true,
size2, NULL); size2, NULL);

Просмотреть файл

@ -16,6 +16,11 @@
/** /**
* @file * @file
* *
* Warning: this is not for the faint of heart -- don't even bother
* reading this source code if you don't have a strong understanding
* of nested data structures and pointer math (remeber that
* associativity and order of C operations is *critical* in terms of
* pointer math!).
*/ */
#include "ompi_config.h" #include "ompi_config.h"
@ -70,7 +75,7 @@ static const mca_coll_base_module_1_0_0_t module = {
NULL, NULL,
NULL, NULL,
NULL, mca_coll_sm_allreduce_intra,
NULL, NULL,
NULL, NULL,
NULL, NULL,
@ -79,7 +84,7 @@ static const mca_coll_base_module_1_0_0_t module = {
NULL, NULL,
NULL, NULL,
NULL, NULL,
NULL, mca_coll_sm_reduce_intra,
NULL, NULL,
NULL, NULL,
NULL, NULL,
@ -199,7 +204,7 @@ sm_module_init(struct ompi_communicator_t *comm)
mca_coll_base_comm_t *data; mca_coll_base_comm_t *data;
size_t control_size, frag_size; size_t control_size, frag_size;
mca_coll_sm_component_t *c = &mca_coll_sm_component; mca_coll_sm_component_t *c = &mca_coll_sm_component;
opal_maffinity_base_segment_t *segments; opal_maffinity_base_segment_t *maffinity;
int parent, min_child, max_child, num_children; int parent, min_child, max_child, num_children;
char *base; char *base;
const int num_barrier_buffers = 2; const int num_barrier_buffers = 2;
@ -207,9 +212,9 @@ sm_module_init(struct ompi_communicator_t *comm)
/* Get some space to setup memory affinity (just easier to try to /* Get some space to setup memory affinity (just easier to try to
alloc here to handle the error case) */ alloc here to handle the error case) */
segments = malloc(sizeof(opal_maffinity_base_segment_t) * maffinity = malloc(sizeof(opal_maffinity_base_segment_t) *
c->sm_bootstrap_num_segments * 5); c->sm_comm_num_segments * 3);
if (NULL == segments) { if (NULL == maffinity) {
return NULL; return NULL;
} }
@ -334,13 +339,37 @@ sm_module_init(struct ompi_communicator_t *comm)
} }
data->mcb_barrier_count = 0; data->mcb_barrier_count = 0;
/* Next, setup the mca_coll_base_mpool_index_t pointers to point /* Next, setup the pointer to the in-use flags. The number of
to the appropriate places in the mpool. */ segments will be an even multiple of the number of in-use
flags. */
base += (c->sm_control_size * size * num_barrier_buffers * 2);
data->mcb_in_use_flags = (mca_coll_sm_in_use_flag_t*) base;
/* All things being equal, if we're rank 0, then make the in-use
flags be local (memory affinity). Then zero them all out so
that they're marked as unused. */
j = 0;
if (0 == rank) {
maffinity[j].mbs_start_addr = base;
maffinity[j].mbs_len = c->sm_control_size *
c->sm_comm_num_in_use_flags;
memset(maffinity[j].mbs_start_addr, 0, maffinity[j].mbs_len);
D(("rank 0 zeroed in-use flags (num %d, len %d): %p - %p\n",
c->sm_comm_num_in_use_flags,
maffinity[j].mbs_len,
base, base + maffinity[j].mbs_len));
++j;
}
/* Next, setup pointers to the control and data portions of the
segments, as well as to the relevant in-use flags. */
base += (c->sm_comm_num_in_use_flags * c->sm_control_size);
control_size = size * c->sm_control_size; control_size = size * c->sm_control_size;
frag_size = size * c->sm_fragment_size; frag_size = size * c->sm_fragment_size;
base += (control_size * num_barrier_buffers * 2); for (i = 0; i < c->sm_comm_num_segments; ++i) {
for (j = i = 0; i < data->mcb_mpool_num_segments; ++i) {
data->mcb_mpool_index[i].mcbmi_control = (uint32_t*) data->mcb_mpool_index[i].mcbmi_control = (uint32_t*)
(base + (i * (control_size + frag_size))); (base + (i * (control_size + frag_size)));
data->mcb_mpool_index[i].mcbmi_data = data->mcb_mpool_index[i].mcbmi_data =
@ -349,16 +378,16 @@ sm_module_init(struct ompi_communicator_t *comm)
/* Memory affinity: control */ /* Memory affinity: control */
segments[j].mbs_len = c->sm_control_size; maffinity[j].mbs_len = c->sm_control_size;
segments[j].mbs_start_addr = maffinity[j].mbs_start_addr =
data->mcb_mpool_index[i].mcbmi_control + data->mcb_mpool_index[i].mcbmi_control +
(rank * c->sm_control_size); (rank * c->sm_control_size);
++j; ++j;
/* Memory affinity: data */ /* Memory affinity: data */
segments[j].mbs_len = c->sm_fragment_size; maffinity[j].mbs_len = c->sm_fragment_size;
segments[j].mbs_start_addr = maffinity[j].mbs_start_addr =
data->mcb_mpool_index[i].mcbmi_data + data->mcb_mpool_index[i].mcbmi_data +
(rank * c->sm_control_size); (rank * c->sm_control_size);
++j; ++j;
@ -367,14 +396,14 @@ sm_module_init(struct ompi_communicator_t *comm)
/* Setup memory affinity so that the pages that belong to this /* Setup memory affinity so that the pages that belong to this
process are local to this process */ process are local to this process */
opal_maffinity_base_set(segments, j); opal_maffinity_base_set(maffinity, j);
free(segments); free(maffinity);
/* Zero out the control structures that belong to this process */ /* Zero out the control structures that belong to this process */
memset(data->mcb_barrier_control_me, 0, memset(data->mcb_barrier_control_me, 0,
num_barrier_buffers * 2 * c->sm_control_size); num_barrier_buffers * 2 * c->sm_control_size);
for (i = 0; i < data->mcb_mpool_num_segments; ++i) { for (i = 0; i < c->sm_comm_num_segments; ++i) {
memset(data->mcb_mpool_index[i].mcbmi_control, 0, memset(data->mcb_mpool_index[i].mcbmi_control, 0,
c->sm_control_size); c->sm_control_size);
} }
@ -476,8 +505,6 @@ static int bootstrap_init(void)
opal_atomic_lock(&bshe->super.seg_lock); opal_atomic_lock(&bshe->super.seg_lock);
opal_atomic_wmb(); opal_atomic_wmb();
if (!bshe->super.seg_inited) { if (!bshe->super.seg_inited) {
bshe->smbhe_num_segments =
mca_coll_sm_component.sm_bootstrap_num_segments;
bshe->smbhe_segments = (mca_coll_sm_bootstrap_comm_setup_t *) bshe->smbhe_segments = (mca_coll_sm_bootstrap_comm_setup_t *)
(((char *) bshe) + (((char *) bshe) +
sizeof(mca_coll_sm_bootstrap_header_extension_t) + sizeof(mca_coll_sm_bootstrap_header_extension_t) +
@ -485,7 +512,7 @@ static int bootstrap_init(void)
mca_coll_sm_component.sm_bootstrap_num_segments)); mca_coll_sm_component.sm_bootstrap_num_segments));
bshe->smbhe_cids = (uint32_t *) bshe->smbhe_cids = (uint32_t *)
(((char *) bshe) + sizeof(*bshe)); (((char *) bshe) + sizeof(*bshe));
for (i = 0; i < bshe->smbhe_num_segments; ++i) { for (i = 0; i < mca_coll_sm_component.sm_bootstrap_num_segments; ++i) {
bshe->smbhe_cids[i] = INT_MAX; bshe->smbhe_cids[i] = INT_MAX;
} }
@ -510,6 +537,7 @@ static int bootstrap_comm(ompi_communicator_t *comm)
mca_coll_base_comm_t *data = comm->c_coll_selected_data; mca_coll_base_comm_t *data = comm->c_coll_selected_data;
int comm_size = ompi_comm_size(comm); int comm_size = ompi_comm_size(comm);
int num_segments = c->sm_comm_num_segments; int num_segments = c->sm_comm_num_segments;
int num_in_use = c->sm_comm_num_in_use_flags;
int frag_size = c->sm_fragment_size; int frag_size = c->sm_fragment_size;
int control_size = c->sm_control_size; int control_size = c->sm_control_size;
@ -525,7 +553,7 @@ static int bootstrap_comm(ompi_communicator_t *comm)
opal_atomic_wmb(); opal_atomic_wmb();
found = false; found = false;
empty_index = -1; empty_index = -1;
for (i = 0; i < bshe->smbhe_num_segments; ++i) { for (i = 0; i < mca_coll_sm_component.sm_bootstrap_num_segments; ++i) {
if (comm->c_contextid == bshe->smbhe_cids[i]) { if (comm->c_contextid == bshe->smbhe_cids[i]) {
found = true; found = true;
break; break;
@ -552,7 +580,6 @@ static int bootstrap_comm(ompi_communicator_t *comm)
i = empty_index; i = empty_index;
bshe->smbhe_cids[i] = comm->c_contextid; bshe->smbhe_cids[i] = comm->c_contextid;
bscs[i].smbcs_comm_num_segments = num_segments;
bscs[i].smbcs_count = comm_size; bscs[i].smbcs_count = comm_size;
/* Calculate how much space we need in the data mpool. /* Calculate how much space we need in the data mpool.
@ -570,12 +597,14 @@ static int bootstrap_comm(ompi_communicator_t *comm)
So it's: So it's:
barrier: 2 * control_size + 2 * control_size barrier: 2 * control_size + 2 * control_size
in use: num_in_use * control_size
control: num_segments * (num_procs * control_size * 2 + control: num_segments * (num_procs * control_size * 2 +
num_procs * control_size) num_procs * control_size)
message: num_segments * (num_procs * frag_size) message: num_segments * (num_procs * frag_size)
*/ */
size = 4 * c->sm_control_size + size = 4 * control_size +
(num_in_use * control_size) +
(num_segments * (comm_size * control_size * 2)) + (num_segments * (comm_size * control_size * 2)) +
(num_segments * (comm_size * frag_size)); (num_segments * (comm_size * frag_size));
@ -591,11 +620,12 @@ static int bootstrap_comm(ompi_communicator_t *comm)
component would have been chosen), so we don't need component would have been chosen), so we don't need
to do that cleanup. */ to do that cleanup. */
bscs[i].smbcs_data_mpool_offset = 0; bscs[i].smbcs_data_mpool_offset = 0;
bscs[i].smbcs_comm_num_segments = 0; bscs[i].smbcs_success = false;
--bscs[i].smbcs_count; --bscs[i].smbcs_count;
opal_atomic_unlock(&bshe->super.seg_lock); opal_atomic_unlock(&bshe->super.seg_lock);
return OMPI_ERR_OUT_OF_RESOURCE; return OMPI_ERR_OUT_OF_RESOURCE;
} }
bscs[i].smbcs_success = true;
/* Calculate the offset and put it in the bootstrap /* Calculate the offset and put it in the bootstrap
area */ area */
@ -624,7 +654,7 @@ static int bootstrap_comm(ompi_communicator_t *comm)
/* Check to see if there was an error while allocating the shared /* Check to see if there was an error while allocating the shared
memory */ memory */
if (0 == bscs[i].smbcs_comm_num_segments) { if (!bscs[i].smbcs_success) {
err = OMPI_ERR_OUT_OF_RESOURCE; err = OMPI_ERR_OUT_OF_RESOURCE;
} }
@ -636,7 +666,6 @@ static int bootstrap_comm(ompi_communicator_t *comm)
data->mcb_mpool_base = c->sm_data_mpool->mpool_base(c->sm_data_mpool); data->mcb_mpool_base = c->sm_data_mpool->mpool_base(c->sm_data_mpool);
data->mcb_mpool_offset = bscs[i].smbcs_data_mpool_offset; data->mcb_mpool_offset = bscs[i].smbcs_data_mpool_offset;
data->mcb_mpool_area = data->mcb_mpool_base + data->mcb_mpool_offset; data->mcb_mpool_area = data->mcb_mpool_base + data->mcb_mpool_offset;
data->mcb_mpool_num_segments = bscs[i].smbcs_comm_num_segments;
data->mcb_operation_count = 0; data->mcb_operation_count = 0;
} }