bunches of updates
- finally added "in use" flags -- one flag protects a set of segments - these flags now used in bcast to protect (for example) when a message is so long that the root loops around the segments and has to re-use old segments -- now it knows that it has to wait until the non-root processes have finished with that set of segments before it can start using them - implement allreduce as a reduce followed by a bcast (per discussion with rich) - removed some redundant data on various data structures - implemented query MCA param ("coll_sm_shared_mem_used_data") that tells you how much shared memory will be used for a given set of MCA params (e.g., number of segments, etc.). For example: ompi_info --mca coll_sm_info_num_procs 4 --param coll sm | \ grep shared_mem_used_data tells you that for the default MCA param values (as of r7172), for 4 processes, sm will use 548864 bytes of shared memory for its data transfer section - remove a bunch of .c files from the Makefile.am that aren't implemented yet (i.e., all they do is return ERR_NOT_IMPLEMENTED) Now on to the big Altix to test that this stuff really works... This commit was SVN r7205. The following SVN revision numbers were found above: r7172 --> open-mpi/ompi@bc72a7722b
Этот коммит содержится в:
родитель
d38316ddbf
Коммит
7bab4ed269
@ -18,27 +18,29 @@
|
|||||||
|
|
||||||
include $(top_ompi_srcdir)/config/Makefile.options
|
include $(top_ompi_srcdir)/config/Makefile.options
|
||||||
|
|
||||||
sources = \
|
not_used_yet = \
|
||||||
coll_sm.h \
|
|
||||||
coll_sm_allgather.c \
|
coll_sm_allgather.c \
|
||||||
coll_sm_allgatherv.c \
|
coll_sm_allgatherv.c \
|
||||||
coll_sm_allreduce.c \
|
|
||||||
coll_sm_alltoall.c \
|
coll_sm_alltoall.c \
|
||||||
coll_sm_alltoallv.c \
|
coll_sm_alltoallv.c \
|
||||||
coll_sm_alltoallw.c \
|
coll_sm_alltoallw.c \
|
||||||
coll_sm_barrier.c \
|
|
||||||
coll_sm_bcast.c \
|
|
||||||
coll_sm_component.c \
|
|
||||||
coll_sm_gather.c \
|
coll_sm_gather.c \
|
||||||
coll_sm_gatherv.c \
|
coll_sm_gatherv.c \
|
||||||
coll_sm_module.c \
|
|
||||||
coll_sm_reduce.c \
|
|
||||||
coll_sm_reduce_scatter.c \
|
coll_sm_reduce_scatter.c \
|
||||||
coll_sm_scan.c \
|
coll_sm_scan.c \
|
||||||
coll_sm_exscan.c \
|
coll_sm_exscan.c \
|
||||||
coll_sm_scatter.c \
|
coll_sm_scatter.c \
|
||||||
coll_sm_scatterv.c
|
coll_sm_scatterv.c
|
||||||
|
|
||||||
|
sources = \
|
||||||
|
coll_sm.h \
|
||||||
|
coll_sm_allreduce.c \
|
||||||
|
coll_sm_barrier.c \
|
||||||
|
coll_sm_bcast.c \
|
||||||
|
coll_sm_component.c \
|
||||||
|
coll_sm_module.c \
|
||||||
|
coll_sm_reduce.c
|
||||||
|
|
||||||
# Make the output library in this directory, and name it either
|
# Make the output library in this directory, and name it either
|
||||||
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
|
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
|
||||||
# (for static builds).
|
# (for static builds).
|
||||||
|
@ -13,9 +13,7 @@
|
|||||||
*
|
*
|
||||||
* $HEADER$
|
* $HEADER$
|
||||||
*/
|
*/
|
||||||
/**
|
/** @file */
|
||||||
* @file
|
|
||||||
*/
|
|
||||||
|
|
||||||
#ifndef MCA_COLL_SM_EXPORT_H
|
#ifndef MCA_COLL_SM_EXPORT_H
|
||||||
#define MCA_COLL_SM_EXPORT_H
|
#define MCA_COLL_SM_EXPORT_H
|
||||||
@ -52,15 +50,15 @@ extern "C" {
|
|||||||
collective sm operations -- use this value plus the base
|
collective sm operations -- use this value plus the base
|
||||||
of the mpool to obtain the pointer to this comm's
|
of the mpool to obtain the pointer to this comm's
|
||||||
mca_coll_sm_mpool_area_t */
|
mca_coll_sm_mpool_area_t */
|
||||||
size_t smbcs_data_mpool_offset;
|
volatile size_t smbcs_data_mpool_offset;
|
||||||
|
|
||||||
/** Number of segments in the data mpool area for this
|
|
||||||
communicator */
|
|
||||||
int smbcs_comm_num_segments;
|
|
||||||
|
|
||||||
/** Number of processes in this communicator who have seen
|
/** Number of processes in this communicator who have seen
|
||||||
this value already. */
|
this value already. */
|
||||||
int smbcs_count;
|
volatile int smbcs_count;
|
||||||
|
|
||||||
|
/** Mechanism for letting multiple processes know whether this
|
||||||
|
allocation succeeded or not */
|
||||||
|
volatile bool smbcs_success;
|
||||||
};
|
};
|
||||||
/**
|
/**
|
||||||
* Convenience typedef
|
* Convenience typedef
|
||||||
@ -75,19 +73,17 @@ extern "C" {
|
|||||||
/** upper-level control structure */
|
/** upper-level control structure */
|
||||||
mca_common_sm_file_header_t super;
|
mca_common_sm_file_header_t super;
|
||||||
|
|
||||||
/** Number of segments in the bootstrap mmap file */
|
|
||||||
int smbhe_num_segments;
|
|
||||||
|
|
||||||
/** Pointer to the start of the segments in the bootstrap area
|
/** Pointer to the start of the segments in the bootstrap area
|
||||||
(map->seg_data only points to just beyond the
|
(map->seg_data only points to just beyond the
|
||||||
mca_common_sm_file_header_t) */
|
mca_common_sm_file_header_t) */
|
||||||
mca_coll_sm_bootstrap_comm_setup_t *smbhe_segments;
|
mca_coll_sm_bootstrap_comm_setup_t *smbhe_segments;
|
||||||
|
|
||||||
/** Pointer to array containing smhe_num_segments CIDs for use
|
/** Pointer to array containing
|
||||||
in bootstrap phase -- will always point immediately after
|
component.sm_bootstrap_num_segments CIDs for use in
|
||||||
the end of this struct (i.e., still within this header,
|
bootstrap phase -- will always point immediately after the
|
||||||
but since it's variable size (set by MCA param), we can't
|
end of this struct (i.e., still within this header, but
|
||||||
just have it here in the struct. Bonk). */
|
since it's variable size (set by MCA param), we can't just
|
||||||
|
have it here in the struct. Bonk). */
|
||||||
uint32_t *smbhe_cids;
|
uint32_t *smbhe_cids;
|
||||||
};
|
};
|
||||||
/**
|
/**
|
||||||
@ -143,6 +139,8 @@ extern "C" {
|
|||||||
calculation of the "info" MCA parameter */
|
calculation of the "info" MCA parameter */
|
||||||
int sm_info_comm_size;
|
int sm_info_comm_size;
|
||||||
|
|
||||||
|
/******* end of MCA params ********/
|
||||||
|
|
||||||
/** Size of the bootstrap area -- calculated in
|
/** Size of the bootstrap area -- calculated in
|
||||||
coll_sm_component.c */
|
coll_sm_component.c */
|
||||||
size_t sm_bootstrap_size;
|
size_t sm_bootstrap_size;
|
||||||
@ -185,6 +183,24 @@ extern "C" {
|
|||||||
*/
|
*/
|
||||||
typedef struct mca_coll_sm_tree_node_t mca_coll_sm_tree_node_t;
|
typedef struct mca_coll_sm_tree_node_t mca_coll_sm_tree_node_t;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Simple structure comprising the "in use" flags. Contains two
|
||||||
|
* members: the number of processes that are currently using this
|
||||||
|
* set of segments and the operation number of the current
|
||||||
|
* operation.
|
||||||
|
*/
|
||||||
|
struct mca_coll_sm_in_use_flag_t {
|
||||||
|
/** Number of processes currently using this set of
|
||||||
|
segments */
|
||||||
|
volatile uint32_t mcsiuf_num_procs_using;
|
||||||
|
/** Must match data->mcb_count */
|
||||||
|
volatile uint32_t mcsiuf_operation_count;
|
||||||
|
};
|
||||||
|
/**
|
||||||
|
* Convenienve typedef
|
||||||
|
*/
|
||||||
|
typedef struct mca_coll_sm_in_use_flag_t mca_coll_sm_in_use_flag_t;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Structure containing pointers to various arrays of data in the
|
* Structure containing pointers to various arrays of data in the
|
||||||
* data mpool area (one of these indexes a single segment in the
|
* data mpool area (one of these indexes a single segment in the
|
||||||
@ -195,8 +211,6 @@ extern "C" {
|
|||||||
struct mca_coll_base_mpool_index_t {
|
struct mca_coll_base_mpool_index_t {
|
||||||
/** Pointer to beginning of control data */
|
/** Pointer to beginning of control data */
|
||||||
uint32_t *mcbmi_control;
|
uint32_t *mcbmi_control;
|
||||||
/** Pointer to the "in use" buffer for this segment */
|
|
||||||
uint32_t *mcbmi_in_use;
|
|
||||||
/** Pointer to beginning of message fragment data */
|
/** Pointer to beginning of message fragment data */
|
||||||
char *mcbmi_data;
|
char *mcbmi_data;
|
||||||
};
|
};
|
||||||
@ -223,8 +237,6 @@ extern "C" {
|
|||||||
operations area (i.e., mcb_mpool_base +
|
operations area (i.e., mcb_mpool_base +
|
||||||
mcb_mpool_offset) */
|
mcb_mpool_offset) */
|
||||||
unsigned char *mcb_mpool_area;
|
unsigned char *mcb_mpool_area;
|
||||||
/** Number of segments in this comm's area in the data mpool */
|
|
||||||
int mcb_mpool_num_segments;
|
|
||||||
|
|
||||||
/** Pointer to my barrier control pages (odd index pages are
|
/** Pointer to my barrier control pages (odd index pages are
|
||||||
"in", even index pages are "out") */
|
"in", even index pages are "out") */
|
||||||
@ -246,6 +258,9 @@ extern "C" {
|
|||||||
of barrier buffers to use). */
|
of barrier buffers to use). */
|
||||||
int mcb_barrier_count;
|
int mcb_barrier_count;
|
||||||
|
|
||||||
|
/** "In use" flags indicating which segments are available */
|
||||||
|
mca_coll_sm_in_use_flag_t *mcb_in_use_flags;
|
||||||
|
|
||||||
/** Array of indexes into the mpool area for control and data
|
/** Array of indexes into the mpool area for control and data
|
||||||
fragment passing (containing pointers to each segments
|
fragment passing (containing pointers to each segments
|
||||||
control and data areas). */
|
control and data areas). */
|
||||||
@ -256,7 +271,7 @@ extern "C" {
|
|||||||
mca_coll_sm_tree_node_t *mcb_tree;
|
mca_coll_sm_tree_node_t *mcb_tree;
|
||||||
|
|
||||||
/** Operation number (i.e., which segment number to use) */
|
/** Operation number (i.e., which segment number to use) */
|
||||||
int mcb_operation_count;
|
uint32_t mcb_operation_count;
|
||||||
};
|
};
|
||||||
/**
|
/**
|
||||||
* Convenience typedef
|
* Convenience typedef
|
||||||
|
@ -13,6 +13,7 @@
|
|||||||
*
|
*
|
||||||
* $HEADER$
|
* $HEADER$
|
||||||
*/
|
*/
|
||||||
|
/** @file */
|
||||||
|
|
||||||
#include "ompi_config.h"
|
#include "ompi_config.h"
|
||||||
|
|
||||||
@ -20,17 +21,20 @@
|
|||||||
#include "coll_sm.h"
|
#include "coll_sm.h"
|
||||||
|
|
||||||
|
|
||||||
/*
|
/**
|
||||||
* allreduce_intra
|
* Shared memory allreduce.
|
||||||
*
|
*
|
||||||
* Function: - allreduce using other MPI collectives
|
* For the moment, all we're doing is a reduce to root==0 and then a
|
||||||
* Accepts: - same as MPI_Allreduce()
|
* broadcast. It is possible that we'll do something better someday.
|
||||||
* Returns: - MPI_SUCCESS or error code
|
|
||||||
*/
|
*/
|
||||||
int mca_coll_sm_allreduce_intra(void *sbuf, void *rbuf, int count,
|
int mca_coll_sm_allreduce_intra(void *sbuf, void *rbuf, int count,
|
||||||
struct ompi_datatype_t *dtype,
|
struct ompi_datatype_t *dtype,
|
||||||
struct ompi_op_t *op,
|
struct ompi_op_t *op,
|
||||||
struct ompi_communicator_t *comm)
|
struct ompi_communicator_t *comm)
|
||||||
{
|
{
|
||||||
return OMPI_ERR_NOT_IMPLEMENTED;
|
int ret;
|
||||||
|
|
||||||
|
ret = mca_coll_sm_reduce_intra(sbuf, rbuf, count, dtype, op, 0, comm);
|
||||||
|
return (ret == OMPI_SUCCESS) ?
|
||||||
|
mca_coll_sm_bcast_intra(rbuf, count, dtype, 0, comm) : ret;
|
||||||
}
|
}
|
||||||
|
@ -48,119 +48,181 @@ int mca_coll_sm_bcast_intra(void *buff, int count,
|
|||||||
struct iovec iov;
|
struct iovec iov;
|
||||||
uint32_t iov_size = 1;
|
uint32_t iov_size = 1;
|
||||||
mca_coll_base_comm_t *data = comm->c_coll_selected_data;
|
mca_coll_base_comm_t *data = comm->c_coll_selected_data;
|
||||||
int i, ret, rank, vrank, size, segment, num_children;
|
int i, ret, rank, size, num_children;
|
||||||
|
int flag_num, segment_num, max_segment_num;
|
||||||
int parent_rank, child_rank;
|
int parent_rank, child_rank;
|
||||||
size_t total_size, max_data, bytes;
|
size_t total_size, max_data, bytes;
|
||||||
uint32_t *my_control, *parent_control;
|
volatile uint32_t *my_control;
|
||||||
ompi_convertor_t send_convertor;
|
mca_coll_sm_in_use_flag_t *flag;
|
||||||
ompi_convertor_t recv_convertor;
|
ompi_convertor_t convertor;
|
||||||
mca_coll_sm_tree_node_t *me, *parent, **children;
|
mca_coll_sm_tree_node_t *me, *parent, **children;
|
||||||
int32_t bogus_free_after;
|
int32_t bogus_free_after = 0;
|
||||||
|
|
||||||
/* Setup some identities */
|
/* Setup some identities */
|
||||||
|
|
||||||
rank = ompi_comm_rank(comm);
|
rank = ompi_comm_rank(comm);
|
||||||
size = ompi_comm_size(comm);
|
size = ompi_comm_size(comm);
|
||||||
vrank = (rank + size - root) % size;
|
|
||||||
|
|
||||||
me = &data->mcb_tree[vrank];
|
OBJ_CONSTRUCT(&convertor, ompi_convertor_t);
|
||||||
D(("rank %d: virtual rank %d\n", rank, vrank));
|
iov.iov_len = mca_coll_sm_component.sm_fragment_size;
|
||||||
|
bytes = 0;
|
||||||
|
|
||||||
|
me = &data->mcb_tree[(rank + size - root) % size];
|
||||||
|
D(("rank %d: virtual rank %d\n", rank, me - data->mcb_tree));
|
||||||
parent = me->mcstn_parent;
|
parent = me->mcstn_parent;
|
||||||
children = me->mcstn_children;
|
children = me->mcstn_children;
|
||||||
num_children = me->mcstn_num_children;
|
num_children = me->mcstn_num_children;
|
||||||
|
|
||||||
/* If I'm the root, I need a send convertor to pack from the
|
/* Only have one top-level decision as to whether I'm the root or
|
||||||
user's buffer to shared memory */
|
not. Do this at the slight expense of repeating a little logic
|
||||||
|
-- but it's better than a conditional branch in every loop
|
||||||
|
iteration. */
|
||||||
|
|
||||||
|
/*********************************************************************
|
||||||
|
* Root
|
||||||
|
*********************************************************************/
|
||||||
|
|
||||||
if (root == rank) {
|
if (root == rank) {
|
||||||
OBJ_CONSTRUCT(&send_convertor, ompi_convertor_t);
|
|
||||||
|
/* The root needs a send convertor to pack from the user's
|
||||||
|
buffer to shared memory */
|
||||||
|
|
||||||
if (OMPI_SUCCESS !=
|
if (OMPI_SUCCESS !=
|
||||||
(ret =
|
(ret =
|
||||||
ompi_convertor_copy_and_prepare_for_send(ompi_mpi_local_convertor,
|
ompi_convertor_copy_and_prepare_for_send(ompi_mpi_local_convertor,
|
||||||
datatype,
|
datatype,
|
||||||
count,
|
count,
|
||||||
buff,
|
buff,
|
||||||
&send_convertor))) {
|
&convertor))) {
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
ompi_convertor_get_packed_size(&send_convertor, &total_size);
|
ompi_convertor_get_packed_size(&convertor, &total_size);
|
||||||
D(("root got send convertor w/ total_size == %lu\n",
|
D(("root got send convertor w/ total_size == %lu\n",
|
||||||
(unsigned long) total_size));
|
(unsigned long) total_size));
|
||||||
|
|
||||||
|
/* Main loop over sending fragments */
|
||||||
|
|
||||||
|
do {
|
||||||
|
flag_num = (data->mcb_operation_count++ %
|
||||||
|
mca_coll_sm_component.sm_comm_num_in_use_flags);
|
||||||
|
|
||||||
|
/* Wait for the set of segments to become available */
|
||||||
|
flag = (mca_coll_sm_in_use_flag_t*)
|
||||||
|
(((char *) data->mcb_in_use_flags) +
|
||||||
|
(flag_num * mca_coll_sm_component.sm_control_size));
|
||||||
|
D(("root waiting for in_use flag %d (value %d), %p\n",
|
||||||
|
flag_num, flag->mcsiuf_num_procs_using, flag));
|
||||||
|
while (0 != flag->mcsiuf_num_procs_using) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
D(("root got in_use flag %d (value %d), %p\n",
|
||||||
|
flag_num, flag->mcsiuf_num_procs_using, flag));
|
||||||
|
|
||||||
|
/* Now that the set of segments is availble, mark it as
|
||||||
|
used. No need to include the root in the count (we'd
|
||||||
|
only have to decrement it later). Don't need a write
|
||||||
|
barrier here -- we have another later that will
|
||||||
|
guarantee that the write has completed, if
|
||||||
|
necessary. */
|
||||||
|
|
||||||
|
flag->mcsiuf_num_procs_using = size - 1;
|
||||||
|
flag->mcsiuf_operation_count = data->mcb_operation_count - 1;
|
||||||
|
|
||||||
|
/* Loop over all the segments in this set */
|
||||||
|
|
||||||
|
segment_num = flag_num *
|
||||||
|
mca_coll_sm_component.sm_comm_num_in_use_flags;
|
||||||
|
max_segment_num = (flag_num + 1) *
|
||||||
|
mca_coll_sm_component.sm_comm_num_in_use_flags;
|
||||||
|
do {
|
||||||
|
|
||||||
|
/* Copy the fragment from the user buffer to my fragment
|
||||||
|
in the current segment */
|
||||||
|
iov.iov_base =
|
||||||
|
data->mcb_mpool_index[segment_num].mcbmi_data +
|
||||||
|
(rank * mca_coll_sm_component.sm_fragment_size);
|
||||||
|
max_data = iov.iov_len;
|
||||||
|
D(("root copying %lu bytes to data fan out, seg %d: %p\n",
|
||||||
|
(unsigned long) iov.iov_len, segment_num, iov.iov_base));
|
||||||
|
ompi_convertor_pack(&convertor, &iov, &iov_size,
|
||||||
|
&max_data, &bogus_free_after);
|
||||||
|
bytes += max_data;
|
||||||
|
|
||||||
|
/* Wait for the write to absolutely complete */
|
||||||
|
opal_atomic_wmb();
|
||||||
|
|
||||||
|
/* Tell my children that this fragment is ready (be
|
||||||
|
sure to normalize the child's ID based on the shift
|
||||||
|
we did above to calculate the "me" node in the
|
||||||
|
tree) */
|
||||||
|
for (i = 0; i < num_children; ++i) {
|
||||||
|
child_rank = (children[i]->mcstn_id + root) % size;
|
||||||
|
*((size_t*)
|
||||||
|
(((char*)
|
||||||
|
data->mcb_mpool_index[segment_num].mcbmi_control) +
|
||||||
|
(mca_coll_sm_component.sm_control_size *
|
||||||
|
child_rank))) = max_data;
|
||||||
|
D(("root sent notice to child %d (vrank %d), control to %p\n",
|
||||||
|
i, children[i]->mcstn_id,
|
||||||
|
(((char*)
|
||||||
|
data->mcb_mpool_index[segment_num].mcbmi_control) +
|
||||||
|
(mca_coll_sm_component.sm_control_size *
|
||||||
|
child_rank))));
|
||||||
}
|
}
|
||||||
|
|
||||||
/* If I'm not the root, I need a receive convertor to unpack from
|
++segment_num;
|
||||||
shared mmory to the user's buffer */
|
} while (bytes < total_size && segment_num < max_segment_num);
|
||||||
|
} while (bytes < total_size);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*********************************************************************
|
||||||
|
* Non-root
|
||||||
|
*********************************************************************/
|
||||||
|
|
||||||
else {
|
else {
|
||||||
OBJ_CONSTRUCT(&recv_convertor, ompi_convertor_t);
|
|
||||||
|
/* Non-root processes need a receive convertor to unpack from
|
||||||
|
shared mmory to the user's buffer */
|
||||||
|
|
||||||
|
OBJ_CONSTRUCT(&convertor, ompi_convertor_t);
|
||||||
if (OMPI_SUCCESS !=
|
if (OMPI_SUCCESS !=
|
||||||
(ret =
|
(ret =
|
||||||
ompi_convertor_copy_and_prepare_for_recv(ompi_mpi_local_convertor,
|
ompi_convertor_copy_and_prepare_for_recv(ompi_mpi_local_convertor,
|
||||||
datatype,
|
datatype,
|
||||||
count,
|
count,
|
||||||
buff,
|
buff,
|
||||||
&recv_convertor))) {
|
&convertor))) {
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
ompi_convertor_get_packed_size(&recv_convertor, &total_size);
|
ompi_convertor_get_packed_size(&convertor, &total_size);
|
||||||
D(("rank %d got recv convertor w/ total_size == %lu\n",
|
D(("rank %d got recv convertor w/ total_size == %lu\n",
|
||||||
rank, (unsigned long) total_size));
|
rank, (unsigned long) total_size));
|
||||||
}
|
|
||||||
|
|
||||||
/* Setup some more constants */
|
/* Loop over receiving (and possibly re-sending) the
|
||||||
|
fragments */
|
||||||
iov.iov_len = mca_coll_sm_component.sm_fragment_size;
|
|
||||||
bytes = 0;
|
|
||||||
|
|
||||||
/* Loop over the fragments */
|
|
||||||
|
|
||||||
do {
|
do {
|
||||||
segment = (data->mcb_operation_count++ %
|
flag_num = (data->mcb_operation_count %
|
||||||
data->mcb_mpool_num_segments);
|
mca_coll_sm_component.sm_comm_num_in_use_flags);
|
||||||
|
|
||||||
/* Root */
|
/* Wait for the root to mark this set of segments as
|
||||||
|
ours */
|
||||||
if (root == rank) {
|
flag = (mca_coll_sm_in_use_flag_t*)
|
||||||
|
(((char *) data->mcb_in_use_flags) +
|
||||||
/* Wait for the segment to become available */
|
(flag_num * mca_coll_sm_component.sm_control_size));
|
||||||
/* JMS */
|
D(("rank %d waiting for root to claim in-use flag %d, %p (op count %d)\n", rank, flag_num, flag, data->mcb_operation_count));
|
||||||
|
while (data->mcb_operation_count != flag->mcsiuf_operation_count) {
|
||||||
/* Copy the fragment from the user buffer to my fragment
|
continue;
|
||||||
in the current segment */
|
|
||||||
iov.iov_base =
|
|
||||||
data->mcb_mpool_index[segment].mcbmi_data +
|
|
||||||
(rank * mca_coll_sm_component.sm_fragment_size);
|
|
||||||
max_data = iov.iov_len;
|
|
||||||
D(("root copying %lu bytes to data fan out, seg %d: %p\n",
|
|
||||||
(unsigned long) iov.iov_len, segment, iov.iov_base));
|
|
||||||
ompi_convertor_pack(&send_convertor, &iov, &iov_size,
|
|
||||||
&max_data, &bogus_free_after);
|
|
||||||
|
|
||||||
/* Wait for the write to absolutely complete */
|
|
||||||
opal_atomic_wmb();
|
|
||||||
|
|
||||||
/* Tell my children that this fragment is ready (be sure
|
|
||||||
to normalize the child's ID based on the shift we did
|
|
||||||
above to calculate the "me" node in the tree) */
|
|
||||||
for (i = 0; i < num_children; ++i) {
|
|
||||||
child_rank = (children[i]->mcstn_id + root) % size;
|
|
||||||
*((size_t*)
|
|
||||||
(((char*)
|
|
||||||
data->mcb_mpool_index[segment].mcbmi_control) +
|
|
||||||
(mca_coll_sm_component.sm_control_size *
|
|
||||||
child_rank))) = max_data;
|
|
||||||
D(("root sent notice to child %d (vrank %d), control to %p\n",
|
|
||||||
i, children[i]->mcstn_id,
|
|
||||||
(((char*)
|
|
||||||
data->mcb_mpool_index[segment].mcbmi_control) +
|
|
||||||
(mca_coll_sm_component.sm_control_size *
|
|
||||||
child_rank))));
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
++data->mcb_operation_count;
|
||||||
|
|
||||||
/* Non-root */
|
/* Loop over all the segments in this set */
|
||||||
|
|
||||||
else {
|
segment_num = flag_num *
|
||||||
|
mca_coll_sm_component.sm_comm_num_in_use_flags;
|
||||||
|
max_segment_num = (flag_num + 1) *
|
||||||
|
mca_coll_sm_component.sm_comm_num_in_use_flags;
|
||||||
|
do {
|
||||||
|
|
||||||
/* Pre-calculate some pointers */
|
/* Pre-calculate some pointers */
|
||||||
|
|
||||||
@ -168,44 +230,39 @@ int mca_coll_sm_bcast_intra(void *buff, int count,
|
|||||||
D(("rank %d parent rank is %d\n", rank, parent_rank));
|
D(("rank %d parent rank is %d\n", rank, parent_rank));
|
||||||
my_control = (uint32_t *)
|
my_control = (uint32_t *)
|
||||||
(((char*)
|
(((char*)
|
||||||
data->mcb_mpool_index[segment].mcbmi_control) +
|
data->mcb_mpool_index[segment_num].mcbmi_control) +
|
||||||
(rank * mca_coll_sm_component.sm_control_size));
|
(rank * mca_coll_sm_component.sm_control_size));
|
||||||
parent_control = (uint32_t *)
|
|
||||||
(((char*)
|
|
||||||
data->mcb_mpool_index[segment].mcbmi_control) +
|
|
||||||
(parent_rank * mca_coll_sm_component.sm_control_size));
|
|
||||||
|
|
||||||
/* Wait for the fragment: the parent will mark the segment
|
/* Wait for the fragment: the parent will mark the segment
|
||||||
as ready */
|
as ready */
|
||||||
D(("rank %d waiting for fragment in segment %d (control %p)\n",
|
D(("rank %d waiting for fragment in segment %d (control %p)\n",
|
||||||
rank, segment, (char*) my_control));
|
rank, segment_num, (char*) my_control));
|
||||||
while (0 == *my_control) {
|
while (0 == *my_control) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
max_data = *my_control;
|
max_data = *my_control;
|
||||||
D(("rank %d: fragment ready in segment %d\n", rank, segment));
|
D(("rank %d: fragment ready in segment %d\n", rank, segment_num));
|
||||||
|
|
||||||
/* If I have children, send the data to them */
|
/* If I have children, send the data to them */
|
||||||
if (num_children > 0) {
|
if (num_children > 0) {
|
||||||
max_data = iov.iov_len;
|
/* No need to wait for the segment to become
|
||||||
|
available -- the root has already claimed it
|
||||||
/* No need to wait for the segment to become available
|
and we're all already using it. So copy the
|
||||||
-- the root has already claimed it and we're all
|
fragment from the parent's portion in the
|
||||||
already using it. So copy the fragment from the
|
segment to my portion in the segment. This is
|
||||||
parent's portion in the segment to my portion in
|
a simply memcpy because it's already been
|
||||||
the segment. This is a simply memcpy because it's
|
packed into the parent's segment. */
|
||||||
already been packed into the parent's segment. */
|
|
||||||
memcpy(/* my data fan out section in the segment */
|
memcpy(/* my data fan out section in the segment */
|
||||||
(data->mcb_mpool_index[segment].mcbmi_data +
|
(data->mcb_mpool_index[segment_num].mcbmi_data +
|
||||||
(rank * mca_coll_sm_component.sm_fragment_size)),
|
(rank * mca_coll_sm_component.sm_fragment_size)),
|
||||||
/* parent's fan out section in the segment */
|
/* parent's fan out section in the segment */
|
||||||
(data->mcb_mpool_index[segment].mcbmi_data +
|
(data->mcb_mpool_index[segment_num].mcbmi_data +
|
||||||
(parent_rank *
|
(parent_rank *
|
||||||
mca_coll_sm_component.sm_fragment_size)),
|
mca_coll_sm_component.sm_fragment_size)),
|
||||||
/* length */
|
/* length */
|
||||||
*my_control);
|
*my_control);
|
||||||
D(("rank %d memcopy'ed fragment (%p) to my data fan out (%lu bytes)\n", rank,
|
D(("rank %d memcopy'ed fragment (%p) to my data fan out (%lu bytes)\n", rank,
|
||||||
(data->mcb_mpool_index[segment].mcbmi_data +
|
(data->mcb_mpool_index[segment_num].mcbmi_data +
|
||||||
(parent_rank * mca_coll_sm_component.sm_fragment_size)),
|
(parent_rank * mca_coll_sm_component.sm_fragment_size)),
|
||||||
(unsigned long) *my_control));
|
(unsigned long) *my_control));
|
||||||
|
|
||||||
@ -217,20 +274,20 @@ int mca_coll_sm_bcast_intra(void *buff, int count,
|
|||||||
child_rank = (children[i]->mcstn_id + root) % size;
|
child_rank = (children[i]->mcstn_id + root) % size;
|
||||||
*((size_t*)
|
*((size_t*)
|
||||||
(((char*)
|
(((char*)
|
||||||
data->mcb_mpool_index[segment].mcbmi_control) +
|
data->mcb_mpool_index[segment_num].mcbmi_control) +
|
||||||
(mca_coll_sm_component.sm_control_size *
|
(mca_coll_sm_component.sm_control_size *
|
||||||
child_rank))) = *my_control;
|
child_rank))) = *my_control;
|
||||||
D(("rank %d notifying child %d (vrank %d, rank %d)\n",
|
D(("rank %d notifying child %d (vrank %d, rank %d)\n",
|
||||||
rank, i, children[i]->mcstn_id, child_rank));
|
rank, i, children[i]->mcstn_id, child_rank));
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Set the "copy from buffer" to be my local segment
|
/* Set the "copy from buffer" to be my local
|
||||||
buffer so that we don't potentially incur a
|
segment buffer so that we don't potentially
|
||||||
non-local memory copy from the parent's fan out
|
incur a non-local memory copy from the parent's
|
||||||
data segment [again] when copying to the user's
|
fan out data segment [again] when copying to
|
||||||
buffer */
|
the user's buffer */
|
||||||
iov.iov_base =
|
iov.iov_base =
|
||||||
data->mcb_mpool_index[segment].mcbmi_data +
|
data->mcb_mpool_index[segment_num].mcbmi_data +
|
||||||
(rank * mca_coll_sm_component.sm_fragment_size);
|
(rank * mca_coll_sm_component.sm_fragment_size);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -241,31 +298,40 @@ int mca_coll_sm_bcast_intra(void *buff, int count,
|
|||||||
else {
|
else {
|
||||||
iov.iov_base =
|
iov.iov_base =
|
||||||
(((char*)
|
(((char*)
|
||||||
data->mcb_mpool_index[segment].mcbmi_data) +
|
data->mcb_mpool_index[segment_num].mcbmi_data) +
|
||||||
(parent_rank * mca_coll_sm_component.sm_fragment_size));
|
(parent_rank *
|
||||||
|
mca_coll_sm_component.sm_fragment_size));
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Copy to my output buffer */
|
/* Copy to my output buffer */
|
||||||
D(("rank %d convertor copied from parent data %p to user buffer (%lu bytes)\n",
|
D(("rank %d convertor copied from parent data %p to user buffer (%lu bytes)\n",
|
||||||
rank, iov.iov_base, (unsigned long) iov.iov_len));
|
rank, iov.iov_base, (unsigned long) iov.iov_len));
|
||||||
ompi_convertor_unpack(&recv_convertor, &iov, &iov_size,
|
ompi_convertor_unpack(&convertor, &iov, &iov_size,
|
||||||
&max_data, &bogus_free_after);
|
&max_data, &bogus_free_after);
|
||||||
}
|
|
||||||
|
|
||||||
/* It's ok to only look at the max_data from the last
|
*my_control = 0;
|
||||||
operation because it will be the same value for all of
|
|
||||||
them */
|
|
||||||
bytes += max_data;
|
bytes += max_data;
|
||||||
} while (bytes < total_size);
|
++segment_num;
|
||||||
|
} while (bytes < total_size && segment_num < max_segment_num);
|
||||||
|
|
||||||
if (root == rank) {
|
/* Wait for all copy-out writes to complete before I say
|
||||||
OBJ_DESTRUCT(&send_convertor);
|
I'm done with the segments */
|
||||||
} else {
|
opal_atomic_wmb();
|
||||||
OBJ_DESTRUCT(&recv_convertor);
|
|
||||||
|
/* We're finished with this set of segments */
|
||||||
|
|
||||||
|
D(("rank %d done with in-use flag %d (value %d)\n",
|
||||||
|
rank, flag_num, flag->mcsiuf_num_procs_using));
|
||||||
|
opal_atomic_add(&flag->mcsiuf_num_procs_using, -1);
|
||||||
|
} while (bytes < total_size);
|
||||||
}
|
}
|
||||||
D(("rank %d done with bcast\n", rank));
|
|
||||||
|
/* Kill the convertor */
|
||||||
|
|
||||||
|
OBJ_DESTRUCT(&convertor);
|
||||||
|
|
||||||
/* All done */
|
/* All done */
|
||||||
|
|
||||||
|
D(("rank %d done with bcast\n", rank));
|
||||||
return OMPI_SUCCESS;
|
return OMPI_SUCCESS;
|
||||||
}
|
}
|
||||||
|
@ -173,7 +173,7 @@ static int sm_open(void)
|
|||||||
&cs->sm_bootstrap_num_segments);
|
&cs->sm_bootstrap_num_segments);
|
||||||
|
|
||||||
mca_base_param_reg_int(c, "fragment_size",
|
mca_base_param_reg_int(c, "fragment_size",
|
||||||
"Fragment size (in bytes) used for passing data through shared memory (bytes, will be rounded up to the nearest control_size size)",
|
"Fragment size (in bytes) used for passing data through shared memory (will be rounded up to the nearest control_size size)",
|
||||||
false, false,
|
false, false,
|
||||||
cs->sm_fragment_size,
|
cs->sm_fragment_size,
|
||||||
&cs->sm_fragment_size);
|
&cs->sm_fragment_size);
|
||||||
@ -202,8 +202,8 @@ static int sm_open(void)
|
|||||||
false, false,
|
false, false,
|
||||||
cs->sm_comm_num_segments,
|
cs->sm_comm_num_segments,
|
||||||
&cs->sm_comm_num_segments);
|
&cs->sm_comm_num_segments);
|
||||||
if (cs->sm_comm_num_segments < 2) {
|
if (cs->sm_comm_num_segments < cs->sm_comm_num_in_use_flags) {
|
||||||
cs->sm_comm_num_segments = 2;
|
cs->sm_comm_num_segments = cs->sm_comm_num_in_use_flags;
|
||||||
}
|
}
|
||||||
if (0 != (cs->sm_comm_num_segments % cs->sm_comm_num_in_use_flags)) {
|
if (0 != (cs->sm_comm_num_segments % cs->sm_comm_num_in_use_flags)) {
|
||||||
cs->sm_comm_num_segments += cs->sm_comm_num_in_use_flags -
|
cs->sm_comm_num_segments += cs->sm_comm_num_in_use_flags -
|
||||||
@ -240,26 +240,8 @@ static int sm_open(void)
|
|||||||
false, true,
|
false, true,
|
||||||
size1, NULL);
|
size1, NULL);
|
||||||
|
|
||||||
/* Calculate how much space we need in the data mpool. There are
|
/* Calculate how much space we need in the data mpool. This
|
||||||
several values to add (one of these for each segment):
|
formula taken directly from coll_sm_module.c. */
|
||||||
|
|
||||||
- size of the control data:
|
|
||||||
- fan-in data (num_procs * control_size size)
|
|
||||||
- fan-out data (num_procs * control_size size)
|
|
||||||
- size of message data
|
|
||||||
- fan-in data (num_procs * (frag_size rounded up to
|
|
||||||
control_size size))
|
|
||||||
- fan-out data (num_procs * (frag_size rounded up
|
|
||||||
to control_size size))
|
|
||||||
|
|
||||||
So it's:
|
|
||||||
|
|
||||||
num_segs * ((num_procs * control_size * 2) + (num_procs * frag * 2))
|
|
||||||
|
|
||||||
Which reduces to:
|
|
||||||
|
|
||||||
num_segs * num_procs * 2 * (control_size + frag)
|
|
||||||
*/
|
|
||||||
|
|
||||||
mca_base_param_reg_int(c, "info_num_procs",
|
mca_base_param_reg_int(c, "info_num_procs",
|
||||||
"Number of processes to use for the calculation of the shared_mem_size MCA information parameter (must be => 2)",
|
"Number of processes to use for the calculation of the shared_mem_size MCA information parameter (must be => 2)",
|
||||||
@ -267,10 +249,12 @@ static int sm_open(void)
|
|||||||
cs->sm_info_comm_size,
|
cs->sm_info_comm_size,
|
||||||
&cs->sm_info_comm_size);
|
&cs->sm_info_comm_size);
|
||||||
|
|
||||||
size2 = cs->sm_comm_num_segments * cs->sm_info_comm_size *
|
size2 = 4 * cs->sm_control_size +
|
||||||
(cs->sm_control_size + cs->sm_fragment_size);
|
(cs->sm_comm_num_in_use_flags * cs->sm_control_size) +
|
||||||
|
(cs->sm_comm_num_segments * (cs->sm_info_comm_size * cs->sm_control_size * 2)) +
|
||||||
|
(cs->sm_comm_num_segments * (cs->sm_info_comm_size * cs->sm_fragment_size));
|
||||||
mca_base_param_reg_int(c, "shared_mem_used_data",
|
mca_base_param_reg_int(c, "shared_mem_used_data",
|
||||||
"Amount of shared memory used in the shared memory data area for one process (in bytes)",
|
"Amount of shared memory used in the shared memory data area for info_num_procs processes (in bytes)",
|
||||||
false, true,
|
false, true,
|
||||||
size2, NULL);
|
size2, NULL);
|
||||||
|
|
||||||
|
@ -16,6 +16,11 @@
|
|||||||
/**
|
/**
|
||||||
* @file
|
* @file
|
||||||
*
|
*
|
||||||
|
* Warning: this is not for the faint of heart -- don't even bother
|
||||||
|
* reading this source code if you don't have a strong understanding
|
||||||
|
* of nested data structures and pointer math (remeber that
|
||||||
|
* associativity and order of C operations is *critical* in terms of
|
||||||
|
* pointer math!).
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#include "ompi_config.h"
|
#include "ompi_config.h"
|
||||||
@ -70,7 +75,7 @@ static const mca_coll_base_module_1_0_0_t module = {
|
|||||||
|
|
||||||
NULL,
|
NULL,
|
||||||
NULL,
|
NULL,
|
||||||
NULL,
|
mca_coll_sm_allreduce_intra,
|
||||||
NULL,
|
NULL,
|
||||||
NULL,
|
NULL,
|
||||||
NULL,
|
NULL,
|
||||||
@ -79,7 +84,7 @@ static const mca_coll_base_module_1_0_0_t module = {
|
|||||||
NULL,
|
NULL,
|
||||||
NULL,
|
NULL,
|
||||||
NULL,
|
NULL,
|
||||||
NULL,
|
mca_coll_sm_reduce_intra,
|
||||||
NULL,
|
NULL,
|
||||||
NULL,
|
NULL,
|
||||||
NULL,
|
NULL,
|
||||||
@ -199,7 +204,7 @@ sm_module_init(struct ompi_communicator_t *comm)
|
|||||||
mca_coll_base_comm_t *data;
|
mca_coll_base_comm_t *data;
|
||||||
size_t control_size, frag_size;
|
size_t control_size, frag_size;
|
||||||
mca_coll_sm_component_t *c = &mca_coll_sm_component;
|
mca_coll_sm_component_t *c = &mca_coll_sm_component;
|
||||||
opal_maffinity_base_segment_t *segments;
|
opal_maffinity_base_segment_t *maffinity;
|
||||||
int parent, min_child, max_child, num_children;
|
int parent, min_child, max_child, num_children;
|
||||||
char *base;
|
char *base;
|
||||||
const int num_barrier_buffers = 2;
|
const int num_barrier_buffers = 2;
|
||||||
@ -207,9 +212,9 @@ sm_module_init(struct ompi_communicator_t *comm)
|
|||||||
/* Get some space to setup memory affinity (just easier to try to
|
/* Get some space to setup memory affinity (just easier to try to
|
||||||
alloc here to handle the error case) */
|
alloc here to handle the error case) */
|
||||||
|
|
||||||
segments = malloc(sizeof(opal_maffinity_base_segment_t) *
|
maffinity = malloc(sizeof(opal_maffinity_base_segment_t) *
|
||||||
c->sm_bootstrap_num_segments * 5);
|
c->sm_comm_num_segments * 3);
|
||||||
if (NULL == segments) {
|
if (NULL == maffinity) {
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -334,13 +339,37 @@ sm_module_init(struct ompi_communicator_t *comm)
|
|||||||
}
|
}
|
||||||
data->mcb_barrier_count = 0;
|
data->mcb_barrier_count = 0;
|
||||||
|
|
||||||
/* Next, setup the mca_coll_base_mpool_index_t pointers to point
|
/* Next, setup the pointer to the in-use flags. The number of
|
||||||
to the appropriate places in the mpool. */
|
segments will be an even multiple of the number of in-use
|
||||||
|
flags. */
|
||||||
|
|
||||||
|
base += (c->sm_control_size * size * num_barrier_buffers * 2);
|
||||||
|
data->mcb_in_use_flags = (mca_coll_sm_in_use_flag_t*) base;
|
||||||
|
|
||||||
|
/* All things being equal, if we're rank 0, then make the in-use
|
||||||
|
flags be local (memory affinity). Then zero them all out so
|
||||||
|
that they're marked as unused. */
|
||||||
|
|
||||||
|
j = 0;
|
||||||
|
if (0 == rank) {
|
||||||
|
maffinity[j].mbs_start_addr = base;
|
||||||
|
maffinity[j].mbs_len = c->sm_control_size *
|
||||||
|
c->sm_comm_num_in_use_flags;
|
||||||
|
memset(maffinity[j].mbs_start_addr, 0, maffinity[j].mbs_len);
|
||||||
|
D(("rank 0 zeroed in-use flags (num %d, len %d): %p - %p\n",
|
||||||
|
c->sm_comm_num_in_use_flags,
|
||||||
|
maffinity[j].mbs_len,
|
||||||
|
base, base + maffinity[j].mbs_len));
|
||||||
|
++j;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Next, setup pointers to the control and data portions of the
|
||||||
|
segments, as well as to the relevant in-use flags. */
|
||||||
|
|
||||||
|
base += (c->sm_comm_num_in_use_flags * c->sm_control_size);
|
||||||
control_size = size * c->sm_control_size;
|
control_size = size * c->sm_control_size;
|
||||||
frag_size = size * c->sm_fragment_size;
|
frag_size = size * c->sm_fragment_size;
|
||||||
base += (control_size * num_barrier_buffers * 2);
|
for (i = 0; i < c->sm_comm_num_segments; ++i) {
|
||||||
for (j = i = 0; i < data->mcb_mpool_num_segments; ++i) {
|
|
||||||
data->mcb_mpool_index[i].mcbmi_control = (uint32_t*)
|
data->mcb_mpool_index[i].mcbmi_control = (uint32_t*)
|
||||||
(base + (i * (control_size + frag_size)));
|
(base + (i * (control_size + frag_size)));
|
||||||
data->mcb_mpool_index[i].mcbmi_data =
|
data->mcb_mpool_index[i].mcbmi_data =
|
||||||
@ -349,16 +378,16 @@ sm_module_init(struct ompi_communicator_t *comm)
|
|||||||
|
|
||||||
/* Memory affinity: control */
|
/* Memory affinity: control */
|
||||||
|
|
||||||
segments[j].mbs_len = c->sm_control_size;
|
maffinity[j].mbs_len = c->sm_control_size;
|
||||||
segments[j].mbs_start_addr =
|
maffinity[j].mbs_start_addr =
|
||||||
data->mcb_mpool_index[i].mcbmi_control +
|
data->mcb_mpool_index[i].mcbmi_control +
|
||||||
(rank * c->sm_control_size);
|
(rank * c->sm_control_size);
|
||||||
++j;
|
++j;
|
||||||
|
|
||||||
/* Memory affinity: data */
|
/* Memory affinity: data */
|
||||||
|
|
||||||
segments[j].mbs_len = c->sm_fragment_size;
|
maffinity[j].mbs_len = c->sm_fragment_size;
|
||||||
segments[j].mbs_start_addr =
|
maffinity[j].mbs_start_addr =
|
||||||
data->mcb_mpool_index[i].mcbmi_data +
|
data->mcb_mpool_index[i].mcbmi_data +
|
||||||
(rank * c->sm_control_size);
|
(rank * c->sm_control_size);
|
||||||
++j;
|
++j;
|
||||||
@ -367,14 +396,14 @@ sm_module_init(struct ompi_communicator_t *comm)
|
|||||||
/* Setup memory affinity so that the pages that belong to this
|
/* Setup memory affinity so that the pages that belong to this
|
||||||
process are local to this process */
|
process are local to this process */
|
||||||
|
|
||||||
opal_maffinity_base_set(segments, j);
|
opal_maffinity_base_set(maffinity, j);
|
||||||
free(segments);
|
free(maffinity);
|
||||||
|
|
||||||
/* Zero out the control structures that belong to this process */
|
/* Zero out the control structures that belong to this process */
|
||||||
|
|
||||||
memset(data->mcb_barrier_control_me, 0,
|
memset(data->mcb_barrier_control_me, 0,
|
||||||
num_barrier_buffers * 2 * c->sm_control_size);
|
num_barrier_buffers * 2 * c->sm_control_size);
|
||||||
for (i = 0; i < data->mcb_mpool_num_segments; ++i) {
|
for (i = 0; i < c->sm_comm_num_segments; ++i) {
|
||||||
memset(data->mcb_mpool_index[i].mcbmi_control, 0,
|
memset(data->mcb_mpool_index[i].mcbmi_control, 0,
|
||||||
c->sm_control_size);
|
c->sm_control_size);
|
||||||
}
|
}
|
||||||
@ -476,8 +505,6 @@ static int bootstrap_init(void)
|
|||||||
opal_atomic_lock(&bshe->super.seg_lock);
|
opal_atomic_lock(&bshe->super.seg_lock);
|
||||||
opal_atomic_wmb();
|
opal_atomic_wmb();
|
||||||
if (!bshe->super.seg_inited) {
|
if (!bshe->super.seg_inited) {
|
||||||
bshe->smbhe_num_segments =
|
|
||||||
mca_coll_sm_component.sm_bootstrap_num_segments;
|
|
||||||
bshe->smbhe_segments = (mca_coll_sm_bootstrap_comm_setup_t *)
|
bshe->smbhe_segments = (mca_coll_sm_bootstrap_comm_setup_t *)
|
||||||
(((char *) bshe) +
|
(((char *) bshe) +
|
||||||
sizeof(mca_coll_sm_bootstrap_header_extension_t) +
|
sizeof(mca_coll_sm_bootstrap_header_extension_t) +
|
||||||
@ -485,7 +512,7 @@ static int bootstrap_init(void)
|
|||||||
mca_coll_sm_component.sm_bootstrap_num_segments));
|
mca_coll_sm_component.sm_bootstrap_num_segments));
|
||||||
bshe->smbhe_cids = (uint32_t *)
|
bshe->smbhe_cids = (uint32_t *)
|
||||||
(((char *) bshe) + sizeof(*bshe));
|
(((char *) bshe) + sizeof(*bshe));
|
||||||
for (i = 0; i < bshe->smbhe_num_segments; ++i) {
|
for (i = 0; i < mca_coll_sm_component.sm_bootstrap_num_segments; ++i) {
|
||||||
bshe->smbhe_cids[i] = INT_MAX;
|
bshe->smbhe_cids[i] = INT_MAX;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -510,6 +537,7 @@ static int bootstrap_comm(ompi_communicator_t *comm)
|
|||||||
mca_coll_base_comm_t *data = comm->c_coll_selected_data;
|
mca_coll_base_comm_t *data = comm->c_coll_selected_data;
|
||||||
int comm_size = ompi_comm_size(comm);
|
int comm_size = ompi_comm_size(comm);
|
||||||
int num_segments = c->sm_comm_num_segments;
|
int num_segments = c->sm_comm_num_segments;
|
||||||
|
int num_in_use = c->sm_comm_num_in_use_flags;
|
||||||
int frag_size = c->sm_fragment_size;
|
int frag_size = c->sm_fragment_size;
|
||||||
int control_size = c->sm_control_size;
|
int control_size = c->sm_control_size;
|
||||||
|
|
||||||
@ -525,7 +553,7 @@ static int bootstrap_comm(ompi_communicator_t *comm)
|
|||||||
opal_atomic_wmb();
|
opal_atomic_wmb();
|
||||||
found = false;
|
found = false;
|
||||||
empty_index = -1;
|
empty_index = -1;
|
||||||
for (i = 0; i < bshe->smbhe_num_segments; ++i) {
|
for (i = 0; i < mca_coll_sm_component.sm_bootstrap_num_segments; ++i) {
|
||||||
if (comm->c_contextid == bshe->smbhe_cids[i]) {
|
if (comm->c_contextid == bshe->smbhe_cids[i]) {
|
||||||
found = true;
|
found = true;
|
||||||
break;
|
break;
|
||||||
@ -552,7 +580,6 @@ static int bootstrap_comm(ompi_communicator_t *comm)
|
|||||||
i = empty_index;
|
i = empty_index;
|
||||||
bshe->smbhe_cids[i] = comm->c_contextid;
|
bshe->smbhe_cids[i] = comm->c_contextid;
|
||||||
|
|
||||||
bscs[i].smbcs_comm_num_segments = num_segments;
|
|
||||||
bscs[i].smbcs_count = comm_size;
|
bscs[i].smbcs_count = comm_size;
|
||||||
|
|
||||||
/* Calculate how much space we need in the data mpool.
|
/* Calculate how much space we need in the data mpool.
|
||||||
@ -570,12 +597,14 @@ static int bootstrap_comm(ompi_communicator_t *comm)
|
|||||||
So it's:
|
So it's:
|
||||||
|
|
||||||
barrier: 2 * control_size + 2 * control_size
|
barrier: 2 * control_size + 2 * control_size
|
||||||
|
in use: num_in_use * control_size
|
||||||
control: num_segments * (num_procs * control_size * 2 +
|
control: num_segments * (num_procs * control_size * 2 +
|
||||||
num_procs * control_size)
|
num_procs * control_size)
|
||||||
message: num_segments * (num_procs * frag_size)
|
message: num_segments * (num_procs * frag_size)
|
||||||
*/
|
*/
|
||||||
|
|
||||||
size = 4 * c->sm_control_size +
|
size = 4 * control_size +
|
||||||
|
(num_in_use * control_size) +
|
||||||
(num_segments * (comm_size * control_size * 2)) +
|
(num_segments * (comm_size * control_size * 2)) +
|
||||||
(num_segments * (comm_size * frag_size));
|
(num_segments * (comm_size * frag_size));
|
||||||
|
|
||||||
@ -591,11 +620,12 @@ static int bootstrap_comm(ompi_communicator_t *comm)
|
|||||||
component would have been chosen), so we don't need
|
component would have been chosen), so we don't need
|
||||||
to do that cleanup. */
|
to do that cleanup. */
|
||||||
bscs[i].smbcs_data_mpool_offset = 0;
|
bscs[i].smbcs_data_mpool_offset = 0;
|
||||||
bscs[i].smbcs_comm_num_segments = 0;
|
bscs[i].smbcs_success = false;
|
||||||
--bscs[i].smbcs_count;
|
--bscs[i].smbcs_count;
|
||||||
opal_atomic_unlock(&bshe->super.seg_lock);
|
opal_atomic_unlock(&bshe->super.seg_lock);
|
||||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||||
}
|
}
|
||||||
|
bscs[i].smbcs_success = true;
|
||||||
|
|
||||||
/* Calculate the offset and put it in the bootstrap
|
/* Calculate the offset and put it in the bootstrap
|
||||||
area */
|
area */
|
||||||
@ -624,7 +654,7 @@ static int bootstrap_comm(ompi_communicator_t *comm)
|
|||||||
|
|
||||||
/* Check to see if there was an error while allocating the shared
|
/* Check to see if there was an error while allocating the shared
|
||||||
memory */
|
memory */
|
||||||
if (0 == bscs[i].smbcs_comm_num_segments) {
|
if (!bscs[i].smbcs_success) {
|
||||||
err = OMPI_ERR_OUT_OF_RESOURCE;
|
err = OMPI_ERR_OUT_OF_RESOURCE;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -636,7 +666,6 @@ static int bootstrap_comm(ompi_communicator_t *comm)
|
|||||||
data->mcb_mpool_base = c->sm_data_mpool->mpool_base(c->sm_data_mpool);
|
data->mcb_mpool_base = c->sm_data_mpool->mpool_base(c->sm_data_mpool);
|
||||||
data->mcb_mpool_offset = bscs[i].smbcs_data_mpool_offset;
|
data->mcb_mpool_offset = bscs[i].smbcs_data_mpool_offset;
|
||||||
data->mcb_mpool_area = data->mcb_mpool_base + data->mcb_mpool_offset;
|
data->mcb_mpool_area = data->mcb_mpool_base + data->mcb_mpool_offset;
|
||||||
data->mcb_mpool_num_segments = bscs[i].smbcs_comm_num_segments;
|
|
||||||
data->mcb_operation_count = 0;
|
data->mcb_operation_count = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user