First cut of sm coll component infrastrcutre (this is what took so
much time) and somewhat-lame implementation of barrier (need to precompute some more stuff rather than calculate it every time). Checkpointing so I can try this on another machine... This commit was SVN r6985.
Этот коммит содержится в:
родитель
5a399e8150
Коммит
31065f1cc0
@ -26,12 +26,64 @@
|
||||
#include "opal/mca/mca.h"
|
||||
#include "ompi/mca/coll/coll.h"
|
||||
#include "ompi/mca/mpool/mpool.h"
|
||||
#include "ompi/mca/common/sm/common_sm_mmap.h"
|
||||
|
||||
#if defined(c_plusplus) || defined(__cplusplus)
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#define PUB(foo) mca_coll_sm##foo
|
||||
/**
|
||||
* Structure used within bootstrap mmap file for setting up a coll
|
||||
* sm component on a communicator
|
||||
*/
|
||||
struct mca_coll_sm_bootstrap_comm_setup_t {
|
||||
/** Offset to be used in the data mpool for this comm's
|
||||
collective sm operations -- use this value plus the base
|
||||
of the mpool to obtain the pointer to this comm's
|
||||
mca_coll_sm_mpool_area_t */
|
||||
size_t smbcs_data_mpool_offset;
|
||||
|
||||
/** Number of segments in the data mpool area for this
|
||||
communicator */
|
||||
int smbcs_communicator_num_segments;
|
||||
|
||||
/** Number of processes in this communicator who have seen
|
||||
this value already. */
|
||||
int smbcs_count;
|
||||
};
|
||||
/**
|
||||
* Convenience typedef
|
||||
*/
|
||||
typedef struct mca_coll_sm_bootstrap_comm_setup_t
|
||||
mca_coll_sm_bootstrap_comm_setup_t;
|
||||
|
||||
/**
|
||||
* Extension to the control structure in the bootstrap mmap file
|
||||
*/
|
||||
struct mca_coll_sm_bootstrap_header_extension_t {
|
||||
/** upper-level control structure */
|
||||
mca_common_sm_file_header_t super;
|
||||
|
||||
/** Number of segments in the bootstrap mmap file */
|
||||
int smbhe_num_segments;
|
||||
|
||||
/** Pointer to the start of the segments in the bootstrap area
|
||||
(map->seg_data only points to just beyond the
|
||||
mca_common_sm_file_header_t) */
|
||||
mca_coll_sm_bootstrap_comm_setup_t *smbhe_segments;
|
||||
|
||||
/** Pointer to array containing smhe_num_segments CIDs for use
|
||||
in bootstrap phase -- will always point immediately after
|
||||
the end of this struct (i.e., still within this header,
|
||||
but since it's variable size (set by MCA param), we can't
|
||||
just have it here in the struct. Bonk). */
|
||||
uint32_t *smbhe_cids;
|
||||
};
|
||||
/**
|
||||
* Convenience typedef
|
||||
*/
|
||||
typedef struct mca_coll_sm_bootstrap_header_extension_t
|
||||
mca_coll_sm_bootstrap_header_extension_t;
|
||||
|
||||
/**
|
||||
* Structure to hold the sm coll component. First it holds the
|
||||
@ -39,33 +91,119 @@ extern "C" {
|
||||
* sm-coll-component-specific stuff (e.g., current MCA param
|
||||
* values).
|
||||
*/
|
||||
typedef struct mca_coll_sm_component_t {
|
||||
struct mca_coll_sm_component_t {
|
||||
/** Base coll component */
|
||||
mca_coll_base_component_1_0_0_t super;
|
||||
|
||||
/** Priority of this component */
|
||||
/** MCA parameter: Priority of this component */
|
||||
int sm_priority;
|
||||
/** Name of the mpool that this component will look for */
|
||||
|
||||
/** MCA parameter: Length of a cache line or page (in bytes) */
|
||||
int sm_control_size;
|
||||
|
||||
/** MCA parameter: Name of shared memory control / bootstrap
|
||||
mmap file */
|
||||
char *sm_bootstrap_filename;
|
||||
|
||||
/** MCA parameter: Number of segments in the bootstrap file
|
||||
(for use with setting up multiple comm's with sm
|
||||
components simultaneously) */
|
||||
int sm_bootstrap_num_segments;
|
||||
|
||||
/** MCA parameter: Name of the mpool that this component will
|
||||
use */
|
||||
char *sm_mpool_name;
|
||||
/** Mpool that will be used */
|
||||
mca_mpool_base_module_t *sm_mpool;
|
||||
} mca_coll_sm_component_t;
|
||||
|
||||
/** MCA parameter: Number of segments for each communicator in
|
||||
the data mpool */
|
||||
int sm_communicator_num_segments;
|
||||
|
||||
/** MCA parameter: Fragment size for data */
|
||||
int sm_fragment_size;
|
||||
|
||||
/** MCA parameter: Degree of tree for tree-based collectives */
|
||||
int sm_tree_degree;
|
||||
|
||||
/** Size of the bootstrap area -- calculated in
|
||||
coll_sm_component.c */
|
||||
size_t sm_bootstrap_size;
|
||||
|
||||
/** Size of the data mpool area -- calculated in
|
||||
coll_sm_component.c */
|
||||
size_t sm_data_mpool_size;
|
||||
|
||||
/** Data mpool that will be used */
|
||||
mca_mpool_base_module_t *sm_data_mpool;
|
||||
|
||||
/** Whether we ended up creating the sm mpool or whether
|
||||
someone else created it and we just found it */
|
||||
bool sm_data_mpool_created;
|
||||
|
||||
/** Meta struct containing information about the bootstrap area */
|
||||
mca_common_sm_mmap_t *sm_bootstrap_meta;
|
||||
};
|
||||
/**
|
||||
* Convenience typedef
|
||||
*/
|
||||
typedef struct mca_coll_sm_component_t mca_coll_sm_component_t;
|
||||
|
||||
/**
|
||||
* Structure for sm collective module, per communicator. The
|
||||
* structure mainly stores memory pointers to the specific
|
||||
* poritions in the shared memory area. Each shared memory area is
|
||||
* reserved for special functions. The shared memory is split
|
||||
* between two types of areas. One is control section that stores
|
||||
* shared flags used during synchronization, while other section
|
||||
* is purely used to pass messages from one process to other.
|
||||
* Structure containing pointers to various arrays of data in the
|
||||
* data mpool area (one of these indexes a single segment in the
|
||||
* data mpool). Nothing is hard-coded because all the array
|
||||
* lengths and displacements of the pointers all depend on how
|
||||
* many processes are in the communicator.
|
||||
*/
|
||||
typedef struct mca_coll_base_module_comm_t {
|
||||
struct mca_coll_base_mpool_index_t {
|
||||
/** Pointer to beginning of control fan-in data */
|
||||
char *mcbmi_control_fan_in;
|
||||
/** Pointer to beginning of control fan-out data */
|
||||
char *mcbmi_control_fan_out;
|
||||
/** Pointer to beginning of message data fan-in data */
|
||||
char *mcbmi_data_fan_in;
|
||||
/** Pointer to beginning of message data fan-out data */
|
||||
char *mcbmi_data_fan_out;
|
||||
};
|
||||
typedef struct mca_coll_base_mpool_index_t mca_coll_base_mpool_index_t;
|
||||
|
||||
/* JMS fill in here */
|
||||
int foo;
|
||||
/**
|
||||
* Structure for the sm coll module to hang off the communicator.
|
||||
* Contains communicator-specific information, including pointers
|
||||
* into the data mpool for this comm's sm collective operations
|
||||
* area.
|
||||
*/
|
||||
struct mca_coll_base_comm_t {
|
||||
/** If this process is the one that invoked mpool_alloc() for
|
||||
the data segment, the return value will be in here.
|
||||
Otherwise, it will be NULL (i.e., only the allocating
|
||||
process will call free). */
|
||||
void *mcb_data_mpool_malloc_addr;
|
||||
/** Base of the data mpool */
|
||||
char *mcb_mpool_base;
|
||||
/** Offset into the data mpool where this comm's operations
|
||||
area is */
|
||||
size_t mcb_mpool_offset;
|
||||
/** Pointer in the data mpool to the beginning of this comm's
|
||||
operations area (i.e., mcb_mpool_base +
|
||||
mcb_mpool_offset) */
|
||||
char *mcb_mpool_area;
|
||||
/** Number of segments in this comm's area in the data mpool */
|
||||
int mcb_mpool_num_segments;
|
||||
|
||||
} mca_coll_base_module_comm_t;
|
||||
/** Array of indexes into the mpool area (containing pointers
|
||||
to each segments control and data areas). This array will
|
||||
be located immediately after the instance of this struct
|
||||
in memory (i.e., so this struct and the array that this
|
||||
member points to will be adjacent in memory). */
|
||||
mca_coll_base_mpool_index_t **mcb_mpool_index;
|
||||
|
||||
/** Operation number (i.e., which segment number to use) */
|
||||
int mcb_operation_count;
|
||||
};
|
||||
/**
|
||||
* Convenience typedef
|
||||
*/
|
||||
typedef struct mca_coll_base_comm_t mca_coll_base_comm_t;
|
||||
|
||||
|
||||
/**
|
||||
@ -88,10 +226,7 @@ extern "C" {
|
||||
int mca_coll_sm_comm_unquery(struct ompi_communicator_t *comm,
|
||||
struct mca_coll_base_comm_t *data);
|
||||
|
||||
const struct mca_coll_base_module_1_0_0_t *
|
||||
mca_coll_sm_module_init(struct ompi_communicator_t *comm);
|
||||
|
||||
int mca_coll_sm_module_finalize(struct ompi_communicator_t *comm);
|
||||
int mca_coll_sm_bootstrap_finalize(void);
|
||||
|
||||
int mca_coll_sm_allgather_intra(void *sbuf, int scount,
|
||||
struct ompi_datatype_t *sdtype,
|
||||
|
@ -17,8 +17,16 @@
|
||||
#include "ompi_config.h"
|
||||
|
||||
#include "ompi/include/constants.h"
|
||||
#include "ompi/communicator/communicator.h"
|
||||
#include "ompi/mca/coll/coll.h"
|
||||
#include "opal/include/sys/atomic.h"
|
||||
#include "coll_sm.h"
|
||||
|
||||
#if 0
|
||||
#define D(foo) printf foo
|
||||
#else
|
||||
#define D(foo)
|
||||
#endif
|
||||
|
||||
/*
|
||||
* barrier
|
||||
@ -29,5 +37,108 @@
|
||||
*/
|
||||
int mca_coll_sm_barrier_intra(struct ompi_communicator_t *comm)
|
||||
{
|
||||
return OMPI_ERR_NOT_IMPLEMENTED;
|
||||
mca_coll_base_comm_t *data = comm->c_coll_selected_data;
|
||||
uint32_t *my_control_in, *my_control_out;
|
||||
uint32_t *parent_control_in;
|
||||
int i, rank, start_rank, parent, num_children, segment;
|
||||
char *control_in, *control_out;
|
||||
|
||||
segment = (++data->mcb_operation_count % data->mcb_mpool_num_segments);
|
||||
control_in = data->mcb_mpool_index[segment]->mcbmi_control_fan_in;
|
||||
control_out = data->mcb_mpool_index[segment]->mcbmi_control_fan_out;
|
||||
|
||||
/* THIS CAN BE PRECOMPUTED */
|
||||
/* Figure out some identities */
|
||||
|
||||
rank = ompi_comm_rank(comm);
|
||||
num_children = mca_coll_sm_component.sm_tree_degree;
|
||||
parent = (rank - 1) / mca_coll_sm_component.sm_tree_degree;
|
||||
|
||||
/* Do we have children? If so, how many? */
|
||||
|
||||
if ((rank * num_children) + 1 >= ompi_comm_size(comm)) {
|
||||
/* Leaves */
|
||||
num_children = 0;
|
||||
} else {
|
||||
int min_child = rank * num_children + 1;
|
||||
int max_child = rank * num_children + num_children;
|
||||
if (max_child >= ompi_comm_size(comm)) {
|
||||
max_child = ompi_comm_size(comm) - 1;;
|
||||
}
|
||||
D(("rank %d: min child: %d, max child: %d\n", rank, min_child, max_child));
|
||||
num_children = max_child - min_child + 1;
|
||||
}
|
||||
D(("rank %d: segment %d (opn count: %d), parent %d, num_children = %d\n",
|
||||
rank, segment, data->mcb_operation_count, parent, num_children));
|
||||
fflush(stdout);
|
||||
|
||||
/* Pre-calculate some pointers */
|
||||
|
||||
my_control_in = (uint32_t *)
|
||||
(control_in + (rank * mca_coll_sm_component.sm_control_size));
|
||||
my_control_out = (uint32_t *)
|
||||
(control_out + (rank * mca_coll_sm_component.sm_control_size));
|
||||
*my_control_out = 0;
|
||||
|
||||
if (0 != rank) {
|
||||
parent_control_in = (uint32_t *)
|
||||
(control_in + (parent * mca_coll_sm_component.sm_control_size));
|
||||
} else {
|
||||
parent_control_in = NULL;
|
||||
}
|
||||
|
||||
/* Fan in: wait for my children */
|
||||
|
||||
if (0 != num_children) {
|
||||
D(("rank %d waiting for fan in from %d children...\n", rank, num_children));
|
||||
while (*my_control_in != (uint32_t) num_children) {
|
||||
opal_atomic_wmb();
|
||||
}
|
||||
*my_control_in = 0;
|
||||
D(("rank %d got fan in\n", rank));
|
||||
}
|
||||
|
||||
/* Fan in: send to my parent */
|
||||
|
||||
if (NULL != parent_control_in) {
|
||||
D(("rank %d writing to parent\n", rank));
|
||||
opal_atomic_add(parent_control_in, 1);
|
||||
D(("rank %d wrote to parent: %d\n", rank, *parent_control_in));
|
||||
}
|
||||
|
||||
/* Fan out: wait for my parent to write to me (don't poll on
|
||||
parent's out buffer -- that would cause a lot of network
|
||||
traffic / contention / faults / etc. -- this way, the children
|
||||
poll on local memory and therefore only num_children messages
|
||||
are sent across the network [vs. num_children *each* time all
|
||||
the children poll] -- i.e., the memory is only being polled by
|
||||
one process, and it is only changed *once* by an external
|
||||
process) */
|
||||
|
||||
if (NULL != parent_control_in) {
|
||||
D(("rank %d waiting for fan out from parent\n", rank));
|
||||
while (0 == *my_control_out) {
|
||||
opal_atomic_wmb();
|
||||
}
|
||||
D(("rank %d got fan out from parent\n", rank));
|
||||
}
|
||||
|
||||
/* Fan out: send to my children */
|
||||
|
||||
start_rank = (rank * mca_coll_sm_component.sm_tree_degree) + 1;
|
||||
for (i = 0; i < num_children; ++i) {
|
||||
D(("rank %d writing fan out to child %d, rank %d (start %d, num_children %d)\n", rank, i, start_rank + i, start_rank, num_children));
|
||||
*((uint32_t *)
|
||||
(control_out + ((start_rank + i) * mca_coll_sm_component.sm_control_size))) = 1;
|
||||
|
||||
}
|
||||
D(("rank %d done with barrier\n", rank));
|
||||
|
||||
/* All done! End state of the control segment:
|
||||
|
||||
my_control_in: 0
|
||||
my_control_out: 1
|
||||
*/
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
@ -23,7 +23,9 @@
|
||||
#include "ompi_config.h"
|
||||
|
||||
#include "ompi/include/constants.h"
|
||||
#include "mca/coll/coll.h"
|
||||
#include "ompi/communicator/communicator.h"
|
||||
#include "ompi/mca/coll/coll.h"
|
||||
#include "opal/util/show_help.h"
|
||||
#include "coll_sm.h"
|
||||
|
||||
|
||||
@ -91,12 +93,38 @@ mca_coll_sm_component_t mca_coll_sm_component = {
|
||||
|
||||
/* sm-component specifc information */
|
||||
|
||||
/* priority */
|
||||
/* (default) priority */
|
||||
75,
|
||||
|
||||
/* mpool name and instance */
|
||||
/* (default) control unit size (bytes) */
|
||||
64,
|
||||
|
||||
/* (default) bootstrap filename */
|
||||
"coll-sm-bootstrap",
|
||||
|
||||
/* (default) number of segments in bootstrap file */
|
||||
8,
|
||||
|
||||
/* (default) mpool name to use */
|
||||
"sm",
|
||||
NULL
|
||||
|
||||
/* (default) number of segments for each communicator in the mpool
|
||||
area */
|
||||
2,
|
||||
|
||||
/* (default) fragment size */
|
||||
8192,
|
||||
|
||||
/* (default) degree of tree for tree-based operations (must be <=
|
||||
control unit size) */
|
||||
4,
|
||||
|
||||
/* default values for non-MCA parameters */
|
||||
0, /* bootstrap size -- filled in below */
|
||||
0, /* mpool data size -- filled in below */
|
||||
NULL, /* data mpool pointer */
|
||||
false, /* whether this process created the data mpool */
|
||||
NULL /* pointer to meta data about bootstrap area */
|
||||
};
|
||||
|
||||
|
||||
@ -105,19 +133,115 @@ mca_coll_sm_component_t mca_coll_sm_component = {
|
||||
*/
|
||||
static int sm_open(void)
|
||||
{
|
||||
int p, ival;
|
||||
char *sval;
|
||||
size_t size1, size2;
|
||||
mca_base_component_t *c = &mca_coll_sm_component.super.collm_version;
|
||||
mca_coll_sm_component_t *cs = &mca_coll_sm_component;
|
||||
|
||||
/* If we want to be selected (i.e., all procs on one node), then
|
||||
we should have a high priority */
|
||||
|
||||
p = mca_base_param_register_int("coll", "sm", "priority", NULL, 75);
|
||||
mca_base_param_lookup_int(p, &ival);
|
||||
mca_coll_sm_component.sm_priority = ival;
|
||||
mca_base_param_reg_int(c, "priority",
|
||||
"Priority of the sm coll component",
|
||||
false, false,
|
||||
cs->sm_priority,
|
||||
&cs->sm_priority);
|
||||
|
||||
p = mca_base_param_register_string("coll", "sm", "mpool", NULL, "sm");
|
||||
mca_base_param_lookup_string(p, &sval);
|
||||
mca_coll_sm_component.sm_mpool_name = sval;
|
||||
mca_base_param_reg_int(c, "control_size",
|
||||
"Length of the control data -- should usually be either a cache line on most SMPs, or a page on machine where pages that support direct memory affinity placement (in bytes)",
|
||||
false, false,
|
||||
cs->sm_control_size,
|
||||
&cs->sm_control_size);
|
||||
|
||||
mca_base_param_reg_string(c, "bootstrap_filename",
|
||||
"Filename (in the Open MPI session directory) of the coll sm component bootstrap rendezvous mmap file",
|
||||
false, false,
|
||||
cs->sm_bootstrap_filename,
|
||||
&cs->sm_bootstrap_filename);
|
||||
|
||||
mca_base_param_reg_int(c, "bootstrap_num_segments",
|
||||
"Number of segments in the bootstrap file",
|
||||
false, false,
|
||||
cs->sm_bootstrap_num_segments,
|
||||
&cs->sm_bootstrap_num_segments);
|
||||
|
||||
mca_base_param_reg_int(c, "fragment_size",
|
||||
"Fragment size (in bytes) used for passing data through shared memory (will be rounded up to the nearest control_size size)",
|
||||
false, false,
|
||||
cs->sm_fragment_size,
|
||||
&cs->sm_fragment_size);
|
||||
if (0 != (cs->sm_fragment_size % cs->sm_control_size)) {
|
||||
cs->sm_fragment_size += cs->sm_control_size -
|
||||
(cs->sm_fragment_size % cs->sm_control_size);
|
||||
}
|
||||
|
||||
mca_base_param_reg_string(c, "mpool",
|
||||
"Name of the mpool component to use",
|
||||
false, false,
|
||||
cs->sm_mpool_name,
|
||||
&cs->sm_mpool_name);
|
||||
|
||||
mca_base_param_reg_int(c, "communicator_num_segments",
|
||||
"Number of shared memory collective segments on each communicator",
|
||||
false, false,
|
||||
cs->sm_communicator_num_segments,
|
||||
&cs->sm_communicator_num_segments);
|
||||
|
||||
mca_base_param_reg_int(c, "tree_degree",
|
||||
"Degree of the tree for tree-based operations (must be <= control size and <= 255)",
|
||||
false, false,
|
||||
cs->sm_tree_degree,
|
||||
&cs->sm_tree_degree);
|
||||
if (cs->sm_tree_degree > cs->sm_control_size) {
|
||||
opal_show_help("help-coll-sm.txt",
|
||||
"tree-degree-larger-than-control", true,
|
||||
cs->sm_tree_degree, cs->sm_control_size);
|
||||
cs->sm_tree_degree = cs->sm_control_size;
|
||||
}
|
||||
if (cs->sm_tree_degree > 255) {
|
||||
opal_show_help("help-coll-sm.txt",
|
||||
"tree-degree-larger-than-255", true,
|
||||
cs->sm_tree_degree);
|
||||
cs->sm_tree_degree = 255;
|
||||
}
|
||||
|
||||
/* Size of the bootstrap shared mb
|
||||
emory area. */
|
||||
|
||||
size1 =
|
||||
sizeof(mca_coll_sm_bootstrap_header_extension_t) +
|
||||
(mca_coll_sm_component.sm_bootstrap_num_segments *
|
||||
sizeof(mca_coll_sm_bootstrap_comm_setup_t)) +
|
||||
(sizeof(uint32_t) * mca_coll_sm_component.sm_bootstrap_num_segments);
|
||||
mca_base_param_reg_int(c, "shared_mem_used_bootstrap",
|
||||
"Amount of shared memory used in the shared memory bootstrap area (in bytes)",
|
||||
false, true,
|
||||
size1, NULL);
|
||||
|
||||
/* Calculate how much space we need in the data mpool. There are
|
||||
several values to add (one of these for each segment):
|
||||
|
||||
- size of the control data:
|
||||
- fan-in data (num_procs * control_size size)
|
||||
- fan-out data (num_procs * control_size size)
|
||||
- size of message data
|
||||
- fan-in data (num_procs * (frag_size rounded up to
|
||||
control_size size))
|
||||
- fan-out data (num_procs * (frag_size rounded up
|
||||
to control_size size))
|
||||
|
||||
So it's:
|
||||
|
||||
num_segs * ((num_procs * control_size * 2) + (num_procs * frag * 2))
|
||||
|
||||
Which reduces to:
|
||||
|
||||
num_segs * num_procs * 2 * (control_size + frag)
|
||||
|
||||
For this example, assume num_procs = 1.
|
||||
*/
|
||||
|
||||
size2 = cs->sm_communicator_num_segments * 2 *
|
||||
(cs->sm_control_size + cs->sm_fragment_size);
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
@ -133,5 +257,7 @@ static int sm_close(void)
|
||||
mca_coll_sm_component.sm_mpool_name = NULL;
|
||||
}
|
||||
|
||||
mca_coll_sm_bootstrap_finalize();
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
@ -17,6 +17,10 @@
|
||||
#include "ompi_config.h"
|
||||
|
||||
#include <stdio.h>
|
||||
#ifdef HAVE_SCHED_H
|
||||
#include <sched.h>
|
||||
#endif
|
||||
#include <sys/mman.h>
|
||||
|
||||
#include "mpi.h"
|
||||
#include "ompi/communicator/communicator.h"
|
||||
@ -27,6 +31,23 @@
|
||||
#include "coll_sm.h"
|
||||
|
||||
|
||||
/*
|
||||
* Local functions
|
||||
*/
|
||||
static const struct mca_coll_base_module_1_0_0_t *
|
||||
sm_module_init(struct ompi_communicator_t *comm);
|
||||
static int sm_module_finalize(struct ompi_communicator_t *comm);
|
||||
static bool have_local_peers(ompi_proc_t **procs, size_t size);
|
||||
static int bootstrap_init(void);
|
||||
static int bootstrap_comm(ompi_communicator_t *comm);
|
||||
|
||||
|
||||
/*
|
||||
* Local variables
|
||||
*/
|
||||
static bool bootstrap_inited = false;
|
||||
|
||||
|
||||
/*
|
||||
* Linear set of collective algorithms
|
||||
*/
|
||||
@ -34,8 +55,8 @@ static const mca_coll_base_module_1_0_0_t module = {
|
||||
|
||||
/* Initialization / finalization functions */
|
||||
|
||||
mca_coll_sm_module_init,
|
||||
mca_coll_sm_module_finalize,
|
||||
sm_module_init,
|
||||
sm_module_finalize,
|
||||
|
||||
/* Collective function pointers */
|
||||
|
||||
@ -61,12 +82,59 @@ static const mca_coll_base_module_1_0_0_t module = {
|
||||
/*
|
||||
* Initial query function that is invoked during MPI_INIT, allowing
|
||||
* this component to disqualify itself if it doesn't support the
|
||||
* required level of thread support.
|
||||
* required level of thread support. This function is invoked exactly
|
||||
* once.
|
||||
*/
|
||||
int mca_coll_sm_init_query(bool enable_progress_threads,
|
||||
bool enable_mpi_threads)
|
||||
{
|
||||
/* Nothing to do */
|
||||
int ret;
|
||||
#if 0
|
||||
/* JMS: Arrgh. Unfortunately, we don't have this information by
|
||||
the time this is invoked -- the GPR compound command doesn't
|
||||
fire until after coll_base_open() in ompi_mpi_init(). */
|
||||
|
||||
ompi_proc_t **procs;
|
||||
size_t size;
|
||||
|
||||
/* Check to see if anyone is local on this machine. If not, don't
|
||||
bother doing anything else. */
|
||||
|
||||
procs = ompi_proc_all(&size);
|
||||
if (NULL == procs || 0 == size) {
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
if (!have_local_peers(procs, size)) {
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
free(procs);
|
||||
#endif
|
||||
|
||||
/* Ok, we have local peers. So setup the bootstrap file */
|
||||
|
||||
if (OMPI_SUCCESS != (ret = bootstrap_init())) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* Can we get an mpool allocation? See if there was one created
|
||||
already. If not, try to make one. */
|
||||
|
||||
mca_coll_sm_component.sm_data_mpool =
|
||||
mca_mpool_base_module_lookup(mca_coll_sm_component.sm_mpool_name);
|
||||
if (NULL == mca_coll_sm_component.sm_data_mpool) {
|
||||
mca_coll_sm_component.sm_data_mpool =
|
||||
mca_mpool_base_module_create(mca_coll_sm_component.sm_mpool_name,
|
||||
NULL, NULL);
|
||||
if (NULL == mca_coll_sm_component.sm_data_mpool) {
|
||||
mca_coll_sm_bootstrap_finalize();
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
mca_coll_sm_component.sm_data_mpool_created = true;
|
||||
} else {
|
||||
mca_coll_sm_component.sm_data_mpool_created = false;
|
||||
}
|
||||
|
||||
/* Alles gut */
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
@ -81,12 +149,13 @@ const mca_coll_base_module_1_0_0_t *
|
||||
mca_coll_sm_comm_query(struct ompi_communicator_t *comm, int *priority,
|
||||
struct mca_coll_base_comm_t **data)
|
||||
{
|
||||
int i;
|
||||
|
||||
/* If we're intercomm, or if there's only one process in the
|
||||
communicator, we don't want to run */
|
||||
communicator, or if not all the processes in the communicator
|
||||
are not on this node, then we don't want to run */
|
||||
|
||||
if (OMPI_COMM_IS_INTER(comm) || 1 == ompi_comm_size(comm)) {
|
||||
if (OMPI_COMM_IS_INTER(comm) || 1 == ompi_comm_size(comm) ||
|
||||
!have_local_peers(comm->c_local_group->grp_proc_pointers,
|
||||
ompi_comm_size(comm))) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
@ -94,30 +163,6 @@ mca_coll_sm_comm_query(struct ompi_communicator_t *comm, int *priority,
|
||||
|
||||
*priority = mca_coll_sm_component.sm_priority;
|
||||
|
||||
/* We only want to run if all the processes in the communicator
|
||||
are on the same node */
|
||||
|
||||
for (i = 0; i < ompi_comm_size(comm); ++i) {
|
||||
if (0 == (comm->c_local_group->grp_proc_pointers[i]->proc_flags &
|
||||
OMPI_PROC_FLAG_LOCAL)) {
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
|
||||
/* Can we get an mpool allocation? See if there was one created
|
||||
already. If not, try to make one. */
|
||||
|
||||
mca_coll_sm_component.sm_mpool =
|
||||
mca_mpool_base_module_lookup(mca_coll_sm_component.sm_mpool_name);
|
||||
if (NULL == mca_coll_sm_component.sm_mpool) {
|
||||
mca_coll_sm_component.sm_mpool =
|
||||
mca_mpool_base_module_create(mca_coll_sm_component.sm_mpool_name,
|
||||
NULL, NULL);
|
||||
if (NULL == mca_coll_sm_component.sm_mpool) {
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
|
||||
/* All is good -- return a module */
|
||||
|
||||
return &module;
|
||||
@ -130,9 +175,6 @@ mca_coll_sm_comm_query(struct ompi_communicator_t *comm, int *priority,
|
||||
int mca_coll_sm_comm_unquery(struct ompi_communicator_t *comm,
|
||||
struct mca_coll_base_comm_t *data)
|
||||
{
|
||||
/* JMS */
|
||||
/* Remove mpool query, if we got one */
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
@ -140,10 +182,77 @@ int mca_coll_sm_comm_unquery(struct ompi_communicator_t *comm,
|
||||
/*
|
||||
* Init module on the communicator
|
||||
*/
|
||||
const struct mca_coll_base_module_1_0_0_t *
|
||||
mca_coll_sm_module_init(struct ompi_communicator_t *comm)
|
||||
static const struct mca_coll_base_module_1_0_0_t *
|
||||
sm_module_init(struct ompi_communicator_t *comm)
|
||||
{
|
||||
/* JMS */
|
||||
int i;
|
||||
mca_coll_base_comm_t *data;
|
||||
size_t control_size, data_size, total_size;
|
||||
mca_coll_sm_component_t *c = &mca_coll_sm_component;
|
||||
|
||||
/* Allocate data to hang off the communicator. The memory we
|
||||
alloc will be laid out as follows:
|
||||
|
||||
1. mca_coll_base_comm_t
|
||||
2. array of num_segments pointers to
|
||||
mca_coll_base_mpool_index_t instances
|
||||
3. array of num_segments mca_coll_base_mpool_index_t instances
|
||||
(pointed to by the array in 2)
|
||||
*/
|
||||
|
||||
data = malloc(sizeof(mca_coll_base_comm_t) +
|
||||
(mca_coll_sm_component.sm_bootstrap_num_segments *
|
||||
(sizeof(mca_coll_base_mpool_index_t *) +
|
||||
sizeof(mca_coll_base_mpool_index_t))));
|
||||
if (NULL == data) {
|
||||
return NULL;
|
||||
}
|
||||
data->mcb_data_mpool_malloc_addr = NULL;
|
||||
|
||||
/* Setup #2: set the array to point immediately beyond the
|
||||
mca_coll_base_comm_t */
|
||||
data->mcb_mpool_index = (mca_coll_base_mpool_index_t**) (data + 1);
|
||||
/* Setup the first pointer in #2 to point to the first instance in
|
||||
#3 */
|
||||
data->mcb_mpool_index[0] = (mca_coll_base_mpool_index_t*)
|
||||
(((char*) data->mcb_mpool_index) +
|
||||
(mca_coll_sm_component.sm_bootstrap_num_segments *
|
||||
sizeof(mca_coll_base_mpool_index_t *)));
|
||||
/* Now setup the rest of the pointers in #2 to point to their
|
||||
corresponding instances in #3 */
|
||||
for (i = 1; i < mca_coll_sm_component.sm_bootstrap_num_segments; ++i) {
|
||||
data->mcb_mpool_index[i] = data->mcb_mpool_index[i - 1] + 1;
|
||||
}
|
||||
|
||||
comm->c_coll_selected_data = data;
|
||||
|
||||
/* Bootstrap this communicator; set us up with the global mpool,
|
||||
etc. */
|
||||
|
||||
if (OMPI_SUCCESS != bootstrap_comm(comm)) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* Once the communicator is bootstrapped, setup the
|
||||
mca_coll_base_mpool_index_t pointers to point to the
|
||||
appropriate places in the mpool. */
|
||||
|
||||
control_size = ompi_comm_size(comm) * c->sm_control_size;
|
||||
data_size = ompi_comm_size(comm) * c->sm_fragment_size;
|
||||
total_size = (control_size + data_size) * 2;
|
||||
for (i = 0; i < data->mcb_mpool_num_segments; ++i) {
|
||||
data->mcb_mpool_index[i]->mcbmi_control_fan_in =
|
||||
data->mcb_mpool_base + data->mcb_mpool_offset +
|
||||
(i * total_size);
|
||||
data->mcb_mpool_index[i]->mcbmi_control_fan_out =
|
||||
data->mcb_mpool_index[i]->mcbmi_control_fan_in + control_size;
|
||||
data->mcb_mpool_index[i]->mcbmi_data_fan_in =
|
||||
data->mcb_mpool_index[i]->mcbmi_control_fan_out + control_size;
|
||||
data->mcb_mpool_index[i]->mcbmi_data_fan_out =
|
||||
data->mcb_mpool_index[i]->mcbmi_data_fan_in + data_size;
|
||||
}
|
||||
|
||||
/* All done */
|
||||
|
||||
return &module;
|
||||
}
|
||||
@ -152,9 +261,310 @@ mca_coll_sm_module_init(struct ompi_communicator_t *comm)
|
||||
/*
|
||||
* Finalize module on the communicator
|
||||
*/
|
||||
int mca_coll_sm_module_finalize(struct ompi_communicator_t *comm)
|
||||
static int sm_module_finalize(struct ompi_communicator_t *comm)
|
||||
{
|
||||
/* JMS */
|
||||
mca_coll_base_comm_t *data;
|
||||
|
||||
/* Free the space in the data mpool and the data hanging off the
|
||||
communicator */
|
||||
|
||||
data = comm->c_coll_selected_data;
|
||||
if (NULL != data) {
|
||||
/* If this was the process that allocated the space in the
|
||||
data mpool, then this is the process that frees it */
|
||||
|
||||
if (NULL != data->mcb_data_mpool_malloc_addr) {
|
||||
mca_coll_sm_component.sm_data_mpool->mpool_free(mca_coll_sm_component.sm_data_mpool,
|
||||
data->mcb_data_mpool_malloc_addr, NULL);
|
||||
}
|
||||
|
||||
/* Now free the data hanging off the communicator */
|
||||
|
||||
free(data);
|
||||
}
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
static bool have_local_peers(ompi_proc_t **procs, size_t size)
|
||||
{
|
||||
size_t i;
|
||||
|
||||
for (i = 0; i < size; ++i) {
|
||||
if (0 == (procs[i]->proc_flags & OMPI_PROC_FLAG_LOCAL)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
static int bootstrap_init(void)
|
||||
{
|
||||
int i;
|
||||
size_t size;
|
||||
char *fullpath;
|
||||
mca_common_sm_mmap_t *meta;
|
||||
mca_coll_sm_bootstrap_header_extension_t *bshe;
|
||||
|
||||
/* Create/open the sm coll bootstrap mmap. Make it have enough
|
||||
space for the top-level control structure and
|
||||
sm_bootstrap_num_segments per-communicator setup struct's
|
||||
(i.e., enough for sm_bootstrap_num_segments communicators to
|
||||
simultaneously set themselves up) */
|
||||
|
||||
if (NULL == mca_coll_sm_component.sm_bootstrap_filename) {
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
orte_proc_info();
|
||||
asprintf(&fullpath, "%s/%s", orte_process_info.job_session_dir,
|
||||
mca_coll_sm_component.sm_bootstrap_filename);
|
||||
if (NULL == fullpath) {
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
size =
|
||||
sizeof(mca_coll_sm_bootstrap_header_extension_t) +
|
||||
(mca_coll_sm_component.sm_bootstrap_num_segments *
|
||||
sizeof(mca_coll_sm_bootstrap_comm_setup_t)) +
|
||||
(sizeof(uint32_t) * mca_coll_sm_component.sm_bootstrap_num_segments);
|
||||
|
||||
mca_coll_sm_component.sm_bootstrap_meta = meta =
|
||||
mca_common_sm_mmap_init(size, fullpath,
|
||||
sizeof(mca_coll_sm_bootstrap_header_extension_t),
|
||||
8);
|
||||
if (NULL == meta) {
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
free(fullpath);
|
||||
|
||||
/* set the pointer to the bootstrap control structure */
|
||||
bshe = (mca_coll_sm_bootstrap_header_extension_t *) meta->map_seg;
|
||||
|
||||
/* Lock the bootstrap control structure. If it's not already
|
||||
initialized, then we're the first one in and we setup the data
|
||||
structures */
|
||||
|
||||
opal_atomic_lock(&bshe->super.seg_lock);
|
||||
opal_atomic_wmb();
|
||||
if (!bshe->super.seg_inited) {
|
||||
bshe->smbhe_num_segments =
|
||||
mca_coll_sm_component.sm_bootstrap_num_segments;
|
||||
bshe->smbhe_segments = (mca_coll_sm_bootstrap_comm_setup_t *)
|
||||
(((char *) bshe) +
|
||||
sizeof(mca_coll_sm_bootstrap_header_extension_t) +
|
||||
(sizeof(uint32_t) *
|
||||
mca_coll_sm_component.sm_bootstrap_num_segments));
|
||||
bshe->smbhe_cids = (uint32_t *)
|
||||
(((char *) bshe) + sizeof(*bshe));
|
||||
for (i = 0; i < bshe->smbhe_num_segments; ++i) {
|
||||
bshe->smbhe_cids[i] = INT_MAX;
|
||||
}
|
||||
|
||||
bshe->super.seg_inited = true;
|
||||
}
|
||||
opal_atomic_unlock(&bshe->super.seg_lock);
|
||||
|
||||
/* All done */
|
||||
|
||||
bootstrap_inited = true;
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
static int bootstrap_comm(ompi_communicator_t *comm)
|
||||
{
|
||||
int i, empty_index, err;
|
||||
bool found;
|
||||
mca_coll_sm_component_t *c = &mca_coll_sm_component;
|
||||
mca_coll_sm_bootstrap_header_extension_t *bshe;
|
||||
mca_coll_sm_bootstrap_comm_setup_t *bscs;
|
||||
mca_coll_base_comm_t *data = comm->c_coll_selected_data;
|
||||
|
||||
/* Is our CID in the CIDs array? If not, loop until we can find
|
||||
an open slot in the array to use in the bootstrap to setup our
|
||||
communicator. */
|
||||
|
||||
bshe = (mca_coll_sm_bootstrap_header_extension_t *)
|
||||
c->sm_bootstrap_meta->map_seg;
|
||||
bscs = bshe->smbhe_segments;
|
||||
opal_atomic_lock(&bshe->super.seg_lock);
|
||||
while (1) {
|
||||
opal_atomic_wmb();
|
||||
found = false;
|
||||
empty_index = -1;
|
||||
for (i = 0; i < bshe->smbhe_num_segments; ++i) {
|
||||
if (comm->c_contextid == bshe->smbhe_cids[i]) {
|
||||
found = true;
|
||||
break;
|
||||
} else if (INT_MAX == bshe->smbhe_cids[i] && -1 == empty_index) {
|
||||
empty_index = i;
|
||||
}
|
||||
}
|
||||
|
||||
/* Did we find our CID? */
|
||||
|
||||
if (found) {
|
||||
break;
|
||||
}
|
||||
|
||||
/* Nope. Did we find an empty slot? If so, initialize that
|
||||
slot and its corresponding segment for our CID. Get an
|
||||
mpool allocation big enough to handle all the shared memory
|
||||
collective stuff. */
|
||||
|
||||
else if (-1 != empty_index) {
|
||||
char *tmp;
|
||||
size_t size;
|
||||
|
||||
i = empty_index;
|
||||
bshe->smbhe_cids[i] = comm->c_contextid;
|
||||
|
||||
bscs[i].smbcs_communicator_num_segments =
|
||||
c->sm_communicator_num_segments;
|
||||
bscs[i].smbcs_count = ompi_comm_size(comm);
|
||||
|
||||
/* Calculate how much space we need in the data mpool.
|
||||
There are several values to add (one of these for each
|
||||
segment):
|
||||
|
||||
- size of the control data:
|
||||
- fan-in data (num_procs * control size)
|
||||
- fan-out data (num_procs * control size)
|
||||
- size of message data
|
||||
- fan-in data (num_procs * (frag_size rounded up to
|
||||
control size))
|
||||
- fan-out data (num_procs * (frag_size rounded up
|
||||
to control size))
|
||||
|
||||
So it's:
|
||||
|
||||
num_segs * ((num_procs * cachline * 2) + (num_procs *
|
||||
frag * 2))
|
||||
|
||||
Which reduces to:
|
||||
|
||||
num_segs * num_procs * 2 * (control + frag)
|
||||
*/
|
||||
|
||||
size = c->sm_communicator_num_segments * ompi_comm_size(comm) * 2 *
|
||||
(c->sm_control_size + c->sm_fragment_size);
|
||||
|
||||
data->mcb_data_mpool_malloc_addr = tmp =
|
||||
c->sm_data_mpool->mpool_alloc(c->sm_data_mpool, size,
|
||||
sizeof(double), NULL);
|
||||
if (NULL == tmp) {
|
||||
/* Cleanup before returning; allow other processes in
|
||||
this communicator to learn of the failure. Note
|
||||
that by definition, bscs[i].smbcs_count won't be
|
||||
zero after the decrement (because there must be >=2
|
||||
processes in this communicator, or the self coll
|
||||
component would have been chosen), so we don't need
|
||||
to do that cleanup. */
|
||||
bscs[i].smbcs_data_mpool_offset = 0;
|
||||
bscs[i].smbcs_communicator_num_segments = 0;
|
||||
--bscs[i].smbcs_count;
|
||||
opal_atomic_unlock(&bshe->super.seg_lock);
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
/* Zero it out. We really only need to zero out the
|
||||
control structures, but zeroing out the entire this is
|
||||
easy and "good enough". */
|
||||
|
||||
memset(tmp, 0, size);
|
||||
|
||||
/* Calculate the offset and put it in the bootstrap
|
||||
area */
|
||||
|
||||
bscs[i].smbcs_data_mpool_offset = (size_t)
|
||||
(tmp - ((char *) c->sm_data_mpool->mpool_base(c->sm_data_mpool)));
|
||||
|
||||
/* Now setup memory affinity such that various control
|
||||
data pages are local to their target processors, if
|
||||
supported. */
|
||||
|
||||
/* JMS continue here */
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
/* Bad luck all around -- we didn't find our CID in the array
|
||||
and there were no empty slots. So give up the lock and let
|
||||
some other processes / threads in there to try to free up
|
||||
some slots, and then try again once we have reacquired the
|
||||
lock. */
|
||||
|
||||
else {
|
||||
opal_atomic_unlock(&bshe->super.seg_lock);
|
||||
#ifdef HAVE_SCHED_YIELD
|
||||
sched_yield();
|
||||
#endif
|
||||
opal_atomic_lock(&bshe->super.seg_lock);
|
||||
}
|
||||
}
|
||||
|
||||
/* Check to see if there was an error while allocating the shared
|
||||
memory */
|
||||
if (0 == bscs[i].smbcs_communicator_num_segments) {
|
||||
err = OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
/* Look at the comm_setup_t section (in the data segment of the
|
||||
bootstrap) and fill in the values on our communicator */
|
||||
|
||||
else {
|
||||
err = OMPI_SUCCESS;
|
||||
data->mcb_mpool_base = c->sm_data_mpool->mpool_base(c->sm_data_mpool);
|
||||
data->mcb_mpool_offset = bscs[i].smbcs_data_mpool_offset;
|
||||
data->mcb_mpool_area = data->mcb_mpool_base + data->mcb_mpool_offset;
|
||||
data->mcb_mpool_num_segments = bscs[i].smbcs_communicator_num_segments;
|
||||
data->mcb_operation_count = 0;
|
||||
}
|
||||
|
||||
/* If the count is now zero, then we're finished with this section
|
||||
in the bootstrap segment, and we should release it for others
|
||||
to use */
|
||||
|
||||
--bscs[i].smbcs_count;
|
||||
if (0 == bscs[i].smbcs_count) {
|
||||
bscs[i].smbcs_data_mpool_offset = 0;
|
||||
bshe->smbhe_cids[i] = INT_MAX;
|
||||
}
|
||||
|
||||
/* All done */
|
||||
|
||||
opal_atomic_unlock(&bshe->super.seg_lock);
|
||||
return err;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* This function is not static and has a prefix-rule-enabled name
|
||||
* because it gets called from the component. This is only called
|
||||
* once -- no need for reference counting or thread protection.
|
||||
*/
|
||||
int mca_coll_sm_bootstrap_finalize(void)
|
||||
{
|
||||
mca_common_sm_mmap_t *meta;
|
||||
|
||||
if (bootstrap_inited) {
|
||||
meta = mca_coll_sm_component.sm_bootstrap_meta;
|
||||
|
||||
/* Free the area in the mpool that we were using */
|
||||
if (mca_coll_sm_component.sm_data_mpool_created) {
|
||||
/* JMS: there does not yet seem to be any opposite to
|
||||
mca_mpool_base_module_create()... */
|
||||
}
|
||||
|
||||
/* Free the entire bootstrap area (no need to zero out
|
||||
anything in here -- all data structures are referencing
|
||||
within the bootstrap area, so the one top-level unmap does
|
||||
it all) */
|
||||
|
||||
munmap(meta->map_seg, meta->map_size);
|
||||
}
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
31
ompi/mca/coll/sm/help-coll-sm.txt
Обычный файл
31
ompi/mca/coll/sm/help-coll-sm.txt
Обычный файл
@ -0,0 +1,31 @@
|
||||
# -*- text -*-
|
||||
#
|
||||
# Copyright (c) 2004-2005 The Trustees of Indiana University.
|
||||
# All rights reserved.
|
||||
# Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
|
||||
# All rights reserved.
|
||||
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
# This is the US/English general help file for Open RTE's orterun.
|
||||
#
|
||||
[tree-degree-larger-than-control]
|
||||
The specified shared memory collective tree degree
|
||||
(coll_sm_tree_degree = %d) is too large. It must be less than or
|
||||
equal to the control size (coll_sm_control_size = %d).
|
||||
|
||||
Automatically adjusting the tree degree to be equal to the control
|
||||
size and continuing...
|
||||
[tree-degree-larger-than-255]
|
||||
The specified shared memory collective tree degree
|
||||
(coll_sm_tree_degree = %d) is too large. It must be less than or
|
||||
equal to 255.
|
||||
|
||||
Automatically adjusting the tree degree to be 255 and continuing...
|
Загрузка…
Ссылка в новой задаче
Block a user