1
1
This commit does a bunch of things:

 * Address all remaining code review items from CMR #2023:

   * Defer mmap setup to be lazy; only set it up the first time we
     invoke a collective.  In this way, we don't penalize apps that
     make lots of communicators but don't invoke collectives on them
     (per #2027).
   * Remove the extra assignments of mca_coll_sm_one (fixing a
     convertor count setup that was the real problem).
   * Remove another extra/unnecessary assignment.
   * Increase libevent polling frequency when using the RML to
     bootstrap mmap'ed memory.
   * Fix a minor procs-related memory leak in btl_sm.
 * Commit a datatype fix that George and I discovered along the way to
   fixing the coll sm.
 * Improve error messages when mmap fails, potentially trying to
   de-alloc any allocated memory when that happens.
 * Fix a previously-unnoticed confusion between extent and true_extent
   in coll sm reduce.

This commit was SVN r22049.

The following Trac tickets were found above:
  Ticket 2023 --> https://svn.open-mpi.org/trac/ompi/ticket/2023
Этот коммит содержится в:
Jeff Squyres 2009-10-02 17:13:56 +00:00
родитель dcab61ad83
Коммит 0f8ac9223f
10 изменённых файлов: 183 добавлений и 40 удалений

Просмотреть файл

@ -264,10 +264,11 @@ static int sm_btl_first_time_init(mca_btl_sm_t *sm_btl, int n)
opal_output(0, "mca_btl_sm_add_procs: unable to create shared memory "
"BTL coordinating strucure :: size %lu \n",
(unsigned long)size);
free(procs);
free(sm_ctl_file);
return OMPI_ERROR;
}
free(procs);
free(sm_ctl_file);
/* set the pointer to the shared memory control structure */

Просмотреть файл

@ -185,6 +185,9 @@ BEGIN_C_DECLS
/** Base module */
mca_coll_base_module_t super;
/* Whether this module has been lazily initialized or not yet */
bool enabled;
/* Data that hangs off the communicator */
mca_coll_sm_comm_t *sm_comm_data;
@ -207,6 +210,11 @@ BEGIN_C_DECLS
mca_coll_base_module_t *
mca_coll_sm_comm_query(struct ompi_communicator_t *comm, int *priority);
/* Lazily enable a module (since it involves expensive/slow mmap
allocation, etc.) */
int ompi_coll_sm_lazy_enable(mca_coll_base_module_t *module,
struct ompi_communicator_t *comm);
int mca_coll_sm_allgather_intra(void *sbuf, int scount,
struct ompi_datatype_t *sdtype,
@ -360,7 +368,6 @@ extern uint32_t mca_coll_sm_one;
(index)->mcbmi_data + \
((rank) * mca_coll_sm_component.sm_fragment_size); \
(iov).iov_len = (max_data); \
mca_coll_sm_one = 1; \
opal_convertor_pack(&(convertor), &(iov), &mca_coll_sm_one, \
&(max_data) )
@ -372,7 +379,6 @@ extern uint32_t mca_coll_sm_one;
(iov).iov_base = (((char*) (index)->mcbmi_data) + \
((src_rank) * (mca_coll_sm_component.sm_fragment_size))); \
(iov).iov_len = (max_data); \
mca_coll_sm_one = 1; \
opal_convertor_unpack(&(convertor), &(iov), &mca_coll_sm_one, \
&(max_data) )

Просмотреть файл

@ -59,6 +59,15 @@ int mca_coll_sm_barrier_intra(struct ompi_communicator_t *comm,
int uint_control_size;
mca_coll_sm_module_t *sm_module = (mca_coll_sm_module_t*) module;
/* Lazily enable the module the first time we invoke a collective
on it */
if (!sm_module->enabled) {
int ret;
if (OMPI_SUCCESS != (ret = ompi_coll_sm_lazy_enable(module, comm))) {
return ret;
}
}
uint_control_size =
mca_coll_sm_component.sm_control_size / sizeof(uint32_t);
data = sm_module->sm_comm_data;

Просмотреть файл

@ -62,7 +62,7 @@ int mca_coll_sm_bcast_intra(void *buff, int count,
{
struct iovec iov;
mca_coll_sm_module_t *sm_module = (mca_coll_sm_module_t*) module;
mca_coll_sm_comm_t *data = sm_module->sm_comm_data;
mca_coll_sm_comm_t *data;
int i, ret, rank, size, num_children, src_rank;
int flag_num, segment_num, max_segment_num;
int parent_rank;
@ -72,6 +72,15 @@ int mca_coll_sm_bcast_intra(void *buff, int count,
mca_coll_sm_tree_node_t *me, *parent, **children;
mca_coll_sm_data_index_t *index;
/* Lazily enable the module the first time we invoke a collective
on it */
if (!sm_module->enabled) {
if (OMPI_SUCCESS != (ret = ompi_coll_sm_lazy_enable(module, comm))) {
return ret;
}
}
data = sm_module->sm_comm_data;
/* Setup some identities */
rank = ompi_comm_rank(comm);
@ -228,7 +237,6 @@ int mca_coll_sm_bcast_intra(void *buff, int count,
}
/* Copy to my output buffer */
max_data = mca_coll_sm_component.sm_fragment_size;
COPY_FRAGMENT_OUT(convertor, src_rank, index, iov, max_data);
bytes += max_data;

Просмотреть файл

@ -81,6 +81,7 @@ static int bootstrap_comm(ompi_communicator_t *comm,
*/
static void mca_coll_sm_module_construct(mca_coll_sm_module_t *module)
{
module->enabled = false;
module->sm_comm_data = NULL;
module->previous_reduce = NULL;
module->previous_reduce_module = NULL;
@ -107,6 +108,8 @@ static void mca_coll_sm_module_destruct(mca_coll_sm_module_t *module)
if (NULL != module->previous_reduce_module) {
OBJ_RELEASE(module->previous_reduce_module);
}
module->enabled = false;
}
@ -222,9 +225,23 @@ mca_coll_sm_comm_query(struct ompi_communicator_t *comm, int *priority)
/*
* Init module on the communicator
*/
static int
sm_module_enable(mca_coll_base_module_t *module,
struct ompi_communicator_t *comm)
static int sm_module_enable(mca_coll_base_module_t *module,
struct ompi_communicator_t *comm)
{
if (NULL == comm->c_coll.coll_reduce ||
NULL == comm->c_coll.coll_reduce_module) {
opal_output_verbose(10, mca_coll_base_output,
"coll:sm:enable (%d/%s): no underlying reduce; disqualifying myself",
comm->c_contextid, comm->c_name);
return OMPI_ERROR;
}
/* We do everything lazily in ompi_coll_sm_enable() */
return OMPI_SUCCESS;
}
int ompi_coll_sm_lazy_enable(mca_coll_base_module_t *module,
struct ompi_communicator_t *comm)
{
int i, j, root, ret;
int rank = ompi_comm_rank(comm);
@ -238,13 +255,11 @@ sm_module_enable(mca_coll_base_module_t *module,
unsigned char *base = NULL;
const int num_barrier_buffers = 2;
if (NULL == comm->c_coll.coll_reduce ||
NULL == comm->c_coll.coll_reduce_module) {
opal_output_verbose(10, mca_coll_base_output,
"coll:sm:enable (%d/%s): no underlying reduce; disqualifying myself",
comm->c_contextid, comm->c_name);
return OMPI_ERROR;
/* Just make sure we haven't been here already */
if (sm_module->enabled) {
return OMPI_SUCCESS;
}
sm_module->enabled = true;
/* Get some space to setup memory affinity (just easier to try to
alloc here to handle the error case) */

Просмотреть файл

@ -92,15 +92,39 @@ int mca_coll_sm_reduce_intra(void *sbuf, void* rbuf, int count,
sm_module->previous_reduce_module);
}
#if WANT_REDUCE_NO_ORDER
else if (!ompi_op_is_intrinsic(op) ||
(ompi_op_is_intrinsic(op) && !ompi_op_is_float_assoc(op) &&
0 != (dtype->flags & OMPI_DATATYPE_FLAG_DATA_FLOAT))) {
return reduce_inorder(sbuf, rbuf, count, dtype, op, root, comm, module);
} else {
return reduce_no_order(sbuf, rbuf, count, dtype, op, root, comm, module);
else {
/* Lazily enable the module the first time we invoke a
collective on it */
if (!sm_module->enabled) {
if (OMPI_SUCCESS !=
(ret = ompi_coll_sm_lazy_enable(module, comm))) {
return ret;
}
}
if (!ompi_op_is_intrinsic(op) ||
(ompi_op_is_intrinsic(op) && !ompi_op_is_float_assoc(op) &&
0 != (dtype->flags & OMPI_DATATYPE_FLAG_DATA_FLOAT))) {
return reduce_inorder(sbuf, rbuf, count, dtype, op,
root, comm, module);
} else {
return reduce_no_order(sbuf, rbuf, count, dtype, op,
root, comm, module);
}
}
#else
else {
/* Lazily enable the module the first time we invoke a
collective on it */
if (!sm_module->enabled) {
int ret;
if (OMPI_SUCCESS !=
(ret = ompi_coll_sm_lazy_enable(module, comm))) {
return ret;
}
}
return reduce_inorder(sbuf, rbuf, count, dtype, op, root, comm, module);
}
#endif
@ -148,6 +172,7 @@ int mca_coll_sm_reduce_intra(void *sbuf, void* rbuf, int count,
* buffer at the end.
*/
static int reduce_inorder(void *sbuf, void* rbuf, int count,
struct ompi_datatype_t *dtype,
struct ompi_op_t *op,
@ -208,7 +233,7 @@ static int reduce_inorder(void *sbuf, void* rbuf, int count,
int frag_num = 0;
bool first_operation = true;
bool sbuf_copied_to_rbuf = false;
/* If the datatype is the same packed as it is unpacked, we
can save a memory copy and just do the reduction operation
directly from the shared memory segment. However, if the
@ -271,7 +296,7 @@ static int reduce_inorder(void *sbuf, void* rbuf, int count,
(ret = opal_convertor_copy_and_prepare_for_recv(
ompi_mpi_local_convertor,
&(dtype->super),
segment_ddt_count,
count,
rbuf,
0,
&rbuf_convertor))) {
@ -317,13 +342,12 @@ static int reduce_inorder(void *sbuf, void* rbuf, int count,
flag_num * mca_coll_sm_component.sm_segs_per_inuse_flag;
max_segment_num =
(flag_num + 1) * mca_coll_sm_component.sm_segs_per_inuse_flag;
reduce_target = (((char*) rbuf) + (frag_num * true_extent * segment_ddt_count));
reduce_target = (((char*) rbuf) + (frag_num * extent * segment_ddt_count));
do {
/* Note that all the other coll modules reduce from
process (size-1) to 0, so that's the order we'll do
it here. */
/* Process (size-1) is the root (special case) */
if (size - 1 == rank) {
/* If we're the root *and* the first process to be
@ -382,7 +406,7 @@ static int reduce_inorder(void *sbuf, void* rbuf, int count,
if (rank == peer) {
ompi_op_reduce(op,
((char *) sbuf) +
frag_num * true_extent * segment_ddt_count,
frag_num * extent * segment_ddt_count,
reduce_target,
min(count_left, segment_ddt_count),
dtype);
@ -436,7 +460,7 @@ static int reduce_inorder(void *sbuf, void* rbuf, int count,
bytes += segment_ddt_bytes;
++segment_num;
++frag_num;
reduce_target += true_extent * segment_ddt_count;
reduce_target += extent * segment_ddt_count;
} while (bytes < total_size && segment_num < max_segment_num);
/* Root is now done with this set of segments */

Просмотреть файл

@ -50,6 +50,10 @@ headers = \
sources = \
common_sm_mmap.c
# Help file
dist_pkgdata_DATA = help-mpi-common-sm.txt
# As per above, we'll either have an installable or noinst result.
# The installable one should follow the same MCA prefix naming rules
# (i.e., libmca_<type>_<name>.la). The noinst one can be named

Просмотреть файл

@ -48,6 +48,7 @@
#include "orte/mca/rml/rml.h"
#include "orte/util/name_fns.h"
#include "orte/util/show_help.h"
#include "orte/runtime/orte_globals.h"
#include "ompi/constants.h"
@ -102,7 +103,11 @@ static mca_common_sm_mmap_t* create_map(int fd, size_t size, char *file_name,
/* map the file and initialize segment state */
seg = (mca_common_sm_file_header_t*)
mmap(NULL, size, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
if((void*)-1 == seg) {
if ((void*)-1 == seg) {
orte_show_help("help-mpi-common-sm.txt", "sys call fail",
orte_process_info.nodename,
"mmap(2)", "",
strerror(errno), errno);
return NULL;
}
@ -117,15 +122,18 @@ static mca_common_sm_mmap_t* create_map(int fd, size_t size, char *file_name,
addr = ((unsigned char *)seg) + size_ctl_structure;
/* If we have a data segment (i.e., if 0 != data_seg_alignment),
then make it the first aligned address after the control
structure. */
structure. IF THIS HAPPENS, THIS IS A PROGRAMMING ERROR IN
OPEN MPI!*/
if (0 != data_seg_alignment) {
addr = OPAL_ALIGN_PTR(addr, data_seg_alignment, unsigned char*);
/* is addr past end of file ? */
if((unsigned char*)seg + size < addr) {
opal_output(0, "mca_common_sm_mmap_init: "
"memory region too small len %lu addr %p\n",
(unsigned long)size, addr);
orte_show_help("help-mpi-common-sm.txt", "mmap too small",
orte_process_info.nodename,
(unsigned long) size,
(unsigned long) size_ctl_structure,
(unsigned long) data_seg_alignment);
return NULL;
}
}
@ -215,11 +223,18 @@ mca_common_sm_mmap_t* mca_common_sm_mmap_init(ompi_proc_t **procs,
/* process initializing the file */
fd = open(file_name, O_CREAT|O_RDWR, 0600);
if (fd < 0) {
opal_output(0, "mca_common_sm_mmap_init: "
"open %s failed with errno=%d\n", file_name, errno);
orte_show_help("help-mpi-common-sm.txt", "sys call fail", 1,
orte_process_info.nodename,
"open(2)", file_name,
strerror(errno), errno);
} else if (ftruncate(fd, size) != 0) {
opal_output(0, "mca_common_sm_mmap_init: "
"ftruncate failed with errno=%d\n", errno);
orte_show_help("help-mpi-common-sm.txt", "sys call fail", 1,
orte_process_info.nodename,
"ftruncate(2)", "",
strerror(errno), errno);
close(fd);
unlink(file_name);
fd = -1;
} else {
map = create_map(fd, size, file_name, size_ctl_structure,
data_seg_alignment);
@ -233,11 +248,17 @@ mca_common_sm_mmap_t* mca_common_sm_mmap_init(ompi_proc_t **procs,
map->map_seg->seg_size = size - mem_offset;
opal_atomic_unlock(&map->map_seg->seg_lock);
map->map_seg->seg_inited = 0;
} else {
close(fd);
unlink(file_name);
fd = -1;
}
}
/* signal the rest of the local procs that the backing file
has been created */
/* Signal the rest of the local procs that the backing file
has been created. Bump up the libevent polling frequency
while we're using the RML. */
opal_progress_event_users_increment();
for (p = 1; p < num_procs; p++) {
rc = orte_rml.send(&(procs[p]->proc_name), iov, 3,
OMPI_RML_TAG_SM_BACK_FILE_CREATED, 0);
@ -247,9 +268,17 @@ mca_common_sm_mmap_t* mca_common_sm_mmap_init(ompi_proc_t **procs,
(unsigned long)p, errno,
rc,
(int) (iov[0].iov_len + iov[1].iov_len + iov[2].iov_len));
opal_progress_event_users_decrement();
/* Free it all -- bad things are going to happen */
munmap(map, size);
close(fd);
unlink(file_name);
fd = -1;
goto out;
}
}
opal_progress_event_users_decrement();
} else {
/* All other procs wait for the file to be initialized before
using the backing file. However, since these shared
@ -273,12 +302,21 @@ mca_common_sm_mmap_t* mca_common_sm_mmap_init(ompi_proc_t **procs,
receiving from the RML. */
if (opal_list_get_end(&pending_rml_msgs) == item) {
while (1) {
/* Bump up the libevent polling frequency while we're
in this RML recv, just to ensure we're checking
libevent frequently. */
opal_progress_event_users_increment();
rc = orte_rml.recv(&(procs[0]->proc_name), iov, 3,
OMPI_RML_TAG_SM_BACK_FILE_CREATED, 0);
opal_progress_event_users_decrement();
if (rc < 0) {
opal_output(0, "mca_common_sm_mmap_init: "
"orte_rml.recv failed from %d with errno=%d\n",
0, errno);
munmap(map, size);
close(fd);
unlink(file_name);
fd = -1;
goto out;
}
@ -291,6 +329,10 @@ mca_common_sm_mmap_t* mca_common_sm_mmap_init(ompi_proc_t **procs,
rml_msg = OBJ_NEW(pending_rml_msg_t);
if (NULL == rml_msg) {
opal_output(0, "mca_common_sm_mmap_init: failed to create pending rml message");
munmap(map, size);
close(fd);
unlink(file_name);
fd = -1;
goto out;
}
memcpy(rml_msg->file_name, filename_to_send,

31
ompi/mca/common/sm/help-mpi-common-sm.txt Обычный файл
Просмотреть файл

@ -0,0 +1,31 @@
# -*- text -*-
#
# Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
#
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# This is the US/English help file for Open MPI's common shmem support.
#
[sys call fail]
A system call failed during shared memory initialization that should
not have. It is likely that your MPI job will now either abort or
experience performance degredation.
Local host: %s
System call: %s %s
Error: %s (errno %d)
#
[mmap too small]
Open MPI requested a shared memory segment that was too small to do
anything useful. This is likely an error in Open MPI itself. If you
see this error, you should see if there is an update available for
Open MPI, and if not, contact the Open MPI developers.
Local host: %s
Requested size: %ul
Control seg size: %ul
Data seg aligment: %ul

Просмотреть файл

@ -450,9 +450,11 @@ opal_generic_simple_unpack_function( opal_convertor_t* pConvertor,
(long)pStack->disp, (unsigned long)iov_len_local ); );
if( --(pStack->count) == 0 ) { /* end of loop */
if( pConvertor->stack_pos == 0 ) {
/* Force the conversion to stop by lowering the number of iovecs. */
*out_size = iov_count;
goto complete_loop; /* completed */
/* Do the same thing as when the loop is completed (com plete_loop:) */
iov[iov_count].iov_len -= iov_len_local; /* update the amount of valid data */
total_unpacked += iov[iov_count].iov_len;
iov_count++; /* go to the next */
goto complete_conversion;
}
pConvertor->stack_pos--;
pStack--;
@ -498,6 +500,7 @@ opal_generic_simple_unpack_function( opal_convertor_t* pConvertor,
iov[iov_count].iov_len -= iov_len_local; /* update the amount of valid data */
total_unpacked += iov[iov_count].iov_len;
}
complete_conversion:
*max_data = total_unpacked;
pConvertor->bConverted += total_unpacked; /* update the already converted bytes */
*out_size = iov_count;