Refs trac:2023, #2027.
This commit does a bunch of things: * Address all remaining code review items from CMR #2023: * Defer mmap setup to be lazy; only set it up the first time we invoke a collective. In this way, we don't penalize apps that make lots of communicators but don't invoke collectives on them (per #2027). * Remove the extra assignments of mca_coll_sm_one (fixing a convertor count setup that was the real problem). * Remove another extra/unnecessary assignment. * Increase libevent polling frequency when using the RML to bootstrap mmap'ed memory. * Fix a minor procs-related memory leak in btl_sm. * Commit a datatype fix that George and I discovered along the way to fixing the coll sm. * Improve error messages when mmap fails, potentially trying to de-alloc any allocated memory when that happens. * Fix a previously-unnoticed confusion between extent and true_extent in coll sm reduce. This commit was SVN r22049. The following Trac tickets were found above: Ticket 2023 --> https://svn.open-mpi.org/trac/ompi/ticket/2023
Этот коммит содержится в:
родитель
dcab61ad83
Коммит
0f8ac9223f
@ -264,10 +264,11 @@ static int sm_btl_first_time_init(mca_btl_sm_t *sm_btl, int n)
|
||||
opal_output(0, "mca_btl_sm_add_procs: unable to create shared memory "
|
||||
"BTL coordinating strucure :: size %lu \n",
|
||||
(unsigned long)size);
|
||||
free(procs);
|
||||
free(sm_ctl_file);
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
||||
free(procs);
|
||||
free(sm_ctl_file);
|
||||
|
||||
/* set the pointer to the shared memory control structure */
|
||||
|
@ -185,6 +185,9 @@ BEGIN_C_DECLS
|
||||
/** Base module */
|
||||
mca_coll_base_module_t super;
|
||||
|
||||
/* Whether this module has been lazily initialized or not yet */
|
||||
bool enabled;
|
||||
|
||||
/* Data that hangs off the communicator */
|
||||
mca_coll_sm_comm_t *sm_comm_data;
|
||||
|
||||
@ -207,6 +210,11 @@ BEGIN_C_DECLS
|
||||
|
||||
mca_coll_base_module_t *
|
||||
mca_coll_sm_comm_query(struct ompi_communicator_t *comm, int *priority);
|
||||
|
||||
/* Lazily enable a module (since it involves expensive/slow mmap
|
||||
allocation, etc.) */
|
||||
int ompi_coll_sm_lazy_enable(mca_coll_base_module_t *module,
|
||||
struct ompi_communicator_t *comm);
|
||||
|
||||
int mca_coll_sm_allgather_intra(void *sbuf, int scount,
|
||||
struct ompi_datatype_t *sdtype,
|
||||
@ -360,7 +368,6 @@ extern uint32_t mca_coll_sm_one;
|
||||
(index)->mcbmi_data + \
|
||||
((rank) * mca_coll_sm_component.sm_fragment_size); \
|
||||
(iov).iov_len = (max_data); \
|
||||
mca_coll_sm_one = 1; \
|
||||
opal_convertor_pack(&(convertor), &(iov), &mca_coll_sm_one, \
|
||||
&(max_data) )
|
||||
|
||||
@ -372,7 +379,6 @@ extern uint32_t mca_coll_sm_one;
|
||||
(iov).iov_base = (((char*) (index)->mcbmi_data) + \
|
||||
((src_rank) * (mca_coll_sm_component.sm_fragment_size))); \
|
||||
(iov).iov_len = (max_data); \
|
||||
mca_coll_sm_one = 1; \
|
||||
opal_convertor_unpack(&(convertor), &(iov), &mca_coll_sm_one, \
|
||||
&(max_data) )
|
||||
|
||||
|
@ -59,6 +59,15 @@ int mca_coll_sm_barrier_intra(struct ompi_communicator_t *comm,
|
||||
int uint_control_size;
|
||||
mca_coll_sm_module_t *sm_module = (mca_coll_sm_module_t*) module;
|
||||
|
||||
/* Lazily enable the module the first time we invoke a collective
|
||||
on it */
|
||||
if (!sm_module->enabled) {
|
||||
int ret;
|
||||
if (OMPI_SUCCESS != (ret = ompi_coll_sm_lazy_enable(module, comm))) {
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
|
||||
uint_control_size =
|
||||
mca_coll_sm_component.sm_control_size / sizeof(uint32_t);
|
||||
data = sm_module->sm_comm_data;
|
||||
|
@ -62,7 +62,7 @@ int mca_coll_sm_bcast_intra(void *buff, int count,
|
||||
{
|
||||
struct iovec iov;
|
||||
mca_coll_sm_module_t *sm_module = (mca_coll_sm_module_t*) module;
|
||||
mca_coll_sm_comm_t *data = sm_module->sm_comm_data;
|
||||
mca_coll_sm_comm_t *data;
|
||||
int i, ret, rank, size, num_children, src_rank;
|
||||
int flag_num, segment_num, max_segment_num;
|
||||
int parent_rank;
|
||||
@ -72,6 +72,15 @@ int mca_coll_sm_bcast_intra(void *buff, int count,
|
||||
mca_coll_sm_tree_node_t *me, *parent, **children;
|
||||
mca_coll_sm_data_index_t *index;
|
||||
|
||||
/* Lazily enable the module the first time we invoke a collective
|
||||
on it */
|
||||
if (!sm_module->enabled) {
|
||||
if (OMPI_SUCCESS != (ret = ompi_coll_sm_lazy_enable(module, comm))) {
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
data = sm_module->sm_comm_data;
|
||||
|
||||
/* Setup some identities */
|
||||
|
||||
rank = ompi_comm_rank(comm);
|
||||
@ -228,7 +237,6 @@ int mca_coll_sm_bcast_intra(void *buff, int count,
|
||||
}
|
||||
|
||||
/* Copy to my output buffer */
|
||||
max_data = mca_coll_sm_component.sm_fragment_size;
|
||||
COPY_FRAGMENT_OUT(convertor, src_rank, index, iov, max_data);
|
||||
|
||||
bytes += max_data;
|
||||
|
@ -81,6 +81,7 @@ static int bootstrap_comm(ompi_communicator_t *comm,
|
||||
*/
|
||||
static void mca_coll_sm_module_construct(mca_coll_sm_module_t *module)
|
||||
{
|
||||
module->enabled = false;
|
||||
module->sm_comm_data = NULL;
|
||||
module->previous_reduce = NULL;
|
||||
module->previous_reduce_module = NULL;
|
||||
@ -107,6 +108,8 @@ static void mca_coll_sm_module_destruct(mca_coll_sm_module_t *module)
|
||||
if (NULL != module->previous_reduce_module) {
|
||||
OBJ_RELEASE(module->previous_reduce_module);
|
||||
}
|
||||
|
||||
module->enabled = false;
|
||||
}
|
||||
|
||||
|
||||
@ -222,9 +225,23 @@ mca_coll_sm_comm_query(struct ompi_communicator_t *comm, int *priority)
|
||||
/*
|
||||
* Init module on the communicator
|
||||
*/
|
||||
static int
|
||||
sm_module_enable(mca_coll_base_module_t *module,
|
||||
struct ompi_communicator_t *comm)
|
||||
static int sm_module_enable(mca_coll_base_module_t *module,
|
||||
struct ompi_communicator_t *comm)
|
||||
{
|
||||
if (NULL == comm->c_coll.coll_reduce ||
|
||||
NULL == comm->c_coll.coll_reduce_module) {
|
||||
opal_output_verbose(10, mca_coll_base_output,
|
||||
"coll:sm:enable (%d/%s): no underlying reduce; disqualifying myself",
|
||||
comm->c_contextid, comm->c_name);
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
||||
/* We do everything lazily in ompi_coll_sm_enable() */
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
int ompi_coll_sm_lazy_enable(mca_coll_base_module_t *module,
|
||||
struct ompi_communicator_t *comm)
|
||||
{
|
||||
int i, j, root, ret;
|
||||
int rank = ompi_comm_rank(comm);
|
||||
@ -238,13 +255,11 @@ sm_module_enable(mca_coll_base_module_t *module,
|
||||
unsigned char *base = NULL;
|
||||
const int num_barrier_buffers = 2;
|
||||
|
||||
if (NULL == comm->c_coll.coll_reduce ||
|
||||
NULL == comm->c_coll.coll_reduce_module) {
|
||||
opal_output_verbose(10, mca_coll_base_output,
|
||||
"coll:sm:enable (%d/%s): no underlying reduce; disqualifying myself",
|
||||
comm->c_contextid, comm->c_name);
|
||||
return OMPI_ERROR;
|
||||
/* Just make sure we haven't been here already */
|
||||
if (sm_module->enabled) {
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
sm_module->enabled = true;
|
||||
|
||||
/* Get some space to setup memory affinity (just easier to try to
|
||||
alloc here to handle the error case) */
|
||||
|
@ -92,15 +92,39 @@ int mca_coll_sm_reduce_intra(void *sbuf, void* rbuf, int count,
|
||||
sm_module->previous_reduce_module);
|
||||
}
|
||||
#if WANT_REDUCE_NO_ORDER
|
||||
else if (!ompi_op_is_intrinsic(op) ||
|
||||
(ompi_op_is_intrinsic(op) && !ompi_op_is_float_assoc(op) &&
|
||||
0 != (dtype->flags & OMPI_DATATYPE_FLAG_DATA_FLOAT))) {
|
||||
return reduce_inorder(sbuf, rbuf, count, dtype, op, root, comm, module);
|
||||
} else {
|
||||
return reduce_no_order(sbuf, rbuf, count, dtype, op, root, comm, module);
|
||||
else {
|
||||
/* Lazily enable the module the first time we invoke a
|
||||
collective on it */
|
||||
if (!sm_module->enabled) {
|
||||
if (OMPI_SUCCESS !=
|
||||
(ret = ompi_coll_sm_lazy_enable(module, comm))) {
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
|
||||
if (!ompi_op_is_intrinsic(op) ||
|
||||
(ompi_op_is_intrinsic(op) && !ompi_op_is_float_assoc(op) &&
|
||||
0 != (dtype->flags & OMPI_DATATYPE_FLAG_DATA_FLOAT))) {
|
||||
return reduce_inorder(sbuf, rbuf, count, dtype, op,
|
||||
root, comm, module);
|
||||
} else {
|
||||
return reduce_no_order(sbuf, rbuf, count, dtype, op,
|
||||
root, comm, module);
|
||||
}
|
||||
}
|
||||
#else
|
||||
else {
|
||||
/* Lazily enable the module the first time we invoke a
|
||||
collective on it */
|
||||
if (!sm_module->enabled) {
|
||||
int ret;
|
||||
|
||||
if (OMPI_SUCCESS !=
|
||||
(ret = ompi_coll_sm_lazy_enable(module, comm))) {
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
|
||||
return reduce_inorder(sbuf, rbuf, count, dtype, op, root, comm, module);
|
||||
}
|
||||
#endif
|
||||
@ -148,6 +172,7 @@ int mca_coll_sm_reduce_intra(void *sbuf, void* rbuf, int count,
|
||||
* buffer at the end.
|
||||
*/
|
||||
|
||||
|
||||
static int reduce_inorder(void *sbuf, void* rbuf, int count,
|
||||
struct ompi_datatype_t *dtype,
|
||||
struct ompi_op_t *op,
|
||||
@ -208,7 +233,7 @@ static int reduce_inorder(void *sbuf, void* rbuf, int count,
|
||||
int frag_num = 0;
|
||||
bool first_operation = true;
|
||||
bool sbuf_copied_to_rbuf = false;
|
||||
|
||||
|
||||
/* If the datatype is the same packed as it is unpacked, we
|
||||
can save a memory copy and just do the reduction operation
|
||||
directly from the shared memory segment. However, if the
|
||||
@ -271,7 +296,7 @@ static int reduce_inorder(void *sbuf, void* rbuf, int count,
|
||||
(ret = opal_convertor_copy_and_prepare_for_recv(
|
||||
ompi_mpi_local_convertor,
|
||||
&(dtype->super),
|
||||
segment_ddt_count,
|
||||
count,
|
||||
rbuf,
|
||||
0,
|
||||
&rbuf_convertor))) {
|
||||
@ -317,13 +342,12 @@ static int reduce_inorder(void *sbuf, void* rbuf, int count,
|
||||
flag_num * mca_coll_sm_component.sm_segs_per_inuse_flag;
|
||||
max_segment_num =
|
||||
(flag_num + 1) * mca_coll_sm_component.sm_segs_per_inuse_flag;
|
||||
reduce_target = (((char*) rbuf) + (frag_num * true_extent * segment_ddt_count));
|
||||
reduce_target = (((char*) rbuf) + (frag_num * extent * segment_ddt_count));
|
||||
do {
|
||||
|
||||
/* Note that all the other coll modules reduce from
|
||||
process (size-1) to 0, so that's the order we'll do
|
||||
it here. */
|
||||
|
||||
/* Process (size-1) is the root (special case) */
|
||||
if (size - 1 == rank) {
|
||||
/* If we're the root *and* the first process to be
|
||||
@ -382,7 +406,7 @@ static int reduce_inorder(void *sbuf, void* rbuf, int count,
|
||||
if (rank == peer) {
|
||||
ompi_op_reduce(op,
|
||||
((char *) sbuf) +
|
||||
frag_num * true_extent * segment_ddt_count,
|
||||
frag_num * extent * segment_ddt_count,
|
||||
reduce_target,
|
||||
min(count_left, segment_ddt_count),
|
||||
dtype);
|
||||
@ -436,7 +460,7 @@ static int reduce_inorder(void *sbuf, void* rbuf, int count,
|
||||
bytes += segment_ddt_bytes;
|
||||
++segment_num;
|
||||
++frag_num;
|
||||
reduce_target += true_extent * segment_ddt_count;
|
||||
reduce_target += extent * segment_ddt_count;
|
||||
} while (bytes < total_size && segment_num < max_segment_num);
|
||||
|
||||
/* Root is now done with this set of segments */
|
||||
|
@ -50,6 +50,10 @@ headers = \
|
||||
sources = \
|
||||
common_sm_mmap.c
|
||||
|
||||
# Help file
|
||||
|
||||
dist_pkgdata_DATA = help-mpi-common-sm.txt
|
||||
|
||||
# As per above, we'll either have an installable or noinst result.
|
||||
# The installable one should follow the same MCA prefix naming rules
|
||||
# (i.e., libmca_<type>_<name>.la). The noinst one can be named
|
||||
|
@ -48,6 +48,7 @@
|
||||
|
||||
#include "orte/mca/rml/rml.h"
|
||||
#include "orte/util/name_fns.h"
|
||||
#include "orte/util/show_help.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
|
||||
#include "ompi/constants.h"
|
||||
@ -102,7 +103,11 @@ static mca_common_sm_mmap_t* create_map(int fd, size_t size, char *file_name,
|
||||
/* map the file and initialize segment state */
|
||||
seg = (mca_common_sm_file_header_t*)
|
||||
mmap(NULL, size, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
|
||||
if((void*)-1 == seg) {
|
||||
if ((void*)-1 == seg) {
|
||||
orte_show_help("help-mpi-common-sm.txt", "sys call fail",
|
||||
orte_process_info.nodename,
|
||||
"mmap(2)", "",
|
||||
strerror(errno), errno);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
@ -117,15 +122,18 @@ static mca_common_sm_mmap_t* create_map(int fd, size_t size, char *file_name,
|
||||
addr = ((unsigned char *)seg) + size_ctl_structure;
|
||||
/* If we have a data segment (i.e., if 0 != data_seg_alignment),
|
||||
then make it the first aligned address after the control
|
||||
structure. */
|
||||
structure. IF THIS HAPPENS, THIS IS A PROGRAMMING ERROR IN
|
||||
OPEN MPI!*/
|
||||
if (0 != data_seg_alignment) {
|
||||
addr = OPAL_ALIGN_PTR(addr, data_seg_alignment, unsigned char*);
|
||||
|
||||
/* is addr past end of file ? */
|
||||
if((unsigned char*)seg + size < addr) {
|
||||
opal_output(0, "mca_common_sm_mmap_init: "
|
||||
"memory region too small len %lu addr %p\n",
|
||||
(unsigned long)size, addr);
|
||||
orte_show_help("help-mpi-common-sm.txt", "mmap too small",
|
||||
orte_process_info.nodename,
|
||||
(unsigned long) size,
|
||||
(unsigned long) size_ctl_structure,
|
||||
(unsigned long) data_seg_alignment);
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
@ -215,11 +223,18 @@ mca_common_sm_mmap_t* mca_common_sm_mmap_init(ompi_proc_t **procs,
|
||||
/* process initializing the file */
|
||||
fd = open(file_name, O_CREAT|O_RDWR, 0600);
|
||||
if (fd < 0) {
|
||||
opal_output(0, "mca_common_sm_mmap_init: "
|
||||
"open %s failed with errno=%d\n", file_name, errno);
|
||||
orte_show_help("help-mpi-common-sm.txt", "sys call fail", 1,
|
||||
orte_process_info.nodename,
|
||||
"open(2)", file_name,
|
||||
strerror(errno), errno);
|
||||
} else if (ftruncate(fd, size) != 0) {
|
||||
opal_output(0, "mca_common_sm_mmap_init: "
|
||||
"ftruncate failed with errno=%d\n", errno);
|
||||
orte_show_help("help-mpi-common-sm.txt", "sys call fail", 1,
|
||||
orte_process_info.nodename,
|
||||
"ftruncate(2)", "",
|
||||
strerror(errno), errno);
|
||||
close(fd);
|
||||
unlink(file_name);
|
||||
fd = -1;
|
||||
} else {
|
||||
map = create_map(fd, size, file_name, size_ctl_structure,
|
||||
data_seg_alignment);
|
||||
@ -233,11 +248,17 @@ mca_common_sm_mmap_t* mca_common_sm_mmap_init(ompi_proc_t **procs,
|
||||
map->map_seg->seg_size = size - mem_offset;
|
||||
opal_atomic_unlock(&map->map_seg->seg_lock);
|
||||
map->map_seg->seg_inited = 0;
|
||||
} else {
|
||||
close(fd);
|
||||
unlink(file_name);
|
||||
fd = -1;
|
||||
}
|
||||
}
|
||||
|
||||
/* signal the rest of the local procs that the backing file
|
||||
has been created */
|
||||
/* Signal the rest of the local procs that the backing file
|
||||
has been created. Bump up the libevent polling frequency
|
||||
while we're using the RML. */
|
||||
opal_progress_event_users_increment();
|
||||
for (p = 1; p < num_procs; p++) {
|
||||
rc = orte_rml.send(&(procs[p]->proc_name), iov, 3,
|
||||
OMPI_RML_TAG_SM_BACK_FILE_CREATED, 0);
|
||||
@ -247,9 +268,17 @@ mca_common_sm_mmap_t* mca_common_sm_mmap_init(ompi_proc_t **procs,
|
||||
(unsigned long)p, errno,
|
||||
rc,
|
||||
(int) (iov[0].iov_len + iov[1].iov_len + iov[2].iov_len));
|
||||
opal_progress_event_users_decrement();
|
||||
|
||||
/* Free it all -- bad things are going to happen */
|
||||
munmap(map, size);
|
||||
close(fd);
|
||||
unlink(file_name);
|
||||
fd = -1;
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
opal_progress_event_users_decrement();
|
||||
} else {
|
||||
/* All other procs wait for the file to be initialized before
|
||||
using the backing file. However, since these shared
|
||||
@ -273,12 +302,21 @@ mca_common_sm_mmap_t* mca_common_sm_mmap_init(ompi_proc_t **procs,
|
||||
receiving from the RML. */
|
||||
if (opal_list_get_end(&pending_rml_msgs) == item) {
|
||||
while (1) {
|
||||
/* Bump up the libevent polling frequency while we're
|
||||
in this RML recv, just to ensure we're checking
|
||||
libevent frequently. */
|
||||
opal_progress_event_users_increment();
|
||||
rc = orte_rml.recv(&(procs[0]->proc_name), iov, 3,
|
||||
OMPI_RML_TAG_SM_BACK_FILE_CREATED, 0);
|
||||
opal_progress_event_users_decrement();
|
||||
if (rc < 0) {
|
||||
opal_output(0, "mca_common_sm_mmap_init: "
|
||||
"orte_rml.recv failed from %d with errno=%d\n",
|
||||
0, errno);
|
||||
munmap(map, size);
|
||||
close(fd);
|
||||
unlink(file_name);
|
||||
fd = -1;
|
||||
goto out;
|
||||
}
|
||||
|
||||
@ -291,6 +329,10 @@ mca_common_sm_mmap_t* mca_common_sm_mmap_init(ompi_proc_t **procs,
|
||||
rml_msg = OBJ_NEW(pending_rml_msg_t);
|
||||
if (NULL == rml_msg) {
|
||||
opal_output(0, "mca_common_sm_mmap_init: failed to create pending rml message");
|
||||
munmap(map, size);
|
||||
close(fd);
|
||||
unlink(file_name);
|
||||
fd = -1;
|
||||
goto out;
|
||||
}
|
||||
memcpy(rml_msg->file_name, filename_to_send,
|
||||
|
31
ompi/mca/common/sm/help-mpi-common-sm.txt
Обычный файл
31
ompi/mca/common/sm/help-mpi-common-sm.txt
Обычный файл
@ -0,0 +1,31 @@
|
||||
# -*- text -*-
|
||||
#
|
||||
# Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
|
||||
#
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
# This is the US/English help file for Open MPI's common shmem support.
|
||||
#
|
||||
[sys call fail]
|
||||
A system call failed during shared memory initialization that should
|
||||
not have. It is likely that your MPI job will now either abort or
|
||||
experience performance degredation.
|
||||
|
||||
Local host: %s
|
||||
System call: %s %s
|
||||
Error: %s (errno %d)
|
||||
#
|
||||
[mmap too small]
|
||||
Open MPI requested a shared memory segment that was too small to do
|
||||
anything useful. This is likely an error in Open MPI itself. If you
|
||||
see this error, you should see if there is an update available for
|
||||
Open MPI, and if not, contact the Open MPI developers.
|
||||
|
||||
Local host: %s
|
||||
Requested size: %ul
|
||||
Control seg size: %ul
|
||||
Data seg aligment: %ul
|
@ -450,9 +450,11 @@ opal_generic_simple_unpack_function( opal_convertor_t* pConvertor,
|
||||
(long)pStack->disp, (unsigned long)iov_len_local ); );
|
||||
if( --(pStack->count) == 0 ) { /* end of loop */
|
||||
if( pConvertor->stack_pos == 0 ) {
|
||||
/* Force the conversion to stop by lowering the number of iovecs. */
|
||||
*out_size = iov_count;
|
||||
goto complete_loop; /* completed */
|
||||
/* Do the same thing as when the loop is completed (com plete_loop:) */
|
||||
iov[iov_count].iov_len -= iov_len_local; /* update the amount of valid data */
|
||||
total_unpacked += iov[iov_count].iov_len;
|
||||
iov_count++; /* go to the next */
|
||||
goto complete_conversion;
|
||||
}
|
||||
pConvertor->stack_pos--;
|
||||
pStack--;
|
||||
@ -498,6 +500,7 @@ opal_generic_simple_unpack_function( opal_convertor_t* pConvertor,
|
||||
iov[iov_count].iov_len -= iov_len_local; /* update the amount of valid data */
|
||||
total_unpacked += iov[iov_count].iov_len;
|
||||
}
|
||||
complete_conversion:
|
||||
*max_data = total_unpacked;
|
||||
pConvertor->bConverted += total_unpacked; /* update the already converted bytes */
|
||||
*out_size = iov_count;
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user