2007-07-26 02:28:04 +04:00
|
|
|
/*
|
|
|
|
* Copyright (c) 2004-2007 The Trustees of the University of Tennessee.
|
|
|
|
* All rights reserved.
|
|
|
|
* $COPYRIGHT$
|
|
|
|
*
|
|
|
|
* Additional copyrights may follow
|
|
|
|
*
|
|
|
|
* $HEADER$
|
|
|
|
*/
|
|
|
|
|
2008-01-05 03:17:32 +03:00
|
|
|
#include "ompi_config.h"
|
2007-07-21 01:36:11 +04:00
|
|
|
#include "vprotocol_pessimist_sender_based.h"
|
|
|
|
#include <sys/types.h>
|
2007-08-16 09:52:30 +04:00
|
|
|
#if defined(HAVE_SYS_MMAN_H)
|
2007-07-21 01:36:11 +04:00
|
|
|
#include <sys/mman.h>
|
2007-08-16 09:52:30 +04:00
|
|
|
#endif /* defined(HAVE_SYS_MMAN_H) */
|
|
|
|
#if defined(HAVE_UNISTD_H)
|
2007-07-21 01:36:11 +04:00
|
|
|
#include <unistd.h>
|
2007-08-16 09:52:30 +04:00
|
|
|
#endif
|
2008-01-23 23:24:54 +03:00
|
|
|
#include <string.h>
|
|
|
|
#include <errno.h>
|
- Split the datatype engine into two parts: an MPI specific part in
OMPI
and a language agnostic part in OPAL. The convertor is completely
moved into OPAL. This offers several benefits as described in RFC
http://www.open-mpi.org/community/lists/devel/2009/07/6387.php
namely:
- Fewer basic types (int* and float* types, boolean and wchar
- Fixing naming scheme to ompi-nomenclature.
- Usability outside of the ompi-layer.
- Due to the fixed nature of simple opal types, their information is
completely
known at compile time and therefore constified
- With fewer datatypes (22), the actual sizes of bit-field types may be
reduced
from 64 to 32 bits, allowing reorganizing the opal_datatype
structure, eliminating holes and keeping data required in convertor
(upon send/recv) in one cacheline...
This has implications to the convertor-datastructure and other parts
of the code.
- Several performance tests have been run, the netpipe latency does not
change with
this patch on Linux/x86-64 on the smoky cluster.
- Extensive tests have been done to verify correctness (no new
regressions) using:
1. mpi_test_suite on linux/x86-64 using clean ompi-trunk and
ompi-ddt:
a. running both trunk and ompi-ddt resulted in no differences
(except for MPI_SHORT_INT and MPI_TYPE_MIX_LB_UB do now run
correctly).
b. with --enable-memchecker and running under valgrind (one buglet
when run with static found in test-suite, commited)
2. ibm testsuite on linux/x86-64 using clean ompi-trunk and ompi-ddt:
all passed (except for the dynamic/ tests failed!! as trunk/MTT)
3. compilation and usage of HDF5 tests on Jaguar using PGI and
PathScale compilers.
4. compilation and usage on Scicortex.
- Please note, that for the heterogeneous case, (-m32 compiled
binaries/ompi), neither
ompi-trunk, nor ompi-ddt branch would successfully launch.
This commit was SVN r21641.
2009-07-13 08:56:31 +04:00
|
|
|
#include "opal/datatype/opal_datatype_memcpy.h"
|
2007-07-21 01:36:11 +04:00
|
|
|
#include <fcntl.h>
|
|
|
|
|
2008-02-28 04:57:57 +03:00
|
|
|
#include "orte/util/proc_info.h"
|
2008-01-08 02:35:44 +03:00
|
|
|
|
2007-07-21 01:36:11 +04:00
|
|
|
#define sb mca_vprotocol_pessimist.sender_based
|
|
|
|
|
2008-01-08 02:35:44 +03:00
|
|
|
static int sb_mmap_file_open(const char *path)
|
|
|
|
{
|
|
|
|
#if defined(__WINDOWS__)
|
|
|
|
sb.sb_fd = CreateFile(path, GENERIC_READ | GENERIC_WRITE, 0, NULL,
|
|
|
|
CREATE_ALWAYS, FILE_ATTRIBUTE_NORMAL, NULL);
|
|
|
|
if(INVALID_HANDLE_VALUE == sb.sb_fd)
|
|
|
|
{
|
|
|
|
V_OUTPUT_ERR("pml_v: vprotocol_pessimist: sender_based_init: open (%s): %s",
|
|
|
|
path, GetLastError());
|
|
|
|
return OPAL_ERR_FILE_OPEN_FAILURE;
|
|
|
|
}
|
|
|
|
return OPAL_SUCCESS;
|
|
|
|
#else
|
|
|
|
sb.sb_fd = open(path, O_CREAT | O_TRUNC | O_RDWR, 0600);
|
|
|
|
if(-1 == sb.sb_fd)
|
|
|
|
{
|
|
|
|
V_OUTPUT_ERR("pml_v: vprotocol_pessimist: sender_based_init: open (%s): %s",
|
|
|
|
path, strerror(errno));
|
|
|
|
return OPAL_ERR_FILE_OPEN_FAILURE;
|
|
|
|
}
|
|
|
|
return OPAL_SUCCESS;
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
|
|
|
static void sb_mmap_file_close(void)
|
|
|
|
{
|
|
|
|
#if defined(__WINDOWS__)
|
2008-01-26 21:43:06 +03:00
|
|
|
CloseHandle(sb.sb_fd);
|
2008-01-08 02:35:44 +03:00
|
|
|
#else
|
|
|
|
int ret = close(sb.sb_fd);
|
|
|
|
if(-1 == ret)
|
|
|
|
V_OUTPUT_ERR("pml_v: protocol_pessimist: sender_based_finalize: close (%d): %s",
|
|
|
|
sb.sb_fd, strerror(errno));
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
|
|
|
static void sb_mmap_alloc(void)
|
|
|
|
{
|
|
|
|
#if defined(__WINDOWS__)
|
|
|
|
sb.sb_map = CreateFileMapping(sb.sb_fd, NULL, PAGE_READWRITE, 0,
|
|
|
|
(DWORD)sb.sb_offset + sb.sb_length, NULL);
|
|
|
|
if(NULL == sb.sb_map)
|
|
|
|
{
|
|
|
|
V_OUTPUT_ERR("pml_v: vprotocol_pessimist: sender_based_alloc: CreateFileMapping : %s",
|
|
|
|
GetLastError());
|
|
|
|
ompi_mpi_abort(MPI_COMM_NULL, MPI_ERR_NO_SPACE, false);
|
|
|
|
}
|
|
|
|
|
|
|
|
sb.sb_addr = (uintptr_t) MapViewOfFile(sb.sb_map, FILE_MAP_ALL_ACCESS, 0,
|
|
|
|
sb.sb_offset, sb.sb_length);
|
2008-01-26 21:43:06 +03:00
|
|
|
if(NULL == (void*)sb.sb_addr)
|
2008-01-08 02:35:44 +03:00
|
|
|
{
|
|
|
|
V_OUTPUT_ERR("pml_v: vprotocol_pessimist: sender_based_alloc: mmap: %s",
|
|
|
|
GetLastError());
|
2008-01-26 21:43:06 +03:00
|
|
|
CloseHandle(sb.sb_map);
|
|
|
|
CloseHandle(sb.sb_fd);
|
2008-01-08 02:35:44 +03:00
|
|
|
ompi_mpi_abort(MPI_COMM_NULL, MPI_ERR_NO_SPACE, false);
|
|
|
|
}
|
|
|
|
#else
|
2008-01-23 22:29:19 +03:00
|
|
|
#ifndef MAP_NOCACHE
|
|
|
|
# define MAP_NOCACHE 0
|
|
|
|
#endif
|
2008-01-08 02:35:44 +03:00
|
|
|
if(-1 == ftruncate(sb.sb_fd, sb.sb_offset + sb.sb_length))
|
|
|
|
{
|
|
|
|
V_OUTPUT_ERR("pml_v: vprotocol_pessimist: sender_based_alloc: ftruncate: %s",
|
|
|
|
strerror(errno));
|
|
|
|
close(sb.sb_fd);
|
|
|
|
ompi_mpi_abort(MPI_COMM_NULL, MPI_ERR_NO_SPACE, false);
|
|
|
|
}
|
|
|
|
sb.sb_addr = (uintptr_t) mmap((void *) sb.sb_addr, sb.sb_length,
|
|
|
|
PROT_WRITE | PROT_READ,
|
|
|
|
MAP_PRIVATE | MAP_NOCACHE, sb.sb_fd,
|
|
|
|
sb.sb_offset);
|
|
|
|
if(((uintptr_t) -1) == sb.sb_addr)
|
|
|
|
{
|
|
|
|
V_OUTPUT_ERR("pml_v: vprotocol_pessimist: sender_based_alloc: mmap: %s",
|
|
|
|
strerror(errno));
|
|
|
|
close(sb.sb_fd);
|
|
|
|
ompi_mpi_abort(MPI_COMM_NULL, MPI_ERR_NO_SPACE, false);
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
|
|
|
static void sb_mmap_free(void)
|
|
|
|
{
|
|
|
|
#if defined(__WINDOWS__)
|
2008-01-26 21:43:06 +03:00
|
|
|
UnmapViewOfFile( (LPCVOID)sb.sb_addr);
|
|
|
|
CloseHandle(sb.sb_map);
|
2008-01-08 02:35:44 +03:00
|
|
|
#else
|
|
|
|
int ret = munmap((void *) sb.sb_addr, sb.sb_length);
|
|
|
|
if(-1 == ret)
|
|
|
|
V_OUTPUT_ERR("pml_v: protocol_pessimsit: sender_based_finalize: munmap (%p): %s",
|
|
|
|
(void *) sb.sb_addr, strerror(errno));
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
2007-07-21 01:36:11 +04:00
|
|
|
int vprotocol_pessimist_sender_based_init(const char *mmapfile, size_t size)
|
|
|
|
{
|
2008-03-28 00:24:27 +03:00
|
|
|
char *path;
|
2007-09-21 01:57:21 +04:00
|
|
|
#ifdef SB_USE_CONVERTOR_METHOD
|
|
|
|
mca_pml_base_send_request_t pml_req;
|
2009-03-17 20:35:28 +03:00
|
|
|
sb.sb_conv_to_pessimist_offset = (uintptr_t) VPROTOCOL_SEND_REQ(NULL) -
|
|
|
|
((uintptr_t) &pml_req.req_base.req_convertor -
|
|
|
|
(uintptr_t) &pml_req);
|
2007-09-27 00:54:18 +04:00
|
|
|
V_OUTPUT_VERBOSE(500, "pessimist: conv_to_pessimist_offset: %p", (void *) sb.sb_conv_to_pessimist_offset);
|
2007-09-21 01:57:21 +04:00
|
|
|
#endif
|
2007-07-21 01:36:11 +04:00
|
|
|
sb.sb_offset = 0;
|
|
|
|
sb.sb_length = size;
|
|
|
|
sb.sb_pagesize = getpagesize();
|
2007-08-16 09:52:30 +04:00
|
|
|
sb.sb_cursor = sb.sb_addr = (uintptr_t) NULL;
|
2007-08-16 21:54:26 +04:00
|
|
|
sb.sb_available = 0;
|
2008-03-28 00:19:45 +03:00
|
|
|
#ifdef SB_USE_PROGRESS_METHOD
|
|
|
|
OBJ_CONSTRUCT(&sb.sb_sendreq, opal_list_t);
|
|
|
|
#endif
|
2007-07-21 01:36:11 +04:00
|
|
|
|
2009-03-06 00:56:03 +03:00
|
|
|
asprintf(&path, "%s"OPAL_PATH_SEP"%s", orte_process_info.proc_session_dir,
|
2007-07-21 01:36:11 +04:00
|
|
|
mmapfile);
|
2008-01-08 02:35:44 +03:00
|
|
|
if(OPAL_SUCCESS != sb_mmap_file_open(path))
|
|
|
|
return OPAL_ERR_FILE_OPEN_FAILURE;
|
2008-03-28 00:24:27 +03:00
|
|
|
free(path);
|
2007-09-21 01:57:21 +04:00
|
|
|
return OMPI_SUCCESS;
|
2007-07-21 01:36:11 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
void vprotocol_pessimist_sender_based_finalize(void)
|
|
|
|
{
|
2007-09-19 07:42:56 +04:00
|
|
|
if(((uintptr_t) NULL) != sb.sb_addr)
|
2008-01-08 02:35:44 +03:00
|
|
|
sb_mmap_free();
|
|
|
|
sb_mmap_file_close();
|
2007-07-21 01:36:11 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/** Manage mmap floating window, allocating enough memory for the message to be
|
|
|
|
* asynchronously copied to disk.
|
|
|
|
*/
|
|
|
|
void vprotocol_pessimist_sender_based_alloc(size_t len)
|
|
|
|
{
|
2007-09-19 07:42:56 +04:00
|
|
|
if(((uintptr_t) NULL) != sb.sb_addr)
|
2008-01-08 02:35:44 +03:00
|
|
|
sb_mmap_free();
|
2007-09-21 01:57:21 +04:00
|
|
|
#ifdef SB_USE_SELFCOMM_METHOD
|
2007-09-19 07:42:56 +04:00
|
|
|
else
|
|
|
|
ompi_comm_dup(MPI_COMM_SELF, &sb.sb_comm, 1);
|
|
|
|
#endif
|
|
|
|
|
2007-07-21 01:36:11 +04:00
|
|
|
/* Take care of alignement of sb_offset */
|
2007-08-16 09:52:30 +04:00
|
|
|
sb.sb_offset += sb.sb_cursor - sb.sb_addr;
|
|
|
|
sb.sb_cursor = sb.sb_offset % sb.sb_pagesize;
|
|
|
|
sb.sb_offset -= sb.sb_cursor;
|
2007-07-21 01:36:11 +04:00
|
|
|
|
|
|
|
/* Adjusting sb_length for the largest application message to fit */
|
2007-08-16 21:54:26 +04:00
|
|
|
len += sb.sb_cursor + sizeof(vprotocol_pessimist_sender_based_header_t);
|
2007-07-21 01:36:11 +04:00
|
|
|
if(sb.sb_length < len)
|
2007-08-16 21:54:26 +04:00
|
|
|
sb.sb_length = len;
|
|
|
|
/* How much space left for application data */
|
|
|
|
sb.sb_available = sb.sb_length - sb.sb_cursor;
|
2007-07-21 01:36:11 +04:00
|
|
|
|
2008-01-08 02:35:44 +03:00
|
|
|
sb_mmap_alloc();
|
|
|
|
|
2007-08-16 09:52:30 +04:00
|
|
|
sb.sb_cursor += sb.sb_addr; /* set absolute addr of sender_based buffer */
|
|
|
|
V_OUTPUT_VERBOSE(30, "pessimist:\tsb\tgrow\toffset %llu\tlength %llu\tbase %p\tcursor %p", (unsigned long long) sb.sb_offset, (unsigned long long) sb.sb_length, (void *) sb.sb_addr, (void *) sb.sb_cursor);
|
2007-07-21 01:36:11 +04:00
|
|
|
}
|
|
|
|
|
2009-03-17 20:35:28 +03:00
|
|
|
#undef sb
|
|
|
|
|
2007-09-19 07:42:56 +04:00
|
|
|
#ifdef SB_USE_CONVERTOR_METHOD
|
- Split the datatype engine into two parts: an MPI specific part in
OMPI
and a language agnostic part in OPAL. The convertor is completely
moved into OPAL. This offers several benefits as described in RFC
http://www.open-mpi.org/community/lists/devel/2009/07/6387.php
namely:
- Fewer basic types (int* and float* types, boolean and wchar
- Fixing naming scheme to ompi-nomenclature.
- Usability outside of the ompi-layer.
- Due to the fixed nature of simple opal types, their information is
completely
known at compile time and therefore constified
- With fewer datatypes (22), the actual sizes of bit-field types may be
reduced
from 64 to 32 bits, allowing reorganizing the opal_datatype
structure, eliminating holes and keeping data required in convertor
(upon send/recv) in one cacheline...
This has implications to the convertor-datastructure and other parts
of the code.
- Several performance tests have been run, the netpipe latency does not
change with
this patch on Linux/x86-64 on the smoky cluster.
- Extensive tests have been done to verify correctness (no new
regressions) using:
1. mpi_test_suite on linux/x86-64 using clean ompi-trunk and
ompi-ddt:
a. running both trunk and ompi-ddt resulted in no differences
(except for MPI_SHORT_INT and MPI_TYPE_MIX_LB_UB do now run
correctly).
b. with --enable-memchecker and running under valgrind (one buglet
when run with static found in test-suite, commited)
2. ibm testsuite on linux/x86-64 using clean ompi-trunk and ompi-ddt:
all passed (except for the dynamic/ tests failed!! as trunk/MTT)
3. compilation and usage of HDF5 tests on Jaguar using PGI and
PathScale compilers.
4. compilation and usage on Scicortex.
- Please note, that for the heterogeneous case, (-m32 compiled
binaries/ompi), neither
ompi-trunk, nor ompi-ddt branch would successfully launch.
This commit was SVN r21641.
2009-07-13 08:56:31 +04:00
|
|
|
int32_t vprotocol_pessimist_sender_based_convertor_advance(opal_convertor_t* pConvertor,
|
2007-09-19 07:42:56 +04:00
|
|
|
struct iovec* iov,
|
|
|
|
uint32_t* out_size,
|
|
|
|
size_t* max_data) {
|
|
|
|
int ret;
|
2007-09-27 00:54:18 +04:00
|
|
|
unsigned int i;
|
2007-09-21 07:24:08 +04:00
|
|
|
size_t pending_length;
|
2008-03-28 00:05:44 +03:00
|
|
|
mca_vprotocol_pessimist_send_request_t *ftreq;
|
2007-09-19 07:42:56 +04:00
|
|
|
|
2008-03-28 00:05:44 +03:00
|
|
|
ftreq = VPESSIMIST_CONV_REQ(pConvertor);
|
2009-03-17 20:35:28 +03:00
|
|
|
pConvertor->flags = ftreq->sb.conv_flags;
|
|
|
|
pConvertor->fAdvance = ftreq->sb.conv_advance;
|
- Split the datatype engine into two parts: an MPI specific part in
OMPI
and a language agnostic part in OPAL. The convertor is completely
moved into OPAL. This offers several benefits as described in RFC
http://www.open-mpi.org/community/lists/devel/2009/07/6387.php
namely:
- Fewer basic types (int* and float* types, boolean and wchar
- Fixing naming scheme to ompi-nomenclature.
- Usability outside of the ompi-layer.
- Due to the fixed nature of simple opal types, their information is
completely
known at compile time and therefore constified
- With fewer datatypes (22), the actual sizes of bit-field types may be
reduced
from 64 to 32 bits, allowing reorganizing the opal_datatype
structure, eliminating holes and keeping data required in convertor
(upon send/recv) in one cacheline...
This has implications to the convertor-datastructure and other parts
of the code.
- Several performance tests have been run, the netpipe latency does not
change with
this patch on Linux/x86-64 on the smoky cluster.
- Extensive tests have been done to verify correctness (no new
regressions) using:
1. mpi_test_suite on linux/x86-64 using clean ompi-trunk and
ompi-ddt:
a. running both trunk and ompi-ddt resulted in no differences
(except for MPI_SHORT_INT and MPI_TYPE_MIX_LB_UB do now run
correctly).
b. with --enable-memchecker and running under valgrind (one buglet
when run with static found in test-suite, commited)
2. ibm testsuite on linux/x86-64 using clean ompi-trunk and ompi-ddt:
all passed (except for the dynamic/ tests failed!! as trunk/MTT)
3. compilation and usage of HDF5 tests on Jaguar using PGI and
PathScale compilers.
4. compilation and usage on Scicortex.
- Please note, that for the heterogeneous case, (-m32 compiled
binaries/ompi), neither
ompi-trunk, nor ompi-ddt branch would successfully launch.
This commit was SVN r21641.
2009-07-13 08:56:31 +04:00
|
|
|
ret = opal_convertor_pack(pConvertor, iov, out_size, max_data);
|
2008-02-27 22:58:18 +03:00
|
|
|
V_OUTPUT_VERBOSE(39, "pessimist:\tsb\tpack\t%"PRIsize_t, *max_data);
|
2007-09-21 01:57:21 +04:00
|
|
|
|
2007-09-21 07:24:08 +04:00
|
|
|
for(i = 0, pending_length = *max_data; pending_length > 0; i++) {
|
|
|
|
assert(i < *out_size);
|
2009-03-17 20:35:28 +03:00
|
|
|
MEMCPY((void *) ftreq->sb.cursor, iov[i].iov_base, iov[i].iov_len);
|
2007-09-21 07:24:08 +04:00
|
|
|
pending_length -= iov[i].iov_len;
|
2009-03-17 20:35:28 +03:00
|
|
|
ftreq->sb.cursor += iov[i].iov_len;
|
2007-09-21 07:24:08 +04:00
|
|
|
}
|
2009-03-17 20:35:28 +03:00
|
|
|
assert(pending_length == 0);
|
2008-03-28 00:05:44 +03:00
|
|
|
|
2007-09-21 01:57:21 +04:00
|
|
|
pConvertor->flags &= ~CONVERTOR_NO_OP;
|
2007-09-27 01:14:35 +04:00
|
|
|
pConvertor->fAdvance = &vprotocol_pessimist_sender_based_convertor_advance;
|
2007-09-19 07:42:56 +04:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|