2013-09-10 19:34:09 +04:00
/*
* Copyright ( c ) 2013 Mellanox Technologies , Inc .
* All rights reserved .
* $ COPYRIGHT $
*
* Additional copyrights may follow
*
* $ HEADER $
*/
# include "oshmem_config.h"
# include "opal/util/output.h"
# include "opal/dss/dss.h"
# include "ompi/mca/dpm/dpm.h"
# include "oshmem/proc/proc.h"
2014-01-29 18:30:55 +04:00
# include "oshmem/util/oshmem_util.h"
2013-09-10 19:34:09 +04:00
# include "oshmem/runtime/runtime.h"
# include "oshmem/mca/memheap/memheap.h"
# include "oshmem/mca/memheap/base/base.h"
# include "oshmem/mca/spml/spml.h"
# ifdef HAVE_SYS_MMAN_H
# include <sys/mman.h>
# endif
# include <sys/ipc.h>
# include <sys/shm.h>
# if defined(MPAGE_ENABLE) && (MPAGE_ENABLE > 0)
# include <infiniband/verbs.h>
# endif /* MPAGE_ENABLE */
/* Turn ON/OFF debug output from build (default 0) */
# ifndef MEMHEAP_BASE_DEBUG
# define MEMHEAP_BASE_DEBUG 0
# endif
# define MEMHEAP_RKEY_REQ 0xA1
# define MEMHEAP_RKEY_RESP 0xA2
# define MEMHEAP_RKEY_RESP_FAIL 0xA3
2013-12-23 13:20:42 +04:00
# define MEMHEAP_MKEY_MAXSIZE 4096
2013-09-10 19:34:09 +04:00
struct oob_comm {
opal_mutex_t lck ;
opal_condition_t cond ;
mca_spml_mkey_t * mkeys ;
int mkeys_rcvd ;
2013-12-23 13:20:42 +04:00
MPI_Request recv_req ;
char buf [ MEMHEAP_MKEY_MAXSIZE ] ;
2013-09-10 19:34:09 +04:00
} ;
# define MEMHEAP_VERBOSE_FASTPATH(...)
static mca_memheap_map_t * memheap_map = NULL ;
struct oob_comm memheap_oob ;
2013-12-23 13:20:42 +04:00
static int send_buffer ( int pe , opal_buffer_t * msg ) ;
2013-12-25 06:28:22 +04:00
static int oshmem_mkey_recv_cb ( void ) ;
2013-12-24 20:18:48 +04:00
2013-09-10 19:34:09 +04:00
/* pickup list of rkeys and remote va */
static int memheap_oob_get_mkeys ( int pe ,
uint32_t va_seg_num ,
mca_spml_mkey_t * mkey ) ;
2013-09-14 00:37:30 +04:00
static inline void * __seg2base_va ( int seg )
2013-09-10 19:34:09 +04:00
{
return memheap_map - > mem_segs [ seg ] . start ;
}
2013-11-26 16:46:56 +04:00
static int _seg_cmp ( const void * k , const void * v )
2013-09-10 19:34:09 +04:00
{
2013-09-14 00:37:30 +04:00
uintptr_t va = ( uintptr_t ) k ;
2013-09-10 19:34:09 +04:00
map_segment_t * s = ( map_segment_t * ) v ;
2013-09-14 00:37:30 +04:00
if ( va < ( uintptr_t ) s - > start )
2013-09-10 19:34:09 +04:00
return - 1 ;
2013-09-14 00:37:30 +04:00
if ( va > = ( uintptr_t ) s - > end )
2013-09-10 19:34:09 +04:00
return 1 ;
return 0 ;
}
2013-09-14 00:37:30 +04:00
static inline map_segment_t * __find_va ( const void * va )
2013-09-10 19:34:09 +04:00
{
map_segment_t * s ;
2013-09-14 00:37:30 +04:00
if ( OPAL_LIKELY ( ( uintptr_t ) va > = ( uintptr_t ) memheap_map - > mem_segs [ HEAP_SEG_INDEX ] . start & &
( uintptr_t ) va < ( uintptr_t ) memheap_map - > mem_segs [ HEAP_SEG_INDEX ] . end ) ) {
2013-09-10 19:34:09 +04:00
s = & memheap_map - > mem_segs [ HEAP_SEG_INDEX ] ;
} else {
2013-09-14 00:37:30 +04:00
s = bsearch ( va ,
2013-09-10 19:34:09 +04:00
& memheap_map - > mem_segs [ SYMB_SEG_INDEX ] ,
memheap_map - > n_segments - 1 ,
sizeof ( * s ) ,
2013-11-26 16:46:56 +04:00
_seg_cmp ) ;
2013-09-10 19:34:09 +04:00
}
# if MEMHEAP_BASE_DEBUG == 1
if ( s ) {
MEMHEAP_VERBOSE ( 5 , " match seg#%02ld: 0x%llX - 0x%llX %llu bytes va=%p " ,
s - memheap_map - > mem_segs ,
( long long ) s - > start ,
( long long ) s - > end ,
( long long ) ( s - > end - s - > start ) ,
( void * ) va ) ;
}
# endif
return s ;
}
2013-12-23 13:20:42 +04:00
/**
* @ param all_trs
* 0 - pack mkeys for transports to given pe
* 1 - pack mkeys for ALL possible transports . value of pe is ignored
*/
static int pack_local_mkeys ( opal_buffer_t * msg , int pe , int seg , int all_trs )
2013-09-10 19:34:09 +04:00
{
oshmem_proc_t * proc ;
int i , n , tr_id ;
mca_spml_mkey_t * mkey ;
/* go over all transports to remote pe and pack mkeys */
2013-12-23 13:20:42 +04:00
if ( ! all_trs ) {
n = oshmem_get_transport_count ( pe ) ;
proc = oshmem_proc_group_find ( oshmem_group_all , pe ) ;
}
else {
proc = NULL ;
n = memheap_map - > num_transports ;
}
2013-09-10 19:34:09 +04:00
opal_dss . pack ( msg , & n , 1 , OPAL_UINT32 ) ;
MEMHEAP_VERBOSE ( 5 , " found %d transports to %d " , n , pe ) ;
for ( i = 0 ; i < n ; i + + ) {
2013-12-23 13:20:42 +04:00
if ( ! all_trs ) {
tr_id = proc - > transport_ids [ i ] ;
}
else {
tr_id = i ;
}
2013-09-10 19:34:09 +04:00
mkey = mca_memheap_base_get_mkey ( __seg2base_va ( seg ) , tr_id ) ;
if ( ! mkey ) {
MEMHEAP_ERROR ( " seg#%d tr_id: %d failed to find local mkey " ,
seg , tr_id ) ;
return OSHMEM_ERROR ;
}
opal_dss . pack ( msg , & tr_id , 1 , OPAL_UINT32 ) ;
opal_dss . pack ( msg , & mkey - > va_base , 1 , OPAL_UINT64 ) ;
2014-01-07 15:56:36 +04:00
if ( 0 = = mkey - > va_base ) {
opal_dss . pack ( msg , & mkey - > u . key , 1 , OPAL_UINT64 ) ;
} else {
opal_dss . pack ( msg , & mkey - > len , 1 , OPAL_UINT16 ) ;
if ( 0 < mkey - > len ) {
opal_dss . pack ( msg , mkey - > u . data , mkey - > len , OPAL_BYTE ) ;
2013-09-10 19:34:09 +04:00
}
}
MEMHEAP_VERBOSE ( 5 ,
2014-01-07 15:56:36 +04:00
" seg#%d tr_id: %d %s " ,
seg , tr_id , mca_spml_base_mkey2str ( mkey ) ) ;
2013-09-10 19:34:09 +04:00
}
return OSHMEM_SUCCESS ;
}
static void memheap_attach_segment ( mca_spml_mkey_t * mkey , int tr_id )
{
/* process special case when va was got using shmget(IPC_PRIVATE)
* this case is notable for :
* - key is set as ( type | shmid ) ;
* - va_base is set as 0 ;
*/
2014-01-07 15:56:36 +04:00
assert ( mkey - > va_base = = 0 ) ;
if ( MEMHEAP_SHM_INVALID = = ( int ) MEMHEAP_SHM_GET_ID ( mkey - > u . key ) ) {
return ;
}
MEMHEAP_VERBOSE ( 5 ,
" shared memory usage tr_id: %d key %llx base_va %p shmid 0x%X|0x%X " ,
tr_id ,
( unsigned long long ) mkey - > u . key ,
mkey - > va_base ,
MEMHEAP_SHM_GET_TYPE ( mkey - > u . key ) ,
MEMHEAP_SHM_GET_ID ( mkey - > u . key ) ) ;
if ( MAP_SEGMENT_ALLOC_SHM = = MEMHEAP_SHM_GET_TYPE ( mkey - > u . key ) ) {
mkey - > va_base = shmat ( MEMHEAP_SHM_GET_ID ( mkey - > u . key ) ,
0 ,
0 ) ;
} else if ( MAP_SEGMENT_ALLOC_IBV = = MEMHEAP_SHM_GET_TYPE ( mkey - > u . key ) ) {
2013-09-10 19:34:09 +04:00
# if defined(MPAGE_ENABLE) && (MPAGE_ENABLE > 0)
2014-01-07 15:56:36 +04:00
openib_device_t * device = NULL ;
struct ibv_mr * ib_mr ;
void * addr ;
static int mr_count ;
2013-09-10 19:34:09 +04:00
2014-01-07 15:56:36 +04:00
int access_flag = IBV_ACCESS_LOCAL_WRITE |
2013-09-10 19:34:09 +04:00
IBV_ACCESS_REMOTE_WRITE |
IBV_ACCESS_REMOTE_READ |
IBV_ACCESS_NO_RDMA ;
2014-01-07 15:56:36 +04:00
device = ( openib_device_t * ) memheap_map - > mem_segs [ HEAP_SEG_INDEX ] . context ;
assert ( device ) ;
/* workaround mtt problem - request aligned addresses */
+ + mr_count ;
addr = ( void * ) ( ( uintptr_t ) mca_memheap_base_start_address + mca_memheap_base_mr_interleave_factor * 1024ULL * 1024ULL * 1024ULL * mr_count ) ;
ib_mr = ibv_reg_shared_mr ( MEMHEAP_SHM_GET_ID ( mkey - > u . key ) ,
device - > ib_pd , addr , access_flag ) ;
if ( NULL = = ib_mr ) {
mkey - > va_base = ( void * ) - 1 ;
MEMHEAP_ERROR ( " error to ibv_reg_shared_mr() errno says %d: %s " ,
errno , strerror ( errno ) ) ;
} else {
if ( ib_mr - > addr ! = addr ) {
MEMHEAP_WARN ( " Failed to map shared region to address %p got addr %p. Try to increase 'memheap_mr_interleave_factor' from %d " , addr , ib_mr - > addr , mca_memheap_base_mr_interleave_factor ) ;
2013-09-10 19:34:09 +04:00
}
2014-01-07 15:56:36 +04:00
opal_value_array_append_item ( & device - > ib_mr_array , & ib_mr ) ;
mkey - > va_base = ib_mr - > addr ;
2013-09-10 19:34:09 +04:00
}
2014-01-07 15:56:36 +04:00
# endif /* MPAGE_ENABLE */
} else {
MEMHEAP_ERROR ( " tr_id: %d key %llx attach failed: incorrect shmid 0x%X|0x%X " ,
tr_id ,
( unsigned long long ) mkey - > u . key ,
MEMHEAP_SHM_GET_TYPE ( mkey - > u . key ) ,
MEMHEAP_SHM_GET_ID ( mkey - > u . key ) ) ;
oshmem_shmem_abort ( - 1 ) ;
}
2013-09-10 19:34:09 +04:00
2014-01-07 15:56:36 +04:00
if ( ( void * ) - 1 = = ( void * ) mkey - > va_base ) {
MEMHEAP_ERROR ( " tr_id: %d key %llx attach failed: errno = %d " ,
tr_id , ( unsigned long long ) mkey - > u . key , errno ) ;
oshmem_shmem_abort ( - 1 ) ;
2013-09-10 19:34:09 +04:00
}
}
2013-12-23 13:20:42 +04:00
static void unpack_remote_mkeys ( opal_buffer_t * msg , int remote_pe )
2013-09-10 19:34:09 +04:00
{
int32_t cnt ;
int32_t n ;
int32_t tr_id ;
int i ;
2013-12-23 13:20:42 +04:00
oshmem_proc_t * proc ;
2013-09-10 19:34:09 +04:00
2013-12-23 13:20:42 +04:00
proc = oshmem_proc_group_find ( oshmem_group_all , remote_pe ) ;
2013-09-10 19:34:09 +04:00
cnt = 1 ;
opal_dss . unpack ( msg , & n , & cnt , OPAL_UINT32 ) ;
for ( i = 0 ; i < n ; i + + ) {
2014-01-07 15:56:36 +04:00
cnt = 1 ;
2013-09-10 19:34:09 +04:00
opal_dss . unpack ( msg , & tr_id , & cnt , OPAL_UINT32 ) ;
2014-01-07 15:56:36 +04:00
cnt = 1 ;
2013-09-10 19:34:09 +04:00
opal_dss . unpack ( msg ,
& memheap_oob . mkeys [ tr_id ] . va_base ,
& cnt ,
OPAL_UINT64 ) ;
2014-01-07 15:56:36 +04:00
if ( 0 = = memheap_oob . mkeys [ tr_id ] . va_base ) {
cnt = 1 ;
opal_dss . unpack ( msg , & memheap_oob . mkeys [ tr_id ] . u . key , & cnt , OPAL_UINT64 ) ;
if ( OPAL_PROC_ON_LOCAL_NODE ( proc - > proc_flags ) )
memheap_attach_segment ( & memheap_oob . mkeys [ tr_id ] , tr_id ) ;
} else {
cnt = 1 ;
opal_dss . unpack ( msg , & memheap_oob . mkeys [ tr_id ] . len , & cnt , OPAL_UINT16 ) ;
if ( 0 < memheap_oob . mkeys [ tr_id ] . len ) {
memheap_oob . mkeys [ tr_id ] . u . data = malloc ( memheap_oob . mkeys [ tr_id ] . len ) ;
if ( NULL = = memheap_oob . mkeys [ tr_id ] . u . data ) {
MEMHEAP_ERROR ( " Failed allocate %d bytes " , memheap_oob . mkeys [ tr_id ] . len ) ;
oshmem_shmem_abort ( - 1 ) ;
}
cnt = memheap_oob . mkeys [ tr_id ] . len ;
opal_dss . unpack ( msg , memheap_oob . mkeys [ tr_id ] . u . data , & cnt , OPAL_BYTE ) ;
2013-09-10 19:34:09 +04:00
}
}
MEMHEAP_VERBOSE ( 5 ,
2014-01-07 15:56:36 +04:00
" tr_id: %d %s " ,
tr_id , mca_spml_base_mkey2str ( & memheap_oob . mkeys [ tr_id ] ) ) ;
2013-09-10 19:34:09 +04:00
}
}
2013-12-23 13:20:42 +04:00
static void do_recv ( int source_pe , opal_buffer_t * buffer )
2013-09-10 19:34:09 +04:00
{
int32_t cnt = 1 ;
int rc ;
opal_buffer_t * msg ;
uint8_t msg_type ;
uint32_t seg ;
MEMHEAP_VERBOSE ( 5 , " unpacking %d of %d " , cnt , OPAL_UINT8 ) ;
rc = opal_dss . unpack ( buffer , & msg_type , & cnt , OPAL_UINT8 ) ;
2014-01-29 18:30:55 +04:00
if ( OPAL_SUCCESS ! = rc ) {
2013-09-10 19:34:09 +04:00
ORTE_ERROR_LOG ( rc ) ;
goto send_fail ;
}
switch ( msg_type ) {
case MEMHEAP_RKEY_REQ :
cnt = 1 ;
rc = opal_dss . unpack ( buffer , & seg , & cnt , OPAL_UINT32 ) ;
2014-01-29 18:30:55 +04:00
if ( OPAL_SUCCESS ! = rc ) {
2013-09-10 19:34:09 +04:00
MEMHEAP_ERROR ( " bad RKEY_REQ msg " ) ;
goto send_fail ;
}
MEMHEAP_VERBOSE ( 5 , " *** RKEY REQ " ) ;
msg = OBJ_NEW ( opal_buffer_t ) ;
if ( ! msg ) {
MEMHEAP_ERROR ( " failed to get msg buffer " ) ;
ORTE_ERROR_LOG ( rc ) ;
return ;
}
2013-12-23 13:20:42 +04:00
msg_type = MEMHEAP_RKEY_RESP ;
opal_dss . pack ( msg , & msg_type , 1 , OPAL_UINT8 ) ;
if ( OSHMEM_SUCCESS ! = pack_local_mkeys ( msg , source_pe , seg , 0 ) ) {
2013-09-10 19:34:09 +04:00
OBJ_RELEASE ( msg ) ;
goto send_fail ;
}
2013-12-23 13:20:42 +04:00
rc = send_buffer ( source_pe , msg ) ;
if ( MPI_SUCCESS ! = rc ) {
2013-09-10 19:34:09 +04:00
MEMHEAP_ERROR ( " FAILED to send rml message %d " , rc ) ;
ORTE_ERROR_LOG ( rc ) ;
goto send_fail ;
}
break ;
case MEMHEAP_RKEY_RESP :
MEMHEAP_VERBOSE ( 5 , " *** RKEY RESP " ) ;
OPAL_THREAD_LOCK ( & memheap_oob . lck ) ;
2013-12-23 13:20:42 +04:00
unpack_remote_mkeys ( buffer , source_pe ) ;
2013-09-10 19:34:09 +04:00
memheap_oob . mkeys_rcvd = MEMHEAP_RKEY_RESP ;
opal_condition_broadcast ( & memheap_oob . cond ) ;
OPAL_THREAD_UNLOCK ( & memheap_oob . lck ) ;
break ;
case MEMHEAP_RKEY_RESP_FAIL :
MEMHEAP_VERBOSE ( 5 , " *** RKEY RESP FAIL " ) ;
memheap_oob . mkeys_rcvd = MEMHEAP_RKEY_RESP_FAIL ;
opal_condition_broadcast ( & memheap_oob . cond ) ;
OPAL_THREAD_UNLOCK ( & memheap_oob . lck ) ;
break ;
default :
MEMHEAP_VERBOSE ( 5 , " Unknown message type %x " , msg_type ) ;
goto send_fail ;
}
return ;
send_fail : msg = OBJ_NEW ( opal_buffer_t ) ;
if ( ! msg ) {
MEMHEAP_ERROR ( " failed to get msg buffer " ) ;
ORTE_ERROR_LOG ( rc ) ;
return ;
}
msg_type = MEMHEAP_RKEY_RESP_FAIL ;
opal_dss . pack ( msg , & msg_type , 1 , OPAL_UINT8 ) ;
2013-12-23 13:20:42 +04:00
rc = send_buffer ( source_pe , msg ) ;
if ( MPI_SUCCESS ! = rc ) {
2013-09-10 19:34:09 +04:00
MEMHEAP_ERROR ( " FAILED to send rml message %d " , rc ) ;
ORTE_ERROR_LOG ( rc ) ;
}
}
2013-12-23 13:20:42 +04:00
/**
* simple / fast version of MPI_Test that
* - only works with persistant request
* - does not do any progress
* - can be safely called from within opal_progress ( )
*/
static inline int my_MPI_Test ( ompi_request_t * * rptr ,
int * completed ,
ompi_status_public_t * status )
{
ompi_request_t * request = * rptr ;
assert ( request - > req_persistent ) ;
if ( request - > req_complete ) {
int old_error ;
* completed = true ;
* status = request - > req_status ;
old_error = status - > MPI_ERROR ;
status - > MPI_ERROR = old_error ;
request - > req_state = OMPI_REQUEST_INACTIVE ;
return request - > req_status . MPI_ERROR ;
}
* completed = false ;
return OMPI_SUCCESS ;
}
2013-12-24 20:18:48 +04:00
static int oshmem_mkey_recv_cb ( void )
2013-12-23 13:20:42 +04:00
{
MPI_Status status ;
int flag ;
int n ;
int rc ;
opal_buffer_t * msg ;
int32_t size ;
void * tmp_buf ;
n = 0 ;
while ( 1 ) {
my_MPI_Test ( & memheap_oob . recv_req , & flag , & status ) ;
if ( OPAL_LIKELY ( 0 = = flag ) ) {
return n ;
}
MPI_Get_count ( & status , MPI_BYTE , & size ) ;
MEMHEAP_VERBOSE ( 5 , " OOB request from PE: %d, size %d " , status . MPI_SOURCE , size ) ;
n + + ;
/* to avoid deadlock we must start request
* before processing it . Data are copied to
* the tmp buffer
*/
tmp_buf = malloc ( size ) ;
if ( NULL = = tmp_buf ) {
MEMHEAP_ERROR ( " not enough memory " ) ;
ORTE_ERROR_LOG ( 0 ) ;
return n ;
}
memcpy ( tmp_buf , ( void * ) memheap_oob . buf , size ) ;
msg = OBJ_NEW ( opal_buffer_t ) ;
if ( NULL = = msg ) {
MEMHEAP_ERROR ( " not enough memory " ) ;
ORTE_ERROR_LOG ( 0 ) ;
return n ;
}
opal_dss . load ( msg , ( void * ) tmp_buf , size ) ;
rc = MPI_Start ( & memheap_oob . recv_req ) ;
if ( MPI_SUCCESS ! = rc ) {
MEMHEAP_ERROR ( " Failed to post recv request %d " , rc ) ;
ORTE_ERROR_LOG ( rc ) ;
return n ;
}
do_recv ( status . MPI_SOURCE , msg ) ;
OBJ_RELEASE ( msg ) ;
}
return 1 ;
}
2013-09-10 19:34:09 +04:00
int memheap_oob_init ( mca_memheap_map_t * map )
{
int rc = OSHMEM_SUCCESS ;
memheap_map = map ;
OBJ_CONSTRUCT ( & memheap_oob . lck , opal_mutex_t ) ;
OBJ_CONSTRUCT ( & memheap_oob . cond , opal_condition_t ) ;
2013-12-23 13:20:42 +04:00
rc = MPI_Recv_init ( memheap_oob . buf , sizeof ( memheap_oob . buf ) , MPI_BYTE ,
MPI_ANY_SOURCE , 0 ,
oshmem_comm_world ,
& memheap_oob . recv_req ) ;
if ( MPI_SUCCESS ! = rc ) {
MEMHEAP_ERROR ( " Failed to created recv request %d " , rc ) ;
return rc ;
}
rc = MPI_Start ( & memheap_oob . recv_req ) ;
if ( MPI_SUCCESS ! = rc ) {
MEMHEAP_ERROR ( " Failed to post recv request %d " , rc ) ;
return rc ;
}
opal_progress_register ( oshmem_mkey_recv_cb ) ;
2013-09-10 19:34:09 +04:00
return rc ;
}
void memheap_oob_destruct ( void )
{
2013-12-23 13:20:42 +04:00
opal_progress_unregister ( oshmem_mkey_recv_cb ) ;
MPI_Cancel ( & memheap_oob . recv_req ) ;
MPI_Request_free ( & memheap_oob . recv_req ) ;
2013-09-10 19:34:09 +04:00
OBJ_DESTRUCT ( & memheap_oob . lck ) ;
OBJ_DESTRUCT ( & memheap_oob . cond ) ;
}
2013-12-23 13:20:42 +04:00
static int send_buffer ( int pe , opal_buffer_t * msg )
{
void * buffer ;
int32_t size ;
int rc ;
opal_dss . unload ( msg , & buffer , & size ) ;
rc = MPI_Send ( buffer , size , MPI_BYTE , pe , 0 , oshmem_comm_world ) ;
free ( buffer ) ;
OBJ_RELEASE ( msg ) ;
MEMHEAP_VERBOSE ( 5 , " message sent: dst=%d, rc=%d, %d bytes! " , pe , rc , size ) ;
return rc ;
}
2013-09-10 19:34:09 +04:00
static int memheap_oob_get_mkeys ( int pe , uint32_t seg , mca_spml_mkey_t * mkeys )
{
opal_buffer_t * msg ;
uint8_t cmd ;
int i ;
2013-12-23 13:20:42 +04:00
int rc ;
2013-09-10 19:34:09 +04:00
if ( OSHMEM_SUCCESS = = MCA_SPML_CALL ( oob_get_mkeys ( pe , seg , mkeys ) ) ) {
for ( i = 0 ; i < memheap_map - > num_transports ; i + + ) {
mkeys [ i ] . va_base = __seg2base_va ( seg ) ;
MEMHEAP_VERBOSE ( 5 ,
2014-01-07 15:56:36 +04:00
" MKEY CALCULATED BY LOCAL SPML: pe: %d tr_id: %d %s " ,
2013-09-10 19:34:09 +04:00
pe ,
2013-09-14 00:37:30 +04:00
i ,
2014-01-07 15:56:36 +04:00
mca_spml_base_mkey2str ( & mkeys [ i ] ) ) ;
2013-09-10 19:34:09 +04:00
}
return OSHMEM_SUCCESS ;
}
OPAL_THREAD_LOCK ( & memheap_oob . lck ) ;
memheap_oob . mkeys = mkeys ;
memheap_oob . mkeys_rcvd = 0 ;
msg = OBJ_NEW ( opal_buffer_t ) ;
if ( ! msg ) {
OPAL_THREAD_UNLOCK ( & memheap_oob . lck ) ;
MEMHEAP_ERROR ( " failed to get msg buffer " ) ;
return OSHMEM_ERROR ;
}
OPAL_THREAD_LOCK ( & memheap_oob . lck ) ;
cmd = MEMHEAP_RKEY_REQ ;
opal_dss . pack ( msg , & cmd , 1 , OPAL_UINT8 ) ;
opal_dss . pack ( msg , & seg , 1 , OPAL_UINT32 ) ;
2013-12-23 13:20:42 +04:00
rc = send_buffer ( pe , msg ) ;
if ( MPI_SUCCESS ! = rc ) {
2013-09-10 19:34:09 +04:00
OPAL_THREAD_UNLOCK ( & memheap_oob . lck ) ;
MEMHEAP_ERROR ( " FAILED to send rml message %d " , rc ) ;
return OSHMEM_ERROR ;
}
while ( ! memheap_oob . mkeys_rcvd ) {
opal_condition_wait ( & memheap_oob . cond , & memheap_oob . lck ) ;
}
if ( MEMHEAP_RKEY_RESP = = memheap_oob . mkeys_rcvd ) {
rc = OSHMEM_SUCCESS ;
} else {
MEMHEAP_ERROR ( " failed to get rkey seg#%d pe=%d " , seg , pe ) ;
rc = OSHMEM_ERROR ;
}
OPAL_THREAD_UNLOCK ( & memheap_oob . lck ) ;
return rc ;
}
void mca_memheap_modex_recv_all ( void )
{
int i ;
int j ;
int nprocs , my_pe ;
2013-12-23 13:20:42 +04:00
opal_buffer_t * msg ;
void * send_buffer ;
char * rcv_buffer ;
void * dummy_buffer ;
int32_t size , dummy_size ;
int rc ;
2013-09-10 19:34:09 +04:00
2013-12-23 13:20:42 +04:00
if ( ! mca_memheap_base_key_exchange ) {
2013-12-26 14:53:48 +04:00
oshmem_shmem_barrier ( ) ;
2013-09-10 19:34:09 +04:00
return ;
2013-12-23 13:20:42 +04:00
}
2013-09-10 19:34:09 +04:00
nprocs = oshmem_num_procs ( ) ;
my_pe = oshmem_my_proc_id ( ) ;
2013-12-23 13:20:42 +04:00
/* serialize our own mkeys */
msg = OBJ_NEW ( opal_buffer_t ) ;
if ( NULL = = msg ) {
MEMHEAP_ERROR ( " failed to get msg buffer " ) ;
oshmem_shmem_abort ( - 1 ) ;
return ;
}
for ( j = 0 ; j < memheap_map - > n_segments ; j + + ) {
pack_local_mkeys ( msg , 0 , j , 1 ) ;
}
/* Do allgather */
opal_dss . unload ( msg , & send_buffer , & size ) ;
MEMHEAP_VERBOSE ( 1 , " local keys packed into %d bytes, %d segments " , size , memheap_map - > n_segments ) ;
rcv_buffer = malloc ( size * nprocs ) ;
if ( NULL = = msg ) {
MEMHEAP_ERROR ( " failed to allocate recieve buffer " ) ;
oshmem_shmem_abort ( - 1 ) ;
}
2013-12-26 14:53:48 +04:00
rc = oshmem_shmem_allgather ( send_buffer , rcv_buffer , size ) ;
2013-12-23 13:20:42 +04:00
if ( MPI_SUCCESS ! = rc ) {
MEMHEAP_ERROR ( " allgather failed " ) ;
oshmem_shmem_abort ( - 1 ) ;
}
/* deserialize mkeys */
OPAL_THREAD_LOCK ( & memheap_oob . lck ) ;
2013-09-10 19:34:09 +04:00
for ( i = 0 ; i < nprocs ; i + + ) {
2013-12-23 13:20:42 +04:00
if ( i = = my_pe ) {
2013-09-10 19:34:09 +04:00
continue ;
2013-12-23 13:20:42 +04:00
}
2013-09-10 19:34:09 +04:00
2013-12-23 13:20:42 +04:00
opal_dss . load ( msg , rcv_buffer + i * size , size ) ;
2013-09-10 19:34:09 +04:00
for ( j = 0 ; j < memheap_map - > n_segments ; j + + ) {
2013-12-23 13:20:42 +04:00
map_segment_t * s ;
s = & memheap_map - > mem_segs [ j ] ;
if ( NULL ! = s - > mkeys_cache [ i ] ) {
MEMHEAP_VERBOSE ( 10 , " PE%d: segment%d already exists, mkey will be replaced " , i , j ) ;
} else {
s - > mkeys_cache [ i ] = ( mca_spml_mkey_t * ) calloc ( memheap_map - > num_transports ,
sizeof ( mca_spml_mkey_t ) ) ;
if ( NULL = = s - > mkeys_cache [ i ] ) {
MEMHEAP_ERROR ( " PE%d: segment%d: Failed to allocate mkeys cache entry " , i , j ) ;
oshmem_shmem_abort ( - 1 ) ;
}
2013-09-10 19:34:09 +04:00
}
2013-12-23 13:20:42 +04:00
memheap_oob . mkeys = s - > mkeys_cache [ i ] ;
unpack_remote_mkeys ( msg , i ) ;
2013-09-10 19:34:09 +04:00
}
2013-12-23 13:20:42 +04:00
opal_dss . unload ( msg , & dummy_buffer , & dummy_size ) ;
2013-09-10 19:34:09 +04:00
}
2013-12-23 13:20:42 +04:00
OPAL_THREAD_UNLOCK ( & memheap_oob . lck ) ;
free ( send_buffer ) ;
free ( rcv_buffer ) ;
OBJ_RELEASE ( msg ) ;
2013-09-10 19:34:09 +04:00
2013-12-23 13:20:42 +04:00
if ( 3 = = mca_memheap_base_alloc_type | | 4 = = mca_memheap_base_alloc_type ) {
2013-09-10 19:34:09 +04:00
/* unfortunately we must do barrier here to assure that everyone are attached to our segment
2013-12-23 13:20:42 +04:00
* good thing that this code path only invoked on older linuxes ( - mca memheap_base_alloc_type 3 | 4 )
* that does not support IPC_RMID op on attached segments .
2013-09-10 19:34:09 +04:00
*/
shmem_barrier_all ( ) ;
/* keys exchanged, segments attached, now we can safely cleanup */
if ( memheap_map - > mem_segs [ HEAP_SEG_INDEX ] . type
= = MAP_SEGMENT_ALLOC_SHM ) {
shmctl ( memheap_map - > mem_segs [ HEAP_SEG_INDEX ] . shmid ,
IPC_RMID ,
NULL ) ;
}
}
}
2013-09-14 00:37:30 +04:00
static inline void * va2rva ( void * va ,
void * local_base ,
void * remote_base )
2013-09-10 19:34:09 +04:00
{
2013-12-11 10:35:03 +04:00
return ( void * ) ( remote_base > local_base ?
( uintptr_t ) va + ( ( uintptr_t ) remote_base - ( uintptr_t ) local_base ) :
( uintptr_t ) va - ( ( uintptr_t ) local_base - ( uintptr_t ) remote_base ) ) ;
2013-09-10 19:34:09 +04:00
}
mca_spml_mkey_t * mca_memheap_base_get_cached_mkey ( int pe ,
2013-09-14 00:37:30 +04:00
void * va ,
2013-09-10 19:34:09 +04:00
int btl_id ,
2013-09-14 00:37:30 +04:00
void * * rva )
2013-09-10 19:34:09 +04:00
{
map_segment_t * s ;
int rc ;
mca_spml_mkey_t * mkey ;
2013-09-14 00:37:30 +04:00
MEMHEAP_VERBOSE_FASTPATH ( 10 , " rkey: pe=%d va=%p " , pe , va ) ;
2013-09-10 19:34:09 +04:00
s = __find_va ( va ) ;
if ( NULL = = s )
return NULL ;
if ( ! s - > is_active )
return NULL ;
if ( pe = = oshmem_my_proc_id ( ) ) {
* rva = va ;
2013-09-14 00:37:30 +04:00
MEMHEAP_VERBOSE_FASTPATH ( 10 , " rkey: pe=%d va=%p -> (local) %lx %p " , pe , va ,
2014-01-07 15:56:36 +04:00
s - > mkeys [ btl_id ] . u . key , * rva ) ;
2013-09-10 19:34:09 +04:00
return & s - > mkeys [ btl_id ] ;
}
if ( OPAL_LIKELY ( s - > mkeys_cache [ pe ] ) ) {
mkey = & s - > mkeys_cache [ pe ] [ btl_id ] ;
* rva = va2rva ( va , s - > start , mkey - > va_base ) ;
2014-01-07 15:56:36 +04:00
MEMHEAP_VERBOSE_FASTPATH ( 10 , " rkey: pe=%d va=%p -> (cached) %lx %p " , pe , ( void * ) va , mkey - > u . key , ( void * ) * rva ) ;
2013-09-10 19:34:09 +04:00
return mkey ;
}
s - > mkeys_cache [ pe ] = ( mca_spml_mkey_t * ) calloc ( memheap_map - > num_transports ,
sizeof ( mca_spml_mkey_t ) ) ;
if ( ! s - > mkeys_cache [ pe ] )
return NULL ;
rc = memheap_oob_get_mkeys ( pe ,
s - memheap_map - > mem_segs ,
s - > mkeys_cache [ pe ] ) ;
if ( OSHMEM_SUCCESS ! = rc )
return NULL ;
mkey = & s - > mkeys_cache [ pe ] [ btl_id ] ;
* rva = va2rva ( va , s - > start , mkey - > va_base ) ;
2014-01-07 15:56:36 +04:00
MEMHEAP_VERBOSE_FASTPATH ( 5 , " rkey: pe=%d va=%p -> (remote lookup) %lx %p " , pe , ( void * ) va , mkey - > u . key , ( void * ) * rva ) ;
2013-09-10 19:34:09 +04:00
return mkey ;
}
2013-09-14 00:37:30 +04:00
mca_spml_mkey_t * mca_memheap_base_get_mkey ( void * va , int tr_id )
2013-09-10 19:34:09 +04:00
{
map_segment_t * s ;
s = __find_va ( va ) ;
return ( ( s & & s - > is_active ) ? & s - > mkeys [ tr_id ] : NULL ) ;
}
uint64_t mca_memheap_base_find_offset ( int pe ,
int tr_id ,
2013-09-14 00:37:30 +04:00
void * va ,
void * rva )
2013-09-10 19:34:09 +04:00
{
map_segment_t * s ;
s = __find_va ( va ) ;
2013-12-11 10:35:03 +04:00
return ( ( s & & s - > is_active ) ? ( ( uintptr_t ) rva - ( uintptr_t ) ( s - > mkeys_cache [ pe ] [ tr_id ] . va_base ) ) : 0 ) ;
2013-09-10 19:34:09 +04:00
}
2013-09-14 00:37:30 +04:00
int mca_memheap_base_is_symmetric_addr ( const void * va )
2013-09-10 19:34:09 +04:00
{
return ( __find_va ( va ) ? 1 : 0 ) ;
}
2013-09-14 00:37:30 +04:00
int mca_memheap_base_detect_addr_type ( void * va )
2013-09-10 19:34:09 +04:00
{
int addr_type = ADDR_INVALID ;
map_segment_t * s ;
s = __find_va ( va ) ;
if ( s ) {
if ( s - > type = = MAP_SEGMENT_STATIC ) {
addr_type = ADDR_STATIC ;
2013-09-14 00:37:30 +04:00
} else if ( ( uintptr_t ) va > = ( uintptr_t ) s - > start
2013-12-11 10:35:03 +04:00
& & ( uintptr_t ) va < ( uintptr_t ) ( ( uintptr_t ) s - > start + mca_memheap . memheap_size ) ) {
2013-09-10 19:34:09 +04:00
addr_type = ADDR_USER ;
} else {
2013-12-11 10:35:03 +04:00
assert ( ( uintptr_t ) va > = ( uintptr_t ) ( ( uintptr_t ) s - > start + mca_memheap . memheap_size ) & & ( uintptr_t ) va < ( uintptr_t ) s - > end ) ;
2013-09-10 19:34:09 +04:00
addr_type = ADDR_PRIVATE ;
}
}
return addr_type ;
}