2005-05-24 02:06:50 +04:00
|
|
|
/*
|
2005-11-05 22:57:48 +03:00
|
|
|
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
|
|
|
* University Research and Technology
|
|
|
|
* Corporation. All rights reserved.
|
2006-08-24 20:38:08 +04:00
|
|
|
* Copyright (c) 2004-2006 The University of Tennessee and The University
|
2005-11-05 22:57:48 +03:00
|
|
|
* of Tennessee Research Foundation. All rights
|
|
|
|
* reserved.
|
2005-05-24 02:06:50 +04:00
|
|
|
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
|
|
|
* University of Stuttgart. All rights reserved.
|
|
|
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
|
|
|
* All rights reserved.
|
|
|
|
* $COPYRIGHT$
|
|
|
|
*
|
|
|
|
* Additional copyrights may follow
|
|
|
|
*
|
|
|
|
* $HEADER$
|
|
|
|
*/
|
|
|
|
/**
|
|
|
|
* @file
|
|
|
|
*/
|
|
|
|
|
2005-05-24 02:22:20 +04:00
|
|
|
#ifndef MCA_PML_OB1_H
|
|
|
|
#define MCA_PML_OB1_H
|
2005-05-24 02:06:50 +04:00
|
|
|
|
2005-06-10 00:16:33 +04:00
|
|
|
#include "ompi_config.h"
|
2005-08-15 15:02:01 +04:00
|
|
|
#include "opal/threads/threads.h"
|
2006-02-12 04:33:29 +03:00
|
|
|
#include "ompi/class/ompi_free_list.h"
|
|
|
|
#include "ompi/request/request.h"
|
|
|
|
#include "ompi/mca/pml/pml.h"
|
|
|
|
#include "ompi/mca/pml/base/pml_base_request.h"
|
|
|
|
#include "ompi/mca/pml/base/pml_base_bsend.h"
|
|
|
|
#include "ompi/mca/pml/base/pml_base_sendreq.h"
|
|
|
|
#include "ompi/mca/btl/btl.h"
|
2005-08-14 07:14:20 +04:00
|
|
|
#include "ompi/datatype/datatype.h"
|
2006-07-20 18:44:35 +04:00
|
|
|
#include "pml_ob1_hdr.h"
|
|
|
|
#include "ompi/mca/bml/base/base.h"
|
2006-08-14 19:43:03 +04:00
|
|
|
#include "ompi/proc/proc.h"
|
2005-05-24 02:06:50 +04:00
|
|
|
|
|
|
|
#if defined(c_plusplus) || defined(__cplusplus)
|
|
|
|
extern "C" {
|
|
|
|
#endif
|
|
|
|
/**
|
2005-05-24 02:22:20 +04:00
|
|
|
* OB1 PML module
|
2005-05-24 02:06:50 +04:00
|
|
|
*/
|
|
|
|
|
|
|
|
struct mca_pml_ob1_t {
|
|
|
|
mca_pml_base_module_t super;
|
|
|
|
|
|
|
|
int priority;
|
2005-06-09 00:37:19 +04:00
|
|
|
int free_list_num; /* initial size of free list */
|
|
|
|
int free_list_max; /* maximum size of free list */
|
|
|
|
int free_list_inc; /* number of elements to grow free list */
|
2005-06-30 09:50:55 +04:00
|
|
|
size_t eager_limit; /* maximum eager limit size - overrides btl setting */
|
2005-06-01 18:34:22 +04:00
|
|
|
size_t send_pipeline_depth;
|
|
|
|
size_t recv_pipeline_depth;
|
2005-06-23 23:24:44 +04:00
|
|
|
bool leave_pinned;
|
2006-02-09 18:49:51 +03:00
|
|
|
int leave_pinned_pipeline;
|
|
|
|
|
2005-05-24 02:06:50 +04:00
|
|
|
/* lock queue access */
|
2005-07-04 02:45:48 +04:00
|
|
|
opal_mutex_t lock;
|
2005-05-24 02:06:50 +04:00
|
|
|
|
2005-06-07 18:12:47 +04:00
|
|
|
/* free lists */
|
2005-05-24 02:06:50 +04:00
|
|
|
ompi_free_list_t send_requests;
|
|
|
|
ompi_free_list_t recv_requests;
|
2005-06-10 00:16:33 +04:00
|
|
|
ompi_free_list_t rdma_frags;
|
|
|
|
ompi_free_list_t recv_frags;
|
2006-07-20 18:44:35 +04:00
|
|
|
ompi_free_list_t pending_pckts;
|
2005-06-07 18:12:47 +04:00
|
|
|
ompi_free_list_t buffers;
|
2005-05-24 02:06:50 +04:00
|
|
|
|
2005-06-10 00:16:33 +04:00
|
|
|
/* list of pending operations */
|
2006-07-20 18:44:35 +04:00
|
|
|
opal_list_t pckt_pending;
|
2005-07-03 20:22:16 +04:00
|
|
|
opal_list_t send_pending;
|
|
|
|
opal_list_t recv_pending;
|
|
|
|
opal_list_t rdma_pending;
|
2005-08-12 06:41:14 +04:00
|
|
|
bool enabled;
|
2005-05-24 02:06:50 +04:00
|
|
|
};
|
|
|
|
typedef struct mca_pml_ob1_t mca_pml_ob1_t;
|
|
|
|
|
|
|
|
extern mca_pml_ob1_t mca_pml_ob1;
|
|
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
* PML module functions.
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
|
|
extern int mca_pml_ob1_component_open(void);
|
|
|
|
extern int mca_pml_ob1_component_close(void);
|
|
|
|
|
|
|
|
extern mca_pml_base_module_t* mca_pml_ob1_component_init(
|
|
|
|
int *priority,
|
|
|
|
bool enable_progress_threads,
|
|
|
|
bool enable_mpi_threads
|
|
|
|
);
|
|
|
|
|
|
|
|
extern int mca_pml_ob1_component_fini(void);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
* PML interface functions.
|
|
|
|
*/
|
|
|
|
|
|
|
|
extern int mca_pml_ob1_add_comm(
|
|
|
|
struct ompi_communicator_t* comm
|
|
|
|
);
|
|
|
|
|
|
|
|
extern int mca_pml_ob1_del_comm(
|
|
|
|
struct ompi_communicator_t* comm
|
|
|
|
);
|
|
|
|
|
|
|
|
extern int mca_pml_ob1_add_procs(
|
|
|
|
struct ompi_proc_t **procs,
|
|
|
|
size_t nprocs
|
|
|
|
);
|
|
|
|
|
|
|
|
extern int mca_pml_ob1_del_procs(
|
|
|
|
struct ompi_proc_t **procs,
|
|
|
|
size_t nprocs
|
|
|
|
);
|
|
|
|
|
|
|
|
extern int mca_pml_ob1_enable(
|
|
|
|
bool enable
|
|
|
|
);
|
|
|
|
|
|
|
|
extern int mca_pml_ob1_progress(void);
|
|
|
|
|
|
|
|
extern int mca_pml_ob1_iprobe(
|
|
|
|
int dst,
|
|
|
|
int tag,
|
|
|
|
struct ompi_communicator_t* comm,
|
|
|
|
int *matched,
|
|
|
|
ompi_status_public_t* status
|
|
|
|
);
|
|
|
|
|
|
|
|
extern int mca_pml_ob1_probe(
|
|
|
|
int dst,
|
|
|
|
int tag,
|
|
|
|
struct ompi_communicator_t* comm,
|
|
|
|
ompi_status_public_t* status
|
|
|
|
);
|
|
|
|
|
|
|
|
extern int mca_pml_ob1_isend_init(
|
|
|
|
void *buf,
|
|
|
|
size_t count,
|
|
|
|
ompi_datatype_t *datatype,
|
|
|
|
int dst,
|
|
|
|
int tag,
|
|
|
|
mca_pml_base_send_mode_t mode,
|
|
|
|
struct ompi_communicator_t* comm,
|
|
|
|
struct ompi_request_t **request
|
|
|
|
);
|
|
|
|
|
|
|
|
extern int mca_pml_ob1_isend(
|
|
|
|
void *buf,
|
|
|
|
size_t count,
|
|
|
|
ompi_datatype_t *datatype,
|
|
|
|
int dst,
|
|
|
|
int tag,
|
|
|
|
mca_pml_base_send_mode_t mode,
|
|
|
|
struct ompi_communicator_t* comm,
|
|
|
|
struct ompi_request_t **request
|
|
|
|
);
|
|
|
|
|
|
|
|
extern int mca_pml_ob1_send(
|
|
|
|
void *buf,
|
|
|
|
size_t count,
|
|
|
|
ompi_datatype_t *datatype,
|
|
|
|
int dst,
|
|
|
|
int tag,
|
|
|
|
mca_pml_base_send_mode_t mode,
|
|
|
|
struct ompi_communicator_t* comm
|
|
|
|
);
|
|
|
|
|
|
|
|
extern int mca_pml_ob1_irecv_init(
|
|
|
|
void *buf,
|
|
|
|
size_t count,
|
|
|
|
ompi_datatype_t *datatype,
|
|
|
|
int src,
|
|
|
|
int tag,
|
|
|
|
struct ompi_communicator_t* comm,
|
|
|
|
struct ompi_request_t **request
|
|
|
|
);
|
|
|
|
|
|
|
|
extern int mca_pml_ob1_irecv(
|
|
|
|
void *buf,
|
|
|
|
size_t count,
|
|
|
|
ompi_datatype_t *datatype,
|
|
|
|
int src,
|
|
|
|
int tag,
|
|
|
|
struct ompi_communicator_t* comm,
|
|
|
|
struct ompi_request_t **request
|
|
|
|
);
|
|
|
|
|
|
|
|
extern int mca_pml_ob1_recv(
|
|
|
|
void *buf,
|
|
|
|
size_t count,
|
|
|
|
ompi_datatype_t *datatype,
|
|
|
|
int src,
|
|
|
|
int tag,
|
|
|
|
struct ompi_communicator_t* comm,
|
|
|
|
ompi_status_public_t* status
|
|
|
|
);
|
|
|
|
|
2006-03-20 18:41:45 +03:00
|
|
|
extern int mca_pml_ob1_dump(
|
2006-03-17 21:46:48 +03:00
|
|
|
struct ompi_communicator_t* comm,
|
|
|
|
int verbose
|
|
|
|
);
|
|
|
|
|
2005-05-24 02:06:50 +04:00
|
|
|
extern int mca_pml_ob1_start(
|
|
|
|
size_t count,
|
|
|
|
ompi_request_t** requests
|
|
|
|
);
|
|
|
|
|
|
|
|
#if defined(c_plusplus) || defined(__cplusplus)
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2005-08-18 21:06:35 +04:00
|
|
|
#define MCA_PML_OB1_DES_ALLOC(bml_btl, des, size) \
|
|
|
|
MCA_BML_BASE_BTL_DES_ALLOC(bml_btl, des, \
|
|
|
|
sizeof(mca_pml_ob1_hdr_t) + (sizeof(mca_btl_base_segment_t) << 4), size)
|
2005-05-24 02:06:50 +04:00
|
|
|
|
2005-09-13 02:28:23 +04:00
|
|
|
|
|
|
|
/**
|
|
|
|
* structure to associate rdma btl with a registration
|
|
|
|
*/
|
|
|
|
|
|
|
|
struct mca_pml_ob1_rdma_reg_t {
|
|
|
|
struct mca_bml_base_btl_t* bml_btl;
|
|
|
|
struct mca_mpool_base_registration_t* btl_reg;
|
|
|
|
};
|
|
|
|
typedef struct mca_pml_ob1_rdma_reg_t mca_pml_ob1_rdma_reg_t;
|
|
|
|
|
|
|
|
#define MCA_PML_OB1_MAX_REGISTRATIONS 4
|
2005-05-24 02:06:50 +04:00
|
|
|
|
2006-07-20 18:44:35 +04:00
|
|
|
struct mca_pml_ob1_pckt_pending_t {
|
|
|
|
ompi_free_list_item_t super;
|
|
|
|
ompi_proc_t* proc;
|
|
|
|
mca_pml_ob1_hdr_t hdr;
|
2006-12-03 13:12:09 +03:00
|
|
|
struct mca_bml_base_btl_t *bml_btl;
|
2006-07-20 18:44:35 +04:00
|
|
|
};
|
|
|
|
typedef struct mca_pml_ob1_pckt_pending_t mca_pml_ob1_pckt_pending_t;
|
|
|
|
OBJ_CLASS_DECLARATION(mca_pml_ob1_pckt_pending_t);
|
|
|
|
|
|
|
|
#define MCA_PML_OB1_PCKT_PENDING_ALLOC(pckt,rc) \
|
|
|
|
do { \
|
|
|
|
ompi_free_list_item_t* item; \
|
|
|
|
OMPI_FREE_LIST_WAIT(&mca_pml_ob1.pending_pckts, item, rc); \
|
|
|
|
pckt = (mca_pml_ob1_pckt_pending_t*)item; \
|
|
|
|
} while (0)
|
|
|
|
|
|
|
|
#define MCA_PML_OB1_PCKT_PENDING_RETURN(pckt) \
|
|
|
|
do { \
|
|
|
|
/* return packet */ \
|
|
|
|
OMPI_FREE_LIST_RETURN(&mca_pml_ob1.pending_pckts, \
|
|
|
|
(ompi_free_list_item_t*)pckt); \
|
|
|
|
} while(0)
|
|
|
|
|
2006-12-03 13:12:09 +03:00
|
|
|
#define MCA_PML_OB1_ADD_FIN_TO_PENDING(P, D, B) \
|
2006-07-20 18:44:35 +04:00
|
|
|
do { \
|
|
|
|
mca_pml_ob1_pckt_pending_t *_pckt; \
|
|
|
|
int _rc; \
|
|
|
|
\
|
|
|
|
MCA_PML_OB1_PCKT_PENDING_ALLOC(_pckt,_rc); \
|
|
|
|
_pckt->hdr.hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_FIN; \
|
|
|
|
_pckt->hdr.hdr_fin.hdr_des.pval = (D); \
|
|
|
|
_pckt->proc = (P); \
|
2006-12-03 13:12:09 +03:00
|
|
|
_pckt->bml_btl = (B); \
|
2006-07-20 18:44:35 +04:00
|
|
|
OPAL_THREAD_LOCK(&mca_pml_ob1.lock); \
|
|
|
|
opal_list_append(&mca_pml_ob1.pckt_pending, \
|
|
|
|
(opal_list_item_t*)_pckt); \
|
|
|
|
OPAL_THREAD_UNLOCK(&mca_pml_ob1.lock); \
|
|
|
|
} while(0)
|
|
|
|
|
|
|
|
|
|
|
|
int mca_pml_ob1_send_fin_btl(ompi_proc_t* proc, mca_bml_base_btl_t* bml_btl,
|
|
|
|
void *hdr_des);
|
2006-12-03 13:12:09 +03:00
|
|
|
static inline int mca_pml_ob1_send_fin(ompi_proc_t* proc, void *hdr_des,
|
|
|
|
mca_bml_base_btl_t* bml_btl)
|
2006-07-20 18:44:35 +04:00
|
|
|
{
|
|
|
|
size_t i;
|
|
|
|
mca_bml_base_endpoint_t* endpoint =
|
|
|
|
(mca_bml_base_endpoint_t*)proc->proc_bml;
|
|
|
|
|
2006-12-03 13:12:09 +03:00
|
|
|
/* Some BTLs (e.g TCP) can report put/get completion before data actually
|
|
|
|
* hits the buffer on the other side. For this kind of BTLs we need to send
|
|
|
|
* FIN through the same BTL PUT was performed with so network will handle
|
|
|
|
* ordering for us. If we will use another BTL, receiver can get FIN before
|
|
|
|
* data will hit the buffer and complete request prematurely. We mark such
|
|
|
|
* problematic BTLs with MCA_BTL_FLAGS_FAKE_RDMA flag (this kind of RDMA
|
|
|
|
* is really fake, because the real one guaranties that sender will see the
|
|
|
|
* completion only after receiver's NIC confirmed that all the data was
|
|
|
|
* received)
|
|
|
|
*/
|
|
|
|
if(bml_btl->btl_flags & MCA_BTL_FLAGS_FAKE_RDMA) {
|
2006-07-20 18:44:35 +04:00
|
|
|
if(mca_pml_ob1_send_fin_btl(proc, bml_btl, hdr_des) == OMPI_SUCCESS)
|
|
|
|
return OMPI_SUCCESS;
|
2006-12-03 13:12:09 +03:00
|
|
|
} else {
|
|
|
|
for(i = 0; i < mca_bml_base_btl_array_get_size(&endpoint->btl_eager);
|
|
|
|
i++) {
|
|
|
|
bml_btl = mca_bml_base_btl_array_get_next(&endpoint->btl_eager);
|
|
|
|
if(mca_pml_ob1_send_fin_btl(proc, bml_btl, hdr_des) == OMPI_SUCCESS)
|
|
|
|
return OMPI_SUCCESS;
|
|
|
|
}
|
|
|
|
bml_btl = NULL;
|
2006-07-20 18:44:35 +04:00
|
|
|
}
|
|
|
|
|
2006-12-03 13:12:09 +03:00
|
|
|
MCA_PML_OB1_ADD_FIN_TO_PENDING(proc, hdr_des, bml_btl);
|
2006-07-20 18:44:35 +04:00
|
|
|
|
|
|
|
return OMPI_ERR_OUT_OF_RESOURCE;
|
|
|
|
}
|
2006-10-29 12:12:24 +03:00
|
|
|
/* This function tries to resend FIN/ACK packets from pckt_pending queue.
|
|
|
|
* Packets are added to the queue when sending of FIN or ACK is failed due to
|
|
|
|
* resource unavailability. bml_btl passed to the function doesn't represents
|
|
|
|
* packet's destination, it represents BTL on which resource was freed, so only
|
|
|
|
* this BTL should be considered for resending packets */
|
2006-07-20 18:44:35 +04:00
|
|
|
void mca_pml_ob1_process_pending_packets(mca_bml_base_btl_t* bml_btl);
|
2006-10-29 12:12:24 +03:00
|
|
|
|
|
|
|
/* This function retries failed PUT/GET operations on frag. When RDMA operation
|
|
|
|
* cannot be accomplished for some reason, frag is put on the rdma_pending list.
|
|
|
|
* Later the operation is retried. The destination of RDMA operation is stored
|
|
|
|
* inside the frag structure */
|
2006-07-20 18:44:35 +04:00
|
|
|
void mca_pml_ob1_process_pending_rdma(void);
|
|
|
|
|
|
|
|
#define MCA_PML_OB1_PROGRESS_PENDING(bml_btl) \
|
|
|
|
do { \
|
|
|
|
if(opal_list_get_size(&mca_pml_ob1.pckt_pending)) \
|
|
|
|
mca_pml_ob1_process_pending_packets(bml_btl); \
|
|
|
|
if(opal_list_get_size(&mca_pml_ob1.recv_pending)) \
|
|
|
|
mca_pml_ob1_recv_request_process_pending(); \
|
|
|
|
if(opal_list_get_size(&mca_pml_ob1.send_pending)) \
|
|
|
|
mca_pml_ob1_send_request_process_pending(bml_btl); \
|
|
|
|
if(opal_list_get_size(&mca_pml_ob1.rdma_pending)) \
|
|
|
|
mca_pml_ob1_process_pending_rdma(); \
|
|
|
|
} while (0)
|
2006-05-08 00:54:44 +04:00
|
|
|
/*
|
|
|
|
* Compute the total number of bytes on supplied descriptor
|
|
|
|
*/
|
|
|
|
#define MCA_PML_OB1_COMPUTE_SEGMENT_LENGTH(segments, count, hdrlen, length) \
|
|
|
|
do { \
|
|
|
|
size_t i; \
|
|
|
|
\
|
|
|
|
for( i = 0; i < count; i++ ) { \
|
|
|
|
length += segments[i].seg_len; \
|
|
|
|
} \
|
|
|
|
length -= hdrlen; \
|
|
|
|
} while(0)
|
|
|
|
|
2005-05-24 02:06:50 +04:00
|
|
|
#endif
|