1
1
1. The send path get shorter. The BTL is allowed to return > 0 to specify that the
   descriptor was pushed to the networks, and that the memory attached to it is 
   available again for the upper layer. The MCA_BTL_DES_SEND_ALWAYS_CALLBACK flag
   can be used by the PML to force the BTL to always trigger the callback.
   Unmodified BTL will continue to work as expected, as they will return OMPI_SUCCESS
   which force the PML to have exactly the same behavior as before. Some BTLs have
   been modified: self, sm, tcp, mx.
2. Add send immediate interface to BTL.
   The idea is to have a mechanism of allowing the BTL to take advantage of
   send optimizations such as the ability to deliver data "inline". Some
   network APIs such as Portals allow data to be sent using a "thin" event
   without packing data into a memory descriptor. This interface change
   allows the BTL to use such capabilities and allows for other optimizations
   in the future. All existing BTLs except for Portals and sm have this interface
   set to NULL.

This commit was SVN r18551.
Этот коммит содержится в:
George Bosilca 2008-05-30 03:58:39 +00:00
родитель 4da4c44210
Коммит e361bcb64c
49 изменённых файлов: 927 добавлений и 408 удалений

Просмотреть файл

@ -69,6 +69,7 @@ struct mca_bml_base_btl_t {
mca_btl_base_module_alloc_fn_t btl_alloc;
mca_btl_base_module_free_fn_t btl_free;
mca_btl_base_module_send_fn_t btl_send;
mca_btl_base_module_sendi_fn_t btl_sendi;
mca_btl_base_module_prepare_fn_t btl_prepare_src;
mca_btl_base_module_prepare_fn_t btl_prepare_dst;
mca_btl_base_module_put_fn_t btl_put;
@ -304,6 +305,22 @@ static inline int mca_bml_base_send_status(
return bml_btl->btl_send(bml_btl->btl, bml_btl->btl_endpoint, des, tag);
}
static inline int mca_bml_base_sendi(
mca_bml_base_btl_t* bml_btl,
struct ompi_convertor_t* convertor,
void* header,
size_t header_size,
size_t payload_size,
uint8_t order,
uint32_t flags,
mca_btl_base_tag_t tag,
mca_btl_base_descriptor_t** descriptor)
{
return bml_btl->btl_sendi(bml_btl->btl, bml_btl->btl_endpoint,
convertor, header, header_size,
payload_size, order, flags, tag, descriptor);
}
static inline int mca_bml_base_put(mca_bml_base_btl_t* bml_btl, mca_btl_base_descriptor_t* des) {
des->des_context = (void*) bml_btl;
return bml_btl->btl_put(

Просмотреть файл

@ -296,6 +296,7 @@ int mca_bml_r2_add_procs(
bml_btl->btl_prepare_src = btl->btl_prepare_src;
bml_btl->btl_prepare_dst = btl->btl_prepare_dst;
bml_btl->btl_send = btl->btl_send;
bml_btl->btl_sendi = btl->btl_sendi;
bml_btl->btl_flags = btl->btl_flags;
bml_btl->btl_put = btl->btl_put;
if( (bml_btl->btl_flags & MCA_BTL_FLAGS_PUT) && (NULL == bml_btl->btl_put) ) {

Просмотреть файл

@ -118,9 +118,7 @@
#include "opal/mca/crs/crs.h"
#include "opal/mca/crs/base/base.h"
#if defined(c_plusplus) || defined(__cplusplus)
extern "C" {
#endif
BEGIN_C_DECLS
/*
* BTL types
@ -274,7 +272,14 @@ typedef struct mca_btl_base_descriptor_t mca_btl_base_descriptor_t;
OMPI_DECLSPEC OBJ_CLASS_DECLARATION(mca_btl_base_descriptor_t);
#define MCA_BTL_DES_FLAGS_PRIORITY 0x0001
/* Allow the BTL to dispose the descriptor once the callback
* associated was triggered.
*/
#define MCA_BTL_DES_FLAGS_BTL_OWNERSHIP 0x0002
/* Allow the BTL to avoid calling the descriptor callback
* if the send succeded in the btl_send (i.e in the fast path).
*/
#define MCA_BTL_DES_SEND_ALWAYS_CALLBACK 0x0004
/**
* Maximum number of allowed segments in src/dst fields of a descriptor.
@ -624,6 +629,45 @@ typedef int (*mca_btl_base_module_send_fn_t)(
mca_btl_base_tag_t tag
);
/**
* Initiate an immediate blocking send.
* Completion Semantics: the BTL will make a best effort
* to send the header and "size" bytes from the datatype using the convertor.
* The header is guaranteed to be delivered entirely in the first segment.
* Should the BTL be unable to deliver the data due to resource constraints
* the BTL will return a descriptor (via the OUT param)
* of size "payload_size + header_size".
*
* @param btl (IN) BTL module
* @param endpoint (IN) BTL addressing information
* @param convertor (IN) Data type convertor
* @param header (IN) Pointer to header.
* @param header_size (IN) Size of header.
* @param payload_size (IN) Size of payload (from convertor).
* @param order (IN) The ordering tag (may be MCA_BTL_NO_ORDER)
* @param tag (IN) The tag value used to notify the peer.
* @param descriptor (OUT) The descriptor to be returned unable to be sent immediately
* @retval OMPI_SUCCESS The send was successfully queued
* @retval OMPI_ERROR The send failed
* @retval OMPI_ERR_UNREACH The endpoint is not reachable
* @retval OMPI_ERR_RESOURCE_BUSY The BTL is busy a descriptor will be returned
* (via the OUT param) if descriptors are available
*/
typedef int (*mca_btl_base_module_sendi_fn_t)(
struct mca_btl_base_module_t* btl,
struct mca_btl_base_endpoint_t* endpoint,
struct ompi_convertor_t* convertor,
void* header,
size_t header_size,
size_t payload_size,
uint8_t order,
uint32_t flags,
mca_btl_base_tag_t tag,
mca_btl_base_descriptor_t** descriptor
);
/**
* Initiate an asynchronous put.
@ -730,6 +774,7 @@ struct mca_btl_base_module_t {
mca_btl_base_module_prepare_fn_t btl_prepare_src;
mca_btl_base_module_prepare_fn_t btl_prepare_dst;
mca_btl_base_module_send_fn_t btl_send;
mca_btl_base_module_sendi_fn_t btl_sendi;
mca_btl_base_module_put_fn_t btl_put;
mca_btl_base_module_get_fn_t btl_get;
mca_btl_base_module_dump_fn_t btl_dump;
@ -763,8 +808,6 @@ typedef struct mca_btl_base_module_t mca_btl_base_module_t;
/* btl v1.0 */ \
"btl", 1, 0, 0
#if defined(c_plusplus) || defined(__cplusplus)
}
#endif
END_C_DECLS
#endif /* OMPI_MCA_BTL_H */

Просмотреть файл

@ -661,6 +661,7 @@ mca_btl_elan_module_t mca_btl_elan_module = {
mca_btl_elan_prepare_src,
mca_btl_elan_prepare_dst,
mca_btl_elan_send,
NULL, /* send immediate */
mca_btl_elan_put,
mca_btl_elan_get,
mca_btl_elan_dump,

Просмотреть файл

@ -79,6 +79,7 @@ mca_btl_gm_module_t mca_btl_gm_module = {
mca_btl_gm_prepare_dst,
#if OMPI_ENABLE_MPI_THREADS || OMPI_ENABLE_PROGRESS_THREADS
mca_btl_gm_send,
NULL, /* send immediate */
mca_btl_gm_put,
mca_btl_gm_get,
#else

Просмотреть файл

@ -457,17 +457,22 @@ int mca_btl_mx_send( struct mca_btl_base_module_t* btl,
}
if( mx_result ) {
mx_return = mx_forget( mx_btl->mx_endpoint, &(frag->mx_request) );
frag->base.des_cbfunc( &(mx_btl->super), frag->endpoint, &(frag->base), OMPI_SUCCESS);
if( btl_ownership ) {
MCA_BTL_MX_FRAG_RETURN( mx_btl, frag );
}
if( OPAL_UNLIKELY(MX_SUCCESS != mx_return) ) {
orte_output( 0, "mx_forget failed with error %d (%s)\n",
mx_return, mx_strerror(mx_return) );
return OMPI_ERROR;
}
frag->base.des_flags |= MCA_BTL_DES_SEND_ALWAYS_CALLBACK;
return OMPI_SUCCESS;
}
if( MCA_BTL_DES_SEND_ALWAYS_CALLBACK & frag->base.des_flags ) {
frag->base.des_cbfunc( &(mx_btl->super), frag->endpoint,
&(frag->base), OMPI_SUCCESS);
}
if( btl_ownership ) {
MCA_BTL_MX_FRAG_RETURN( mx_btl, frag );
}
return 1;
}
}
#endif
if( 4096 > total_length ) {
@ -475,18 +480,22 @@ int mca_btl_mx_send( struct mca_btl_base_module_t* btl,
uint32_t mx_result;
/* let's check for completness */
mx_return = mx_test( mx_btl->mx_endpoint, &(frag->mx_request), &mx_status, &mx_result );
if( OPAL_UNLIKELY(MX_SUCCESS != mx_return) )
return OMPI_SUCCESS;
/* call the completion callback */
mx_return = mx_test( mx_btl->mx_endpoint, &(frag->mx_request),
&mx_status, &mx_result );
if( OPAL_LIKELY(MX_SUCCESS == mx_return) ) {
if( mx_result ) {
frag->base.des_cbfunc( &(mx_btl->super), frag->endpoint, &(frag->base), OMPI_SUCCESS);
if( MCA_BTL_DES_SEND_ALWAYS_CALLBACK & frag->base.des_flags ) {
frag->base.des_cbfunc( &(mx_btl->super), frag->endpoint,
&(frag->base), OMPI_SUCCESS);
}
if( btl_ownership ) {
MCA_BTL_MX_FRAG_RETURN( mx_btl, frag );
}
return OMPI_SUCCESS;
return 1;
}
}
}
frag->base.des_flags |= MCA_BTL_DES_SEND_ALWAYS_CALLBACK;
return OMPI_SUCCESS;
}
@ -552,6 +561,7 @@ mca_btl_mx_module_t mca_btl_mx_module = {
mca_btl_mx_prepare_src,
mca_btl_mx_prepare_dst,
mca_btl_mx_send,
NULL, /* send immediate */
mca_btl_mx_put, /* put */
NULL, /* get */
mca_btl_base_dump,

Просмотреть файл

@ -620,8 +620,10 @@ int mca_btl_mx_component_progress(void)
if( MCA_BTL_MX_SEND == frag->type ) { /* it's a send */
int btl_ownership = (frag->base.des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP);
/* call the completion callback */
if( MCA_BTL_DES_SEND_ALWAYS_CALLBACK & frag->base.des_flags ) {
frag->base.des_cbfunc( &(mx_btl->super), frag->endpoint,
&(frag->base), OMPI_SUCCESS );
}
if( btl_ownership ) {
MCA_BTL_MX_FRAG_RETURN( mx_btl, frag );
}

Просмотреть файл

@ -60,6 +60,7 @@ mca_btl_ud_module_t mca_btl_ofud_module = {
mca_btl_ud_prepare_src,
NULL, /*mca_btl_ud_prepare_dst */
mca_btl_ud_send,
NULL, /* send immediate */
NULL, /*mca_btl_ud_put */
NULL, /*mca_btl_ud_get */
mca_btl_base_dump,

Просмотреть файл

@ -82,6 +82,7 @@ mca_btl_openib_module_t mca_btl_openib_module = {
mca_btl_openib_prepare_src,
mca_btl_openib_prepare_dst,
mca_btl_openib_send,
NULL, /* send immediate */
mca_btl_openib_put,
mca_btl_openib_get,
mca_btl_base_dump,

Просмотреть файл

@ -65,6 +65,7 @@ mca_btl_portals_module_t mca_btl_portals_module = {
mca_btl_portals_prepare_src,
mca_btl_portals_prepare_dst,
mca_btl_portals_send,
mca_btl_portals_sendi,
mca_btl_portals_put,
mca_btl_portals_get,
mca_btl_base_dump,
@ -251,7 +252,7 @@ mca_btl_portals_alloc(struct mca_btl_base_module_t* btl_base,
}
frag->base.des_src_cnt = 1;
frag->base.des_flags = flags;
frag->base.des_flags = flags | MCA_BTL_DES_SEND_ALWAYS_CALLBACK;
frag->base.order = MCA_BTL_NO_ORDER;
return &frag->base;

Просмотреть файл

@ -204,6 +204,18 @@ int mca_btl_portals_send(struct mca_btl_base_module_t* btl_base,
struct mca_btl_base_descriptor_t* descriptor,
mca_btl_base_tag_t tag);
int mca_btl_portals_sendi(struct mca_btl_base_module_t* btl_base,
struct mca_btl_base_endpoint_t* endpoint,
struct ompi_convertor_t* convertor,
void* header,
size_t header_size,
size_t payload_size,
uint8_t order,
uint32_t flags,
mca_btl_base_tag_t tag,
mca_btl_base_descriptor_t** des);
int mca_btl_portals_put(struct mca_btl_base_module_t* btl_base,
struct mca_btl_base_endpoint_t* btl_peer,
struct mca_btl_base_descriptor_t* decriptor);

Просмотреть файл

@ -70,7 +70,7 @@ mca_btl_portals_component_t mca_btl_portals_component = {
};
static orte_output_stream_t portals_output_stream;
static opal_output_stream_t portals_output_stream;
int
mca_btl_portals_component_open(void)
@ -84,7 +84,7 @@ mca_btl_portals_component_open(void)
*/
/* start up debugging output */
OBJ_CONSTRUCT(&portals_output_stream, orte_output_stream_t);
OBJ_CONSTRUCT(&portals_output_stream, opal_output_stream_t);
portals_output_stream.lds_is_debugging = true;
portals_output_stream.lds_want_stdout = true;
portals_output_stream.lds_file_suffix = "btl-portals";
@ -368,10 +368,11 @@ mca_btl_portals_component_progress(void)
}
break;
case PTL_EVENT_PUT_START:
tag = ((unsigned char*) (&ev.hdr_data))[7];
/* generated on destination (target) when a put into memory starts */
ORTE_OUTPUT_VERBOSE((90, mca_btl_portals_component.portals_output,
"PTL_EVENT_PUT_START for 0x%lx, %d",
(unsigned long) frag, (int) ev.hdr_data));
(unsigned long) frag, (int) tag));
#if OMPI_ENABLE_DEBUG
if (ev.ni_fail_type != PTL_NI_OK) {
@ -381,7 +382,7 @@ mca_btl_portals_component_progress(void)
}
#endif
/* if it's a pending unexpected receive, do book keeping. */
if (ev.hdr_data < MCA_BTL_TAG_MAX) {
if (tag < MCA_BTL_TAG_MAX) {
block = ev.md.user_ptr;
OPAL_THREAD_ADD32(&(block->pending), 1);
}
@ -389,10 +390,11 @@ mca_btl_portals_component_progress(void)
break;
case PTL_EVENT_PUT_END:
tag = ((unsigned char*) (&ev.hdr_data))[7];
/* generated on destination (target) when a put into memory ends */
ORTE_OUTPUT_VERBOSE((90, mca_btl_portals_component.portals_output,
"PTL_EVENT_PUT_END for 0x%lx, %d",
(unsigned long) frag, (int) ev.hdr_data));
(unsigned long) frag, (int) tag));
#if OMPI_ENABLE_DEBUG
if (ev.ni_fail_type != PTL_NI_OK) {
@ -404,13 +406,43 @@ mca_btl_portals_component_progress(void)
}
#endif
/* if it's an unexpected receive, do book keeping and send to PML */
if (ev.hdr_data < MCA_BTL_TAG_MAX) {
if (tag < MCA_BTL_TAG_MAX) {
block = ev.md.user_ptr;
tag = ev.hdr_data;
frag = &mca_btl_portals_module.portals_recv_frag;
if(ev.match_bits) {
uint8_t header_size = ((uint8_t*) (&ev.hdr_data))[6];
memcpy(frag->data, &ev.match_bits, header_size > 8 ? 8 : header_size);
if(header_size > 8) {
memcpy(frag->data+8, &ev.hdr_data, header_size - 8);
}
OPAL_OUTPUT_VERBOSE((90, mca_btl_portals_component.portals_output,"received %x %x %x %x %x %x %x %x %x %x : header_size %x : tag %x \n",
frag->data[0],
frag->data[1],
frag->data[2],
frag->data[3],
frag->data[4],
frag->data[5],
frag->data[6],
frag->data[7],
frag->data[8],
frag->data[9],
header_size,
tag
));
OPAL_OUTPUT_VERBOSE((90, mca_btl_portals_component.portals_output,"received %d bytes \n", ev.mlength));
frag->segments[0].seg_addr.pval = &frag->data;
frag->segments[0].seg_len = header_size;
if(ev.mlength) {
frag->segments[1].seg_addr.pval = ((((char*) ev.md.start) + ev.offset));
frag->segments[1].seg_len = ev.mlength;
frag->base.des_dst_cnt = 2;
} else {
frag->base.des_dst_cnt = 1;
}
} else {
/* if we ever make this thread hot, need to do
something with the receive fragments */
frag = &mca_btl_portals_module.portals_recv_frag;
frag->segments[0].seg_addr.pval = (((char*) ev.md.start) + ev.offset);
frag->segments[0].seg_len = ev.mlength;
@ -418,7 +450,7 @@ mca_btl_portals_component_progress(void)
"received send fragment 0x%lx (thresh: %d, length %d)",
(unsigned long) frag,
ev.md.threshold, (int) ev.mlength));
}
if (ev.md.length - (ev.offset + ev.mlength) < (ptl_size_t) ev.md.max_size ||
ev.md.threshold == 1) {
/* the block is full. It's deactivated automagically, but we
@ -458,11 +490,11 @@ mca_btl_portals_component_progress(void)
"PTL_EVENT_REPLY_END for 0x%lx",
(unsigned long) frag));
/* let the PML know we're done */
frag->base.des_cbfunc(&mca_btl_portals_module.super,
frag->endpoint,
&frag->base,
OMPI_SUCCESS);
if( btl_ownership ) {
ORTE_OUTPUT_VERBOSE((90, mca_btl_portals_component.portals_output,
"in PTL_EVENT_REPLY_END received a frag with btl_ownership!"));
@ -505,10 +537,12 @@ mca_btl_portals_component_progress(void)
if (ev.ni_fail_type != PTL_NI_OK) {
orte_output(mca_btl_portals_component.portals_output,
"Failure to end send event\n");
if( MCA_BTL_DES_SEND_ALWAYS_CALLBACK & frag->base.des_flags ){
frag->base.des_cbfunc(&mca_btl_portals_module.super,
frag->endpoint,
&frag->base,
OMPI_ERROR);
}
if( btl_ownership ) {
mca_btl_portals_free(&mca_btl_portals_module.super,
&frag->base);
@ -517,10 +551,12 @@ mca_btl_portals_component_progress(void)
#endif
if(!mca_btl_portals_component.portals_need_ack) {
/* my part's done, in portals we trust! */
if( MCA_BTL_DES_SEND_ALWAYS_CALLBACK & frag->base.des_flags ){
frag->base.des_cbfunc(&mca_btl_portals_module.super,
frag->endpoint,
&frag->base,
OMPI_SUCCESS);
}
if( btl_ownership ) {
mca_btl_portals_free(&mca_btl_portals_module.super,
&frag->base);
@ -553,10 +589,12 @@ mca_btl_portals_component_progress(void)
if (ev.ni_fail_type != PTL_NI_OK) {
orte_output(mca_btl_portals_component.portals_output,
"Failure to ack event\n");
if( MCA_BTL_DES_SEND_ALWAYS_CALLBACK & frag->base.des_flags ){
frag->base.des_cbfunc(&mca_btl_portals_module.super,
frag->endpoint,
&frag->base,
OMPI_ERROR);
}
if( btl_ownership ) {
mca_btl_portals_free(&mca_btl_portals_module.super,
&frag->base);
@ -564,7 +602,7 @@ mca_btl_portals_component_progress(void)
} else
#endif
if (0 == ev.mlength) {
if (ev.rlength != ev.mlength) {
/* other side received message but truncated to 0.
This should only happen for unexpected
messages, and only when the other side has no
@ -581,10 +619,12 @@ mca_btl_portals_component_progress(void)
/* other side received the message. should have
received entire thing */
/* let the PML know we're done */
if( MCA_BTL_DES_SEND_ALWAYS_CALLBACK & frag->base.des_flags ) {
frag->base.des_cbfunc(&mca_btl_portals_module.super,
frag->endpoint,
&frag->base,
OMPI_SUCCESS);
}
if( btl_ownership ) {
mca_btl_portals_free(&mca_btl_portals_module.super,
&frag->base);

Просмотреть файл

@ -42,6 +42,7 @@ struct mca_btl_portals_frag_t {
enum { BTL_PORTALS_FRAG_TYPE_EAGER,
BTL_PORTALS_FRAG_TYPE_MAX,
BTL_PORTALS_FRAG_TYPE_USER } type;
unsigned char data[16];
};
typedef struct mca_btl_portals_frag_t mca_btl_portals_frag_t;
OBJ_CLASS_DECLARATION(mca_btl_portals_frag_t);

Просмотреть файл

@ -35,6 +35,7 @@ mca_btl_portals_put(struct mca_btl_base_module_t* btl_base,
{
mca_btl_portals_frag_t *frag = (mca_btl_portals_frag_t*) descriptor;
int ret;
unsigned char hdr_data[8];
ORTE_OUTPUT_VERBOSE((90, mca_btl_portals_component.portals_output,
"PtlPut (rdma) fragment %lx, bits %" PRIx64,
@ -45,7 +46,7 @@ mca_btl_portals_put(struct mca_btl_base_module_t* btl_base,
assert(frag->md_h != PTL_INVALID_HANDLE);
frag->endpoint = btl_peer;
frag->hdr.tag = MCA_BTL_TAG_MAX;
hdr_data[7] = frag->hdr.tag = MCA_BTL_TAG_MAX;
/* setup the send */
assert(1 == frag->base.des_src_cnt);
@ -57,7 +58,7 @@ mca_btl_portals_put(struct mca_btl_base_module_t* btl_base,
0, /* ac_index - not used*/
frag->base.des_dst[0].seg_key.key64, /* match bits */
0, /* remote offset - not used */
MCA_BTL_TAG_MAX); /* hdr_data - invalid tag */
*((ptl_hdr_data_t*) hdr_data)); /* hdr_data: tag */
if (ret != PTL_OK) {
orte_output(mca_btl_portals_component.portals_output,
"PtlPut failed with error %d", ret);

Просмотреть файл

@ -39,6 +39,7 @@ mca_btl_portals_recv_enable(mca_btl_portals_module_t *btl)
ptl_process_id_t any_proc = {PTL_NID_ANY, PTL_PID_ANY};
int ret;
int i;
uint64_t ignore_bits = ~((uint64_t) 0);
/* create the reject entry */
md.start = NULL;
@ -55,7 +56,7 @@ mca_btl_portals_recv_enable(mca_btl_portals_module_t *btl)
OMPI_BTL_PORTALS_SEND_TABLE_ID,
any_proc,
0, /* match */
0, /* ignore */
ignore_bits, /* ignore */
PTL_RETAIN,
PTL_INS_BEFORE,
&(btl->portals_recv_reject_me_h));

Просмотреть файл

@ -74,6 +74,7 @@ mca_btl_portals_activate_block(mca_btl_portals_recv_block_t *block)
int ret;
ptl_process_id_t any_proc = { PTL_NID_ANY, PTL_PID_ANY };
ptl_md_t md;
uint64_t ignore_bits = ~((uint64_t) 0);
/* if we have pending operations, something very, very, very bad
has happened... */
@ -85,7 +86,7 @@ mca_btl_portals_activate_block(mca_btl_portals_recv_block_t *block)
ret = PtlMEInsert(block->btl->portals_recv_reject_me_h,
any_proc,
0, /* match bits */
0, /* ignore bits */
ignore_bits, /* ignore bits */
PTL_UNLINK,
PTL_INS_BEFORE,
&(block->me_h));

Просмотреть файл

@ -23,6 +23,8 @@
#include <assert.h>
#include "ompi/constants.h"
#include "ompi/datatype/convertor.h"
#include "ompi/datatype/datatype.h"
#include "orte/util/output.h"
#include "btl_portals.h"
@ -37,11 +39,12 @@ mca_btl_portals_send(struct mca_btl_base_module_t* btl_base,
{
mca_btl_portals_frag_t *frag = (mca_btl_portals_frag_t*) descriptor;
int ret;
unsigned char hdr_data[8];
assert(&mca_btl_portals_module == (mca_btl_portals_module_t*) btl_base);
frag->endpoint = endpoint;
frag->hdr.tag = tag;
hdr_data[7] = frag->hdr.tag = tag;
ORTE_OUTPUT_VERBOSE((90, mca_btl_portals_component.portals_output,
"PtlPut (send) fragment %lx",
@ -103,7 +106,146 @@ mca_btl_portals_send(struct mca_btl_base_module_t* btl_base,
0, /* ac_index - not used */
0, /* match bits */
0, /* remote offset - not used */
frag->hdr.tag); /* hdr_data: tag */
*((ptl_hdr_data_t*) hdr_data)); /* hdr_data: tag */
if (ret != PTL_OK) {
orte_output(mca_btl_portals_component.portals_output,
"send: PtlPut failed with error %d", ret);
return OMPI_ERROR;
}
return OMPI_SUCCESS;
}
int mca_btl_portals_sendi(struct mca_btl_base_module_t* btl_base,
struct mca_btl_base_endpoint_t* endpoint,
struct ompi_convertor_t* convertor,
void* header,
size_t header_size,
size_t payload_size,
uint8_t order,
uint32_t flags,
mca_btl_base_tag_t tag,
mca_btl_base_descriptor_t** des)
{
mca_btl_portals_frag_t *frag = NULL;
struct iovec iov;
unsigned int iov_count;
unsigned char match_bits[8];
unsigned char hdr_data[8];
int ret;
size_t max_data;
assert(&mca_btl_portals_module == (mca_btl_portals_module_t*) btl_base);
*des = NULL;
if (OPAL_THREAD_ADD32(&mca_btl_portals_module.portals_outstanding_ops, 1) >
mca_btl_portals_module.portals_max_outstanding_ops) {
/* no space - queue and continue */
orte_output_verbose(50, mca_btl_portals_component.portals_output,
"no resources left for send inline");
OPAL_THREAD_ADD32(&mca_btl_portals_module.portals_outstanding_ops, -1);
*des = mca_btl_portals_alloc(btl_base, endpoint, order,
payload_size + header_size, flags);
return OMPI_ERR_RESOURCE_BUSY;
} else if(14 < header_size) {
/* header is too big */
*des = mca_btl_portals_alloc(btl_base, endpoint, order,
payload_size + header_size, flags);
OPAL_THREAD_ADD32(&mca_btl_portals_module.portals_outstanding_ops, -1);
return OMPI_ERR_RESOURCE_BUSY;
}
assert (payload_size <= mca_btl_portals_module.super.btl_eager_limit);
OMPI_BTL_PORTALS_FRAG_ALLOC_EAGER(&mca_btl_portals_module, frag, ret);
if (OMPI_SUCCESS != ret) {
OPAL_THREAD_ADD32(&mca_btl_portals_module.portals_outstanding_ops, -1);
return OMPI_ERR_RESOURCE_BUSY;
}
frag->segments[0].seg_len = payload_size;
frag->base.des_src_cnt = 1;
frag->base.des_flags = flags;
frag->base.order = MCA_BTL_NO_ORDER;
frag->endpoint = endpoint;
if(payload_size) {
/* pack the data into the supplied buffer */
iov.iov_base = (IOVBASE_TYPE*)((unsigned char*)frag->segments[0].seg_addr.pval);
iov.iov_len = payload_size;
iov_count = 1;
(void)ompi_convertor_pack( convertor,
&iov, &iov_count, &max_data);
assert(max_data == payload_size);
}
if(header_size) {
memcpy(match_bits, header, header_size > 8 ? 8 : header_size);
}
if(header_size > 8 ) {
memcpy(hdr_data, ((unsigned char*) header) + 8, header_size - 8);
}
hdr_data[6] = header_size;
hdr_data[7] = frag->hdr.tag = tag;
ORTE_OUTPUT_VERBOSE((90, mca_btl_portals_component.portals_output,
"PtlPut (send) fragment %lx",
(unsigned long) frag));
if (frag->md_h == PTL_INVALID_HANDLE) {
/* setup the send - always describe entire fragment */
mca_btl_portals_module.md_send.start = frag->segments[0].seg_addr.pval;
mca_btl_portals_module.md_send.length =
0 == frag->size ? frag->segments[0].seg_len : frag->size;
#if OMPI_ENABLE_DEBUG
mca_btl_portals_module.md_send.options =
PTL_MD_EVENT_START_DISABLE;
#else
/* optimized build, we can get rid of the END event */
/* if we are using portals ACK's for completion */
mca_btl_portals_module.md_send.options =
(PTL_MD_EVENT_START_DISABLE |
(mca_btl_portals_component.portals_need_ack ? PTL_MD_EVENT_END_DISABLE : 0));
#endif
mca_btl_portals_module.md_send.user_ptr = frag; /* keep a pointer to ourselves */
/* make a free-floater */
ret = PtlMDBind(mca_btl_portals_module.portals_ni_h,
mca_btl_portals_module.md_send,
PTL_UNLINK,
&frag->md_h);
if (ret != PTL_OK) {
orte_output(mca_btl_portals_component.portals_output,
"PtlMDBind failed with error %d", ret);
return OMPI_ERROR;
}
}
ORTE_OUTPUT_VERBOSE((90, mca_btl_portals_component.portals_output,
"fragment info:\n"
"\tstart: 0x%lx\n"
"\tlen: %d",
(unsigned long) frag->segments[0].seg_addr.pval,
frag->segments[0].seg_len));
ret = PtlPutRegion(frag->md_h, /* memory descriptor */
0, /* fragment offset */
frag->segments[0].seg_len, /* fragment length */
(mca_btl_portals_component.portals_need_ack ? PTL_ACK_REQ : PTL_NO_ACK_REQ),
*((mca_btl_base_endpoint_t*) endpoint),
OMPI_BTL_PORTALS_SEND_TABLE_ID,
0, /* ac_index - not used */
*((ptl_match_bits_t*) match_bits), /* match bits */
0, /* remote offset - not used */
*((ptl_hdr_data_t*) hdr_data)); /* hdr_data: tag */
if (ret != PTL_OK) {
orte_output(mca_btl_portals_component.portals_output,
"send: PtlPut failed with error %d", ret);

Просмотреть файл

@ -55,6 +55,7 @@ mca_btl_sctp_module_t mca_btl_sctp_module = {
mca_btl_sctp_prepare_src,
mca_btl_sctp_prepare_dst,
mca_btl_sctp_send,
NULL, /* send immediate */
mca_btl_sctp_put,
NULL, /* get */
mca_btl_base_dump,

Просмотреть файл

@ -63,6 +63,7 @@ mca_btl_base_module_t mca_btl_self = {
mca_btl_self_prepare_src,
mca_btl_self_prepare_dst,
mca_btl_self_send,
NULL, /* send immediate */
mca_btl_self_rdma, /* put */
mca_btl_self_rdma, /* get */
mca_btl_base_dump,
@ -305,11 +306,13 @@ int mca_btl_self_send( struct mca_btl_base_module_t* btl,
des->des_dst = NULL;
des->des_dst_cnt = 0;
/* send completion */
if( des->des_flags & MCA_BTL_DES_SEND_ALWAYS_CALLBACK ) {
des->des_cbfunc( btl, endpoint, des, OMPI_SUCCESS );
}
if( btl_ownership ) {
mca_btl_self_free( btl, des );
}
return OMPI_SUCCESS;
return 1;
}
/**

Просмотреть файл

@ -67,6 +67,7 @@ mca_btl_sm_t mca_btl_sm = {
mca_btl_sm_prepare_src,
NULL,
mca_btl_sm_send,
mca_btl_sm_sendi, /* send immediate */
NULL, /* put */
NULL, /* get */
mca_btl_base_dump,
@ -252,7 +253,7 @@ static int sm_btl_first_time_init(mca_btl_sm_t *sm_btl, int n)
length = sizeof(mca_btl_sm_frag1_t);
length_payload =
sizeof(mca_btl_sm_hdr_t) + mca_btl_sm_component.eager_limit;
ompi_free_list_init_new(&mca_btl_sm_component.sm_frags1, length,
ompi_free_list_init_new(&mca_btl_sm_component.sm_frags_eager, length,
CACHE_LINE_SIZE, OBJ_CLASS(mca_btl_sm_frag1_t),
length_payload, CACHE_LINE_SIZE,
mca_btl_sm_component.sm_free_list_num,
@ -263,7 +264,7 @@ static int sm_btl_first_time_init(mca_btl_sm_t *sm_btl, int n)
length = sizeof(mca_btl_sm_frag2_t);
length_payload =
sizeof(mca_btl_sm_hdr_t) + mca_btl_sm_component.max_frag_size;
ompi_free_list_init_new(&mca_btl_sm_component.sm_frags2, length,
ompi_free_list_init_new(&mca_btl_sm_component.sm_frags_max, length,
CACHE_LINE_SIZE, OBJ_CLASS(mca_btl_sm_frag2_t),
length_payload, CACHE_LINE_SIZE,
mca_btl_sm_component.sm_free_list_num,
@ -271,14 +272,6 @@ static int sm_btl_first_time_init(mca_btl_sm_t *sm_btl, int n)
mca_btl_sm_component.sm_free_list_inc,
mca_btl_sm_component.sm_mpool);
ompi_free_list_init_new(&mca_btl_sm_component.sm_frags,
sizeof(mca_btl_sm_frag_t), CACHE_LINE_SIZE,
OBJ_CLASS(mca_btl_sm_frag_t), 0, CACHE_LINE_SIZE,
mca_btl_sm_component.sm_free_list_num,
-1,
mca_btl_sm_component.sm_free_list_inc,
NULL);
opal_free_list_init(&mca_btl_sm_component.pending_send_fl,
sizeof(btl_sm_pending_send_item_t),
OBJ_CLASS(opal_free_list_item_t),
@ -467,7 +460,7 @@ int mca_btl_sm_add_procs(
mca_btl_sm_component.num_smp_procs += n_local_procs;
/* make sure we have enough eager fragmnents for each process */
return_code = ompi_free_list_resize(&mca_btl_sm_component.sm_frags1,
return_code = ompi_free_list_resize(&mca_btl_sm_component.sm_frags_eager,
mca_btl_sm_component.num_smp_procs * 2);
if (OMPI_SUCCESS != return_code)
goto CLEANUP;
@ -610,6 +603,99 @@ struct mca_btl_base_descriptor_t* mca_btl_sm_prepare_src(
return &frag->base;
}
#if 0
#define MCA_BTL_SM_TOUCH_DATA_TILL_CACHELINE_BOUNDARY(sm_frag) \
do { \
char* _memory = (char*)(sm_frag)->segment.seg_addr.pval + \
(sm_frag)->segment.seg_len; \
int* _intmem; \
size_t align = (intptr_t)_memory & 0xFUL; \
switch( align & 0x3 ) { \
case 3: *_memory = 0; _memory++; \
case 2: *_memory = 0; _memory++; \
case 1: *_memory = 0; _memory++; \
} \
align >>= 2; \
_intmem = (int*)_memory; \
switch( align ) { \
case 3: *_intmem = 0; _intmem++; \
case 2: *_intmem = 0; _intmem++; \
case 1: *_intmem = 0; _intmem++; \
} \
} while(0)
#else
#define MCA_BTL_SM_TOUCH_DATA_TILL_CACHELINE_BOUNDARY(sm_frag)
#endif
#if 0
if( OPAL_LIKELY(align > 0) ) { \
align = 0xFUL - align; \
memset( _memory, 0, align ); \
} \
#endif
/**
* Initiate an inline send to the peer. If failure then return a descriptor.
*
* @param btl (IN) BTL module
* @param peer (IN) BTL peer addressing
*/
int mca_btl_sm_sendi( struct mca_btl_base_module_t* btl,
struct mca_btl_base_endpoint_t* endpoint,
struct ompi_convertor_t* convertor,
void* header,
size_t header_size,
size_t payload_size,
uint8_t order,
uint32_t flags,
mca_btl_base_tag_t tag,
mca_btl_base_descriptor_t** descriptor )
{
size_t max_data, length = (header_size + payload_size);
mca_btl_sm_frag_t* frag;
int rc;
if( length < mca_btl_sm_component.eager_limit ) {
MCA_BTL_SM_FRAG_ALLOC1(frag, rc);
if( OPAL_UNLIKELY(NULL == frag) ) {
*descriptor = NULL;
return rc;
}
frag->segment.seg_len = length;
frag->hdr->len = length;
assert( 0 == (flags & MCA_BTL_DES_SEND_ALWAYS_CALLBACK) );
frag->base.des_flags = flags | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP;
frag->hdr->tag = tag;
frag->endpoint = endpoint;
memcpy( frag->segment.seg_addr.pval, header, header_size );
if( payload_size ) {
struct iovec iov;
unsigned int iov_count;
/* pack the data into the supplied buffer */
iov.iov_base = (IOVBASE_TYPE*)((unsigned char*)frag->segment.seg_addr.pval + header_size);
iov.iov_len = payload_size;
iov_count = 1;
(void)ompi_convertor_pack( convertor,
&iov, &iov_count, &max_data);
assert(max_data == payload_size);
}
MCA_BTL_SM_TOUCH_DATA_TILL_CACHELINE_BOUNDARY(frag);
/*
* post the descriptor in the queue - post with the relative
* address
*/
MCA_BTL_SM_FIFO_WRITE(endpoint, endpoint->my_smp_rank,
endpoint->peer_smp_rank, frag->hdr, false, rc);
return rc;
}
*descriptor = mca_btl_sm_alloc( btl, endpoint, order,
payload_size + header_size, flags);
return OMPI_ERR_RESOURCE_BUSY;
}
/**
* Initiate a send to the peer.
@ -617,7 +703,6 @@ struct mca_btl_base_descriptor_t* mca_btl_sm_prepare_src(
* @param btl (IN) BTL module
* @param peer (IN) BTL peer addressing
*/
int mca_btl_sm_send(
struct mca_btl_base_module_t* btl,
struct mca_btl_base_endpoint_t* endpoint,
@ -632,6 +717,8 @@ int mca_btl_sm_send(
/* type of message, pt-2-pt, one-sided, etc */
frag->hdr->tag = tag;
MCA_BTL_SM_TOUCH_DATA_TILL_CACHELINE_BOUNDARY(frag);
frag->endpoint = endpoint;
/*
@ -640,7 +727,7 @@ int mca_btl_sm_send(
*/
MCA_BTL_SM_FIFO_WRITE(endpoint, endpoint->my_smp_rank,
endpoint->peer_smp_rank, frag->hdr, false, rc);
return rc;
return (rc < 0 ? rc : 1);
}
int mca_btl_sm_ft_event(int state) {

Просмотреть файл

@ -95,9 +95,8 @@ struct mca_btl_sm_component_t {
int32_t num_smp_procs; /**< current number of smp procs on this host */
int32_t my_smp_rank; /**< My SMP process rank. Used for accessing
* SMP specfic data structures. */
ompi_free_list_t sm_frags1; /**< free list of sm first */
ompi_free_list_t sm_frags2; /**< free list of sm second */
ompi_free_list_t sm_frags; /**< free list of frags without data */
ompi_free_list_t sm_frags_eager; /**< free list of sm first */
ompi_free_list_t sm_frags_max; /**< free list of sm second */
ompi_free_list_t sm_first_frags_to_progress; /**< list of first
fragments that are
awaiting resources */
@ -270,6 +269,23 @@ struct mca_btl_base_descriptor_t* mca_btl_sm_prepare_src(
);
/**
* Initiate an inlined send to the peer or return a descriptor.
*
* @param btl (IN) BTL module
* @param peer (IN) BTL peer addressing
*/
extern int mca_btl_sm_sendi( struct mca_btl_base_module_t* btl,
struct mca_btl_base_endpoint_t* endpoint,
struct ompi_convertor_t* convertor,
void* header,
size_t header_size,
size_t payload_size,
uint8_t order,
uint32_t flags,
mca_btl_base_tag_t tag,
mca_btl_base_descriptor_t** descriptor );
/**
* Initiate a send to the peer.
*

Просмотреть файл

@ -177,9 +177,8 @@ int mca_btl_sm_component_open(void)
/* initialize objects */
OBJ_CONSTRUCT(&mca_btl_sm_component.sm_lock, opal_mutex_t);
OBJ_CONSTRUCT(&mca_btl_sm_component.sm_frags, ompi_free_list_t);
OBJ_CONSTRUCT(&mca_btl_sm_component.sm_frags1, ompi_free_list_t);
OBJ_CONSTRUCT(&mca_btl_sm_component.sm_frags2, ompi_free_list_t);
OBJ_CONSTRUCT(&mca_btl_sm_component.sm_frags_eager, ompi_free_list_t);
OBJ_CONSTRUCT(&mca_btl_sm_component.sm_frags_max, ompi_free_list_t);
OBJ_CONSTRUCT(&mca_btl_sm_component.pending_send_fl, opal_free_list_t);
return OMPI_SUCCESS;
}
@ -199,8 +198,8 @@ int mca_btl_sm_component_close(void)
* directly into the mmapped file, they will auto-magically dissapear
* when the file get unmapped.
*/
/*OBJ_DESTRUCT(&mca_btl_sm_component.sm_frags1);*/
/*OBJ_DESTRUCT(&mca_btl_sm_component.sm_frags2);*/
/*OBJ_DESTRUCT(&mca_btl_sm_component.sm_frags_eager);*/
/*OBJ_DESTRUCT(&mca_btl_sm_component.sm_frags_max);*/
/* unmap the shared memory control structure */
if(mca_btl_sm_component.mmap_file != NULL) {
@ -407,30 +406,9 @@ int mca_btl_sm_component_progress(void)
continue;
}
rc++;
/* dispatch fragment by type */
switch(((uintptr_t)hdr) & MCA_BTL_SM_FRAG_TYPE_MASK) {
case MCA_BTL_SM_FRAG_ACK:
{
int status = (uintptr_t)hdr & MCA_BTL_SM_FRAG_STATUS_MASK;
struct mca_btl_base_endpoint_t* endpoint;
int btl_ownership;
frag = (mca_btl_sm_frag_t *)((char*)((uintptr_t)hdr &
(~(MCA_BTL_SM_FRAG_TYPE_MASK |
MCA_BTL_SM_FRAG_STATUS_MASK))));
endpoint = frag->endpoint;
btl_ownership = (frag->base.des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP);
/* completion callback */
frag->base.des_cbfunc(&mca_btl_sm.super, frag->endpoint,
&frag->base, status?OMPI_ERROR:OMPI_SUCCESS);
if( btl_ownership ) {
MCA_BTL_SM_FRAG_RETURN(frag);
}
if(opal_list_get_size(&endpoint->pending_sends)) {
process_pending_send(endpoint);
}
break;
}
case MCA_BTL_SM_FRAG_SEND:
{
mca_btl_active_message_callback_t* reg;
@ -452,6 +430,30 @@ int mca_btl_sm_component_progress(void)
my_smp_rank, peer_smp_rank, hdr->frag, false, rc);
break;
}
case MCA_BTL_SM_FRAG_ACK:
{
int status = (uintptr_t)hdr & MCA_BTL_SM_FRAG_STATUS_MASK;
struct mca_btl_base_endpoint_t* endpoint;
int btl_ownership;
frag = (mca_btl_sm_frag_t *)((char*)((uintptr_t)hdr &
(~(MCA_BTL_SM_FRAG_TYPE_MASK |
MCA_BTL_SM_FRAG_STATUS_MASK))));
endpoint = frag->endpoint;
btl_ownership = (frag->base.des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP);
if( MCA_BTL_DES_SEND_ALWAYS_CALLBACK & frag->base.des_flags ) {
/* completion callback */
frag->base.des_cbfunc(&mca_btl_sm.super, frag->endpoint,
&frag->base, status?OMPI_ERROR:OMPI_SUCCESS);
}
if( btl_ownership ) {
MCA_BTL_SM_FRAG_RETURN(frag);
}
if(opal_list_get_size(&endpoint->pending_sends)) {
process_pending_send(endpoint);
}
break;
}
default:
/* unknown */
hdr = (mca_btl_sm_hdr_t*)((uintptr_t)hdr->frag |
@ -461,7 +463,6 @@ int mca_btl_sm_component_progress(void)
my_smp_rank, peer_smp_rank, hdr, false, rc);
break;
}
rc++;
}
return rc;
}

Просмотреть файл

@ -36,24 +36,17 @@ static inline void mca_btl_sm_frag_common_constructor(mca_btl_sm_frag_t* frag)
frag->base.des_flags = 0;
}
static void mca_btl_sm_frag_constructor(mca_btl_sm_frag_t* frag)
{
frag->size = 0;
frag->my_list = &mca_btl_sm_component.sm_frags;
mca_btl_sm_frag_common_constructor(frag);
}
static void mca_btl_sm_frag1_constructor(mca_btl_sm_frag_t* frag)
{
frag->size = mca_btl_sm_component.eager_limit;
frag->my_list = &mca_btl_sm_component.sm_frags1;
frag->my_list = &mca_btl_sm_component.sm_frags_eager;
mca_btl_sm_frag_common_constructor(frag);
}
static void mca_btl_sm_frag2_constructor(mca_btl_sm_frag_t* frag)
{
frag->size = mca_btl_sm_component.max_frag_size;
frag->my_list = &mca_btl_sm_component.sm_frags2;
frag->my_list = &mca_btl_sm_component.sm_frags_max;
mca_btl_sm_frag_common_constructor(frag);
}
@ -62,12 +55,6 @@ static void mca_btl_sm_frag_destructor(mca_btl_sm_frag_t* frag)
}
OBJ_CLASS_INSTANCE(
mca_btl_sm_frag_t,
mca_btl_base_descriptor_t,
mca_btl_sm_frag_constructor,
mca_btl_sm_frag_destructor);
OBJ_CLASS_INSTANCE(
mca_btl_sm_frag1_t,
mca_btl_base_descriptor_t,

Просмотреть файл

@ -69,24 +69,17 @@ OBJ_CLASS_DECLARATION(mca_btl_sm_frag_t);
OBJ_CLASS_DECLARATION(mca_btl_sm_frag1_t);
OBJ_CLASS_DECLARATION(mca_btl_sm_frag2_t);
#define MCA_BTL_SM_FRAG_ALLOC(frag, rc) \
{ \
ompi_free_list_item_t* item; \
OMPI_FREE_LIST_WAIT(&mca_btl_sm_component.sm_frags, item, rc); \
frag = (mca_btl_sm_frag_t*)item; \
}
#define MCA_BTL_SM_FRAG_ALLOC1(frag, rc) \
{ \
ompi_free_list_item_t* item; \
OMPI_FREE_LIST_GET(&mca_btl_sm_component.sm_frags1, item, rc); \
OMPI_FREE_LIST_GET(&mca_btl_sm_component.sm_frags_eager, item, rc); \
frag = (mca_btl_sm_frag_t*)item; \
}
#define MCA_BTL_SM_FRAG_ALLOC2(frag, rc) \
{ \
ompi_free_list_item_t* item; \
OMPI_FREE_LIST_GET(&mca_btl_sm_component.sm_frags2, item, rc); \
OMPI_FREE_LIST_GET(&mca_btl_sm_component.sm_frags_max, item, rc); \
frag = (mca_btl_sm_frag_t*)item; \
}

Просмотреть файл

@ -58,6 +58,7 @@ mca_btl_tcp_module_t mca_btl_tcp_module = {
mca_btl_tcp_prepare_src,
mca_btl_tcp_prepare_dst,
mca_btl_tcp_send,
NULL, /* send immediate */
mca_btl_tcp_put,
NULL, /* get */
mca_btl_base_dump,
@ -428,7 +429,7 @@ int mca_btl_tcp_put( mca_btl_base_module_t* btl,
frag->hdr.type = MCA_BTL_TCP_HDR_TYPE_PUT;
frag->hdr.count = frag->base.des_dst_cnt;
if (endpoint->endpoint_nbo) MCA_BTL_TCP_HDR_HTON(frag->hdr);
return mca_btl_tcp_endpoint_send(endpoint,frag);
return ((i = mca_btl_tcp_endpoint_send(endpoint,frag)) >= 0 ? OMPI_SUCCESS : i);
}
@ -448,6 +449,7 @@ int mca_btl_tcp_get(
{
mca_btl_tcp_module_t* tcp_btl = (mca_btl_tcp_module_t*) btl;
mca_btl_tcp_frag_t* frag = (mca_btl_tcp_frag_t*)descriptor;
int rc;
frag->btl = tcp_btl;
frag->endpoint = endpoint;
@ -464,7 +466,7 @@ int mca_btl_tcp_get(
frag->hdr.type = MCA_BTL_TCP_HDR_TYPE_GET;
frag->hdr.count = frag->base.des_src_cnt;
if (endpoint->endpoint_nbo) MCA_BTL_TCP_HDR_HTON(frag->hdr);
return mca_btl_tcp_endpoint_send(endpoint,frag);
return ((rc = mca_btl_tcp_endpoint_send(endpoint,frag)) >= 0 ? OMPI_SUCCESS : rc);
}

Просмотреть файл

@ -245,6 +245,7 @@ int mca_btl_tcp_endpoint_send(mca_btl_base_endpoint_t* btl_endpoint, mca_btl_tcp
case MCA_BTL_TCP_CONNECT_ACK:
case MCA_BTL_TCP_CLOSED:
opal_list_append(&btl_endpoint->endpoint_frags, (opal_list_item_t*)frag);
frag->base.des_flags |= MCA_BTL_DES_SEND_ALWAYS_CALLBACK;
if(btl_endpoint->endpoint_state == MCA_BTL_TCP_CLOSED)
rc = mca_btl_tcp_endpoint_start_connect(btl_endpoint);
break;
@ -258,16 +259,20 @@ int mca_btl_tcp_endpoint_send(mca_btl_base_endpoint_t* btl_endpoint, mca_btl_tcp
int btl_ownership = (frag->base.des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP);
OPAL_THREAD_UNLOCK(&btl_endpoint->endpoint_send_lock);
if( frag->base.des_flags & MCA_BTL_DES_SEND_ALWAYS_CALLBACK ) {
frag->base.des_cbfunc(&frag->btl->super, frag->endpoint, &frag->base, frag->rc);
}
if( btl_ownership ) {
MCA_BTL_TCP_FRAG_RETURN(frag);
}
return OMPI_SUCCESS;
return 1;
} else {
btl_endpoint->endpoint_send_frag = frag;
opal_event_add(&btl_endpoint->endpoint_send_event, 0);
frag->base.des_flags |= MCA_BTL_DES_SEND_ALWAYS_CALLBACK;
}
} else {
frag->base.des_flags |= MCA_BTL_DES_SEND_ALWAYS_CALLBACK;
opal_list_append(&btl_endpoint->endpoint_frags, (opal_list_item_t*)frag);
}
break;
@ -762,6 +767,7 @@ static void mca_btl_tcp_endpoint_send_handler(int sd, short flags, void* user)
/* if required - update request status and release fragment */
OPAL_THREAD_UNLOCK(&btl_endpoint->endpoint_send_lock);
assert( frag->base.des_flags & MCA_BTL_DES_SEND_ALWAYS_CALLBACK );
frag->base.des_cbfunc(&frag->btl->super, frag->endpoint, &frag->base, frag->rc);
if( btl_ownership ) {
MCA_BTL_TCP_FRAG_RETURN(frag);

Просмотреть файл

@ -54,6 +54,7 @@ mca_btl_template_module_t mca_btl_template_module = {
mca_btl_template_prepare_src,
mca_btl_template_prepare_dst,
mca_btl_template_send,
NULL, /* send immediate */
mca_btl_template_put,
NULL, /* get */
NULL, /*dump */

Просмотреть файл

@ -71,6 +71,7 @@ mca_btl_udapl_module_t mca_btl_udapl_module = {
mca_btl_udapl_prepare_src,
mca_btl_udapl_prepare_dst,
mca_btl_udapl_send,
NULL, /* send immediate */
mca_btl_udapl_put,
NULL, /* get */
mca_btl_base_dump,

Просмотреть файл

@ -285,7 +285,7 @@ static int progress()
/* we're done, so remove us from the pending list */
opal_list_remove_item(&mca_io_romio_pending_requests, item);
/* mark as complete (and make sure to wake up any waiters */
ompi_request_complete((ompi_request_t*) item);
ompi_request_complete((ompi_request_t*) item, true);
mca_io_base_request_progress_del();
/* if the request has been freed already, the user isn't
* going to call test or wait on us, so we need to do it

Просмотреть файл

@ -92,6 +92,9 @@ struct mca_mtl_portals_module_t {
/* use rendezvous for long messages */
bool ptl_use_rendezvous;
/* output channel for debugging */
int portals_output;
};
typedef struct mca_mtl_portals_module_t mca_mtl_portals_module_t;

Просмотреть файл

@ -20,6 +20,7 @@
#include "opal/event/event.h"
#include "opal/mca/base/mca_base_param.h"
#include "orte/util/output.h"
#include "ompi/datatype/convertor.h"
#include "ompi/mca/common/portals/common_portals.h"
@ -61,6 +62,8 @@ mca_mtl_base_component_1_0_0_t mca_mtl_portals_component = {
ompi_mtl_portals_component_init, /* component init */
};
static opal_output_stream_t mtl_portals_output_stream;
static int
ompi_mtl_portals_component_open(void)
{
@ -98,6 +101,24 @@ ompi_mtl_portals_component_open(void)
15 * 1024 * 1024,
&ompi_mtl_portals.ptl_recv_short_mds_size);
OBJ_CONSTRUCT(&mtl_portals_output_stream, opal_output_stream_t);
mtl_portals_output_stream.lds_is_debugging = true;
mtl_portals_output_stream.lds_want_stdout = true;
mtl_portals_output_stream.lds_file_suffix = "btl-portals";
mca_base_param_reg_int(&mca_mtl_portals_component.mtl_version,
"debug_level",
"Debugging verbosity (0 - 100)",
false,
false,
0,
&(mtl_portals_output_stream.lds_verbose_level));
asprintf(&(mtl_portals_output_stream.lds_prefix),
"btl: portals (%s): ", ompi_common_portals_nodeid());
ompi_mtl_portals.portals_output =
orte_output_open(&mtl_portals_output_stream, "MTL", "PORTALS", "DEBUG", NULL);
ompi_mtl_portals.ptl_ni_h = PTL_INVALID_HANDLE;
mca_base_param_reg_int(&mca_mtl_portals_component.mtl_version,

Просмотреть файл

@ -18,6 +18,8 @@
#include "ompi_config.h"
#include "orte/util/output.h"
#include "mtl_portals.h"
#include "mtl_portals_request.h"
#include "mtl_portals_send_short.h"

Просмотреть файл

@ -1047,7 +1047,7 @@ rdma_send_info_send(ompi_osc_rdma_module_t *module,
bml_btl = peer_send_info->bml_btl;
mca_bml_base_alloc(bml_btl, &descriptor, MCA_BTL_NO_ORDER,
sizeof(ompi_osc_rdma_rdma_info_header_t),
MCA_BTL_DES_FLAGS_PRIORITY);
MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_SEND_ALWAYS_CALLBACK);
if (NULL == descriptor) {
ret = OMPI_ERR_TEMP_OUT_OF_RESOURCE;
goto cleanup;

Просмотреть файл

@ -448,7 +448,7 @@ ompi_osc_rdma_sendreq_send(ompi_osc_rdma_module_t *module,
mca_bml_base_alloc(bml_btl, &descriptor, MCA_BTL_NO_ORDER,
module->m_use_buffers ? bml_btl->btl_eager_limit :
needed_len < bml_btl->btl_eager_limit ? needed_len :
bml_btl->btl_eager_limit, MCA_BTL_DES_FLAGS_PRIORITY);
bml_btl->btl_eager_limit, MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_SEND_ALWAYS_CALLBACK);
if (NULL == descriptor) {
ret = OMPI_ERR_TEMP_OUT_OF_RESOURCE;
goto cleanup;
@ -690,7 +690,7 @@ ompi_osc_rdma_replyreq_send(ompi_osc_rdma_module_t *module,
endpoint = (mca_bml_base_endpoint_t*) replyreq->rep_origin_proc->proc_bml;
bml_btl = mca_bml_base_btl_array_get_next(&endpoint->btl_eager);
mca_bml_base_alloc(bml_btl, &descriptor, MCA_BTL_NO_ORDER,
bml_btl->btl_eager_limit, MCA_BTL_DES_FLAGS_PRIORITY);
bml_btl->btl_eager_limit, MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_SEND_ALWAYS_CALLBACK);
if (NULL == descriptor) {
ret = OMPI_ERR_TEMP_OUT_OF_RESOURCE;
goto cleanup;
@ -1295,7 +1295,7 @@ ompi_osc_rdma_control_send(ompi_osc_rdma_module_t *module,
bml_btl = mca_bml_base_btl_array_get_next(&endpoint->btl_eager);
mca_bml_base_alloc(bml_btl, &descriptor, MCA_BTL_NO_ORDER,
sizeof(ompi_osc_rdma_control_header_t),
MCA_BTL_DES_FLAGS_PRIORITY);
MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_SEND_ALWAYS_CALLBACK);
if (NULL == descriptor) {
ret = OMPI_ERR_TEMP_OUT_OF_RESOURCE;
goto cleanup;
@ -1357,7 +1357,7 @@ ompi_osc_rdma_rdma_ack_send(ompi_osc_rdma_module_t *module,
/* Get a BTL and a fragment to go with it */
mca_bml_base_alloc(bml_btl, &descriptor, rdma_btl->rdma_order,
sizeof(ompi_osc_rdma_control_header_t),
MCA_BTL_DES_FLAGS_PRIORITY);
MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_SEND_ALWAYS_CALLBACK);
if (NULL == descriptor) {
ret = OMPI_ERR_TEMP_OUT_OF_RESOURCE;
goto cleanup;

Просмотреть файл

@ -214,7 +214,7 @@ do { \
*/
#define MCA_PML_CM_THIN_RECV_REQUEST_MPI_COMPLETE( recvreq ) \
do { \
ompi_request_complete( &(recvreq->req_base.req_ompi) ); \
ompi_request_complete( &(recvreq->req_base.req_ompi), true ); \
} while (0)
@ -233,7 +233,7 @@ do { \
MCA_PML_CM_THIN_RECV_REQUEST_RETURN( recvreq ); \
} else { \
recvreq->req_base.req_pml_complete = true; \
ompi_request_complete( &(recvreq->req_base.req_ompi) ); \
ompi_request_complete( &(recvreq->req_base.req_ompi), true ); \
} \
OPAL_THREAD_UNLOCK(&ompi_request_lock); \
} while(0)
@ -262,7 +262,7 @@ do { \
ompi_convertor_set_position(&recvreq->req_base.req_convertor, &offset); \
} \
recvreq->req_base.req_pml_complete = true; \
ompi_request_complete( &(recvreq->req_base.req_ompi) ); \
ompi_request_complete( &(recvreq->req_base.req_ompi), true ); \
} \
OPAL_THREAD_UNLOCK(&ompi_request_lock); \
} while(0)

Просмотреть файл

@ -260,7 +260,7 @@ do { \
if(OMPI_SUCCESS == ret && \
sendreq->req_send.req_send_mode == MCA_PML_BASE_SEND_BUFFERED) { \
sendreq->req_send.req_base.req_ompi.req_status.MPI_ERROR = 0; \
ompi_request_complete(&(sendreq)->req_send.req_base.req_ompi); \
ompi_request_complete(&(sendreq)->req_send.req_base.req_ompi, true); \
} \
} \
} while (0)
@ -284,7 +284,7 @@ do { \
OPAL_THREAD_LOCK(&ompi_request_lock); \
if( false == sendreq->req_send.req_base.req_ompi.req_complete ) { \
/* Should only be called for long messages (maybe synchronous) */ \
ompi_request_complete(&(sendreq->req_send.req_base.req_ompi)); \
ompi_request_complete(&(sendreq->req_send.req_base.req_ompi), true); \
} \
sendreq->req_send.req_base.req_pml_complete = true; \
\
@ -330,7 +330,7 @@ do { \
OPAL_THREAD_LOCK(&ompi_request_lock); \
if( false == sendreq->req_send.req_base.req_ompi.req_complete ) { \
/* Should only be called for long messages (maybe synchronous) */ \
ompi_request_complete(&(sendreq->req_send.req_base.req_ompi)); \
ompi_request_complete(&(sendreq->req_send.req_base.req_ompi), true); \
} \
sendreq->req_send.req_base.req_pml_complete = true; \
\

Просмотреть файл

@ -112,7 +112,7 @@ static int mca_pml_dr_recv_request_cancel(struct ompi_request_t* ompi_request, i
* on this request will be able to complete. As the status is marked as
* cancelled the cancel state will be detected.
*/
ompi_request_complete(ompi_request);
ompi_request_complete(ompi_request, true);
OPAL_THREAD_UNLOCK(&ompi_request_lock);
return OMPI_SUCCESS;
}

Просмотреть файл

@ -136,7 +136,7 @@ do {
recvreq->req_recv.req_base.req_ompi.req_status._count = \
recvreq->req_bytes_received; \
} \
ompi_request_complete( &(recvreq->req_recv.req_base.req_ompi) ); \
ompi_request_complete( &(recvreq->req_recv.req_base.req_ompi), true ); \
\
if( true == recvreq->req_recv.req_base.req_free_called ) { \
MCA_PML_DR_RECV_REQUEST_RETURN( recvreq ); \

Просмотреть файл

@ -220,7 +220,7 @@ do {
(sendreq)->req_send.req_base.req_ompi.req_status.MPI_ERROR = OMPI_SUCCESS; \
(sendreq)->req_send.req_base.req_ompi.req_status._count = \
(sendreq)->req_send.req_bytes_packed; \
ompi_request_complete( &((sendreq)->req_send.req_base.req_ompi) ); \
ompi_request_complete( &((sendreq)->req_send.req_base.req_ompi), true ); \
} while(0)
/*

Просмотреть файл

@ -365,7 +365,8 @@ int mca_pml_ob1_send_fin( ompi_proc_t* proc,
int rc;
mca_bml_base_alloc(bml_btl, &fin, order, sizeof(mca_pml_ob1_fin_hdr_t),
MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP);
MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP |
MCA_BTL_DES_SEND_ALWAYS_CALLBACK);
if(NULL == fin) {
MCA_PML_OB1_ADD_FIN_TO_PENDING(proc, hdr_des, bml_btl, order, status);
@ -387,15 +388,17 @@ int mca_pml_ob1_send_fin( ompi_proc_t* proc,
rc = mca_bml_base_send( bml_btl,
fin,
MCA_PML_OB1_HDR_TYPE_FIN );
if( OPAL_UNLIKELY(OMPI_SUCCESS != rc) ) {
if( OPAL_LIKELY( rc >= 0 ) ) {
if( OPAL_LIKELY( 1 == rc ) ) {
MCA_PML_OB1_PROGRESS_PENDING(bml_btl);
}
return OMPI_SUCCESS;
}
mca_bml_base_free(bml_btl, fin);
MCA_PML_OB1_ADD_FIN_TO_PENDING(proc, hdr_des, bml_btl, order, status);
return OMPI_ERR_OUT_OF_RESOURCE;
}
return OMPI_SUCCESS;
}
void mca_pml_ob1_process_pending_packets(mca_bml_base_btl_t* bml_btl)
{
mca_pml_ob1_pckt_pending_t *pckt;

Просмотреть файл

@ -75,6 +75,12 @@ struct mca_pml_ob1_match_hdr_t {
uint8_t hdr_padding[2]; /**< explicitly pad to 16 bytes. Compilers seem to already prefer to do this, but make it explicit just in case */
#endif
};
#if OMPI_ENABLE_HETEROGENEOUS_SUPPORT
#define OMPI_PML_OB1_MATCH_HDR_LEN 16
#else
#define OMPI_PML_OB1_MATCH_HDR_LEN 14
#endif
typedef struct mca_pml_ob1_match_hdr_t mca_pml_ob1_match_hdr_t;
#define MCA_PML_OB1_MATCH_HDR_NTOH(h) \

Просмотреть файл

@ -68,6 +68,7 @@ int mca_pml_ob1_isend(void *buf,
{
int rc;
mca_pml_ob1_send_request_t *sendreq = NULL;
MCA_PML_OB1_SEND_REQUEST_ALLOC(comm, dst, sendreq, rc);
if (rc != OMPI_SUCCESS)
return rc;
@ -99,6 +100,7 @@ int mca_pml_ob1_send(void *buf,
{
int rc;
mca_pml_ob1_send_request_t *sendreq;
MCA_PML_OB1_SEND_REQUEST_ALLOC(comm, dst, sendreq, rc);
if (rc != OMPI_SUCCESS)
return rc;

Просмотреть файл

@ -89,8 +89,7 @@ void mca_pml_ob1_recv_frag_callback_match(mca_btl_base_module_t* btl,
size_t num_segments = des->des_dst_cnt;
size_t bytes_received = 0;
if( OPAL_UNLIKELY(segment->seg_len < sizeof(mca_pml_ob1_match_hdr_t))) {
if( OPAL_UNLIKELY(segment->seg_len < OMPI_PML_OB1_MATCH_HDR_LEN) ) {
return;
}
ob1_hdr_ntoh(((mca_pml_ob1_hdr_t*) hdr), MCA_PML_OB1_HDR_TYPE_MATCH);
@ -169,7 +168,7 @@ void mca_pml_ob1_recv_frag_callback_match(mca_btl_base_module_t* btl,
OPAL_THREAD_UNLOCK(&comm->matching_lock);
if(OPAL_LIKELY(match)) {
bytes_received = segment->seg_len - sizeof(mca_pml_ob1_match_hdr_t);
bytes_received = segment->seg_len - OMPI_PML_OB1_MATCH_HDR_LEN;
match->req_recv.req_bytes_packed = bytes_received;
MCA_PML_OB1_RECV_REQUEST_MATCHED(match, hdr);
@ -189,7 +188,7 @@ void mca_pml_ob1_recv_frag_callback_match(mca_btl_base_module_t* btl,
iov[0].iov_len = bytes_received;
iov[0].iov_base = (IOVBASE_TYPE*)((unsigned char*)segment->seg_addr.pval +
sizeof(mca_pml_ob1_match_hdr_t));
OMPI_PML_OB1_MATCH_HDR_LEN);
if(num_segments > 1) {
bytes_received += segment[1].seg_len;
iov[1].iov_len = segment[1].seg_len;

Просмотреть файл

@ -235,12 +235,15 @@ int mca_pml_ob1_recv_request_ack_send_btl(
des->des_cbfunc = mca_pml_ob1_recv_ctl_completion;
rc = mca_bml_base_send(bml_btl, des, MCA_PML_OB1_HDR_TYPE_ACK);
if( OPAL_UNLIKELY(rc != OMPI_SUCCESS) ) {
mca_bml_base_free(bml_btl, des);
return OMPI_ERR_OUT_OF_RESOURCE;
if( OPAL_LIKELY( rc >= 0 ) ) {
if( OPAL_LIKELY( 1 == rc ) ) {
MCA_PML_OB1_PROGRESS_PENDING(bml_btl);
}
return OMPI_SUCCESS;
}
mca_bml_base_free(bml_btl, des);
return OMPI_ERR_OUT_OF_RESOURCE;
}
static int mca_pml_ob1_recv_request_ack(
mca_pml_ob1_recv_request_t* recvreq,
@ -614,7 +617,7 @@ void mca_pml_ob1_recv_request_progress_match( mca_pml_ob1_recv_request_t* recvre
MCA_PML_OB1_COMPUTE_SEGMENT_LENGTH( segments, num_segments,
0, bytes_received );
bytes_received -= sizeof(mca_pml_ob1_match_hdr_t);
bytes_received -= OMPI_PML_OB1_MATCH_HDR_LEN;
recvreq->req_recv.req_bytes_packed = bytes_received;
MCA_PML_OB1_RECV_REQUEST_MATCHED(recvreq, &hdr->hdr_match);
@ -630,7 +633,7 @@ void mca_pml_ob1_recv_request_progress_match( mca_pml_ob1_recv_request_t* recvre
MCA_PML_OB1_RECV_REQUEST_UNPACK( recvreq,
segments,
num_segments,
sizeof(mca_pml_ob1_match_hdr_t),
OMPI_PML_OB1_MATCH_HDR_LEN,
data_offset,
bytes_received,
bytes_delivered);
@ -666,7 +669,7 @@ void mca_pml_ob1_recv_request_matched_probe( mca_pml_ob1_recv_request_t* recvreq
case MCA_PML_OB1_HDR_TYPE_MATCH:
MCA_PML_OB1_COMPUTE_SEGMENT_LENGTH( segments, num_segments,
sizeof(mca_pml_ob1_match_hdr_t),
OMPI_PML_OB1_MATCH_HDR_LEN,
bytes_packed );
break;
@ -819,16 +822,19 @@ int mca_pml_ob1_recv_request_schedule_once(
/* send rdma request to peer */
rc = mca_bml_base_send(bml_btl, ctl, MCA_PML_OB1_HDR_TYPE_PUT);
if(OPAL_LIKELY(OMPI_SUCCESS == rc)) {
if( OPAL_LIKELY( rc >= 0 ) ) {
/* update request state */
recvreq->req_rdma_offset += size;
OPAL_THREAD_ADD_SIZE_T(&recvreq->req_pipeline_depth, 1);
recvreq->req_rdma[rdma_idx].length -= size;
bytes_remaining -= size;
if( OPAL_LIKELY( 1 == rc ) ) {
/* The send is completed, trigger the callback */
MCA_PML_OB1_PROGRESS_PENDING(bml_btl);
}
} else {
mca_bml_base_free(bml_btl,ctl);
mca_bml_base_free(bml_btl,dst);
continue;
}
}

Просмотреть файл

@ -119,7 +119,7 @@ do { \
do { \
PERUSE_TRACE_COMM_EVENT( PERUSE_COMM_REQ_COMPLETE, \
&(recvreq->req_recv.req_base), PERUSE_RECV ); \
ompi_request_complete( &(recvreq->req_recv.req_base.req_ompi) ); \
ompi_request_complete( &(recvreq->req_recv.req_base.req_ompi), true ); \
} while (0)
/*

Просмотреть файл

@ -146,25 +146,43 @@ OBJ_CLASS_INSTANCE( mca_pml_ob1_send_request_t,
* Completion of a short message - nothing left to schedule.
*/
static void
mca_pml_ob1_match_completion_free( struct mca_btl_base_module_t* btl,
static inline void
mca_pml_ob1_match_fast_completion_free( struct mca_btl_base_module_t* btl,
struct mca_btl_base_endpoint_t* ep,
struct mca_btl_base_descriptor_t* descriptor,
int status )
struct mca_btl_base_descriptor_t* des )
{
mca_pml_ob1_send_request_t* sendreq = (mca_pml_ob1_send_request_t*)descriptor->des_cbdata;
mca_bml_base_btl_t* bml_btl = (mca_bml_base_btl_t*) descriptor->des_context;
mca_pml_ob1_send_request_t* sendreq = (mca_pml_ob1_send_request_t*)des->des_cbdata;
mca_bml_base_btl_t* bml_btl = (mca_bml_base_btl_t*) des->des_context;
if( sendreq->req_send.req_bytes_packed > 0 ) {
PERUSE_TRACE_COMM_EVENT( PERUSE_COMM_REQ_XFER_BEGIN,
&(sendreq->req_send.req_base), PERUSE_SEND );
}
/* check completion status */
if( OPAL_UNLIKELY(OMPI_SUCCESS != status) ) {
/* TSW - FIX */
orte_output(0, "%s:%d FATAL", __FILE__, __LINE__);
orte_errmgr.abort(-1, NULL);
if( sendreq->req_send.req_bytes_packed > 0 ) {
PERUSE_TRACE_COMM_EVENT( PERUSE_COMM_REQ_XFER_END,
&(sendreq->req_send.req_base), PERUSE_SEND);
}
/*
* We are on the fast path, so there is no need to lock the request, as at
* this point there is only one reference to it. Moreover, there is no
* need to signal anything, as nobody is waiting on it.
*/
MCA_PML_OB1_SEND_REQUEST_MPI_COMPLETE(sendreq, false);
sendreq->req_send.req_base.req_pml_complete = true;
/* check for pending requests */
MCA_PML_OB1_PROGRESS_PENDING(bml_btl);
}
static inline void
mca_pml_ob1_match_completion_free_request( mca_bml_base_btl_t* bml_btl,
mca_pml_ob1_send_request_t* sendreq )
{
if( sendreq->req_send.req_bytes_packed > 0 ) {
PERUSE_TRACE_COMM_EVENT( PERUSE_COMM_REQ_XFER_BEGIN,
&(sendreq->req_send.req_base), PERUSE_SEND );
}
/* signal request completion */
@ -174,24 +192,14 @@ mca_pml_ob1_match_completion_free( struct mca_btl_base_module_t* btl,
MCA_PML_OB1_PROGRESS_PENDING(bml_btl);
}
/*
* Completion of the first fragment of a long message that
* requires an acknowledgement
*/
static void
mca_pml_ob1_rndv_completion( mca_btl_base_module_t* btl,
static inline void
mca_pml_ob1_match_completion_free( struct mca_btl_base_module_t* btl,
struct mca_btl_base_endpoint_t* ep,
struct mca_btl_base_descriptor_t* descriptor,
struct mca_btl_base_descriptor_t* des,
int status )
{
mca_pml_ob1_send_request_t* sendreq = (mca_pml_ob1_send_request_t*)descriptor->des_cbdata;
mca_bml_base_btl_t* bml_btl = (mca_bml_base_btl_t*)descriptor->des_context;
size_t req_bytes_delivered = 0;
if( sendreq->req_send.req_bytes_packed > 0 ) {
PERUSE_TRACE_COMM_EVENT( PERUSE_COMM_REQ_XFER_BEGIN,
&(sendreq->req_send.req_base), PERUSE_SEND );
}
mca_pml_ob1_send_request_t* sendreq = (mca_pml_ob1_send_request_t*)des->des_cbdata;
mca_bml_base_btl_t* bml_btl = (mca_bml_base_btl_t*) des->des_context;
/* check completion status */
if( OPAL_UNLIKELY(OMPI_SUCCESS != status) ) {
@ -199,15 +207,18 @@ mca_pml_ob1_rndv_completion( mca_btl_base_module_t* btl,
orte_output(0, "%s:%d FATAL", __FILE__, __LINE__);
orte_errmgr.abort(-1, NULL);
}
mca_pml_ob1_match_completion_free_request( bml_btl, sendreq );
}
/* count bytes of user data actually delivered. As the rndv completion only
* happens in one thread, the increase of the req_bytes_delivered does not
* have to be atomic.
*/
MCA_PML_OB1_COMPUTE_SEGMENT_LENGTH( descriptor->des_src,
descriptor->des_src_cnt,
sizeof(mca_pml_ob1_rendezvous_hdr_t),
req_bytes_delivered );
static inline void
mca_pml_ob1_rndv_completion_request( mca_bml_base_btl_t* bml_btl,
mca_pml_ob1_send_request_t* sendreq,
size_t req_bytes_delivered )
{
if( sendreq->req_send.req_bytes_packed > 0 ) {
PERUSE_TRACE_COMM_EVENT( PERUSE_COMM_REQ_XFER_BEGIN,
&(sendreq->req_send.req_base), PERUSE_SEND );
}
OPAL_THREAD_ADD_SIZE_T(&sendreq->req_bytes_delivered, req_bytes_delivered);
@ -220,12 +231,45 @@ mca_pml_ob1_rndv_completion( mca_btl_base_module_t* btl,
MCA_PML_OB1_PROGRESS_PENDING(bml_btl);
}
/*
* Completion of the first fragment of a long message that
* requires an acknowledgement
*/
static inline void
mca_pml_ob1_rndv_completion( mca_btl_base_module_t* btl,
struct mca_btl_base_endpoint_t* ep,
struct mca_btl_base_descriptor_t* des,
int status )
{
mca_pml_ob1_send_request_t* sendreq = (mca_pml_ob1_send_request_t*)des->des_cbdata;
mca_bml_base_btl_t* bml_btl = (mca_bml_base_btl_t*)des->des_context;
size_t req_bytes_delivered = 0;
/* check completion status */
if( OPAL_UNLIKELY(OMPI_SUCCESS != status) ) {
/* TSW - FIX */
orte_output(0, "%s:%d FATAL", __FILE__, __LINE__);
orte_errmgr.abort(-1, NULL);
}
/* count bytes of user data actually delivered. As the rndv completion only
* happens in one thread, the increase of the req_bytes_delivered does not
* have to be atomic.
*/
MCA_PML_OB1_COMPUTE_SEGMENT_LENGTH( des->des_src,
des->des_src_cnt,
sizeof(mca_pml_ob1_rendezvous_hdr_t),
req_bytes_delivered );
mca_pml_ob1_rndv_completion_request( bml_btl, sendreq, req_bytes_delivered );
}
/**
* Completion of a get request.
*/
static void
static inline void
mca_pml_ob1_rget_completion( mca_btl_base_module_t* btl,
struct mca_btl_base_endpoint_t* ep,
struct mca_btl_base_descriptor_t* des,
@ -251,13 +295,13 @@ mca_pml_ob1_rget_completion( mca_btl_base_module_t* btl,
* Completion of a control message - return resources.
*/
static void
static inline void
mca_pml_ob1_send_ctl_completion( mca_btl_base_module_t* btl,
struct mca_btl_base_endpoint_t* ep,
struct mca_btl_base_descriptor_t* descriptor,
struct mca_btl_base_descriptor_t* des,
int status )
{
mca_bml_base_btl_t* bml_btl = (mca_bml_base_btl_t*) descriptor->des_context;
mca_bml_base_btl_t* bml_btl = (mca_bml_base_btl_t*) des->des_context;
/* check for pending requests */
MCA_PML_OB1_PROGRESS_PENDING(bml_btl);
@ -268,14 +312,14 @@ mca_pml_ob1_send_ctl_completion( mca_btl_base_module_t* btl,
* to schedule additional fragments.
*/
static void
static inline void
mca_pml_ob1_frag_completion( mca_btl_base_module_t* btl,
struct mca_btl_base_endpoint_t* ep,
struct mca_btl_base_descriptor_t* descriptor,
struct mca_btl_base_descriptor_t* des,
int status )
{
mca_pml_ob1_send_request_t* sendreq = (mca_pml_ob1_send_request_t*)descriptor->des_cbdata;
mca_bml_base_btl_t* bml_btl = (mca_bml_base_btl_t*) descriptor->des_context;
mca_pml_ob1_send_request_t* sendreq = (mca_pml_ob1_send_request_t*)des->des_cbdata;
mca_bml_base_btl_t* bml_btl = (mca_bml_base_btl_t*) des->des_context;
size_t req_bytes_delivered = 0;
/* check completion status */
@ -286,8 +330,8 @@ mca_pml_ob1_frag_completion( mca_btl_base_module_t* btl,
}
/* count bytes of user data actually delivered */
MCA_PML_OB1_COMPUTE_SEGMENT_LENGTH( descriptor->des_src,
descriptor->des_src_cnt,
MCA_PML_OB1_COMPUTE_SEGMENT_LENGTH( des->des_src,
des->des_src_cnt,
sizeof(mca_pml_ob1_frag_hdr_t),
req_bytes_delivered );
@ -310,23 +354,23 @@ int mca_pml_ob1_send_request_start_buffered(
mca_bml_base_btl_t* bml_btl,
size_t size)
{
mca_btl_base_descriptor_t* descriptor;
mca_btl_base_descriptor_t* des;
mca_btl_base_segment_t* segment;
mca_pml_ob1_hdr_t* hdr;
struct iovec iov;
unsigned int iov_count;
size_t max_data;
size_t max_data, req_bytes_delivered;
int rc;
/* allocate descriptor */
mca_bml_base_alloc(bml_btl, &descriptor,
mca_bml_base_alloc(bml_btl, &des,
MCA_BTL_NO_ORDER,
sizeof(mca_pml_ob1_rendezvous_hdr_t) + size,
MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP);
if( OPAL_UNLIKELY(NULL == descriptor) ) {
if( OPAL_UNLIKELY(NULL == des) ) {
return OMPI_ERR_OUT_OF_RESOURCE;
}
segment = descriptor->des_src;
segment = des->des_src;
/* pack the data into the BTL supplied buffer */
iov.iov_base = (IOVBASE_TYPE*)((unsigned char*)segment->seg_addr.pval +
@ -338,9 +382,10 @@ int mca_pml_ob1_send_request_start_buffered(
&iov,
&iov_count,
&max_data)) < 0) {
mca_bml_base_free(bml_btl, descriptor);
mca_bml_base_free(bml_btl, des);
return rc;
}
req_bytes_delivered = max_data;
/* build rendezvous header */
hdr = (mca_pml_ob1_hdr_t*)segment->seg_addr.pval;
@ -359,13 +404,13 @@ int mca_pml_ob1_send_request_start_buffered(
/* update lengths */
segment->seg_len = sizeof(mca_pml_ob1_rendezvous_hdr_t) + max_data;
descriptor->des_cbfunc = mca_pml_ob1_rndv_completion;
descriptor->des_cbdata = sendreq;
des->des_cbfunc = mca_pml_ob1_rndv_completion;
des->des_cbdata = sendreq;
/* buffer the remainder of the message */
rc = mca_pml_base_bsend_request_alloc((ompi_request_t*)sendreq);
if( OPAL_UNLIKELY(OMPI_SUCCESS != rc) ) {
mca_bml_base_free(bml_btl, descriptor);
mca_bml_base_free(bml_btl, des);
return rc;
}
@ -376,7 +421,7 @@ int mca_pml_ob1_send_request_start_buffered(
&iov,
&iov_count,
&max_data)) < 0) {
mca_bml_base_free(bml_btl, descriptor);
mca_bml_base_free(bml_btl, des);
return rc;
}
@ -391,14 +436,18 @@ int mca_pml_ob1_send_request_start_buffered(
/* request is complete at mpi level */
OPAL_THREAD_LOCK(&ompi_request_lock);
MCA_PML_OB1_SEND_REQUEST_MPI_COMPLETE(sendreq);
MCA_PML_OB1_SEND_REQUEST_MPI_COMPLETE(sendreq, true);
OPAL_THREAD_UNLOCK(&ompi_request_lock);
/* send */
rc = mca_bml_base_send(bml_btl, descriptor, MCA_PML_OB1_HDR_TYPE_RNDV);
if( OPAL_UNLIKELY(OMPI_SUCCESS != rc) ) {
mca_bml_base_free(bml_btl, descriptor );
rc = mca_bml_base_send(bml_btl, des, MCA_PML_OB1_HDR_TYPE_RNDV);
if( OPAL_LIKELY( rc >= 0 ) ) {
if( OPAL_LIKELY( 1 == rc ) ) {
mca_pml_ob1_rndv_completion_request( bml_btl, sendreq, req_bytes_delivered);
}
return OMPI_SUCCESS;
}
mca_bml_base_free(bml_btl, des );
return rc;
}
@ -413,33 +462,75 @@ int mca_pml_ob1_send_request_start_copy( mca_pml_ob1_send_request_t* sendreq,
mca_bml_base_btl_t* bml_btl,
size_t size )
{
mca_btl_base_descriptor_t* descriptor;
mca_btl_base_descriptor_t* des = NULL;
mca_btl_base_segment_t* segment;
mca_pml_ob1_hdr_t* hdr;
struct iovec iov;
unsigned int iov_count;
size_t max_data = size;
int rc;
#if 0
if(NULL != bml_btl->btl_sendi) {
mca_pml_ob1_match_hdr_t match;
match.hdr_common.hdr_flags = 0;
match.hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_MATCH;
match.hdr_ctx = sendreq->req_send.req_base.req_comm->c_contextid;
match.hdr_src = sendreq->req_send.req_base.req_comm->c_my_rank;
match.hdr_tag = sendreq->req_send.req_base.req_tag;
match.hdr_seq = (uint16_t)sendreq->req_send.req_base.req_sequence;
/* allocate descriptor */
mca_bml_base_alloc( bml_btl, &descriptor,
MCA_BTL_NO_ORDER,
sizeof(mca_pml_ob1_match_hdr_t) + size,
MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP);
if( OPAL_UNLIKELY(NULL == descriptor) ) {
ob1_hdr_hton(&match, MCA_PML_OB1_HDR_TYPE_MATCH,
sendreq->req_send.req_base.req_proc);
/* try to send immediately */
rc = mca_bml_base_sendi( bml_btl, &sendreq->req_send.req_base.req_convertor,
&match, OMPI_PML_OB1_MATCH_HDR_LEN,
size, MCA_BTL_NO_ORDER,
MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP,
MCA_PML_OB1_HDR_TYPE_MATCH,
&des);
if( OMPI_SUCCESS == rc ) {
/* signal request completion */
send_request_pml_complete(sendreq);
/* check for pending requests */
MCA_PML_OB1_PROGRESS_PENDING(bml_btl);
return OMPI_SUCCESS;
}
switch(rc) {
case OMPI_ERR_RESOURCE_BUSY:
if(OPAL_UNLIKELY(NULL == des)) {
return OMPI_ERR_OUT_OF_RESOURCE;
}
segment = descriptor->des_src;
break;
default:
return rc;
break;
}
} else
#endif
{
/* allocate descriptor */
mca_bml_base_alloc( bml_btl, &des,
MCA_BTL_NO_ORDER,
OMPI_PML_OB1_MATCH_HDR_LEN + size,
MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP);
if( OPAL_UNLIKELY(NULL == des) ) {
return OMPI_ERR_OUT_OF_RESOURCE;
}
}
segment = des->des_src;
if(size > 0) {
/* pack the data into the supplied buffer */
iov.iov_base = (IOVBASE_TYPE*)((unsigned char*)segment->seg_addr.pval +
sizeof(mca_pml_ob1_match_hdr_t));
OMPI_PML_OB1_MATCH_HDR_LEN);
iov.iov_len = size;
iov_count = 1;
/*
* Before copy the user buffer, make the target part
* accessable.
* accessible.
*/
MEMCHECKER(
memchecker_call(&opal_memchecker_base_mem_defined,
@ -474,28 +565,28 @@ int mca_pml_ob1_send_request_start_copy( mca_pml_ob1_send_request_t* sendreq,
sendreq->req_send.req_base.req_proc);
/* update lengths */
segment->seg_len = sizeof(mca_pml_ob1_match_hdr_t) + max_data;
segment->seg_len = OMPI_PML_OB1_MATCH_HDR_LEN + max_data;
/* short message */
descriptor->des_cbdata = sendreq;
descriptor->des_cbfunc = mca_pml_ob1_match_completion_free;
des->des_cbdata = sendreq;
des->des_cbfunc = mca_pml_ob1_match_completion_free;
/* send */
rc = mca_bml_base_send_status(bml_btl, descriptor, MCA_PML_OB1_HDR_TYPE_MATCH);
rc = mca_bml_base_send_status(bml_btl, des, MCA_PML_OB1_HDR_TYPE_MATCH);
if( OPAL_LIKELY( rc >= 0 ) ) {
if( OPAL_LIKELY( 1 == rc ) ) {
mca_pml_ob1_match_completion_free_request( bml_btl, sendreq );
}
return OMPI_SUCCESS;
}
switch(rc) {
case OMPI_SUCCESS:
/* packet is on wire; signal request completion */
OPAL_THREAD_LOCK(&ompi_request_lock);
MCA_PML_OB1_SEND_REQUEST_MPI_COMPLETE(sendreq);
OPAL_THREAD_UNLOCK(&ompi_request_lock);
break;
case OMPI_ERR_RESOURCE_BUSY:
/* don't signal request completion; will be completed in wait() */
rc = OMPI_SUCCESS;
break;
default:
mca_bml_base_free(bml_btl, descriptor);
mca_bml_base_free(bml_btl, des);
break;
}
return rc;
@ -510,7 +601,7 @@ int mca_pml_ob1_send_request_start_prepare( mca_pml_ob1_send_request_t* sendreq,
mca_bml_base_btl_t* bml_btl,
size_t size )
{
mca_btl_base_descriptor_t* descriptor;
mca_btl_base_descriptor_t* des;
mca_btl_base_segment_t* segment;
mca_pml_ob1_hdr_t* hdr;
int rc;
@ -520,14 +611,14 @@ int mca_pml_ob1_send_request_start_prepare( mca_pml_ob1_send_request_t* sendreq,
NULL,
&sendreq->req_send.req_base.req_convertor,
MCA_BTL_NO_ORDER,
sizeof(mca_pml_ob1_match_hdr_t),
OMPI_PML_OB1_MATCH_HDR_LEN,
&size,
MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP,
&descriptor );
if( OPAL_UNLIKELY(NULL == descriptor) ) {
&des );
if( OPAL_UNLIKELY(NULL == des) ) {
return OMPI_ERR_OUT_OF_RESOURCE;
}
segment = descriptor->des_src;
segment = des->des_src;
/* build match header */
hdr = (mca_pml_ob1_hdr_t*)segment->seg_addr.pval;
@ -542,15 +633,18 @@ int mca_pml_ob1_send_request_start_prepare( mca_pml_ob1_send_request_t* sendreq,
sendreq->req_send.req_base.req_proc);
/* short message */
descriptor->des_cbfunc = mca_pml_ob1_match_completion_free;
descriptor->des_cbdata = sendreq;
des->des_cbfunc = mca_pml_ob1_match_completion_free;
des->des_cbdata = sendreq;
/* send */
rc = mca_bml_base_send(bml_btl, descriptor, MCA_PML_OB1_HDR_TYPE_MATCH);
if( OPAL_UNLIKELY(OMPI_SUCCESS != rc) ) {
mca_bml_base_free(bml_btl, descriptor );
rc = mca_bml_base_send(bml_btl, des, MCA_PML_OB1_HDR_TYPE_MATCH);
if( OPAL_LIKELY( rc >= 0 ) ) {
if( OPAL_LIKELY( 1 == rc ) ) {
mca_pml_ob1_match_completion_free_request( bml_btl, sendreq );
}
return OMPI_SUCCESS;
}
mca_bml_base_free(bml_btl, des );
return rc;
}
@ -706,10 +800,14 @@ int mca_pml_ob1_send_request_start_rdma(
des->des_cbdata = sendreq;
/* send */
rc = mca_bml_base_send(bml_btl, des, MCA_PML_OB1_HDR_TYPE_RNDV);
if( OPAL_UNLIKELY(OMPI_SUCCESS != rc) ) {
mca_bml_base_free(bml_btl, des);
rc = mca_bml_base_send(bml_btl, des, hdr->hdr_common.hdr_type);
if( OPAL_LIKELY( rc >= 0 ) ) {
if( OPAL_LIKELY( 1 == rc ) ) {
mca_pml_ob1_rndv_completion_request( bml_btl, sendreq, 0 );
}
return OMPI_SUCCESS;
}
mca_bml_base_free(bml_btl, des);
return rc;
}
@ -787,9 +885,13 @@ int mca_pml_ob1_send_request_start_rndv( mca_pml_ob1_send_request_t* sendreq,
/* send */
rc = mca_bml_base_send(bml_btl, des, MCA_PML_OB1_HDR_TYPE_RNDV);
if( OPAL_UNLIKELY(OMPI_SUCCESS != rc) ) {
mca_bml_base_free(bml_btl, des );
if( OPAL_LIKELY( rc >= 0 ) ) {
if( OPAL_LIKELY( 1 == rc ) ) {
mca_pml_ob1_rndv_completion_request( bml_btl, sendreq, size );
}
return OMPI_SUCCESS;
}
mca_bml_base_free(bml_btl, des );
return rc;
}
@ -958,7 +1060,7 @@ cannot_pack:
&sendreq->req_send.req_base.req_convertor,
MCA_BTL_NO_ORDER,
sizeof(mca_pml_ob1_frag_hdr_t),
&size, MCA_BTL_DES_FLAGS_BTL_OWNERSHIP, &des);
&size, MCA_BTL_DES_FLAGS_BTL_OWNERSHIP | MCA_BTL_DES_SEND_ALWAYS_CALLBACK, &des);
MEMCHECKER(
memchecker_call(&opal_memchecker_base_mem_noaccess,
sendreq->req_send.req_base.req_addr,
@ -998,8 +1100,7 @@ cannot_pack:
/* initiate send - note that this may complete before the call returns */
rc = mca_bml_base_send(bml_btl, des, MCA_PML_OB1_HDR_TYPE_FRAG);
if( OPAL_LIKELY(rc == OMPI_SUCCESS) ) {
if( OPAL_LIKELY(rc >= 0) ) {
/* update state */
range->range_btls[btl_idx].length -= size;
range->range_send_length -= size;
@ -1011,7 +1112,6 @@ cannot_pack:
}
} else {
mca_bml_base_free(bml_btl,des);
continue;
}
}

Просмотреть файл

@ -180,7 +180,7 @@ static inline void mca_pml_ob1_free_rdma_resources(mca_pml_ob1_send_request_t* s
* Mark a send request as completed at the MPI level.
*/
#define MCA_PML_OB1_SEND_REQUEST_MPI_COMPLETE(sendreq) \
#define MCA_PML_OB1_SEND_REQUEST_MPI_COMPLETE(sendreq, with_signal) \
do { \
(sendreq)->req_send.req_base.req_ompi.req_status.MPI_SOURCE = \
(sendreq)->req_send.req_base.req_comm->c_my_rank; \
@ -189,7 +189,7 @@ do {
(sendreq)->req_send.req_base.req_ompi.req_status.MPI_ERROR = OMPI_SUCCESS; \
(sendreq)->req_send.req_base.req_ompi.req_status._count = \
(int)(sendreq)->req_send.req_bytes_packed; \
ompi_request_complete( &((sendreq)->req_send.req_base.req_ompi) ); \
ompi_request_complete( &((sendreq)->req_send.req_base.req_ompi), (with_signal) ); \
\
PERUSE_TRACE_COMM_EVENT( PERUSE_COMM_REQ_COMPLETE, \
&(sendreq->req_send.req_base), PERUSE_SEND); \
@ -237,7 +237,7 @@ send_request_pml_complete(mca_pml_ob1_send_request_t *sendreq)
OPAL_THREAD_LOCK(&ompi_request_lock);
if(false == sendreq->req_send.req_base.req_ompi.req_complete) {
/* Should only be called for long messages (maybe synchronous) */
MCA_PML_OB1_SEND_REQUEST_MPI_COMPLETE(sendreq);
MCA_PML_OB1_SEND_REQUEST_MPI_COMPLETE(sendreq, true);
}
sendreq->req_send.req_base.req_pml_complete = true;
@ -459,5 +459,4 @@ void mca_pml_ob1_send_request_copy_in_out(mca_pml_ob1_send_request_t *sendreq,
END_C_DECLS
#endif /* !defined(OMPI_PML_OB1_SEND_REQUEST_H) */
#endif /* OMPI_PML_OB1_SEND_REQUEST_H */

Просмотреть файл

@ -180,7 +180,7 @@ int ompi_grequest_complete(ompi_request_t *req)
int rc;
OPAL_THREAD_LOCK(&ompi_request_lock);
rc = ompi_request_complete(req);
rc = ompi_request_complete(req, true);
OPAL_THREAD_UNLOCK(&ompi_request_lock);
OBJ_RELEASE(req);
return rc;

Просмотреть файл

@ -386,19 +386,23 @@ static inline void ompi_request_wait_completion(ompi_request_t *req)
}
/**
* Signal a request as complete. Note this will
* wake any thread pending on the request.
* ompi_request_lock should be held while calling this function
* Signal or mark a request as complete. If with_signal is true this will
* wake any thread pending on the request and ompi_request_lock should be
* held while calling this function. If with_signal is false, there will
* signal generated, and no lock required. This is a special case when
* the function is called from the critical path for small messages, where
* we know the current execution flow created the request, and is still
* in the _START macro.
*/
static inline int ompi_request_complete(ompi_request_t* request)
static inline int ompi_request_complete(ompi_request_t* request, bool with_signal)
{
if( NULL != request->req_complete_cb ) {
request->req_complete_cb( request );
}
request->req_complete = true;
ompi_request_completed++;
request->req_complete = true;
if(ompi_request_waiting) {
if(with_signal && ompi_request_waiting) {
/* Broadcast the condition, otherwise if there is already a thread
* waiting on another request it can use all signals.
*/