2005-05-24 02:06:50 +04:00
|
|
|
/*
|
2005-11-05 22:57:48 +03:00
|
|
|
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
|
|
|
* University Research and Technology
|
|
|
|
* Corporation. All rights reserved.
|
2008-02-18 20:39:30 +03:00
|
|
|
* Copyright (c) 2004-2008 The University of Tennessee and The University
|
2005-11-05 22:57:48 +03:00
|
|
|
* of Tennessee Research Foundation. All rights
|
|
|
|
* reserved.
|
2008-02-12 11:46:27 +03:00
|
|
|
* Copyright (c) 2004-2008 High Performance Computing Center Stuttgart,
|
2005-05-24 02:06:50 +04:00
|
|
|
* University of Stuttgart. All rights reserved.
|
|
|
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
|
|
|
* All rights reserved.
|
2008-05-30 05:29:09 +04:00
|
|
|
* Copyright (c) 2008 UT-Battelle, LLC. All rights reserved.
|
2005-05-24 02:06:50 +04:00
|
|
|
* $COPYRIGHT$
|
|
|
|
*
|
|
|
|
* Additional copyrights may follow
|
|
|
|
*
|
|
|
|
* $HEADER$
|
|
|
|
*/
|
|
|
|
|
2006-10-28 03:16:13 +04:00
|
|
|
|
2008-02-12 11:46:27 +03:00
|
|
|
#include "ompi_config.h"
|
2006-10-28 03:16:13 +04:00
|
|
|
#include "opal/prefetch.h"
|
2006-02-12 04:33:29 +03:00
|
|
|
#include "ompi/constants.h"
|
|
|
|
#include "ompi/mca/pml/pml.h"
|
|
|
|
#include "ompi/mca/btl/btl.h"
|
|
|
|
#include "orte/mca/errmgr/errmgr.h"
|
|
|
|
#include "ompi/mca/mpool/mpool.h"
|
2005-05-24 02:06:50 +04:00
|
|
|
#include "pml_ob1.h"
|
|
|
|
#include "pml_ob1_hdr.h"
|
|
|
|
#include "pml_ob1_sendreq.h"
|
2005-06-10 00:16:33 +04:00
|
|
|
#include "pml_ob1_rdmafrag.h"
|
2005-05-24 02:06:50 +04:00
|
|
|
#include "pml_ob1_recvreq.h"
|
2006-02-12 04:33:29 +03:00
|
|
|
#include "ompi/mca/bml/base/base.h"
|
2008-02-12 21:01:17 +03:00
|
|
|
#include "ompi/memchecker.h"
|
2005-05-24 02:06:50 +04:00
|
|
|
|
2007-06-03 12:30:07 +04:00
|
|
|
OBJ_CLASS_INSTANCE(mca_pml_ob1_send_range_t, ompi_free_list_item_t,
|
|
|
|
NULL, NULL);
|
2006-10-26 17:21:47 +04:00
|
|
|
|
|
|
|
void mca_pml_ob1_send_request_process_pending(mca_bml_base_btl_t *bml_btl)
|
2006-07-20 18:44:35 +04:00
|
|
|
{
|
|
|
|
int i, s = opal_list_get_size(&mca_pml_ob1.send_pending);
|
|
|
|
|
|
|
|
/* advance pending requests */
|
|
|
|
for(i = 0; i < s; i++) {
|
2007-09-11 22:00:53 +04:00
|
|
|
mca_pml_ob1_send_pending_t pending_type = MCA_PML_OB1_SEND_PENDING_NONE;
|
2006-07-20 18:44:35 +04:00
|
|
|
mca_pml_ob1_send_request_t* sendreq;
|
2006-10-26 17:21:47 +04:00
|
|
|
mca_bml_base_btl_t *send_dst;
|
2007-07-11 03:45:23 +04:00
|
|
|
|
2007-08-30 16:10:04 +04:00
|
|
|
sendreq = get_request_from_send_pending(&pending_type);
|
|
|
|
if(OPAL_UNLIKELY(NULL == sendreq))
|
2006-07-20 18:44:35 +04:00
|
|
|
break;
|
2007-08-30 16:10:04 +04:00
|
|
|
|
2006-07-20 18:44:35 +04:00
|
|
|
switch(pending_type) {
|
|
|
|
case MCA_PML_OB1_SEND_PENDING_SCHEDULE:
|
|
|
|
if(mca_pml_ob1_send_request_schedule_exclusive(sendreq) ==
|
|
|
|
OMPI_ERR_OUT_OF_RESOURCE) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
case MCA_PML_OB1_SEND_PENDING_START:
|
2006-10-26 17:21:47 +04:00
|
|
|
send_dst = mca_bml_base_btl_array_find(
|
|
|
|
&sendreq->req_endpoint->btl_eager, bml_btl->btl);
|
|
|
|
if(NULL == send_dst ||
|
|
|
|
mca_pml_ob1_send_request_start_btl(sendreq, send_dst) ==
|
2006-07-20 18:44:35 +04:00
|
|
|
OMPI_ERR_OUT_OF_RESOURCE) {
|
2007-08-30 16:10:04 +04:00
|
|
|
/* prepend to the pending list to minimize reordering in case
|
|
|
|
* send_dst != 0 */
|
|
|
|
add_request_to_send_pending(sendreq,
|
|
|
|
MCA_PML_OB1_SEND_PENDING_START, NULL == send_dst);
|
|
|
|
/* if no destination try next request otherwise give up,
|
|
|
|
* no more resources on this btl */
|
|
|
|
if(send_dst != NULL)
|
|
|
|
return;
|
2006-07-20 18:44:35 +04:00
|
|
|
}
|
|
|
|
break;
|
|
|
|
default:
|
This commit represents a bunch of work on a Mercurial side branch. As
such, the commit message back to the master SVN repository is fairly
long.
= ORTE Job-Level Output Messages =
Add two new interfaces that should be used for all new code throughout
the ORTE and OMPI layers (we already make the search-and-replace on
the existing ORTE / OMPI layers):
* orte_output(): (and corresponding friends ORTE_OUTPUT,
orte_output_verbose, etc.) This function sends the output directly
to the HNP for processing as part of a job-specific output
channel. It supports all the same outputs as opal_output()
(syslog, file, stdout, stderr), but for stdout/stderr, the output
is sent to the HNP for processing and output. More on this below.
* orte_show_help(): This function is a drop-in-replacement for
opal_show_help(), with two differences in functionality:
1. the rendered text help message output is sent to the HNP for
display (rather than outputting directly into the process' stderr
stream)
1. the HNP detects duplicate help messages and does not display them
(so that you don't see the same error message N times, once from
each of your N MPI processes); instead, it counts "new" instances
of the help message and displays a message every ~5 seconds when
there are new ones ("I got X new copies of the help message...")
opal_show_help and opal_output still exist, but they only output in
the current process. The intent for the new orte_* functions is that
they can apply job-level intelligence to the output. As such, we
recommend that all new ORTE and OMPI code use the new orte_*
functions, not thei opal_* functions.
=== New code ===
For ORTE and OMPI programmers, here's what you need to do differently
in new code:
* Do not include opal/util/show_help.h or opal/util/output.h.
Instead, include orte/util/output.h (this one header file has
declarations for both the orte_output() series of functions and
orte_show_help()).
* Effectively s/opal_output/orte_output/gi throughout your code.
Note that orte_output_open() takes a slightly different argument
list (as a way to pass data to the filtering stream -- see below),
so you if explicitly call opal_output_open(), you'll need to
slightly adapt to the new signature of orte_output_open().
* Literally s/opal_show_help/orte_show_help/. The function signature
is identical.
=== Notes ===
* orte_output'ing to stream 0 will do similar to what
opal_output'ing did, so leaving a hard-coded "0" as the first
argument is safe.
* For systems that do not use ORTE's RML or the HNP, the effect of
orte_output_* and orte_show_help will be identical to their opal
counterparts (the additional information passed to
orte_output_open() will be lost!). Indeed, the orte_* functions
simply become trivial wrappers to their opal_* counterparts. Note
that we have not tested this; the code is simple but it is quite
possible that we mucked something up.
= Filter Framework =
Messages sent view the new orte_* functions described above and
messages output via the IOF on the HNP will now optionally be passed
through a new "filter" framework before being output to
stdout/stderr. The "filter" OPAL MCA framework is intended to allow
preprocessing to messages before they are sent to their final
destinations. The first component that was written in the filter
framework was to create an XML stream, segregating all the messages
into different XML tags, etc. This will allow 3rd party tools to read
the stdout/stderr from the HNP and be able to know exactly what each
text message is (e.g., a help message, another OMPI infrastructure
message, stdout from the user process, stderr from the user process,
etc.).
Filtering is not active by default. Filter components must be
specifically requested, such as:
{{{
$ mpirun --mca filter xml ...
}}}
There can only be one filter component active.
= New MCA Parameters =
The new functionality described above introduces two new MCA
parameters:
* '''orte_base_help_aggregate''': Defaults to 1 (true), meaning that
help messages will be aggregated, as described above. If set to 0,
all help messages will be displayed, even if they are duplicates
(i.e., the original behavior).
* '''orte_base_show_output_recursions''': An MCA parameter to help
debug one of the known issues, described below. It is likely that
this MCA parameter will disappear before v1.3 final.
= Known Issues =
* The XML filter component is not complete. The current output from
this component is preliminary and not real XML. A bit more work
needs to be done to configure.m4 search for an appropriate XML
library/link it in/use it at run time.
* There are possible recursion loops in the orte_output() and
orte_show_help() functions -- e.g., if RML send calls orte_output()
or orte_show_help(). We have some ideas how to fix these, but
figured that it was ok to commit before feature freeze with known
issues. The code currently contains sub-optimal workarounds so
that this will not be a problem, but it would be good to actually
solve the problem rather than have hackish workarounds before v1.3 final.
This commit was SVN r18434.
2008-05-14 00:00:55 +04:00
|
|
|
orte_output(0, "[%s:%d] wrong send request type\n",
|
2006-07-20 18:44:35 +04:00
|
|
|
__FILE__, __LINE__);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2006-03-16 01:53:41 +03:00
|
|
|
/*
|
|
|
|
* The free call mark the final stage in a request life-cycle. Starting from this
|
|
|
|
* point the request is completed at both PML and user level, and can be used
|
|
|
|
* for others p2p communications. Therefore, in the case of the OB1 PML it should
|
|
|
|
* be added to the free request list.
|
|
|
|
*/
|
2006-03-24 07:21:30 +03:00
|
|
|
static int mca_pml_ob1_send_request_free(struct ompi_request_t** request)
|
2005-05-24 02:06:50 +04:00
|
|
|
{
|
2006-03-16 01:53:41 +03:00
|
|
|
mca_pml_ob1_send_request_t* sendreq = *(mca_pml_ob1_send_request_t**)request;
|
|
|
|
|
|
|
|
assert( false == sendreq->req_send.req_base.req_free_called );
|
|
|
|
|
2006-03-02 03:39:07 +03:00
|
|
|
OPAL_THREAD_LOCK(&ompi_request_lock);
|
2006-03-16 01:53:41 +03:00
|
|
|
sendreq->req_send.req_base.req_free_called = true;
|
2006-03-31 21:09:09 +04:00
|
|
|
|
|
|
|
PERUSE_TRACE_COMM_EVENT( PERUSE_COMM_REQ_NOTIFY,
|
|
|
|
&(sendreq->req_send.req_base), PERUSE_SEND );
|
|
|
|
|
2006-09-29 03:54:38 +04:00
|
|
|
if( true == sendreq->req_send.req_base.req_pml_complete ) {
|
2007-08-30 16:08:33 +04:00
|
|
|
MCA_PML_OB1_SEND_REQUEST_RETURN( sendreq );
|
2006-09-29 03:54:38 +04:00
|
|
|
}
|
|
|
|
|
2006-03-02 03:39:07 +03:00
|
|
|
OPAL_THREAD_UNLOCK(&ompi_request_lock);
|
2008-04-07 21:46:50 +04:00
|
|
|
MEMCHECKER(
|
2008-05-07 16:28:51 +04:00
|
|
|
memchecker_call(&opal_memchecker_base_mem_defined,
|
|
|
|
sendreq->req_send.req_base.req_addr,
|
|
|
|
sendreq->req_send.req_base.req_count,
|
|
|
|
sendreq->req_send.req_base.req_datatype);
|
2008-04-07 21:46:50 +04:00
|
|
|
);
|
2006-02-14 12:09:05 +03:00
|
|
|
*request = MPI_REQUEST_NULL;
|
2005-05-24 02:06:50 +04:00
|
|
|
return OMPI_SUCCESS;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int mca_pml_ob1_send_request_cancel(struct ompi_request_t* request, int complete)
|
|
|
|
{
|
|
|
|
/* we dont cancel send requests by now */
|
|
|
|
return OMPI_SUCCESS;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void mca_pml_ob1_send_request_construct(mca_pml_ob1_send_request_t* req)
|
|
|
|
{
|
|
|
|
req->req_send.req_base.req_type = MCA_PML_REQUEST_SEND;
|
|
|
|
req->req_send.req_base.req_ompi.req_free = mca_pml_ob1_send_request_free;
|
|
|
|
req->req_send.req_base.req_ompi.req_cancel = mca_pml_ob1_send_request_cancel;
|
2006-03-16 01:53:41 +03:00
|
|
|
req->req_rdma_cnt = 0;
|
2007-06-03 12:30:07 +04:00
|
|
|
req->req_throttle_sends = false;
|
|
|
|
OBJ_CONSTRUCT(&req->req_send_ranges, opal_list_t);
|
2007-07-30 12:21:52 +04:00
|
|
|
OBJ_CONSTRUCT(&req->req_send_range_lock, opal_mutex_t);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void mca_pml_ob1_send_request_destruct(mca_pml_ob1_send_request_t* req)
|
|
|
|
{
|
|
|
|
OBJ_DESTRUCT(&req->req_send_ranges);
|
|
|
|
OBJ_DESTRUCT(&req->req_send_range_lock);
|
2005-05-24 02:06:50 +04:00
|
|
|
}
|
|
|
|
|
2007-07-11 03:45:23 +04:00
|
|
|
OBJ_CLASS_INSTANCE( mca_pml_ob1_send_request_t,
|
|
|
|
mca_pml_base_send_request_t,
|
|
|
|
mca_pml_ob1_send_request_construct,
|
2007-07-30 12:21:52 +04:00
|
|
|
mca_pml_ob1_send_request_destruct );
|
2005-05-24 02:06:50 +04:00
|
|
|
|
2005-09-14 21:08:08 +04:00
|
|
|
/**
|
|
|
|
* Completion of a short message - nothing left to schedule.
|
|
|
|
*/
|
|
|
|
|
2007-07-11 03:45:23 +04:00
|
|
|
static void
|
|
|
|
mca_pml_ob1_match_completion_free( struct mca_btl_base_module_t* btl,
|
|
|
|
struct mca_btl_base_endpoint_t* ep,
|
|
|
|
struct mca_btl_base_descriptor_t* descriptor,
|
|
|
|
int status )
|
2005-09-14 21:08:08 +04:00
|
|
|
{
|
|
|
|
mca_pml_ob1_send_request_t* sendreq = (mca_pml_ob1_send_request_t*)descriptor->des_cbdata;
|
|
|
|
mca_bml_base_btl_t* bml_btl = (mca_bml_base_btl_t*) descriptor->des_context;
|
|
|
|
|
2006-07-06 03:39:13 +04:00
|
|
|
if( sendreq->req_send.req_bytes_packed > 0 ) {
|
|
|
|
PERUSE_TRACE_COMM_EVENT( PERUSE_COMM_REQ_XFER_BEGIN,
|
|
|
|
&(sendreq->req_send.req_base), PERUSE_SEND );
|
|
|
|
}
|
2006-03-31 21:09:09 +04:00
|
|
|
|
2005-09-14 21:08:08 +04:00
|
|
|
/* check completion status */
|
2007-07-11 03:45:23 +04:00
|
|
|
if( OPAL_UNLIKELY(OMPI_SUCCESS != status) ) {
|
2005-09-14 21:08:08 +04:00
|
|
|
/* TSW - FIX */
|
This commit represents a bunch of work on a Mercurial side branch. As
such, the commit message back to the master SVN repository is fairly
long.
= ORTE Job-Level Output Messages =
Add two new interfaces that should be used for all new code throughout
the ORTE and OMPI layers (we already make the search-and-replace on
the existing ORTE / OMPI layers):
* orte_output(): (and corresponding friends ORTE_OUTPUT,
orte_output_verbose, etc.) This function sends the output directly
to the HNP for processing as part of a job-specific output
channel. It supports all the same outputs as opal_output()
(syslog, file, stdout, stderr), but for stdout/stderr, the output
is sent to the HNP for processing and output. More on this below.
* orte_show_help(): This function is a drop-in-replacement for
opal_show_help(), with two differences in functionality:
1. the rendered text help message output is sent to the HNP for
display (rather than outputting directly into the process' stderr
stream)
1. the HNP detects duplicate help messages and does not display them
(so that you don't see the same error message N times, once from
each of your N MPI processes); instead, it counts "new" instances
of the help message and displays a message every ~5 seconds when
there are new ones ("I got X new copies of the help message...")
opal_show_help and opal_output still exist, but they only output in
the current process. The intent for the new orte_* functions is that
they can apply job-level intelligence to the output. As such, we
recommend that all new ORTE and OMPI code use the new orte_*
functions, not thei opal_* functions.
=== New code ===
For ORTE and OMPI programmers, here's what you need to do differently
in new code:
* Do not include opal/util/show_help.h or opal/util/output.h.
Instead, include orte/util/output.h (this one header file has
declarations for both the orte_output() series of functions and
orte_show_help()).
* Effectively s/opal_output/orte_output/gi throughout your code.
Note that orte_output_open() takes a slightly different argument
list (as a way to pass data to the filtering stream -- see below),
so you if explicitly call opal_output_open(), you'll need to
slightly adapt to the new signature of orte_output_open().
* Literally s/opal_show_help/orte_show_help/. The function signature
is identical.
=== Notes ===
* orte_output'ing to stream 0 will do similar to what
opal_output'ing did, so leaving a hard-coded "0" as the first
argument is safe.
* For systems that do not use ORTE's RML or the HNP, the effect of
orte_output_* and orte_show_help will be identical to their opal
counterparts (the additional information passed to
orte_output_open() will be lost!). Indeed, the orte_* functions
simply become trivial wrappers to their opal_* counterparts. Note
that we have not tested this; the code is simple but it is quite
possible that we mucked something up.
= Filter Framework =
Messages sent view the new orte_* functions described above and
messages output via the IOF on the HNP will now optionally be passed
through a new "filter" framework before being output to
stdout/stderr. The "filter" OPAL MCA framework is intended to allow
preprocessing to messages before they are sent to their final
destinations. The first component that was written in the filter
framework was to create an XML stream, segregating all the messages
into different XML tags, etc. This will allow 3rd party tools to read
the stdout/stderr from the HNP and be able to know exactly what each
text message is (e.g., a help message, another OMPI infrastructure
message, stdout from the user process, stderr from the user process,
etc.).
Filtering is not active by default. Filter components must be
specifically requested, such as:
{{{
$ mpirun --mca filter xml ...
}}}
There can only be one filter component active.
= New MCA Parameters =
The new functionality described above introduces two new MCA
parameters:
* '''orte_base_help_aggregate''': Defaults to 1 (true), meaning that
help messages will be aggregated, as described above. If set to 0,
all help messages will be displayed, even if they are duplicates
(i.e., the original behavior).
* '''orte_base_show_output_recursions''': An MCA parameter to help
debug one of the known issues, described below. It is likely that
this MCA parameter will disappear before v1.3 final.
= Known Issues =
* The XML filter component is not complete. The current output from
this component is preliminary and not real XML. A bit more work
needs to be done to configure.m4 search for an appropriate XML
library/link it in/use it at run time.
* There are possible recursion loops in the orte_output() and
orte_show_help() functions -- e.g., if RML send calls orte_output()
or orte_show_help(). We have some ideas how to fix these, but
figured that it was ok to commit before feature freeze with known
issues. The code currently contains sub-optimal workarounds so
that this will not be a problem, but it would be good to actually
solve the problem rather than have hackish workarounds before v1.3 final.
This commit was SVN r18434.
2008-05-14 00:00:55 +04:00
|
|
|
orte_output(0, "%s:%d FATAL", __FILE__, __LINE__);
|
2008-02-28 04:57:57 +03:00
|
|
|
orte_errmgr.abort(-1, NULL);
|
2005-09-14 21:08:08 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
/* signal request completion */
|
2007-08-30 16:08:33 +04:00
|
|
|
send_request_pml_complete(sendreq);
|
2006-07-20 18:44:35 +04:00
|
|
|
|
|
|
|
/* check for pending requests */
|
|
|
|
MCA_PML_OB1_PROGRESS_PENDING(bml_btl);
|
2005-09-14 21:08:08 +04:00
|
|
|
}
|
|
|
|
|
2005-07-18 22:54:25 +04:00
|
|
|
/*
|
|
|
|
* Completion of the first fragment of a long message that
|
|
|
|
* requires an acknowledgement
|
2005-06-09 00:37:19 +04:00
|
|
|
*/
|
2007-07-11 03:45:23 +04:00
|
|
|
static void
|
|
|
|
mca_pml_ob1_rndv_completion( mca_btl_base_module_t* btl,
|
|
|
|
struct mca_btl_base_endpoint_t* ep,
|
|
|
|
struct mca_btl_base_descriptor_t* descriptor,
|
|
|
|
int status )
|
2005-06-09 00:37:19 +04:00
|
|
|
{
|
|
|
|
mca_pml_ob1_send_request_t* sendreq = (mca_pml_ob1_send_request_t*)descriptor->des_cbdata;
|
2007-01-03 18:19:34 +03:00
|
|
|
mca_bml_base_btl_t* bml_btl = (mca_bml_base_btl_t*)descriptor->des_context;
|
|
|
|
size_t req_bytes_delivered = 0;
|
2006-02-08 09:03:54 +03:00
|
|
|
|
2007-07-06 19:02:36 +04:00
|
|
|
if( sendreq->req_send.req_bytes_packed > 0 ) {
|
2006-07-06 03:39:13 +04:00
|
|
|
PERUSE_TRACE_COMM_EVENT( PERUSE_COMM_REQ_XFER_BEGIN,
|
|
|
|
&(sendreq->req_send.req_base), PERUSE_SEND );
|
|
|
|
}
|
2006-03-31 21:09:09 +04:00
|
|
|
|
2005-06-10 00:16:33 +04:00
|
|
|
/* check completion status */
|
2007-07-11 03:45:23 +04:00
|
|
|
if( OPAL_UNLIKELY(OMPI_SUCCESS != status) ) {
|
2005-06-10 00:16:33 +04:00
|
|
|
/* TSW - FIX */
|
This commit represents a bunch of work on a Mercurial side branch. As
such, the commit message back to the master SVN repository is fairly
long.
= ORTE Job-Level Output Messages =
Add two new interfaces that should be used for all new code throughout
the ORTE and OMPI layers (we already make the search-and-replace on
the existing ORTE / OMPI layers):
* orte_output(): (and corresponding friends ORTE_OUTPUT,
orte_output_verbose, etc.) This function sends the output directly
to the HNP for processing as part of a job-specific output
channel. It supports all the same outputs as opal_output()
(syslog, file, stdout, stderr), but for stdout/stderr, the output
is sent to the HNP for processing and output. More on this below.
* orte_show_help(): This function is a drop-in-replacement for
opal_show_help(), with two differences in functionality:
1. the rendered text help message output is sent to the HNP for
display (rather than outputting directly into the process' stderr
stream)
1. the HNP detects duplicate help messages and does not display them
(so that you don't see the same error message N times, once from
each of your N MPI processes); instead, it counts "new" instances
of the help message and displays a message every ~5 seconds when
there are new ones ("I got X new copies of the help message...")
opal_show_help and opal_output still exist, but they only output in
the current process. The intent for the new orte_* functions is that
they can apply job-level intelligence to the output. As such, we
recommend that all new ORTE and OMPI code use the new orte_*
functions, not thei opal_* functions.
=== New code ===
For ORTE and OMPI programmers, here's what you need to do differently
in new code:
* Do not include opal/util/show_help.h or opal/util/output.h.
Instead, include orte/util/output.h (this one header file has
declarations for both the orte_output() series of functions and
orte_show_help()).
* Effectively s/opal_output/orte_output/gi throughout your code.
Note that orte_output_open() takes a slightly different argument
list (as a way to pass data to the filtering stream -- see below),
so you if explicitly call opal_output_open(), you'll need to
slightly adapt to the new signature of orte_output_open().
* Literally s/opal_show_help/orte_show_help/. The function signature
is identical.
=== Notes ===
* orte_output'ing to stream 0 will do similar to what
opal_output'ing did, so leaving a hard-coded "0" as the first
argument is safe.
* For systems that do not use ORTE's RML or the HNP, the effect of
orte_output_* and orte_show_help will be identical to their opal
counterparts (the additional information passed to
orte_output_open() will be lost!). Indeed, the orte_* functions
simply become trivial wrappers to their opal_* counterparts. Note
that we have not tested this; the code is simple but it is quite
possible that we mucked something up.
= Filter Framework =
Messages sent view the new orte_* functions described above and
messages output via the IOF on the HNP will now optionally be passed
through a new "filter" framework before being output to
stdout/stderr. The "filter" OPAL MCA framework is intended to allow
preprocessing to messages before they are sent to their final
destinations. The first component that was written in the filter
framework was to create an XML stream, segregating all the messages
into different XML tags, etc. This will allow 3rd party tools to read
the stdout/stderr from the HNP and be able to know exactly what each
text message is (e.g., a help message, another OMPI infrastructure
message, stdout from the user process, stderr from the user process,
etc.).
Filtering is not active by default. Filter components must be
specifically requested, such as:
{{{
$ mpirun --mca filter xml ...
}}}
There can only be one filter component active.
= New MCA Parameters =
The new functionality described above introduces two new MCA
parameters:
* '''orte_base_help_aggregate''': Defaults to 1 (true), meaning that
help messages will be aggregated, as described above. If set to 0,
all help messages will be displayed, even if they are duplicates
(i.e., the original behavior).
* '''orte_base_show_output_recursions''': An MCA parameter to help
debug one of the known issues, described below. It is likely that
this MCA parameter will disappear before v1.3 final.
= Known Issues =
* The XML filter component is not complete. The current output from
this component is preliminary and not real XML. A bit more work
needs to be done to configure.m4 search for an appropriate XML
library/link it in/use it at run time.
* There are possible recursion loops in the orte_output() and
orte_show_help() functions -- e.g., if RML send calls orte_output()
or orte_show_help(). We have some ideas how to fix these, but
figured that it was ok to commit before feature freeze with known
issues. The code currently contains sub-optimal workarounds so
that this will not be a problem, but it would be good to actually
solve the problem rather than have hackish workarounds before v1.3 final.
This commit was SVN r18434.
2008-05-14 00:00:55 +04:00
|
|
|
orte_output(0, "%s:%d FATAL", __FILE__, __LINE__);
|
2008-02-28 04:57:57 +03:00
|
|
|
orte_errmgr.abort(-1, NULL);
|
2005-06-10 00:16:33 +04:00
|
|
|
}
|
2005-06-09 00:37:19 +04:00
|
|
|
|
2006-02-08 09:03:54 +03:00
|
|
|
/* count bytes of user data actually delivered. As the rndv completion only
|
|
|
|
* happens in one thread, the increase of the req_bytes_delivered does not
|
|
|
|
* have to be atomic.
|
|
|
|
*/
|
|
|
|
MCA_PML_OB1_COMPUTE_SEGMENT_LENGTH( descriptor->des_src,
|
|
|
|
descriptor->des_src_cnt,
|
|
|
|
sizeof(mca_pml_ob1_rendezvous_hdr_t),
|
2007-01-03 18:19:34 +03:00
|
|
|
req_bytes_delivered );
|
2005-06-09 00:37:19 +04:00
|
|
|
|
2007-08-30 16:08:33 +04:00
|
|
|
OPAL_THREAD_ADD_SIZE_T(&sendreq->req_bytes_delivered, req_bytes_delivered);
|
|
|
|
|
2005-07-18 22:54:25 +04:00
|
|
|
/* advance the request */
|
2007-08-30 16:08:33 +04:00
|
|
|
OPAL_THREAD_ADD32(&sendreq->req_state, -1);
|
|
|
|
|
|
|
|
send_request_pml_complete_check(sendreq);
|
|
|
|
|
2005-07-19 01:22:55 +04:00
|
|
|
/* check for pending requests */
|
2006-07-20 18:44:35 +04:00
|
|
|
MCA_PML_OB1_PROGRESS_PENDING(bml_btl);
|
2005-07-18 22:54:25 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2005-08-17 22:23:38 +04:00
|
|
|
/**
|
|
|
|
* Completion of a get request.
|
|
|
|
*/
|
|
|
|
|
2007-07-11 03:45:23 +04:00
|
|
|
static void
|
|
|
|
mca_pml_ob1_rget_completion( mca_btl_base_module_t* btl,
|
|
|
|
struct mca_btl_base_endpoint_t* ep,
|
|
|
|
struct mca_btl_base_descriptor_t* des,
|
|
|
|
int status )
|
2005-08-17 22:23:38 +04:00
|
|
|
{
|
|
|
|
mca_pml_ob1_send_request_t* sendreq = (mca_pml_ob1_send_request_t*)des->des_cbdata;
|
2006-07-20 18:44:35 +04:00
|
|
|
mca_bml_base_btl_t* bml_btl = (mca_bml_base_btl_t*)des->des_context;
|
2006-02-08 09:03:54 +03:00
|
|
|
size_t req_bytes_delivered = 0;
|
2005-08-17 22:23:38 +04:00
|
|
|
|
|
|
|
/* count bytes of user data actually delivered and check for request completion */
|
2006-02-08 09:03:54 +03:00
|
|
|
MCA_PML_OB1_COMPUTE_SEGMENT_LENGTH( des->des_src, des->des_src_cnt,
|
|
|
|
0, req_bytes_delivered );
|
2007-08-30 16:08:33 +04:00
|
|
|
OPAL_THREAD_ADD_SIZE_T(&sendreq->req_bytes_delivered, req_bytes_delivered);
|
|
|
|
|
|
|
|
send_request_pml_complete_check(sendreq);
|
2008-03-25 04:43:41 +03:00
|
|
|
/* free the descriptor */
|
|
|
|
mca_bml_base_free(bml_btl, des);
|
2006-07-20 18:44:35 +04:00
|
|
|
MCA_PML_OB1_PROGRESS_PENDING(bml_btl);
|
2005-08-17 22:23:38 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Completion of a control message - return resources.
|
|
|
|
*/
|
|
|
|
|
2007-07-11 03:45:23 +04:00
|
|
|
static void
|
|
|
|
mca_pml_ob1_send_ctl_completion( mca_btl_base_module_t* btl,
|
|
|
|
struct mca_btl_base_endpoint_t* ep,
|
|
|
|
struct mca_btl_base_descriptor_t* descriptor,
|
|
|
|
int status )
|
2005-08-17 22:23:38 +04:00
|
|
|
{
|
|
|
|
mca_bml_base_btl_t* bml_btl = (mca_bml_base_btl_t*) descriptor->des_context;
|
2006-07-20 18:44:35 +04:00
|
|
|
|
|
|
|
/* check for pending requests */
|
|
|
|
MCA_PML_OB1_PROGRESS_PENDING(bml_btl);
|
2005-08-17 22:23:38 +04:00
|
|
|
}
|
|
|
|
|
2005-07-18 22:54:25 +04:00
|
|
|
/**
|
|
|
|
* Completion of additional fragments of a large message - may need
|
|
|
|
* to schedule additional fragments.
|
|
|
|
*/
|
|
|
|
|
2007-07-11 03:45:23 +04:00
|
|
|
static void
|
|
|
|
mca_pml_ob1_frag_completion( mca_btl_base_module_t* btl,
|
|
|
|
struct mca_btl_base_endpoint_t* ep,
|
|
|
|
struct mca_btl_base_descriptor_t* descriptor,
|
|
|
|
int status )
|
2005-07-18 22:54:25 +04:00
|
|
|
{
|
|
|
|
mca_pml_ob1_send_request_t* sendreq = (mca_pml_ob1_send_request_t*)descriptor->des_cbdata;
|
2007-01-03 17:44:20 +03:00
|
|
|
mca_bml_base_btl_t* bml_btl = (mca_bml_base_btl_t*) descriptor->des_context;
|
2006-02-08 09:03:54 +03:00
|
|
|
size_t req_bytes_delivered = 0;
|
2005-07-18 22:54:25 +04:00
|
|
|
|
|
|
|
/* check completion status */
|
2007-07-11 03:45:23 +04:00
|
|
|
if( OPAL_UNLIKELY(OMPI_SUCCESS != status) ) {
|
2005-07-18 22:54:25 +04:00
|
|
|
/* TSW - FIX */
|
This commit represents a bunch of work on a Mercurial side branch. As
such, the commit message back to the master SVN repository is fairly
long.
= ORTE Job-Level Output Messages =
Add two new interfaces that should be used for all new code throughout
the ORTE and OMPI layers (we already make the search-and-replace on
the existing ORTE / OMPI layers):
* orte_output(): (and corresponding friends ORTE_OUTPUT,
orte_output_verbose, etc.) This function sends the output directly
to the HNP for processing as part of a job-specific output
channel. It supports all the same outputs as opal_output()
(syslog, file, stdout, stderr), but for stdout/stderr, the output
is sent to the HNP for processing and output. More on this below.
* orte_show_help(): This function is a drop-in-replacement for
opal_show_help(), with two differences in functionality:
1. the rendered text help message output is sent to the HNP for
display (rather than outputting directly into the process' stderr
stream)
1. the HNP detects duplicate help messages and does not display them
(so that you don't see the same error message N times, once from
each of your N MPI processes); instead, it counts "new" instances
of the help message and displays a message every ~5 seconds when
there are new ones ("I got X new copies of the help message...")
opal_show_help and opal_output still exist, but they only output in
the current process. The intent for the new orte_* functions is that
they can apply job-level intelligence to the output. As such, we
recommend that all new ORTE and OMPI code use the new orte_*
functions, not thei opal_* functions.
=== New code ===
For ORTE and OMPI programmers, here's what you need to do differently
in new code:
* Do not include opal/util/show_help.h or opal/util/output.h.
Instead, include orte/util/output.h (this one header file has
declarations for both the orte_output() series of functions and
orte_show_help()).
* Effectively s/opal_output/orte_output/gi throughout your code.
Note that orte_output_open() takes a slightly different argument
list (as a way to pass data to the filtering stream -- see below),
so you if explicitly call opal_output_open(), you'll need to
slightly adapt to the new signature of orte_output_open().
* Literally s/opal_show_help/orte_show_help/. The function signature
is identical.
=== Notes ===
* orte_output'ing to stream 0 will do similar to what
opal_output'ing did, so leaving a hard-coded "0" as the first
argument is safe.
* For systems that do not use ORTE's RML or the HNP, the effect of
orte_output_* and orte_show_help will be identical to their opal
counterparts (the additional information passed to
orte_output_open() will be lost!). Indeed, the orte_* functions
simply become trivial wrappers to their opal_* counterparts. Note
that we have not tested this; the code is simple but it is quite
possible that we mucked something up.
= Filter Framework =
Messages sent view the new orte_* functions described above and
messages output via the IOF on the HNP will now optionally be passed
through a new "filter" framework before being output to
stdout/stderr. The "filter" OPAL MCA framework is intended to allow
preprocessing to messages before they are sent to their final
destinations. The first component that was written in the filter
framework was to create an XML stream, segregating all the messages
into different XML tags, etc. This will allow 3rd party tools to read
the stdout/stderr from the HNP and be able to know exactly what each
text message is (e.g., a help message, another OMPI infrastructure
message, stdout from the user process, stderr from the user process,
etc.).
Filtering is not active by default. Filter components must be
specifically requested, such as:
{{{
$ mpirun --mca filter xml ...
}}}
There can only be one filter component active.
= New MCA Parameters =
The new functionality described above introduces two new MCA
parameters:
* '''orte_base_help_aggregate''': Defaults to 1 (true), meaning that
help messages will be aggregated, as described above. If set to 0,
all help messages will be displayed, even if they are duplicates
(i.e., the original behavior).
* '''orte_base_show_output_recursions''': An MCA parameter to help
debug one of the known issues, described below. It is likely that
this MCA parameter will disappear before v1.3 final.
= Known Issues =
* The XML filter component is not complete. The current output from
this component is preliminary and not real XML. A bit more work
needs to be done to configure.m4 search for an appropriate XML
library/link it in/use it at run time.
* There are possible recursion loops in the orte_output() and
orte_show_help() functions -- e.g., if RML send calls orte_output()
or orte_show_help(). We have some ideas how to fix these, but
figured that it was ok to commit before feature freeze with known
issues. The code currently contains sub-optimal workarounds so
that this will not be a problem, but it would be good to actually
solve the problem rather than have hackish workarounds before v1.3 final.
This commit was SVN r18434.
2008-05-14 00:00:55 +04:00
|
|
|
orte_output(0, "%s:%d FATAL", __FILE__, __LINE__);
|
2008-02-28 04:57:57 +03:00
|
|
|
orte_errmgr.abort(-1, NULL);
|
2005-06-09 00:37:19 +04:00
|
|
|
}
|
|
|
|
|
2005-08-17 22:23:38 +04:00
|
|
|
/* count bytes of user data actually delivered */
|
2006-02-08 09:03:54 +03:00
|
|
|
MCA_PML_OB1_COMPUTE_SEGMENT_LENGTH( descriptor->des_src,
|
|
|
|
descriptor->des_src_cnt,
|
|
|
|
sizeof(mca_pml_ob1_frag_hdr_t),
|
|
|
|
req_bytes_delivered );
|
2006-07-20 18:44:35 +04:00
|
|
|
|
2007-01-03 17:44:20 +03:00
|
|
|
OPAL_THREAD_ADD_SIZE_T(&sendreq->req_pipeline_depth, -1);
|
2007-08-30 16:08:33 +04:00
|
|
|
OPAL_THREAD_ADD_SIZE_T(&sendreq->req_bytes_delivered, req_bytes_delivered);
|
|
|
|
|
|
|
|
if(send_request_pml_complete_check(sendreq) == false)
|
2005-07-18 22:54:25 +04:00
|
|
|
mca_pml_ob1_send_request_schedule(sendreq);
|
2005-06-10 00:16:33 +04:00
|
|
|
|
2005-07-19 01:22:55 +04:00
|
|
|
/* check for pending requests */
|
2006-07-20 18:44:35 +04:00
|
|
|
MCA_PML_OB1_PROGRESS_PENDING(bml_btl);
|
2005-05-24 02:06:50 +04:00
|
|
|
}
|
|
|
|
|
2005-09-15 22:47:59 +04:00
|
|
|
/**
|
|
|
|
* Buffer the entire message and mark as complete.
|
|
|
|
*/
|
|
|
|
|
|
|
|
int mca_pml_ob1_send_request_start_buffered(
|
|
|
|
mca_pml_ob1_send_request_t* sendreq,
|
|
|
|
mca_bml_base_btl_t* bml_btl,
|
|
|
|
size_t size)
|
|
|
|
{
|
|
|
|
mca_btl_base_descriptor_t* descriptor;
|
|
|
|
mca_btl_base_segment_t* segment;
|
|
|
|
mca_pml_ob1_hdr_t* hdr;
|
|
|
|
struct iovec iov;
|
|
|
|
unsigned int iov_count;
|
|
|
|
size_t max_data;
|
|
|
|
int rc;
|
|
|
|
|
|
|
|
/* allocate descriptor */
|
2007-05-24 23:51:26 +04:00
|
|
|
mca_bml_base_alloc(bml_btl, &descriptor,
|
2007-12-09 17:08:01 +03:00
|
|
|
MCA_BTL_NO_ORDER,
|
|
|
|
sizeof(mca_pml_ob1_rendezvous_hdr_t) + size,
|
2008-02-18 20:39:30 +03:00
|
|
|
MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP);
|
2007-07-11 03:45:23 +04:00
|
|
|
if( OPAL_UNLIKELY(NULL == descriptor) ) {
|
2005-09-15 22:47:59 +04:00
|
|
|
return OMPI_ERR_OUT_OF_RESOURCE;
|
|
|
|
}
|
|
|
|
segment = descriptor->des_src;
|
|
|
|
|
|
|
|
/* pack the data into the BTL supplied buffer */
|
2007-01-05 01:07:37 +03:00
|
|
|
iov.iov_base = (IOVBASE_TYPE*)((unsigned char*)segment->seg_addr.pval +
|
2006-08-24 20:38:08 +04:00
|
|
|
sizeof(mca_pml_ob1_rendezvous_hdr_t));
|
2005-09-15 22:47:59 +04:00
|
|
|
iov.iov_len = size;
|
|
|
|
iov_count = 1;
|
|
|
|
max_data = size;
|
2007-07-11 02:16:38 +04:00
|
|
|
if((rc = ompi_convertor_pack( &sendreq->req_send.req_base.req_convertor,
|
|
|
|
&iov,
|
|
|
|
&iov_count,
|
|
|
|
&max_data)) < 0) {
|
2005-09-15 22:47:59 +04:00
|
|
|
mca_bml_base_free(bml_btl, descriptor);
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* build rendezvous header */
|
2007-01-05 01:07:37 +03:00
|
|
|
hdr = (mca_pml_ob1_hdr_t*)segment->seg_addr.pval;
|
2005-09-15 22:47:59 +04:00
|
|
|
hdr->hdr_common.hdr_flags = 0;
|
|
|
|
hdr->hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_RNDV;
|
|
|
|
hdr->hdr_match.hdr_ctx = sendreq->req_send.req_base.req_comm->c_contextid;
|
|
|
|
hdr->hdr_match.hdr_src = sendreq->req_send.req_base.req_comm->c_my_rank;
|
|
|
|
hdr->hdr_match.hdr_tag = sendreq->req_send.req_base.req_tag;
|
2006-08-24 20:38:08 +04:00
|
|
|
hdr->hdr_match.hdr_seq = (uint16_t)sendreq->req_send.req_base.req_sequence;
|
2005-09-15 22:47:59 +04:00
|
|
|
hdr->hdr_rndv.hdr_msg_length = sendreq->req_send.req_bytes_packed;
|
2007-01-05 01:07:37 +03:00
|
|
|
hdr->hdr_rndv.hdr_src_req.pval = sendreq;
|
2005-09-15 22:47:59 +04:00
|
|
|
|
2007-12-16 11:45:44 +03:00
|
|
|
ob1_hdr_hton(hdr, MCA_PML_OB1_HDR_TYPE_RNDV,
|
|
|
|
sendreq->req_send.req_base.req_proc);
|
2006-02-26 03:45:54 +03:00
|
|
|
|
2005-09-15 22:47:59 +04:00
|
|
|
/* update lengths */
|
|
|
|
segment->seg_len = sizeof(mca_pml_ob1_rendezvous_hdr_t) + max_data;
|
|
|
|
|
|
|
|
descriptor->des_cbfunc = mca_pml_ob1_rndv_completion;
|
|
|
|
descriptor->des_cbdata = sendreq;
|
|
|
|
|
|
|
|
/* buffer the remainder of the message */
|
|
|
|
rc = mca_pml_base_bsend_request_alloc((ompi_request_t*)sendreq);
|
2007-07-11 03:45:23 +04:00
|
|
|
if( OPAL_UNLIKELY(OMPI_SUCCESS != rc) ) {
|
2005-09-15 22:47:59 +04:00
|
|
|
mca_bml_base_free(bml_btl, descriptor);
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
|
2007-06-03 12:30:07 +04:00
|
|
|
iov.iov_base = (IOVBASE_TYPE*)(((unsigned char*)sendreq->req_send.req_addr) + max_data);
|
|
|
|
iov.iov_len = max_data = sendreq->req_send.req_bytes_packed - max_data;
|
2005-09-15 22:47:59 +04:00
|
|
|
|
2007-07-11 02:16:38 +04:00
|
|
|
if((rc = ompi_convertor_pack( &sendreq->req_send.req_base.req_convertor,
|
2006-04-01 11:42:43 +04:00
|
|
|
&iov,
|
|
|
|
&iov_count,
|
2006-10-27 03:11:26 +04:00
|
|
|
&max_data)) < 0) {
|
2006-04-01 11:42:43 +04:00
|
|
|
mca_bml_base_free(bml_btl, descriptor);
|
|
|
|
return rc;
|
2005-09-15 22:47:59 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
/* re-init convertor for packed data */
|
2007-07-11 02:16:38 +04:00
|
|
|
ompi_convertor_prepare_for_send( &sendreq->req_send.req_base.req_convertor,
|
2006-07-06 21:58:25 +04:00
|
|
|
MPI_BYTE,
|
|
|
|
sendreq->req_send.req_bytes_packed,
|
2006-04-01 11:42:43 +04:00
|
|
|
sendreq->req_send.req_addr );
|
2007-08-30 16:08:33 +04:00
|
|
|
|
|
|
|
/* wait for ack and completion */
|
|
|
|
sendreq->req_state = 2;
|
|
|
|
|
2005-09-15 22:47:59 +04:00
|
|
|
/* request is complete at mpi level */
|
|
|
|
OPAL_THREAD_LOCK(&ompi_request_lock);
|
|
|
|
MCA_PML_OB1_SEND_REQUEST_MPI_COMPLETE(sendreq);
|
|
|
|
OPAL_THREAD_UNLOCK(&ompi_request_lock);
|
|
|
|
|
|
|
|
/* send */
|
2008-05-30 05:29:09 +04:00
|
|
|
rc = mca_bml_base_send(bml_btl, descriptor, MCA_PML_OB1_HDR_TYPE_RNDV);
|
2007-07-11 03:45:23 +04:00
|
|
|
if( OPAL_UNLIKELY(OMPI_SUCCESS != rc) ) {
|
2005-09-15 22:47:59 +04:00
|
|
|
mca_bml_base_free(bml_btl, descriptor );
|
|
|
|
}
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
2006-12-02 00:52:07 +03:00
|
|
|
* We work on a buffered request with a size smaller than the eager size
|
|
|
|
* or the BTL is not able to send the data IN_PLACE. Request a segment
|
|
|
|
* that is used for initial hdr and any eager data. This is used only
|
|
|
|
* from the _START macro.
|
2005-09-15 22:47:59 +04:00
|
|
|
*/
|
2006-12-02 00:52:07 +03:00
|
|
|
int mca_pml_ob1_send_request_start_copy( mca_pml_ob1_send_request_t* sendreq,
|
|
|
|
mca_bml_base_btl_t* bml_btl,
|
|
|
|
size_t size )
|
2005-09-15 22:47:59 +04:00
|
|
|
{
|
|
|
|
mca_btl_base_descriptor_t* descriptor;
|
|
|
|
mca_btl_base_segment_t* segment;
|
|
|
|
mca_pml_ob1_hdr_t* hdr;
|
|
|
|
struct iovec iov;
|
|
|
|
unsigned int iov_count;
|
2006-10-28 03:16:13 +04:00
|
|
|
size_t max_data = size;
|
2005-09-15 22:47:59 +04:00
|
|
|
int rc;
|
|
|
|
|
|
|
|
/* allocate descriptor */
|
2007-12-09 16:58:17 +03:00
|
|
|
mca_bml_base_alloc( bml_btl, &descriptor,
|
|
|
|
MCA_BTL_NO_ORDER,
|
2007-12-09 17:08:01 +03:00
|
|
|
sizeof(mca_pml_ob1_match_hdr_t) + size,
|
2008-02-18 20:39:30 +03:00
|
|
|
MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP);
|
2007-12-09 16:58:17 +03:00
|
|
|
if( OPAL_UNLIKELY(NULL == descriptor) ) {
|
|
|
|
return OMPI_ERR_OUT_OF_RESOURCE;
|
|
|
|
}
|
|
|
|
segment = descriptor->des_src;
|
2006-10-29 11:12:44 +03:00
|
|
|
|
2007-12-09 16:58:17 +03:00
|
|
|
if(size > 0) {
|
2006-10-29 11:12:44 +03:00
|
|
|
/* pack the data into the supplied buffer */
|
2007-12-09 16:58:17 +03:00
|
|
|
iov.iov_base = (IOVBASE_TYPE*)((unsigned char*)segment->seg_addr.pval +
|
|
|
|
sizeof(mca_pml_ob1_match_hdr_t));
|
2008-02-12 11:46:27 +03:00
|
|
|
iov.iov_len = size;
|
|
|
|
iov_count = 1;
|
|
|
|
/*
|
|
|
|
* Before copy the user buffer, make the target part
|
|
|
|
* accessable.
|
|
|
|
*/
|
|
|
|
MEMCHECKER(
|
2008-05-07 16:28:51 +04:00
|
|
|
memchecker_call(&opal_memchecker_base_mem_defined,
|
|
|
|
sendreq->req_send.req_base.req_addr,
|
|
|
|
sendreq->req_send.req_base.req_count,
|
|
|
|
sendreq->req_send.req_base.req_datatype);
|
2008-02-12 11:46:27 +03:00
|
|
|
);
|
2007-07-11 02:16:38 +04:00
|
|
|
(void)ompi_convertor_pack( &sendreq->req_send.req_base.req_convertor,
|
2008-02-12 11:46:27 +03:00
|
|
|
&iov, &iov_count, &max_data );
|
|
|
|
/*
|
|
|
|
* Packing finished, make the user buffer unaccessable.
|
|
|
|
*/
|
|
|
|
MEMCHECKER(
|
2008-05-07 16:28:51 +04:00
|
|
|
memchecker_call(&opal_memchecker_base_mem_noaccess,
|
|
|
|
sendreq->req_send.req_base.req_addr,
|
|
|
|
sendreq->req_send.req_base.req_count,
|
|
|
|
sendreq->req_send.req_base.req_datatype);
|
2008-02-12 11:46:27 +03:00
|
|
|
);
|
2005-09-15 22:47:59 +04:00
|
|
|
}
|
2008-02-12 11:46:27 +03:00
|
|
|
|
2006-02-08 20:39:33 +03:00
|
|
|
|
2005-09-15 22:47:59 +04:00
|
|
|
/* build match header */
|
2007-01-05 01:07:37 +03:00
|
|
|
hdr = (mca_pml_ob1_hdr_t*)segment->seg_addr.pval;
|
2005-09-15 22:47:59 +04:00
|
|
|
hdr->hdr_common.hdr_flags = 0;
|
|
|
|
hdr->hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_MATCH;
|
|
|
|
hdr->hdr_match.hdr_ctx = sendreq->req_send.req_base.req_comm->c_contextid;
|
|
|
|
hdr->hdr_match.hdr_src = sendreq->req_send.req_base.req_comm->c_my_rank;
|
|
|
|
hdr->hdr_match.hdr_tag = sendreq->req_send.req_base.req_tag;
|
2006-08-24 20:38:08 +04:00
|
|
|
hdr->hdr_match.hdr_seq = (uint16_t)sendreq->req_send.req_base.req_sequence;
|
2005-09-15 22:47:59 +04:00
|
|
|
|
2007-12-16 11:45:44 +03:00
|
|
|
ob1_hdr_hton(hdr, MCA_PML_OB1_HDR_TYPE_MATCH,
|
|
|
|
sendreq->req_send.req_base.req_proc);
|
2006-02-26 03:45:54 +03:00
|
|
|
|
2005-09-15 22:47:59 +04:00
|
|
|
/* update lengths */
|
|
|
|
segment->seg_len = sizeof(mca_pml_ob1_match_hdr_t) + max_data;
|
|
|
|
|
|
|
|
/* short message */
|
|
|
|
descriptor->des_cbdata = sendreq;
|
2007-12-09 16:58:17 +03:00
|
|
|
descriptor->des_cbfunc = mca_pml_ob1_match_completion_free;
|
2005-09-15 22:47:59 +04:00
|
|
|
|
|
|
|
|
|
|
|
/* send */
|
2008-05-30 05:29:09 +04:00
|
|
|
rc = mca_bml_base_send_status(bml_btl, descriptor, MCA_PML_OB1_HDR_TYPE_MATCH);
|
2007-12-09 17:13:24 +03:00
|
|
|
switch(rc) {
|
|
|
|
case OMPI_SUCCESS:
|
|
|
|
/* packet is on wire; signal request completion */
|
|
|
|
OPAL_THREAD_LOCK(&ompi_request_lock);
|
|
|
|
MCA_PML_OB1_SEND_REQUEST_MPI_COMPLETE(sendreq);
|
|
|
|
OPAL_THREAD_UNLOCK(&ompi_request_lock);
|
|
|
|
break;
|
|
|
|
case OMPI_ERR_RESOURCE_BUSY:
|
|
|
|
/* don't signal request completion; will be completed in wait() */
|
|
|
|
rc = OMPI_SUCCESS;
|
|
|
|
break;
|
|
|
|
default:
|
2007-12-09 16:58:17 +03:00
|
|
|
mca_bml_base_free(bml_btl, descriptor);
|
2007-12-09 17:13:24 +03:00
|
|
|
break;
|
|
|
|
}
|
2005-09-15 22:47:59 +04:00
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* BTL can send directly from user buffer so allow the BTL
|
2006-03-16 01:53:41 +03:00
|
|
|
* to prepare the segment list. Start sending a small message.
|
2005-09-15 22:47:59 +04:00
|
|
|
*/
|
|
|
|
|
2006-12-02 00:52:07 +03:00
|
|
|
int mca_pml_ob1_send_request_start_prepare( mca_pml_ob1_send_request_t* sendreq,
|
|
|
|
mca_bml_base_btl_t* bml_btl,
|
|
|
|
size_t size )
|
2005-09-15 22:47:59 +04:00
|
|
|
{
|
|
|
|
mca_btl_base_descriptor_t* descriptor;
|
|
|
|
mca_btl_base_segment_t* segment;
|
|
|
|
mca_pml_ob1_hdr_t* hdr;
|
|
|
|
int rc;
|
|
|
|
|
|
|
|
/* prepare descriptor */
|
2006-12-29 10:40:02 +03:00
|
|
|
mca_bml_base_prepare_src( bml_btl,
|
|
|
|
NULL,
|
2007-07-11 02:16:38 +04:00
|
|
|
&sendreq->req_send.req_base.req_convertor,
|
2007-05-24 23:51:26 +04:00
|
|
|
MCA_BTL_NO_ORDER,
|
2006-12-29 10:40:02 +03:00
|
|
|
sizeof(mca_pml_ob1_match_hdr_t),
|
|
|
|
&size,
|
2008-02-18 20:39:30 +03:00
|
|
|
MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP,
|
2007-12-09 17:08:01 +03:00
|
|
|
&descriptor );
|
2006-12-29 10:40:02 +03:00
|
|
|
if( OPAL_UNLIKELY(NULL == descriptor) ) {
|
2005-09-15 22:47:59 +04:00
|
|
|
return OMPI_ERR_OUT_OF_RESOURCE;
|
2006-03-16 01:53:41 +03:00
|
|
|
}
|
2005-09-15 22:47:59 +04:00
|
|
|
segment = descriptor->des_src;
|
|
|
|
|
|
|
|
/* build match header */
|
2007-01-05 01:07:37 +03:00
|
|
|
hdr = (mca_pml_ob1_hdr_t*)segment->seg_addr.pval;
|
2005-09-15 22:47:59 +04:00
|
|
|
hdr->hdr_common.hdr_flags = 0;
|
|
|
|
hdr->hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_MATCH;
|
|
|
|
hdr->hdr_match.hdr_ctx = sendreq->req_send.req_base.req_comm->c_contextid;
|
|
|
|
hdr->hdr_match.hdr_src = sendreq->req_send.req_base.req_comm->c_my_rank;
|
|
|
|
hdr->hdr_match.hdr_tag = sendreq->req_send.req_base.req_tag;
|
2006-08-24 20:38:08 +04:00
|
|
|
hdr->hdr_match.hdr_seq = (uint16_t)sendreq->req_send.req_base.req_sequence;
|
2005-09-15 22:47:59 +04:00
|
|
|
|
2007-12-16 11:45:44 +03:00
|
|
|
ob1_hdr_hton(hdr, MCA_PML_OB1_HDR_TYPE_MATCH,
|
|
|
|
sendreq->req_send.req_base.req_proc);
|
2006-02-26 03:45:54 +03:00
|
|
|
|
2005-09-15 22:47:59 +04:00
|
|
|
/* short message */
|
|
|
|
descriptor->des_cbfunc = mca_pml_ob1_match_completion_free;
|
2006-03-16 01:53:41 +03:00
|
|
|
|
2005-09-15 22:47:59 +04:00
|
|
|
descriptor->des_cbdata = sendreq;
|
|
|
|
|
|
|
|
/* send */
|
2008-05-30 05:29:09 +04:00
|
|
|
rc = mca_bml_base_send(bml_btl, descriptor, MCA_PML_OB1_HDR_TYPE_MATCH);
|
2006-12-29 10:40:02 +03:00
|
|
|
if( OPAL_UNLIKELY(OMPI_SUCCESS != rc) ) {
|
2005-09-15 22:47:59 +04:00
|
|
|
mca_bml_base_free(bml_btl, descriptor );
|
|
|
|
}
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2005-09-13 02:28:23 +04:00
|
|
|
/**
|
|
|
|
* We have contigous data that is registered - schedule across
|
|
|
|
* available nics.
|
|
|
|
*/
|
|
|
|
|
2005-09-15 22:47:59 +04:00
|
|
|
int mca_pml_ob1_send_request_start_rdma(
|
2007-07-11 03:45:23 +04:00
|
|
|
mca_pml_ob1_send_request_t* sendreq,
|
|
|
|
mca_bml_base_btl_t* bml_btl,
|
|
|
|
size_t size)
|
2005-09-13 02:28:23 +04:00
|
|
|
{
|
|
|
|
/*
|
2007-01-18 12:15:18 +03:00
|
|
|
* When req_rdma array is constructed the firs element of the array always
|
|
|
|
* assigned different btl in round robin fashion (if there are more than
|
|
|
|
* one RDMA capable BTLs). This way round robin distribution of RDMA
|
|
|
|
* operation is achieved.
|
2005-09-13 02:28:23 +04:00
|
|
|
*/
|
|
|
|
|
|
|
|
mca_mpool_base_registration_t* reg = sendreq->req_rdma[0].btl_reg;
|
|
|
|
mca_btl_base_descriptor_t* src;
|
|
|
|
mca_btl_base_descriptor_t* des;
|
|
|
|
mca_btl_base_segment_t* segment;
|
|
|
|
mca_pml_ob1_hdr_t* hdr;
|
|
|
|
size_t i;
|
|
|
|
int rc;
|
|
|
|
|
2005-11-03 23:52:56 +03:00
|
|
|
|
2005-09-15 22:47:59 +04:00
|
|
|
bml_btl = sendreq->req_rdma[0].bml_btl;
|
2005-11-03 23:52:56 +03:00
|
|
|
if(sendreq->req_rdma_cnt == 1 &&
|
|
|
|
bml_btl->btl_flags & MCA_BTL_FLAGS_GET) {
|
2007-07-11 02:16:38 +04:00
|
|
|
size_t old_position = sendreq->req_send.req_base.req_convertor.bConverted;
|
2005-09-13 02:28:23 +04:00
|
|
|
|
2008-04-07 11:52:04 +04:00
|
|
|
MEMCHECKER(
|
2008-05-07 16:28:51 +04:00
|
|
|
memchecker_call(&opal_memchecker_base_mem_defined,
|
|
|
|
sendreq->req_send.req_base.req_addr,
|
|
|
|
sendreq->req_send.req_base.req_count,
|
|
|
|
sendreq->req_send.req_base.req_datatype);
|
2008-04-07 11:52:04 +04:00
|
|
|
);
|
2006-12-17 15:26:41 +03:00
|
|
|
/* prepare source descriptor/segment(s) */
|
2008-03-25 04:43:41 +03:00
|
|
|
/* PML owns this descriptor and will free it in */
|
|
|
|
/* get_completion */
|
2007-07-11 02:16:38 +04:00
|
|
|
mca_bml_base_prepare_src( bml_btl,
|
|
|
|
reg,
|
|
|
|
&sendreq->req_send.req_base.req_convertor,
|
|
|
|
MCA_BTL_NO_ORDER,
|
|
|
|
0,
|
|
|
|
&size,
|
2008-03-25 04:43:41 +03:00
|
|
|
0,
|
2007-07-11 02:16:38 +04:00
|
|
|
&src );
|
2008-04-07 11:52:04 +04:00
|
|
|
MEMCHECKER(
|
2008-05-07 16:28:51 +04:00
|
|
|
memchecker_call(&opal_memchecker_base_mem_noaccess,
|
|
|
|
sendreq->req_send.req_base.req_addr,
|
|
|
|
sendreq->req_send.req_base.req_count,
|
|
|
|
sendreq->req_send.req_base.req_datatype);
|
2008-04-07 11:52:04 +04:00
|
|
|
);
|
2007-07-11 03:45:23 +04:00
|
|
|
if( OPAL_UNLIKELY(NULL == src) ) {
|
|
|
|
ompi_convertor_set_position(&sendreq->req_send.req_base.req_convertor,
|
|
|
|
&old_position);
|
|
|
|
return OMPI_ERR_OUT_OF_RESOURCE;
|
|
|
|
}
|
|
|
|
src->des_cbfunc = mca_pml_ob1_rget_completion;
|
|
|
|
src->des_cbdata = sendreq;
|
|
|
|
|
|
|
|
/* allocate space for get hdr + segment list */
|
|
|
|
mca_bml_base_alloc(bml_btl, &des, MCA_BTL_NO_ORDER,
|
2007-12-09 17:08:01 +03:00
|
|
|
sizeof(mca_pml_ob1_rget_hdr_t) +
|
|
|
|
(sizeof(mca_btl_base_segment_t) * (src->des_src_cnt-1)),
|
2008-02-18 20:39:30 +03:00
|
|
|
MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP);
|
2007-07-11 03:45:23 +04:00
|
|
|
if( OPAL_UNLIKELY(NULL == des) ) {
|
2008-03-18 06:03:33 +03:00
|
|
|
ompi_convertor_set_position( &sendreq->req_send.req_base.req_convertor,
|
|
|
|
&old_position );
|
2007-07-11 03:45:23 +04:00
|
|
|
mca_bml_base_free(bml_btl, src);
|
|
|
|
return OMPI_ERR_OUT_OF_RESOURCE;
|
|
|
|
}
|
|
|
|
segment = des->des_src;
|
|
|
|
|
|
|
|
/* build match header */
|
|
|
|
hdr = (mca_pml_ob1_hdr_t*)segment->seg_addr.pval;
|
|
|
|
hdr->hdr_common.hdr_flags = MCA_PML_OB1_HDR_FLAGS_CONTIG|MCA_PML_OB1_HDR_FLAGS_PIN;
|
|
|
|
hdr->hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_RGET;
|
|
|
|
hdr->hdr_match.hdr_ctx = sendreq->req_send.req_base.req_comm->c_contextid;
|
|
|
|
hdr->hdr_match.hdr_src = sendreq->req_send.req_base.req_comm->c_my_rank;
|
|
|
|
hdr->hdr_match.hdr_tag = sendreq->req_send.req_base.req_tag;
|
|
|
|
hdr->hdr_match.hdr_seq = (uint16_t)sendreq->req_send.req_base.req_sequence;
|
|
|
|
hdr->hdr_rndv.hdr_msg_length = sendreq->req_send.req_bytes_packed;
|
|
|
|
hdr->hdr_rndv.hdr_src_req.pval = sendreq;
|
|
|
|
hdr->hdr_rget.hdr_des.pval = src;
|
|
|
|
hdr->hdr_rget.hdr_seg_cnt = src->des_src_cnt;
|
2006-02-26 03:45:54 +03:00
|
|
|
|
2007-12-16 11:45:44 +03:00
|
|
|
ob1_hdr_hton(hdr, MCA_PML_OB1_HDR_TYPE_RGET,
|
|
|
|
sendreq->req_send.req_base.req_proc);
|
2006-02-26 03:45:54 +03:00
|
|
|
|
2007-07-11 03:45:23 +04:00
|
|
|
for( i = 0; i < src->des_src_cnt; i++ ) {
|
|
|
|
hdr->hdr_rget.hdr_segs[i].seg_addr.lval = ompi_ptr_ptol(src->des_src[i].seg_addr.pval);
|
|
|
|
hdr->hdr_rget.hdr_segs[i].seg_len = src->des_src[i].seg_len;
|
|
|
|
hdr->hdr_rget.hdr_segs[i].seg_key.key64 = src->des_src[i].seg_key.key64;
|
|
|
|
}
|
|
|
|
|
|
|
|
des->des_cbfunc = mca_pml_ob1_send_ctl_completion;
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Well, it's a get so we will not know when the peer get the data anyway.
|
|
|
|
* If we generate the PERUSE event here, at least we will know when do we
|
|
|
|
* sent the GET message ...
|
|
|
|
*/
|
|
|
|
if( sendreq->req_send.req_bytes_packed > 0 ) {
|
|
|
|
PERUSE_TRACE_COMM_EVENT( PERUSE_COMM_REQ_XFER_BEGIN,
|
|
|
|
&(sendreq->req_send.req_base), PERUSE_SEND );
|
|
|
|
}
|
|
|
|
|
|
|
|
} else {
|
|
|
|
|
|
|
|
/* allocate a rendezvous header - dont eager send any data
|
|
|
|
* receiver will schedule rdma put(s) of the entire message
|
|
|
|
*/
|
|
|
|
|
|
|
|
mca_bml_base_alloc(bml_btl, &des,
|
2007-12-09 17:08:01 +03:00
|
|
|
MCA_BTL_NO_ORDER,
|
|
|
|
sizeof(mca_pml_ob1_rendezvous_hdr_t),
|
2008-02-18 20:39:30 +03:00
|
|
|
MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP);
|
2007-07-11 03:45:23 +04:00
|
|
|
if( OPAL_UNLIKELY(NULL == des)) {
|
|
|
|
return OMPI_ERR_OUT_OF_RESOURCE;
|
|
|
|
}
|
|
|
|
segment = des->des_src;
|
2005-09-13 02:28:23 +04:00
|
|
|
|
2007-07-11 03:45:23 +04:00
|
|
|
/* build hdr */
|
|
|
|
hdr = (mca_pml_ob1_hdr_t*)segment->seg_addr.pval;
|
|
|
|
hdr->hdr_common.hdr_flags = MCA_PML_OB1_HDR_FLAGS_CONTIG|MCA_PML_OB1_HDR_FLAGS_PIN;
|
|
|
|
hdr->hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_RNDV;
|
|
|
|
hdr->hdr_match.hdr_ctx = sendreq->req_send.req_base.req_comm->c_contextid;
|
|
|
|
hdr->hdr_match.hdr_src = sendreq->req_send.req_base.req_comm->c_my_rank;
|
|
|
|
hdr->hdr_match.hdr_tag = sendreq->req_send.req_base.req_tag;
|
|
|
|
hdr->hdr_match.hdr_seq = (uint16_t)sendreq->req_send.req_base.req_sequence;
|
|
|
|
hdr->hdr_rndv.hdr_msg_length = sendreq->req_send.req_bytes_packed;
|
|
|
|
hdr->hdr_rndv.hdr_src_req.pval = sendreq;
|
2005-09-13 02:28:23 +04:00
|
|
|
|
2007-12-16 11:45:44 +03:00
|
|
|
ob1_hdr_hton(hdr, MCA_PML_OB1_HDR_TYPE_RNDV,
|
|
|
|
sendreq->req_send.req_base.req_proc);
|
2006-02-26 03:45:54 +03:00
|
|
|
|
2007-07-11 03:45:23 +04:00
|
|
|
/* update lengths with number of bytes actually packed */
|
|
|
|
segment->seg_len = sizeof(mca_pml_ob1_rendezvous_hdr_t);
|
2005-09-13 02:28:23 +04:00
|
|
|
|
2007-07-11 03:45:23 +04:00
|
|
|
/* first fragment of a long message */
|
|
|
|
des->des_cbfunc = mca_pml_ob1_rndv_completion;
|
2007-08-30 16:08:33 +04:00
|
|
|
|
|
|
|
/* wait for ack and completion */
|
|
|
|
sendreq->req_state = 2;
|
2005-09-13 02:28:23 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
des->des_cbdata = sendreq;
|
|
|
|
|
|
|
|
/* send */
|
2008-05-30 05:29:09 +04:00
|
|
|
rc = mca_bml_base_send(bml_btl, des, MCA_PML_OB1_HDR_TYPE_RNDV);
|
2007-07-11 03:45:23 +04:00
|
|
|
if( OPAL_UNLIKELY(OMPI_SUCCESS != rc) ) {
|
2005-09-13 02:28:23 +04:00
|
|
|
mca_bml_base_free(bml_btl, des);
|
|
|
|
}
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Rendezvous is required. Not doing rdma so eager send up to
|
|
|
|
* the btls eager limit.
|
|
|
|
*/
|
|
|
|
|
2007-07-11 03:45:23 +04:00
|
|
|
int mca_pml_ob1_send_request_start_rndv( mca_pml_ob1_send_request_t* sendreq,
|
|
|
|
mca_bml_base_btl_t* bml_btl,
|
|
|
|
size_t size,
|
|
|
|
int flags )
|
2005-09-13 02:28:23 +04:00
|
|
|
{
|
|
|
|
mca_btl_base_descriptor_t* des;
|
|
|
|
mca_btl_base_segment_t* segment;
|
|
|
|
mca_pml_ob1_hdr_t* hdr;
|
|
|
|
int rc;
|
|
|
|
|
|
|
|
/* prepare descriptor */
|
2005-09-14 06:17:04 +04:00
|
|
|
if(size == 0) {
|
2007-07-11 02:16:38 +04:00
|
|
|
mca_bml_base_alloc( bml_btl,
|
|
|
|
&des,
|
|
|
|
MCA_BTL_NO_ORDER,
|
2007-12-09 17:08:01 +03:00
|
|
|
sizeof(mca_pml_ob1_rendezvous_hdr_t),
|
2008-02-18 20:39:30 +03:00
|
|
|
MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP );
|
2005-09-14 06:17:04 +04:00
|
|
|
} else {
|
2008-04-07 11:52:04 +04:00
|
|
|
MEMCHECKER(
|
2008-05-07 16:28:51 +04:00
|
|
|
memchecker_call(&opal_memchecker_base_mem_defined,
|
|
|
|
sendreq->req_send.req_base.req_addr,
|
|
|
|
sendreq->req_send.req_base.req_count,
|
|
|
|
sendreq->req_send.req_base.req_datatype);
|
2008-04-07 11:52:04 +04:00
|
|
|
);
|
2007-07-11 02:16:38 +04:00
|
|
|
mca_bml_base_prepare_src( bml_btl,
|
|
|
|
NULL,
|
|
|
|
&sendreq->req_send.req_base.req_convertor,
|
|
|
|
MCA_BTL_NO_ORDER,
|
|
|
|
sizeof(mca_pml_ob1_rendezvous_hdr_t),
|
|
|
|
&size,
|
2008-02-18 20:39:30 +03:00
|
|
|
MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP,
|
2007-07-11 02:16:38 +04:00
|
|
|
&des );
|
2008-04-07 11:52:04 +04:00
|
|
|
MEMCHECKER(
|
2008-05-07 16:28:51 +04:00
|
|
|
memchecker_call(&opal_memchecker_base_mem_noaccess,
|
|
|
|
sendreq->req_send.req_base.req_addr,
|
|
|
|
sendreq->req_send.req_base.req_count,
|
|
|
|
sendreq->req_send.req_base.req_datatype);
|
2008-04-07 11:52:04 +04:00
|
|
|
);
|
2005-09-14 06:17:04 +04:00
|
|
|
}
|
2005-09-13 02:28:23 +04:00
|
|
|
|
2007-07-11 03:45:23 +04:00
|
|
|
if( OPAL_UNLIKELY(NULL == des) ) {
|
2005-09-13 02:28:23 +04:00
|
|
|
return OMPI_ERR_OUT_OF_RESOURCE;
|
|
|
|
}
|
|
|
|
segment = des->des_src;
|
|
|
|
|
|
|
|
/* build hdr */
|
2007-01-05 01:07:37 +03:00
|
|
|
hdr = (mca_pml_ob1_hdr_t*)segment->seg_addr.pval;
|
2005-11-11 18:33:25 +03:00
|
|
|
hdr->hdr_common.hdr_flags = flags;
|
2005-09-13 02:28:23 +04:00
|
|
|
hdr->hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_RNDV;
|
|
|
|
hdr->hdr_match.hdr_ctx = sendreq->req_send.req_base.req_comm->c_contextid;
|
|
|
|
hdr->hdr_match.hdr_src = sendreq->req_send.req_base.req_comm->c_my_rank;
|
|
|
|
hdr->hdr_match.hdr_tag = sendreq->req_send.req_base.req_tag;
|
2006-08-24 20:38:08 +04:00
|
|
|
hdr->hdr_match.hdr_seq = (uint16_t)sendreq->req_send.req_base.req_sequence;
|
2005-09-13 02:28:23 +04:00
|
|
|
hdr->hdr_rndv.hdr_msg_length = sendreq->req_send.req_bytes_packed;
|
2007-01-05 01:07:37 +03:00
|
|
|
hdr->hdr_rndv.hdr_src_req.pval = sendreq;
|
2005-09-13 02:28:23 +04:00
|
|
|
|
2007-12-16 11:45:44 +03:00
|
|
|
ob1_hdr_hton(hdr, MCA_PML_OB1_HDR_TYPE_RNDV,
|
|
|
|
sendreq->req_send.req_base.req_proc);
|
2006-02-26 03:45:54 +03:00
|
|
|
|
2005-09-13 02:28:23 +04:00
|
|
|
/* first fragment of a long message */
|
|
|
|
des->des_cbdata = sendreq;
|
|
|
|
des->des_cbfunc = mca_pml_ob1_rndv_completion;
|
|
|
|
|
2007-08-30 16:08:33 +04:00
|
|
|
/* wait for ack and completion */
|
|
|
|
sendreq->req_state = 2;
|
|
|
|
|
2005-09-13 02:28:23 +04:00
|
|
|
/* send */
|
2008-05-30 05:29:09 +04:00
|
|
|
rc = mca_bml_base_send(bml_btl, des, MCA_PML_OB1_HDR_TYPE_RNDV);
|
2007-07-11 03:45:23 +04:00
|
|
|
if( OPAL_UNLIKELY(OMPI_SUCCESS != rc) ) {
|
2005-09-13 02:28:23 +04:00
|
|
|
mca_bml_base_free(bml_btl, des );
|
|
|
|
}
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
|
2007-07-11 03:45:23 +04:00
|
|
|
void mca_pml_ob1_send_request_copy_in_out( mca_pml_ob1_send_request_t *sendreq,
|
|
|
|
uint64_t send_offset,
|
|
|
|
uint64_t send_length )
|
2007-06-03 12:30:07 +04:00
|
|
|
{
|
|
|
|
mca_pml_ob1_send_range_t *sr;
|
|
|
|
ompi_free_list_item_t *i;
|
2007-07-01 15:34:23 +04:00
|
|
|
mca_bml_base_endpoint_t* bml_endpoint = sendreq->req_endpoint;
|
|
|
|
int num_btls = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_send);
|
|
|
|
int rc = OMPI_SUCCESS, n;
|
|
|
|
double weight_total = 0;
|
2007-06-03 12:30:07 +04:00
|
|
|
|
2007-07-11 03:45:23 +04:00
|
|
|
if( OPAL_UNLIKELY(0 == send_length) )
|
2007-06-03 12:30:07 +04:00
|
|
|
return;
|
|
|
|
|
|
|
|
OMPI_FREE_LIST_WAIT(&mca_pml_ob1.send_ranges, i, rc);
|
|
|
|
|
|
|
|
sr = (mca_pml_ob1_send_range_t*)i;
|
|
|
|
|
|
|
|
sr->range_send_offset = send_offset;
|
|
|
|
sr->range_send_length = send_length;
|
2007-07-01 15:34:23 +04:00
|
|
|
sr->range_btl_idx = 0;
|
|
|
|
|
|
|
|
for(n = 0; n < num_btls && n < mca_pml_ob1.max_send_per_range; n++) {
|
|
|
|
sr->range_btls[n].bml_btl =
|
|
|
|
mca_bml_base_btl_array_get_next(&bml_endpoint->btl_send);
|
|
|
|
weight_total += sr->range_btls[n].bml_btl->btl_weight;
|
|
|
|
}
|
|
|
|
|
|
|
|
sr->range_btl_cnt = n;
|
|
|
|
mca_pml_ob1_calc_weighted_length(sr->range_btls, n, send_length,
|
|
|
|
weight_total);
|
|
|
|
|
2007-06-03 12:30:07 +04:00
|
|
|
OPAL_THREAD_LOCK(&sendreq->req_send_range_lock);
|
|
|
|
opal_list_append(&sendreq->req_send_ranges, (opal_list_item_t*)sr);
|
|
|
|
OPAL_THREAD_UNLOCK(&sendreq->req_send_range_lock);
|
|
|
|
}
|
2005-09-13 02:28:23 +04:00
|
|
|
|
2007-08-30 16:10:04 +04:00
|
|
|
static inline mca_pml_ob1_send_range_t *
|
|
|
|
get_send_range_nolock(mca_pml_ob1_send_request_t* sendreq)
|
|
|
|
{
|
|
|
|
opal_list_item_t *item;
|
|
|
|
|
|
|
|
item = opal_list_get_first(&sendreq->req_send_ranges);
|
|
|
|
|
|
|
|
if(opal_list_get_end(&sendreq->req_send_ranges) == item)
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
return (mca_pml_ob1_send_range_t*)item;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline mca_pml_ob1_send_range_t *
|
|
|
|
get_send_range(mca_pml_ob1_send_request_t* sendreq)
|
|
|
|
{
|
|
|
|
mca_pml_ob1_send_range_t *range;
|
|
|
|
|
|
|
|
OPAL_THREAD_LOCK(&sendreq->req_send_range_lock);
|
|
|
|
range = get_send_range_nolock(sendreq);
|
|
|
|
OPAL_THREAD_UNLOCK(&sendreq->req_send_range_lock);
|
|
|
|
|
|
|
|
return range;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline mca_pml_ob1_send_range_t *
|
|
|
|
get_next_send_range(mca_pml_ob1_send_request_t* sendreq,
|
|
|
|
mca_pml_ob1_send_range_t *range)
|
|
|
|
{
|
|
|
|
OPAL_THREAD_LOCK(&sendreq->req_send_range_lock);
|
|
|
|
opal_list_remove_item(&sendreq->req_send_ranges, (opal_list_item_t *)range);
|
|
|
|
OMPI_FREE_LIST_RETURN(&mca_pml_ob1.send_ranges, &range->base);
|
|
|
|
range = get_send_range_nolock(sendreq);
|
|
|
|
OPAL_THREAD_UNLOCK(&sendreq->req_send_range_lock);
|
|
|
|
|
|
|
|
return range;
|
|
|
|
}
|
|
|
|
|
2005-09-14 21:08:08 +04:00
|
|
|
/**
|
2005-09-15 22:47:59 +04:00
|
|
|
* Schedule pipeline of send descriptors for the given request.
|
|
|
|
* Up to the rdma threshold. If this is a send based protocol,
|
|
|
|
* the rdma threshold is the end of the message. Otherwise, schedule
|
|
|
|
* fragments up to the threshold to overlap initial registration/setup
|
2006-07-20 18:44:35 +04:00
|
|
|
* costs of the rdma. Only one thread can be inside this function.
|
2005-06-01 18:34:22 +04:00
|
|
|
*/
|
|
|
|
|
2007-08-30 16:10:04 +04:00
|
|
|
int
|
2007-09-12 11:08:38 +04:00
|
|
|
mca_pml_ob1_send_request_schedule_once(mca_pml_ob1_send_request_t* sendreq)
|
2005-06-01 18:34:22 +04:00
|
|
|
{
|
2007-08-30 16:10:04 +04:00
|
|
|
size_t prev_bytes_remaining = 0;
|
|
|
|
mca_pml_ob1_send_range_t *range;
|
|
|
|
int num_fail = 0;
|
|
|
|
|
|
|
|
/* check pipeline_depth here before attempting to get any locks */
|
|
|
|
if(true == sendreq->req_throttle_sends &&
|
|
|
|
sendreq->req_pipeline_depth >= mca_pml_ob1.send_pipeline_depth)
|
|
|
|
return OMPI_SUCCESS;
|
|
|
|
|
|
|
|
range = get_send_range(sendreq);
|
|
|
|
|
|
|
|
while(range && (false == sendreq->req_throttle_sends ||
|
|
|
|
sendreq->req_pipeline_depth < mca_pml_ob1.send_pipeline_depth)) {
|
|
|
|
mca_pml_ob1_frag_hdr_t* hdr;
|
|
|
|
mca_btl_base_descriptor_t* des;
|
|
|
|
int rc, btl_idx;
|
2007-10-18 16:07:37 +04:00
|
|
|
size_t size, offset, data_remaining = 0;
|
2007-08-30 16:10:04 +04:00
|
|
|
mca_bml_base_btl_t* bml_btl;
|
|
|
|
|
|
|
|
assert(range->range_send_length != 0);
|
2005-06-01 18:34:22 +04:00
|
|
|
|
2007-08-30 16:10:04 +04:00
|
|
|
if(prev_bytes_remaining == range->range_send_length)
|
|
|
|
num_fail++;
|
|
|
|
else
|
|
|
|
num_fail = 0;
|
|
|
|
|
|
|
|
prev_bytes_remaining = range->range_send_length;
|
|
|
|
|
|
|
|
if( OPAL_UNLIKELY(num_fail == range->range_btl_cnt) ) {
|
|
|
|
assert(sendreq->req_pending == MCA_PML_OB1_SEND_PENDING_NONE);
|
|
|
|
add_request_to_send_pending(sendreq,
|
|
|
|
MCA_PML_OB1_SEND_PENDING_SCHEDULE, true);
|
|
|
|
/* Note that request remains locked. send_request_process_pending()
|
|
|
|
* function will call shedule_exclusive() directly without taking
|
|
|
|
* the lock */
|
|
|
|
return OMPI_ERR_OUT_OF_RESOURCE;
|
|
|
|
}
|
|
|
|
|
2007-10-18 16:07:37 +04:00
|
|
|
cannot_pack:
|
2007-08-30 16:10:04 +04:00
|
|
|
do {
|
|
|
|
btl_idx = range->range_btl_idx;
|
|
|
|
if(++range->range_btl_idx == range->range_btl_cnt)
|
|
|
|
range->range_btl_idx = 0;
|
2007-10-18 16:07:37 +04:00
|
|
|
} while(!range->range_btls[btl_idx].length);
|
|
|
|
|
|
|
|
bml_btl = range->range_btls[btl_idx].bml_btl;
|
|
|
|
/* If there is a remaining data from another BTL that was too small
|
|
|
|
* for converter to pack then send it through another BTL */
|
|
|
|
range->range_btls[btl_idx].length += data_remaining;
|
|
|
|
size = range->range_btls[btl_idx].length;
|
2007-08-30 16:10:04 +04:00
|
|
|
|
|
|
|
/* makes sure that we don't exceed BTL max send size */
|
|
|
|
if(bml_btl->btl_max_send_size != 0)
|
|
|
|
{
|
|
|
|
size_t max_send_size = bml_btl->btl_max_send_size -
|
|
|
|
sizeof(mca_pml_ob1_frag_hdr_t);
|
|
|
|
|
|
|
|
if (size > max_send_size) {
|
|
|
|
size = max_send_size;
|
2006-07-20 18:44:35 +04:00
|
|
|
}
|
2007-08-30 16:10:04 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
/* pack into a descriptor */
|
|
|
|
offset = (size_t)range->range_send_offset;
|
|
|
|
ompi_convertor_set_position(&sendreq->req_send.req_base.req_convertor,
|
|
|
|
&offset);
|
|
|
|
range->range_send_offset = (uint64_t)offset;
|
|
|
|
|
2007-10-18 16:07:37 +04:00
|
|
|
data_remaining = size;
|
2008-04-07 11:52:04 +04:00
|
|
|
MEMCHECKER(
|
2008-05-07 16:28:51 +04:00
|
|
|
memchecker_call(&opal_memchecker_base_mem_defined,
|
|
|
|
sendreq->req_send.req_base.req_addr,
|
|
|
|
sendreq->req_send.req_base.req_count,
|
|
|
|
sendreq->req_send.req_base.req_datatype);
|
2008-04-07 11:52:04 +04:00
|
|
|
);
|
2007-08-30 16:10:04 +04:00
|
|
|
mca_bml_base_prepare_src(bml_btl, NULL,
|
|
|
|
&sendreq->req_send.req_base.req_convertor,
|
|
|
|
MCA_BTL_NO_ORDER,
|
2007-12-09 17:08:01 +03:00
|
|
|
sizeof(mca_pml_ob1_frag_hdr_t),
|
2008-02-18 20:39:30 +03:00
|
|
|
&size, MCA_BTL_DES_FLAGS_BTL_OWNERSHIP, &des);
|
2008-04-07 11:52:04 +04:00
|
|
|
MEMCHECKER(
|
2008-05-07 16:28:51 +04:00
|
|
|
memchecker_call(&opal_memchecker_base_mem_noaccess,
|
|
|
|
sendreq->req_send.req_base.req_addr,
|
|
|
|
sendreq->req_send.req_base.req_count,
|
|
|
|
sendreq->req_send.req_base.req_datatype);
|
2008-04-07 11:52:04 +04:00
|
|
|
);
|
2007-10-18 16:07:37 +04:00
|
|
|
|
|
|
|
if( OPAL_UNLIKELY(des == NULL || size == 0) ) {
|
|
|
|
if(des) {
|
|
|
|
/* Converter can't pack this chunk. Append to another chunk
|
|
|
|
* from other BTL */
|
|
|
|
mca_bml_base_free(bml_btl, des);
|
|
|
|
range->range_btls[btl_idx].length -= data_remaining;
|
|
|
|
goto cannot_pack;
|
|
|
|
}
|
2007-08-30 16:10:04 +04:00
|
|
|
continue;
|
|
|
|
}
|
2007-10-18 16:07:37 +04:00
|
|
|
|
2007-08-30 16:10:04 +04:00
|
|
|
des->des_cbfunc = mca_pml_ob1_frag_completion;
|
|
|
|
des->des_cbdata = sendreq;
|
2006-07-20 18:44:35 +04:00
|
|
|
|
2007-08-30 16:10:04 +04:00
|
|
|
/* setup header */
|
|
|
|
hdr = (mca_pml_ob1_frag_hdr_t*)des->des_src->seg_addr.pval;
|
|
|
|
hdr->hdr_common.hdr_flags = 0;
|
|
|
|
hdr->hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_FRAG;
|
|
|
|
hdr->hdr_frag_offset = range->range_send_offset;
|
|
|
|
hdr->hdr_src_req.pval = sendreq;
|
|
|
|
hdr->hdr_dst_req = sendreq->req_recv;
|
2005-06-01 18:34:22 +04:00
|
|
|
|
2007-12-16 11:45:44 +03:00
|
|
|
ob1_hdr_hton(hdr, MCA_PML_OB1_HDR_TYPE_FRAG,
|
|
|
|
sendreq->req_send.req_base.req_proc);
|
2006-02-26 03:45:54 +03:00
|
|
|
|
2006-03-31 21:09:09 +04:00
|
|
|
#if OMPI_WANT_PERUSE
|
2007-08-30 16:10:04 +04:00
|
|
|
PERUSE_TRACE_COMM_OMPI_EVENT(PERUSE_COMM_REQ_XFER_CONTINUE,
|
|
|
|
&(sendreq->req_send.req_base), size, PERUSE_SEND);
|
2006-03-31 21:09:09 +04:00
|
|
|
#endif /* OMPI_WANT_PERUSE */
|
|
|
|
|
2007-08-30 16:10:04 +04:00
|
|
|
/* initiate send - note that this may complete before the call returns */
|
2008-05-30 05:29:09 +04:00
|
|
|
rc = mca_bml_base_send(bml_btl, des, MCA_PML_OB1_HDR_TYPE_FRAG);
|
2007-08-30 16:10:04 +04:00
|
|
|
|
|
|
|
if( OPAL_LIKELY(rc == OMPI_SUCCESS) ) {
|
|
|
|
/* update state */
|
|
|
|
range->range_btls[btl_idx].length -= size;
|
|
|
|
range->range_send_length -= size;
|
|
|
|
range->range_send_offset += size;
|
|
|
|
OPAL_THREAD_ADD_SIZE_T(&sendreq->req_pipeline_depth, 1);
|
|
|
|
if(range->range_send_length == 0) {
|
|
|
|
range = get_next_send_range(sendreq, range);
|
|
|
|
prev_bytes_remaining = 0;
|
2005-06-01 18:34:22 +04:00
|
|
|
}
|
2007-08-30 16:10:04 +04:00
|
|
|
} else {
|
|
|
|
mca_bml_base_free(bml_btl,des);
|
|
|
|
continue;
|
2006-07-20 18:44:35 +04:00
|
|
|
}
|
2007-08-30 16:10:04 +04:00
|
|
|
}
|
2006-07-20 18:44:35 +04:00
|
|
|
|
2005-06-01 18:34:22 +04:00
|
|
|
return OMPI_SUCCESS;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2005-06-10 00:16:33 +04:00
|
|
|
/**
|
|
|
|
* An RDMA put operation has completed:
|
|
|
|
* (1) Update request status and if required set completed
|
|
|
|
* (2) Send FIN control message to the destination
|
|
|
|
*/
|
|
|
|
|
2006-12-02 00:52:07 +03:00
|
|
|
static void mca_pml_ob1_put_completion( mca_btl_base_module_t* btl,
|
|
|
|
struct mca_btl_base_endpoint_t* ep,
|
|
|
|
struct mca_btl_base_descriptor_t* des,
|
|
|
|
int status )
|
2005-06-10 00:16:33 +04:00
|
|
|
{
|
|
|
|
mca_pml_ob1_rdma_frag_t* frag = (mca_pml_ob1_rdma_frag_t*)des->des_cbdata;
|
2006-08-24 20:38:08 +04:00
|
|
|
mca_pml_ob1_send_request_t* sendreq = (mca_pml_ob1_send_request_t*)frag->rdma_req;
|
2006-07-20 18:44:35 +04:00
|
|
|
mca_bml_base_btl_t* bml_btl = (mca_bml_base_btl_t*) des->des_context;
|
2005-06-10 00:16:33 +04:00
|
|
|
|
|
|
|
/* check completion status */
|
2007-07-11 03:45:23 +04:00
|
|
|
if( OPAL_UNLIKELY(OMPI_SUCCESS != status) ) {
|
2005-06-10 00:16:33 +04:00
|
|
|
/* TSW - FIX */
|
|
|
|
ORTE_ERROR_LOG(status);
|
2008-02-28 04:57:57 +03:00
|
|
|
orte_errmgr.abort(-1, NULL);
|
2005-06-10 00:16:33 +04:00
|
|
|
}
|
2006-12-03 11:55:59 +03:00
|
|
|
|
|
|
|
mca_pml_ob1_send_fin(sendreq->req_send.req_base.req_proc,
|
2007-05-24 23:51:26 +04:00
|
|
|
bml_btl,
|
|
|
|
frag->rdma_hdr.hdr_rdma.hdr_des.pval,
|
2007-06-03 12:31:58 +04:00
|
|
|
des->order, 0);
|
2007-05-24 23:51:26 +04:00
|
|
|
|
2006-03-16 01:53:41 +03:00
|
|
|
/* check for request completion */
|
2007-08-30 16:08:33 +04:00
|
|
|
OPAL_THREAD_ADD_SIZE_T(&sendreq->req_bytes_delivered, frag->rdma_length);
|
|
|
|
|
|
|
|
send_request_pml_complete_check(sendreq);
|
2005-06-10 00:16:33 +04:00
|
|
|
|
2006-07-20 18:44:35 +04:00
|
|
|
MCA_PML_OB1_RDMA_FRAG_RETURN(frag);
|
2005-06-10 00:16:33 +04:00
|
|
|
|
2006-07-20 18:44:35 +04:00
|
|
|
MCA_PML_OB1_PROGRESS_PENDING(bml_btl);
|
2005-06-10 00:16:33 +04:00
|
|
|
}
|
|
|
|
|
2006-12-02 00:52:07 +03:00
|
|
|
int mca_pml_ob1_send_request_put_frag( mca_pml_ob1_rdma_frag_t* frag )
|
2006-07-20 18:44:35 +04:00
|
|
|
{
|
2005-06-25 01:12:38 +04:00
|
|
|
mca_mpool_base_registration_t* reg = NULL;
|
2007-05-09 16:11:51 +04:00
|
|
|
mca_bml_base_btl_t* bml_btl = frag->rdma_bml;
|
2005-06-30 09:50:55 +04:00
|
|
|
mca_btl_base_descriptor_t* des;
|
2007-05-09 16:11:51 +04:00
|
|
|
size_t save_size = frag->rdma_length;
|
2005-06-10 00:16:33 +04:00
|
|
|
int rc;
|
|
|
|
|
2005-06-22 00:58:24 +04:00
|
|
|
/* setup descriptor */
|
2006-12-02 00:52:07 +03:00
|
|
|
mca_bml_base_prepare_src( bml_btl,
|
|
|
|
reg,
|
2007-05-03 13:13:17 +04:00
|
|
|
&frag->convertor,
|
2007-05-24 23:51:26 +04:00
|
|
|
MCA_BTL_NO_ORDER,
|
2006-12-02 00:52:07 +03:00
|
|
|
0,
|
2007-12-09 17:08:01 +03:00
|
|
|
&frag->rdma_length,
|
2008-02-18 20:39:30 +03:00
|
|
|
MCA_BTL_DES_FLAGS_BTL_OWNERSHIP,
|
2006-12-02 00:52:07 +03:00
|
|
|
&des );
|
2005-08-12 06:41:14 +04:00
|
|
|
|
2007-07-11 03:45:23 +04:00
|
|
|
if( OPAL_UNLIKELY(NULL == des) ) {
|
2007-06-03 12:31:58 +04:00
|
|
|
if(frag->retries < mca_pml_ob1.rdma_put_retries_limit) {
|
|
|
|
size_t offset = (size_t)frag->rdma_hdr.hdr_rdma.hdr_rdma_offset;
|
|
|
|
frag->rdma_length = save_size;
|
|
|
|
ompi_convertor_set_position(&frag->convertor, &offset);
|
|
|
|
OPAL_THREAD_LOCK(&mca_pml_ob1.lock);
|
|
|
|
opal_list_append(&mca_pml_ob1.rdma_pending, (opal_list_item_t*)frag);
|
|
|
|
OPAL_THREAD_UNLOCK(&mca_pml_ob1.lock);
|
|
|
|
} else {
|
|
|
|
mca_pml_ob1_send_request_t *sendreq =
|
|
|
|
(mca_pml_ob1_send_request_t*)frag->rdma_req;
|
|
|
|
|
|
|
|
/* tell receiver to unregister memory */
|
|
|
|
mca_pml_ob1_send_fin(sendreq->req_send.req_base.req_proc,
|
2007-07-08 20:37:51 +04:00
|
|
|
bml_btl, frag->rdma_hdr.hdr_rdma.hdr_des.pval,
|
2007-06-03 12:31:58 +04:00
|
|
|
MCA_BTL_NO_ORDER, 1);
|
|
|
|
|
|
|
|
/* send fragment by copy in/out */
|
2007-07-11 02:16:38 +04:00
|
|
|
mca_pml_ob1_send_request_copy_in_out(sendreq,
|
2007-06-03 12:31:58 +04:00
|
|
|
frag->rdma_hdr.hdr_rdma.hdr_rdma_offset, frag->rdma_length);
|
2008-05-11 16:40:55 +04:00
|
|
|
/* if a pointer to a receive request is not set it means that
|
|
|
|
* ACK was not yet received. Don't schedule sends before ACK */
|
|
|
|
if(NULL != sendreq->req_recv.pval)
|
|
|
|
mca_pml_ob1_send_request_schedule(sendreq);
|
2007-06-03 12:31:58 +04:00
|
|
|
}
|
2006-07-20 18:44:35 +04:00
|
|
|
return OMPI_ERR_OUT_OF_RESOURCE;
|
2005-06-10 00:16:33 +04:00
|
|
|
}
|
2006-02-09 18:49:51 +03:00
|
|
|
|
2005-06-10 00:16:33 +04:00
|
|
|
des->des_dst = frag->rdma_segs;
|
2006-07-20 18:44:35 +04:00
|
|
|
des->des_dst_cnt = frag->rdma_hdr.hdr_rdma.hdr_seg_cnt;
|
2005-06-10 00:16:33 +04:00
|
|
|
des->des_cbfunc = mca_pml_ob1_put_completion;
|
|
|
|
des->des_cbdata = frag;
|
|
|
|
|
2006-06-27 00:08:33 +04:00
|
|
|
PERUSE_TRACE_COMM_OMPI_EVENT( PERUSE_COMM_REQ_XFER_CONTINUE,
|
2007-06-13 16:47:47 +04:00
|
|
|
&(((mca_pml_ob1_send_request_t*)frag->rdma_req)->req_send.req_base), save_size, PERUSE_SEND );
|
2006-06-26 23:01:22 +04:00
|
|
|
|
2007-07-11 03:45:23 +04:00
|
|
|
rc = mca_bml_base_put(bml_btl, des);
|
|
|
|
if( OPAL_UNLIKELY(OMPI_SUCCESS != rc) ) {
|
2006-07-20 18:44:35 +04:00
|
|
|
mca_bml_base_free(bml_btl, des);
|
|
|
|
frag->rdma_length = save_size;
|
|
|
|
if(OMPI_ERR_OUT_OF_RESOURCE == rc) {
|
2005-07-04 02:45:48 +04:00
|
|
|
OPAL_THREAD_LOCK(&mca_pml_ob1.lock);
|
2006-07-20 18:44:35 +04:00
|
|
|
opal_list_append(&mca_pml_ob1.rdma_pending,
|
|
|
|
(opal_list_item_t*)frag);
|
2005-07-04 02:45:48 +04:00
|
|
|
OPAL_THREAD_UNLOCK(&mca_pml_ob1.lock);
|
2006-07-20 18:44:35 +04:00
|
|
|
return OMPI_ERR_OUT_OF_RESOURCE;
|
2005-06-10 00:16:33 +04:00
|
|
|
} else {
|
|
|
|
/* TSW - FIX */
|
|
|
|
ORTE_ERROR_LOG(rc);
|
2008-02-28 04:57:57 +03:00
|
|
|
orte_errmgr.abort(-1, NULL);
|
2005-06-10 00:16:33 +04:00
|
|
|
}
|
|
|
|
}
|
2006-07-20 18:44:35 +04:00
|
|
|
return OMPI_SUCCESS;
|
2005-06-09 07:34:33 +04:00
|
|
|
}
|
|
|
|
|
2006-07-20 18:44:35 +04:00
|
|
|
/**
|
|
|
|
* Receiver has scheduled an RDMA operation:
|
|
|
|
* (1) Allocate an RDMA fragment to maintain the state of the operation
|
|
|
|
* (2) Call BTL prepare_src to pin/prepare source buffers
|
|
|
|
* (3) Queue the RDMA put
|
|
|
|
*/
|
|
|
|
|
2007-07-01 20:19:13 +04:00
|
|
|
void mca_pml_ob1_send_request_put( mca_pml_ob1_send_request_t* sendreq,
|
|
|
|
mca_btl_base_module_t* btl,
|
|
|
|
mca_pml_ob1_rdma_hdr_t* hdr )
|
2006-07-20 18:44:35 +04:00
|
|
|
{
|
|
|
|
mca_bml_base_endpoint_t *bml_endpoint = sendreq->req_endpoint;
|
|
|
|
mca_pml_ob1_rdma_frag_t* frag;
|
|
|
|
int rc;
|
|
|
|
size_t i, size = 0;
|
2005-06-09 07:34:33 +04:00
|
|
|
|
2006-07-20 18:44:35 +04:00
|
|
|
if(hdr->hdr_common.hdr_flags & MCA_PML_OB1_HDR_TYPE_ACK) {
|
2007-08-30 16:08:33 +04:00
|
|
|
OPAL_THREAD_ADD32(&sendreq->req_state, -1);
|
2006-07-20 18:44:35 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
MCA_PML_OB1_RDMA_FRAG_ALLOC(frag, rc);
|
|
|
|
|
2007-07-11 03:45:23 +04:00
|
|
|
if( OPAL_UNLIKELY(NULL == frag) ) {
|
2006-07-20 18:44:35 +04:00
|
|
|
/* TSW - FIX */
|
|
|
|
ORTE_ERROR_LOG(rc);
|
2008-02-28 04:57:57 +03:00
|
|
|
orte_errmgr.abort(-1, NULL);
|
2006-07-20 18:44:35 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
/* setup fragment */
|
2007-01-07 04:48:57 +03:00
|
|
|
for( i = 0; i < hdr->hdr_seg_cnt; i++ ) {
|
|
|
|
frag->rdma_segs[i].seg_addr.lval = hdr->hdr_segs[i].seg_addr.lval;
|
|
|
|
frag->rdma_segs[i].seg_len = hdr->hdr_segs[i].seg_len;
|
|
|
|
frag->rdma_segs[i].seg_key.key64 = hdr->hdr_segs[i].seg_key.key64;
|
2007-08-29 01:23:44 +04:00
|
|
|
|
|
|
|
#if OMPI_ENABLE_HETEROGENEOUS_SUPPORT
|
2008-04-18 00:43:56 +04:00
|
|
|
if ((sendreq->req_send.req_base.req_proc->proc_arch & OPAL_ARCH_ISBIGENDIAN) !=
|
|
|
|
(ompi_proc_local()->proc_arch & OPAL_ARCH_ISBIGENDIAN)) {
|
2007-08-29 01:23:44 +04:00
|
|
|
size += opal_swap_bytes4(frag->rdma_segs[i].seg_len);
|
|
|
|
} else
|
|
|
|
#endif
|
|
|
|
{
|
2007-08-29 17:28:47 +04:00
|
|
|
size += frag->rdma_segs[i].seg_len;
|
2007-08-29 01:23:44 +04:00
|
|
|
}
|
2006-07-20 18:44:35 +04:00
|
|
|
}
|
|
|
|
|
2007-05-09 16:11:51 +04:00
|
|
|
frag->rdma_bml = mca_bml_base_btl_array_find(&bml_endpoint->btl_rdma, btl);
|
2006-07-20 18:44:35 +04:00
|
|
|
frag->rdma_hdr.hdr_rdma = *hdr;
|
|
|
|
frag->rdma_req = sendreq;
|
|
|
|
frag->rdma_ep = bml_endpoint;
|
|
|
|
frag->rdma_length = size;
|
|
|
|
frag->rdma_state = MCA_PML_OB1_RDMA_PUT;
|
2007-05-09 16:11:51 +04:00
|
|
|
frag->reg = NULL;
|
2007-06-03 12:31:58 +04:00
|
|
|
frag->retries = 0;
|
2007-05-09 16:11:51 +04:00
|
|
|
|
|
|
|
/* lookup the corresponding registration */
|
|
|
|
for(i=0; i<sendreq->req_rdma_cnt; i++) {
|
|
|
|
if(sendreq->req_rdma[i].bml_btl == frag->rdma_bml) {
|
|
|
|
frag->reg = sendreq->req_rdma[i].btl_reg;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
2006-07-20 18:44:35 +04:00
|
|
|
|
2007-05-03 13:13:17 +04:00
|
|
|
/* RDMA writes may proceed in parallel to send and to each other, so
|
|
|
|
* create clone of the convertor for each RDMA fragment
|
|
|
|
*/
|
2007-05-09 14:02:06 +04:00
|
|
|
size = hdr->hdr_rdma_offset;
|
2007-07-11 02:16:38 +04:00
|
|
|
ompi_convertor_clone_with_position(&sendreq->req_send.req_base.req_convertor,
|
2007-05-09 14:02:06 +04:00
|
|
|
&frag->convertor, 0, &size);
|
2007-05-03 13:13:17 +04:00
|
|
|
|
2006-07-20 18:44:35 +04:00
|
|
|
mca_pml_ob1_send_request_put_frag(frag);
|
|
|
|
}
|
2007-07-11 02:16:38 +04:00
|
|
|
|