From 274f8d608c6b498edafc2155066c22a3e10b600d Mon Sep 17 00:00:00 2001 From: Pascal Deveze Date: Thu, 21 Jul 2016 11:06:45 +0200 Subject: [PATCH 1/4] coll-portals4: Change output format and change variable names (minor changes). --- ompi/mca/coll/portals4/coll_portals4_allreduce.c | 4 ++-- ompi/mca/coll/portals4/coll_portals4_bcast.c | 14 ++++++++------ ompi/mca/coll/portals4/coll_portals4_reduce.c | 2 +- 3 files changed, 11 insertions(+), 9 deletions(-) diff --git a/ompi/mca/coll/portals4/coll_portals4_allreduce.c b/ompi/mca/coll/portals4/coll_portals4_allreduce.c index 57d681d9f6..61f140cd0f 100644 --- a/ompi/mca/coll/portals4/coll_portals4_allreduce.c +++ b/ompi/mca/coll/portals4/coll_portals4_allreduce.c @@ -265,7 +265,7 @@ allreduce_kary_tree_top(const void *sendbuf, void *recvbuf, int count, ompi_coll_portals4_get_peer(comm, child[i]), mca_coll_portals4_component.pt_idx, match_bits_rtr, 0, NULL, 0)) != PTL_OK) - return opal_stderr("Put RTR failed", __FILE__, __LINE__, ret); + return opal_stderr("Put RTR failed %d", __FILE__, __LINE__, ret); } } } @@ -408,7 +408,7 @@ int ompi_coll_portals4_iallreduce_intra(const void* sendbuf, void* recvbuf, int allreduce_kary_tree_top(sendbuf, recvbuf, count, dtype, op, comm, request, portals4_module); - puts("iallreduce"); + opal_output_verbose(10, ompi_coll_base_framework.framework_output, "iallreduce"); return (OMPI_SUCCESS); } diff --git a/ompi/mca/coll/portals4/coll_portals4_bcast.c b/ompi/mca/coll/portals4/coll_portals4_bcast.c index fe0431d903..db23a21029 100644 --- a/ompi/mca/coll/portals4/coll_portals4_bcast.c +++ b/ompi/mca/coll/portals4/coll_portals4_bcast.c @@ -409,9 +409,9 @@ bcast_kary_tree_top(void *buff, int count, */ if (rank != root) { - ack_thr = segment_nb; + trig_thr = segment_nb; if (is_sync) { - if ((ret = PtlCTWait(request->u.bcast.trig_ct_h, ack_thr, &ct)) != 0) { + if ((ret = PtlCTWait(request->u.bcast.trig_ct_h, trig_thr, &ct)) != 0) { opal_stderr("PtlCTWait failed", __FILE__, __LINE__, ret); } } @@ -421,7 +421,7 @@ bcast_kary_tree_top(void *buff, int count, mca_coll_portals4_component.finish_pt_idx, 0, 0, NULL, (uintptr_t) request, request->u.bcast.trig_ct_h, - ack_thr)) != 0) { + trig_thr)) != 0) { return opal_stderr("PtlTriggeredPut failed", __FILE__, __LINE__, ret); } @@ -696,8 +696,9 @@ bcast_pipeline_top(void *buff, int count, */ if (rank != root) { + trig_thr = segment_nb; if (is_sync) { - if ((ret = PtlCTWait(request->u.bcast.trig_ct_h, segment_nb, &ct)) != 0) { + if ((ret = PtlCTWait(request->u.bcast.trig_ct_h, trig_thr, &ct)) != 0) { opal_stderr("PtlCTWait failed", __FILE__, __LINE__, ret); } } @@ -707,7 +708,7 @@ bcast_pipeline_top(void *buff, int count, mca_coll_portals4_component.finish_pt_idx, 0, 0, NULL, (uintptr_t) request, request->u.bcast.trig_ct_h, - segment_nb)) != 0) { + trig_thr)) != 0) { return opal_stderr("PtlTriggeredPut failed", __FILE__, __LINE__, ret); } } @@ -831,7 +832,7 @@ ompi_coll_portals4_ibcast_intra(void *buff, int count, return OMPI_ERROR; } - puts("ibcast"); + opal_output_verbose(10, ompi_coll_base_framework.framework_output, "ibcast_intra"); return (OMPI_SUCCESS); } @@ -860,5 +861,6 @@ ompi_coll_portals4_ibcast_intra_fini(ompi_coll_portals4_request_t *request) ompi_request_complete(&request->super, true); OPAL_THREAD_UNLOCK(&ompi_request_lock); + opal_output_verbose(10, ompi_coll_base_framework.framework_output, "ibcast_intra_fini"); return (OMPI_SUCCESS); } diff --git a/ompi/mca/coll/portals4/coll_portals4_reduce.c b/ompi/mca/coll/portals4/coll_portals4_reduce.c index 39928cfb85..92933ebeec 100644 --- a/ompi/mca/coll/portals4/coll_portals4_reduce.c +++ b/ompi/mca/coll/portals4/coll_portals4_reduce.c @@ -428,7 +428,7 @@ ompi_coll_portals4_ireduce_intra(const void* sendbuf, void* recvbuf, int count, OMPI_COLL_PORTALS4_REQUEST_RETURN(request); } - puts("ireduce"); + opal_output_verbose(10, ompi_coll_base_framework.framework_output, "ireduce"); return (OMPI_SUCCESS); } From df59d6cdd4407e57657de4036fcdc18b70fc65b4 Mon Sep 17 00:00:00 2001 From: Pascal Deveze Date: Thu, 21 Jul 2016 11:07:46 +0200 Subject: [PATCH 2/4] coll-portals4: Correct and simplify how the data are cut in segment_nb segments (bcast) --- ompi/mca/coll/portals4/coll_portals4.h | 2 +- ompi/mca/coll/portals4/coll_portals4_bcast.c | 50 ++++++++++++------- ompi/mca/coll/portals4/coll_portals4_reduce.c | 8 +-- 3 files changed, 38 insertions(+), 22 deletions(-) diff --git a/ompi/mca/coll/portals4/coll_portals4.h b/ompi/mca/coll/portals4/coll_portals4.h index 9d3386db4d..6ab62f752e 100644 --- a/ompi/mca/coll/portals4/coll_portals4.h +++ b/ompi/mca/coll/portals4/coll_portals4.h @@ -314,7 +314,7 @@ is_reduce_optimizable(struct ompi_datatype_t *dtype, size_t length, struct ompi_ } *ptl_dtype = ompi_coll_portals4_atomic_datatype[dtype->id]; - if (*ptl_dtype == COLL_PORTALS4_NO_DTYPE){ + if (*ptl_dtype == COLL_PORTALS4_NO_DTYPE) { opal_output_verbose(50, ompi_coll_base_framework.framework_output, "datatype %d not supported\n", dtype->id); diff --git a/ompi/mca/coll/portals4/coll_portals4_bcast.c b/ompi/mca/coll/portals4/coll_portals4_bcast.c index db23a21029..b54e0e1e84 100644 --- a/ompi/mca/coll/portals4/coll_portals4_bcast.c +++ b/ompi/mca/coll/portals4/coll_portals4_bcast.c @@ -137,9 +137,9 @@ bcast_kary_tree_top(void *buff, int count, mca_coll_portals4_module_t *portals4_module) { bool is_sync = request->is_sync; - int ret, seg; - unsigned int i; - int segment_nb = request->u.bcast.segment_nb; + int ret; + unsigned int i, seg, seg_size, nb_long; + unsigned int segment_nb = request->u.bcast.segment_nb; unsigned int child_nb; int size = ompi_comm_size(comm); int rank = ompi_comm_rank(comm); @@ -201,15 +201,22 @@ bcast_kary_tree_top(void *buff, int count, COLL_PORTALS4_SET_BITS(match_bits, ompi_comm_get_cid(comm), 0, 0, COLL_PORTALS4_BCAST, 0, internal_count); + /* The data will be cut in segment_nb segments. + * nb_long segments will have a size of (seg_size + 1) + * and (segment_nb - nb_long) segments will have a size of seg_size + */ + seg_size = request->u.bcast.tmpsize / segment_nb; + nb_long = request->u.bcast.tmpsize % segment_nb; + opal_output_verbose(10, ompi_coll_base_framework.framework_output, "seg_size=%d nb_long=%d segment_nb=%d", seg_size, nb_long, segment_nb); + if (rank != root) { for (seg = 1, offset = 0, length = 0 ; seg <= segment_nb ; seg++, offset += length) { /* Divide buffer into segments */ - length = (seg < segment_nb) ? - (request->u.bcast.tmpsize + segment_nb - 1) / segment_nb : - request->u.bcast.tmpsize - ((request->u.bcast.tmpsize + segment_nb - 1) / segment_nb) * (segment_nb - 1); + if (seg <= nb_long) length = seg_size + 1; + else length = seg_size; /* ** Prepare Data ME @@ -352,9 +359,10 @@ bcast_kary_tree_top(void *buff, int count, seg++, offset += length) { /* Divide buffer into segments */ - length = (seg < segment_nb) ? - (request->u.bcast.tmpsize + segment_nb - 1) / segment_nb : - request->u.bcast.tmpsize - ((request->u.bcast.tmpsize + segment_nb - 1) / segment_nb) * (segment_nb - 1); + if (seg <= nb_long) length = seg_size + 1; + else length = seg_size; + opal_output_verbose(10, ompi_coll_base_framework.framework_output, + "bcast with k-ary tree : segment of size %ld", length); /* compute the triggering threshold to send data to the children */ trig_thr = (rank == root) ? (segment_nb) : @@ -440,8 +448,9 @@ bcast_pipeline_top(void *buff, int count, mca_coll_portals4_module_t *portals4_module) { bool is_sync = request->is_sync; - int ret, seg; - int segment_nb = request->u.bcast.segment_nb; + int ret; + unsigned int seg, seg_size, nb_long; + unsigned int segment_nb = request->u.bcast.segment_nb; int size = ompi_comm_size(comm); int rank = ompi_comm_rank(comm); ptl_rank_t parent, child; @@ -492,6 +501,13 @@ bcast_pipeline_top(void *buff, int count, COLL_PORTALS4_SET_BITS(match_bits, ompi_comm_get_cid(comm), 0, 0, COLL_PORTALS4_BCAST, 0, internal_count); + /* The data will be cut in segment_nb segments. + * nb_long segments will have a size of (seg_size + 1) + * and (segment_nb - nb_long) segments will have a size of seg_size + */ + seg_size = request->u.bcast.tmpsize / segment_nb; + nb_long = request->u.bcast.tmpsize % segment_nb; + opal_output_verbose(10, ompi_coll_base_framework.framework_output, "seg_size=%d nb_long=%d", seg_size, nb_long); if (rank != root) { for (seg = 1, offset = 0, length = 0 ; @@ -499,9 +515,8 @@ bcast_pipeline_top(void *buff, int count, seg++, offset += length) { /* Divide buffer into segments */ - length = (seg < segment_nb) ? - (request->u.bcast.tmpsize + segment_nb - 1) / segment_nb : - request->u.bcast.tmpsize - ((request->u.bcast.tmpsize + segment_nb - 1) / segment_nb) * (segment_nb - 1); + if (seg <= nb_long) length = seg_size + 1; + else length = seg_size; /* ** Prepare Data ME @@ -642,9 +657,10 @@ bcast_pipeline_top(void *buff, int count, seg++, offset += length) { /* Divide buffer into segments */ - length = (seg < segment_nb) ? - (request->u.bcast.tmpsize + segment_nb - 1) / segment_nb : - request->u.bcast.tmpsize - ((request->u.bcast.tmpsize + segment_nb - 1) / segment_nb) * (segment_nb - 1); + if (seg <= nb_long) length = seg_size + 1; + else length = seg_size; + opal_output_verbose(10, ompi_coll_base_framework.framework_output, + "bcast with pipeline : segment of size %ld \n", length); /* compute the triggering threshold to send data to the children */ trig_thr = (rank == root) ? (segment_nb) : diff --git a/ompi/mca/coll/portals4/coll_portals4_reduce.c b/ompi/mca/coll/portals4/coll_portals4_reduce.c index 92933ebeec..88a1464880 100644 --- a/ompi/mca/coll/portals4/coll_portals4_reduce.c +++ b/ompi/mca/coll/portals4/coll_portals4_reduce.c @@ -385,10 +385,10 @@ ompi_coll_portals4_reduce_intra(const void *sendbuf, void *recvbuf, int count, ret = reduce_kary_tree_top(sendbuf, recvbuf, count, dtype, op, root, comm, request, portals4_module); if (OMPI_SUCCESS != ret) - return ret; + return ret; ret = reduce_kary_tree_bottom(request); if (OMPI_SUCCESS != ret) - return ret; + return ret; OMPI_COLL_PORTALS4_REQUEST_RETURN(request); return (OMPI_SUCCESS); @@ -422,7 +422,7 @@ ompi_coll_portals4_ireduce_intra(const void* sendbuf, void* recvbuf, int count, ret = reduce_kary_tree_top(sendbuf, recvbuf, count, dtype, op, root, comm, request, portals4_module); if (OMPI_SUCCESS != ret) - return ret; + return ret; if (!request->u.reduce.is_optim) { OMPI_COLL_PORTALS4_REQUEST_RETURN(request); @@ -439,7 +439,7 @@ ompi_coll_portals4_ireduce_intra_fini(ompi_coll_portals4_request_t *request) ret = reduce_kary_tree_bottom(request); if (OMPI_SUCCESS != ret) - return ret; + return ret; OPAL_THREAD_LOCK(&ompi_request_lock); ompi_request_complete(&request->super, true); From 175e6aa38555d989f891792151d8383c1dfc3d24 Mon Sep 17 00:00:00 2001 From: Pascal Deveze Date: Thu, 21 Jul 2016 11:08:42 +0200 Subject: [PATCH 3/4] coll-portals4: Before calling PtlCTWait, call PtlTriggeredInc twice so be sure all pending PtlTriggredPut are triggered --- .../mca/coll/portals4/coll_portals4_barrier.c | 26 ++++++- ompi/mca/coll/portals4/coll_portals4_bcast.c | 76 ++++++++++++++++--- 2 files changed, 90 insertions(+), 12 deletions(-) diff --git a/ompi/mca/coll/portals4/coll_portals4_barrier.c b/ompi/mca/coll/portals4/coll_portals4_barrier.c index 76b54fd923..9e18dd811a 100644 --- a/ompi/mca/coll/portals4/coll_portals4_barrier.c +++ b/ompi/mca/coll/portals4/coll_portals4_barrier.c @@ -147,9 +147,31 @@ barrier_hypercube_top(struct ompi_communicator_t *comm, } if (is_sync) { - /* Send a put to self when we've received all our messages... */ - ret = PtlCTWait(request->u.barrier.rtr_ct_h, num_msgs, &event); + /* Each process has a pending PtlTriggeredPut. To be sure this request will be triggered, we must + call PtlTriggeredCTInc twice. Otherwise, we could free the CT too early and the Put wouldn't be triggered */ + ptl_ct_event_t ct_inc; + + ct_inc.success = 1; + ct_inc.failure = 0; + + if ((ret = PtlTriggeredCTInc(request->u.barrier.rtr_ct_h, ct_inc, + request->u.barrier.rtr_ct_h, num_msgs)) != 0) { + return opal_stderr("PtlTriggeredCTInc failed", __FILE__, __LINE__, ret); + } + + if ((ret = PtlTriggeredCTInc(request->u.barrier.rtr_ct_h, ct_inc, + request->u.barrier.rtr_ct_h, num_msgs + 1)) != 0) { + return opal_stderr("PtlTriggeredCTInc failed", __FILE__, __LINE__, ret); + } + + ret = PtlCTWait(request->u.barrier.rtr_ct_h, num_msgs + 2, &event); + if (PTL_OK != ret) { + opal_output_verbose(1, ompi_coll_base_framework.framework_output, + "%s:%d: PtlCTWait failed: %d\n", + __FILE__, __LINE__, ret); + return OMPI_ERROR; + } } else { /* Send a put to self when we've received all our messages... */ diff --git a/ompi/mca/coll/portals4/coll_portals4_bcast.c b/ompi/mca/coll/portals4/coll_portals4_bcast.c index b54e0e1e84..2042ae9eb6 100644 --- a/ompi/mca/coll/portals4/coll_portals4_bcast.c +++ b/ompi/mca/coll/portals4/coll_portals4_bcast.c @@ -361,12 +361,10 @@ bcast_kary_tree_top(void *buff, int count, /* Divide buffer into segments */ if (seg <= nb_long) length = seg_size + 1; else length = seg_size; - opal_output_verbose(10, ompi_coll_base_framework.framework_output, - "bcast with k-ary tree : segment of size %ld", length); /* compute the triggering threshold to send data to the children */ - trig_thr = (rank == root) ? (segment_nb) : - (segment_nb + seg); + trig_thr = segment_nb + seg - 1; /* To be sure the set of PtlTriggeredPut of DATA will be executed in order */ + if (rank != root) trig_thr ++; /* ** Send Data to children @@ -389,6 +387,17 @@ bcast_kary_tree_top(void *buff, int count, } } + if (rank == root) { + trig_thr = segment_nb; + ct_inc.success = segment_nb; + ct_inc.failure = 0; + + if ((ret = PtlTriggeredCTInc(request->u.bcast.trig_ct_h, ct_inc, + request->u.bcast.trig_ct_h, trig_thr)) != 0) { + return opal_stderr("PtlTriggeredCTInc failed", __FILE__, __LINE__, ret); + } + } + ack_thr = child_nb; if (is_sync) { @@ -419,7 +428,26 @@ bcast_kary_tree_top(void *buff, int count, if (rank != root) { trig_thr = segment_nb; if (is_sync) { - if ((ret = PtlCTWait(request->u.bcast.trig_ct_h, trig_thr, &ct)) != 0) { + /* Each leaf has a pending PtlTriggeredPut (to send the final ACK). We must call PtlTriggeredCTInc twice. + Otherwise, we could pass the PtlCTWait and then free the CT too early and the Put wouldn't be triggered. + + This is necessary because portals4 does not insure the order in the triggered operations associated + with the same threshold. In the case where PtlCTWait is not called (else case), this is not necessary. */ + + ct_inc.success = 1; + ct_inc.failure = 0; + + if ((ret = PtlTriggeredCTInc(request->u.bcast.trig_ct_h, ct_inc, + request->u.bcast.trig_ct_h, trig_thr)) != 0) { + return opal_stderr("PtlTriggeredCTInc failed", __FILE__, __LINE__, ret); + } + + if ((ret = PtlTriggeredCTInc(request->u.bcast.trig_ct_h, ct_inc, + request->u.bcast.trig_ct_h, trig_thr + 1)) != 0) { + return opal_stderr("PtlTriggeredCTInc failed", __FILE__, __LINE__, ret); + } + + if ((ret = PtlCTWait(request->u.bcast.trig_ct_h, trig_thr + 2, &ct)) != 0) { opal_stderr("PtlCTWait failed", __FILE__, __LINE__, ret); } } @@ -659,12 +687,10 @@ bcast_pipeline_top(void *buff, int count, /* Divide buffer into segments */ if (seg <= nb_long) length = seg_size + 1; else length = seg_size; - opal_output_verbose(10, ompi_coll_base_framework.framework_output, - "bcast with pipeline : segment of size %ld \n", length); /* compute the triggering threshold to send data to the children */ - trig_thr = (rank == root) ? (segment_nb) : - (segment_nb + seg); + trig_thr = segment_nb + seg - 1; /* To be sure the PtlTriggeredPut will be executed in order */ + if (rank != root) trig_thr ++; /* ** Send Data to children @@ -684,6 +710,16 @@ bcast_pipeline_top(void *buff, int count, } } } + if (rank == root) { + trig_thr = segment_nb; + ct_inc.success = segment_nb; + ct_inc.failure = 0; + + if ((ret = PtlTriggeredCTInc(request->u.bcast.trig_ct_h, ct_inc, + request->u.bcast.trig_ct_h, trig_thr)) != 0) { + return opal_stderr("PtlTriggeredCTInc failed", __FILE__, __LINE__, ret); + } + } if (is_sync) { if ((ret = PtlCTWait(request->u.bcast.ack_ct_h, 1, &ct)) != 0) { @@ -713,8 +749,28 @@ bcast_pipeline_top(void *buff, int count, if (rank != root) { trig_thr = segment_nb; + if (is_sync) { - if ((ret = PtlCTWait(request->u.bcast.trig_ct_h, trig_thr, &ct)) != 0) { + /* Each leaf has a pending PtlTriggeredPut (to send the final ACK). We must call PtlTriggeredCTInc twice. + Otherwise, we could pass the PtlCTWait and then free the CT too early and the Put wouldn't be triggered. + + This is necessary because portals4 does not insure the order in the triggered operations associated + with the same threshold. In the case where PtlCTWait is not called (else case), this is not necessary. */ + + ct_inc.success = 1; + ct_inc.failure = 0; + + if ((ret = PtlTriggeredCTInc(request->u.bcast.trig_ct_h, ct_inc, + request->u.bcast.trig_ct_h, trig_thr)) != 0) { + return opal_stderr("PtlTriggeredCTInc failed", __FILE__, __LINE__, ret); + } + + if ((ret = PtlTriggeredCTInc(request->u.bcast.trig_ct_h, ct_inc, + request->u.bcast.trig_ct_h, trig_thr + 1)) != 0) { + return opal_stderr("PtlTriggeredCTInc failed", __FILE__, __LINE__, ret); + } + + if ((ret = PtlCTWait(request->u.bcast.trig_ct_h, trig_thr + 2, &ct)) != 0) { opal_stderr("PtlCTWait failed", __FILE__, __LINE__, ret); } } From a7e3de6c4f107aef3d1d7fca25537209ad60456c Mon Sep 17 00:00:00 2001 From: Pascal Deveze Date: Thu, 21 Jul 2016 13:40:25 +0200 Subject: [PATCH 4/4] coll-portals4: No more messages passed to Portals4 bigger than the limit given by PtlNIInit --- ompi/mca/coll/portals4/coll_portals4.h | 1 + ompi/mca/coll/portals4/coll_portals4_bcast.c | 20 ++- .../coll/portals4/coll_portals4_component.c | 16 +++ ompi/mca/coll/portals4/coll_portals4_gather.c | 125 ++++++++++++++---- .../mca/coll/portals4/coll_portals4_scatter.c | 95 ++++++++----- 5 files changed, 191 insertions(+), 66 deletions(-) diff --git a/ompi/mca/coll/portals4/coll_portals4.h b/ompi/mca/coll/portals4/coll_portals4.h index 6ab62f752e..3e898eab99 100644 --- a/ompi/mca/coll/portals4/coll_portals4.h +++ b/ompi/mca/coll/portals4/coll_portals4.h @@ -65,6 +65,7 @@ struct mca_coll_portals4_component_t { opal_free_list_t requests; /* request free list for the i collectives */ ptl_ni_limits_t ni_limits; + ptl_size_t portals_max_msg_size; int use_binomial_gather_algorithm; diff --git a/ompi/mca/coll/portals4/coll_portals4_bcast.c b/ompi/mca/coll/portals4/coll_portals4_bcast.c index 2042ae9eb6..3356bc05a4 100644 --- a/ompi/mca/coll/portals4/coll_portals4_bcast.c +++ b/ompi/mca/coll/portals4/coll_portals4_bcast.c @@ -89,12 +89,20 @@ static int prepare_bcast_data (struct ompi_communicator_t *comm, } /* Number of segments */ - request->u.bcast.segment_nb = (request->u.bcast.tmpsize > COLL_PORTALS4_MAX_BW) ? - (((request->u.bcast.tmpsize + COLL_PORTALS4_MAX_BW -1) / COLL_PORTALS4_MAX_BW) < COLL_PORTALS4_MAX_SEGMENT ? - ((request->u.bcast.tmpsize + COLL_PORTALS4_MAX_BW -1) / COLL_PORTALS4_MAX_BW) : - COLL_PORTALS4_MAX_SEGMENT) : + { + size_t max_msg_size = (COLL_PORTALS4_MAX_BW > mca_coll_portals4_component.ni_limits.max_msg_size) ? + mca_coll_portals4_component.ni_limits.max_msg_size : + COLL_PORTALS4_MAX_BW; + + //TODO : Either make compatible Portals size limits and COLL_PORTALS4_MAX_SEGMENT or remove COLL_PORTALS4_MAX_SEGMENT + request->u.bcast.segment_nb = (request->u.bcast.tmpsize > max_msg_size) ? + (((request->u.bcast.tmpsize + max_msg_size -1) / max_msg_size) < COLL_PORTALS4_MAX_SEGMENT ? + ((request->u.bcast.tmpsize + max_msg_size -1) / max_msg_size) : COLL_PORTALS4_MAX_SEGMENT) : 1; + OPAL_OUTPUT_VERBOSE((10, ompi_coll_base_framework.framework_output, + "seg_number=%d , seg_size_max=%lu", request->u.bcast.segment_nb, max_msg_size)); + } if (request->u.bcast.segment_nb > COLL_PORTALS4_BCAST_ALGO_THRESHOLD) { request->u.bcast.algo = OMPI_COLL_PORTALS4_BCAST_PIPELINE_ALGO; } @@ -361,6 +369,8 @@ bcast_kary_tree_top(void *buff, int count, /* Divide buffer into segments */ if (seg <= nb_long) length = seg_size + 1; else length = seg_size; + opal_output_verbose(10, ompi_coll_base_framework.framework_output, + "bcast with k-ary tree : segment of size %ld", length); /* compute the triggering threshold to send data to the children */ trig_thr = segment_nb + seg - 1; /* To be sure the set of PtlTriggeredPut of DATA will be executed in order */ @@ -687,6 +697,8 @@ bcast_pipeline_top(void *buff, int count, /* Divide buffer into segments */ if (seg <= nb_long) length = seg_size + 1; else length = seg_size; + opal_output_verbose(10, ompi_coll_base_framework.framework_output, + "bcast with pipeline : segment of size %ld \n", length); /* compute the triggering threshold to send data to the children */ trig_thr = segment_nb + seg - 1; /* To be sure the PtlTriggeredPut will be executed in order */ diff --git a/ompi/mca/coll/portals4/coll_portals4_component.c b/ompi/mca/coll/portals4/coll_portals4_component.c index 4c3bb7be40..72149790fa 100644 --- a/ompi/mca/coll/portals4/coll_portals4_component.c +++ b/ompi/mca/coll/portals4/coll_portals4_component.c @@ -211,6 +211,16 @@ portals4_register(void) MCA_BASE_VAR_SCOPE_READONLY, &mca_coll_portals4_component.use_binomial_gather_algorithm); + mca_coll_portals4_component.portals_max_msg_size = PTL_SIZE_MAX; + (void) mca_base_component_var_register(&mca_coll_portals4_component.super.collm_version, + "max_msg_size", + "Max size supported by portals4 (above that, a message is cut into messages less than that size)", + MCA_BASE_VAR_TYPE_UNSIGNED_LONG, + NULL, 0, 0, + OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_READONLY, + &mca_coll_portals4_component.portals_max_msg_size); + return OMPI_SUCCESS; } @@ -369,7 +379,13 @@ portals4_init_query(bool enable_progress_threads, __FILE__, __LINE__, ret); return OMPI_ERROR; } + opal_output_verbose(10, ompi_coll_base_framework.framework_output, + "ni_limits.max_atomic_size=%ld", mca_coll_portals4_component.ni_limits.max_atomic_size); + if (mca_coll_portals4_component.portals_max_msg_size < mca_coll_portals4_component.ni_limits.max_msg_size) + mca_coll_portals4_component.ni_limits.max_msg_size = mca_coll_portals4_component.portals_max_msg_size; + opal_output_verbose(10, ompi_coll_base_framework.framework_output, + "ni_limits.max_msg_size=%lu", mca_coll_portals4_component.ni_limits.max_msg_size); ret = PtlGetId(mca_coll_portals4_component.ni_h, &mca_coll_portals4_component.id); if (PTL_OK != ret) { diff --git a/ompi/mca/coll/portals4/coll_portals4_gather.c b/ompi/mca/coll/portals4/coll_portals4_gather.c index c2aca74493..9ac88f2b5f 100644 --- a/ompi/mca/coll/portals4/coll_portals4_gather.c +++ b/ompi/mca/coll/portals4/coll_portals4_gather.c @@ -21,6 +21,7 @@ #include "coll_portals4.h" #include "coll_portals4_request.h" +#include // included for ffs in get_tree_numdescendants_of #undef RTR_USES_TRIGGERED_PUT @@ -55,6 +56,22 @@ * | * 15 */ + +static int32_t get_tree_numdescendants_of(struct ompi_communicator_t* comm, + int vrank) +{ + int max; + int size = ompi_comm_size(comm); + + if (0 == vrank) { + return size - 1; + } else { + max = 1 << ffs(vrank - 1); + return ((vrank + max <= size ) ? max : size - vrank) -1; + } + +} + static ompi_coll_portals4_tree_t* ompi_coll_portals4_build_in_order_bmtree( struct ompi_communicator_t* comm, int root ) @@ -506,8 +523,10 @@ ompi_coll_portals4_gather_intra_binomial_top(const void *sbuf, int scount, struc int32_t expected_ops =0; int32_t expected_acks=0; + ptl_size_t number_of_fragment_gathered = 0; + ptl_size_t number_of_fragment_send = 1; - OPAL_OUTPUT((ompi_coll_base_framework.framework_output, + OPAL_OUTPUT_VERBOSE((10, ompi_coll_base_framework.framework_output, "coll:portals4:gather_intra_binomial_top enter rank %d", request->u.gather.my_rank)); request->type = OMPI_COLL_PORTALS4_TYPE_GATHER; @@ -579,6 +598,23 @@ ompi_coll_portals4_gather_intra_binomial_top(const void *sbuf, int scount, struc ret = setup_sync_handles(comm, request, portals4_module); if (MPI_SUCCESS != ret) { line = __LINE__; goto err_hdlr; } + OPAL_OUTPUT_VERBOSE((10, ompi_coll_base_framework.framework_output, + "%s:%d: packed_size=%lu, fragment_size=%lu", + __FILE__, __LINE__, request->u.gather.packed_size, mca_coll_portals4_component.ni_limits.max_msg_size)); + + for (int i =0; i < bmtree->tree_nextsize; i++) { + int child_vrank = VRANK(bmtree->tree_next[i], request->u.gather.root_rank, request->u.gather.size); + int sub_tree_size = get_tree_numdescendants_of(comm, child_vrank) + 1; + ptl_size_t local_number_of_fragment = ((sub_tree_size * request->u.gather.packed_size) + mca_coll_portals4_component.ni_limits.max_msg_size -1) / mca_coll_portals4_component.ni_limits.max_msg_size; + + OPAL_OUTPUT_VERBOSE((10, ompi_coll_base_framework.framework_output, + "%s:%d: %d is child of %d(%d) with %d descendants (nb_frag += %lu)", + __FILE__, __LINE__, bmtree->tree_next[i], vrank, request->u.gather.root_rank , sub_tree_size, local_number_of_fragment)); + number_of_fragment_gathered += local_number_of_fragment; + } + + number_of_fragment_send = (request->u.gather.gather_bytes + mca_coll_portals4_component.ni_limits.max_msg_size -1) / mca_coll_portals4_component.ni_limits.max_msg_size; + /***********************************************/ /* Chain the RTR and Recv-ACK to the Gather CT */ /***********************************************/ @@ -603,7 +639,7 @@ ompi_coll_portals4_gather_intra_binomial_top(const void *sbuf, int scount, struc if (vrank == 0) { /* root, so do nothing */ - expected_ops=bmtree->tree_nextsize; /* gather put from each child */ + expected_ops=number_of_fragment_gathered ; /* gather put from each child */ expected_acks=0; } else { @@ -617,22 +653,32 @@ ompi_coll_portals4_gather_intra_binomial_top(const void *sbuf, int scount, struc __FILE__, __LINE__, vrank, remote_offset, vrank, vparent, request->u.gather.packed_size); - expected_ops=bmtree->tree_nextsize + 1; /* gather put from each child + a chained RTR */ + expected_ops=number_of_fragment_gathered + 1; /* gather puts from each child + a chained RTR */ expected_acks=1; /* Recv-ACK from parent */ - ret = PtlTriggeredPut(request->u.gather.gather_mdh, - request->u.gather.gather_offset, - request->u.gather.gather_bytes, + ptl_size_t size_sent = 0; + ptl_size_t size_left = request->u.gather.gather_bytes; + + for (ptl_size_t i = 0 ; i < number_of_fragment_send; i++) { + ptl_size_t frag_size = (size_left > mca_coll_portals4_component.ni_limits.max_msg_size) ? + mca_coll_portals4_component.ni_limits.max_msg_size: + size_left; + ret = PtlTriggeredPut(request->u.gather.gather_mdh, + request->u.gather.gather_offset + size_sent, + frag_size, PTL_NO_ACK_REQ, ompi_coll_portals4_get_peer(comm, parent), mca_coll_portals4_component.pt_idx, request->u.gather.gather_match_bits, - remote_offset, + remote_offset + size_sent, NULL, 0, request->u.gather.gather_cth, expected_ops); - if (PTL_OK != ret) { ret = OMPI_ERROR; line = __LINE__; goto err_hdlr; } + if (PTL_OK != ret) { ret = OMPI_ERROR; line = __LINE__; goto err_hdlr; } + size_left -= frag_size; + size_sent += frag_size; + } } /************************************/ @@ -734,7 +780,7 @@ ompi_coll_portals4_gather_intra_binomial_top(const void *sbuf, int scount, struc ompi_coll_portals4_destroy_tree(&(portals4_module->cached_in_order_bmtree)); - OPAL_OUTPUT((ompi_coll_base_framework.framework_output, + OPAL_OUTPUT_VERBOSE((10, ompi_coll_base_framework.framework_output, "coll:portals4:gather_intra_binomial_top exit rank %d", request->u.gather.my_rank)); return OMPI_SUCCESS; @@ -773,8 +819,9 @@ ompi_coll_portals4_gather_intra_linear_top(const void *sbuf, int scount, struct int32_t expected_ops =0; int32_t expected_acks=0; + ptl_size_t number_of_fragment = 1; - OPAL_OUTPUT((ompi_coll_base_framework.framework_output, + OPAL_OUTPUT_VERBOSE((10, ompi_coll_base_framework.framework_output, "coll:portals4:gather_intra_linear_top enter rank %d", request->u.gather.my_rank)); request->type = OMPI_COLL_PORTALS4_TYPE_GATHER; @@ -843,6 +890,13 @@ ompi_coll_portals4_gather_intra_linear_top(const void *sbuf, int scount, struct ret = setup_sync_handles(comm, request, portals4_module); if (MPI_SUCCESS != ret) { line = __LINE__; goto err_hdlr; } + number_of_fragment = (request->u.gather.packed_size > mca_coll_portals4_component.ni_limits.max_msg_size) ? + (request->u.gather.packed_size + mca_coll_portals4_component.ni_limits.max_msg_size - 1) / mca_coll_portals4_component.ni_limits.max_msg_size : + 1; + opal_output_verbose(90, ompi_coll_base_framework.framework_output, + "%s:%d:rank %d:number_of_fragment = %lu", + __FILE__, __LINE__, request->u.gather.my_rank, number_of_fragment); + /***********************************************/ /* Chain the RTR and Recv-ACK to the Gather CT */ /***********************************************/ @@ -867,11 +921,13 @@ ompi_coll_portals4_gather_intra_linear_top(const void *sbuf, int scount, struct if (i_am_root) { /* root, so do nothing */ - expected_ops=request->u.gather.size-1; /* gather put from all other ranks */ + expected_ops=(request->u.gather.size-1) * number_of_fragment; /* gather put from all other ranks */ expected_acks=0; } else { ptl_size_t remote_offset=request->u.gather.my_rank * request->u.gather.packed_size; + ptl_size_t split_offset = 0; + ptl_size_t size_left = request->u.gather.gather_bytes; opal_output_verbose(30, ompi_coll_base_framework.framework_output, "%s:%d:rank(%d): remote_offset(%lu)=rank(%d) * packed_size(%ld)", @@ -881,19 +937,34 @@ ompi_coll_portals4_gather_intra_linear_top(const void *sbuf, int scount, struct expected_ops=1; /* chained RTR */ expected_acks=1; /* Recv-ACK from root */ - ret = PtlTriggeredPut(request->u.gather.gather_mdh, - request->u.gather.gather_offset, - request->u.gather.gather_bytes, + for (ptl_size_t j=0; j mca_coll_portals4_component.ni_limits.max_msg_size) ? + mca_coll_portals4_component.ni_limits.max_msg_size : + size_left; + + opal_output_verbose(10, ompi_coll_base_framework.framework_output, + "%s:%d:rank(%d): frag(%lu),offset_frag (%lu) frag_size(%lu)", + __FILE__, __LINE__, request->u.gather.my_rank, + j, split_offset, frag_size); + + ret = PtlTriggeredPut(request->u.gather.gather_mdh, + request->u.gather.gather_offset + split_offset, + frag_size, PTL_NO_ACK_REQ, ompi_coll_portals4_get_peer(comm, request->u.gather.root_rank), mca_coll_portals4_component.pt_idx, request->u.gather.gather_match_bits, - remote_offset, + remote_offset + split_offset, NULL, 0, request->u.gather.gather_cth, expected_ops); - if (PTL_OK != ret) { ret = OMPI_ERROR; line = __LINE__; goto err_hdlr; } + if (PTL_OK != ret) { ret = OMPI_ERROR; line = __LINE__; goto err_hdlr; } + + size_left -= frag_size; + split_offset += frag_size; + } } /*****************************************/ @@ -997,7 +1068,7 @@ ompi_coll_portals4_gather_intra_linear_top(const void *sbuf, int scount, struct "completed CTWait(expected_ops=%d)\n", expected_ops); } - OPAL_OUTPUT((ompi_coll_base_framework.framework_output, + OPAL_OUTPUT_VERBOSE((10, ompi_coll_base_framework.framework_output, "coll:portals4:gather_intra_linear_top exit rank %d", request->u.gather.my_rank)); return OMPI_SUCCESS; @@ -1020,7 +1091,7 @@ ompi_coll_portals4_gather_intra_binomial_bottom(struct ompi_communicator_t *comm int ret, line; int i; - OPAL_OUTPUT((ompi_coll_base_framework.framework_output, + OPAL_OUTPUT_VERBOSE((10, ompi_coll_base_framework.framework_output, "coll:portals4:gather_intra_binomial_bottom enter rank %d", request->u.gather.my_rank)); ret = cleanup_gather_handles(request); @@ -1065,7 +1136,7 @@ ompi_coll_portals4_gather_intra_binomial_bottom(struct ompi_communicator_t *comm ompi_request_complete(&request->super, true); OPAL_THREAD_UNLOCK(&ompi_request_lock); - OPAL_OUTPUT((ompi_coll_base_framework.framework_output, + OPAL_OUTPUT_VERBOSE((10, ompi_coll_base_framework.framework_output, "coll:portals4:gather_intra_binomial_bottom exit rank %d", request->u.gather.my_rank)); return OMPI_SUCCESS; @@ -1090,7 +1161,7 @@ ompi_coll_portals4_gather_intra_linear_bottom(struct ompi_communicator_t *comm, int ret, line; int i; - OPAL_OUTPUT((ompi_coll_base_framework.framework_output, + OPAL_OUTPUT_VERBOSE((10, ompi_coll_base_framework.framework_output, "coll:portals4:gather_intra_linear_bottom enter rank %d", request->u.gather.my_rank)); ret = cleanup_gather_handles(request); @@ -1128,7 +1199,7 @@ ompi_coll_portals4_gather_intra_linear_bottom(struct ompi_communicator_t *comm, ompi_request_complete(&request->super, true); OPAL_THREAD_UNLOCK(&ompi_request_lock); - OPAL_OUTPUT((ompi_coll_base_framework.framework_output, + OPAL_OUTPUT_VERBOSE((10, ompi_coll_base_framework.framework_output, "coll:portals4:gather_intra_linear_bottom exit rank %d", request->u.gather.my_rank)); return OMPI_SUCCESS; @@ -1157,7 +1228,7 @@ ompi_coll_portals4_gather_intra(const void *sbuf, int scount, struct ompi_dataty ompi_coll_portals4_request_t *request; - OPAL_OUTPUT((ompi_coll_base_framework.framework_output, + OPAL_OUTPUT_VERBOSE((10, ompi_coll_base_framework.framework_output, "coll:portals4:gather_intra enter rank %d", ompi_comm_rank(comm))); /* @@ -1204,7 +1275,7 @@ ompi_coll_portals4_gather_intra(const void *sbuf, int scount, struct ompi_dataty */ OMPI_COLL_PORTALS4_REQUEST_RETURN(request); - OPAL_OUTPUT((ompi_coll_base_framework.framework_output, + OPAL_OUTPUT_VERBOSE((10, ompi_coll_base_framework.framework_output, "coll:portals4:gather_intra exit rank %d", request->u.gather.my_rank)); return OMPI_SUCCESS; @@ -1230,7 +1301,7 @@ ompi_coll_portals4_igather_intra(const void *sbuf, int scount, struct ompi_datat ompi_coll_portals4_request_t *request; - OPAL_OUTPUT((ompi_coll_base_framework.framework_output, + OPAL_OUTPUT_VERBOSE((10, ompi_coll_base_framework.framework_output, "coll:portals4:igather_intra enter rank %d", ompi_comm_rank(comm))); /* @@ -1267,7 +1338,7 @@ ompi_coll_portals4_igather_intra(const void *sbuf, int scount, struct ompi_datat if (MPI_SUCCESS != ret) { line = __LINE__; goto err_hdlr; } } - OPAL_OUTPUT((ompi_coll_base_framework.framework_output, + OPAL_OUTPUT_VERBOSE((10, ompi_coll_base_framework.framework_output, "coll:portals4:igather_intra exit rank %d", request->u.gather.my_rank)); return OMPI_SUCCESS; @@ -1286,7 +1357,7 @@ ompi_coll_portals4_igather_intra_fini(ompi_coll_portals4_request_t *request) { int ret, line; - OPAL_OUTPUT((ompi_coll_base_framework.framework_output, + OPAL_OUTPUT_VERBOSE((10, ompi_coll_base_framework.framework_output, "coll:portals4:igather_intra_fini enter rank %d", request->u.gather.my_rank)); /* @@ -1300,7 +1371,7 @@ ompi_coll_portals4_igather_intra_fini(ompi_coll_portals4_request_t *request) if (MPI_SUCCESS != ret) { line = __LINE__; goto err_hdlr; } } - OPAL_OUTPUT((ompi_coll_base_framework.framework_output, + OPAL_OUTPUT_VERBOSE((10, ompi_coll_base_framework.framework_output, "coll:portals4:igather_intra_fini exit rank %d", request->u.gather.my_rank)); return OMPI_SUCCESS; diff --git a/ompi/mca/coll/portals4/coll_portals4_scatter.c b/ompi/mca/coll/portals4/coll_portals4_scatter.c index c1ec41dd84..fd1d134690 100644 --- a/ompi/mca/coll/portals4/coll_portals4_scatter.c +++ b/ompi/mca/coll/portals4/coll_portals4_scatter.c @@ -127,7 +127,7 @@ setup_scatter_handles(struct ompi_communicator_t *comm, ptl_me_t me; - OPAL_OUTPUT((ompi_coll_base_framework.framework_output, + OPAL_OUTPUT_VERBOSE((10, ompi_coll_base_framework.framework_output, "coll:portals4:setup_scatter_handles enter rank %d", request->u.scatter.my_rank)); /**********************************/ @@ -136,7 +136,7 @@ setup_scatter_handles(struct ompi_communicator_t *comm, COLL_PORTALS4_SET_BITS(request->u.scatter.scatter_match_bits, ompi_comm_get_cid(comm), 0, 0, COLL_PORTALS4_SCATTER, 0, request->u.scatter.coll_count); - OPAL_OUTPUT((ompi_coll_base_framework.framework_output, + OPAL_OUTPUT_VERBOSE((10, ompi_coll_base_framework.framework_output, "coll:portals4:setup_scatter_handles rank(%d) scatter_match_bits(0x%016lX)", request->u.scatter.my_rank, request->u.scatter.scatter_match_bits)); @@ -166,7 +166,7 @@ setup_scatter_handles(struct ompi_communicator_t *comm, &request->u.scatter.scatter_meh); if (PTL_OK != ret) { ret = OMPI_ERROR; line = __LINE__; goto err_hdlr; } - OPAL_OUTPUT((ompi_coll_base_framework.framework_output, + OPAL_OUTPUT_VERBOSE((10, ompi_coll_base_framework.framework_output, "coll:portals4:setup_scatter_handles exit rank %d", request->u.scatter.my_rank)); return OMPI_SUCCESS; @@ -188,7 +188,7 @@ setup_sync_handles(struct ompi_communicator_t *comm, ptl_me_t me; - OPAL_OUTPUT((ompi_coll_base_framework.framework_output, + OPAL_OUTPUT_VERBOSE((10, ompi_coll_base_framework.framework_output, "coll:portals4:setup_sync_handles enter rank %d", request->u.scatter.my_rank)); /**********************************/ @@ -197,7 +197,7 @@ setup_sync_handles(struct ompi_communicator_t *comm, COLL_PORTALS4_SET_BITS(request->u.scatter.sync_match_bits, ompi_comm_get_cid(comm), 0, 1, COLL_PORTALS4_SCATTER, 0, request->u.scatter.coll_count); - OPAL_OUTPUT((ompi_coll_base_framework.framework_output, + OPAL_OUTPUT_VERBOSE((10, ompi_coll_base_framework.framework_output, "coll:portals4:setup_sync_handles rank(%d) sync_match_bits(0x%016lX)", request->u.scatter.my_rank, request->u.scatter.sync_match_bits)); @@ -227,7 +227,7 @@ setup_sync_handles(struct ompi_communicator_t *comm, &request->u.scatter.sync_meh); if (PTL_OK != ret) { ret = OMPI_ERROR; line = __LINE__; goto err_hdlr; } - OPAL_OUTPUT((ompi_coll_base_framework.framework_output, + OPAL_OUTPUT_VERBOSE((10, ompi_coll_base_framework.framework_output, "coll:portals4:setup_sync_handles exit rank %d", request->u.scatter.my_rank)); return OMPI_SUCCESS; @@ -245,7 +245,7 @@ cleanup_scatter_handles(ompi_coll_portals4_request_t *request) { int ret, line; - OPAL_OUTPUT((ompi_coll_base_framework.framework_output, + OPAL_OUTPUT_VERBOSE((10, ompi_coll_base_framework.framework_output, "coll:portals4:cleanup_scatter_handles enter rank %d", request->u.scatter.my_rank)); /**********************************/ @@ -265,7 +265,7 @@ cleanup_scatter_handles(ompi_coll_portals4_request_t *request) ret = PtlCTFree(request->u.scatter.scatter_cth); if (PTL_OK != ret) { ret = OMPI_ERROR; line = __LINE__; goto err_hdlr; } - OPAL_OUTPUT((ompi_coll_base_framework.framework_output, + OPAL_OUTPUT_VERBOSE((10, ompi_coll_base_framework.framework_output, "coll:portals4:cleanup_scatter_handles exit rank %d", request->u.scatter.my_rank)); return OMPI_SUCCESS; @@ -284,7 +284,7 @@ cleanup_sync_handles(ompi_coll_portals4_request_t *request) int ret, line; int ptl_ret; - OPAL_OUTPUT((ompi_coll_base_framework.framework_output, + OPAL_OUTPUT_VERBOSE((10, ompi_coll_base_framework.framework_output, "coll:portals4:cleanup_sync_handles enter rank %d", request->u.scatter.my_rank)); /**********************************/ @@ -304,7 +304,7 @@ cleanup_sync_handles(ompi_coll_portals4_request_t *request) ret = PtlCTFree(request->u.scatter.sync_cth); if (PTL_OK != ret) { ptl_ret = ret; ret = OMPI_ERROR; line = __LINE__; goto err_hdlr; } - OPAL_OUTPUT((ompi_coll_base_framework.framework_output, + OPAL_OUTPUT_VERBOSE((10, ompi_coll_base_framework.framework_output, "coll:portals4:cleanup_sync_handles exit rank %d", request->u.scatter.my_rank)); return OMPI_SUCCESS; @@ -341,8 +341,9 @@ ompi_coll_portals4_scatter_intra_linear_top(const void *sbuf, int scount, struct int32_t expected_chained_rtrs = 0; int32_t expected_chained_acks = 0; + ptl_size_t number_of_fragment = 1; - OPAL_OUTPUT((ompi_coll_base_framework.framework_output, + OPAL_OUTPUT_VERBOSE((10, ompi_coll_base_framework.framework_output, "coll:portals4:scatter_intra_linear_top enter rank %d", request->u.scatter.my_rank)); request->type = OMPI_COLL_PORTALS4_TYPE_SCATTER; @@ -409,6 +410,13 @@ ompi_coll_portals4_scatter_intra_linear_top(const void *sbuf, int scount, struct ret = setup_sync_handles(comm, request, portals4_module); if (MPI_SUCCESS != ret) { line = __LINE__; goto err_hdlr; } + number_of_fragment = (request->u.scatter.packed_size > mca_coll_portals4_component.ni_limits.max_msg_size) ? + (request->u.scatter.packed_size + mca_coll_portals4_component.ni_limits.max_msg_size - 1) / mca_coll_portals4_component.ni_limits.max_msg_size : + 1; + opal_output_verbose(90, ompi_coll_base_framework.framework_output, + "%s:%d:rank %d:number_of_fragment = %lu", + __FILE__, __LINE__, request->u.scatter.my_rank, number_of_fragment); + /**********************************/ /* do the scatter */ /**********************************/ @@ -445,25 +453,42 @@ ompi_coll_portals4_scatter_intra_linear_top(const void *sbuf, int scount, struct } ptl_size_t offset = request->u.scatter.packed_size * i; + ptl_size_t size_sent = 0; + ptl_size_t size_left = request->u.scatter.packed_size; - opal_output_verbose(30, ompi_coll_base_framework.framework_output, + opal_output_verbose(10, ompi_coll_base_framework.framework_output, "%s:%d:rank(%d): offset(%lu)=rank(%d) * packed_size(%ld)", __FILE__, __LINE__, request->u.scatter.my_rank, offset, i, request->u.scatter.packed_size); - ret = PtlTriggeredPut(request->u.scatter.scatter_mdh, - (ptl_size_t)request->u.scatter.scatter_buf + offset, - request->u.scatter.packed_size, - PTL_NO_ACK_REQ, - ompi_coll_portals4_get_peer(comm, i), - mca_coll_portals4_component.pt_idx, - request->u.scatter.scatter_match_bits, - 0, - NULL, - 0, - request->u.scatter.scatter_cth, - expected_chained_rtrs); - if (PTL_OK != ret) { ret = OMPI_ERROR; line = __LINE__; goto err_hdlr; } + for (ptl_size_t j=0; j mca_coll_portals4_component.ni_limits.max_msg_size) ? + mca_coll_portals4_component.ni_limits.max_msg_size : + size_left; + + OPAL_OUTPUT_VERBOSE((10, ompi_coll_base_framework.framework_output, + "%s:%d:rank(%d): frag(%lu),offset_frag (%lu) frag_size(%lu)", + __FILE__, __LINE__, request->u.scatter.my_rank, + j, size_sent, frag_size)); + + ret = PtlTriggeredPut(request->u.scatter.scatter_mdh, + (ptl_size_t)request->u.scatter.scatter_buf + offset + size_sent, + frag_size, + PTL_NO_ACK_REQ, + ompi_coll_portals4_get_peer(comm, i), + mca_coll_portals4_component.pt_idx, + request->u.scatter.scatter_match_bits, + size_sent, + NULL, + 0, + request->u.scatter.scatter_cth, + expected_chained_rtrs); + if (PTL_OK != ret) { ret = OMPI_ERROR; line = __LINE__; goto err_hdlr; } + + size_left -= frag_size; + size_sent += frag_size; + } } } else { /* non-root, so do nothing */ @@ -473,7 +498,7 @@ ompi_coll_portals4_scatter_intra_linear_top(const void *sbuf, int scount, struct expected_acks = 0; /* operations on the scatter counter */ - expected_puts = 1; /* scatter put from root */ + expected_puts = number_of_fragment; /* scatter put from root */ expected_chained_rtrs = 0; expected_chained_acks = 0; } @@ -552,7 +577,7 @@ ompi_coll_portals4_scatter_intra_linear_top(const void *sbuf, int scount, struct "completed CTWait(expected_ops=%d)\n", expected_ops); } - OPAL_OUTPUT((ompi_coll_base_framework.framework_output, + OPAL_OUTPUT_VERBOSE((10, ompi_coll_base_framework.framework_output, "coll:portals4:scatter_intra_linear_top exit rank %d", request->u.scatter.my_rank)); return OMPI_SUCCESS; @@ -574,7 +599,7 @@ ompi_coll_portals4_scatter_intra_linear_bottom(struct ompi_communicator_t *comm, { int ret, line; - OPAL_OUTPUT((ompi_coll_base_framework.framework_output, + OPAL_OUTPUT_VERBOSE((10, ompi_coll_base_framework.framework_output, "coll:portals4:scatter_intra_linear_bottom enter rank %d", request->u.scatter.my_rank)); ret = cleanup_scatter_handles(request); @@ -616,7 +641,7 @@ ompi_coll_portals4_scatter_intra_linear_bottom(struct ompi_communicator_t *comm, ompi_request_complete(&request->super, true); OPAL_THREAD_UNLOCK(&ompi_request_lock); - OPAL_OUTPUT((ompi_coll_base_framework.framework_output, + OPAL_OUTPUT_VERBOSE((10, ompi_coll_base_framework.framework_output, "coll:portals4:scatter_intra_linear_bottom exit rank %d", request->u.scatter.my_rank)); return OMPI_SUCCESS; @@ -645,7 +670,7 @@ ompi_coll_portals4_scatter_intra(const void *sbuf, int scount, struct ompi_datat ompi_coll_portals4_request_t *request; - OPAL_OUTPUT((ompi_coll_base_framework.framework_output, + OPAL_OUTPUT_VERBOSE((10, ompi_coll_base_framework.framework_output, "coll:portals4:scatter_intra enter rank %d", ompi_comm_rank(comm))); /* @@ -679,7 +704,7 @@ ompi_coll_portals4_scatter_intra(const void *sbuf, int scount, struct ompi_datat */ OMPI_COLL_PORTALS4_REQUEST_RETURN(request); - OPAL_OUTPUT((ompi_coll_base_framework.framework_output, + OPAL_OUTPUT_VERBOSE((10, ompi_coll_base_framework.framework_output, "coll:portals4:scatter_intra exit rank %d", request->u.scatter.my_rank)); return OMPI_SUCCESS; @@ -705,7 +730,7 @@ ompi_coll_portals4_iscatter_intra(const void *sbuf, int scount, struct ompi_data ompi_coll_portals4_request_t *request; - OPAL_OUTPUT((ompi_coll_base_framework.framework_output, + OPAL_OUTPUT_VERBOSE((10, ompi_coll_base_framework.framework_output, "coll:portals4:iscatter_intra enter rank %d", ompi_comm_rank(comm))); /* @@ -732,7 +757,7 @@ ompi_coll_portals4_iscatter_intra(const void *sbuf, int scount, struct ompi_data module); if (MPI_SUCCESS != ret) { line = __LINE__; goto err_hdlr; } - OPAL_OUTPUT((ompi_coll_base_framework.framework_output, + OPAL_OUTPUT_VERBOSE((10, ompi_coll_base_framework.framework_output, "coll:portals4:iscatter_intra exit rank %d", request->u.scatter.my_rank)); return OMPI_SUCCESS; @@ -751,7 +776,7 @@ ompi_coll_portals4_iscatter_intra_fini(ompi_coll_portals4_request_t *request) { int ret, line; - OPAL_OUTPUT((ompi_coll_base_framework.framework_output, + OPAL_OUTPUT_VERBOSE((10, ompi_coll_base_framework.framework_output, "coll:portals4:iscatter_intra_fini enter rank %d", request->u.scatter.my_rank)); /* @@ -760,7 +785,7 @@ ompi_coll_portals4_iscatter_intra_fini(ompi_coll_portals4_request_t *request) ret = ompi_coll_portals4_scatter_intra_linear_bottom(request->super.req_mpi_object.comm, request); if (MPI_SUCCESS != ret) { line = __LINE__; goto err_hdlr; } - OPAL_OUTPUT((ompi_coll_base_framework.framework_output, + OPAL_OUTPUT_VERBOSE((10, ompi_coll_base_framework.framework_output, "coll:portals4:iscatter_intra_fini exit rank %d", request->u.scatter.my_rank)); return OMPI_SUCCESS;