From a274d2795c796da1ad11eeb1a3a7c41ad4ceaef8 Mon Sep 17 00:00:00 2001 From: Todd Kordenbrock Date: Tue, 7 Apr 2015 15:42:11 -0500 Subject: [PATCH] coll-portals4: implement collective operations using Portals4 triggered operations This commit implements the reduce, allreduce, barrier and bcast collective operations using Portals4 triggered operations. --- ompi/mca/coll/portals4/Makefile.am | 6 +- ompi/mca/coll/portals4/coll_portals4.h | 347 +++++-- .../coll/portals4/coll_portals4_allreduce.c | 424 +++++++++ .../mca/coll/portals4/coll_portals4_barrier.c | 372 ++++---- ompi/mca/coll/portals4/coll_portals4_bcast.c | 864 ++++++++++++++++++ .../coll/portals4/coll_portals4_component.c | 601 ++++++++---- ompi/mca/coll/portals4/coll_portals4_reduce.c | 436 +++++++++ .../mca/coll/portals4/coll_portals4_request.c | 11 +- .../mca/coll/portals4/coll_portals4_request.h | 99 +- 9 files changed, 2657 insertions(+), 503 deletions(-) create mode 100644 ompi/mca/coll/portals4/coll_portals4_allreduce.c create mode 100644 ompi/mca/coll/portals4/coll_portals4_bcast.c create mode 100644 ompi/mca/coll/portals4/coll_portals4_reduce.c diff --git a/ompi/mca/coll/portals4/Makefile.am b/ompi/mca/coll/portals4/Makefile.am index 874ce8ff65..2434bfee74 100644 --- a/ompi/mca/coll/portals4/Makefile.am +++ b/ompi/mca/coll/portals4/Makefile.am @@ -1,5 +1,6 @@ # -# Copyright (c) 2013 Sandia National Laboratories. All rights reserved. +# Copyright (c) 2013-2015 Sandia National Laboratories. All rights reserved. +# Copyright (c) 2015 Bull SAS. All rights reserved. # $COPYRIGHT$ # # Additional copyrights may follow @@ -9,8 +10,11 @@ local_sources = \ coll_portals4.h \ + coll_portals4_allreduce.c \ coll_portals4_component.c \ coll_portals4_barrier.c \ + coll_portals4_bcast.c \ + coll_portals4_reduce.c \ coll_portals4_request.h \ coll_portals4_request.c diff --git a/ompi/mca/coll/portals4/coll_portals4.h b/ompi/mca/coll/portals4/coll_portals4.h index ff81346dd2..e4808dc121 100644 --- a/ompi/mca/coll/portals4/coll_portals4.h +++ b/ompi/mca/coll/portals4/coll_portals4.h @@ -1,12 +1,12 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* - * Copyright (c) 2013 Sandia National Laboratories. All rights reserved. + * Copyright (c) 2013-2015 Sandia National Laboratories. All rights reserved. * Copyright (c) 2015 Los Alamos National Security, LLC. All rights * reserved. + * Copyright (c) 2015 Bull SAS. All rights reserved. * $COPYRIGHT$ - * + * * Additional copyrights may follow - * + * * $HEADER$ */ @@ -16,16 +16,24 @@ #include "ompi_config.h" #include - #include "mpi.h" +#include "ompi/constants.h" +#include "ompi/datatype/ompi_datatype.h" +#include "ompi/datatype/ompi_datatype_internal.h" +#include "ompi/op/op.h" #include "ompi/mca/mca.h" #include "ompi/mca/coll/coll.h" #include "ompi/request/request.h" #include "ompi/communicator/communicator.h" -#include "ompi/mca/mtl/portals4/mtl_portals4_endpoint.h" +#include "ompi/mca/coll/base/base.h" BEGIN_C_DECLS +#define COLL_PORTALS4_NO_OP ((ptl_op_t)-1) +extern ptl_op_t ompi_coll_portals4_atomic_op[]; + +#define COLL_PORTALS4_NO_DTYPE ((ptl_datatype_t)-1) +extern ptl_datatype_t ompi_coll_portals4_atomic_datatype[]; struct mca_coll_portals4_component_t { mca_coll_base_component_t super; @@ -33,111 +41,322 @@ struct mca_coll_portals4_component_t { /** Network interface handle for matched interface */ ptl_handle_ni_t ni_h; ptl_uid_t uid; + ptl_process_t id; ptl_pt_index_t pt_idx; ptl_pt_index_t finish_pt_idx; ptl_handle_eq_t eq_h; - ptl_handle_me_t barrier_unex_me_h; + ptl_handle_me_t unex_me_h; ptl_handle_me_t finish_me_h; - /** Send MD handle(s). Use ompi_coll_portals4_get_md() to get the right md */ -#if OPAL_PORTALS4_MAX_MD_SIZE < OPAL_PORTALS4_MAX_VA_SIZE - ptl_handle_md_t *md_hs; -#else - ptl_handle_md_t md_h; -#endif - + bool ev_link; + opal_mutex_t lock; + opal_condition_t cond; + int nb_links; + ptl_handle_md_t zero_md_h; + ptl_handle_md_t data_md_h; opal_free_list_t requests; /* request free list for the i collectives */ + + ptl_ni_limits_t ni_limits; + }; typedef struct mca_coll_portals4_component_t mca_coll_portals4_component_t; OMPI_MODULE_DECLSPEC extern mca_coll_portals4_component_t mca_coll_portals4_component; struct mca_coll_portals4_module_t { mca_coll_base_module_t super; + size_t coll_count; - size_t barrier_count; + /* record handlers dedicated to fallback if offloaded operations are not supported */ + mca_coll_base_module_reduce_fn_t previous_reduce; + mca_coll_base_module_t *previous_reduce_module; + mca_coll_base_module_ireduce_fn_t previous_ireduce; + mca_coll_base_module_t *previous_ireduce_module; + + mca_coll_base_module_allreduce_fn_t previous_allreduce; + mca_coll_base_module_t *previous_allreduce_module; + mca_coll_base_module_iallreduce_fn_t previous_iallreduce; + mca_coll_base_module_t *previous_iallreduce_module; }; typedef struct mca_coll_portals4_module_t mca_coll_portals4_module_t; OBJ_CLASS_DECLARATION(mca_coll_portals4_module_t); struct ompi_coll_portals4_request_t; +#define COLL_PORTALS4_MAX_BW 4096 +#define COLL_PORTALS4_MAX_SEGMENT 32 + + /* match/ignore bit manipulation * - * 01234567 0123 4 567 012 34567 01234567 01234567 01234567 01234567 01234567 - * | | | - * context id |^| type | op count - * ||| | - * +- eager switch -*/ + * 01234567 01234567 012 3 4 567 012 3 4567 01234567 01234567 01234567 01234567 + | | | | | + * context id |^|^| type | int | op count + * ||||| | | + * |||+--------------- is a RTR message + * |+----------------- is a data ACK message + */ -#define COLL_PORTALS4_CID_MASK 0xFFF0000000000000ULL -#define COLL_PORTALS4_OP_COUNT_MASK 0x00001FFFFFFFFFFFULL +#define COLL_PORTALS4_CID_MASK 0xFFE0000000000000ULL +#define COLL_PORTALS4_ACK_MASK 0x0010000000000000ULL +#define COLL_PORTALS4_RTR_MASK 0x0008000000000000ULL +#define COLL_PORTALS4_TYPE_MASK 0x0007E00000000000ULL +#define COLL_PORTALS4_INTERNAL_MASK 0x00001F0000000000ULL +#define COLL_PORTALS4_OP_COUNT_MASK 0x000000FFFFFFFFFFULL -#define COLL_PORTALS4_BARRIER 0x01 +#define COLL_PORTALS4_BARRIER 0x01 +#define COLL_PORTALS4_BCAST 0x02 +#define COLL_PORTALS4_SCATTER 0x03 +#define COLL_PORTALS4_GATHER 0x04 +#define COLL_PORTALS4_REDUCE 0x05 +#define COLL_PORTALS4_ALLREDUCE 0x06 -#define COLL_PORTALS4_SET_BITS(match_bits, contextid, eager, type, op_count) \ - { \ - match_bits = contextid; \ - match_bits = (match_bits << 1); \ - match_bits |= (eager & 0x1); \ - match_bits = (match_bits << 6); \ - match_bits |= (type & 0x3F); \ - match_bits = (match_bits << 45); \ - match_bits |= (op_count & 0x1FFFFFFFFFFF); \ - } +#define PTL_INVALID_RANK ((ptl_rank_t)-1) +#define PTL_FIRST_RANK ((ptl_rank_t)0) + +#define COLL_PORTALS4_SET_BITS(match_bits, contextid, ack, rtr, type, internal, op_count) \ +{ \ + match_bits = contextid; \ + match_bits = (match_bits << 1); \ + match_bits |= (ack & 0x1); \ + match_bits = (match_bits << 1); \ + match_bits |= (rtr & 0x1); \ + match_bits = (match_bits << 6); \ + match_bits |= (type & 0x3F); \ + match_bits = (match_bits << 5); \ + match_bits |= (internal & 0x1F); \ + match_bits = (match_bits << 40); \ + match_bits |= (op_count & 0xFFFFFFFFFF); \ +} + +int +opal_stderr(const char *msg, const char *file, + const int line, const int ret); int ompi_coll_portals4_barrier_intra(struct ompi_communicator_t *comm, - mca_coll_base_module_t *module); -int ompi_coll_portals4_ibarrier_intra(struct ompi_communicator_t *comm, - ompi_request_t ** request, - mca_coll_base_module_t *module); + mca_coll_base_module_t *module); +int ompi_coll_portals4_ibarrier_intra(struct ompi_communicator_t *comm, + ompi_request_t ** request, + mca_coll_base_module_t *module); int ompi_coll_portals4_ibarrier_intra_fini(struct ompi_coll_portals4_request_t *request); +int ompi_coll_portals4_bcast_intra(void *buff, int count, + struct ompi_datatype_t *datatype, int root, + struct ompi_communicator_t *comm,mca_coll_base_module_t *module); +int ompi_coll_portals4_ibcast_intra(void *buff, int count, + struct ompi_datatype_t *datatype, int root, + struct ompi_communicator_t *comm, + ompi_request_t **request, + mca_coll_base_module_t *module); +int ompi_coll_portals4_ibcast_intra_fini(struct ompi_coll_portals4_request_t *request); + +int ompi_coll_portals4_reduce_intra(void *sbuf, void *rbuf, int count, + MPI_Datatype dtype, MPI_Op op, + int root, + struct ompi_communicator_t *comm, + mca_coll_base_module_t *module); +int ompi_coll_portals4_ireduce_intra(void* sendbuf, void* recvbuf, int count, + MPI_Datatype dype, MPI_Op op, + int root, + struct ompi_communicator_t *comm, + ompi_request_t ** ompi_request, + struct mca_coll_base_module_2_1_0_t *module); +int ompi_coll_portals4_ireduce_intra_fini(struct ompi_coll_portals4_request_t *request); + +int ompi_coll_portals4_allreduce_intra(void* sendbuf, void* recvbuf, int count, + MPI_Datatype dtype, MPI_Op op, + struct ompi_communicator_t *comm, + struct mca_coll_base_module_2_1_0_t *module); +int ompi_coll_portals4_iallreduce_intra(void* sendbuf, void* recvbuf, int count, + MPI_Datatype dtype, MPI_Op op, + struct ompi_communicator_t *comm, + ompi_request_t ** ompi_request, + struct mca_coll_base_module_2_1_0_t *module); +int +ompi_coll_portals4_iallreduce_intra_fini(struct ompi_coll_portals4_request_t *request); static inline ptl_process_t ompi_coll_portals4_get_peer(struct ompi_communicator_t *comm, int rank) { ompi_proc_t *proc = ompi_comm_peer_lookup(comm, rank); + if (proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_PORTALS4] == NULL) { + printf("ompi_coll_portals4_get_peer failure\n"); + } return *((ptl_process_t*) proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_PORTALS4]); } + +static inline bool +is_reduce_optimizable(struct ompi_datatype_t *dtype, size_t length, struct ompi_op_t *op, + ptl_datatype_t *ptl_dtype, ptl_op_t *ptl_op) { + + /* first check the type of operation and + * map it to the corresponding portals4 one */ + + if (!(op->o_flags & OMPI_OP_FLAGS_COMMUTE)) { + opal_output_verbose(50, ompi_coll_base_framework.framework_output, + "atomic op %d is not commutative, deactivate the optimization\n", + op->op_type); + return false; + } + + if (!(op->o_flags & OMPI_OP_FLAGS_ASSOC)) { + opal_output_verbose(50, ompi_coll_base_framework.framework_output, + "atomic op %d is not float associative, deactivate the optimization\n", + op->op_type); + return false; + } + + if (op->op_type >= OMPI_OP_NUM_OF_TYPES) { + opal_output_verbose(50, ompi_coll_base_framework.framework_output, + "unknown atomic op %d\n", + op->op_type); + return false; + } + + *ptl_op = ompi_coll_portals4_atomic_op[op->op_type]; + if (*ptl_op == COLL_PORTALS4_NO_OP) { + opal_output_verbose(50, ompi_coll_base_framework.framework_output, + "unsupported atomic op %d\n", + op->op_type); + return false; + } + + /* then check the data type and map it + * to the corresponding portals4 one */ + + if (!ompi_datatype_is_valid(dtype)) { + opal_output_verbose(50, ompi_coll_base_framework.framework_output, + "not a valid datatype %d\n", + dtype->id); + return false; + } + + if (dtype->id >= OMPI_DATATYPE_MPI_MAX_PREDEFINED) { + opal_output_verbose(50, ompi_coll_base_framework.framework_output, + "not a valid datatype %d\n", + dtype->id); + return false; + } + + if (length > mca_coll_portals4_component.ni_limits.max_atomic_size) { + opal_output_verbose(50, ompi_coll_base_framework.framework_output, + "length (%ld) > ni.max_atomic_size (%ld)\n", + length, mca_coll_portals4_component.ni_limits.max_atomic_size); + return false; + } + + *ptl_dtype = ompi_coll_portals4_atomic_datatype[dtype->id]; + if (*ptl_dtype == COLL_PORTALS4_NO_DTYPE){ + opal_output_verbose(50, ompi_coll_base_framework.framework_output, + "datatype %d not supported\n", + dtype->id); + return false; + } + + return true; +} + + static inline int -ompi_coll_portals4_get_nchildren(int cube_dim, int hibit, int rank, int size) +get_nchildren(int cube_dim, int hibit, int rank, int size) { int guess = cube_dim - (hibit + 1); - if ((rank | (1 << (cube_dim - 1))) >= size) guess--; - if (guess < 0) return 0; + + if ((rank | (1 << (cube_dim - 1))) >= size) { + guess--; + } + if (guess < 0) { + return 0; + } + return guess; } -/* - * See note in mtl/portals4/mtl_portals4.h for why this exists. - */ -static inline void -ompi_coll_portals4_get_md(const void *ptr, ptl_handle_md_t *md_h, void **base_ptr) +static inline +void get_pipeline(ptl_rank_t rank, ptl_rank_t np, ptl_rank_t root, + ptl_rank_t *prev, ptl_rank_t *next) { -#if OPAL_PORTALS4_MAX_MD_SIZE < OPAL_PORTALS4_MAX_VA_SIZE - int mask = (1ULL << (OPAL_PORTALS4_MAX_VA_SIZE - OPAL_PORTALS4_MAX_MD_SIZE + 1)) - 1; - int which = (((uintptr_t) ptr) >> (OPAL_PORTALS4_MAX_MD_SIZE - 1)) & mask; - *md_h = mca_coll_portals4_component.md_hs[which]; - *base_ptr = (void*) (which * (1ULL << (OPAL_PORTALS4_MAX_MD_SIZE - 1))); -#else - *md_h = mca_coll_portals4_component.md_h; - *base_ptr = 0; -#endif + *prev = (rank == root) ? + PTL_INVALID_RANK: + ((rank == PTL_FIRST_RANK) ? (np - 1) : (rank - 1)); + *next = (rank == (np - 1)) ? + ((root == PTL_FIRST_RANK) ? PTL_INVALID_RANK : PTL_FIRST_RANK): + ((rank == (root - 1)) ? PTL_INVALID_RANK : (rank + 1)); + return; } +#define div(a,b) (((a)+(b)-1) / (b)) +#define min(a,b) (((a) < (b)) ? (a) : (b)) +#define min_zero(a) (((a) < 0) ? 0 : (a)) -static inline int -ompi_coll_portals4_get_num_mds(void) -{ -#if OPAL_PORTALS4_MAX_MD_SIZE < OPAL_PORTALS4_MAX_VA_SIZE - return (1 << (OPAL_PORTALS4_MAX_VA_SIZE - OPAL_PORTALS4_MAX_MD_SIZE + 1)); -#else - return 1; -#endif +static inline +void get_k_ary_tree(const unsigned int k_ary, + ptl_rank_t rank, ptl_rank_t np, ptl_rank_t root, + ptl_rank_t *father, ptl_rank_t *children, unsigned int *child_nb) { + + bool should_continue = true; + unsigned int cnt; + ptl_rank_t first, last, dist, up, my; + + if ((!father) || + (!children) || + (!child_nb)) { + return; + } + + /* initialization and checks */ + *father = PTL_INVALID_RANK; + *child_nb = 0; + + if (!k_ary) { + return; + } + + for (cnt = 0 ; cnt < k_ary ; cnt++) { + children[cnt] = PTL_INVALID_RANK; + } + + if ((np <= 0) || + (rank < 0) || + (rank >= np) || + (root < 0 ) || + (root >= np)) { + return; + } + + my = (np + rank - root) % np; + + /* start the loop */ + up = PTL_INVALID_RANK; + first = PTL_FIRST_RANK; + last = np - 1; + + while (should_continue) { + if (my == first) { + first++; + dist = div(last - first + 1, k_ary); + should_continue = false; + } + else { + up = first; + first++; + dist = div(last - first + 1, k_ary); + while (my >= (first + dist)) { + first += dist; + } + last = min(first + dist - 1, last); + } + } + *father = (up == PTL_INVALID_RANK) ? PTL_INVALID_RANK : ((up + root) % np); + *child_nb = min(k_ary, min_zero(last - first + 1)); + + for (cnt = 0 ; cnt < *child_nb ; cnt++) { + children[cnt] = (root + + first + cnt * dist) % np; + } + + return; } - END_C_DECLS #endif /* MCA_COLL_PORTALS4_EXPORT_H */ diff --git a/ompi/mca/coll/portals4/coll_portals4_allreduce.c b/ompi/mca/coll/portals4/coll_portals4_allreduce.c new file mode 100644 index 0000000000..3cbe9cbedd --- /dev/null +++ b/ompi/mca/coll/portals4/coll_portals4_allreduce.c @@ -0,0 +1,424 @@ +/* + * Copyright (c) 2015 Sandia National Laboratories. All rights reserved. + * Copyright (c) 2015 Bull SAS. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "ompi_config.h" +#include "coll_portals4.h" +#include "coll_portals4_request.h" + +#include + +#include "mpi.h" +#include "ompi/constants.h" +#include "ompi/datatype/ompi_datatype.h" +#include "ompi/datatype/ompi_datatype_internal.h" +#include "ompi/op/op.h" +#include "opal/util/bit_ops.h" +#include "ompi/mca/pml/pml.h" +#include "ompi/mca/coll/coll.h" +#include "ompi/mca/coll/base/base.h" + + +#define COLL_PORTALS4_ALLREDUCE_MAX_SEGMENT 128 +#define COLL_PORTALS4_ALLREDUCE_MAX_CHILDREN 2 + +static int +allreduce_kary_tree_top(void *sendbuf, void *recvbuf, int count, + MPI_Datatype dtype, MPI_Op op, + struct ompi_communicator_t *comm, + ompi_coll_portals4_request_t *request, + mca_coll_portals4_module_t *module) +{ + bool is_sync = request->is_sync; + int ret, i; + int size = ompi_comm_size(comm); + int rank = ompi_comm_rank(comm); + ptl_rank_t parent, child[COLL_PORTALS4_ALLREDUCE_MAX_CHILDREN]; + unsigned int child_nb; + size_t internal_count, length; + ptl_handle_md_t zero_md_h, data_md_h; + ptl_handle_me_t me_h; + ptl_me_t me; + ptl_match_bits_t match_bits_ack, match_bits_rtr, match_bits; + ptl_ct_event_t ct; + ptl_op_t ptl_op; + ptl_datatype_t ptl_dtype; + + request->type = OMPI_COLL_PORTALS4_TYPE_ALLREDUCE; + + /* + ** Initialization + */ + + for (i = 0 ; i < COLL_PORTALS4_ALLREDUCE_MAX_CHILDREN ; i++) { + child[i] = PTL_INVALID_RANK; + } + + parent = PTL_INVALID_RANK; + + zero_md_h = mca_coll_portals4_component.zero_md_h; + data_md_h = mca_coll_portals4_component.data_md_h; + + internal_count = opal_atomic_add_size_t(&module->coll_count, 1); + + /* + ** DATATYPE and SIZES + */ + ret = ompi_datatype_type_size(dtype, &length); + length *= count; + + request->u.allreduce.is_optim = is_reduce_optimizable(dtype, length, op, &ptl_dtype, &ptl_op); + + if (request->u.allreduce.is_optim) { + /* + * TOPOLOGY + */ + + /* this function is dependent on the number of segments, + * if we use segmentation pipe-line is preferred, and + * binary tree otherwise */ + + get_k_ary_tree(COLL_PORTALS4_ALLREDUCE_MAX_CHILDREN, + rank, size, PTL_FIRST_RANK, &parent, child, &child_nb); + request->u.allreduce.child_nb = child_nb; + + /* + * PORTALS4 RESOURCE ALLOCATION + */ + + /* Compute match bits */ + COLL_PORTALS4_SET_BITS(match_bits_ack, ompi_comm_get_cid(comm), 1, 0, + COLL_PORTALS4_ALLREDUCE, 0, internal_count); + + COLL_PORTALS4_SET_BITS(match_bits_rtr, ompi_comm_get_cid(comm), 0, 1, + COLL_PORTALS4_ALLREDUCE, 0, internal_count); + + COLL_PORTALS4_SET_BITS(match_bits, ompi_comm_get_cid(comm), 0, 0, + COLL_PORTALS4_ALLREDUCE, 0, internal_count); + + if ((ret = PtlCTAlloc(mca_coll_portals4_component.ni_h, &request->u.allreduce.trig_ct_h)) != 0) { + return opal_stderr("PtlCTAlloc failed", __FILE__, __LINE__, ret); + } + + if (sendbuf != MPI_IN_PLACE) { + /* + * copy the data from sendbuf to recvbuf + */ + memcpy(recvbuf, sendbuf, length); + } + + /* + ** Prepare Data ME + */ + memset(&me, 0, sizeof(ptl_me_t)); + me.start = recvbuf; + me.length = length; + me.ct_handle = request->u.allreduce.trig_ct_h; + me.uid = mca_coll_portals4_component.uid; + me.options = PTL_ME_OP_PUT | PTL_ME_EVENT_SUCCESS_DISABLE | + PTL_ME_EVENT_LINK_DISABLE | PTL_ME_EVENT_UNLINK_DISABLE | + PTL_ME_EVENT_CT_COMM; + me.match_id.phys.nid = PTL_NID_ANY; + me.match_id.phys.pid = PTL_PID_ANY; + me.match_bits = match_bits; + me.ignore_bits = 0; + + if ((ret = PtlMEAppend(mca_coll_portals4_component.ni_h, + mca_coll_portals4_component.pt_idx, + &me, PTL_PRIORITY_LIST, NULL, + &request->u.allreduce.data_me_h)) != 0) { + return opal_stderr("PtlMEAppend failed", __FILE__, __LINE__, ret); + } + + if (child_nb) { + if ((ret = PtlCTAlloc(mca_coll_portals4_component.ni_h, &request->u.allreduce.ack_ct_h)) != 0) { + return opal_stderr("PtlCTAlloc failed", __FILE__, __LINE__, ret); + } + + for (i = 0 ; i < COLL_PORTALS4_ALLREDUCE_MAX_CHILDREN ; i++) { + if (child[i] != PTL_INVALID_RANK) { + + /* + ** Prepare ME for data ACK Put + ** Priority List + */ + + memset(&me, 0, sizeof(ptl_me_t)); + me.start = NULL; + me.length = 0; + me.min_free = 0; + me.uid = mca_coll_portals4_component.uid; + me.options = PTL_ME_OP_PUT | PTL_ME_EVENT_SUCCESS_DISABLE | + PTL_ME_EVENT_LINK_DISABLE | PTL_ME_EVENT_UNLINK_DISABLE | + PTL_ME_USE_ONCE | + PTL_ME_EVENT_CT_COMM; + me.match_id.phys.nid = PTL_NID_ANY; + me.match_id.phys.pid = PTL_PID_ANY; + me.match_bits = match_bits_ack; + me.ignore_bits = 0; + me.ct_handle = request->u.allreduce.ack_ct_h; + + if ((ret = PtlMEAppend(mca_coll_portals4_component.ni_h, + mca_coll_portals4_component.pt_idx, + &me, PTL_PRIORITY_LIST, + NULL, + &me_h)) != 0) { + return opal_stderr("PtlMEAppend failed", __FILE__, __LINE__, ret); + } + + /* + * Prepare Triggered Put to send the result to children + * + */ + + if ((ret = PtlTriggeredPut (data_md_h, + (uint64_t)recvbuf, + length, PTL_NO_ACK_REQ, + ompi_coll_portals4_get_peer(comm, child[i]), + mca_coll_portals4_component.pt_idx, + match_bits, 0, NULL, 0, + request->u.allreduce.trig_ct_h, + (rank != PTL_FIRST_RANK) ? child_nb + 2 : child_nb)) != 0) { + return opal_stderr("PtlTriggeredPut failed", __FILE__, __LINE__, ret); + } + } + + } + } + + if (rank != PTL_FIRST_RANK) { + + /* Send Atomic operation to the parent */ + if ((ret = PtlTriggeredAtomic(data_md_h, + (uint64_t)recvbuf, + length, PTL_NO_ACK_REQ, + ompi_coll_portals4_get_peer(comm, parent), + mca_coll_portals4_component.pt_idx, + match_bits, 0, NULL, 0, + ptl_op, ptl_dtype, request->u.allreduce.trig_ct_h, + child_nb + 1)) != 0) { + return opal_stderr("PtlTriggeredAtomic failed", __FILE__, __LINE__, ret); + } + + if ((ret = PtlTriggeredPut (zero_md_h, 0, 0, PTL_NO_ACK_REQ, + ompi_coll_portals4_get_peer(comm, parent), + mca_coll_portals4_component.pt_idx, + match_bits_ack, 0, NULL, 0, + request->u.allreduce.trig_ct_h, + child_nb + 2)) != 0) { + return opal_stderr("PtlTriggeredPut failed", __FILE__, __LINE__, ret); + } + + /* + ** Prepare ME for receving RTR + ** Priority List, match also with "Overflow list Me" in coll_portals4_component + */ + + memset(&me, 0, sizeof(ptl_me_t)); + me.start = NULL; + me.length = 0; + me.min_free = 0; + me.uid = mca_coll_portals4_component.uid; + me.options = PTL_ME_OP_PUT | PTL_ME_EVENT_SUCCESS_DISABLE | + PTL_ME_EVENT_LINK_DISABLE | PTL_ME_EVENT_UNLINK_DISABLE | + PTL_ME_USE_ONCE | + PTL_ME_EVENT_CT_COMM | PTL_ME_EVENT_CT_OVERFLOW; + me.match_id.phys.nid = PTL_NID_ANY; + me.match_id.phys.pid = PTL_PID_ANY; + me.match_bits = match_bits_rtr; + me.ignore_bits = 0; + me.ct_handle = request->u.allreduce.trig_ct_h; + + if ((ret = PtlMEAppend(mca_coll_portals4_component.ni_h, + mca_coll_portals4_component.pt_idx, + &me, PTL_PRIORITY_LIST, + NULL, + &me_h)) != 0) { + return opal_stderr("PtlMEAppend failed", __FILE__, __LINE__, ret); + } + } + + + /* + * OK, everything is ready for data and acknowledgement reception + * + */ + + if (child_nb) { + for (i = 0 ; i < COLL_PORTALS4_ALLREDUCE_MAX_CHILDREN ; i++) { + if (child[i] != PTL_INVALID_RANK) { + /* + * Send RTR to children + * + */ + + /* and there, we only send the RTR when all the MEs are ready */ + if ((ret = PtlPut(zero_md_h, 0, 0, PTL_NO_ACK_REQ, + ompi_coll_portals4_get_peer(comm, child[i]), + mca_coll_portals4_component.pt_idx, + match_bits_rtr, 0, NULL, 0)) != PTL_OK) + return opal_stderr("Put RTR failed %d", __FILE__, __LINE__, ret); + } + } + } + + if (child_nb) { + if (is_sync) { + if ((ret = PtlCTWait(request->u.allreduce.ack_ct_h, + child_nb, &ct)) != 0) { + opal_stderr("PtlCTWait failed", __FILE__, __LINE__, ret); + } + } + else { + if ((ret = PtlTriggeredPut (zero_md_h, 0, 0, PTL_NO_ACK_REQ, + ompi_coll_portals4_get_peer(comm, rank), + mca_coll_portals4_component.finish_pt_idx, + 0, 0, NULL, (uintptr_t) request, + request->u.allreduce.ack_ct_h, + child_nb)) != 0) { + return opal_stderr("PtlTriggeredPut failed", __FILE__, __LINE__, ret); + } + } + } + else { + if (is_sync) { + if ((ret = PtlCTWait(request->u.allreduce.trig_ct_h, + (rank != PTL_FIRST_RANK) ? 2 : 0, &ct)) != 0) { + opal_stderr("PtlCTWait failed", __FILE__, __LINE__, ret); + } + } + else { + if ((ret = PtlTriggeredPut (zero_md_h, 0, 0, PTL_NO_ACK_REQ, + ompi_coll_portals4_get_peer(comm, rank), + mca_coll_portals4_component.finish_pt_idx, + 0, 0, NULL, (uintptr_t) request, + request->u.allreduce.trig_ct_h, + (rank != PTL_FIRST_RANK) ? 2 : 0)) != 0) { + return opal_stderr("PtlTriggeredPut failed", __FILE__, __LINE__, ret); + } + } + } + + } + else { + opal_output_verbose(100, ompi_coll_base_framework.framework_output, + "rank %d - optimization not supported, falling back to previous handler\n", rank); + + if (request->is_sync) { + if ((module->previous_allreduce) && (module->previous_allreduce_module)) { + ret = module->previous_allreduce(sendbuf, recvbuf, count, dtype, op, + comm, module->previous_allreduce_module); + } + else { + opal_output_verbose(1, ompi_coll_base_framework.framework_output, + "rank %d - no previous allreduce handler is available, aborting\n", rank); + return (OMPI_ERROR); + } + } + else { + if ((module->previous_iallreduce) && (module->previous_iallreduce_module)) { + ret = module->previous_iallreduce(sendbuf, recvbuf, count, dtype, op, + comm, request->fallback_request, module->previous_iallreduce_module); + } + else { + opal_output_verbose(1, ompi_coll_base_framework.framework_output, + "rank %d - no previous iallreduce handler is available, aborting\n", rank); + return (OMPI_ERROR); + } + } + return ret; + } + return (OMPI_SUCCESS); +} + +static int +allreduce_kary_tree_bottom(ompi_coll_portals4_request_t *request) +{ + if (request->u.allreduce.is_optim) { + PtlAtomicSync(); + + if (request->u.allreduce.child_nb) { + PtlCTFree(request->u.allreduce.ack_ct_h); + } + + PtlMEUnlink(request->u.allreduce.data_me_h); + PtlCTFree(request->u.allreduce.trig_ct_h); + } + + return (OMPI_SUCCESS); +} + +int ompi_coll_portals4_allreduce_intra(void* sendbuf, void* recvbuf, int count, + MPI_Datatype dtype, MPI_Op op, + struct ompi_communicator_t *comm, + struct mca_coll_base_module_2_1_0_t *module) +{ + mca_coll_portals4_module_t *portals4_module = (mca_coll_portals4_module_t*) module; + ompi_coll_portals4_request_t *request; + + OMPI_COLL_PORTALS4_REQUEST_ALLOC(comm, request); + if (NULL == request) { + opal_output_verbose(1, ompi_coll_base_framework.framework_output, + "%s:%d: request alloc failed\n", + __FILE__, __LINE__); + return OMPI_ERR_TEMP_OUT_OF_RESOURCE; + } + + request->is_sync = true; + request->fallback_request = NULL; + + allreduce_kary_tree_top(sendbuf, recvbuf, count, + dtype, op, comm, request, portals4_module); + + allreduce_kary_tree_bottom(request); + + OMPI_COLL_PORTALS4_REQUEST_RETURN(request); + return (OMPI_SUCCESS); +} + + +int ompi_coll_portals4_iallreduce_intra(void* sendbuf, void* recvbuf, int count, + MPI_Datatype dtype, MPI_Op op, + struct ompi_communicator_t *comm, + ompi_request_t ** ompi_request, + struct mca_coll_base_module_2_1_0_t *module) +{ + mca_coll_portals4_module_t *portals4_module = (mca_coll_portals4_module_t*) module; + ompi_coll_portals4_request_t *request; + + OMPI_COLL_PORTALS4_REQUEST_ALLOC(comm, request); + if (NULL == request) { + opal_output_verbose(1, ompi_coll_base_framework.framework_output, + "%s:%d: request alloc failed\n", + __FILE__, __LINE__); + return OMPI_ERR_TEMP_OUT_OF_RESOURCE; + } + *ompi_request = &request->super; + request->fallback_request = ompi_request; + request->is_sync = false; + + allreduce_kary_tree_top(sendbuf, recvbuf, count, + dtype, op, comm, request, portals4_module); + + puts("iallreduce"); + return (OMPI_SUCCESS); +} + + +int +ompi_coll_portals4_iallreduce_intra_fini(struct ompi_coll_portals4_request_t *request) +{ + allreduce_kary_tree_bottom(request); + + OPAL_THREAD_LOCK(&ompi_request_lock); + ompi_request_complete(&request->super, true); + OPAL_THREAD_UNLOCK(&ompi_request_lock); + + return (OMPI_SUCCESS); +} diff --git a/ompi/mca/coll/portals4/coll_portals4_barrier.c b/ompi/mca/coll/portals4/coll_portals4_barrier.c index ffaf15e5e0..76b54fd923 100644 --- a/ompi/mca/coll/portals4/coll_portals4_barrier.c +++ b/ompi/mca/coll/portals4/coll_portals4_barrier.c @@ -1,11 +1,12 @@ /* - * Copyright (c) 2013 Sandia National Laboratories. All rights reserved. + * Copyright (c) 2013-2015 Sandia National Laboratories. All rights reserved. * Copyright (c) 2014 Research Organization for Information Science * and Technology (RIST). All rights reserved. + * Copyright (c) 2015 Bull SAS. All rights reserved. * $COPYRIGHT$ - * + * * Additional copyrights may follow - * + * * $HEADER$ */ @@ -23,92 +24,96 @@ #include "ompi/mca/coll/base/base.h" -int -ompi_coll_portals4_barrier_intra(struct ompi_communicator_t *comm, - mca_coll_base_module_t *module) +static int +barrier_hypercube_top(struct ompi_communicator_t *comm, + ompi_coll_portals4_request_t *request, + mca_coll_portals4_module_t *portals4_module) { - mca_coll_portals4_module_t *portals4_module = (mca_coll_portals4_module_t*) module; + bool is_sync = request->is_sync; int ret, i, dim, hibit, mask, num_msgs; int size = ompi_comm_size(comm); int rank = ompi_comm_rank(comm); - ptl_ct_event_t ct; - ptl_handle_ct_t ct_h; - ptl_handle_me_t me_h; ptl_me_t me; size_t count; - ptl_match_bits_t match_bits; + ptl_match_bits_t match_bits_rtr, match_bits; + ptl_ct_event_t event; ptl_handle_md_t md_h; - void *base; - ompi_coll_portals4_get_md(0, &md_h, &base); + md_h = mca_coll_portals4_component.zero_md_h; - count = opal_atomic_add_size_t(&portals4_module->barrier_count, 1); + request->type = OMPI_COLL_PORTALS4_TYPE_BARRIER; + + count = opal_atomic_add_size_t(&portals4_module->coll_count, 1); ret = PtlCTAlloc(mca_coll_portals4_component.ni_h, - &ct_h); + &request->u.barrier.rtr_ct_h); if (PTL_OK != ret) { opal_output_verbose(1, ompi_coll_base_framework.framework_output, - "%s:%d: PtlCTAlloc failed: %d\n", - __FILE__, __LINE__, ret); + "%s:%d: PtlCTAlloc failed: %d\n", + __FILE__, __LINE__, ret); return OMPI_ERR_TEMP_OUT_OF_RESOURCE; } - COLL_PORTALS4_SET_BITS(match_bits, ompi_comm_get_cid(comm), - 0, COLL_PORTALS4_BARRIER, count); + COLL_PORTALS4_SET_BITS(match_bits_rtr, ompi_comm_get_cid(comm), + 0, 1, COLL_PORTALS4_BARRIER, 0, count); + + COLL_PORTALS4_SET_BITS(match_bits, ompi_comm_get_cid(comm), + 0, 0, COLL_PORTALS4_BARRIER, 0, count); /* Build "tree" out of hypercube */ dim = comm->c_cube_dim; hibit = opal_hibit(rank, dim); --dim; + /* calculate number of children to receive from */ + num_msgs = get_nchildren(dim + 1, hibit, rank, size); /* receive space */ + memset(&me, 0, sizeof(ptl_me_t)); me.start = NULL; me.length = 0; - me.ct_handle = ct_h; + me.ct_handle = request->u.barrier.rtr_ct_h; me.min_free = 0; me.uid = mca_coll_portals4_component.uid; me.options = PTL_ME_OP_PUT | PTL_ME_EVENT_SUCCESS_DISABLE | - PTL_ME_EVENT_LINK_DISABLE | PTL_ME_EVENT_UNLINK_DISABLE | - PTL_ME_EVENT_CT_COMM | PTL_ME_EVENT_CT_OVERFLOW; + PTL_ME_EVENT_LINK_DISABLE | PTL_ME_EVENT_UNLINK_DISABLE | + PTL_ME_EVENT_CT_COMM | PTL_ME_EVENT_CT_OVERFLOW; me.match_id.phys.nid = PTL_NID_ANY; me.match_id.phys.pid = PTL_PID_ANY; me.match_bits = match_bits; - me.ignore_bits = 0; + me.ignore_bits = COLL_PORTALS4_RTR_MASK; ret = PtlMEAppend(mca_coll_portals4_component.ni_h, - mca_coll_portals4_component.pt_idx, - &me, - PTL_PRIORITY_LIST, - NULL, - &me_h); + mca_coll_portals4_component.pt_idx, + &me, + PTL_PRIORITY_LIST, + NULL, + &request->u.barrier.data_me_h); if (PTL_OK != ret) { opal_output_verbose(1, ompi_coll_base_framework.framework_output, - "%s:%d: PtlMEAppend failed: %d\n", - __FILE__, __LINE__, ret); + "%s:%d: PtlMEAppend failed: %d\n", + __FILE__, __LINE__, ret); return OMPI_ERROR; } - /* calculate number of children to receive from */ - num_msgs = ompi_coll_portals4_get_nchildren(dim + 1, hibit, rank, size); - /* send to parent when children have sent to us */ if (rank > 0) { int parent = rank & ~(1 << hibit); + ret = PtlTriggeredPut(md_h, - 0, - 0, - PTL_NO_ACK_REQ, - ompi_coll_portals4_get_peer(comm, parent), - mca_coll_portals4_component.pt_idx, - match_bits, - 0, - NULL, - 0, - ct_h, - num_msgs); + 0, + 0, + PTL_NO_ACK_REQ, + ompi_coll_portals4_get_peer(comm, parent), + mca_coll_portals4_component.pt_idx, + match_bits_rtr, + 0, + NULL, + 0, + request->u.barrier.rtr_ct_h, + num_msgs); if (PTL_OK != ret) { opal_output_verbose(1, ompi_coll_base_framework.framework_output, - "%s:%d: PtlTriggeredPut failed: %d\n", - __FILE__, __LINE__, ret); + "%s:%d: PtlTriggeredPut failed: %d\n", + __FILE__, __LINE__, ret); return OMPI_ERROR; } @@ -121,48 +126,76 @@ ompi_coll_portals4_barrier_intra(struct ompi_communicator_t *comm, int peer = rank | mask; if (peer < size) { ret = PtlTriggeredPut(md_h, - 0, - 0, - PTL_NO_ACK_REQ, - ompi_coll_portals4_get_peer(comm, peer), - mca_coll_portals4_component.pt_idx, - match_bits, - 0, - NULL, - 0, - ct_h, - num_msgs); + 0, + 0, + PTL_NO_ACK_REQ, + ompi_coll_portals4_get_peer(comm, peer), + mca_coll_portals4_component.pt_idx, + match_bits, + 0, + NULL, + 0, + request->u.barrier.rtr_ct_h, + num_msgs); if (PTL_OK != ret) { opal_output_verbose(1, ompi_coll_base_framework.framework_output, - "%s:%d: PtlTriggeredPut failed: %d\n", - __FILE__, __LINE__, ret); + "%s:%d: PtlTriggeredPut failed: %d\n", + __FILE__, __LINE__, ret); return OMPI_ERROR; } } } - /* Wait for all incoming messages */ - ret = PtlCTWait(ct_h, num_msgs, &ct); + if (is_sync) { + /* Send a put to self when we've received all our messages... */ + ret = PtlCTWait(request->u.barrier.rtr_ct_h, num_msgs, &event); + + } + else { + /* Send a put to self when we've received all our messages... */ + ret = PtlTriggeredPut(md_h, + 0, + 0, + PTL_NO_ACK_REQ, + ompi_coll_portals4_get_peer(comm, rank), + mca_coll_portals4_component.finish_pt_idx, + 0, + 0, + NULL, + (uintptr_t) request, + request->u.barrier.rtr_ct_h, + num_msgs); + if (PTL_OK != ret) { + opal_output_verbose(1, ompi_coll_base_framework.framework_output, + "%s:%d: PtlTriggeredPut failed: %d\n", + __FILE__, __LINE__, ret); + return OMPI_ERROR; + } + } + + return OMPI_SUCCESS; +} + + +static int +barrier_hypercube_bottom(ompi_coll_portals4_request_t *request) +{ + int ret; + + /* cleanup */ + ret = PtlMEUnlink(request->u.barrier.data_me_h); if (PTL_OK != ret) { opal_output_verbose(1, ompi_coll_base_framework.framework_output, - "%s:%d: PtlCTWait failed: %d\n", - __FILE__, __LINE__, ret); + "%s:%d: PtlMEUnlink failed: %d\n", + __FILE__, __LINE__, ret); return OMPI_ERROR; } - /* cleanup */ - ret = PtlMEUnlink(me_h); + ret = PtlCTFree(request->u.barrier.rtr_ct_h); if (PTL_OK != ret) { opal_output_verbose(1, ompi_coll_base_framework.framework_output, - "%s:%d: PtlMEUnlink failed: %d\n", - __FILE__, __LINE__, ret); - return OMPI_ERROR; - } - ret = PtlCTFree(ct_h); - if (PTL_OK != ret) { - opal_output_verbose(1, ompi_coll_base_framework.framework_output, - "%s:%d: PtlCTFree failed: %d\n", - __FILE__, __LINE__, ret); + "%s:%d: PtlCTFree failed: %d\n", + __FILE__, __LINE__, ret); return OMPI_ERROR; } @@ -171,150 +204,71 @@ ompi_coll_portals4_barrier_intra(struct ompi_communicator_t *comm, int -ompi_coll_portals4_ibarrier_intra(struct ompi_communicator_t *comm, - ompi_request_t **ompi_req, - struct mca_coll_base_module_2_1_0_t *module) +ompi_coll_portals4_barrier_intra(struct ompi_communicator_t *comm, + mca_coll_base_module_t *module) { + int ret; mca_coll_portals4_module_t *portals4_module = (mca_coll_portals4_module_t*) module; - int ret, i, dim, hibit, mask, num_msgs; - int size = ompi_comm_size(comm); - int rank = ompi_comm_rank(comm); - ptl_me_t me; - size_t count; - ptl_match_bits_t match_bits; ompi_coll_portals4_request_t *request; - ptl_handle_md_t md_h; - void *base; - ompi_coll_portals4_get_md(0, &md_h, &base); OMPI_COLL_PORTALS4_REQUEST_ALLOC(comm, request); if (NULL == request) { opal_output_verbose(1, ompi_coll_base_framework.framework_output, - "%s:%d: request alloc failed\n", - __FILE__, __LINE__); - return OMPI_ERR_TEMP_OUT_OF_RESOURCE; - } - *ompi_req = &request->super; - request->type = OMPI_COLL_PORTALS4_TYPE_BARRIER; - - count = opal_atomic_add_size_t(&portals4_module->barrier_count, 1); - - ret = PtlCTAlloc(mca_coll_portals4_component.ni_h, - &request->ct_h); - if (PTL_OK != ret) { - opal_output_verbose(1, ompi_coll_base_framework.framework_output, - "%s:%d: PtlCTAlloc failed: %d\n", - __FILE__, __LINE__, ret); + "%s:%d: request alloc failed\n", + __FILE__, __LINE__); return OMPI_ERR_TEMP_OUT_OF_RESOURCE; } - COLL_PORTALS4_SET_BITS(match_bits, ompi_comm_get_cid(comm), - 0, COLL_PORTALS4_BARRIER, count); + request->is_sync = true; - /* Build "tree" out of hypercube */ - dim = comm->c_cube_dim; - hibit = opal_hibit(rank, dim); - --dim; - /* calculate number of children to receive from */ - num_msgs = ompi_coll_portals4_get_nchildren(dim + 1, hibit, rank, size); - - /* receive space */ - me.start = NULL; - me.length = 0; - me.ct_handle = request->ct_h; - me.min_free = 0; - me.uid = mca_coll_portals4_component.uid; - me.options = PTL_ME_OP_PUT | PTL_ME_EVENT_SUCCESS_DISABLE | - PTL_ME_EVENT_LINK_DISABLE | PTL_ME_EVENT_UNLINK_DISABLE | - PTL_ME_EVENT_CT_COMM | PTL_ME_EVENT_CT_OVERFLOW; - me.match_id.phys.nid = PTL_NID_ANY; - me.match_id.phys.pid = PTL_PID_ANY; - me.match_bits = match_bits; - me.ignore_bits = 0; - ret = PtlMEAppend(mca_coll_portals4_component.ni_h, - mca_coll_portals4_component.pt_idx, - &me, - PTL_PRIORITY_LIST, - NULL, - &request->me_h); - if (PTL_OK != ret) { + ret = barrier_hypercube_top(comm, request, portals4_module); + if (OMPI_SUCCESS != ret) { opal_output_verbose(1, ompi_coll_base_framework.framework_output, - "%s:%d: PtlMEAppend failed: %d\n", - __FILE__, __LINE__, ret); + "%s:%d: ompi_coll_portals4_barrier_hypercube_top failed %d\n", + __FILE__, __LINE__, ret); return OMPI_ERROR; } - - /* send to parent when children have sent to us */ - if (rank > 0) { - int parent = rank & ~(1 << hibit); - ret = PtlTriggeredPut(md_h, - 0, - 0, - PTL_NO_ACK_REQ, - ompi_coll_portals4_get_peer(comm, parent), - mca_coll_portals4_component.pt_idx, - match_bits, - 0, - NULL, - 0, - request->ct_h, - num_msgs); - if (PTL_OK != ret) { - opal_output_verbose(1, ompi_coll_base_framework.framework_output, - "%s:%d: PtlTriggeredPut failed: %d\n", - __FILE__, __LINE__, ret); - return OMPI_ERROR; - } - - /* we'll need to wait for the parent response before the next set of comms */ - num_msgs++; - } - - /* send to children when parent (or all children if root) has sent to us */ - for (i = hibit + 1, mask = 1 << i; i <= dim; ++i, mask <<= 1) { - int peer = rank | mask; - if (peer < size) { - ret = PtlTriggeredPut(md_h, - 0, - 0, - PTL_NO_ACK_REQ, - ompi_coll_portals4_get_peer(comm, peer), - mca_coll_portals4_component.pt_idx, - match_bits, - 0, - NULL, - 0, - request->ct_h, - num_msgs); - if (PTL_OK != ret) { - opal_output_verbose(1, ompi_coll_base_framework.framework_output, - "%s:%d: PtlTriggeredPut failed: %d\n", - __FILE__, __LINE__, ret); - return OMPI_ERROR; - } - } - } - - /* Send a put to self when we've received all our messages... */ - ret = PtlTriggeredPut(md_h, - 0, - 0, - PTL_NO_ACK_REQ, - ompi_coll_portals4_get_peer(comm, rank), - mca_coll_portals4_component.finish_pt_idx, - 0, - 0, - NULL, - (uintptr_t) request, - request->ct_h, - num_msgs); - - if (PTL_OK != ret) { + ret = barrier_hypercube_bottom(request); + if (OMPI_SUCCESS != ret) { opal_output_verbose(1, ompi_coll_base_framework.framework_output, - "%s:%d: PtlTriggeredPut failed: %d\n", - __FILE__, __LINE__, ret); + "%s:%d: ompi_coll_portals4_barrier_hypercube_bottom failed %d\n", + __FILE__, __LINE__, ret); + return OMPI_ERROR; + } + + OMPI_COLL_PORTALS4_REQUEST_RETURN(request); + return OMPI_SUCCESS; +} + + +int +ompi_coll_portals4_ibarrier_intra(struct ompi_communicator_t *comm, + ompi_request_t **ompi_req, + struct mca_coll_base_module_2_1_0_t *module) +{ + int ret; + mca_coll_portals4_module_t *portals4_module = (mca_coll_portals4_module_t*) module; + ompi_coll_portals4_request_t *request; + + + OMPI_COLL_PORTALS4_REQUEST_ALLOC(comm, request); + if (NULL == request) { + opal_output_verbose(1, ompi_coll_base_framework.framework_output, + "%s:%d: request alloc failed\n", + __FILE__, __LINE__); + return OMPI_ERR_TEMP_OUT_OF_RESOURCE; + } + + *ompi_req = &request->super; + request->is_sync = false; + + ret = barrier_hypercube_top(comm, request, portals4_module); + if (OMPI_SUCCESS != ret) { + opal_output_verbose(1, ompi_coll_base_framework.framework_output, + "%s:%d: ompi_coll_portals4_barrier_hypercube_top failed %d\n", + __FILE__, __LINE__, ret); return OMPI_ERROR; } @@ -327,19 +281,11 @@ ompi_coll_portals4_ibarrier_intra_fini(ompi_coll_portals4_request_t *request) { int ret; - /* cleanup */ - ret = PtlMEUnlink(request->me_h); - if (PTL_OK != ret) { + ret = barrier_hypercube_bottom(request); + if (OMPI_SUCCESS != ret) { opal_output_verbose(1, ompi_coll_base_framework.framework_output, - "%s:%d: PtlMEUnlink failed: %d\n", - __FILE__, __LINE__, ret); - return OMPI_ERROR; - } - ret = PtlCTFree(request->ct_h); - if (PTL_OK != ret) { - opal_output_verbose(1, ompi_coll_base_framework.framework_output, - "%s:%d: PtlCTFree failed: %d\n", - __FILE__, __LINE__, ret); + "%s:%d: ompi_coll_portals4_barrier_hypercube_bottom failed %d\n", + __FILE__, __LINE__, ret); return OMPI_ERROR; } diff --git a/ompi/mca/coll/portals4/coll_portals4_bcast.c b/ompi/mca/coll/portals4/coll_portals4_bcast.c new file mode 100644 index 0000000000..fe0431d903 --- /dev/null +++ b/ompi/mca/coll/portals4/coll_portals4_bcast.c @@ -0,0 +1,864 @@ +/* + * Copyright (c) 2015 Sandia National Laboratories. All rights reserved. + * Copyright (c) 2015 Bull SAS. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "ompi_config.h" + +#include "coll_portals4.h" +#include "coll_portals4_request.h" + +#include "mpi.h" +#include "ompi/constants.h" +#include "opal/util/bit_ops.h" +#include "ompi/mca/pml/pml.h" +#include "ompi/mca/coll/coll.h" +#include "ompi/mca/coll/base/base.h" +#include "ompi/datatype/ompi_datatype.h" + +/* + * the bcast communication is based on 1 to N scheme + * + */ + +#define COLL_PORTALS4_BCAST_MAX_CHILDREN 2 +#define COLL_PORTALS4_BCAST_ALGO_THRESHOLD 4 + + +static int prepare_bcast_data (struct ompi_communicator_t *comm, + void *buff, int count, + struct ompi_datatype_t *datatype, int root, + ompi_coll_portals4_request_t *request) { + int rank = ompi_comm_rank(comm); + int ret; + size_t max_data; + unsigned int iov_count; + struct iovec iovec; + + request->u.bcast.is_root = (rank == root); + request->u.bcast.needs_pack = !ompi_datatype_is_contiguous_memory_layout(datatype, count); + + if (request->u.bcast.needs_pack) { + if (request->u.bcast.is_root) { + OBJ_CONSTRUCT(&request->u.bcast.convertor, opal_convertor_t); + opal_convertor_copy_and_prepare_for_send(ompi_mpi_local_convertor, + &(datatype->super), count, + buff, 0, &request->u.bcast.convertor); + opal_convertor_get_packed_size(&request->u.bcast.convertor, &request->u.bcast.tmpsize); + request->u.bcast.tmpbuf = malloc(request->u.bcast.tmpsize); + if (OPAL_UNLIKELY(NULL == request->u.bcast.tmpbuf)) { + OBJ_DESTRUCT(&request->u.bcast.convertor); + return opal_stderr("malloc failed", __FILE__, __LINE__, OMPI_ERR_OUT_OF_RESOURCE); + } + + iovec.iov_base = request->u.bcast.tmpbuf; + iovec.iov_len = request->u.bcast.tmpsize; + iov_count = 1; + max_data = request->u.bcast.tmpsize; + ret = opal_convertor_pack(&request->u.bcast.convertor, &iovec, &iov_count, &max_data); + OBJ_DESTRUCT(&request->u.bcast.convertor); + if (OPAL_UNLIKELY(ret < 0)) { + return opal_stderr("opal_convertor_pack failed", __FILE__, __LINE__, ret); } + } + else { + OBJ_CONSTRUCT(&request->u.bcast.convertor, opal_convertor_t); + opal_convertor_copy_and_prepare_for_recv(ompi_mpi_local_convertor, + &(datatype->super), count, + buff, 0, &request->u.bcast.convertor); + + max_data = request->u.bcast.tmpsize; + opal_convertor_get_packed_size(&request->u.bcast.convertor, &max_data); + + request->u.bcast.tmpbuf = malloc(request->u.bcast.tmpsize); + if (OPAL_UNLIKELY(NULL == request->u.bcast.tmpbuf)) { + OBJ_DESTRUCT(&request->u.bcast.convertor); + return opal_stderr("malloc failed", __FILE__, __LINE__, OMPI_ERR_OUT_OF_RESOURCE); + } + } + } + else { + request->u.bcast.tmpbuf = buff; + + ompi_datatype_type_size(datatype, &request->u.bcast.tmpsize); + request->u.bcast.tmpsize *= count; + } + + /* Number of segments */ + request->u.bcast.segment_nb = (request->u.bcast.tmpsize > COLL_PORTALS4_MAX_BW) ? + (((request->u.bcast.tmpsize + COLL_PORTALS4_MAX_BW -1) / COLL_PORTALS4_MAX_BW) < COLL_PORTALS4_MAX_SEGMENT ? + ((request->u.bcast.tmpsize + COLL_PORTALS4_MAX_BW -1) / COLL_PORTALS4_MAX_BW) : + COLL_PORTALS4_MAX_SEGMENT) : + 1; + + if (request->u.bcast.segment_nb > COLL_PORTALS4_BCAST_ALGO_THRESHOLD) { + request->u.bcast.algo = OMPI_COLL_PORTALS4_BCAST_PIPELINE_ALGO; + } + else { + request->u.bcast.algo = OMPI_COLL_PORTALS4_BCAST_KARY_TREE_ALGO; + } + return (OMPI_SUCCESS); +} + +static int post_bcast_data( ompi_coll_portals4_request_t *request) { + + int ret; + size_t max_data; + unsigned int iov_count; + struct iovec iovec; + + if (request->u.bcast.needs_pack) { + if (!request->u.bcast.is_root) { + opal_convertor_get_packed_size(&request->u.bcast.convertor, &request->u.bcast.tmpsize); + + iovec.iov_base = request->u.bcast.tmpbuf; + iovec.iov_len = request->u.bcast.tmpsize; + iov_count = 1; + ret = opal_convertor_unpack(&request->u.bcast.convertor, &iovec, &iov_count, &max_data); + OBJ_DESTRUCT(&request->u.bcast.convertor); + if (OPAL_UNLIKELY(ret < 0)) { + return opal_stderr("opal_convertor_unpack failed", __FILE__, __LINE__, ret); + } + } + free(request->u.bcast.tmpbuf); + } + return (OMPI_SUCCESS); +} + +static int +bcast_kary_tree_top(void *buff, int count, + struct ompi_datatype_t *datatype, int root, + struct ompi_communicator_t *comm, + ompi_coll_portals4_request_t *request, + mca_coll_portals4_module_t *portals4_module) +{ + bool is_sync = request->is_sync; + int ret, seg; + unsigned int i; + int segment_nb = request->u.bcast.segment_nb; + unsigned int child_nb; + int size = ompi_comm_size(comm); + int rank = ompi_comm_rank(comm); + ptl_rank_t parent, child[COLL_PORTALS4_BCAST_MAX_CHILDREN]; + size_t internal_count, length, offset; + ptl_handle_md_t zero_md_h, data_md_h; + ptl_handle_me_t me_h; + ptl_ct_event_t ct_inc; + ptl_me_t me; + ptl_match_bits_t match_bits_ack, match_bits_rtr, match_bits; + ptl_ct_event_t ct; + ptl_size_t trig_thr, ack_thr; + + /* + ** Initialization + */ + + request->type = OMPI_COLL_PORTALS4_TYPE_BCAST; + + for (i = 0 ; i < COLL_PORTALS4_BCAST_MAX_CHILDREN ; i++) { + child[i] = PTL_INVALID_RANK; + } + + parent = PTL_INVALID_RANK; + + zero_md_h = mca_coll_portals4_component.zero_md_h; + data_md_h = mca_coll_portals4_component.data_md_h; + + internal_count = opal_atomic_add_size_t(&portals4_module->coll_count, 1); + + + /* + ** DATATYPE and SIZES + */ + + get_k_ary_tree(COLL_PORTALS4_BCAST_MAX_CHILDREN, + rank, size, root, &parent, child, &child_nb); + request->u.bcast.u.child_nb = child_nb; + + /* + * TOPOLOGY + */ + + /* + * PORTALS4 RESOURCE ALLOCATION + */ + + if ((ret = PtlCTAlloc(mca_coll_portals4_component.ni_h, &request->u.bcast.trig_ct_h)) != 0) { + return opal_stderr("PtlCTAlloc failed", __FILE__, __LINE__, ret); + } + + /* Compute match bits */ + COLL_PORTALS4_SET_BITS(match_bits_ack, ompi_comm_get_cid(comm), 1, 0, + COLL_PORTALS4_BCAST, 0, internal_count); + + COLL_PORTALS4_SET_BITS(match_bits_rtr, ompi_comm_get_cid(comm), 0, 1, + COLL_PORTALS4_BCAST, 0, internal_count); + + COLL_PORTALS4_SET_BITS(match_bits, ompi_comm_get_cid(comm), 0, 0, + COLL_PORTALS4_BCAST, 0, internal_count); + + if (rank != root) { + for (seg = 1, offset = 0, length = 0 ; + seg <= segment_nb ; + seg++, offset += length) { + + /* Divide buffer into segments */ + length = (seg < segment_nb) ? + (request->u.bcast.tmpsize + segment_nb - 1) / segment_nb : + request->u.bcast.tmpsize - ((request->u.bcast.tmpsize + segment_nb - 1) / segment_nb) * (segment_nb - 1); + + /* + ** Prepare Data ME + */ + + memset(&me, 0, sizeof(ptl_me_t)); + me.start = ((uint8_t*) request->u.bcast.tmpbuf) + offset; + me.length = length; + me.ct_handle = request->u.bcast.trig_ct_h; + me.uid = mca_coll_portals4_component.uid; + me.options = PTL_ME_OP_PUT | PTL_ME_EVENT_SUCCESS_DISABLE | + PTL_ME_EVENT_LINK_DISABLE | PTL_ME_EVENT_UNLINK_DISABLE | + PTL_ME_USE_ONCE | + PTL_ME_EVENT_CT_COMM; + me.match_id.phys.nid = PTL_NID_ANY; + me.match_id.phys.pid = PTL_PID_ANY; + me.match_bits = match_bits; + me.ignore_bits = 0; + if ((ret = PtlMEAppend(mca_coll_portals4_component.ni_h, + mca_coll_portals4_component.pt_idx, + &me, + PTL_PRIORITY_LIST, + NULL, + &me_h)) != 0) { + return opal_stderr("PtlMEAppend failed", __FILE__, __LINE__, ret); + } + } + + /* + * Send RTR to parent + * + * the root does not to have to do it, since it does not have parent. + * WE can do such an operation by now, since we are able to receive data, + * even if we are not able to receive the others. + * + */ + + /* and there, we only send the RTR when all the MEs are ready */ + if ((ret = PtlPut(zero_md_h, 0, 0, PTL_NO_ACK_REQ, + ompi_coll_portals4_get_peer(comm, parent), + mca_coll_portals4_component.pt_idx, match_bits_rtr, + 0, NULL, 0)) != PTL_OK) { + return opal_stderr("Put RTR failed %d", __FILE__, __LINE__, ret); + } + + /* + * Prepare Triggered Put to ACK Data to parent + * + */ + + trig_thr = child_nb ? (segment_nb * 2) : + segment_nb; + + if ((ret = PtlTriggeredPut (zero_md_h, 0, 0, PTL_NO_ACK_REQ, + ompi_coll_portals4_get_peer(comm, parent), + mca_coll_portals4_component.pt_idx, + match_bits_ack, 0, NULL, 0, + request->u.bcast.trig_ct_h, trig_thr)) != 0) { + return opal_stderr("PtlTriggeredPut failed", __FILE__, __LINE__, ret); + } + } + + if (child_nb) { + if ((ret = PtlCTAlloc(mca_coll_portals4_component.ni_h, &request->u.bcast.rtr_ct_h)) != 0) { + return opal_stderr("PtlCTAlloc failed", __FILE__, __LINE__, ret); + } + ct_inc.success = segment_nb; + ct_inc.failure = 0; + + if ((ret = PtlTriggeredCTInc(request->u.bcast.trig_ct_h, ct_inc, + request->u.bcast.rtr_ct_h, child_nb)) != 0) { + return opal_stderr("PtlTriggeredCTInc failed", __FILE__, __LINE__, ret); + } + + if ((ret = PtlCTAlloc(mca_coll_portals4_component.ni_h, &request->u.bcast.ack_ct_h)) != 0) { + return opal_stderr("PtlCTAlloc failed", __FILE__, __LINE__, ret); + } + + /* + ** Prepare ME for receiving data ACK Put + ** Priority List + */ + + memset(&me, 0, sizeof(ptl_me_t)); + me.start = NULL; + me.length = 0; + me.min_free = 0; + me.uid = mca_coll_portals4_component.uid; + me.options = PTL_ME_OP_PUT | PTL_ME_EVENT_SUCCESS_DISABLE | + PTL_ME_EVENT_LINK_DISABLE | PTL_ME_EVENT_UNLINK_DISABLE | + PTL_ME_USE_ONCE | + PTL_ME_EVENT_CT_COMM; + me.match_id.phys.nid = PTL_NID_ANY; + me.match_id.phys.pid = PTL_PID_ANY; + me.match_bits = match_bits_ack; + me.ignore_bits = 0; + me.ct_handle = request->u.bcast.ack_ct_h; + + for (i = 0 ; i < child_nb ; i++) { + if ((ret = PtlMEAppend(mca_coll_portals4_component.ni_h, + mca_coll_portals4_component.pt_idx, + &me, PTL_PRIORITY_LIST, NULL, + &me_h)) != 0) { + return opal_stderr("PtlMEAppend failed", __FILE__, __LINE__, ret); + } + } + + /* + ** Prepare ME for sending RTR Put + ** Priority List, match also with "Overflow list Me" in coll_portals4_component + */ + + memset(&me, 0, sizeof(ptl_me_t)); + me.start = NULL; + me.length = 0; + me.min_free = 0; + me.uid = mca_coll_portals4_component.uid; + me.options = PTL_ME_OP_PUT | PTL_ME_EVENT_SUCCESS_DISABLE | + PTL_ME_EVENT_LINK_DISABLE | PTL_ME_EVENT_UNLINK_DISABLE | + PTL_ME_USE_ONCE | + PTL_ME_EVENT_CT_COMM | PTL_ME_EVENT_CT_OVERFLOW; + me.match_id.phys.nid = PTL_NID_ANY; + me.match_id.phys.pid = PTL_PID_ANY; + me.match_bits = match_bits_rtr; + me.ignore_bits = 0; + me.ct_handle = request->u.bcast.rtr_ct_h; + + for (i = 0 ; i < child_nb ; i++) { + if ((ret = PtlMEAppend(mca_coll_portals4_component.ni_h, + mca_coll_portals4_component.pt_idx, + &me, PTL_PRIORITY_LIST, + NULL, + &me_h)) != 0) { + return opal_stderr("PtlMEAppend failed", __FILE__, __LINE__, ret); + } + } + + for (seg = 1, offset = 0, length = 0 ; + seg <= segment_nb ; + seg++, offset += length) { + + /* Divide buffer into segments */ + length = (seg < segment_nb) ? + (request->u.bcast.tmpsize + segment_nb - 1) / segment_nb : + request->u.bcast.tmpsize - ((request->u.bcast.tmpsize + segment_nb - 1) / segment_nb) * (segment_nb - 1); + + /* compute the triggering threshold to send data to the children */ + trig_thr = (rank == root) ? (segment_nb) : + (segment_nb + seg); + + /* + ** Send Data to children + */ + + for (i = 0 ; i < COLL_PORTALS4_BCAST_MAX_CHILDREN ; i++) { + if (child[i] != PTL_INVALID_RANK) { + + if ((ret = PtlTriggeredPut (data_md_h, + (uint64_t) request->u.bcast.tmpbuf + offset, + length, PTL_NO_ACK_REQ, + ompi_coll_portals4_get_peer(comm, child[i]), + mca_coll_portals4_component.pt_idx, + match_bits, 0, + NULL, + 0, request->u.bcast.trig_ct_h, trig_thr)) != 0) { + return opal_stderr("PtlTriggeredPut failed", __FILE__, __LINE__, ret); + } + } + } + } + + ack_thr = child_nb; + + if (is_sync) { + if ((ret = PtlCTWait(request->u.bcast.ack_ct_h, ack_thr, &ct)) != 0) + opal_stderr("PtlCTWait failed", __FILE__, __LINE__, ret); + } + else { + if ((ret = PtlTriggeredPut (zero_md_h, 0, 0, PTL_NO_ACK_REQ, + ompi_coll_portals4_get_peer(comm, rank), + mca_coll_portals4_component.finish_pt_idx, + 0, 0, NULL, (uintptr_t) request, + request->u.bcast.ack_ct_h, + ack_thr)) != 0) { + return opal_stderr("PtlTriggeredPut failed", __FILE__, __LINE__, ret); + } + } + } + else { + /* A leaf of the tree does not need to send data to its children */ + request->u.bcast.rtr_ct_h = PTL_INVALID_HANDLE; + request->u.bcast.ack_ct_h = PTL_INVALID_HANDLE; + + /* a leaf does not expect counting events from its children, + * the threshold is computed using the number of segments received + * from the parent + */ + + if (rank != root) { + ack_thr = segment_nb; + if (is_sync) { + if ((ret = PtlCTWait(request->u.bcast.trig_ct_h, ack_thr, &ct)) != 0) { + opal_stderr("PtlCTWait failed", __FILE__, __LINE__, ret); + } + } + else { + if ((ret = PtlTriggeredPut (zero_md_h, 0, 0, PTL_NO_ACK_REQ, + ompi_coll_portals4_get_peer(comm, rank), + mca_coll_portals4_component.finish_pt_idx, + 0, 0, NULL, (uintptr_t) request, + request->u.bcast.trig_ct_h, + ack_thr)) != 0) { + return opal_stderr("PtlTriggeredPut failed", __FILE__, __LINE__, ret); + } + + } + } + } + return (OMPI_SUCCESS); +} + + +static int +bcast_pipeline_top(void *buff, int count, + struct ompi_datatype_t *datatype, int root, + struct ompi_communicator_t *comm, + ompi_coll_portals4_request_t *request, + mca_coll_portals4_module_t *portals4_module) +{ + bool is_sync = request->is_sync; + int ret, seg; + int segment_nb = request->u.bcast.segment_nb; + int size = ompi_comm_size(comm); + int rank = ompi_comm_rank(comm); + ptl_rank_t parent, child; + size_t internal_count, length, offset; + ptl_handle_md_t zero_md_h, data_md_h; + ptl_handle_me_t me_h; + ptl_ct_event_t ct_inc; + ptl_me_t me; + ptl_match_bits_t match_bits_ack, match_bits_rtr, match_bits; + ptl_ct_event_t ct; + ptl_size_t trig_thr; + + /* + ** Initialization + */ + + request->type = OMPI_COLL_PORTALS4_TYPE_BCAST; + + child = PTL_INVALID_RANK; + parent = PTL_INVALID_RANK; + + zero_md_h = mca_coll_portals4_component.zero_md_h; + data_md_h = mca_coll_portals4_component.data_md_h; + + internal_count = opal_atomic_add_size_t(&portals4_module->coll_count, 1); + + /* + ** DATATYPE and SIZES + */ + + get_pipeline(rank, size, root, &parent, &child); + request->u.bcast.u.child = child; + + /* + * PORTALS4 RESOURCE ALLOCATION + */ + + if ((ret = PtlCTAlloc(mca_coll_portals4_component.ni_h, &request->u.bcast.trig_ct_h)) != 0) { + return opal_stderr("PtlCTAlloc failed", __FILE__, __LINE__, ret); + } + + /* Compute match bits */ + COLL_PORTALS4_SET_BITS(match_bits_ack, ompi_comm_get_cid(comm), 1, 0, + COLL_PORTALS4_BCAST, 0, internal_count); + + COLL_PORTALS4_SET_BITS(match_bits_rtr, ompi_comm_get_cid(comm), 0, 1, + COLL_PORTALS4_BCAST, 0, internal_count); + + COLL_PORTALS4_SET_BITS(match_bits, ompi_comm_get_cid(comm), 0, 0, + COLL_PORTALS4_BCAST, 0, internal_count); + + if (rank != root) { + for (seg = 1, offset = 0, length = 0 ; + seg <= segment_nb ; + seg++, offset += length) { + + /* Divide buffer into segments */ + length = (seg < segment_nb) ? + (request->u.bcast.tmpsize + segment_nb - 1) / segment_nb : + request->u.bcast.tmpsize - ((request->u.bcast.tmpsize + segment_nb - 1) / segment_nb) * (segment_nb - 1); + + /* + ** Prepare Data ME + */ + + memset(&me, 0, sizeof(ptl_me_t)); + me.start = ((uint8_t*) request->u.bcast.tmpbuf) + offset; + me.length = length; + me.ct_handle = request->u.bcast.trig_ct_h; + me.uid = mca_coll_portals4_component.uid; + me.options = PTL_ME_OP_PUT | PTL_ME_EVENT_SUCCESS_DISABLE | + PTL_ME_EVENT_LINK_DISABLE | PTL_ME_EVENT_UNLINK_DISABLE | + PTL_ME_USE_ONCE | + PTL_ME_EVENT_CT_COMM; + me.match_id.phys.nid = PTL_NID_ANY; + me.match_id.phys.pid = PTL_PID_ANY; + me.match_bits = match_bits; + me.ignore_bits = 0; + if ((ret = PtlMEAppend(mca_coll_portals4_component.ni_h, + mca_coll_portals4_component.pt_idx, + &me, + PTL_PRIORITY_LIST, + NULL, + &me_h)) != 0) { + return opal_stderr("PtlMEAppend failed", __FILE__, __LINE__, ret); + } + } + + /* + * Send RTR to parent + * + * the root does not to have to do it, since it does not have parent. + * WE can do such an operation by now, since we are able to receive data, + * even if we are not able to receive the others. + * + */ + + /* and there, we only send the RTR when all the MEs are ready */ + if ((ret = PtlPut(zero_md_h, 0, 0, PTL_NO_ACK_REQ, + ompi_coll_portals4_get_peer(comm, parent), + mca_coll_portals4_component.pt_idx, match_bits_rtr, + 0, NULL, 0)) != PTL_OK) { + return opal_stderr("Put RTR failed %d", __FILE__, __LINE__, ret); + } + + /* + * Prepare Triggered Put to ACK Data to parent + * + */ + + trig_thr = (child != PTL_INVALID_RANK) ? + (segment_nb * 2) : + segment_nb; + + if ((ret = PtlTriggeredPut (zero_md_h, 0, 0, PTL_NO_ACK_REQ, + ompi_coll_portals4_get_peer(comm, parent), + mca_coll_portals4_component.pt_idx, + match_bits_ack, 0, NULL, 0, + request->u.bcast.trig_ct_h, trig_thr)) != 0) { + return opal_stderr("PtlTriggeredPut failed", __FILE__, __LINE__, ret); + } + } + + if (child != PTL_INVALID_RANK) { + if ((ret = PtlCTAlloc(mca_coll_portals4_component.ni_h, &request->u.bcast.rtr_ct_h)) != 0) { + return opal_stderr("PtlCTAlloc failed", __FILE__, __LINE__, ret); + } + + ct_inc.success = segment_nb; + ct_inc.failure = 0; + + if ((ret = PtlTriggeredCTInc(request->u.bcast.trig_ct_h, ct_inc, + request->u.bcast.rtr_ct_h, 1)) != 0) { + return opal_stderr("PtlTriggeredCTInc failed", __FILE__, __LINE__, ret); + } + + if ((ret = PtlCTAlloc(mca_coll_portals4_component.ni_h, &request->u.bcast.ack_ct_h)) != 0) { + return opal_stderr("PtlCTAlloc failed", __FILE__, __LINE__, ret); + } + + /* + ** Prepare ME for receiving data ACK Put + ** Priority List + */ + + memset(&me, 0, sizeof(ptl_me_t)); + me.start = NULL; + me.length = 0; + me.min_free = 0; + me.uid = mca_coll_portals4_component.uid; + me.options = PTL_ME_OP_PUT | PTL_ME_EVENT_SUCCESS_DISABLE | + PTL_ME_EVENT_LINK_DISABLE | PTL_ME_EVENT_UNLINK_DISABLE | + PTL_ME_USE_ONCE | + PTL_ME_EVENT_CT_COMM; + me.match_id.phys.nid = PTL_NID_ANY; + me.match_id.phys.pid = PTL_PID_ANY; + me.match_bits = match_bits_ack; + me.ignore_bits = 0; + me.ct_handle = request->u.bcast.ack_ct_h; + + if ((ret = PtlMEAppend(mca_coll_portals4_component.ni_h, + mca_coll_portals4_component.pt_idx, + &me, PTL_PRIORITY_LIST, NULL, + &me_h)) != 0) { + return opal_stderr("PtlMEAppend failed", __FILE__, __LINE__, ret); + } + + /* + ** Prepare ME for sending RTR Put + ** Priority List, match also with "Overflow list Me" in coll_portals4_component + */ + + memset(&me, 0, sizeof(ptl_me_t)); + me.start = NULL; + me.length = 0; + me.min_free = 0; + me.uid = mca_coll_portals4_component.uid; + me.options = PTL_ME_OP_PUT | PTL_ME_EVENT_SUCCESS_DISABLE | + PTL_ME_EVENT_LINK_DISABLE | PTL_ME_EVENT_UNLINK_DISABLE | + PTL_ME_USE_ONCE | + PTL_ME_EVENT_CT_COMM | PTL_ME_EVENT_CT_OVERFLOW; + me.match_id.phys.nid = PTL_NID_ANY; + me.match_id.phys.pid = PTL_PID_ANY; + me.match_bits = match_bits_rtr; + me.ignore_bits = 0; + me.ct_handle = request->u.bcast.rtr_ct_h; + + if ((ret = PtlMEAppend(mca_coll_portals4_component.ni_h, + mca_coll_portals4_component.pt_idx, + &me, PTL_PRIORITY_LIST, + NULL, + &me_h)) != 0) { + return opal_stderr("PtlMEAppend failed", __FILE__, __LINE__, ret); + } + + for (seg = 1, offset = 0, length = 0 ; + seg <= segment_nb ; + seg++, offset += length) { + + /* Divide buffer into segments */ + length = (seg < segment_nb) ? + (request->u.bcast.tmpsize + segment_nb - 1) / segment_nb : + request->u.bcast.tmpsize - ((request->u.bcast.tmpsize + segment_nb - 1) / segment_nb) * (segment_nb - 1); + + /* compute the triggering threshold to send data to the children */ + trig_thr = (rank == root) ? (segment_nb) : + (segment_nb + seg); + + /* + ** Send Data to children + */ + + if (child != PTL_INVALID_RANK) { + + if ((ret = PtlTriggeredPut (data_md_h, + (uint64_t) request->u.bcast.tmpbuf + offset, + length, PTL_NO_ACK_REQ, + ompi_coll_portals4_get_peer(comm, child), + mca_coll_portals4_component.pt_idx, + match_bits, 0, + NULL, + 0, request->u.bcast.trig_ct_h, trig_thr)) != 0) { + return opal_stderr("PtlTriggeredPut failed", __FILE__, __LINE__, ret); + } + } + } + + if (is_sync) { + if ((ret = PtlCTWait(request->u.bcast.ack_ct_h, 1, &ct)) != 0) { + opal_stderr("PtlCTWait failed", __FILE__, __LINE__, ret); + } + } + else { + if ((ret = PtlTriggeredPut (zero_md_h, 0, 0, PTL_NO_ACK_REQ, + ompi_coll_portals4_get_peer(comm, rank), + mca_coll_portals4_component.finish_pt_idx, + 0, 0, NULL, (uintptr_t) request, + request->u.bcast.ack_ct_h, + 1)) != 0) { + return opal_stderr("PtlTriggeredPut failed", __FILE__, __LINE__, ret); + } + } + } + else { + /* A leaf of the tree does not need to send data to its children */ + request->u.bcast.rtr_ct_h = PTL_INVALID_HANDLE; + request->u.bcast.ack_ct_h = PTL_INVALID_HANDLE; + + /* a leaf does not expect counting events from its children, + * the threshold is computed using the number of segments received + * from the parent + */ + + if (rank != root) { + if (is_sync) { + if ((ret = PtlCTWait(request->u.bcast.trig_ct_h, segment_nb, &ct)) != 0) { + opal_stderr("PtlCTWait failed", __FILE__, __LINE__, ret); + } + } + else { + if ((ret = PtlTriggeredPut (zero_md_h, 0, 0, PTL_NO_ACK_REQ, + ompi_coll_portals4_get_peer(comm, rank), + mca_coll_portals4_component.finish_pt_idx, + 0, 0, NULL, (uintptr_t) request, + request->u.bcast.trig_ct_h, + segment_nb)) != 0) { + return opal_stderr("PtlTriggeredPut failed", __FILE__, __LINE__, ret); + } + } + } + } + + return (OMPI_SUCCESS); +} + + + +static int +bcast_kary_tree_bottom(ompi_coll_portals4_request_t *request) +{ + /* release all Portals4 resources for this request */ + if (request->u.bcast.u.child_nb) { + PtlCTFree(request->u.bcast.rtr_ct_h); + PtlCTFree(request->u.bcast.ack_ct_h); + } + + PtlCTFree(request->u.bcast.trig_ct_h); + + return (OMPI_SUCCESS); +} + + +static int +bcast_pipeline_bottom(ompi_coll_portals4_request_t *request) +{ + /* release all Portals4 resources for this request */ + if (request->u.bcast.u.child != PTL_INVALID_RANK) { + PtlCTFree(request->u.bcast.rtr_ct_h); + PtlCTFree(request->u.bcast.ack_ct_h); + } + + PtlCTFree(request->u.bcast.trig_ct_h); + return (OMPI_SUCCESS); +} + + +int +ompi_coll_portals4_bcast_intra(void *buff, int count, + struct ompi_datatype_t *datatype, int root, + struct ompi_communicator_t *comm, + mca_coll_base_module_t *module) +{ + mca_coll_portals4_module_t *portals4_module = (mca_coll_portals4_module_t*) module; + ompi_coll_portals4_request_t *request; + + OMPI_COLL_PORTALS4_REQUEST_ALLOC(comm, request); + if (NULL == request) { + opal_output_verbose(1, ompi_coll_base_framework.framework_output, + "%s:%d: request alloc failed\n", + __FILE__, __LINE__); + return OMPI_ERR_TEMP_OUT_OF_RESOURCE; + } + request->is_sync = true; + + prepare_bcast_data(comm, buff, count, datatype, root, request); + + switch (request->u.bcast.algo) { + case OMPI_COLL_PORTALS4_BCAST_KARY_TREE_ALGO: + bcast_kary_tree_top(buff, count, datatype, root, + comm, request, portals4_module); + bcast_kary_tree_bottom(request); + break; + case OMPI_COLL_PORTALS4_BCAST_PIPELINE_ALGO: + bcast_pipeline_top(buff, count, datatype, root, + comm, request, portals4_module); + bcast_pipeline_bottom(request); + break; + default: + opal_output_verbose(1, ompi_coll_base_framework.framework_output, + "%s:%d: unknown bcast algorithm %d\n", + __FILE__, __LINE__, request->u.bcast.algo); + return OMPI_ERROR; + } + post_bcast_data(request); + + OMPI_COLL_PORTALS4_REQUEST_RETURN(request); + return (OMPI_SUCCESS); +} + + +int +ompi_coll_portals4_ibcast_intra(void *buff, int count, + struct ompi_datatype_t *datatype, int root, + struct ompi_communicator_t *comm, + ompi_request_t **ompi_request, + mca_coll_base_module_t *module) +{ + + mca_coll_portals4_module_t *portals4_module = (mca_coll_portals4_module_t*) module; + ompi_coll_portals4_request_t *request; + + OMPI_COLL_PORTALS4_REQUEST_ALLOC(comm, request); + if (NULL == request) { + opal_output_verbose(1, ompi_coll_base_framework.framework_output, + "%s:%d: request alloc failed\n", + __FILE__, __LINE__); + return OMPI_ERR_TEMP_OUT_OF_RESOURCE; + } + *ompi_request = &request->super; + request->is_sync = false; + + prepare_bcast_data(comm, buff, count, datatype, root, request); + + switch (request->u.bcast.algo) { + case OMPI_COLL_PORTALS4_BCAST_KARY_TREE_ALGO: + bcast_kary_tree_top(buff, count, datatype, root, + comm, request, portals4_module); + break; + case OMPI_COLL_PORTALS4_BCAST_PIPELINE_ALGO: + bcast_pipeline_top(buff, count, datatype, root, + comm, request, portals4_module); + break; + default: + opal_output_verbose(1, ompi_coll_base_framework.framework_output, + "%s:%d: unknown bcast algorithm %d\n", + __FILE__, __LINE__, request->u.bcast.algo); + return OMPI_ERROR; + } + + puts("ibcast"); + return (OMPI_SUCCESS); +} + + +int +ompi_coll_portals4_ibcast_intra_fini(ompi_coll_portals4_request_t *request) +{ + + switch (request->u.bcast.algo) { + case OMPI_COLL_PORTALS4_BCAST_KARY_TREE_ALGO: + bcast_kary_tree_bottom(request); + break; + case OMPI_COLL_PORTALS4_BCAST_PIPELINE_ALGO: + bcast_pipeline_bottom(request); + break; + default: + opal_output_verbose(1, ompi_coll_base_framework.framework_output, + "%s:%d: unknown bcast algorithm %d\n", + __FILE__, __LINE__, request->u.bcast.algo); + return OMPI_ERROR; + } + + post_bcast_data(request); + + OPAL_THREAD_LOCK(&ompi_request_lock); + ompi_request_complete(&request->super, true); + OPAL_THREAD_UNLOCK(&ompi_request_lock); + + return (OMPI_SUCCESS); +} diff --git a/ompi/mca/coll/portals4/coll_portals4_component.c b/ompi/mca/coll/portals4/coll_portals4_component.c index 1f42eb349c..cb29348ed3 100644 --- a/ompi/mca/coll/portals4/coll_portals4_component.c +++ b/ompi/mca/coll/portals4/coll_portals4_component.c @@ -1,4 +1,3 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana * University Research and Technology @@ -6,18 +5,19 @@ * Copyright (c) 2004-2005 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2008 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2013 Sandia National Laboratories. All rights reserved. + * Copyright (c) 2013-2015 Sandia National Laboratories. All rights reserved. * Copyright (c) 2015 Los Alamos National Security, LLC. All rights * reserved. + * Copyright (c) 2015 Bull SAS. All rights reserved. * $COPYRIGHT$ - * + * * Additional copyrights may follow - * + * * $HEADER$ * */ @@ -28,23 +28,129 @@ #include "coll_portals4_request.h" #include "mpi.h" +#include "ompi/op/op.h" +#include "ompi/datatype/ompi_datatype_internal.h" #include "ompi/mca/coll/coll.h" #include "ompi/mca/coll/base/base.h" +#define REQ_COLL_TABLE_ID 15 +#define REQ_COLL_FINISH_TABLE_ID 16 + + +ptl_op_t ompi_coll_portals4_atomic_op [OMPI_OP_NUM_OF_TYPES] = +{ + [OMPI_OP_NULL] = COLL_PORTALS4_NO_OP, + [OMPI_OP_MAX] = PTL_MAX, + [OMPI_OP_MIN] = PTL_MIN, + [OMPI_OP_SUM] = PTL_SUM, + [OMPI_OP_PROD] = PTL_PROD, + [OMPI_OP_LAND] = PTL_LAND, + [OMPI_OP_BAND] = PTL_BAND, + [OMPI_OP_LOR] = PTL_LOR, + [OMPI_OP_BOR] = PTL_BOR, + [OMPI_OP_LXOR] = PTL_LXOR, + [OMPI_OP_BXOR] = PTL_BXOR, + [OMPI_OP_MAXLOC] = COLL_PORTALS4_NO_OP, + [OMPI_OP_MINLOC] = COLL_PORTALS4_NO_OP, + [OMPI_OP_REPLACE] = PTL_CSWAP, +}; + +ptl_datatype_t ompi_coll_portals4_atomic_datatype [OMPI_DATATYPE_MPI_MAX_PREDEFINED] = +{ + [OMPI_DATATYPE_MPI_EMPTY] = COLL_PORTALS4_NO_DTYPE, + [OMPI_DATATYPE_MPI_UINT8_T] = PTL_UINT8_T, + [OMPI_DATATYPE_MPI_INT16_T] = PTL_INT16_T, + [OMPI_DATATYPE_MPI_UINT16_T] = PTL_UINT16_T, + [OMPI_DATATYPE_MPI_INT32_T] = PTL_INT32_T, + [OMPI_DATATYPE_MPI_UINT32_T] = PTL_UINT32_T, + [OMPI_DATATYPE_MPI_INT64_T] = PTL_INT64_T, + [OMPI_DATATYPE_MPI_UINT64_T] = PTL_UINT64_T, + [OMPI_DATATYPE_MPI_FLOAT] = PTL_FLOAT, + [OMPI_DATATYPE_MPI_DOUBLE] = PTL_DOUBLE, + [OMPI_DATATYPE_MPI_LONG_DOUBLE] = COLL_PORTALS4_NO_DTYPE, + [OMPI_DATATYPE_MPI_COMPLEX8] = COLL_PORTALS4_NO_DTYPE, + [OMPI_DATATYPE_MPI_COMPLEX16] = COLL_PORTALS4_NO_DTYPE, + [OMPI_DATATYPE_MPI_COMPLEX32] = COLL_PORTALS4_NO_DTYPE, + [OMPI_DATATYPE_MPI_WCHAR] = COLL_PORTALS4_NO_DTYPE, + [OMPI_DATATYPE_MPI_PACKED] = COLL_PORTALS4_NO_DTYPE, + + /* C++ / C99 datatypes */ + [OMPI_DATATYPE_MPI_BOOL] = COLL_PORTALS4_NO_DTYPE, + + /* Fortran datatypes */ + [OMPI_DATATYPE_MPI_LOGICAL] = COLL_PORTALS4_NO_DTYPE, + [OMPI_DATATYPE_MPI_CHARACTER] = PTL_INT8_T, + [OMPI_DATATYPE_MPI_INTEGER] = PTL_INT64_T, + [OMPI_DATATYPE_MPI_REAL] = PTL_FLOAT, + [OMPI_DATATYPE_MPI_DOUBLE_PRECISION] = PTL_DOUBLE, + + [OMPI_DATATYPE_MPI_COMPLEX] = PTL_FLOAT_COMPLEX, + [OMPI_DATATYPE_MPI_DOUBLE_COMPLEX] = PTL_DOUBLE_COMPLEX, + [OMPI_DATATYPE_MPI_LONG_DOUBLE_COMPLEX] = PTL_LONG_DOUBLE_COMPLEX, + [OMPI_DATATYPE_MPI_2INT] = COLL_PORTALS4_NO_DTYPE, + [OMPI_DATATYPE_MPI_2INTEGER] = COLL_PORTALS4_NO_DTYPE, + [OMPI_DATATYPE_MPI_2REAL] = COLL_PORTALS4_NO_DTYPE, + [OMPI_DATATYPE_MPI_2DBLPREC] = COLL_PORTALS4_NO_DTYPE, + [OMPI_DATATYPE_MPI_2COMPLEX] = COLL_PORTALS4_NO_DTYPE, + [OMPI_DATATYPE_MPI_2DOUBLE_COMPLEX] = COLL_PORTALS4_NO_DTYPE, + + [OMPI_DATATYPE_MPI_FLOAT_INT] = COLL_PORTALS4_NO_DTYPE, + + [OMPI_DATATYPE_MPI_DOUBLE_INT] = PTL_INT64_T, + [OMPI_DATATYPE_MPI_LONG_DOUBLE_INT] = COLL_PORTALS4_NO_DTYPE, + [OMPI_DATATYPE_MPI_LONG_INT] = PTL_INT32_T, + [OMPI_DATATYPE_MPI_SHORT_INT] = PTL_INT16_T, + + /* MPI 2.2 types */ + [OMPI_DATATYPE_MPI_AINT] = COLL_PORTALS4_NO_DTYPE, + [OMPI_DATATYPE_MPI_OFFSET] = COLL_PORTALS4_NO_DTYPE, + [OMPI_DATATYPE_MPI_C_BOOL] = COLL_PORTALS4_NO_DTYPE, + [OMPI_DATATYPE_MPI_C_COMPLEX] = COLL_PORTALS4_NO_DTYPE, + [OMPI_DATATYPE_MPI_C_FLOAT_COMPLEX] = COLL_PORTALS4_NO_DTYPE, + [OMPI_DATATYPE_MPI_C_DOUBLE_COMPLEX] = COLL_PORTALS4_NO_DTYPE, + [OMPI_DATATYPE_MPI_C_LONG_DOUBLE_COMPLEX] = COLL_PORTALS4_NO_DTYPE, + + [OMPI_DATATYPE_MPI_LB] = COLL_PORTALS4_NO_DTYPE, + [OMPI_DATATYPE_MPI_UB] = COLL_PORTALS4_NO_DTYPE, + + /* MPI 3.0 types */ + [OMPI_DATATYPE_MPI_COUNT] = COLL_PORTALS4_NO_DTYPE, + + [OMPI_DATATYPE_MPI_UNAVAILABLE] = COLL_PORTALS4_NO_DTYPE, + +}; + + +#define PORTALS4_SAVE_PREV_COLL_API(__module, __comm, __api) \ + do { \ + __module->previous_ ## __api = __comm->c_coll.coll_ ## __api; \ + __module->previous_ ## __api ## _module = __comm->c_coll.coll_ ## __api ## _module; \ + if (!comm->c_coll.coll_ ## __api || !comm->c_coll.coll_ ## __api ## _module) { \ + opal_output_verbose(1, ompi_coll_base_framework.framework_output, \ + "(%d/%s): no underlying " # __api"; disqualifying myself", \ + __comm->c_contextid, __comm->c_name); \ + return OMPI_ERROR; \ + } \ + OBJ_RETAIN(__module->previous_ ## __api ## _module); \ + } while(0) + + const char *mca_coll_portals4_component_version_string = - "Open MPI Portals 4 collective MCA component version " OMPI_VERSION; + "Open MPI Portals 4 collective MCA component version " OMPI_VERSION; int mca_coll_portals4_priority = 10; +#define MCA_COLL_PORTALS4_EQ_SIZE 4096 + static int portals4_open(void); static int portals4_close(void); static int portals4_register(void); static int portals4_init_query(bool enable_progress_threads, - bool enable_mpi_threads); + bool enable_mpi_threads); static mca_coll_base_module_t* portals4_comm_query(struct ompi_communicator_t *comm, - int *priority); + int *priority); static int portals4_module_enable(mca_coll_base_module_t *module, - struct ompi_communicator_t *comm); + struct ompi_communicator_t *comm); static int portals4_progress(void); @@ -77,17 +183,25 @@ mca_coll_portals4_component_t mca_coll_portals4_component = { }, }; +int +opal_stderr(const char *msg, const char *file, + const int line, const int ret) +{ + opal_output_verbose(1, ompi_coll_base_framework.framework_output, + "%s:%d: %s: %d\n", file, line, msg, ret); + return (OMPI_ERR_TEMP_OUT_OF_RESOURCE); +} static int portals4_register(void) { mca_coll_portals4_priority = 100; (void) mca_base_component_var_register(&mca_coll_portals4_component.super.collm_version, "priority", - "Priority of the portals4 coll component", - MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, - OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_READONLY, - &mca_coll_portals4_priority); + "Priority of the portals4 coll component", + MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_READONLY, + &mca_coll_portals4_priority); return OMPI_SUCCESS; } @@ -103,24 +217,21 @@ portals4_open(void) mca_coll_portals4_component.pt_idx = -1; mca_coll_portals4_component.finish_pt_idx = -1; mca_coll_portals4_component.eq_h = PTL_INVALID_HANDLE; - mca_coll_portals4_component.barrier_unex_me_h = PTL_INVALID_HANDLE; + mca_coll_portals4_component.unex_me_h = PTL_INVALID_HANDLE; mca_coll_portals4_component.finish_me_h = PTL_INVALID_HANDLE; -#if OPAL_PORTALS4_MAX_MD_SIZE < OPAL_PORTALS4_MAX_VA_SIZE - mca_coll_portals4_component.md_hs = NULL; -#else - mca_coll_portals4_component.md_h = PTL_INVALID_HANDLE; -#endif - + mca_coll_portals4_component.zero_md_h = PTL_INVALID_HANDLE; + mca_coll_portals4_component.data_md_h = PTL_INVALID_HANDLE; + OBJ_CONSTRUCT(&mca_coll_portals4_component.requests, opal_free_list_t); - ret = opal_free_list_init (&mca_coll_portals4_component.requests, - sizeof(ompi_coll_portals4_request_t), - opal_cache_line_size, - OBJ_CLASS(ompi_coll_portals4_request_t), - 0, 0, 8, 0, 8, NULL, 0, NULL, NULL, NULL); + ret = opal_free_list_init(&mca_coll_portals4_component.requests, + sizeof(ompi_coll_portals4_request_t), + opal_cache_line_size, + OBJ_CLASS(ompi_coll_portals4_request_t), + 0, 0, 8, 0, 8, NULL, 0, NULL, NULL, NULL); if (OMPI_SUCCESS != ret) { opal_output_verbose(1, ompi_coll_base_framework.framework_output, - "%s:%d: opal_free_list_init failed: %d\n", - __FILE__, __LINE__, ret); + "%s:%d: ompi_free_list_init failed: %d\n", + __FILE__, __LINE__, ret); return ret; } @@ -135,80 +246,72 @@ portals4_close(void) OBJ_DESTRUCT(&mca_coll_portals4_component.requests); -#if OPAL_PORTALS4_MAX_MD_SIZE < OPAL_PORTALS4_MAX_VA_SIZE - if (NULL != mca_coll_portals4_component.md_hs) { - int i; - int num_mds = ompi_coll_portals4_get_num_mds(); - - for (i = 0 ; i < num_mds ; ++i) { - if (!PtlHandleIsEqual(mca_coll_portals4_component.md_hs[i], PTL_INVALID_HANDLE)) { - ret = PtlMDRelease(mca_coll_portals4_component.md_hs[i]); - if (PTL_OK != ret) { - opal_output_verbose(1, ompi_coll_base_framework.framework_output, - "%s:%d: PtlMDRelease failed: %d\n", - __FILE__, __LINE__, ret); - } - } - } - - free(mca_coll_portals4_component.md_hs); - } -#else - if (!PtlHandleIsEqual(mca_coll_portals4_component.md_h, PTL_INVALID_HANDLE)) { - ret = PtlMDRelease(mca_coll_portals4_component.md_h); + if (!PtlHandleIsEqual(mca_coll_portals4_component.zero_md_h, PTL_INVALID_HANDLE)) { + ret = PtlMDRelease(mca_coll_portals4_component.zero_md_h); if (PTL_OK != ret) { opal_output_verbose(1, ompi_coll_base_framework.framework_output, - "%s:%d: PtlMDRelease failed: %d\n", - __FILE__, __LINE__, ret); + "%s:%d: PtlMDRelease failed: %d\n", + __FILE__, __LINE__, ret); } } -#endif + mca_coll_portals4_component.zero_md_h = PTL_INVALID_HANDLE; + + if (!PtlHandleIsEqual(mca_coll_portals4_component.data_md_h, PTL_INVALID_HANDLE)) { + ret = PtlMDRelease(mca_coll_portals4_component.data_md_h); + if (PTL_OK != ret) { + opal_output_verbose(1, ompi_coll_base_framework.framework_output, + "%s:%d: PtlMDRelease failed: %d\n", + __FILE__, __LINE__, ret); + } + } + mca_coll_portals4_component.data_md_h = PTL_INVALID_HANDLE; + if (!PtlHandleIsEqual(mca_coll_portals4_component.finish_me_h, PTL_INVALID_HANDLE)) { ret = PtlMEUnlink(mca_coll_portals4_component.finish_me_h); if (PTL_OK != ret) { opal_output_verbose(1, ompi_coll_base_framework.framework_output, - "%s:%d: PtlMEUnlink failed: %d\n", - __FILE__, __LINE__, ret); + "%s:%d: PtlMEUnlink failed: %d\n", + __FILE__, __LINE__, ret); } } - if (!PtlHandleIsEqual(mca_coll_portals4_component.barrier_unex_me_h, PTL_INVALID_HANDLE)) { - ret = PtlMEUnlink(mca_coll_portals4_component.barrier_unex_me_h); + if (!PtlHandleIsEqual(mca_coll_portals4_component.unex_me_h, PTL_INVALID_HANDLE)) { + ret = PtlMEUnlink(mca_coll_portals4_component.unex_me_h); if (PTL_OK != ret) { opal_output_verbose(1, ompi_coll_base_framework.framework_output, - "%s:%d: PtlMEUnlink failed: %d\n", - __FILE__, __LINE__, ret); + "%s:%d: PtlMEUnlink failed: %d\n", + __FILE__, __LINE__, ret); } } if (mca_coll_portals4_component.finish_pt_idx >= 0) { ret = PtlPTFree(mca_coll_portals4_component.ni_h, mca_coll_portals4_component.finish_pt_idx); if (PTL_OK != ret) { opal_output_verbose(1, ompi_coll_base_framework.framework_output, - "%s:%d: PtlPTFree failed: %d\n", - __FILE__, __LINE__, ret); + "%s:%d: PtlPTFree failed: %d\n", + __FILE__, __LINE__, ret); } } if (mca_coll_portals4_component.pt_idx >= 0) { ret = PtlPTFree(mca_coll_portals4_component.ni_h, mca_coll_portals4_component.pt_idx); if (PTL_OK != ret) { opal_output_verbose(1, ompi_coll_base_framework.framework_output, - "%s:%d: PtlPTFree failed: %d\n", - __FILE__, __LINE__, ret); + "%s:%d: PtlPTFree failed: %d\n", + __FILE__, __LINE__, ret); } } if (!PtlHandleIsEqual(mca_coll_portals4_component.eq_h, PTL_INVALID_HANDLE)) { ret = PtlEQFree(mca_coll_portals4_component.eq_h); if (PTL_OK != ret) { opal_output_verbose(1, ompi_coll_base_framework.framework_output, - "%s:%d: PtlEQFree failed: %d\n", - __FILE__, __LINE__, ret); + "%s:%d: PtlEQFree failed: %d\n", + __FILE__, __LINE__, ret); } } if (!PtlHandleIsEqual(mca_coll_portals4_component.ni_h, PTL_INVALID_HANDLE)) { ret = PtlNIFini(mca_coll_portals4_component.ni_h); if (PTL_OK != ret) { opal_output_verbose(1, ompi_coll_base_framework.framework_output, - "%s:%d: PtlNIFini failed: %d\n", - __FILE__, __LINE__, ret); + "%s:%d: PtlNIFini failed: %d\n", + __FILE__, __LINE__, ret); } PtlFini(); @@ -226,129 +329,125 @@ portals4_close(void) * this component to disqualify itself if it doesn't support the * required level of thread support. */ +/* + /!\ Called for each processes /!\ + */ static int portals4_init_query(bool enable_progress_threads, - bool enable_mpi_threads) + bool enable_mpi_threads) { int ret; ptl_md_t md; ptl_me_t me; - /* Make sure someone is populating the proc table, since we're not - in a really good position to do so */ - if (NULL == ompi_proc_local()->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_PORTALS4]) { - opal_output_verbose(1, ompi_coll_base_framework.framework_output, - "%s:%d: Proc table not previously populated", - __FILE__, __LINE__); - return OMPI_ERROR; - } - /* Initialize Portals and create a physical, matching interface */ ret = PtlInit(); if (PTL_OK != ret) { opal_output_verbose(1, ompi_coll_base_framework.framework_output, - "%s:%d: PtlInit failed: %d\n", - __FILE__, __LINE__, ret); + "%s:%d: PtlInit failed: %d\n", + __FILE__, __LINE__, ret); return OMPI_ERROR; } ret = PtlNIInit(PTL_IFACE_DEFAULT, - PTL_NI_PHYSICAL | PTL_NI_MATCHING, - PTL_PID_ANY, - NULL, - NULL, - &mca_coll_portals4_component.ni_h); + PTL_NI_PHYSICAL | PTL_NI_MATCHING, + PTL_PID_ANY, + NULL, + &mca_coll_portals4_component.ni_limits, + &mca_coll_portals4_component.ni_h); if (PTL_OK != ret) { opal_output_verbose(1, ompi_coll_base_framework.framework_output, - "%s:%d: PtlNIInit failed: %d\n", - __FILE__, __LINE__, ret); + "%s:%d: PtlNIInit failed: %d\n", + __FILE__, __LINE__, ret); return OMPI_ERROR; } - /* FIX ME: Need to make sure our ID matches with the MTL... */ + ret = PtlGetId(mca_coll_portals4_component.ni_h, &mca_coll_portals4_component.id); + if (PTL_OK != ret) { + opal_output_verbose(1, ompi_coll_base_framework.framework_output, + "%s:%d: PtlGetid failed: %d\n", + __FILE__, __LINE__, ret); + return OMPI_ERROR; + } + /* FIX ME: Need to make sure our ID matches with the MTL... */ ret = PtlGetUid(mca_coll_portals4_component.ni_h, &mca_coll_portals4_component.uid); if (PTL_OK != ret) { opal_output_verbose(1, ompi_coll_base_framework.framework_output, - "%s:%d: PtlGetUid failed: %d\n", - __FILE__, __LINE__, ret); + "%s:%d: PtlGetUid failed: %d\n", + __FILE__, __LINE__, ret); return OMPI_ERROR; } ret = PtlEQAlloc(mca_coll_portals4_component.ni_h, - 4096, - &mca_coll_portals4_component.eq_h); + MCA_COLL_PORTALS4_EQ_SIZE, + &mca_coll_portals4_component.eq_h); if (PTL_OK != ret) { opal_output_verbose(1, ompi_coll_base_framework.framework_output, - "%s:%d: PtlEQAlloc failed: %d\n", - __FILE__, __LINE__, ret); + "%s:%d: PtlEQAlloc failed: %d\n", + __FILE__, __LINE__, ret); return OMPI_ERROR; } ret = PtlPTAlloc(mca_coll_portals4_component.ni_h, - 0, - mca_coll_portals4_component.eq_h, - 15, - &mca_coll_portals4_component.pt_idx); + 0, + mca_coll_portals4_component.eq_h, + REQ_COLL_TABLE_ID, + &mca_coll_portals4_component.pt_idx); if (PTL_OK != ret) { opal_output_verbose(1, ompi_coll_base_framework.framework_output, - "%s:%d: PtlPTAlloc failed: %d\n", - __FILE__, __LINE__, ret); + "%s:%d: PtlPTAlloc failed: %d\n", + __FILE__, __LINE__, ret); + return OMPI_ERROR; + } + + if (mca_coll_portals4_component.pt_idx != REQ_COLL_TABLE_ID) { + opal_output_verbose(1, ompi_coll_base_framework.framework_output, + "%s:%d: PtlPTAlloc return wrong pt_idx: %d\n", + __FILE__, __LINE__, + mca_coll_portals4_component.finish_pt_idx); return OMPI_ERROR; } ret = PtlPTAlloc(mca_coll_portals4_component.ni_h, - 0, - mca_coll_portals4_component.eq_h, - 16, - &mca_coll_portals4_component.finish_pt_idx); + 0, + mca_coll_portals4_component.eq_h, + REQ_COLL_FINISH_TABLE_ID, + &mca_coll_portals4_component.finish_pt_idx); if (PTL_OK != ret) { opal_output_verbose(1, ompi_coll_base_framework.framework_output, - "%s:%d: PtlPTAlloc failed: %d\n", - __FILE__, __LINE__, ret); + "%s:%d: PtlPTAlloc failed: %d\n", + __FILE__, __LINE__, ret); + return OMPI_ERROR; + } + + if (mca_coll_portals4_component.finish_pt_idx != REQ_COLL_FINISH_TABLE_ID) { + opal_output_verbose(1, ompi_coll_base_framework.framework_output, + "%s:%d: PtlPTAlloc return wrong pt_idx: %d\n", + __FILE__, __LINE__, + mca_coll_portals4_component.finish_pt_idx); return OMPI_ERROR; } /* Bind MD/MDs across all memory. We prefer (for obvious reasons) to have a single MD across all of memory */ -#if OPAL_PORTALS4_MAX_MD_SIZE < OPAL_PORTALS4_MAX_VA_SIZE - { - int i; - int num_mds = ompi_coll_portals4_get_num_mds(); - ptl_size_t size = (1ULL << OPAL_PORTALS4_MAX_MD_SIZE) - 1; - ptl_size_t offset_unit = (1ULL << OPAL_PORTALS4_MAX_MD_SIZE) / 2; + memset(&md, 0, sizeof(ptl_md_t)); + md.start = 0; + md.length = 0; + md.options = 0; + md.eq_handle = PTL_EQ_NONE; + md.ct_handle = PTL_CT_NONE; - mca_coll_portals4_component.md_hs = malloc(sizeof(ptl_handle_md_t) * num_mds); - if (NULL == mca_coll_portals4_component.md_hs) { - opal_output_verbose(1, ompi_coll_base_framework.framework_output, - "%s:%d: Error allocating MD array", - __FILE__, __LINE__); - return OMPI_ERR_TEMP_OUT_OF_RESOURCE; - } - - for (i = 0 ; i < num_mds ; ++i) { - mca_coll_portals4_component.md_hs[i] = PTL_INVALID_HANDLE; - } - - for (i = 0 ; i < num_mds ; ++i) { - md.start = (char*) (offset_unit * i); - md.length = (i - 1 == num_mds) ? size / 2 : size; - md.options = 0; - md.eq_handle = PTL_EQ_NONE; - md.ct_handle = PTL_CT_NONE; - - ret = PtlMDBind(mca_coll_portals4_component.ni_h, - &md, - &mca_coll_portals4_component.md_hs[i]); - if (PTL_OK != ret) { - opal_output_verbose(1, ompi_coll_base_framework.framework_output, - "%s:%d: PtlMDBind failed: %d\n", - __FILE__, __LINE__, ret); - return OMPI_ERROR; - } - } + ret = PtlMDBind(mca_coll_portals4_component.ni_h, + &md, + &mca_coll_portals4_component.zero_md_h); + if (PTL_OK != ret) { + opal_output_verbose(1, ompi_coll_base_framework.framework_output, + "%s:%d: PtlMDBind failed: %d\n", + __FILE__, __LINE__, ret); + return OMPI_ERROR; } -#else + md.start = 0; md.length = PTL_SIZE_MAX; md.options = 0; @@ -356,15 +455,15 @@ portals4_init_query(bool enable_progress_threads, md.ct_handle = PTL_CT_NONE; ret = PtlMDBind(mca_coll_portals4_component.ni_h, - &md, - &mca_coll_portals4_component.md_h); + &md, + &mca_coll_portals4_component.data_md_h); if (PTL_OK != ret) { opal_output_verbose(1, ompi_coll_base_framework.framework_output, - "%s:%d: PtlMDBind failed: %d\n", - __FILE__, __LINE__, ret); + "%s:%d: PtlMDBind failed: %d\n", + __FILE__, __LINE__, ret); return OMPI_ERROR; } -#endif + OPAL_OUTPUT_VERBOSE((90, ompi_coll_base_framework.framework_output, "PtlMDBind start=%p length=%x\n", md.start, md.length)); /* setup finish ack ME */ me.start = NULL; @@ -372,49 +471,55 @@ portals4_init_query(bool enable_progress_threads, me.ct_handle = PTL_CT_NONE; me.min_free = 0; me.uid = mca_coll_portals4_component.uid; - me.options = PTL_ME_OP_PUT | - PTL_ME_EVENT_LINK_DISABLE | PTL_ME_EVENT_UNLINK_DISABLE; + me.options = PTL_ME_OP_PUT | + PTL_ME_EVENT_LINK_DISABLE | PTL_ME_EVENT_UNLINK_DISABLE; me.match_id.phys.nid = PTL_NID_ANY; me.match_id.phys.pid = PTL_PID_ANY; me.match_bits = 0; me.ignore_bits = 0; ret = PtlMEAppend(mca_coll_portals4_component.ni_h, - mca_coll_portals4_component.finish_pt_idx, - &me, - PTL_PRIORITY_LIST, - NULL, - &mca_coll_portals4_component.finish_me_h); + mca_coll_portals4_component.finish_pt_idx, + &me, + PTL_PRIORITY_LIST, + NULL, + &mca_coll_portals4_component.finish_me_h); if (PTL_OK != ret) { opal_output_verbose(1, ompi_coll_base_framework.framework_output, - "%s:%d: PtlMEAppend of barrier unexpected failed: %d\n", - __FILE__, __LINE__, ret); + "%s:%d: PtlMEAppend of barrier unexpected failed: %d\n", + __FILE__, __LINE__, ret); return OMPI_ERROR; } - /* Setup Barrier unexpected arena, which is not per-communicator specific. */ + /* This ME is used for RTR exchange only */ me.start = NULL; me.length = 0; me.ct_handle = PTL_CT_NONE; me.min_free = 0; me.uid = mca_coll_portals4_component.uid; - me.options = PTL_ME_OP_PUT | PTL_ME_EVENT_SUCCESS_DISABLE | - PTL_ME_EVENT_LINK_DISABLE | PTL_ME_EVENT_UNLINK_DISABLE; + me.options = PTL_ME_OP_PUT | + PTL_ME_EVENT_SUCCESS_DISABLE | PTL_ME_EVENT_OVER_DISABLE | + PTL_ME_EVENT_LINK_DISABLE | PTL_ME_EVENT_UNLINK_DISABLE; me.match_id.phys.nid = PTL_NID_ANY; me.match_id.phys.pid = PTL_PID_ANY; - COLL_PORTALS4_SET_BITS(me.match_bits, 0, 0, COLL_PORTALS4_BARRIER, 0); - me.ignore_bits = COLL_PORTALS4_CID_MASK | COLL_PORTALS4_OP_COUNT_MASK; + + /* Note : the RTR bit must be set to match this ME, + * this allows to discriminate the RTR from data flow + * (especially for the Barrier operations) + */ + COLL_PORTALS4_SET_BITS(me.match_bits, 0, 0, 1, 0, 0, 0); + me.ignore_bits = ~COLL_PORTALS4_RTR_MASK; ret = PtlMEAppend(mca_coll_portals4_component.ni_h, - mca_coll_portals4_component.pt_idx, - &me, - PTL_OVERFLOW_LIST, - NULL, - &mca_coll_portals4_component.barrier_unex_me_h); + mca_coll_portals4_component.pt_idx, + &me, + PTL_OVERFLOW_LIST, + NULL, + &mca_coll_portals4_component.unex_me_h); if (PTL_OK != ret) { opal_output_verbose(1, ompi_coll_base_framework.framework_output, - "%s:%d: PtlMEAppend of barrier unexpected failed: %d\n", - __FILE__, __LINE__, ret); + "%s:%d: PtlMEAppend of barrier unexpected failed: %d\n", + __FILE__, __LINE__, ret); return OMPI_ERROR; } @@ -422,14 +527,14 @@ portals4_init_query(bool enable_progress_threads, ret = opal_progress_register(portals4_progress); if (OMPI_SUCCESS != ret) { opal_output_verbose(1, ompi_coll_base_framework.framework_output, - "%s:%d: opal_progress_register failed: %d\n", - __FILE__, __LINE__, ret); + "%s:%d: opal_progress_register failed: %d\n", + __FILE__, __LINE__, ret); return OMPI_ERROR; + } - return OMPI_SUCCESS; -} +} /* * Invoked when there's a new communicator that has been created. @@ -437,10 +542,11 @@ portals4_init_query(bool enable_progress_threads, * priority we want to return. */ mca_coll_base_module_t * -portals4_comm_query(struct ompi_communicator_t *comm, - int *priority) +portals4_comm_query(struct ompi_communicator_t *comm, + int *priority) { mca_coll_portals4_module_t *portals4_module; + ptl_process_t *proc; /* For now, we don't support intercommunicators and we probably never should handle the single proc case, since there's the @@ -449,16 +555,43 @@ portals4_comm_query(struct ompi_communicator_t *comm, return NULL; } + /* Make sure someone is populating the proc table, since we're not + in a really good position to do so */ + proc = ompi_proc_local()->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_PORTALS4]; + if (NULL == proc) { + opal_output_verbose(1, ompi_coll_base_framework.framework_output, + "%s:%d: Proc table not previously populated", + __FILE__, __LINE__); + return NULL; + } + + /* check for logical addressing mode in the MTL */ + if (0 == proc->phys.pid) { + opal_output_verbose(1, ompi_coll_base_framework.framework_output, + "%s:%d: proc->phys.pid==0, so mtl-portals4 is using logical addressing which coll-portals4 doesn't support. Disqualifying myself.", + __FILE__, __LINE__); + return NULL; + } + portals4_module = OBJ_NEW(mca_coll_portals4_module_t); if (NULL == portals4_module) return NULL; *priority = mca_coll_portals4_priority; + portals4_module->coll_count = 0; portals4_module->super.coll_module_enable = portals4_module_enable; portals4_module->super.ft_event = NULL; + portals4_module->super.coll_barrier = ompi_coll_portals4_barrier_intra; portals4_module->super.coll_ibarrier = ompi_coll_portals4_ibarrier_intra; - portals4_module->barrier_count = 0; + portals4_module->super.coll_bcast = ompi_coll_portals4_bcast_intra; + portals4_module->super.coll_ibcast = ompi_coll_portals4_ibcast_intra; + + portals4_module->super.coll_allreduce = ompi_coll_portals4_allreduce_intra; + portals4_module->super.coll_iallreduce = ompi_coll_portals4_iallreduce_intra; + + portals4_module->super.coll_reduce = ompi_coll_portals4_reduce_intra; + portals4_module->super.coll_ireduce = ompi_coll_portals4_ireduce_intra; return &(portals4_module->super); } @@ -469,12 +602,54 @@ portals4_comm_query(struct ompi_communicator_t *comm, */ static int portals4_module_enable(mca_coll_base_module_t *module, - struct ompi_communicator_t *comm) + struct ompi_communicator_t *comm) { + mca_coll_portals4_module_t *portals4_module = (mca_coll_portals4_module_t*) module; + + PORTALS4_SAVE_PREV_COLL_API(portals4_module, comm, allreduce); + PORTALS4_SAVE_PREV_COLL_API(portals4_module, comm, iallreduce); + PORTALS4_SAVE_PREV_COLL_API(portals4_module, comm, reduce); + PORTALS4_SAVE_PREV_COLL_API(portals4_module, comm, ireduce); + return OMPI_SUCCESS; } +static char *failtype[] = { + "PTL_NI_OK", + "PTL_NI_PERM_VIOLATION", + "PTL_NI_SEGV", + "PTL_NI_PT_DISABLED", + "PTL_NI_DROPPED", + "PTL_NI_UNDELIVERABLE", + "PTL_FAIL", + "PTL_ARG_INVALID", + "PTL_IN_USE", + "PTL_ME_NO_MATCH", + "PTL_NI_TARGET_INVALID", + "PTL_NI_OP_VIOLATION" +}; + +static char *evname[] = { + "PTL_EVENT_GET", + "PTL_EVENT_GET_OVERFLOW", + "PTL_EVENT_PUT", + "PTL_EVENT_PUT_OVERFLOW", + "PTL_EVENT_ATOMIC", + "PTL_EVENT_ATOMIC_OVERFLOW", + "PTL_EVENT_FETCH_ATOMIC", + "PTL_EVENT_FETCH_ATOMIC_OVERFLOW", + "PTL_EVENT_REPLY", + "PTL_EVENT_SEND", + "PTL_EVENT_ACK", + "PTL_EVENT_PT_DISABLED", + "PTL_EVENT_AUTO_UNLINK", + "PTL_EVENT_AUTO_FREE", + "PTL_EVENT_SEARCH", + "PTL_EVENT_LINK" +}; + +/* Target EQ */ static int portals4_progress(void) { @@ -483,42 +658,70 @@ portals4_progress(void) ompi_coll_portals4_request_t *ptl_request; while (true) { - ret = PtlEQGet(mca_coll_portals4_component.eq_h, &ev); + ret = PtlEQGet(mca_coll_portals4_component.eq_h, &ev); if (PTL_OK == ret) { - OPAL_OUTPUT_VERBOSE((60, ompi_coll_base_framework.framework_output, - "Found event of type %d\n", ev.type)); + + OPAL_OUTPUT_VERBOSE((10, ompi_coll_base_framework.framework_output, "event type=%s\n", evname[ev.type])); count++; - if (PTL_OK == ev.ni_fail_type) { - assert(0 != ev.hdr_data); - ptl_request = (ompi_coll_portals4_request_t*) ev.hdr_data; - assert(NULL != ptl_request); - switch (ptl_request->type) { - case OMPI_COLL_PORTALS4_TYPE_BARRIER: - ompi_coll_portals4_ibarrier_intra_fini(ptl_request); - break; + + switch (ev.type) { + case PTL_EVENT_PUT: + /* Non-Blocking / request */ + if (PTL_OK == ev.ni_fail_type) { + OPAL_OUTPUT_VERBOSE((50, ompi_coll_base_framework.framework_output, + "hdr_data %p, matchbits 0x%lx", + (void*) ev.hdr_data, ev.match_bits)); + assert(0 != ev.hdr_data); + ptl_request = (ompi_coll_portals4_request_t*) ev.hdr_data; + assert(NULL != ptl_request); + + switch (ptl_request->type) { + case OMPI_COLL_PORTALS4_TYPE_BARRIER: + ompi_coll_portals4_ibarrier_intra_fini(ptl_request); + break; + case OMPI_COLL_PORTALS4_TYPE_BCAST: + ompi_coll_portals4_ibcast_intra_fini(ptl_request); + break; + case OMPI_COLL_PORTALS4_TYPE_REDUCE: + ompi_coll_portals4_ireduce_intra_fini(ptl_request); + break; + case OMPI_COLL_PORTALS4_TYPE_ALLREDUCE: + ompi_coll_portals4_iallreduce_intra_fini(ptl_request); + break; + case OMPI_COLL_PORTALS4_TYPE_SCATTER: + case OMPI_COLL_PORTALS4_TYPE_GATHER: + opal_output(ompi_coll_base_framework.framework_output, + "allreduce is not supported yet\n"); + break; + } } - } else { + + if (PTL_OK != ev.ni_fail_type) { + OPAL_OUTPUT_VERBOSE((10, ompi_coll_base_framework.framework_output, "ni_fail_type=%s\n", failtype[ev.ni_fail_type])); + } + break; + default: opal_output(ompi_coll_base_framework.framework_output, - "Error reported in event: %d\n", ev.ni_fail_type); - abort(); + "Unexpected event of type %d", ev.type); + break; } - } else if (PTL_EQ_EMPTY == ret) { + + } + else if (PTL_EQ_EMPTY == ret) { break; - } else if (PTL_EQ_DROPPED == ret) { - opal_output(ompi_coll_base_framework.framework_output, - "Flow control situation without recovery (EQ_DROPPED)\n"); + } + else if (PTL_EQ_DROPPED == ret) { + opal_output(ompi_coll_base_framework.framework_output, "Flow control situation without recovery (EQ_DROPPED)\n"); abort(); - } else { - opal_output(ompi_coll_base_framework.framework_output, - "Error returned from PtlEQGet: %d", ret); + } + else { + opal_output(ompi_coll_base_framework.framework_output, "Error returned from PtlEQGet: %d", ret); break; } } - return count; } - OBJ_CLASS_INSTANCE(mca_coll_portals4_module_t, - mca_coll_base_module_t, - NULL, NULL); + mca_coll_base_module_t, + NULL, NULL); diff --git a/ompi/mca/coll/portals4/coll_portals4_reduce.c b/ompi/mca/coll/portals4/coll_portals4_reduce.c new file mode 100644 index 0000000000..3ab0bf727e --- /dev/null +++ b/ompi/mca/coll/portals4/coll_portals4_reduce.c @@ -0,0 +1,436 @@ +/* + * Copyright (c) 2015 Sandia National Laboratories. All rights reserved. + * Copyright (c) 2015 Bull SAS. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "ompi_config.h" +#include "coll_portals4.h" +#include "coll_portals4_request.h" + +#include + +#include "mpi.h" +#include "ompi/constants.h" +#include "ompi/datatype/ompi_datatype.h" +#include "ompi/datatype/ompi_datatype_internal.h" +#include "ompi/op/op.h" +#include "opal/util/bit_ops.h" +#include "ompi/mca/pml/pml.h" +#include "ompi/mca/coll/coll.h" +#include "ompi/mca/coll/base/base.h" + +#define COLL_PORTALS4_REDUCE_MAX_CHILDREN 2 + + +static int +reduce_kary_tree_top(void *sendbuf, void *recvbuf, int count, + MPI_Datatype dtype, MPI_Op op, + int root, + struct ompi_communicator_t *comm, + ompi_coll_portals4_request_t *request, + mca_coll_portals4_module_t *module) +{ + bool is_sync = request->is_sync; + int ret; + unsigned int i; + int size = ompi_comm_size(comm); + int rank = ompi_comm_rank(comm); + ptl_rank_t parent, child[COLL_PORTALS4_REDUCE_MAX_CHILDREN]; + size_t internal_count, length; + ptl_handle_md_t zero_md_h, data_md_h; + ptl_handle_me_t me_h; + ptl_me_t me; + ptl_match_bits_t match_bits_ack, match_bits_rtr, match_bits; + ptl_ct_event_t ct; + ptl_op_t ptl_op; + ptl_datatype_t ptl_dtype; + + + request->type = OMPI_COLL_PORTALS4_TYPE_REDUCE; + + /* + ** Initialization + */ + + for (i = 0 ; i < COLL_PORTALS4_REDUCE_MAX_CHILDREN ; i++) { + child[i] = PTL_INVALID_RANK; + } + + parent = PTL_INVALID_RANK; + + zero_md_h = mca_coll_portals4_component.zero_md_h; + data_md_h = mca_coll_portals4_component.data_md_h; + + internal_count = opal_atomic_add_size_t(&module->coll_count, 1); + + /* + ** DATATYPE and SIZES + */ + ret = ompi_datatype_type_size(dtype, &length); + length *= count; + + request->u.reduce.is_optim = is_reduce_optimizable(dtype, length, op, &ptl_dtype, &ptl_op); + + if (request->u.reduce.is_optim) { + + /* + * TOPOLOGY + */ + + /* this function is dependent on the number of segments, + * if we use segmentation pipe-line is preferred, and + * binary tree otherwise */ + + get_k_ary_tree(COLL_PORTALS4_REDUCE_MAX_CHILDREN, + rank, size, root, &parent, child, &request->u.reduce.child_nb); + + /* + * PORTALS4 RESOURCE ALLOCATION + */ + + /* Compute match bits */ + COLL_PORTALS4_SET_BITS(match_bits_ack, ompi_comm_get_cid(comm), 1, 0, + COLL_PORTALS4_REDUCE, 0, internal_count); + + COLL_PORTALS4_SET_BITS(match_bits_rtr, ompi_comm_get_cid(comm), 0, 1, + COLL_PORTALS4_REDUCE, 0, internal_count); + + COLL_PORTALS4_SET_BITS(match_bits, ompi_comm_get_cid(comm), 0, 0, + COLL_PORTALS4_REDUCE, 0, internal_count); + + if ((ret = PtlCTAlloc(mca_coll_portals4_component.ni_h, &request->u.reduce.trig_ct_h)) != 0) { + return opal_stderr("PtlCTAlloc failed", __FILE__, __LINE__, ret); + } + + /* warning : all the operations will be executed on the recvbuf */ + if (rank != root) { + request->u.reduce.free_buffer = malloc(length); + if (NULL == request->u.reduce.free_buffer) { + return OMPI_ERR_OUT_OF_RESOURCE; + } + recvbuf = (void*)request->u.reduce.free_buffer; + + memcpy(recvbuf, sendbuf, length); + } + else { + request->u.reduce.free_buffer = NULL; + if (sendbuf != MPI_IN_PLACE) { + memcpy(recvbuf, sendbuf, length); + } + } + + if (request->u.reduce.child_nb) { + + /* + ** Prepare Data ME + */ + memset(&me, 0, sizeof(ptl_me_t)); + me.start = recvbuf; + me.length = length; + me.ct_handle = request->u.reduce.trig_ct_h; + me.uid = mca_coll_portals4_component.uid; + me.options = PTL_ME_OP_PUT | PTL_ME_EVENT_SUCCESS_DISABLE | + PTL_ME_EVENT_LINK_DISABLE | PTL_ME_EVENT_UNLINK_DISABLE | + PTL_ME_EVENT_CT_COMM; + me.match_id.phys.nid = PTL_NID_ANY; + me.match_id.phys.pid = PTL_PID_ANY; + me.match_bits = match_bits; + me.ignore_bits = 0; + + if ((ret = PtlMEAppend(mca_coll_portals4_component.ni_h, + mca_coll_portals4_component.pt_idx, + &me, PTL_PRIORITY_LIST, NULL, + &request->u.reduce.data_me_h)) != 0) { + return opal_stderr("PtlMEAppend failed", __FILE__, __LINE__, ret); + } + } + + if (rank != root) { + request->u.reduce.use_ack_ct_h = true; + + if ((ret = PtlCTAlloc(mca_coll_portals4_component.ni_h, &request->u.reduce.ack_ct_h)) != 0) { + return opal_stderr("PtlCTAlloc failed", __FILE__, __LINE__, ret); + } + + /* + ** Prepare ME for data ACK Put + ** Priority List + */ + + memset(&me, 0, sizeof(ptl_me_t)); + me.start = NULL; + me.length = 0; + me.min_free = 0; + me.uid = mca_coll_portals4_component.uid; + me.options = PTL_ME_OP_PUT | PTL_ME_EVENT_SUCCESS_DISABLE | + PTL_ME_EVENT_LINK_DISABLE | PTL_ME_EVENT_UNLINK_DISABLE | + PTL_ME_USE_ONCE | + PTL_ME_EVENT_CT_COMM; + me.match_id.phys.nid = PTL_NID_ANY; + me.match_id.phys.pid = PTL_PID_ANY; + me.match_bits = match_bits_ack; + me.ignore_bits = 0; + me.ct_handle = request->u.reduce.ack_ct_h; + + if ((ret = PtlMEAppend(mca_coll_portals4_component.ni_h, + mca_coll_portals4_component.pt_idx, + &me, PTL_PRIORITY_LIST, + NULL, + &me_h)) != 0) { + return opal_stderr("PtlMEAppend failed", __FILE__, __LINE__, ret); + } + + /* + ** Prepare ME for sending RTR Put + ** Priority List, match also with "Overflow list Me" in coll_portals4_component + */ + + memset(&me, 0, sizeof(ptl_me_t)); + me.start = NULL; + me.length = 0; + me.min_free = 0; + me.uid = mca_coll_portals4_component.uid; + me.options = PTL_ME_OP_PUT | PTL_ME_EVENT_SUCCESS_DISABLE | + PTL_ME_EVENT_LINK_DISABLE | PTL_ME_EVENT_UNLINK_DISABLE | + PTL_ME_USE_ONCE | + PTL_ME_EVENT_CT_COMM | PTL_ME_EVENT_CT_OVERFLOW; + me.match_id.phys.nid = PTL_NID_ANY; + me.match_id.phys.pid = PTL_PID_ANY; + me.match_bits = match_bits_rtr; + me.ignore_bits = 0; + me.ct_handle = request->u.reduce.trig_ct_h; + + if ((ret = PtlMEAppend(mca_coll_portals4_component.ni_h, + mca_coll_portals4_component.pt_idx, + &me, PTL_PRIORITY_LIST, + NULL, + &me_h)) != 0) { + return opal_stderr("PtlMEAppend failed", __FILE__, __LINE__, ret); + } + + /* Send Atomic operation to the parent */ + if ((ret = PtlTriggeredAtomic(data_md_h, + (uint64_t)recvbuf, + length, PTL_NO_ACK_REQ, + ompi_coll_portals4_get_peer(comm, parent), + mca_coll_portals4_component.pt_idx, + match_bits, 0, NULL, 0, + ptl_op, ptl_dtype, request->u.reduce.trig_ct_h, + request->u.reduce.child_nb + 1)) != 0) { + return opal_stderr("PtlTriggeredAtomic failed", __FILE__, __LINE__, ret); + } + } + else { + request->u.reduce.use_ack_ct_h = false; + } + + if (request->u.reduce.child_nb) { + for (i = 0 ; i < COLL_PORTALS4_REDUCE_MAX_CHILDREN ; i++) { + if (child[i] != PTL_INVALID_RANK) { + /* + * Prepare Triggered Put to ACK Data to children + * + */ + + if ((ret = PtlTriggeredPut (zero_md_h, 0, 0, PTL_NO_ACK_REQ, + ompi_coll_portals4_get_peer(comm, child[i]), + mca_coll_portals4_component.pt_idx, + match_bits_ack, 0, NULL, 0, + request->u.reduce.trig_ct_h, + (rank != root) ? + request->u.reduce.child_nb + 1 : + request->u.reduce.child_nb)) != 0) { + return opal_stderr("PtlTriggeredPut failed", __FILE__, __LINE__, ret); + } + + /* + * Send RTR to children + * + */ + + /* and there, we only send the RTR when all the MEs are ready */ + if ((ret = PtlPut(zero_md_h, 0, 0, PTL_NO_ACK_REQ, + ompi_coll_portals4_get_peer(comm, child[i]), + mca_coll_portals4_component.pt_idx, + match_bits_rtr, 0, NULL, 0)) != PTL_OK) { + return opal_stderr("Put RTR failed %d", __FILE__, __LINE__, ret); + } + } + } + } + + if (rank != root) { + if (is_sync) { + if ((ret = PtlCTWait(request->u.reduce.ack_ct_h, 1, &ct)) != 0) { + opal_stderr("PtlCTWait failed", __FILE__, __LINE__, ret); + } + } + else { + if ((ret = PtlTriggeredPut (zero_md_h, 0, 0, PTL_NO_ACK_REQ, + ompi_coll_portals4_get_peer(comm, rank), + mca_coll_portals4_component.finish_pt_idx, + 0, 0, NULL, (uintptr_t) request, + request->u.reduce.ack_ct_h, + 1)) != 0) { + return opal_stderr("PtlTriggeredPut failed", __FILE__, __LINE__, ret); + } + } + } + else { + if (is_sync) { + if ((ret = PtlCTWait(request->u.reduce.trig_ct_h, + request->u.reduce.child_nb, &ct)) != 0) { + opal_stderr("PtlCTWait failed", __FILE__, __LINE__, ret); + } + } + else { + if ((ret = PtlTriggeredPut (zero_md_h, 0, 0, PTL_NO_ACK_REQ, + ompi_coll_portals4_get_peer(comm, rank), + mca_coll_portals4_component.finish_pt_idx, + 0, 0, NULL, (uintptr_t) request, + request->u.reduce.trig_ct_h, + request->u.reduce.child_nb)) != 0) { + return opal_stderr("PtlTriggeredPut failed", __FILE__, __LINE__, ret); + } + } + } + } + else { + opal_output_verbose(100, ompi_coll_base_framework.framework_output, + "rank %d - optimization not supported, falling back to previous handler\n", rank); + + if (request->is_sync) { + if ((module->previous_reduce) && (module->previous_reduce_module)) { + ret = module->previous_reduce(sendbuf, recvbuf, count, dtype, op, root, + comm, module->previous_reduce_module); + } + else { + opal_output_verbose(1, ompi_coll_base_framework.framework_output, + "rank %d - no previous reduce handler is available, aborting\n", rank); + return (OMPI_ERROR); + } + } + else { + if ((module->previous_ireduce) && (module->previous_ireduce_module)) { + ret = module->previous_ireduce(sendbuf, recvbuf, count, dtype, op, root, + comm, request->fallback_request, module->previous_ireduce_module); + } + else { + opal_output_verbose(1, ompi_coll_base_framework.framework_output, + "rank %d - no previous ireduce handler is available, aborting\n", rank); + return (OMPI_ERROR); + } + } + return ret; + } + return (OMPI_SUCCESS); +} + + + + +static int +reduce_kary_tree_bottom(ompi_coll_portals4_request_t *request) +{ + if (request->u.reduce.is_optim) { + PtlAtomicSync(); + + if (request->u.reduce.use_ack_ct_h) { + PtlCTFree(request->u.reduce.ack_ct_h); + } + + if (request->u.reduce.child_nb) { + PtlMEUnlink(request->u.reduce.data_me_h); + } + + PtlCTFree(request->u.reduce.trig_ct_h); + + if (request->u.reduce.free_buffer) { + free(request->u.reduce.free_buffer); + } + } + return (OMPI_SUCCESS); +} + + +int +ompi_coll_portals4_reduce_intra(void *sendbuf, void *recvbuf, int count, + MPI_Datatype dtype, MPI_Op op, + int root, + struct ompi_communicator_t *comm, + mca_coll_base_module_t *module) +{ + mca_coll_portals4_module_t *portals4_module = (mca_coll_portals4_module_t*) module; + ompi_coll_portals4_request_t *request; + + OMPI_COLL_PORTALS4_REQUEST_ALLOC(comm, request); + if (NULL == request) { + opal_output_verbose(1, ompi_coll_base_framework.framework_output, + "%s:%d: request alloc failed\n", + __FILE__, __LINE__); + return OMPI_ERR_TEMP_OUT_OF_RESOURCE; + } + + request->is_sync = true; + request->fallback_request = NULL; + + reduce_kary_tree_top(sendbuf, recvbuf, count, + dtype, op, root, comm, request, portals4_module); + reduce_kary_tree_bottom(request); + + OMPI_COLL_PORTALS4_REQUEST_RETURN(request); + return (OMPI_SUCCESS); +} + + +int +ompi_coll_portals4_ireduce_intra(void* sendbuf, void* recvbuf, int count, + MPI_Datatype dtype, MPI_Op op, + int root, + struct ompi_communicator_t *comm, + ompi_request_t ** ompi_request, + struct mca_coll_base_module_2_1_0_t *module) +{ + mca_coll_portals4_module_t *portals4_module = (mca_coll_portals4_module_t*) module; + ompi_coll_portals4_request_t *request; + + OMPI_COLL_PORTALS4_REQUEST_ALLOC(comm, request); + if (NULL == request) { + opal_output_verbose(1, ompi_coll_base_framework.framework_output, + "%s:%d: request alloc failed\n", + __FILE__, __LINE__); + return OMPI_ERR_TEMP_OUT_OF_RESOURCE; + } + + *ompi_request = &request->super; + request->fallback_request = ompi_request; + request->is_sync = false; + + + reduce_kary_tree_top(sendbuf, recvbuf, count, + dtype, op, root, comm, request, portals4_module); + + if (!request->u.reduce.is_optim) { + OMPI_COLL_PORTALS4_REQUEST_RETURN(request); + } + + puts("ireduce"); + return (OMPI_SUCCESS); +} + +int +ompi_coll_portals4_ireduce_intra_fini(ompi_coll_portals4_request_t *request) +{ + reduce_kary_tree_bottom(request); + + OPAL_THREAD_LOCK(&ompi_request_lock); + ompi_request_complete(&request->super, true); + OPAL_THREAD_UNLOCK(&ompi_request_lock); + + return (OMPI_SUCCESS); +} diff --git a/ompi/mca/coll/portals4/coll_portals4_request.c b/ompi/mca/coll/portals4/coll_portals4_request.c index 4dc74fbabe..001594f5d5 100644 --- a/ompi/mca/coll/portals4/coll_portals4_request.c +++ b/ompi/mca/coll/portals4/coll_portals4_request.c @@ -1,9 +1,10 @@ /* * Copyright (c) 2013 Sandia National Laboratories. All rights reserved. + * Copyright (c) 2015 Bull SAS. All rights reserved. * $COPYRIGHT$ - * + * * Additional copyrights may follow - * + * * $HEADER$ */ @@ -23,7 +24,7 @@ request_cancel(struct ompi_request_t *request, int complete) static int request_free(struct ompi_request_t **ompi_req) { - ompi_coll_portals4_request_t *request = + ompi_coll_portals4_request_t *request = (ompi_coll_portals4_request_t*) *ompi_req; if (true != request->super.req_complete) { @@ -41,12 +42,10 @@ static void request_construct(ompi_coll_portals4_request_t *request) { - request->super.req_type = OMPI_REQUEST_WIN; + request->super.req_type = OMPI_REQUEST_COLL; request->super.req_status._cancelled = 0; request->super.req_free = request_free; request->super.req_cancel = request_cancel; - request->ct_h = PTL_INVALID_HANDLE; - request->me_h = PTL_INVALID_HANDLE; } OBJ_CLASS_INSTANCE(ompi_coll_portals4_request_t, diff --git a/ompi/mca/coll/portals4/coll_portals4_request.h b/ompi/mca/coll/portals4/coll_portals4_request.h index e9b887d3dc..175835381e 100644 --- a/ompi/mca/coll/portals4/coll_portals4_request.h +++ b/ompi/mca/coll/portals4/coll_portals4_request.h @@ -1,12 +1,12 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* - * Copyright (c) 2013 Sandia National Laboratories. All rights reserved. + * Copyright (c) 2013-2015 Sandia National Laboratories. All rights reserved. * Copyright (c) 2015 Los Alamos National Security, LLC. All rights * reserved. + * Copyright (c) 2015 Bull SAS. All rights reserved. * $COPYRIGHT$ - * + * * Additional copyrights may follow - * + * * $HEADER$ */ @@ -14,38 +14,97 @@ #define COLL_PORTALS4_REQUEST_H #include "ompi/request/request.h" +#include "coll_portals4.h" + + +enum ompi_coll_portals4_bcast_algo_t { + OMPI_COLL_PORTALS4_BCAST_KARY_TREE_ALGO, + OMPI_COLL_PORTALS4_BCAST_PIPELINE_ALGO, +}; +typedef enum ompi_coll_portals4_bcast_algo_t ompi_coll_portals4_bcast_algo_t; + enum ompi_coll_portals4_request_type_t { OMPI_COLL_PORTALS4_TYPE_BARRIER, + OMPI_COLL_PORTALS4_TYPE_BCAST, + OMPI_COLL_PORTALS4_TYPE_SCATTER, + OMPI_COLL_PORTALS4_TYPE_GATHER, + OMPI_COLL_PORTALS4_TYPE_REDUCE, + OMPI_COLL_PORTALS4_TYPE_ALLREDUCE, }; typedef enum ompi_coll_portals4_request_type_t ompi_coll_portals4_request_type_t; + struct ompi_coll_portals4_request_t { ompi_request_t super; ompi_coll_portals4_request_type_t type; - ptl_handle_ct_t ct_h; - ptl_handle_me_t me_h; + bool is_sync; + + ompi_request_t **fallback_request; + union { + struct { + ptl_handle_me_t data_me_h; + ptl_handle_ct_t rtr_ct_h; + } barrier; + + struct { + bool needs_pack; + bool is_root; + opal_convertor_t convertor; + void *tmpbuf; + size_t tmpsize; + + union { + ptl_rank_t child; + unsigned int child_nb; + } u; + ompi_coll_portals4_bcast_algo_t algo; + int segment_nb; + ptl_handle_ct_t rtr_ct_h; + ptl_handle_ct_t trig_ct_h; + ptl_handle_ct_t ack_ct_h; + } bcast; + + struct { + bool is_optim; + bool use_ack_ct_h; + unsigned int child_nb; + void *free_buffer; + ptl_handle_me_t data_me_h; + ptl_handle_ct_t trig_ct_h; + ptl_handle_ct_t ack_ct_h; + } reduce; + + struct { + bool is_optim; + unsigned int child_nb; + ptl_handle_me_t data_me_h; + ptl_handle_ct_t trig_ct_h; + ptl_handle_ct_t ack_ct_h; + } allreduce; + + } u; }; typedef struct ompi_coll_portals4_request_t ompi_coll_portals4_request_t; OBJ_CLASS_DECLARATION(ompi_coll_portals4_request_t); -#define OMPI_COLL_PORTALS4_REQUEST_ALLOC(comm, req) \ - do { \ - opal_free_list_item_t *item; \ - item = opal_free_list_get (&mca_coll_portals4_component.requests); \ - req = (ompi_coll_portals4_request_t*) item; \ - OMPI_REQUEST_INIT(&req->super, false); \ - req->super.req_mpi_object.comm = comm; \ - req->super.req_complete = false; \ - req->super.req_state = OMPI_REQUEST_ACTIVE; \ +#define OMPI_COLL_PORTALS4_REQUEST_ALLOC(comm, req) \ + do { \ + opal_free_list_item_t *item; \ + item = opal_free_list_get(&mca_coll_portals4_component.requests); \ + req = (ompi_coll_portals4_request_t*) item; \ + OMPI_REQUEST_INIT(&req->super, false); \ + req->super.req_mpi_object.comm = comm; \ + req->super.req_complete = false; \ + req->super.req_state = OMPI_REQUEST_ACTIVE; \ } while (0) -#define OMPI_COLL_PORTALS4_REQUEST_RETURN(req) \ - do { \ - OMPI_REQUEST_FINI(&request->super); \ - opal_free_list_return (&mca_coll_portals4_component.requests, \ - (opal_free_list_item_t*) req); \ +#define OMPI_COLL_PORTALS4_REQUEST_RETURN(req) \ + do { \ + OMPI_REQUEST_FINI(&request->super); \ + opal_free_list_return(&mca_coll_portals4_component.requests, \ + (opal_free_list_item_t*) req); \ } while (0)