/* * Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. * Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow * * $HEADER$ */ #include "ompi_config.h" #include "ompi/include/ompi/constants.h" #include "ompi/mca/bcol/bcol.h" #include "bcol_ptpcoll_bcast.h" #include "bcol_ptpcoll_utils.h" #define K_NOMIAL_ROOT_BCAST_NB_NOTEST(step_info, radix, \ my_group_index, group_list, \ data_buffer, count, tag, comm, send_requests, num_pending_sends) \ do { \ int rc = OMPI_SUCCESS; \ int dst; \ int comm_dst; \ *num_pending_sends = 0; \ \ while(MCA_COMMON_NETPATTERNS_GET_NEXT_KNOMIAL_PEER_CHECK_LEVEL(step_info)) { \ /* For each level of tree, do sends */ \ MCA_COMMON_NETPATTERNS_GET_NEXT_KNOMIAL_PEER(my_group_index, \ radix, step_info, dst); \ comm_dst = group_list[dst]; \ \ /* Non blocking send .... */ \ PTPCOLL_VERBOSE(9 , ("Bcast, Isend data to %d[%d], count %d, tag %d, addr %p", \ dst, comm_dst, count, tag, \ data_buffer)); \ rc = MCA_PML_CALL(isend(data_buffer, count, MPI_BYTE, \ comm_dst, tag, \ MCA_PML_BASE_SEND_STANDARD, comm, \ &(send_requests[*num_pending_sends]))); \ PTPCOLL_VERBOSE(10, ("send request addr is %p", send_requests[*num_pending_sends])); \ if( OMPI_SUCCESS != rc ) { \ PTPCOLL_VERBOSE(10, ("Failed to isend data")); \ return OMPI_ERROR; \ } \ ++(*num_pending_sends); \ } \ } while(0) #define NARRAY_BCAST_NB(narray_node, process_shift, group_size, \ data_buffer, count, tag, comm, send_requests, \ num_pending_sends) \ do { \ int n, rc = OMPI_SUCCESS; \ int dst; \ int comm_dst; \ \ /* Send out data to all relevant childrens */ \ for (n = 0; n < narray_node->n_children; n++) { \ \ dst = narray_node->children_ranks[n] + process_shift; \ if (dst >= group_size) { \ dst -= group_size; \ } \ comm_dst = group_list[dst]; \ \ /* Non blocking send .... */ \ PTPCOLL_VERBOSE(9 , ("Bcast, Isend data to %d[%d], count %d, tag %d, addr %p", \ dst, comm_dst, count, tag, \ data_buffer)); \ rc = MCA_PML_CALL(isend(data_buffer, count, MPI_BYTE, \ comm_dst, tag, \ MCA_PML_BASE_SEND_STANDARD, comm, \ &(send_requests[*num_pending_sends]))); \ if( OMPI_SUCCESS != rc ) { \ PTPCOLL_VERBOSE(10, ("Failed to isend data")); \ return OMPI_ERROR; \ } \ ++(*num_pending_sends); \ } \ } while(0) int bcol_ptpcoll_bcast_k_nomial_anyroot_progress(bcol_function_args_t *input_args, struct mca_bcol_base_function_t *const_args) { int completed = 0; int rc; mca_bcol_ptpcoll_module_t *ptpcoll_module = (mca_bcol_ptpcoll_module_t *)const_args->bcol_module; uint32_t buffer_index = input_args->buffer_index; ompi_request_t **send_requests = ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].requests; int *active_requests = &(ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].active_requests); completed = mca_bcol_ptpcoll_test_all_for_match(active_requests, send_requests, &rc); if (OMPI_SUCCESS != rc) { return OMPI_ERROR; } /* DONE */ if(completed) { PTPCOLL_VERBOSE(10, ("bcast root is done")); return BCOL_FN_COMPLETE; } else { PTPCOLL_VERBOSE(10, ("bcast root is started")); return BCOL_FN_STARTED; } } /* K-nomial tree ( with any root ) algorithm */ int bcol_ptpcoll_bcast_k_nomial_anyroot(bcol_function_args_t *input_args, struct mca_bcol_base_function_t *const_args) { mca_bcol_ptpcoll_module_t *ptpcoll_module = (mca_bcol_ptpcoll_module_t *)const_args->bcol_module; mca_bcol_ptpcoll_component_t *cm = &mca_bcol_ptpcoll_component; int tag; int rc; int matched = 0; /* not matched */ int comm_root = 0; /* no root */ int i; int my_group_index = ptpcoll_module->super.sbgp_partner_module->my_index; int *group_list = ptpcoll_module->super.sbgp_partner_module->group_list; int radix = ptpcoll_module->k_nomial_radix; int root_radix_mask = ptpcoll_module->pow_knum; int peer = -1; uint64_t sequence_number = input_args->sequence_num; uint32_t buffer_index = input_args->buffer_index; int extra_root = -1; ompi_communicator_t* comm = ptpcoll_module->super.sbgp_partner_module->group_comm; ompi_status_public_t status; ompi_request_t **send_requests = ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].requests; void *data_buffer = (void *) ( (unsigned char *)input_args->sbuf + (size_t)input_args->sbuf_offset); int count = input_args->count * input_args->dtype->super.size; int *active_requests = &(ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].active_requests); netpatterns_knomial_step_info_t step_info = {0, 0, 0}; PTPCOLL_VERBOSE(3, ("BCAST Anyroot, index_this_type %d, num_of_this_type %d", const_args->index_of_this_type_in_collective + 1, const_args->n_of_this_type_in_collective)); /* keep tag within the limit support by the pml */ tag = (PTPCOLL_TAG_OFFSET + sequence_number * PTPCOLL_TAG_FACTOR) & (ptpcoll_module->tag_mask); /* mark this as a collective tag, to avoid conflict with user-level flags */ tag = -tag; /* reset requests */ *active_requests = 0; PTPCOLL_VERBOSE(8, ("bcol_ptpcoll_bcast_k_nomial_anyroot, buffer index: %d \n" "tag: %d " "tag_mask: %d " "sn: %d " "root: %d " "pow_k: %d %d " "buff: %p " "radix: %d", buffer_index, tag, ptpcoll_module->tag_mask, sequence_number, input_args->root_flag, ptpcoll_module->pow_k, ptpcoll_module->pow_knum, data_buffer, radix)); if (input_args->root_flag) { PTPCOLL_VERBOSE(10, ("I'm root of the data")); /* * I'm root of the operation * send data to (k - 1) * log base k N neighbors */ MCA_COMMON_NETPATTERNS_GET_NEXT_KNOMIAL_INIT(step_info, ptpcoll_module->pow_knum, my_group_index); K_NOMIAL_ROOT_BCAST_NB_NOTEST(step_info, radix, my_group_index, group_list, data_buffer, count, tag, comm, send_requests, active_requests); goto ANY_ROOT_KNOMIAL_EXTRA; } /* * I'm not root, and I don't know to calculate root, so just * wait for data from ANY_SOURCE, once you get it, proceed like a root */ for (i = 0; i < cm->num_to_probe; i++) { MCA_COMMON_NETPATTERNS_GET_NEXT_KNOMIAL_INIT(step_info, ptpcoll_module->pow_knum, my_group_index); while(MCA_COMMON_NETPATTERNS_GET_NEXT_KNOMIAL_PEER_CHECK_LEVEL(step_info)) { MCA_COMMON_NETPATTERNS_GET_NEXT_KNOMIAL_PEER(my_group_index, radix, step_info, peer); PTPCOLL_VERBOSE(10, ("Bcast, iprobe tag %d rank %d", tag, group_list[peer])); MCA_PML_CALL(iprobe(group_list[peer], tag, comm, &matched, &status)); if (matched) { MCA_COMMON_NETPATTERNS_GET_NEXT_KNOMIAL_UPDATE_LEVEL_FOR_BCAST(step_info, radix); break; } } /* Check of the */ if (PTPCOLL_KN_PROXY & ptpcoll_module->pow_ktype) { for (i = 0 ; i < ptpcoll_module->kn_proxy_extra_num; i++) { PTPCOLL_VERBOSE(10, ("Bcast, iprobe tag %d rank %d", tag, group_list[peer])); MCA_PML_CALL(iprobe(group_list[ptpcoll_module->kn_proxy_extra_index[i]], tag, comm, &matched, &status)); if (matched) { step_info.k_level = root_radix_mask; extra_root = group_list[ptpcoll_module->kn_proxy_extra_index[i]]; goto ANY_ROOT_KNOMIAL_BCAST; } } } } /* the function always returns OMPI_SUCCESS, so we don't check return code */ if (0 == matched) { PTPCOLL_VERBOSE(10, ("IPROBE was not matched")); /* No data was received, return no match error */ return BCOL_FN_NOT_STARTED; } /* set the source of data */ comm_root = status.MPI_SOURCE; PTPCOLL_VERBOSE(10, ("A. step info %d %d %d", step_info.k_level, step_info.k_step, step_info.k_tmp_peer)); /* Bcast the data */ PTPCOLL_VERBOSE(10, ("Starting data bcast")); ANY_ROOT_KNOMIAL_BCAST: /* Post receive that will fetch the data */ PTPCOLL_VERBOSE(10, ("Bcast, receive data from %d[%d], count %d, tag %d, addr %p", comm_root, count, tag, data_buffer)); rc = MCA_PML_CALL(recv(data_buffer, count, MPI_BYTE, comm_root, tag, comm, MPI_STATUS_IGNORE)); if( OMPI_SUCCESS != rc ) { PTPCOLL_VERBOSE(10, ("Failed to receive data")); return OMPI_ERROR; } PTPCOLL_VERBOSE(10, ("Bcast, Data was received")); /* Sending forward the data over K-nomial tree */ MCA_COMMON_NETPATTERNS_GET_NEXT_KNOMIAL_INIT(step_info, step_info.k_level, my_group_index); PTPCOLL_VERBOSE(10, ("B. step info %d %d %d", step_info.k_level, step_info.k_step, step_info.k_tmp_peer)); K_NOMIAL_ROOT_BCAST_NB_NOTEST(step_info, radix, my_group_index, group_list, data_buffer, count, tag, comm, send_requests, active_requests); ANY_ROOT_KNOMIAL_EXTRA: /* Proxy node but NOT virtual root */ if (PTPCOLL_KN_PROXY & ptpcoll_module->pow_ktype) { for (i = 0 ; i < ptpcoll_module->kn_proxy_extra_num; i++) { if (ptpcoll_module->kn_proxy_extra_index[i] == extra_root) continue; PTPCOLL_VERBOSE(10, ("Extra_Isend to %d", ptpcoll_module->kn_proxy_extra_index[i])); rc = MCA_PML_CALL(isend(data_buffer, count, MPI_BYTE, group_list[ptpcoll_module->kn_proxy_extra_index[i]], tag - 1, MCA_PML_BASE_SEND_STANDARD, comm, &(send_requests[*active_requests]))); if( OMPI_SUCCESS != rc ) { PTPCOLL_VERBOSE(10, ("Failed to send data")); return OMPI_ERROR; } ++(*active_requests); } } if (*active_requests > 0) { matched = mca_bcol_ptpcoll_test_all_for_match (active_requests, send_requests, &rc); } /* If it is last call, we have to recycle memory */ if(matched) { PTPCOLL_VERBOSE(10, ("bcast root is done")); return BCOL_FN_COMPLETE; } else { PTPCOLL_VERBOSE(10, ("bcast root is started")); return BCOL_FN_STARTED; } } static int bcol_ptpcoll_bcast_k_nomial_extra_known_and_anyroot(bcol_function_args_t *input_args, struct mca_bcol_base_function_t *const_args) { mca_bcol_ptpcoll_module_t *ptpcoll_module = (mca_bcol_ptpcoll_module_t *)const_args->bcol_module; int tag; int rc; int i; int completed = 0; /* not completed */ uint32_t buffer_index = input_args->buffer_index; ompi_communicator_t* comm = ptpcoll_module->super.sbgp_partner_module->group_comm; ompi_request_t **requests = ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].requests; void *data_buffer = (void *) ( (unsigned char *)input_args->sbuf + (size_t)input_args->sbuf_offset); int count = input_args->count * input_args->dtype->super.size; int *iteration = &(ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].iteration); int *active_requests = &(ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].active_requests); int *group_list = ptpcoll_module->super.sbgp_partner_module->group_list; mca_bcol_ptpcoll_component_t *cm = &mca_bcol_ptpcoll_component; ompi_status_public_t status; PTPCOLL_VERBOSE(3, ("Knomial Anyroot, index_this_type %d, num_of_this_type %d", const_args->index_of_this_type_in_collective + 1, const_args->n_of_this_type_in_collective)); /* keep tag within the limit support by the pml */ tag = (PTPCOLL_TAG_OFFSET + input_args->sequence_num * PTPCOLL_TAG_FACTOR) & (ptpcoll_module->tag_mask); /* mark this as a collective tag, to avoid conflict with user-level flags */ tag = -tag; /* reset active requests */ *active_requests = 0; /* reset iteration counter */ *iteration = -1; PTPCOLL_VERBOSE(8, ("bcol_ptpcoll_bcast_k_nomial_anyroot extra, buffer index: %d \n" "tag: %d " "tag_mask: %d " "sn: %d " "root: %d " "pow_k: %d %d " "buff: %p " ,buffer_index, tag, ptpcoll_module->tag_mask, input_args->sequence_num, input_args->root_flag, ptpcoll_module->pow_k, ptpcoll_module->pow_knum, data_buffer )); /* we have a power 2 group */ if (input_args->root_flag) { PTPCOLL_VERBOSE(10, ("I'm EXTRA root of the data, v root %d", ptpcoll_module->kn_proxy_extra_index[0])); /* send the all data to your proxy peer */ rc = MCA_PML_CALL(isend(data_buffer, count, MPI_BYTE, group_list[ptpcoll_module->kn_proxy_extra_index[0]], tag, MCA_PML_BASE_SEND_STANDARD, comm, &(requests[*active_requests]))); if( OMPI_SUCCESS != rc ) { PTPCOLL_VERBOSE(10, ("Failed to send data")); return OMPI_ERROR; } ++(*active_requests); completed = mca_bcol_ptpcoll_test_all_for_match(active_requests, requests, &rc); if (0 == completed) { /* we have to store the iteration number somewhere */ PTPCOLL_VERBOSE(10, ("Extra was started")); return (OMPI_SUCCESS != rc) ? rc : BCOL_FN_STARTED; } } else { for (i = 0; i < cm->num_to_probe && 0 == completed; i++) { MCA_PML_CALL(iprobe(group_list[ptpcoll_module->kn_proxy_extra_index[0]], tag - 1, comm, &completed, &status)); } if (0 == completed) { /* No data was received */ return BCOL_FN_NOT_STARTED; } /* the data is ready */ rc = MCA_PML_CALL(recv(data_buffer, count, MPI_BYTE, group_list[ptpcoll_module->kn_proxy_extra_index[0]], tag - 1, comm, MPI_STATUS_IGNORE)); if( OMPI_SUCCESS != rc ) { PTPCOLL_VERBOSE(10, ("Failed to send data")); return OMPI_ERROR; } } PTPCOLL_VERBOSE(10, ("Extra was done")); return BCOL_FN_COMPLETE; } static int bcol_ptpcoll_bcast_k_nomial_extra_known_and_anyroot_progress(bcol_function_args_t *input_args, struct mca_bcol_base_function_t *const_args) { int rc; int completed = 0; /* not completed */ int i; mca_bcol_ptpcoll_module_t *ptpcoll_module = (mca_bcol_ptpcoll_module_t *)const_args->bcol_module; ompi_request_t **requests = ptpcoll_module->ml_mem.ml_buf_desc[input_args->buffer_index].requests; uint32_t buffer_index = input_args->buffer_index; mca_bcol_ptpcoll_component_t *cm = &mca_bcol_ptpcoll_component; int *active_requests = &(ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].active_requests); ompi_status_public_t status; void *data_buffer = (void *) ( (unsigned char *)input_args->sbuf + (size_t)input_args->sbuf_offset); int count = input_args->count * input_args->dtype->super.size; /* keep tag within the limit support by the pml */ int tag = -((PTPCOLL_TAG_OFFSET + input_args->sequence_num * PTPCOLL_TAG_FACTOR) & (ptpcoll_module->tag_mask)); ompi_communicator_t* comm = ptpcoll_module->super.sbgp_partner_module->group_comm; int *group_list = ptpcoll_module->super.sbgp_partner_module->group_list; PTPCOLL_VERBOSE(10, ("bcol_ptpcoll_bcast_k_nomial_extra_known_and_anyroot_progress extra, was called, tag %d\n", tag)); if (input_args->root_flag) { PTPCOLL_VERBOSE(10, ("I'm EXTRA root of the data")); completed = mca_bcol_ptpcoll_test_all_for_match(active_requests, requests, &rc); if (0 == completed) { return (OMPI_SUCCESS != rc) ? rc : BCOL_FN_STARTED; } } else { for (i = 0; i < cm->num_to_probe && 0 == completed; i++) { MCA_PML_CALL(iprobe(group_list[ptpcoll_module->kn_proxy_extra_index[0]], tag - 1, comm, &completed, &status)); } if (0 == completed) { return BCOL_FN_STARTED; } /* the data is ready */ rc = MCA_PML_CALL(recv(data_buffer, count, MPI_BYTE, group_list[ptpcoll_module->kn_proxy_extra_index[0]], tag - 1, comm, MPI_STATUS_IGNORE)); if( OMPI_SUCCESS != rc ) { PTPCOLL_VERBOSE(10, ("Failed to send data")); return OMPI_ERROR; } } /* Done */ return BCOL_FN_COMPLETE; \ } /* Know root means that we know exactly the source of data and we do not have to check multiple * sources */ #define K_NOMIAL_DATA_SRC(radix, my_group_index, group_size, group_root, data_src, radix_mask) \ do { \ int relative_rank = (my_group_index >= group_root) ? my_group_index - group_root : \ my_group_index - group_root + group_size; \ \ radix_mask = 1; \ while (radix_mask < group_size) { \ if (relative_rank % (radix * radix_mask)) { \ data_src = relative_rank/(radix * radix_mask) * (radix * radix_mask) + group_root; \ if (data_src >= group_size) data_src -= group_size; \ break; \ } \ radix_mask *= radix; \ } \ } while (0) int bcol_ptpcoll_bcast_k_nomial_known_root_progress(bcol_function_args_t *input_args, struct mca_bcol_base_function_t *const_args) { mca_bcol_ptpcoll_module_t *ptpcoll_module = (mca_bcol_ptpcoll_module_t *)const_args->bcol_module; int tag; int rc = OMPI_SUCCESS; int my_group_index = ptpcoll_module->super.sbgp_partner_module->my_index; int *group_list = ptpcoll_module->super.sbgp_partner_module->group_list; int radix = ptpcoll_module->k_nomial_radix; int radix_mask; uint64_t sequence_number = input_args->sequence_num; uint32_t buffer_index = input_args->buffer_index; int group_root_index = 0; ompi_communicator_t* comm = ptpcoll_module->super.sbgp_partner_module->group_comm; ompi_request_t **send_requests = ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].requests; ompi_request_t **recv_request = &ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].requests[0]; void *data_buffer = (void *) ( (unsigned char *)input_args->sbuf + (size_t)input_args->sbuf_offset); int count = input_args->count * input_args->dtype->super.size; int completed = 0; int *active_requests = &(ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].active_requests); tag = (PTPCOLL_TAG_OFFSET + sequence_number * PTPCOLL_TAG_FACTOR) & (ptpcoll_module->tag_mask); /* mark this as a collective tag, to avoid conflict with user-level flags */ tag = -tag; PTPCOLL_VERBOSE(3, ("BCAST Know root, index_this_type %d, num_of_this_type %d", const_args->index_of_this_type_in_collective + 1, const_args->n_of_this_type_in_collective)); PTPCOLL_VERBOSE(10, ("bcol_ptpcoll_bcast_k_nomial_known_root_progress, buffer index: %d \n" "tag: %d " "tag_mask: %d " "sn: %d " "root: %d " "pow_k: %d %d " "buff: %p " "radix: %d", buffer_index, tag, ptpcoll_module->tag_mask, sequence_number, input_args->root_flag, ptpcoll_module->pow_k, ptpcoll_module->pow_knum, data_buffer, radix)); if (input_args->root_flag) { /* Check for completion */ assert(*active_requests > 0); PTPCOLL_VERBOSE(10, ("Requests %d", *active_requests)); completed = mca_bcol_ptpcoll_test_all_for_match(active_requests, send_requests, &rc); if (OMPI_SUCCESS != rc) { return OMPI_ERROR; } } else { /* No data was received. Waiting for data */ if (0 == (*active_requests)) { int extra_root = -1; netpatterns_knomial_step_info_t step_info; /* We can not block. So run couple of test for data arrival */ if (0 == mca_bcol_ptpcoll_test_for_match(recv_request, &rc)) { PTPCOLL_VERBOSE(10, ("Test was not matched (active request %d)", *active_requests)); /* No data was received, return no match error */ return (OMPI_SUCCESS != rc) ? rc : BCOL_FN_STARTED; } radix_mask = ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].radix_mask; group_root_index = input_args->root_route->rank; PTPCOLL_VERBOSE(10, ("Test was matched - radix %d", radix_mask)); /* Bcast the data */ MCA_COMMON_NETPATTERNS_GET_NEXT_KNOMIAL_INIT(step_info, radix_mask, my_group_index); K_NOMIAL_ROOT_BCAST_NB_NOTEST(step_info, radix, my_group_index, group_list, data_buffer, count, tag, comm, send_requests, active_requests); if (PTPCOLL_KN_PROXY & ptpcoll_module->pow_ktype) { int i; if (radix_mask == ptpcoll_module->pow_knum) { extra_root = group_root_index; } for (i = 0 ; i < ptpcoll_module->kn_proxy_extra_num; i++) { if (ptpcoll_module->kn_proxy_extra_index[i] == extra_root) continue; PTPCOLL_VERBOSE(10, ("Extra_Isend to %d", ptpcoll_module->kn_proxy_extra_index[i])); rc = MCA_PML_CALL(isend(data_buffer, count, MPI_BYTE, group_list[ptpcoll_module->kn_proxy_extra_index[i]], tag - 1, MCA_PML_BASE_SEND_STANDARD, comm, &(send_requests[*active_requests]))); if( OMPI_SUCCESS != rc ) { PTPCOLL_VERBOSE(10, ("Failed to send data")); return OMPI_ERROR; } ++(*active_requests); } } if (*active_requests > 0) { completed = mca_bcol_ptpcoll_test_all_for_match (active_requests, send_requests, &rc); } else { completed = 1; } } else { /* Data was received and sent out, check for completion */ completed = mca_bcol_ptpcoll_test_all_for_match(active_requests, send_requests, &rc); if (OMPI_SUCCESS != rc) { PTPCOLL_VERBOSE(10, ("Test was not matched (active request %d)", *active_requests)); return OMPI_ERROR; } } } /* DONE */ if(completed) { return BCOL_FN_COMPLETE; } else { PTPCOLL_VERBOSE(10, ("bcast root is started")); return BCOL_FN_STARTED; } } int bcol_ptpcoll_bcast_k_nomial_known_root(bcol_function_args_t *input_args, struct mca_bcol_base_function_t *const_args) { mca_bcol_ptpcoll_module_t *ptpcoll_module = (mca_bcol_ptpcoll_module_t *)const_args->bcol_module; int tag; int rc; int comm_root; int data_src = -1; int group_root_index; int my_group_index = ptpcoll_module->super.sbgp_partner_module->my_index; int *group_list = ptpcoll_module->super.sbgp_partner_module->group_list; int radix = ptpcoll_module->k_nomial_radix; uint32_t buffer_index = input_args->buffer_index; ompi_communicator_t* comm = ptpcoll_module->super.sbgp_partner_module->group_comm; ompi_request_t **send_requests = ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].requests; ompi_request_t **recv_request = &ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].requests[0]; void *data_buffer = (void *) ( (unsigned char *)input_args->sbuf + (size_t)input_args->sbuf_offset); int count = input_args->count * input_args->dtype->super.size; int *active_requests = &(ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].active_requests); int matched = 0; int k_level, logk_level; int extra_root = -1; netpatterns_knomial_step_info_t step_info; PTPCOLL_VERBOSE(3, ("BCAST Know root, index_this_type %d, num_of_this_type %d", const_args->index_of_this_type_in_collective + 1, const_args->n_of_this_type_in_collective)); /* reset active request counter */ (*active_requests) = 0; /* keep tag within the limit support by the pml */ tag = (PTPCOLL_TAG_OFFSET + input_args->sequence_num * PTPCOLL_TAG_FACTOR) & (ptpcoll_module->tag_mask); /* mark this as a collective tag, to avoid conflict with user-level flags */ tag = -tag; PTPCOLL_VERBOSE(8, ("bcol_ptpcoll_bcast_k_nomial_known_root, buffer index: %d \n" "tag: %d " "tag_mask: %d " "sn: %d " "root: %d " "pow_k: %d %d " "buff: %p " "radix: %d", buffer_index, tag, ptpcoll_module->tag_mask, input_args->sequence_num, input_args->root_flag, ptpcoll_module->pow_k, ptpcoll_module->pow_knum, data_buffer, radix)); if (input_args->root_flag) { PTPCOLL_VERBOSE(10, ("I'm root of the data")); /* * I'm root of the operation * send data to (k - 1) * log base k N neighbors */ MCA_COMMON_NETPATTERNS_GET_NEXT_KNOMIAL_INIT(step_info, ptpcoll_module->pow_knum, my_group_index); K_NOMIAL_ROOT_BCAST_NB_NOTEST(step_info, radix, my_group_index, group_list, data_buffer, count, tag, comm, send_requests, active_requests); goto KNOWN_ROOT_KNOMIAL_BCAST_EXTRA; } /* I'm not root */ group_root_index = input_args->root_route->rank; /* If Proxy node, check if extra node is root */ PTPCOLL_VERBOSE(10, ("Check if I virtual root, groop root %d group_size_pow %d type %d\n", group_root_index, ptpcoll_module->pow_knum , ptpcoll_module->pow_ktype)); if (group_root_index >= ptpcoll_module->pow_knum) { /* Chech if the rank is virtual root */ int virtual_root = (group_root_index - ptpcoll_module->pow_knum) / (radix - 1); if (my_group_index == virtual_root) { MCA_COMMON_NETPATTERNS_GET_NEXT_KNOMIAL_INIT(step_info, ptpcoll_module->pow_knum, my_group_index); k_level = ptpcoll_module->pow_knum; comm_root = group_list[group_root_index]; extra_root = group_root_index; PTPCOLL_VERBOSE(10, ("Im virtual root klevel %d, comm_root %d vroot %d\n", k_level, comm_root, virtual_root)); goto KNOWN_ROOT_KNOMIAL_BCAST; } else { /* set virtual root as real root of the group */ group_root_index = virtual_root; PTPCOLL_VERBOSE(10, ("My virtual root vroot %d\n", group_root_index)); } } data_src = netpatterns_get_knomial_data_source( my_group_index, group_root_index, radix, ptpcoll_module->pow_knum, &k_level, &logk_level); comm_root = group_list[data_src]; KNOWN_ROOT_KNOMIAL_BCAST: PTPCOLL_VERBOSE(10, ("Bcast, receive data from %d[%d], count %d, tag %d, addr %p", comm_root, data_src, count, tag, data_buffer)); rc = MCA_PML_CALL(irecv(data_buffer, count, MPI_BYTE, comm_root, tag, comm, recv_request)); if( OMPI_SUCCESS != rc ) { PTPCOLL_VERBOSE(10, ("Failed to receive data")); return OMPI_ERROR; } /* We can not block. So run couple of test for data arrival */ if (0 == mca_bcol_ptpcoll_test_for_match(recv_request, &rc)) { PTPCOLL_VERBOSE(10, ("Test was not matched - %d", rc)); /* cache the radix mask for future progress */ ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].radix_mask = k_level; /* No data was received, return no match error */ return (OMPI_SUCCESS != rc) ? rc : BCOL_FN_STARTED; } /* Bcast the data */ MCA_COMMON_NETPATTERNS_GET_NEXT_KNOMIAL_INIT(step_info, k_level, my_group_index); K_NOMIAL_ROOT_BCAST_NB_NOTEST(step_info, radix, my_group_index, group_list, data_buffer, count, tag, comm, send_requests, active_requests); KNOWN_ROOT_KNOMIAL_BCAST_EXTRA: /* Proxy node but NOT virtual root */ if (PTPCOLL_KN_PROXY & ptpcoll_module->pow_ktype) { int i; for (i = 0 ; i < ptpcoll_module->kn_proxy_extra_num; i++) { if (ptpcoll_module->kn_proxy_extra_index[i] == extra_root) continue; PTPCOLL_VERBOSE(10, ("Extra_Isend to %d", ptpcoll_module->kn_proxy_extra_index[i])); rc = MCA_PML_CALL(isend(data_buffer, count, MPI_BYTE, group_list[ptpcoll_module->kn_proxy_extra_index[i]], tag - 1, MCA_PML_BASE_SEND_STANDARD, comm, &(send_requests[*active_requests]))); if( OMPI_SUCCESS != rc ) { PTPCOLL_VERBOSE(10, ("Failed to send data")); return OMPI_ERROR; } ++(*active_requests); } } if (*active_requests > 0) { matched = mca_bcol_ptpcoll_test_all_for_match (active_requests, send_requests, &rc); } else { matched = 1; } /* If it is last call, we have to recycle memory */ if(matched) { return BCOL_FN_COMPLETE; } else { PTPCOLL_VERBOSE(10, ("bcast root is started")); return BCOL_FN_STARTED; } } int bcol_ptpcoll_bcast_binomial_scatter_gatther_anyroot_extra(bcol_function_args_t *input_args, struct mca_bcol_base_function_t *const_args) { mca_bcol_ptpcoll_module_t *ptpcoll_module = (mca_bcol_ptpcoll_module_t *)const_args->bcol_module; int tag; int rc; int i; int completed = 0; /* not completed */ uint32_t buffer_index = input_args->buffer_index; ompi_communicator_t* comm = ptpcoll_module->super.sbgp_partner_module->group_comm; ompi_request_t **requests = ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].requests; void *data_buffer = (void *) ( (unsigned char *)input_args->sbuf + (size_t)input_args->sbuf_offset); int count = input_args->count * input_args->dtype->super.size; int *iteration = &(ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].iteration); int *active_requests = &(ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].active_requests); mca_bcol_ptpcoll_component_t *cm = &mca_bcol_ptpcoll_component; ompi_status_public_t status; int *group_list = ptpcoll_module->super.sbgp_partner_module->group_list; PTPCOLL_VERBOSE(3, ("BCAST Anyroot, index_this_type %d, num_of_this_type %d", const_args->index_of_this_type_in_collective + 1, const_args->n_of_this_type_in_collective)); /* keep tag within the limit support by the pml */ tag = (PTPCOLL_TAG_OFFSET + input_args->sequence_num * PTPCOLL_TAG_FACTOR) & (ptpcoll_module->tag_mask); /* mark this as a collective tag, to avoid conflict with user-level flags */ tag = -tag; /* reset active requests */ *active_requests = 0; /* reset iteration counter */ *iteration = -1; PTPCOLL_VERBOSE(8, ("bcol_ptpcoll_bcast_k_nomial_anyroot extra, buffer index: %d \n" "tag: %d " "tag_mask: %d " "sn: %d " "root: %d " "pow_k: %d %d " "buff: %p " "radix: %d" , buffer_index, tag, ptpcoll_module->tag_mask, input_args->sequence_num, input_args->root_flag, ptpcoll_module->pow_k, ptpcoll_module->pow_knum, data_buffer, 2 )); /* we have a power 2 group */ if (input_args->root_flag) { PTPCOLL_VERBOSE(10, ("I'm EXTRA root of the data")); /* send the all data to your proxy peer */ rc = MCA_PML_CALL(isend(data_buffer, count, MPI_BYTE, group_list[ptpcoll_module->proxy_extra_index], tag, MCA_PML_BASE_SEND_STANDARD, comm, &(requests[*active_requests]))); if( OMPI_SUCCESS != rc ) { PTPCOLL_VERBOSE(10, ("Failed to send data")); return OMPI_ERROR; } ++(*active_requests); completed = mca_bcol_ptpcoll_test_all_for_match(active_requests, requests, &rc); if (0 == completed) { /* we have to store the iteration number somewhere */ return (OMPI_SUCCESS != rc) ? rc : BCOL_FN_STARTED; } } else { for (i = 0; i < cm->num_to_probe && 0 == completed; i++) { MCA_PML_CALL(iprobe(group_list[ptpcoll_module->proxy_extra_index], tag - 1, comm, &completed, &status)); } if (0 == completed) { /* No data was received */ return BCOL_FN_NOT_STARTED; } /* the data is ready */ rc = MCA_PML_CALL(recv(data_buffer, count, MPI_BYTE, group_list[ptpcoll_module->proxy_extra_index], tag - 1, comm, MPI_STATUS_IGNORE)); if( OMPI_SUCCESS != rc ) { PTPCOLL_VERBOSE(10, ("Failed to send data")); return OMPI_ERROR; } } return BCOL_FN_COMPLETE; } int bcol_ptpcoll_bcast_binomial_scatter_gatther_anyroot_extra_progress(bcol_function_args_t *input_args, struct mca_bcol_base_function_t *const_args) { int rc; int completed = 0; /* not completed */ int i; mca_bcol_ptpcoll_module_t *ptpcoll_module = (mca_bcol_ptpcoll_module_t *)const_args->bcol_module; ompi_request_t **requests = ptpcoll_module->ml_mem.ml_buf_desc[input_args->buffer_index].requests; uint32_t buffer_index = input_args->buffer_index; mca_bcol_ptpcoll_component_t *cm = &mca_bcol_ptpcoll_component; int *active_requests = &(ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].active_requests); ompi_status_public_t status; void *data_buffer = (void *) ( (unsigned char *)input_args->sbuf + (size_t)input_args->sbuf_offset); int count = input_args->count * input_args->dtype->super.size; /* keep tag within the limit support by the pml */ int tag = -((PTPCOLL_TAG_OFFSET + input_args->sequence_num * PTPCOLL_TAG_FACTOR) & (ptpcoll_module->tag_mask)); ompi_communicator_t* comm = ptpcoll_module->super.sbgp_partner_module->group_comm; int *group_list = ptpcoll_module->super.sbgp_partner_module->group_list; PTPCOLL_VERBOSE(10, ("bcol_ptpcoll_bcast_k_nomial_extra_known_and_anyroot_progress extra, was called, tag %d\n", tag)); if (input_args->root_flag) { PTPCOLL_VERBOSE(10, ("I'm EXTRA root of the data")); completed = mca_bcol_ptpcoll_test_all_for_match(active_requests, requests, &rc); if (0 == completed) { return (OMPI_SUCCESS != rc) ? rc : BCOL_FN_STARTED; } } else { for (i = 0; i < cm->num_to_probe && 0 == completed; i++) { MCA_PML_CALL(iprobe(group_list[ptpcoll_module->proxy_extra_index], tag - 1, comm, &completed, &status)); } if (0 == completed) { return BCOL_FN_STARTED; } /* the data is ready */ rc = MCA_PML_CALL(recv(data_buffer, count, MPI_BYTE, group_list[ptpcoll_module->proxy_extra_index], tag - 1, comm, MPI_STATUS_IGNORE)); if( OMPI_SUCCESS != rc ) { PTPCOLL_VERBOSE(10, ("Failed to send data")); return OMPI_ERROR; } } /* Done */ return BCOL_FN_COMPLETE; } int bcol_ptpcoll_bcast_binomial_scatter_gatther_anyroot_progress(bcol_function_args_t *input_args, struct mca_bcol_base_function_t *const_args) { mca_bcol_ptpcoll_module_t *ptpcoll_module = (mca_bcol_ptpcoll_module_t *)const_args->bcol_module; int rc; int completed = 0; /* not completed */ uint32_t buffer_index = input_args->buffer_index; ompi_request_t **requests = ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].requests; void *data_buffer = (void *) ( (unsigned char *)input_args->sbuf + (size_t)input_args->sbuf_offset); int count = input_args->count * input_args->dtype->super.size; int *iteration = &(ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].iteration); int *active_requests = &(ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].active_requests); size_t base_block_size = (count + ptpcoll_module->pow_2num - 1) / ptpcoll_module->pow_2num; int tag = ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].tag; ompi_communicator_t* comm = ptpcoll_module->super.sbgp_partner_module->group_comm; int *status = &ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].status; PTPCOLL_VERBOSE(10, ("bcol_ptpcoll_bcast_binomial_scatter_gatther_anyroot_progress, buffer index: %d \n" "tag: %d " "tag_mask: %d " "sn: %d " "root: %d " "pow_2: %d %d " "buff: %p " "radix: %d" "block_size: %d", buffer_index, tag, ptpcoll_module->tag_mask, 0, input_args->root_flag, ptpcoll_module->pow_2, ptpcoll_module->pow_2num, data_buffer, 2, base_block_size)); switch(*status) { case PTPCOLL_GATHER_STARTED: completed = mca_bcol_ptpcoll_test_all_for_match(active_requests, requests, &rc); if (0 == completed) { PTPCOLL_VERBOSE(10, ("Not done, have to complete %d, Return %d", *active_requests, rc)); return (OMPI_SUCCESS != rc) ? rc : BCOL_FN_STARTED; } ++(*iteration); /* start from next iteration */ PTPCOLL_VERBOSE(10, ("Outstanding operation was comleted, starting next one ! %d", *iteration)); break; case PTPCOLL_EXTRA_SEND_STARTED: completed = mca_bcol_ptpcoll_test_all_for_match(active_requests, requests, &rc); if (0 == completed) { PTPCOLL_VERBOSE(10, ("Not done, have to complete %d, Return %d", *active_requests, rc)); return (OMPI_SUCCESS != rc) ? rc : BCOL_FN_STARTED; } return BCOL_FN_COMPLETE; default: PTPCOLL_VERBOSE(10, ("Unknown status %d", *status)); return OMPI_ERROR; } PTPCOLL_VERBOSE(10, ("Stating PR_GATHER")); /* Gather, continue the recoursive doubling iterations */ rc = bcol_ptpcoll_bcast_binomial_gather_anyroot(ptpcoll_module, buffer_index, data_buffer, count, base_block_size); if (BCOL_FN_COMPLETE != rc) { assert(0 != *active_requests); PTPCOLL_VERBOSE(10, ("Not done. Return %d", rc)); return rc; } PTPCOLL_VERBOSE(10, ("PR_GATHER done")); /* it the process is proxy , it has to send full message to remote peer */ if ((PTPCOLL_PROXY & ptpcoll_module->pow_2type) && ! CHECK_IF_ROOT_OR_VROOT(ptpcoll_module, buffer_index)) { *status = PTPCOLL_EXTRA_SEND_STARTED; rc = bcol_ptpcoll_bcast_binomial_scatter_gatther_send_extra( ptpcoll_module, data_buffer, count, tag - 1, ptpcoll_module->proxy_extra_index, comm, active_requests, requests); if (BCOL_FN_COMPLETE != rc) { return rc; } } /* return */ return BCOL_FN_COMPLETE; } int bcol_ptpcoll_bcast_binomial_scatter_gatther_anyroot(bcol_function_args_t *input_args, struct mca_bcol_base_function_t *const_args) { mca_bcol_ptpcoll_module_t *ptpcoll_module = (mca_bcol_ptpcoll_module_t *)const_args->bcol_module; int tag; int rc; int my_group_index = ptpcoll_module->super.sbgp_partner_module->my_index; int *group_list = ptpcoll_module->super.sbgp_partner_module->group_list; uint64_t sequence_number = input_args->sequence_num; uint32_t buffer_index = input_args->buffer_index; ompi_communicator_t* comm = ptpcoll_module->super.sbgp_partner_module->group_comm; ompi_request_t **requests = ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].requests; void *data_buffer = (void *) ( (unsigned char *)input_args->sbuf + (size_t)input_args->sbuf_offset); int count = input_args->count * input_args->dtype->super.size; int *iteration = &(ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].iteration); int *radix_mask_pow = &(ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].radix_mask_pow); int *active_requests = &(ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].active_requests); size_t base_block_size = (count + ptpcoll_module->pow_2num - 1) / ptpcoll_module->pow_2num; int root_pow2 = ptpcoll_module->pow_2 - 1; int *status = &ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].status; PTPCOLL_VERBOSE(3, ("BCAST Anyroot, index_this_type %d, num_of_this_type %d", const_args->index_of_this_type_in_collective + 1, const_args->n_of_this_type_in_collective)); /* keep tag within the limit support by the pml */ tag = (PTPCOLL_TAG_OFFSET + sequence_number * PTPCOLL_TAG_FACTOR) & (ptpcoll_module->tag_mask); /* mark this as a collective tag, to avoid conflict with user-level flags */ ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].tag = tag = -tag; /* reset active requests */ *active_requests = 0; /* reset iteration counter */ *iteration = -1; /* set initial status */ *status = PTPCOLL_NOT_STARTED; PTPCOLL_VERBOSE(8, ("bcol_ptpcoll_bcast_k_nomial_anyroot, buffer index: %d \n" "tag: %d " "tag_mask: %d " "sn: %d " "root: %d " "pow_2: %d %d " "buff: %p " "radix: %d" "block_size: %d", buffer_index, tag, ptpcoll_module->tag_mask, sequence_number, input_args->root_flag, ptpcoll_module->pow_2, ptpcoll_module->pow_2num, data_buffer, 2, base_block_size)); /* we have a power 2 group */ if (input_args->root_flag) { PTPCOLL_VERBOSE(10, ("I'm root of the data")); /* for proxy we have little bit more work to do */ if (PTPCOLL_PROXY & ptpcoll_module->pow_2type) { /* send the all data to your extra peer */ rc = MCA_PML_CALL(isend(data_buffer, count, MPI_BYTE, group_list[ptpcoll_module->proxy_extra_index], tag - 1, MCA_PML_BASE_SEND_STANDARD, comm, &(requests[*active_requests]))); if( OMPI_SUCCESS != rc ) { PTPCOLL_VERBOSE(10, ("Failed to send data")); return OMPI_ERROR; } ++(*active_requests); } /* * I'm root of the operation * send data to (k - 1) * log base k N neighbors */ *radix_mask_pow = ptpcoll_module->pow_2; K_NOMIAL_ROOT_BCAST_NB_BINOMIAL_SCATTER(root_pow2, my_group_index, group_size, group_list, data_buffer, base_block_size, count, tag, comm, requests, active_requests); goto GATHER; } /* <-- non root flow --> */ rc = bcol_ptpcoll_bcast_binomial_probe_and_scatter_anyroot(ptpcoll_module, buffer_index, data_buffer, count, base_block_size); if (BCOL_FN_COMPLETE != rc) { PTPCOLL_VERBOSE(10, ("Not done. Return %d", rc)); return rc; } GATHER: *iteration = 0; *status = PTPCOLL_GATHER_STARTED; rc = bcol_ptpcoll_bcast_binomial_gather_anyroot(ptpcoll_module, buffer_index, data_buffer, count, base_block_size); if (BCOL_FN_COMPLETE != rc) { assert(0 != *active_requests); PTPCOLL_VERBOSE(10, ("Not done. Return %d", rc)); return rc; } ++(*iteration); /* I need it for progress */ /* proxy case */ if ((PTPCOLL_PROXY & ptpcoll_module->pow_2type) && ! CHECK_IF_ROOT_OR_VROOT(ptpcoll_module, buffer_index)) { *status = PTPCOLL_EXTRA_SEND_STARTED; rc = bcol_ptpcoll_bcast_binomial_scatter_gatther_send_extra(ptpcoll_module, data_buffer, count, tag - 1, ptpcoll_module->proxy_extra_index, comm, active_requests, requests); if (BCOL_FN_COMPLETE != rc) { return rc; } } return BCOL_FN_COMPLETE; } int bcol_ptpcoll_bcast_binomial_scatter_gatther_known_root_progress(bcol_function_args_t *input_args, struct mca_bcol_base_function_t *const_args) { mca_bcol_ptpcoll_module_t *ptpcoll_module = (mca_bcol_ptpcoll_module_t *)const_args->bcol_module; int rc; int completed = 0; /* not completed */ uint32_t buffer_index = input_args->buffer_index; ompi_request_t **requests = ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].requests; void *data_buffer = (void *) ( (unsigned char *)input_args->sbuf + (size_t)input_args->sbuf_offset); int count = input_args->count * input_args->dtype->super.size; int *iteration = &(ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].iteration); int *active_requests = &(ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].active_requests); size_t base_block_size = (count + ptpcoll_module->pow_2num - 1) / ptpcoll_module->pow_2num; int tag = ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].tag; ompi_communicator_t* comm = ptpcoll_module->super.sbgp_partner_module->group_comm; int *status = &(ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].status); PTPCOLL_VERBOSE(10, ("bcol_ptpcoll_bcast_binomial_scatter_gatther_known_progress, buffer index: %d \n" "tag: %d " "tag_mask: %d " "sn: %d " "root: %d " "pow_2: %d %d " "buff: %p " "radix: %d" "block_size: %d", buffer_index, tag, ptpcoll_module->tag_mask, 0, input_args->root_flag, ptpcoll_module->pow_2, ptpcoll_module->pow_2num, data_buffer, 2, base_block_size)); switch(*status) { case PTPCOLL_WAITING_FOR_DATA: PTPCOLL_VERBOSE(10, ("Probe for the data")); rc = bcol_ptpcoll_bcast_binomial_test_and_scatter_known_root(ptpcoll_module, buffer_index, data_buffer, count, base_block_size); if (BCOL_FN_COMPLETE != rc) { assert(0 != *active_requests); PTPCOLL_VERBOSE(10, ("Not done. Return %d", rc)); return rc; } *iteration = 0; *status = PTPCOLL_GATHER_STARTED; break; case PTPCOLL_GATHER_STARTED: completed = mca_bcol_ptpcoll_test_all_for_match(active_requests, requests, &rc); if (0 == completed) { PTPCOLL_VERBOSE(10, ("Not done, have to complete %d, Return %d", *active_requests, rc)); return (OMPI_SUCCESS != rc) ? rc : BCOL_FN_STARTED; } ++(*iteration); /* start from next iteration */ PTPCOLL_VERBOSE(10, ("Outstanding operation was comleted, starting next one ! %d", *iteration)); break; case PTPCOLL_EXTRA_SEND_STARTED: completed = mca_bcol_ptpcoll_test_all_for_match(active_requests, requests, &rc); if (0 == completed) { PTPCOLL_VERBOSE(10, ("Not done, have to complete %d, Return %d", *active_requests, rc)); return (OMPI_SUCCESS != rc) ? rc : BCOL_FN_STARTED; } return BCOL_FN_COMPLETE; default: PTPCOLL_VERBOSE(10, ("Unknown status %d", *status)); return OMPI_ERROR; } PTPCOLL_VERBOSE(10, ("Stating PR_GATHER")); /* Gather, continue the recoursive doubling iterations */ rc = bcol_ptpcoll_bcast_binomial_gather_anyroot(ptpcoll_module, buffer_index, data_buffer, count, base_block_size); if (BCOL_FN_COMPLETE != rc) { assert(0 != *active_requests); PTPCOLL_VERBOSE(10, ("Not done. Return %d", rc)); return rc; } PTPCOLL_VERBOSE(10, ("PR_GATHER done")); /* it the process is proxy , it has to send full message to remote peer */ if ((PTPCOLL_PROXY & ptpcoll_module->pow_2type) && ! CHECK_IF_ROOT_OR_VROOT(ptpcoll_module, buffer_index)) { *status = PTPCOLL_EXTRA_SEND_STARTED; rc = bcol_ptpcoll_bcast_binomial_scatter_gatther_send_extra( ptpcoll_module, data_buffer, count, tag - 1, ptpcoll_module->proxy_extra_index, comm, active_requests, requests); if (BCOL_FN_COMPLETE != rc) { return rc; } } /* return */ return BCOL_FN_COMPLETE; } int bcol_ptpcoll_bcast_binomial_scatter_gatther_known_root(bcol_function_args_t *input_args, struct mca_bcol_base_function_t *const_args) { mca_bcol_ptpcoll_module_t *ptpcoll_module = (mca_bcol_ptpcoll_module_t *)const_args->bcol_module; int tag; int rc; int my_group_index = ptpcoll_module->super.sbgp_partner_module->my_index; int group_src, comm_root; int *group_list = ptpcoll_module->super.sbgp_partner_module->group_list; int pow2_distance; void *curr_data_buffer; int recv_count; uint64_t sequence_number = input_args->sequence_num; uint32_t buffer_index = input_args->buffer_index; ompi_communicator_t* comm = ptpcoll_module->super.sbgp_partner_module->group_comm; ompi_request_t **requests = ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].requests; void *data_buffer = (void *) ( (unsigned char *)input_args->sbuf + (size_t)input_args->sbuf_offset); int count = input_args->count * input_args->dtype->super.size; int *iteration = &(ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].iteration); int *radix_mask_pow = &(ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].radix_mask_pow); int *active_requests = &(ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].active_requests); size_t base_block_size = (count + ptpcoll_module->pow_2num - 1) / ptpcoll_module->pow_2num; int root_pow2 = ptpcoll_module->pow_2 - 1; int *status = &(ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].status); PTPCOLL_VERBOSE(3, ("BCAST Anyroot, index_this_type %d, num_of_this_type %d", const_args->index_of_this_type_in_collective + 1, const_args->n_of_this_type_in_collective)); /* keep tag within the limit support by the pml */ tag = (PTPCOLL_TAG_OFFSET + sequence_number * PTPCOLL_TAG_FACTOR) & (ptpcoll_module->tag_mask); /* mark this as a collective tag, to avoid conflict with user-level flags */ ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].tag = tag = -tag; /* reset active requests */ *active_requests = 0; /* reset iteration counter */ *iteration = -1; /* set initial status */ *status = PTPCOLL_NOT_STARTED; PTPCOLL_VERBOSE(8, ("bcol_ptpcoll_bcast_binomial_scatter_gatther_known, buffer index: %d \n" "tag: %d " "tag_mask: %d " "sn: %d " "root: %d " "pow_2: %d %d " "buff: %p " "radix: %d" "block_size: %d", buffer_index, tag, ptpcoll_module->tag_mask, sequence_number, input_args->root_flag, ptpcoll_module->pow_2, ptpcoll_module->pow_2num, data_buffer, 2, base_block_size)); /* we have a power 2 group */ if (input_args->root_flag) { PTPCOLL_VERBOSE(10, ("I'm root of the data")); /* for proxy we have little bit more work to do */ if (PTPCOLL_PROXY & ptpcoll_module->pow_2type) { /* send the all data to your extra peer */ rc = MCA_PML_CALL(isend(data_buffer, count, MPI_BYTE, group_list[ptpcoll_module->proxy_extra_index], tag - 1, MCA_PML_BASE_SEND_STANDARD, comm, &(requests[*active_requests]))); if( OMPI_SUCCESS != rc ) { PTPCOLL_VERBOSE(10, ("Failed to send data")); return OMPI_ERROR; } *active_requests = 1; } /* * I'm root of the operation * send data to (k - 1) * log base k N neighbors */ K_NOMIAL_ROOT_BCAST_NB_BINOMIAL_SCATTER(root_pow2, my_group_index, group_size, group_list, data_buffer, base_block_size, count, tag, comm, requests, active_requests); /* EXIT OR GO TO Gather */ *iteration = 0; *radix_mask_pow = ptpcoll_module->pow_2; goto GATHER; } /* <-- non root flow --> */ /* prapare and post recv operation */ group_src = bcol_ptpcoll_binomial_root_to_src(input_args->root_route->rank, my_group_index, ptpcoll_module->pow_2num, ptpcoll_module->group_size, &pow2_distance); assert(group_src >= 0); if (0 > pow2_distance) { /* the rank is virtual root for this group, receive the data and scatter gather as root */ PTPCOLL_VERBOSE(10, ("Virtual root %d , set mask to %d", my_group_index, ptpcoll_module->pow_2)); *radix_mask_pow = ptpcoll_module->pow_2; curr_data_buffer = data_buffer; recv_count = count; } else { int my_left_boundary_rank; recv_count = base_block_size * (1 << pow2_distance); /* we may receive larger data */ my_left_boundary_rank = my_group_index & ((~(int)0) << pow2_distance ); curr_data_buffer = (void *)((unsigned char *)data_buffer + (size_t) base_block_size * my_left_boundary_rank); *radix_mask_pow = pow2_distance; } comm_root = group_list[group_src]; PTPCOLL_VERBOSE(10, ("Bcast, receive data from %d[%d], count %d, tag %d, addr %p", comm_root, group_src, count, tag, data_buffer)); rc = MCA_PML_CALL(irecv(curr_data_buffer, recv_count, MPI_BYTE, comm_root, tag, comm, &requests[*active_requests])); if( OMPI_SUCCESS != rc ) { PTPCOLL_VERBOSE(10, ("Failed to receive data")); return OMPI_ERROR; } ++(*active_requests); *status = PTPCOLL_WAITING_FOR_DATA; rc = bcol_ptpcoll_bcast_binomial_test_and_scatter_known_root(ptpcoll_module, buffer_index, data_buffer, count, base_block_size); if (BCOL_FN_COMPLETE != rc) { PTPCOLL_VERBOSE(10, ("Not done. Return %d", rc)); return rc; } /* recv operation is done */ *iteration = 0; GATHER: *status = PTPCOLL_GATHER_STARTED; rc = bcol_ptpcoll_bcast_binomial_gather_anyroot(ptpcoll_module, buffer_index, data_buffer, count, base_block_size); if (BCOL_FN_COMPLETE != rc) { assert(0 != *active_requests); PTPCOLL_VERBOSE(10, ("Not done. Return %d", rc)); return rc; } ++(*iteration); /* I need it for progress */ /* proxy case */ if ((PTPCOLL_PROXY & ptpcoll_module->pow_2type) && ! CHECK_IF_ROOT_OR_VROOT(ptpcoll_module, buffer_index)) { *status = PTPCOLL_EXTRA_SEND_STARTED; rc = bcol_ptpcoll_bcast_binomial_scatter_gatther_send_extra( ptpcoll_module, data_buffer, count, tag - 1, ptpcoll_module->proxy_extra_index, comm, active_requests, requests); if (BCOL_FN_COMPLETE != rc) { return rc; } } return BCOL_FN_COMPLETE; } int bcol_ptpcoll_bcast_binomial_scatter_gatther_known_root_extra(bcol_function_args_t *input_args, struct mca_bcol_base_function_t *const_args) { mca_bcol_ptpcoll_module_t *ptpcoll_module = (mca_bcol_ptpcoll_module_t *)const_args->bcol_module; int tag; int rc; int completed = 0; /* not completed */ uint32_t buffer_index = input_args->buffer_index; ompi_communicator_t* comm = ptpcoll_module->super.sbgp_partner_module->group_comm; ompi_request_t **requests = ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].requests; void *data_buffer = (void *) ( (unsigned char *)input_args->sbuf + (size_t)input_args->sbuf_offset); int count = input_args->count * input_args->dtype->super.size; int *iteration = &(ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].iteration); int *active_requests = &(ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].active_requests); int *group_list = ptpcoll_module->super.sbgp_partner_module->group_list; PTPCOLL_VERBOSE(3, ("BCAST known root, index_this_type %d, num_of_this_type %d", const_args->index_of_this_type_in_collective + 1, const_args->n_of_this_type_in_collective)); /* keep tag within the limit support by the pml */ tag = (PTPCOLL_TAG_OFFSET + input_args->sequence_num * PTPCOLL_TAG_FACTOR) & (ptpcoll_module->tag_mask); /* mark this as a collective tag, to avoid conflict with user-level flags */ tag = -tag; /* reset active requests */ *active_requests = 0; /* reset iteration counter */ *iteration = -1; PTPCOLL_VERBOSE(8, ("bcol_ptpcoll_bcast_k_nomial_anyroot extra, buffer index: %d \n" "tag: %d " "tag_mask: %d " "sn: %d " "root: %d " "pow_k: %d %d " "buff: %p " "radix: %d" , buffer_index, tag, ptpcoll_module->tag_mask, input_args->sequence_num, input_args->root_flag, ptpcoll_module->pow_k, ptpcoll_module->pow_knum, data_buffer, 2 )); /* we have a power 2 group */ if (input_args->root_flag) { PTPCOLL_VERBOSE(10, ("I'm EXTRA root of the data")); /* send the all data to your proxy peer */ rc = MCA_PML_CALL(isend(data_buffer, count, MPI_BYTE, group_list[ptpcoll_module->proxy_extra_index], tag, MCA_PML_BASE_SEND_STANDARD, comm, &(requests[*active_requests]))); if( OMPI_SUCCESS != rc ) { PTPCOLL_VERBOSE(10, ("Failed to send data")); return OMPI_ERROR; } ++(*active_requests); completed = mca_bcol_ptpcoll_test_all_for_match(active_requests, requests, &rc); if (0 == completed) { /* we have to store the iteration number somewhere */ return (OMPI_SUCCESS != rc) ? rc : BCOL_FN_STARTED; } } else { rc = MCA_PML_CALL(irecv(data_buffer, count, MPI_BYTE, group_list[ptpcoll_module->proxy_extra_index], tag - 1, comm, &requests[*active_requests])); ++(*active_requests); completed = mca_bcol_ptpcoll_test_all_for_match(active_requests, requests, &rc); if (0 == completed) { PTPCOLL_VERBOSE(10, ("Test was not matched - %d", rc)); return (OMPI_SUCCESS != rc) ? rc : BCOL_FN_STARTED; } } return BCOL_FN_COMPLETE; } int bcol_ptpcoll_bcast_binomial_scatter_gatther_known_root_extra_progress(bcol_function_args_t *input_args, struct mca_bcol_base_function_t *const_args) { int rc; int completed = 0; /* not completed */ mca_bcol_ptpcoll_module_t *ptpcoll_module = (mca_bcol_ptpcoll_module_t *)const_args->bcol_module; ompi_request_t **requests = ptpcoll_module->ml_mem.ml_buf_desc[input_args->buffer_index].requests; uint32_t buffer_index = input_args->buffer_index; int *active_requests = &(ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].active_requests); PTPCOLL_VERBOSE(10, ("bcol_ptpcoll_bcast_binomial_known_root_extra_progress extra, was called\n")); completed = mca_bcol_ptpcoll_test_all_for_match(active_requests, requests, &rc); if (0 == completed) { return (OMPI_SUCCESS != rc) ? rc : BCOL_FN_STARTED; } return BCOL_FN_COMPLETE; } static int bcol_ptpcoll_bcast_narray_knomial_scatter_gatther_known_root_progress( bcol_function_args_t *input_args, struct mca_bcol_base_function_t *const_args) { mca_bcol_ptpcoll_module_t *ptpcoll_module = (mca_bcol_ptpcoll_module_t *)const_args->bcol_module; int rc; int completed = 0; /* not completed */ int my_group_index = ptpcoll_module->super.sbgp_partner_module->my_index; uint32_t buffer_index = input_args->buffer_index; ompi_request_t **requests = ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].requests; void *data_buffer = (void *) ( (unsigned char *)input_args->sbuf + (size_t)input_args->sbuf_offset); int count = input_args->count * input_args->dtype->super.size; int *iteration = &(ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].iteration); int *active_requests = &(ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].active_requests); int tag = ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].tag; ompi_communicator_t* comm = ptpcoll_module->super.sbgp_partner_module->group_comm; int *status = &(ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].status); int relative_group_index, group_root_index = 0; int group_size = ptpcoll_module->full_narray_tree_size; PTPCOLL_VERBOSE(8, ("bcol_ptpcoll_bcast_narray_knomial_scatter_gatther_known_root_progress, buffer index: %d " "tag: %d " "tag_mask: %d " "root: %d " "buff: %p " "radix: %d" , buffer_index, tag, ptpcoll_module->tag_mask, input_args->root_flag, data_buffer, ptpcoll_module->narray_knomial_proxy_num )); if (input_args->root_flag || /* virtual root case */ (input_args->root_route->rank >= group_size && my_group_index == (input_args->root_route->rank - group_size) / mca_bcol_ptpcoll_component.narray_knomial_radix)) { relative_group_index = 0; group_root_index = my_group_index; } else { if (input_args->root_route->rank >= group_size) { group_root_index = (input_args->root_route->rank - group_size) / mca_bcol_ptpcoll_component.narray_knomial_radix; } else { group_root_index = input_args->root_route->rank; } relative_group_index = my_group_index - group_root_index; if (relative_group_index < 0) { relative_group_index += group_size; } } switch(*status) { case PTPCOLL_WAITING_FOR_DATA: PTPCOLL_VERBOSE(10, ("Probe for the data")); rc = bcol_ptpcoll_bcast_narray_test_and_scatter_known_root(ptpcoll_module, buffer_index, data_buffer, count, group_root_index, relative_group_index); if (BCOL_FN_COMPLETE != rc) { assert(0 != *active_requests); PTPCOLL_VERBOSE(10, ("Not done. Return %d", rc)); return rc; } *iteration = 0; *status = PTPCOLL_GATHER_STARTED; break; case PTPCOLL_ROOT_SEND_STARTED: case PTPCOLL_GATHER_STARTED: completed = mca_bcol_ptpcoll_test_all_for_match(active_requests, requests, &rc); if (0 == completed) { PTPCOLL_VERBOSE(10, ("Not done, have to complete %d, Return %d", *active_requests, rc)); return (OMPI_SUCCESS != rc) ? rc : BCOL_FN_STARTED; } ++(*iteration); /* start from next iteration */ PTPCOLL_VERBOSE(10, ("Outstanding operation was comleted, starting next one ! %d", *iteration)); break; case PTPCOLL_EXTRA_SEND_STARTED: completed = mca_bcol_ptpcoll_test_all_for_match(active_requests, requests, &rc); if (0 == completed) { PTPCOLL_VERBOSE(10, ("Not done, have to complete %d, Return %d", *active_requests, rc)); return (OMPI_SUCCESS != rc) ? rc : BCOL_FN_STARTED; } return BCOL_FN_COMPLETE; default: PTPCOLL_VERBOSE(10, ("Unknown status %d", *status)); return OMPI_ERROR; } PTPCOLL_VERBOSE(10, ("Stating PR_GATHER")); /* Gather, continue the recoursive doubling iterations */ rc = bcol_ptpcoll_bcast_narray_knomial_gather(ptpcoll_module, buffer_index, data_buffer, count, relative_group_index); if (BCOL_FN_COMPLETE != rc) { assert(0 != *active_requests); PTPCOLL_VERBOSE(10, ("Not done. Return %d", rc)); return rc; } PTPCOLL_VERBOSE(10, ("PR_GATHER done")); /* it the process is proxy , it has to send full message to remote peer */ if ((PTPCOLL_PROXY & ptpcoll_module->narray_type) && !input_args->root_flag) { *status = PTPCOLL_EXTRA_SEND_STARTED; rc = bcol_ptpcoll_send_n_extra( ptpcoll_module, data_buffer, count, tag - 1, ptpcoll_module->narray_knomial_proxy_extra_index, ptpcoll_module->narray_knomial_proxy_num, input_args->root_route->rank, comm, active_requests, requests); if (BCOL_FN_COMPLETE != rc) { return rc; } } /* return */ return BCOL_FN_COMPLETE; } static int bcol_ptpcoll_bcast_narray_knomial_scatter_gatther_known_root(bcol_function_args_t *input_args, struct mca_bcol_base_function_t *const_args) { mca_bcol_ptpcoll_module_t *ptpcoll_module = (mca_bcol_ptpcoll_module_t *)const_args->bcol_module; int tag, rc, i; int my_group_index = ptpcoll_module->super.sbgp_partner_module->my_index; int data_src, offset, comm_root; int *group_list = ptpcoll_module->super.sbgp_partner_module->group_list; void *curr_data_buffer; int recv_count; uint64_t sequence_number = input_args->sequence_num; uint32_t buffer_index = input_args->buffer_index; ompi_communicator_t* comm = ptpcoll_module->super.sbgp_partner_module->group_comm; ompi_request_t **requests = ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].requests; void *data_buffer = (void *) ( (unsigned char *)input_args->sbuf + (size_t)input_args->sbuf_offset); int count = input_args->count * input_args->dtype->super.size; int *iteration = &(ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].iteration); int *active_requests = &(ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].active_requests); size_t base_block_size = 0; int *status = &(ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].status); int relative_group_index, group_root_index; int group_size = ptpcoll_module->full_narray_tree_size; int completed = 0; int virtual_root; netpatterns_narray_knomial_tree_node_t *narray_knomial_node = NULL; netpatterns_narray_knomial_tree_node_t *narray_node = NULL; PTPCOLL_VERBOSE(3, ("BCAST Anyroot, index_this_type %d, num_of_this_type %d", const_args->index_of_this_type_in_collective + 1, const_args->n_of_this_type_in_collective)); /* keep tag within the limit support by the pml */ tag = (PTPCOLL_TAG_OFFSET + sequence_number * PTPCOLL_TAG_FACTOR) & (ptpcoll_module->tag_mask); /* mark this as a collective tag, to avoid conflict with user-level flags */ ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].tag = tag = -tag; /* reset radix mask, it used to keep last block size */ ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].radix_mask = 1; /* reset active requests */ *active_requests = 0; /* reset iteration counter */ *iteration = -1; /* set initial status */ *status = PTPCOLL_NOT_STARTED; PTPCOLL_VERBOSE(8, ("bcol_ptpcoll_bcast_narray_knomial_scatter_gatther_known_root, buffer index: %d " "tag: %d " "tag_mask: %d " "sn: %d " "root: %d " "buff: %p " "radix: %d" ,buffer_index, tag, ptpcoll_module->tag_mask, sequence_number, input_args->root_flag, data_buffer, ptpcoll_module->narray_knomial_proxy_num )); /* we have a power 2 group */ if (input_args->root_flag) { PTPCOLL_VERBOSE(10, ("I'm root of the data")); narray_knomial_node = &ptpcoll_module->narray_knomial_node[0]; relative_group_index = 0; group_root_index = my_group_index; /* for proxy we have little bit more work to do */ if (PTPCOLL_PROXY & ptpcoll_module->narray_type) { /* send the all data to your extra peer */ for (i = 0; i < ptpcoll_module->narray_knomial_proxy_num; ++i) { PTPCOLL_VERBOSE(9, ("Extra send %d, dst %d, tag %d", i, ptpcoll_module->narray_knomial_proxy_extra_index[i], tag - 1)); rc = MCA_PML_CALL(isend(data_buffer, count, MPI_BYTE, group_list[ptpcoll_module->narray_knomial_proxy_extra_index[i]], tag - 1, MCA_PML_BASE_SEND_STANDARD, comm, &(requests[*active_requests]))); if( OMPI_SUCCESS != rc ) { PTPCOLL_VERBOSE(10, ("Failed to send data")); return OMPI_ERROR; } ++(*active_requests); } } /* * I'm root of the operation * send data to radix_k neighbors */ base_block_size = NARRAY_BLOCK_SIZE(count, ptpcoll_module, narray_knomial_node->level_size); NARRAY_SCATTER_B(narray_knomial_node, my_group_index, group_size, data_buffer, base_block_size, count, tag, comm, requests, active_requests, completed); if (0 == completed) { *status = PTPCOLL_ROOT_SEND_STARTED; return BCOL_FN_STARTED; } goto EXIT; } /* <-- non root flow --> */ group_root_index = input_args->root_route->rank; if (group_root_index >= group_size) { /* calculate virtual root */ virtual_root = (group_root_index - group_size) / mca_bcol_ptpcoll_component.narray_knomial_radix; if (my_group_index == virtual_root) { PTPCOLL_VERBOSE(10, ("I'm virtual root of the data")); rc = MCA_PML_CALL(irecv(data_buffer, count, MPI_BYTE, group_list[group_root_index], tag, comm, &requests[*active_requests])); if( OMPI_SUCCESS != rc ) { PTPCOLL_VERBOSE(10, ("Failed to receive data")); return OMPI_ERROR; } ++(*active_requests); /* act like a root */ relative_group_index = 0; group_root_index = my_group_index; goto SCATTER; } group_root_index = virtual_root; } relative_group_index = my_group_index - group_root_index; if (relative_group_index < 0) { relative_group_index += group_size; } narray_node = &ptpcoll_module->narray_knomial_node[relative_group_index]; data_src = narray_node->parent_rank + group_root_index; if (data_src >= group_size) { data_src -= group_size; } comm_root = group_list[data_src]; recv_count = NARRAY_BLOCK_SIZE(count, ptpcoll_module, narray_node->level_size); offset = recv_count * narray_node->rank_on_level; /* make sure that we do not overun memory */ if (OPAL_UNLIKELY(offset + recv_count > count)) { recv_count = count - offset; if (0 >= recv_count) { goto GATHER; } } curr_data_buffer = (void *)((unsigned char *)data_buffer + (size_t)offset); PTPCOLL_VERBOSE(10, ("Bcast, receive data from %d[%d], count %d, tag %d, addr %p len %d offset %d", comm_root, data_src, count, tag, data_buffer, recv_count, offset)); rc = MCA_PML_CALL(irecv(curr_data_buffer, recv_count, MPI_BYTE, comm_root, tag, comm, &requests[*active_requests])); if( OMPI_SUCCESS != rc ) { PTPCOLL_VERBOSE(10, ("Failed to receive data")); return OMPI_ERROR; } ++(*active_requests); SCATTER: *status = PTPCOLL_WAITING_FOR_DATA; rc = bcol_ptpcoll_bcast_narray_test_and_scatter_known_root(ptpcoll_module, buffer_index, data_buffer, count, group_root_index, relative_group_index); if (BCOL_FN_COMPLETE != rc) { PTPCOLL_VERBOSE(10, ("Not done. Return %d", rc)); return rc; } GATHER: /* recv operation is done */ *iteration = 0; *status = PTPCOLL_GATHER_STARTED; rc = bcol_ptpcoll_bcast_narray_knomial_gather(ptpcoll_module, buffer_index, data_buffer, count, relative_group_index); if (BCOL_FN_COMPLETE != rc) { assert(0 != *active_requests); PTPCOLL_VERBOSE(10, ("Not done. Return %d", rc)); return rc; } ++(*iteration); /* I need it for progress */ /* proxy case */ if ((PTPCOLL_PROXY & ptpcoll_module->narray_type) && ! input_args->root_flag) { *status = PTPCOLL_EXTRA_SEND_STARTED; rc = bcol_ptpcoll_send_n_extra( ptpcoll_module, data_buffer, count, tag - 1, ptpcoll_module->narray_knomial_proxy_extra_index, ptpcoll_module->narray_knomial_proxy_num, input_args->root_route->rank, comm, active_requests, requests); if (BCOL_FN_COMPLETE != rc) { return rc; } } EXIT: return BCOL_FN_COMPLETE; } /* Pasha : need to move this code to some common function */ static int bcol_ptpcoll_bcast_narray_knomial_scatter_gatther_known_root_extra(bcol_function_args_t *input_args, struct mca_bcol_base_function_t *const_args) { mca_bcol_ptpcoll_module_t *ptpcoll_module = (mca_bcol_ptpcoll_module_t *)const_args->bcol_module; int tag; int rc; int completed = 0; /* not completed */ uint32_t buffer_index = input_args->buffer_index; ompi_communicator_t* comm = ptpcoll_module->super.sbgp_partner_module->group_comm; ompi_request_t **requests = ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].requests; void *data_buffer = (void *) ( (unsigned char *)input_args->sbuf + (size_t)input_args->sbuf_offset); int count = input_args->count * input_args->dtype->super.size; int *iteration = &(ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].iteration); int *active_requests = &(ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].active_requests); int *group_list = ptpcoll_module->super.sbgp_partner_module->group_list; PTPCOLL_VERBOSE(3, ("BCAST known root, index_this_type %d, num_of_this_type %d", const_args->index_of_this_type_in_collective + 1, const_args->n_of_this_type_in_collective)); /* keep tag within the limit support by the pml */ tag = (PTPCOLL_TAG_OFFSET + input_args->sequence_num * PTPCOLL_TAG_FACTOR) & (ptpcoll_module->tag_mask); /* mark this as a collective tag, to avoid conflict with user-level flags */ tag = -tag; /* reset active requests */ *active_requests = 0; /* reset iteration counter */ *iteration = -1; PTPCOLL_VERBOSE(8, ("bcol_ptpcoll_bcast_narray_knomial_scatter_gatther_known_root_extra, buffer index: %d " "tag: %d " "tag_mask: %d " "sn: %d " "root: %d " "buff: %p " ,buffer_index, tag, ptpcoll_module->tag_mask, input_args->sequence_num, input_args->root_flag, data_buffer )); /* we have a power 2 group */ if (input_args->root_flag) { PTPCOLL_VERBOSE(10, ("I'm EXTRA root of the data")); /* send the all data to your proxy peer */ rc = MCA_PML_CALL(isend(data_buffer, count, MPI_BYTE, group_list[ptpcoll_module->narray_knomial_proxy_extra_index[0]], tag, MCA_PML_BASE_SEND_STANDARD, comm, &(requests[*active_requests]))); if( OMPI_SUCCESS != rc ) { PTPCOLL_VERBOSE(10, ("Failed to send data")); return OMPI_ERROR; } ++(*active_requests); completed = mca_bcol_ptpcoll_test_all_for_match(active_requests, requests, &rc); if (0 == completed) { /* we have to store the iteration number somewhere */ return (OMPI_SUCCESS != rc) ? rc : BCOL_FN_STARTED; } } else { PTPCOLL_VERBOSE(9, ("Posting recive from %d tag %d", ptpcoll_module->narray_knomial_proxy_extra_index[0], tag - 1)); rc = MCA_PML_CALL(irecv(data_buffer, count, MPI_BYTE, group_list[ptpcoll_module->narray_knomial_proxy_extra_index[0]], tag - 1, comm, &requests[*active_requests])); ++(*active_requests); completed = mca_bcol_ptpcoll_test_all_for_match(active_requests, requests, &rc); if (0 == completed) { PTPCOLL_VERBOSE(10, ("Test was not matched - %d", rc)); return (OMPI_SUCCESS != rc) ? rc : BCOL_FN_STARTED; } } return BCOL_FN_COMPLETE; } static int bcol_ptpcoll_bcast_known_root_extra_progress(bcol_function_args_t *input_args, struct mca_bcol_base_function_t *const_args) { int rc; int completed = 0; /* not completed */ mca_bcol_ptpcoll_module_t *ptpcoll_module = (mca_bcol_ptpcoll_module_t *)const_args->bcol_module; ompi_request_t **requests = ptpcoll_module->ml_mem.ml_buf_desc[input_args->buffer_index].requests; uint32_t buffer_index = input_args->buffer_index; int *active_requests = &(ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].active_requests); PTPCOLL_VERBOSE(10, ("bcol_ptpcoll_bcast_binomial_known_root_extra_progress extra, was called\n")); completed = mca_bcol_ptpcoll_test_all_for_match(active_requests, requests, &rc); if (0 == completed) { return (OMPI_SUCCESS != rc) ? rc : BCOL_FN_STARTED; } PTPCOLL_VERBOSE(10, ("Test was matched - %d", rc)); return BCOL_FN_COMPLETE; } static int bcol_ptpcoll_bcast_narray_progress(bcol_function_args_t *input_args, struct mca_bcol_base_function_t *const_args) { mca_bcol_ptpcoll_module_t *ptpcoll_module = (mca_bcol_ptpcoll_module_t *)const_args->bcol_module; int tag = -1; int rc; int group_size = ptpcoll_module->group_size; int *group_list = ptpcoll_module->super.sbgp_partner_module->group_list; uint32_t buffer_index = input_args->buffer_index; ompi_communicator_t* comm = ptpcoll_module->super.sbgp_partner_module->group_comm; ompi_request_t **send_requests = ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].requests; ompi_request_t **recv_request = &ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].requests[0]; void *data_buffer = (void *) ( (unsigned char *)input_args->sbuf + (size_t)input_args->sbuf_offset); int count = input_args->count * input_args->dtype->super.size; int *active_requests = &(ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].active_requests); int matched = true; int my_group_index = ptpcoll_module->super.sbgp_partner_module->my_index; int relative_group_index = 0; netpatterns_tree_node_t *narray_node = NULL; PTPCOLL_VERBOSE(3, ("Bcast, Narray tree Progress")); PTPCOLL_VERBOSE(8, ("bcol_ptpcoll_bcast_k_nomial_known_root, buffer index: %d " "tag: %d " "tag_mask: %d " "sn: %d " "root: %d [%d]" "buff: %p ", buffer_index, tag, ptpcoll_module->tag_mask, input_args->sequence_num, input_args->root_flag, input_args->root_route->rank, data_buffer)); if (0 == *active_requests) { int group_root_index = input_args->root_route->rank; /* If the collective does not have any active requests, it means the initial data was not received from parent. Check if some data arrived */ if (0 == mca_bcol_ptpcoll_test_for_match(recv_request, &rc)) { PTPCOLL_VERBOSE(10, ("Test was not matched - %d", rc)); /* No data was received, return no match error */ return (OMPI_SUCCESS != rc) ? rc : BCOL_FN_STARTED; } /* set all paremetres */ relative_group_index = my_group_index - group_root_index; if (relative_group_index < 0) { relative_group_index +=group_size; } narray_node = &ptpcoll_module->narray_node[relative_group_index]; /* keep tag within the limit support by the pml */ tag = (PTPCOLL_TAG_OFFSET + input_args->sequence_num * PTPCOLL_TAG_FACTOR) & (ptpcoll_module->tag_mask); /* mark this as a collective tag, to avoid conflict with user-level flags */ tag = -tag; /* Bcast the data */ NARRAY_BCAST_NB(narray_node, group_root_index, group_size, data_buffer, count, tag, comm, send_requests, active_requests); } /* All data was received and sent out. Check if the completion arrived */ matched = mca_bcol_ptpcoll_test_all_for_match (active_requests, send_requests, &rc); if (OMPI_SUCCESS != rc) { return OMPI_ERROR; } /* If it is last call, we have to recycle memory */ if(matched) { return BCOL_FN_COMPLETE; } else { PTPCOLL_VERBOSE(10, ("bcast root is started")); return BCOL_FN_STARTED; } } static int bcol_ptpcoll_bcast_narray(bcol_function_args_t *input_args, struct mca_bcol_base_function_t *const_args) { mca_bcol_ptpcoll_module_t *ptpcoll_module = (mca_bcol_ptpcoll_module_t *)const_args->bcol_module; int tag; int rc; int data_src; int group_size = ptpcoll_module->group_size; int *group_list = ptpcoll_module->super.sbgp_partner_module->group_list; uint32_t buffer_index = input_args->buffer_index; ompi_communicator_t* comm = ptpcoll_module->super.sbgp_partner_module->group_comm; ompi_request_t **send_requests = ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].requests; ompi_request_t **recv_request = &ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].requests[0]; void *data_buffer = (void *) ( (unsigned char *)input_args->sbuf + (size_t)input_args->sbuf_offset); int count = input_args->count * input_args->dtype->super.size; int *active_requests = &(ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].active_requests); int matched = true; int my_group_index = ptpcoll_module->super.sbgp_partner_module->my_index; int group_root_index; int relative_group_index = 0; netpatterns_tree_node_t *narray_node = NULL; PTPCOLL_VERBOSE(3, ("Bcast, Narray tree")); /* reset active request counter */ (*active_requests) = 0; /* keep tag within the limit support by the pml */ tag = (PTPCOLL_TAG_OFFSET + input_args->sequence_num * PTPCOLL_TAG_FACTOR) & (ptpcoll_module->tag_mask); /* mark this as a collective tag, to avoid conflict with user-level flags */ tag = -tag; PTPCOLL_VERBOSE(8, ("bcol_ptpcoll_bcast_narray, buffer index: %d " "tag: %d " "tag_mask: %d " "sn: %d " "root: %d " "buff: %p ", buffer_index, tag, ptpcoll_module->tag_mask, input_args->sequence_num, input_args->root_flag, data_buffer)); if (input_args->root_flag) { PTPCOLL_VERBOSE(10, ("I'm root of the data")); narray_node = &ptpcoll_module->narray_node[0]; group_root_index = my_group_index; /* * I'm root of the operation * send data to N childrens */ goto NARRAY_BCAST_START; } /* I'm not root */ group_root_index = input_args->root_route->rank; relative_group_index = my_group_index - group_root_index; if (relative_group_index < 0) { relative_group_index += group_size; } data_src = ptpcoll_module->narray_node[relative_group_index].parent_rank + group_root_index; if (data_src >= group_size) { data_src -= group_size; } PTPCOLL_VERBOSE(10, ("Bcast, receive data from %d [%d], count %d, tag %d, addr %p", group_list[data_src], data_src, count, tag, data_buffer)); rc = MCA_PML_CALL(irecv(data_buffer, count, MPI_BYTE, group_list[data_src], tag, comm, recv_request)); if( OMPI_SUCCESS != rc ) { PTPCOLL_VERBOSE(10, ("Failed to receive data")); return OMPI_ERROR; } /* We can not block. So run couple of test for data arrival */ if (0 == mca_bcol_ptpcoll_test_for_match(recv_request, &rc)) { PTPCOLL_VERBOSE(10, ("Test was not matched - %d", rc)); /* No data was received, return no match error */ return (OMPI_SUCCESS != rc) ? rc : BCOL_FN_STARTED; } narray_node = &ptpcoll_module->narray_node[relative_group_index]; NARRAY_BCAST_START: /* Bcast the data */ NARRAY_BCAST_NB(narray_node, group_root_index, group_size, data_buffer, count, tag, comm, send_requests, active_requests); matched = mca_bcol_ptpcoll_test_all_for_match (active_requests, send_requests, &rc); if (OMPI_SUCCESS != rc) { return OMPI_ERROR; } /* If it is last call, we have to recycle memory */ if(matched) { return BCOL_FN_COMPLETE; } else { PTPCOLL_VERBOSE(10, ("bcast root is started")); return BCOL_FN_STARTED; } } int bcol_ptpcoll_bcast_init(mca_bcol_base_module_t *super) { mca_bcol_ptpcoll_module_t *ptpcoll_module = (mca_bcol_ptpcoll_module_t *) super; mca_bcol_base_coll_fn_comm_attributes_t comm_attribs; mca_bcol_base_coll_fn_invoke_attributes_t inv_attribs; comm_attribs.bcoll_type = BCOL_BCAST; comm_attribs.comm_size_min = 0; comm_attribs.comm_size_max = 1024 * 1024; comm_attribs.waiting_semantics = NON_BLOCKING; inv_attribs.bcol_msg_min = 0; inv_attribs.bcol_msg_max = 20000; /* range 1 */ inv_attribs.datatype_bitmap = 0xffffffff; inv_attribs.op_types_bitmap = 0xffffffff; comm_attribs.data_src = DATA_SRC_UNKNOWN; if(PTPCOLL_KN_EXTRA == ptpcoll_module->pow_ktype) { mca_bcol_base_set_attributes(super, &comm_attribs, &inv_attribs, bcol_ptpcoll_bcast_k_nomial_extra_known_and_anyroot, bcol_ptpcoll_bcast_k_nomial_extra_known_and_anyroot_progress); } else { mca_bcol_base_set_attributes(super, &comm_attribs, &inv_attribs, bcol_ptpcoll_bcast_k_nomial_anyroot, bcol_ptpcoll_bcast_k_nomial_anyroot_progress); } comm_attribs.data_src = DATA_SRC_KNOWN; switch(mca_bcol_ptpcoll_component.bcast_small_messages_known_root_alg) { case PTPCOLL_KNOMIAL: if(PTPCOLL_KN_EXTRA == ptpcoll_module->pow_ktype) { mca_bcol_base_set_attributes(super, &comm_attribs, &inv_attribs, bcol_ptpcoll_bcast_k_nomial_extra_known_and_anyroot, bcol_ptpcoll_bcast_k_nomial_extra_known_and_anyroot_progress); } else { mca_bcol_base_set_attributes(super, &comm_attribs, &inv_attribs, bcol_ptpcoll_bcast_k_nomial_known_root, bcol_ptpcoll_bcast_k_nomial_known_root_progress); } break; case PTPCOLL_NARRAY: mca_bcol_base_set_attributes(super, &comm_attribs, &inv_attribs, bcol_ptpcoll_bcast_narray, bcol_ptpcoll_bcast_narray_progress); break; default: PTPCOLL_ERROR(("Unknown algorithm index was selected %", mca_bcol_ptpcoll_component.bcast_small_messages_known_root_alg)); return OMPI_ERROR; } comm_attribs.data_src = DATA_SRC_UNKNOWN; inv_attribs.bcol_msg_min = 10000000; inv_attribs.bcol_msg_max = 10485760; /* range 4 */ /* Anyroot large messages functions registration */ if (PTPCOLL_EXTRA == ptpcoll_module->pow_2type) { mca_bcol_base_set_attributes(super, &comm_attribs, &inv_attribs, bcol_ptpcoll_bcast_binomial_scatter_gatther_anyroot_extra, bcol_ptpcoll_bcast_binomial_scatter_gatther_anyroot_extra_progress); } else { mca_bcol_base_set_attributes(super, &comm_attribs, &inv_attribs, bcol_ptpcoll_bcast_binomial_scatter_gatther_anyroot, bcol_ptpcoll_bcast_binomial_scatter_gatther_anyroot_progress); } /* Known-root large messages functions registration */ comm_attribs.data_src = DATA_SRC_KNOWN; switch(mca_bcol_ptpcoll_component.bcast_large_messages_known_root_alg) { case PTPCOLL_BINOMIAL_SG: if (PTPCOLL_EXTRA == ptpcoll_module->pow_2type) { mca_bcol_base_set_attributes(super, &comm_attribs, &inv_attribs, bcol_ptpcoll_bcast_binomial_scatter_gatther_known_root_extra, bcol_ptpcoll_bcast_known_root_extra_progress); /* bcol_ptpcoll_bcast_binomial_scatter_gatther_known_root_extra_progress); */ } else { mca_bcol_base_set_attributes(super, &comm_attribs, &inv_attribs, bcol_ptpcoll_bcast_binomial_scatter_gatther_known_root, bcol_ptpcoll_bcast_binomial_scatter_gatther_known_root_progress); } break; case PTPCOLL_NARRAY_KNOMIAL_SG: if (PTPCOLL_EXTRA == ptpcoll_module->narray_type) { mca_bcol_base_set_attributes(super, &comm_attribs, &inv_attribs, bcol_ptpcoll_bcast_narray_knomial_scatter_gatther_known_root_extra, bcol_ptpcoll_bcast_known_root_extra_progress); } else { mca_bcol_base_set_attributes(super, &comm_attribs, &inv_attribs, bcol_ptpcoll_bcast_narray_knomial_scatter_gatther_known_root, bcol_ptpcoll_bcast_narray_knomial_scatter_gatther_known_root_progress); } break; default: PTPCOLL_ERROR(("Unknown algorithm index was selected %", mca_bcol_ptpcoll_component.bcast_large_messages_known_root_alg)); return OMPI_ERROR; } return OMPI_SUCCESS; }