/* * $HEADER$ */ #include "ompi_config.h" #include #include #include "mpi.h" #include "communicator/communicator.h" #include "proc/proc.h" #include "util/bit_ops.h" #include "include/constants.h" #include "mca/pcm/pcm.h" #include "mca/pml/pml.h" #include "mca/coll/coll.h" #include "mca/coll/base/base.h" #include "mca/ns/base/base.h" #include "mca/pml/pml.h" #include "mca/ptl/ptl.h" #include "mca/ptl/base/ptl_base_comm.h" #include "mca/ptl/base/ptl_base_recvfrag.h" #include "mca/ptl/base/ptl_base_header.h" #include "mca/ptl/base/ptl_base_match.h" /* ** sort-function for MPI_Comm_split */ static int rankkeycompare(const void *, const void *); /* ** typedef for the allgather_intra required in comm_split. ** the reason for introducing this abstraction is, that ** for Comm_split for inter-coms, we do not have this ** functions, so we need to emulate it. */ typedef int ompi_comm_allgatherfct (void* inbuf, int incount, MPI_Datatype intype, void* outbuf, int outcount, MPI_Datatype outtype, ompi_communicator_t *comm); static int ompi_comm_allgather_emulate_intra (void* inbuf, int incount, MPI_Datatype intype, void* outbuf, int outcount, MPI_Datatype outtype, ompi_communicator_t *comm); /**********************************************************************/ /**********************************************************************/ /**********************************************************************/ /* * This is the function setting all elements of a communicator. * All other routines are just used to determine these elements. */ ompi_communicator_t * ompi_comm_set ( ompi_communicator_t* oldcomm, int local_size, ompi_proc_t **local_procs, int remote_size, ompi_proc_t **remote_procs, ompi_hash_table_t *attr, ompi_errhandler_t *errh, mca_base_module_t *collmodule, mca_base_module_t *topomodule ) { ompi_communicator_t *newcomm; ompi_proc_t *my_gpointer; int my_grank; /* Allocate comm structure */ newcomm = ompi_comm_allocate ( local_size, remote_size ); /* Set local_group information */ memcpy ( newcomm->c_local_group->grp_proc_pointers, local_procs, local_size * sizeof(ompi_proc_t *)); ompi_group_increment_proc_count(newcomm->c_local_group); /* determine my rank */ my_grank = oldcomm->c_local_group->grp_my_rank; my_gpointer = oldcomm->c_local_group->grp_proc_pointers[my_grank]; ompi_set_group_rank(newcomm->c_local_group, my_gpointer); newcomm->c_my_rank = newcomm->c_local_group->grp_my_rank; /* Set remote group, if applicable */ if ( 0 < remote_size) { memcpy ( newcomm->c_remote_group->grp_proc_pointers, remote_procs, remote_size * sizeof(ompi_proc_t *)); ompi_group_increment_proc_count(newcomm->c_remote_group); newcomm->c_flags |= OMPI_COMM_INTER; } /* Set error handler */ newcomm->error_handler = errh; OBJ_RETAIN ( newcomm->error_handler ); /* Set name for debugging purposes */ snprintf(newcomm->c_name, MPI_MAX_OBJECT_NAME, "MPI COMMUNICATOR %d", newcomm->c_contextid); /* Determine cube_dim */ newcomm->c_cube_dim = ompi_cube_dim(newcomm->c_local_group->grp_proc_count); /* Set Topology, if required */ if ( NULL != topomodule ) { if (OMPI_COMM_IS_CART ( oldcomm ) ) newcomm->c_flags |= OMPI_COMM_CART; if (OMPI_COMM_IS_GRAPH ( oldcomm ) ) newcomm->c_flags |= OMPI_COMM_GRAPH; /*set the topo-module */ } /* Copy attributes and call according copy functions, if required */ ompi_attr_hash_init(&newcomm->c_keyhash); if ( attr != NULL ) { ompi_attr_copy_all (COMM_ATTR, oldcomm, newcomm, attr, newcomm->c_keyhash); } /* Initialize the PML stuff in the newcomm */ if ( OMPI_ERROR == mca_pml.pml_add_comm(newcomm) ) { goto err_exit; } /* Initialize the coll modules */ /* Let the collectives modules fight over who will do collective on this new comm. */ if (OMPI_ERROR == mca_coll_base_comm_select(newcomm, collmodule)) { goto err_exit; } err_exit: /* Free whatever has been allocated, print an appropriate error message and return a null pointer */ return ( newcomm ); } /**********************************************************************/ /**********************************************************************/ /**********************************************************************/ /* ** Counterpart to MPI_Comm_group. To be used within OMPI functions. */ int ompi_comm_group ( ompi_communicator_t* comm, ompi_group_t **group ) { /* increment proc reference counters */ OBJ_RETAIN(comm->c_local_group); *group = comm->c_local_group; return OMPI_SUCCESS; } /**********************************************************************/ /**********************************************************************/ /**********************************************************************/ /* ** Counterpart to MPI_Comm_create. To be used within OMPI. */ int ompi_comm_create ( ompi_communicator_t *comm, ompi_group_t *group, ompi_communicator_t **newcomm ) { ompi_communicator_t *newcomp; int rsize; int mode; int *allranks=NULL; ompi_proc_t **rprocs=NULL; int rc = OMPI_SUCCESS; if ( OMPI_COMM_IS_INTER(comm) ) { #ifdef HAVE_COMM_CREATE_FOR_INTERCOMMS int tsize, i, j; tsize = comm->c_remote_group->grp_proc_count; allranks = (int *) malloc ( tsize * sizeof(int)); if ( NULL == allranks ) { rc = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } rc = comm->c_coll.coll_allgather_inter ( &group->grp_my_rank, 1, MPI_INT, allranks, 1, MPI_INT, comm ); if ( OMPI_SUCCESS != rc ) { goto exit; } /* Count number of procs in future remote group */ for (rsize=0, i = 0; i < tsize; i++) { if ( MPI_PROC_NULL != allranks[i] ) { rsize++; } } /* If any of those groups is empty, we have to return MPI_COMM_NULL */ if ( 0 == rsize || 0 == group->grp_proc_count ) { *newcomm = MPI_COMM_NULL; rc = OMPI_SUCCESS; goto exit; } /* Set proc-pointers for remote group */ rprocs = (ompi_proc_t **) malloc ( sizeof(ompi_proc_t *) * rsize); if ( NULL == rprocs ) { rc = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } for ( j = 0, i = 0; i < rsize; i++ ) { if ( MPI_PROC_NULL != allranks[i] ) { rprocs[j] = comm->c_remote_group->grp_proc_pointers[i]; j++; } } mode = OMPI_COMM_CID_INTER; #else return ( MPI_ERR_COMM ); #endif } else { rsize = 0; rprocs = NULL; mode = OMPI_COMM_CID_INTRA; } newcomp = ompi_comm_set ( comm, /* old comm */ group->grp_proc_count, /* local_size */ group->grp_proc_pointers, /* local_procs*/ rsize, /* remote_size */ rprocs, /* remote_procs */ NULL, /* attrs */ comm->error_handler, /* error handler */ NULL, /* coll module */ NULL /* topo module */ ); if ( MPI_COMM_NULL == newcomp ) { rc = MPI_ERR_INTERN; goto exit; } /* Determine context id. It is identical to f_2_c_handle */ rc = ompi_comm_nextcid ( newcomp, /* new communicator */ comm, /* old comm */ NULL, /* bridge comm */ NULL, /* local leader */ NULL, /* remote_leader */ mode ); /* mode */ if ( OMPI_SUCCESS != rc ) { goto exit; } /* Check whether we are part of the new comm. If not, we have to free the structure again. However, we could not avoid the comm_nextcid step, since all processes of the original comm have to participate in that function call. Additionally, all errhandler stuff etc. has to be set to make ompi_comm_free happy */ if ( MPI_PROC_NULL == newcomp->c_local_group->grp_my_rank ) { ompi_comm_free ( &newcomp ); } exit: if ( NULL != allranks ) { free ( allranks ); } if ( NULL != rprocs ) { free ( rprocs ); } *newcomm = newcomp; return ( rc ); } /**********************************************************************/ /**********************************************************************/ /**********************************************************************/ /* ** Counterpart to MPI_Comm_split. To be used within OMPI (e.g. MPI_Cart_sub). */ int ompi_comm_split ( ompi_communicator_t* comm, int color, int key, ompi_communicator_t **newcomm ) { int myinfo[2]; int size, my_size; int my_rsize; int mode; #ifdef HAVE_COMM_SPLIT_FOR_INTERCOMMS int rsize; #endif int i, loc; int inter; int *results=NULL, *sorted=NULL; int *rresults=NULL, *rsorted=NULL; int rc=OMPI_SUCCESS; ompi_proc_t **procs=NULL, **rprocs=NULL; ompi_communicator_t *newcomp; /* Step 1: determine all the information for the local group */ /* --------------------------------------------------------- */ /* sort according to color and rank. Gather information from everyone */ myinfo[0] = color; myinfo[1] = key; size = ompi_comm_size ( comm ); inter = OMPI_COMM_IS_INTER(comm); results = (int*) malloc ( 2 * size * sizeof(int)); if ( NULL == results ) { return OMPI_ERR_OUT_OF_RESOURCE; } rc = comm->c_coll.coll_allgather( myinfo, 2, MPI_INT, results, 2, MPI_INT, comm ); if ( OMPI_SUCCESS != rc ) { goto exit; } /* how many have the same color like me */ for ( my_size = 0, i=0; i < size; i++) { if ( results[(2*i)+0] == color) my_size++; } sorted = (int *) malloc ( sizeof( int ) * my_size * 2); if ( NULL == sorted) { rc = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } /* ok we can now fill this info */ for( loc = 0, i = 0; i < size; i++ ) { if ( results[(2*i)+0] == color) { sorted[(2*loc)+0] = i; /* copy org rank */ sorted[(2*loc)+1] = results[(2*i)+1]; /* copy key */ loc++; } } /* the new array needs to be sorted so that it is in 'key' order */ /* if two keys are equal then it is sorted in original rank order! */ if(my_size>1){ qsort ((int*)sorted, my_size, sizeof(int)*2, rankkeycompare); } /* put group elements in a list */ procs = (ompi_proc_t **) malloc ( sizeof(ompi_proc_t *) * my_size); if ( NULL == procs ) { rc = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } for (i = 0; i < my_size; i++) { procs[i] = comm->c_local_group->grp_proc_pointers[sorted[i*2]]; } /* Step 2: determine all the information for the remote group */ /* --------------------------------------------------------- */ if ( inter ) { #ifdef HAVE_COMM_SPLIT_FOR_INTERCOMMS rsize = comm->c_remote_group->grp_proc_count; rresults = (int *) malloc ( rsize * 2 * sizeof(int)); if ( NULL == rresults ) { rc = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } rc = comm->c_coll.coll_allgather_inter ( myinfo, 2, MPI_INT, rresults, 2, MPI_INT, comm ); if ( OMPI_SUCCESS != rc ) { goto exit; } /* how many have the same color like me */ for ( my_rsize = 0, i=0; i < rsize; i++) { if ( rresults[(2*i)+0] == color) my_rsize++; } rsorted = (int *) malloc ( sizeof( int ) * my_rsize * 2); if ( NULL == rsorted) { rc = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } /* ok we can now fill this info */ for( loc = 0, i = 0; i < rsize; i++ ) { if ( rresults[(2*i)+0] == color) { rsorted[(2*loc)+0] = i; /* org rank */ rsorted[(2*loc)+1] = rresults[(2*i)+1]; /* key */ loc++; } } /* the new array needs to be sorted so that it is in 'key' order */ /* if two keys are equal then it is sorted in original rank order! */ if(my_rsize>1) { qsort ((int*)rsorted, my_rsize, sizeof(int)*2, rankkeycompare); } /* put group elements in a list */ rprocs = (ompi_proc_t **) malloc ( sizeof(ompi_proc_t *) * my_rsize); if ( NULL == procs ) { rc = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } for (i = 0; i < my_rsize; i++) { rprocs[i] = comm->c_remote_group->grp_proc_pointers[rsorted[i*2]]; } mode = OMPI_COMM_CID_INTER; #else /* creating an inter-communicator using MPI_Comm_create will be supported soon, but not in this version */ rc = MPI_ERR_COMM; goto exit; #endif } else { my_rsize = 0; rprocs = NULL; mode = OMPI_COMM_CID_INTRA; } /* Step 3: set up the communicator */ /* --------------------------------------------------------- */ /* Create the communicator finally */ newcomp = ompi_comm_set ( comm, /* old comm */ my_size, /* local_size */ procs, /* local_procs*/ my_rsize, /* remote_size */ rprocs, /* remote_procs */ NULL, /* attrs */ comm->error_handler,/* error handler */ NULL, /* coll module */ NULL /* topo module */ ); if ( MPI_COMM_NULL == newcomp ) { rc = MPI_ERR_INTERN; goto exit; } /* Determine context id. It is identical to f_2_c_handle */ rc = ompi_comm_nextcid ( newcomp, /* new communicator */ comm, /* old comm */ NULL, /* bridge comm */ NULL, /* local leader */ NULL, /* remote_leader */ mode ); /* mode */ if ( OMPI_SUCCESS != rc ) { goto exit; } exit: if ( NULL != results ) { free ( results ); } if ( NULL != sorted ) { free ( sorted ); } if ( NULL != rresults) { free ( rresults ); } if ( NULL != rsorted ) { free ( rsorted ); } if ( NULL != procs ) { free ( procs ); } if ( NULL != rprocs ) { free ( rprocs ); } /* Step 4: if we are not part of the comm, free the struct */ /* --------------------------------------------------------- */ if ( MPI_UNDEFINED == color ) { ompi_comm_free ( &newcomp ); } *newcomm = newcomp; return ( rc ); } /**********************************************************************/ /**********************************************************************/ /**********************************************************************/ /* * Implementation of MPI_Allgather for the local_group in an inter-comm. * The algorithm consists of two steps: * 1. an inter-gather to rank 0 in remote group * 2. an inter-bcast from rank 0 in remote_group. */ #define OMPI_COMM_ALLGATHER_TAG 31078 static int ompi_comm_allgather_emulate_intra( void *inbuf, int incount, MPI_Datatype intype, void* outbuf, int outcount, MPI_Datatype outtype, ompi_communicator_t *comm) { int rank, rsize, i, rc; int *tmpbuf=NULL; MPI_Request *req=NULL, sendreq; MPI_Status *stats=NULL, status; rsize = ompi_comm_remote_size(comm); rank = ompi_comm_rank(comm); /* Step 1: the gather-step */ if ( 0 == rank ) { tmpbuf = (int *) malloc (rsize*outcount*sizeof(int)); req = (MPI_Request *)malloc (rsize*outcount*sizeof(MPI_Request)); stats = (MPI_Status *)malloc (rsize*outcount*sizeof(MPI_Status)); if ( NULL == tmpbuf || NULL == req || NULL == stats) { return (OMPI_ERR_OUT_OF_RESOURCE); } for ( i=0; ic_keyhash ); OBJ_RELEASE((*comm)->c_keyhash); /* Release the communicator */ OBJ_RELEASE ( (*comm) ); *comm = MPI_COMM_NULL; return OMPI_SUCCESS; } /**********************************************************************/ /**********************************************************************/ /**********************************************************************/ ompi_proc_t **ompi_comm_get_rprocs ( ompi_communicator_t *local_comm, ompi_communicator_t *bridge_comm, int local_leader, int remote_leader, int tag, int rsize) { ompi_proc_t **rprocs = NULL; #if 0 /* TSW - fix this */ int local_rank, local_size; ompi_process_id_t jobid; uint32_t *rvpids=NULL, *vpids=NULL; int rc, i; local_rank = ompi_comm_rank ( local_comm ); vpids = (uint32_t *) malloc ( local_size * sizeof(uint32_t)); rvpids = (uint32_t *) malloc ( rsize * sizeof(uint32_t)); rprocs = (ompi_proc_t **) malloc ( rsize * sizeof(ompi_proc_t *)); if ( NULL == vpids || NULL == rvpids || NULL == rprocs ) { rc = OMPI_ERR_OUT_OF_RESOURCE; goto err_exit; } if ( local_rank == local_leader ) { MPI_Request req; MPI_Status status; /* generate vpid list */ for ( i = 0; i < local_size; i++ ){ vpids[i] = (uint32_t) local_comm->c_local_group->grp_proc_pointers[i]->proc_name; } /* local leader exchange group sizes and vpid lists */ rc = mca_pml.pml_irecv (rvpids, rsize, MPI_UNSIGNED, remote_leader, tag, bridge_comm, &req ); if ( rc != MPI_SUCCESS ) { goto err_exit; } rc = mca_pml.pml_send (vpids, local_size, MPI_UNSIGNED, remote_leader, tag, MCA_PML_BASE_SEND_STANDARD, bridge_comm ); if ( rc != MPI_SUCCESS ) { goto err_exit; } rc = mca_pml.pml_wait ( 1, &req, NULL, &status); if ( rc != MPI_SUCCESS ) { goto err_exit; } } rc = local_comm->c_coll.coll_bcast( rvpids, rsize, MPI_UNSIGNED, local_leader, local_comm ); if ( rc != MPI_SUCCESS ) { goto err_exit; } for ( i = 0; i < rsize; i++ ) { rprocs[i] = ompi_proc_find ( job, rvpids[i] ); } err_exit: if ( NULL != vpids ) { free ( vpids ); } if ( NULL != rvpids) { free ( rvpids ); } /* rprocs has to be freed in the level above (i.e. intercomm_create ) */ #endif return rprocs; } /**********************************************************************/ /**********************************************************************/ /**********************************************************************/ int ompi_comm_determine_first ( ompi_communicator_t *intercomm, int high ) { int flag, rhigh; int local_rank, rc; ompi_proc_t *lvpid, *rvpid; ompi_ns_cmp_bitmask_t mask; lvpid = intercomm->c_local_group->grp_proc_pointers[0]; rvpid = intercomm->c_remote_group->grp_proc_pointers[0]; local_rank = ompi_comm_rank ( intercomm ); /* * determine maximal high value over the intercomm */ mask = OMPI_NS_CMP_CELLID | OMPI_NS_CMP_JOBID | OMPI_NS_CMP_VPID; if ( ompi_name_server.compare(mask, &lvpid->proc_name, &rvpid->proc_name) < 0 ) { if ( 0 == local_rank ) { rc = intercomm->c_coll.coll_bcast(&high, 1, MPI_INT, MPI_ROOT, intercomm ); } else { rc = intercomm->c_coll.coll_bcast(&high, 1, MPI_INT, MPI_PROC_NULL, intercomm ); } if ( rc != MPI_SUCCESS ) { return rc; } rc = intercomm->c_coll.coll_bcast ( &rhigh, 1, MPI_INT, 0, intercomm ); if ( rc != MPI_SUCCESS ) { return rc; } } else { rc = intercomm->c_coll.coll_bcast ( &rhigh, 1, MPI_INT, 0, intercomm ); if ( rc != MPI_SUCCESS ) { return rc; } if ( 0 == local_rank ) { rc = intercomm->c_coll.coll_bcast ( &high, 1, MPI_INT, MPI_ROOT, intercomm ); } else { rc = intercomm->c_coll.coll_bcast(&high, 1, MPI_INT, MPI_PROC_NULL, intercomm); } if ( rc != MPI_SUCCESS ) { return rc; } } /* This is the logic for determining who is first, who is second */ if ( high && !rhigh ) { flag = true; } else if ( !high && rhigh ) { flag = false; } else { if ( lvpid > rvpid ) { flag = true; } else { flag = false; } } return flag; } /********************************************************************************/ /********************************************************************************/ /********************************************************************************/ int ompi_comm_dump ( ompi_communicator_t *comm ) { mca_pml_ptl_comm_t *pml_comm; mca_ptl_sequence_t *seq; ompi_list_t *list; int i; printf("Dumping information for comm_cid %d\n", comm->c_contextid); printf(" f2c index:%d cube_dim: %d\n", comm->c_f_to_c_index, comm->c_cube_dim); printf(" Local group: size = %d my_rank = %d\n", comm->c_local_group->grp_proc_count, comm->c_local_group->grp_my_rank ); printf(" Communicator is:"); /* Display flags */ if ( OMPI_COMM_IS_INTER(comm) ) printf(" inter-comm,"); if ( OMPI_COMM_IS_CART(comm)) printf(" topo-cart,"); if ( OMPI_COMM_IS_GRAPH(comm)) printf(" topo-graph"); printf("\n"); if (OMPI_COMM_IS_INTER(comm)) { printf(" Remote group size:%d\n", comm->c_remote_group->grp_proc_count); } /* Dump the c_pml_comm->c_unexpexted_frags for each process */ pml_comm = (mca_pml_ptl_comm_t *)comm->c_pml_comm; seq = (mca_ptl_sequence_t *) pml_comm->c_frags_cant_match; for ( i = 0; i < comm->c_local_group->grp_proc_count; i++ ){ list = (ompi_list_t *)seq+i; printf("%d: head->list_next:%p head->list_prev:%p" " tail->list_next:%p tail->list_next:%p\n", i, (char*)list->ompi_list_head.ompi_list_next, (char*)list->ompi_list_head.ompi_list_prev, (char*)list->ompi_list_tail.ompi_list_next, (char*)list->ompi_list_tail.ompi_list_prev ); fflush(stdout); } return MPI_SUCCESS; } /********************************************************************************/ /********************************************************************************/ /********************************************************************************/ /* static functions */ /* ** rankkeygidcompare() compares a tuple of (rank,key,gid) producing ** sorted lists that match the rules needed for a MPI_Comm_split */ static int rankkeycompare (const void *p, const void *q) { int *a, *b; /* ranks at [0] key at [1] */ /* i.e. we cast and just compare the keys and then the original ranks.. */ a = (int*)p; b = (int*)q; /* simple tests are those where the keys are different */ if (a[1] < b[1]) { return (-1); } if (a[1] > b[1]) { return (1); } /* ok, if the keys are the same then we check the original ranks */ if (a[1] == b[1]) { if (a[0] < b[0]) { return (-1); } if (a[0] == b[0]) { return (0); } if (a[0] > b[0]) { return (1); } } return ( 0 ); }