diff --git a/ompi/communicator/comm_cid.c b/ompi/communicator/comm_cid.c index 59e812a5fd..1789f714fc 100644 --- a/ompi/communicator/comm_cid.c +++ b/ompi/communicator/comm_cid.c @@ -139,54 +139,8 @@ static opal_mutex_t ompi_cid_lock; static opal_list_t ompi_registered_comms; -/* This variable is zero (false) if all processes in MPI_COMM_WORLD - * did not require MPI_THREAD_MULTIPLE support, and is 1 (true) as - * soon as at least one process requested support for THREAD_MULTIPLE */ -static int ompi_comm_world_thread_level_mult=0; - - int ompi_comm_cid_init (void) { -#if OMPI_ENABLE_THREAD_MULTIPLE - ompi_proc_t **procs, *thisproc; - uint8_t thread_level; - uint8_t *tlpointer; - int ret; - size_t i, size, numprocs; - - /** Note that the following call only returns processes - * with the same jobid. This is on purpose, since - * we switch for the dynamic communicators anyway - * to the original (slower) cid allocation algorithm. - */ - procs = ompi_proc_world ( &numprocs ); - - for ( i=0; isuper.proc_name, - (uint8_t**)&tlpointer, &size); - if (OMPI_SUCCESS == ret) { - thread_level = *((uint8_t *) tlpointer); - if ( OMPI_THREADLEVEL_IS_MULTIPLE (thread_level) ) { - ompi_comm_world_thread_level_mult = 1; - break; - } - } else if (OMPI_ERR_NOT_IMPLEMENTED == ret) { - if (ompi_mpi_thread_multiple) { - ompi_comm_world_thread_level_mult = 1; - } - break; - } else { - return ret; - } - } - free(procs); -#else - ompi_comm_world_thread_level_mult = 0; // silence compiler warning if not used -#endif - return OMPI_SUCCESS; } diff --git a/ompi/communicator/comm_init.c b/ompi/communicator/comm_init.c index 2b761d93df..a7f302bbd4 100644 --- a/ompi/communicator/comm_init.c +++ b/ompi/communicator/comm_init.c @@ -13,7 +13,7 @@ * Copyright (c) 2006-2010 University of Houston. All rights reserved. * Copyright (c) 2007-2012 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2009 Sun Microsystems, Inc. All rights reserved. - * Copyright (c) 2012-2014 Los Alamos National Security, LLC. + * Copyright (c) 2012-2015 Los Alamos National Security, LLC. * All rights reserved. * Copyright (c) 2011-2013 Inria. All rights reserved. * Copyright (c) 2011-2013 Universite Bordeaux 1 @@ -102,12 +102,26 @@ int ompi_comm_init(void) OBJ_CONSTRUCT(&ompi_mpi_comm_world, ompi_communicator_t); assert(ompi_mpi_comm_world.comm.c_f_to_c_index == 0); group = OBJ_NEW(ompi_group_t); - group->grp_proc_pointers = ompi_proc_world(&size); - group->grp_proc_count = (int)size; + + size = ompi_process_info.num_procs; + group->grp_proc_pointers = (ompi_proc_t **) calloc (size, sizeof (ompi_proc_t *)); + group->grp_proc_count = size; + + for (size_t i = 0 ; i < size ; ++i) { + opal_process_name_t name = {.vpid = i, .jobid = OMPI_PROC_MY_NAME->jobid}; + /* look for existing ompi_proc_t that matches this name */ + group->grp_proc_pointers[i] = (ompi_proc_t *) ompi_proc_lookup (name); + if (NULL == group->grp_proc_pointers[i]) { + /* set sentinel value */ + group->grp_proc_pointers[i] = (ompi_proc_t *) ompi_proc_name_to_sentinel (name); + } else { + OBJ_RETAIN (group->grp_proc_pointers[i]); + } + } + OMPI_GROUP_SET_INTRINSIC (group); OMPI_GROUP_SET_DENSE (group); ompi_set_group_rank(group, ompi_proc_local()); - ompi_group_increment_proc_count (group); ompi_mpi_comm_world.comm.c_contextid = 0; ompi_mpi_comm_world.comm.c_id_start_index = 4; diff --git a/ompi/dpm/dpm.c b/ompi/dpm/dpm.c index a9e7f6c960..2d298127a6 100644 --- a/ompi/dpm/dpm.c +++ b/ompi/dpm/dpm.c @@ -13,7 +13,7 @@ * Copyright (c) 2007-2015 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2006-2009 University of Houston. All rights reserved. * Copyright (c) 2009 Sun Microsystems, Inc. All rights reserved. - * Copyright (c) 2011-2013 Los Alamos National Security, LLC. All rights + * Copyright (c) 2011-2015 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2013-2015 Intel, Inc. All rights reserved * Copyright (c) 2014-2015 Research Organization for Information Science @@ -1293,6 +1293,22 @@ static int disconnect_waitall (int count, ompi_dpm_disconnect_obj **objs) /**********************************************************************/ /**********************************************************************/ /**********************************************************************/ +static bool ompi_dpm_group_is_dyn (ompi_group_t *group, ompi_jobid_t thisjobid) +{ + int size = group ? ompi_group_size (group) : 0; + + for (int i = 1 ; i < size ; ++i) { + opal_process_name_t name = ompi_group_get_proc_name (group, i); + + if (thisjobid != ((ompi_process_name_t *) &name)->jobid) { + /* at least one is different */ + return true; + } + } + + return false; +} + /* All we want to do in this function is determine if the number of * jobids in the local and/or remote group is > 1. This tells us to * set the disconnect flag. We don't actually care what the true @@ -1300,56 +1316,30 @@ static int disconnect_waitall (int count, ompi_dpm_disconnect_obj **objs) */ void ompi_dpm_mark_dyncomm(ompi_communicator_t *comm) { - int i; - int size, rsize; - bool found=false; + bool found; ompi_jobid_t thisjobid; - ompi_group_t *grp=NULL; - ompi_proc_t *proc = NULL; /* special case for MPI_COMM_NULL */ if (comm == MPI_COMM_NULL) { return; } - size = ompi_comm_size(comm); - rsize = ompi_comm_remote_size(comm); + thisjobid = ompi_group_get_proc_name (comm->c_local_group, 0).jobid; /* loop over all processes in local group and check for * a different jobid */ - grp = comm->c_local_group; - proc = ompi_group_peer_lookup(grp,0); - thisjobid = ((ompi_process_name_t*)&proc->super.proc_name)->jobid; - - for (i=1; i< size; i++) { - proc = ompi_group_peer_lookup(grp,i); - if (thisjobid != ((ompi_process_name_t*)&proc->super.proc_name)->jobid) { - /* at least one is different */ - found = true; - goto complete; - } + found = ompi_dpm_group_is_dyn (comm->c_local_group, thisjobid); + if (!found) { + /* if inter-comm, loop over all processes in remote_group + * and see if any are different from thisjobid + */ + found = ompi_dpm_group_is_dyn (comm->c_remote_group, thisjobid); } - /* if inter-comm, loop over all processes in remote_group - * and see if any are different from thisjobid - */ - grp = comm->c_remote_group; - for (i=0; i< rsize; i++) { - proc = ompi_group_peer_lookup(grp,i); - if (thisjobid != ((ompi_process_name_t*)&proc->super.proc_name)->jobid) { - /* at least one is different */ - found = true; - break; - } - } - - complete: /* if a different jobid was found, set the disconnect flag*/ if (found) { ompi_comm_num_dyncomm++; OMPI_COMM_SET_DYNAMIC(comm); } - - return; } diff --git a/ompi/group/group.c b/ompi/group/group.c index d489028d58..fe0b60c246 100644 --- a/ompi/group/group.c +++ b/ompi/group/group.c @@ -14,7 +14,7 @@ * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2012 Oak Ridge National Labs. All rights reserved. * Copyright (c) 2012-2013 Inria. All rights reserved. - * Copyright (c) 2013 Los Alamos National Security, LLC. All rights + * Copyright (c) 2013-2015 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2015 Research Organization for Information Science * and Technology (RIST). All rights reserved. @@ -49,16 +49,14 @@ int ompi_group_translate_ranks ( ompi_group_t *group1, ompi_group_t *group2, int *ranks2) { - int rank, proc, proc2; - struct ompi_proc_t *proc1_pointer, *proc2_pointer; - if ( MPI_GROUP_EMPTY == group1 || MPI_GROUP_EMPTY == group2 ) { - for (proc = 0; proc < n_ranks ; proc++) { + for (int proc = 0; proc < n_ranks ; ++proc) { ranks2[proc] = MPI_UNDEFINED; } return MPI_SUCCESS; } +#if OMPI_GROUP_SPARSE /* * If we are translating from a parent to a child that uses the sparse format * or vice versa, we use the translate ranks function corresponding to the @@ -80,8 +78,11 @@ int ompi_group_translate_ranks ( ompi_group_t *group1, (group1,n_ranks,ranks1,group2,ranks2); } + /* unknown sparse group type */ + assert (0); } - else if( group2->grp_parent_group_ptr == group1 ) { /* from parent to child*/ + + if( group2->grp_parent_group_ptr == group1 ) { /* from parent to child*/ if(OMPI_GROUP_IS_SPORADIC(group2)) { return ompi_group_translate_ranks_sporadic (group1,n_ranks,ranks1,group2,ranks2); @@ -95,28 +96,32 @@ int ompi_group_translate_ranks ( ompi_group_t *group1, (group1,n_ranks,ranks1,group2,ranks2); } + /* unknown sparse group type */ + assert (0); } - else { - /* loop over all ranks */ - for (proc = 0; proc < n_ranks; proc++) { - rank=ranks1[proc]; - if ( MPI_PROC_NULL == rank) { - ranks2[proc] = MPI_PROC_NULL; - } - else { - proc1_pointer = ompi_group_peer_lookup(group1 ,rank); - /* initialize to no "match" */ - ranks2[proc] = MPI_UNDEFINED; - for (proc2 = 0; proc2 < group2->grp_proc_count; proc2++) { - proc2_pointer= ompi_group_peer_lookup(group2, proc2); - if ( proc1_pointer == proc2_pointer) { - ranks2[proc] = proc2; - break; - } - } /* end proc2 loop */ - } /* end proc loop */ +#endif + + /* loop over all ranks */ + for (int proc = 0; proc < n_ranks; ++proc) { + struct ompi_proc_t *proc1_pointer, *proc2_pointer; + int rank = ranks1[proc]; + + if ( MPI_PROC_NULL == rank) { + ranks2[proc] = MPI_PROC_NULL; + continue; } - } + + proc1_pointer = ompi_group_get_proc_ptr_raw (group1, rank); + /* initialize to no "match" */ + ranks2[proc] = MPI_UNDEFINED; + for (int proc2 = 0; proc2 < group2->grp_proc_count; ++proc2) { + proc2_pointer = ompi_group_get_proc_ptr_raw (group2, proc2); + if ( proc1_pointer == proc2_pointer) { + ranks2[proc] = proc2; + break; + } + } /* end proc2 loop */ + } /* end proc loop */ return MPI_SUCCESS; } @@ -168,25 +173,6 @@ int ompi_group_dump (ompi_group_t* group) return OMPI_SUCCESS; } -/* - * This is the function that iterates through the sparse groups to the dense group - * to reach the process pointer - */ -ompi_proc_t* ompi_group_get_proc_ptr (ompi_group_t* group , int rank) -{ - int ranks1,ranks2; - do { - if(OMPI_GROUP_IS_DENSE(group)) { - return group->grp_proc_pointers[rank]; - } - ranks1 = rank; - ompi_group_translate_ranks( group, 1, &ranks1, - group->grp_parent_group_ptr,&ranks2); - rank = ranks2; - group = group->grp_parent_group_ptr; - } while (1); -} - int ompi_group_minloc ( int list[] , int length ) { int i,index,min; @@ -568,3 +554,23 @@ int ompi_group_compare(ompi_group_t *group1, return return_value; } + +bool ompi_group_have_remote_peers (ompi_group_t *group) +{ + for (size_t i = 0 ; i < group->grp_proc_count ; ++i) { + ompi_proc_t *proc = NULL; +#if OMPI_GROUP_SPARSE + proc = ompi_group_peer_lookup (group, i); +#else + if ((intptr_t) group->grp_proc_pointers[i] < 0) { + return true; + } + proc = group->grp_proc_pointers[i]; +#endif + if (!OPAL_PROC_ON_LOCAL_NODE(proc->super.proc_flags)) { + return true; + } + } + + return false; +} diff --git a/ompi/group/group.h b/ompi/group/group.h index 797f52933c..0f8871fb7f 100644 --- a/ompi/group/group.h +++ b/ompi/group/group.h @@ -14,7 +14,7 @@ * Copyright (c) 2007-2012 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2009 Sun Microsystems, Inc. All rights reserved. * Copyright (c) 2012 Oak Ridge National Labs. All rights reserved. - * Copyright (c) 2013 Los Alamos National Security, LLC. All rights + * Copyright (c) 2013-2015 Los Alamos National Security, LLC. All rights * reserved. * $COPYRIGHT$ * @@ -252,8 +252,6 @@ int ompi_group_free (ompi_group_t **group); /** * Functions to handle process pointers for sparse group formats */ -OMPI_DECLSPEC ompi_proc_t* ompi_group_get_proc_ptr (ompi_group_t* group , int rank); - int ompi_group_translate_ranks_sporadic ( ompi_group_t *group1, int n_ranks, const int *ranks1, ompi_group_t *group2, @@ -324,25 +322,93 @@ int ompi_group_calc_bmap ( int n, int orig_size , const int *ranks ); */ int ompi_group_minloc (int list[], int length); +/** + * @brief Helper function for retreiving the proc of a group member in a dense group + * + * This function exists to handle the translation of sentinel group members to real + * ompi_proc_t's. If a sentinel value is found and allocate is true then this function + * looks for an existing ompi_proc_t using ompi_proc_for_name which will allocate a + * ompi_proc_t if one does not exist. If allocate is false then sentinel values translate + * to NULL. + */ +static inline struct ompi_proc_t *ompi_group_dense_lookup (ompi_group_t *group, const int peer_id, const bool allocate) +{ +#if OPAL_ENABLE_DEBUG + if (peer_id >= group->grp_proc_count) { + opal_output(0, "ompi_group_dense_lookup: invalid peer index (%d)", peer_id); + return (struct ompi_proc_t *) NULL; + } +#endif + + if (OPAL_UNLIKELY((intptr_t) group->grp_proc_pointers[peer_id] < 0)) { + if (!allocate) { + return NULL; + } + + /* replace sentinel value with an actual ompi_proc_t */ + group->grp_proc_pointers[peer_id] = + (ompi_proc_t *) ompi_proc_for_name (ompi_proc_sentinel_to_name ((intptr_t) group->grp_proc_pointers[peer_id])); + OBJ_RETAIN(group->grp_proc_pointers[peer_id]); + } + + return group->grp_proc_pointers[peer_id]; +} + +/* + * This is the function that iterates through the sparse groups to the dense group + * to reach the process pointer + */ +static inline ompi_proc_t *ompi_group_get_proc_ptr (ompi_group_t *group, int rank, const bool allocate) +{ +#if OMPI_GROUP_SPARSE + do { + if (OMPI_GROUP_IS_DENSE(group)) { + return ompi_group_dense_lookup (group, peer_id, allocate); + } + int ranks1 = rank; + ompi_group_translate_ranks (group, 1, &ranks1, group->grp_parent_group_ptr, &rank); + group = group->grp_parent_group_ptr; + } while (1); +#else + return ompi_group_dense_lookup (group, rank, allocate); +#endif +} + +/** + * @brief Get the raw proc pointer from the group + * + * This function will either return a ompi_proc_t if one exists (either stored in the group + * or cached in the proc hash table) or a sentinel value representing the proc. This + * differs from ompi_group_get_proc_ptr() which returns the ompi_proc_t or NULL. + */ +ompi_proc_t *ompi_group_get_proc_ptr_raw (ompi_group_t *group, int rank); + +static inline opal_process_name_t ompi_group_get_proc_name (ompi_group_t *group, int rank) +{ + ompi_proc_t *proc = ompi_group_get_proc_ptr_raw (group, rank); + if ((intptr_t) proc < 0) { + return ompi_proc_sentinel_to_name ((intptr_t) proc); + } + + return proc->super.proc_name; +} + /** * Inline function to check if sparse groups are enabled and return the direct access * to the proc pointer, otherwise the lookup function */ static inline struct ompi_proc_t* ompi_group_peer_lookup(ompi_group_t *group, int peer_id) { -#if OPAL_ENABLE_DEBUG - if (peer_id >= group->grp_proc_count) { - opal_output(0, "ompi_group_lookup_peer: invalid peer index (%d)", peer_id); - return (struct ompi_proc_t *) NULL; - } -#endif -#if OMPI_GROUP_SPARSE - return ompi_group_get_proc_ptr (group, peer_id); -#else - return group->grp_proc_pointers[peer_id]; -#endif + return ompi_group_get_proc_ptr (group, peer_id, true); } +static inline struct ompi_proc_t *ompi_group_peer_lookup_existing (ompi_group_t *group, int peer_id) +{ + return ompi_group_get_proc_ptr (group, peer_id, false); +} + +bool ompi_group_have_remote_peers (ompi_group_t *group); + /** * Function to print the group info */ diff --git a/ompi/group/group_init.c b/ompi/group/group_init.c index 67e5af61e4..5352493c4f 100644 --- a/ompi/group/group_init.c +++ b/ompi/group/group_init.c @@ -210,14 +210,13 @@ ompi_group_t *ompi_group_allocate_bmap(int orig_group_size , int group_size) */ void ompi_group_increment_proc_count(ompi_group_t *group) { - int proc; ompi_proc_t * proc_pointer; - for (proc = 0; proc < group->grp_proc_count; proc++) { - proc_pointer = ompi_group_peer_lookup(group,proc); - OBJ_RETAIN(proc_pointer); + for (int proc = 0 ; proc < group->grp_proc_count ; ++proc) { + proc_pointer = ompi_group_peer_lookup_existing (group, proc); + if (proc_pointer) { + OBJ_RETAIN(proc_pointer); + } } - - return; } /* @@ -226,14 +225,13 @@ void ompi_group_increment_proc_count(ompi_group_t *group) void ompi_group_decrement_proc_count(ompi_group_t *group) { - int proc; ompi_proc_t * proc_pointer; - for (proc = 0; proc < group->grp_proc_count; proc++) { - proc_pointer = ompi_group_peer_lookup(group,proc); - OBJ_RELEASE(proc_pointer); + for (int proc = 0 ; proc < group->grp_proc_count ; ++proc) { + proc_pointer = ompi_group_peer_lookup_existing (group, proc); + if (proc_pointer) { + OBJ_RELEASE(proc_pointer); + } } - - return; } /* @@ -255,9 +253,6 @@ static void ompi_group_construct(ompi_group_t *new_group) /* default the sparse values for groups */ new_group->grp_parent_group_ptr = NULL; - - /* return */ - return; } @@ -300,9 +295,6 @@ static void ompi_group_destruct(ompi_group_t *group) opal_pointer_array_set_item(&ompi_group_f_to_c_table, group->grp_f_to_c_index, NULL); } - - /* return */ - return; } diff --git a/ompi/group/group_plist.c b/ompi/group/group_plist.c index ebf2f1a85a..0f17422e22 100644 --- a/ompi/group/group_plist.c +++ b/ompi/group/group_plist.c @@ -12,7 +12,7 @@ * All rights reserved. * Copyright (c) 2006-2007 University of Houston. All rights reserved. * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2013 Los Alamos National Security, LLC. All rights + * Copyright (c) 2013-2015 Los Alamos National Security, LLC. All rights * reserved. * $COPYRIGHT$ * @@ -29,6 +29,66 @@ #include +static int ompi_group_dense_overlap (ompi_group_t *group1, ompi_group_t *group2, opal_bitmap_t *bitmap) +{ + ompi_proc_t *proc1_pointer, *proc2_pointer; + int rc, overlap_count; + + overlap_count = 0; + + for (int proc1 = 0 ; proc1 < group1->grp_proc_count ; ++proc1) { + proc1_pointer = ompi_group_get_proc_ptr_raw (group1, proc1); + + /* check to see if this proc is in group2 */ + for (int proc2 = 0 ; proc2 < group2->grp_proc_count ; ++proc2) { + proc2_pointer = ompi_group_get_proc_ptr_raw (group2, proc2); + if( proc1_pointer == proc2_pointer ) { + rc = opal_bitmap_set_bit (bitmap, proc2); + if (OPAL_SUCCESS != rc) { + return rc; + } + ++overlap_count; + + break; + } + } /* end proc1 loop */ + } /* end proc loop */ + + return overlap_count; +} + +static struct ompi_proc_t *ompi_group_dense_lookup_raw (ompi_group_t *group, const int peer_id) +{ + if (OPAL_UNLIKELY((intptr_t) group->grp_proc_pointers[peer_id] < 0)) { + ompi_proc_t *proc = + (ompi_proc_t *) ompi_proc_lookup (ompi_proc_sentinel_to_name ((intptr_t) group->grp_proc_pointers[peer_id])); + if (NULL != proc) { + /* replace sentinel value with an actual ompi_proc_t */ + group->grp_proc_pointers[peer_id] = proc; + /* retain the proc */ + OBJ_RETAIN(group->grp_proc_pointers[peer_id]); + } + } + + return group->grp_proc_pointers[peer_id]; +} + +ompi_proc_t *ompi_group_get_proc_ptr_raw (ompi_group_t *group, int rank) +{ +#if OMPI_GROUP_SPARSE + do { + if (OMPI_GROUP_IS_DENSE(group)) { + return ompi_group_dense_lookup_raw (group, peer_id); + } + int ranks1 = rank; + ompi_group_translate_ranks (group, 1, &ranks1, group->grp_parent_group_ptr, &rank); + group = group->grp_parent_group_ptr; + } while (1); +#else + return ompi_group_dense_lookup_raw (group, rank); +#endif +} + int ompi_group_calc_plist ( int n , const int *ranks ) { return sizeof(char *) * n ; } @@ -37,9 +97,8 @@ int ompi_group_incl_plist(ompi_group_t* group, int n, const int *ranks, ompi_group_t **new_group) { /* local variables */ - int proc,my_group_rank; + int my_group_rank; ompi_group_t *group_pointer, *new_group_pointer; - ompi_proc_t *my_proc_pointer; group_pointer = (ompi_group_t *)group; @@ -56,9 +115,9 @@ int ompi_group_incl_plist(ompi_group_t* group, int n, const int *ranks, } /* put group elements in the list */ - for (proc = 0; proc < n; proc++) { + for (int proc = 0; proc < n; proc++) { new_group_pointer->grp_proc_pointers[proc] = - ompi_group_peer_lookup(group_pointer,ranks[proc]); + ompi_group_get_proc_ptr_raw (group_pointer, ranks[proc]); } /* end proc loop */ /* increment proc reference counters */ @@ -67,10 +126,8 @@ int ompi_group_incl_plist(ompi_group_t* group, int n, const int *ranks, /* find my rank */ my_group_rank=group_pointer->grp_my_rank; if (MPI_UNDEFINED != my_group_rank) { - my_proc_pointer=ompi_group_peer_lookup (group_pointer,my_group_rank); - ompi_set_group_rank(new_group_pointer,my_proc_pointer); - } - else { + ompi_set_group_rank(new_group_pointer, ompi_proc_local_proc); + } else { new_group_pointer->grp_my_rank = MPI_UNDEFINED; } @@ -87,114 +144,77 @@ int ompi_group_union (ompi_group_t* group1, ompi_group_t* group2, ompi_group_t **new_group) { /* local variables */ - int new_group_size, proc1, proc2, found_in_group; - int my_group_rank, cnt; - ompi_group_t *group1_pointer, *group2_pointer, *new_group_pointer; - ompi_proc_t *proc1_pointer, *proc2_pointer, *my_proc_pointer = NULL; - - group1_pointer = (ompi_group_t *) group1; - group2_pointer = (ompi_group_t *) group2; + int new_group_size, cnt, rc, overlap_count; + ompi_group_t *new_group_pointer; + ompi_proc_t *proc2_pointer; + opal_bitmap_t bitmap; /* * form union */ /* get new group size */ - new_group_size = group1_pointer->grp_proc_count; + OBJ_CONSTRUCT(&bitmap, opal_bitmap_t); + rc = opal_bitmap_init (&bitmap, 32); + if (OPAL_SUCCESS != rc) { + return rc; + } /* check group2 elements to see if they need to be included in the list */ - for (proc2 = 0; proc2 < group2_pointer->grp_proc_count; proc2++) { - proc2_pointer = ompi_group_peer_lookup(group2_pointer,proc2); - - /* check to see if this proc2 is alread in the group */ - found_in_group = 0; - for (proc1 = 0; proc1 < group1_pointer->grp_proc_count; proc1++) { - proc1_pointer = ompi_group_peer_lookup(group1_pointer,proc1); - - if (proc1_pointer == proc2_pointer) { - /* proc2 is in group1 - don't double count */ - found_in_group = 1; - break; - } - } /* end proc1 loop */ - - if (found_in_group) { - continue; - } - - new_group_size++; - } /* end proc loop */ + overlap_count = ompi_group_dense_overlap (group1, group2, &bitmap); + if (0 > overlap_count) { + OBJ_DESTRUCT(&bitmap); + return overlap_count; + } + new_group_size = group1->grp_proc_count + group2->grp_proc_count - overlap_count; if ( 0 == new_group_size ) { *new_group = MPI_GROUP_EMPTY; OBJ_RETAIN(MPI_GROUP_EMPTY); + OBJ_DESTRUCT(&bitmap); return MPI_SUCCESS; } /* get new group struct */ new_group_pointer = ompi_group_allocate(new_group_size); if (NULL == new_group_pointer) { + OBJ_DESTRUCT(&bitmap); return MPI_ERR_GROUP; } /* fill in the new group list */ /* put group1 elements in the list */ - for (proc1 = 0; proc1 < group1_pointer->grp_proc_count; proc1++) { + for (int proc1 = 0; proc1 < group1->grp_proc_count; ++proc1) { new_group_pointer->grp_proc_pointers[proc1] = - ompi_group_peer_lookup(group1_pointer,proc1); + ompi_group_get_proc_ptr_raw (group1, proc1); } - cnt = group1_pointer->grp_proc_count; + cnt = group1->grp_proc_count; /* check group2 elements to see if they need to be included in the list */ - for (proc2 = 0; proc2 < group2_pointer->grp_proc_count; proc2++) { - proc2_pointer = ompi_group_peer_lookup(group2_pointer,proc2); - - /* check to see if this proc2 is alread in the group */ - found_in_group = 0; - for (proc1 = 0; proc1 < group1_pointer->grp_proc_count; proc1++) { - proc1_pointer = ompi_group_peer_lookup(group1_pointer,proc1); - - if (proc1_pointer == proc2_pointer) { - /* proc2 is in group1 - don't double count */ - found_in_group = 1; - break; - } - } /* end proc1 loop */ - - if (found_in_group) { + for (int proc2 = 0; proc2 < group2->grp_proc_count; ++proc2) { + if (opal_bitmap_is_set_bit (&bitmap, proc2)) { continue; } - new_group_pointer->grp_proc_pointers[cnt] = - ompi_group_peer_lookup(group2_pointer,proc2); - cnt++; + proc2_pointer = ompi_group_get_proc_ptr_raw (group2, proc2); + new_group_pointer->grp_proc_pointers[cnt++] = proc2_pointer; } /* end proc loop */ + OBJ_DESTRUCT(&bitmap); + /* increment proc reference counters */ ompi_group_increment_proc_count(new_group_pointer); /* find my rank */ - my_group_rank = group1_pointer->grp_my_rank; - if (MPI_UNDEFINED == my_group_rank) { - my_group_rank = group2_pointer->grp_my_rank; - if ( MPI_UNDEFINED != my_group_rank) { - my_proc_pointer = ompi_group_peer_lookup(group2_pointer,my_group_rank); - } + if (MPI_UNDEFINED != group1->grp_my_rank || MPI_UNDEFINED != group2->grp_my_rank) { + ompi_set_group_rank(new_group_pointer, ompi_proc_local_proc); } else { - my_proc_pointer = ompi_group_peer_lookup(group1_pointer,my_group_rank); - } - - if ( MPI_UNDEFINED == my_group_rank ) { new_group_pointer->grp_my_rank = MPI_UNDEFINED; } - else { - ompi_set_group_rank(new_group_pointer, my_proc_pointer); - } *new_group = (MPI_Group) new_group_pointer; - return OMPI_SUCCESS; } @@ -206,96 +226,65 @@ int ompi_group_difference(ompi_group_t* group1, ompi_group_t* group2, ompi_group_t **new_group) { /* local varibles */ - int new_group_size, proc1, proc2, found_in_group2, cnt; - int my_group_rank; - ompi_group_t *group1_pointer, *group2_pointer, *new_group_pointer; - ompi_proc_t *proc1_pointer, *proc2_pointer, *my_proc_pointer = NULL; - - - group1_pointer=(ompi_group_t *)group1; - group2_pointer=(ompi_group_t *)group2; + int new_group_size, overlap_count, rc; + ompi_group_t *new_group_pointer; + ompi_proc_t *proc1_pointer; + opal_bitmap_t bitmap; /* * form union */ /* get new group size */ - new_group_size=0; + OBJ_CONSTRUCT(&bitmap, opal_bitmap_t); + rc = opal_bitmap_init (&bitmap, 32); + if (OPAL_SUCCESS != rc) { + return rc; + } - /* loop over group1 members */ - for( proc1=0; proc1 < group1_pointer->grp_proc_count; proc1++ ) { - proc1_pointer = ompi_group_peer_lookup(group1_pointer,proc1); - /* check to see if this proc is in group2 */ - found_in_group2=0; - for( proc2=0 ; proc2 < group2_pointer->grp_proc_count ; proc2++ ) { - proc2_pointer = ompi_group_peer_lookup(group2_pointer,proc2); - if( proc1_pointer == proc2_pointer ) { - found_in_group2=true; - break; - } - } /* end proc1 loop */ - if(found_in_group2) { - continue; - } - new_group_size++; - } /* end proc loop */ + /* check group2 elements to see if they need to be included in the list */ + overlap_count = ompi_group_dense_overlap (group2, group1, &bitmap); + if (0 > overlap_count) { + OBJ_DESTRUCT(&bitmap); + return overlap_count; + } + new_group_size = group1->grp_proc_count - overlap_count; if ( 0 == new_group_size ) { *new_group = MPI_GROUP_EMPTY; OBJ_RETAIN(MPI_GROUP_EMPTY); + OBJ_DESTRUCT(&bitmap); return MPI_SUCCESS; } /* allocate a new ompi_group_t structure */ - new_group_pointer=ompi_group_allocate(new_group_size); + new_group_pointer = ompi_group_allocate(new_group_size); if( NULL == new_group_pointer ) { + OBJ_DESTRUCT(&bitmap); return MPI_ERR_GROUP; } /* fill in group list */ - cnt=0; /* loop over group1 members */ - for( proc1=0; proc1 < group1_pointer->grp_proc_count; proc1++ ) { - proc1_pointer = ompi_group_peer_lookup(group1_pointer,proc1); - /* check to see if this proc is in group2 */ - found_in_group2=0; - for( proc2=0 ; proc2 < group2_pointer->grp_proc_count ; proc2++ ) { - proc2_pointer = ompi_group_peer_lookup(group2_pointer,proc2); - if( proc1_pointer == proc2_pointer ) { - found_in_group2=true; - break; - } - } /* end proc1 loop */ - if(found_in_group2) { + for (int proc1 = 0, cnt = 0 ; proc1 < group1->grp_proc_count ; ++proc1) { + if (opal_bitmap_is_set_bit (&bitmap, proc1)) { continue; } - new_group_pointer->grp_proc_pointers[cnt] = - ompi_group_peer_lookup(group1_pointer,proc1); - - cnt++; + proc1_pointer = ompi_group_get_proc_ptr_raw (group1, proc1); + new_group_pointer->grp_proc_pointers[cnt++] = proc1_pointer; } /* end proc loop */ + OBJ_DESTRUCT(&bitmap); + /* increment proc reference counters */ ompi_group_increment_proc_count(new_group_pointer); /* find my rank */ - my_group_rank=group1_pointer->grp_my_rank; - if ( MPI_UNDEFINED != my_group_rank ) { - my_proc_pointer = ompi_group_peer_lookup(group1_pointer,my_group_rank); - } - else { - my_group_rank=group2_pointer->grp_my_rank; - if ( MPI_UNDEFINED != my_group_rank ) { - my_proc_pointer = ompi_group_peer_lookup(group2_pointer,my_group_rank); - } - } - - if ( MPI_UNDEFINED == my_group_rank ) { + if (MPI_UNDEFINED == group1->grp_my_rank || MPI_UNDEFINED != group2->grp_my_rank) { new_group_pointer->grp_my_rank = MPI_UNDEFINED; - } - else { - ompi_set_group_rank(new_group_pointer,my_proc_pointer); + } else { + ompi_set_group_rank(new_group_pointer, ompi_proc_local_proc); } *new_group = (MPI_Group)new_group_pointer; diff --git a/ompi/group/group_set_rank.c b/ompi/group/group_set_rank.c index 8529970ae7..16b8401743 100644 --- a/ompi/group/group_set_rank.c +++ b/ompi/group/group_set_rank.c @@ -1,3 +1,4 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology @@ -10,6 +11,8 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2006-2007 University of Houston. All rights reserved. + * Copyright (c) 2015 Los Alamos National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -38,12 +41,10 @@ void ompi_set_group_rank(ompi_group_t *group, struct ompi_proc_t *proc_pointer) for (proc = 0; proc < group->grp_proc_count; proc++) { /* check and see if this proc pointer matches proc_pointer */ - if (ompi_group_peer_lookup(group,proc) == proc_pointer) { + if (ompi_group_peer_lookup_existing (group, proc) == proc_pointer) { group->grp_my_rank = proc; - } + break; + } } /* end proc loop */ } - - /* return */ - return; } diff --git a/ompi/mca/bml/base/base.h b/ompi/mca/bml/base/base.h index 27cd1e568c..595deaf72a 100644 --- a/ompi/mca/bml/base/base.h +++ b/ompi/mca/bml/base/base.h @@ -1,3 +1,4 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana * University Research and Technology @@ -10,6 +11,8 @@ * Copyright (c) 2004-2006 The Regents of the University of California. * All rights reserved. * Copyright (c) 2009 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2015 Los Alamos National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -25,6 +28,7 @@ #include "ompi/mca/mca.h" #include "opal/mca/base/mca_base_framework.h" #include "ompi/mca/bml/bml.h" +#include "ompi/proc/proc.h" /* @@ -60,6 +64,14 @@ OMPI_DECLSPEC extern mca_bml_base_component_t mca_bml_component; OMPI_DECLSPEC extern mca_bml_base_module_t mca_bml; OMPI_DECLSPEC extern mca_base_framework_t ompi_bml_base_framework; +static inline struct mca_bml_base_endpoint_t *mca_bml_base_get_endpoint (struct ompi_proc_t *proc) { + if (OPAL_UNLIKELY(NULL == proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML])) { + mca_bml.bml_add_proc (proc); + } + + return (struct mca_bml_base_endpoint_t *) proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML]; +} + END_C_DECLS #endif /* MCA_BML_BASE_H */ diff --git a/ompi/mca/bml/bml.h b/ompi/mca/bml/bml.h index 41e8496066..df731a64a0 100644 --- a/ompi/mca/bml/bml.h +++ b/ompi/mca/bml/bml.h @@ -160,14 +160,11 @@ static inline bool mca_bml_base_btl_array_remove( mca_bml_base_btl_array_t* arra */ static inline mca_bml_base_btl_t* mca_bml_base_btl_array_get_index(mca_bml_base_btl_array_t* array, size_t item_index) { -#if OPAL_ENABLE_DEBUG - if(item_index >= array->arr_size) { - opal_output(0, "mca_bml_base_btl_array_get_index: invalid array index %lu >= %lu", - (unsigned long)item_index, (unsigned long)array->arr_size); - return 0; + if (item_index < array->arr_size) { + return &array->bml_btls[item_index]; } -#endif - return &array->bml_btls[item_index]; + + return NULL; } /** @@ -441,7 +438,7 @@ typedef int (*mca_bml_base_module_finalize_fn_t)( void ); * @return OMPI_SUCCESS or error status on failure. * * The mca_bml_base_module_add_procs_fn_t() is called by the PML to - * determine the set of BMLs that should be used to reach each process. + * determine the set of BTLs that should be used to reach each process. * Any addressing information exported by the peer via the mca_base_modex_send() * function should be available during this call via the corresponding * mca_base_modex_recv() function. The BML may utilize this information to @@ -465,6 +462,25 @@ typedef int (*mca_bml_base_module_add_procs_fn_t)( struct opal_bitmap_t* reachable ); +/** + * PML->BML notification of change in the process list. + * + * @param proc (IN) Process + * @return OMPI_SUCCESS or error status on failure. + * + * The mca_bml_base_module_add_proc_fn_t() is called by the PML to + * determine the set of BTLs that should be used to reach each process. + * Any addressing information exported by the peer via the mca_base_modex_send() + * function should be available during this call via the corresponding + * mca_base_modex_recv() function. The BML may utilize this information to + * determine reachability of each peer process. + * + * \note This function will return OMPI_ERR_UNREACH if the process can not + * be reached by a currently active BTL. This is not a fatal error, and the + * calling layer is free to continue using the BML interface. + */ +typedef int (*mca_bml_base_module_add_proc_fn_t) (struct ompi_proc_t *proc); + /** * Notification of change to the process list. * @@ -559,6 +575,7 @@ struct mca_bml_base_module_t { mca_bml_base_component_t* bml_component; /**< pointer back to the BML component structure */ /* BML function table */ + mca_bml_base_module_add_proc_fn_t bml_add_proc; mca_bml_base_module_add_procs_fn_t bml_add_procs; mca_bml_base_module_del_procs_fn_t bml_del_procs; mca_bml_base_module_add_btl_fn_t bml_add_btl; diff --git a/ompi/mca/bml/r2/bml_r2.c b/ompi/mca/bml/r2/bml_r2.c index 345facd037..182b0da00d 100644 --- a/ompi/mca/bml/r2/bml_r2.c +++ b/ompi/mca/bml/r2/bml_r2.c @@ -10,7 +10,7 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2006 The Regents of the University of California. * All rights reserved. - * Copyright (c) 2007-2014 Los Alamos National Security, LLC. All rights + * Copyright (c) 2007-2015 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2008-2015 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2013 Intel, Inc. All rights reserved @@ -144,6 +144,293 @@ static void mca_bml_r2_calculate_bandwidth_latency (mca_bml_base_btl_array_t *bt } } +static mca_bml_base_endpoint_t *mca_bml_r2_allocate_endpoint (ompi_proc_t *proc) { + mca_bml_base_endpoint_t *bml_endpoint; + + /* allocate bml specific proc data */ + bml_endpoint = OBJ_NEW(mca_bml_base_endpoint_t); + if (NULL == bml_endpoint) { + opal_output(0, "mca_bml_r2_add_procs: unable to allocate resources"); + return NULL; + } + + /* preallocate space in array for max number of r2s */ + mca_bml_base_btl_array_reserve(&bml_endpoint->btl_eager, mca_bml_r2.num_btl_modules); + mca_bml_base_btl_array_reserve(&bml_endpoint->btl_send, mca_bml_r2.num_btl_modules); + mca_bml_base_btl_array_reserve(&bml_endpoint->btl_rdma, mca_bml_r2.num_btl_modules); + bml_endpoint->btl_max_send_size = -1; + bml_endpoint->btl_proc = proc; + proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML] = bml_endpoint; + + bml_endpoint->btl_flags_or = 0; + return bml_endpoint; +} + +static void mca_bml_r2_register_progress (mca_btl_base_module_t *btl) +{ + if (NULL != btl->btl_component->btl_progress) { + bool found = false; + + for (size_t p = 0 ; p < mca_bml_r2.num_btl_progress ; ++p) { + if(mca_bml_r2.btl_progress[p] == btl->btl_component->btl_progress) { + found = true; + break; + } + } + + if (found == false) { + mca_bml_r2.btl_progress[mca_bml_r2.num_btl_progress++] = + btl->btl_component->btl_progress; + opal_progress_register (btl->btl_component->btl_progress); + } + } +} + +static int mca_bml_r2_endpoint_add_btl (struct ompi_proc_t *proc, mca_bml_base_endpoint_t *bml_endpoint, + mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *btl_endpoint) +{ + mca_bml_base_btl_t* bml_btl = NULL; + int btl_flags = btl->btl_flags; + bool btl_in_use = false; + size_t size; + + /* NTH: these flags should have been sanitized by the btl. Once that is verified these + * checks can be safely removed. */ + if ((btl_flags & MCA_BTL_FLAGS_PUT) && (NULL == btl->btl_put)) { + opal_output(0, "mca_bml_r2_add_procs: The PUT flag is specified for" + " the %s BTL without any PUT function attached. Discard the flag !", + btl->btl_component->btl_version.mca_component_name); + btl_flags ^= MCA_BTL_FLAGS_PUT; + } + if ((btl_flags & MCA_BTL_FLAGS_GET) && (NULL == btl->btl_get)) { + opal_output(0, "mca_bml_r2_add_procs: The GET flag is specified for" + " the %s BTL without any GET function attached. Discard the flag !", + btl->btl_component->btl_version.mca_component_name); + btl_flags ^= MCA_BTL_FLAGS_GET; + } + + if ((btl_flags & (MCA_BTL_FLAGS_PUT | MCA_BTL_FLAGS_GET | MCA_BTL_FLAGS_SEND)) == 0) { + /* If no protocol specified, we have 2 choices: we ignore the BTL + * as we don't know which protocl to use, or we suppose that all + * BTLs support the send protocol. This is really a btl error as + * these flags should have been sanitized by the btl. */ + btl_flags |= MCA_BTL_FLAGS_SEND; + } + + if (btl_flags & MCA_BTL_FLAGS_SEND) { + /* dont allow an additional BTL with a lower exclusivity ranking */ + bml_btl = mca_bml_base_btl_array_get_index (&bml_endpoint->btl_send, size - 1); + size = mca_bml_base_btl_array_get_size (&bml_endpoint->btl_send); + + if (!bml_btl || bml_btl->btl->btl_exclusivity < btl->btl_exclusivity) { + /* this btl has higher exclusivity than an existing btl or none exists */ + + opal_output_verbose(1, opal_btl_base_framework.framework_output, + "mca: bml: Using %s btl for send to %s on node %s", + btl->btl_component->btl_version.mca_component_name, + OMPI_NAME_PRINT(&proc->super.proc_name), + proc->super.proc_hostname); + + /* cache the endpoint on the proc */ + if (NULL == bml_btl || (bml_btl->btl->btl_exclusivity <= btl->btl_exclusivity)) { + bml_btl = mca_bml_base_btl_array_insert (&bml_endpoint->btl_send); + bml_btl->btl = btl; + bml_btl->btl_endpoint = btl_endpoint; + bml_btl->btl_weight = 0; + bml_btl->btl_flags = btl_flags; + + /** + * calculate the bitwise OR of the btl flags + */ + bml_endpoint->btl_flags_or |= bml_btl->btl_flags; + } else { + opal_output_verbose(20, opal_btl_base_framework.framework_output, + "mca: bml: Not using %s btl for send to %s on node %s " + "because %s btl has higher exclusivity (%d > %d)", + btl->btl_component->btl_version.mca_component_name, + OMPI_NAME_PRINT(&proc->super.proc_name), proc->super.proc_hostname, + bml_btl->btl->btl_component->btl_version.mca_component_name, + bml_btl->btl->btl_exclusivity, + btl->btl_exclusivity); + } + + btl_in_use = true; + } + } + + /* always add rdma endpoints */ + if ((btl_flags & MCA_BTL_FLAGS_RDMA) && + !((proc->super.proc_arch != ompi_proc_local_proc->super.proc_arch) && + (0 == (btl->btl_flags & MCA_BTL_FLAGS_HETEROGENEOUS_RDMA)))) { + mca_bml_base_btl_t *bml_btl_rdma = mca_bml_base_btl_array_insert(&bml_endpoint->btl_rdma); + + bml_btl_rdma->btl = btl; + bml_btl_rdma->btl_endpoint = btl_endpoint; + bml_btl_rdma->btl_weight = 0; + bml_btl_rdma->btl_flags = btl_flags; + + if (bml_endpoint->btl_pipeline_send_length < btl->btl_rdma_pipeline_send_length) { + bml_endpoint->btl_pipeline_send_length = btl->btl_rdma_pipeline_send_length; + } + + if (bml_endpoint->btl_send_limit < btl->btl_min_rdma_pipeline_size) { + bml_endpoint->btl_send_limit = btl->btl_min_rdma_pipeline_size; + } + + btl_in_use = true; + } + + return btl_in_use ? OMPI_SUCCESS : OMPI_ERR_NOT_AVAILABLE; +} + +static void mca_bml_r2_compute_endpoint_metrics (mca_bml_base_endpoint_t *bml_endpoint) +{ + double total_bandwidth = 0; + uint32_t latency; + size_t n_send, n_rdma; + + /* (1) determine the total bandwidth available across all btls + * note that we need to do this here, as we may already have btls configured + * (2) determine the highest priority ranking for latency + * (3) compute the maximum amount of bytes that can be send without any + * weighting. Once the left over is smaller than this number we will + * start using the weight to compute the correct amount. + */ + n_send = mca_bml_base_btl_array_get_size (&bml_endpoint->btl_send); + n_rdma = mca_bml_base_btl_array_get_size (&bml_endpoint->btl_rdma); + + /* sort BTLs in descending order according to bandwidth value */ + qsort (bml_endpoint->btl_send.bml_btls, n_send, + sizeof(mca_bml_base_btl_t), btl_bandwidth_compare); + + bml_endpoint->btl_rdma_index = 0; + + mca_bml_r2_calculate_bandwidth_latency (&bml_endpoint->btl_send, &total_bandwidth, &latency); + + /* (1) set the weight of each btl as a percentage of overall bandwidth + * (2) copy all btl instances at the highest priority ranking into the + * list of btls used for first fragments + */ + for (size_t n_index = 0 ; n_index < n_send ; ++n_index) { + mca_bml_base_btl_t *bml_btl = + mca_bml_base_btl_array_get_index(&bml_endpoint->btl_send, n_index); + mca_btl_base_module_t *btl = bml_btl->btl; + + /* compute weighting factor for this r2 */ + if(btl->btl_bandwidth > 0) { + bml_btl->btl_weight = (float)(btl->btl_bandwidth / total_bandwidth); + } else { + bml_btl->btl_weight = (float)(1.0 / n_send); + } + + /* check to see if this r2 is already in the array of r2s + * used for first fragments - if not add it. + */ + if(btl->btl_latency == latency) { + mca_bml_base_btl_t* bml_btl_new = + mca_bml_base_btl_array_insert(&bml_endpoint->btl_eager); + *bml_btl_new = *bml_btl; + } + + /* set endpoint max send size as min of available btls */ + if (bml_endpoint->btl_max_send_size > btl->btl_max_send_size) + bml_endpoint->btl_max_send_size = btl->btl_max_send_size; + } + + /* sort BTLs in descending order according to bandwidth value */ + qsort(bml_endpoint->btl_rdma.bml_btls, n_rdma, + sizeof(mca_bml_base_btl_t), btl_bandwidth_compare); + + mca_bml_r2_calculate_bandwidth_latency (&bml_endpoint->btl_rdma, &total_bandwidth, &latency); + + /* set rdma btl weights */ + for (size_t n_index = 0 ; n_index < n_rdma ; ++n_index) { + mca_bml_base_btl_t *bml_btl = + mca_bml_base_btl_array_get_index(&bml_endpoint->btl_rdma, n_index); + + /* compute weighting factor for this r2 */ + if (bml_btl->btl->btl_bandwidth > 0.0) { + bml_btl->btl_weight = (float)(bml_btl->btl->btl_bandwidth / total_bandwidth); + } else { + bml_btl->btl_weight = (float)(1.0 / n_rdma); + } + } +} + +static int mca_bml_r2_add_proc (struct ompi_proc_t *proc) +{ + mca_bml_base_endpoint_t *bml_endpoint; + /* at least one btl is in use */ + bool btl_in_use; + int rc; + + if (OPAL_UNLIKELY(NULL == proc)) { + return OMPI_ERR_BAD_PARAM; + } + + /* check if this endpoint is already set up */ + if (NULL != proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML]) { + OBJ_RETAIN(proc); + return OMPI_SUCCESS; + } + + /* add btls if not already done */ + if (OMPI_SUCCESS != (rc = mca_bml_r2_add_btls())) { + return rc; + } + + bml_endpoint = mca_bml_r2_allocate_endpoint (proc); + if (OPAL_UNLIKELY(NULL == bml_endpoint)) { + return OMPI_ERR_OUT_OF_RESOURCE; + } + + for (int p_index = 0 ; p_index < mca_bml_r2.num_btl_modules ; ++p_index) { + mca_btl_base_module_t *btl = mca_bml_r2.btl_modules[p_index]; + struct mca_btl_base_endpoint_t *btl_endpoint = NULL; + + /* if the r2 can reach the destination proc it sets the + * corresponding bit (proc index) in the reachable bitmap + * and can return addressing information for each proc + * that is passed back to the r2 on data transfer calls + */ + rc = btl->btl_add_procs (btl, 1, (opal_proc_t **) &proc, &btl_endpoint, NULL); + if (OMPI_SUCCESS != rc || NULL == btl_endpoint) { + /* This BTL has troubles adding the nodes. Let's continue maybe some other BTL + * can take care of this task. */ + continue; + } + + rc = mca_bml_r2_endpoint_add_btl (proc, bml_endpoint, btl, btl_endpoint); + if (OMPI_SUCCESS != rc) { + btl->btl_del_procs (btl, 1, (opal_proc_t **) &proc, &btl_endpoint); + } else { + mca_bml_r2_register_progress (btl); + btl_in_use = true; + } + } + + if (!btl_in_use) { + /* no btl is available for this proc */ + if (mca_bml_r2.show_unreach_errors) { + opal_show_help ("help-mca-bml-r2.txt", "unreachable proc", true, + OMPI_NAME_PRINT(&(ompi_proc_local_proc->super.proc_name)), + (NULL != ompi_proc_local_proc->super.proc_hostname ? + ompi_proc_local_proc->super.proc_hostname : "unknown!"), + OMPI_NAME_PRINT(&(proc->super.proc_name)), + (NULL != proc->super.proc_hostname ? + proc->super.proc_hostname : "unknown!"), + btl_names); + } + + return OMPI_ERR_UNREACH; + } + + /* compute metrics for registered btls */ + mca_bml_r2_compute_endpoint_metrics (bml_endpoint); + + return OMPI_SUCCESS; +} + /* * For each proc setup a datastructure that indicates the BTLs * that can be used to reach the destination. @@ -154,7 +441,7 @@ static int mca_bml_r2_add_procs( size_t nprocs, struct ompi_proc_t** procs, struct opal_bitmap_t* reachable ) { - size_t p, p_index, n_new_procs = 0; + size_t n_new_procs = 0; struct mca_btl_base_endpoint_t ** btl_endpoints = NULL; struct ompi_proc_t** new_procs = NULL; int rc, ret = OMPI_SUCCESS; @@ -170,7 +457,7 @@ static int mca_bml_r2_add_procs( size_t nprocs, /* Select only the procs that don't yet have the BML proc struct. This prevent * us from calling btl->add_procs several times on the same destination proc. */ - for(p_index = 0; p_index < nprocs; p_index++) { + for (size_t p_index = 0 ; p_index < nprocs ; ++p_index) { struct ompi_proc_t* proc = procs[p_index]; if(NULL != proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML]) { @@ -203,10 +490,9 @@ static int mca_bml_r2_add_procs( size_t nprocs, return OMPI_ERR_OUT_OF_RESOURCE; } - for(p_index = 0; p_index < mca_bml_r2.num_btl_modules; p_index++) { - mca_btl_base_module_t* btl = mca_bml_r2.btl_modules[p_index]; + for (size_t p_index = 0 ; p_index < mca_bml_r2.num_btl_modules ; ++p_index) { + mca_btl_base_module_t *btl = mca_bml_r2.btl_modules[p_index]; int btl_inuse = 0; - int btl_flags; /* if the r2 can reach the destination proc it sets the * corresponding bit (proc index) in the reachable bitmap @@ -217,240 +503,69 @@ static int mca_bml_r2_add_procs( size_t nprocs, memset(btl_endpoints, 0, nprocs *sizeof(struct mca_btl_base_endpoint_t*)); rc = btl->btl_add_procs(btl, n_new_procs, (opal_proc_t**)new_procs, btl_endpoints, reachable); - if(OMPI_SUCCESS != rc) { - /* This BTL has troubles adding the nodes. Let's continue maybe some other BTL - * can take care of this task. - */ + if (OMPI_SUCCESS != rc) { + /* This BTL encountered an error while adding procs. Continue in case some other + * BTL(s) can be used. */ continue; } /* for each proc that is reachable */ - for( p = 0; p < n_new_procs; p++ ) { - if(opal_bitmap_is_set_bit(reachable, p)) { - ompi_proc_t *proc = new_procs[p]; - mca_bml_base_endpoint_t * bml_endpoint = - (mca_bml_base_endpoint_t*) proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML]; - mca_bml_base_btl_t* bml_btl = NULL; - size_t size; - - if(NULL == bml_endpoint) { - /* allocate bml specific proc data */ - bml_endpoint = OBJ_NEW(mca_bml_base_endpoint_t); - if (NULL == bml_endpoint) { - opal_output(0, "mca_bml_r2_add_procs: unable to allocate resources"); - free(btl_endpoints); - free(new_procs); - return OMPI_ERR_OUT_OF_RESOURCE; - } - - /* preallocate space in array for max number of r2s */ - mca_bml_base_btl_array_reserve(&bml_endpoint->btl_eager, mca_bml_r2.num_btl_modules); - mca_bml_base_btl_array_reserve(&bml_endpoint->btl_send, mca_bml_r2.num_btl_modules); - mca_bml_base_btl_array_reserve(&bml_endpoint->btl_rdma, mca_bml_r2.num_btl_modules); - bml_endpoint->btl_max_send_size = -1; - bml_endpoint->btl_proc = proc; - proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML] = bml_endpoint; - - bml_endpoint->btl_flags_or = 0; - } - - btl_flags = btl->btl_flags; - if( (btl_flags & MCA_BTL_FLAGS_PUT) && (NULL == btl->btl_put) ) { - opal_output(0, "mca_bml_r2_add_procs: The PUT flag is specified for" - " the %s BTL without any PUT function attached. Discard the flag !", - btl->btl_component->btl_version.mca_component_name); - btl_flags ^= MCA_BTL_FLAGS_PUT; - } - if( (btl_flags & MCA_BTL_FLAGS_GET) && (NULL == btl->btl_get) ) { - opal_output(0, "mca_bml_r2_add_procs: The GET flag is specified for" - " the %s BTL without any GET function attached. Discard the flag !", - btl->btl_component->btl_version.mca_component_name); - btl_flags ^= MCA_BTL_FLAGS_GET; - } - - if( (btl_flags & (MCA_BTL_FLAGS_PUT | MCA_BTL_FLAGS_GET | MCA_BTL_FLAGS_SEND)) == 0 ) { - /** - * If no protocol specified, we have 2 choices: we ignore the BTL - * as we don't know which protocl to use, or we suppose that all - * BTLs support the send protocol. - */ - btl_flags |= MCA_BTL_FLAGS_SEND; - } - - /* dont allow an additional BTL with a lower exclusivity ranking */ - size = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_send); - if(size > 0) { - bml_btl = mca_bml_base_btl_array_get_index(&bml_endpoint->btl_send, size-1); - /* skip this btl if the exclusivity is less than the previous only if the btl does not provide full rdma (for one-sided) */ - if(bml_btl->btl->btl_exclusivity > btl->btl_exclusivity && ((btl_flags & MCA_BTL_FLAGS_RDMA) != MCA_BTL_FLAGS_RDMA)) { - btl->btl_del_procs(btl, 1, (opal_proc_t**)&proc, &btl_endpoints[p]); - opal_output_verbose(MCA_BASE_VERBOSE_INFO, ompi_bml_base_framework.framework_output, - "mca: bml: Not using %s btl to %s on node %s " - "because %s btl has higher exclusivity (%d > %d)", - btl->btl_component->btl_version.mca_component_name, - OMPI_NAME_PRINT(&proc->super.proc_name), proc->super.proc_hostname, - bml_btl->btl->btl_component->btl_version.mca_component_name, - bml_btl->btl->btl_exclusivity, - btl->btl_exclusivity); - continue; - } - } - opal_output_verbose(MCA_BASE_VERBOSE_INFO, ompi_bml_base_framework.framework_output, - "mca: bml: Using %s btl to %s on node %s", - btl->btl_component->btl_version.mca_component_name, - OMPI_NAME_PRINT(&proc->super.proc_name), - proc->super.proc_hostname); - - /* cache the endpoint on the proc */ - if (NULL == bml_btl || (bml_btl->btl->btl_exclusivity <= btl->btl_exclusivity)) { - bml_btl = mca_bml_base_btl_array_insert(&bml_endpoint->btl_send); - bml_btl->btl = btl; - bml_btl->btl_endpoint = btl_endpoints[p]; - bml_btl->btl_weight = 0; - bml_btl->btl_flags = btl_flags; - - /** - * calculate the bitwise OR of the btl flags - */ - bml_endpoint->btl_flags_or |= bml_btl->btl_flags; - } - - /* always add rdma endpoints */ - if ((btl_flags & MCA_BTL_FLAGS_RDMA) && - !((proc->super.proc_arch != ompi_proc_local_proc->super.proc_arch) && - (0 == (btl->btl_flags & MCA_BTL_FLAGS_HETEROGENEOUS_RDMA)))) { - mca_bml_base_btl_t *bml_btl_rdma = mca_bml_base_btl_array_insert(&bml_endpoint->btl_rdma); - - bml_btl_rdma->btl = btl; - bml_btl_rdma->btl_endpoint = btl_endpoints[p]; - bml_btl_rdma->btl_weight = 0; - bml_btl_rdma->btl_flags = btl_flags; - - if (bml_endpoint->btl_pipeline_send_length < btl->btl_rdma_pipeline_send_length) { - bml_endpoint->btl_pipeline_send_length = btl->btl_rdma_pipeline_send_length; - } - - if (bml_endpoint->btl_send_limit < btl->btl_min_rdma_pipeline_size) { - bml_endpoint->btl_send_limit = btl->btl_min_rdma_pipeline_size; - } - } - - /* This BTL is in use, allow the progress registration */ - btl_inuse++; + for (size_t p = 0 ; p < n_new_procs ; ++p) { + if (!opal_bitmap_is_set_bit(reachable, p)) { + continue; } + + ompi_proc_t *proc = new_procs[p]; + mca_bml_base_endpoint_t *bml_endpoint = + (mca_bml_base_endpoint_t *) proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML]; + mca_bml_base_btl_t *bml_btl = NULL; + size_t size; + + if (NULL == bml_endpoint) { + bml_endpoint = mca_bml_r2_allocate_endpoint (proc); + if (NULL == bml_endpoint) { + free(btl_endpoints); + free(new_procs); + return OPAL_ERR_OUT_OF_RESOURCE; + } + } + + rc = mca_bml_r2_endpoint_add_btl (proc, bml_endpoint, btl, btl_endpoints[p]); + if (OMPI_SUCCESS != rc) { + btl->btl_del_procs(btl, 1, (opal_proc_t**)&proc, &btl_endpoints[p]); + continue; + } + + /* This BTL is in use, allow the progress registration */ + btl_inuse++; } - if(btl_inuse > 0 && NULL != btl->btl_component->btl_progress) { - size_t p; - bool found = false; - for( p = 0; p < mca_bml_r2.num_btl_progress; p++ ) { - if(mca_bml_r2.btl_progress[p] == btl->btl_component->btl_progress) { - found = true; - break; - } - } - if(found == false) { - mca_bml_r2.btl_progress[mca_bml_r2.num_btl_progress] = - btl->btl_component->btl_progress; - mca_bml_r2.num_btl_progress++; - opal_progress_register( btl->btl_component->btl_progress ); - } + if (btl_inuse) { + mca_bml_r2_register_progress (btl); } } + free(btl_endpoints); /* iterate back through procs and compute metrics for registered r2s */ - for(p=0; pproc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML]; - double total_bandwidth = 0; - uint32_t latency; - size_t n_send, n_rdma; + for (size_t p = 0; p < n_new_procs ; ++p) { + mca_bml_base_endpoint_t *bml_endpoint = + (mca_bml_base_endpoint_t *) new_procs[p]->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML]; /* skip over procs w/ no btl's registered */ - if(NULL == bml_endpoint) { - continue; - } - - /* (1) determine the total bandwidth available across all btls - * note that we need to do this here, as we may already have btls configured - * (2) determine the highest priority ranking for latency - * (3) compute the maximum amount of bytes that can be send without any - * weighting. Once the left over is smaller than this number we will - * start using the weight to compute the correct amount. - */ - n_send = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_send); - n_rdma = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_rdma); - - /* sort BTLs in descending order according to bandwidth value */ - qsort(bml_endpoint->btl_send.bml_btls, n_send, - sizeof(mca_bml_base_btl_t), btl_bandwidth_compare); - - bml_endpoint->btl_rdma_index = 0; - - mca_bml_r2_calculate_bandwidth_latency (&bml_endpoint->btl_send, &total_bandwidth, &latency); - - /* (1) set the weight of each btl as a percentage of overall bandwidth - * (2) copy all btl instances at the highest priority ranking into the - * list of btls used for first fragments - */ - for (size_t n_index = 0 ; n_index < n_send ; ++n_index) { - mca_bml_base_btl_t* bml_btl = - mca_bml_base_btl_array_get_index(&bml_endpoint->btl_send, n_index); - mca_btl_base_module_t *btl = bml_btl->btl; - - /* compute weighting factor for this r2 */ - if(btl->btl_bandwidth > 0) { - bml_btl->btl_weight = (float)(btl->btl_bandwidth / total_bandwidth); - } else { - bml_btl->btl_weight = (float)(1.0 / n_send); - } - - /* check to see if this r2 is already in the array of r2s - * used for first fragments - if not add it. - */ - if(btl->btl_latency == latency) { - mca_bml_base_btl_t* bml_btl_new = - mca_bml_base_btl_array_insert(&bml_endpoint->btl_eager); - *bml_btl_new = *bml_btl; - } - - /* set endpoint max send size as min of available btls */ - if(bml_endpoint->btl_max_send_size > btl->btl_max_send_size) - bml_endpoint->btl_max_send_size = btl->btl_max_send_size; - } - - /* sort BTLs in descending order according to bandwidth value */ - qsort(bml_endpoint->btl_rdma.bml_btls, n_rdma, - sizeof(mca_bml_base_btl_t), btl_bandwidth_compare); - - mca_bml_r2_calculate_bandwidth_latency (&bml_endpoint->btl_rdma, &total_bandwidth, &latency); - - /* set rdma btl weights */ - for (size_t n_index = 0 ; n_index < n_rdma ; ++n_index) { - mca_bml_base_btl_t *bml_btl = - mca_bml_base_btl_array_get_index(&bml_endpoint->btl_rdma, n_index); - - /* compute weighting factor for this r2 */ - if (bml_btl->btl->btl_bandwidth > 0.0) { - bml_btl->btl_weight = (float)(bml_btl->btl->btl_bandwidth / total_bandwidth); - } else { - bml_btl->btl_weight = (float)(1.0 / n_rdma); - } + if (NULL != bml_endpoint) { + mca_bml_r2_compute_endpoint_metrics (bml_endpoint); } } /* see if we have a connection to everyone else */ - for(p = 0; p < n_new_procs; p++) { + for(size_t p = 0; p < n_new_procs ; ++p) { ompi_proc_t *proc = new_procs[p]; if (NULL == proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML]) { ret = OMPI_ERR_UNREACH; if (mca_bml_r2.show_unreach_errors) { - opal_show_help("help-mca-bml-r2.txt", - "unreachable proc", - true, + opal_show_help("help-mca-bml-r2.txt", "unreachable proc", true, OMPI_NAME_PRINT(&(ompi_proc_local_proc->super.proc_name)), (NULL != ompi_proc_local_proc->super.proc_hostname ? ompi_proc_local_proc->super.proc_hostname : "unknown!"), @@ -459,6 +574,7 @@ static int mca_bml_r2_add_procs( size_t nprocs, proc->super.proc_hostname : "unknown!"), btl_names); } + break; } } @@ -476,7 +592,6 @@ static int mca_bml_r2_add_procs( size_t nprocs, static int mca_bml_r2_del_procs(size_t nprocs, struct ompi_proc_t** procs) { - size_t p; int rc; struct ompi_proc_t** del_procs = (struct ompi_proc_t**) malloc(nprocs * sizeof(struct ompi_proc_t*)); @@ -486,26 +601,27 @@ static int mca_bml_r2_del_procs(size_t nprocs, return OMPI_ERR_OUT_OF_RESOURCE; } - for(p = 0; p < nprocs; p++) { + for (size_t p = 0 ; p < nprocs ; ++p) { ompi_proc_t *proc = procs[p]; /* We much check that there are 2 references to the proc (not 1). The * first reference belongs to ompi/proc the second belongs to the bml * since we retained it. We will release that reference at the end of * the loop below. */ - if(((opal_object_t*)proc)->obj_reference_count == 2) { + if (((opal_object_t*)proc)->obj_reference_count == 2 && + NULL != proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML]) { del_procs[n_del_procs++] = proc; } } - for(p = 0; p < n_del_procs; p++) { + for (size_t p = 0 ; p < n_del_procs ; ++p) { ompi_proc_t *proc = del_procs[p]; mca_bml_base_endpoint_t* bml_endpoint = (mca_bml_base_endpoint_t*) proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML]; - size_t f_index, f_size; + size_t f_size; /* notify each btl that the proc is going away */ f_size = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_send); - for(f_index = 0; f_index < f_size; f_index++) { + for (size_t f_index = 0 ; f_index < f_size ; ++f_index) { mca_bml_base_btl_t* bml_btl = mca_bml_base_btl_array_get_index(&bml_endpoint->btl_send, f_index); mca_btl_base_module_t* btl = bml_btl->btl; @@ -521,10 +637,12 @@ static int mca_bml_r2_del_procs(size_t nprocs, */ } + proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML] = NULL; + OBJ_RELEASE(proc); + /* do any required cleanup */ OBJ_RELEASE(bml_endpoint); - proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML] = NULL; } free(del_procs); @@ -835,6 +953,7 @@ int mca_bml_r2_component_fini(void) mca_bml_r2_module_t mca_bml_r2 = { .super = { .bml_component = &mca_bml_r2_component, + .bml_add_proc = mca_bml_r2_add_proc, .bml_add_procs = mca_bml_r2_add_procs, .bml_del_procs = mca_bml_r2_del_procs, .bml_add_btl = mca_bml_r2_add_btl, @@ -843,8 +962,7 @@ mca_bml_r2_module_t mca_bml_r2 = { .bml_register = mca_bml_r2_register, .bml_register_error = mca_bml_r2_register_error, .bml_finalize = mca_bml_r2_finalize, - .bml_ft_event = mca_bml_r2_ft_event - } - + .bml_ft_event = mca_bml_r2_ft_event, + }, }; diff --git a/ompi/mca/coll/fca/coll_fca_module.c b/ompi/mca/coll/fca/coll_fca_module.c index 2c3922cf34..cda756dfa5 100644 --- a/ompi/mca/coll/fca/coll_fca_module.c +++ b/ompi/mca/coll/fca/coll_fca_module.c @@ -35,25 +35,6 @@ int mca_coll_fca_init_query(bool enable_progress_threads, return OMPI_SUCCESS; } -static int have_remote_peers(ompi_group_t *group, size_t size, int *local_peers) -{ - ompi_proc_t *proc; - size_t i; - int ret; - - *local_peers = 0; - ret = 0; - for (i = 0; i < size; ++i) { - proc = ompi_group_peer_lookup(group, i); - if (OPAL_PROC_ON_LOCAL_NODE(proc->super.proc_flags)) { - ++*local_peers; - } else { - ret = 1; - } - } - return ret; -} - static inline ompi_proc_t* __local_rank_lookup(ompi_communicator_t *comm, int rank) { return ompi_group_peer_lookup(comm->c_local_group, rank); @@ -618,7 +599,7 @@ mca_coll_fca_comm_query(struct ompi_communicator_t *comm, int *priority) if (size < mca_coll_fca_component.fca_np) goto exit; - if (!have_remote_peers(comm->c_local_group, size, &local_peers) || OMPI_COMM_IS_INTER(comm)) + if (!ompi_group_have_remote_peers(comm->c_local_group) || OMPI_COMM_IS_INTER(comm)) goto exit; fca_module = OBJ_NEW(mca_coll_fca_module_t); diff --git a/ompi/mca/coll/sm/coll_sm_module.c b/ompi/mca/coll/sm/coll_sm_module.c index 37a7cbdc2d..4739217bc1 100644 --- a/ompi/mca/coll/sm/coll_sm_module.c +++ b/ompi/mca/coll/sm/coll_sm_module.c @@ -74,7 +74,6 @@ uint32_t mca_coll_sm_one = 1; */ static int sm_module_enable(mca_coll_base_module_t *module, struct ompi_communicator_t *comm); -static bool have_local_peers(ompi_group_t *group, size_t size); static int bootstrap_comm(ompi_communicator_t *comm, mca_coll_sm_module_t *module); static int mca_coll_sm_module_disable(mca_coll_base_module_t *module, @@ -172,8 +171,7 @@ mca_coll_sm_comm_query(struct ompi_communicator_t *comm, int *priority) /* If we're intercomm, or if there's only one process in the communicator, or if not all the processes in the communicator are not on this node, then we don't want to run */ - if (OMPI_COMM_IS_INTER(comm) || 1 == ompi_comm_size(comm) || - !have_local_peers(comm->c_local_group, ompi_comm_size(comm))) { + if (OMPI_COMM_IS_INTER(comm) || 1 == ompi_comm_size(comm) || ompi_group_have_remote_peers (comm->c_local_group)) { opal_output_verbose(10, ompi_coll_base_framework.framework_output, "coll:sm:comm_query (%d/%s): intercomm, comm is too small, or not all peers local; disqualifying myself", comm->c_contextid, comm->c_name); return NULL; @@ -490,23 +488,6 @@ int ompi_coll_sm_lazy_enable(mca_coll_base_module_t *module, return OMPI_SUCCESS; } - -static bool have_local_peers(ompi_group_t *group, size_t size) -{ - size_t i; - ompi_proc_t *proc; - - for (i = 0; i < size; ++i) { - proc = ompi_group_peer_lookup(group,i); - if (!OPAL_PROC_ON_LOCAL_NODE(proc->super.proc_flags)) { - return false; - } - } - - return true; -} - - static int bootstrap_comm(ompi_communicator_t *comm, mca_coll_sm_module_t *module) { diff --git a/ompi/mca/mtl/psm/mtl_psm.h b/ompi/mca/mtl/psm/mtl_psm.h index 36aedbfcc5..52a590b3d3 100644 --- a/ompi/mca/mtl/psm/mtl_psm.h +++ b/ompi/mca/mtl/psm/mtl_psm.h @@ -1,3 +1,4 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* * Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana * University Research and Technology @@ -10,6 +11,8 @@ * Copyright (c) 2004-2006 The Regents of the University of California. * All rights reserved. * Copyright (c) 2006 QLogic Corporation. All rights reserved. + * Copyright (c) 2015 Los Alamos National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -23,6 +26,7 @@ #include "ompi/mca/pml/pml.h" #include "ompi/mca/mtl/mtl.h" #include "ompi/mca/mtl/base/base.h" +#include "ompi/proc/proc.h" #include "opal/datatype/opal_convertor.h" #include #include diff --git a/ompi/mca/mtl/psm/mtl_psm_endpoint.h b/ompi/mca/mtl/psm/mtl_psm_endpoint.h index 83a1ecfa8f..b08e9fdbc4 100644 --- a/ompi/mca/mtl/psm/mtl_psm_endpoint.h +++ b/ompi/mca/mtl/psm/mtl_psm_endpoint.h @@ -1,3 +1,4 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology @@ -10,6 +11,8 @@ * Copyright (c) 2004-2006 The Regents of the University of California. * All rights reserved. * Copyright (c) 2006 QLogic Corporation. All rights reserved. + * Copyright (c) 2015 Los Alamos National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -54,5 +57,14 @@ struct mca_mtl_psm_endpoint_t { typedef struct mca_mtl_psm_endpoint_t mca_mtl_psm_endpoint_t; OBJ_CLASS_DECLARATION(mca_mtl_psm_endpoint); +static inline mca_mtl_psm_endpoint_t *ompi_mtl_psm_get_endpoint (struct mca_mtl_base_module_t* mtl, ompi_proc_t *ompi_proc) +{ + if (OPAL_UNLIKELY(NULL == ompi_proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_MTL])) { + ompi_mtl_psm_add_procs (mtl, 1, &ompi_proc); + } + + return ompi_proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_MTL]; +} + END_C_DECLS #endif diff --git a/ompi/mca/mtl/psm/mtl_psm_send.c b/ompi/mca/mtl/psm/mtl_psm_send.c index ddedd65265..c30801b1fb 100644 --- a/ompi/mca/mtl/psm/mtl_psm_send.c +++ b/ompi/mca/mtl/psm/mtl_psm_send.c @@ -1,3 +1,4 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* * Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana * University Research and Technology @@ -10,6 +11,8 @@ * Copyright (c) 2004-2006 The Regents of the University of California. * All rights reserved. * Copyright (c) 2006 QLogic Corporation. All rights reserved. + * Copyright (c) 2015 Los Alamos National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -42,7 +45,7 @@ ompi_mtl_psm_send(struct mca_mtl_base_module_t* mtl, int ret; size_t length; ompi_proc_t* ompi_proc = ompi_comm_peer_lookup( comm, dest ); - mca_mtl_psm_endpoint_t* psm_endpoint = (mca_mtl_psm_endpoint_t*) ompi_proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_MTL]; + mca_mtl_psm_endpoint_t* psm_endpoint = ompi_mtl_psm_get_endpoint (mtl, ompi_proc); assert(mtl == &ompi_mtl_psm.super); @@ -94,7 +97,7 @@ ompi_mtl_psm_isend(struct mca_mtl_base_module_t* mtl, mca_mtl_psm_request_t * mtl_psm_request = (mca_mtl_psm_request_t*) mtl_request; size_t length; ompi_proc_t* ompi_proc = ompi_comm_peer_lookup( comm, dest ); - mca_mtl_psm_endpoint_t* psm_endpoint = (mca_mtl_psm_endpoint_t*)ompi_proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_MTL]; + mca_mtl_psm_endpoint_t* psm_endpoint = ompi_mtl_psm_get_endpoint (mtl, ompi_proc); assert(mtl == &ompi_mtl_psm.super); diff --git a/ompi/mca/mtl/psm2/mtl_psm2.h b/ompi/mca/mtl/psm2/mtl_psm2.h index b48e07a039..44152656bf 100644 --- a/ompi/mca/mtl/psm2/mtl_psm2.h +++ b/ompi/mca/mtl/psm2/mtl_psm2.h @@ -1,3 +1,4 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* * Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana * University Research and Technology @@ -11,6 +12,8 @@ * All rights reserved. * Copyright (c) 2006 QLogic Corporation. All rights reserved. * Copyright (c) 2015 Intel, Inc. All rights reserved + * Copyright (c) 2015 Los Alamos National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -24,6 +27,7 @@ #include "ompi/mca/pml/pml.h" #include "ompi/mca/mtl/mtl.h" #include "ompi/mca/mtl/base/base.h" +#include "ompi/proc/proc.h" #include "opal/datatype/opal_convertor.h" #include #include diff --git a/ompi/mca/mtl/psm2/mtl_psm2_endpoint.h b/ompi/mca/mtl/psm2/mtl_psm2_endpoint.h index e3233db352..aeb6bccadc 100644 --- a/ompi/mca/mtl/psm2/mtl_psm2_endpoint.h +++ b/ompi/mca/mtl/psm2/mtl_psm2_endpoint.h @@ -55,5 +55,14 @@ struct mca_mtl_psm2_endpoint_t { typedef struct mca_mtl_psm2_endpoint_t mca_mtl_psm2_endpoint_t; OBJ_CLASS_DECLARATION(mca_mtl_psm2_endpoint); +static inline mca_mtl_psm_endpoint_t *ompi_mtl_psm2_get_endpoint (struct mca_mtl_base_module_t* mtl, ompi_proc_t *ompi_proc) +{ + if (OPAL_UNLIKELY(NULL == ompi_proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_MTL])) { + ompi_mtl_psm2_add_procs (mtl, 1, &ompi_proc); + } + + return ompi_proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_MTL]; +} + END_C_DECLS #endif diff --git a/ompi/mca/mtl/psm2/mtl_psm2_send.c b/ompi/mca/mtl/psm2/mtl_psm2_send.c index 76fb5a1cd0..73cf769873 100644 --- a/ompi/mca/mtl/psm2/mtl_psm2_send.c +++ b/ompi/mca/mtl/psm2/mtl_psm2_send.c @@ -43,7 +43,7 @@ ompi_mtl_psm2_send(struct mca_mtl_base_module_t* mtl, int ret; size_t length; ompi_proc_t* ompi_proc = ompi_comm_peer_lookup( comm, dest ); - mca_mtl_psm2_endpoint_t* psm_endpoint = (mca_mtl_psm2_endpoint_t*) ompi_proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_MTL]; + mca_mtl_psm2_endpoint_t* psm_endpoint = ompi_mtl_psm2_get_endpoint (mtl, ompi_proc); assert(mtl == &ompi_mtl_psm2.super); @@ -95,7 +95,7 @@ ompi_mtl_psm2_isend(struct mca_mtl_base_module_t* mtl, mca_mtl_psm2_request_t * mtl_psm2_request = (mca_mtl_psm2_request_t*) mtl_request; size_t length; ompi_proc_t* ompi_proc = ompi_comm_peer_lookup( comm, dest ); - mca_mtl_psm2_endpoint_t* psm_endpoint = (mca_mtl_psm2_endpoint_t*)ompi_proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_MTL]; + mca_mtl_psm2_endpoint_t* psm_endpoint = ompi_mtl_psm2_get_endpoint (mtl, ompi_proc); assert(mtl == &ompi_mtl_psm2.super); diff --git a/ompi/mca/osc/portals4/osc_portals4.h b/ompi/mca/osc/portals4/osc_portals4.h index c403683627..fcba31ffad 100644 --- a/ompi/mca/osc/portals4/osc_portals4.h +++ b/ompi/mca/osc/portals4/osc_portals4.h @@ -299,7 +299,7 @@ ompi_osc_portals4_get_peer(ompi_osc_portals4_module_t *module, int rank) static inline ptl_process_t ompi_osc_portals4_get_peer_group(struct ompi_group_t *group, int rank) { - ompi_proc_t *proc = ompi_group_get_proc_ptr(group, rank); + ompi_proc_t *proc = ompi_group_get_proc_ptr(group, rank, true); return *((ptl_process_t*) proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_PORTALS4]); } diff --git a/ompi/mca/osc/sm/osc_sm_component.c b/ompi/mca/osc/sm/osc_sm_component.c index b51e7d1c92..c8c940d535 100644 --- a/ompi/mca/osc/sm/osc_sm_component.c +++ b/ompi/mca/osc/sm/osc_sm_component.c @@ -134,10 +134,8 @@ check_win_ok(ompi_communicator_t *comm, int flavor) return OMPI_ERR_NOT_SUPPORTED; } - for (i = 0 ; i < ompi_comm_size(comm) ; ++i) { - if (!OPAL_PROC_ON_LOCAL_NODE(ompi_comm_peer_lookup(comm, i)->super.proc_flags)) { - return OMPI_ERR_RMA_SHARED; - } + if (ompi_group_have_remote_peers (comm->c_local_group)) { + return OMPI_ERR_RMA_SHARED; } return OMPI_SUCCESS; diff --git a/ompi/mca/pml/ob1/pml_ob1.c b/ompi/mca/pml/ob1/pml_ob1.c index cee5cd3756..55de7d150f 100644 --- a/ompi/mca/pml/ob1/pml_ob1.c +++ b/ompi/mca/pml/ob1/pml_ob1.c @@ -191,11 +191,9 @@ int mca_pml_ob1_add_comm(ompi_communicator_t* comm) { /* allocate pml specific comm data */ mca_pml_ob1_comm_t* pml_comm = OBJ_NEW(mca_pml_ob1_comm_t); - opal_list_item_t *item, *next_item; - mca_pml_ob1_recv_frag_t* frag; + mca_pml_ob1_recv_frag_t *frag, *next_frag; mca_pml_ob1_comm_proc_t* pml_proc; mca_pml_ob1_match_hdr_t* hdr; - int i; if (NULL == pml_comm) { return OMPI_ERR_OUT_OF_RESOURCE; @@ -210,16 +208,8 @@ int mca_pml_ob1_add_comm(ompi_communicator_t* comm) mca_pml_ob1_comm_init_size(pml_comm, comm->c_remote_group->grp_proc_count); comm->c_pml_comm = pml_comm; - for( i = 0; i < comm->c_remote_group->grp_proc_count; i++ ) { - pml_comm->procs[i].ompi_proc = ompi_group_peer_lookup(comm->c_remote_group,i); - OBJ_RETAIN(pml_comm->procs[i].ompi_proc); - } /* Grab all related messages from the non_existing_communicator pending queue */ - for( item = opal_list_get_first(&mca_pml_ob1.non_existing_communicator_pending); - item != opal_list_get_end(&mca_pml_ob1.non_existing_communicator_pending); - item = next_item ) { - frag = (mca_pml_ob1_recv_frag_t*)item; - next_item = opal_list_get_next(item); + OPAL_LIST_FOREACH_SAFE(frag, next_frag, &mca_pml_ob1.non_existing_communicator_pending, mca_pml_ob1_recv_frag_t) { hdr = &frag->hdr.hdr_match; /* Is this fragment for the current communicator ? */ @@ -229,8 +219,8 @@ int mca_pml_ob1_add_comm(ompi_communicator_t* comm) /* As we now know we work on a fragment for this communicator * we should remove it from the * non_existing_communicator_pending list. */ - opal_list_remove_item( &mca_pml_ob1.non_existing_communicator_pending, - item ); + opal_list_remove_item (&mca_pml_ob1.non_existing_communicator_pending, + (opal_list_item_t *) frag); add_fragment_to_unexpected: @@ -249,7 +239,7 @@ int mca_pml_ob1_add_comm(ompi_communicator_t* comm) * We just have to push the fragment into the unexpected list of the corresponding * proc, or into the out-of-order (cant_match) list. */ - pml_proc = &(pml_comm->procs[hdr->hdr_src]); + pml_proc = mca_pml_ob1_peer_lookup(comm, hdr->hdr_src); if( ((uint16_t)hdr->hdr_seq) == ((uint16_t)pml_proc->expected_sequence) ) { /* We're now expecting the next sequence number. */ @@ -283,12 +273,6 @@ int mca_pml_ob1_add_comm(ompi_communicator_t* comm) int mca_pml_ob1_del_comm(ompi_communicator_t* comm) { - mca_pml_ob1_comm_t* pml_comm = comm->c_pml_comm; - int i; - - for( i = 0; i < comm->c_remote_group->grp_proc_count; i++ ) { - OBJ_RELEASE(pml_comm->procs[i].ompi_proc); - } OBJ_RELEASE(comm->c_pml_comm); comm->c_pml_comm = NULL; return OMPI_SUCCESS; @@ -303,9 +287,9 @@ int mca_pml_ob1_del_comm(ompi_communicator_t* comm) int mca_pml_ob1_add_procs(ompi_proc_t** procs, size_t nprocs) { + mca_btl_base_selected_module_t *sm; opal_bitmap_t reachable; int rc; - opal_list_item_t *item; if(nprocs == 0) return OMPI_SUCCESS; @@ -347,11 +331,7 @@ int mca_pml_ob1_add_procs(ompi_proc_t** procs, size_t nprocs) BTLs requires iterating over the procs, as the BML does not expose all currently in use btls. */ - for (item = opal_list_get_first(&mca_btl_base_modules_initialized) ; - item != opal_list_get_end(&mca_btl_base_modules_initialized) ; - item = opal_list_get_next(item)) { - mca_btl_base_selected_module_t *sm = - (mca_btl_base_selected_module_t*) item; + OPAL_LIST_FOREACH(sm, &mca_btl_base_modules_initialized, mca_btl_base_selected_module_t) { if (sm->btl_module->btl_eager_limit < sizeof(mca_pml_ob1_hdr_t)) { opal_show_help("help-mpi-pml-ob1.txt", "eager_limit_too_small", true, @@ -589,13 +569,19 @@ int mca_pml_ob1_dump(struct ompi_communicator_t* comm, int verbose) /* iterate through all procs on communicator */ for( i = 0; i < (int)pml_comm->num_procs; i++ ) { - mca_pml_ob1_comm_proc_t* proc = &pml_comm->procs[i]; + mca_pml_ob1_comm_proc_t* proc = pml_comm->procs[i]; + + if (NULL == proc) { + continue; + } + mca_bml_base_endpoint_t* ep = (mca_bml_base_endpoint_t*)proc->ompi_proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML]; size_t n; opal_output(0, "[Rank %d] expected_seq %d ompi_proc %p send_seq %d\n", i, proc->expected_sequence, (void*) proc->ompi_proc, proc->send_sequence); + /* dump all receive queues */ if( opal_list_get_size(&proc->specific_receives) ) { opal_output(0, "expected specific receives\n"); diff --git a/ompi/mca/pml/ob1/pml_ob1_comm.c b/ompi/mca/pml/ob1/pml_ob1_comm.c index d66bebef5d..9ab64c4614 100644 --- a/ompi/mca/pml/ob1/pml_ob1_comm.c +++ b/ompi/mca/pml/ob1/pml_ob1_comm.c @@ -40,14 +40,15 @@ static void mca_pml_ob1_comm_proc_destruct(mca_pml_ob1_comm_proc_t* proc) OBJ_DESTRUCT(&proc->frags_cant_match); OBJ_DESTRUCT(&proc->specific_receives); OBJ_DESTRUCT(&proc->unexpected_frags); + if (proc->ompi_proc) { + OBJ_RELEASE(proc->ompi_proc); + } } -static OBJ_CLASS_INSTANCE( - mca_pml_ob1_comm_proc_t, - opal_object_t, - mca_pml_ob1_comm_proc_construct, - mca_pml_ob1_comm_proc_destruct); +OBJ_CLASS_INSTANCE(mca_pml_ob1_comm_proc_t, opal_object_t, + mca_pml_ob1_comm_proc_construct, + mca_pml_ob1_comm_proc_destruct); static void mca_pml_ob1_comm_construct(mca_pml_ob1_comm_t* comm) @@ -63,11 +64,16 @@ static void mca_pml_ob1_comm_construct(mca_pml_ob1_comm_t* comm) static void mca_pml_ob1_comm_destruct(mca_pml_ob1_comm_t* comm) { - size_t i; - for(i=0; inum_procs; i++) - OBJ_DESTRUCT((&comm->procs[i])); - if(NULL != comm->procs) + if (NULL != comm->procs) { + for (size_t i = 0; i < comm->num_procs; ++i) { + if (comm->procs[i]) { + OBJ_RELEASE(comm->procs[i]); + } + } + free(comm->procs); + } + OBJ_DESTRUCT(&comm->wild_receives); OBJ_DESTRUCT(&comm->matching_lock); } @@ -80,18 +86,13 @@ OBJ_CLASS_INSTANCE( mca_pml_ob1_comm_destruct); -int mca_pml_ob1_comm_init_size(mca_pml_ob1_comm_t* comm, size_t size) +int mca_pml_ob1_comm_init_size (mca_pml_ob1_comm_t* comm, size_t size) { - size_t i; - /* send message sequence-number support - sender side */ - comm->procs = (mca_pml_ob1_comm_proc_t*)malloc(sizeof(mca_pml_ob1_comm_proc_t)*size); + comm->procs = (mca_pml_ob1_comm_proc_t **) calloc(size, sizeof (mca_pml_ob1_comm_proc_t *)); if(NULL == comm->procs) { return OMPI_ERR_OUT_OF_RESOURCE; } - for(i=0; iprocs+i, mca_pml_ob1_comm_proc_t); - } comm->num_procs = size; return OMPI_SUCCESS; } diff --git a/ompi/mca/pml/ob1/pml_ob1_comm.h b/ompi/mca/pml/ob1/pml_ob1_comm.h index 411310575d..7ef54c63e6 100644 --- a/ompi/mca/pml/ob1/pml_ob1_comm.h +++ b/ompi/mca/pml/ob1/pml_ob1_comm.h @@ -24,6 +24,7 @@ #include "opal/threads/mutex.h" #include "opal/class/opal_list.h" #include "ompi/proc/proc.h" +#include "ompi/communicator/communicator.h" BEGIN_C_DECLS @@ -42,6 +43,7 @@ struct mca_pml_ob1_comm_proc_t { }; typedef struct mca_pml_ob1_comm_proc_t mca_pml_ob1_comm_proc_t; +OBJ_CLASS_DECLARATION(mca_pml_ob1_comm_proc_t); /** * Cached on ompi_communicator_t to hold queues/state @@ -56,7 +58,7 @@ struct mca_pml_comm_t { #endif opal_mutex_t matching_lock; /**< matching lock */ opal_list_t wild_receives; /**< queue of unmatched wild (source process not specified) receives */ - mca_pml_ob1_comm_proc_t* procs; + mca_pml_ob1_comm_proc_t **procs; size_t num_procs; size_t last_probed; }; @@ -64,6 +66,18 @@ typedef struct mca_pml_comm_t mca_pml_ob1_comm_t; OBJ_CLASS_DECLARATION(mca_pml_ob1_comm_t); +static inline mca_pml_ob1_comm_proc_t *mca_pml_ob1_peer_lookup (struct ompi_communicator_t *comm, int rank) +{ + mca_pml_ob1_comm_t *pml_comm = (mca_pml_ob1_comm_t *)comm->c_pml_comm; + + if (OPAL_UNLIKELY(NULL == pml_comm->procs[rank])) { + pml_comm->procs[rank] = OBJ_NEW(mca_pml_ob1_comm_proc_t); + pml_comm->procs[rank]->ompi_proc = ompi_comm_peer_lookup (comm, rank); + OBJ_RETAIN(pml_comm->procs[rank]->ompi_proc); + } + + return pml_comm->procs[rank]; +} /** * Initialize an instance of mca_pml_ob1_comm_t based on the communicator size. diff --git a/ompi/mca/pml/ob1/pml_ob1_component.c b/ompi/mca/pml/ob1/pml_ob1_component.c index f3a3434a1c..0e17d1e64f 100644 --- a/ompi/mca/pml/ob1/pml_ob1_component.c +++ b/ompi/mca/pml/ob1/pml_ob1_component.c @@ -144,9 +144,12 @@ static int mca_pml_ob1_get_unex_msgq_size (const struct mca_base_pvar_t *pvar, v int i; for (i = 0 ; i < comm_size ; ++i) { - pml_proc = pml_comm->procs + i; - - values[i] = opal_list_get_size (&pml_proc->unexpected_frags); + pml_proc = pml_comm->procs[i]; + if (pml_proc) { + values[i] = opal_list_get_size (&pml_proc->unexpected_frags); + } else { + values[i] = 0; + } } return OMPI_SUCCESS; @@ -162,9 +165,13 @@ static int mca_pml_ob1_get_posted_recvq_size (const struct mca_base_pvar_t *pvar int i; for (i = 0 ; i < comm_size ; ++i) { - pml_proc = pml_comm->procs + i; + pml_proc = pml_comm->procs[i]; - values[i] = opal_list_get_size (&pml_proc->specific_receives); + if (pml_proc) { + values[i] = opal_list_get_size (&pml_proc->specific_receives); + } else { + values[i] = 0; + } } return OMPI_SUCCESS; diff --git a/ompi/mca/pml/ob1/pml_ob1_irecv.c b/ompi/mca/pml/ob1/pml_ob1_irecv.c index 787a6e0139..16ffcf4f21 100644 --- a/ompi/mca/pml/ob1/pml_ob1_irecv.c +++ b/ompi/mca/pml/ob1/pml_ob1_irecv.c @@ -148,7 +148,6 @@ mca_pml_ob1_imrecv( void *buf, int src, tag; ompi_communicator_t *comm; mca_pml_ob1_comm_proc_t* proc; - mca_pml_ob1_comm_t* ob1_comm; uint64_t seq; /* get the request from the message and the frag from the request @@ -158,7 +157,6 @@ mca_pml_ob1_imrecv( void *buf, src = recvreq->req_recv.req_base.req_ompi.req_status.MPI_SOURCE; tag = recvreq->req_recv.req_base.req_ompi.req_status.MPI_TAG; comm = (*message)->comm; - ob1_comm = recvreq->req_recv.req_base.req_comm->c_pml_comm; seq = recvreq->req_recv.req_base.req_sequence; /* make the request a recv request again */ @@ -196,7 +194,7 @@ mca_pml_ob1_imrecv( void *buf, /* Note - sequence number already assigned */ recvreq->req_recv.req_base.req_sequence = seq; - proc = &ob1_comm->procs[recvreq->req_recv.req_base.req_peer]; + proc = mca_pml_ob1_peer_lookup (comm, recvreq->req_recv.req_base.req_peer); recvreq->req_recv.req_base.req_proc = proc->ompi_proc; prepare_recv_req_converter(recvreq); @@ -243,7 +241,6 @@ mca_pml_ob1_mrecv( void *buf, int src, tag, rc; ompi_communicator_t *comm; mca_pml_ob1_comm_proc_t* proc; - mca_pml_ob1_comm_t* ob1_comm; uint64_t seq; /* get the request from the message and the frag from the request @@ -254,7 +251,6 @@ mca_pml_ob1_mrecv( void *buf, src = recvreq->req_recv.req_base.req_ompi.req_status.MPI_SOURCE; tag = recvreq->req_recv.req_base.req_ompi.req_status.MPI_TAG; seq = recvreq->req_recv.req_base.req_sequence; - ob1_comm = recvreq->req_recv.req_base.req_comm->c_pml_comm; /* make the request a recv request again */ /* The old request kept pointers to comm and the char datatype. @@ -290,7 +286,7 @@ mca_pml_ob1_mrecv( void *buf, /* Note - sequence number already assigned */ recvreq->req_recv.req_base.req_sequence = seq; - proc = &ob1_comm->procs[recvreq->req_recv.req_base.req_peer]; + proc = mca_pml_ob1_peer_lookup (comm, recvreq->req_recv.req_base.req_peer); recvreq->req_recv.req_base.req_proc = proc->ompi_proc; prepare_recv_req_converter(recvreq); diff --git a/ompi/mca/pml/ob1/pml_ob1_isend.c b/ompi/mca/pml/ob1/pml_ob1_isend.c index a25a7250b2..5de0c89d8a 100644 --- a/ompi/mca/pml/ob1/pml_ob1_isend.c +++ b/ompi/mca/pml/ob1/pml_ob1_isend.c @@ -126,15 +126,14 @@ int mca_pml_ob1_isend(const void *buf, ompi_communicator_t * comm, ompi_request_t ** request) { - mca_pml_ob1_comm_t* ob1_comm = comm->c_pml_comm; + mca_pml_ob1_comm_proc_t *ob1_proc = mca_pml_ob1_peer_lookup (comm, dst); mca_pml_ob1_send_request_t *sendreq = NULL; - ompi_proc_t *dst_proc = ompi_comm_peer_lookup (comm, dst); - mca_bml_base_endpoint_t* endpoint = (mca_bml_base_endpoint_t*) - dst_proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML]; + ompi_proc_t *dst_proc = ob1_proc->ompi_proc; + mca_bml_base_endpoint_t* endpoint = mca_bml_base_get_endpoint (dst_proc); int16_t seqn; int rc; - seqn = (uint16_t) OPAL_THREAD_ADD32(&ob1_comm->procs[dst].send_sequence, 1); + seqn = (uint16_t) OPAL_THREAD_ADD32(&ob1_proc->send_sequence, 1); if (MCA_PML_BASE_SEND_SYNCHRONOUS != sendmode) { rc = mca_pml_ob1_send_inline (buf, count, datatype, dst, tag, seqn, dst_proc, @@ -176,10 +175,9 @@ int mca_pml_ob1_send(const void *buf, mca_pml_base_send_mode_t sendmode, ompi_communicator_t * comm) { - mca_pml_ob1_comm_t* ob1_comm = comm->c_pml_comm; - ompi_proc_t *dst_proc = ompi_comm_peer_lookup (comm, dst); - mca_bml_base_endpoint_t* endpoint = (mca_bml_base_endpoint_t*) - dst_proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML]; + mca_pml_ob1_comm_proc_t *ob1_proc = mca_pml_ob1_peer_lookup (comm, dst); + ompi_proc_t *dst_proc = ob1_proc->ompi_proc; + mca_bml_base_endpoint_t* endpoint = mca_bml_base_get_endpoint (dst_proc); mca_pml_ob1_send_request_t *sendreq = NULL; int16_t seqn; int rc; @@ -202,7 +200,7 @@ int mca_pml_ob1_send(const void *buf, return OMPI_ERR_UNREACH; } - seqn = (uint16_t) OPAL_THREAD_ADD32(&ob1_comm->procs[dst].send_sequence, 1); + seqn = (uint16_t) OPAL_THREAD_ADD32(&ob1_proc->send_sequence, 1); /** * The immediate send will not have a request, so they are diff --git a/ompi/mca/pml/ob1/pml_ob1_recvfrag.c b/ompi/mca/pml/ob1/pml_ob1_recvfrag.c index 16a7636e55..797d276b61 100644 --- a/ompi/mca/pml/ob1/pml_ob1_recvfrag.c +++ b/ompi/mca/pml/ob1/pml_ob1_recvfrag.c @@ -143,7 +143,7 @@ void mca_pml_ob1_recv_frag_callback_match(mca_btl_base_module_t* btl, comm = (mca_pml_ob1_comm_t *)comm_ptr->c_pml_comm; /* source sequence number */ - proc = &comm->procs[hdr->hdr_src]; + proc = mca_pml_ob1_peer_lookup (comm_ptr, hdr->hdr_src); /* We generate the MSG_ARRIVED event as soon as the PML is aware * of a matching fragment arrival. Independing if it is received @@ -650,7 +650,7 @@ static int mca_pml_ob1_recv_frag_match( mca_btl_base_module_t *btl, /* source sequence number */ frag_msg_seq = hdr->hdr_seq; - proc = &comm->procs[hdr->hdr_src]; + proc = mca_pml_ob1_peer_lookup (comm_ptr, hdr->hdr_src); /** * We generate the MSG_ARRIVED event as soon as the PML is aware of a matching diff --git a/ompi/mca/pml/ob1/pml_ob1_recvreq.c b/ompi/mca/pml/ob1/pml_ob1_recvreq.c index 792ae45a9c..fdbc130973 100644 --- a/ompi/mca/pml/ob1/pml_ob1_recvreq.c +++ b/ompi/mca/pml/ob1/pml_ob1_recvreq.c @@ -100,7 +100,8 @@ static int mca_pml_ob1_recv_request_free(struct ompi_request_t** request) static int mca_pml_ob1_recv_request_cancel(struct ompi_request_t* ompi_request, int complete) { mca_pml_ob1_recv_request_t* request = (mca_pml_ob1_recv_request_t*)ompi_request; - mca_pml_ob1_comm_t* comm = request->req_recv.req_base.req_comm->c_pml_comm; + ompi_communicator_t *comm = request->req_recv.req_base.req_comm; + mca_pml_ob1_comm_t *ob1_comm = comm->c_pml_comm; if( true == request->req_match_received ) { /* way to late to cancel this one */ assert( OMPI_ANY_TAG != ompi_request->req_status.MPI_TAG ); /* not matched isn't it */ @@ -108,11 +109,11 @@ static int mca_pml_ob1_recv_request_cancel(struct ompi_request_t* ompi_request, } /* The rest should be protected behind the match logic lock */ - OPAL_THREAD_LOCK(&comm->matching_lock); + OPAL_THREAD_LOCK(&ob1_comm->matching_lock); if( request->req_recv.req_base.req_peer == OMPI_ANY_SOURCE ) { - opal_list_remove_item( &comm->wild_receives, (opal_list_item_t*)request ); + opal_list_remove_item( &ob1_comm->wild_receives, (opal_list_item_t*)request ); } else { - mca_pml_ob1_comm_proc_t* proc = comm->procs + request->req_recv.req_base.req_peer; + mca_pml_ob1_comm_proc_t* proc = mca_pml_ob1_peer_lookup (comm, request->req_recv.req_base.req_peer); opal_list_remove_item(&proc->specific_receives, (opal_list_item_t*)request); } PERUSE_TRACE_COMM_EVENT( PERUSE_COMM_REQ_REMOVE_FROM_POSTED_Q, @@ -122,7 +123,7 @@ static int mca_pml_ob1_recv_request_cancel(struct ompi_request_t* ompi_request, * to true. Otherwise, the request will never be freed. */ request->req_recv.req_base.req_pml_complete = true; - OPAL_THREAD_UNLOCK(&comm->matching_lock); + OPAL_THREAD_UNLOCK(&ob1_comm->matching_lock); OPAL_THREAD_LOCK(&ompi_request_lock); ompi_request->req_status._cancelled = true; @@ -260,7 +261,7 @@ static int mca_pml_ob1_recv_request_ack( ompi_proc_t* proc = (ompi_proc_t*)recvreq->req_recv.req_base.req_proc; mca_bml_base_endpoint_t* bml_endpoint = NULL; - bml_endpoint = (mca_bml_base_endpoint_t*) proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML]; + bml_endpoint = mca_bml_base_get_endpoint (proc); /* by default copy everything */ recvreq->req_send_offset = bytes_received; @@ -654,7 +655,7 @@ void mca_pml_ob1_recv_request_progress_rget( mca_pml_ob1_recv_request_t* recvreq } /* lookup bml datastructures */ - bml_endpoint = (mca_bml_base_endpoint_t*)recvreq->req_recv.req_base.req_proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML]; + bml_endpoint = mca_bml_base_get_endpoint (recvreq->req_recv.req_base.req_proc); rdma_bml = mca_bml_base_btl_array_find(&bml_endpoint->btl_rdma, btl); #if OPAL_CUDA_SUPPORT @@ -1079,8 +1080,11 @@ static mca_pml_ob1_recv_frag_t* recv_req_match_specific_proc( const mca_pml_ob1_recv_request_t *req, mca_pml_ob1_comm_proc_t *proc ) { + if (NULL == proc) { + return NULL; + } + opal_list_t* unexpected_frags = &proc->unexpected_frags; - opal_list_item_t *i; mca_pml_ob1_recv_frag_t* frag; int tag = req->req_recv.req_base.req_tag; @@ -1088,20 +1092,12 @@ recv_req_match_specific_proc( const mca_pml_ob1_recv_request_t *req, return NULL; if( OMPI_ANY_TAG == tag ) { - for (i = opal_list_get_first(unexpected_frags); - i != opal_list_get_end(unexpected_frags); - i = opal_list_get_next(i)) { - frag = (mca_pml_ob1_recv_frag_t*)i; - + OPAL_LIST_FOREACH(frag, unexpected_frags, mca_pml_ob1_recv_frag_t) { if( frag->hdr.hdr_match.hdr_tag >= 0 ) return frag; } } else { - for (i = opal_list_get_first(unexpected_frags); - i != opal_list_get_end(unexpected_frags); - i = opal_list_get_next(i)) { - frag = (mca_pml_ob1_recv_frag_t*)i; - + OPAL_LIST_FOREACH(frag, unexpected_frags, mca_pml_ob1_recv_frag_t) { if( frag->hdr.hdr_match.hdr_tag == tag ) return frag; } @@ -1118,7 +1114,7 @@ recv_req_match_wild( mca_pml_ob1_recv_request_t* req, mca_pml_ob1_comm_proc_t **p) { mca_pml_ob1_comm_t* comm = req->req_recv.req_base.req_comm->c_pml_comm; - mca_pml_ob1_comm_proc_t* proc = comm->procs; + mca_pml_ob1_comm_proc_t **procp = comm->procs; size_t i; /* @@ -1133,10 +1129,10 @@ recv_req_match_wild( mca_pml_ob1_recv_request_t* req, mca_pml_ob1_recv_frag_t* frag; /* loop over messages from the current proc */ - if((frag = recv_req_match_specific_proc(req, &proc[i]))) { - *p = &proc[i]; + if((frag = recv_req_match_specific_proc(req, procp[i]))) { + *p = procp[i]; comm->last_probed = i; - req->req_recv.req_base.req_proc = proc[i].ompi_proc; + req->req_recv.req_base.req_proc = procp[i]->ompi_proc; prepare_recv_req_converter(req); return frag; /* match found */ } @@ -1145,10 +1141,10 @@ recv_req_match_wild( mca_pml_ob1_recv_request_t* req, mca_pml_ob1_recv_frag_t* frag; /* loop over messages from the current proc */ - if((frag = recv_req_match_specific_proc(req, &proc[i]))) { - *p = &proc[i]; + if((frag = recv_req_match_specific_proc(req, procp[i]))) { + *p = procp[i]; comm->last_probed = i; - req->req_recv.req_base.req_proc = proc[i].ompi_proc; + req->req_recv.req_base.req_proc = procp[i]->ompi_proc; prepare_recv_req_converter(req); return frag; /* match found */ } @@ -1161,7 +1157,8 @@ recv_req_match_wild( mca_pml_ob1_recv_request_t* req, void mca_pml_ob1_recv_req_start(mca_pml_ob1_recv_request_t *req) { - mca_pml_ob1_comm_t* comm = req->req_recv.req_base.req_comm->c_pml_comm; + ompi_communicator_t *comm = req->req_recv.req_base.req_comm; + mca_pml_ob1_comm_t *ob1_comm = comm->c_pml_comm; mca_pml_ob1_comm_proc_t* proc; mca_pml_ob1_recv_frag_t* frag; opal_list_t *queue; @@ -1179,7 +1176,7 @@ void mca_pml_ob1_recv_req_start(mca_pml_ob1_recv_request_t *req) MCA_PML_BASE_RECV_START(&req->req_recv.req_base); - OPAL_THREAD_LOCK(&comm->matching_lock); + OPAL_THREAD_LOCK(&ob1_comm->matching_lock); /** * The laps of time between the ACTIVATE event and the SEARCH_UNEX one include * the cost of the request lock. @@ -1188,12 +1185,12 @@ void mca_pml_ob1_recv_req_start(mca_pml_ob1_recv_request_t *req) &(req->req_recv.req_base), PERUSE_RECV); /* assign sequence number */ - req->req_recv.req_base.req_sequence = comm->recv_sequence++; + req->req_recv.req_base.req_sequence = ob1_comm->recv_sequence++; /* attempt to match posted recv */ if(req->req_recv.req_base.req_peer == OMPI_ANY_SOURCE) { frag = recv_req_match_wild(req, &proc); - queue = &comm->wild_receives; + queue = &ob1_comm->wild_receives; #if !OPAL_ENABLE_HETEROGENEOUS_SUPPORT /* As we are in a homogeneous environment we know that all remote * architectures are exactly the same as the local one. Therefore, @@ -1206,7 +1203,7 @@ void mca_pml_ob1_recv_req_start(mca_pml_ob1_recv_request_t *req) } #endif /* !OPAL_ENABLE_HETEROGENEOUS_SUPPORT */ } else { - proc = &comm->procs[req->req_recv.req_base.req_peer]; + proc = mca_pml_ob1_peer_lookup (comm, req->req_recv.req_base.req_peer); req->req_recv.req_base.req_proc = proc->ompi_proc; frag = recv_req_match_specific_proc(req, proc); queue = &proc->specific_receives; @@ -1221,7 +1218,7 @@ void mca_pml_ob1_recv_req_start(mca_pml_ob1_recv_request_t *req) it when the message comes in. */ append_recv_req_to_queue(queue, req); req->req_match_received = false; - OPAL_THREAD_UNLOCK(&comm->matching_lock); + OPAL_THREAD_UNLOCK(&ob1_comm->matching_lock); } else { if(OPAL_LIKELY(!IS_PROB_REQ(req))) { PERUSE_TRACE_COMM_EVENT(PERUSE_COMM_REQ_MATCH_UNEX, @@ -1239,7 +1236,7 @@ void mca_pml_ob1_recv_req_start(mca_pml_ob1_recv_request_t *req) opal_list_remove_item(&proc->unexpected_frags, (opal_list_item_t*)frag); - OPAL_THREAD_UNLOCK(&comm->matching_lock); + OPAL_THREAD_UNLOCK(&ob1_comm->matching_lock); switch(hdr->hdr_common.hdr_type) { case MCA_PML_OB1_HDR_TYPE_MATCH: @@ -1269,14 +1266,14 @@ void mca_pml_ob1_recv_req_start(mca_pml_ob1_recv_request_t *req) restarted with this request during mrecv */ opal_list_remove_item(&proc->unexpected_frags, (opal_list_item_t*)frag); - OPAL_THREAD_UNLOCK(&comm->matching_lock); + OPAL_THREAD_UNLOCK(&ob1_comm->matching_lock); req->req_recv.req_base.req_addr = frag; mca_pml_ob1_recv_request_matched_probe(req, frag->btl, frag->segments, frag->num_segments); } else { - OPAL_THREAD_UNLOCK(&comm->matching_lock); + OPAL_THREAD_UNLOCK(&ob1_comm->matching_lock); mca_pml_ob1_recv_request_matched_probe(req, frag->btl, frag->segments, frag->num_segments); } diff --git a/ompi/mca/pml/ob1/pml_ob1_recvreq.h b/ompi/mca/pml/ob1/pml_ob1_recvreq.h index a9e54f6c66..71fb8c3d5b 100644 --- a/ompi/mca/pml/ob1/pml_ob1_recvreq.h +++ b/ompi/mca/pml/ob1/pml_ob1_recvreq.h @@ -433,8 +433,7 @@ static inline int mca_pml_ob1_recv_request_ack_send(ompi_proc_t* proc, { size_t i; mca_bml_base_btl_t* bml_btl; - mca_bml_base_endpoint_t* endpoint = - (mca_bml_base_endpoint_t*)proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML]; + mca_bml_base_endpoint_t* endpoint = mca_bml_base_get_endpoint (proc); for(i = 0; i < mca_bml_base_btl_array_get_size(&endpoint->btl_eager); i++) { bml_btl = mca_bml_base_btl_array_get_next(&endpoint->btl_eager); diff --git a/ompi/mca/pml/ob1/pml_ob1_sendreq.h b/ompi/mca/pml/ob1/pml_ob1_sendreq.h index d9d3bb13a8..9659dabb0b 100644 --- a/ompi/mca/pml/ob1/pml_ob1_sendreq.h +++ b/ompi/mca/pml/ob1/pml_ob1_sendreq.h @@ -480,16 +480,16 @@ mca_pml_ob1_send_request_start_seq (mca_pml_ob1_send_request_t* sendreq, mca_bml static inline int mca_pml_ob1_send_request_start( mca_pml_ob1_send_request_t* sendreq ) { - mca_bml_base_endpoint_t* endpoint = (mca_bml_base_endpoint_t*) - sendreq->req_send.req_base.req_proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML]; - mca_pml_ob1_comm_t* comm = sendreq->req_send.req_base.req_comm->c_pml_comm; + mca_bml_base_endpoint_t *endpoint = mca_bml_base_get_endpoint (sendreq->req_send.req_base.req_proc); + ompi_communicator_t *comm = sendreq->req_send.req_base.req_comm; + mca_pml_ob1_comm_proc_t *ob1_proc = mca_pml_ob1_peer_lookup (comm, sendreq->req_send.req_base.req_peer); int32_t seqn; if (OPAL_UNLIKELY(NULL == endpoint)) { return OMPI_ERR_UNREACH; } - seqn = OPAL_THREAD_ADD32(&comm->procs[sendreq->req_send.req_base.req_peer].send_sequence, 1); + seqn = OPAL_THREAD_ADD32(&ob1_proc->send_sequence, 1); return mca_pml_ob1_send_request_start_seq (sendreq, endpoint, seqn); } diff --git a/ompi/proc/proc.c b/ompi/proc/proc.c index 9747fd7da1..98c4d2a70d 100644 --- a/ompi/proc/proc.c +++ b/ompi/proc/proc.c @@ -1,3 +1,4 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* * Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana * University Research and Technology @@ -10,7 +11,7 @@ * Copyright (c) 2004-2006 The Regents of the University of California. * All rights reserved. * Copyright (c) 2006-2014 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2012 Los Alamos National Security, LLC. All rights + * Copyright (c) 2012-2015 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2013-2015 Intel, Inc. All rights reserved * Copyright (c) 2014 Research Organization for Information Science @@ -43,6 +44,8 @@ static opal_list_t ompi_proc_list; static opal_mutex_t ompi_proc_lock; +static opal_hash_table_t ompi_proc_hash; + ompi_proc_t* ompi_proc_local_proc = NULL; static void ompi_proc_construct(ompi_proc_t* proc); @@ -83,49 +86,223 @@ void ompi_proc_destruct(ompi_proc_t* proc) } OPAL_THREAD_LOCK(&ompi_proc_lock); opal_list_remove_item(&ompi_proc_list, (opal_list_item_t*)proc); + opal_hash_table_remove_value_ptr (&ompi_proc_hash, &proc->super.proc_name, sizeof (proc->super.proc_name)); OPAL_THREAD_UNLOCK(&ompi_proc_lock); } +/** + * Allocate a new ompi_proc_T for the given jobid/vpid + * + * @param[in] jobid Job identifier + * @param[in] vpid Process identifier + * @param[out] procp New ompi_proc_t structure + * + * This function allocates a new ompi_proc_t and inserts it into + * the process list and hash table. + */ +static int ompi_proc_allocate (ompi_jobid_t jobid, ompi_vpid_t vpid, ompi_proc_t **procp) { + ompi_proc_t *proc = OBJ_NEW(ompi_proc_t); + + opal_list_append(&ompi_proc_list, (opal_list_item_t*)proc); + + OMPI_CAST_RTE_NAME(&proc->super.proc_name)->jobid = jobid; + OMPI_CAST_RTE_NAME(&proc->super.proc_name)->vpid = vpid; + + opal_hash_table_set_value_ptr (&ompi_proc_hash, &proc->super.proc_name, sizeof (proc->super.proc_name), + proc); + + *procp = proc; + + return OMPI_SUCCESS; +} + +/** + * Finish setting up an ompi_proc_t + * + * @param[in] proc ompi process structure + * + * This function contains the core code of ompi_proc_complete_init() and + * ompi_proc_refresh(). The tasks performed by this function include + * retrieving the hostname (if below the modex cutoff), determining the + * remote architecture, and calculating the locality of the process. + */ +static int ompi_proc_complete_init_single (ompi_proc_t *proc) +{ + uint16_t u16, *u16ptr; + int ret; + + u16ptr = &u16; + + if (OMPI_CAST_RTE_NAME(&proc->super.proc_name)->vpid == OMPI_PROC_MY_NAME->vpid) { + /* nothing else to do */ + return OMPI_SUCCESS; + } + + /* get the locality information - all RTEs are required + * to provide this information at startup */ + OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_LOCALITY, &proc->super.proc_name, &u16ptr, OPAL_UINT16); + if (OPAL_SUCCESS != ret) { + proc->super.proc_flags = OPAL_PROC_NON_LOCAL; + } else { + proc->super.proc_flags = u16; + } + + /* we can retrieve the hostname at no cost because it + * was provided at startup */ + OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_HOSTNAME, &proc->super.proc_name, + (char**)&(proc->super.proc_hostname), OPAL_STRING); + if (OPAL_SUCCESS != ret) { + return ret; + } +#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT + /* get the remote architecture - this might force a modex except + * for those environments where the RM provides it */ + { + uint32_t *ui32ptr; + ui32ptr = &(proc->super.proc_arch); + OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_ARCH, &proc->super.proc_name, + (void**)&ui32ptr, OPAL_UINT32); + if (OPAL_SUCCESS == ret) { + /* if arch is different than mine, create a new convertor for this proc */ + if (proc->super.proc_arch != opal_local_arch) { + OBJ_RELEASE(proc->super.proc_convertor); + proc->super.proc_convertor = opal_convertor_create(proc->super.proc_arch, 0); + } + } else if (OMPI_ERR_NOT_IMPLEMENTED == ret) { + proc->super.proc_arch = opal_local_arch; + } else { + return ret; + } + } +#else + /* must be same arch as my own */ + proc->super.proc_arch = opal_local_arch; +#endif + + return OMPI_SUCCESS; +} + +opal_proc_t *ompi_proc_lookup (const opal_process_name_t proc_name) +{ + ompi_proc_t *proc = NULL; + int ret; + + /* try to lookup the value in the hash table */ + ret = opal_hash_table_get_value_ptr (&ompi_proc_hash, &proc_name, sizeof (proc_name), (void **) &proc); + + if (OPAL_SUCCESS == ret) { + return &proc->super; + } + + return NULL; +} + +opal_proc_t *ompi_proc_for_name (const opal_process_name_t proc_name) +{ + ompi_proc_t *proc = NULL; + int ret; + + /* try to lookup the value in the hash table */ + ret = opal_hash_table_get_value_ptr (&ompi_proc_hash, &proc_name, sizeof (proc_name), (void **) &proc); + if (OPAL_SUCCESS == ret) { + return &proc->super; + } + + OPAL_THREAD_LOCK(&ompi_proc_lock); + do { + /* double-check that another competing thread has not added this proc */ + ret = opal_hash_table_get_value_ptr (&ompi_proc_hash, &proc_name, sizeof (proc_name), (void **) &proc); + if (OPAL_SUCCESS == ret) { + break; + } + + /* allocate a new ompi_proc_t object for the process and insert it into the process table */ + ret = ompi_proc_allocate (proc_name.jobid, proc_name.vpid, &proc); + if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { + /* allocation fail */ + break; + } + + /* finish filling in the important proc data fields */ + ret = ompi_proc_complete_init_single (proc); + if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { + proc = NULL; + break; + } + } while (0); + OPAL_THREAD_UNLOCK(&ompi_proc_lock); + + return (opal_proc_t *) proc; +} int ompi_proc_init(void) { - ompi_vpid_t i; -#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT + int opal_proc_hash_init_size = (ompi_process_info.num_procs < ompi_add_procs_cutoff) ? ompi_process_info.num_procs : + 1024; + ompi_proc_t *proc; int ret; -#endif OBJ_CONSTRUCT(&ompi_proc_list, opal_list_t); OBJ_CONSTRUCT(&ompi_proc_lock, opal_mutex_t); + OBJ_CONSTRUCT(&ompi_proc_hash, opal_hash_table_t); - /* create proc structures and find self */ - for( i = 0; i < ompi_process_info.num_procs; i++ ) { - ompi_proc_t *proc = OBJ_NEW(ompi_proc_t); - opal_list_append(&ompi_proc_list, (opal_list_item_t*)proc); + ret = opal_hash_table_init (&ompi_proc_hash, opal_proc_hash_init_size); + if (OPAL_SUCCESS != ret) { + return ret; + } - OMPI_CAST_RTE_NAME(&proc->super.proc_name)->jobid = OMPI_PROC_MY_NAME->jobid; - OMPI_CAST_RTE_NAME(&proc->super.proc_name)->vpid = i; + /* create a proc for the local process */ + ret = ompi_proc_allocate (OMPI_PROC_MY_NAME->jobid, OMPI_PROC_MY_NAME->vpid, &proc); + if (OMPI_SUCCESS != ret) { + return OMPI_ERR_OUT_OF_RESOURCE; + } - if (i == OMPI_PROC_MY_NAME->vpid) { - ompi_proc_local_proc = proc; - proc->super.proc_flags = OPAL_PROC_ALL_LOCAL; - proc->super.proc_hostname = strdup(ompi_process_info.nodename); - proc->super.proc_arch = opal_local_arch; - /* Register the local proc with OPAL */ - opal_proc_local_set(&proc->super); + /* set local process data */ + ompi_proc_local_proc = proc; + proc->super.proc_flags = OPAL_PROC_ALL_LOCAL; + proc->super.proc_hostname = strdup(ompi_process_info.nodename); + proc->super.proc_arch = opal_local_arch; + /* Register the local proc with OPAL */ + opal_proc_local_set(&proc->super); #if OPAL_ENABLE_HETEROGENEOUS_SUPPORT - /* add our arch to the modex */ - OPAL_MODEX_SEND_VALUE(ret, OPAL_PMIX_GLOBAL, - OPAL_PMIX_ARCH, &opal_local_arch, OPAL_UINT32); - if (OPAL_SUCCESS != ret) { + /* add our arch to the modex */ + OPAL_MODEX_SEND_VALUE(ret, PMIX_GLOBAL, + OPAL_PMIX_ARCH, &opal_local_arch, OPAL_UINT32); + if (OPAL_SUCCESS != ret) { + return ret; + } +#endif + + if (ompi_process_info.num_procs < ompi_add_procs_cutoff) { + /* create proc structures and find self */ + for (ompi_vpid_t i = 0 ; i < ompi_process_info.num_procs ; ++i ) { + if (i == OMPI_PROC_MY_NAME->vpid) { + continue; + } + + ret = ompi_proc_allocate (OMPI_PROC_MY_NAME->jobid, i, &proc); + if (OMPI_SUCCESS != ret) { return ret; } -#endif } } return OMPI_SUCCESS; } +static int ompi_proc_compare_vid (opal_list_item_t **a, opal_list_item_t **b) +{ + ompi_proc_t *proca = (ompi_proc_t *) *a; + ompi_proc_t *procb = (ompi_proc_t *) *b; + + if (proca->super.proc_name.vpid > procb->super.proc_name.vpid) { + return 1; + } else { + return -1; + } + + /* they should never be equal */ +} /** * The process creation is split into two steps. The second step @@ -140,58 +317,47 @@ int ompi_proc_complete_init(void) { ompi_proc_t *proc; int ret, errcode = OMPI_SUCCESS; - uint16_t u16, *u16ptr; OPAL_THREAD_LOCK(&ompi_proc_lock); - u16ptr = &u16; OPAL_LIST_FOREACH(proc, &ompi_proc_list, ompi_proc_t) { - if (OMPI_CAST_RTE_NAME(&proc->super.proc_name)->vpid != OMPI_PROC_MY_NAME->vpid) { - /* get the locality information - all RTEs are required - * to provide this information at startup */ - OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_LOCALITY, &proc->super.proc_name, &u16ptr, OPAL_UINT16); - if (OPAL_SUCCESS != ret) { - proc->super.proc_flags = OPAL_PROC_NON_LOCAL; - } else { - proc->super.proc_flags = u16; - } - - /* we can retrieve the hostname at no cost because it - * was provided at startup */ - OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_HOSTNAME, &proc->super.proc_name, - (char**)&(proc->super.proc_hostname), OPAL_STRING); - if (OPAL_SUCCESS != ret) { - /* we can live without it */ - proc->super.proc_hostname = NULL; - } -#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT - /* get the remote architecture - this might force a modex except - * for those environments where the RM provides it */ - { - uint32_t *ui32ptr; - ui32ptr = &(proc->super.proc_arch); - OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_ARCH, &proc->super.proc_name, - (void**)&ui32ptr, OPAL_UINT32); - if (OPAL_SUCCESS == ret) { - /* if arch is different than mine, create a new convertor for this proc */ - if (proc->super.proc_arch != opal_local_arch) { - OBJ_RELEASE(proc->super.proc_convertor); - proc->super.proc_convertor = opal_convertor_create(proc->super.proc_arch, 0); - } - } else if (OMPI_ERR_NOT_IMPLEMENTED == ret) { - proc->super.proc_arch = opal_local_arch; - } else { - errcode = ret; - break; - } - } -#else - /* must be same arch as my own */ - proc->super.proc_arch = opal_local_arch; -#endif + ret = ompi_proc_complete_init_single (proc); + if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { + errcode = ret; + break; } } OPAL_THREAD_UNLOCK(&ompi_proc_lock); + + if (ompi_process_info.num_procs >= ompi_add_procs_cutoff) { + uint16_t u16, *u16ptr; + + u16ptr = &u16; + + /* find and add all local processes */ + for (ompi_vpid_t i = 0 ; i < ompi_process_info.num_procs ; ++i ) { + opal_process_name_t proc_name = {.vpid = i, .jobid = OMPI_PROC_MY_NAME->jobid}; + uint16_t locality = OPAL_PROC_NON_LOCAL; + + if (OMPI_PROC_MY_NAME->vpid == i) { + continue; + } + + /* the runtime is required to fill in locality for all local processes by this + * point. only local processes will have locality set */ + OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_LOCALITY, &proc_name, &u16ptr, OPAL_UINT16); + if (OPAL_SUCCESS == ret) { + locality = u16; + } + + if (OPAL_PROC_NON_LOCAL != locality) { + (void) ompi_proc_for_name (proc_name); + } + } + } + + opal_list_sort (&ompi_proc_list, ompi_proc_compare_vid); + return errcode; } @@ -227,6 +393,7 @@ int ompi_proc_finalize (void) /* now destruct the list and thread lock */ OBJ_DESTRUCT(&ompi_proc_list); OBJ_DESTRUCT(&ompi_proc_lock); + OBJ_DESTRUCT(&ompi_proc_hash); return OMPI_SUCCESS; } @@ -248,9 +415,7 @@ ompi_proc_t** ompi_proc_world(size_t *size) /* First count how many match this jobid */ OPAL_THREAD_LOCK(&ompi_proc_lock); - for (proc = (ompi_proc_t*)opal_list_get_first(&ompi_proc_list); - proc != (ompi_proc_t*)opal_list_get_end(&ompi_proc_list); - proc = (ompi_proc_t*)opal_list_get_next(proc)) { + OPAL_LIST_FOREACH(proc, &ompi_proc_list, ompi_proc_t) { if (OPAL_EQUAL == ompi_rte_compare_name_fields(mask, OMPI_CAST_RTE_NAME(&proc->super.proc_name), &my_name)) { ++count; } @@ -265,9 +430,7 @@ ompi_proc_t** ompi_proc_world(size_t *size) /* now save only the procs that match this jobid */ count = 0; - for (proc = (ompi_proc_t*)opal_list_get_first(&ompi_proc_list); - proc != (ompi_proc_t*)opal_list_get_end(&ompi_proc_list); - proc = (ompi_proc_t*)opal_list_get_next(proc)) { + OPAL_LIST_FOREACH(proc, &ompi_proc_list, ompi_proc_t) { if (OPAL_EQUAL == ompi_rte_compare_name_fields(mask, &proc->super.proc_name, &my_name)) { /* DO NOT RETAIN THIS OBJECT - the reference count on this * object will be adjusted by external callers. The intent @@ -305,9 +468,7 @@ ompi_proc_t** ompi_proc_all(size_t* size) } OPAL_THREAD_LOCK(&ompi_proc_lock); - for(proc = (ompi_proc_t*)opal_list_get_first(&ompi_proc_list); - proc != (ompi_proc_t*)opal_list_get_end(&ompi_proc_list); - proc = (ompi_proc_t*)opal_list_get_next(proc)) { + OPAL_LIST_FOREACH(proc, &ompi_proc_list, ompi_proc_t) { /* We know this isn't consistent with the behavior in ompi_proc_world, * but we are leaving the RETAIN for now because the code using this function * assumes that the results need to be released when done. It will @@ -349,9 +510,7 @@ ompi_proc_t * ompi_proc_find ( const ompi_process_name_t * name ) /* return the proc-struct which matches this jobid+process id */ mask = OMPI_RTE_CMP_JOBID | OMPI_RTE_CMP_VPID; OPAL_THREAD_LOCK(&ompi_proc_lock); - for(proc = (ompi_proc_t*)opal_list_get_first(&ompi_proc_list); - proc != (ompi_proc_t*)opal_list_get_end(&ompi_proc_list); - proc = (ompi_proc_t*)opal_list_get_next(proc)) { + OPAL_LIST_FOREACH(proc, &ompi_proc_list, ompi_proc_t) { if (OPAL_EQUAL == ompi_rte_compare_name_fields(mask, &proc->super.proc_name, name)) { rproc = proc; break; @@ -366,21 +525,14 @@ ompi_proc_t * ompi_proc_find ( const ompi_process_name_t * name ) int ompi_proc_refresh(void) { ompi_proc_t *proc = NULL; - opal_list_item_t *item = NULL; ompi_vpid_t i = 0; int ret=OMPI_SUCCESS; - uint16_t u16, *u16ptr; OPAL_THREAD_LOCK(&ompi_proc_lock); - for( item = opal_list_get_first(&ompi_proc_list), i = 0; - item != opal_list_get_end(&ompi_proc_list); - item = opal_list_get_next(item), ++i ) { - proc = (ompi_proc_t*)item; - + OPAL_LIST_FOREACH(proc, &ompi_proc_list, ompi_proc_t) { /* Does not change: proc->super.proc_name.vpid */ OMPI_CAST_RTE_NAME(&proc->super.proc_name)->jobid = OMPI_PROC_MY_NAME->jobid; - u16ptr = &u16; /* Make sure to clear the local flag before we set it below */ proc->super.proc_flags = 0; @@ -392,56 +544,10 @@ int ompi_proc_refresh(void) proc->super.proc_arch = opal_local_arch; opal_proc_local_set(&proc->super); } else { - /* get the locality information - all RTEs are required - * to provide this information at startup */ - OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_LOCALITY, &proc->super.proc_name, &u16ptr, OPAL_UINT16); - if (OPAL_SUCCESS != ret) { - proc->super.proc_flags = OPAL_PROC_NON_LOCAL; - } else { - proc->super.proc_flags = u16; + ret = ompi_proc_complete_init_single (proc); + if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { + break; } - - if (ompi_process_info.num_procs < ompi_direct_modex_cutoff) { - /* IF the number of procs falls below the specified cutoff, - * then we assume the job is small enough that retrieving - * the hostname (which will typically cause retrieval of - * ALL modex info for this proc) will have no appreciable - * impact on launch scaling - */ - OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_HOSTNAME, &proc->super.proc_name, - (char**)&(proc->super.proc_hostname), OPAL_STRING); - if (OMPI_SUCCESS != ret) { - break; - } - } else { - /* just set the hostname to NULL for now - we'll fill it in - * as modex_recv's are called for procs we will talk to, thus - * avoiding retrieval of ALL modex info for this proc until - * required. Transports that delay calling modex_recv until - * first message will therefore scale better than those that - * call modex_recv on all procs during init. - */ - proc->super.proc_hostname = NULL; - } -#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT - { - /* get the remote architecture */ - uint32_t* uiptr = &(proc->super.proc_arch); - OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_ARCH, &proc->super.proc_name, - (void**)&uiptr, OPAL_UINT32); - if (OMPI_SUCCESS != ret) { - break; - } - /* if arch is different than mine, create a new convertor for this proc */ - if (proc->super.proc_arch != opal_local_arch) { - OBJ_RELEASE(proc->super.proc_convertor); - proc->super.proc_convertor = opal_convertor_create(proc->super.proc_arch, 0); - } - } -#else - /* must be same arch as my own */ - proc->super.proc_arch = opal_local_arch; -#endif } } @@ -454,7 +560,7 @@ int ompi_proc_pack(ompi_proc_t **proclist, int proclistsize, opal_buffer_t* buf) { - int i, rc; + int rc; OPAL_THREAD_LOCK(&ompi_proc_lock); @@ -470,7 +576,7 @@ ompi_proc_pack(ompi_proc_t **proclist, int proclistsize, * reduced. For now, just go ahead and pack the info so it * can be sent. */ - for (i=0; isuper.proc_name), 1, OMPI_NAME); if(rc != OPAL_SUCCESS) { OMPI_ERROR_LOG(rc); @@ -503,9 +609,7 @@ ompi_proc_find_and_add(const ompi_process_name_t * name, bool* isnew) /* return the proc-struct which matches this jobid+process id */ mask = OMPI_RTE_CMP_JOBID | OMPI_RTE_CMP_VPID; OPAL_THREAD_LOCK(&ompi_proc_lock); - for(proc = (ompi_proc_t*)opal_list_get_first(&ompi_proc_list); - proc != (ompi_proc_t*)opal_list_get_end(&ompi_proc_list); - proc = (ompi_proc_t*)opal_list_get_next(proc)) { + OPAL_LIST_FOREACH(proc, &ompi_proc_list, ompi_proc_t) { if (OPAL_EQUAL == ompi_rte_compare_name_fields(mask, &proc->super.proc_name, name)) { rproc = proc; *isnew = false; @@ -538,7 +642,6 @@ ompi_proc_unpack(opal_buffer_t* buf, int proclistsize, ompi_proc_t ***proclist, int *newproclistsize, ompi_proc_t ***newproclist) { - int i; size_t newprocs_len = 0; ompi_proc_t **plist=NULL, **newprocs = NULL; @@ -558,7 +661,7 @@ ompi_proc_unpack(opal_buffer_t* buf, /* cycle through the array of provided procs and unpack * their info - as packed by ompi_proc_pack */ - for (i=0; ic_remote_group, i)->super.proc_name); + *OMPI_CAST_RTE_NAME(&ompi_group_get_proc_ptr(comm->c_remote_group, i, true)->super.proc_name); } } @@ -96,7 +99,7 @@ static void try_kill_peers(ompi_communicator_t *comm, for (i = 0; i < ompi_comm_remote_size(comm); ++i) { assert(count <= nprocs); procs[count++] = - *OMPI_CAST_RTE_NAME(&ompi_group_get_proc_ptr(comm->c_remote_group, i)->super.proc_name); + *OMPI_CAST_RTE_NAME(&ompi_group_get_proc_ptr(comm->c_remote_group, i, true)->super.proc_name); } if (nprocs > 0) { diff --git a/ompi/runtime/ompi_mpi_init.c b/ompi/runtime/ompi_mpi_init.c index 0b9af10dba..8c23e17b18 100644 --- a/ompi/runtime/ompi_mpi_init.c +++ b/ompi/runtime/ompi_mpi_init.c @@ -400,6 +400,7 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided) opal_compare_proc = _process_name_compare; opal_convert_string_to_process_name = _convert_string_to_process_name; opal_convert_process_name_to_string = _convert_process_name_to_string; + opal_proc_for_name = ompi_proc_for_name; /* Register MCA variables */ if (OPAL_SUCCESS != (ret = ompi_register_mca_variables())) { diff --git a/ompi/runtime/ompi_mpi_params.c b/ompi/runtime/ompi_mpi_params.c index 021bf9b617..7a7305e150 100644 --- a/ompi/runtime/ompi_mpi_params.c +++ b/ompi/runtime/ompi_mpi_params.c @@ -64,6 +64,7 @@ int ompi_mpi_event_tick_rate = -1; char *ompi_mpi_show_mca_params_string = NULL; bool ompi_mpi_have_sparse_group_storage = !!(OMPI_GROUP_SPARSE); bool ompi_mpi_preconnect_mpi = false; +uint32_t ompi_add_procs_cutoff = 1024; static bool show_default_mca_params = false; static bool show_file_mca_params = false; @@ -288,6 +289,16 @@ int ompi_mpi_register_params(void) ompi_rte_abort(1, NULL); } + ompi_add_procs_cutoff = 1024; + (void) mca_base_var_register ("ompi", "mpi", NULL, "add_procs_cutoff", + "Maximum world size for pre-allocating resources for all " + "remote processes. Increasing this limit may improve " + "communication performance at the cost of memory usage " + "(default: 1024)", MCA_BASE_VAR_TYPE_UNSIGNED_INT, NULL, + 0, 0, OPAL_INFO_LVL_3, MCA_BASE_VAR_SCOPE_LOCAL, + &ompi_add_procs_cutoff); + + return OMPI_SUCCESS; } diff --git a/ompi/runtime/params.h b/ompi/runtime/params.h index e5edda3825..495f0f36fa 100644 --- a/ompi/runtime/params.h +++ b/ompi/runtime/params.h @@ -1,3 +1,4 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology @@ -9,7 +10,7 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. - * Copyright (c) 2007 Los Alamos National Security, LLC. All rights + * Copyright (c) 2007-2015 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2006-2009 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2013 NVIDIA Corporation. All rights reserved. @@ -123,11 +124,16 @@ OMPI_DECLSPEC extern bool ompi_have_sparse_group_storage; */ OMPI_DECLSPEC extern bool ompi_use_sparse_group_storage; -/* +/** * Cutoff point for retrieving hostnames */ OMPI_DECLSPEC extern uint32_t ompi_direct_modex_cutoff; +/** + * Cutoff point for calling add_procs for all processes + */ +OMPI_DECLSPEC extern uint32_t ompi_add_procs_cutoff; + /** * Register MCA parameters used by the MPI layer. * diff --git a/opal/mca/btl/btl.h b/opal/mca/btl/btl.h index 885a6fc0f4..b32a3d3c88 100644 --- a/opal/mca/btl/btl.h +++ b/opal/mca/btl/btl.h @@ -605,12 +605,15 @@ typedef int (*mca_btl_base_module_finalize_fn_t)( * modex_recv() function. The BTL may utilize this information to * determine reachability of each peer process. * - * For each process that is reachable by the BTL, the bit corresponding to the index - * into the proc array (nprocs) should be set in the reachable bitmask. The BTL - * will return an array of pointers to a data structure defined - * by the BTL that is then returned to the BTL on subsequent calls to the BTL data - * transfer functions (e.g btl_send). This may be used by the BTL to cache any addressing - * or connection information (e.g. TCP socket, IB queue pair). + * The caller may pass a "reachable" bitmap pointer. If it is not + * NULL, for each process that is reachable by the BTL, the bit + * corresponding to the index into the proc array (nprocs) should be + * set in the reachable bitmask. The BTL will return an array of + * pointers to a data structure defined by the BTL that is then + * returned to the BTL on subsequent calls to the BTL data transfer + * functions (e.g btl_send). This may be used by the BTL to cache any + * addressing or connection information (e.g. TCP socket, IB queue + * pair). */ typedef int (*mca_btl_base_module_add_procs_fn_t)( struct mca_btl_base_module_t* btl, diff --git a/opal/mca/btl/openib/btl_openib.c b/opal/mca/btl/openib/btl_openib.c index 0cff59f2ab..9f82110cef 100644 --- a/opal/mca/btl/openib/btl_openib.c +++ b/opal/mca/btl/openib/btl_openib.c @@ -871,6 +871,7 @@ int mca_btl_openib_add_procs( for (i = 0, local_procs = 0 ; i < (int) nprocs; i++) { struct opal_proc_t* proc = procs[i]; mca_btl_openib_proc_t* ib_proc; + bool found_existing = false; int remote_matching_port; opal_output(-1, "add procs: adding proc %d", i); @@ -898,6 +899,24 @@ int mca_btl_openib_add_procs( continue; } + OPAL_THREAD_LOCK(&ib_proc->proc_lock); + for (j = 0 ; j < (int) ib_proc->proc_endpoint_count ; ++j) { + endpoint = ib_proc->proc_endpoints[j]; + if (endpoint->endpoint_btl == openib_btl) { + found_existing = true; + break; + } + } + OPAL_THREAD_UNLOCK(&ib_proc->proc_lock); + + if (found_existing) { + if (reachable) { + opal_bitmap_set_bit(reachable, i); + } + peers[i] = endpoint; + continue; + } + /* check if the remote proc has any ports that: - on the same subnet as the local proc, and - on that subnet, has a CPC in common with the local proc @@ -1048,6 +1067,37 @@ int mca_btl_openib_add_procs( return OPAL_SUCCESS; } +struct mca_btl_base_endpoint_t *mca_btl_openib_get_ep (struct mca_btl_base_module_t *btl, struct opal_proc_t *proc) +{ + mca_btl_openib_module_t *openib_btl = (mca_btl_openib_module_t *) btl; + mca_btl_base_endpoint_t *endpoint; + mca_btl_openib_proc_t *ib_proc; + + if (NULL == (ib_proc = mca_btl_openib_proc_create(proc))) { + /* if we don't have connection info for this process, it's + * okay because some other method might be able to reach it, + * so just mark it as unreachable by us */ + return NULL; + } + + OPAL_THREAD_LOCK(&ib_proc->proc_lock); + for (size_t j = 0 ; j < ib_proc->proc_endpoint_count ; ++j) { + endpoint = ib_proc->proc_endpoints[j]; + if (endpoint->endpoint_btl == openib_btl) { + OPAL_THREAD_UNLOCK(&ib_proc->proc_lock); + return endpoint; + } + } + OPAL_THREAD_UNLOCK(&ib_proc->proc_lock); + + BTL_VERBOSE(("creating new endpoint for remote process {.jobid = 0x%x, .vpid = 0x%x}", + proc->proc_name.jobid, proc->proc_name.vpid)); + + endpoint = NULL; + (void) mca_btl_openib_add_procs (btl, 1, &proc, &endpoint, NULL); + return endpoint; +} + /* * delete the proc as reachable from this btl module */ diff --git a/opal/mca/btl/openib/btl_openib.h b/opal/mca/btl/openib/btl_openib.h index 92506bc5d0..6e1c5ca50f 100644 --- a/opal/mca/btl/openib/btl_openib.h +++ b/opal/mca/btl/openib/btl_openib.h @@ -874,6 +874,18 @@ int mca_btl_openib_post_srr(mca_btl_openib_module_t* openib_btl, const int qp); const char* btl_openib_get_transport_name(mca_btl_openib_transport_type_t transport_type); +/** + * Get an endpoint for a process + * + * @param btl (IN) BTL module + * @param proc (IN) opal process object + * + * This function will return an existing endpoint if one exists otherwise it will allocate + * a new endpoint and return it. + */ +struct mca_btl_base_endpoint_t *mca_btl_openib_get_ep (struct mca_btl_base_module_t *btl, + struct opal_proc_t *proc); + /** * Get a transport type of btl. */ diff --git a/opal/mca/btl/openib/btl_openib_mca.c b/opal/mca/btl/openib/btl_openib_mca.c index 8cc9384417..d366443549 100644 --- a/opal/mca/btl/openib/btl_openib_mca.c +++ b/opal/mca/btl/openib/btl_openib_mca.c @@ -565,7 +565,8 @@ int btl_openib_register_mca_params(void) mca_btl_openib_module.super.btl_rdma_pipeline_frag_size = 1024 * 1024; mca_btl_openib_module.super.btl_min_rdma_pipeline_size = 256 * 1024; mca_btl_openib_module.super.btl_flags = MCA_BTL_FLAGS_RDMA | - MCA_BTL_FLAGS_NEED_ACK | MCA_BTL_FLAGS_NEED_CSUM | MCA_BTL_FLAGS_HETEROGENEOUS_RDMA; + MCA_BTL_FLAGS_NEED_ACK | MCA_BTL_FLAGS_NEED_CSUM | MCA_BTL_FLAGS_HETEROGENEOUS_RDMA | + MCA_BTL_FLAGS_SEND; #if BTL_OPENIB_FAILOVER_ENABLED mca_btl_openib_module.super.btl_flags |= MCA_BTL_FLAGS_FAILOVER_SUPPORT; #endif diff --git a/opal/mca/btl/openib/connect/btl_openib_connect_udcm.c b/opal/mca/btl/openib/connect/btl_openib_connect_udcm.c index f438dfcdc8..6074473c05 100644 --- a/opal/mca/btl/openib/connect/btl_openib_connect_udcm.c +++ b/opal/mca/btl/openib/connect/btl_openib_connect_udcm.c @@ -218,6 +218,7 @@ typedef struct udcm_msg_hdr { union { /* UDCM_MESSAGE_CONNECT */ struct msg_connect { + opal_process_name_t rem_name; int32_t rem_ep_index; uint8_t rem_port_num; } req; @@ -1473,36 +1474,26 @@ static int udcm_rc_qp_create_all (mca_btl_base_endpoint_t *lcl_ep) /* JMS: optimization target -- can we send something in private data to find the proc directly instead of having to search through *all* procs? */ -static mca_btl_openib_endpoint_t *udcm_find_endpoint (opal_pointer_array_t *endpoints, +static mca_btl_openib_endpoint_t *udcm_find_endpoint (struct mca_btl_openib_module_t *btl, uint32_t qp_num, uint16_t lid, udcm_msg_hdr_t *msg_hdr) { - uint8_t port_num; - int i; + mca_btl_base_endpoint_t *endpoint; + struct opal_proc_t *opal_proc; - port_num = msg_hdr->data.req.rem_port_num; - - for (i = 0 ; i < opal_pointer_array_get_size (endpoints) ; ++i) { - mca_btl_openib_endpoint_t *endpoint; - modex_msg_t *msg; - - endpoint = (mca_btl_openib_endpoint_t *) - opal_pointer_array_get_item (endpoints, i); - if (NULL == endpoint) { - continue; - } - - msg = UDCM_ENDPOINT_REM_MODEX(endpoint); - - if (msg->mm_qp_num == qp_num && msg->mm_port_num == port_num && - msg->mm_lid == lid) - return endpoint; + opal_proc = opal_proc_for_name (msg_hdr->data.req.rem_name); + if (NULL == opal_proc) { + BTL_ERROR(("could not get proc associated with remote peer")); + return NULL; } - BTL_ERROR(("could not find endpoint with port: %d, lid: %d, msg_type: %d", - port_num, lid, msg_hdr->type)); + endpoint = mca_btl_openib_get_ep (&btl->super, opal_proc); + if (NULL == endpoint) { + BTL_ERROR(("could not find endpoint with port: %d, lid: %d, msg_type: %d", + msg_hdr->data.req.rem_port_num, lid, msg_hdr->type)); + } - return NULL; + return endpoint; } static int udcm_endpoint_init_data (mca_btl_base_endpoint_t *lcl_ep) @@ -1678,6 +1669,7 @@ static int udcm_send_request (mca_btl_base_endpoint_t *lcl_ep, msg->data->hdr.data.req.rem_ep_index = htonl(lcl_ep->index); msg->data->hdr.data.req.rem_port_num = m->modex.mm_port_num; + msg->data->hdr.data.req.rem_name = OPAL_PROC_MY_NAME; for (i = 0 ; i < mca_btl_openib_component.num_qps ; ++i) { msg->data->qps[i].psn = htonl(lcl_ep->qps[i].qp->lcl_psn); @@ -1981,8 +1973,7 @@ static int udcm_process_messages (struct ibv_cq *event_cq, udcm_module_t *m) lcl_ep = message->hdr.lcl_ep; if (NULL == lcl_ep) { - lcl_ep = udcm_find_endpoint (m->btl->device->endpoints, wc[i].src_qp, - wc[i].slid, &message->hdr); + lcl_ep = udcm_find_endpoint (m->btl, wc[i].src_qp, wc[i].slid, &message->hdr); } if (NULL == lcl_ep ) { @@ -2824,6 +2815,7 @@ static int udcm_xrc_send_request (mca_btl_base_endpoint_t *lcl_ep, mca_btl_base_ msg->data->hdr.data.req.rem_ep_index = htonl(lcl_ep->index); msg->data->hdr.data.req.rem_port_num = m->modex.mm_port_num; + msg->data->hdr.data.req.rem_name = OPAL_PROC_MY_NAME; if (UDCM_MESSAGE_XCONNECT == msg_type) { BTL_VERBOSE(("Sending XConnect with qp: %d, psn: %d", lcl_ep->qps[0].qp->lcl_qp->qp_num, diff --git a/opal/mca/btl/portals4/btl_portals4_component.c b/opal/mca/btl/portals4/btl_portals4_component.c index 94b4dd3023..8e4f2864f1 100644 --- a/opal/mca/btl/portals4/btl_portals4_component.c +++ b/opal/mca/btl/portals4/btl_portals4_component.c @@ -221,7 +221,8 @@ mca_btl_portals4_component_open(void) mca_btl_portals4_module.super.btl_min_rdma_pipeline_size = 0; mca_btl_portals4_module.super.btl_flags = MCA_BTL_FLAGS_RDMA | - MCA_BTL_FLAGS_RDMA_MATCHED; + MCA_BTL_FLAGS_RDMA_MATCHED | + MCA_BTL_FLAGS_SEND; mca_btl_portals4_module.super.btl_registration_handle_size = sizeof (mca_btl_base_registration_handle_t); diff --git a/opal/mca/btl/self/btl_self_component.c b/opal/mca/btl/self/btl_self_component.c index 78e400ea6e..42ea125d44 100644 --- a/opal/mca/btl/self/btl_self_component.c +++ b/opal/mca/btl/self/btl_self_component.c @@ -98,7 +98,7 @@ static int mca_btl_self_component_register(void) mca_btl_self.btl_rdma_pipeline_send_length = INT_MAX; mca_btl_self.btl_rdma_pipeline_frag_size = INT_MAX; mca_btl_self.btl_min_rdma_pipeline_size = 0; - mca_btl_self.btl_flags = MCA_BTL_FLAGS_PUT | MCA_BTL_FLAGS_SEND_INPLACE; + mca_btl_self.btl_flags = MCA_BTL_FLAGS_PUT | MCA_BTL_FLAGS_SEND_INPLACE | MCA_BTL_FLAGS_SEND; mca_btl_self.btl_bandwidth = 100; mca_btl_self.btl_latency = 0; mca_btl_base_param_register(&mca_btl_self_component.super.btl_version, diff --git a/opal/mca/btl/tcp/btl_tcp.c b/opal/mca/btl/tcp/btl_tcp.c index c14d655f9b..87e5b0ef15 100644 --- a/opal/mca/btl/tcp/btl_tcp.c +++ b/opal/mca/btl/tcp/btl_tcp.c @@ -10,7 +10,7 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. - * Copyright (c) 2006-2014 Los Alamos National Security, LLC. All rights + * Copyright (c) 2006-2015 Los Alamos National Security, LLC. All rights * reserved. * * $COPYRIGHT$ @@ -72,6 +72,7 @@ int mca_btl_tcp_add_procs( struct mca_btl_base_module_t* btl, struct opal_proc_t* opal_proc = procs[i]; mca_btl_tcp_proc_t* tcp_proc; mca_btl_base_endpoint_t* tcp_endpoint; + bool existing_found = false; /* Do not create loopback TCP connections */ if( my_proc == opal_proc ) { @@ -90,28 +91,43 @@ int mca_btl_tcp_add_procs( struct mca_btl_base_module_t* btl, OPAL_THREAD_LOCK(&tcp_proc->proc_lock); - /* The btl_proc datastructure is shared by all TCP BTL - * instances that are trying to reach this destination. - * Cache the peer instance on the btl_proc. - */ - tcp_endpoint = OBJ_NEW(mca_btl_tcp_endpoint_t); - if(NULL == tcp_endpoint) { - OPAL_THREAD_UNLOCK(&tcp_proc->proc_lock); - return OPAL_ERR_OUT_OF_RESOURCE; + for (int j = 0 ; j < tcp_proc->proc_endpoint_count ; ++j) { + tcp_endpoint = tcp_proc->proc_endpoints[j]; + if (tcp_endpoint->endpoint_btl == tcp_btl) { + existing_found = true; + break; + } } - tcp_endpoint->endpoint_btl = tcp_btl; - rc = mca_btl_tcp_proc_insert(tcp_proc, tcp_endpoint); - if(rc != OPAL_SUCCESS) { - OPAL_THREAD_UNLOCK(&tcp_proc->proc_lock); - OBJ_RELEASE(tcp_endpoint); - continue; + if (!existing_found) { + /* The btl_proc datastructure is shared by all TCP BTL + * instances that are trying to reach this destination. + * Cache the peer instance on the btl_proc. + */ + tcp_endpoint = OBJ_NEW(mca_btl_tcp_endpoint_t); + if(NULL == tcp_endpoint) { + OPAL_THREAD_UNLOCK(&tcp_proc->proc_lock); + return OPAL_ERR_OUT_OF_RESOURCE; + } + + tcp_endpoint->endpoint_btl = tcp_btl; + rc = mca_btl_tcp_proc_insert(tcp_proc, tcp_endpoint); + if(rc != OPAL_SUCCESS) { + OPAL_THREAD_UNLOCK(&tcp_proc->proc_lock); + OBJ_RELEASE(tcp_endpoint); + continue; + } + + opal_list_append(&tcp_btl->tcp_endpoints, (opal_list_item_t*)tcp_endpoint); } - opal_bitmap_set_bit(reachable, i); OPAL_THREAD_UNLOCK(&tcp_proc->proc_lock); + + if (NULL != reachable) { + opal_bitmap_set_bit(reachable, i); + } + peers[i] = tcp_endpoint; - opal_list_append(&tcp_btl->tcp_endpoints, (opal_list_item_t*)tcp_endpoint); /* we increase the count of MPI users of the event library once per peer, so that we are used until we aren't diff --git a/opal/mca/btl/tcp/btl_tcp_component.c b/opal/mca/btl/tcp/btl_tcp_component.c index 4332d2d74c..a43d6453d0 100644 --- a/opal/mca/btl/tcp/btl_tcp_component.c +++ b/opal/mca/btl/tcp/btl_tcp_component.c @@ -269,7 +269,8 @@ static int mca_btl_tcp_component_register(void) MCA_BTL_FLAGS_SEND_INPLACE | MCA_BTL_FLAGS_NEED_CSUM | MCA_BTL_FLAGS_NEED_ACK | - MCA_BTL_FLAGS_HETEROGENEOUS_RDMA; + MCA_BTL_FLAGS_HETEROGENEOUS_RDMA | + MCA_BTL_FLAGS_SEND; mca_btl_tcp_module.super.btl_bandwidth = 100; mca_btl_tcp_module.super.btl_latency = 100; diff --git a/opal/mca/btl/tcp/btl_tcp_proc.c b/opal/mca/btl/tcp/btl_tcp_proc.c index c86977dde3..c0d3399fb8 100644 --- a/opal/mca/btl/tcp/btl_tcp_proc.c +++ b/opal/mca/btl/tcp/btl_tcp_proc.c @@ -14,7 +14,9 @@ * Copyright (c) 2013-2015 Intel, Inc. All rights reserved * Copyright (c) 2014-2015 Research Organization for Information Science * and Technology (RIST). All rights reserved. - * Copyright (c) 2015 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2015 Los Alamos National Security, LLC. All rights + * reserved. + * Copyright (c) 2015 Cisco Systems, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -738,6 +740,31 @@ mca_btl_tcp_proc_t* mca_btl_tcp_proc_lookup(const opal_process_name_t *name) opal_proc_table_get_value(&mca_btl_tcp_component.tcp_procs, *name, (void**)&proc); OPAL_THREAD_UNLOCK(&mca_btl_tcp_component.tcp_lock); + if (OPAL_UNLIKELY(NULL == proc)) { + mca_btl_base_endpoint_t *endpoint; + opal_proc_t *opal_proc; + int rc; + + BTL_VERBOSE(("adding tcp proc for unknown peer {.jobid = 0x%x, .vpid = 0x%x}", + name->jobid, name->vpid)); + + opal_proc = opal_proc_for_name (*name); + if (NULL == opal_proc) { + return NULL; + } + + /* try adding this proc to each btl until */ + for (int i = 0 ; i < mca_btl_tcp_component.tcp_num_btls ; ++i) { + endpoint = NULL; + (void) mca_btl_tcp_add_procs (&mca_btl_tcp_component.tcp_btls[i]->super, 1, &opal_proc, + &endpoint, NULL); + if (NULL != endpoint && NULL == proc) { + /* get the proc and continue on (could probably just break here) */ + proc = endpoint->endpoint_proc; + } + } + } + return proc; } diff --git a/opal/mca/btl/ugni/btl_ugni.h b/opal/mca/btl/ugni/btl_ugni.h index 0fad0465bb..e6d9634f58 100644 --- a/opal/mca/btl/ugni/btl_ugni.h +++ b/opal/mca/btl/ugni/btl_ugni.h @@ -49,7 +49,7 @@ /* ompi and smsg endpoint attributes */ typedef struct mca_btl_ugni_endpoint_attr_t { - uint64_t proc_id; + opal_process_name_t proc_name; uint32_t index; gni_smsg_attr_t smsg_attr; gni_mem_handle_t rmt_irq_mem_hndl; @@ -67,6 +67,7 @@ typedef struct mca_btl_ugni_module_t { opal_common_ugni_device_t *device; + opal_mutex_t endpoint_lock; size_t endpoint_count; opal_pointer_array_t endpoints; opal_hash_table_t id_to_endpoint; @@ -229,6 +230,8 @@ mca_btl_ugni_del_procs (struct mca_btl_base_module_t *btl, struct opal_proc_t **procs, struct mca_btl_base_endpoint_t **peers); +struct mca_btl_base_endpoint_t *mca_btl_ugni_get_ep (struct mca_btl_base_module_t *module, opal_proc_t *proc); + /** * Initiate an asynchronous send. * diff --git a/opal/mca/btl/ugni/btl_ugni_add_procs.c b/opal/mca/btl/ugni/btl_ugni_add_procs.c index fbeff5b5f0..8d7f571e7c 100644 --- a/opal/mca/btl/ugni/btl_ugni_add_procs.c +++ b/opal/mca/btl/ugni/btl_ugni_add_procs.c @@ -28,13 +28,11 @@ static void mca_btl_ugni_module_set_max_reg (mca_btl_ugni_module_t *ugni_module, int nlocal_procs); static int mca_btl_ugni_smsg_setup (int nprocs); -int mca_btl_ugni_add_procs(struct mca_btl_base_module_t* btl, - size_t nprocs, - struct opal_proc_t **procs, - struct mca_btl_base_endpoint_t **peers, - opal_bitmap_t *reachable) { +int mca_btl_ugni_add_procs (struct mca_btl_base_module_t* btl, size_t nprocs, + struct opal_proc_t **procs, + struct mca_btl_base_endpoint_t **peers, + opal_bitmap_t *reachable) { mca_btl_ugni_module_t *ugni_module = (mca_btl_ugni_module_t *) btl; - size_t i; int rc; void *mmap_start_addr; @@ -59,36 +57,45 @@ int mca_btl_ugni_add_procs(struct mca_btl_base_module_t* btl, } } - for (i = 0 ; i < nprocs ; ++i) { + for (size_t i = 0 ; i < nprocs ; ++i) { struct opal_proc_t *opal_proc = procs[i]; uint64_t proc_id = mca_btl_ugni_proc_name_to_id(opal_proc->proc_name); - if (OPAL_PROC_ON_LOCAL_NODE(opal_proc->proc_flags)) { - ugni_module->nlocal_procs++; + /* check for an existing endpoint */ + OPAL_THREAD_LOCK(&ugni_module->endpoint_lock); + if (OPAL_SUCCESS != opal_hash_table_get_value_uint64 (&ugni_module->id_to_endpoint, proc_id, (void **) (peers + i))) { + if (OPAL_PROC_ON_LOCAL_NODE(opal_proc->proc_flags)) { + ugni_module->nlocal_procs++; - /* ugni is allowed on local processes to provide support for network - * atomic operations */ + /* ugni is allowed on local processes to provide support for network + * atomic operations */ + } + + /* Create and Init endpoints */ + rc = mca_btl_ugni_init_ep (ugni_module, peers + i, (mca_btl_ugni_module_t *) btl, opal_proc); + if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { + OPAL_THREAD_UNLOCK(&ugni_module->endpoint_lock); + BTL_ERROR(("btl/ugni error initializing endpoint")); + return rc; + } + + /* go ahead and connect the local endpoint for RDMA/CQ write */ + if (opal_proc == opal_proc_local_get ()) { + ugni_module->local_ep = peers[i]; + } + + /* Add this endpoint to the pointer array. */ + BTL_VERBOSE(("initialized uGNI endpoint for proc id: 0x%" PRIx64 " ptr: %p", proc_id, (void *) peers[i])); + opal_hash_table_set_value_uint64 (&ugni_module->id_to_endpoint, proc_id, peers[i]); + + ++ugni_module->endpoint_count; } + OPAL_THREAD_UNLOCK(&ugni_module->endpoint_lock); - /* Create and Init endpoints */ - rc = mca_btl_ugni_init_ep (ugni_module, peers + i, (mca_btl_ugni_module_t *) btl, opal_proc); - if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { - BTL_ERROR(("btl/ugni error initializing endpoint")); - return rc; + /* Set the reachable bit if necessary */ + if (reachable) { + rc = opal_bitmap_set_bit (reachable, i); } - - /* go ahead and connect the local endpoint for RDMA/CQ write */ - if (opal_proc == opal_proc_local_get ()) { - ugni_module->local_ep = peers[i]; - } - - /* Add this endpoint to the pointer array. */ - BTL_VERBOSE(("initialized uGNI endpoint for proc id: 0x%" PRIx64 " ptr: %p", proc_id, (void *) peers[i])); - opal_hash_table_set_value_uint64 (&ugni_module->id_to_endpoint, proc_id, peers[i]); - - /* Set the reachable bit */ - rc = opal_bitmap_set_bit (reachable, i); - ++ugni_module->endpoint_count; } mca_btl_ugni_module_set_max_reg (ugni_module, ugni_module->nlocal_procs); @@ -224,6 +231,41 @@ int mca_btl_ugni_del_procs (struct mca_btl_base_module_t *btl, return OPAL_SUCCESS; } + +struct mca_btl_base_endpoint_t *mca_btl_ugni_get_ep (struct mca_btl_base_module_t *module, opal_proc_t *proc) +{ + mca_btl_ugni_module_t *ugni_module = (mca_btl_ugni_module_t *) module; + uint64_t proc_id = mca_btl_ugni_proc_name_to_id(proc->proc_name); + mca_btl_base_endpoint_t *ep; + int rc; + + OPAL_THREAD_LOCK(&ugni_module->endpoint_lock); + + do { + rc = opal_hash_table_get_value_uint64 (&ugni_module->id_to_endpoint, proc_id, (void **) &ep); + if (OPAL_SUCCESS == rc) { + OPAL_THREAD_UNLOCK(&ugni_module->endpoint_lock); + break; + } + + /* Create and Init endpoints */ + rc = mca_btl_ugni_init_ep (ugni_module, &ep, ugni_module, proc); + if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { + BTL_ERROR(("btl/ugni error initializing endpoint")); + break; + } + + /* Add this endpoint to the pointer array. */ + BTL_VERBOSE(("initialized uGNI endpoint for proc id: 0x%" PRIx64 " ptr: %p", proc_id, (void *) ep)); + opal_hash_table_set_value_uint64 (&ugni_module->id_to_endpoint, proc_id, ep); + } while (0); + + OPAL_THREAD_UNLOCK(&ugni_module->endpoint_lock); + + return ep; +} + + static int ugni_reg_rdma_mem (void *reg_data, void *base, size_t size, mca_mpool_base_registration_t *reg) { diff --git a/opal/mca/btl/ugni/btl_ugni_component.c b/opal/mca/btl/ugni/btl_ugni_component.c index 5941e18417..1248f2f1c3 100644 --- a/opal/mca/btl/ugni/btl_ugni_component.c +++ b/opal/mca/btl/ugni/btl_ugni_component.c @@ -386,8 +386,8 @@ mca_btl_ugni_component_init (int *num_btl_modules, static inline int mca_btl_ugni_progress_datagram (mca_btl_ugni_module_t *ugni_module) { + uint64_t datagram_id, data, proc_id; uint32_t remote_addr, remote_id; - uint64_t datagram_id, data; mca_btl_base_endpoint_t *ep; gni_post_state_t post_state; gni_ep_handle_t handle; @@ -425,15 +425,24 @@ mca_btl_ugni_progress_datagram (mca_btl_ugni_module_t *ugni_module) /* if this is a wildcard endpoint lookup the remote peer by the proc id we received */ if (handle == ugni_module->wildcard_ep) { - BTL_VERBOSE(("received connection attempt on wildcard endpoint from proc id: %" PRIx64, ugni_module->wc_remote_attr.proc_id)); - rc = opal_hash_table_get_value_uint64 (&ugni_module->id_to_endpoint, - ugni_module->wc_remote_attr.proc_id, - (void *) &ep); + proc_id = mca_btl_ugni_proc_name_to_id (ugni_module->wc_remote_attr.proc_name); + + BTL_VERBOSE(("received connection attempt on wildcard endpoint from proc id: %" PRIx64, + proc_id)); + + OPAL_THREAD_LOCK(&ugni_module->endpoint_lock); + rc = opal_hash_table_get_value_uint64 (&ugni_module->id_to_endpoint, proc_id, (void **) &ep); + OPAL_THREAD_UNLOCK(&ugni_module->endpoint_lock); + /* check if the endpoint is known */ if (OPAL_UNLIKELY(OPAL_SUCCESS != rc || NULL == ep)) { - BTL_ERROR(("received connection attempt from an unknown peer. rc: %d, ep: %p, id: 0x%" PRIx64, - rc, (void *) ep, ugni_module->wc_remote_attr.proc_id)); - return OPAL_ERR_NOT_FOUND; + struct opal_proc_t *remote_proc = opal_proc_for_name (ugni_module->wc_remote_attr.proc_name); + BTL_VERBOSE(("Got connection request from an unknown peer {jobid = 0x%x, vid = 0x%x}", + ugni_module->wc_remote_attr.proc_name.jobid, ugni_module->wc_remote_attr.proc_name.vpid)); + ep = mca_btl_ugni_get_ep (&ugni_module->super, remote_proc); + if (OPAL_UNLIKELY(NULL == ep)) { + return rc; + } } } else { BTL_VERBOSE(("directed datagram complete for endpoint %p", (void *) ep)); diff --git a/opal/mca/btl/ugni/btl_ugni_module.c b/opal/mca/btl/ugni/btl_ugni_module.c index 4977659fc1..7f008c607f 100644 --- a/opal/mca/btl/ugni/btl_ugni_module.c +++ b/opal/mca/btl/ugni/btl_ugni_module.c @@ -91,6 +91,7 @@ mca_btl_ugni_module_init (mca_btl_ugni_module_t *ugni_module, OBJ_CONSTRUCT(&ugni_module->pending_smsg_frags_bb, opal_pointer_array_t); OBJ_CONSTRUCT(&ugni_module->ep_wait_list_lock,opal_mutex_t); OBJ_CONSTRUCT(&ugni_module->ep_wait_list, opal_list_t); + OBJ_CONSTRUCT(&ugni_module->endpoint_lock, opal_mutex_t); OBJ_CONSTRUCT(&ugni_module->endpoints, opal_pointer_array_t); OBJ_CONSTRUCT(&ugni_module->id_to_endpoint, opal_hash_table_t); OBJ_CONSTRUCT(&ugni_module->smsg_mboxes, opal_free_list_t); @@ -208,6 +209,7 @@ mca_btl_ugni_module_finalize (struct mca_btl_base_module_t *btl) OBJ_DESTRUCT(&ugni_module->smsg_mboxes); OBJ_DESTRUCT(&ugni_module->pending_smsg_frags_bb); OBJ_DESTRUCT(&ugni_module->id_to_endpoint); + OBJ_DESTRUCT(&ugni_module->endpoint_lock); OBJ_DESTRUCT(&ugni_module->endpoints); OBJ_DESTRUCT(&ugni_module->eager_get_pending); diff --git a/opal/mca/btl/ugni/btl_ugni_smsg.c b/opal/mca/btl/ugni/btl_ugni_smsg.c index f4f255edfb..5d9ea1eef6 100644 --- a/opal/mca/btl/ugni/btl_ugni_smsg.c +++ b/opal/mca/btl/ugni/btl_ugni_smsg.c @@ -27,7 +27,7 @@ static void mca_btl_ugni_smsg_mbox_construct (mca_btl_ugni_smsg_mbox_t *mbox) { mbox->attr.smsg_attr.msg_buffer = base_reg->base; mbox->attr.smsg_attr.buff_size = mca_btl_ugni_component.smsg_mbox_size; mbox->attr.smsg_attr.mem_hndl = ugni_reg->handle.gni_handle; - mbox->attr.proc_id = mca_btl_ugni_proc_name_to_id (OPAL_PROC_MY_NAME); + mbox->attr.proc_name = OPAL_PROC_MY_NAME; mbox->attr.rmt_irq_mem_hndl = mca_btl_ugni_component.modules[0].device->smsg_irq_mhndl; } diff --git a/opal/mca/btl/usnic/btl_usnic_module.c b/opal/mca/btl/usnic/btl_usnic_module.c index 283d794614..33904eab34 100644 --- a/opal/mca/btl/usnic/btl_usnic_module.c +++ b/opal/mca/btl/usnic/btl_usnic_module.c @@ -427,7 +427,7 @@ static int usnic_add_procs(struct mca_btl_base_module_t* base_module, /* Find all the endpoints with a complete set of USD destinations and mark them as reachable */ - for (size_t i = 0; i < nprocs; ++i) { + for (size_t i = 0; NULL != reachable && i < nprocs; ++i) { if (NULL != endpoints[i]) { bool happy = true; for (int channel = 0; channel < USNIC_NUM_CHANNELS; ++channel) { diff --git a/opal/mca/btl/vader/btl_vader_component.c b/opal/mca/btl/vader/btl_vader_component.c index 79f12aa70b..2f46785ff4 100644 --- a/opal/mca/btl/vader/btl_vader_component.c +++ b/opal/mca/btl/vader/btl_vader_component.c @@ -239,8 +239,10 @@ static int mca_btl_vader_component_register (void) mca_btl_vader.super.btl_rdma_pipeline_send_length = mca_btl_vader.super.btl_eager_limit; mca_btl_vader.super.btl_rdma_pipeline_frag_size = mca_btl_vader.super.btl_eager_limit; + mca_btl_vader.super.btl_flags = MCA_BTL_FLAGS_SEND_INPLACE | MCA_BTL_FLAGS_SEND; + if (MCA_BTL_VADER_NONE != mca_btl_vader_component.single_copy_mechanism) { - mca_btl_vader.super.btl_flags = MCA_BTL_FLAGS_RDMA | MCA_BTL_FLAGS_SEND_INPLACE; + mca_btl_vader.super.btl_flags |= MCA_BTL_FLAGS_RDMA; /* Single copy mechanisms should provide better bandwidth */ mca_btl_vader.super.btl_bandwidth = 40000; /* Mbs */ @@ -248,7 +250,6 @@ static int mca_btl_vader_component_register (void) mca_btl_vader.super.btl_get = (mca_btl_base_module_get_fn_t) mca_btl_vader_dummy_rdma; mca_btl_vader.super.btl_put = (mca_btl_base_module_get_fn_t) mca_btl_vader_dummy_rdma; } else { - mca_btl_vader.super.btl_flags = MCA_BTL_FLAGS_SEND_INPLACE; mca_btl_vader.super.btl_bandwidth = 10000; /* Mbs */ } diff --git a/opal/util/proc.c b/opal/util/proc.c index fa9ac41c5d..5fba5fd6a3 100644 --- a/opal/util/proc.c +++ b/opal/util/proc.c @@ -1,3 +1,4 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* * Copyright (c) 2013 The University of Tennessee and The University * of Tennessee Research Foundation. All rights @@ -6,6 +7,8 @@ * Copyright (c) 2014-2015 Intel, Inc. All rights reserved. * Copyright (c) 2014 Research Organization for Information Science * and Technology (RIST). All rights reserved. + * Copyright (c) 2015 Los Alamos National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -162,6 +165,11 @@ static int opal_convert_string_to_jobid_should_never_be_called(opal_jobid_t *job return OPAL_ERR_NOT_SUPPORTED; } +static struct opal_proc_t *opal_proc_for_name_should_never_be_called (opal_process_name_t name) +{ + return NULL; +} + char* (*opal_process_name_print)(const opal_process_name_t) = opal_process_name_print_should_never_be_called; char* (*opal_vpid_print)(const opal_vpid_t) = opal_vpid_print_should_never_be_called; char* (*opal_jobid_print)(const opal_jobid_t) = opal_jobid_print_should_never_be_called; @@ -169,6 +177,7 @@ int (*opal_convert_string_to_process_name)(opal_process_name_t *name, const char int (*opal_convert_process_name_to_string)(char** name_string, const opal_process_name_t *name) = opal_convert_process_name_to_string_should_never_be_called; char* (*opal_convert_jobid_to_string)(opal_jobid_t jobid) = opal_convert_jobid_to_string_should_never_be_called; int (*opal_convert_string_to_jobid)(opal_jobid_t *jobid, const char *jobid_string) = opal_convert_string_to_jobid_should_never_be_called; +struct opal_proc_t *(*opal_proc_for_name) (const opal_process_name_t name) = opal_proc_for_name_should_never_be_called; char* opal_get_proc_hostname(const opal_proc_t *proc) { diff --git a/opal/util/proc.h b/opal/util/proc.h index 9c642c932c..250430ba3c 100644 --- a/opal/util/proc.h +++ b/opal/util/proc.h @@ -136,6 +136,13 @@ OPAL_DECLSPEC extern char* (*opal_jobid_print)(const opal_jobid_t); OPAL_DECLSPEC extern char* (*opal_convert_jobid_to_string)(opal_jobid_t jobid); OPAL_DECLSPEC extern int (*opal_convert_string_to_jobid)(opal_jobid_t *jobid, const char *jobid_string); +/** + * Lookup an opal_proc_t by name + * + * @param name (IN) name to lookup + */ +OPAL_DECLSPEC extern struct opal_proc_t *(*opal_proc_for_name) (const opal_process_name_t name); + #define OPAL_NAME_PRINT(OPAL_PN) opal_process_name_print(OPAL_PN) #define OPAL_JOBID_PRINT(OPAL_PN) opal_jobid_print(OPAL_PN) #define OPAL_VPID_PRINT(OPAL_PN) opal_vpid_print(OPAL_PN) diff --git a/oshmem/mca/scoll/mpi/scoll_mpi_module.c b/oshmem/mca/scoll/mpi/scoll_mpi_module.c index d50d842fa4..c933512b33 100644 --- a/oshmem/mca/scoll/mpi/scoll_mpi_module.c +++ b/oshmem/mca/scoll/mpi/scoll_mpi_module.c @@ -113,6 +113,8 @@ mca_scoll_mpi_comm_query(oshmem_group_t *osh_group, int *priority) if (NULL == oshmem_group_all) { osh_group->ompi_comm = &(ompi_mpi_comm_world.comm); } else { + int my_rank = MPI_UNDEFINED; + err = ompi_comm_group(&(ompi_mpi_comm_world.comm), &parent_group); if (OPAL_UNLIKELY(OMPI_SUCCESS != err)) { return NULL; @@ -132,6 +134,10 @@ mca_scoll_mpi_comm_query(oshmem_group_t *osh_group, int *priority) break; } } + /* NTH: keep track of my rank in the new group for the workaround below */ + if (ranks[i] == ompi_comm_rank (&ompi_mpi_comm_world.comm)) { + my_rank = i; + } } err = ompi_group_incl(parent_group, osh_group->proc_count, ranks, &new_group); @@ -139,6 +145,15 @@ mca_scoll_mpi_comm_query(oshmem_group_t *osh_group, int *priority) free(ranks); return NULL; } + + /* NTH: XXX -- WORKAROUND -- The oshmem code overwrites ompi_proc_local_proc with its + * own proc but does not update the proc list in comm world or comm self. This causes + * the code in ompi_group_incl that updates grp_my_rank to fail. This will cause failures + * here and when an application attempts to mix oshmem and mpi so it will really need to + * be fixed in oshmem/proc and not here. For now we need to work around a new jenkins + * failure so set my group ranking so we do not crash when running ompi_comm_create_group. */ + new_group->grp_my_rank = my_rank; + err = ompi_comm_create_group(&(ompi_mpi_comm_world.comm), new_group, tag, &newcomm); if (OPAL_UNLIKELY(OMPI_SUCCESS != err)) { free(ranks);