diff --git a/ompi/debuggers/mpi_interface.h b/ompi/debuggers/mpi_interface.h index 0a5d4284c5..66403aa9da 100644 --- a/ompi/debuggers/mpi_interface.h +++ b/ompi/debuggers/mpi_interface.h @@ -441,7 +441,7 @@ typedef struct mqs_process_callbacks { mqs_get_global_rank_ft mqs_get_global_rank_fp; mqs_get_image_ft mqs_get_image_fp; - mqs_fetch_data_ft mqs_fetch_data_fp; + mqs_fetch_data_ft mqs_fetch_data_fp; mqs_target_to_host_ft mqs_target_to_host_fp; #if (FOR_MPI2) mqs_get_process_job_ft mqs_get_process_job_fp; diff --git a/ompi/debuggers/ompi_dll.c b/ompi/debuggers/ompi_dll.c index 1d2a63bdd6..16da68f28f 100644 --- a/ompi/debuggers/ompi_dll.c +++ b/ompi/debuggers/ompi_dll.c @@ -99,13 +99,13 @@ /** * The internal debugging interface. */ -#define VERBOSE_GENERAL 1 +#define VERBOSE_GENERAL 2 #define VERBOSE_GROUP 10 #define VERBOSE_COMM 10 #define VERBOSE_LISTS 10 #define VERBOSE_REQ 50 -#define VERBOSE 0 +#define VERBOSE 1 #if VERBOSE #define DEBUG(LEVEL, WHAT) if( (LEVEL) > VERBOSE ) { printf WHAT; } #else @@ -118,8 +118,6 @@ */ static const mqs_basic_callbacks *mqs_basic_entrypoints; static int host_is_big_endian; -static int world_proc_array_entries = 0; -static mqs_taddr_t* world_proc_array = NULL; void mqs_setup_basic_callbacks (const mqs_basic_callbacks * cb) { @@ -286,15 +284,15 @@ static mqs_tword_t fetch_bool(mqs_process * proc, mqs_taddr_t addr, mpi_process_ } /* fetch_bool */ /***********************************************************************/ -static mqs_tword_t fetch_size_t(mqs_process * proc, mqs_taddr_t addr, mpi_process_info *p_info) +static mqs_taddr_t fetch_size_t(mqs_process * proc, mqs_taddr_t addr, mpi_process_info *p_info) { int isize = p_info->sizes.size_t_size; char buffer[8]; /* ASSUME the type fits in 8 bytes */ - mqs_tword_t res = 0; + mqs_taddr_t res = 0; if (mqs_ok == mqs_fetch_data (proc, addr, isize, &buffer)) mqs_target_to_host (proc, buffer, - ((char *)&res) + (host_is_big_endian ? sizeof(mqs_tword_t)-isize : 0), + ((char *)&res) + (host_is_big_endian ? sizeof(mqs_taddr_t)-isize : 0), isize); return res; @@ -322,14 +320,13 @@ static group_t * find_or_create_group( mqs_process *proc, mqs_taddr_t table ) { mpi_process_info *p_info = (mpi_process_info *)mqs_get_process_info (proc); - mqs_image * image = mqs_get_image (proc); + mqs_image * image = mqs_get_image (proc); mpi_image_info *i_info = (mpi_image_info *)mqs_get_image_info (image); - communicator_t *comm = p_info->communicator_list; + communicator_t *comm = p_info->communicator_list; int *tr; char *trbuffer; - int i; + int i, np; group_t *group; - int np; mqs_taddr_t value; np = fetch_int( proc, @@ -342,7 +339,7 @@ static group_t * find_or_create_group( mqs_process *proc, /* Iterate over each communicator seeing if we can find this group */ for (;comm; comm = comm->next) { group = comm->group; - if (group && group->table_base == table) { + if( group && (group->group_base == table) ) { group->ref_count++; /* Someone else is interested */ DEBUG(VERBOSE_GROUP, ("Increase refcount for group 0x%p to %d\n", (void*)group, group->ref_count) ); @@ -355,7 +352,7 @@ static group_t * find_or_create_group( mqs_process *proc, tr = (int *)mqs_malloc (np*sizeof(int)); trbuffer = (char *)mqs_malloc (np*sizeof(mqs_taddr_t)); group->local_to_global = tr; - group->table_base = table; + group->group_base = table; DEBUG(VERBOSE_GROUP, ("Create a new group 0x%p with %d members\n", (void*)group, np) ); @@ -373,15 +370,15 @@ static group_t * find_or_create_group( mqs_process *proc, * structure. By comparing this pointers to the MPI_COMM_WORLD group * we can figure out the global rank in the MPI_COMM_WORLD of the process. */ - if( NULL == world_proc_array ) { - world_proc_array = mqs_malloc( np * sizeof(mqs_taddr_t) ); + if( NULL == p_info->world_proc_array ) { + p_info->world_proc_array = mqs_malloc( np * sizeof(mqs_taddr_t) ); for( i = 0; i < np; i++ ) { mqs_target_to_host( proc, trbuffer + p_info->sizes.pointer_size*i, &value, p_info->sizes.pointer_size ); - world_proc_array[i] = value; + p_info->world_proc_array[i] = value; group->local_to_global[i] = i; } - world_proc_array_entries = np; + p_info->world_proc_array_entries = np; } else { int j; @@ -389,8 +386,8 @@ static group_t * find_or_create_group( mqs_process *proc, mqs_target_to_host( proc, trbuffer + p_info->sizes.pointer_size*i, &value, p_info->sizes.pointer_size ); /* get the global rank this MPI process */ - for( j = 0; j < world_proc_array_entries; j++ ) { - if( value == world_proc_array[j] ) { + for( j = 0; j < p_info->world_proc_array_entries; j++ ) { + if( value == p_info->world_proc_array[j] ) { group->local_to_global[i] = j; break; } @@ -465,7 +462,7 @@ int mqs_image_has_queues (mqs_image *image, char **message) } /** - * Open MPI use a bunch of lists in order to kep track of the internal + * Open MPI use a bunch of lists in order to keep track of the internal * objects. We have to make sure we're able to find all of them in the image * and compute their ofset in order to be able to parse them later. * We need to find the opal_list_item_t, the opal_list_t, the ompi_free_list_item_t, @@ -643,6 +640,10 @@ int mqs_image_has_queues (mqs_image *image, char **message) i_info->ompi_communicator_t.offset.c_contextid = mqs_field_offset(qh_type, "c_contextid"); i_info->ompi_communicator_t.offset.c_my_rank = mqs_field_offset(qh_type, "c_my_rank" ); i_info->ompi_communicator_t.offset.c_local_group = mqs_field_offset(qh_type, "c_local_group" ); + printf( "Communicator structure size %d, offset c_contextid %d offset c_my_rank %d\n", + i_info->ompi_communicator_t.size, + i_info->ompi_communicator_t.offset.c_contextid, + i_info->ompi_communicator_t.offset.c_my_rank ); } { mqs_type* qh_type = mqs_find_type( image, "ompi_group_t", mqs_lang_c ); @@ -695,13 +696,13 @@ int mqs_setup_process (mqs_process *process, const mqs_process_callbacks *pcb) if (p_info) { mqs_image *image; - mpi_image_info *i_info; + mpi_image_info *i_info; p_info->process_callbacks = pcb; /* Now we can get the rest of the info ! */ image = mqs_get_image (process); - i_info = (mpi_image_info *)mqs_get_image_info (image); + i_info = (mpi_image_info *)mqs_get_image_info (image); /* We have no communicators yet */ p_info->communicator_list = NULL; @@ -711,6 +712,9 @@ int mqs_setup_process (mqs_process *process, const mqs_process_callbacks *pcb) /* By default we don't show our internal requests*/ p_info->show_internal_requests = 0; + p_info->world_proc_array_entries = 0; + p_info->world_proc_array = NULL; + mqs_get_type_sizes (process, &p_info->sizes); /** * Before going any further make sure we know exactly how the Open MPI @@ -771,8 +775,8 @@ int mqs_setup_process (mqs_process *process, const mqs_process_callbacks *pcb) int mqs_process_has_queues (mqs_process *proc, char **msg) { mpi_process_info *p_info = (mpi_process_info *)mqs_get_process_info (proc); - mqs_image * image = mqs_get_image (proc); - mpi_image_info *i_info = (mpi_image_info *)mqs_get_image_info (image); + mqs_image * image = mqs_get_image (proc); + mpi_image_info *i_info = (mpi_image_info *)mqs_get_image_info (image); /* Don't bother with a pop up here, it's unlikely to be helpful */ *msg = 0; @@ -810,13 +814,13 @@ static int communicators_changed (mqs_process *proc) p_info ); if( (lowest_free != p_info->comm_lowest_free) || (number_free != p_info->comm_number_free) ) { - p_info->comm_lowest_free = lowest_free; - p_info->comm_number_free = number_free; DEBUG(VERBOSE_COMM, ("Recreate the communicator list\n" " lowest_free [current] %d != [stored] %d\n" " number_free [current] %d != [stored] %d\n", (int)lowest_free, (int)p_info->comm_lowest_free, (int)number_free, (int)p_info->comm_number_free) ); + p_info->comm_lowest_free = lowest_free; + p_info->comm_number_free = number_free; return 1; } DEBUG(VERBOSE_COMM, ("Communicator list not modified\n") ); @@ -859,20 +863,19 @@ static int compare_comms (const void *a, const void *b) static int rebuild_communicator_list (mqs_process *proc) { mpi_process_info *p_info = (mpi_process_info *)mqs_get_process_info (proc); - mqs_image * image = mqs_get_image (proc); + mqs_image * image = mqs_get_image (proc); mpi_image_info *i_info = (mpi_image_info *)mqs_get_image_info (image); communicator_t **commp, *old; - int i, commcount = 0; + int i, commcount = 0, context_id, local_rank; mqs_tword_t comm_size, lowest_free, number_free; - mqs_taddr_t comm_addr_base = p_info->commlist_base + i_info->ompi_pointer_array_t.offset.addr; + mqs_taddr_t comm_addr_base; mqs_taddr_t comm_ptr; - mqs_communicator remote_comm; DEBUG(VERBOSE_COMM,("rebuild_communicator_list called " "(commlist_base %lx, array offset %ld array size %d)\n", p_info->commlist_base, (long)i_info->ompi_pointer_array_t.offset.addr, - i_info->ompi_pointer_array_t.offset.size)); + i_info->ompi_pointer_array_t.size)); /** * Start by getting the number of registered communicators in the * global communicator array. @@ -895,58 +898,69 @@ static int rebuild_communicator_list (mqs_process *proc) * We can use the fact that MPI_COMM_WORLD is at index 0 to force the * creation of the world_proc_array. */ - world_proc_array_entries = 0; - mqs_free( world_proc_array ); - world_proc_array = NULL; + p_info->world_proc_array_entries = 0; + mqs_free( p_info->world_proc_array ); + p_info->world_proc_array = NULL; - /* Now get the pointer to the first communicator pointer */ - comm_addr_base = fetch_pointer( proc, comm_addr_base, p_info ); + /* Now get the pointer to the array of pointers to communicators */ + comm_addr_base = + fetch_pointer( proc, + p_info->commlist_base + i_info->ompi_pointer_array_t.offset.addr, + p_info ); + DEBUG(VERBOSE_COMM,("Array of communicators starting at 0x%llx (sizeof(void*) = %d)\n", + (long long)comm_addr_base, (int)sizeof(mqs_taddr_t))); for( i = 0; (commcount < (comm_size - number_free)) && (i < comm_size); i++ ) { /* Get the communicator pointer */ comm_ptr = fetch_pointer( proc, comm_addr_base + i * p_info->sizes.pointer_size, p_info ); + DEBUG(VERBOSE_GENERAL,("Fetch communicator pointer 0x%llx\n", (long long)comm_ptr)); if( 0 == comm_ptr ) continue; commcount++; /* Now let's grab the data we want from inside */ - remote_comm.unique_id = fetch_int( proc, - comm_ptr + i_info->ompi_communicator_t.offset.c_contextid, - p_info ); - remote_comm.local_rank = fetch_int( proc, - comm_ptr + i_info->ompi_communicator_t.offset.c_my_rank, - p_info ); - mqs_fetch_data( proc, comm_ptr + i_info->ompi_communicator_t.offset.c_name, - 64, remote_comm.name ); + DEBUG(VERBOSE_GENERAL, ("Retrieve context_id from 0x%llx and local_rank from 0x%llx\n", + comm_ptr + i_info->ompi_communicator_t.offset.c_contextid, + comm_ptr + i_info->ompi_communicator_t.offset.c_my_rank)); + context_id = fetch_int( proc, + comm_ptr + i_info->ompi_communicator_t.offset.c_contextid, + p_info ); + local_rank = fetch_int( proc, + comm_ptr + i_info->ompi_communicator_t.offset.c_my_rank, + p_info ); /* Do we already have this communicator ? */ - old = find_communicator(p_info, remote_comm.unique_id); + old = find_communicator(p_info, context_id); if( NULL == old ) { mqs_taddr_t group_base; old = (communicator_t *)mqs_malloc (sizeof (communicator_t)); /* Save the results */ - old->next = p_info->communicator_list; + old->next = p_info->communicator_list; p_info->communicator_list = old; - old->comm_ptr = comm_ptr; - old->recv_context = remote_comm.unique_id; + old->comm_ptr = comm_ptr; + old->recv_context = context_id; + old->comm_info.local_rank = local_rank; + DEBUG(VERBOSE_COMM,("Create new communicator 0x%llX with context_id %d and local_rank %d\n", + (long long)old, context_id, local_rank)); /* Now get the information about the group */ group_base = fetch_pointer( proc, comm_ptr + i_info->ompi_communicator_t.offset.c_local_group, p_info ); old->group = find_or_create_group( proc, group_base ); } - strncpy(old->comm_info.name, remote_comm.name, 64); - old->comm_info.unique_id = remote_comm.unique_id; - old->comm_info.local_rank = remote_comm.local_rank; + mqs_fetch_data( proc, comm_ptr + i_info->ompi_communicator_t.offset.c_name, + 64, old->comm_info.name ); + assert( old->comm_info.unique_id == remote_comm.unique_id); + assert( old->comm_info.local_rank == remote_comm.local_rank); if( NULL != old->group ) { old->comm_info.size = old->group->entries; } old->present = TRUE; - DEBUG(VERBOSE_COMM,("Communicator %d local_rank %d name %s\n", - (int)old->comm_info.unique_id, (int)old->comm_info.local_rank, - old->comm_info.name)); + DEBUG(VERBOSE_COMM,("Communicator 0x%llx %d local_rank %d name %s\n", + (long long)old->comm_ptr, (int)old->comm_info.unique_id, + (int)old->comm_info.local_rank, old->comm_info.name)); } /* Now iterate over the list tidying up any communicators which diff --git a/ompi/debuggers/ompi_dll_defs.h b/ompi/debuggers/ompi_dll_defs.h index 791bdb0de0..827b0c2521 100644 --- a/ompi/debuggers/ompi_dll_defs.h +++ b/ompi/debuggers/ompi_dll_defs.h @@ -167,54 +167,6 @@ typedef struct int _cancelled; } offset; } ompi_status_public_t; - /* Fields in MPID_QHDR */ - int unexpected_offs; - - /* Fields in MPID_QUEUE */ - int first_offs; - - /* Fields in MPID_QEL */ - int context_id_offs; - int tag_offs; - int tagmask_offs; - int lsrc_offs; - int srcmask_offs; - int next_offs; - int ptr_offs; - - /* Fields in MPIR_SQEL */ - int db_shandle_offs; - int db_comm_offs; - int db_target_offs; - int db_tag_offs; - int db_data_offs; - int db_byte_length_offs; - int db_next_offs; - - /* Fields in MPIR_RHANDLE */ - int is_complete_offs; - int buf_offs; - int len_offs; - int datatype_offs; - int comm_offs; - int start_offs; - - /* in the embedded MPI_Status object */ - int count_offs; - int MPI_SOURCE_offs; - int MPI_TAG_offs; - - /* Fields in MPIR_Comm_list */ - int sequence_number_offs; - int comm_first_offs; - - /* Fields in MPIR_COMMUNICATOR */ - int np_offs; - int lrank_to_grank_offs; - int send_context_offs; - int recv_context_offs; - int comm_next_offs; - int comm_name_offs; } mpi_image_info; /*********************************************************************** @@ -223,10 +175,10 @@ typedef struct typedef struct group_t { - mqs_taddr_t table_base; /* Where was it in the process */ - int ref_count; /* How many references to us */ - int entries; /* How many entries */ - int *local_to_global; /* The translation table */ + mqs_taddr_t group_base; /* Where was it in the process */ + int ref_count; /* How many references to us */ + int entries; /* How many entries */ + int* local_to_global; /* The translation table */ } group_t; /* Internal structure we hold for each communicator */ @@ -284,6 +236,9 @@ typedef struct mqs_tword_t show_internal_requests; /* show or not the Open MPI internal requests */ /* State for the iterators */ struct communicator_t *current_communicator; /* Easy, we're walking a simple list */ + + int world_proc_array_entries; + mqs_taddr_t* world_proc_array; mqs_ompi_free_list_t_pos next_msg; /* And state for the message iterator */ mqs_op_class what; /* What queue are we looking on */