1
1

Two changes to improve the sm situation with spawn:

* have the mpool size be based on MCW, not num procs
    in other jobs we know about.  Solves the problem of
    the spawned job having a much bigger than needed
    sm file
  * Can't assume that "me" is in the list of procs
    passed to addprocs, so need to use slightly different
    logic and not go through all of add procs unless
    there's a proc in my job that isn't me.

This seems to greatly improve the situation, although
there still seems to be more of a slowdown through
MPI_INIT for the children (if there are more than one
child) than MPI_INIT for the parent if there are 'n'
children compared to 'n' parents.  Hopefully that
made sense ;)

This commit was SVN r13417.
Этот коммит содержится в:
Brian Barrett 2007-02-01 17:18:35 +00:00
родитель c754523a14
Коммит 58b325b03f
2 изменённых файлов: 16 добавлений и 9 удалений

Просмотреть файл

@ -163,6 +163,7 @@ int mca_btl_sm_add_procs_same_base_addr(
ptrdiff_t diff;
volatile char **tmp_ptr;
volatile int *tmp_int_ptr;
bool have_connected_peer = false;
/* initializion */
for( i = 0 ; i < nprocs ; i++ ) {
@ -201,20 +202,24 @@ int mca_btl_sm_add_procs_same_base_addr(
#endif
struct mca_btl_base_endpoint_t *peer;
/* check to see if this is me */
if( my_proc == procs[proc] ) {
mca_btl_sm_component.my_smp_rank = n_local_procs;
}
/* check to see if this proc can be reached via shmem (i.e.,
if they're on my local host and in my job) */
else if (procs[proc]->proc_name.jobid != my_proc->proc_name.jobid ||
if (procs[proc]->proc_name.jobid != my_proc->proc_name.jobid ||
0 == (procs[proc]->proc_flags & OMPI_PROC_FLAG_LOCAL)) {
continue;
}
/* If we got here, the proc is reachable via sm. So
initialize the peers information */
/* check to see if this is me */
if( my_proc == procs[proc] ) {
mca_btl_sm_component.my_smp_rank = n_local_procs;
} else {
/* we have someone to talk to */
have_connected_peer = true;
}
peer = peers[proc] = (struct mca_btl_base_endpoint_t*)malloc(sizeof(struct mca_btl_base_endpoint_t));
if( NULL == peer ){
return_code=OMPI_ERR_OUT_OF_RESOURCE;
@ -236,8 +241,8 @@ int mca_btl_sm_add_procs_same_base_addr(
mca_btl_sm_component.sm_proc_connect[proc]=SM_CONNECTED;
}
/* There is always at least a local proc (myself). */
if( n_local_procs == 1) {
/* jump out if there's not someone we can talk to */
if (!have_connected_peer) {
return_code = OMPI_SUCCESS;
goto CLEANUP;
}

Просмотреть файл

@ -136,7 +136,9 @@ static mca_mpool_base_module_t* mca_mpool_sm_init(
mca_base_param_lookup_int(min_size_param, &min_size);
mca_base_param_lookup_int(peer_size_param, &peer_size);
procs = ompi_proc_all(&num_all_procs);
/* README: this needs to change if procs in different jobs (even
spawned ones) are to talk using shared memory */
procs = ompi_proc_world(&num_all_procs);
for (i = 0 ; i < num_all_procs ; ++i) {
if (procs[i]->proc_flags & OMPI_PROC_FLAG_LOCAL) {
num_local_procs++;