Two changes to improve the sm situation with spawn:
* have the mpool size be based on MCW, not num procs in other jobs we know about. Solves the problem of the spawned job having a much bigger than needed sm file * Can't assume that "me" is in the list of procs passed to addprocs, so need to use slightly different logic and not go through all of add procs unless there's a proc in my job that isn't me. This seems to greatly improve the situation, although there still seems to be more of a slowdown through MPI_INIT for the children (if there are more than one child) than MPI_INIT for the parent if there are 'n' children compared to 'n' parents. Hopefully that made sense ;) This commit was SVN r13417.
Этот коммит содержится в:
родитель
c754523a14
Коммит
58b325b03f
@ -163,6 +163,7 @@ int mca_btl_sm_add_procs_same_base_addr(
|
||||
ptrdiff_t diff;
|
||||
volatile char **tmp_ptr;
|
||||
volatile int *tmp_int_ptr;
|
||||
bool have_connected_peer = false;
|
||||
|
||||
/* initializion */
|
||||
for( i = 0 ; i < nprocs ; i++ ) {
|
||||
@ -201,20 +202,24 @@ int mca_btl_sm_add_procs_same_base_addr(
|
||||
#endif
|
||||
struct mca_btl_base_endpoint_t *peer;
|
||||
|
||||
/* check to see if this is me */
|
||||
if( my_proc == procs[proc] ) {
|
||||
mca_btl_sm_component.my_smp_rank = n_local_procs;
|
||||
}
|
||||
|
||||
/* check to see if this proc can be reached via shmem (i.e.,
|
||||
if they're on my local host and in my job) */
|
||||
else if (procs[proc]->proc_name.jobid != my_proc->proc_name.jobid ||
|
||||
if (procs[proc]->proc_name.jobid != my_proc->proc_name.jobid ||
|
||||
0 == (procs[proc]->proc_flags & OMPI_PROC_FLAG_LOCAL)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
/* If we got here, the proc is reachable via sm. So
|
||||
initialize the peers information */
|
||||
|
||||
/* check to see if this is me */
|
||||
if( my_proc == procs[proc] ) {
|
||||
mca_btl_sm_component.my_smp_rank = n_local_procs;
|
||||
} else {
|
||||
/* we have someone to talk to */
|
||||
have_connected_peer = true;
|
||||
}
|
||||
|
||||
peer = peers[proc] = (struct mca_btl_base_endpoint_t*)malloc(sizeof(struct mca_btl_base_endpoint_t));
|
||||
if( NULL == peer ){
|
||||
return_code=OMPI_ERR_OUT_OF_RESOURCE;
|
||||
@ -236,8 +241,8 @@ int mca_btl_sm_add_procs_same_base_addr(
|
||||
mca_btl_sm_component.sm_proc_connect[proc]=SM_CONNECTED;
|
||||
}
|
||||
|
||||
/* There is always at least a local proc (myself). */
|
||||
if( n_local_procs == 1) {
|
||||
/* jump out if there's not someone we can talk to */
|
||||
if (!have_connected_peer) {
|
||||
return_code = OMPI_SUCCESS;
|
||||
goto CLEANUP;
|
||||
}
|
||||
|
@ -136,7 +136,9 @@ static mca_mpool_base_module_t* mca_mpool_sm_init(
|
||||
mca_base_param_lookup_int(min_size_param, &min_size);
|
||||
mca_base_param_lookup_int(peer_size_param, &peer_size);
|
||||
|
||||
procs = ompi_proc_all(&num_all_procs);
|
||||
/* README: this needs to change if procs in different jobs (even
|
||||
spawned ones) are to talk using shared memory */
|
||||
procs = ompi_proc_world(&num_all_procs);
|
||||
for (i = 0 ; i < num_all_procs ; ++i) {
|
||||
if (procs[i]->proc_flags & OMPI_PROC_FLAG_LOCAL) {
|
||||
num_local_procs++;
|
||||
|
Загрузка…
Ссылка в новой задаче
Block a user