If the RTE fails to deliver the daemon information,
gracefully fallback to a non-reordered communicator. Optimize the loops building the process hierarchy.
Этот коммит содержится в:
родитель
23886754f0
Коммит
8d0baf140f
@ -44,9 +44,9 @@
|
|||||||
while(0);
|
while(0);
|
||||||
|
|
||||||
#define FALLBACK() \
|
#define FALLBACK() \
|
||||||
do { free(nodes_roots); \
|
do { free(nodes_roots); \
|
||||||
free(local_procs); \
|
free(local_procs); \
|
||||||
hwloc_bitmap_free(set); \
|
if( NULL != set) hwloc_bitmap_free(set); \
|
||||||
goto fallback; } \
|
goto fallback; } \
|
||||||
while(0);
|
while(0);
|
||||||
|
|
||||||
@ -181,19 +181,16 @@ int mca_topo_treematch_dist_graph_create(mca_topo_base_module_t* topo_module,
|
|||||||
num_procs_in_node++;
|
num_procs_in_node++;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Get the ranks of the local procs in comm_old */
|
vpids = (int *)malloc(size * sizeof(int));
|
||||||
|
colors = (int *)malloc(size * sizeof(int));
|
||||||
local_procs = (int *)malloc(num_procs_in_node * sizeof(int));
|
local_procs = (int *)malloc(num_procs_in_node * sizeof(int));
|
||||||
for(i = idx = 0 ; i < size ; i++){
|
for(i = idx = 0 ; i < size ; i++){
|
||||||
proc = ompi_group_peer_lookup(comm_old->c_local_group, i);
|
proc = ompi_group_peer_lookup(comm_old->c_local_group, i);
|
||||||
if (( i == rank ) ||
|
if (( i == rank ) ||
|
||||||
(OPAL_PROC_ON_LOCAL_NODE(proc->super.proc_flags)))
|
(OPAL_PROC_ON_LOCAL_NODE(proc->super.proc_flags))) {
|
||||||
local_procs[idx++] = i;
|
local_procs[idx++] = i;
|
||||||
}
|
}
|
||||||
|
|
||||||
vpids = (int *)malloc(size * sizeof(int));
|
|
||||||
colors = (int *)malloc(size * sizeof(int));
|
|
||||||
for(i = 0; i < size ; i++) {
|
|
||||||
proc = ompi_group_peer_lookup(comm_old->c_local_group, i);
|
|
||||||
pval = &val;
|
pval = &val;
|
||||||
OPAL_MODEX_RECV_VALUE(err, OPAL_PMIX_NODEID, &(proc->super.proc_name), &pval, OPAL_UINT32);
|
OPAL_MODEX_RECV_VALUE(err, OPAL_PMIX_NODEID, &(proc->super.proc_name), &pval, OPAL_UINT32);
|
||||||
if( OPAL_SUCCESS != err ) {
|
if( OPAL_SUCCESS != err ) {
|
||||||
@ -220,22 +217,30 @@ int mca_topo_treematch_dist_graph_create(mca_topo_base_module_t* topo_module,
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
/* clean-up dupes in the array */
|
/* clean-up dupes in the array */
|
||||||
for(i = 0; i < size ; i++)
|
for(i = 0; i < size; i++) {
|
||||||
if ( -1 == vpids[i] )
|
if( -1 == vpids[i] )
|
||||||
continue;
|
continue;
|
||||||
else
|
|
||||||
for(j = i+1 ; j < size ; j++)
|
num_nodes++; /* update the number of nodes */
|
||||||
if( vpids[j] != -1 )
|
|
||||||
if( vpids[i] == vpids[j] )
|
for(j = i+1; j < size; j++)
|
||||||
vpids[j] = -1;
|
if( vpids[j] != -1 )
|
||||||
/* compute number of nodes */
|
if( vpids[i] == vpids[j] )
|
||||||
for(i = 0; i < size ; i++)
|
vpids[j] = -1;
|
||||||
if( vpids[i] != -1 )
|
}
|
||||||
num_nodes++;
|
if( 0 == num_nodes ) {
|
||||||
|
/* No useful info has been retrieved from the runtime. Fallback
|
||||||
|
* and create a duplicate of the original communicator */
|
||||||
|
free(vpids);
|
||||||
|
free(colors);
|
||||||
|
free(local_procs);
|
||||||
|
err = OMPI_SUCCESS; /* return with success */
|
||||||
|
goto fallback;
|
||||||
|
}
|
||||||
/* compute local roots ranks in comm_old */
|
/* compute local roots ranks in comm_old */
|
||||||
/* Only the global root needs to do this */
|
/* Only the global root needs to do this */
|
||||||
if(0 == rank) {
|
if(0 == rank) {
|
||||||
nodes_roots = (int *)calloc(num_nodes,sizeof(int));
|
nodes_roots = (int *)calloc(num_nodes, sizeof(int));
|
||||||
for(i = idx = 0; i < size ; i++)
|
for(i = idx = 0; i < size ; i++)
|
||||||
if( vpids[i] != -1 )
|
if( vpids[i] != -1 )
|
||||||
nodes_roots[idx++] = i;
|
nodes_roots[idx++] = i;
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user