1
1
This commit was SVN r9998.
Этот коммит содержится в:
Galen Shipman 2006-05-20 02:39:05 +00:00
родитель faf63c68f8
Коммит 9165882c07
7 изменённых файлов: 131 добавлений и 87 удалений

Просмотреть файл

@ -140,6 +140,35 @@ static inline mca_bml_base_btl_t* mca_bml_base_btl_array_insert(mca_bml_base_btl
return &array->bml_btls[array->arr_size++];
}
/**
* Remove a btl from a bml_btl
*
* @param array (IN)
* @param btl (IN)
*/
static inline bool mca_bml_base_btl_array_remove(mca_bml_base_btl_array_t* array,
struct mca_btl_base_module_t* btl) {
size_t i=0, index;
bool found = false;
/* find the btl */
for(i=0; i<array->arr_size; i++) {
if(array->bml_btls[i].btl == btl) {
found = true;
}
if(found) {
/* move all btl's back by 1, so the found
btl is "removed" */
array->bml_btls[i] = array->bml_btls[(i+1)];
}
}
if(found) {
array->arr_size--;
array->arr_index = 0;
}
return found;
}
/**
* Return an array item at the specified index.
*

Просмотреть файл

@ -492,10 +492,12 @@ int mca_bml_r2_finalize( void ) {
int mca_bml_r2_del_btl(mca_btl_base_module_t* btl)
{
ompi_proc_t** procs;
size_t i, m, p, num_procs;
size_t i, j, m, p, num_procs;
opal_list_item_t* item;
mca_btl_base_module_t** modules;
mca_btl_base_component_progress_fn_t * btl_progress_new;
procs = ompi_proc_all(&num_procs);
if(NULL == procs)
return OMPI_SUCCESS;
@ -529,6 +531,30 @@ int mca_bml_r2_del_btl(mca_btl_base_module_t* btl)
mca_bml_r2.btl_modules = modules;
mca_bml_r2.num_btl_modules = m;
/* remove progress function so btl_progress isn't
called on the failed BTL */
if(mca_bml_r2.num_btl_progress <= 1) {
/* nothing left to send on! */
opal_output(0, "%s:%d:%s: only one BTL, can't fail-over!",
__FILE__, __LINE__, __func__);
return OMPI_ERROR;
}
/* figure out which progress functions to keep */
btl_progress_new = (mca_btl_base_component_progress_fn_t*)
malloc(sizeof(mca_btl_base_component_progress_fn_t) *
(mca_bml_r2.num_btl_progress - 1));
j = 0;
for(i = 0; i < mca_bml_r2.num_btl_progress; i++) {
if(btl->btl_component->btl_progress != mca_bml_r2.btl_progress[i]) {
btl_progress_new[j] = mca_bml_r2.btl_progress[i];
j++;
}
}
free(mca_bml_r2.btl_progress);
mca_bml_r2.btl_progress = btl_progress_new;
mca_bml_r2.num_btl_progress--;
/* cleanup */
btl->btl_finalize(btl);
free(procs);
@ -537,90 +563,69 @@ int mca_bml_r2_del_btl(mca_btl_base_module_t* btl)
int mca_bml_r2_del_proc_btl(ompi_proc_t* proc, mca_btl_base_module_t* btl)
{
mca_bml_base_endpoint_t* ep_old = (mca_bml_base_endpoint_t*)proc->proc_pml;
mca_bml_base_endpoint_t* ep_new = (mca_bml_base_endpoint_t*)opal_obj_new(mca_bml_r2.endpoint_class);
mca_bml_base_endpoint_t* ep = (mca_bml_base_endpoint_t*)proc->proc_pml;
mca_btl_base_component_progress_fn_t * btl_progress_new;
int i, j;
double total_bandwidth = 0;
size_t b;
/* initialize */
ep_new->super.proc_ompi = proc;
ep_new->btl_max_send_size = -1;
ep_new->btl_rdma_size = -1;
ep_new->btl_rdma_align = 0;
ep_new->btl_rdma_offset = 0;
/* build new eager list */
mca_bml_base_btl_array_reserve(&ep_new->btl_eager, mca_bml_base_btl_array_get_size(&ep_old->btl_eager));
for(b=0; b< mca_bml_base_btl_array_get_size(&ep_old->btl_eager); b++) {
mca_bml_base_btl_t* bml_btl_old = mca_bml_base_btl_array_get_index(&ep_old->btl_eager, b);
if(bml_btl_old->btl != btl) {
mca_bml_base_btl_t* bml_btl_new = mca_bml_base_btl_array_insert(&ep_new->btl_eager);
*bml_btl_new = *bml_btl_old;
/* remove btl from eager list */
mca_bml_base_btl_array_remove(&ep->btl_eager, btl);
/* remove btl from send list */
if(mca_bml_base_btl_array_remove(&ep->btl_send, btl)) {
/* compute total_bandwidth and
reset max_send_size to the min of all btl's */
total_bandwidth = 0;
for(b=0; b< mca_bml_base_btl_array_get_size(&ep->btl_send); b++) {
mca_bml_base_btl_t* bml_btl = mca_bml_base_btl_array_get_index(&ep->btl_send, b);
total_bandwidth += bml_btl->btl->btl_bandwidth;
if (bml_btl->btl_max_send_size < ep->btl_max_send_size) {
ep->btl_max_send_size = bml_btl->btl->btl_max_send_size;
}
}
}
/* build new send list */
total_bandwidth = 0;
mca_bml_base_btl_array_reserve(&ep_new->btl_send, mca_bml_base_btl_array_get_size(&ep_old->btl_send));
for(b=0; b< mca_bml_base_btl_array_get_size(&ep_old->btl_send); b++) {
mca_bml_base_btl_t* bml_btl_old = mca_bml_base_btl_array_get_index(&ep_old->btl_send, b);
if(bml_btl_old->btl != btl) {
mca_bml_base_btl_t* bml_btl_new = mca_bml_base_btl_array_insert(&ep_new->btl_send);
*bml_btl_new = *bml_btl_old;
total_bandwidth += bml_btl_new->btl->btl_bandwidth;
if (bml_btl_new->btl_max_send_size < ep_new->btl_max_send_size) {
ep_new->btl_max_send_size = bml_btl_new->btl->btl_max_send_size;
/* compute weighting factor for this btl */
for(b=0; b< mca_bml_base_btl_array_get_size(&ep->btl_send); b++) {
mca_bml_base_btl_t* bml_btl = mca_bml_base_btl_array_get_index(&ep->btl_send, b);
if(bml_btl->btl->btl_bandwidth > 0) {
bml_btl->btl_weight = bml_btl->btl->btl_bandwidth / total_bandwidth;
} else {
bml_btl->btl_weight = 1.0 / mca_bml_base_btl_array_get_size(&ep->btl_send);
}
}
}
/* compute weighting factor for this btl */
for(b=0; b< mca_bml_base_btl_array_get_size(&ep_new->btl_send); b++) {
mca_bml_base_btl_t* bml_btl = mca_bml_base_btl_array_get_index(&ep_old->btl_send, b);
if(bml_btl->btl->btl_bandwidth > 0) {
bml_btl->btl_weight = bml_btl->btl->btl_bandwidth / total_bandwidth;
} else {
bml_btl->btl_weight = 1.0 / mca_bml_base_btl_array_get_size(&ep_new->btl_send);
}
}
/* build new rdma list */
total_bandwidth = 0;
mca_bml_base_btl_array_reserve(&ep_new->btl_rdma, mca_bml_base_btl_array_get_size(&ep_old->btl_rdma));
for(b=0; b< mca_bml_base_btl_array_get_size(&ep_old->btl_rdma); b++) {
mca_bml_base_btl_t* bml_btl_old = mca_bml_base_btl_array_get_index(&ep_old->btl_rdma, b);
if(bml_btl_old->btl != btl) {
mca_bml_base_btl_t* bml_btl_new = mca_bml_base_btl_array_insert(&ep_new->btl_rdma);
*bml_btl_new = *bml_btl_old;
/* remove btl from RDMA list */
if(mca_bml_base_btl_array_remove(&ep->btl_rdma, btl)) {
/* computer total bandwidth */
total_bandwidth = 0;
for(b=0; b< mca_bml_base_btl_array_get_size(&ep->btl_rdma); b++) {
mca_bml_base_btl_t* bml_btl = mca_bml_base_btl_array_get_index(&ep->btl_rdma, b);
/* update aggregate endpoint info */
total_bandwidth += bml_btl_new->btl->btl_bandwidth;
if (ep_new->btl_rdma_offset < bml_btl_new->btl_min_rdma_size) {
ep_new->btl_rdma_offset = bml_btl_new->btl_min_rdma_size;
total_bandwidth += bml_btl->btl->btl_bandwidth;
if (ep->btl_rdma_offset < bml_btl->btl_min_rdma_size) {
ep->btl_rdma_offset = bml_btl->btl_min_rdma_size;
}
if (ep->btl_rdma_size > bml_btl->btl_max_rdma_size) {
ep->btl_rdma_size = bml_btl->btl_max_rdma_size;
ep->btl_rdma_align = bml_base_log2(ep->btl_rdma_size);
}
if (ep_new->btl_rdma_size > bml_btl_new->btl_max_rdma_size) {
ep_new->btl_rdma_size = bml_btl_new->btl_max_rdma_size;
ep_new->btl_rdma_align = bml_base_log2(ep_new->btl_rdma_size);
}
/* compute weighting factor for this btl */
for(b=0; b< mca_bml_base_btl_array_get_size(&ep->btl_rdma); b++) {
mca_bml_base_btl_t* bml_btl = mca_bml_base_btl_array_get_index(&ep->btl_rdma, b);
if(bml_btl->btl->btl_bandwidth > 0) {
bml_btl->btl_weight = bml_btl->btl->btl_bandwidth / total_bandwidth;
} else {
bml_btl->btl_weight = 1.0 / mca_bml_base_btl_array_get_size(&ep->btl_rdma);
}
}
}
/* compute weighting factor for this btl */
for(b=0; b< mca_bml_base_btl_array_get_size(&ep_new->btl_rdma); b++) {
mca_bml_base_btl_t* bml_btl = mca_bml_base_btl_array_get_index(&ep_old->btl_rdma, b);
if(bml_btl->btl->btl_bandwidth > 0) {
bml_btl->btl_weight = bml_btl->btl->btl_bandwidth / total_bandwidth;
} else {
bml_btl->btl_weight = 1.0 / mca_bml_base_btl_array_get_size(&ep_new->btl_rdma);
}
}
/* copy over any additional state ... */
if(NULL != ep_new->copy) {
ep_new->copy(ep_new, ep_old);
}
/* save on proc */
proc->proc_pml = (mca_pml_proc_t*)ep_new;
return OMPI_SUCCESS;
}

Просмотреть файл

@ -154,15 +154,16 @@ int mca_pml_dr_add_procs(ompi_proc_t** procs, size_t nprocs)
mca_pml_dr.free_list_max,
mca_pml_dr.free_list_inc,
NULL);
for(i = 0; i < nprocs; i++) {
for(i = 0; i < nprocs; i++) {
int idx;
/* this won't work for comm spawn and other dynamic
/* this won't work for comm spawn and other dynamic
processes, but will work for initial job start */
idx = ompi_pointer_array_add(&mca_pml_dr.procs,
idx = ompi_pointer_array_add(&mca_pml_dr.endpoints,
(void*) endpoints[i]);
if(orte_ns.compare(ORTE_NS_CMP_ALL,
orte_process_info.my_name,
&endpoints[i]->base.super.proc_ompi->proc_name) == 0) {
if(orte_ns.compare(ORTE_NS_CMP_ALL,
orte_process_info.my_name,
&endpoints[i]->base.super.proc_ompi->proc_name) == 0) {
mca_pml_dr.my_rank = idx;
}
endpoints[i]->local = endpoints[i]->dst = idx;

Просмотреть файл

@ -73,8 +73,8 @@ struct mca_pml_dr_t {
ompi_free_list_t vfrags;
ompi_free_list_t buffers;
/* proc pointer array */
ompi_pointer_array_t procs;
/* endpoint pointer array */
ompi_pointer_array_t endpoints;
/* my 'global' rank */
int32_t my_rank;

Просмотреть файл

@ -159,7 +159,7 @@ int mca_pml_dr_component_open(void)
OBJ_CONSTRUCT(&mca_pml_dr.send_pending, opal_list_t);
OBJ_CONSTRUCT(&mca_pml_dr.acks_pending, opal_list_t);
OBJ_CONSTRUCT(&mca_pml_dr.buffers, ompi_free_list_t);
OBJ_CONSTRUCT(&mca_pml_dr.procs, ompi_pointer_array_t);
OBJ_CONSTRUCT(&mca_pml_dr.endpoints, ompi_pointer_array_t);
OBJ_CONSTRUCT(&mca_pml_dr.lock, opal_mutex_t);
mca_pml_dr.enabled = false;

Просмотреть файл

@ -19,6 +19,7 @@
#include "ompi_config.h"
#include "pml_dr.h"
#include "pml_dr_endpoint.h"
#include "orte/mca/ns/ns.h"
static void mca_pml_dr_endpoint_copy(mca_pml_dr_endpoint_t* dst, mca_pml_dr_endpoint_t* src)
@ -30,11 +31,19 @@ static void mca_pml_dr_endpoint_copy(mca_pml_dr_endpoint_t* dst, mca_pml_dr_endp
ompi_seq_tracker_copy(&dst->seq_recvs, &src->seq_recvs);
ompi_seq_tracker_copy(&dst->seq_recvs_matched, &src->seq_recvs_matched);
dst->vfrag_seq = src->vfrag_seq;
/* this won't work for comm spawn and other dynamic
processes, but will work for initial job start */
/* dst->local = dst->dst = ompi_pointer_array_add(&mca_pml_dr.endpoints, */
/* (void*) dst); */
}
static void mca_pml_dr_endpoint_construct(mca_pml_dr_endpoint_t* ep)
{
int idx;
OBJ_CONSTRUCT(&ep->seq_sends, ompi_seq_tracker_t);
OBJ_CONSTRUCT(&ep->seq_recvs, ompi_seq_tracker_t);
OBJ_CONSTRUCT(&ep->seq_recvs_matched, ompi_seq_tracker_t);

Просмотреть файл

@ -51,7 +51,7 @@ do {
return; \
} \
} \
ep = ompi_pointer_array_get_item(&mca_pml_dr.procs, hdr->hdr_common.hdr_src); \
ep = ompi_pointer_array_get_item(&mca_pml_dr.endpoints, hdr->hdr_common.hdr_src); \
assert(ep != NULL); \
if(ompi_seq_tracker_check_duplicate(&ep->seq_sends, hdr->hdr_common.hdr_vid)) { \
OPAL_OUTPUT((0, "%s:%d: dropping duplicate ack", __FILE__, __LINE__)); \
@ -127,7 +127,7 @@ void mca_pml_dr_recv_frag_callback(
__FILE__, __LINE__, hdr->hdr_common.hdr_src, hdr->hdr_common.hdr_dst));
return;
}
ep = ompi_pointer_array_get_item(&mca_pml_dr.procs, hdr->hdr_common.hdr_src);
ep = ompi_pointer_array_get_item(&mca_pml_dr.endpoints, hdr->hdr_common.hdr_src);
assert(ep != NULL);
if(ompi_seq_tracker_check_duplicate(&ep->seq_recvs, hdr->hdr_common.hdr_vid)) {
mca_pml_dr_recv_frag_ack(&ep->base,
@ -173,7 +173,7 @@ void mca_pml_dr_recv_frag_callback(
__FILE__, __LINE__, hdr->hdr_common.hdr_src, hdr->hdr_common.hdr_dst));
return;
}
ep = ompi_pointer_array_get_item(&mca_pml_dr.procs, hdr->hdr_common.hdr_src);
ep = ompi_pointer_array_get_item(&mca_pml_dr.endpoints, hdr->hdr_common.hdr_src);
assert(ep != NULL);
/* seq_recvs protected by matching lock */
@ -256,7 +256,7 @@ void mca_pml_dr_recv_frag_callback(
__FILE__, __LINE__, hdr->hdr_common.hdr_src, hdr->hdr_common.hdr_dst));
return;
}
ep = ompi_pointer_array_get_item(&mca_pml_dr.procs, hdr->hdr_common.hdr_src);
ep = ompi_pointer_array_get_item(&mca_pml_dr.endpoints, hdr->hdr_common.hdr_src);
assert(ep != NULL);
/* seq_recvs protected by matching lock */