1
1

Get elan PTL up running again. There is some bug with mca_base_module_close(),

triggered by the missing of some PTL.  With my really slow connection. 
It is really hard to do debugging now.  I will update more on this.

This commit was SVN r1828.
Этот коммит содержится в:
Weikuan Yu 2004-07-31 02:44:28 +00:00
родитель 76275092da
Коммит e1fae8ab7a
5 изменённых файлов: 67 добавлений и 22 удалений

Просмотреть файл

@ -72,8 +72,8 @@ struct mca_ptl_elan_module_1_0_0_t {
ompi_list_t elan_procs; /**< elan proc's */
ompi_list_t elan_send_frags;
ompi_list_t elan_pending_acks;
ompi_list_t elan_recv_frags;
ompi_list_t elan_pending_acks;
ompi_free_list_t elan_send_frags_free;
ompi_free_list_t elan_recv_frags_free;

Просмотреть файл

@ -25,6 +25,11 @@
#include "ptl_elan_frag.h"
#include "ptl_elan_priv.h"
#ifdef CHECK_ELAN
#undef CHECK_ELAN
#define CHECK_ELAN 0
#endif
extern ompi_proc_t *ompi_proc_local_proc;
mca_ptl_elan_module_1_0_0_t mca_ptl_elan_module = {
@ -121,6 +126,9 @@ mca_ptl_elan_module_open (void)
OBJ_CONSTRUCT (&elan_mp->elan_procs, ompi_list_t);
OBJ_CONSTRUCT (&elan_mp->elan_pending_acks, ompi_list_t);
OBJ_CONSTRUCT (&elan_mp->elan_recv_frags, ompi_list_t);
OBJ_CONSTRUCT (&elan_mp->elan_send_frags, ompi_list_t);
OBJ_CONSTRUCT (&elan_mp->elan_send_frags_free, ompi_free_list_t);
OBJ_CONSTRUCT (&elan_mp->elan_recv_frags_free, ompi_free_list_t);
/* initialize other objects */
@ -216,6 +224,12 @@ mca_ptl_elan_module_init (int *num_ptls,
*allow_multi_user_threads = true;
*have_hidden_threads = OMPI_HAVE_THREADS;
if (CHECK_ELAN) {
char hostname[32]; gethostname(hostname, 32);
fprintf(stderr, "[%s:%s:%d] before list init...\n",
hostname, __FUNCTION__, __LINE__);
}
ompi_free_list_init (&(elan_mp->elan_send_frags_free),
sizeof (mca_ptl_elan_send_frag_t),
OBJ_CLASS (mca_ptl_elan_recv_frag_t),
@ -223,6 +237,13 @@ mca_ptl_elan_module_init (int *num_ptls,
elan_mp->elan_free_list_max,
elan_mp->elan_free_list_inc, NULL);
if (CHECK_ELAN) {
char hostname[32]; gethostname(hostname, 32);
fprintf(stderr, "[%s:%s:%d] after list init...\n",
hostname, __FUNCTION__, __LINE__);
}
ompi_free_list_init (&(elan_mp->elan_recv_frags_free),
sizeof (mca_ptl_elan_recv_frag_t),
OBJ_CLASS (mca_ptl_elan_recv_frag_t),
@ -230,6 +251,12 @@ mca_ptl_elan_module_init (int *num_ptls,
elan_mp->elan_free_list_max,
elan_mp->elan_free_list_inc, NULL);
if (CHECK_ELAN) {
char hostname[32]; gethostname(hostname, 32);
fprintf(stderr, "[%s:%s:%d] after list init...\n",
hostname, __FUNCTION__, __LINE__);
}
/* open basic elan device */
if (OMPI_SUCCESS != ompi_mca_ptl_elan_init(&mca_ptl_elan_module)) {
ompi_output(0,
@ -285,6 +312,16 @@ int
mca_ptl_elan_module_progress (mca_ptl_tstamp_t tstamp)
{
START_FUNC();
/*if (times <= -1000)*/
if (times <= -1)
{
char hostname[32]; gethostname(hostname, 32);
fprintf(stderr, "[%s:%s:%d] debugging ...\n",
hostname, __FUNCTION__, __LINE__);
exit(1);
} else {
times ++;
}
mca_ptl_elan_drain_recv(elan_mp);
mca_ptl_elan_update_send(elan_mp);
END_FUNC();

Просмотреть файл

@ -146,19 +146,6 @@ mca_ptl_elan_start_desc (mca_ptl_elan_send_frag_t * desc,
START_FUNC();
/* fragment state */
#if 0
sendfrag->frag_owner = &ptl_peer->peer_ptl->super;
sendfrag->frag_send.frag_request = sendreq;
sendfrag->frag_send.frag_base.frag_addr = sendfrag->frag_vec[1].iov_base;
sendfrag->frag_send.frag_base.frag_size = size_out;
sendfrag->frag_peer = ptl_peer;
/* XXX: Fragment state, is this going to be set anywhere in PML */
sendfrag->frag_progressed = 0;
#endif
if (desc->desc->desc_type == MCA_PTL_ELAN_DESC_QDMA) {
struct ompi_ptl_elan_qdma_desc_t *qdma;
@ -182,6 +169,15 @@ mca_ptl_elan_start_desc (mca_ptl_elan_send_frag_t * desc,
return OMPI_ERROR;
}
/*mca_ptl_base_frag_t frag_base; */
/* fragment state */
desc->frag_base.frag_owner = &ptl_peer->peer_ptl->super;
desc->frag_base.frag_peer = ptl_peer;
desc->frag_base.frag_addr = NULL;
desc->frag_base.frag_size = *size;
desc->frag_progressed = 0;
END_FUNC();
return OMPI_SUCCESS;
}
@ -293,10 +289,10 @@ mca_ptl_elan_drain_recv (mca_ptl_elan_module_1_0_0_t * emp)
OMPI_LOCK (&queue->rx_lock);
#if 1
#if 0
rc = (*(int *) (&rxq->qr_doneWord));
#else
rc = elan4_pollevent_word (ctx, &rxq->qr_doneWord, 1);
rc = elan4_pollevent_word (ctx, &rxq->qr_doneWord, 2000);
#endif
if (rc) {
@ -310,7 +306,7 @@ mca_ptl_elan_drain_recv (mca_ptl_elan_module_1_0_0_t * emp)
gethostname(hostname, 32);
fprintf(stderr,
"[%s recv...] type %x flag %x size %x\n",
"[%s recv...] type %d flag %d size %d\n",
hostname,
header->hdr_common.hdr_type,
header->hdr_common.hdr_flags,
@ -401,11 +397,11 @@ mca_ptl_elan_update_send (mca_ptl_elan_module_1_0_0_t * emp)
while (ompi_list_get_size (&queue->tx_desc) > 0) {
desc = (mca_ptl_elan_send_frag_t *)
ompi_list_get_first (&queue->tx_desc);
#if 1
#if 0
rc = * ((int *) (&desc->desc->main_doneWord));
#else
/* Poll the completion event for 1usec */
rc = elan4_pollevent_word(ctx, &desc->desc->main_doneWord, 1);
rc = elan4_pollevent_word(ctx, &desc->desc->main_doneWord, 2000);
#endif
if (rc) {
mca_ptl_base_header_t *header;
@ -417,6 +413,18 @@ mca_ptl_elan_update_send (mca_ptl_elan_module_1_0_0_t * emp)
header = (mca_ptl_base_header_t *)&
((ompi_ptl_elan_qdma_desc_t *)desc->desc)->buff[0];
if (CHECK_ELAN) {
char hostname[32];
gethostname(hostname, 32);
fprintf(stderr,
"[%s comp sending...] type %d flag %d size %d\n",
hostname,
header->hdr_common.hdr_type,
header->hdr_common.hdr_flags,
header->hdr_common.hdr_size);
}
if(NULL == req) { /* An ack descriptor */
OMPI_FREE_LIST_RETURN (&queue->tx_desc_free,
(ompi_list_item_t *) desc);

Просмотреть файл

@ -48,9 +48,9 @@
} \
} while (0)
#define CHECK_ELAN 0
#define CHECK_ELAN 1
#if CHECK_ELAN
#if CHECK_ELAN && 0
#define START_FUNC() \
do { \
char hostname[32]; gethostname(hostname, 32); \

Просмотреть файл

@ -14,7 +14,7 @@ int main (int argc, char ** argv)
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &proc);
MPI_Comm_size(MPI_COMM_WORLD, &nproc);
MPI_Barrier(MPI_COMM_WORLD);
/*MPI_Barrier(MPI_COMM_WORLD);*/
fprintf(stdout, "[%s:%s:%d] done with init \n",
hostname, __FUNCTION__, __LINE__);
fflush(stdout);