Get elan PTL up running again. There is some bug with mca_base_module_close(),
triggered by the missing of some PTL. With my really slow connection. It is really hard to do debugging now. I will update more on this. This commit was SVN r1828.
Этот коммит содержится в:
родитель
76275092da
Коммит
e1fae8ab7a
@ -72,8 +72,8 @@ struct mca_ptl_elan_module_1_0_0_t {
|
||||
|
||||
ompi_list_t elan_procs; /**< elan proc's */
|
||||
ompi_list_t elan_send_frags;
|
||||
ompi_list_t elan_pending_acks;
|
||||
ompi_list_t elan_recv_frags;
|
||||
ompi_list_t elan_pending_acks;
|
||||
|
||||
ompi_free_list_t elan_send_frags_free;
|
||||
ompi_free_list_t elan_recv_frags_free;
|
||||
|
@ -25,6 +25,11 @@
|
||||
#include "ptl_elan_frag.h"
|
||||
#include "ptl_elan_priv.h"
|
||||
|
||||
#ifdef CHECK_ELAN
|
||||
#undef CHECK_ELAN
|
||||
#define CHECK_ELAN 0
|
||||
#endif
|
||||
|
||||
extern ompi_proc_t *ompi_proc_local_proc;
|
||||
|
||||
mca_ptl_elan_module_1_0_0_t mca_ptl_elan_module = {
|
||||
@ -121,6 +126,9 @@ mca_ptl_elan_module_open (void)
|
||||
OBJ_CONSTRUCT (&elan_mp->elan_procs, ompi_list_t);
|
||||
OBJ_CONSTRUCT (&elan_mp->elan_pending_acks, ompi_list_t);
|
||||
OBJ_CONSTRUCT (&elan_mp->elan_recv_frags, ompi_list_t);
|
||||
OBJ_CONSTRUCT (&elan_mp->elan_send_frags, ompi_list_t);
|
||||
|
||||
OBJ_CONSTRUCT (&elan_mp->elan_send_frags_free, ompi_free_list_t);
|
||||
OBJ_CONSTRUCT (&elan_mp->elan_recv_frags_free, ompi_free_list_t);
|
||||
|
||||
/* initialize other objects */
|
||||
@ -216,6 +224,12 @@ mca_ptl_elan_module_init (int *num_ptls,
|
||||
*allow_multi_user_threads = true;
|
||||
*have_hidden_threads = OMPI_HAVE_THREADS;
|
||||
|
||||
if (CHECK_ELAN) {
|
||||
char hostname[32]; gethostname(hostname, 32);
|
||||
fprintf(stderr, "[%s:%s:%d] before list init...\n",
|
||||
hostname, __FUNCTION__, __LINE__);
|
||||
}
|
||||
|
||||
ompi_free_list_init (&(elan_mp->elan_send_frags_free),
|
||||
sizeof (mca_ptl_elan_send_frag_t),
|
||||
OBJ_CLASS (mca_ptl_elan_recv_frag_t),
|
||||
@ -223,6 +237,13 @@ mca_ptl_elan_module_init (int *num_ptls,
|
||||
elan_mp->elan_free_list_max,
|
||||
elan_mp->elan_free_list_inc, NULL);
|
||||
|
||||
if (CHECK_ELAN) {
|
||||
char hostname[32]; gethostname(hostname, 32);
|
||||
fprintf(stderr, "[%s:%s:%d] after list init...\n",
|
||||
hostname, __FUNCTION__, __LINE__);
|
||||
}
|
||||
|
||||
|
||||
ompi_free_list_init (&(elan_mp->elan_recv_frags_free),
|
||||
sizeof (mca_ptl_elan_recv_frag_t),
|
||||
OBJ_CLASS (mca_ptl_elan_recv_frag_t),
|
||||
@ -230,6 +251,12 @@ mca_ptl_elan_module_init (int *num_ptls,
|
||||
elan_mp->elan_free_list_max,
|
||||
elan_mp->elan_free_list_inc, NULL);
|
||||
|
||||
if (CHECK_ELAN) {
|
||||
char hostname[32]; gethostname(hostname, 32);
|
||||
fprintf(stderr, "[%s:%s:%d] after list init...\n",
|
||||
hostname, __FUNCTION__, __LINE__);
|
||||
}
|
||||
|
||||
/* open basic elan device */
|
||||
if (OMPI_SUCCESS != ompi_mca_ptl_elan_init(&mca_ptl_elan_module)) {
|
||||
ompi_output(0,
|
||||
@ -285,6 +312,16 @@ int
|
||||
mca_ptl_elan_module_progress (mca_ptl_tstamp_t tstamp)
|
||||
{
|
||||
START_FUNC();
|
||||
/*if (times <= -1000)*/
|
||||
if (times <= -1)
|
||||
{
|
||||
char hostname[32]; gethostname(hostname, 32);
|
||||
fprintf(stderr, "[%s:%s:%d] debugging ...\n",
|
||||
hostname, __FUNCTION__, __LINE__);
|
||||
exit(1);
|
||||
} else {
|
||||
times ++;
|
||||
}
|
||||
mca_ptl_elan_drain_recv(elan_mp);
|
||||
mca_ptl_elan_update_send(elan_mp);
|
||||
END_FUNC();
|
||||
|
@ -146,19 +146,6 @@ mca_ptl_elan_start_desc (mca_ptl_elan_send_frag_t * desc,
|
||||
|
||||
START_FUNC();
|
||||
|
||||
/* fragment state */
|
||||
#if 0
|
||||
sendfrag->frag_owner = &ptl_peer->peer_ptl->super;
|
||||
sendfrag->frag_send.frag_request = sendreq;
|
||||
sendfrag->frag_send.frag_base.frag_addr = sendfrag->frag_vec[1].iov_base;
|
||||
sendfrag->frag_send.frag_base.frag_size = size_out;
|
||||
sendfrag->frag_peer = ptl_peer;
|
||||
|
||||
/* XXX: Fragment state, is this going to be set anywhere in PML */
|
||||
sendfrag->frag_progressed = 0;
|
||||
#endif
|
||||
|
||||
|
||||
if (desc->desc->desc_type == MCA_PTL_ELAN_DESC_QDMA) {
|
||||
struct ompi_ptl_elan_qdma_desc_t *qdma;
|
||||
|
||||
@ -182,6 +169,15 @@ mca_ptl_elan_start_desc (mca_ptl_elan_send_frag_t * desc,
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
||||
/*mca_ptl_base_frag_t frag_base; */
|
||||
|
||||
/* fragment state */
|
||||
desc->frag_base.frag_owner = &ptl_peer->peer_ptl->super;
|
||||
desc->frag_base.frag_peer = ptl_peer;
|
||||
desc->frag_base.frag_addr = NULL;
|
||||
desc->frag_base.frag_size = *size;
|
||||
desc->frag_progressed = 0;
|
||||
|
||||
END_FUNC();
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
@ -293,10 +289,10 @@ mca_ptl_elan_drain_recv (mca_ptl_elan_module_1_0_0_t * emp)
|
||||
|
||||
OMPI_LOCK (&queue->rx_lock);
|
||||
|
||||
#if 1
|
||||
#if 0
|
||||
rc = (*(int *) (&rxq->qr_doneWord));
|
||||
#else
|
||||
rc = elan4_pollevent_word (ctx, &rxq->qr_doneWord, 1);
|
||||
rc = elan4_pollevent_word (ctx, &rxq->qr_doneWord, 2000);
|
||||
#endif
|
||||
|
||||
if (rc) {
|
||||
@ -310,7 +306,7 @@ mca_ptl_elan_drain_recv (mca_ptl_elan_module_1_0_0_t * emp)
|
||||
gethostname(hostname, 32);
|
||||
|
||||
fprintf(stderr,
|
||||
"[%s recv...] type %x flag %x size %x\n",
|
||||
"[%s recv...] type %d flag %d size %d\n",
|
||||
hostname,
|
||||
header->hdr_common.hdr_type,
|
||||
header->hdr_common.hdr_flags,
|
||||
@ -401,11 +397,11 @@ mca_ptl_elan_update_send (mca_ptl_elan_module_1_0_0_t * emp)
|
||||
while (ompi_list_get_size (&queue->tx_desc) > 0) {
|
||||
desc = (mca_ptl_elan_send_frag_t *)
|
||||
ompi_list_get_first (&queue->tx_desc);
|
||||
#if 1
|
||||
#if 0
|
||||
rc = * ((int *) (&desc->desc->main_doneWord));
|
||||
#else
|
||||
/* Poll the completion event for 1usec */
|
||||
rc = elan4_pollevent_word(ctx, &desc->desc->main_doneWord, 1);
|
||||
rc = elan4_pollevent_word(ctx, &desc->desc->main_doneWord, 2000);
|
||||
#endif
|
||||
if (rc) {
|
||||
mca_ptl_base_header_t *header;
|
||||
@ -417,6 +413,18 @@ mca_ptl_elan_update_send (mca_ptl_elan_module_1_0_0_t * emp)
|
||||
header = (mca_ptl_base_header_t *)&
|
||||
((ompi_ptl_elan_qdma_desc_t *)desc->desc)->buff[0];
|
||||
|
||||
if (CHECK_ELAN) {
|
||||
char hostname[32];
|
||||
gethostname(hostname, 32);
|
||||
|
||||
fprintf(stderr,
|
||||
"[%s comp sending...] type %d flag %d size %d\n",
|
||||
hostname,
|
||||
header->hdr_common.hdr_type,
|
||||
header->hdr_common.hdr_flags,
|
||||
header->hdr_common.hdr_size);
|
||||
}
|
||||
|
||||
if(NULL == req) { /* An ack descriptor */
|
||||
OMPI_FREE_LIST_RETURN (&queue->tx_desc_free,
|
||||
(ompi_list_item_t *) desc);
|
||||
|
@ -48,9 +48,9 @@
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#define CHECK_ELAN 0
|
||||
#define CHECK_ELAN 1
|
||||
|
||||
#if CHECK_ELAN
|
||||
#if CHECK_ELAN && 0
|
||||
#define START_FUNC() \
|
||||
do { \
|
||||
char hostname[32]; gethostname(hostname, 32); \
|
||||
|
@ -14,7 +14,7 @@ int main (int argc, char ** argv)
|
||||
MPI_Init(&argc, &argv);
|
||||
MPI_Comm_rank(MPI_COMM_WORLD, &proc);
|
||||
MPI_Comm_size(MPI_COMM_WORLD, &nproc);
|
||||
MPI_Barrier(MPI_COMM_WORLD);
|
||||
/*MPI_Barrier(MPI_COMM_WORLD);*/
|
||||
fprintf(stdout, "[%s:%s:%d] done with init \n",
|
||||
hostname, __FUNCTION__, __LINE__);
|
||||
fflush(stdout);
|
||||
|
Загрузка…
Ссылка в новой задаче
Block a user