1
1
This commit was SVN r2268.
Этот коммит содержится в:
Weikuan Yu 2004-08-23 19:31:29 +00:00
родитель 066dc18fa2
Коммит 14ecc21913
3 изменённых файлов: 230 добавлений и 112 удалений

Просмотреть файл

@ -85,8 +85,46 @@ ompi_init_elan_queue_events (mca_ptl_elan_module_t * ptl,
/* Initialize some of the dma structures */
desc->main_dma.dma_dstAddr = 0;
#if OMPI_PTL_ELAN_COMP_QUEUE
/* Have all the source event fired to the Queue */
/* XXX: Chain a QDMA to each queue and
* Have all the srcEvent fired to the Queue */
/* Setup the chain dma */
desc->comp_dma.dma_typeSize = E4_DMA_TYPE_SIZE (
sizeof(mca_ptl_base_frag_header_t),
DMA_DataTypeByte, DMA_QueueWrite, 8);
desc->comp_dma.dma_cookie = elan4_local_cookie(ptl->queue->tx_cpoll,
E4_COOKIE_TYPE_LOCAL_DMA, ptl->elan_vp);
desc->comp_dma.dma_vproc = ptl->elan_vp;
desc->comp_dma.dma_srcAddr = elan4_main2elan (ctx, (void *) hdr);
desc->comp_dma.dma_dstAddr = 0x0ULL;
/* XXX: If completion is to be detected from the Queue
* there is no need to trigger a local event */
desc->comp_dma.dma_dstEvent = elan4_main2elan (ctx,
(void *) ptl->comp->input);
/*desc->main_dma.dma_srcEvent = SDRAM2ELAN (ctx, desc->elan_event);*/
desc->comp_dma.dma_srcEvent = 0x0ULL;
desc->comp_dma.dma_typeSize |= RUN_DMA_CMD;
desc->comp_dma.dma_pad = NOP_CMD;
memcpy ((void *)desc->comp_buff, (void *)&desc->comp_dma,
sizeof (E4_DMA64));
desc->comp_event->ev_CountAndType = E4_EVENT_INIT_VALUE(-32,
E4_EVENT_COPY, E4_EVENT_DTYPE_LONG, 8);
desc->comp_event->ev_Params[0] = elan4_main2elan (ctx,
(void *)desc->comp_buff);
/* XXX: The chain dma will go directly into a command stream
* so we need addend the command queue control bits.
* Allocate space from command queues hanged off the CTX.
*/
desc->comp_event->ev_Params[1] = elan4_alloccq_space (ctx, 8, CQ_Size8K);
desc->main_dma.dma_srcEvent= elan4_main2elan(ctx,
(E4_Event *)desc->comp_event);
desc->main_dma.dma_dstEvent= SDRAM2ELAN (ctx, queue->input);
#else
desc->main_dma.dma_srcEvent = SDRAM2ELAN (ctx, desc->elan_event);
desc->main_dma.dma_dstEvent = SDRAM2ELAN (ctx, queue->input);
@ -139,7 +177,7 @@ mca_ptl_elan_putget_desc_contruct (
desc->main_dma.dma_dstAddr = dst_elan4_addr;
#if OMPI_PTL_ELAN_COMP_QUEUE
/* Have all the source event fired to the Queue */
/* Have all the source event fired to the Queue */
#else
if (local) {
desc->main_dma.dma_srcEvent = elan4_main2elan(ctx, elan_event);
@ -205,9 +243,10 @@ ompi_ptl_elan_init_putget_ctrl (mca_ptl_elan_module_t * ptl,
main_size = OMPI_PTL_ELAN_ALIGNUP(sizeof(ompi_ptl_elan_putget_desc_t),
main_align);
/* Contain elan_event, chain_event and a chain_buff */
/* FIXME: Get the correct number of elan_size,
* Contain elan_event, chain_event and a chain_buff */
elan_size = OMPI_PTL_ELAN_ALIGNUP(
(sizeof(E4_Event32)*2 + ELAN_BLOCK_SIZE), elan_align);
(ELAN_BLOCK_SIZE * 2 + sizeof(E4_Event32)*3 ), elan_align);
rail = (RAIL *) ptl->ptl_elan_rail;
ctx = (ELAN4_CTX *) ptl->ptl_elan_ctx;
@ -278,6 +317,91 @@ ompi_init_elan_qdma (mca_ptl_elan_component_t * emp,
START_FUNC(PTL_ELAN_DEBUG_INIT);
#if OMPI_PTL_ELAN_COMP_QUEUE || 1
/* Create a complete queue here, later use the queue above directly */
/* Init the Transmit Queue structure */
for (i = 0; i < num_rails; i++) {
ompi_ptl_elan_recv_queue_t *rxq;
ompi_ptl_elan_comp_queue_t *comp;
ptl = emp->modules[i];
rail = (RAIL *) ptl->ptl_elan_rail;
ctx = (ELAN4_CTX *) ptl->ptl_elan_ctx;
comp = ptl->comp = (ompi_ptl_elan_comp_queue_t *)
malloc (sizeof (ompi_ptl_elan_comp_queue_t));
OMPI_PTL_ELAN_CHECK_UNEX (comp, NULL, OMPI_ERROR, 0);
memset (comp, 0, sizeof (ompi_ptl_elan_comp_queue_t));
/* Allocate input queue */
comp->input = (E4_InputQueue *) elan4_allocElan (rail->r_alloc,
INPUT_QUEUE_ALIGN,
INPUT_QUEUE_SIZE);
OMPI_PTL_ELAN_CHECK_UNEX (comp->input, NULL, OMPI_ERROR, 0);
/* Init the Receive Queue structure */
comp->rx_nslots = nslots;
nslots += OMPI_PTL_ELAN_LOST_QSLOTS;
comp->rx_buffsize = (slotsize > INPUT_QUEUE_MAX) ?
INPUT_QUEUE_MAX : slotsize;
comp->rx_slotsize = ELAN_ALIGNUP (slotsize, OMPI_PTL_ELAN_SLOT_ALIGN);
rxq = comp->rxq = (ompi_ptl_elan_recv_queue_t *)
elan4_allocMain (rail->r_alloc, 64,
sizeof (ompi_ptl_elan_recv_queue_t));
OMPI_PTL_ELAN_CHECK_UNEX (rxq, NULL, OMPI_ERROR, 0);
memset (rxq, 0, sizeof (ompi_ptl_elan_recv_queue_t));
rxq->qr_rail = rail;
rxq->qr_fptr = elan4_allocMain (rail->r_alloc,
128, nslots * comp->rx_slotsize);
OMPI_PTL_ELAN_CHECK_UNEX (rxq->qr_fptr, NULL, OMPI_ERROR, 0);
memset (rxq->qr_fptr, 0xeb, nslots * comp->rx_slotsize);
rxq->qr_elanDone = ALLOC_ELAN (rail,
OMPI_PTL_ELAN_SLOT_ALIGN, sizeof (EVENT32));
OMPI_PTL_ELAN_CHECK_UNEX (rxq->qr_elanDone, NULL, OMPI_ERROR, 0);
/* Set the top et al */
rxq->qr_efitem = (E4_uint64) elan4_main2elan (ctx, rxq->qr_fptr);
assert(rxq->qr_efitem != ELAN_BAD_ADDR);
rxq->qr_base = rxq->qr_fptr;
rxq->qr_top = (void *) ((uintptr_t) rxq->qr_base +
(comp->rx_slotsize * (nslots - OMPI_PTL_ELAN_LOST_QSLOTS)));
rxq->qr_efptr = rxq->qr_efitem;
rxq->qr_elitem = rxq->qr_efitem +
(comp->rx_slotsize * (nslots - OMPI_PTL_ELAN_LOST_QSLOTS));
/* Event to wait/block on, Bug here for the event */
rxq->qr_qEvent = rxq->qr_elanDone;
comp->input->q_event= SDRAM2ELAN (ctx, (void *) rxq->qr_elanDone);
comp->input->q_fptr = rxq->qr_efitem;
comp->input->q_bptr = rxq->qr_efitem;
comp->input->q_control =
E4_InputQueueControl (rxq->qr_efitem, rxq->qr_elitem,
comp->rx_slotsize);
/* The event */
INITEVENT_WORD (ctx, (EVENT *) rxq->qr_elanDone,
&rxq->qr_doneWord);
RESETEVENT_WORD (&rxq->qr_doneWord);
PRIMEEVENT_WORD (ctx, (EVENT *) rxq->qr_elanDone, 1);
rxq->qr_cmdq = elan4_alloc_cmdq (ctx, rail->r_alloc,
CQ_Size1K,
CQ_WriteEnableBit |
CQ_WaitEventEnableBit, NULL);
OMPI_PTL_ELAN_CHECK_UNEX (rxq->qr_cmdq, NULL, OMPI_ERROR, 0);
/* Allocate a sleepDesc for threads to block on */
rxq->qr_es = ompi_init_elan_sleepdesc (&mca_ptl_elan_global_state,
rxq->qr_rail);
OMPI_PTL_ELAN_CHECK_UNEX (rxq->qr_es, NULL, OMPI_ERROR, 0);
OBJ_CONSTRUCT (&comp->rx_lock, ompi_mutex_t);
}
#endif
/* Init the Transmit Queue structure */
for (i = 0; i < num_rails; i++) {
@ -358,8 +482,7 @@ ompi_init_elan_qdma (mca_ptl_elan_component_t * emp,
/* Event to wait/block on, Bug here for the event */
rxq->qr_qEvent = rxq->qr_elanDone;
queue->input->q_event =
SDRAM2ELAN (ctx, (void *) rxq->qr_elanDone);
queue->input->q_event= SDRAM2ELAN (ctx, (void *) rxq->qr_elanDone);
queue->input->q_fptr = rxq->qr_efitem;
queue->input->q_bptr = rxq->qr_efitem;
queue->input->q_control =
@ -385,91 +508,6 @@ ompi_init_elan_qdma (mca_ptl_elan_component_t * emp,
OBJ_CONSTRUCT (&queue->rx_lock, ompi_mutex_t);
}
#if OMPI_PTL_ELAN_COMP_QUEUE || 1
/* Create a complete queue here, later use the queue above directly */
/* Init the Transmit Queue structure */
for (i = 0; i < num_rails; i++) {
ompi_ptl_elan_recv_queue_t *rxq;
ompi_ptl_elan_comp_queue_t *comp;
ptl = emp->modules[i];
rail = (RAIL *) ptl->ptl_elan_rail;
ctx = (ELAN4_CTX *) ptl->ptl_elan_ctx;
comp = ptl->comp = (ompi_ptl_elan_comp_queue_t *)
malloc (sizeof (ompi_ptl_elan_comp_queue_t));
OMPI_PTL_ELAN_CHECK_UNEX (comp, NULL, OMPI_ERROR, 0);
memset (comp, 0, sizeof (ompi_ptl_elan_comp_queue_t));
/* Allocate input queue */
comp->input = (E4_InputQueue *) elan4_allocElan (rail->r_alloc,
INPUT_QUEUE_ALIGN,
INPUT_QUEUE_SIZE);
OMPI_PTL_ELAN_CHECK_UNEX (comp->input, NULL, OMPI_ERROR, 0);
/* Init the Receive Queue structure */
comp->rx_nslots = nslots;
nslots += OMPI_PTL_ELAN_LOST_QSLOTS;
comp->rx_buffsize = (slotsize > INPUT_QUEUE_MAX) ?
INPUT_QUEUE_MAX : slotsize;
comp->rx_slotsize = ELAN_ALIGNUP (slotsize, OMPI_PTL_ELAN_SLOT_ALIGN);
rxq = comp->rxq = (ompi_ptl_elan_recv_queue_t *)
elan4_allocMain (rail->r_alloc, 64,
sizeof (ompi_ptl_elan_recv_queue_t));
OMPI_PTL_ELAN_CHECK_UNEX (rxq, NULL, OMPI_ERROR, 0);
memset (rxq, 0, sizeof (ompi_ptl_elan_recv_queue_t));
rxq->qr_rail = rail;
rxq->qr_fptr = elan4_allocMain (rail->r_alloc,
128, nslots * comp->rx_slotsize);
OMPI_PTL_ELAN_CHECK_UNEX (rxq->qr_fptr, NULL, OMPI_ERROR, 0);
memset (rxq->qr_fptr, 0xeb, nslots * comp->rx_slotsize);
rxq->qr_elanDone = ALLOC_ELAN (rail,
OMPI_PTL_ELAN_SLOT_ALIGN, sizeof (EVENT32));
OMPI_PTL_ELAN_CHECK_UNEX (rxq->qr_elanDone, NULL, OMPI_ERROR, 0);
/* Set the top et al */
rxq->qr_efitem = (E4_uint64) elan4_main2elan (ctx, rxq->qr_fptr);
assert(rxq->qr_efitem != ELAN_BAD_ADDR);
rxq->qr_base = rxq->qr_fptr;
rxq->qr_top = (void *) ((uintptr_t) rxq->qr_base +
(comp->rx_slotsize * (nslots - OMPI_PTL_ELAN_LOST_QSLOTS)));
rxq->qr_efptr = rxq->qr_efitem;
rxq->qr_elitem = rxq->qr_efitem +
(comp->rx_slotsize * (nslots - OMPI_PTL_ELAN_LOST_QSLOTS));
/* Event to wait/block on, Bug here for the event */
rxq->qr_qEvent = rxq->qr_elanDone;
comp->input->q_event =
SDRAM2ELAN (ctx, (void *) rxq->qr_elanDone);
comp->input->q_fptr = rxq->qr_efitem;
comp->input->q_bptr = rxq->qr_efitem;
comp->input->q_control =
E4_InputQueueControl (rxq->qr_efitem, rxq->qr_elitem,
comp->rx_slotsize);
/* The event */
INITEVENT_WORD (ctx, (EVENT *) rxq->qr_elanDone,
&rxq->qr_doneWord);
RESETEVENT_WORD (&rxq->qr_doneWord);
PRIMEEVENT_WORD (ctx, (EVENT *) rxq->qr_elanDone, 1);
rxq->qr_cmdq = elan4_alloc_cmdq (ctx, rail->r_alloc,
CQ_Size1K,
CQ_WriteEnableBit |
CQ_WaitEventEnableBit, NULL);
OMPI_PTL_ELAN_CHECK_UNEX (rxq->qr_cmdq, NULL, OMPI_ERROR, 0);
/* Allocate a sleepDesc for threads to block on */
rxq->qr_es = ompi_init_elan_sleepdesc (&mca_ptl_elan_global_state,
rxq->qr_rail);
OMPI_PTL_ELAN_CHECK_UNEX (rxq->qr_es, NULL, OMPI_ERROR, 0);
OBJ_CONSTRUCT (&comp->rx_lock, ompi_mutex_t);
}
#endif
END_FUNC(PTL_ELAN_DEBUG_INIT);
return (OMPI_SUCCESS);

Просмотреть файл

@ -169,12 +169,13 @@ mca_ptl_elan_init_qdma_desc (struct mca_ptl_elan_send_frag_t *frag,
mca_ptl_base_header_t *hdr;
struct ompi_ptl_elan_qdma_desc_t * desc;
ELAN4_CTX *ctx;
START_FUNC(PTL_ELAN_DEBUG_SEND);
desc = (ompi_ptl_elan_qdma_desc_t *)frag->desc;
destvp = ptl_peer->peer_vp;
size_in = *size;
ctx = ptl->ptl_elan_ctx,
hdr = (mca_ptl_base_header_t *) & desc->buff[0];
@ -190,7 +191,7 @@ mca_ptl_elan_init_qdma_desc (struct mca_ptl_elan_send_frag_t *frag,
/* Stash local buffer address into the header, for ptl_elan_get */
hdr->hdr_frag.hdr_dst_ptr.pval = 0;
hdr->hdr_frag.hdr_dst_ptr.lval = elan4_main2elan(
ptl->ptl_elan_ctx, pml_req->req_base.req_addr);
ctx, pml_req->req_base.req_addr);
hdr->hdr_match.hdr_contextid = pml_req->req_base.req_comm->c_contextid;
hdr->hdr_match.hdr_src = pml_req->req_base.req_comm->c_my_rank;
@ -257,9 +258,36 @@ mca_ptl_elan_init_qdma_desc (struct mca_ptl_elan_send_frag_t *frag,
* For now just save the information to the provided header
* Later will use the inline header to report the progress */
frag->frag_base.frag_header = *hdr;
desc->main_dma.dma_srcAddr = MAIN2ELAN (desc->ptl->ptl_elan_ctx,
&desc->buff[0]);
#if OMPI_PTL_ELAN_COMP_QUEUE || 1
/* XXX: Chain a QDMA to each queue and
* Have all the srcEvent fired to the Queue */
desc->comp_dma.dma_cookie = elan4_local_cookie(ptl->queue->tx_cpool,
E4_COOKIE_TYPE_LOCAL_DMA, ptl->elan_vp);
desc->comp_dma.dma_srcAddr = elan4_main2elan (ctx,
(void *) &frag->frag_base.frag_header);
memcpy ((void *)desc->comp_buff, (void *)&desc->comp_dma,
sizeof (E4_DMA64));
/* XXX: The chain dma will go directly into a command stream
* so we need addend the command queue control bits.
* Allocate space from command queues hanged off the CTX.
*/
desc->comp_event->ev_Params[1] = elan4_alloccq_space (ctx, 8, CQ_Size8K);
desc->main_dma.dma_srcEvent= elan4_main2elan(
ctx, (E4_Event *)desc->comp_event);
desc->main_dma.dma_srcAddr = MAIN2ELAN (ctx, &desc->buff[0]);
/* XXX: Hardcoded DMA retry count */
desc->main_dma.dma_typeSize = E4_DMA_TYPE_SIZE ((header_length +
size_out),
DMA_DataTypeByte,
DMA_QueueWrite, 16);
desc->main_dma.dma_cookie= elan4_local_cookie (ptl->queue->tx_cpool,
E4_COOKIE_TYPE_LOCAL_DMA, destvp);
desc->main_dma.dma_vproc = destvp;
#else
desc->main_dma.dma_srcAddr = MAIN2ELAN (ctx, &desc->buff[0]);
/* XXX: Hardcoded DMA retry count */
desc->main_dma.dma_typeSize = E4_DMA_TYPE_SIZE ((header_length +
size_out),
@ -268,6 +296,7 @@ mca_ptl_elan_init_qdma_desc (struct mca_ptl_elan_send_frag_t *frag,
desc->main_dma.dma_cookie = elan4_local_cookie (ptl->queue->tx_cpool,
E4_COOKIE_TYPE_LOCAL_DMA, destvp);
desc->main_dma.dma_vproc = destvp;
#endif
LOG_PRINT(PTL_ELAN_DEBUG_MAC,
"[ send...] destvp %d type %d flag %d size %d\n",
@ -381,7 +410,32 @@ mca_ptl_elan_init_putget_desc (struct mca_ptl_elan_send_frag_t *frag,
(void *) ptl->queue->input);
#if OMPI_PTL_ELAN_COMP_QUEUE
/* Have all the source event fired to the Queue */
/* XXX: Chain a QDMA to each queue and
* Have all the srcEvent fired to the Queue */
desc->comp_dma.dma_cookie = elan4_local_cookie(ptl->queue->tx_cpool,
E4_COOKIE_TYPE_LOCAL_DMA, ptl->elan_vp);
desc->comp_dma.dma_srcAddr = elan4_main2elan (ctx,
(void *) &frag->frag_base.frag_header);
memcpy ((void *)desc->comp_buff, (void *)&desc->comp_dma,
sizeof (E4_DMA64));
/* XXX: The chain dma will go directly into a command stream
* so we need addend the command queue control bits.
* Allocate space from command queues hanged off the CTX.
*/
desc->comp_event->ev_Params[1] = elan4_alloccq_space (ctx, 8, CQ_Size8K);
desc->main_dma.dma_srcEvent= elan4_main2elan(
ctx, (E4_Event *)desc->comp_event);
desc->main_dma.dma_srcAddr = MAIN2ELAN (ctx, &desc->buff[0]);
/* XXX: Hardcoded DMA retry count */
desc->main_dma.dma_typeSize = E4_DMA_TYPE_SIZE ((header_length +
size_out),
DMA_DataTypeByte,
DMA_QueueWrite, 16);
desc->main_dma.dma_cookie= elan4_local_cookie (ptl->queue->tx_cpool,
E4_COOKIE_TYPE_LOCAL_DMA, destvp);
desc->main_dma.dma_vproc = destvp;
#else
desc->chain_dma.dma_srcEvent = elan4_main2elan (ctx, desc->elan_event);
INITEVENT_WORD (ctx, (E4_Event *) desc->elan_event, &desc->main_doneWord);
@ -497,8 +551,23 @@ mca_ptl_elan_init_get_desc (mca_ptl_elan_module_t *ptl,
desc->chain_dma.dma_dstEvent = elan4_main2elan (ctx,
(void *) ptl->queue->input);
#if OMPI_PTL_ELAN_COMP_QUEUE
/* Have all the source event fired to the Queue */
#if OMPI_PTL_ELAN_COMP_QUEUE || 1
/* XXX: Chain a QDMA to each queue and
* Have all the srcEvent fired to the Queue */
desc->comp_dma.dma_cookie = elan4_local_cookie(ptl->queue->tx_cpool,
E4_COOKIE_TYPE_LOCAL_DMA, ptl->elan_vp);
desc->comp_dma.dma_srcAddr = elan4_main2elan (ctx,
(void *) &frag->frag_base.frag_header);
memcpy ((void *)desc->comp_buff, (void *)&desc->comp_dma,
sizeof (E4_DMA64));
/* XXX: The chained COMP/DMA will go directly into a command stream
* so we need addend the command queue control bits.
* Allocate space from command queues hanged off the CTX.
*/
desc->comp_event->ev_Params[1] = elan4_alloccq_space (ctx, 8, CQ_Size8K);
desc->chain_dma.dma_srcEvent= elan4_main2elan(ctx,
(E4_Event *)desc->comp_event);
#else
desc->chain_dma.dma_srcEvent = elan4_main2elan (ctx, desc->elan_event);
INITEVENT_WORD (ctx, (E4_Event *) desc->elan_event, &desc->main_doneWord);

Просмотреть файл

@ -175,20 +175,31 @@ typedef struct ompi_ptl_elan_comp_queue_t ompi_ptl_elan_comp_queue_t;
/**
* ELAN descriptor for send
*/
#define ELAN_BASE_DESC_FIELDS \
E4_DMA64 main_dma; /**< 8-byte aligned */ \
/* 8 byte aligned */ \
volatile E4_uint64 main_doneWord; \
/* 8 byte aligned */ \
E4_Event *elan_event; \
void *desc_buff; \
/* 8 byte aligned */ \
mca_pml_base_request_t *req; \
mca_ptl_elan_module_t *ptl; \
/* 8 byte aligned */ \
int desc_type; \
int desc_status; \
/* 8 byte aligned */
#define ELAN_BASE_DESC_FIELDS \
E4_DMA64 main_dma; /**< 8-byte aligned */ \
/* 8 byte aligned */ \
volatile E4_uint64 main_doneWord; \
/* 8 byte aligned */ \
E4_Event *elan_event; \
void *desc_buff; \
/* 8 byte aligned */ \
mca_pml_base_request_t *req; \
mca_ptl_elan_module_t *ptl; \
/* 8 byte aligned */ \
int desc_type; \
int desc_status; \
/* 8 byte aligned */ \
E4_DMA64 comp_dma; \
/* 8 byte aligned */ \
volatile E4_uint64 comp_doneWord; \
/* 8 byte aligned */ \
E4_Event32 *comp_event; /* E4_Event plus pad */ \
/* 8 byte aligned */ \
E4_Addr *comp_buff; \
E4_Addr *comp_pad; \
E4_Addr comp_srcAddr; \
E4_Addr comp_dstAddr; \
/* 8 byte aligned */ \
struct ompi_ptl_elan_base_desc_t {
ELAN_BASE_DESC_FIELDS