diff --git a/ompi/mca/pml/ob1/pml_ob1.c b/ompi/mca/pml/ob1/pml_ob1.c index a2f16863dd..d82daee29a 100644 --- a/ompi/mca/pml/ob1/pml_ob1.c +++ b/ompi/mca/pml/ob1/pml_ob1.c @@ -647,7 +647,7 @@ int mca_pml_ob1_send_fin( ompi_proc_t* proc, int rc; mca_bml_base_alloc(bml_btl, &fin, order, sizeof(mca_pml_ob1_fin_hdr_t), - MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP); + MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP | MCA_BTL_DES_FLAGS_SIGNAL); if(NULL == fin) { MCA_PML_OB1_ADD_FIN_TO_PENDING(proc, hdr_des, bml_btl, order, status); diff --git a/ompi/mca/pml/ob1/pml_ob1_hdr.h b/ompi/mca/pml/ob1/pml_ob1_hdr.h index 8248216dba..71e52ae608 100644 --- a/ompi/mca/pml/ob1/pml_ob1_hdr.h +++ b/ompi/mca/pml/ob1/pml_ob1_hdr.h @@ -53,6 +53,7 @@ #define MCA_PML_OB1_HDR_FLAGS_PIN 4 /* is user buffer pinned */ #define MCA_PML_OB1_HDR_FLAGS_CONTIG 8 /* is user buffer contiguous */ #define MCA_PML_OB1_HDR_FLAGS_NORDMA 16 /* rest will be send by copy-in-out */ +#define MCA_PML_OB1_HDR_FLAGS_SIGNAL 32 /* message can be optionally signalling */ /** * Common hdr attributes - must be first element in each hdr type diff --git a/ompi/mca/pml/ob1/pml_ob1_recvreq.c b/ompi/mca/pml/ob1/pml_ob1_recvreq.c index 4900de9333..a8206af0ca 100644 --- a/ompi/mca/pml/ob1/pml_ob1_recvreq.c +++ b/ompi/mca/pml/ob1/pml_ob1_recvreq.c @@ -227,7 +227,8 @@ int mca_pml_ob1_recv_request_ack_send_btl( /* allocate descriptor */ mca_bml_base_alloc(bml_btl, &des, MCA_BTL_NO_ORDER, sizeof(mca_pml_ob1_ack_hdr_t), - MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP | MCA_BTL_DES_SEND_ALWAYS_CALLBACK); + MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP | + MCA_BTL_DES_SEND_ALWAYS_CALLBACK | MCA_BTL_DES_FLAGS_SIGNAL); if( OPAL_UNLIKELY(NULL == des) ) { return OMPI_ERR_OUT_OF_RESOURCE; } @@ -370,7 +371,7 @@ static int mca_pml_ob1_init_get_fallback (mca_pml_ob1_rdma_frag_t *frag, /* prepare a descriptor for rdma control message */ mca_bml_base_alloc (bml_btl, &ctl, MCA_BTL_NO_ORDER, sizeof (mca_pml_ob1_rdma_hdr_t) + seg_size, MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP | - MCA_BTL_DES_SEND_ALWAYS_CALLBACK); + MCA_BTL_DES_SEND_ALWAYS_CALLBACK | MCA_BTL_DES_FLAGS_SIGNAL); if (OPAL_UNLIKELY(NULL == ctl)) { return OMPI_ERR_OUT_OF_RESOURCE; } @@ -985,7 +986,8 @@ int mca_pml_ob1_recv_request_schedule_once( mca_pml_ob1_recv_request_t* recvreq, /* prepare a descriptor for rdma control message */ mca_bml_base_alloc(bml_btl, &ctl, MCA_BTL_NO_ORDER, sizeof(mca_pml_ob1_rdma_hdr_t) + seg_size, - MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP | MCA_BTL_DES_SEND_ALWAYS_CALLBACK); + MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP | + MCA_BTL_DES_SEND_ALWAYS_CALLBACK | MCA_BTL_DES_FLAGS_SIGNAL); if( OPAL_UNLIKELY(NULL == ctl) ) { mca_bml_base_free(bml_btl,dst); diff --git a/ompi/mca/pml/ob1/pml_ob1_sendreq.c b/ompi/mca/pml/ob1/pml_ob1_sendreq.c index 65d2c10770..86d7dc0dce 100644 --- a/ompi/mca/pml/ob1/pml_ob1_sendreq.c +++ b/ompi/mca/pml/ob1/pml_ob1_sendreq.c @@ -384,7 +384,8 @@ int mca_pml_ob1_send_request_start_buffered( mca_bml_base_alloc(bml_btl, &des, MCA_BTL_NO_ORDER, sizeof(mca_pml_ob1_rendezvous_hdr_t) + size, - MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP); + MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP | + MCA_BTL_DES_FLAGS_SIGNAL); if( OPAL_UNLIKELY(NULL == des) ) { return OMPI_ERR_OUT_OF_RESOURCE; } @@ -720,7 +721,8 @@ int mca_pml_ob1_send_request_start_rdma( mca_pml_ob1_send_request_t* sendreq, /* allocate space for get hdr + segment list */ mca_bml_base_alloc(bml_btl, &des, MCA_BTL_NO_ORDER, sizeof (*hdr) + seg_size, - MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP); + MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP | + MCA_BTL_DES_FLAGS_SIGNAL); if( OPAL_UNLIKELY(NULL == des) ) { /* NTH: no need to reset the converter here. it will be reset before it is retried */ mca_bml_base_free(bml_btl, src); @@ -811,7 +813,8 @@ int mca_pml_ob1_send_request_start_rndv( mca_pml_ob1_send_request_t* sendreq, MCA_BTL_NO_ORDER, sizeof(mca_pml_ob1_rendezvous_hdr_t), &size, - MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP, + MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP | + MCA_BTL_DES_FLAGS_SIGNAL, &des ); MEMCHECKER( memchecker_call(&opal_memchecker_base_mem_noaccess, @@ -828,7 +831,7 @@ int mca_pml_ob1_send_request_start_rndv( mca_pml_ob1_send_request_t* sendreq, /* build hdr */ hdr = (mca_pml_ob1_hdr_t*)segment->seg_addr.pval; - hdr->hdr_common.hdr_flags = flags; + hdr->hdr_common.hdr_flags = flags | MCA_PML_OB1_HDR_FLAGS_SIGNAL; hdr->hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_RNDV; hdr->hdr_match.hdr_ctx = sendreq->req_send.req_base.req_comm->c_contextid; hdr->hdr_match.hdr_src = sendreq->req_send.req_base.req_comm->c_my_rank; @@ -1023,7 +1026,9 @@ cannot_pack: &sendreq->req_send.req_base.req_convertor, MCA_BTL_NO_ORDER, sizeof(mca_pml_ob1_frag_hdr_t), - &size, MCA_BTL_DES_FLAGS_BTL_OWNERSHIP | MCA_BTL_DES_SEND_ALWAYS_CALLBACK, &des); + &size, MCA_BTL_DES_FLAGS_BTL_OWNERSHIP | MCA_BTL_DES_SEND_ALWAYS_CALLBACK | + MCA_BTL_DES_FLAGS_SIGNAL, + &des); MEMCHECKER( memchecker_call(&opal_memchecker_base_mem_noaccess, sendreq->req_send.req_base.req_addr, diff --git a/opal/mca/btl/ugni/Makefile.am b/opal/mca/btl/ugni/Makefile.am index 9bdfb1a297..7304f1baeb 100644 --- a/opal/mca/btl/ugni/Makefile.am +++ b/opal/mca/btl/ugni/Makefile.am @@ -38,6 +38,7 @@ ugni_SOURCES = \ btl_ugni.h \ btl_ugni_smsg.h \ btl_ugni_smsg.c \ + btl_ugni_progress_thread.c \ btl_ugni_prepare.h mcacomponentdir = $(opallibdir) diff --git a/opal/mca/btl/ugni/btl_ugni.h b/opal/mca/btl/ugni/btl_ugni.h index bbffde6c5d..3d69197252 100644 --- a/opal/mca/btl/ugni/btl_ugni.h +++ b/opal/mca/btl/ugni/btl_ugni.h @@ -47,6 +47,8 @@ #define MCA_BTL_UGNI_CONNECT_DIRECTED_ID 0x8000000000000000ull #define MCA_BTL_UGNI_DATAGRAM_MASK 0x8000000000000000ull +extern int howards_progress_var; + /* ompi and smsg endpoint attributes */ typedef struct mca_btl_ugni_endpoint_attr_t { uint64_t proc_id; @@ -311,5 +313,8 @@ static inline uint64_t mca_btl_ugni_proc_name_to_id (opal_process_name_t name) { return ((uint64_t) (name.jobid & 0x7fffffff) << 32 | name.vpid); } +int mca_btl_ugni_spawn_progress_thread(struct mca_btl_base_module_t* btl); +int mca_btl_ugni_kill_progress_thread(void); + #endif diff --git a/opal/mca/btl/ugni/btl_ugni_add_procs.c b/opal/mca/btl/ugni/btl_ugni_add_procs.c index 7d476a0e32..2dc5c08003 100644 --- a/opal/mca/btl/ugni/btl_ugni_add_procs.c +++ b/opal/mca/btl/ugni/btl_ugni_add_procs.c @@ -20,6 +20,8 @@ #include "opal/include/opal/align.h" #include "opal/mca/dstore/dstore.h" +extern int howards_progress_var; + #define INITIAL_GNI_EPS 10000 static int @@ -28,6 +30,8 @@ static void mca_btl_ugni_module_set_max_reg (mca_btl_ugni_module_t *ugni_module, int nlocal_procs); static int mca_btl_ugni_smsg_setup (int nprocs); +void *howards_start_addr; + int mca_btl_ugni_add_procs(struct mca_btl_base_module_t* btl, size_t nprocs, struct opal_proc_t **procs, @@ -119,6 +123,49 @@ int mca_btl_ugni_add_procs(struct mca_btl_base_module_t* btl, return opal_common_rc_ugni_to_opal (rc); } + if (howards_progress_var) { + fprintf(stderr,"setting up irq cqs\n"); + OPAL_THREAD_LOCK(&ugni_module->device->dev_lock); + rc = GNI_CqCreate (ugni_module->device->dev_handle, mca_btl_ugni_component.local_cq_size, + 0, GNI_CQ_BLOCKING, NULL, NULL, &ugni_module->rdma_local_irq_cq); + OPAL_THREAD_UNLOCK(&ugni_module->device->dev_lock); + if (GNI_RC_SUCCESS != rc) { + BTL_ERROR(("error creating local BTE/FMA CQ")); + return opal_common_rc_ugni_to_opal (rc); + } + + fprintf(stderr,"created blocking cq 0x%lx\n",ugni_module->rdma_local_irq_cq); + + OPAL_THREAD_LOCK(&ugni_module->device->dev_lock); + rc = GNI_CqCreate (ugni_module->device->dev_handle, mca_btl_ugni_component.remote_cq_size, + 0, GNI_CQ_BLOCKING, NULL, NULL, &ugni_module->smsg_remote_irq_cq); + OPAL_THREAD_UNLOCK(&ugni_module->device->dev_lock); + if (GNI_RC_SUCCESS != rc) { + BTL_ERROR(("error creating remote SMSG CQ")); + return opal_common_rc_ugni_to_opal (rc); + } + + OPAL_THREAD_LOCK(&ugni_module->device->dev_lock); + rc = GNI_EpCreate (ugni_module->device->dev_handle, ugni_module->rdma_local_cq, + &ugni_module->local_ep); + OPAL_THREAD_UNLOCK(&ugni_module->device->dev_lock); + if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { + BTL_ERROR(("error creating local ugni endpoint")); + return opal_common_rc_ugni_to_opal (rc); + } + + OPAL_THREAD_LOCK(&ugni_module->device->dev_lock); + rc = GNI_EpBind (ugni_module->local_ep, + ugni_module->device->dev_addr, + getpid()); + OPAL_THREAD_UNLOCK(&ugni_module->device->dev_lock); + if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { + BTL_ERROR(("error binding local ugni endpoint")); + return opal_common_rc_ugni_to_opal (rc); + } + + } + rc = mca_btl_ugni_setup_mpools (ugni_module); if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { BTL_ERROR(("btl/ugni error setting up mpools/free lists")); @@ -131,6 +178,31 @@ int mca_btl_ugni_add_procs(struct mca_btl_base_module_t* btl, return rc; } + if (howards_progress_var) { + howards_start_addr = mmap(NULL, 4096, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, -1, 0); + if (howards_start_addr == NULL) { + fprintf(stderr,"Hey, mmap returned NULL!\b"); + } + + OPAL_THREAD_LOCK(&ugni_module->device->dev_lock); + status = GNI_MemRegister(ugni_module->device->dev_handle, + (unsigned long)howards_start_addr, + 4096, + ugni_module->smsg_remote_irq_cq, + GNI_MEM_READWRITE, + -1, + &ugni_module->device->smsg_irq_mhndl); +#if 1 + { + unsigned long *vec = (unsigned long *)&mca_btl_ugni_component.modules[0].device->smsg_irq_mhndl; + fprintf(stderr,"status = %d memory handle contents 0x%lx 0x%lx\n",status,vec[0],vec[1]); + } +#endif + OPAL_THREAD_UNLOCK(&ugni_module->device->dev_lock); + + mca_btl_ugni_spawn_progress_thread(btl); + } + ugni_module->initialized = true; } diff --git a/opal/mca/btl/ugni/btl_ugni_component.c b/opal/mca/btl/ugni/btl_ugni_component.c index 7758a6d8f9..22e7ab7310 100644 --- a/opal/mca/btl/ugni/btl_ugni_component.c +++ b/opal/mca/btl/ugni/btl_ugni_component.c @@ -564,7 +564,9 @@ static int mca_btl_ugni_component_progress (void) count += mca_btl_ugni_progress_local_smsg (ugni_module); count += mca_btl_ugni_progress_remote_smsg (ugni_module); count += mca_btl_ugni_progress_rdma (ugni_module, 0); - + if (howards_progress_var) { + count += mca_btl_ugni_progress_rdma (ugni_module, 1); + } } return count; diff --git a/opal/mca/btl/ugni/btl_ugni_endpoint.c b/opal/mca/btl/ugni/btl_ugni_endpoint.c index dc7e9dfd55..e8eefc170b 100644 --- a/opal/mca/btl/ugni/btl_ugni_endpoint.c +++ b/opal/mca/btl/ugni/btl_ugni_endpoint.c @@ -159,6 +159,7 @@ static inline int mca_btl_ugni_ep_connect_finish (mca_btl_base_endpoint_t *ep) { ep->rmt_irq_mem_hndl = ep->remote_attr.rmt_irq_mem_hndl; ep->state = MCA_BTL_UGNI_EP_STATE_CONNECTED; + fprintf(stderr,"ep->rmt_irq_mem_hndl rmt_irq_mem_hndl 0x%lx 0x%lx\n",ep->rmt_irq_mem_hndl.qword1,ep->rmt_irq_mem_hndl.qword2); /* send all pending messages */ BTL_VERBOSE(("endpoint connected. posting %u sends", (unsigned int) opal_list_get_size (&ep->frag_wait_list))); diff --git a/opal/mca/btl/ugni/btl_ugni_frag.h b/opal/mca/btl/ugni/btl_ugni_frag.h index a4e6bac1ba..319fd89f0a 100644 --- a/opal/mca/btl/ugni/btl_ugni_frag.h +++ b/opal/mca/btl/ugni/btl_ugni_frag.h @@ -81,6 +81,7 @@ typedef struct mca_btl_ugni_base_frag_t { mca_btl_base_endpoint_t *endpoint; mca_btl_ugni_reg_t *registration; ompi_free_list_t *my_list; + frag_cb_t *cbfunc; } mca_btl_ugni_base_frag_t; typedef struct mca_btl_ugni_base_frag_t mca_btl_ugni_smsg_frag_t; diff --git a/opal/mca/btl/ugni/btl_ugni_module.c b/opal/mca/btl/ugni/btl_ugni_module.c index 03bec3cac7..aab80ab9f7 100644 --- a/opal/mca/btl/ugni/btl_ugni_module.c +++ b/opal/mca/btl/ugni/btl_ugni_module.c @@ -20,6 +20,8 @@ #include "btl_ugni_prepare.h" #include "btl_ugni_smsg.h" +int howards_progress_var = 0; + static int mca_btl_ugni_free (struct mca_btl_base_module_t *btl, mca_btl_base_descriptor_t *des); @@ -114,6 +116,7 @@ mca_btl_ugni_module_init (mca_btl_ugni_module_t *ugni_module, return rc; } + if (getenv("HOWARDS_PROGESS") != NULL) howards_progress_var = 1; return OPAL_SUCCESS; } @@ -145,21 +148,35 @@ mca_btl_ugni_module_finalize (struct mca_btl_base_module_t *btl) rc = opal_hash_table_get_next_key_uint64 (&ugni_module->id_to_endpoint, &key, (void **) &ep, node, &node); } + if (howards_progress_var) { + mca_btl_ugni_kill_progress_thread(); + } + /* destroy all cqs */ OPAL_THREAD_LOCK(&ugni_module->device->dev_lock); rc = GNI_CqDestroy (ugni_module->rdma_local_cq); if (GNI_RC_SUCCESS != rc) { - BTL_ERROR(("error tearing down local BTE/FMA CQ")); + BTL_ERROR(("error tearing down local BTE/FMA CQ - %s",gni_err_str[rc])); } rc = GNI_CqDestroy (ugni_module->smsg_local_cq); if (GNI_RC_SUCCESS != rc) { - BTL_ERROR(("error tearing down local SMSG CQ")); + BTL_ERROR(("error tearing down TX SMSG CQ - %s",gni_err_str[rc])); } rc = GNI_CqDestroy (ugni_module->smsg_remote_cq); if (GNI_RC_SUCCESS != rc) { - BTL_ERROR(("error tearing down remote SMSG CQ")); + BTL_ERROR(("error tearing down RX SMSG CQ - %s",gni_err_str[rc])); + } + + rc = GNI_CqDestroy (ugni_module->rdma_local_irq_cq); + if (GNI_RC_SUCCESS != rc) { + BTL_ERROR(("error tearing down local BTE/FMA CQ - %s",gni_err_str[rc])); + } + + rc = GNI_CqDestroy (ugni_module->smsg_remote_irq_cq); + if (GNI_RC_SUCCESS != rc) { + BTL_ERROR(("error tearing down remote SMSG CQ - %s",gni_err_str[rc])); } /* cancel wildcard post */ @@ -173,7 +190,7 @@ mca_btl_ugni_module_finalize (struct mca_btl_base_module_t *btl) /* tear down wildcard endpoint */ rc = GNI_EpDestroy (ugni_module->wildcard_ep); if (GNI_RC_SUCCESS != rc) { - BTL_VERBOSE(("btl/ugni error destroying endpoint")); + BTL_VERBOSE(("btl/ugni error destroying endpoint - %s",gni_err_str[rc])); } OPAL_THREAD_UNLOCK(&ugni_module->device->dev_lock); } diff --git a/opal/mca/btl/ugni/btl_ugni_progress_thread.c b/opal/mca/btl/ugni/btl_ugni_progress_thread.c new file mode 100644 index 0000000000..9603de0df6 --- /dev/null +++ b/opal/mca/btl/ugni/btl_ugni_progress_thread.c @@ -0,0 +1,173 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2011-2013 Los Alamos National Security, LLC. All rights + * reserved. + * Copyright (c) 2011 UT-Battelle, LLC. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "opal_config.h" + +#include "btl_ugni.h" +#include "btl_ugni_frag.h" +#include "btl_ugni_smsg.h" + +#include "opal/include/opal/align.h" + + +static pthread_t mca_btl_ugni_progress_thread_id; +static pthread_mutex_t progress_mutex = PTHREAD_MUTEX_INITIALIZER; +static pthread_cond_t progress_cond = PTHREAD_COND_INITIALIZER; + +static volatile int stop_progress_thread = 0; +static volatile int progress_thread_done = 0; + +static int thread_wakeups = 0; + +static void *mca_btl_ugni_prog_thread_fn(void * data) +{ + int rc; + uint32_t which; + gni_return_t status; + gni_cq_handle_t cq_vec[2]; + + struct mca_btl_ugni_module_t *btl = (mca_btl_ugni_module_t *)data; + + /* + * need to block signals + */ + + cq_vec[0] = btl->smsg_remote_irq_cq; + cq_vec[1] = btl->rdma_local_irq_cq; + + + while (stop_progress_thread == 0) { + + /* + * this ugni call doesn't need a lock + */ + + status = GNI_CqVectorMonitor(cq_vec, + 2, + -1, + &which); + + if (which == 1) { + fprintf(stderr,"broke out of GNI_CqVectorMonitor which = %d status = %s\n",which,gni_err_str[status]); + } + + if (status == GNI_RC_NOT_DONE) continue; + + if ((status == GNI_RC_SUCCESS) && (stop_progress_thread == 0)) { + thread_wakeups++; + if (which == 1) + fprintf(stderr,"Calling the progress function\n"); +#if 0 + opal_progress(); +#endif +#if 1 + mca_btl_ugni_component.super.btl_progress(); /* TODO: probably needs to be higher up */ +#endif + } + } + + /* Send a signal to the main thread saying we are done */ + rc = pthread_mutex_lock(&progress_mutex); + if (rc != 0) { + fprintf(stderr,"Hey pthread_mutex_lock failed\n"); + } + + progress_thread_done = 1; + + rc = pthread_mutex_unlock(&progress_mutex); + if (rc != 0) { + fprintf(stderr,"Hey pthread_mutex_unlock failed\n"); + } + rc = pthread_cond_signal(&progress_cond); + + return OMPI_SUCCESS; +} + +int mca_btl_ugni_spawn_progress_thread(struct mca_btl_base_module_t *btl) +{ + int rc; + pthread_attr_t attr; + + pthread_attr_init(&attr); + pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED); + + fprintf(stderr,"Firing off the progress thread \n"); + + rc = pthread_create(&mca_btl_ugni_progress_thread_id, + &attr, mca_btl_ugni_prog_thread_fn, (void *)btl); + if (rc != 0) { + fprintf(stderr,"Hey, pthread_create returned with error %d (%s) \n",errno,strerror(errno)); + } + + rc = pthread_attr_destroy(&attr); + if (rc != 0) { + fprintf(stderr,"Hey, pthread_attr_destroy returned with error %d (%s) \n",errno,strerror(errno)); + } + + return OMPI_SUCCESS; +} + +int mca_btl_ugni_kill_progress_thread(void) +{ + gni_return_t status; + static mca_btl_ugni_base_frag_t cq_write_frag; + + stop_progress_thread = 1; + + fprintf(stderr,"async progress thread being killed off\n"); + + /* + * post a CQ to myself to wake my thread up + */ + + cq_write_frag.post_desc.base.type = GNI_POST_CQWRITE; + cq_write_frag.post_desc.base.cqwrite_value = 0xdead; /* up to 48 bytes here, not used for now */ + cq_write_frag.post_desc.base.cq_mode = GNI_CQMODE_GLOBAL_EVENT; + cq_write_frag.post_desc.base.dlvr_mode = GNI_DLVMODE_IN_ORDER; + cq_write_frag.post_desc.base.src_cq_hndl = mca_btl_ugni_component.modules[0].rdma_local_cq; + cq_write_frag.post_desc.base.remote_mem_hndl = mca_btl_ugni_component.modules[0].device->smsg_irq_mhndl; + cq_write_frag.post_desc.tries = 0; + cq_write_frag.cbfunc = NULL; + OPAL_THREAD_LOCK(&mca_btl_ugni_component.modules[0].device->dev_lock); + status = GNI_PostCqWrite(mca_btl_ugni_component.modules[0].local_ep, + &cq_write_frag.post_desc.base); + OPAL_THREAD_UNLOCK(&mca_btl_ugni_component.modules[0].device->dev_lock); + /* + * TODO: if error returned, need to kill off thread manually + */ + if (GNI_RC_SUCCESS != status) { + BTL_ERROR(("GNI_PostCqWrite returned error - %s",gni_err_str[status])); + } + + while (!progress_thread_done) { + pthread_cond_wait(&progress_cond, &progress_mutex); + } + + pthread_mutex_unlock(&progress_mutex); + + fprintf(stderr,"async progress thread killed off wakeups = %d\n",thread_wakeups); + + /* + * destroy the local_ep + */ + + OPAL_THREAD_LOCK(&mca_btl_ugni_component.modules[0].device->dev_lock); + status = GNI_EpDestroy (mca_btl_ugni_component.modules[0].local_ep); + OPAL_THREAD_UNLOCK(&mca_btl_ugni_component.modules[0].device->dev_lock); + if (OPAL_UNLIKELY(OMPI_SUCCESS != status)) { + BTL_ERROR(("error destroy local ep endpoint - %s", gni_err_str[status])); + return opal_common_rc_ugni_to_opali (status); + } + + return OMPI_SUCCESS; +} + diff --git a/opal/mca/btl/ugni/btl_ugni_rdma.h b/opal/mca/btl/ugni/btl_ugni_rdma.h index 0194c6aeff..03374526f7 100644 --- a/opal/mca/btl/ugni/btl_ugni_rdma.h +++ b/opal/mca/btl/ugni/btl_ugni_rdma.h @@ -67,12 +67,26 @@ static inline int mca_btl_ugni_post_fma (mca_btl_ugni_base_frag_t *frag, gni_pos static inline int mca_btl_ugni_post_bte (mca_btl_ugni_base_frag_t *frag, gni_post_type_t op_type, mca_btl_ugni_segment_t *lcl_seg, mca_btl_ugni_segment_t *rem_seg) { + int rc; gni_return_t status; + mca_btl_ugni_base_frag_t *cq_frag = NULL; + extern void *howards_start_addr; /* Post descriptor */ - init_gni_post_desc (frag, op_type, lcl_seg->base.seg_addr.lval, lcl_seg->memory_handle, - rem_seg->base.seg_addr.lval, rem_seg->memory_handle, lcl_seg->base.seg_len, - frag->endpoint->btl->rdma_local_cq); +#if 0 + if (howards_progress_var && (getenv("GENERATE_RDMA_IRQS") != NULL)) { + fprintf(stderr,"Calling GNI_PostRdma with to trigger interrupt on rdma_local_irq_cq %p\n",frag->endpoint->btl->rdma_local_irq_cq); + init_gni_post_desc (frag, op_type, lcl_seg->base.seg_addr.lval, lcl_seg->memory_handle, + rem_seg->base.seg_addr.lval, rem_seg->memory_handle, lcl_seg->base.seg_len, + frag->endpoint->btl->rdma_local_irq_cq); + } else { +#endif + init_gni_post_desc (frag, op_type, lcl_seg->base.seg_addr.lval, lcl_seg->memory_handle, + rem_seg->base.seg_addr.lval, rem_seg->memory_handle, lcl_seg->base.seg_len, + frag->endpoint->btl->rdma_local_cq); +#if 0 + } +#endif OPAL_THREAD_LOCK(&frag->endpoint->common->dev->dev_lock); status = GNI_PostRdma (frag->endpoint->rdma_ep_handle, &frag->post_desc.base); @@ -82,6 +96,43 @@ static inline int mca_btl_ugni_post_bte (mca_btl_ugni_base_frag_t *frag, gni_pos return opal_common_rc_ugni_to_opal(status); } + if (howards_progress_var && (getenv("GENERATE_RDMA_IRQS") != NULL)) { + + rc = mca_btl_ugni_frag_alloc(frag->endpoint, + &frag->endpoint->btl->rdma_frags, + &cq_frag); +#if 0 + fprintf(stderr,"allocated cq_frag %p\n",cq_frag); +#endif + if (rc == OPAL_SUCCESS) { + cq_frag->registration = NULL; + cq_frag->post_desc.base.type = GNI_POST_RDMA_PUT; + cq_frag->post_desc.base.length = 4; + cq_frag->post_desc.base.remote_addr = (uint64_t)howards_start_addr; + cq_frag->post_desc.base.remote_mem_hndl = mca_btl_ugni_component.modules[0].device->smsg_irq_mhndl; + cq_frag->post_desc.base.local_addr = (uint64_t)howards_start_addr; + cq_frag->post_desc.base.cq_mode = GNI_CQMODE_REMOTE_EVENT | GNI_CQMODE_GLOBAL_EVENT; + cq_frag->post_desc.base.dlvr_mode = GNI_DLVMODE_IN_ORDER; + cq_frag->post_desc.base.src_cq_hndl = mca_btl_ugni_component.modules[0].rdma_local_cq; + cq_frag->post_desc.base.rdma_mode = 0; + cq_frag->post_desc.base.local_mem_hndl = mca_btl_ugni_component.modules[0].device->smsg_irq_mhndl; + cq_frag->post_desc.base.post_id = 0xFFFF; + cq_frag->post_desc.tries = 0; + cq_frag->cbfunc = mca_btl_ugni_write_to_self_complete; + OPAL_THREAD_LOCK(&frag->endpoint->common->dev->dev_lock); + status = GNI_PostRdma(mca_btl_ugni_component.modules[0].local_ep,&cq_frag->post_desc.base); + OPAL_THREAD_UNLOCK(&frag->endpoint->common->dev->dev_lock); + if (status == GNI_RC_ERROR_RESOURCE) { /* errors for PostCqWrite treated as non-fatal */ + fprintf(stderr,"GNI_PostRdma returned %s\n",gni_err_str[status]); + mca_btl_ugni_frag_return (cq_frag); + } else { + if (status != GNI_RC_SUCCESS) { + fprintf(stderr,"GNI_PostRdma returned %s\n",gni_err_str[status]); + } + } + } + } + return OPAL_SUCCESS; } diff --git a/opal/mca/btl/ugni/btl_ugni_smsg.h b/opal/mca/btl/ugni/btl_ugni_smsg.h index 18cde99d48..02ad8bb6a7 100644 --- a/opal/mca/btl/ugni/btl_ugni_smsg.h +++ b/opal/mca/btl/ugni/btl_ugni_smsg.h @@ -17,6 +17,8 @@ #include "btl_ugni_endpoint.h" #include "btl_ugni_frag.h" #include "btl_ugni_rdma.h" +/* TODO: need to fix this one */ +#include "ompi/mca/pml/ob1/pml_ob1_hdr.h" typedef enum { MCA_BTL_UGNI_TAG_SEND, @@ -82,12 +84,23 @@ static inline int mca_btl_ugni_progress_local_smsg (mca_btl_ugni_module_t *ugni_ return 1; } +static void mca_btl_ugni_cqwrite_complete (mca_btl_ugni_base_frag_t *frag, int rc) +{ + frag->flags |= MCA_BTL_UGNI_FRAG_COMPLETE; + + BTL_VERBOSE(("cqwrite frag complete")); + mca_btl_ugni_frag_return (frag); +} + static inline int opal_mca_btl_ugni_smsg_send (mca_btl_ugni_base_frag_t *frag, void *hdr, size_t hdr_len, void *payload, size_t payload_len, mca_btl_ugni_smsg_tag_t tag) { + int rc; + int pml_tag; gni_return_t grc; + mca_btl_ugni_base_frag_t *cq_write_frag = NULL; OPAL_THREAD_LOCK(&frag->endpoint->common->dev->dev_lock); grc = GNI_SmsgSendWTag (frag->endpoint->smsg_ep_handle, hdr, hdr_len, @@ -98,6 +111,38 @@ static inline int opal_mca_btl_ugni_smsg_send (mca_btl_ugni_base_frag_t *frag, /* increment the active send counter */ opal_atomic_add_32(&frag->endpoint->btl->active_send_count,1); + if (howards_progress_var == 1 && (getenv("GENERATE_MDH_IRQS") != NULL)) { + pml_tag = frag->hdr.send.lag >> 24; + if (pml_tag > MCA_PML_OB1_HDR_TYPE_MATCH) { + rc = mca_btl_ugni_frag_alloc(frag->endpoint, + &frag->endpoint->btl->rdma_frags, + &cq_write_frag); + if (rc == OPAL_SUCCESS) { + cq_write_frag->registration = NULL; + cq_write_frag->endpoint = frag->endpoint; + cq_write_frag->post_desc.base.type = GNI_POST_CQWRITE; + cq_write_frag->post_desc.base.cqwrite_value = 0xdead; /* up to 48 bytes here, not used for now */ + cq_write_frag->post_desc.base.cq_mode = GNI_CQMODE_GLOBAL_EVENT; + cq_write_frag->post_desc.base.dlvr_mode = GNI_DLVMODE_IN_ORDER; + cq_write_frag->post_desc.base.src_cq_hndl = frag->endpoint->btl->rdma_local_cq; + cq_write_frag->post_desc.base.remote_mem_hndl = frag->endpoint->rmt_irq_mem_hndl; + cq_write_frag->post_desc.tries = 0; + cq_write_frag->cbfunc = mca_btl_ugni_cqwrite_complete; +#if 0 + fprintf(stderr,"doing a GNI_PostCqWrite to 0x%lx 0x%lx \n",cq_write_frag->post_desc.base.remote_mem_hndl.qword1, + cq_write_frag->post_desc.base.remote_mem_hndl.qword2); +#endif + OPAL_THREAD_LOCK(&frag->endpoint->common->dev->dev_lock); + grc = GNI_PostCqWrite(frag->endpoint->rdma_ep_handle, &cq_write_frag->post_desc.base); + OPAL_THREAD_UNLOCK(&frag->endpoint->common->dev->dev_lock); + if (grc == GNI_RC_ERROR_RESOURCE) { /* errors for PostCqWrite treated as non-fatal */ + fprintf(stderr,"GNI_PostCqWrite returned gni error %s\n",gni_err_str[grc]); + mca_btl_ugni_frag_return (cq_write_frag); + } + } + } + } + (void) mca_btl_ugni_progress_local_smsg ((mca_btl_ugni_module_t *) frag->endpoint->btl); return OPAL_SUCCESS; }