* start cleaning up debugging output (still much to do)
* make buffers really big so that we pass allocmem until we figure out why we're not flow controlling as I expected * set event queue to invalid intially and use that as the enabled test rather than a seperate bool - shrinks the module a bit * add dropped count checks, with a panic if one occurs. Still need to implement some type of retransmit logic. This commit was SVN r5704.
Этот коммит содержится в:
родитель
e2c2c72b84
Коммит
0c6eaaebe3
@ -156,7 +156,7 @@ AC_DEFUN([MCA_CONFIGURE_STUB],[
|
||||
[Portals table id to use for fragment receive queue])
|
||||
|
||||
MCA_PTL_PORTALS_CONFIG_VAL([debug-level],
|
||||
[PTL_PORTALS_DEFAULT_DEBUG_LEVEL], [1000],
|
||||
[PTL_PORTALS_DEFAULT_DEBUG_LEVEL], [99],
|
||||
[Default debugging level for portals ptl])
|
||||
|
||||
MCA_PTL_PORTALS_CONFIG_VAL([request-cache-size],
|
||||
@ -168,11 +168,11 @@ AC_DEFUN([MCA_CONFIGURE_STUB],[
|
||||
[Default first frag size for portals ptl])
|
||||
|
||||
MCA_PTL_PORTALS_CONFIG_VAL([first-frag-num-entries],
|
||||
[PTL_PORTALS_DEFAULT_FIRST_FRAG_NUM_ENTRIES], [3],
|
||||
[PTL_PORTALS_DEFAULT_FIRST_FRAG_NUM_ENTRIES], [5],
|
||||
[Default number of memory descriptors for first fragments])
|
||||
|
||||
MCA_PTL_PORTALS_CONFIG_VAL([first-frag-entry-size],
|
||||
[PTL_PORTALS_DEFAULT_FIRST_FRAG_ENTRY_SIZE], [1048576],
|
||||
[PTL_PORTALS_DEFAULT_FIRST_FRAG_ENTRY_SIZE], [10485760],
|
||||
[Default size of memory associeted with first fag md])
|
||||
|
||||
MCA_PTL_PORTALS_CONFIG_VAL([first-frag-queue-size],
|
||||
|
@ -99,7 +99,7 @@ mca_ptl_portals_add_procs(struct mca_ptl_base_module_t* ptl,
|
||||
portals_procs[i],
|
||||
&distance);
|
||||
if (ret != PTL_OK) {
|
||||
ompi_output_verbose(100, mca_ptl_portals_component.portals_output,
|
||||
ompi_output_verbose(10, mca_ptl_portals_component.portals_output,
|
||||
"Could not find distance to process %d", i);
|
||||
continue;
|
||||
}
|
||||
@ -139,7 +139,7 @@ mca_ptl_portals_module_enable(struct mca_ptl_portals_module_t *ptl,
|
||||
/* BWB - not really sure how - would have to track a lot more data... */
|
||||
} else {
|
||||
/* only do all the hard stuff if we haven't created the queue */
|
||||
if (ptl->frag_queues_created) return OMPI_SUCCESS;
|
||||
if (ptl->frag_eq_handle != PTL_EQ_NONE) return OMPI_SUCCESS;
|
||||
|
||||
/* create an event queue, then the match entries for the match
|
||||
entries */
|
||||
@ -159,7 +159,6 @@ mca_ptl_portals_module_enable(struct mca_ptl_portals_module_t *ptl,
|
||||
for (i = 0 ; i < ptl->first_frag_num_entries ; ++i) {
|
||||
ret = ptl_portals_post_recv_md(ptl, NULL);
|
||||
if (OMPI_SUCCESS != ret) return ret;
|
||||
ptl->frag_queues_created = true;
|
||||
}
|
||||
}
|
||||
|
||||
@ -176,7 +175,7 @@ mca_ptl_portals_finalize(struct mca_ptl_base_module_t *ptl_base)
|
||||
|
||||
ret = PtlNIFini(ptl->ni_handle);
|
||||
if (PTL_OK != ret) {
|
||||
ompi_output_verbose(50, mca_ptl_portals_component.portals_output,
|
||||
ompi_output_verbose(90, mca_ptl_portals_component.portals_output,
|
||||
"PtlNIFini returned %d\n", ret);
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
@ -45,7 +45,7 @@ struct mca_ptl_portals_component_t {
|
||||
* - 0 : critical user information
|
||||
* - 10: initialization / shutdown diagnostic information
|
||||
* - 20: general execution diagnostic information
|
||||
* - 99: useful only to developers
|
||||
* - 90: useful only to developers
|
||||
*/
|
||||
int portals_output;
|
||||
|
||||
@ -169,8 +169,6 @@ struct mca_ptl_portals_module_t {
|
||||
/* size for event queue */
|
||||
int first_frag_queue_size;
|
||||
|
||||
/* frag receive data */
|
||||
bool frag_queues_created;
|
||||
/* frag receive event queue */
|
||||
ptl_handle_eq_t frag_eq_handle;
|
||||
|
||||
@ -178,6 +176,9 @@ struct mca_ptl_portals_module_t {
|
||||
ptl_handle_ni_t ni_handle;
|
||||
/** the limits returned from PtlNIInit for interface */
|
||||
ptl_ni_limits_t limits;
|
||||
|
||||
/** number of dropped messages */
|
||||
ptl_sr_value_t dropped;
|
||||
};
|
||||
typedef struct mca_ptl_portals_module_t mca_ptl_portals_module_t;
|
||||
|
||||
|
@ -129,8 +129,8 @@ mca_ptl_portals_add_procs_compat(struct mca_ptl_portals_module_t* ptl,
|
||||
return ret;
|
||||
} else if (sizeof(ptl_process_id_t) != size) {
|
||||
ompi_output_verbose(10, mca_ptl_portals_component.portals_output,
|
||||
"mca_base_modex_recv returned size%d",
|
||||
size);
|
||||
"mca_base_modex_recv returned size %d, expected %d",
|
||||
size, sizeof(ptl_process_id_t));
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
||||
|
@ -170,9 +170,14 @@ mca_ptl_portals_component_open(void)
|
||||
mca_ptl_portals_component.portals_output =
|
||||
ompi_output_open(&portals_output_stream);
|
||||
|
||||
ompi_output_verbose(100, mca_ptl_portals_component.portals_output,
|
||||
ompi_output_verbose(90, mca_ptl_portals_component.portals_output,
|
||||
"mca_ptl_portals_component_open()");
|
||||
|
||||
/* fill in defaults for module data */
|
||||
mca_ptl_portals_module.frag_eq_handle = PTL_EQ_NONE;
|
||||
mca_ptl_portals_module.ni_handle = PTL_INVALID_HANDLE;
|
||||
mca_ptl_portals_module.dropped = 0;
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
@ -183,7 +188,7 @@ mca_ptl_portals_component_open(void)
|
||||
int
|
||||
mca_ptl_portals_component_close(void)
|
||||
{
|
||||
ompi_output_verbose(100, mca_ptl_portals_component.portals_output,
|
||||
ompi_output_verbose(90, mca_ptl_portals_component.portals_output,
|
||||
"mca_ptl_portals_component_close()");
|
||||
|
||||
/* finalize interface? */
|
||||
@ -219,9 +224,12 @@ mca_ptl_portals_component_init(int *num_ptls,
|
||||
mca_ptl_base_module_t** ptls;
|
||||
*num_ptls = 0;
|
||||
|
||||
ompi_output_verbose(100, mca_ptl_portals_component.portals_output,
|
||||
ompi_output_verbose(90, mca_ptl_portals_component.portals_output,
|
||||
"mca_ptl_portals_component_init()");
|
||||
|
||||
/* BWB - no support for threads */
|
||||
if (enable_progress_threads || enable_mpi_threads) return NULL;
|
||||
|
||||
ompi_free_list_init(&mca_ptl_portals_component.portals_send_frags,
|
||||
sizeof(mca_ptl_portals_send_frag_t),
|
||||
OBJ_CLASS(mca_ptl_portals_send_frag_t),
|
||||
@ -238,9 +246,6 @@ mca_ptl_portals_component_init(int *num_ptls,
|
||||
mca_ptl_portals_component.portals_free_list_inc_num,
|
||||
NULL); /* use default allocator */
|
||||
|
||||
/* BWB - no support for progress threads */
|
||||
if (enable_progress_threads) return NULL;
|
||||
|
||||
/* initialize portals ptl. note that this is in the compat code because
|
||||
it's fairly non-portable between implementations */
|
||||
if (OMPI_SUCCESS != mca_ptl_portals_init(&mca_ptl_portals_component)) {
|
||||
@ -307,8 +312,20 @@ mca_ptl_portals_component_progress(mca_ptl_tstamp_t tstamp)
|
||||
struct mca_ptl_portals_module_t *module =
|
||||
mca_ptl_portals_component.portals_modules[i];
|
||||
ptl_event_t ev;
|
||||
ptl_sr_value_t numdropped;
|
||||
|
||||
if (! module->frag_queues_created) continue;
|
||||
if (module->frag_eq_handle == PTL_EQ_NONE) continue;
|
||||
|
||||
/* BWB - this is going to kill performance */
|
||||
PtlNIStatus(module->ni_handle,
|
||||
PTL_SR_DROP_COUNT,
|
||||
&numdropped);
|
||||
if (numdropped != module->dropped) {
|
||||
ompi_output_verbose(30, mca_ptl_portals_component.portals_output,
|
||||
"*** Dropped message count changed. %lld, %lld",
|
||||
module->dropped, numdropped);
|
||||
module->dropped = numdropped;
|
||||
}
|
||||
|
||||
ret = PtlEQPoll(&(module->frag_eq_handle),
|
||||
1, /* number of eq handles */
|
||||
@ -336,8 +353,8 @@ mca_ptl_portals_component_progress(mca_ptl_tstamp_t tstamp)
|
||||
|
||||
#if PTL_PORTALS_HAVE_EVENT_UNLINK
|
||||
if (PTL_EVENT_UNLINK == ev.type) {
|
||||
ompi_output_verbose(2000, mca_ptl_portals_component.portals_output,
|
||||
"-----> unlink event occurred <-----");
|
||||
ompi_output_verbose(100, mca_ptl_portals_component.portals_output,
|
||||
"unlink event occurred");
|
||||
continue;
|
||||
}
|
||||
#endif
|
||||
|
@ -66,7 +66,7 @@ ptl_portals_post_recv_md(struct mca_ptl_portals_module_t *ptl, void *data_ptr)
|
||||
md.start = mem;
|
||||
md.length = ptl->first_frag_entry_size;
|
||||
md.threshold = PTL_MD_THRESH_INF;
|
||||
md.max_size = md.length - ptl->super.ptl_first_frag_size;
|
||||
md.max_size = ptl->super.ptl_first_frag_size;
|
||||
md.options = PTL_MD_OP_PUT | PTL_MD_MAX_SIZE;
|
||||
md.user_ptr = NULL;
|
||||
md.eq_handle = ptl->frag_eq_handle;
|
||||
@ -80,7 +80,7 @@ ptl_portals_post_recv_md(struct mca_ptl_portals_module_t *ptl, void *data_ptr)
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
||||
ompi_output_verbose(50, mca_ptl_portals_component.portals_output,
|
||||
ompi_output_verbose(100, mca_ptl_portals_component.portals_output,
|
||||
"new receive buffer posted");
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
@ -271,10 +271,10 @@ mca_ptl_portals_process_recv_event(struct mca_ptl_portals_module_t *ptl,
|
||||
}
|
||||
|
||||
/* see if we need to repost an md */
|
||||
if (ev->offset + ev->md.length > ev->md.max_size) {
|
||||
if (ev->md.length - (ev->offset + ev->mlength) < ev->md.max_size) {
|
||||
ompi_output_verbose(100, mca_ptl_portals_component.portals_output,
|
||||
"must repost event: %lld, %lld, %lld",
|
||||
ev->offset, ev->md.length, ev->md.max_size);
|
||||
ev->offset, ev->mlength, ev->md.max_size);
|
||||
/* use the same memory as the old md - it's not using it anymore */
|
||||
ret = ptl_portals_post_recv_md(ptl, ev->md.start);
|
||||
if (OMPI_SUCCESS != ret) {
|
||||
|
@ -233,8 +233,10 @@ mca_ptl_portals_process_send_event(ptl_event_t *ev)
|
||||
}
|
||||
}
|
||||
|
||||
#if 0
|
||||
/* unlink memory descriptor */
|
||||
PtlMDUnlink(ev->md_handle);
|
||||
#endif
|
||||
|
||||
} else {
|
||||
ompi_output_verbose(10, mca_ptl_portals_component.portals_output,
|
||||
|
Загрузка…
Ссылка в новой задаче
Block a user