1
1

* start cleaning up debugging output (still much to do)

* make buffers really big so that we pass allocmem until we figure out
  why we're not flow controlling as I expected
* set event queue to invalid intially and use that as the enabled test
  rather than a seperate bool - shrinks the module a bit
* add dropped count checks, with a panic if one occurs.  Still need to
  implement some type of retransmit logic.

This commit was SVN r5704.
Этот коммит содержится в:
Brian Barrett 2005-05-12 21:28:48 +00:00
родитель e2c2c72b84
Коммит 0c6eaaebe3
7 изменённых файлов: 44 добавлений и 25 удалений

Просмотреть файл

@ -156,7 +156,7 @@ AC_DEFUN([MCA_CONFIGURE_STUB],[
[Portals table id to use for fragment receive queue])
MCA_PTL_PORTALS_CONFIG_VAL([debug-level],
[PTL_PORTALS_DEFAULT_DEBUG_LEVEL], [1000],
[PTL_PORTALS_DEFAULT_DEBUG_LEVEL], [99],
[Default debugging level for portals ptl])
MCA_PTL_PORTALS_CONFIG_VAL([request-cache-size],
@ -168,11 +168,11 @@ AC_DEFUN([MCA_CONFIGURE_STUB],[
[Default first frag size for portals ptl])
MCA_PTL_PORTALS_CONFIG_VAL([first-frag-num-entries],
[PTL_PORTALS_DEFAULT_FIRST_FRAG_NUM_ENTRIES], [3],
[PTL_PORTALS_DEFAULT_FIRST_FRAG_NUM_ENTRIES], [5],
[Default number of memory descriptors for first fragments])
MCA_PTL_PORTALS_CONFIG_VAL([first-frag-entry-size],
[PTL_PORTALS_DEFAULT_FIRST_FRAG_ENTRY_SIZE], [1048576],
[PTL_PORTALS_DEFAULT_FIRST_FRAG_ENTRY_SIZE], [10485760],
[Default size of memory associeted with first fag md])
MCA_PTL_PORTALS_CONFIG_VAL([first-frag-queue-size],

Просмотреть файл

@ -99,7 +99,7 @@ mca_ptl_portals_add_procs(struct mca_ptl_base_module_t* ptl,
portals_procs[i],
&distance);
if (ret != PTL_OK) {
ompi_output_verbose(100, mca_ptl_portals_component.portals_output,
ompi_output_verbose(10, mca_ptl_portals_component.portals_output,
"Could not find distance to process %d", i);
continue;
}
@ -139,7 +139,7 @@ mca_ptl_portals_module_enable(struct mca_ptl_portals_module_t *ptl,
/* BWB - not really sure how - would have to track a lot more data... */
} else {
/* only do all the hard stuff if we haven't created the queue */
if (ptl->frag_queues_created) return OMPI_SUCCESS;
if (ptl->frag_eq_handle != PTL_EQ_NONE) return OMPI_SUCCESS;
/* create an event queue, then the match entries for the match
entries */
@ -159,7 +159,6 @@ mca_ptl_portals_module_enable(struct mca_ptl_portals_module_t *ptl,
for (i = 0 ; i < ptl->first_frag_num_entries ; ++i) {
ret = ptl_portals_post_recv_md(ptl, NULL);
if (OMPI_SUCCESS != ret) return ret;
ptl->frag_queues_created = true;
}
}
@ -176,7 +175,7 @@ mca_ptl_portals_finalize(struct mca_ptl_base_module_t *ptl_base)
ret = PtlNIFini(ptl->ni_handle);
if (PTL_OK != ret) {
ompi_output_verbose(50, mca_ptl_portals_component.portals_output,
ompi_output_verbose(90, mca_ptl_portals_component.portals_output,
"PtlNIFini returned %d\n", ret);
return OMPI_ERROR;
}

Просмотреть файл

@ -45,7 +45,7 @@ struct mca_ptl_portals_component_t {
* - 0 : critical user information
* - 10: initialization / shutdown diagnostic information
* - 20: general execution diagnostic information
* - 99: useful only to developers
* - 90: useful only to developers
*/
int portals_output;
@ -169,8 +169,6 @@ struct mca_ptl_portals_module_t {
/* size for event queue */
int first_frag_queue_size;
/* frag receive data */
bool frag_queues_created;
/* frag receive event queue */
ptl_handle_eq_t frag_eq_handle;
@ -178,6 +176,9 @@ struct mca_ptl_portals_module_t {
ptl_handle_ni_t ni_handle;
/** the limits returned from PtlNIInit for interface */
ptl_ni_limits_t limits;
/** number of dropped messages */
ptl_sr_value_t dropped;
};
typedef struct mca_ptl_portals_module_t mca_ptl_portals_module_t;

Просмотреть файл

@ -129,8 +129,8 @@ mca_ptl_portals_add_procs_compat(struct mca_ptl_portals_module_t* ptl,
return ret;
} else if (sizeof(ptl_process_id_t) != size) {
ompi_output_verbose(10, mca_ptl_portals_component.portals_output,
"mca_base_modex_recv returned size%d",
size);
"mca_base_modex_recv returned size %d, expected %d",
size, sizeof(ptl_process_id_t));
return OMPI_ERROR;
}

Просмотреть файл

@ -170,9 +170,14 @@ mca_ptl_portals_component_open(void)
mca_ptl_portals_component.portals_output =
ompi_output_open(&portals_output_stream);
ompi_output_verbose(100, mca_ptl_portals_component.portals_output,
ompi_output_verbose(90, mca_ptl_portals_component.portals_output,
"mca_ptl_portals_component_open()");
/* fill in defaults for module data */
mca_ptl_portals_module.frag_eq_handle = PTL_EQ_NONE;
mca_ptl_portals_module.ni_handle = PTL_INVALID_HANDLE;
mca_ptl_portals_module.dropped = 0;
return OMPI_SUCCESS;
}
@ -183,7 +188,7 @@ mca_ptl_portals_component_open(void)
int
mca_ptl_portals_component_close(void)
{
ompi_output_verbose(100, mca_ptl_portals_component.portals_output,
ompi_output_verbose(90, mca_ptl_portals_component.portals_output,
"mca_ptl_portals_component_close()");
/* finalize interface? */
@ -219,9 +224,12 @@ mca_ptl_portals_component_init(int *num_ptls,
mca_ptl_base_module_t** ptls;
*num_ptls = 0;
ompi_output_verbose(100, mca_ptl_portals_component.portals_output,
ompi_output_verbose(90, mca_ptl_portals_component.portals_output,
"mca_ptl_portals_component_init()");
/* BWB - no support for threads */
if (enable_progress_threads || enable_mpi_threads) return NULL;
ompi_free_list_init(&mca_ptl_portals_component.portals_send_frags,
sizeof(mca_ptl_portals_send_frag_t),
OBJ_CLASS(mca_ptl_portals_send_frag_t),
@ -238,9 +246,6 @@ mca_ptl_portals_component_init(int *num_ptls,
mca_ptl_portals_component.portals_free_list_inc_num,
NULL); /* use default allocator */
/* BWB - no support for progress threads */
if (enable_progress_threads) return NULL;
/* initialize portals ptl. note that this is in the compat code because
it's fairly non-portable between implementations */
if (OMPI_SUCCESS != mca_ptl_portals_init(&mca_ptl_portals_component)) {
@ -307,8 +312,20 @@ mca_ptl_portals_component_progress(mca_ptl_tstamp_t tstamp)
struct mca_ptl_portals_module_t *module =
mca_ptl_portals_component.portals_modules[i];
ptl_event_t ev;
ptl_sr_value_t numdropped;
if (! module->frag_queues_created) continue;
if (module->frag_eq_handle == PTL_EQ_NONE) continue;
/* BWB - this is going to kill performance */
PtlNIStatus(module->ni_handle,
PTL_SR_DROP_COUNT,
&numdropped);
if (numdropped != module->dropped) {
ompi_output_verbose(30, mca_ptl_portals_component.portals_output,
"*** Dropped message count changed. %lld, %lld",
module->dropped, numdropped);
module->dropped = numdropped;
}
ret = PtlEQPoll(&(module->frag_eq_handle),
1, /* number of eq handles */
@ -336,8 +353,8 @@ mca_ptl_portals_component_progress(mca_ptl_tstamp_t tstamp)
#if PTL_PORTALS_HAVE_EVENT_UNLINK
if (PTL_EVENT_UNLINK == ev.type) {
ompi_output_verbose(2000, mca_ptl_portals_component.portals_output,
"-----> unlink event occurred <-----");
ompi_output_verbose(100, mca_ptl_portals_component.portals_output,
"unlink event occurred");
continue;
}
#endif

Просмотреть файл

@ -66,7 +66,7 @@ ptl_portals_post_recv_md(struct mca_ptl_portals_module_t *ptl, void *data_ptr)
md.start = mem;
md.length = ptl->first_frag_entry_size;
md.threshold = PTL_MD_THRESH_INF;
md.max_size = md.length - ptl->super.ptl_first_frag_size;
md.max_size = ptl->super.ptl_first_frag_size;
md.options = PTL_MD_OP_PUT | PTL_MD_MAX_SIZE;
md.user_ptr = NULL;
md.eq_handle = ptl->frag_eq_handle;
@ -80,7 +80,7 @@ ptl_portals_post_recv_md(struct mca_ptl_portals_module_t *ptl, void *data_ptr)
return OMPI_ERROR;
}
ompi_output_verbose(50, mca_ptl_portals_component.portals_output,
ompi_output_verbose(100, mca_ptl_portals_component.portals_output,
"new receive buffer posted");
return OMPI_SUCCESS;
@ -271,10 +271,10 @@ mca_ptl_portals_process_recv_event(struct mca_ptl_portals_module_t *ptl,
}
/* see if we need to repost an md */
if (ev->offset + ev->md.length > ev->md.max_size) {
if (ev->md.length - (ev->offset + ev->mlength) < ev->md.max_size) {
ompi_output_verbose(100, mca_ptl_portals_component.portals_output,
"must repost event: %lld, %lld, %lld",
ev->offset, ev->md.length, ev->md.max_size);
ev->offset, ev->mlength, ev->md.max_size);
/* use the same memory as the old md - it's not using it anymore */
ret = ptl_portals_post_recv_md(ptl, ev->md.start);
if (OMPI_SUCCESS != ret) {

Просмотреть файл

@ -233,8 +233,10 @@ mca_ptl_portals_process_send_event(ptl_event_t *ev)
}
}
#if 0
/* unlink memory descriptor */
PtlMDUnlink(ev->md_handle);
#endif
} else {
ompi_output_verbose(10, mca_ptl_portals_component.portals_output,