OSHMEM: OOM in yoda
fix: do not fail on blm allocation error, wait for some puts to complete and retry fixed by Roman, reviewed by Mike/Alex cmr=v1.7.5:reviewer=ompi-rm1.7 This commit was SVN r30779.
Этот коммит содержится в:
родитель
63803f5e61
Коммит
684e78e669
@ -107,19 +107,57 @@ static inline void calc_nfrags(mca_bml_base_btl_t* bml_btl,
|
||||
*nfrags = 1 + (size - 1) / (*frag_size);
|
||||
}
|
||||
|
||||
static int mca_spml_yoda_fence_internal(int puts_wait)
|
||||
{
|
||||
int n_puts_wait;
|
||||
|
||||
/* Waiting for certain number of puts : 'puts_wait'
|
||||
* if 'puts_wait' == 0 waiting for all puts ('n_active_puts')
|
||||
* if 'puts_wait' > 'n_active_puts' waiting for 'n_active_puts' */
|
||||
|
||||
n_puts_wait = puts_wait > 0 ? mca_spml_yoda.n_active_puts - puts_wait : 0;
|
||||
|
||||
if (n_puts_wait < 0) {
|
||||
n_puts_wait = 0;
|
||||
}
|
||||
|
||||
while (n_puts_wait < mca_spml_yoda.n_active_puts) {
|
||||
oshmem_request_wait_any_completion();
|
||||
}
|
||||
return OSHMEM_SUCCESS;
|
||||
}
|
||||
|
||||
static inline void mca_spml_yoda_bml_alloc( mca_bml_base_btl_t* bml_btl,
|
||||
mca_btl_base_descriptor_t** des,
|
||||
uint8_t order, size_t size, uint32_t flags,
|
||||
int use_send)
|
||||
{
|
||||
bool is_done;
|
||||
bool is_fence_complete;
|
||||
|
||||
is_done = false;
|
||||
is_fence_complete = false;
|
||||
|
||||
if (use_send) {
|
||||
size = (0 == size ? size : size + SPML_YODA_SEND_CONTEXT_SIZE);
|
||||
}
|
||||
mca_bml_base_alloc(bml_btl,
|
||||
|
||||
do {
|
||||
mca_bml_base_alloc(bml_btl,
|
||||
des,
|
||||
MCA_BTL_NO_ORDER,
|
||||
size,
|
||||
flags);
|
||||
|
||||
if (OPAL_UNLIKELY(!(*des) || !(*des)->des_src ) && !is_fence_complete) {
|
||||
mca_spml_yoda_fence_internal(mca_spml_yoda.bml_alloc_threshold);
|
||||
|
||||
is_fence_complete = true;
|
||||
} else {
|
||||
is_done = true;
|
||||
}
|
||||
|
||||
} while (!is_done);
|
||||
}
|
||||
|
||||
static inline void spml_yoda_prepare_for_put(void* buffer, size_t size, void* p_src, void* p_dst, int use_send)
|
||||
@ -861,11 +899,7 @@ int mca_spml_yoda_put_nb(void* dst_addr,
|
||||
|
||||
int mca_spml_yoda_fence(void)
|
||||
{
|
||||
|
||||
while (0 < mca_spml_yoda.n_active_puts) {
|
||||
oshmem_request_wait_any_completion();
|
||||
}
|
||||
return OSHMEM_SUCCESS;
|
||||
return mca_spml_yoda_fence_internal(0);
|
||||
}
|
||||
|
||||
int mca_spml_yoda_wait_gets(void)
|
||||
|
@ -66,6 +66,8 @@ struct mca_spml_yoda_t {
|
||||
int free_list_num; /* initial size of free list */
|
||||
int free_list_max; /* maximum size of free list */
|
||||
int free_list_inc; /* number of elements to grow free list */
|
||||
int bml_alloc_threshold; /* number of puts to wait
|
||||
in case of put/get temporary buffer allocation failture */
|
||||
|
||||
/* lock queue access */
|
||||
opal_mutex_t lock;
|
||||
|
@ -78,6 +78,12 @@ static int mca_spml_yoda_component_register(void)
|
||||
mca_spml_yoda_param_register_int("free_list_max", 1024, 0);
|
||||
mca_spml_yoda.free_list_inc =
|
||||
mca_spml_yoda_param_register_int("free_list_inc", 16, 0);
|
||||
mca_spml_yoda.bml_alloc_threshold =
|
||||
mca_spml_yoda_param_register_int("bml_alloc_threshold",
|
||||
3,
|
||||
"number of puts to wait \
|
||||
in case of put/get temporary buffer \
|
||||
allocation failture");
|
||||
mca_spml_yoda.priority =
|
||||
mca_spml_yoda_param_register_int("priority",
|
||||
10,
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user