Must release the lock before entering the non blocking recv, since
it is possible that if the receive has been arrived the callback will be called before recv_buffer_nb() returns. This causes deadlock as we try to acquire the lock, but already hold it. This was causing orterun and orteds to stall in certian situations. Became evident when stress testing dynamics with remote nodes. This commit was SVN r7543.
Этот коммит содержится в:
родитель
997644af31
Коммит
d39841174d
@ -71,10 +71,11 @@ void orte_gpr_replica_recv(int status, orte_process_name_t* sender,
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
}
|
||||
|
||||
OPAL_THREAD_UNLOCK(&orte_gpr_replica_globals.mutex);
|
||||
|
||||
/* reissue the non-blocking receive before returning */
|
||||
orte_rml.recv_buffer_nb(ORTE_RML_NAME_ANY, ORTE_RML_TAG_GPR, 0, orte_gpr_replica_recv, NULL);
|
||||
|
||||
OPAL_THREAD_UNLOCK(&orte_gpr_replica_globals.mutex);
|
||||
return;
|
||||
}
|
||||
|
@ -519,13 +519,14 @@ static void orte_daemon_recv(int status, orte_process_name_t* sender,
|
||||
OBJ_RELEASE(answer);
|
||||
|
||||
DONE:
|
||||
OPAL_THREAD_UNLOCK(&orted_globals.mutex);
|
||||
|
||||
/* reissue the non-blocking receive */
|
||||
ret = orte_rml.recv_buffer_nb(ORTE_RML_NAME_ANY, ORTE_RML_TAG_DAEMON, 0, orte_daemon_recv, NULL);
|
||||
if (ret != ORTE_SUCCESS && ret != ORTE_ERR_NOT_IMPLEMENTED) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
}
|
||||
|
||||
OPAL_THREAD_UNLOCK(&orted_globals.mutex);
|
||||
return;
|
||||
}
|
||||
|
||||
|
Загрузка…
Ссылка в новой задаче
Block a user