Remove the progress engine stuff from abort. This was causing
some orted's to stall on locks in the MPI Dynamics cases. Since it is not essentual that we call these functions, they can so away. Unlock the peer lock when aborting. This causes a potential deadlock in do_waitall [see comment in code]. This was causing orteds to deadlock at times when the seed had terminated. With proper interleaving and timing the orted was deadlocking. This seems to have fixed this in my stress testing with MPI 2 Dynamics. This commit was SVN r7539.
Этот коммит содержится в:
родитель
e825b4522f
Коммит
c11ba09655
@ -472,6 +472,10 @@ void mca_oob_tcp_peer_close(mca_oob_tcp_peer_t* peer)
|
||||
if(memcmp(&peer->peer_name,&mca_oob_name_seed,sizeof(mca_oob_name_seed)) == 0) {
|
||||
/* If we are not already inside orte_finalize, then call abort */
|
||||
if (ORTE_UNIVERSE_STATE_FINALIZE > orte_universe_info.state) {
|
||||
/* Should free the peer lock before we abort so we don't
|
||||
* get stuck in the orte_wait_kill when receiving messages in the
|
||||
* tcp OOB. */
|
||||
OPAL_THREAD_UNLOCK(&peer->peer_lock);
|
||||
orte_errmgr.abort();
|
||||
}
|
||||
}
|
||||
|
@ -53,12 +53,6 @@ int orte_abort(int status, char *fmt, ...)
|
||||
* - Assume errmgr cleans up child processes before we exit.
|
||||
*/
|
||||
|
||||
/* - Turn of progress engine */
|
||||
opal_progress_finalize();
|
||||
|
||||
/* - Turn off event loop */
|
||||
opal_event_fini();
|
||||
|
||||
/* - Clean up session directory */
|
||||
orte_session_dir_finalize(orte_process_info.my_name);
|
||||
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user