1
1

Must release the lock before entering the non blocking recv, since

it is possible that if the receive has been arrived the callback will
be called before recv_buffer_nb() returns. This causes deadlock
as we try to acquire the lock, but already hold it.

This was causing orterun and orteds to stall in certian situations.
Became evident when stress testing dynamics with remote nodes.

This commit was SVN r7543.
Этот коммит содержится в:
Josh Hursey 2005-09-29 14:24:11 +00:00
родитель 997644af31
Коммит d39841174d
2 изменённых файлов: 4 добавлений и 2 удалений

Просмотреть файл

@ -71,10 +71,11 @@ void orte_gpr_replica_recv(int status, orte_process_name_t* sender,
ORTE_ERROR_LOG(rc);
}
}
OPAL_THREAD_UNLOCK(&orte_gpr_replica_globals.mutex);
/* reissue the non-blocking receive before returning */
orte_rml.recv_buffer_nb(ORTE_RML_NAME_ANY, ORTE_RML_TAG_GPR, 0, orte_gpr_replica_recv, NULL);
OPAL_THREAD_UNLOCK(&orte_gpr_replica_globals.mutex);
return;
}

Просмотреть файл

@ -519,13 +519,14 @@ static void orte_daemon_recv(int status, orte_process_name_t* sender,
OBJ_RELEASE(answer);
DONE:
OPAL_THREAD_UNLOCK(&orted_globals.mutex);
/* reissue the non-blocking receive */
ret = orte_rml.recv_buffer_nb(ORTE_RML_NAME_ANY, ORTE_RML_TAG_DAEMON, 0, orte_daemon_recv, NULL);
if (ret != ORTE_SUCCESS && ret != ORTE_ERR_NOT_IMPLEMENTED) {
ORTE_ERROR_LOG(ret);
}
OPAL_THREAD_UNLOCK(&orted_globals.mutex);
return;
}