1
1

Continue to resolve priority issues. Cleanup the case of forced termination in mpirun during launch processing by ensuring we can respond to socket closures, and ensuring that the remote daemons correctly close their sockets when terminating.

Jeff: please test a variety of conditions to ensure we get this right

cmr=v1.7.5:reviewer=jsquyres

This commit was SVN r31058.
Этот коммит содержится в:
Ralph Castain 2014-03-13 04:02:24 +00:00
родитель 24020ef1e3
Коммит 2abed09d7c
4 изменённых файлов: 27 добавлений и 12 удалений

Просмотреть файл

@ -157,8 +157,31 @@ static void tcp_init(struct mca_oob_tcp_module_t *md)
static void tcp_fini(struct mca_oob_tcp_module_t *md)
{
mca_oob_tcp_module_t *mod = (mca_oob_tcp_module_t*)md;
uint64_t ui64;
char *nptr;
mca_oob_tcp_peer_t *peer;
/* cleanup all peers */
if (OPAL_SUCCESS == opal_hash_table_get_first_key_uint64(&mod->peers, &ui64,
(void**)&peer, (void**)&nptr)) {
opal_output_verbose(2, orte_oob_base_framework.framework_output,
"%s RELEASING PEER OBJ %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
(NULL == peer) ? "NULL" : ORTE_NAME_PRINT(&peer->name));
if (NULL != peer) {
OBJ_RELEASE(peer);
}
while (OPAL_SUCCESS == opal_hash_table_get_next_key_uint64(&mod->peers, &ui64,
(void**)&peer, nptr, (void**)&nptr)) {
opal_output_verbose(2, orte_oob_base_framework.framework_output,
"%s RELEASING PEER OBJ %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
(NULL == peer) ? "NULL" : ORTE_NAME_PRINT(&peer->name));
if (NULL != peer) {
OBJ_RELEASE(peer);
}
}
}
OBJ_DESTRUCT(&mod->peers);
if (mod->ev_active) {

Просмотреть файл

@ -1606,6 +1606,10 @@ static void peer_cons(mca_oob_tcp_peer_t *peer)
static void peer_des(mca_oob_tcp_peer_t *peer)
{
if (0 <= peer->sd) {
opal_output_verbose(2, orte_oob_base_framework.framework_output,
"%s CLOSING SOCKET %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
peer->sd);
CLOSE_THE_SOCKET(peer->sd);
}
OPAL_LIST_DESTRUCT(&peer->addrs);

Просмотреть файл

@ -71,8 +71,6 @@ ORTE_DECLSPEC extern mca_base_framework_t orte_state_base_framework;
__FILE__, __LINE__); \
ORTE_UPDATE_EXIT_STATUS(x); \
ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_FORCED_EXIT); \
/* set the global abnormal exit flag */ \
orte_abnormal_term_ordered = true; \
} \
} while(0);

Просмотреть файл

@ -86,11 +86,6 @@ static char *get_orted_comm_cmd_str(int command);
static opal_pointer_array_t *procs_prev_ordered_to_terminate = NULL;
static void suicide(int sd, short args, void *cbdata)
{
ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED);
}
void orte_daemon_recv(int status, orte_process_name_t* sender,
opal_buffer_t *buffer, orte_rml_tag_t tag,
void* cbdata)
@ -457,11 +452,6 @@ void orte_daemon_recv(int status, orte_process_name_t* sender,
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
}
ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED);
} else {
/* set a timer to suicide, just in case one of our
* dependent routes fails to terminate
*/
ORTE_TIMER_EVENT(10, 0, suicide, ORTE_ERROR_PRI);
}
return;
break;