1
1

In order to prevent orphaned processes when using non-unity routing methods, the procs need to realize that their local daemon is a critical connection - if that connection unexpectedly closes, they need to terminate.

This commit adds definition for a "lifeline" connection. For an HNP, there is no lifeline, so the lifeline proc is NULL. For a daemon, the lifeline is the HNP - the daemon should abort if it loses that connection.

For a proc using unity routed, the lifeline is the HNP since it connects directly to the HNP.

For a proc using tree routed, the lifeline is the local daemon.

Adjusted OOB to call abort if the lifeline (as opposed to HNP) connection is lost.

This commit was SVN r17761.
Этот коммит содержится в:
Ralph Castain 2008-03-06 15:30:44 +00:00
родитель 498190e326
Коммит ff99aa054f
6 изменённых файлов: 26 добавлений и 3 удалений

Просмотреть файл

@ -576,15 +576,16 @@ void mca_oob_tcp_peer_close(mca_oob_tcp_peer_t* peer)
peer->peer_state);
}
/* if we lose the connection to the HNP - abort */
if (OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, &peer->peer_name, ORTE_PROC_MY_HNP)) {
/* if we lose the connection to the lifeline - abort */
if (NULL != ORTE_PROC_MY_LIFELINE &&
OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, &peer->peer_name, ORTE_PROC_MY_LIFELINE)) {
/* If we are not already inside orte_finalize, then call abort */
if (!orte_finalizing) {
/* Should free the peer lock before we abort so we don't
* get stuck in the orte_wait_kill when receiving messages in the
* tcp OOB. */
OPAL_THREAD_UNLOCK(&peer->peer_lock);
orte_errmgr.abort(1, "OOB: Connection to HNP lost");
orte_errmgr.abort(1, "OOB: Connection to lifeline lost");
}
}

Просмотреть файл

@ -272,6 +272,10 @@ int orte_routed_tree_init_routes(orte_jobid_t job, opal_buffer_t *ndat)
*/
orte_routed_tree_module.wildcard_route.jobid = ORTE_PROC_MY_HNP->jobid;
orte_routed_tree_module.wildcard_route.vpid = ORTE_PROC_MY_HNP->vpid;
/* set our lifeline to the the HNP - we will abort if that connection is lost */
orte_process_info.lifeline = ORTE_PROC_MY_HNP;
/* daemons will send their contact info back to the HNP as
* part of the message confirming they are read to go. HNP's
* load their contact info during orte_init
@ -309,6 +313,7 @@ int orte_routed_tree_init_routes(orte_jobid_t job, opal_buffer_t *ndat)
ORTE_ERROR_LOG(rc);
return rc;
}
/* the HNP has no lifeline, so leave that field the default NULL */
} else {
/* if this is for my own jobid, then I am getting an update of RML info
* for the daemons - so update our contact info and routes
@ -414,6 +419,9 @@ int orte_routed_tree_init_routes(orte_jobid_t job, opal_buffer_t *ndat)
orte_routed_tree_module.wildcard_route.jobid = ORTE_PROC_MY_DAEMON->jobid;
orte_routed_tree_module.wildcard_route.vpid = ORTE_PROC_MY_DAEMON->vpid;
/* set our lifeline to the local daemon - we will abort if this connection is lost */
orte_process_info.lifeline = ORTE_PROC_MY_DAEMON;
/* register ourselves -this sends a message to the daemon (warming up that connection)
* and sends our contact info to the HNP when all local procs have reported
*

Просмотреть файл

@ -389,6 +389,10 @@ int orte_routed_unity_init_routes(orte_jobid_t job, opal_buffer_t *ndata)
/* we don't have to update the route as the unity component is
* always "direct"
*/
/* set our lifeline as the HNP - we will abort if that connection fails */
orte_process_info.lifeline = ORTE_PROC_MY_HNP;
return ORTE_SUCCESS;
}
@ -432,6 +436,7 @@ int orte_routed_unity_init_routes(orte_jobid_t job, opal_buffer_t *ndata)
}
}
/* I do not have a lifeline, so leave it as the default NULL */
return ORTE_SUCCESS;
}
@ -539,6 +544,11 @@ int orte_routed_unity_init_routes(orte_jobid_t job, opal_buffer_t *ndata)
return rc;
}
/* declare the HNP as our "lifeline" - this means that we will automatically
* abort if we lose that connection
*/
orte_process_info.lifeline = ORTE_PROC_MY_HNP;
/* we don't have to update the route as the unity component is
* always "direct"
*/

Просмотреть файл

@ -275,6 +275,8 @@ ORTE_DECLSPEC extern orte_process_name_t orte_globals_name_invalid; /** instant
/* define the name of my daemon */
#define ORTE_PROC_MY_DAEMON (&orte_process_info.my_daemon)
/* define the name of my lifeline */
#define ORTE_PROC_MY_LIFELINE (orte_process_info.lifeline)
/* global variables used by RTE - instanced in orte_globals.c */
ORTE_DECLSPEC extern bool orte_debug_flag, orte_reuse_daemons, orte_timing;

Просмотреть файл

@ -42,6 +42,7 @@ ORTE_DECLSPEC orte_proc_info_t orte_process_info = {
/* .my_daemon_uri = */ NULL,
/* .my_hnp = */ {0, 0},
/* .my_hnp_uri = */ NULL,
/* .lifeline = */ NULL,
/* .hnp_pid = */ 0,
/* ,app_num = */ -1,
/* ,universe_size = */ -1,

Просмотреть файл

@ -50,6 +50,7 @@ struct orte_proc_info_t {
char *my_daemon_uri; /**< Contact info to local daemon */
orte_process_name_t my_hnp; /**< Name of my hnp */
char *my_hnp_uri; /**< Contact info for my hnp */
orte_process_name_t *lifeline; /**< Name of the contact I cannot live without */
pid_t hnp_pid; /**< hnp pid - used if singleton */
orte_std_cntr_t app_num; /**< our index into the app_context array */
orte_std_cntr_t universe_size; /**< the size of the universe we are in */