1
1

When in the unity message routing mode, we have to update the RML contact info in the parent procs so that they know how to talk to the children. Ideally, this would be done in the MPI layer since that layer knows which procs are actively involved in the comm_spawn. However, it isn't being done there, which causes comm_spawn to fail, so do it explicitly in the RTE.

Note that this means ALL procs in the parent job are updated, even though they may not be participating in the comm_spawn. This doesn't really hurt anything - just unnecessary.

Comm_spawn still has a problem when a child process shares a node with a parent, so this doesn't fix everything. It only fixes the bug of ensuring all procs know how to talk to each other.

This commit was SVN r16460.
Этот коммит содержится в:
Ralph Castain 2007-10-16 16:09:41 +00:00
родитель 713b6e13a5
Коммит ec5fe78876
8 изменённых файлов: 129 добавлений и 36 удалений

Просмотреть файл

@ -91,14 +91,14 @@ BEGIN_C_DECLS
#define ORTE_RML_TAG_BARRIER 31
#define ORTE_RML_TAG_INIT_ROUTES 32
#define ORTE_RML_TAG_SYNC 33
#define ORTE_RML_TAG_UPDATE_ROUTES 33
#define ORTE_RML_TAG_SYNC 34
/* For FileM RSH Component */
#define ORTE_RML_TAG_FILEM_RSH 34
#define ORTE_RML_TAG_FILEM_RSH 35
/* For SnapC Full Component */
#define ORTE_RML_TAG_SNAPC_FULL 35
#define ORTE_RML_TAG_SNAPC_FULL 36
/* For CRCP Coord Component */
#define OMPI_CRCP_COORD_BOOKMARK_TAG 4242

Просмотреть файл

@ -118,6 +118,11 @@ orte_routed_base_select(void)
if (NULL == selected_component) return ORTE_ERROR;
/* initialize the selected component */
if (ORTE_SUCCESS != orte_routed.initialize()) {
return ORTE_ERROR;
}
return ORTE_SUCCESS;
}

Просмотреть файл

@ -31,6 +31,8 @@
#endif
/* API functions */
static int orte_routed_cnos_module_init(void);
static int orte_routed_cnos_finalize(void);
static int orte_routed_cnos_update_route(orte_process_name_t *target,
@ -43,6 +45,7 @@ static int orte_routed_cnos_init_routes(orte_jobid_t job, orte_gpr_notify_data_t
static int orte_routed_cnos_warmup_routes(void);
orte_routed_module_t orte_routed_cnos_module = {
orte_routed_cnos_module_init,
orte_routed_cnos_finalize,
orte_routed_cnos_update_route,
orte_routed_cnos_get_route,
@ -50,6 +53,12 @@ orte_routed_module_t orte_routed_cnos_module = {
orte_routed_cnos_warmup_routes
};
static int
orte_routed_cnos_module_init(void)
{
return ORTE_SUCCESS;
}
static int
orte_routed_cnos_finalize(void)
{

Просмотреть файл

@ -86,6 +86,15 @@ typedef struct orte_routed_component_1_0_0_t orte_routed_component_t;
/* ******************************************************************** */
/**
* Initialize the routed module
*
* Do whatever needs to be done to initialize the selected module
*
* @retval ORTE_SUCCESS Success
* @retval ORTE_ERROR Error code from whatever was encountered
*/
typedef int (*orte_routed_module_init_fn_t)(void);
/**
* Finalize the routed module
@ -142,9 +151,10 @@ typedef int (*orte_routed_module_warmup_routes_fn_t)(void);
* active routed interface.
*/
struct orte_routed_module_t {
/** Shutdown the communication system and clean up resources */
/** Startup/shutdown the communication system and clean up resources */
orte_routed_module_init_fn_t initialize;
orte_routed_module_finalize_fn_t finalize;
/* API functions */
orte_routed_module_update_route_fn_t update_route;
orte_routed_module_get_route_fn_t get_route;
orte_routed_module_init_routes_fn_t init_routes;

Просмотреть файл

@ -40,6 +40,8 @@ ORTE_MODULE_DECLSPEC extern orte_routed_component_t mca_routed_tree_component;
extern orte_routed_tree_module_t orte_routed_tree_module;
int orte_routed_tree_module_init(void);
int orte_routed_tree_finalize(void);
int orte_routed_tree_update_route(orte_process_name_t *target,

Просмотреть файл

@ -21,7 +21,7 @@
#include "orte/mca/ns/ns.h"
static orte_routed_module_t* routed_tree_init(int* priority);
static bool selected=false;
/**
* component definition
@ -53,6 +53,7 @@ orte_routed_component_t mca_routed_tree_component = {
orte_routed_tree_module_t orte_routed_tree_module = {
{
orte_routed_tree_module_init,
orte_routed_tree_finalize,
orte_routed_tree_update_route,
orte_routed_tree_get_route,
@ -69,39 +70,47 @@ routed_tree_init(int* priority)
{
*priority = 5;
OBJ_CONSTRUCT(&orte_routed_tree_module.peer_list, opal_list_t);
OBJ_CONSTRUCT(&orte_routed_tree_module.vpid_wildcard_list, opal_list_t);
OBJ_CONSTRUCT(&orte_routed_tree_module.jobid_wildcard_list, opal_list_t);
orte_routed_tree_module.full_wildcard_entry.target.jobid = ORTE_JOBID_WILDCARD;
orte_routed_tree_module.full_wildcard_entry.target.vpid = ORTE_VPID_WILDCARD;
orte_routed_tree_module.full_wildcard_entry.route.jobid = ORTE_JOBID_INVALID;
orte_routed_tree_module.full_wildcard_entry.route.vpid = ORTE_VPID_INVALID;
return &orte_routed_tree_module.super;
}
int
orte_routed_tree_module_init(void)
{
OBJ_CONSTRUCT(&orte_routed_tree_module.peer_list, opal_list_t);
OBJ_CONSTRUCT(&orte_routed_tree_module.vpid_wildcard_list, opal_list_t);
OBJ_CONSTRUCT(&orte_routed_tree_module.jobid_wildcard_list, opal_list_t);
orte_routed_tree_module.full_wildcard_entry.target.jobid = ORTE_JOBID_WILDCARD;
orte_routed_tree_module.full_wildcard_entry.target.vpid = ORTE_VPID_WILDCARD;
orte_routed_tree_module.full_wildcard_entry.route.jobid = ORTE_JOBID_INVALID;
orte_routed_tree_module.full_wildcard_entry.route.vpid = ORTE_VPID_INVALID;
selected = true;
return ORTE_SUCCESS;
}
int
orte_routed_tree_finalize(void)
{
opal_list_item_t *item;
while (NULL != (item = opal_list_remove_first(&orte_routed_tree_module.peer_list))) {
OBJ_RELEASE(item);
if (selected) {
while (NULL != (item = opal_list_remove_first(&orte_routed_tree_module.peer_list))) {
OBJ_RELEASE(item);
}
OBJ_DESTRUCT(&orte_routed_tree_module.peer_list);
while (NULL != (item = opal_list_remove_first(&orte_routed_tree_module.vpid_wildcard_list))) {
OBJ_RELEASE(item);
}
OBJ_DESTRUCT(&orte_routed_tree_module.vpid_wildcard_list);
while (NULL != (item = opal_list_remove_first(&orte_routed_tree_module.jobid_wildcard_list))) {
OBJ_RELEASE(item);
}
OBJ_DESTRUCT(&orte_routed_tree_module.jobid_wildcard_list);
}
OBJ_DESTRUCT(&orte_routed_tree_module.peer_list);
while (NULL != (item = opal_list_remove_first(&orte_routed_tree_module.vpid_wildcard_list))) {
OBJ_RELEASE(item);
}
OBJ_DESTRUCT(&orte_routed_tree_module.vpid_wildcard_list);
while (NULL != (item = opal_list_remove_first(&orte_routed_tree_module.jobid_wildcard_list))) {
OBJ_RELEASE(item);
}
OBJ_DESTRUCT(&orte_routed_tree_module.jobid_wildcard_list);
return ORTE_SUCCESS;
}

Просмотреть файл

@ -18,6 +18,8 @@ BEGIN_C_DECLS
ORTE_MODULE_DECLSPEC extern orte_routed_component_t mca_routed_unity_component;
int orte_routed_unity_module_init(void);
int orte_routed_unity_finalize(void);
int orte_routed_unity_update_route(orte_process_name_t *target,

Просмотреть файл

@ -29,7 +29,7 @@
#include "routed_unity.h"
static orte_routed_module_t* routed_unity_init(int* priority);
static bool recv_issued=false;
/**
* component definition
@ -61,6 +61,7 @@ orte_routed_component_t mca_routed_unity_component = {
};
orte_routed_module_t orte_routed_unity_module = {
orte_routed_unity_module_init,
orte_routed_unity_finalize,
orte_routed_unity_update_route,
orte_routed_unity_get_route,
@ -76,10 +77,53 @@ routed_unity_init(int* priority)
return &orte_routed_unity_module;
}
static void orte_routed_unity_recv(int status, orte_process_name_t* sender,
orte_buffer_t* buffer, orte_rml_tag_t tag,
void* cbdata)
{
orte_std_cntr_t cnt;
orte_gpr_notify_data_t *ndat;
int rc;
ndat = OBJ_NEW(orte_gpr_notify_data_t);
cnt = 1;
if (ORTE_SUCCESS != (rc = orte_dss.unpack(buffer, &ndat, &cnt, ORTE_GPR_NOTIFY_DATA))) {
ORTE_ERROR_LOG(rc);
return;
}
orte_rml_base_contact_info_notify(ndat, NULL);
OBJ_RELEASE(ndat);
}
int orte_routed_unity_module_init(void)
{
int rc;
if (ORTE_SUCCESS != (rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD,
ORTE_RML_TAG_UPDATE_ROUTES,
ORTE_RML_PERSISTENT,
orte_routed_unity_recv,
NULL))) {
ORTE_ERROR_LOG(rc);
return rc;
}
recv_issued = true;
return ORTE_SUCCESS;
}
int
orte_routed_unity_finalize(void)
{
int rc;
if (recv_issued) {
if (ORTE_SUCCESS != (rc = orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_UPDATE_ROUTES))) {
ORTE_ERROR_LOG(rc);
return rc;
}
recv_issued = false;
}
return ORTE_SUCCESS;
}
@ -122,7 +166,9 @@ int orte_routed_unity_init_routes(orte_jobid_t job, orte_gpr_notify_data_t *ndat
orte_std_cntr_t cnt;
char *rml_uri;
orte_gpr_notify_data_t *ndat;
orte_process_name_t name;
orte_jobid_t parent;
/* if I am a daemon... */
if (orte_process_info.daemon) {
OPAL_OUTPUT_VERBOSE((1, orte_routed_base_output,
@ -211,8 +257,6 @@ int orte_routed_unity_init_routes(orte_jobid_t job, orte_gpr_notify_data_t *ndat
}
#endif
{
orte_process_name_t name;
/* if ndata != NULL, then we can ignore it - some routing algos
* need to call init_routes during launch, but we don't
*/
@ -231,14 +275,16 @@ int orte_routed_unity_init_routes(orte_jobid_t job, orte_gpr_notify_data_t *ndat
return rc;
}
/* does this job have a parent? */
if (ORTE_SUCCESS != (rc = orte_ns.get_parent_job(&name.jobid, job))) {
if (ORTE_SUCCESS != (rc = orte_ns.get_parent_job(&parent, job))) {
ORTE_ERROR_LOG(rc);
return rc;
}
if (name.jobid != job) {
if (parent != job) {
/* yes it does - so get that contact info and send it along as well.
* get_contact_info will simply add to the ndat structure
*/
name.jobid = parent;
name.vpid = ORTE_VPID_WILDCARD;
if (ORTE_SUCCESS != (rc = orte_rml_base_get_contact_info(&name, &ndat))) {
ORTE_ERROR_LOG(rc);
return rc;
@ -274,6 +320,16 @@ int orte_routed_unity_init_routes(orte_jobid_t job, orte_gpr_notify_data_t *ndat
OBJ_DESTRUCT(&buf);
return rc;
}
/* if this job has a parent, send it to them too - must send to their update
* tag as they won't be listening to the init_routes one
*/
if (parent != job) {
if (ORTE_SUCCESS != (rc = orte_grpcomm.xcast(parent, &buf, ORTE_RML_TAG_UPDATE_ROUTES))) {
ORTE_ERROR_LOG(rc);
OBJ_DESTRUCT(&buf);
return rc;
}
}
OBJ_DESTRUCT(&buf);
return ORTE_SUCCESS;