From c693d3a5d1ab1845eef269a5c425e950f3372e6a Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Fri, 20 Jun 2008 03:26:13 +0000 Subject: [PATCH] I hadn't honestly considered before that an MPI process might attempt to call functions in the routed framework intended solely for daemons and HNPs. By design, MPI processes are not allowed to route RML/OOB messages, and hence the routed module in an MPI process has no knowledge whatsoever of how a message will reach its destination (except in the direct module). Thus, it has no way to return a valid routing tree, update a routing tree, or get wireup info. This commit ensures that attempts to access information that is unknowable or undefined returns appropriate invalid or not_supported values to avoid unexpected behavior and/or segfaults. This commit was SVN r18692. --- orte/mca/routed/binomial/routed_binomial.c | 24 ++++++++++++++++- orte/mca/routed/direct/routed_direct.c | 31 ++++++++++++++++++---- orte/mca/routed/linear/routed_linear.c | 29 +++++++++++++++++--- 3 files changed, 74 insertions(+), 10 deletions(-) diff --git a/orte/mca/routed/binomial/routed_binomial.c b/orte/mca/routed/binomial/routed_binomial.c index b5baeb3568..dfb45e2cf4 100644 --- a/orte/mca/routed/binomial/routed_binomial.c +++ b/orte/mca/routed/binomial/routed_binomial.c @@ -681,6 +681,13 @@ static int update_routing_tree(void) { opal_list_item_t *item; + /* if I am anything other than a daemon or the HNP, this + * is a meaningless command as I am not allowed to route + */ + if (!orte_process_info.daemon && !orte_process_info.hnp) { + return ORTE_ERR_NOT_SUPPORTED; + } + /* clear the list of children if any are already present */ while (NULL != (item = opal_list_remove_first(&my_children))) { OBJ_RELEASE(item); @@ -700,6 +707,13 @@ static orte_vpid_t get_routing_tree(orte_jobid_t job, opal_list_item_t *item; orte_namelist_t *nm, *child; + /* if I am anything other than a daemon or the HNP, this + * is a meaningless command as I am not allowed to route + */ + if (!orte_process_info.daemon && !orte_process_info.hnp) { + return ORTE_VPID_INVALID; + } + /* the binomial routing tree always goes to our children, * for any job */ @@ -723,6 +737,14 @@ static int get_wireup_info(orte_jobid_t job, opal_buffer_t *buf) { int rc; + /* if I am anything other than the HNP, this + * is a meaningless command as I cannot get + * the requested info + */ + if (!orte_process_info.hnp) { + return ORTE_ERR_NOT_SUPPORTED; + } + /* if we are not using static ports, then we need to share the * comm info - otherwise, just return */ @@ -730,7 +752,7 @@ static int get_wireup_info(orte_jobid_t job, opal_buffer_t *buf) return ORTE_SUCCESS; } - if (ORTE_SUCCESS != (rc = orte_rml_base_get_contact_info(ORTE_PROC_MY_NAME->jobid, buf))) { + if (ORTE_SUCCESS != (rc = orte_rml_base_get_contact_info(job, buf))) { ORTE_ERROR_LOG(rc); OBJ_RELEASE(buf); return rc; diff --git a/orte/mca/routed/direct/routed_direct.c b/orte/mca/routed/direct/routed_direct.c index c837c9aa04..fbf583773b 100644 --- a/orte/mca/routed/direct/routed_direct.c +++ b/orte/mca/routed/direct/routed_direct.c @@ -670,6 +670,13 @@ static bool route_is_defined(const orte_process_name_t *target) static int update_routing_tree(void) { + /* if I am anything other than a daemon or the HNP, this + * is a meaningless command as I am not allowed to route + */ + if (!orte_process_info.daemon && !orte_process_info.hnp) { + return ORTE_ERR_NOT_SUPPORTED; + } + /* nothing to do here as the routing tree is fixed */ return ORTE_SUCCESS; } @@ -679,11 +686,17 @@ static orte_vpid_t get_routing_tree(orte_jobid_t job, { orte_namelist_t *nm; - /* for anyone other than the HNP, the direct routing - * does not go anywhere - we don't relay - and our + /* if I am anything other than a daemon or the HNP, this + * is a meaningless command as I am not allowed to route + */ + if (!orte_process_info.daemon && !orte_process_info.hnp) { + return ORTE_VPID_INVALID; + } + + /* if I am a daemon, I have no children and my * parent is the HNP */ - if (!orte_process_info.hnp) { + if (orte_process_info.daemon) { return ORTE_PROC_MY_HNP->vpid; } @@ -702,11 +715,19 @@ static orte_vpid_t get_routing_tree(orte_jobid_t job, return ORTE_VPID_INVALID; } -static int get_wireup_info(orte_jobid_t job, opal_buffer_t *buf) +static int get_wireup_info(orte_jobid_t jobid, opal_buffer_t *buf) { int rc; - if (ORTE_SUCCESS != (rc = orte_rml_base_get_contact_info(ORTE_PROC_MY_NAME->jobid, buf))) { + /* if I am anything other than the HNP, this + * is a meaningless command as I cannot get + * the requested info + */ + if (!orte_process_info.hnp) { + return ORTE_ERR_NOT_SUPPORTED; + } + + if (ORTE_SUCCESS != (rc = orte_rml_base_get_contact_info(jobid, buf))) { ORTE_ERROR_LOG(rc); OBJ_RELEASE(buf); return rc; diff --git a/orte/mca/routed/linear/routed_linear.c b/orte/mca/routed/linear/routed_linear.c index da1b4c51c1..9479814f46 100644 --- a/orte/mca/routed/linear/routed_linear.c +++ b/orte/mca/routed/linear/routed_linear.c @@ -616,6 +616,13 @@ static bool route_is_defined(const orte_process_name_t *target) static int update_routing_tree(void) { + /* if I am anything other than a daemon or the HNP, this + * is a meaningless command as I am not allowed to route + */ + if (!orte_process_info.daemon && !orte_process_info.hnp) { + return ORTE_ERR_NOT_SUPPORTED; + } + /* nothing to do here as the routing tree is fixed */ return ORTE_SUCCESS; } @@ -625,11 +632,17 @@ static orte_vpid_t get_routing_tree(orte_jobid_t job, { orte_namelist_t *nm; - /* for anyone other than the HNP, the linear routing - * does not go anywhere - we don't relay - and our + /* if I am anything other than a daemon or the HNP, this + * is a meaningless command as I am not allowed to route + */ + if (!orte_process_info.daemon && !orte_process_info.hnp) { + return ORTE_VPID_INVALID; + } + + /* if I am a daemon, I have no children and my * parent is the HNP */ - if (!orte_process_info.hnp) { + if (orte_process_info.daemon) { return ORTE_PROC_MY_HNP->vpid; } @@ -652,6 +665,14 @@ static int get_wireup_info(orte_jobid_t job, opal_buffer_t *buf) { int rc; + /* if I am anything other than the HNP, this + * is a meaningless command as I cannot get + * the requested info + */ + if (!orte_process_info.hnp) { + return ORTE_ERR_NOT_SUPPORTED; + } + /* if we are not using static ports, then we need to share the * comm info - otherwise, just return */ @@ -659,7 +680,7 @@ static int get_wireup_info(orte_jobid_t job, opal_buffer_t *buf) return ORTE_SUCCESS; } - if (ORTE_SUCCESS != (rc = orte_rml_base_get_contact_info(ORTE_PROC_MY_NAME->jobid, buf))) { + if (ORTE_SUCCESS != (rc = orte_rml_base_get_contact_info(job, buf))) { ORTE_ERROR_LOG(rc); OBJ_RELEASE(buf); return rc;