1
1

Pass the job family to tools that need to connect to specific HNPs

This commit was SVN r22853.
Этот коммит содержится в:
Ralph Castain 2010-03-19 04:01:33 +00:00
родитель a479e6c320
Коммит abbdc2b527

Просмотреть файл

@ -29,6 +29,8 @@
#include "opal/mca/paffinity/paffinity.h" #include "opal/mca/paffinity/paffinity.h"
#include "opal/mca/sysinfo/sysinfo.h" #include "opal/mca/sysinfo/sysinfo.h"
#include "opal/mca/sysinfo/base/base.h" #include "opal/mca/sysinfo/base/base.h"
#include "opal/threads/mutex.h"
#include "opal/threads/condition.h"
#include "orte/mca/rmcast/base/base.h" #include "orte/mca/rmcast/base/base.h"
#include "orte/mca/errmgr/errmgr.h" #include "orte/mca/errmgr/errmgr.h"
@ -73,81 +75,97 @@ orte_ess_base_module_t orte_ess_cm_module = {
NULL /* ft_event */ NULL /* ft_event */
}; };
/* support for setting name */
static bool arrived = false;
static bool name_success = false;
static opal_mutex_t lock;
static opal_condition_t cond;
static int cm_set_name(void); static int cm_set_name(void);
static int rte_init(void) static int rte_init(void)
{ {
int ret; int ret;
char *error = NULL; char *error = NULL;
char **hosts = NULL; char **hosts = NULL;
char *nodelist; char *nodelist;
char *tmp=NULL;
/* only daemons that are bootstrapping should orte_vpid_t vpid;
* be calling this module int32_t jfam;
*/
/* construct the thread support */
OBJ_CONSTRUCT(&lock, opal_mutex_t);
OBJ_CONSTRUCT(&cond, opal_condition_t);
/* run the prolog */ /* run the prolog */
if (ORTE_SUCCESS != (ret = orte_ess_base_std_prolog())) { if (ORTE_SUCCESS != (ret = orte_ess_base_std_prolog())) {
error = "orte_ess_base_std_prolog"; error = "orte_ess_base_std_prolog";
goto error; goto error;
} }
/* open the reliable multicast framework */
if (ORTE_SUCCESS != (ret = orte_rmcast_base_open())) {
ORTE_ERROR_LOG(ret);
error = "orte_rmcast_base_open";
goto error;
}
if (ORTE_SUCCESS != (ret = orte_rmcast_base_select())) {
ORTE_ERROR_LOG(ret);
error = "orte_rmcast_base_select";
goto error;
}
if (ORTE_PROC_IS_DAEMON) { if (ORTE_PROC_IS_DAEMON) {
/* open and setup the local resource discovery framework */
if (ORTE_SUCCESS != (ret = opal_sysinfo_base_open())) {
ORTE_ERROR_LOG(ret);
error = "opal_sysinfo_base_open";
goto error;
}
if (ORTE_SUCCESS != (ret = opal_sysinfo_base_select())) {
ORTE_ERROR_LOG(ret);
error = "opal_sysinfo_base_select";
goto error;
}
/* if we do not know the HNP, then we have to /* if we do not know the HNP, then we have to
* use the multicast system to find it * use the multicast system to find it
*/ */
if (NULL == orte_process_info.my_hnp_uri) { if (NULL == orte_process_info.my_hnp_uri) {
/* Runtime Messaging Layer - this opens/selects the OOB as well /* if we were given a job family to join, get it */
* so we can get our url mca_base_param_reg_string_name("orte", "ess_job_family", "Job family",
*/ true, false, NULL, &tmp);
if (ORTE_SUCCESS != (ret = orte_rml_base_open())) { if (NULL != tmp) {
ORTE_ERROR_LOG(ret); jfam = strtol(tmp, NULL, 10);
error = "orte_rml_base_open"; ORTE_PROC_MY_NAME->jobid = ORTE_CONSTRUCT_JOB_FAMILY(jfam);
goto error; ORTE_PROC_MY_NAME->vpid = ORTE_VPID_INVALID;
} }
if (ORTE_SUCCESS != (ret = orte_rml_base_select())) {
ORTE_ERROR_LOG(ret);
error = "orte_rml_base_select";
goto error;
}
/* open the reliable multicast framework */
if (ORTE_SUCCESS != (ret = orte_rmcast_base_open())) {
ORTE_ERROR_LOG(ret);
error = "orte_rmcast_base_open";
goto error;
}
if (ORTE_SUCCESS != (ret = orte_rmcast_base_select())) {
ORTE_ERROR_LOG(ret);
error = "orte_rmcast_base_select";
goto error;
}
/* open and setup the local resource discovery framework */
if (ORTE_SUCCESS != (ret = opal_sysinfo_base_open())) {
ORTE_ERROR_LOG(ret);
error = "opal_sysinfo_base_open";
goto error;
}
if (ORTE_SUCCESS != (ret = opal_sysinfo_base_select())) {
ORTE_ERROR_LOG(ret);
error = "opal_sysinfo_base_select";
goto error;
}
/* get a name for ourselves */ /* get a name for ourselves */
if (ORTE_SUCCESS != (ret = cm_set_name())) { if (ORTE_SUCCESS != (ret = cm_set_name())) {
error = "set_name"; error = "set_name";
goto error; goto error;
} }
} else { } else {
/* if we were given an HNP, then we must have also /* if we were given an HNP, we can get the jobid from
* been given a vpid - we can get the jobid from * the HNP's name - this is decoded in proc_info.c during
* the HNP's name * the prolog
*/ */
ORTE_PROC_MY_NAME->jobid = orte_process_info.my_hnp.jobid; ORTE_PROC_MY_NAME->jobid = orte_process_info.my_hnp.jobid;
/* get vpid from environ */
mca_base_param_reg_string_name("orte", "ess_vpid", "Process vpid",
true, false, NULL, &tmp);
if (NULL == tmp) {
ret = ORTE_ERR_NOT_FOUND;
error = "get_ess_vpid";
goto error;
}
if (ORTE_SUCCESS != (ret = orte_util_convert_string_to_vpid(&vpid, tmp))) {
error = "convert_string_to_vpid";
goto error;
}
free(tmp);
ORTE_PROC_MY_NAME->vpid = vpid;
} }
/* get the list of nodes used for this job */ /* get the list of nodes used for this job */
@ -164,48 +182,41 @@ static int rte_init(void)
} }
opal_argv_free(hosts); opal_argv_free(hosts);
} else if (ORTE_PROC_IS_TOOL) { } else if (ORTE_PROC_IS_TOOL) {
if (ORTE_SUCCESS != (ret = orte_plm_base_open())) { /* if we were given a job family to join, get it */
ORTE_ERROR_LOG(ret); mca_base_param_reg_string_name("orte", "ess_job_family", "Job family",
error = "orte_plm_base_open"; true, false, NULL, &tmp);
goto error; if (NULL != tmp) {
} jfam = strtol(tmp, NULL, 10);
ORTE_PROC_MY_NAME->jobid = ORTE_CONSTRUCT_LOCAL_JOBID((jfam << 16), ORTE_JOBID_INVALID);
if (ORTE_SUCCESS != (ret = orte_plm_base_select())) { ORTE_PROC_MY_NAME->vpid = ORTE_VPID_INVALID;
ORTE_ERROR_LOG(ret); } else {
error = "orte_plm_base_select"; /* create our own name */
goto error; if (ORTE_SUCCESS != (ret = orte_plm_base_open())) {
}
if (ORTE_SUCCESS != (ret = orte_plm.set_hnp_name())) {
ORTE_ERROR_LOG(ret);
error = "orte_plm_set_hnp_name";
goto error;
}
/* close the plm since we opened it to set our
* name, but have no further use for it
*/
orte_plm_base_close();
/* if we do not know the HNP, then we have to use
* the multicast system to find it
*/
if (NULL == orte_process_info.my_hnp_uri) {
/* open the reliable multicast framework */
if (ORTE_SUCCESS != (ret = orte_rmcast_base_open())) {
ORTE_ERROR_LOG(ret); ORTE_ERROR_LOG(ret);
error = "orte_rmcast_base_open"; error = "orte_plm_base_open";
goto error; goto error;
} }
if (ORTE_SUCCESS != (ret = orte_rmcast_base_select())) { if (ORTE_SUCCESS != (ret = orte_plm_base_select())) {
ORTE_ERROR_LOG(ret); ORTE_ERROR_LOG(ret);
error = "orte_rmcast_base_select"; error = "orte_plm_base_select";
goto error; goto error;
} }
/* checkin with the HNP */ if (ORTE_SUCCESS != (ret = orte_plm.set_hnp_name())) {
if (ORTE_SUCCESS != (ret = cm_set_name())) { ORTE_ERROR_LOG(ret);
error = "set_name"; error = "orte_plm_set_hnp_name";
goto error; goto error;
} }
/* close the plm since we opened it to set our
* name, but have no further use for it
*/
orte_plm_base_close();
}
/* checkin with the HNP to get a name or just make contact */
if (ORTE_SUCCESS != (ret = cm_set_name())) {
error = "set_name";
goto error;
} }
/* do the rest of the standard tool init */ /* do the rest of the standard tool init */
@ -215,6 +226,11 @@ static int rte_init(void)
goto error; goto error;
} }
} }
/* destruct the thread support */
OBJ_DESTRUCT(&lock);
OBJ_DESTRUCT(&cond);
return ORTE_SUCCESS; return ORTE_SUCCESS;
error: error:
@ -222,6 +238,9 @@ error:
"orte_init:startup:internal-failure", "orte_init:startup:internal-failure",
true, error, ORTE_ERROR_NAME(ret), ret); true, error, ORTE_ERROR_NAME(ret), ret);
/* destruct the thread support */
OBJ_DESTRUCT(&lock);
OBJ_DESTRUCT(&cond);
return ret; return ret;
} }
@ -409,10 +428,6 @@ static int update_nidmap(opal_byte_object_t *bo)
return rc; return rc;
} }
/* support for setting name */
static bool arrived = false;
static bool name_success = false;
static void cbfunc(int status, static void cbfunc(int status,
int channel, orte_rmcast_tag_t tag, int channel, orte_rmcast_tag_t tag,
orte_process_name_t *sender, orte_process_name_t *sender,
@ -425,6 +440,8 @@ static void cbfunc(int status,
char *uri; char *uri;
char *host; char *host;
OPAL_THREAD_LOCK(&lock);
/* ensure we default to failure */ /* ensure we default to failure */
name_success = false; name_success = false;
@ -432,38 +449,38 @@ static void cbfunc(int status,
n = 1; n = 1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(buf, &cmd, &n, ORTE_DAEMON_CMD_T))) { if (ORTE_SUCCESS != (rc = opal_dss.unpack(buf, &cmd, &n, ORTE_DAEMON_CMD_T))) {
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
arrived = true; goto cleanup;
return;
} }
if (ORTE_DAEMON_NAME_REQ_CMD == cmd) { /* unpack the intended recipient's hostname */
/* unpack the intended recipient's hostname */ n=1;
n=1; if (ORTE_SUCCESS != (rc = opal_dss.unpack(buf, &host, &n, OPAL_STRING))) {
if (ORTE_SUCCESS != (rc = opal_dss.unpack(buf, &host, &n, OPAL_STRING))) { ORTE_ERROR_LOG(rc);
ORTE_ERROR_LOG(rc); return;
arrived = true; }
return;
} /* is this intended for me? */
if (0 != strcmp(host, orte_process_info.nodename)) {
/* is this intended for me? */ /* nope - ignore it */
if (0 != strcmp(host, orte_process_info.nodename)) { goto cleanup;
/* nope - ignore it */ }
return;
} /* unpack the name */
n = 1;
/* unpack the name */ if (ORTE_SUCCESS != (rc = opal_dss.unpack(buf, &name, &n, ORTE_NAME))) {
n = 1; ORTE_ERROR_LOG(rc);
if (ORTE_SUCCESS != (rc = opal_dss.unpack(buf, &name, &n, ORTE_NAME))) { return;
ORTE_ERROR_LOG(rc); }
arrived = true; /* if we got an invalid name, then declare failure */
return; if (ORTE_JOBID_INVALID == name.jobid &&
} ORTE_VPID_INVALID == name.vpid) {
/* if we got an invalid name, then declare failure */ arrived = true;
if (ORTE_JOBID_INVALID == name.jobid && name_success = false;
ORTE_VPID_INVALID == name.vpid) { goto cleanup;
arrived = true; }
return;
} /* if I didn't already have a name, set it */
if (ORTE_VPID_INVALID == ORTE_PROC_MY_NAME->vpid) {
ORTE_PROC_MY_NAME->jobid = name.jobid; ORTE_PROC_MY_NAME->jobid = name.jobid;
ORTE_PROC_MY_NAME->vpid = name.vpid; ORTE_PROC_MY_NAME->vpid = name.vpid;
OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output, OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output,
@ -474,28 +491,29 @@ static void cbfunc(int status,
n = 1; n = 1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(buf, &uri, &n, OPAL_STRING))) { if (ORTE_SUCCESS != (rc = opal_dss.unpack(buf, &uri, &n, OPAL_STRING))) {
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
arrived = true; goto cleanup;
return;
} }
OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output, OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output,
"%s got hnp uri %s", "%s got hnp uri %s",
ORTE_NAME_PRINT(&name), uri)); ORTE_NAME_PRINT(&name), uri));
orte_process_info.my_hnp_uri = uri; orte_process_info.my_hnp_uri = uri;
if (ORTE_DAEMON_NAME_REQ_CMD == cmd || if (ORTE_DAEMON_CHECKIN_CMD == cmd) {
ORTE_DAEMON_CHECKIN_CMD == cmd) {
/* unpack the number of daemons */ /* unpack the number of daemons */
n = 1; n = 1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(buf, &np, &n, OPAL_INT32))) { if (ORTE_SUCCESS != (rc = opal_dss.unpack(buf, &np, &n, OPAL_INT32))) {
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
arrived = true; goto cleanup;
return;
} }
orte_process_info.num_procs = np; orte_process_info.num_procs = np;
} }
name_success = true; name_success = true;
arrived = true; arrived = true;
cleanup:
opal_condition_signal(&cond);
OPAL_THREAD_UNLOCK(&lock);
} }
static int cm_set_name(void) static int cm_set_name(void)
@ -514,33 +532,21 @@ static int cm_set_name(void)
opal_list_item_t *item; opal_list_item_t *item;
opal_sysinfo_value_t *info; opal_sysinfo_value_t *info;
int32_t num_values; int32_t num_values;
char *rml_uri;
/* setup the query */ /* setup the query */
OBJ_CONSTRUCT(&buf, opal_buffer_t); OBJ_CONSTRUCT(&buf, opal_buffer_t);
if (ORTE_PROC_IS_DAEMON) { if (ORTE_PROC_IS_DAEMON) {
/* use the reliable multicast system to contact the HNP and cmd = ORTE_DAEMON_CHECKIN_CMD;
* get a name
*/
cmd = ORTE_DAEMON_NAME_REQ_CMD;
opal_dss.pack(&buf, &cmd, 1, ORTE_DAEMON_CMD_T);
} else if (ORTE_PROC_IS_TOOL) { } else if (ORTE_PROC_IS_TOOL) {
cmd = ORTE_TOOL_CHECKIN_CMD; cmd = ORTE_TOOL_CHECKIN_CMD;
opal_dss.pack(&buf, &cmd, 1, ORTE_DAEMON_CMD_T);
} }
opal_dss.pack(&buf, &cmd, 1, ORTE_DAEMON_CMD_T);
/* always include our node name */ /* always include our node name */
opal_dss.pack(&buf, &orte_process_info.nodename, 1, OPAL_STRING); opal_dss.pack(&buf, &orte_process_info.nodename, 1, OPAL_STRING);
if (ORTE_PROC_IS_DAEMON) { if (ORTE_PROC_IS_DAEMON) {
/* get and send our url */
if (NULL == (rml_uri = orte_rml.get_contact_info())) {
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
return ORTE_ERR_NOT_FOUND;
}
opal_dss.pack(&buf, &rml_uri, 1, OPAL_STRING);
free(rml_uri);
/* get our local resources */ /* get our local resources */
OBJ_CONSTRUCT(&resources, opal_list_t); OBJ_CONSTRUCT(&resources, opal_list_t);
opal_sysinfo.query(keys, &resources); opal_sysinfo.query(keys, &resources);
@ -587,7 +593,11 @@ static int cm_set_name(void)
OBJ_DESTRUCT(&buf); OBJ_DESTRUCT(&buf);
/* wait for response */ /* wait for response */
ORTE_PROGRESSED_WAIT(arrived, 0, 1); OPAL_THREAD_LOCK(&lock);
while (!arrived) {
opal_condition_wait(&cond, &lock);
}
OPAL_THREAD_UNLOCK(&lock);
/* cancel the recv */ /* cancel the recv */
orte_rmcast.cancel_recv(ORTE_RMCAST_SYS_CHANNEL, orte_rmcast.cancel_recv(ORTE_RMCAST_SYS_CHANNEL,