Fix some major bit-rot on scalable launch. If static ports are provided, then daemons can connect back to the HNP via the routed connection tree instead of doing so directly. In order to do that at scale, the node list must be passed as a regular expression - otherwise, the orted command line gets too long.
Over the course of time, usage of static ports got corrupted in several places, the "parent" info got incorrectly reset, etc. So correct all that and get the regex-based wireup going again. Also, don't pass node lists if static ports aren't enabled - they are of no value to the orted and just create the possibility of overly-long cmd lines. This commit was SVN r24860.
Этот коммит содержится в:
родитель
6496b2f845
Коммит
1ee7c39982
@ -299,33 +299,23 @@ int orte_ess_base_orted_setup(char **hosts)
|
||||
* own port, which we need in order to construct the nidmap
|
||||
*/
|
||||
if (orte_static_ports) {
|
||||
if (NULL != orted_launch_cmd) {
|
||||
/* the launch cmd was given via regexp on the cmd line - parse
|
||||
* it to get the contact info
|
||||
*/
|
||||
if (ORTE_SUCCESS != (ret = orte_regex_decode_maps(orted_launch_cmd, NULL))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
error = "orte_regex_decode_maps";
|
||||
goto error;
|
||||
}
|
||||
} else {
|
||||
if (ORTE_SUCCESS != (ret = orte_util_setup_local_nidmap_entries())) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
error = "orte_util_nidmap_init";
|
||||
goto error;
|
||||
}
|
||||
/* extract the node info from the environment and
|
||||
* build a nidmap from it
|
||||
*/
|
||||
if (ORTE_SUCCESS != (ret = orte_util_build_daemon_nidmap(hosts))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
error = "construct daemon map from static ports";
|
||||
goto error;
|
||||
}
|
||||
if (ORTE_SUCCESS != (ret = orte_util_setup_local_nidmap_entries())) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
error = "orte_util_nidmap_init";
|
||||
goto error;
|
||||
}
|
||||
/* extract the node info from the environment and
|
||||
* build a nidmap from it
|
||||
*/
|
||||
if (ORTE_SUCCESS != (ret = orte_util_build_daemon_nidmap(hosts))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
error = "construct daemon map from static ports";
|
||||
goto error;
|
||||
}
|
||||
}
|
||||
/* be sure to update the routing tree so the initial "phone home"
|
||||
* to mpirun goes through the tree!
|
||||
* to mpirun goes through the tree if static ports were enabled - still
|
||||
* need to do it anyway just to initialize things
|
||||
*/
|
||||
if (ORTE_SUCCESS != (ret = orte_routed.update_routing_tree(ORTE_PROC_MY_NAME->jobid))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
|
@ -40,6 +40,7 @@
|
||||
#include "opal/mca/paffinity/paffinity.h"
|
||||
|
||||
#include "orte/util/proc_info.h"
|
||||
#include "orte/util/regex.h"
|
||||
#include "orte/util/show_help.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/util/name_fns.h"
|
||||
@ -50,7 +51,6 @@
|
||||
#include "orte/mca/ess/base/base.h"
|
||||
#include "orte/mca/ess/slurm/ess_slurm.h"
|
||||
|
||||
static char *get_slurm_nodename(int nodeid);
|
||||
static int slurm_set_name(void);
|
||||
|
||||
static int rte_init(void);
|
||||
@ -90,7 +90,6 @@ static int rte_init(void)
|
||||
int ret;
|
||||
char *error = NULL;
|
||||
char **hosts = NULL;
|
||||
char *slurm_nodelist;
|
||||
|
||||
/* run the prolog */
|
||||
if (ORTE_SUCCESS != (ret = orte_ess_base_std_prolog())) {
|
||||
@ -105,13 +104,12 @@ static int rte_init(void)
|
||||
* default procedure
|
||||
*/
|
||||
if (ORTE_PROC_IS_DAEMON) {
|
||||
/* get the list of nodes used for this job */
|
||||
mca_base_param_reg_string_name("orte", "nodelist", "List of nodes in job",
|
||||
true, false, NULL, &slurm_nodelist);
|
||||
|
||||
if (NULL != slurm_nodelist) {
|
||||
/* split the node list into an argv array */
|
||||
hosts = opal_argv_split(slurm_nodelist, ',');
|
||||
if (NULL != orte_node_regex) {
|
||||
/* extract the nodes */
|
||||
if (ORTE_SUCCESS != (ret = orte_regex_extract_node_names(orte_node_regex, &hosts))) {
|
||||
error = "orte_regex_extract_node_names";
|
||||
goto error;
|
||||
}
|
||||
}
|
||||
if (ORTE_SUCCESS != (ret = orte_ess_base_orted_setup(hosts))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
@ -388,12 +386,12 @@ static int slurm_set_name(void)
|
||||
if (NULL != orte_process_info.nodename) {
|
||||
free(orte_process_info.nodename);
|
||||
}
|
||||
orte_process_info.nodename = get_slurm_nodename(slurm_nodeid);
|
||||
orte_process_info.nodename = getenv("SLURMD_NODENAME");
|
||||
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output,
|
||||
"ess:slurm set nodename to %s",
|
||||
orte_process_info.nodename));
|
||||
(NULL == orte_process_info.nodename) ? "NULL" : orte_process_info.nodename));
|
||||
|
||||
/* get the non-name common environmental variables */
|
||||
if (ORTE_SUCCESS != (rc = orte_ess_env_get())) {
|
||||
@ -403,35 +401,3 @@ static int slurm_set_name(void)
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static char *
|
||||
get_slurm_nodename(int nodeid)
|
||||
{
|
||||
char **names = NULL;
|
||||
char *slurm_nodelist;
|
||||
char *ret;
|
||||
|
||||
mca_base_param_reg_string_name("orte", "nodelist", "List of nodes in job",
|
||||
true, false, NULL, &slurm_nodelist);
|
||||
if (NULL == slurm_nodelist) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* split the node list into an argv array */
|
||||
names = opal_argv_split(slurm_nodelist, ',');
|
||||
if (NULL == names) { /* got an error */
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* check to see if there are enough entries */
|
||||
if (nodeid > opal_argv_count(names)) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
ret = strdup(names[nodeid]);
|
||||
|
||||
opal_argv_free(names);
|
||||
|
||||
/* All done */
|
||||
return ret;
|
||||
}
|
||||
|
@ -124,87 +124,8 @@ int orte_odls_base_default_get_add_procs_data(opal_buffer_t *data,
|
||||
if (NULL == map) {
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/* are we passing a regexp? */
|
||||
if (orte_use_regexp && jdata->num_apps < 2 && NULL == orte_debugger_daemon) {
|
||||
char *regexp;
|
||||
flag = 1;
|
||||
opal_dss.pack(data, &flag, 1, OPAL_INT8);
|
||||
regexp = orte_regex_encode_maps(jdata);
|
||||
opal_dss.pack(data, ®exp, 1, OPAL_STRING);
|
||||
free(regexp);
|
||||
/* if we are not using static ports, then we need to add the daemon wireup info */
|
||||
if (!orte_static_ports) {
|
||||
/* pack a flag indicating that wiring info is provided */
|
||||
flag = 1;
|
||||
opal_dss.pack(data, &flag, 1, OPAL_INT8);
|
||||
/* get wireup info for daemons per the selected routing module */
|
||||
wireup = OBJ_NEW(opal_buffer_t);
|
||||
if (ORTE_SUCCESS != (rc = orte_routed.get_wireup_info(wireup))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_RELEASE(wireup);
|
||||
return rc;
|
||||
}
|
||||
/* if anything was inserted, put it in a byte object for xmission */
|
||||
if (0 < wireup->bytes_used) {
|
||||
opal_dss.unload(wireup, (void**)&bo.bytes, &numbytes);
|
||||
/* pack the number of bytes required by payload */
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &numbytes, 1, OPAL_INT32))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_RELEASE(wireup);
|
||||
return rc;
|
||||
}
|
||||
/* pack the byte object */
|
||||
bo.size = numbytes;
|
||||
boptr = &bo;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &boptr, 1, OPAL_BYTE_OBJECT))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_RELEASE(wireup);
|
||||
return rc;
|
||||
}
|
||||
/* release the data since it has now been copied into our buffer */
|
||||
free(bo.bytes);
|
||||
} else {
|
||||
/* pack numbytes=0 so the unpack routine remains sync'd to us */
|
||||
numbytes = 0;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &numbytes, 1, OPAL_INT32))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_RELEASE(wireup);
|
||||
return rc;
|
||||
}
|
||||
}
|
||||
OBJ_RELEASE(wireup);
|
||||
} else {
|
||||
/* pack a flag indicating no wireup info is provided */
|
||||
flag = 0;
|
||||
opal_dss.pack(data, &flag, 1, OPAL_INT8);
|
||||
}
|
||||
/* insert an "add-procs" command here so we can cleanly process it on the
|
||||
* other end
|
||||
*/
|
||||
command = ORTE_DAEMON_ADD_LOCAL_PROCS;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &command, 1, ORTE_DAEMON_CMD))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
/* since we will have processed this to update daemons, flag that we don't
|
||||
* have the regexp again
|
||||
*/
|
||||
flag = 2;
|
||||
opal_dss.pack(data, &flag, 1, OPAL_INT8);
|
||||
/* pack the jobid so it can be extracted later */
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &job, 1, ORTE_JOBID))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
/* all done */
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
nodemap:
|
||||
/* if we are not passing a regexp, then pass the nodemap */
|
||||
flag = 0;
|
||||
opal_dss.pack(data, &flag, 1, OPAL_INT8);
|
||||
/* construct a nodemap */
|
||||
if (ORTE_SUCCESS != (rc = orte_util_encode_nodemap(&bo))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
@ -276,13 +197,6 @@ int orte_odls_base_default_get_add_procs_data(opal_buffer_t *data,
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* pack the flag indicating that we are not using regexps - required to
|
||||
* keep things in order when unpacking due to different ways the data
|
||||
* can get to the unpacking routine
|
||||
*/
|
||||
flag = 0;
|
||||
opal_dss.pack(data, &flag, 1, OPAL_INT8);
|
||||
|
||||
/* are we co-locating debugger daemons? */
|
||||
if (NULL != orte_debugger_daemon) {
|
||||
orte_app_context_t **apps;
|
||||
@ -472,23 +386,6 @@ int orte_odls_base_default_get_add_procs_data(opal_buffer_t *data,
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static int unpack_regexp(orte_odls_job_t **jobdat, opal_buffer_t *data)
|
||||
{
|
||||
char *regexp;
|
||||
int rc, cnt;
|
||||
|
||||
cnt=1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, ®exp, &cnt, OPAL_STRING))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
if (ORTE_SUCCESS != (rc = orte_regex_decode_maps(regexp, jobdat))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
free(regexp);
|
||||
return rc;
|
||||
}
|
||||
|
||||
int orte_odls_base_default_update_daemon_info(opal_buffer_t *data)
|
||||
{
|
||||
opal_buffer_t wireup;
|
||||
@ -498,40 +395,7 @@ int orte_odls_base_default_update_daemon_info(opal_buffer_t *data)
|
||||
int32_t numbytes;
|
||||
int8_t flag;
|
||||
|
||||
/* unpack the flag for regexp */
|
||||
cnt=1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &flag, &cnt, OPAL_INT8))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* if we have a regexp, then process it so we know the daemonmap */
|
||||
if (0 < flag) {
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
|
||||
"%s odls:update:daemon:info updating nidmap from regexp",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
if (ORTE_SUCCESS != (rc = unpack_regexp(NULL, data))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
/* update the routing tree */
|
||||
if (ORTE_SUCCESS != (rc = orte_routed.update_routing_tree(ORTE_PROC_MY_NAME->jobid))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
/* see if we have wiring info as well */
|
||||
cnt=1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &flag, &cnt, OPAL_INT8))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
if (0 < flag) {
|
||||
/* yes - extract and process it */
|
||||
goto wireup;
|
||||
}
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* otherwise, extract the byte object holding the daemonmap */
|
||||
/* extract the byte object holding the daemonmap */
|
||||
cnt=1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &bo, &cnt, OPAL_BYTE_OBJECT))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
@ -573,7 +437,6 @@ int orte_odls_base_default_update_daemon_info(opal_buffer_t *data)
|
||||
return rc;
|
||||
}
|
||||
|
||||
wireup:
|
||||
/* unpack the #bytes of daemon wireup info in the message */
|
||||
cnt=1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &numbytes, &cnt, OPAL_INT32))) {
|
||||
@ -629,70 +492,6 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data,
|
||||
|
||||
*job = ORTE_JOBID_INVALID;
|
||||
|
||||
/* unpack the flag for regexp */
|
||||
cnt=1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &flag, &cnt, OPAL_INT8))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto REPORT_ERROR;
|
||||
}
|
||||
|
||||
if (0 < flag) {
|
||||
if (1 == flag) {
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
|
||||
"%s odls: constructing jobdat from regexp",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
|
||||
/* need to setup the job from the regexp */
|
||||
if (ORTE_SUCCESS != (rc = unpack_regexp(&jobdat, data))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto REPORT_ERROR;
|
||||
}
|
||||
/* record the jobid */
|
||||
*job = jobdat->jobid;
|
||||
} else {
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
|
||||
"%s odls: using jobdat previously extracted from regexp",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
/* unpack the jobid */
|
||||
cnt=1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, job, &cnt, ORTE_JOBID))) {
|
||||
*job = ORTE_JOBID_INVALID;
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto REPORT_ERROR;
|
||||
}
|
||||
/* find the corresponding jobdat */
|
||||
for (item = opal_list_get_first(&orte_local_jobdata);
|
||||
item != opal_list_get_end(&orte_local_jobdata);
|
||||
item = opal_list_get_next(item)) {
|
||||
orte_odls_job_t *jdat = (orte_odls_job_t*)item;
|
||||
|
||||
/* is this the specified job? */
|
||||
if (jdat->jobid == *job) {
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
|
||||
"%s odls:construct_child_list found existing jobdat for job %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(*job)));
|
||||
jobdat = jdat;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (NULL == jobdat) {
|
||||
/* we have a problem */
|
||||
rc = ORTE_ERR_NOT_FOUND;
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto REPORT_ERROR;
|
||||
}
|
||||
}
|
||||
/* fake an app_idx array */
|
||||
app_idx = (orte_app_idx_t*)malloc(jobdat->num_procs * sizeof(orte_app_idx_t));
|
||||
memset(app_idx, 0, jobdat->num_procs * sizeof(orte_app_idx_t));
|
||||
/* if we are doing a timing test, store the time the msg was recvd */
|
||||
if (orte_timing) {
|
||||
jobdat->launch_msg_recvd.tv_sec = orte_daemon_msg_recvd.tv_sec;
|
||||
jobdat->launch_msg_recvd.tv_usec = orte_daemon_msg_recvd.tv_usec;
|
||||
}
|
||||
goto find_my_procs;
|
||||
}
|
||||
|
||||
/* unpack the flag - are we co-locating debugger daemons? */
|
||||
cnt=1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &flag, &cnt, OPAL_INT8))) {
|
||||
@ -743,7 +542,7 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data,
|
||||
/* if the buffer was empty, then we know that all we are doing is
|
||||
* launching debugger daemons
|
||||
*/
|
||||
if (ORTE_ERR_UNPACK_READ_PAST_END_OF_BUFFER == OPAL_SOS_GET_ERROR_CODE(rc)) {
|
||||
if (ORTE_ERR_UNPACK_READ_PAST_END_OF_BUFFER == rc) {
|
||||
goto done;
|
||||
}
|
||||
*job = ORTE_JOBID_INVALID;
|
||||
@ -942,7 +741,6 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data,
|
||||
}
|
||||
}
|
||||
|
||||
find_my_procs:
|
||||
/* cycle through the procs and find mine */
|
||||
proc.jobid = jobdat->jobid;
|
||||
for (j=0; j < jobdat->num_procs; j++) {
|
||||
|
@ -609,7 +609,7 @@ mca_oob_tcp_create_listen(int *target_sd, unsigned short *target_port, uint16_t
|
||||
port in the range. Otherwise, tcp_port_min will be 0, which
|
||||
means "pick any port" */
|
||||
if (AF_INET == af_family) {
|
||||
if (ORTE_PROC_IS_DAEMON) {
|
||||
if (ORTE_PROC_IS_HNP || ORTE_PROC_IS_DAEMON) {
|
||||
if (NULL != mca_oob_tcp_component.tcp4_static_ports) {
|
||||
/* if static ports were provided, the daemon takes the
|
||||
* first entry in the list
|
||||
@ -655,7 +655,7 @@ mca_oob_tcp_create_listen(int *target_sd, unsigned short *target_port, uint16_t
|
||||
orte_static_ports = false;
|
||||
}
|
||||
} else {
|
||||
/* if we are the HNP or a tool, then we must let the
|
||||
/* if we are a tool, then we must let the
|
||||
* system pick any port
|
||||
*/
|
||||
opal_argv_append_nosize(&ports, "0");
|
||||
@ -672,7 +672,7 @@ mca_oob_tcp_create_listen(int *target_sd, unsigned short *target_port, uint16_t
|
||||
|
||||
#if OPAL_WANT_IPV6
|
||||
if (AF_INET6 == af_family) {
|
||||
if (ORTE_PROC_IS_DAEMON) {
|
||||
if (ORTE_PROC_IS_HNP || ORTE_PROC_IS_DAEMON) {
|
||||
if (NULL != mca_oob_tcp_component.tcp6_static_ports) {
|
||||
/* if static ports were provided, the daemon takes the
|
||||
* first entry in the list
|
||||
@ -718,7 +718,7 @@ mca_oob_tcp_create_listen(int *target_sd, unsigned short *target_port, uint16_t
|
||||
orte_static_ports = false;
|
||||
}
|
||||
} else {
|
||||
/* if we are the HNP or a tool, then we must let the
|
||||
/* if we are a tool, then we must let the
|
||||
* system pick any port
|
||||
*/
|
||||
opal_argv_append_nosize(&ports, "0");
|
||||
|
@ -9,7 +9,7 @@
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2006-2007 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2006-2011 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2007 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
@ -255,10 +255,16 @@ static int plm_alps_launch_job(orte_job_t *jdata)
|
||||
}
|
||||
nodelist_flat = opal_argv_join(nodelist_argv, ',');
|
||||
opal_argv_free(nodelist_argv);
|
||||
opal_argv_append(&argc, &argv, "-L");
|
||||
asprintf(&tmp, "%s", nodelist_flat);
|
||||
opal_argv_append(&argc, &argv, tmp);
|
||||
free(tmp);
|
||||
|
||||
/* if we are using all allocated nodes, then alps
|
||||
* doesn't need a nodelist
|
||||
*/
|
||||
if (map->num_new_daemons < orte_num_allocated_nodes) {
|
||||
opal_argv_append(&argc, &argv, "-L");
|
||||
asprintf(&tmp, "%s", nodelist_flat);
|
||||
opal_argv_append(&argc, &argv, tmp);
|
||||
free(tmp);
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
@ -306,7 +312,7 @@ static int plm_alps_launch_job(orte_job_t *jdata)
|
||||
cur_prefix = NULL;
|
||||
for (i=0; i < jdata->num_apps; i++) {
|
||||
char * app_prefix_dir = apps[i]->prefix_dir;
|
||||
/* Check for already set cur_prefix_dir -- if different,
|
||||
/* Check for already set cur_prefix_dir -- if different,
|
||||
complain */
|
||||
if (NULL != app_prefix_dir) {
|
||||
if (NULL != cur_prefix &&
|
||||
@ -357,7 +363,7 @@ static int plm_alps_launch_job(orte_job_t *jdata)
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
launch_apps:
|
||||
launch_apps:
|
||||
/* if we get here, then daemons launched - change to declaring apps failed */
|
||||
failed_job = active_job;
|
||||
if (ORTE_SUCCESS != (rc = orte_plm_base_launch_apps(active_job))) {
|
||||
@ -370,15 +376,15 @@ launch_apps:
|
||||
|
||||
if (mca_plm_alps_component.timing) {
|
||||
if (0 != gettimeofday(&launchstop, NULL)) {
|
||||
opal_output(0, "plm_alps: could not obtain stop time");
|
||||
} else {
|
||||
opal_output(0, "plm_alps: daemon block launch time is %ld usec",
|
||||
(launchstop.tv_sec - launchstart.tv_sec)*1000000 +
|
||||
(launchstop.tv_usec - launchstart.tv_usec));
|
||||
opal_output(0, "plm_alps: total job launch time is %ld usec",
|
||||
(launchstop.tv_sec - joblaunchstart.tv_sec)*1000000 +
|
||||
(launchstop.tv_usec - joblaunchstart.tv_usec));
|
||||
}
|
||||
opal_output(0, "plm_alps: could not obtain stop time");
|
||||
} else {
|
||||
opal_output(0, "plm_alps: daemon block launch time is %ld usec",
|
||||
(launchstop.tv_sec - launchstart.tv_sec)*1000000 +
|
||||
(launchstop.tv_usec - launchstart.tv_usec));
|
||||
opal_output(0, "plm_alps: total job launch time is %ld usec",
|
||||
(launchstop.tv_sec - joblaunchstart.tv_sec)*1000000 +
|
||||
(launchstop.tv_usec - joblaunchstart.tv_usec));
|
||||
}
|
||||
}
|
||||
|
||||
if (ORTE_SUCCESS != rc) {
|
||||
@ -386,7 +392,7 @@ launch_apps:
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
cleanup:
|
||||
cleanup:
|
||||
if (NULL != argv) {
|
||||
opal_argv_free(argv);
|
||||
}
|
||||
|
@ -758,11 +758,6 @@ int orte_plm_base_orted_append_basic_args(int *argc, char ***argv,
|
||||
opal_argv_append(argc, argv, "--report-bindings");
|
||||
}
|
||||
|
||||
/* check for bootstrap */
|
||||
if (orte_daemon_bootstrap) {
|
||||
opal_argv_append(argc, argv, "--bootstrap");
|
||||
}
|
||||
|
||||
if ((int)ORTE_VPID_INVALID != orted_debug_failure) {
|
||||
opal_argv_append(argc, argv, "--debug-failure");
|
||||
asprintf(¶m, "%d", orted_debug_failure);
|
||||
@ -828,11 +823,16 @@ int orte_plm_base_orted_append_basic_args(int *argc, char ***argv,
|
||||
opal_argv_append(argc, argv, param);
|
||||
free(param);
|
||||
|
||||
/* if given, pass the node list */
|
||||
if (NULL != nodes) {
|
||||
opal_argv_append(argc, argv, "-mca");
|
||||
opal_argv_append(argc, argv, "orte_nodelist");
|
||||
opal_argv_append(argc, argv, nodes);
|
||||
/* if given and we have static ports, pass the node list */
|
||||
if (orte_static_ports && NULL != nodes) {
|
||||
/* convert the nodes to a regex */
|
||||
if (ORTE_SUCCESS != (rc = orte_regex_create(nodes, ¶m))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
opal_argv_append(argc, argv, "--nodes");
|
||||
opal_argv_append(argc, argv, param);
|
||||
free(param);
|
||||
}
|
||||
|
||||
/* pass along any cmd line MCA params provided to mpirun,
|
||||
|
@ -1171,7 +1171,8 @@ PRELOAD_FILES:
|
||||
int orte_plm_base_rsh_setup_launch(int *argcptr, char ***argvptr,
|
||||
char *nodename,
|
||||
int *node_name_index1,
|
||||
int *proc_vpid_index, char *prefix_dir)
|
||||
int *proc_vpid_index, char *prefix_dir,
|
||||
char *nodes)
|
||||
{
|
||||
int argc;
|
||||
char **argv;
|
||||
@ -1383,7 +1384,7 @@ int orte_plm_base_rsh_setup_launch(int *argcptr, char ***argvptr,
|
||||
orte_plm_base_orted_append_basic_args(&argc, &argv,
|
||||
"env",
|
||||
proc_vpid_index,
|
||||
NULL);
|
||||
nodes);
|
||||
|
||||
/* ensure that only the ssh plm is selected on the remote daemon */
|
||||
opal_argv_append_nosize(&argv, "-mca");
|
||||
|
@ -66,7 +66,8 @@ ORTE_DECLSPEC int orte_plm_base_rsh_setup_shell(orte_plm_rsh_shell_t *rshell,
|
||||
ORTE_DECLSPEC int orte_plm_base_rsh_setup_launch(int *argcptr, char ***argvptr,
|
||||
char *nodename,
|
||||
int *node_name_index1,
|
||||
int *proc_vpid_index, char *prefix_dir);
|
||||
int *proc_vpid_index, char *prefix_dir,
|
||||
char *nodes);
|
||||
ORTE_DECLSPEC void orte_plm_base_ssh_child(int argc, char **argv,
|
||||
orte_vpid_t vpid, int proc_vpid_index);
|
||||
|
||||
|
@ -212,7 +212,7 @@ static int spawn(orte_job_t *jdata)
|
||||
orte_node_t *node;
|
||||
int nnode;
|
||||
int argc;
|
||||
char **argv=NULL;
|
||||
char **argv=NULL, **nodes=NULL, *nodelist=NULL;
|
||||
char *prefix_dir;
|
||||
int node_name_index1;
|
||||
int proc_vpid_index;
|
||||
@ -287,7 +287,7 @@ static int spawn(orte_job_t *jdata)
|
||||
orte_leave_session_attached) &&
|
||||
mca_plm_rshbase_component.num_concurrent < map->num_new_daemons) {
|
||||
/**
|
||||
* If we are in '--debug-daemons' we keep the ssh connection
|
||||
* If we are in '--debug-daemons' we keep the ssh connection
|
||||
* alive for the span of the run. If we use this option
|
||||
* AND we launch on more than "num_concurrent" machines
|
||||
* then we will deadlock. No connections are terminated
|
||||
@ -339,11 +339,33 @@ static int spawn(orte_job_t *jdata)
|
||||
}
|
||||
prefix_dir = app->prefix_dir;
|
||||
|
||||
/* if we are using static ports, then setup a string showing the
|
||||
* nodes so we can use a regex to pass connection info
|
||||
*/
|
||||
if (orte_static_ports) {
|
||||
for (nnode=0; nnode < map->nodes->size; nnode++) {
|
||||
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, nnode))) {
|
||||
continue;
|
||||
}
|
||||
opal_argv_append_nosize(&nodes, node->name);
|
||||
}
|
||||
nodelist = opal_argv_join(nodes, ',');
|
||||
opal_argv_free(nodes);
|
||||
}
|
||||
|
||||
/* setup the launch */
|
||||
if (ORTE_SUCCESS != (rc = orte_plm_base_rsh_setup_launch(&argc, &argv, node->name, &node_name_index1,
|
||||
&proc_vpid_index, prefix_dir))) {
|
||||
&proc_vpid_index, prefix_dir, nodelist))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
if (NULL != nodelist) {
|
||||
free(nodelist);
|
||||
nodelist = NULL;
|
||||
}
|
||||
}
|
||||
if (NULL != nodelist) {
|
||||
free(nodelist);
|
||||
nodelist = NULL;
|
||||
}
|
||||
|
||||
/* set the active jobid */
|
||||
@ -419,7 +441,7 @@ static int spawn(orte_job_t *jdata)
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
|
||||
"%s plm:rsh: recording launch of daemon %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&node->daemon->name)));
|
||||
ORTE_NAME_PRINT(&node->daemon->name)));
|
||||
|
||||
/* setup callback on sigchild - wait until setup above is complete
|
||||
* as the callback can occur in the call to orte_wait_cb
|
||||
@ -448,7 +470,7 @@ static int spawn(orte_job_t *jdata)
|
||||
}
|
||||
|
||||
|
||||
launch_apps:
|
||||
launch_apps:
|
||||
/* if we get here, then the daemons succeeded, so any failure would now be
|
||||
* for the application job
|
||||
*/
|
||||
|
@ -162,7 +162,6 @@ static int plm_slurm_launch_job(orte_job_t *jdata)
|
||||
int proc_vpid_index;
|
||||
orte_jobid_t failed_job;
|
||||
bool failed_launch=true;
|
||||
bool using_regexp=false;
|
||||
|
||||
/* if we are timing, record the start time */
|
||||
if (orte_timing) {
|
||||
@ -217,7 +216,7 @@ static int plm_slurm_launch_job(orte_job_t *jdata)
|
||||
ORTE_JOBID_PRINT(jdata->jobid)));
|
||||
|
||||
/* set the active jobid */
|
||||
active_job = jdata->jobid;
|
||||
active_job = jdata->jobid;
|
||||
|
||||
/* Get the map for this job */
|
||||
if (NULL == (map = orte_rmaps.get_job_map(active_job))) {
|
||||
@ -250,6 +249,9 @@ static int plm_slurm_launch_job(orte_job_t *jdata)
|
||||
/* add the srun command */
|
||||
opal_argv_append(&argc, &argv, "srun");
|
||||
|
||||
/* alert us if any orteds die during startup */
|
||||
opal_argv_append(&argc, &argv, "--kill-on-bad-exit");
|
||||
|
||||
/* Append user defined arguments to srun */
|
||||
if ( NULL != mca_plm_slurm_component.custom_args ) {
|
||||
custom_strings = opal_argv_split(mca_plm_slurm_component.custom_args, ' ');
|
||||
@ -260,17 +262,6 @@ static int plm_slurm_launch_job(orte_job_t *jdata)
|
||||
opal_argv_free(custom_strings);
|
||||
}
|
||||
|
||||
asprintf(&tmp, "--nodes=%lu", (unsigned long) map->num_new_daemons);
|
||||
opal_argv_append(&argc, &argv, tmp);
|
||||
free(tmp);
|
||||
|
||||
asprintf(&tmp, "--ntasks=%lu", (unsigned long) map->num_new_daemons);
|
||||
opal_argv_append(&argc, &argv, tmp);
|
||||
free(tmp);
|
||||
|
||||
/* alert us if any orteds die during startup */
|
||||
opal_argv_append(&argc, &argv, "--kill-on-bad-exit");
|
||||
|
||||
/* create nodelist */
|
||||
nodelist_argv = NULL;
|
||||
|
||||
@ -297,9 +288,23 @@ static int plm_slurm_launch_job(orte_job_t *jdata)
|
||||
}
|
||||
nodelist_flat = opal_argv_join(nodelist_argv, ',');
|
||||
opal_argv_free(nodelist_argv);
|
||||
asprintf(&tmp, "--nodelist=%s", nodelist_flat);
|
||||
opal_argv_append(&argc, &argv, tmp);
|
||||
free(tmp);
|
||||
|
||||
/* if we are using all allocated nodes, then srun doesn't
|
||||
* require any further arguments
|
||||
*/
|
||||
if (map->num_new_daemons < orte_num_allocated_nodes) {
|
||||
asprintf(&tmp, "--nodes=%lu", (unsigned long) map->num_new_daemons);
|
||||
opal_argv_append(&argc, &argv, tmp);
|
||||
free(tmp);
|
||||
|
||||
asprintf(&tmp, "--ntasks=%lu", (unsigned long) map->num_new_daemons);
|
||||
opal_argv_append(&argc, &argv, tmp);
|
||||
free(tmp);
|
||||
|
||||
asprintf(&tmp, "--nodelist=%s", nodelist_flat);
|
||||
opal_argv_append(&argc, &argv, tmp);
|
||||
free(tmp);
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((2, orte_plm_globals.output,
|
||||
"%s plm:slurm: launching on nodes %s",
|
||||
@ -312,7 +317,7 @@ static int plm_slurm_launch_job(orte_job_t *jdata)
|
||||
/* add the daemon command (as specified by user) */
|
||||
orte_plm_base_setup_orted_cmd(&argc, &argv);
|
||||
|
||||
/* Add basic orted command line options, including debug flags */
|
||||
/* Add basic orted command line options, including debug flags */
|
||||
orte_plm_base_orted_append_basic_args(&argc, &argv,
|
||||
"slurm", &proc_vpid_index,
|
||||
nodelist_flat);
|
||||
@ -344,7 +349,7 @@ static int plm_slurm_launch_job(orte_job_t *jdata)
|
||||
continue;
|
||||
}
|
||||
app_prefix_dir = app->prefix_dir;
|
||||
/* Check for already set cur_prefix_dir -- if different,
|
||||
/* Check for already set cur_prefix_dir -- if different,
|
||||
complain */
|
||||
if (NULL != app_prefix_dir) {
|
||||
if (NULL != cur_prefix &&
|
||||
@ -375,20 +380,6 @@ static int plm_slurm_launch_job(orte_job_t *jdata)
|
||||
opal_setenv(var, "rsh", true, &env);
|
||||
free(var);
|
||||
|
||||
/* if we can do it, use the regexp to launch the apps - this
|
||||
* requires that the user requested this mode, that we were
|
||||
* provided with static ports, and that we only have one
|
||||
* app_context
|
||||
*/
|
||||
if (orte_use_regexp && orte_static_ports && jdata->num_apps < 2) {
|
||||
char *regexp;
|
||||
regexp = orte_regex_encode_maps(jdata);
|
||||
opal_argv_append(&argc, &argv, "--launch");
|
||||
opal_argv_append(&argc, &argv, regexp);
|
||||
free(regexp);
|
||||
using_regexp = true;
|
||||
}
|
||||
|
||||
if (0 < opal_output_get_verbosity(orte_plm_globals.output)) {
|
||||
param = opal_argv_join(argv, ' ');
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
|
||||
@ -418,42 +409,16 @@ static int plm_slurm_launch_job(orte_job_t *jdata)
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
launch_apps:
|
||||
launch_apps:
|
||||
/* get here if daemons launch okay - any failures now by apps */
|
||||
launching_daemons = false;
|
||||
failed_job = active_job;
|
||||
if (using_regexp) {
|
||||
/* daemons already have launch cmd - just wait for them to
|
||||
* report back
|
||||
*/
|
||||
opal_buffer_t launch;
|
||||
int8_t flag;
|
||||
orte_daemon_cmd_flag_t command = ORTE_DAEMON_ADD_LOCAL_PROCS;
|
||||
OBJ_CONSTRUCT(&launch, opal_buffer_t);
|
||||
opal_dss.pack(&launch, &command, 1, ORTE_DAEMON_CMD);
|
||||
flag = 1;
|
||||
opal_dss.pack(&launch, &flag, 1, OPAL_INT8);
|
||||
opal_dss.pack(&launch, &orted_launch_cmd, 1, OPAL_STRING);
|
||||
ORTE_MESSAGE_EVENT(ORTE_PROC_MY_NAME, &launch, ORTE_RML_TAG_DAEMON, orte_daemon_cmd_processor);
|
||||
OBJ_DESTRUCT(&launch);
|
||||
|
||||
#if 0
|
||||
if (ORTE_SUCCESS != (rc = orte_plm_base_report_launched(jdata->jobid))) {
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
|
||||
"%s plm:slurm:launch failed for job %s on error %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_JOBID_PRINT(jdata->jobid), ORTE_ERROR_NAME(rc)));
|
||||
goto cleanup;
|
||||
}
|
||||
#endif
|
||||
} else {
|
||||
if (ORTE_SUCCESS != (rc = orte_plm_base_launch_apps(active_job))) {
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
|
||||
"%s plm:slurm: launch of apps failed for job %s on error %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_JOBID_PRINT(active_job), ORTE_ERROR_NAME(rc)));
|
||||
goto cleanup;
|
||||
}
|
||||
if (ORTE_SUCCESS != (rc = orte_plm_base_launch_apps(active_job))) {
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
|
||||
"%s plm:slurm: launch of apps failed for job %s on error %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_JOBID_PRINT(active_job), ORTE_ERROR_NAME(rc)));
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/* declare the launch a success */
|
||||
@ -461,12 +426,12 @@ launch_apps:
|
||||
|
||||
if (orte_timing) {
|
||||
if (0 != gettimeofday(&launchstop, NULL)) {
|
||||
opal_output(0, "plm_slurm: could not obtain stop time");
|
||||
} else {
|
||||
opal_output(0, "plm_slurm: total job launch time is %ld usec",
|
||||
(launchstop.tv_sec - launchstart.tv_sec)*1000000 +
|
||||
(launchstop.tv_usec - launchstart.tv_usec));
|
||||
}
|
||||
opal_output(0, "plm_slurm: could not obtain stop time");
|
||||
} else {
|
||||
opal_output(0, "plm_slurm: total job launch time is %ld usec",
|
||||
(launchstop.tv_sec - launchstart.tv_sec)*1000000 +
|
||||
(launchstop.tv_usec - launchstart.tv_usec));
|
||||
}
|
||||
}
|
||||
|
||||
if (ORTE_SUCCESS != rc) {
|
||||
@ -474,7 +439,7 @@ launch_apps:
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
cleanup:
|
||||
cleanup:
|
||||
if (NULL != argv) {
|
||||
opal_argv_free(argv);
|
||||
}
|
||||
|
@ -158,6 +158,9 @@ static int orte_ras_alps_allocate(opal_list_t *nodes)
|
||||
}
|
||||
free(str);
|
||||
|
||||
/* record the number of allocated nodes */
|
||||
orte_num_allocated_nodes = opal_list_get_size(nodes);
|
||||
|
||||
cleanup:
|
||||
|
||||
/* All done */
|
||||
|
@ -124,6 +124,8 @@ static int orte_ras_slurm_allocate(opal_list_t *nodes)
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
return ret;
|
||||
}
|
||||
/* record the number of allocated nodes */
|
||||
orte_num_allocated_nodes = opal_list_get_size(nodes);
|
||||
|
||||
/* All done */
|
||||
|
||||
|
@ -23,7 +23,6 @@
|
||||
|
||||
#include "opal/util/argv.h"
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/util/opal_sos.h"
|
||||
|
||||
#include "opal/dss/dss.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
@ -123,7 +122,7 @@ int orte_rml_base_update_contact_info(opal_buffer_t* data)
|
||||
/* track how many procs were in the message */
|
||||
++num_procs;
|
||||
}
|
||||
if (ORTE_ERR_UNPACK_READ_PAST_END_OF_BUFFER != OPAL_SOS_GET_ERROR_CODE(rc)) {
|
||||
if (ORTE_ERR_UNPACK_READ_PAST_END_OF_BUFFER != rc) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
@ -84,7 +84,6 @@ static opal_condition_t cond;
|
||||
static opal_mutex_t lock;
|
||||
static orte_process_name_t *lifeline=NULL;
|
||||
static orte_process_name_t local_lifeline;
|
||||
static orte_process_name_t my_parent;
|
||||
static int num_children;
|
||||
static opal_list_t my_children;
|
||||
static bool ack_recvd;
|
||||
@ -102,7 +101,7 @@ static int init(void)
|
||||
/* setup the list of children */
|
||||
OBJ_CONSTRUCT(&my_children, opal_list_t);
|
||||
num_children = 0;
|
||||
my_parent.jobid = ORTE_PROC_MY_NAME->jobid;
|
||||
ORTE_PROC_MY_PARENT->jobid = ORTE_PROC_MY_NAME->jobid;
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
@ -387,10 +386,10 @@ static orte_process_name_t get_route(orte_process_name_t *target)
|
||||
}
|
||||
|
||||
/* THIS CAME FROM OUR OWN JOB FAMILY... */
|
||||
if( !orte_static_ports &&
|
||||
if (orte_static_ports &&
|
||||
OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, ORTE_PROC_MY_HNP, target) ) {
|
||||
OPAL_OUTPUT_VERBOSE((2, orte_routed_base_output,
|
||||
"%s routing to the HNP through my PLM parent %s",
|
||||
"%s routing to the HNP through my parent %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_PARENT)));
|
||||
ret = ORTE_PROC_MY_PARENT;
|
||||
@ -442,7 +441,7 @@ static orte_process_name_t get_route(orte_process_name_t *target)
|
||||
/* if we get here, then the target daemon is not beneath
|
||||
* any of our children, so we have to step up through our parent
|
||||
*/
|
||||
daemon.vpid = my_parent.vpid;
|
||||
daemon.vpid = ORTE_PROC_MY_PARENT->vpid;
|
||||
|
||||
ret = &daemon;
|
||||
|
||||
@ -537,7 +536,7 @@ static int init_routes(orte_jobid_t job, opal_buffer_t *ndat)
|
||||
|
||||
/* if we are using static ports, set my lifeline to point at my parent */
|
||||
if (orte_static_ports) {
|
||||
lifeline = &my_parent;
|
||||
lifeline = ORTE_PROC_MY_PARENT;
|
||||
} else {
|
||||
/* set our lifeline to the HNP - we will abort if that connection is lost */
|
||||
lifeline = ORTE_PROC_MY_HNP;
|
||||
@ -992,13 +991,13 @@ static int update_routing_tree(orte_jobid_t jobid)
|
||||
/* compute my direct children and the bitmap that shows which vpids
|
||||
* lie underneath their branch
|
||||
*/
|
||||
my_parent.vpid = binomial_tree(0, 0, ORTE_PROC_MY_NAME->vpid,
|
||||
ORTE_PROC_MY_PARENT->vpid = binomial_tree(0, 0, ORTE_PROC_MY_NAME->vpid,
|
||||
orte_process_info.max_procs,
|
||||
&num_children, &my_children, NULL, true, jobid);
|
||||
my_parent.epoch = orte_ess.proc_get_epoch(&my_parent);
|
||||
ORTE_PROC_MY_PARENT->epoch = orte_ess.proc_get_epoch(ORTE_PROC_MY_PARENT);
|
||||
|
||||
if (0 < opal_output_get_verbosity(orte_routed_base_output)) {
|
||||
opal_output(0, "%s: parent %d num_children %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), my_parent.vpid, num_children);
|
||||
opal_output(0, "%s: parent %d num_children %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_PROC_MY_PARENT->vpid, num_children);
|
||||
for (item = opal_list_get_first(&my_children);
|
||||
item != opal_list_get_end(&my_children);
|
||||
item = opal_list_get_next(item)) {
|
||||
@ -1043,7 +1042,7 @@ static orte_vpid_t get_routing_tree(opal_list_t *children)
|
||||
}
|
||||
|
||||
/* return my parent's vpid */
|
||||
return my_parent.vpid;
|
||||
return ORTE_PROC_MY_PARENT->vpid;
|
||||
}
|
||||
|
||||
static int get_wireup_info(opal_buffer_t *buf)
|
||||
|
@ -92,6 +92,8 @@ static int init(void)
|
||||
OBJ_CONSTRUCT(&cond, opal_condition_t);
|
||||
OBJ_CONSTRUCT(&lock, opal_mutex_t);
|
||||
|
||||
ORTE_PROC_MY_PARENT->jobid = ORTE_PROC_MY_NAME->jobid;
|
||||
|
||||
lifeline = NULL;
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
@ -345,8 +347,8 @@ static orte_process_name_t get_route(orte_process_name_t *target)
|
||||
|
||||
/* THIS CAME FROM OUR OWN JOB FAMILY... */
|
||||
|
||||
/* if we are not using static ports and this is going to the HNP, send directly through my parent */
|
||||
if( !orte_static_ports &&
|
||||
/* if we are using static ports and this is going to the HNP, send through my parent */
|
||||
if (orte_static_ports &&
|
||||
OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, ORTE_PROC_MY_HNP, target) ) {
|
||||
OPAL_OUTPUT_VERBOSE((2, orte_routed_base_output,
|
||||
"%s routing to the HNP through my parent %s",
|
||||
@ -478,8 +480,13 @@ static int init_routes(orte_jobid_t job, opal_buffer_t *ndat)
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* set our lifeline to the the HNP - we will abort if that connection is lost */
|
||||
lifeline = ORTE_PROC_MY_HNP;
|
||||
/* if we are using static ports, set my lifeline to point at my parent */
|
||||
if (orte_static_ports) {
|
||||
lifeline = ORTE_PROC_MY_PARENT;
|
||||
} else {
|
||||
/* set our lifeline to the HNP - we will abort if that connection is lost */
|
||||
lifeline = ORTE_PROC_MY_HNP;
|
||||
}
|
||||
|
||||
/* daemons will send their contact info back to the HNP as
|
||||
* part of the message confirming they are read to go. HNP's
|
||||
@ -741,6 +748,11 @@ static int update_routing_tree(orte_jobid_t jobid)
|
||||
return ORTE_ERR_NOT_SUPPORTED;
|
||||
}
|
||||
|
||||
/* my parent is the my_vpid-1 daemon */
|
||||
if (!ORTE_PROC_IS_HNP) {
|
||||
ORTE_PROC_MY_PARENT->vpid = ORTE_PROC_MY_NAME->vpid - 1;
|
||||
}
|
||||
|
||||
/* nothing to do here as the routing tree is fixed */
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
@ -83,7 +83,6 @@ static opal_condition_t cond;
|
||||
static opal_mutex_t lock;
|
||||
static orte_process_name_t *lifeline=NULL;
|
||||
static orte_process_name_t local_lifeline;
|
||||
static orte_process_name_t my_parent;
|
||||
static int num_children;
|
||||
static opal_list_t my_children;
|
||||
static bool ack_recvd;
|
||||
@ -100,7 +99,7 @@ static int init(void)
|
||||
/* setup the list of children */
|
||||
OBJ_CONSTRUCT(&my_children, opal_list_t);
|
||||
num_children = 0;
|
||||
my_parent.jobid = ORTE_PROC_MY_NAME->jobid;
|
||||
ORTE_PROC_MY_PARENT->jobid = ORTE_PROC_MY_NAME->jobid;
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
@ -370,8 +369,8 @@ static orte_process_name_t get_route(orte_process_name_t *target)
|
||||
|
||||
/* THIS CAME FROM OUR OWN JOB FAMILY... */
|
||||
|
||||
/* if we are not using static ports and this is going to the HNP, send directly through my parent */
|
||||
if( !orte_static_ports &&
|
||||
/* if we are using static ports and this is going to the HNP, send through my parent */
|
||||
if (orte_static_ports &&
|
||||
OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, ORTE_PROC_MY_HNP, target) ) {
|
||||
OPAL_OUTPUT_VERBOSE((2, orte_routed_base_output,
|
||||
"%s routing to the HNP through my parent %s",
|
||||
@ -418,7 +417,7 @@ static orte_process_name_t get_route(orte_process_name_t *target)
|
||||
/* if we get here, then the target daemon is not beneath
|
||||
* any of our children, so we have to step up through our parent
|
||||
*/
|
||||
daemon.vpid = my_parent.vpid;
|
||||
daemon.vpid = ORTE_PROC_MY_PARENT->vpid;
|
||||
daemon.epoch = orte_ess.proc_get_epoch(&daemon);
|
||||
|
||||
ret = &daemon;
|
||||
@ -510,8 +509,13 @@ static int init_routes(orte_jobid_t job, opal_buffer_t *ndat)
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* set our lifeline to the HNP - we will abort if that connection is lost */
|
||||
lifeline = ORTE_PROC_MY_HNP;
|
||||
/* if we are using static ports, set my lifeline to point at my parent */
|
||||
if (orte_static_ports) {
|
||||
lifeline = ORTE_PROC_MY_PARENT;
|
||||
} else {
|
||||
/* set our lifeline to the HNP - we will abort if that connection is lost */
|
||||
lifeline = ORTE_PROC_MY_HNP;
|
||||
}
|
||||
|
||||
/* daemons will send their contact info back to the HNP as
|
||||
* part of the message confirming they are read to go. HNP's
|
||||
@ -864,12 +868,12 @@ static int update_routing_tree(orte_jobid_t jobid)
|
||||
NInPrevLevel = NInLevel/mca_routed_radix_component.radix;
|
||||
|
||||
if( 0 == Ii ) {
|
||||
my_parent.vpid = -1;
|
||||
ORTE_PROC_MY_PARENT->vpid = -1;
|
||||
} else {
|
||||
my_parent.vpid = (Ii-Sum) % NInPrevLevel;
|
||||
my_parent.vpid += (Sum - NInPrevLevel);
|
||||
ORTE_PROC_MY_PARENT->vpid = (Ii-Sum) % NInPrevLevel;
|
||||
ORTE_PROC_MY_PARENT->vpid += (Sum - NInPrevLevel);
|
||||
}
|
||||
my_parent.epoch = orte_ess.proc_get_epoch(&my_parent);
|
||||
ORTE_PROC_MY_PARENT->epoch = orte_ess.proc_get_epoch(ORTE_PROC_MY_PARENT);
|
||||
|
||||
/* compute my direct children and the bitmap that shows which vpids
|
||||
* lie underneath their branch
|
||||
@ -877,7 +881,7 @@ static int update_routing_tree(orte_jobid_t jobid)
|
||||
radix_tree(Ii, &num_children, &my_children, NULL);
|
||||
|
||||
if (0 < opal_output_get_verbosity(orte_routed_base_output)) {
|
||||
opal_output(0, "%s: parent %d num_children %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), my_parent.vpid, num_children);
|
||||
opal_output(0, "%s: parent %d num_children %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_PROC_MY_PARENT->vpid, num_children);
|
||||
for (item = opal_list_get_first(&my_children);
|
||||
item != opal_list_get_end(&my_children);
|
||||
item = opal_list_get_next(item)) {
|
||||
@ -921,7 +925,7 @@ static orte_vpid_t get_routing_tree(opal_list_t *children)
|
||||
}
|
||||
}
|
||||
/* return my parent's vpid */
|
||||
return my_parent.vpid;
|
||||
return ORTE_PROC_MY_PARENT->vpid;
|
||||
}
|
||||
|
||||
static int get_wireup_info(opal_buffer_t *buf)
|
||||
|
@ -188,18 +188,14 @@ opal_cmd_line_init_t orte_cmd_line_opts[] = {
|
||||
NULL, OPAL_CMD_LINE_TYPE_STRING,
|
||||
"Create a new xterm window and display output from the specified ranks there" },
|
||||
|
||||
{ NULL, NULL, NULL, '\0', "launch", "launch", 1,
|
||||
&orted_launch_cmd, OPAL_CMD_LINE_TYPE_STRING,
|
||||
"A regular expression describing the job to be launched at startup" },
|
||||
|
||||
{ "orte", "daemon", "bootstrap", '\0', "bootstrap", "bootstrap", 0,
|
||||
NULL, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Bootstrap the connection to the HNP" },
|
||||
|
||||
{ "orte", "report", "bindings", '\0', "report-bindings", "report-bindings", 0,
|
||||
NULL, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Whether to report process bindings to stderr" },
|
||||
|
||||
{ "orte", "node", "regex", '\0', "nodes", "nodes", 1,
|
||||
NULL, OPAL_CMD_LINE_TYPE_STRING,
|
||||
"Regular expression defining nodes in system" },
|
||||
|
||||
/* End of list */
|
||||
{ NULL, NULL, NULL, '\0', NULL, NULL, 0,
|
||||
NULL, OPAL_CMD_LINE_TYPE_NULL, NULL }
|
||||
@ -228,17 +224,21 @@ int orte_daemon(int argc, char *argv[])
|
||||
|
||||
/* setup to check common command line options that just report and die */
|
||||
cmd_line = OBJ_NEW(opal_cmd_line_t);
|
||||
opal_cmd_line_create(cmd_line, orte_cmd_line_opts);
|
||||
if (OPAL_SUCCESS != opal_cmd_line_create(cmd_line, orte_cmd_line_opts)) {
|
||||
OBJ_RELEASE(cmd_line);
|
||||
exit(1);
|
||||
}
|
||||
mca_base_cmd_line_setup(cmd_line);
|
||||
if (ORTE_SUCCESS != (ret = opal_cmd_line_parse(cmd_line, false,
|
||||
argc, argv))) {
|
||||
char *args = NULL;
|
||||
args = opal_cmd_line_get_usage_msg(cmd_line);
|
||||
orte_show_help("help-orted.txt", "orted:usage", false,
|
||||
argv[0], args);
|
||||
fprintf(stderr, "Usage: %s [OPTION]...\n%s\n", argv[0], args);
|
||||
free(args);
|
||||
OBJ_RELEASE(cmd_line);
|
||||
return ret;
|
||||
}
|
||||
opal_output(0, "DONE PARSING CMD LINE");
|
||||
|
||||
/*
|
||||
* Since this process can now handle MCA/GMCA parameters, make sure to
|
||||
@ -537,14 +537,8 @@ int orte_daemon(int argc, char *argv[])
|
||||
/* if we are not the HNP...the only time we will be an HNP
|
||||
* is if we are launched by a singleton to provide support
|
||||
* for it
|
||||
*
|
||||
* only do this if we were not given a regexp to launch - if
|
||||
* we were given one, we won't report back our existence
|
||||
* to the HNP, but instead will report when procs are launched
|
||||
* to avoid establishing an unnecessary direct connection back
|
||||
* to the HNP
|
||||
*/
|
||||
if (!ORTE_PROC_IS_HNP && NULL == orted_launch_cmd) {
|
||||
if (!ORTE_PROC_IS_HNP) {
|
||||
/* send the information to the orted report-back point - this function
|
||||
* will process the data, but also counts the number of
|
||||
* orteds that reported back so the launch procedure can continue.
|
||||
@ -649,9 +643,6 @@ int orte_daemon(int argc, char *argv[])
|
||||
opal_sysinfo_value_t *info;
|
||||
int32_t num_values;
|
||||
|
||||
/* Point my parent to be my HNP */
|
||||
orte_process_info.my_parent = orte_process_info.my_hnp;
|
||||
|
||||
/* include our node name */
|
||||
opal_dss.pack(buffer, &orte_process_info.nodename, 1, OPAL_STRING);
|
||||
|
||||
@ -672,26 +663,15 @@ int orte_daemon(int argc, char *argv[])
|
||||
}
|
||||
}
|
||||
|
||||
if (orte_daemon_bootstrap) {
|
||||
/* send to a different callback location as the
|
||||
* HNP didn't launch us and isn't waiting for a
|
||||
* callback
|
||||
*/
|
||||
if (0 > (ret = orte_rml.send_buffer(ORTE_PROC_MY_HNP, buffer,
|
||||
ORTE_RML_TAG_BOOTSTRAP, 0))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
OBJ_RELEASE(buffer);
|
||||
goto DONE;
|
||||
}
|
||||
} else {
|
||||
/* send to the HNP's callback */
|
||||
if (0 > (ret = orte_rml.send_buffer(ORTE_PROC_MY_PARENT, buffer,
|
||||
ORTE_RML_TAG_ORTED_CALLBACK, 0))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
OBJ_RELEASE(buffer);
|
||||
goto DONE;
|
||||
}
|
||||
}
|
||||
/* send to the HNP's callback - this will flow up the routing
|
||||
* tree if static ports are enabled
|
||||
*/
|
||||
if (0 > (ret = orte_rml.send_buffer(ORTE_PROC_MY_HNP, buffer,
|
||||
ORTE_RML_TAG_ORTED_CALLBACK, 0))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
OBJ_RELEASE(buffer);
|
||||
goto DONE;
|
||||
}
|
||||
}
|
||||
OBJ_RELEASE(buffer); /* done with this */
|
||||
}
|
||||
@ -700,25 +680,11 @@ int orte_daemon(int argc, char *argv[])
|
||||
opal_output(0, "%s orted: up and running - waiting for commands!", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
|
||||
}
|
||||
|
||||
/* if we were given a launch string, then process it */
|
||||
if (NULL != orted_launch_cmd) {
|
||||
opal_buffer_t launch;
|
||||
int8_t flag;
|
||||
orte_daemon_cmd_flag_t command = ORTE_DAEMON_ADD_LOCAL_PROCS;
|
||||
OBJ_CONSTRUCT(&launch, opal_buffer_t);
|
||||
opal_dss.pack(&launch, &command, 1, ORTE_DAEMON_CMD);
|
||||
flag = 1;
|
||||
opal_dss.pack(&launch, &flag, 1, OPAL_INT8);
|
||||
opal_dss.pack(&launch, &orted_launch_cmd, 1, OPAL_STRING);
|
||||
ORTE_MESSAGE_EVENT(ORTE_PROC_MY_NAME, &launch, ORTE_RML_TAG_DAEMON, orte_daemon_cmd_processor);
|
||||
OBJ_DESTRUCT(&launch);
|
||||
}
|
||||
|
||||
/* wait to hear we are done */
|
||||
opal_event_dispatch(opal_event_base);
|
||||
|
||||
/* should never get here, but if we do... */
|
||||
DONE:
|
||||
DONE:
|
||||
/* Finalize and clean up ourselves */
|
||||
orte_quit();
|
||||
return ret;
|
||||
|
@ -58,7 +58,6 @@ bool orte_debug_daemons_file_flag = false;
|
||||
bool orte_leave_session_attached;
|
||||
bool orte_do_not_launch = false;
|
||||
bool orted_spin_flag = false;
|
||||
bool orte_daemon_bootstrap = false;
|
||||
char *orte_local_cpu_type = NULL;
|
||||
char *orte_local_cpu_model = NULL;
|
||||
char *orte_basename = NULL;
|
||||
@ -114,7 +113,6 @@ int orte_clean_output = -1;
|
||||
/* Nidmap and job maps */
|
||||
opal_pointer_array_t orte_nidmap;
|
||||
opal_pointer_array_t orte_jobmap;
|
||||
bool orte_use_regexp;
|
||||
char *orted_launch_cmd = NULL;
|
||||
|
||||
/* list of local children on a daemon */
|
||||
@ -156,6 +154,8 @@ char *orte_rankfile;
|
||||
#ifdef __WINDOWS__
|
||||
char *orte_ccp_headnode;
|
||||
#endif
|
||||
int orte_num_allocated_nodes = 0;
|
||||
char *orte_node_regex = NULL;
|
||||
|
||||
/* default rank assigment and binding policy */
|
||||
orte_mapping_policy_t orte_default_mapping_policy = 0;
|
||||
@ -1105,38 +1105,4 @@ OBJ_CLASS_INSTANCE(orte_job_map_t,
|
||||
orte_job_map_construct,
|
||||
orte_job_map_destruct);
|
||||
|
||||
static void orte_regex_node_construct(orte_regex_node_t *ptr)
|
||||
{
|
||||
ptr->prefix = NULL;
|
||||
OBJ_CONSTRUCT(&ptr->suffix, opal_value_array_t);
|
||||
opal_value_array_init(&ptr->suffix, sizeof(char));
|
||||
OBJ_CONSTRUCT(&ptr->nodes, opal_value_array_t);
|
||||
opal_value_array_init(&ptr->nodes, sizeof(int32_t));
|
||||
OBJ_CONSTRUCT(&ptr->cnt, opal_value_array_t);
|
||||
opal_value_array_init(&ptr->cnt, sizeof(int32_t));
|
||||
OBJ_CONSTRUCT(&ptr->starting_vpid, opal_value_array_t);
|
||||
opal_value_array_init(&ptr->starting_vpid, sizeof(orte_vpid_t));
|
||||
OBJ_CONSTRUCT(&ptr->ppn, opal_value_array_t);
|
||||
opal_value_array_init(&ptr->ppn, sizeof(int32_t));
|
||||
OBJ_CONSTRUCT(&ptr->nrank, opal_value_array_t);
|
||||
opal_value_array_init(&ptr->nrank, sizeof(orte_node_rank_t));
|
||||
}
|
||||
static void orte_regex_node_destruct(orte_regex_node_t *ptr)
|
||||
{
|
||||
if (NULL != ptr->prefix) {
|
||||
free(ptr->prefix);
|
||||
ptr->prefix = NULL;
|
||||
}
|
||||
OBJ_DESTRUCT(&ptr->suffix);
|
||||
OBJ_DESTRUCT(&ptr->nodes);
|
||||
OBJ_DESTRUCT(&ptr->cnt);
|
||||
OBJ_DESTRUCT(&ptr->starting_vpid);
|
||||
OBJ_DESTRUCT(&ptr->ppn);
|
||||
OBJ_DESTRUCT(&ptr->nrank);
|
||||
}
|
||||
OBJ_CLASS_INSTANCE(orte_regex_node_t,
|
||||
opal_list_item_t,
|
||||
orte_regex_node_construct,
|
||||
orte_regex_node_destruct);
|
||||
|
||||
#endif
|
||||
|
@ -554,19 +554,6 @@ typedef struct {
|
||||
} orte_jmap_t;
|
||||
ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_jmap_t);
|
||||
|
||||
typedef struct {
|
||||
/* list object */
|
||||
opal_list_item_t super;
|
||||
char *prefix;
|
||||
opal_value_array_t suffix;
|
||||
opal_value_array_t nodes;
|
||||
opal_value_array_t cnt;
|
||||
opal_value_array_t starting_vpid;
|
||||
opal_value_array_t ppn;
|
||||
opal_value_array_t nrank;
|
||||
} orte_regex_node_t;
|
||||
ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_regex_node_t);
|
||||
|
||||
/**
|
||||
* Get a job data object
|
||||
* We cannot just reference a job data object with its jobid as
|
||||
@ -589,7 +576,6 @@ ORTE_DECLSPEC extern bool orte_debug_daemons_file_flag;
|
||||
ORTE_DECLSPEC extern bool orte_leave_session_attached;
|
||||
ORTE_DECLSPEC extern bool orte_do_not_launch;
|
||||
ORTE_DECLSPEC extern bool orted_spin_flag;
|
||||
ORTE_DECLSPEC extern bool orte_daemon_bootstrap;
|
||||
ORTE_DECLSPEC extern char *orte_local_cpu_type;
|
||||
ORTE_DECLSPEC extern char *orte_local_cpu_model;
|
||||
ORTE_DECLSPEC extern char *orte_basename;
|
||||
@ -644,7 +630,6 @@ ORTE_DECLSPEC extern int orte_clean_output;
|
||||
/* Nidmap and job maps */
|
||||
ORTE_DECLSPEC extern opal_pointer_array_t orte_nidmap;
|
||||
ORTE_DECLSPEC extern opal_pointer_array_t orte_jobmap;
|
||||
ORTE_DECLSPEC extern bool orte_use_regexp;
|
||||
ORTE_DECLSPEC extern char *orted_launch_cmd;
|
||||
|
||||
/* list of local children on a daemon */
|
||||
@ -686,6 +671,8 @@ ORTE_DECLSPEC extern char *orte_rankfile;
|
||||
#ifdef __WINDOWS__
|
||||
ORTE_DECLSPEC extern char *orte_ccp_headnode;
|
||||
#endif
|
||||
ORTE_DECLSPEC extern int orte_num_allocated_nodes;
|
||||
ORTE_DECLSPEC extern char *orte_node_regex;
|
||||
|
||||
/* default rank assigment and binding policy */
|
||||
ORTE_DECLSPEC extern orte_mapping_policy_t orte_default_mapping_policy;
|
||||
|
@ -124,11 +124,6 @@ int orte_register_params(void)
|
||||
orte_debug_daemons_flag = true;
|
||||
}
|
||||
|
||||
mca_base_param_reg_int_name("orte", "daemon_bootstrap",
|
||||
"Bootstrap the connection to the HNP",
|
||||
false, false, (int)false, &value);
|
||||
orte_daemon_bootstrap = OPAL_INT_TO_BOOL(value);
|
||||
|
||||
/* do we want session output left open? */
|
||||
mca_base_param_reg_int_name("orte", "leave_session_attached",
|
||||
"Whether applications and/or daemons should leave their sessions "
|
||||
@ -238,18 +233,18 @@ int orte_register_params(void)
|
||||
NULL, &orte_ccp_headnode);
|
||||
#endif
|
||||
|
||||
|
||||
/* regex of nodes in system */
|
||||
mca_base_param_reg_string_name("orte", "node_regex",
|
||||
"Regular expression defining nodes in the system",
|
||||
false, false, NULL, &orte_node_regex);
|
||||
|
||||
/* whether or not to keep FQDN hostnames */
|
||||
mca_base_param_reg_int_name("orte", "keep_fqdn_hostnames",
|
||||
"Whether or not to keep FQDN hostnames [default: no]",
|
||||
false, false, (int)false, &value);
|
||||
orte_keep_fqdn_hostnames = OPAL_INT_TO_BOOL(value);
|
||||
|
||||
/* whether or not to use regular expressions for launch */
|
||||
mca_base_param_reg_int_name("orte", "use_regexp",
|
||||
"Whether or not to use regular expressions for launch [default: no]",
|
||||
false, false, (int)false, &value);
|
||||
orte_use_regexp = OPAL_INT_TO_BOOL(value);
|
||||
|
||||
/* whether to tag output */
|
||||
mca_base_param_reg_int_name("orte", "tag_output",
|
||||
"Tag all output with [job,rank] (default: false)",
|
||||
|
@ -1,4 +1,4 @@
|
||||
PROGS = no_op sigusr_trap spin orte_nodename orte_spawn orte_loop_spawn orte_loop_child orte_abort get_limits orte_ring spawn_child orte_tool orte_no_op binom oob_stress iof_stress iof_delay radix orte_barrier orte_mcast opal_interface mcast mcast_recv orte_spin segfault sysinfo orte_exit orte_db orte_sensor test-time event-threads psm_keygen
|
||||
PROGS = no_op sigusr_trap spin orte_nodename orte_spawn orte_loop_spawn orte_loop_child orte_abort get_limits orte_ring spawn_child orte_tool orte_no_op binom oob_stress iof_stress iof_delay radix orte_barrier orte_mcast opal_interface mcast mcast_recv orte_spin segfault sysinfo orte_exit orte_db orte_sensor test-time event-threads psm_keygen regex
|
||||
|
||||
all: $(PROGS)
|
||||
|
||||
|
48
orte/test/system/regex.c
Обычный файл
48
orte/test/system/regex.c
Обычный файл
@ -0,0 +1,48 @@
|
||||
/* -*- C -*-
|
||||
*
|
||||
* $HEADER$
|
||||
*
|
||||
* The most basic of MPI applications
|
||||
*/
|
||||
|
||||
#include "orte_config.h"
|
||||
|
||||
#include <stdio.h>
|
||||
#include <unistd.h>
|
||||
|
||||
#include "orte/util/proc_info.h"
|
||||
#include "orte/util/regex.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/runtime/runtime.h"
|
||||
|
||||
main(int argc, char **argv)
|
||||
{
|
||||
int rc;
|
||||
char *regex, *save;
|
||||
char **nodes;
|
||||
|
||||
if (argc < 1 || NULL == argv[1]) {
|
||||
fprintf(stderr, "usage: regex <comma-separated list of nodes>\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
save = strdup(argv[1]);
|
||||
if (ORTE_SUCCESS != (rc = orte_regex_create(save, ®ex))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
} else {
|
||||
fprintf(stderr, "REGEX: %s\n", regex);
|
||||
if (ORTE_SUCCESS != (rc = orte_regex_extract_node_names(regex, &nodes))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
free(regex);
|
||||
regex = opal_argv_join(nodes, ',');
|
||||
opal_argv_free(nodes);
|
||||
if (0 == strcmp(regex, argv[1])) {
|
||||
fprintf(stderr, "EXACT MATCH\n");
|
||||
} else {
|
||||
fprintf(stderr, "ERROR: %s\n", regex);
|
||||
}
|
||||
free(regex);
|
||||
}
|
||||
free(save);
|
||||
}
|
@ -69,8 +69,6 @@ int orte_util_nidmap_init(opal_buffer_t *buffer)
|
||||
int32_t cnt;
|
||||
int rc;
|
||||
opal_byte_object_t *bo;
|
||||
int8_t flag;
|
||||
char *regexp;
|
||||
|
||||
if (!initialized) {
|
||||
/* need to construct the global arrays */
|
||||
@ -91,29 +89,6 @@ int orte_util_nidmap_init(opal_buffer_t *buffer)
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/* extract the flag indicating the type of info in the buffer */
|
||||
cnt=1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &flag, &cnt, OPAL_INT8))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
if (0 < flag) {
|
||||
/* the data is a regular expression - extract and parse it
|
||||
* to get the daemonmap and process map
|
||||
*/
|
||||
cnt=1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, ®exp, &cnt, OPAL_STRING))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
if (ORTE_SUCCESS != (rc = orte_regex_decode_maps(regexp, NULL))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
free(regexp);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* extract the byte object holding the daemonmap */
|
||||
cnt=1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &bo, &cnt, OPAL_BYTE_OBJECT))) {
|
||||
|
1107
orte/util/regex.c
1107
orte/util/regex.c
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
@ -25,18 +25,39 @@
|
||||
|
||||
#include "orte_config.h"
|
||||
|
||||
#include "opal/class/opal_value_array.h"
|
||||
#include "opal/class/opal_list.h"
|
||||
|
||||
#include "orte/mca/odls/odls_types.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
|
||||
BEGIN_C_DECLS
|
||||
|
||||
typedef struct {
|
||||
opal_list_item_t super;
|
||||
int start;
|
||||
int cnt;
|
||||
} orte_regex_range_t;
|
||||
ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_regex_range_t);
|
||||
|
||||
typedef struct {
|
||||
/* list object */
|
||||
opal_list_item_t super;
|
||||
char *prefix;
|
||||
char *suffix;
|
||||
int num_digits;
|
||||
opal_list_t ranges;
|
||||
} orte_regex_node_t;
|
||||
ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_regex_node_t);
|
||||
|
||||
/* NOTE: this is a destructive call for the nodes param - the
|
||||
* function will search and replace all commas with '\0'
|
||||
*/
|
||||
ORTE_DECLSPEC int orte_regex_create(char *nodes, char **regexp);
|
||||
|
||||
ORTE_DECLSPEC int orte_regex_extract_node_names(char *regexp, char ***names);
|
||||
|
||||
ORTE_DECLSPEC int orte_regex_extract_ppn(int num_nodes, char *regexp, int **ppn);
|
||||
|
||||
ORTE_DECLSPEC char* orte_regex_encode_maps(orte_job_t *jdata);
|
||||
|
||||
ORTE_DECLSPEC int orte_regex_decode_maps(char *regexp, orte_odls_job_t **jobdat);
|
||||
|
||||
END_C_DECLS
|
||||
#endif
|
||||
|
Загрузка…
Ссылка в новой задаче
Block a user