Merge pull request #2916 from rhc54/topic/sim
Create an alternative mapping method
Этот коммит содержится в:
Коммит
97287f6568
@ -8,7 +8,7 @@ enable_heterogeneous=no
|
||||
enable_picky=yes
|
||||
enable_debug=yes
|
||||
enable_shared=yes
|
||||
enable_static=no
|
||||
enable_static=yes
|
||||
enable_memchecker=no
|
||||
enable_ipv6=no
|
||||
enable_mpi_fortran=no
|
||||
@ -18,7 +18,10 @@ enable_cxx_exceptions=no
|
||||
enable_oshmem=no
|
||||
enable_mpi_java=no
|
||||
enable_io_romio=no
|
||||
enable_builtin_atomics=no
|
||||
enable_contrib_no_build=libnbc
|
||||
enable_mca_no_build=btl-tcp,btl-sm,rcache-udreg
|
||||
enable_mca_direct=pml-ob1
|
||||
with_memory_manager=no
|
||||
with_tm=no
|
||||
with_verbs=no
|
||||
|
@ -24,7 +24,7 @@ my $ppn = 1;
|
||||
my @csvrow;
|
||||
|
||||
my @tests = qw(/bin/true ./orte_no_op ./mpi_no_op ./mpi_no_op ./mpi_no_op);
|
||||
my @options = ("", "", "", "-mca mpi_add_procs_cutoff 0 -mca pmix_base_async_modex 1", "-mca mpi_add_procs_cutoff 0 -mca pmix_base_async_modex 1 -mca async_mpi_init 1 -mca async_mpi_finalize 1");
|
||||
my @options = ("", "", "", "--fwd-mpirun-port -mca mpi_add_procs_cutoff 0 -mca pmix_base_async_modex 1", "--fwd-mpirun-port -mca mpi_add_procs_cutoff 0 -mca pmix_base_async_modex 1 -mca async_mpi_init 1 -mca async_mpi_finalize 1");
|
||||
my @starterlist = qw(mpirun orterun srun aprun);
|
||||
my @starteroptionlist = ("--novm",
|
||||
"--hnp file:dvm_uri",
|
||||
|
@ -58,6 +58,7 @@
|
||||
#include "orte/mca/plm/base/base.h"
|
||||
#include "orte/mca/odls/base/base.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/rmaps/base/base.h"
|
||||
#if OPAL_ENABLE_FT_CR == 1
|
||||
#include "orte/mca/snapc/base/base.h"
|
||||
#include "orte/mca/sstore/base/base.h"
|
||||
@ -116,6 +117,7 @@ int orte_ess_base_orted_setup(char **hosts)
|
||||
char *param;
|
||||
hwloc_obj_t obj;
|
||||
unsigned i, j;
|
||||
orte_topology_t *t;
|
||||
opal_list_t transports;
|
||||
|
||||
/* my name is set, xfer it to the OPAL layer */
|
||||
@ -333,13 +335,8 @@ int orte_ess_base_orted_setup(char **hosts)
|
||||
/* create and store a node object where we are */
|
||||
node = OBJ_NEW(orte_node_t);
|
||||
node->name = strdup(orte_process_info.nodename);
|
||||
node->index = opal_pointer_array_set_item(orte_node_pool, ORTE_PROC_MY_NAME->vpid, node);
|
||||
/* point our topology to the one detected locally */
|
||||
node->topology = OBJ_NEW(orte_topology_t);
|
||||
node->topology->sig = strdup(orte_topo_signature);
|
||||
node->topology->topo = opal_hwloc_topology;
|
||||
/* add it to the array of known ones */
|
||||
opal_pointer_array_add(orte_node_topologies, node->topology);
|
||||
node->index = ORTE_PROC_MY_NAME->vpid;
|
||||
opal_pointer_array_set_item(orte_node_pool, ORTE_PROC_MY_NAME->vpid, node);
|
||||
|
||||
/* create and store a proc object for us */
|
||||
proc = OBJ_NEW(orte_proc_t);
|
||||
@ -496,14 +493,40 @@ int orte_ess_base_orted_setup(char **hosts)
|
||||
error = "orte_rtc_base_select";
|
||||
goto error;
|
||||
}
|
||||
if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_rmaps_base_framework, 0))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
error = "orte_rmaps_base_open";
|
||||
goto error;
|
||||
}
|
||||
if (ORTE_SUCCESS != (ret = orte_rmaps_base_select())) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
error = "orte_rmaps_base_find_available";
|
||||
goto error;
|
||||
}
|
||||
|
||||
/* if we are using static ports, then we need to setup
|
||||
/* if a topology file was given, then the rmaps framework open
|
||||
* will have reset our topology. Ensure we always get the right
|
||||
* one by setting our node topology afterwards
|
||||
*/
|
||||
t = OBJ_NEW(orte_topology_t);
|
||||
t->topo = opal_hwloc_topology;
|
||||
/* generate the signature */
|
||||
orte_topo_signature = opal_hwloc_base_get_topo_signature(opal_hwloc_topology);
|
||||
t->sig = strdup(orte_topo_signature);
|
||||
opal_pointer_array_add(orte_node_topologies, t);
|
||||
node->topology = t;
|
||||
if (15 < opal_output_get_verbosity(orte_ess_base_framework.framework_output)) {
|
||||
opal_output(0, "%s Topology Info:", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
|
||||
opal_dss.dump(0, opal_hwloc_topology, OPAL_HWLOC_TOPO);
|
||||
}
|
||||
|
||||
/* if we were given the host list, then we need to setup
|
||||
* the daemon info so the RML can function properly
|
||||
* without requiring a wireup stage. This must be done
|
||||
* after we enable_comm as that function determines our
|
||||
* own port, which we need in order to construct the nidmap
|
||||
*/
|
||||
if (orte_static_ports) {
|
||||
if (NULL != hosts) {
|
||||
/* extract the node info from the environment and
|
||||
* build a nidmap from it - this will update the
|
||||
* routing plan as well
|
||||
|
@ -427,7 +427,8 @@ static int rte_init(void)
|
||||
/* create and store a node object where we are */
|
||||
node = OBJ_NEW(orte_node_t);
|
||||
node->name = strdup(orte_process_info.nodename);
|
||||
node->index = opal_pointer_array_set_item(orte_node_pool, 0, node);
|
||||
node->index = ORTE_PROC_MY_NAME->vpid;
|
||||
opal_pointer_array_set_item(orte_node_pool, 0, node);
|
||||
|
||||
/* create and store a proc object for us */
|
||||
proc = OBJ_NEW(orte_proc_t);
|
||||
|
@ -545,6 +545,8 @@ static int pack_xcast(orte_grpcomm_signature_t *sig,
|
||||
OBJ_DESTRUCT(&data);
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base_framework.framework_output,
|
||||
"MSG SIZE: %lu", buffer->bytes_used));
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
@ -67,6 +67,8 @@
|
||||
#include "orte/mca/plm/base/base.h"
|
||||
#include "orte/mca/rml/base/rml_contact.h"
|
||||
#include "orte/mca/rmaps/rmaps_types.h"
|
||||
#include "orte/mca/rmaps/base/base.h"
|
||||
#include "orte/mca/rmaps/base/rmaps_private.h"
|
||||
#include "orte/mca/schizo/schizo.h"
|
||||
#include "orte/mca/state/state.h"
|
||||
#include "orte/mca/filem/filem.h"
|
||||
@ -74,6 +76,7 @@
|
||||
|
||||
#include "orte/util/context_fns.h"
|
||||
#include "orte/util/name_fns.h"
|
||||
#include "orte/util/regex.h"
|
||||
#include "orte/util/session_dir.h"
|
||||
#include "orte/util/proc_info.h"
|
||||
#include "orte/util/nidmap.h"
|
||||
@ -137,7 +140,7 @@ int orte_odls_base_default_get_add_procs_data(opal_buffer_t *buffer,
|
||||
}
|
||||
|
||||
/* if we are not using static ports, we need to send the wireup info */
|
||||
if (!orte_static_ports) {
|
||||
if (!orte_static_ports && !orte_fwd_mpirun_port) {
|
||||
/* pack a flag indicating wiring info is provided */
|
||||
flag = 1;
|
||||
opal_dss.pack(buffer, &flag, 1, OPAL_INT8);
|
||||
@ -234,11 +237,11 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *buffer,
|
||||
int rc;
|
||||
orte_std_cntr_t cnt;
|
||||
orte_job_t *jdata=NULL, *daemons;
|
||||
int32_t n, j, k;
|
||||
orte_proc_t *pptr, *dmn;
|
||||
int32_t n, k, m;
|
||||
opal_buffer_t *bptr;
|
||||
orte_app_context_t *app;
|
||||
orte_node_t *node;
|
||||
orte_proc_t *pptr, *dmn;
|
||||
orte_app_context_t *app;
|
||||
bool newmap = false;
|
||||
int8_t flag;
|
||||
|
||||
@ -246,6 +249,7 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *buffer,
|
||||
"%s odls:constructing child list",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
|
||||
/* set a default response */
|
||||
*job = ORTE_JOBID_INVALID;
|
||||
/* get the daemon job object */
|
||||
daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid);
|
||||
@ -286,24 +290,6 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *buffer,
|
||||
if (NULL == orte_get_job_data_object(jdata->jobid)) {
|
||||
/* nope - add it */
|
||||
opal_hash_table_set_value_uint32(orte_job_data, jdata->jobid, jdata);
|
||||
/* connect each proc to its node object */
|
||||
for (j=0; j < jdata->procs->size; j++) {
|
||||
if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, j))) {
|
||||
continue;
|
||||
}
|
||||
if (NULL == (dmn = (orte_proc_t*)opal_pointer_array_get_item(daemons->procs, pptr->parent))) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
||||
rc = ORTE_ERR_NOT_FOUND;
|
||||
goto REPORT_ERROR;
|
||||
}
|
||||
OBJ_RETAIN(dmn->node);
|
||||
pptr->node = dmn->node;
|
||||
/* add proc to node - note that num_procs for the
|
||||
* node was already correctly unpacked, so don't
|
||||
* increment it here */
|
||||
OBJ_RETAIN(pptr);
|
||||
opal_pointer_array_add(dmn->node->procs, pptr);
|
||||
}
|
||||
} else {
|
||||
/* yep - so we can drop this copy */
|
||||
jdata->jobid = ORTE_JOBID_INVALID;
|
||||
@ -351,41 +337,6 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *buffer,
|
||||
rc = ORTE_ERR_NOT_FOUND;
|
||||
goto REPORT_ERROR;
|
||||
}
|
||||
for (n=0; n < jdata->procs->size; n++) {
|
||||
if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, n))) {
|
||||
continue;
|
||||
}
|
||||
if (ORTE_PROC_STATE_UNDEF == pptr->state) {
|
||||
/* not ready for use yet */
|
||||
continue;
|
||||
}
|
||||
/* see if it belongs to us */
|
||||
if (pptr->parent == ORTE_PROC_MY_NAME->vpid) {
|
||||
/* is this child on our current list of children */
|
||||
if (!ORTE_FLAG_TEST(pptr, ORTE_PROC_FLAG_LOCAL)) {
|
||||
/* not on the local list */
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
|
||||
"%s adding proc %s to my local list",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&pptr->name)));
|
||||
/* keep tabs of the number of local procs */
|
||||
jdata->num_local_procs++;
|
||||
/* add this proc to our child list */
|
||||
OBJ_RETAIN(pptr);
|
||||
ORTE_FLAG_SET(pptr, ORTE_PROC_FLAG_LOCAL);
|
||||
opal_pointer_array_add(orte_local_children, pptr);
|
||||
}
|
||||
|
||||
/* if the job is in restart mode, the child must not barrier when launched */
|
||||
if (ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_RESTART)) {
|
||||
orte_set_attribute(&pptr->attributes, ORTE_PROC_NOBARRIER, ORTE_ATTR_LOCAL, NULL, OPAL_BOOL);
|
||||
}
|
||||
/* mark that this app_context is being used on this node */
|
||||
app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, pptr->app_idx);
|
||||
ORTE_FLAG_SET(app, ORTE_APP_FLAG_USED_ON_NODE);
|
||||
}
|
||||
}
|
||||
goto COMPLETE;
|
||||
} else {
|
||||
opal_hash_table_set_value_uint32(orte_job_data, jdata->jobid, jdata);
|
||||
}
|
||||
@ -396,16 +347,9 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *buffer,
|
||||
newmap = true;
|
||||
}
|
||||
|
||||
/* if we have a file map, then we need to load it */
|
||||
if (orte_get_attribute(&jdata->attributes, ORTE_JOB_FILE_MAPS, (void**)&bptr, OPAL_BUFFER)) {
|
||||
if (NULL != orte_dfs.load_file_maps) {
|
||||
orte_dfs.load_file_maps(jdata->jobid, bptr, fm_release, bptr);
|
||||
} else {
|
||||
OBJ_RELEASE(bptr);
|
||||
}
|
||||
}
|
||||
|
||||
/* check the procs */
|
||||
if (orte_no_vm) {
|
||||
/* if we are operating novm, then mpirun will have sent us
|
||||
* the complete array of procs - process it */
|
||||
for (n=0; n < jdata->procs->size; n++) {
|
||||
if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, n))) {
|
||||
continue;
|
||||
@ -474,20 +418,82 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *buffer,
|
||||
ORTE_FLAG_SET(app, ORTE_APP_FLAG_USED_ON_NODE);
|
||||
}
|
||||
}
|
||||
/* reset the node map flags we used so the next job will start clean */
|
||||
} else {
|
||||
/* create the map - will already have been done for the novm case */
|
||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_map_job(jdata))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
/* find our local procs */
|
||||
for (n=0; n < jdata->map->nodes->size; n++) {
|
||||
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(jdata->map->nodes, n))) {
|
||||
continue;
|
||||
}
|
||||
if (node->index != (int)ORTE_PROC_MY_NAME->vpid) {
|
||||
continue;
|
||||
}
|
||||
for (m=0; m < node->procs->size; m++) {
|
||||
if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(node->procs, m))) {
|
||||
continue;
|
||||
}
|
||||
if (!ORTE_FLAG_TEST(pptr, ORTE_PROC_FLAG_LOCAL)) {
|
||||
/* not on the local list */
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
|
||||
"%s adding proc %s to my local list",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&pptr->name)));
|
||||
/* keep tabs of the number of local procs */
|
||||
jdata->num_local_procs++;
|
||||
/* add this proc to our child list */
|
||||
OBJ_RETAIN(pptr);
|
||||
ORTE_FLAG_SET(pptr, ORTE_PROC_FLAG_LOCAL);
|
||||
opal_pointer_array_add(orte_local_children, pptr);
|
||||
/* mark that this app_context is being used on this node */
|
||||
app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, pptr->app_idx);
|
||||
ORTE_FLAG_SET(app, ORTE_APP_FLAG_USED_ON_NODE);
|
||||
}
|
||||
}
|
||||
}
|
||||
/* compute and save bindings of local children */
|
||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_bindings(jdata))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
}
|
||||
|
||||
/* reset any node map flags we used so the next job will start clean */
|
||||
for (n=0; n < jdata->map->nodes->size; n++) {
|
||||
if (NULL != (node = (orte_node_t*)opal_pointer_array_get_item(jdata->map->nodes, n))) {
|
||||
ORTE_FLAG_UNSET(node, ORTE_NODE_FLAG_MAPPED);
|
||||
}
|
||||
}
|
||||
|
||||
COMPLETE:
|
||||
/* if we wanted to see the map, now is the time to display it */
|
||||
if (jdata->map->display_map) {
|
||||
orte_rmaps_base_display_map(jdata);
|
||||
}
|
||||
|
||||
/* if we have a file map, then we need to load it */
|
||||
if (orte_get_attribute(&jdata->attributes, ORTE_JOB_FILE_MAPS, (void**)&bptr, OPAL_BUFFER)) {
|
||||
if (NULL != orte_dfs.load_file_maps) {
|
||||
orte_dfs.load_file_maps(jdata->jobid, bptr, fm_release, bptr);
|
||||
} else {
|
||||
OBJ_RELEASE(bptr);
|
||||
}
|
||||
}
|
||||
|
||||
/* register this job with the PMIx server - need to wait until after we
|
||||
* have computed the #local_procs before calling the function */
|
||||
if (ORTE_SUCCESS != (rc = orte_pmix_server_register_nspace(jdata, false))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto REPORT_ERROR;
|
||||
}
|
||||
|
||||
/* to save memory, purge the job map of all procs other than
|
||||
* our own - for daemons, this will completely release the
|
||||
* proc structures. For the HNP, the proc structs will
|
||||
* remain in the orte_job_t array */
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
|
||||
REPORT_ERROR:
|
||||
@ -636,6 +642,10 @@ void orte_odls_base_default_launch_local(int fd, short sd, void *cbdata)
|
||||
orte_odls_base_fork_local_proc_fn_t fork_local = caddy->fork_local;
|
||||
bool index_argv;
|
||||
|
||||
opal_output_verbose(5, orte_odls_base_framework.framework_output,
|
||||
"%s local:launch",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
|
||||
|
||||
/* establish our baseline working directory - we will be potentially
|
||||
* bouncing around as we execute various apps, but we will always return
|
||||
* to this place as our default directory
|
||||
@ -804,7 +814,6 @@ void orte_odls_base_default_launch_local(int fd, short sd, void *cbdata)
|
||||
if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, idx))) {
|
||||
continue;
|
||||
}
|
||||
|
||||
/* does this child belong to this app? */
|
||||
if (j != (int)child->app_idx) {
|
||||
continue;
|
||||
|
@ -325,13 +325,12 @@ static int tcp_component_register(void)
|
||||
} else {
|
||||
mca_oob_tcp_component.tcp6_static_ports = NULL;
|
||||
}
|
||||
if (NULL == mca_oob_tcp_component.tcp_static_ports &&
|
||||
NULL == mca_oob_tcp_component.tcp6_static_ports) {
|
||||
orte_static_ports = false;
|
||||
} else {
|
||||
#endif // OPAL_ENABLE_IPV6
|
||||
|
||||
if (NULL != mca_oob_tcp_component.tcp_static_ports ||
|
||||
NULL != mca_oob_tcp_component.tcp6_static_ports) {
|
||||
orte_static_ports = true;
|
||||
}
|
||||
#endif // OPAL_ENABLE_IPV6
|
||||
|
||||
dyn_port_string = NULL;
|
||||
(void)mca_base_component_var_register(component, "dynamic_ipv4_ports",
|
||||
|
@ -385,7 +385,7 @@ static int create_listen(void)
|
||||
conn = OBJ_NEW(mca_oob_tcp_listener_t);
|
||||
conn->sd = sd;
|
||||
conn->port = ntohs(((struct sockaddr_in*) &inaddr)->sin_port);
|
||||
if (orte_static_ports && 0 == orte_process_info.my_port) {
|
||||
if (0 == orte_process_info.my_port) {
|
||||
/* save the first one */
|
||||
orte_process_info.my_port = conn->port;
|
||||
}
|
||||
|
@ -202,6 +202,7 @@ static void launch_daemons(int fd, short args, void *cbdata)
|
||||
orte_std_cntr_t nnode;
|
||||
orte_job_t *daemons;
|
||||
orte_state_caddy_t *state = (orte_state_caddy_t*)cbdata;
|
||||
char *ltmp;
|
||||
|
||||
/* if we are launching debugger daemons, then just go
|
||||
* do it - no new daemons will be launched
|
||||
@ -350,16 +351,13 @@ static void launch_daemons(int fd, short args, void *cbdata)
|
||||
/* add the daemon command (as specified by user) */
|
||||
orte_plm_base_setup_orted_cmd(&argc, &argv);
|
||||
|
||||
/* if we have static ports, we need to ensure that mpirun is
|
||||
/* ensure that mpirun is
|
||||
* on the list. Since alps won't be launching a daemon on it,
|
||||
* it won't have been placed on the list, so create a new
|
||||
* version here that includes it */
|
||||
if (orte_static_ports) {
|
||||
char *ltmp;
|
||||
asprintf(<mp, "%s,%s", orte_process_info.nodename, nodelist_flat);
|
||||
free(nodelist_flat);
|
||||
nodelist_flat = ltmp;
|
||||
}
|
||||
|
||||
/* Add basic orted command line options, including debug flags */
|
||||
orte_plm_base_orted_append_basic_args(&argc, &argv,
|
||||
|
@ -207,7 +207,7 @@ static void files_ready(int status, void *cbdata)
|
||||
if (ORTE_SUCCESS != status) {
|
||||
ORTE_FORCED_TERMINATE(status);
|
||||
} else {
|
||||
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_MAP);
|
||||
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_SYSTEM_PREP);
|
||||
}
|
||||
}
|
||||
|
||||
@ -395,22 +395,6 @@ void orte_plm_base_complete_setup(int fd, short args, void *cbdata)
|
||||
/* convenience */
|
||||
jdata = caddy->jdata;
|
||||
|
||||
/* quick sanity check - is the stdin target within range
|
||||
* of the job?
|
||||
*/
|
||||
if (ORTE_VPID_WILDCARD != jdata->stdin_target &&
|
||||
ORTE_VPID_INVALID != jdata->stdin_target &&
|
||||
jdata->num_procs <= jdata->stdin_target) {
|
||||
/* this request cannot be met */
|
||||
orte_show_help("help-plm-base.txt", "stdin-target-out-of-range", true,
|
||||
ORTE_VPID_PRINT(jdata->stdin_target),
|
||||
ORTE_VPID_PRINT(jdata->num_procs));
|
||||
orte_never_launched = true;
|
||||
ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
|
||||
OBJ_RELEASE(caddy);
|
||||
return;
|
||||
}
|
||||
|
||||
/* If this job is being started by me, then there is nothing
|
||||
* further we need to do as any user directives (e.g., to tie
|
||||
* off IO to /dev/null) will have been included in the launch
|
||||
@ -426,19 +410,6 @@ void orte_plm_base_complete_setup(int fd, short args, void *cbdata)
|
||||
* about stdin */
|
||||
}
|
||||
|
||||
#if OPAL_ENABLE_FT_CR == 1
|
||||
/*
|
||||
* Notify the Global SnapC component regarding new job (even if it was restarted)
|
||||
*/
|
||||
{
|
||||
int rc;
|
||||
if( ORTE_SUCCESS != (rc = orte_snapc.setup_job(jdata->jobid) ) ) {
|
||||
/* Silent Failure :/ JJH */
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
/* if coprocessors were detected, now is the time to
|
||||
* identify who is attached to what host - this info
|
||||
* will be shipped to the daemons in the nidmap. Someday,
|
||||
@ -1457,8 +1428,7 @@ int orte_plm_base_orted_append_basic_args(int *argc, char ***argv,
|
||||
opal_argv_append(argc, argv, rml_uri);
|
||||
free(rml_uri);
|
||||
|
||||
/* if we have static ports, pass the node list */
|
||||
if (orte_static_ports) {
|
||||
/* pass the node list if one was given*/
|
||||
param = NULL;
|
||||
if (NULL != nodes) {
|
||||
/* convert the nodes to a regex */
|
||||
@ -1475,6 +1445,14 @@ int orte_plm_base_orted_append_basic_args(int *argc, char ***argv,
|
||||
opal_argv_append(argc, argv, param);
|
||||
free(param);
|
||||
}
|
||||
|
||||
/* if requested, pass our port */
|
||||
if (orte_fwd_mpirun_port) {
|
||||
asprintf(¶m, "%d", orte_process_info.my_port);
|
||||
opal_argv_append(argc, argv, "-"OPAL_MCA_CMD_LINE_ID);
|
||||
opal_argv_append(argc, argv, "oob_tcp_static_ipv4_ports");
|
||||
opal_argv_append(argc, argv, param);
|
||||
free(param);
|
||||
}
|
||||
|
||||
/* if output-filename was specified, pass that along */
|
||||
|
@ -341,6 +341,7 @@ void orte_plm_base_recv(int status, orte_process_name_t* sender,
|
||||
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, vpid))) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
||||
ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
|
||||
goto CLEANUP;
|
||||
}
|
||||
/* NEVER update the proc state before activating the state machine - let
|
||||
* the state cbfunc update it as it may need to compare this
|
||||
@ -374,14 +375,14 @@ void orte_plm_base_recv(int status, orte_process_name_t* sender,
|
||||
count=1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &job, &count, ORTE_JOBID))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto DEPART;
|
||||
goto CLEANUP;
|
||||
}
|
||||
name.jobid = job;
|
||||
/* get the job object */
|
||||
if (NULL == (jdata = orte_get_job_data_object(job))) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
||||
rc = ORTE_ERR_NOT_FOUND;
|
||||
goto DEPART;
|
||||
goto CLEANUP;
|
||||
}
|
||||
count=1;
|
||||
while (ORTE_SUCCESS == opal_dss.unpack(buffer, &vpid, &count, ORTE_VPID)) {
|
||||
@ -398,11 +399,6 @@ void orte_plm_base_recv(int status, orte_process_name_t* sender,
|
||||
}
|
||||
|
||||
CLEANUP:
|
||||
if (ORTE_SUCCESS != rc) {
|
||||
goto DEPART;
|
||||
}
|
||||
|
||||
DEPART:
|
||||
/* see if an error occurred - if so, wakeup the HNP so we can exit */
|
||||
if (ORTE_PROC_IS_HNP && ORTE_SUCCESS != rc) {
|
||||
jdata = NULL;
|
||||
|
@ -1191,7 +1191,6 @@ static void launch_daemons(int fd, short args, void *cbdata)
|
||||
orte_routed.get_routing_list(rtmod, &coll);
|
||||
}
|
||||
|
||||
if (orte_static_ports) {
|
||||
/* create a list of all nodes involved so we can pass it along */
|
||||
char **nodelist = NULL;
|
||||
orte_node_t *n2;
|
||||
@ -1201,15 +1200,11 @@ static void launch_daemons(int fd, short args, void *cbdata)
|
||||
}
|
||||
}
|
||||
/* we need mpirun to be the first node on this list */
|
||||
if (NULL == nodelist ||
|
||||
0 != strcmp(nodelist[0], orte_process_info.nodename)) {
|
||||
if (0 != strcmp(nodelist[0], orte_process_info.nodename)) {
|
||||
opal_argv_prepend_nosize(&nodelist, orte_process_info.nodename);
|
||||
}
|
||||
nlistflat = opal_argv_join(nodelist, ',');
|
||||
opal_argv_free(nodelist);
|
||||
} else {
|
||||
nlistflat = NULL;
|
||||
}
|
||||
|
||||
/* setup the launch */
|
||||
if (ORTE_SUCCESS != (rc = setup_launch(&argc, &argv, node->name, &node_name_index1,
|
||||
|
@ -118,7 +118,7 @@ static int allocate(orte_job_t *jdata, opal_list_t *nodes)
|
||||
/* check for topology */
|
||||
if (use_local_topology) {
|
||||
/* use our topology */
|
||||
topo = opal_hwloc_topology;
|
||||
t = (orte_topology_t*)opal_pointer_array_get_item(orte_node_topologies, 0);
|
||||
} else if (NULL != files) {
|
||||
if (0 != hwloc_topology_init(&topo)) {
|
||||
orte_show_help("help-ras-simulator.txt",
|
||||
@ -257,17 +257,17 @@ static int allocate(orte_job_t *jdata, opal_list_t *nodes)
|
||||
if (NULL == max_slot_cnt || NULL == max_slot_cnt[n]) {
|
||||
node->slots_max = 0;
|
||||
} else {
|
||||
obj = hwloc_get_root_obj(topo);
|
||||
node->slots_max = opal_hwloc_base_get_npus(topo, obj);
|
||||
obj = hwloc_get_root_obj(t->topo);
|
||||
node->slots_max = opal_hwloc_base_get_npus(t->topo, obj);
|
||||
}
|
||||
if (NULL == slot_cnt || NULL == slot_cnt[n]) {
|
||||
node->slots = 0;
|
||||
} else {
|
||||
obj = hwloc_get_root_obj(topo);
|
||||
node->slots = opal_hwloc_base_get_npus(topo, obj);
|
||||
obj = hwloc_get_root_obj(t->topo);
|
||||
node->slots = opal_hwloc_base_get_npus(t->topo, obj);
|
||||
}
|
||||
node->topology = OBJ_NEW(orte_topology_t);
|
||||
node->topology->topo = topo;
|
||||
OBJ_RETAIN(t);
|
||||
node->topology = t;
|
||||
opal_output_verbose(1, orte_ras_base_framework.framework_output,
|
||||
"Created Node <%10s> [%3d : %3d]",
|
||||
node->name, node->slots, node->slots_max);
|
||||
|
@ -12,7 +12,7 @@
|
||||
* Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2011-2013 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2014-2015 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -99,7 +99,7 @@ OBJ_CLASS_DECLARATION(orte_rmaps_base_selected_module_t);
|
||||
/*
|
||||
* Map a job
|
||||
*/
|
||||
ORTE_DECLSPEC void orte_rmaps_base_map_job(int fd, short args, void *cbdata);
|
||||
ORTE_DECLSPEC int orte_rmaps_base_map_job(orte_job_t *jdata);
|
||||
|
||||
/**
|
||||
* Utility routines to get/set vpid mapping for the job
|
||||
@ -126,6 +126,8 @@ ORTE_DECLSPEC int orte_rmaps_base_set_ranking_policy(orte_ranking_policy_t *poli
|
||||
orte_mapping_policy_t mapping,
|
||||
char *spec);
|
||||
|
||||
ORTE_DECLSPEC void orte_rmaps_base_display_map(orte_job_t *jdata);
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
#endif
|
||||
|
@ -414,6 +414,9 @@ static int bind_in_place(orte_job_t *jdata,
|
||||
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, i))) {
|
||||
continue;
|
||||
}
|
||||
if (!orte_no_vm && (int)ORTE_PROC_MY_NAME->vpid != node->index) {
|
||||
continue;
|
||||
}
|
||||
if (!orte_do_not_launch) {
|
||||
/* if we don't want to launch, then we are just testing the system,
|
||||
* so ignore questions about support capabilities
|
||||
@ -606,6 +609,9 @@ static int bind_to_cpuset(orte_job_t *jdata)
|
||||
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, i))) {
|
||||
continue;
|
||||
}
|
||||
if (!orte_no_vm && (int)ORTE_PROC_MY_NAME->vpid != node->index) {
|
||||
continue;
|
||||
}
|
||||
if (!orte_do_not_launch) {
|
||||
/* if we don't want to launch, then we are just testing the system,
|
||||
* so ignore questions about support capabilities
|
||||
@ -842,6 +848,9 @@ int orte_rmaps_base_compute_bindings(orte_job_t *jdata)
|
||||
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(jdata->map->nodes, i))) {
|
||||
continue;
|
||||
}
|
||||
if (!orte_no_vm && (int)ORTE_PROC_MY_NAME->vpid != i) {
|
||||
continue;
|
||||
}
|
||||
if (!orte_do_not_launch) {
|
||||
/* if we don't want to launch, then we are just testing the system,
|
||||
* so ignore questions about support capabilities
|
||||
|
@ -42,31 +42,18 @@
|
||||
#include "orte/mca/rmaps/base/rmaps_private.h"
|
||||
|
||||
|
||||
/*
|
||||
* Function for selecting one component from all those that are
|
||||
* available.
|
||||
*/
|
||||
void orte_rmaps_base_map_job(int fd, short args, void *cbdata)
|
||||
int orte_rmaps_base_map_job(orte_job_t *jdata)
|
||||
{
|
||||
orte_job_t *jdata;
|
||||
orte_node_t *node;
|
||||
int rc, i, ppx;
|
||||
bool did_map, given, pernode;
|
||||
orte_rmaps_base_selected_module_t *mod;
|
||||
orte_job_t *parent;
|
||||
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
|
||||
orte_vpid_t nprocs;
|
||||
orte_app_context_t *app;
|
||||
|
||||
/* convenience */
|
||||
jdata = caddy->jdata;
|
||||
jdata->state = ORTE_JOB_STATE_MAP;
|
||||
|
||||
/* NOTE: NO PROXY COMPONENT REQUIRED - REMOTE PROCS ARE NOT
|
||||
* ALLOWED TO CALL RMAPS INDEPENDENTLY. ONLY THE PLM CAN
|
||||
* DO SO, AND ALL PLM COMMANDS ARE RELAYED TO HNP
|
||||
*/
|
||||
|
||||
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
|
||||
"mca:rmaps: mapping job %s",
|
||||
ORTE_JOBID_PRINT(jdata->jobid));
|
||||
@ -128,10 +115,8 @@ void orte_rmaps_base_map_job(int fd, short args, void *cbdata)
|
||||
orte_get_attribute(&app->attributes, ORTE_APP_HOSTFILE, NULL, OPAL_STRING))) {
|
||||
/* inform the user of the error */
|
||||
orte_show_help("help-orte-rmaps-base.txt", "num-procs-not-specified", true);
|
||||
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_MAP_FAILED);
|
||||
OBJ_RELEASE(caddy);
|
||||
OPAL_LIST_DESTRUCT(&nodes);
|
||||
return;
|
||||
return ORTE_ERR_BAD_PARAM;
|
||||
}
|
||||
}
|
||||
nprocs += slots;
|
||||
@ -350,9 +335,7 @@ void orte_rmaps_base_map_job(int fd, short args, void *cbdata)
|
||||
int i;
|
||||
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, 0))) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
||||
OBJ_RELEASE(caddy);
|
||||
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_MAP_FAILED);
|
||||
return;
|
||||
return ORTE_ERR_NOT_FOUND;
|
||||
}
|
||||
t0 = node->topology;
|
||||
for (i=1; i < orte_node_pool->size; i++) {
|
||||
@ -385,9 +368,7 @@ void orte_rmaps_base_map_job(int fd, short args, void *cbdata)
|
||||
*/
|
||||
if (ORTE_ERR_TAKE_NEXT_OPTION != rc) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_MAP_FAILED);
|
||||
OBJ_RELEASE(caddy);
|
||||
return;
|
||||
return rc;
|
||||
}
|
||||
}
|
||||
if (did_map && ORTE_ERR_RESOURCE_BUSY == rc) {
|
||||
@ -395,9 +376,7 @@ void orte_rmaps_base_map_job(int fd, short args, void *cbdata)
|
||||
* for launch as all the resources were busy
|
||||
*/
|
||||
orte_show_help("help-orte-rmaps-base.txt", "cannot-launch", true);
|
||||
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_CANNOT_LAUNCH);
|
||||
OBJ_RELEASE(caddy);
|
||||
return;
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* if we get here without doing the map, or with zero procs in
|
||||
@ -407,9 +386,7 @@ void orte_rmaps_base_map_job(int fd, short args, void *cbdata)
|
||||
orte_show_help("help-orte-rmaps-base.txt", "failed-map", true,
|
||||
did_map ? "mapped" : "unmapped",
|
||||
jdata->num_procs, jdata->map->num_nodes);
|
||||
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_MAP_FAILED);
|
||||
OBJ_RELEASE(caddy);
|
||||
return;
|
||||
return ORTE_ERR_INVALID_NUM_PROCS;
|
||||
}
|
||||
|
||||
/* if any node is oversubscribed, then check to see if a binding
|
||||
@ -425,17 +402,15 @@ void orte_rmaps_base_map_job(int fd, short args, void *cbdata)
|
||||
/* compute and save local ranks */
|
||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_local_ranks(jdata))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_MAP_FAILED);
|
||||
OBJ_RELEASE(caddy);
|
||||
return;
|
||||
return rc;
|
||||
}
|
||||
|
||||
if (orte_no_vm) {
|
||||
/* compute and save bindings */
|
||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_bindings(jdata))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_MAP_FAILED);
|
||||
OBJ_RELEASE(caddy);
|
||||
return;
|
||||
return rc;
|
||||
}
|
||||
}
|
||||
|
||||
/* set the offset so shared memory components can potentially
|
||||
@ -452,10 +427,12 @@ void orte_rmaps_base_map_job(int fd, short args, void *cbdata)
|
||||
}
|
||||
}
|
||||
|
||||
/* if we wanted to display the map, now is the time to do it - ignore
|
||||
* daemon job
|
||||
*/
|
||||
if (jdata->map->display_map) {
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
void orte_rmaps_base_display_map(orte_job_t *jdata)
|
||||
{
|
||||
/* ignore daemon job */
|
||||
char *output=NULL;
|
||||
int i, j, cnt;
|
||||
orte_node_t *node;
|
||||
@ -547,9 +524,3 @@ void orte_rmaps_base_map_job(int fd, short args, void *cbdata)
|
||||
free(output);
|
||||
}
|
||||
}
|
||||
/* set the job state to the next position */
|
||||
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_MAP_COMPLETE);
|
||||
|
||||
/* cleanup */
|
||||
OBJ_RELEASE(caddy);
|
||||
}
|
||||
|
@ -12,6 +12,7 @@
|
||||
* Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2011 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2017 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -71,16 +72,16 @@ ORTE_DECLSPEC extern opal_event_t orte_mapping_event;
|
||||
typedef int (*orte_rmaps_base_module_map_fn_t)(orte_job_t *jdata);
|
||||
|
||||
/*
|
||||
* rmaps module version 1.3.0
|
||||
* rmaps module version 3.0.0
|
||||
*/
|
||||
struct orte_rmaps_base_module_1_3_0_t {
|
||||
struct orte_rmaps_base_module_3_0_0_t {
|
||||
/** Mapping function pointer */
|
||||
orte_rmaps_base_module_map_fn_t map_job;
|
||||
};
|
||||
/** Convenience typedef */
|
||||
typedef struct orte_rmaps_base_module_1_3_0_t orte_rmaps_base_module_1_3_0_t;
|
||||
typedef struct orte_rmaps_base_module_3_0_0_t orte_rmaps_base_module_3_0_0_t;
|
||||
/** Convenience typedef */
|
||||
typedef orte_rmaps_base_module_1_3_0_t orte_rmaps_base_module_t;
|
||||
typedef orte_rmaps_base_module_3_0_0_t orte_rmaps_base_module_t;
|
||||
|
||||
|
||||
/*
|
||||
@ -88,18 +89,18 @@ typedef orte_rmaps_base_module_1_3_0_t orte_rmaps_base_module_t;
|
||||
*/
|
||||
|
||||
/**
|
||||
* rmaps component version 1.3.0
|
||||
* rmaps component version 3.0.0
|
||||
*/
|
||||
struct orte_rmaps_base_component_2_0_0_t {
|
||||
struct orte_rmaps_base_component_3_0_0_t {
|
||||
/** Base MCA structure */
|
||||
mca_base_component_t base_version;
|
||||
/** Base MCA data */
|
||||
mca_base_component_data_t base_data;
|
||||
};
|
||||
/** Convenience typedef */
|
||||
typedef struct orte_rmaps_base_component_2_0_0_t orte_rmaps_base_component_2_0_0_t;
|
||||
typedef struct orte_rmaps_base_component_3_0_0_t orte_rmaps_base_component_3_0_0_t;
|
||||
/** Convenience typedef */
|
||||
typedef orte_rmaps_base_component_2_0_0_t orte_rmaps_base_component_t;
|
||||
typedef orte_rmaps_base_component_3_0_0_t orte_rmaps_base_component_t;
|
||||
|
||||
|
||||
END_C_DECLS
|
||||
|
@ -10,6 +10,7 @@
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# Copyright (c) 2017 Intel, Inc. All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
@ -20,8 +21,10 @@
|
||||
#
|
||||
[orte-rmaps-rr:alloc-error]
|
||||
There are not enough slots available in the system to satisfy the %d slots
|
||||
that were requested by the application:
|
||||
%s
|
||||
that were requested:
|
||||
|
||||
application: %s
|
||||
host: %s
|
||||
|
||||
Either request fewer slots for your application, or make more slots available
|
||||
for use.
|
||||
|
@ -59,7 +59,7 @@ int orte_rmaps_rr_byslot(orte_job_t *jdata,
|
||||
if (num_slots < (int)app->num_procs) {
|
||||
if (ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping)) {
|
||||
orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:alloc-error",
|
||||
true, app->num_procs, app->app);
|
||||
true, app->num_procs, app->app, orte_process_info.nodename);
|
||||
ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
|
||||
return ORTE_ERR_SILENT;
|
||||
}
|
||||
@ -192,13 +192,13 @@ int orte_rmaps_rr_byslot(orte_job_t *jdata,
|
||||
* via hostfile/dash-host */
|
||||
if (!(ORTE_MAPPING_SUBSCRIBE_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping))) {
|
||||
orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:alloc-error",
|
||||
true, app->num_procs, app->app);
|
||||
true, app->num_procs, app->app, orte_process_info.nodename);
|
||||
ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
|
||||
return ORTE_ERR_SILENT;
|
||||
} else if (ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping)) {
|
||||
/* if we were explicitly told not to oversubscribe, then don't */
|
||||
orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:alloc-error",
|
||||
true, app->num_procs, app->app);
|
||||
true, app->num_procs, app->app, orte_process_info.nodename);
|
||||
ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
|
||||
return ORTE_ERR_SILENT;
|
||||
}
|
||||
@ -237,7 +237,7 @@ int orte_rmaps_rr_bynode(orte_job_t *jdata,
|
||||
if (num_slots < (int)app->num_procs) {
|
||||
if (ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping)) {
|
||||
orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:alloc-error",
|
||||
true, app->num_procs, app->app);
|
||||
true, app->num_procs, app->app, orte_process_info.nodename);
|
||||
ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
|
||||
return ORTE_ERR_SILENT;
|
||||
}
|
||||
@ -377,13 +377,13 @@ int orte_rmaps_rr_bynode(orte_job_t *jdata,
|
||||
* via hostfile/dash-host */
|
||||
if (!(ORTE_MAPPING_SUBSCRIBE_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping))) {
|
||||
orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:alloc-error",
|
||||
true, app->num_procs, app->app);
|
||||
true, app->num_procs, app->app, orte_process_info.nodename);
|
||||
ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
|
||||
return ORTE_ERR_SILENT;
|
||||
} else if (ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping)) {
|
||||
/* if we were explicitly told not to oversubscribe, then don't */
|
||||
orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:alloc-error",
|
||||
true, app->num_procs, app->app);
|
||||
true, app->num_procs, app->app, orte_process_info.nodename);
|
||||
ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
|
||||
return ORTE_ERR_SILENT;
|
||||
}
|
||||
@ -491,7 +491,7 @@ int orte_rmaps_rr_byobj(orte_job_t *jdata,
|
||||
if (num_slots < app->num_procs) {
|
||||
if (ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping)) {
|
||||
orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:alloc-error",
|
||||
true, app->num_procs, app->app);
|
||||
true, app->num_procs, app->app, orte_process_info.nodename);
|
||||
ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
|
||||
return ORTE_ERR_SILENT;
|
||||
}
|
||||
@ -599,13 +599,13 @@ int orte_rmaps_rr_byobj(orte_job_t *jdata,
|
||||
* via hostfile/dash-host */
|
||||
if (!(ORTE_MAPPING_SUBSCRIBE_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping))) {
|
||||
orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:alloc-error",
|
||||
true, app->num_procs, app->app);
|
||||
true, app->num_procs, app->app, orte_process_info.nodename);
|
||||
ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
|
||||
return ORTE_ERR_SILENT;
|
||||
} else if (ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping)) {
|
||||
/* if we were explicitly told not to oversubscribe, then don't */
|
||||
orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:alloc-error",
|
||||
true, app->num_procs, app->app);
|
||||
true, app->num_procs, app->app, orte_process_info.nodename);
|
||||
ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
|
||||
return ORTE_ERR_SILENT;
|
||||
}
|
||||
@ -652,7 +652,7 @@ static int byobj_span(orte_job_t *jdata,
|
||||
if (num_slots < (int)app->num_procs) {
|
||||
if (ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping)) {
|
||||
orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:alloc-error",
|
||||
true, app->num_procs, app->app);
|
||||
true, app->num_procs, app->app, orte_process_info.nodename);
|
||||
ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
|
||||
return ORTE_ERR_SILENT;
|
||||
}
|
||||
|
@ -168,6 +168,12 @@ void orte_rml_base_process_msg(int fd, short flags, void *cbdata)
|
||||
OPAL_TIMING_EVENT((&tm_rml,"from %s %d bytes",
|
||||
ORTE_NAME_PRINT(&msg->sender), msg->iov.iov_len));
|
||||
|
||||
/* if this message is just to warmup the connection, then drop it */
|
||||
if (ORTE_RML_TAG_WARMUP_CONNECTION == msg->tag) {
|
||||
OBJ_RELEASE(msg);
|
||||
return;
|
||||
}
|
||||
|
||||
/* see if we have a waiting recv for this message */
|
||||
OPAL_LIST_FOREACH(post, &orte_rml_base.posted_recvs, orte_rml_posted_recv_t) {
|
||||
/* since names could include wildcards, must use
|
||||
|
@ -172,6 +172,9 @@ BEGIN_C_DECLS
|
||||
/* topology report */
|
||||
#define ORTE_RML_TAG_TOPOLOGY_REPORT 62
|
||||
|
||||
/* warmup connection - simply establishes the connection */
|
||||
#define ORTE_RML_TAG_WARMUP_CONNECTION 63
|
||||
|
||||
#define ORTE_RML_TAG_MAX 100
|
||||
|
||||
|
||||
|
@ -428,7 +428,7 @@ static opal_cmd_line_init_t cmd_line_init[] = {
|
||||
&orte_cmd_options.disable_recovery, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Disable recovery (resets all recovery options to off)" },
|
||||
|
||||
{ "state_novm_select", '\0', "novm", "novm", 0,
|
||||
{ "orte_no_vm", '\0', "novm", "novm", 0,
|
||||
NULL, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Execute without creating an allocation-spanning virtual machine (only start daemons on nodes hosting application procs)" },
|
||||
|
||||
@ -449,6 +449,11 @@ static opal_cmd_line_init_t cmd_line_init[] = {
|
||||
&orte_cmd_options.terminate_dvm, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Terminate the DVM" },
|
||||
|
||||
/* fwd mpirun port */
|
||||
{ "orte_fwd_mpirun_port", '\0', "fwd-mpirun-port", "fwd-mpirun-port", 0,
|
||||
NULL, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Forward mpirun port to compute node daemons so all will use it" },
|
||||
|
||||
/* End of list */
|
||||
{ NULL, '\0', NULL, NULL, 0,
|
||||
NULL, OPAL_CMD_LINE_TYPE_NULL, NULL }
|
||||
|
@ -80,8 +80,6 @@ static orte_job_state_t launch_states[] = {
|
||||
ORTE_JOB_STATE_DAEMONS_LAUNCHED,
|
||||
ORTE_JOB_STATE_DAEMONS_REPORTED,
|
||||
ORTE_JOB_STATE_VM_READY,
|
||||
ORTE_JOB_STATE_MAP,
|
||||
ORTE_JOB_STATE_MAP_COMPLETE,
|
||||
ORTE_JOB_STATE_SYSTEM_PREP,
|
||||
ORTE_JOB_STATE_LAUNCH_APPS,
|
||||
ORTE_JOB_STATE_LOCAL_LAUNCH_COMPLETE,
|
||||
@ -100,8 +98,6 @@ static orte_state_cbfunc_t launch_callbacks[] = {
|
||||
orte_plm_base_daemons_launched,
|
||||
orte_plm_base_daemons_reported,
|
||||
vm_ready,
|
||||
orte_rmaps_base_map_job,
|
||||
orte_plm_base_mapping_complete,
|
||||
orte_plm_base_complete_setup,
|
||||
orte_plm_base_launch_apps,
|
||||
orte_state_base_local_launch_complete,
|
||||
@ -213,8 +209,9 @@ static void files_ready(int status, void *cbdata)
|
||||
|
||||
if (ORTE_SUCCESS != status) {
|
||||
ORTE_FORCED_TERMINATE(status);
|
||||
return;
|
||||
} else {
|
||||
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_MAP);
|
||||
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_SYSTEM_PREP);
|
||||
}
|
||||
}
|
||||
|
||||
@ -259,7 +256,7 @@ static void vm_ready(int fd, short args, void *cbdata)
|
||||
OBJ_RELEASE(buf);
|
||||
return;
|
||||
}
|
||||
/* flag that daemons were launchd so we will update the nidmap */
|
||||
/* flag that daemons were launched so we will update the nidmap */
|
||||
flag = 1;
|
||||
opal_dss.pack(buf, &flag, 1, OPAL_INT8);
|
||||
/* construct a nodemap with everything in it */
|
||||
@ -269,6 +266,7 @@ static void vm_ready(int fd, short args, void *cbdata)
|
||||
return;
|
||||
}
|
||||
|
||||
if (!orte_static_ports && !orte_fwd_mpirun_port) {
|
||||
/* pack a flag indicating wiring info is provided */
|
||||
flag = 1;
|
||||
opal_dss.pack(buf, &flag, 1, OPAL_INT8);
|
||||
@ -296,6 +294,10 @@ static void vm_ready(int fd, short args, void *cbdata)
|
||||
free(bo.bytes);
|
||||
}
|
||||
OBJ_RELEASE(wireup);
|
||||
} else {
|
||||
flag = 0;
|
||||
opal_dss.pack(buf, &flag, 1, OPAL_INT8);
|
||||
}
|
||||
|
||||
/* goes to all daemons */
|
||||
sig = OBJ_NEW(orte_grpcomm_signature_t);
|
||||
|
@ -73,8 +73,6 @@ static orte_job_state_t launch_states[] = {
|
||||
ORTE_JOB_STATE_DAEMONS_LAUNCHED,
|
||||
ORTE_JOB_STATE_DAEMONS_REPORTED,
|
||||
ORTE_JOB_STATE_VM_READY,
|
||||
ORTE_JOB_STATE_MAP,
|
||||
ORTE_JOB_STATE_MAP_COMPLETE,
|
||||
ORTE_JOB_STATE_SYSTEM_PREP,
|
||||
ORTE_JOB_STATE_LAUNCH_APPS,
|
||||
ORTE_JOB_STATE_LOCAL_LAUNCH_COMPLETE,
|
||||
@ -93,8 +91,6 @@ static orte_state_cbfunc_t launch_callbacks[] = {
|
||||
orte_plm_base_daemons_launched,
|
||||
orte_plm_base_daemons_reported,
|
||||
orte_plm_base_vm_ready,
|
||||
orte_rmaps_base_map_job,
|
||||
orte_plm_base_mapping_complete,
|
||||
orte_plm_base_complete_setup,
|
||||
orte_plm_base_launch_apps,
|
||||
orte_state_base_local_launch_complete,
|
||||
|
@ -61,7 +61,6 @@ orte_state_base_module_t orte_state_novm_module = {
|
||||
};
|
||||
|
||||
static void allocation_complete(int fd, short args, void *cbdata);
|
||||
static void map_complete(int fd, short args, void *cbdata);
|
||||
static void vm_ready(int fd, short args, void *cbdata);
|
||||
|
||||
/* defined state machine sequence for no VM - individual
|
||||
@ -75,8 +74,6 @@ static orte_job_state_t launch_states[] = {
|
||||
ORTE_JOB_STATE_DAEMONS_LAUNCHED,
|
||||
ORTE_JOB_STATE_DAEMONS_REPORTED,
|
||||
ORTE_JOB_STATE_VM_READY,
|
||||
ORTE_JOB_STATE_MAP,
|
||||
ORTE_JOB_STATE_MAP_COMPLETE,
|
||||
ORTE_JOB_STATE_SYSTEM_PREP,
|
||||
ORTE_JOB_STATE_LAUNCH_APPS,
|
||||
ORTE_JOB_STATE_LOCAL_LAUNCH_COMPLETE,
|
||||
@ -96,8 +93,6 @@ static orte_state_cbfunc_t launch_callbacks[] = {
|
||||
orte_plm_base_daemons_launched,
|
||||
orte_plm_base_daemons_reported,
|
||||
vm_ready,
|
||||
orte_rmaps_base_map_job,
|
||||
map_complete,
|
||||
orte_plm_base_complete_setup,
|
||||
orte_plm_base_launch_apps,
|
||||
orte_state_base_local_launch_complete,
|
||||
@ -200,7 +195,7 @@ static void allocation_complete(int fd, short args, void *cbdata)
|
||||
orte_job_t *daemons;
|
||||
orte_topology_t *t;
|
||||
orte_node_t *node;
|
||||
int i;
|
||||
int i, rc;
|
||||
|
||||
jdata->state = ORTE_JOB_STATE_ALLOCATION_COMPLETE;
|
||||
|
||||
@ -213,7 +208,6 @@ static void allocation_complete(int fd, short args, void *cbdata)
|
||||
/* mark that we are not using a VM */
|
||||
orte_set_attribute(&daemons->attributes, ORTE_JOB_NO_VM, ORTE_ATTR_GLOBAL, NULL, OPAL_BOOL);
|
||||
|
||||
|
||||
/* ensure that all nodes point to our topology - we
|
||||
* cannot support hetero nodes with this state machine
|
||||
*/
|
||||
@ -224,28 +218,38 @@ static void allocation_complete(int fd, short args, void *cbdata)
|
||||
}
|
||||
node->topology = t;
|
||||
}
|
||||
if (!orte_managed_allocation) {
|
||||
if (NULL != orte_set_slots &&
|
||||
0 != strncmp(orte_set_slots, "none", strlen(orte_set_slots))) {
|
||||
for (i=0; i < orte_node_pool->size; i++) {
|
||||
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, i))) {
|
||||
continue;
|
||||
}
|
||||
if (!ORTE_FLAG_TEST(node, ORTE_NODE_FLAG_SLOTS_GIVEN)) {
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
|
||||
"%s plm:base:setting slots for node %s by %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), node->name, orte_set_slots));
|
||||
orte_plm_base_set_slots(node);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* move to the map stage */
|
||||
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_MAP);
|
||||
/* perform the map */
|
||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_map_job(jdata))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
|
||||
goto done;
|
||||
}
|
||||
|
||||
/* after we map, we are ready to launch the daemons */
|
||||
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_LAUNCH_DAEMONS);
|
||||
|
||||
done:
|
||||
/* cleanup */
|
||||
OBJ_RELEASE(state);
|
||||
}
|
||||
|
||||
/* after we map, we are ready to launch the daemons */
|
||||
static void map_complete(int fd, short args, void *cbdata)
|
||||
{
|
||||
orte_state_caddy_t *state = (orte_state_caddy_t*)cbdata;
|
||||
orte_job_t *jdata = state->jdata;
|
||||
|
||||
jdata->state = ORTE_JOB_STATE_MAP_COMPLETE;
|
||||
/* move to the map stage */
|
||||
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_LAUNCH_DAEMONS);
|
||||
|
||||
/* cleanup */
|
||||
OBJ_RELEASE(state);
|
||||
}
|
||||
|
||||
static void vm_ready(int fd, short args, void *cbdata)
|
||||
{
|
||||
|
@ -3,6 +3,7 @@
|
||||
* Copyright (c) 2011-2015 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
*
|
||||
* Copyright (c) 2017 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -13,6 +14,7 @@
|
||||
#include "orte_config.h"
|
||||
#include "opal/util/output.h"
|
||||
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
#include "orte/mca/state/state.h"
|
||||
#include "orte/mca/state/base/base.h"
|
||||
#include "state_novm.h"
|
||||
@ -26,7 +28,6 @@ const char *orte_state_novm_component_version_string =
|
||||
/*
|
||||
* Local functionality
|
||||
*/
|
||||
static int state_novm_register (void);
|
||||
static int state_novm_open(void);
|
||||
static int state_novm_close(void);
|
||||
static int state_novm_component_query(mca_base_module_t **module, int *priority);
|
||||
@ -50,8 +51,7 @@ orte_state_base_component_t mca_state_novm_component =
|
||||
/* Component open and close functions */
|
||||
.mca_open_component = state_novm_open,
|
||||
.mca_close_component = state_novm_close,
|
||||
.mca_query_component = state_novm_component_query,
|
||||
.mca_register_component_params = state_novm_register
|
||||
.mca_query_component = state_novm_component_query
|
||||
},
|
||||
.base_data = {
|
||||
/* The component is checkpoint ready */
|
||||
@ -59,23 +59,6 @@ orte_state_base_component_t mca_state_novm_component =
|
||||
},
|
||||
};
|
||||
|
||||
static bool select_me = false;
|
||||
|
||||
static int state_novm_register (void)
|
||||
{
|
||||
int ret;
|
||||
|
||||
select_me = false;
|
||||
ret = mca_base_component_var_register (&mca_state_novm_component.base_version,
|
||||
"select", "Use this component",
|
||||
MCA_BASE_VAR_TYPE_BOOL, NULL,
|
||||
0, MCA_BASE_VAR_FLAG_SETTABLE,
|
||||
OPAL_INFO_LVL_9,
|
||||
MCA_BASE_VAR_SCOPE_ALL_EQ,
|
||||
&select_me);
|
||||
return (0 > ret) ? ret : ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static int state_novm_open(void)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
@ -88,7 +71,7 @@ static int state_novm_close(void)
|
||||
|
||||
static int state_novm_component_query(mca_base_module_t **module, int *priority)
|
||||
{
|
||||
if (ORTE_PROC_IS_HNP && select_me) {
|
||||
if (ORTE_PROC_IS_HNP && orte_no_vm) {
|
||||
/* set our priority high so we'll be selected if user desires */
|
||||
*priority = 1000;
|
||||
*module = (mca_base_module_t *)&orte_state_novm_module;
|
||||
|
@ -23,6 +23,7 @@
|
||||
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/iof/base/base.h"
|
||||
#include "orte/mca/rmaps/rmaps_types.h"
|
||||
#include "orte/mca/rml/rml.h"
|
||||
#include "orte/mca/routed/routed.h"
|
||||
#include "orte/util/session_dir.h"
|
||||
@ -256,6 +257,9 @@ static void track_procs(int fd, short argc, void *cbdata)
|
||||
int rc, i, j;
|
||||
orte_plm_cmd_flag_t cmd;
|
||||
char *rtmod;
|
||||
orte_std_cntr_t index;
|
||||
orte_job_map_t *map;
|
||||
orte_node_t *node;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_state_base_framework.framework_output,
|
||||
"%s state:orted:track_procs called for proc %s state %s",
|
||||
@ -426,18 +430,6 @@ static void track_procs(int fd, short argc, void *cbdata)
|
||||
if (pptr->name.jobid == jdata->jobid) {
|
||||
/* clear the entry in the local children */
|
||||
opal_pointer_array_set_item(orte_local_children, i, NULL);
|
||||
/* find it in the node->procs array */
|
||||
for (j=0; j < pptr->node->procs->size; j++) {
|
||||
if (NULL == (pdata = (orte_proc_t*)opal_pointer_array_get_item(pptr->node->procs, j))) {
|
||||
continue;
|
||||
}
|
||||
if (pdata == pptr) {
|
||||
/* remove it */
|
||||
opal_pointer_array_set_item(pptr->node->procs, j, NULL);
|
||||
OBJ_RELEASE(pdata); // maintain accounting
|
||||
break;
|
||||
}
|
||||
}
|
||||
OBJ_RELEASE(pptr); // maintain accounting
|
||||
}
|
||||
}
|
||||
@ -451,6 +443,47 @@ static void track_procs(int fd, short argc, void *cbdata)
|
||||
opal_pmix.server_deregister_nspace(jdata->jobid, NULL, NULL);
|
||||
}
|
||||
|
||||
/* release the resources */
|
||||
if (NULL != jdata->map) {
|
||||
map = jdata->map;
|
||||
for (index = 0; index < map->nodes->size; index++) {
|
||||
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, index))) {
|
||||
continue;
|
||||
}
|
||||
OPAL_OUTPUT_VERBOSE((2, orte_state_base_framework.framework_output,
|
||||
"%s state:orted releasing procs from node %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
node->name));
|
||||
for (i = 0; i < node->procs->size; i++) {
|
||||
if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(node->procs, i))) {
|
||||
continue;
|
||||
}
|
||||
if (pptr->name.jobid != jdata->jobid) {
|
||||
/* skip procs from another job */
|
||||
continue;
|
||||
}
|
||||
node->slots_inuse--;
|
||||
node->num_procs--;
|
||||
OPAL_OUTPUT_VERBOSE((2, orte_state_base_framework.framework_output,
|
||||
"%s state:orted releasing proc %s from node %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&pptr->name), node->name));
|
||||
/* set the entry in the node array to NULL */
|
||||
opal_pointer_array_set_item(node->procs, i, NULL);
|
||||
/* release the proc once for the map entry */
|
||||
OBJ_RELEASE(pptr);
|
||||
}
|
||||
/* set the node location to NULL */
|
||||
opal_pointer_array_set_item(map->nodes, index, NULL);
|
||||
/* maintain accounting */
|
||||
OBJ_RELEASE(node);
|
||||
/* flag that the node is no longer in a map */
|
||||
ORTE_FLAG_UNSET(node, ORTE_NODE_FLAG_MAPPED);
|
||||
}
|
||||
OBJ_RELEASE(map);
|
||||
jdata->map = NULL;
|
||||
}
|
||||
|
||||
/* cleanup the job info */
|
||||
opal_hash_table_set_value_uint32(orte_job_data, jdata->jobid, NULL);
|
||||
OBJ_RELEASE(jdata);
|
||||
|
@ -111,6 +111,11 @@
|
||||
static opal_event_t *pipe_handler;
|
||||
static void shutdown_callback(int fd, short flags, void *arg);
|
||||
static void pipe_closed(int fd, short flags, void *arg);
|
||||
static void rollup(int status, orte_process_name_t* sender,
|
||||
opal_buffer_t *buffer,
|
||||
orte_rml_tag_t tag, void *cbdata);
|
||||
static opal_buffer_t *bucket;
|
||||
static int ncollected = 0;
|
||||
|
||||
static char *orte_parent_uri;
|
||||
|
||||
@ -228,6 +233,7 @@ int orte_daemon(int argc, char *argv[])
|
||||
memset(&orted_globals, 0, sizeof(orted_globals));
|
||||
/* initialize the singleton died pipe to an illegal value so we can detect it was set */
|
||||
orted_globals.singleton_died_pipe = -1;
|
||||
bucket = OBJ_NEW(opal_buffer_t);
|
||||
|
||||
/* setup to check common command line options that just report and die */
|
||||
cmd_line = OBJ_NEW(opal_cmd_line_t);
|
||||
@ -694,6 +700,31 @@ int orte_daemon(int argc, char *argv[])
|
||||
* for it
|
||||
*/
|
||||
if (!ORTE_PROC_IS_HNP) {
|
||||
orte_process_name_t target;
|
||||
target.jobid = ORTE_PROC_MY_NAME->jobid;
|
||||
|
||||
if (orte_fwd_mpirun_port) {
|
||||
/* setup the rollup callback */
|
||||
orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_ORTED_CALLBACK,
|
||||
ORTE_RML_PERSISTENT, rollup, NULL);
|
||||
target.vpid = ORTE_PROC_MY_NAME->vpid;
|
||||
/* since we will be waiting for any children to send us
|
||||
* their rollup info before sending to our parent, save
|
||||
* a little time in the launch phase by "warming up" the
|
||||
* connection to our parent while we wait for our children */
|
||||
buffer = OBJ_NEW(opal_buffer_t); // zero-byte message
|
||||
if (0 > (ret = orte_rml.send_buffer_nb(orte_coll_conduit,
|
||||
ORTE_PROC_MY_PARENT, buffer,
|
||||
ORTE_RML_TAG_WARMUP_CONNECTION,
|
||||
orte_rml_send_callback, NULL))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
OBJ_RELEASE(buffer);
|
||||
goto DONE;
|
||||
}
|
||||
} else {
|
||||
target.vpid = 0;
|
||||
}
|
||||
|
||||
/* send the information to the orted report-back point - this function
|
||||
* will process the data, but also counts the number of
|
||||
* orteds that reported back so the launch procedure can continue.
|
||||
@ -767,9 +798,9 @@ int orte_daemon(int argc, char *argv[])
|
||||
}
|
||||
}
|
||||
|
||||
/* send to the HNP's callback - will be routed if routes are available */
|
||||
/* send it to the designated target */
|
||||
if (0 > (ret = orte_rml.send_buffer_nb(orte_coll_conduit,
|
||||
ORTE_PROC_MY_HNP, buffer,
|
||||
&target, buffer,
|
||||
ORTE_RML_TAG_ORTED_CALLBACK,
|
||||
orte_rml_send_callback, NULL))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
@ -898,3 +929,30 @@ static void shutdown_callback(int fd, short flags, void *arg)
|
||||
orte_session_dir_cleanup(ORTE_JOBID_WILDCARD);
|
||||
exit(ORTE_ERROR_DEFAULT_EXIT_CODE);
|
||||
}
|
||||
|
||||
static void rollup(int status, orte_process_name_t* sender,
|
||||
opal_buffer_t *buffer,
|
||||
orte_rml_tag_t tag, void *cbdata)
|
||||
{
|
||||
int nreqd;
|
||||
char *rtmod;
|
||||
int ret;
|
||||
|
||||
/* xfer the contents of the rollup to our bucket */
|
||||
opal_dss.copy_payload(bucket, buffer);
|
||||
ncollected++;
|
||||
|
||||
/* get the number of children, and include ourselves */
|
||||
rtmod = orte_rml.get_routed(orte_mgmt_conduit);
|
||||
nreqd = orte_routed.num_routes(rtmod) + 1;
|
||||
if (nreqd == ncollected) {
|
||||
/* relay this on to our parent */
|
||||
if (0 > (ret = orte_rml.send_buffer_nb(orte_coll_conduit,
|
||||
ORTE_PROC_MY_PARENT, bucket,
|
||||
ORTE_RML_TAG_ORTED_CALLBACK,
|
||||
orte_rml_send_callback, NULL))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
OBJ_RELEASE(bucket);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -12,7 +12,7 @@
|
||||
* Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2011-2013 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -82,6 +82,13 @@ int orte_dt_pack_job(opal_buffer_t *buffer, const void *src,
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
/* pack the flags */
|
||||
if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer,
|
||||
(void*)(&(jobs[i]->flags)), 1, ORTE_JOB_FLAGS_T))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* pack the personality */
|
||||
count = opal_argv_count(jobs[i]->personality);
|
||||
if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer, &count, 1, OPAL_INT32))) {
|
||||
@ -126,8 +133,8 @@ int orte_dt_pack_job(opal_buffer_t *buffer, const void *src,
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
/* and the procs, if we have them */
|
||||
if (0 < jobs[i]->num_procs) {
|
||||
|
||||
if (orte_no_vm && 0 < jobs[i]->num_procs) {
|
||||
for (j=0; j < jobs[i]->procs->size; j++) {
|
||||
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jobs[i]->procs, j))) {
|
||||
continue;
|
||||
@ -163,6 +170,7 @@ int orte_dt_pack_job(opal_buffer_t *buffer, const void *src,
|
||||
j=0;
|
||||
} else {
|
||||
/* pack a one to indicate a map is there */
|
||||
j = 1;
|
||||
}
|
||||
if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer,
|
||||
(void*)&j, 1, ORTE_STD_CNTR))) {
|
||||
@ -191,13 +199,6 @@ int orte_dt_pack_job(opal_buffer_t *buffer, const void *src,
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* pack the flags */
|
||||
if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer,
|
||||
(void*)(&(jobs[i]->flags)), 1, ORTE_JOB_FLAGS_T))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* pack the attributes that need to be sent */
|
||||
count = 0;
|
||||
OPAL_LIST_FOREACH(kv, &jobs[i]->attributes, orte_attribute_t) {
|
||||
|
@ -187,13 +187,12 @@ int orte_dt_print_job(char **output, char *prefix, orte_job_t *src, opal_data_ty
|
||||
}
|
||||
|
||||
tmp2 = opal_argv_join(src->personality, ',');
|
||||
asprintf(&tmp, "\n%sData for job: %s\tPersonality: %s\tRecovery: %s(%s)\n%s\tNum apps: %ld\tMPI allowed: %s\tStdin target: %s\tState: %s\tAbort: %s", pfx2,
|
||||
asprintf(&tmp, "\n%sData for job: %s\tPersonality: %s\tRecovery: %s(%s)\n%s\tNum apps: %ld\tStdin target: %s\tState: %s\tAbort: %s", pfx2,
|
||||
ORTE_JOBID_PRINT(src->jobid), tmp2,
|
||||
(ORTE_FLAG_TEST(src, ORTE_JOB_FLAG_RECOVERABLE)) ? "ENABLED" : "DISABLED",
|
||||
(orte_get_attribute(&src->attributes, ORTE_JOB_RECOVER_DEFINED, NULL, OPAL_BOOL)) ? "DEFINED" : "DEFAULT",
|
||||
pfx2,
|
||||
(long)src->num_apps,
|
||||
(ORTE_FLAG_TEST(src, ORTE_JOB_FLAG_GANG_LAUNCHED)) ? "YES" : "NO", ORTE_VPID_PRINT(src->stdin_target),
|
||||
(long)src->num_apps, ORTE_VPID_PRINT(src->stdin_target),
|
||||
orte_job_state_to_str(src->state), (ORTE_FLAG_TEST(src, ORTE_JOB_FLAG_ABORTED)) ? "True" : "False");
|
||||
free(tmp2);
|
||||
asprintf(&pfx, "%s\t", pfx2);
|
||||
|
@ -87,6 +87,14 @@ int orte_dt_unpack_job(opal_buffer_t *buffer, void *dest,
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
/* unpack the flags */
|
||||
n = 1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
|
||||
(&(jobs[i]->flags)), &n, ORTE_JOB_FLAGS_T))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* unpack the personality */
|
||||
n=1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer, &count, &n, OPAL_INT32))) {
|
||||
@ -138,8 +146,8 @@ int orte_dt_unpack_job(opal_buffer_t *buffer, void *dest,
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
/* and the procs, if provided */
|
||||
if (0 < jobs[i]->num_procs) {
|
||||
|
||||
if (orte_no_vm && 0 < jobs[i]->num_procs) {
|
||||
orte_proc_t *proc;
|
||||
for (j=0; j < jobs[i]->num_procs; j++) {
|
||||
n = 1;
|
||||
@ -197,14 +205,6 @@ int orte_dt_unpack_job(opal_buffer_t *buffer, void *dest,
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* unpack the flags */
|
||||
n = 1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
|
||||
(&(jobs[i]->flags)), &n, ORTE_JOB_FLAGS_T))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* unpack the attributes */
|
||||
n=1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer, &count,
|
||||
|
@ -76,11 +76,13 @@ char *orte_mgmt_transport = NULL;
|
||||
char *orte_coll_transport = NULL;
|
||||
int orte_mgmt_conduit = -1;
|
||||
int orte_coll_conduit = -1;
|
||||
bool orte_no_vm = false;
|
||||
|
||||
/* ORTE OOB port flags */
|
||||
bool orte_static_ports = false;
|
||||
char *orte_oob_static_ports = NULL;
|
||||
bool orte_standalone_operation = false;
|
||||
bool orte_fwd_mpirun_port = false;
|
||||
|
||||
bool orte_keep_fqdn_hostnames = false;
|
||||
bool orte_have_fqdn_allocation = false;
|
||||
@ -648,7 +650,6 @@ static void orte_job_construct(orte_job_t* job)
|
||||
job->num_local_procs = 0;
|
||||
|
||||
job->flags = 0;
|
||||
ORTE_FLAG_SET(job, ORTE_JOB_FLAG_GANG_LAUNCHED);
|
||||
ORTE_FLAG_SET(job, ORTE_JOB_FLAG_FORWARD_OUTPUT);
|
||||
|
||||
OBJ_CONSTRUCT(&job->attributes, opal_list_t);
|
||||
|
@ -456,11 +456,13 @@ ORTE_DECLSPEC extern char *orte_basename;
|
||||
ORTE_DECLSPEC extern bool orte_coprocessors_detected;
|
||||
ORTE_DECLSPEC extern opal_hash_table_t *orte_coprocessors;
|
||||
ORTE_DECLSPEC extern char *orte_topo_signature;
|
||||
ORTE_DECLSPEC extern bool orte_no_vm;
|
||||
|
||||
/* ORTE OOB port flags */
|
||||
ORTE_DECLSPEC extern bool orte_static_ports;
|
||||
ORTE_DECLSPEC extern char *orte_oob_static_ports;
|
||||
ORTE_DECLSPEC extern bool orte_standalone_operation;
|
||||
ORTE_DECLSPEC extern bool orte_fwd_mpirun_port;
|
||||
|
||||
/* nodename flags */
|
||||
ORTE_DECLSPEC extern bool orte_keep_fqdn_hostnames;
|
||||
|
@ -689,6 +689,15 @@ int orte_register_params(void)
|
||||
OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&orte_stat_history_size);
|
||||
|
||||
orte_no_vm = false;
|
||||
id = mca_base_var_register ("orte", "orte", NULL, "no_vm",
|
||||
"Do not build the VM at start to detect topologies",
|
||||
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&orte_no_vm);
|
||||
/* register a synonym for old name */
|
||||
mca_base_var_register_synonym (id, "orte", "state", "novm", "select", MCA_BASE_VAR_SYN_FLAG_DEPRECATED);
|
||||
|
||||
orte_max_vm_size = -1;
|
||||
(void) mca_base_var_register ("orte", "orte", NULL, "max_vm_size",
|
||||
"Maximum size of virtual machine - used to subdivide allocation",
|
||||
@ -773,5 +782,11 @@ int orte_register_params(void)
|
||||
OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&orte_stack_trace_wait_timeout);
|
||||
|
||||
(void) mca_base_var_register ("orte", "orte", NULL, "fwd_mpirun_port",
|
||||
"Forward the port used by mpirun so all daemons will use it",
|
||||
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&orte_fwd_mpirun_port);
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
@ -78,7 +78,6 @@ typedef uint8_t orte_node_flags_t;
|
||||
typedef uint16_t orte_job_flags_t;
|
||||
#define ORTE_JOB_FLAGS_T OPAL_UINT16
|
||||
#define ORTE_JOB_FLAG_UPDATED 0x0001 // job has been updated and needs to be included in the pidmap message
|
||||
#define ORTE_JOB_FLAG_GANG_LAUNCHED 0x0002 // MPI is allowed on this job - i.e., all members of the job were simultaneously launched
|
||||
#define ORTE_JOB_FLAG_RESTARTED 0x0004 // some procs in this job are being restarted
|
||||
#define ORTE_JOB_FLAG_ABORTED 0x0008 // did this job abort?
|
||||
#define ORTE_JOB_FLAG_DEBUGGER_DAEMON 0x0010 // job is launching debugger daemons
|
||||
|
@ -177,23 +177,27 @@ int orte_util_encode_nodemap(opal_buffer_t *buffer)
|
||||
char *node;
|
||||
char prefix[ORTE_MAX_NODE_PREFIX];
|
||||
int i, j, n, len, startnum, nodenum, numdigits;
|
||||
bool found, fullname;
|
||||
bool found, fullname, test;
|
||||
char *suffix, *sfx;
|
||||
orte_regex_node_t *ndreg;
|
||||
orte_regex_range_t *range, *rng, *idrng;
|
||||
opal_list_t nodeids, nodenms, dvpids;
|
||||
orte_regex_range_t *range, *rng, *slt, *tp, *flg;
|
||||
opal_list_t nodenms, dvpids, slots, topos, flags;
|
||||
opal_list_item_t *item, *itm2;
|
||||
char **regexargs = NULL, *tmp, *tmp2;
|
||||
orte_node_t *nptr;
|
||||
int rc;
|
||||
|
||||
/* setup the list of results */
|
||||
OBJ_CONSTRUCT(&nodeids, opal_list_t);
|
||||
OBJ_CONSTRUCT(&nodenms, opal_list_t);
|
||||
OBJ_CONSTRUCT(&dvpids, opal_list_t);
|
||||
OBJ_CONSTRUCT(&slots, opal_list_t);
|
||||
OBJ_CONSTRUCT(&topos, opal_list_t);
|
||||
OBJ_CONSTRUCT(&flags, opal_list_t);
|
||||
|
||||
rng = NULL;
|
||||
idrng = NULL;
|
||||
slt = NULL;
|
||||
tp = NULL;
|
||||
flg = NULL;
|
||||
for (n=0; n < orte_node_pool->size; n++) {
|
||||
if (NULL == (nptr = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, n))) {
|
||||
continue;
|
||||
@ -202,26 +206,6 @@ int orte_util_encode_nodemap(opal_buffer_t *buffer)
|
||||
if (NULL == nptr->daemon) {
|
||||
continue;
|
||||
}
|
||||
/* deal with the nodeids - these are just the index into
|
||||
* the node array. We pass these to ensure coherence across
|
||||
* the virtual machine */
|
||||
if (NULL == idrng) {
|
||||
/* just starting */
|
||||
idrng = OBJ_NEW(orte_regex_range_t);
|
||||
idrng->start = n;
|
||||
idrng->cnt = 1;
|
||||
opal_list_append(&nodeids, &idrng->super);
|
||||
} else {
|
||||
/* is this the next in line */
|
||||
if (n == (idrng->start + idrng->cnt)) {
|
||||
idrng->cnt++;
|
||||
} else {
|
||||
/* need to start another range */
|
||||
idrng = OBJ_NEW(orte_regex_range_t);
|
||||
idrng->start = n;
|
||||
opal_list_append(&nodeids, &idrng->super);
|
||||
}
|
||||
}
|
||||
/* deal with the daemon vpid - see if it is next in the
|
||||
* current range */
|
||||
if (NULL == rng) {
|
||||
@ -238,9 +222,87 @@ int orte_util_encode_nodemap(opal_buffer_t *buffer)
|
||||
/* need to start another range */
|
||||
rng = OBJ_NEW(orte_regex_range_t);
|
||||
rng->start = nptr->daemon->name.vpid;
|
||||
rng->cnt = 1;
|
||||
opal_list_append(&dvpids, &rng->super);
|
||||
}
|
||||
}
|
||||
/* check the #slots */
|
||||
if (NULL == slt) {
|
||||
/* just starting */
|
||||
slt = OBJ_NEW(orte_regex_range_t);
|
||||
slt->start = nptr->daemon->name.vpid;
|
||||
slt->slots = nptr->slots;
|
||||
slt->cnt = 1;
|
||||
opal_list_append(&slots, &slt->super);
|
||||
} else {
|
||||
/* is this the next in line */
|
||||
if (nptr->slots == slt->slots) {
|
||||
slt->cnt++;
|
||||
} else {
|
||||
/* need to start another range */
|
||||
slt = OBJ_NEW(orte_regex_range_t);
|
||||
slt->start = nptr->daemon->name.vpid;
|
||||
slt->slots = nptr->slots;
|
||||
slt->cnt = 1;
|
||||
opal_list_append(&slots, &slt->super);
|
||||
}
|
||||
}
|
||||
/* check the topologies */
|
||||
if (NULL == tp) {
|
||||
if (NULL != nptr->topology) {
|
||||
/* just starting */
|
||||
tp = OBJ_NEW(orte_regex_range_t);
|
||||
tp->start = nptr->daemon->name.vpid;
|
||||
tp->t = nptr->topology;
|
||||
tp->cnt = 1;
|
||||
opal_list_append(&topos, &tp->super);
|
||||
}
|
||||
} else {
|
||||
if (NULL != nptr->topology) {
|
||||
/* is this the next in line */
|
||||
if (tp->t == nptr->topology) {
|
||||
tp->cnt++;
|
||||
} else {
|
||||
/* need to start another range */
|
||||
tp = OBJ_NEW(orte_regex_range_t);
|
||||
tp->start = nptr->daemon->name.vpid;
|
||||
tp->t = nptr->topology;
|
||||
tp->cnt = 1;
|
||||
opal_list_append(&topos, &tp->super);
|
||||
}
|
||||
}
|
||||
}
|
||||
/* check the flags */
|
||||
test = ORTE_FLAG_TEST(nptr, ORTE_NODE_FLAG_SLOTS_GIVEN);
|
||||
if (NULL == flg) {
|
||||
/* just starting */
|
||||
flg = OBJ_NEW(orte_regex_range_t);
|
||||
flg->start = nptr->daemon->name.vpid;
|
||||
if (test) {
|
||||
flg->slots = 1;
|
||||
} else {
|
||||
flg->slots = 0;
|
||||
}
|
||||
flg->cnt = 1;
|
||||
opal_list_append(&flags, &flg->super);
|
||||
} else {
|
||||
/* is this the next in line */
|
||||
if ((test && 1 == flg->slots) ||
|
||||
(!test && 0 == flg->slots)) {
|
||||
flg->cnt++;
|
||||
} else {
|
||||
/* need to start another range */
|
||||
flg = OBJ_NEW(orte_regex_range_t);
|
||||
flg->start = nptr->daemon->name.vpid;
|
||||
if (test) {
|
||||
flg->slots = 1;
|
||||
} else {
|
||||
flg->slots = 0;
|
||||
}
|
||||
flg->cnt = 1;
|
||||
opal_list_append(&flags, &flg->super);
|
||||
}
|
||||
}
|
||||
node = nptr->name;
|
||||
/* determine this node's prefix by looking for first non-alpha char */
|
||||
fullname = false;
|
||||
@ -422,41 +484,7 @@ int orte_util_encode_nodemap(opal_buffer_t *buffer)
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &tmp, 1, OPAL_STRING))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OPAL_LIST_DESTRUCT(&dvpids);
|
||||
return rc;
|
||||
}
|
||||
if (NULL != tmp) {
|
||||
free(tmp);
|
||||
}
|
||||
|
||||
/* do the same for the indices */
|
||||
tmp = NULL;
|
||||
while (NULL != (item = opal_list_remove_first(&nodeids))) {
|
||||
rng = (orte_regex_range_t*)item;
|
||||
if (1 < rng->cnt) {
|
||||
if (NULL == tmp) {
|
||||
asprintf(&tmp, "%d-%d", rng->start, rng->start + rng->cnt - 1);
|
||||
} else {
|
||||
asprintf(&tmp2, "%s,%d-%d", tmp, rng->start, rng->start + rng->cnt - 1);
|
||||
free(tmp);
|
||||
tmp = tmp2;
|
||||
}
|
||||
} else {
|
||||
if (NULL == tmp) {
|
||||
asprintf(&tmp, "%d", rng->start);
|
||||
} else {
|
||||
asprintf(&tmp2, "%s,%d", tmp, rng->start);
|
||||
free(tmp);
|
||||
tmp = tmp2;
|
||||
}
|
||||
}
|
||||
OBJ_RELEASE(rng);
|
||||
}
|
||||
OPAL_LIST_DESTRUCT(&nodeids);
|
||||
|
||||
/* pack the string */
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &tmp, 1, OPAL_STRING))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OPAL_LIST_DESTRUCT(&dvpids);
|
||||
OPAL_LIST_DESTRUCT(&slots);
|
||||
return rc;
|
||||
}
|
||||
if (NULL != tmp) {
|
||||
@ -491,27 +519,171 @@ int orte_util_encode_nodemap(opal_buffer_t *buffer)
|
||||
/* pack the string */
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &tmp, 1, OPAL_STRING))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OPAL_LIST_DESTRUCT(&dvpids);
|
||||
OPAL_LIST_DESTRUCT(&slots);
|
||||
return rc;
|
||||
}
|
||||
if (NULL != tmp) {
|
||||
free(tmp);
|
||||
}
|
||||
|
||||
/* do the same to pass #slots on each node */
|
||||
tmp = NULL;
|
||||
while (NULL != (item = opal_list_remove_first(&slots))) {
|
||||
rng = (orte_regex_range_t*)item;
|
||||
if (1 < rng->cnt) {
|
||||
if (NULL == tmp) {
|
||||
asprintf(&tmp, "%d-%d[%d]", rng->start, rng->start + rng->cnt - 1, rng->slots);
|
||||
} else {
|
||||
asprintf(&tmp2, "%s,%d-%d[%d]", tmp, rng->start, rng->start + rng->cnt - 1, rng->slots);
|
||||
free(tmp);
|
||||
tmp = tmp2;
|
||||
}
|
||||
} else {
|
||||
if (NULL == tmp) {
|
||||
asprintf(&tmp, "%d[%d]", rng->start, rng->slots);
|
||||
} else {
|
||||
asprintf(&tmp2, "%s,%d[%d]", tmp, rng->start, rng->slots);
|
||||
free(tmp);
|
||||
tmp = tmp2;
|
||||
}
|
||||
}
|
||||
OBJ_RELEASE(rng);
|
||||
}
|
||||
OPAL_LIST_DESTRUCT(&slots);
|
||||
|
||||
/* pack the string */
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &tmp, 1, OPAL_STRING))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
if (NULL != tmp) {
|
||||
free(tmp);
|
||||
}
|
||||
|
||||
/* do the same to pass the flags for each node */
|
||||
tmp = NULL;
|
||||
while (NULL != (item = opal_list_remove_first(&flags))) {
|
||||
rng = (orte_regex_range_t*)item;
|
||||
if (1 < rng->cnt) {
|
||||
if (NULL == tmp) {
|
||||
asprintf(&tmp, "%d-%d[%x]", rng->start, rng->start + rng->cnt - 1, rng->slots);
|
||||
} else {
|
||||
asprintf(&tmp2, "%s,%d-%d[%x]", tmp, rng->start, rng->start + rng->cnt - 1, rng->slots);
|
||||
free(tmp);
|
||||
tmp = tmp2;
|
||||
}
|
||||
} else {
|
||||
if (NULL == tmp) {
|
||||
asprintf(&tmp, "%d[%x]", rng->start, rng->slots);
|
||||
} else {
|
||||
asprintf(&tmp2, "%s,%d[%x]", tmp, rng->start, rng->slots);
|
||||
free(tmp);
|
||||
tmp = tmp2;
|
||||
}
|
||||
}
|
||||
OBJ_RELEASE(rng);
|
||||
}
|
||||
OPAL_LIST_DESTRUCT(&flags);
|
||||
|
||||
/* pack the string */
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &tmp, 1, OPAL_STRING))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
if (NULL != tmp) {
|
||||
free(tmp);
|
||||
}
|
||||
|
||||
/* handle the topologies - as the most common case by far
|
||||
* is to have homogeneous topologies, we only send them
|
||||
* if something is different */
|
||||
tmp = NULL;
|
||||
if (1 < opal_list_get_size(&topos)) {
|
||||
opal_buffer_t bucket, *bptr;
|
||||
OBJ_CONSTRUCT(&bucket, opal_buffer_t);
|
||||
while (NULL != (item = opal_list_remove_first(&topos))) {
|
||||
rng = (orte_regex_range_t*)item;
|
||||
if (1 < rng->cnt) {
|
||||
if (NULL == tmp) {
|
||||
asprintf(&tmp, "%d-%d", rng->start, rng->start + rng->cnt - 1);
|
||||
} else {
|
||||
asprintf(&tmp2, "%s,%d-%d", tmp, rng->start, rng->start + rng->cnt - 1);
|
||||
free(tmp);
|
||||
tmp = tmp2;
|
||||
}
|
||||
} else {
|
||||
if (NULL == tmp) {
|
||||
asprintf(&tmp, "%d", rng->start);
|
||||
} else {
|
||||
asprintf(&tmp2, "%s,%d", tmp, rng->start);
|
||||
free(tmp);
|
||||
tmp = tmp2;
|
||||
}
|
||||
}
|
||||
/* pack this topology string */
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(&bucket, &rng->t->sig, 1, OPAL_STRING))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_RELEASE(rng);
|
||||
OPAL_LIST_DESTRUCT(&topos);
|
||||
OBJ_DESTRUCT(&bucket);
|
||||
free(tmp);
|
||||
return rc;
|
||||
}
|
||||
/* pack the topology itself */
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(&bucket, &rng->t->topo, 1, OPAL_HWLOC_TOPO))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_RELEASE(rng);
|
||||
OPAL_LIST_DESTRUCT(&topos);
|
||||
OBJ_DESTRUCT(&bucket);
|
||||
free(tmp);
|
||||
return rc;
|
||||
}
|
||||
OBJ_RELEASE(rng);
|
||||
}
|
||||
OPAL_LIST_DESTRUCT(&topos);
|
||||
|
||||
/* pack the string */
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &tmp, 1, OPAL_STRING))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_DESTRUCT(&bucket);
|
||||
free(tmp);
|
||||
return rc;
|
||||
}
|
||||
free(tmp);
|
||||
|
||||
/* now pack the topologies */
|
||||
bptr = &bucket;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &bptr, 1, OPAL_BUFFER))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_DESTRUCT(&bucket);
|
||||
return rc;
|
||||
}
|
||||
OBJ_DESTRUCT(&bucket);
|
||||
} else {
|
||||
/* need to pack the NULL just to terminate the region */
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &tmp, 1, OPAL_STRING))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/* decode a nodemap for a daemon */
|
||||
int orte_util_decode_daemon_nodemap(opal_buffer_t *buffer)
|
||||
{
|
||||
int m, n, rc;
|
||||
int n, nn, rc;
|
||||
orte_node_t *node;
|
||||
size_t k, num_nodes, endpt;
|
||||
size_t k, endpt, start;
|
||||
orte_job_t *daemons;
|
||||
orte_proc_t *dptr;
|
||||
char **nodes, *indices, *dvpids;
|
||||
char **nodes=NULL, *dvpids=NULL, *slots=NULL, *topos=NULL, *flags=NULL;
|
||||
char *ndnames, *rmndr, **tmp;
|
||||
int *nodeids, *dids;
|
||||
opal_list_t dids, slts, flgs;;
|
||||
opal_buffer_t *bptr=NULL;
|
||||
orte_topology_t *t;
|
||||
orte_regex_range_t *rng, *drng, *srng, *frng;
|
||||
|
||||
/* unpack the node regex */
|
||||
n = 1;
|
||||
@ -524,141 +696,226 @@ int orte_util_decode_daemon_nodemap(opal_buffer_t *buffer)
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/* unpack the index regex */
|
||||
n = 1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &indices, &n, OPAL_STRING))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
free(ndnames);
|
||||
return rc;
|
||||
}
|
||||
/* this is not allowed to be NULL */
|
||||
if (NULL == indices) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
|
||||
free(ndnames);
|
||||
return ORTE_ERR_BAD_PARAM;
|
||||
}
|
||||
OBJ_CONSTRUCT(&dids, opal_list_t);
|
||||
OBJ_CONSTRUCT(&slts, opal_list_t);
|
||||
OBJ_CONSTRUCT(&flgs, opal_list_t);
|
||||
|
||||
/* unpack the daemon vpid regex */
|
||||
n = 1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &dvpids, &n, OPAL_STRING))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
free(ndnames);
|
||||
free(indices);
|
||||
return rc;
|
||||
goto cleanup;
|
||||
}
|
||||
/* this is not allowed to be NULL */
|
||||
if (NULL == dvpids) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
|
||||
free(ndnames);
|
||||
free(indices);
|
||||
return ORTE_ERR_BAD_PARAM;
|
||||
rc = ORTE_ERR_BAD_PARAM;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/* unpack the slots regex */
|
||||
n = 1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &slots, &n, OPAL_STRING))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
}
|
||||
/* this is not allowed to be NULL */
|
||||
if (NULL == slots) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
|
||||
rc = ORTE_ERR_BAD_PARAM;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/* unpack the flags regex */
|
||||
n = 1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &flags, &n, OPAL_STRING))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
}
|
||||
/* this is not allowed to be NULL */
|
||||
if (NULL == flags) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
|
||||
rc = ORTE_ERR_BAD_PARAM;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/* unpack the topos regex - this may not have been
|
||||
* provided (e.g., for a homogeneous machine) */
|
||||
n = 1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &topos, &n, OPAL_STRING))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
}
|
||||
if (NULL != topos) {
|
||||
/* need to unpack the topologies */
|
||||
n = 1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &bptr, &n, OPAL_BUFFER))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
}
|
||||
}
|
||||
|
||||
/* if we are the HNP, then we just discard these strings as we already
|
||||
* have a complete picture - but we needed to unpack them in order to
|
||||
* maintain sync in the unpacking order */
|
||||
if (ORTE_PROC_IS_HNP) {
|
||||
free(ndnames);
|
||||
free(indices);
|
||||
free(dvpids);
|
||||
return ORTE_SUCCESS;
|
||||
rc = ORTE_SUCCESS;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/* decompress the regex */
|
||||
nodes = NULL;
|
||||
if (ORTE_SUCCESS != (rc = orte_regex_extract_node_names(ndnames, &nodes))) {
|
||||
free(ndnames);
|
||||
free(indices);
|
||||
free(dvpids);
|
||||
return rc;
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
}
|
||||
free(ndnames);
|
||||
|
||||
if (NULL == nodes) {
|
||||
/* should not happen */
|
||||
free(indices);
|
||||
free(dvpids);
|
||||
return ORTE_ERR_NOT_FOUND;
|
||||
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
||||
rc = ORTE_ERR_NOT_FOUND;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/* decompress the index ranges */
|
||||
num_nodes = opal_argv_count(nodes);
|
||||
nodeids = (int*)malloc(num_nodes * sizeof(int));
|
||||
tmp = opal_argv_split(indices, ',');
|
||||
k = 0;
|
||||
for (n=0; NULL != tmp[n]; n++) {
|
||||
/* convert the number - since it might be a range,
|
||||
* save the remainder pointer */
|
||||
nodeids[k++] = strtoul(tmp[n], &rmndr, 10);
|
||||
if (NULL != rmndr) {
|
||||
/* it must be a range - find the endpoint */
|
||||
++rmndr;
|
||||
m = nodeids[k-1] + 1;
|
||||
endpt = strtoul(rmndr, NULL, 10);
|
||||
while (k <= endpt && k < num_nodes) {
|
||||
nodeids[k++] = m++;
|
||||
}
|
||||
--k; // step back to compensate for later increment
|
||||
}
|
||||
++k;
|
||||
}
|
||||
opal_argv_free(tmp);
|
||||
free(indices);
|
||||
|
||||
/* decompress the vpids */
|
||||
dids = (int*)malloc(num_nodes * sizeof(int));
|
||||
tmp = opal_argv_split(dvpids, ',');
|
||||
k = 0;
|
||||
for (n=0; NULL != tmp[n]; n++) {
|
||||
rng = OBJ_NEW(orte_regex_range_t);
|
||||
opal_list_append(&dids, &rng->super);
|
||||
/* convert the number - since it might be a range,
|
||||
* save the remainder pointer */
|
||||
dids[k++] = strtoul(tmp[n], &rmndr, 10);
|
||||
if (NULL != rmndr) {
|
||||
rng->start = strtoul(tmp[n], &rmndr, 10);
|
||||
if (NULL == rmndr || 0 == strlen(rmndr)) {
|
||||
rng->endpt = rng->start;
|
||||
} else {
|
||||
/* it must be a range - find the endpoint */
|
||||
++rmndr;
|
||||
endpt = strtoul(rmndr, NULL, 10);
|
||||
m = dids[k-1] + 1;
|
||||
while (k <= endpt && k < num_nodes) {
|
||||
dids[k++] = m++;
|
||||
rng->endpt = strtoul(rmndr, NULL, 10);
|
||||
}
|
||||
--k; // step back to compensate for later increment
|
||||
}
|
||||
++k;
|
||||
}
|
||||
opal_argv_free(tmp);
|
||||
free(dvpids);
|
||||
|
||||
/* decompress the slots */
|
||||
tmp = opal_argv_split(slots, ',');
|
||||
for (n=0; NULL != tmp[n]; n++) {
|
||||
rng = OBJ_NEW(orte_regex_range_t);
|
||||
opal_list_append(&slts, &rng->super);
|
||||
/* find the '[' as that delimits the value */
|
||||
rmndr = strchr(tmp[n], '[');
|
||||
if (NULL == rmndr) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
|
||||
rc = ORTE_ERR_BAD_PARAM;
|
||||
opal_argv_free(tmp);
|
||||
goto cleanup;
|
||||
}
|
||||
*rmndr = '\0';
|
||||
++rmndr;
|
||||
/* convert that number as this is the number of
|
||||
* slots for this range */
|
||||
rng->slots = strtoul(rmndr, NULL, 10);
|
||||
/* convert the starting pt - since it might be a range,
|
||||
* save the remainder pointer */
|
||||
rng->start = strtoul(tmp[n], &rmndr, 10);
|
||||
if (NULL == rmndr || 0 == strlen(rmndr)) {
|
||||
rng->endpt = rng->start;
|
||||
} else {
|
||||
/* it must be a range - find the endpoint */
|
||||
++rmndr;
|
||||
rng->endpt = strtoul(rmndr, NULL, 10);
|
||||
}
|
||||
}
|
||||
opal_argv_free(tmp);
|
||||
|
||||
/* decompress the flags */
|
||||
tmp = opal_argv_split(flags, ',');
|
||||
for (n=0; NULL != tmp[n]; n++) {
|
||||
rng = OBJ_NEW(orte_regex_range_t);
|
||||
opal_list_append(&dids, &rng->super);
|
||||
/* find the '[' as that delimits the value */
|
||||
rmndr = strchr(tmp[n], '[');
|
||||
if (NULL == rmndr) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
|
||||
opal_argv_free(tmp);
|
||||
rc = ORTE_ERR_BAD_PARAM;
|
||||
goto cleanup;
|
||||
}
|
||||
*rmndr = '\0';
|
||||
++rmndr;
|
||||
/* check the value - it is just one character */
|
||||
if ('1' == *rmndr) {
|
||||
rng->slots = 1;
|
||||
} else {
|
||||
rng->slots = 0;
|
||||
}
|
||||
/* convert the starting pt - since it might be a range,
|
||||
* save the remainder pointer */
|
||||
rng->start = strtoul(tmp[n], &rmndr, 10);
|
||||
if (NULL == rmndr || 0 == strlen(rmndr)) {
|
||||
rng->endpt = rng->start;
|
||||
} else {
|
||||
/* it must be a range - find the endpoint */
|
||||
++rmndr;
|
||||
rng->endpt = strtoul(rmndr, NULL, 10);
|
||||
}
|
||||
}
|
||||
opal_argv_free(tmp);
|
||||
free(flags);
|
||||
|
||||
/* get the daemon job object */
|
||||
daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid);
|
||||
/* since this is a complete picture of all the nodes, reset the
|
||||
* counters and regenerate them */
|
||||
orte_process_info.num_daemons = 0;
|
||||
orte_process_info.num_nodes = 0;
|
||||
daemons->num_procs = 0;
|
||||
|
||||
/* update the node array */
|
||||
drng = (orte_regex_range_t*)opal_list_get_first(&dids);
|
||||
srng = (orte_regex_range_t*)opal_list_get_first(&slts);
|
||||
frng = (orte_regex_range_t*)opal_list_get_first(&flgs);
|
||||
for (n=0; NULL != nodes[n]; n++) {
|
||||
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, nodeids[n]))) {
|
||||
/* the daemon vpids for these nodes will be in the dids array, so
|
||||
* use those to lookup the nodes */
|
||||
nn = drng->start + n;
|
||||
if (nn == drng->endpt) {
|
||||
drng = (orte_regex_range_t*)opal_list_get_next(&drng->super);
|
||||
}
|
||||
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, nn))) {
|
||||
node = OBJ_NEW(orte_node_t);
|
||||
node->name = nodes[n];
|
||||
node->index = nodeids[n];
|
||||
opal_pointer_array_set_item(orte_node_pool, node->index, node);
|
||||
} else if (NULL != node->daemon) {
|
||||
node->index = nn;
|
||||
opal_pointer_array_set_item(orte_node_pool, nn, node);
|
||||
}
|
||||
/* set the number of slots */
|
||||
node->slots = srng->slots;
|
||||
if (srng->endpt == nn) {
|
||||
srng = (orte_regex_range_t*)opal_list_get_next(&srng->super);
|
||||
}
|
||||
/* set the flags */
|
||||
if (0 == frng->slots) {
|
||||
ORTE_FLAG_UNSET(node, ORTE_NODE_FLAG_SLOTS_GIVEN);
|
||||
} else {
|
||||
ORTE_FLAG_SET(node, ORTE_NODE_FLAG_SLOTS_GIVEN);
|
||||
}
|
||||
if (frng->endpt == nn) {
|
||||
frng = (orte_regex_range_t*)opal_list_get_next(&frng->super);
|
||||
}
|
||||
++orte_process_info.num_nodes;
|
||||
/* if this is me, just ignore the rest as we are all setup */
|
||||
if (nn == (int)ORTE_PROC_MY_NAME->vpid) {
|
||||
continue;
|
||||
}
|
||||
if (NULL != node->daemon) {
|
||||
OBJ_RELEASE(node->daemon);
|
||||
node->daemon = NULL;
|
||||
}
|
||||
++orte_process_info.num_nodes;
|
||||
if (NULL == (dptr = (orte_proc_t*)opal_pointer_array_get_item(daemons->procs, dids[n]))) {
|
||||
if (NULL == (dptr = (orte_proc_t*)opal_pointer_array_get_item(daemons->procs, nn))) {
|
||||
/* create a daemon object for this node */
|
||||
dptr = OBJ_NEW(orte_proc_t);
|
||||
dptr->name.jobid = ORTE_PROC_MY_NAME->jobid;
|
||||
dptr->name.vpid = dids[n];
|
||||
dptr->name.vpid = nn;
|
||||
ORTE_FLAG_SET(dptr, ORTE_PROC_FLAG_ALIVE); // assume the daemon is alive until discovered otherwise
|
||||
opal_pointer_array_set_item(daemons->procs, dids[n], dptr);
|
||||
opal_pointer_array_set_item(daemons->procs, nn, dptr);
|
||||
++daemons->num_procs;
|
||||
} else if (NULL != dptr->node) {
|
||||
OBJ_RELEASE(dptr->node);
|
||||
dptr->node = NULL;
|
||||
}
|
||||
++daemons->num_procs;
|
||||
/* link the node to the daemon */
|
||||
OBJ_RETAIN(dptr);
|
||||
node->daemon = dptr;
|
||||
@ -670,8 +927,83 @@ int orte_util_decode_daemon_nodemap(opal_buffer_t *buffer)
|
||||
* all the node names themselves. Instead, we just free the
|
||||
* array of string pointers, leaving the strings alone */
|
||||
free(nodes);
|
||||
free(nodeids);
|
||||
free(dids);
|
||||
|
||||
/* if no topology info was passed, then everyone shares our topology */
|
||||
if (NULL == bptr) {
|
||||
orte_topology_t *t;
|
||||
/* our topology is first in the array */
|
||||
t = (orte_topology_t*)opal_pointer_array_get_item(orte_node_topologies, 0);
|
||||
for (n=0; n < orte_node_pool->size; n++) {
|
||||
if (NULL != (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, n))) {
|
||||
if (NULL == node->topology) {
|
||||
OBJ_RETAIN(t);
|
||||
node->topology = t;
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
char *sig;
|
||||
hwloc_topology_t topo;
|
||||
/* decompress the topology regex */
|
||||
tmp = opal_argv_split(topos, ',');
|
||||
/* there must be a topology definition for each range */
|
||||
for (nn=0; NULL != tmp[nn]; nn++) {
|
||||
/* unpack the signature */
|
||||
n = 1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.unpack(bptr, &sig, &n, OPAL_STRING))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
opal_argv_free(tmp);
|
||||
OBJ_RELEASE(bptr);
|
||||
goto cleanup;
|
||||
}
|
||||
n = 1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.unpack(bptr, &topo, &n, OPAL_HWLOC_TOPO))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
opal_argv_free(tmp);
|
||||
OBJ_RELEASE(bptr);
|
||||
free(sig);
|
||||
goto cleanup;
|
||||
}
|
||||
/* see if we already have this topology - could be an update */
|
||||
for (n=0; n < orte_node_topologies->size; n++) {
|
||||
if (NULL == (t = (orte_topology_t*)opal_pointer_array_get_item(orte_node_topologies, n))) {
|
||||
continue;
|
||||
}
|
||||
if (0 == strcmp(t->sig, sig)) {
|
||||
/* found a match */
|
||||
free(sig);
|
||||
opal_hwloc_base_free_topology(topo);
|
||||
sig = NULL;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (NULL != sig) {
|
||||
/* new topology - record it */
|
||||
t = OBJ_NEW(orte_topology_t);
|
||||
t->sig = sig;
|
||||
t->topo = topo;
|
||||
}
|
||||
/* point each of the nodes in the regex to this topology */
|
||||
start = strtoul(tmp[nn], &rmndr, 10);
|
||||
if (NULL != rmndr) {
|
||||
/* it must be a range - find the endpoint */
|
||||
++rmndr;
|
||||
endpt = strtoul(rmndr, NULL, 10);
|
||||
} else {
|
||||
endpt = start;
|
||||
}
|
||||
for (k=start; k <= endpt; k++) {
|
||||
if (NULL != (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, k))) {
|
||||
if (NULL == node->topology) {
|
||||
OBJ_RETAIN(t);
|
||||
node->topology = t;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
OBJ_RELEASE(bptr);
|
||||
opal_argv_free(tmp);
|
||||
}
|
||||
|
||||
/* unpdate num procs */
|
||||
if (orte_process_info.num_procs != daemons->num_procs) {
|
||||
@ -700,5 +1032,9 @@ int orte_util_decode_daemon_nodemap(opal_buffer_t *buffer)
|
||||
}
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
cleanup:
|
||||
OPAL_LIST_DESTRUCT(&dids);
|
||||
OPAL_LIST_DESTRUCT(&slts);
|
||||
OPAL_LIST_DESTRUCT(&flgs);
|
||||
return rc;
|
||||
}
|
||||
|
@ -51,6 +51,10 @@ ORTE_DECLSPEC int orte_util_decode_daemon_nodemap(opal_buffer_t *buffer);
|
||||
|
||||
ORTE_DECLSPEC int orte_util_build_daemon_nidmap(char **nodes);
|
||||
|
||||
ORTE_DECLSPEC int orte_util_encode_topologies(opal_buffer_t *buffer);
|
||||
|
||||
ORTE_DECLSPEC int orte_util_decode_topologies(opal_buffer_t *buffer);
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
#endif
|
||||
|
@ -428,7 +428,6 @@ int orte_regex_extract_node_names(char *regexp, char ***names)
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Parse one or more ranges in a set
|
||||
*
|
||||
@ -589,148 +588,7 @@ static int regex_parse_node_range(char *base, char *range, int num_digits, char
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/* Compute the #procs on each node given a regex of form
|
||||
* "#procs(x#nodes),#procs(x#nodes). In other words, an
|
||||
* expression of "4(x30) will be interpreted to mean four
|
||||
* procs on each of the next 30 nodes.
|
||||
*/
|
||||
int orte_regex_extract_ppn(int num_nodes, char *regexp, int **ppn)
|
||||
{
|
||||
int *tmp;
|
||||
char *begptr, *endptr, *orig;
|
||||
int i, j, count, reps;
|
||||
|
||||
/* init null answer */
|
||||
*ppn = NULL;
|
||||
|
||||
tmp = (int *) malloc(sizeof(int) * num_nodes);
|
||||
if (NULL == tmp) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
memset(tmp, 0, sizeof(int) * num_nodes);
|
||||
|
||||
orig = begptr = strdup(regexp);
|
||||
if (NULL == begptr) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
free(tmp);
|
||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
j = 0;
|
||||
while (begptr) {
|
||||
count = strtol(begptr, &endptr, 10);
|
||||
if ((endptr[0] == '(') && (endptr[1] == 'x')) {
|
||||
reps = strtol((endptr+2), &endptr, 10);
|
||||
if (endptr[0] == ')') {
|
||||
endptr++;
|
||||
}
|
||||
} else {
|
||||
reps = 1;
|
||||
}
|
||||
|
||||
for (i = 0; i < reps && j < num_nodes; i++) {
|
||||
tmp[j++] = count;
|
||||
}
|
||||
|
||||
if (*endptr == ',') {
|
||||
begptr = endptr + 1;
|
||||
} else if (*endptr == '\0' || j >= num_nodes) {
|
||||
break;
|
||||
} else {
|
||||
orte_show_help("help-regex.txt", "regex:bad-ppn", true, regexp);
|
||||
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
|
||||
free(tmp);
|
||||
free(orig);
|
||||
return ORTE_ERR_BAD_PARAM;
|
||||
}
|
||||
}
|
||||
|
||||
free(orig);
|
||||
|
||||
/* return values */
|
||||
*ppn = tmp;
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
int orte_regex_extract_name_range(char *regexp, char ***names)
|
||||
{
|
||||
char *tmp, *b, *b2, **rngs, *t, *pre, *post;
|
||||
int i;
|
||||
char c;
|
||||
|
||||
/* protect input */
|
||||
tmp = strdup(regexp);
|
||||
/* look for bracket */
|
||||
if (NULL == (b = strchr(tmp, '['))) {
|
||||
/* just one value */
|
||||
opal_argv_append_nosize(names, regexp);
|
||||
free(tmp);
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
if (b == tmp) {
|
||||
/* bracket at very beginning */
|
||||
pre = NULL;
|
||||
} else {
|
||||
*b = '\0';
|
||||
pre = tmp;
|
||||
}
|
||||
/* step over the bracket */
|
||||
b++;
|
||||
/* look for closing bracket */
|
||||
if (NULL == (b2 = strrchr(tmp, ']'))) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
|
||||
free(tmp);
|
||||
return ORTE_ERR_BAD_PARAM;
|
||||
}
|
||||
*b2 = '\0';
|
||||
b2++;
|
||||
if ('\0' == *b2) {
|
||||
/* bracket was at end */
|
||||
post = NULL;
|
||||
} else {
|
||||
post = b2;
|
||||
}
|
||||
|
||||
/* split on commas */
|
||||
rngs = opal_argv_split(b, ',');
|
||||
for (i=0; NULL != rngs[i]; i++) {
|
||||
/* look for a range */
|
||||
if (NULL == strchr(rngs[i], '-')) {
|
||||
/* just one value */
|
||||
if (NULL == pre && NULL == post) {
|
||||
t = strdup(rngs[i]);
|
||||
} else if (NULL == pre) {
|
||||
asprintf(&t, "%s%s", rngs[i], post);
|
||||
} else if (NULL == post) {
|
||||
asprintf(&t, "%s%s", pre, rngs[i]);
|
||||
} else {
|
||||
asprintf(&t, "%s%s%s", pre, rngs[i], post);
|
||||
}
|
||||
opal_argv_append_nosize(names, t);
|
||||
free(t);
|
||||
} else {
|
||||
/* has to be <char>-<char> */
|
||||
for (c=rngs[i][0]; c <= rngs[i][2]; c++) {
|
||||
if (NULL == pre && NULL == post) {
|
||||
asprintf(&t, "%c", c);
|
||||
} else if (NULL == pre) {
|
||||
asprintf(&t, "%c%s", c, post);
|
||||
} else if (NULL == post) {
|
||||
asprintf(&t, "%s%c", pre, c);
|
||||
} else {
|
||||
asprintf(&t, "%s%c%s", pre, c, post);
|
||||
}
|
||||
opal_argv_append_nosize(names, t);
|
||||
free(t);
|
||||
}
|
||||
}
|
||||
}
|
||||
opal_argv_free(rngs);
|
||||
free(tmp);
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
/***** CLASS INSTANTIATIONS ****/
|
||||
|
||||
static void range_construct(orte_regex_range_t *ptr)
|
||||
{
|
||||
@ -768,4 +626,3 @@ OBJ_CLASS_INSTANCE(orte_regex_node_t,
|
||||
opal_list_item_t,
|
||||
orte_regex_node_construct,
|
||||
orte_regex_node_destruct);
|
||||
|
||||
|
@ -37,7 +37,10 @@ BEGIN_C_DECLS
|
||||
typedef struct {
|
||||
opal_list_item_t super;
|
||||
int start;
|
||||
int endpt;
|
||||
int cnt;
|
||||
int slots;
|
||||
orte_topology_t *t;
|
||||
} orte_regex_range_t;
|
||||
ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_regex_range_t);
|
||||
|
||||
@ -58,9 +61,5 @@ ORTE_DECLSPEC int orte_regex_create(char *nodes, char **regexp);
|
||||
|
||||
ORTE_DECLSPEC int orte_regex_extract_node_names(char *regexp, char ***names);
|
||||
|
||||
ORTE_DECLSPEC int orte_regex_extract_ppn(int num_nodes, char *regexp, int **ppn);
|
||||
|
||||
ORTE_DECLSPEC int orte_regex_extract_name_range(char *regexp, char ***names);
|
||||
|
||||
END_C_DECLS
|
||||
#endif
|
||||
|
Загрузка…
Ссылка в новой задаче
Block a user