Merge pull request #2869 from rhc54/topic/staticports
Fix static port and partial allocation operations
Этот коммит содержится в:
Коммит
2b2ea2fed2
@ -8,7 +8,7 @@
|
||||
* reserved.
|
||||
* Copyright (c) 2011-2013 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -245,9 +245,12 @@ static void proc_errors(int fd, short args, void *cbdata)
|
||||
* lifeline
|
||||
*/
|
||||
if (ORTE_PROC_STATE_LIFELINE_LOST == state ||
|
||||
ORTE_PROC_STATE_UNABLE_TO_SEND_MSG == state) {
|
||||
ORTE_PROC_STATE_UNABLE_TO_SEND_MSG == state ||
|
||||
ORTE_PROC_STATE_NO_PATH_TO_TARGET == state ||
|
||||
ORTE_PROC_STATE_PEER_UNKNOWN == state ||
|
||||
ORTE_PROC_STATE_FAILED_TO_CONNECT == state) {
|
||||
OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output,
|
||||
"%s errmgr:orted lifeline lost - exiting",
|
||||
"%s errmgr:orted lifeline lost or unable to communicate - exiting",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
/* set our exit status */
|
||||
ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
|
||||
|
@ -46,7 +46,7 @@ void orte_oob_base_send_nb(int fd, short args, void *cbdata)
|
||||
OBJ_RELEASE(cd);
|
||||
|
||||
opal_output_verbose(5, orte_oob_base_framework.framework_output,
|
||||
"%s oob:base:send to target %s - %u attempt",
|
||||
"%s oob:base:send to target %s - attempt %u",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&msg->dst), msg->retries);
|
||||
|
||||
|
@ -273,6 +273,8 @@ void mca_oob_tcp_peer_try_connect(int fd, short args, void *cbdata)
|
||||
if (mca_oob_tcp_component.max_recon_attempts < 0 ||
|
||||
peer->num_retries < mca_oob_tcp_component.max_recon_attempts) {
|
||||
struct timeval tv;
|
||||
/* close the current socket */
|
||||
CLOSE_THE_SOCKET(peer->sd);
|
||||
/* reset the addr states */
|
||||
OPAL_LIST_FOREACH(addr, &peer->addrs, mca_oob_tcp_addr_t) {
|
||||
addr->state = MCA_OOB_TCP_UNCONNECTED;
|
||||
@ -306,6 +308,8 @@ void mca_oob_tcp_peer_try_connect(int fd, short args, void *cbdata)
|
||||
"------------------------------------------------------------",
|
||||
orte_process_info.nodename,
|
||||
(NULL == host) ? "<unknown>" : host);
|
||||
/* close the socket */
|
||||
CLOSE_THE_SOCKET(peer->sd);
|
||||
/* let the TCP component know that this module failed to make
|
||||
* the connection so it can do some bookkeeping and fail back
|
||||
* to the OOB level so another component can try. This will activate
|
||||
@ -350,6 +354,8 @@ void mca_oob_tcp_peer_try_connect(int fd, short args, void *cbdata)
|
||||
} else {
|
||||
peer->state = MCA_OOB_TCP_UNCONNECTED;
|
||||
}
|
||||
/* close the socket */
|
||||
CLOSE_THE_SOCKET(peer->sd);
|
||||
return;
|
||||
} else {
|
||||
opal_output(0,
|
||||
@ -361,6 +367,8 @@ void mca_oob_tcp_peer_try_connect(int fd, short args, void *cbdata)
|
||||
opal_net_get_port((struct sockaddr*)&addr->addr),
|
||||
opal_strerror(rc),
|
||||
rc);
|
||||
/* close the socket */
|
||||
CLOSE_THE_SOCKET(peer->sd);
|
||||
ORTE_FORCED_TERMINATE(1);
|
||||
}
|
||||
|
||||
|
@ -385,6 +385,10 @@ static int create_listen(void)
|
||||
conn = OBJ_NEW(mca_oob_tcp_listener_t);
|
||||
conn->sd = sd;
|
||||
conn->port = ntohs(((struct sockaddr_in*) &inaddr)->sin_port);
|
||||
if (orte_static_ports && 0 == orte_process_info.my_port) {
|
||||
/* save the first one */
|
||||
orte_process_info.my_port = conn->port;
|
||||
}
|
||||
opal_list_append(&mca_oob_tcp_component.listeners, &conn->item);
|
||||
/* and to our ports */
|
||||
asprintf(&tconn, "%d", ntohs(((struct sockaddr_in*) &inaddr)->sin_port));
|
||||
|
@ -13,7 +13,7 @@
|
||||
* Copyright (c) 2006-2011 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2007-2015 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2014 Intel Corporation. All rights reserved.
|
||||
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -350,6 +350,17 @@ static void launch_daemons(int fd, short args, void *cbdata)
|
||||
/* add the daemon command (as specified by user) */
|
||||
orte_plm_base_setup_orted_cmd(&argc, &argv);
|
||||
|
||||
/* if we have static ports, we need to ensure that mpirun is
|
||||
* on the list. Since alps won't be launching a daemon on it,
|
||||
* it won't have been placed on the list, so create a new
|
||||
* version here that includes it */
|
||||
if (orte_static_ports) {
|
||||
char *ltmp;
|
||||
asprintf(<mp, "%s,%s", orte_process_info.nodename, nodelist_flat);
|
||||
free(nodelist_flat);
|
||||
nodelist_flat = ltmp;
|
||||
}
|
||||
|
||||
/* Add basic orted command line options, including debug flags */
|
||||
orte_plm_base_orted_append_basic_args(&argc, &argv,
|
||||
NULL,
|
||||
|
@ -84,27 +84,35 @@
|
||||
void orte_plm_base_set_slots(orte_node_t *node)
|
||||
{
|
||||
if (0 == strncmp(orte_set_slots, "cores", strlen(orte_set_slots))) {
|
||||
node->slots = opal_hwloc_base_get_nbobjs_by_type(node->topology->topo,
|
||||
HWLOC_OBJ_CORE, 0,
|
||||
OPAL_HWLOC_LOGICAL);
|
||||
if (NULL != node->topology && NULL != node->topology->topo) {
|
||||
node->slots = opal_hwloc_base_get_nbobjs_by_type(node->topology->topo,
|
||||
HWLOC_OBJ_CORE, 0,
|
||||
OPAL_HWLOC_LOGICAL);
|
||||
}
|
||||
} else if (0 == strncmp(orte_set_slots, "sockets", strlen(orte_set_slots))) {
|
||||
if (0 == (node->slots = opal_hwloc_base_get_nbobjs_by_type(node->topology->topo,
|
||||
HWLOC_OBJ_SOCKET, 0,
|
||||
OPAL_HWLOC_LOGICAL))) {
|
||||
/* some systems don't report sockets - in this case,
|
||||
* use numanodes */
|
||||
if (NULL != node->topology && NULL != node->topology->topo) {
|
||||
if (0 == (node->slots = opal_hwloc_base_get_nbobjs_by_type(node->topology->topo,
|
||||
HWLOC_OBJ_SOCKET, 0,
|
||||
OPAL_HWLOC_LOGICAL))) {
|
||||
/* some systems don't report sockets - in this case,
|
||||
* use numanodes */
|
||||
node->slots = opal_hwloc_base_get_nbobjs_by_type(node->topology->topo,
|
||||
HWLOC_OBJ_NODE, 0,
|
||||
OPAL_HWLOC_LOGICAL);
|
||||
}
|
||||
}
|
||||
} else if (0 == strncmp(orte_set_slots, "numas", strlen(orte_set_slots))) {
|
||||
if (NULL != node->topology && NULL != node->topology->topo) {
|
||||
node->slots = opal_hwloc_base_get_nbobjs_by_type(node->topology->topo,
|
||||
HWLOC_OBJ_NODE, 0,
|
||||
OPAL_HWLOC_LOGICAL);
|
||||
}
|
||||
} else if (0 == strncmp(orte_set_slots, "numas", strlen(orte_set_slots))) {
|
||||
node->slots = opal_hwloc_base_get_nbobjs_by_type(node->topology->topo,
|
||||
HWLOC_OBJ_NODE, 0,
|
||||
OPAL_HWLOC_LOGICAL);
|
||||
} else if (0 == strncmp(orte_set_slots, "hwthreads", strlen(orte_set_slots))) {
|
||||
node->slots = opal_hwloc_base_get_nbobjs_by_type(node->topology->topo,
|
||||
HWLOC_OBJ_PU, 0,
|
||||
OPAL_HWLOC_LOGICAL);
|
||||
if (NULL != node->topology && NULL != node->topology->topo) {
|
||||
node->slots = opal_hwloc_base_get_nbobjs_by_type(node->topology->topo,
|
||||
HWLOC_OBJ_PU, 0,
|
||||
OPAL_HWLOC_LOGICAL);
|
||||
}
|
||||
} else {
|
||||
/* must be a number */
|
||||
node->slots = strtol(orte_set_slots, NULL, 10);
|
||||
@ -1436,16 +1444,23 @@ int orte_plm_base_orted_append_basic_args(int *argc, char ***argv,
|
||||
free(rml_uri);
|
||||
|
||||
/* if we have static ports, pass the node list */
|
||||
if (orte_static_ports && NULL != nodes) {
|
||||
/* convert the nodes to a regex */
|
||||
if (ORTE_SUCCESS != (rc = orte_regex_create(nodes, ¶m))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
if (orte_static_ports) {
|
||||
param = NULL;
|
||||
if (NULL != nodes) {
|
||||
/* convert the nodes to a regex */
|
||||
if (ORTE_SUCCESS != (rc = orte_regex_create(nodes, ¶m))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
} else if (NULL != orte_node_regex) {
|
||||
param = strdup(orte_node_regex);
|
||||
}
|
||||
if (NULL != param) {
|
||||
opal_argv_append(argc, argv, "-"OPAL_MCA_CMD_LINE_ID);
|
||||
opal_argv_append(argc, argv, "orte_node_regex");
|
||||
opal_argv_append(argc, argv, param);
|
||||
free(param);
|
||||
}
|
||||
opal_argv_append(argc, argv, "-"OPAL_MCA_CMD_LINE_ID);
|
||||
opal_argv_append(argc, argv, "orte_node_regex");
|
||||
opal_argv_append(argc, argv, param);
|
||||
free(param);
|
||||
}
|
||||
|
||||
/* if output-filename was specified, pass that along */
|
||||
|
@ -14,7 +14,7 @@
|
||||
* reserved.
|
||||
* Copyright (c) 2008 Institut National de Recherche en Informatique
|
||||
* et Automatique. All rights reserved.
|
||||
* Copyright (c) 2014 Intel Corporation. All rights reserved.
|
||||
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -258,6 +258,17 @@ static void launch_daemons(int fd, short args, void *cbdata)
|
||||
/* add the daemon command (as specified by user) */
|
||||
orte_plm_base_setup_orted_cmd(&argc, &argv);
|
||||
|
||||
/* if we have static ports, we need to ensure that mpirun is
|
||||
* on the list. Since lsf won't be launching a daemon on it,
|
||||
* it won't have been placed on the list, so create a new
|
||||
* version here that includes it */
|
||||
if (orte_static_ports) {
|
||||
char *ltmp;
|
||||
asprintf(<mp, "%s,%s", orte_process_info.nodename, nodelist);
|
||||
free(nodelist);
|
||||
nodelist = ltmp;
|
||||
}
|
||||
|
||||
/* Add basic orted command line options */
|
||||
orte_plm_base_orted_append_basic_args(&argc, &argv,
|
||||
"lsf",
|
||||
|
@ -328,7 +328,8 @@ static void rsh_wait_daemon(orte_proc_t *daemon, void* cbdata)
|
||||
static int setup_launch(int *argcptr, char ***argvptr,
|
||||
char *nodename,
|
||||
int *node_name_index1,
|
||||
int *proc_vpid_index, char *prefix_dir)
|
||||
int *proc_vpid_index, char *prefix_dir,
|
||||
char *nodelist)
|
||||
{
|
||||
int argc;
|
||||
char **argv;
|
||||
@ -613,7 +614,7 @@ static int setup_launch(int *argcptr, char ***argvptr,
|
||||
orte_plm_base_orted_append_basic_args(&argc, &argv,
|
||||
"env",
|
||||
proc_vpid_index,
|
||||
NULL);
|
||||
nodelist);
|
||||
|
||||
/* ensure that only the ssh plm is selected on the remote daemon */
|
||||
opal_argv_append_nosize(&argv, "-"OPAL_MCA_CMD_LINE_ID);
|
||||
@ -828,7 +829,7 @@ static int remote_spawn(opal_buffer_t *launch)
|
||||
|
||||
/* setup the launch */
|
||||
if (ORTE_SUCCESS != (rc = setup_launch(&argc, &argv, orte_process_info.nodename, &node_name_index1,
|
||||
&proc_vpid_index, prefix))) {
|
||||
&proc_vpid_index, prefix, NULL))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_DESTRUCT(&coll);
|
||||
goto cleanup;
|
||||
@ -993,6 +994,7 @@ static void launch_daemons(int fd, short args, void *cbdata)
|
||||
int port, *portptr;
|
||||
orte_namelist_t *child;
|
||||
char *rtmod;
|
||||
char *nlistflat;
|
||||
|
||||
/* if we are launching debugger daemons, then just go
|
||||
* do it - no new daemons will be launched
|
||||
@ -1153,12 +1155,37 @@ static void launch_daemons(int fd, short args, void *cbdata)
|
||||
orte_routed.get_routing_list(rtmod, &coll);
|
||||
}
|
||||
|
||||
if (orte_static_ports) {
|
||||
/* create a list of all nodes involved so we can pass it along */
|
||||
char **nodelist = NULL;
|
||||
orte_node_t *n2;
|
||||
for (nnode=0; nnode < map->nodes->size; nnode++) {
|
||||
if (NULL != (n2 = (orte_node_t*)opal_pointer_array_get_item(map->nodes, nnode))) {
|
||||
opal_argv_append_nosize(&nodelist, n2->name);
|
||||
}
|
||||
}
|
||||
/* we need mpirun to be the first node on this list */
|
||||
if (0 != strcmp(nodelist[0], orte_process_info.nodename)) {
|
||||
opal_argv_prepend_nosize(&nodelist, orte_process_info.nodename);
|
||||
}
|
||||
nlistflat = opal_argv_join(nodelist, ',');
|
||||
opal_argv_free(nodelist);
|
||||
} else {
|
||||
nlistflat = NULL;
|
||||
}
|
||||
|
||||
/* setup the launch */
|
||||
if (ORTE_SUCCESS != (rc = setup_launch(&argc, &argv, node->name, &node_name_index1,
|
||||
&proc_vpid_index, prefix_dir))) {
|
||||
&proc_vpid_index, prefix_dir, nlistflat))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
if (NULL != nlistflat) {
|
||||
free(nlistflat);
|
||||
}
|
||||
goto cleanup;
|
||||
}
|
||||
if (NULL != nlistflat) {
|
||||
free(nlistflat);
|
||||
}
|
||||
|
||||
/*
|
||||
* Iterate through each of the nodes
|
||||
|
@ -12,7 +12,7 @@
|
||||
* Copyright (c) 2006-2014 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2007-2015 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2014 Intel Corporation. All rights reserved.
|
||||
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -354,6 +354,17 @@ static void launch_daemons(int fd, short args, void *cbdata)
|
||||
/* add the daemon command (as specified by user) */
|
||||
orte_plm_base_setup_orted_cmd(&argc, &argv);
|
||||
|
||||
/* if we have static ports, we need to ensure that mpirun is
|
||||
* on the list. Since slurm won't be launching a daemon on it,
|
||||
* it won't have been placed on the list, so create a new
|
||||
* version here that includes it */
|
||||
if (orte_static_ports) {
|
||||
char *ltmp;
|
||||
asprintf(<mp, "%s,%s", orte_process_info.nodename, nodelist_flat);
|
||||
free(nodelist_flat);
|
||||
nodelist_flat = ltmp;
|
||||
}
|
||||
|
||||
/* Add basic orted command line options, including debug flags */
|
||||
orte_plm_base_orted_append_basic_args(&argc, &argv,
|
||||
"slurm", &proc_vpid_index,
|
||||
|
@ -12,7 +12,7 @@
|
||||
* Copyright (c) 2006 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2007-2012 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2014 Intel Corporation. All rights reserved.
|
||||
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -278,6 +278,17 @@ static void launch_daemons(int fd, short args, void *cbdata)
|
||||
nodelist = opal_argv_join(nodeargv, ',');
|
||||
opal_argv_free(nodeargv);
|
||||
|
||||
/* if we have static ports, we need to ensure that mpirun is
|
||||
* on the list. Since Torque won't be launching a daemon on it,
|
||||
* it won't have been placed on the list, so create a new
|
||||
* version here that includes it */
|
||||
if (orte_static_ports) {
|
||||
char *ltmp;
|
||||
asprintf(<mp, "%s,%s", orte_process_info.nodename, nodelist);
|
||||
free(nodelist);
|
||||
nodelist = ltmp;
|
||||
}
|
||||
|
||||
/* Add basic orted command line options */
|
||||
orte_plm_base_orted_append_basic_args(&argc, &argv, "tm",
|
||||
&proc_vpid_index,
|
||||
|
@ -248,7 +248,7 @@ OBJ_CLASS_DECLARATION(orte_self_send_xfer_t);
|
||||
} \
|
||||
} else if (NULL != (m)->cbfunc.buffer) { \
|
||||
/* non-blocking buffer send */ \
|
||||
(m)->cbfunc.buffer((m)->status, &((m)->origin), \
|
||||
(m)->cbfunc.buffer((m)->status, &((m)->dst), \
|
||||
(m)->buffer, \
|
||||
(m)->tag, (m)->cbdata); \
|
||||
} \
|
||||
|
@ -253,25 +253,12 @@ int pmix_server_init(void)
|
||||
kv->type = OPAL_STRING;
|
||||
opal_list_append(&info, &kv->super);
|
||||
}
|
||||
/* tell the server to allow tool connections */
|
||||
kv = OBJ_NEW(opal_value_t);
|
||||
kv->key = strdup(OPAL_PMIX_SERVER_TOOL_SUPPORT);
|
||||
kv->type = OPAL_BOOL;
|
||||
kv->data.flag = true;
|
||||
opal_list_append(&info, &kv->super);
|
||||
/* tell the server our temp directory */
|
||||
kv = OBJ_NEW(opal_value_t);
|
||||
kv->key = strdup(OPAL_PMIX_SERVER_TMPDIR);
|
||||
kv->type = OPAL_STRING;
|
||||
kv->data.string = opal_os_path(false, orte_process_info.jobfam_session_dir, NULL);
|
||||
opal_list_append(&info, &kv->super);
|
||||
/* use the same for the system temp directory - this is
|
||||
* where the system-level tool connections will go */
|
||||
kv = OBJ_NEW(opal_value_t);
|
||||
kv->key = strdup(OPAL_PMIX_SYSTEM_TMPDIR);
|
||||
kv->type = OPAL_STRING;
|
||||
kv->data.string = strdup(orte_process_info.tmpdir_base);
|
||||
opal_list_append(&info, &kv->super);
|
||||
/* use only one listener */
|
||||
kv = OBJ_NEW(opal_value_t);
|
||||
kv->key = strdup(OPAL_PMIX_SINGLE_LISTENER);
|
||||
|
@ -105,25 +105,12 @@ int orte_util_build_daemon_nidmap(char **nodes)
|
||||
}
|
||||
OBJ_DESTRUCT(&kv);
|
||||
|
||||
OBJ_CONSTRUCT(&kv, opal_value_t);
|
||||
kv.key = strdup(OPAL_PMIX_HOSTNAME);
|
||||
kv.data.string = strdup("HNP");
|
||||
kv.type = OPAL_STRING;
|
||||
if (OPAL_SUCCESS != (rc = opal_pmix.store_local(&proc, &kv))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_DESTRUCT(&kv);
|
||||
return rc;
|
||||
}
|
||||
OBJ_DESTRUCT(&kv);
|
||||
|
||||
/* the daemon vpids will be assigned in order,
|
||||
* starting with vpid=1 for the first node in
|
||||
* the list
|
||||
*/
|
||||
* starting with vpid=0 for the HNP */
|
||||
OBJ_CONSTRUCT(&buf, opal_buffer_t);
|
||||
for (i=0; i < num_nodes; i++) {
|
||||
/* define the vpid for this daemon */
|
||||
proc.vpid = i+1;
|
||||
proc.vpid = i;
|
||||
/* store the hostname for the proc */
|
||||
OBJ_CONSTRUCT(&kv, opal_value_t);
|
||||
kv.key = strdup(OPAL_PMIX_HOSTNAME);
|
||||
|
@ -696,10 +696,9 @@ int orte_show_help_norender(const char *filename, const char *topic,
|
||||
ORTE_PROC_MY_HNP, buf,
|
||||
ORTE_RML_TAG_SHOW_HELP,
|
||||
orte_rml_send_callback, NULL))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_RELEASE(buf);
|
||||
/* okay, that didn't work, just process locally error, just ignore return */
|
||||
show_help(filename, topic, NULL, ORTE_PROC_MY_NAME);
|
||||
/* okay, that didn't work, output locally */
|
||||
opal_output(orte_help_output, "%s", output);
|
||||
} else {
|
||||
rc = ORTE_SUCCESS;
|
||||
}
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user