Initial implementation of static ports. Provide an mca param to specify static port ranges to the OOB - can provide an
y combination of comma-separated values and ranges. Daemons will use the first port in the range, MPI procs will use the other ports in the range assuming that they know their node rank in time and enough ports were specified. NOTE: this capability only works under specific conditions. I will outline more about this in a note to devel as the remainder of the implementation progresses. For now, the only environment where this works is slurm. The linear routed module has also been adjusted to work with static ports so that all messaging flows strictly through the topology, including the initial daemon callback - thus limiting the number of sockets opened by mpirun. This commit was SVN r20390.
Этот коммит содержится в:
родитель
e6398979ef
Коммит
5e6d3ba289
1
orte/mca/ess/env/ess_env_module.c
поставляемый
1
orte/mca/ess/env/ess_env_module.c
поставляемый
@ -331,7 +331,6 @@ static orte_node_rank_t proc_get_node_rank(orte_process_name_t *proc)
|
||||
orte_pmap_t *pmap;
|
||||
|
||||
if (NULL == (pmap = orte_util_lookup_pmap(proc))) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
||||
return ORTE_NODE_RANK_INVALID;
|
||||
}
|
||||
|
||||
|
@ -28,11 +28,21 @@
|
||||
#include <string.h>
|
||||
#endif /* HAVE_STRING_H */
|
||||
#include <ctype.h>
|
||||
#ifdef HAVE_NETDB_H
|
||||
#include <netdb.h>
|
||||
#endif
|
||||
#ifdef HAVE_IFADDRS_H
|
||||
#include <ifaddrs.h>
|
||||
#endif
|
||||
|
||||
|
||||
#include "opal/util/opal_environ.h"
|
||||
#include "opal/mca/base/mca_base_param.h"
|
||||
#include "opal/util/argv.h"
|
||||
#include "opal/class/opal_pointer_array.h"
|
||||
#include "opal/util/if.h"
|
||||
#include "opal/util/net.h"
|
||||
#include "opal/dss/dss.h"
|
||||
|
||||
#include "orte/util/proc_info.h"
|
||||
#include "orte/util/show_help.h"
|
||||
@ -40,6 +50,7 @@
|
||||
#include "orte/util/name_fns.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
#include "orte/util/nidmap.h"
|
||||
#include "orte/mca/rml/base/rml_contact.h"
|
||||
|
||||
#include "orte/mca/ess/ess.h"
|
||||
#include "orte/mca/ess/base/base.h"
|
||||
@ -47,6 +58,7 @@
|
||||
|
||||
static char *get_slurm_nodename(int nodeid);
|
||||
static int slurm_set_name(void);
|
||||
static int build_daemon_nidmap(void);
|
||||
|
||||
static int rte_init(char flags);
|
||||
static int rte_finalize(void);
|
||||
@ -100,6 +112,27 @@ static int rte_init(char flags)
|
||||
error = "orte_ess_base_orted_setup";
|
||||
goto error;
|
||||
}
|
||||
/* if we are using static ports, then we need to setup
|
||||
* the daemon info so the RML can function properly
|
||||
* without requiring a wireup stage
|
||||
*/
|
||||
if (orte_static_ports) {
|
||||
/* construct the nidmap arrays */
|
||||
if (ORTE_SUCCESS != (ret = orte_util_nidmap_init(NULL))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
error = "orte_util_nidmap_init";
|
||||
goto error;
|
||||
}
|
||||
/* extract the node info from the environment and
|
||||
* build a nidmap from it
|
||||
*/
|
||||
if (ORTE_SUCCESS != (ret = build_daemon_nidmap())) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
error = "construct daemon map from static ports";
|
||||
goto error;
|
||||
}
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
} else if (orte_process_info.tool) {
|
||||
/* otherwise, if I am a tool proc, use that procedure */
|
||||
if (ORTE_SUCCESS != (ret = orte_ess_base_tool_setup())) {
|
||||
@ -295,7 +328,6 @@ static orte_node_rank_t proc_get_node_rank(orte_process_name_t *proc)
|
||||
orte_pmap_t *pmap;
|
||||
|
||||
if (NULL == (pmap = orte_util_lookup_pmap(proc))) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
||||
return ORTE_NODE_RANK_INVALID;
|
||||
}
|
||||
|
||||
@ -431,3 +463,105 @@ get_slurm_nodename(int nodeid)
|
||||
/* All done */
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int build_daemon_nidmap(void)
|
||||
{
|
||||
char **names = NULL;
|
||||
char *slurm_nodelist;
|
||||
orte_nid_t *node;
|
||||
int i, num_nodes;
|
||||
int rc;
|
||||
struct hostent *h;
|
||||
opal_buffer_t buf;
|
||||
orte_process_name_t proc;
|
||||
char *uri, *addr;
|
||||
char *proc_name;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((0, orte_ess_base_output,
|
||||
"ess:slurm build daemon nidmap"));
|
||||
|
||||
slurm_nodelist = getenv("OMPI_MCA_orte_slurm_nodelist");
|
||||
|
||||
if (NULL == slurm_nodelist) {
|
||||
return ORTE_ERR_NOT_FOUND;
|
||||
}
|
||||
|
||||
/* split the node list into an argv array */
|
||||
names = opal_argv_split(slurm_nodelist, ',');
|
||||
if (NULL == names) { /* got an error */
|
||||
return ORTE_ERR_NOT_FOUND;
|
||||
}
|
||||
|
||||
num_nodes = opal_argv_count(names);
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((0, orte_ess_base_output,
|
||||
"ess:slurm:build:daemon:nidmap found %d nodes", num_nodes));
|
||||
|
||||
/* set the size of the nidmap storage so we minimize realloc's */
|
||||
if (ORTE_SUCCESS != (rc = opal_pointer_array_set_size(&orte_nidmap, num_nodes+1))) {
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* install the entry for the HNP */
|
||||
node = OBJ_NEW(orte_nid_t);
|
||||
node->name = strdup("HNP");
|
||||
node->daemon = 0;
|
||||
/* the arch defaults to our arch so that non-hetero
|
||||
* case will yield correct behavior
|
||||
*/
|
||||
opal_pointer_array_set_item(&orte_nidmap, 0, node);
|
||||
|
||||
/* the daemon vpids will be assigned in order,
|
||||
* starting with vpid=1 for the first node in
|
||||
* the list
|
||||
*/
|
||||
OBJ_CONSTRUCT(&buf, opal_buffer_t);
|
||||
proc.jobid = ORTE_PROC_MY_NAME->jobid;
|
||||
for (i=0; i < num_nodes; i++) {
|
||||
node = OBJ_NEW(orte_nid_t);
|
||||
node->name = strdup(names[i]);
|
||||
node->daemon = i+1;
|
||||
/* the arch defaults to our arch so that non-hetero
|
||||
* case will yield correct behavior
|
||||
*/
|
||||
opal_pointer_array_set_item(&orte_nidmap, node->daemon, node);
|
||||
|
||||
opal_output(0, "%s lookup address for node %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), node->name);
|
||||
/* lookup the address of this node */
|
||||
if (NULL == (h = gethostbyname(node->name))) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
||||
return ORTE_ERR_NOT_FOUND;
|
||||
}
|
||||
addr = inet_ntoa(*(struct in_addr*)h->h_addr_list[0]);
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((0, orte_ess_base_output,
|
||||
"ess:slurm:build:daemon:nidmap node %s daemon %d addr %s",
|
||||
node->name, (int)node->daemon, addr));
|
||||
|
||||
/* since we are using static ports, all my fellow daemons will be on my
|
||||
* port. Setup the contact info for each daemon in my hash tables. Note
|
||||
* that this will -not- open a port to those daemons, but will only
|
||||
* define the info necessary for opening such a port if/when I communicate
|
||||
* to them
|
||||
*/
|
||||
/* construct the URI */
|
||||
proc.vpid = node->daemon;
|
||||
orte_util_convert_process_name_to_string(&proc_name, &proc);
|
||||
asprintf(&uri, "%s;tcp://%s:%d", proc_name, addr, (int)orte_process_info.my_port);
|
||||
opal_output(0, "contact info %s", uri);
|
||||
opal_dss.pack(&buf, &uri, 1, OPAL_STRING);
|
||||
free(proc_name);
|
||||
free(uri);
|
||||
}
|
||||
|
||||
/* load the hash tables */
|
||||
if (ORTE_SUCCESS != (rc = orte_rml_base_update_contact_info(&buf))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
OBJ_DESTRUCT(&buf);
|
||||
|
||||
opal_argv_free(names);
|
||||
|
||||
/* All done */
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
@ -48,12 +48,15 @@
|
||||
#include "orte/util/show_help.h"
|
||||
#include "opal/util/if.h"
|
||||
#include "opal/util/net.h"
|
||||
#include "opal/util/argv.h"
|
||||
#include "opal/class/opal_hash_table.h"
|
||||
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/rml/rml.h"
|
||||
#include "orte/mca/ess/ess.h"
|
||||
#include "orte/util/name_fns.h"
|
||||
#include "orte/util/show_help.h"
|
||||
#include "orte/util/parse_options.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
|
||||
#include "orte/mca/oob/tcp/oob_tcp.h"
|
||||
@ -310,16 +313,25 @@ int mca_oob_tcp_component_open(void)
|
||||
mca_oob_tcp_component.tcp_listen_thread_sds[0] = -1;
|
||||
mca_oob_tcp_component.tcp_listen_thread_sds[1] = -1;
|
||||
|
||||
mca_base_param_reg_int(&mca_oob_tcp_component.super.oob_base,
|
||||
"port_min_v4", "Starting port allowed (IPv4)",
|
||||
false, false,
|
||||
0,
|
||||
&mca_oob_tcp_component.tcp_port_min);
|
||||
mca_base_param_reg_int(&mca_oob_tcp_component.super.oob_base,
|
||||
"port_range_v4", "Range of allowed ports (IPv4)",
|
||||
false, false,
|
||||
64*1024 - 1 - mca_oob_tcp_component.tcp_port_min,
|
||||
&mca_oob_tcp_component.tcp_port_range);
|
||||
mca_base_param_reg_string(&mca_oob_tcp_component.super.oob_base,
|
||||
"static_ports", "Static ports for daemons and procs (IPv4)",
|
||||
false, false,
|
||||
NULL,
|
||||
&str);
|
||||
/* if ports were provided, parse the provided range */
|
||||
if (NULL != str) {
|
||||
orte_static_ports = true;
|
||||
orte_util_parse_range_options(str, &mca_oob_tcp_component.tcp4_static_ports);
|
||||
if (0 == strcmp(mca_oob_tcp_component.tcp4_static_ports[0], "-1")) {
|
||||
opal_argv_free(mca_oob_tcp_component.tcp4_static_ports);
|
||||
mca_oob_tcp_component.tcp4_static_ports = NULL;
|
||||
orte_static_ports = false;
|
||||
}
|
||||
} else {
|
||||
orte_static_ports = false;
|
||||
mca_oob_tcp_component.tcp4_static_ports = NULL;
|
||||
}
|
||||
|
||||
mca_base_param_reg_int(&mca_oob_tcp_component.super.oob_base,
|
||||
"disable_family", "Disable IPv4 (4) or IPv6 (6)",
|
||||
false, false,
|
||||
@ -327,15 +339,22 @@ int mca_oob_tcp_component_open(void)
|
||||
&mca_oob_tcp_component.disable_family);
|
||||
#if OPAL_WANT_IPV6
|
||||
mca_base_param_reg_int(&mca_oob_tcp_component.super.oob_base,
|
||||
"port_min_v6", "Starting port allowed (IPv6)",
|
||||
"static_ports_v6", "Static ports for daemons and procs (IPv6)",
|
||||
false, false,
|
||||
0,
|
||||
&mca_oob_tcp_component.tcp6_port_min);
|
||||
mca_base_param_reg_int(&mca_oob_tcp_component.super.oob_base,
|
||||
"port_range_v6", "Range of allowed ports (IPv6)",
|
||||
false, false,
|
||||
64*1024 - 1 - mca_oob_tcp_component.tcp6_port_min,
|
||||
&mca_oob_tcp_component.tcp6_port_range);
|
||||
NULL,
|
||||
&str);
|
||||
if (NULL != str) {
|
||||
orte_static_ports = true;
|
||||
orte_util_parse_range_options(str, &mca_oob_tcp_component.tcp6_static_ports);
|
||||
if (0 == strcmp(mca_oob_tcp_component.tcp6_static_ports[0], "-1")) {
|
||||
opal_argv_free(mca_oob_tcp_component.tcp6_static_ports);
|
||||
mca_oob_tcp_component.tcp6_static_ports = NULL;
|
||||
orte_static_ports = false;
|
||||
}
|
||||
} else {
|
||||
orte_static_ports = false;
|
||||
mca_oob_tcp_component.tcp6_static_ports = NULL;
|
||||
}
|
||||
mca_oob_tcp_component.tcp6_listen_sd = -1;
|
||||
#endif /* OPAL_WANT_IPV6 */
|
||||
|
||||
@ -459,7 +478,8 @@ static void mca_oob_tcp_accept(int incoming_sd)
|
||||
static int
|
||||
mca_oob_tcp_create_listen(int *target_sd, unsigned short *target_port, uint16_t af_family)
|
||||
{
|
||||
int flags, index, range = 0, port=0;
|
||||
int flags;
|
||||
uint16_t port=0;
|
||||
struct sockaddr_storage inaddr;
|
||||
opal_socklen_t addrlen;
|
||||
|
||||
@ -520,79 +540,145 @@ mca_oob_tcp_create_listen(int *target_sd, unsigned short *target_port, uint16_t
|
||||
addrlen = sizeof(struct sockaddr_in);
|
||||
#endif
|
||||
|
||||
/* Disable reusing ports */
|
||||
flags = 0;
|
||||
/* If an explicit range of ports was given, find the first open
|
||||
port in the range. Otherwise, tcp_port_min will be 0, which
|
||||
means "pick any port" */
|
||||
if (AF_INET == af_family) {
|
||||
if (orte_process_info.daemon) {
|
||||
/* if static ports were provided, the daemon takes the
|
||||
* first entry in the list - otherwise, we "pick any port"
|
||||
*/
|
||||
if (NULL != mca_oob_tcp_component.tcp4_static_ports) {
|
||||
port = strtol(mca_oob_tcp_component.tcp4_static_ports[0], NULL, 10);
|
||||
/* save the port for later use */
|
||||
orte_process_info.my_port = port;
|
||||
/* convert it to network-byte-order */
|
||||
port = htons(port);
|
||||
/* flag that we are using static ports */
|
||||
orte_static_ports = true;
|
||||
} else {
|
||||
port = 0;
|
||||
orte_static_ports = false;
|
||||
}
|
||||
} else if (orte_process_info.mpi_proc) {
|
||||
/* if static ports were provided, an mpi proc takes its
|
||||
* node_local_rank entry in the list IF it has that info
|
||||
* AND enough ports were provided - otherwise, we "pick any port"
|
||||
*/
|
||||
if (NULL != mca_oob_tcp_component.tcp4_static_ports) {
|
||||
orte_node_rank_t nrank;
|
||||
/* do I know my node_local_rank yet? */
|
||||
if (ORTE_NODE_RANK_INVALID != (nrank = orte_ess.get_node_rank(ORTE_PROC_MY_NAME)) &&
|
||||
nrank < opal_argv_count(mca_oob_tcp_component.tcp4_static_ports)) {
|
||||
/* any daemon takes the first entry, so we start with the second */
|
||||
port = strtol(mca_oob_tcp_component.tcp4_static_ports[nrank+1], NULL, 10);
|
||||
/* save the port for later use */
|
||||
orte_process_info.my_port = port;
|
||||
/* convert it to network-byte-order */
|
||||
port = htons(port);
|
||||
/* flag that we are using static ports */
|
||||
orte_static_ports = true;
|
||||
} else {
|
||||
port = 0;
|
||||
orte_static_ports = false;
|
||||
}
|
||||
} else {
|
||||
port = 0;
|
||||
orte_static_ports = false;
|
||||
}
|
||||
} else {
|
||||
/* if we are the HNP or a tool, then we must let the
|
||||
* system pick any port
|
||||
*/
|
||||
port = 0;
|
||||
orte_static_ports = false;
|
||||
}
|
||||
}
|
||||
|
||||
#if OPAL_WANT_IPV6
|
||||
if (AF_INET6 == af_family) {
|
||||
if(orte_process_info.daemon) {
|
||||
/* if static ports were provided, the daemon takes the
|
||||
* first entry in the list - otherwise, we "pick any port"
|
||||
*/
|
||||
if (NULL != mca_oob_tcp_component.tcp6_static_ports) {
|
||||
port = strtol(mca_oob_tcp_component.tcp6_static_ports[0], NULL, 10);
|
||||
/* save the port for later use */
|
||||
orte_process_info.my_port = port;
|
||||
/* convert it to network-byte-order */
|
||||
port = htons(port);
|
||||
/* flag that we are using static ports */
|
||||
orte_static_ports = true;
|
||||
} else {
|
||||
port = 0;
|
||||
orte_static_ports = false;
|
||||
}
|
||||
} else if (orte_process_info.mpi_proc) {
|
||||
/* if static ports were provided, an mpi proc takes its
|
||||
* node_local_rank entry in the list IF it has that info
|
||||
* AND enough ports were provided - otherwise, we "pick any port"
|
||||
*/
|
||||
if (NULL != mca_oob_tcp_component.tcp6_static_ports) {
|
||||
orte_node_rank_t nrank;
|
||||
/* do I know my node_local_rank yet? */
|
||||
if (ORTE_NODE_RANK_INVALID != (nrank = orte_ess.get_node_rank(ORTE_PROC_MY_NAME)) &&
|
||||
nrank < opal_argv_count(mca_oob_tcp_component.tcp6_static_ports)) {
|
||||
/* any daemon takes the first entry, so we start with the second */
|
||||
port = strtol(mca_oob_tcp_component.tcp6_static_ports[nrank+1], NULL, 10);
|
||||
/* save the port for later use */
|
||||
orte_process_info.my_port = port;
|
||||
/* convert it to network-byte-order */
|
||||
port = htons(port);
|
||||
/* flag that we are using static ports */
|
||||
orte_static_ports = true;
|
||||
} else {
|
||||
port = 0;
|
||||
orte_static_ports = false;
|
||||
}
|
||||
} else {
|
||||
port = 0;
|
||||
orte_static_ports = false;
|
||||
}
|
||||
} else {
|
||||
/* if we are the HNP or a tool, then we must let the
|
||||
* system pick any port
|
||||
*/
|
||||
port = 0;
|
||||
orte_static_ports = false;
|
||||
}
|
||||
}
|
||||
#endif /* OPAL_WANT_IPV6 */
|
||||
|
||||
/* Enable/disable reusing ports */
|
||||
if (orte_static_ports) {
|
||||
flags = 1;
|
||||
} else {
|
||||
flags = 0;
|
||||
}
|
||||
if (setsockopt (*target_sd, SOL_SOCKET, SO_REUSEADDR, (const char *)&flags, sizeof(flags)) < 0) {
|
||||
opal_output(0, "mca_oob_tcp_create_listen: unable to unset the "
|
||||
opal_output(0, "mca_oob_tcp_create_listen: unable to set the "
|
||||
"SO_REUSEADDR option (%s:%d)\n",
|
||||
strerror(opal_socket_errno), opal_socket_errno);
|
||||
CLOSE_THE_SOCKET(*target_sd);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
|
||||
/* If an explicit range of ports was given, find the first open
|
||||
port in the range. Otherwise, tcp_port_min will be 0, which
|
||||
means "pick any port" */
|
||||
if (AF_INET == af_family) {
|
||||
range = mca_oob_tcp_component.tcp_port_range;
|
||||
port = mca_oob_tcp_component.tcp_port_min;
|
||||
}
|
||||
#if OPAL_WANT_IPV6
|
||||
if (AF_INET6 == af_family) {
|
||||
range = mca_oob_tcp_component.tcp6_port_range;
|
||||
port = mca_oob_tcp_component.tcp6_port_min;
|
||||
}
|
||||
#endif /* OPAL_WANT_IPV6 */
|
||||
|
||||
#if 0
|
||||
/* flag whether or not static ports are in use so that other
|
||||
* parts of ORTE can act appropriately
|
||||
* LEAVE OFF FOR MOMENT PENDING FURTHER TEST
|
||||
*/
|
||||
if (0 != port) {
|
||||
orte_static_ports = true;
|
||||
}
|
||||
#endif
|
||||
|
||||
for (index = 0; index < range; index++ ) {
|
||||
if (AF_INET == af_family) {
|
||||
((struct sockaddr_in*) &inaddr)->sin_port = port + index;
|
||||
} else if (AF_INET6 == af_family) {
|
||||
((struct sockaddr_in6*) &inaddr)->sin6_port = port + index;
|
||||
} else {
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
|
||||
if(bind(*target_sd, (struct sockaddr*)&inaddr, addrlen) < 0) {
|
||||
if( (EADDRINUSE == opal_socket_errno) || (EADDRNOTAVAIL == opal_socket_errno) ) {
|
||||
continue;
|
||||
}
|
||||
opal_output(0, "bind() failed: %s (%d)",
|
||||
strerror(opal_socket_errno),
|
||||
opal_socket_errno );
|
||||
CLOSE_THE_SOCKET(*target_sd);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
goto socket_binded;
|
||||
if (AF_INET == af_family) {
|
||||
((struct sockaddr_in*) &inaddr)->sin_port = port;
|
||||
} else if (AF_INET6 == af_family) {
|
||||
((struct sockaddr_in6*) &inaddr)->sin6_port = port;
|
||||
} else {
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
|
||||
if(bind(*target_sd, (struct sockaddr*)&inaddr, addrlen) < 0) {
|
||||
opal_output(0, "bind() failed: %s (%d)",
|
||||
strerror(opal_socket_errno),
|
||||
opal_socket_errno );
|
||||
CLOSE_THE_SOCKET(*target_sd);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
|
||||
if (AF_INET == af_family ) {
|
||||
opal_output(0, "bind() failed: no port available in the range [%d..%d]",
|
||||
mca_oob_tcp_component.tcp_port_min,
|
||||
mca_oob_tcp_component.tcp_port_min + range);
|
||||
}
|
||||
#if OPAL_WANT_IPV6
|
||||
if (AF_INET6 == af_family) {
|
||||
opal_output(0, "bind6() failed: no port available in the range [%d..%d]",
|
||||
mca_oob_tcp_component.tcp6_port_min,
|
||||
mca_oob_tcp_component.tcp6_port_min + range);
|
||||
}
|
||||
#endif /* OPAL_WANT_IPV6 */
|
||||
|
||||
CLOSE_THE_SOCKET(*target_sd);
|
||||
return ORTE_ERROR;
|
||||
|
||||
socket_binded:
|
||||
/* resolve assigned port */
|
||||
if (getsockname(*target_sd, (struct sockaddr*)&inaddr, &addrlen) < 0) {
|
||||
opal_output(0, "mca_oob_tcp_create_listen: getsockname(): %s (%d)",
|
||||
@ -601,12 +687,19 @@ mca_oob_tcp_create_listen(int *target_sd, unsigned short *target_port, uint16_t
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
|
||||
/* record the assigned port */
|
||||
if (AF_INET == af_family) {
|
||||
*target_port = ((struct sockaddr_in*) &inaddr)->sin_port;
|
||||
} else {
|
||||
*target_port = ((struct sockaddr_in6*) &inaddr)->sin6_port;
|
||||
}
|
||||
|
||||
if (0 == port) {
|
||||
/* if we dynamically assigned the port, save it here,
|
||||
* remembering to convert it back from network byte order first
|
||||
*/
|
||||
orte_process_info.my_port = ntohs(*target_port);
|
||||
}
|
||||
|
||||
/* setup listen backlog to maximum allowed by kernel */
|
||||
if(listen(*target_sd, SOMAXCONN) < 0) {
|
||||
opal_output(0, "mca_oob_tcp_component_init: listen(): %s (%d)",
|
||||
|
@ -203,15 +203,13 @@ struct mca_oob_tcp_component_t {
|
||||
opal_event_t tcp_recv_event; /**< event structure for IPv4 recvs */
|
||||
int tcp_listen_sd; /**< listen socket for incoming IPv4 connection requests */
|
||||
unsigned short tcp_listen_port; /**< IPv4 listen port */
|
||||
int tcp_port_min; /**< Minimum allowed port for the OOB listen socket */
|
||||
int tcp_port_range; /**< Range of allowed TCP ports */
|
||||
char** tcp4_static_ports; /**< Static ports - IPV4 */
|
||||
int disable_family; /**< disable AF: 0-nothing, 4-IPv4, 6-IPv6 */
|
||||
#if OPAL_WANT_IPV6
|
||||
opal_event_t tcp6_recv_event; /**< event structure for IPv6 recvs */
|
||||
int tcp6_listen_sd; /**< listen socket for incoming IPv6 connection requests */
|
||||
unsigned short tcp6_listen_port; /**< IPv6 listen port */
|
||||
int tcp6_port_min; /**< Minimum allowed port for the OOB listen socket */
|
||||
int tcp6_port_range; /**< Range of allowed TCP ports */
|
||||
char** tcp6_static_ports; /**< Static port - IPV6 */
|
||||
#endif /* OPAL_WANT_IPV6 */
|
||||
opal_mutex_t tcp_lock; /**< lock for accessing module state */
|
||||
opal_list_t tcp_events; /**< list of pending events (accepts) */
|
||||
|
@ -516,10 +516,13 @@ static void process_orted_launch_report(int fd, short event, void *data)
|
||||
CLEANUP:
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
|
||||
"%s plm:base:orted_report_launch %s for daemon %s",
|
||||
"%s plm:base:orted_report_launch %s for daemon %s at contact %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
orted_failed_launch ? "failed" : "completed",
|
||||
ORTE_NAME_PRINT(&mev->sender)));
|
||||
ORTE_NAME_PRINT(&mev->sender), pdatorted[mev->sender.vpid]->rml_uri));
|
||||
|
||||
/* release the message */
|
||||
OBJ_RELEASE(mev);
|
||||
|
||||
if (orted_failed_launch) {
|
||||
orte_errmgr.incomplete_start(ORTE_PROC_MY_NAME->jobid, ORTE_ERROR_DEFAULT_EXIT_CODE);
|
||||
|
@ -280,7 +280,7 @@ static int plm_slurmd_launch_job(orte_job_t *jdata)
|
||||
|
||||
/* Add basic orted command line options, including debug flags */
|
||||
orte_plm_base_orted_append_basic_args(&argc, &argv,
|
||||
"slurmd",
|
||||
"slurm",
|
||||
&proc_vpid_index,
|
||||
false);
|
||||
|
||||
@ -344,7 +344,7 @@ static int plm_slurmd_launch_job(orte_job_t *jdata)
|
||||
env = opal_argv_copy(orte_launch_environ);
|
||||
|
||||
/* add the nodelist */
|
||||
var = mca_base_param_environ_variable("orte", "slurmd", "nodelist");
|
||||
var = mca_base_param_environ_variable("orte", "slurm", "nodelist");
|
||||
opal_setenv(var, nodelist_flat, true, &env);
|
||||
free(nodelist_flat);
|
||||
free(var);
|
||||
|
@ -319,8 +319,9 @@ static orte_process_name_t get_route(orte_process_name_t *target)
|
||||
|
||||
/* THIS CAME FROM OUR OWN JOB FAMILY... */
|
||||
|
||||
/* if this is going to the HNP, send direct */
|
||||
if (ORTE_PROC_MY_HNP->jobid == target->jobid &&
|
||||
/* if we are not using static ports and this is going to the HNP, send direct */
|
||||
if (!orte_static_ports &&
|
||||
ORTE_PROC_MY_HNP->jobid == target->jobid &&
|
||||
ORTE_PROC_MY_HNP->vpid == target->vpid) {
|
||||
OPAL_OUTPUT_VERBOSE((2, orte_routed_base_output,
|
||||
"%s routing not enabled - going direct",
|
||||
@ -362,7 +363,7 @@ static orte_process_name_t get_route(orte_process_name_t *target)
|
||||
|
||||
found:
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_routed_base_output,
|
||||
OPAL_OUTPUT_VERBOSE((0, orte_routed_base_output,
|
||||
"%s routed_linear_get(%s) --> %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(target),
|
||||
|
@ -591,21 +591,14 @@ int orte_daemon(int argc, char *argv[])
|
||||
*/
|
||||
|
||||
buffer = OBJ_NEW(opal_buffer_t);
|
||||
/* if we are using static ports, there is no need to send our
|
||||
* contact info back to HNP - it already knows how to reach us
|
||||
* Instead, just send a zero-byte buffer for barrier purposes
|
||||
/* for now, always include our contact info, even if we are using
|
||||
* static ports. Eventually, this will be removed
|
||||
*/
|
||||
if (!orte_static_ports) {
|
||||
if (orte_debug_daemons_flag) {
|
||||
fprintf(stderr, "Daemon %s not using static ports\n",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
|
||||
}
|
||||
rml_uri = orte_rml.get_contact_info();
|
||||
if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &rml_uri, 1, OPAL_STRING))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
OBJ_RELEASE(buffer);
|
||||
return ret;
|
||||
}
|
||||
rml_uri = orte_rml.get_contact_info();
|
||||
if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &rml_uri, 1, OPAL_STRING))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
OBJ_RELEASE(buffer);
|
||||
return ret;
|
||||
}
|
||||
/* send our architecture */
|
||||
if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &orte_process_info.arch, 1, OPAL_INT32))) {
|
||||
|
@ -273,10 +273,7 @@ main(int argc, char *argv[])
|
||||
}
|
||||
|
||||
/* parse the input ranks */
|
||||
if (ORTE_SUCCESS != (ret = orte_util_parse_rank_options(my_globals.ranks, &ranks))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
goto cleanup;
|
||||
}
|
||||
orte_util_parse_range_options(my_globals.ranks, &ranks);
|
||||
|
||||
/* pull the specified output streams and dump to our stdout */
|
||||
for (i=0; i < opal_argv_count(ranks); i++) {
|
||||
|
@ -35,11 +35,12 @@
|
||||
|
||||
#include "orte/util/parse_options.h"
|
||||
|
||||
int orte_util_parse_rank_options(char *input, char ***output)
|
||||
void orte_util_parse_range_options(char *input, char ***output)
|
||||
{
|
||||
char **r1=NULL, **r2=NULL;
|
||||
int i, vint;
|
||||
orte_vpid_t start, end, vpid;
|
||||
int start, end, n;
|
||||
char nstr[32];
|
||||
|
||||
/* split on commas */
|
||||
r1 = opal_argv_split(input, ',');
|
||||
@ -61,10 +62,11 @@ int orte_util_parse_rank_options(char *input, char ***output)
|
||||
goto cleanup;
|
||||
}
|
||||
start = strtol(r2[0], NULL, 10);
|
||||
end = start + 1;
|
||||
end = start;
|
||||
}
|
||||
for (vpid = start; vpid < end; vpid++) {
|
||||
opal_argv_append_nosize(output, ORTE_VPID_PRINT(vpid));
|
||||
for (n = start; n <= end; n++) {
|
||||
snprintf(nstr, 32, "%d", n);
|
||||
opal_argv_append_nosize(output, nstr);
|
||||
}
|
||||
}
|
||||
|
||||
@ -72,6 +74,4 @@ cleanup:
|
||||
opal_argv_free(r1);
|
||||
opal_argv_free(r2);
|
||||
|
||||
/* All was good */
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
@ -29,7 +29,7 @@
|
||||
|
||||
BEGIN_C_DECLS
|
||||
|
||||
ORTE_DECLSPEC int orte_util_parse_rank_options(char *input, char ***output);
|
||||
ORTE_DECLSPEC void orte_util_parse_range_options(char *input, char ***output);
|
||||
|
||||
END_C_DECLS
|
||||
#endif
|
||||
|
@ -54,6 +54,7 @@ ORTE_DECLSPEC orte_proc_info_t orte_process_info = {
|
||||
/* .tool = */ false,
|
||||
/* .mpi_proc = */ false,
|
||||
/* .sync_buf = */ NULL,
|
||||
/* .my_port = */ 0,
|
||||
/* .tmpdir_base = */ NULL,
|
||||
/* .top_session_dir = */ NULL,
|
||||
/* .job_session_dir = */ NULL,
|
||||
|
@ -64,6 +64,7 @@ struct orte_proc_info_t {
|
||||
bool tool; /**< I am a tool or not */
|
||||
bool mpi_proc; /**< I am an MPI process */
|
||||
opal_buffer_t *sync_buf; /**< buffer to store sync response */
|
||||
uint16_t my_port; /**< TCP port for out-of-band comm */
|
||||
/* The session directory has the form
|
||||
* <prefix>/<openmpi-sessions-user>/<jobid>/<procid>, where the prefix
|
||||
* can either be provided by the user via the
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user