2004-11-22 03:37:56 +03:00
|
|
|
/*
|
2005-11-05 22:57:48 +03:00
|
|
|
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
|
|
|
* University Research and Technology
|
|
|
|
* Corporation. All rights reserved.
|
|
|
|
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
|
|
|
* of Tennessee Research Foundation. All rights
|
|
|
|
* reserved.
|
2004-11-28 23:09:25 +03:00
|
|
|
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
|
|
|
* University of Stuttgart. All rights reserved.
|
2005-03-24 15:43:37 +03:00
|
|
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
|
|
|
* All rights reserved.
|
2004-11-22 04:38:40 +03:00
|
|
|
* $COPYRIGHT$
|
|
|
|
*
|
|
|
|
* Additional copyrights may follow
|
|
|
|
*
|
2004-11-22 03:37:56 +03:00
|
|
|
* $HEADER$
|
|
|
|
*/
|
|
|
|
|
2006-02-12 04:33:29 +03:00
|
|
|
#include "orte_config.h"
|
2004-10-22 20:06:05 +04:00
|
|
|
#ifdef HAVE_SYS_TYPES_H
|
2004-09-02 18:45:01 +04:00
|
|
|
#include <sys/types.h>
|
2004-10-22 20:06:05 +04:00
|
|
|
#endif
|
|
|
|
#ifdef HAVE_SYS_SOCKET_H
|
2004-09-02 18:20:13 +04:00
|
|
|
#include <sys/socket.h>
|
2004-10-22 20:06:05 +04:00
|
|
|
#endif
|
|
|
|
#ifdef HAVE_NETINET_IN_H
|
2004-09-02 18:20:13 +04:00
|
|
|
#include <netinet/in.h>
|
2004-10-22 20:06:05 +04:00
|
|
|
#endif
|
|
|
|
#ifdef HAVE_ARPA_INET_H
|
2004-09-02 18:20:13 +04:00
|
|
|
#include <arpa/inet.h>
|
2004-10-22 20:06:05 +04:00
|
|
|
#endif
|
2004-09-02 18:20:13 +04:00
|
|
|
#include <string.h>
|
2006-02-12 04:33:29 +03:00
|
|
|
#include "orte/orte_constants.h"
|
2005-07-04 05:36:20 +04:00
|
|
|
#include "opal/util/if.h"
|
2007-05-17 05:17:59 +04:00
|
|
|
#include "opal/util/net.h"
|
Bring over the update to terminate orteds that are generated by a dynamic spawn such as comm_spawn. This introduces the concept of a job "family" - i.e., jobs that have a parent/child relationship. Comm_spawn'ed jobs have a parent (the one that spawned them). We track that relationship throughout the lineage - i.e., if a comm_spawned job in turn calls comm_spawn, then it has a parent (the one that spawned it) and a "root" job (the original job that started things).
Accordingly, there are new APIs to the name service to support the ability to get a job's parent, root, immediate children, and all its descendants. In addition, the terminate_job, terminate_orted, and signal_job APIs for the PLS have been modified to accept attributes that define the extent of their actions. For example, doing a "terminate_job" with an attribute of ORTE_NS_INCLUDE_DESCENDANTS will terminate the given jobid AND all jobs that descended from it.
I have tested this capability on a MacBook under rsh, Odin under SLURM, and LANL's Flash (bproc). It worked successfully on non-MPI jobs (both simple and including a spawn), and MPI jobs (again, both simple and with a spawn).
This commit was SVN r12597.
2006-11-14 22:34:59 +03:00
|
|
|
|
|
|
|
#include "orte/mca/ns/ns_types.h"
|
|
|
|
#include "orte/util/proc_info.h"
|
|
|
|
#include "orte/dss/dss.h"
|
|
|
|
|
2004-09-02 18:20:13 +04:00
|
|
|
#include "oob_tcp.h"
|
|
|
|
#include "oob_tcp_addr.h"
|
|
|
|
|
|
|
|
|
|
|
|
static void mca_oob_tcp_addr_construct(mca_oob_tcp_addr_t* addr)
|
|
|
|
{
|
|
|
|
memset(&addr->addr_name, 0, sizeof(addr->addr_name));
|
|
|
|
addr->addr_count = 0;
|
|
|
|
addr->addr_alloc = 0;
|
|
|
|
addr->addr_next = 0;
|
|
|
|
addr->addr_inet = NULL;
|
2007-04-25 05:55:40 +04:00
|
|
|
addr->addr_matched = MCA_OOB_TCP_ADDR_UNCLASSIFIED;
|
2004-09-02 18:20:13 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
static void mca_oob_tcp_addr_destruct(mca_oob_tcp_addr_t* addr)
|
|
|
|
{
|
|
|
|
if(addr->addr_inet != NULL)
|
|
|
|
free(addr->addr_inet);
|
|
|
|
}
|
|
|
|
|
|
|
|
OBJ_CLASS_INSTANCE(
|
|
|
|
mca_oob_tcp_addr_t,
|
2005-07-03 20:06:07 +04:00
|
|
|
opal_object_t,
|
2004-09-02 18:20:13 +04:00
|
|
|
mca_oob_tcp_addr_construct,
|
|
|
|
mca_oob_tcp_addr_destruct);
|
|
|
|
|
|
|
|
|
2007-07-20 05:34:02 +04:00
|
|
|
int mca_oob_tcp_addr_get_next(mca_oob_tcp_addr_t* addr, struct sockaddr* retval)
|
2004-09-02 18:20:13 +04:00
|
|
|
{
|
2007-04-25 05:55:40 +04:00
|
|
|
static uint32_t i_have = MCA_OOB_TCP_ADDR_UNCLASSIFIED; /* my own capabilities */
|
|
|
|
|
|
|
|
if((NULL == addr) || (0 == addr->addr_count)) {
|
2005-03-14 23:57:21 +03:00
|
|
|
return ORTE_ERROR;
|
2007-04-25 05:55:40 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
if(MCA_OOB_TCP_ADDR_UNCLASSIFIED == addr->addr_matched) {
|
2006-08-15 23:54:10 +04:00
|
|
|
orte_std_cntr_t i=0;
|
2005-09-19 18:47:11 +04:00
|
|
|
for(i=0; i<addr->addr_count; i++) {
|
2007-07-20 05:34:02 +04:00
|
|
|
opal_list_item_t *item;
|
|
|
|
for (item = opal_list_get_first(&mca_oob_tcp_component.tcp_available_devices) ;
|
|
|
|
item != opal_list_get_end(&mca_oob_tcp_component.tcp_available_devices) ;
|
|
|
|
item = opal_list_get_next(item)) {
|
|
|
|
mca_oob_tcp_device_t *dev = (mca_oob_tcp_device_t*) item;
|
2007-04-25 05:55:40 +04:00
|
|
|
uint32_t inmask;
|
2007-07-20 05:34:02 +04:00
|
|
|
|
|
|
|
opal_ifindextomask(dev->if_index, &inmask, sizeof(inmask));
|
2005-09-19 18:47:11 +04:00
|
|
|
|
2007-04-25 05:55:40 +04:00
|
|
|
/* Decide which address to try first; note that we're
|
|
|
|
called multiple times and each time we need to
|
|
|
|
present a different address
|
|
|
|
|
|
|
|
Precedence rules:
|
|
|
|
|
|
|
|
- IPv4public has the highest priority
|
|
|
|
- when IPv4private + IPv6, use IPv6 (this should
|
|
|
|
be changed when there is something like a CellID)
|
|
|
|
*/
|
2007-07-20 05:34:02 +04:00
|
|
|
if (true == opal_net_addr_isipv4public ((struct sockaddr*) &dev->if_addr)) {
|
2007-04-25 05:55:40 +04:00
|
|
|
i_have |= MCA_OOB_TCP_ADDR_IPV4public;
|
|
|
|
}
|
|
|
|
|
2007-05-17 05:17:59 +04:00
|
|
|
if (true == opal_net_addr_isipv4public ((struct sockaddr*)&addr->addr_inet[i])) {
|
2007-04-25 05:55:40 +04:00
|
|
|
addr->addr_matched |= MCA_OOB_TCP_ADDR_IPV4public;
|
|
|
|
}
|
|
|
|
|
|
|
|
if ((MCA_OOB_TCP_ADDR_IPV4public ==
|
|
|
|
(i_have & MCA_OOB_TCP_ADDR_IPV4public)) &&
|
|
|
|
(MCA_OOB_TCP_ADDR_IPV4public ==
|
|
|
|
(addr->addr_matched & MCA_OOB_TCP_ADDR_IPV4public))) {
|
|
|
|
addr->addr_next = i;
|
|
|
|
goto done;
|
|
|
|
}
|
|
|
|
|
2007-07-20 05:34:02 +04:00
|
|
|
if (AF_INET6 == dev->if_addr.ss_family) {
|
2007-04-25 05:55:40 +04:00
|
|
|
i_have |= MCA_OOB_TCP_ADDR_IPV6;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (AF_INET6 ==
|
|
|
|
((struct sockaddr_in6*)&addr->addr_inet[i])->sin6_family) {
|
|
|
|
addr->addr_matched |= MCA_OOB_TCP_ADDR_IPV6;
|
|
|
|
addr->addr_next = i;
|
|
|
|
goto done;
|
|
|
|
}
|
|
|
|
|
2005-09-19 18:47:11 +04:00
|
|
|
/* if match on network prefix - start here */
|
2007-04-25 05:55:40 +04:00
|
|
|
/* Bug, FIXME: This code is dangerous, it will prefer
|
|
|
|
local addresses even if they point to wrong hosts
|
|
|
|
(the multicluster problem).
|
|
|
|
|
|
|
|
We need more magic to select the best address
|
|
|
|
|
|
|
|
adi@2006-09-30
|
|
|
|
*/
|
2007-07-20 05:34:02 +04:00
|
|
|
if(opal_net_samenetwork((struct sockaddr*) &dev->if_addr,
|
2007-05-17 05:17:59 +04:00
|
|
|
(struct sockaddr*)&addr->addr_inet[i],
|
|
|
|
inmask)) {
|
2007-04-25 05:55:40 +04:00
|
|
|
addr->addr_matched |= MCA_OOB_TCP_ADDR_MATCHED;
|
|
|
|
addr->addr_next = i;
|
2005-09-19 18:47:11 +04:00
|
|
|
goto done;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
done:
|
2007-04-25 05:55:40 +04:00
|
|
|
; /* NOP */
|
2005-09-19 18:47:11 +04:00
|
|
|
}
|
2007-07-20 05:34:02 +04:00
|
|
|
|
|
|
|
if (addr->addr_inet[addr->addr_next].ss_family == AF_INET) {
|
|
|
|
memcpy(retval, &addr->addr_inet[addr->addr_next],
|
|
|
|
sizeof(struct sockaddr_in));
|
|
|
|
} else {
|
|
|
|
memcpy(retval, &addr->addr_inet[addr->addr_next],
|
|
|
|
sizeof(struct sockaddr_in6));
|
|
|
|
}
|
|
|
|
|
2004-09-02 18:20:13 +04:00
|
|
|
if(++addr->addr_next >= addr->addr_count)
|
|
|
|
addr->addr_next = 0;
|
2005-03-14 23:57:21 +03:00
|
|
|
return ORTE_SUCCESS;
|
2004-09-02 18:20:13 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2007-07-20 05:34:02 +04:00
|
|
|
int
|
|
|
|
mca_oob_tcp_addr_insert(mca_oob_tcp_addr_t* addr, const struct sockaddr* inaddr)
|
2004-09-02 18:20:13 +04:00
|
|
|
{
|
|
|
|
if(addr->addr_alloc == 0) {
|
|
|
|
addr->addr_alloc = 2;
|
2007-07-20 05:34:02 +04:00
|
|
|
addr->addr_inet = (struct sockaddr_storage*) malloc(addr->addr_alloc * sizeof(struct sockaddr_storage));
|
2004-09-02 18:20:13 +04:00
|
|
|
} else if(addr->addr_count == addr->addr_alloc) {
|
|
|
|
addr->addr_alloc <<= 1;
|
2007-07-20 05:34:02 +04:00
|
|
|
addr->addr_inet = (struct sockaddr_storage*) realloc(addr->addr_inet, addr->addr_alloc * sizeof(struct sockaddr_storage));
|
|
|
|
}
|
|
|
|
if(NULL == addr->addr_inet) return ORTE_ERR_OUT_OF_RESOURCE;
|
|
|
|
|
|
|
|
if (inaddr->sa_family == AF_INET) {
|
|
|
|
memcpy(addr->addr_inet+addr->addr_count, inaddr, sizeof(struct sockaddr_in));
|
|
|
|
} else {
|
|
|
|
memcpy(addr->addr_inet+addr->addr_count, inaddr, sizeof(struct sockaddr_in6));
|
2004-09-02 18:20:13 +04:00
|
|
|
}
|
|
|
|
addr->addr_count++;
|
2005-03-14 23:57:21 +03:00
|
|
|
return ORTE_SUCCESS;
|
2004-09-02 18:20:13 +04:00
|
|
|
}
|