2004-11-22 03:37:56 +03:00
|
|
|
/*
|
2005-11-05 22:57:48 +03:00
|
|
|
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
|
|
|
* University Research and Technology
|
|
|
|
* Corporation. All rights reserved.
|
|
|
|
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
|
|
|
* of Tennessee Research Foundation. All rights
|
|
|
|
* reserved.
|
2004-11-28 23:09:25 +03:00
|
|
|
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
|
|
|
* University of Stuttgart. All rights reserved.
|
2005-03-24 15:43:37 +03:00
|
|
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
|
|
|
* All rights reserved.
|
2004-11-22 04:38:40 +03:00
|
|
|
* $COPYRIGHT$
|
|
|
|
*
|
|
|
|
* Additional copyrights may follow
|
|
|
|
*
|
2004-11-22 03:37:56 +03:00
|
|
|
* $HEADER$
|
|
|
|
*/
|
|
|
|
|
2006-02-12 04:33:29 +03:00
|
|
|
#include "orte_config.h"
|
2004-10-22 20:06:05 +04:00
|
|
|
#ifdef HAVE_SYS_TYPES_H
|
2004-09-02 18:45:01 +04:00
|
|
|
#include <sys/types.h>
|
2004-10-22 20:06:05 +04:00
|
|
|
#endif
|
|
|
|
#ifdef HAVE_SYS_SOCKET_H
|
2004-09-02 18:20:13 +04:00
|
|
|
#include <sys/socket.h>
|
2004-10-22 20:06:05 +04:00
|
|
|
#endif
|
|
|
|
#ifdef HAVE_NETINET_IN_H
|
2004-09-02 18:20:13 +04:00
|
|
|
#include <netinet/in.h>
|
2004-10-22 20:06:05 +04:00
|
|
|
#endif
|
|
|
|
#ifdef HAVE_ARPA_INET_H
|
2004-09-02 18:20:13 +04:00
|
|
|
#include <arpa/inet.h>
|
2004-10-22 20:06:05 +04:00
|
|
|
#endif
|
2004-09-02 18:20:13 +04:00
|
|
|
#include <string.h>
|
2006-02-12 04:33:29 +03:00
|
|
|
#include "orte/orte_constants.h"
|
2005-07-04 05:36:20 +04:00
|
|
|
#include "opal/util/if.h"
|
Bring over the update to terminate orteds that are generated by a dynamic spawn such as comm_spawn. This introduces the concept of a job "family" - i.e., jobs that have a parent/child relationship. Comm_spawn'ed jobs have a parent (the one that spawned them). We track that relationship throughout the lineage - i.e., if a comm_spawned job in turn calls comm_spawn, then it has a parent (the one that spawned it) and a "root" job (the original job that started things).
Accordingly, there are new APIs to the name service to support the ability to get a job's parent, root, immediate children, and all its descendants. In addition, the terminate_job, terminate_orted, and signal_job APIs for the PLS have been modified to accept attributes that define the extent of their actions. For example, doing a "terminate_job" with an attribute of ORTE_NS_INCLUDE_DESCENDANTS will terminate the given jobid AND all jobs that descended from it.
I have tested this capability on a MacBook under rsh, Odin under SLURM, and LANL's Flash (bproc). It worked successfully on non-MPI jobs (both simple and including a spawn), and MPI jobs (again, both simple and with a spawn).
This commit was SVN r12597.
2006-11-14 22:34:59 +03:00
|
|
|
|
|
|
|
#include "orte/mca/ns/ns_types.h"
|
|
|
|
#include "orte/util/proc_info.h"
|
|
|
|
#include "orte/dss/dss.h"
|
|
|
|
|
2004-09-02 18:20:13 +04:00
|
|
|
#include "oob_tcp.h"
|
|
|
|
#include "oob_tcp_addr.h"
|
|
|
|
|
|
|
|
|
|
|
|
static void mca_oob_tcp_addr_construct(mca_oob_tcp_addr_t* addr)
|
|
|
|
{
|
|
|
|
memset(&addr->addr_name, 0, sizeof(addr->addr_name));
|
|
|
|
addr->addr_count = 0;
|
|
|
|
addr->addr_alloc = 0;
|
|
|
|
addr->addr_next = 0;
|
|
|
|
addr->addr_inet = NULL;
|
2005-09-19 18:47:11 +04:00
|
|
|
addr->addr_matched = false;
|
2004-09-02 18:20:13 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
static void mca_oob_tcp_addr_destruct(mca_oob_tcp_addr_t* addr)
|
|
|
|
{
|
|
|
|
if(addr->addr_inet != NULL)
|
|
|
|
free(addr->addr_inet);
|
|
|
|
}
|
|
|
|
|
|
|
|
OBJ_CLASS_INSTANCE(
|
|
|
|
mca_oob_tcp_addr_t,
|
2005-07-03 20:06:07 +04:00
|
|
|
opal_object_t,
|
2004-09-02 18:20:13 +04:00
|
|
|
mca_oob_tcp_addr_construct,
|
|
|
|
mca_oob_tcp_addr_destruct);
|
|
|
|
|
|
|
|
|
2005-03-14 23:57:21 +03:00
|
|
|
int mca_oob_tcp_addr_pack(orte_buffer_t* buffer)
|
2004-09-02 18:20:13 +04:00
|
|
|
{
|
|
|
|
uint32_t count = 0;
|
|
|
|
int i;
|
2005-03-14 23:57:21 +03:00
|
|
|
int rc;
|
|
|
|
|
Bring over the update to terminate orteds that are generated by a dynamic spawn such as comm_spawn. This introduces the concept of a job "family" - i.e., jobs that have a parent/child relationship. Comm_spawn'ed jobs have a parent (the one that spawned them). We track that relationship throughout the lineage - i.e., if a comm_spawned job in turn calls comm_spawn, then it has a parent (the one that spawned it) and a "root" job (the original job that started things).
Accordingly, there are new APIs to the name service to support the ability to get a job's parent, root, immediate children, and all its descendants. In addition, the terminate_job, terminate_orted, and signal_job APIs for the PLS have been modified to accept attributes that define the extent of their actions. For example, doing a "terminate_job" with an attribute of ORTE_NS_INCLUDE_DESCENDANTS will terminate the given jobid AND all jobs that descended from it.
I have tested this capability on a MacBook under rsh, Odin under SLURM, and LANL's Flash (bproc). It worked successfully on non-MPI jobs (both simple and including a spawn), and MPI jobs (again, both simple and with a spawn).
This commit was SVN r12597.
2006-11-14 22:34:59 +03:00
|
|
|
rc = orte_dss.pack(buffer, ORTE_PROC_MY_NAME, 1, ORTE_NAME);
|
2005-03-14 23:57:21 +03:00
|
|
|
if(rc != ORTE_SUCCESS)
|
|
|
|
return rc;
|
2004-09-02 18:20:13 +04:00
|
|
|
|
2005-07-04 05:36:20 +04:00
|
|
|
for(i=opal_ifbegin(); i>0; i=opal_ifnext(i)) {
|
2004-09-02 18:20:13 +04:00
|
|
|
struct sockaddr_in inaddr;
|
2005-07-04 05:36:20 +04:00
|
|
|
opal_ifindextoaddr(i, (struct sockaddr*)&inaddr, sizeof(inaddr));
|
2006-07-12 00:54:49 +04:00
|
|
|
if(opal_ifcount() > 1 &&
|
|
|
|
opal_ifislocalhost((struct sockaddr*) &inaddr))
|
2004-09-02 18:20:13 +04:00
|
|
|
continue;
|
|
|
|
count++;
|
2004-10-01 01:23:10 +04:00
|
|
|
}
|
2006-02-07 06:32:36 +03:00
|
|
|
rc = orte_dss.pack(buffer, &count, 1, ORTE_INT32);
|
2005-03-14 23:57:21 +03:00
|
|
|
if(rc != ORTE_SUCCESS)
|
|
|
|
return rc;
|
2004-09-02 18:20:13 +04:00
|
|
|
|
2005-07-04 05:36:20 +04:00
|
|
|
for(i=opal_ifbegin(); i>0; i=opal_ifnext(i)) {
|
2004-09-02 18:20:13 +04:00
|
|
|
struct sockaddr_in inaddr;
|
2006-10-25 19:09:30 +04:00
|
|
|
uint8_t type;
|
|
|
|
uint32_t ipaddr;
|
|
|
|
uint16_t port;
|
|
|
|
|
2005-07-04 05:36:20 +04:00
|
|
|
opal_ifindextoaddr(i, (struct sockaddr*)&inaddr, sizeof(inaddr));
|
2006-07-12 00:54:49 +04:00
|
|
|
if(opal_ifcount() > 1 &&
|
|
|
|
opal_ifislocalhost((struct sockaddr*) &inaddr))
|
2004-09-02 18:20:13 +04:00
|
|
|
continue;
|
2006-10-25 19:09:30 +04:00
|
|
|
|
|
|
|
switch (inaddr.sin_family) {
|
|
|
|
case AF_INET:
|
|
|
|
type = MCA_OOB_TCP_ADDR_TYPE_AFINET;
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
/* shouldn't get here, as opal_if shouldn't allow anything
|
|
|
|
but AFINET. Will need another case once IPv6 code is
|
|
|
|
committed. */
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
orte_dss.pack(buffer, &type, 1, ORTE_INT8);
|
|
|
|
|
|
|
|
port = mca_oob_tcp_component.tcp_listen_port;
|
|
|
|
orte_dss.pack(buffer, &port, sizeof(port), ORTE_BYTE);
|
|
|
|
|
|
|
|
/* This will need to be adjusted for IPv6 */
|
|
|
|
ipaddr = (uint32_t) inaddr.sin_addr.s_addr;
|
|
|
|
orte_dss.pack(buffer, &ipaddr, sizeof(ipaddr), ORTE_BYTE);
|
2004-09-02 18:20:13 +04:00
|
|
|
}
|
2005-03-14 23:57:21 +03:00
|
|
|
return ORTE_SUCCESS;
|
2004-09-02 18:20:13 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2005-03-14 23:57:21 +03:00
|
|
|
mca_oob_tcp_addr_t* mca_oob_tcp_addr_unpack(orte_buffer_t* buffer)
|
2004-09-02 18:20:13 +04:00
|
|
|
{
|
|
|
|
mca_oob_tcp_addr_t* addr = OBJ_NEW(mca_oob_tcp_addr_t);
|
2005-03-14 23:57:21 +03:00
|
|
|
int rc;
|
2006-08-15 23:54:10 +04:00
|
|
|
orte_std_cntr_t count;
|
2004-09-02 18:20:13 +04:00
|
|
|
if(NULL == addr)
|
|
|
|
return NULL;
|
|
|
|
|
2005-03-14 23:57:21 +03:00
|
|
|
count = 1;
|
2006-02-07 06:32:36 +03:00
|
|
|
rc = orte_dss.unpack(buffer, &addr->addr_name, &count, ORTE_NAME);
|
2006-02-12 04:33:29 +03:00
|
|
|
if(rc != ORTE_SUCCESS) {
|
2005-03-14 23:57:21 +03:00
|
|
|
OBJ_RELEASE(addr);
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
count = 1;
|
2006-02-07 06:32:36 +03:00
|
|
|
rc = orte_dss.unpack(buffer, &addr->addr_count, &count, ORTE_INT32);
|
2006-02-12 04:33:29 +03:00
|
|
|
if(rc != ORTE_SUCCESS) {
|
2005-03-14 23:57:21 +03:00
|
|
|
OBJ_RELEASE(addr);
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2004-09-02 18:20:13 +04:00
|
|
|
if(addr->addr_count != 0) {
|
2006-08-15 23:54:10 +04:00
|
|
|
orte_std_cntr_t i;
|
2004-10-28 23:40:30 +04:00
|
|
|
addr->addr_inet = (struct sockaddr_in *)malloc(sizeof(struct sockaddr_in) * addr->addr_count);
|
2004-09-02 18:20:13 +04:00
|
|
|
if(NULL == addr->addr_inet) {
|
|
|
|
OBJ_RELEASE(addr);
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
addr->addr_alloc = addr->addr_count;
|
2005-06-13 18:50:59 +04:00
|
|
|
for(i=0; i<addr->addr_count; i++) {
|
2006-10-25 19:09:30 +04:00
|
|
|
uint8_t type;
|
|
|
|
uint32_t ipaddr;
|
|
|
|
uint16_t port;
|
|
|
|
/* unpack and expand family */
|
|
|
|
count = 1;
|
|
|
|
rc = orte_dss.unpack(buffer, &type, &count, ORTE_INT8);
|
2006-02-12 04:33:29 +03:00
|
|
|
if(rc != ORTE_SUCCESS) {
|
2005-06-13 18:50:59 +04:00
|
|
|
OBJ_RELEASE(addr);
|
|
|
|
return NULL;
|
|
|
|
}
|
2006-10-25 19:09:30 +04:00
|
|
|
switch (type) {
|
|
|
|
case MCA_OOB_TCP_ADDR_TYPE_AFINET:
|
|
|
|
addr->addr_inet[i].sin_family = AF_INET;
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
OBJ_RELEASE(addr);
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* and the listen port */
|
|
|
|
count = sizeof(port);
|
|
|
|
rc = orte_dss.unpack(buffer, &port, &count, ORTE_BYTE);
|
|
|
|
if(rc != ORTE_SUCCESS) {
|
|
|
|
OBJ_RELEASE(addr);
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
addr->addr_inet[i].sin_port = port;
|
|
|
|
|
|
|
|
/* and the address. need to fix for IPv6 */
|
|
|
|
count = sizeof(ipaddr);
|
|
|
|
rc = orte_dss.unpack(buffer, &ipaddr, &count, ORTE_BYTE);
|
|
|
|
if(rc != ORTE_SUCCESS) {
|
|
|
|
OBJ_RELEASE(addr);
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
addr->addr_inet[i].sin_addr.s_addr = ipaddr;
|
2005-03-14 23:57:21 +03:00
|
|
|
}
|
2004-09-02 18:20:13 +04:00
|
|
|
}
|
|
|
|
return addr;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2005-09-19 18:47:11 +04:00
|
|
|
int mca_oob_tcp_addr_get_next(mca_oob_tcp_addr_t* addr, struct sockaddr_in* retval)
|
2004-09-02 18:20:13 +04:00
|
|
|
{
|
|
|
|
if(addr == NULL || addr->addr_count == 0)
|
2005-03-14 23:57:21 +03:00
|
|
|
return ORTE_ERROR;
|
2005-09-19 18:47:11 +04:00
|
|
|
if(addr->addr_matched == false) {
|
2006-08-15 23:54:10 +04:00
|
|
|
orte_std_cntr_t i=0;
|
2005-09-19 18:47:11 +04:00
|
|
|
for(i=0; i<addr->addr_count; i++) {
|
|
|
|
int ifindex;
|
|
|
|
for(ifindex=opal_ifbegin(); ifindex>0; ifindex=opal_ifnext(ifindex)) {
|
|
|
|
struct sockaddr_in inaddr;
|
|
|
|
struct sockaddr_in inmask;
|
|
|
|
char name[32];
|
|
|
|
opal_ifindextoname(i, name, sizeof(name));
|
|
|
|
if (mca_oob_tcp_component.tcp_include != NULL &&
|
|
|
|
strstr(mca_oob_tcp_component.tcp_include,name) == NULL)
|
|
|
|
continue;
|
|
|
|
if (mca_oob_tcp_component.tcp_exclude != NULL &&
|
|
|
|
strstr(mca_oob_tcp_component.tcp_exclude,name) != NULL)
|
|
|
|
continue;
|
|
|
|
opal_ifindextoaddr(ifindex, (struct sockaddr*)&inaddr, sizeof(inaddr));
|
2006-07-12 00:54:49 +04:00
|
|
|
if(opal_ifcount() > 1 &&
|
|
|
|
opal_ifislocalhost((struct sockaddr*) &inaddr))
|
2005-09-19 18:47:11 +04:00
|
|
|
continue;
|
|
|
|
opal_ifindextomask(ifindex, (struct sockaddr*)&inmask, sizeof(inmask));
|
|
|
|
|
|
|
|
/* if match on network prefix - start here */
|
|
|
|
if((inaddr.sin_addr.s_addr & inmask.sin_addr.s_addr) ==
|
|
|
|
(addr->addr_inet[i].sin_addr.s_addr & inmask.sin_addr.s_addr)) {
|
|
|
|
addr->addr_next = i;
|
|
|
|
goto done;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
done:
|
|
|
|
addr->addr_matched = true;
|
|
|
|
}
|
|
|
|
*retval = addr->addr_inet[addr->addr_next];
|
2004-09-02 18:20:13 +04:00
|
|
|
if(++addr->addr_next >= addr->addr_count)
|
|
|
|
addr->addr_next = 0;
|
2005-03-14 23:57:21 +03:00
|
|
|
return ORTE_SUCCESS;
|
2004-09-02 18:20:13 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
int mca_oob_tcp_addr_insert(mca_oob_tcp_addr_t* addr, const struct sockaddr_in* inaddr)
|
|
|
|
{
|
|
|
|
if(addr->addr_alloc == 0) {
|
|
|
|
addr->addr_alloc = 2;
|
2004-10-28 23:40:30 +04:00
|
|
|
addr->addr_inet = (struct sockaddr_in *)malloc(addr->addr_alloc * sizeof(struct sockaddr_in));
|
2004-09-02 18:20:13 +04:00
|
|
|
} else if(addr->addr_count == addr->addr_alloc) {
|
|
|
|
addr->addr_alloc <<= 1;
|
2004-10-28 23:40:30 +04:00
|
|
|
addr->addr_inet = (struct sockaddr_in *)realloc(addr->addr_inet, addr->addr_alloc * sizeof(struct sockaddr_in));
|
2004-09-02 18:20:13 +04:00
|
|
|
}
|
|
|
|
if(NULL == addr->addr_inet)
|
2005-03-14 23:57:21 +03:00
|
|
|
return ORTE_ERR_OUT_OF_RESOURCE;
|
2004-09-02 18:20:13 +04:00
|
|
|
memcpy(addr->addr_inet+addr->addr_count, inaddr, sizeof(struct sockaddr_in));
|
|
|
|
addr->addr_count++;
|
2005-03-14 23:57:21 +03:00
|
|
|
return ORTE_SUCCESS;
|
2004-09-02 18:20:13 +04:00
|
|
|
}
|
|
|
|
|