2004-07-08 18:48:34 +04:00
|
|
|
/*
|
2004-11-22 04:38:40 +03:00
|
|
|
* Copyright (c) 2004-2005 The Trustees of Indiana University.
|
|
|
|
* All rights reserved.
|
|
|
|
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
|
|
|
|
* All rights reserved.
|
2004-11-28 23:09:25 +03:00
|
|
|
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
|
|
|
* University of Stuttgart. All rights reserved.
|
2005-03-24 15:43:37 +03:00
|
|
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
|
|
|
* All rights reserved.
|
2004-11-22 04:38:40 +03:00
|
|
|
* $COPYRIGHT$
|
|
|
|
*
|
|
|
|
* Additional copyrights may follow
|
|
|
|
*
|
2004-11-22 03:37:56 +03:00
|
|
|
* $HEADER$
|
2004-07-08 18:48:34 +04:00
|
|
|
*/
|
|
|
|
|
2004-08-19 03:24:27 +04:00
|
|
|
#include "ompi_config.h"
|
|
|
|
|
2004-02-13 16:56:55 +03:00
|
|
|
#include <string.h>
|
2004-07-08 18:48:34 +04:00
|
|
|
|
2005-07-04 02:45:48 +04:00
|
|
|
#include "opal/threads/mutex.h"
|
2005-07-04 03:31:27 +04:00
|
|
|
#include "opal/util/output.h"
|
2005-07-15 02:43:01 +04:00
|
|
|
#include "opal/util/sys_info.h"
|
|
|
|
#include "orte/dps/dps.h"
|
|
|
|
#include "orte/mca/oob/oob.h"
|
|
|
|
#include "orte/mca/ns/ns.h"
|
|
|
|
#include "orte/mca/gpr/gpr.h"
|
|
|
|
#include "orte/util/proc_info.h"
|
|
|
|
#include "ompi/proc/proc.h"
|
|
|
|
#include "ompi/mca/pml/pml.h"
|
2005-08-05 22:03:30 +04:00
|
|
|
#include "ompi/datatype/dt_arch.h"
|
2005-07-15 02:43:01 +04:00
|
|
|
#include "ompi/datatype/convertor.h"
|
2004-01-29 18:34:47 +03:00
|
|
|
|
2005-07-03 20:22:16 +04:00
|
|
|
static opal_list_t ompi_proc_list;
|
2005-07-04 02:45:48 +04:00
|
|
|
static opal_mutex_t ompi_proc_lock;
|
2004-06-07 19:33:53 +04:00
|
|
|
ompi_proc_t* ompi_proc_local_proc = NULL;
|
2004-02-13 16:56:55 +03:00
|
|
|
|
2004-06-07 19:33:53 +04:00
|
|
|
static void ompi_proc_construct(ompi_proc_t* proc);
|
|
|
|
static void ompi_proc_destruct(ompi_proc_t* proc);
|
2005-07-15 02:43:01 +04:00
|
|
|
static int setup_registry_callback(void);
|
|
|
|
static void callback(orte_gpr_notify_data_t *data, void *cbdata);
|
2004-01-16 03:31:58 +03:00
|
|
|
|
2004-10-26 15:39:16 +04:00
|
|
|
OBJ_CLASS_INSTANCE(
|
|
|
|
ompi_proc_t,
|
2005-07-03 20:22:16 +04:00
|
|
|
opal_list_item_t,
|
2004-10-26 15:39:16 +04:00
|
|
|
ompi_proc_construct,
|
|
|
|
ompi_proc_destruct
|
|
|
|
);
|
|
|
|
|
2004-01-16 03:31:58 +03:00
|
|
|
|
2004-06-07 19:33:53 +04:00
|
|
|
void ompi_proc_construct(ompi_proc_t* proc)
|
2004-01-16 03:31:58 +03:00
|
|
|
{
|
2004-02-14 01:16:39 +03:00
|
|
|
proc->proc_pml = NULL;
|
|
|
|
proc->proc_modex = NULL;
|
2005-07-04 02:45:48 +04:00
|
|
|
OBJ_CONSTRUCT(&proc->proc_lock, opal_mutex_t);
|
2004-03-26 17:15:20 +03:00
|
|
|
|
2005-08-05 22:03:30 +04:00
|
|
|
/* By default all processors are supposelly having the same architecture as me. Thus,
|
|
|
|
* by default we run in a homogeneous environment. Later when the registry callback
|
|
|
|
* get fired we will have to set the convertors to the correct architecture.
|
|
|
|
*/
|
|
|
|
proc->proc_convertor = ompi_mpi_local_convertor;
|
|
|
|
OBJ_RETAIN( ompi_mpi_local_convertor );
|
|
|
|
proc->proc_arch = ompi_mpi_local_arch;
|
2004-01-29 18:34:47 +03:00
|
|
|
|
2005-07-15 02:43:01 +04:00
|
|
|
proc->proc_flags = 0;
|
|
|
|
|
2005-07-04 02:45:48 +04:00
|
|
|
OPAL_THREAD_LOCK(&ompi_proc_lock);
|
2005-07-03 20:22:16 +04:00
|
|
|
opal_list_append(&ompi_proc_list, (opal_list_item_t*)proc);
|
2005-07-04 02:45:48 +04:00
|
|
|
OPAL_THREAD_UNLOCK(&ompi_proc_lock);
|
2004-01-16 03:31:58 +03:00
|
|
|
}
|
|
|
|
|
2004-01-29 18:34:47 +03:00
|
|
|
|
2004-06-07 19:33:53 +04:00
|
|
|
void ompi_proc_destruct(ompi_proc_t* proc)
|
2004-01-16 03:31:58 +03:00
|
|
|
{
|
2005-07-15 02:43:01 +04:00
|
|
|
if (proc->proc_modex != NULL) {
|
2004-10-15 01:04:45 +04:00
|
|
|
OBJ_RELEASE(proc->proc_modex);
|
2005-07-15 02:43:01 +04:00
|
|
|
}
|
2005-08-05 22:03:30 +04:00
|
|
|
/* As all the convertors are created with OBJ_NEW we can just call OBJ_RELEASE. All, except
|
|
|
|
* the local convertor, will get destroyed at some point here. If the reference count is correct
|
|
|
|
* the local convertor (who has the reference count increased in the datatype) will not get
|
|
|
|
* destroyed here. It will be destroyed later when the ompi_ddt_finalize is called.
|
|
|
|
*/
|
2005-07-13 00:25:47 +04:00
|
|
|
OBJ_RELEASE( proc->proc_convertor );
|
2005-07-04 02:45:48 +04:00
|
|
|
OPAL_THREAD_LOCK(&ompi_proc_lock);
|
2005-07-03 20:22:16 +04:00
|
|
|
opal_list_remove_item(&ompi_proc_list, (opal_list_item_t*)proc);
|
2005-07-04 02:45:48 +04:00
|
|
|
OPAL_THREAD_UNLOCK(&ompi_proc_lock);
|
2004-03-31 20:59:06 +04:00
|
|
|
OBJ_DESTRUCT(&proc->proc_lock);
|
2004-01-16 03:31:58 +03:00
|
|
|
}
|
|
|
|
|
2004-02-13 16:56:55 +03:00
|
|
|
|
2004-06-07 19:33:53 +04:00
|
|
|
int ompi_proc_init(void)
|
2004-02-13 16:56:55 +03:00
|
|
|
{
|
2005-03-14 23:57:21 +03:00
|
|
|
orte_process_name_t *peers;
|
|
|
|
size_t i, npeers, self;
|
2004-02-13 16:56:55 +03:00
|
|
|
int rc;
|
|
|
|
|
2005-07-03 20:22:16 +04:00
|
|
|
OBJ_CONSTRUCT(&ompi_proc_list, opal_list_t);
|
2005-07-04 02:45:48 +04:00
|
|
|
OBJ_CONSTRUCT(&ompi_proc_lock, opal_mutex_t);
|
2004-10-15 01:04:45 +04:00
|
|
|
|
2005-07-15 02:43:01 +04:00
|
|
|
/* get all peers in this job */
|
2005-03-14 23:57:21 +03:00
|
|
|
if(OMPI_SUCCESS != (rc = orte_ns.get_peers(&peers, &npeers, &self))) {
|
2005-07-04 03:31:27 +04:00
|
|
|
opal_output(0, "ompi_proc_init: get_peers failed with errno=%d", rc);
|
2004-02-13 16:56:55 +03:00
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
|
2005-07-15 02:43:01 +04:00
|
|
|
/* find self */
|
2005-08-05 22:03:30 +04:00
|
|
|
for( i = 0; i < npeers; i++ ) {
|
2004-06-07 19:33:53 +04:00
|
|
|
ompi_proc_t *proc = OBJ_NEW(ompi_proc_t);
|
2004-07-01 18:49:54 +04:00
|
|
|
proc->proc_name = peers[i];
|
2005-03-14 23:57:21 +03:00
|
|
|
if( i == self ) {
|
2004-06-07 19:33:53 +04:00
|
|
|
ompi_proc_local_proc = proc;
|
2005-07-15 02:43:01 +04:00
|
|
|
proc->proc_flags |= OMPI_PROC_FLAG_LOCAL;
|
2004-03-03 19:44:41 +03:00
|
|
|
}
|
2004-02-13 16:56:55 +03:00
|
|
|
}
|
2005-03-14 23:57:21 +03:00
|
|
|
free(peers);
|
2005-07-15 02:43:01 +04:00
|
|
|
|
|
|
|
/* setup registry callback to find everyone on my local node.
|
|
|
|
Can't do a GPR get because we're in the middle of MPI_INIT,
|
|
|
|
and we're setup for the GPR compound command -- so create a
|
|
|
|
subscription which will be serviced later, at the end of the
|
|
|
|
compound command. */
|
|
|
|
if (ORTE_SUCCESS != (rc = setup_registry_callback())) {
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
|
2005-08-05 22:03:30 +04:00
|
|
|
/* Here we have to add to the GPR the information about the current architecture.
|
|
|
|
* TODO: george
|
|
|
|
*/
|
|
|
|
|
2004-06-07 19:33:53 +04:00
|
|
|
return OMPI_SUCCESS;
|
2004-02-13 16:56:55 +03:00
|
|
|
}
|
|
|
|
|
2004-12-02 16:28:10 +03:00
|
|
|
int ompi_proc_finalize (void)
|
|
|
|
{
|
|
|
|
ompi_proc_t *proc, *nextproc, *endproc;
|
|
|
|
|
2005-07-03 20:22:16 +04:00
|
|
|
proc = (ompi_proc_t*)opal_list_get_first(&ompi_proc_list);
|
|
|
|
nextproc = (ompi_proc_t*)opal_list_get_next(proc);
|
|
|
|
endproc = (ompi_proc_t*)opal_list_get_end(&ompi_proc_list);
|
2004-12-02 16:28:10 +03:00
|
|
|
|
|
|
|
OBJ_RELEASE(proc);
|
|
|
|
while ( nextproc != endproc ) {
|
2005-07-13 00:25:47 +04:00
|
|
|
proc = nextproc;
|
|
|
|
nextproc = (ompi_proc_t *)opal_list_get_next(proc);
|
|
|
|
OBJ_RELEASE(proc);
|
2004-12-02 16:28:10 +03:00
|
|
|
}
|
|
|
|
OBJ_DESTRUCT(&ompi_proc_list);
|
|
|
|
|
|
|
|
return OMPI_SUCCESS;
|
|
|
|
}
|
2004-02-13 16:56:55 +03:00
|
|
|
|
2004-06-07 19:33:53 +04:00
|
|
|
ompi_proc_t** ompi_proc_world(size_t *size)
|
2004-02-13 16:56:55 +03:00
|
|
|
{
|
2005-07-15 02:43:01 +04:00
|
|
|
ompi_proc_t **procs;
|
2004-06-07 19:33:53 +04:00
|
|
|
ompi_proc_t *proc;
|
2004-02-13 16:56:55 +03:00
|
|
|
size_t count = 0;
|
2005-07-15 02:43:01 +04:00
|
|
|
orte_ns_cmp_bitmask_t mask;
|
|
|
|
orte_process_name_t my_name;
|
2004-02-13 16:56:55 +03:00
|
|
|
|
2005-07-15 02:43:01 +04:00
|
|
|
/* check bozo case */
|
|
|
|
if (NULL == ompi_proc_local_proc) {
|
2004-02-13 16:56:55 +03:00
|
|
|
return NULL;
|
2005-07-15 02:43:01 +04:00
|
|
|
}
|
|
|
|
mask = ORTE_NS_CMP_JOBID;
|
|
|
|
my_name = ompi_proc_local_proc->proc_name;
|
2004-02-13 16:56:55 +03:00
|
|
|
|
2005-07-15 02:43:01 +04:00
|
|
|
/* First count how many match this jobid */
|
2005-07-04 02:45:48 +04:00
|
|
|
OPAL_THREAD_LOCK(&ompi_proc_lock);
|
2005-07-15 02:43:01 +04:00
|
|
|
for (proc = (ompi_proc_t*)opal_list_get_first(&ompi_proc_list);
|
|
|
|
proc != (ompi_proc_t*)opal_list_get_end(&ompi_proc_list);
|
|
|
|
proc = (ompi_proc_t*)opal_list_get_next(proc)) {
|
|
|
|
if (0 == orte_ns.compare(mask, &proc->proc_name, &my_name)) {
|
|
|
|
++count;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* allocate an array */
|
|
|
|
procs = (ompi_proc_t**) malloc(count * sizeof(ompi_proc_t*));
|
|
|
|
if (NULL == procs) {
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* now save only the procs that match this jobid */
|
|
|
|
count = 0;
|
|
|
|
for (proc = (ompi_proc_t*)opal_list_get_first(&ompi_proc_list);
|
|
|
|
proc != (ompi_proc_t*)opal_list_get_end(&ompi_proc_list);
|
|
|
|
proc = (ompi_proc_t*)opal_list_get_next(proc)) {
|
|
|
|
if (0 == orte_ns.compare(mask, &proc->proc_name, &my_name)) {
|
|
|
|
procs[count++] = proc;
|
|
|
|
}
|
2004-02-13 16:56:55 +03:00
|
|
|
}
|
2005-07-04 02:45:48 +04:00
|
|
|
OPAL_THREAD_UNLOCK(&ompi_proc_lock);
|
2005-07-15 02:43:01 +04:00
|
|
|
|
2004-02-13 16:56:55 +03:00
|
|
|
*size = count;
|
|
|
|
return procs;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2004-06-07 19:33:53 +04:00
|
|
|
ompi_proc_t** ompi_proc_all(size_t* size)
|
2004-02-13 16:56:55 +03:00
|
|
|
{
|
2004-10-18 20:11:14 +04:00
|
|
|
ompi_proc_t **procs =
|
2005-07-03 20:22:16 +04:00
|
|
|
(ompi_proc_t**) malloc(opal_list_get_size(&ompi_proc_list) * sizeof(ompi_proc_t*));
|
2004-06-07 19:33:53 +04:00
|
|
|
ompi_proc_t *proc;
|
2004-02-13 16:56:55 +03:00
|
|
|
size_t count = 0;
|
|
|
|
|
2005-07-15 02:43:01 +04:00
|
|
|
if (NULL == procs) {
|
2004-02-13 16:56:55 +03:00
|
|
|
return NULL;
|
2005-07-15 02:43:01 +04:00
|
|
|
}
|
2004-02-13 16:56:55 +03:00
|
|
|
|
2005-07-04 02:45:48 +04:00
|
|
|
OPAL_THREAD_LOCK(&ompi_proc_lock);
|
2005-07-03 20:22:16 +04:00
|
|
|
for(proc = (ompi_proc_t*)opal_list_get_first(&ompi_proc_list);
|
|
|
|
proc != (ompi_proc_t*)opal_list_get_end(&ompi_proc_list);
|
|
|
|
proc = (ompi_proc_t*)opal_list_get_next(proc)) {
|
2004-02-13 16:56:55 +03:00
|
|
|
OBJ_RETAIN(proc);
|
|
|
|
procs[count++] = proc;
|
|
|
|
}
|
2005-07-04 02:45:48 +04:00
|
|
|
OPAL_THREAD_UNLOCK(&ompi_proc_lock);
|
2004-02-13 16:56:55 +03:00
|
|
|
*size = count;
|
|
|
|
return procs;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2004-06-07 19:33:53 +04:00
|
|
|
ompi_proc_t** ompi_proc_self(size_t* size)
|
2004-02-13 16:56:55 +03:00
|
|
|
{
|
2004-10-18 20:11:14 +04:00
|
|
|
ompi_proc_t **procs = (ompi_proc_t**) malloc(sizeof(ompi_proc_t*));
|
2005-07-15 02:43:01 +04:00
|
|
|
if (NULL == procs) {
|
2004-02-13 16:56:55 +03:00
|
|
|
return NULL;
|
2005-07-15 02:43:01 +04:00
|
|
|
}
|
2004-06-07 19:33:53 +04:00
|
|
|
OBJ_RETAIN(ompi_proc_local_proc);
|
|
|
|
*procs = ompi_proc_local_proc;
|
2004-02-13 16:56:55 +03:00
|
|
|
*size = 1;
|
|
|
|
return procs;
|
|
|
|
}
|
|
|
|
|
2005-03-14 23:57:21 +03:00
|
|
|
ompi_proc_t * ompi_proc_find ( const orte_process_name_t * name )
|
2004-05-18 01:28:32 +04:00
|
|
|
{
|
2004-09-17 14:10:24 +04:00
|
|
|
ompi_proc_t *proc, *rproc=NULL;
|
2005-03-14 23:57:21 +03:00
|
|
|
orte_ns_cmp_bitmask_t mask;
|
2004-05-18 01:28:32 +04:00
|
|
|
|
|
|
|
/* return the proc-struct which matches this jobid+process id */
|
2004-07-08 18:48:34 +04:00
|
|
|
|
2005-03-14 23:57:21 +03:00
|
|
|
mask = ORTE_NS_CMP_CELLID | ORTE_NS_CMP_JOBID | ORTE_NS_CMP_VPID;
|
2005-07-04 02:45:48 +04:00
|
|
|
OPAL_THREAD_LOCK(&ompi_proc_lock);
|
2005-07-03 20:22:16 +04:00
|
|
|
for(proc = (ompi_proc_t*)opal_list_get_first(&ompi_proc_list);
|
|
|
|
proc != (ompi_proc_t*)opal_list_get_end(&ompi_proc_list);
|
|
|
|
proc = (ompi_proc_t*)opal_list_get_next(proc)) {
|
2005-07-15 02:43:01 +04:00
|
|
|
if (0 == orte_ns.compare(mask, &proc->proc_name, name)) {
|
2005-03-14 23:57:21 +03:00
|
|
|
rproc = proc;
|
|
|
|
break;
|
|
|
|
}
|
2004-05-18 01:28:32 +04:00
|
|
|
}
|
2005-07-04 02:45:48 +04:00
|
|
|
OPAL_THREAD_UNLOCK(&ompi_proc_lock);
|
2004-09-17 14:10:24 +04:00
|
|
|
return rproc;
|
2004-05-18 01:28:32 +04:00
|
|
|
}
|
2004-07-01 18:49:54 +04:00
|
|
|
|
2004-08-04 21:05:22 +04:00
|
|
|
|
2005-03-14 23:57:21 +03:00
|
|
|
ompi_proc_t * ompi_proc_find_and_add ( const orte_process_name_t * name, bool* isnew )
|
2004-09-17 14:10:24 +04:00
|
|
|
{
|
|
|
|
ompi_proc_t *proc, *rproc=NULL;
|
2005-03-14 23:57:21 +03:00
|
|
|
orte_ns_cmp_bitmask_t mask;
|
2004-09-17 14:10:24 +04:00
|
|
|
|
|
|
|
/* return the proc-struct which matches this jobid+process id */
|
2005-03-14 23:57:21 +03:00
|
|
|
mask = ORTE_NS_CMP_CELLID | ORTE_NS_CMP_JOBID | ORTE_NS_CMP_VPID;
|
2005-07-04 02:45:48 +04:00
|
|
|
OPAL_THREAD_LOCK(&ompi_proc_lock);
|
2005-07-03 20:22:16 +04:00
|
|
|
for(proc = (ompi_proc_t*)opal_list_get_first(&ompi_proc_list);
|
|
|
|
proc != (ompi_proc_t*)opal_list_get_end(&ompi_proc_list);
|
|
|
|
proc = (ompi_proc_t*)opal_list_get_next(proc)) {
|
2005-07-15 02:43:01 +04:00
|
|
|
if (0 == orte_ns.compare(mask, &proc->proc_name, name)) {
|
2005-03-14 23:57:21 +03:00
|
|
|
*isnew = false;
|
|
|
|
rproc = proc;
|
|
|
|
break;
|
|
|
|
}
|
2004-09-17 14:10:24 +04:00
|
|
|
}
|
2005-07-04 02:45:48 +04:00
|
|
|
OPAL_THREAD_UNLOCK(&ompi_proc_lock);
|
2004-09-17 14:10:24 +04:00
|
|
|
|
|
|
|
if ( NULL == rproc ) {
|
2005-07-15 02:43:01 +04:00
|
|
|
ompi_proc_t *tproc = OBJ_NEW(ompi_proc_t);
|
|
|
|
rproc = tproc;
|
|
|
|
rproc->proc_name = *name;
|
2004-10-25 23:52:37 +04:00
|
|
|
*isnew = true;
|
2004-09-17 14:10:24 +04:00
|
|
|
}
|
|
|
|
return rproc;
|
|
|
|
}
|
|
|
|
|
2005-03-14 23:57:21 +03:00
|
|
|
int ompi_proc_get_namebuf ( ompi_proc_t **proclist, int proclistsize, orte_buffer_t* buf)
|
2004-08-04 21:05:22 +04:00
|
|
|
{
|
|
|
|
int i;
|
2005-07-04 02:45:48 +04:00
|
|
|
OPAL_THREAD_LOCK(&ompi_proc_lock);
|
2004-08-04 21:05:22 +04:00
|
|
|
for (i=0; i<proclistsize; i++) {
|
2005-03-14 23:57:21 +03:00
|
|
|
int rc = orte_dps.pack(buf, &(proclist[i]->proc_name), 1, ORTE_NAME);
|
|
|
|
if(rc != OMPI_SUCCESS) {
|
2005-07-04 02:45:48 +04:00
|
|
|
OPAL_THREAD_UNLOCK(&ompi_proc_lock);
|
2005-03-14 23:57:21 +03:00
|
|
|
return rc;
|
|
|
|
}
|
2004-08-04 21:05:22 +04:00
|
|
|
}
|
2005-07-04 02:45:48 +04:00
|
|
|
OPAL_THREAD_UNLOCK(&ompi_proc_lock);
|
2004-08-04 21:05:22 +04:00
|
|
|
return OMPI_SUCCESS;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2005-03-14 23:57:21 +03:00
|
|
|
int ompi_proc_get_proclist (orte_buffer_t* buf, int proclistsize, ompi_proc_t ***proclist)
|
2004-08-04 21:05:22 +04:00
|
|
|
{
|
|
|
|
int i;
|
|
|
|
ompi_proc_t **plist=NULL;
|
2005-03-14 23:57:21 +03:00
|
|
|
orte_process_name_t name;
|
2004-10-25 23:52:37 +04:00
|
|
|
bool isnew = false;
|
|
|
|
|
2005-07-15 02:43:01 +04:00
|
|
|
/* do not free plist *ever*, since it is used in the remote group
|
|
|
|
structure of a communicator */
|
2004-09-16 14:07:14 +04:00
|
|
|
plist = (ompi_proc_t **) calloc (proclistsize, sizeof (ompi_proc_t *));
|
2004-08-04 21:05:22 +04:00
|
|
|
if ( NULL == plist ) {
|
|
|
|
return OMPI_ERR_OUT_OF_RESOURCE;
|
|
|
|
}
|
|
|
|
|
|
|
|
for ( i=0; i<proclistsize; i++ ){
|
2005-03-14 23:57:21 +03:00
|
|
|
size_t count=1;
|
2005-07-15 02:43:01 +04:00
|
|
|
int rc = orte_dps.unpack(buf, &name, &count, ORTE_NAME);
|
|
|
|
if(rc != ORTE_SUCCESS) {
|
2005-03-14 23:57:21 +03:00
|
|
|
return rc;
|
2005-07-15 02:43:01 +04:00
|
|
|
}
|
2004-10-25 23:52:37 +04:00
|
|
|
plist[i] = ompi_proc_find_and_add ( &name, &isnew );
|
|
|
|
if(isnew) {
|
2005-04-13 07:19:48 +04:00
|
|
|
MCA_PML_CALL(add_procs(&plist[i], 1));
|
2004-10-25 23:52:37 +04:00
|
|
|
}
|
2004-08-04 21:05:22 +04:00
|
|
|
}
|
|
|
|
*proclist = plist;
|
|
|
|
return OMPI_SUCCESS;
|
|
|
|
}
|
|
|
|
|
2005-07-15 02:43:01 +04:00
|
|
|
|
|
|
|
/*
|
|
|
|
* As described above, we cannot do a simple GPR get because we're in
|
|
|
|
* the middle of the GPR compound command in MPI_INIT. So setup a
|
|
|
|
* subscription that will be fullfilled later in MPI_INIT.
|
|
|
|
*/
|
|
|
|
static int setup_registry_callback(void)
|
|
|
|
{
|
|
|
|
int rc;
|
|
|
|
char *segment;
|
|
|
|
ompi_proc_t *local = ompi_proc_local();
|
|
|
|
orte_jobid_t jobid;
|
|
|
|
orte_gpr_trigger_t trig, *trig1;
|
|
|
|
orte_gpr_value_t value, *values;
|
|
|
|
orte_gpr_subscription_t sub, *sub1;
|
|
|
|
|
|
|
|
if (ORTE_SUCCESS != orte_ns.get_jobid(&jobid, &local->proc_name)) {
|
|
|
|
printf("Badness!\n");
|
|
|
|
}
|
|
|
|
|
|
|
|
/* find the job segment on the registry */
|
|
|
|
if (ORTE_SUCCESS !=
|
|
|
|
(rc = orte_schema.get_job_segment_name(&segment, jobid))) {
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
|
|
|
|
OBJ_CONSTRUCT(&sub, orte_gpr_subscription_t);
|
|
|
|
/* indicate that this is a standard subscription. This indicates
|
|
|
|
that the subscription will be common to all processes. Thus,
|
|
|
|
the resulting data can be consolidated into a
|
|
|
|
process-independent message and broadcast to all processes */
|
|
|
|
if (ORTE_SUCCESS !=
|
|
|
|
(rc = orte_schema.get_std_subscription_name(&(sub.name),
|
2005-07-18 22:49:00 +04:00
|
|
|
OMPI_PROC_SUBSCRIPTION, jobid))) {
|
2005-07-15 02:43:01 +04:00
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* send data when trigger fires, then delete - no need for further
|
|
|
|
notifications */
|
|
|
|
sub.action = ORTE_GPR_NOTIFY_DELETE_AFTER_TRIG;
|
|
|
|
|
|
|
|
OBJ_CONSTRUCT(&value, orte_gpr_value_t);
|
|
|
|
values = &value;
|
|
|
|
sub.values = &values;
|
|
|
|
sub.cnt = 1;
|
|
|
|
|
|
|
|
value.addr_mode = ORTE_GPR_TOKENS_OR | ORTE_GPR_KEYS_OR;
|
|
|
|
value.segment = segment;
|
|
|
|
value.tokens = NULL; /* wildcard - look at all containers */
|
|
|
|
value.num_tokens = 0;
|
|
|
|
value.cnt = 2;
|
|
|
|
value.keyvals =
|
|
|
|
(orte_gpr_keyval_t**)malloc(sizeof(orte_gpr_keyval_t*) * 2);
|
|
|
|
if (NULL == value.keyvals) {
|
|
|
|
rc = ORTE_ERR_OUT_OF_RESOURCE;
|
|
|
|
goto cleanup;
|
|
|
|
}
|
|
|
|
value.keyvals[0] = NULL;
|
|
|
|
value.keyvals[1] = NULL;
|
|
|
|
|
|
|
|
value.keyvals[0] = OBJ_NEW(orte_gpr_keyval_t);
|
|
|
|
if (NULL == value.keyvals[0]) {
|
|
|
|
rc = ORTE_ERR_OUT_OF_RESOURCE;
|
|
|
|
goto cleanup;
|
|
|
|
}
|
|
|
|
value.keyvals[0]->key = strdup(ORTE_PROC_NAME_KEY);
|
|
|
|
if (NULL == value.keyvals[0]->key) {
|
|
|
|
rc = ORTE_ERR_OUT_OF_RESOURCE;
|
|
|
|
goto cleanup;
|
|
|
|
}
|
|
|
|
|
|
|
|
value.keyvals[1] = OBJ_NEW(orte_gpr_keyval_t);
|
|
|
|
if (NULL == value.keyvals[0]) {
|
|
|
|
rc = ORTE_ERR_OUT_OF_RESOURCE;
|
|
|
|
goto cleanup;
|
|
|
|
}
|
|
|
|
value.keyvals[1]->key = strdup(ORTE_NODE_NAME_KEY);
|
|
|
|
if (NULL == value.keyvals[0]->key) {
|
|
|
|
rc = ORTE_ERR_OUT_OF_RESOURCE;
|
|
|
|
goto cleanup;
|
|
|
|
}
|
|
|
|
|
2005-08-05 22:03:30 +04:00
|
|
|
/* Here we have to add another key to the registry to be able to get the information
|
|
|
|
* about the remote architectures.
|
|
|
|
* TODO: George.
|
|
|
|
*/
|
|
|
|
|
2005-07-15 02:43:01 +04:00
|
|
|
sub.cbfunc = callback;
|
|
|
|
sub.user_tag = NULL;
|
|
|
|
|
|
|
|
/* setup the trigger information */
|
|
|
|
OBJ_CONSTRUCT(&trig, orte_gpr_trigger_t);
|
|
|
|
if (ORTE_SUCCESS !=
|
|
|
|
(rc = orte_schema.get_std_trigger_name(&(trig.name),
|
|
|
|
ORTE_STG1_TRIGGER, jobid))) {
|
|
|
|
goto cleanup;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* do the subscription */
|
|
|
|
sub1 = ⊂
|
|
|
|
trig1 = &trig;
|
|
|
|
rc = orte_gpr.subscribe(1, &sub1, 1, &trig1);
|
|
|
|
|
|
|
|
cleanup:
|
|
|
|
OBJ_DESTRUCT(&value);
|
|
|
|
sub.values = NULL;
|
|
|
|
OBJ_DESTRUCT(&sub);
|
|
|
|
OBJ_DESTRUCT(&trig);
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
* This callback is invoked by a subscription during MPI_INIT to let
|
|
|
|
* us know what procs are on what hosts. We look at the results and
|
|
|
|
* figure out which procs are on the same host as the local proc. For
|
|
|
|
* each proc that is on the same host as the local proc, we set that
|
|
|
|
* proc's OMPI_PROC_FLAG_LOCAL flag.
|
|
|
|
*/
|
|
|
|
static void callback(orte_gpr_notify_data_t *data, void *cbdata)
|
|
|
|
{
|
2005-07-18 22:49:00 +04:00
|
|
|
size_t i, j, k;
|
2005-07-15 02:43:01 +04:00
|
|
|
char *str;
|
|
|
|
bool found_name;
|
|
|
|
orte_ns_cmp_bitmask_t mask;
|
|
|
|
orte_process_name_t name;
|
|
|
|
orte_gpr_value_t **value;
|
|
|
|
orte_gpr_keyval_t **keyval;
|
|
|
|
ompi_proc_t *proc;
|
|
|
|
|
|
|
|
/* check bozo case */
|
|
|
|
if (0 == data->cnt) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* locks are probably not necessary here, but just be safe anyway */
|
|
|
|
OPAL_THREAD_LOCK(&ompi_proc_lock);
|
|
|
|
|
|
|
|
/* loop over the data returned in the subscription */
|
|
|
|
mask = ORTE_NS_CMP_CELLID | ORTE_NS_CMP_JOBID | ORTE_NS_CMP_VPID;
|
2005-07-18 22:49:00 +04:00
|
|
|
value = (orte_gpr_value_t**)(data->values)->addr;
|
|
|
|
for (i = 0, k=0; k < data->cnt &&
|
|
|
|
i < (data->values)->size; ++i) {
|
|
|
|
if (NULL != value[i]) {
|
|
|
|
k++;
|
|
|
|
str = NULL;
|
|
|
|
found_name = false;
|
|
|
|
keyval = value[i]->keyvals;
|
|
|
|
|
|
|
|
/* find the 2 keys that we're looking for */
|
|
|
|
for (j = 0; j < value[i]->cnt; ++j) {
|
|
|
|
if (strcmp(keyval[j]->key, ORTE_PROC_NAME_KEY) == 0) {
|
|
|
|
orte_ns.get_proc_name_string(&str, &keyval[j]->value.proc);
|
|
|
|
name = keyval[j]->value.proc;
|
|
|
|
found_name = true;
|
|
|
|
} else if (strcmp(keyval[j]->key, ORTE_NODE_NAME_KEY) == 0) {
|
|
|
|
if (NULL != str) {
|
|
|
|
free(str);
|
|
|
|
}
|
|
|
|
str = strdup(keyval[j]->value.strptr);
|
2005-07-15 02:43:01 +04:00
|
|
|
}
|
|
|
|
}
|
2005-07-18 22:49:00 +04:00
|
|
|
|
|
|
|
/* if we found both keys and the proc is on my local host,
|
|
|
|
find it in the master proc list and set the "local" flag */
|
|
|
|
if (NULL != str && found_name &&
|
|
|
|
0 == strcmp(str, orte_system_info.nodename)) {
|
|
|
|
for (proc = (ompi_proc_t*)opal_list_get_first(&ompi_proc_list);
|
|
|
|
proc != (ompi_proc_t*)opal_list_get_end(&ompi_proc_list);
|
|
|
|
proc = (ompi_proc_t*)opal_list_get_next(proc)) {
|
|
|
|
if (0 == orte_ns.compare(mask, &name,
|
|
|
|
&proc->proc_name)) {
|
|
|
|
proc->proc_flags |= OMPI_PROC_FLAG_LOCAL;
|
|
|
|
}
|
2005-07-15 02:43:01 +04:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2005-08-05 22:03:30 +04:00
|
|
|
/* And finally here we have to retrieve the remote architectures and create the convertors
|
|
|
|
* attached to the remote processors depending on the remote architecture.
|
|
|
|
* TODO: George.
|
|
|
|
*/
|
2005-07-15 02:43:01 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
/* unlock */
|
|
|
|
OPAL_THREAD_UNLOCK(&ompi_proc_lock);
|
|
|
|
}
|
|
|
|
|