2011-10-21 16:28:36 +00:00
|
|
|
/* -*- Mode: C; c-basic-offset:4 ; -*- */
|
2011-06-20 21:04:46 +00:00
|
|
|
/*
|
|
|
|
* Copyright (c) 2007 The Trustees of Indiana University.
|
|
|
|
* All rights reserved.
|
|
|
|
* Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
|
2013-01-28 20:14:51 +00:00
|
|
|
* Copyright (c) 2011-2013 Los Alamos National Security, LLC. All
|
2011-10-21 04:54:38 +00:00
|
|
|
* rights reserved.
|
2011-06-20 21:04:46 +00:00
|
|
|
* $COPYRIGHT$
|
|
|
|
*
|
|
|
|
* Additional copyrights may follow
|
|
|
|
*
|
|
|
|
* $HEADER$
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include "orte_config.h"
|
|
|
|
#include "orte/constants.h"
|
|
|
|
#include "orte/types.h"
|
|
|
|
|
|
|
|
#include <string.h>
|
2011-10-12 20:59:25 +00:00
|
|
|
#include <pmi.h>
|
2011-10-21 04:54:38 +00:00
|
|
|
#if WANT_CRAY_PMI2_EXT
|
|
|
|
#include <pmi2.h>
|
|
|
|
#endif
|
2011-06-20 21:04:46 +00:00
|
|
|
|
|
|
|
#include "opal/dss/dss.h"
|
2011-10-19 20:18:14 +00:00
|
|
|
#include "opal/mca/hwloc/base/base.h"
|
Per the meeting on moving the BTLs to OPAL, move the ORTE database "db" framework to OPAL so the relocated BTLs can access it. Because the data is indexed by process, this requires that we define a new "opal_identifier_t" that corresponds to the orte_process_name_t struct. In order to support multiple run-times, this is defined in opal/mca/db/db_types.h as a uint64_t without identifying the meaning of any part of that data.
A few changes were required to support this move:
1. the PMI component used to identify rte-related data (e.g., host name, bind level) and package them as a unit to reduce the number of PMI keys. This code was moved up to the ORTE layer as the OPAL layer has no understanding of these concepts. In addition, the component locally stored data based on process jobid/vpid - this could no longer be supported (see below for the solution).
2. the hash component was updated to use the new opal_identifier_t instead of orte_process_name_t as its index for storing data in the hash tables. Previously, we did a hash on the vpid and stored the data in a 32-bit hash table. In the revised system, we don't see a separate "vpid" field - we only have a 64-bit opaque value. The orte_process_name_t hash turned out to do nothing useful, so we now store the data in a 64-bit hash table. Preliminary tests didn't show any identifiable change in behavior or performance, but we'll have to see if a move back to the 32-bit table is required at some later time.
3. the db framework was a "select one" system. However, since the PMI component could no longer use its internal storage system, the framework has now been changed to a "select many" mode of operation. This allows the hash component to handle all internal storage, while the PMI component only handles pushing/pulling things from the PMI system. This was something we had planned for some time - when fetching data, we first check internal storage to see if we already have it, and then automatically go to the global system to look for it if we don't. Accordingly, the framework was provided with a custom query function used during "select" that lets you seperately specify the "store" and "fetch" ordering.
4. the ORTE grpcomm and ess/pmi components, and the nidmap code, were updated to work with the new db framework and to specify internal/global storage options.
No changes were made to the MPI layer, except for modifying the ORTE component of the OMPI/rte framework to support the new db framework.
This commit was SVN r28112.
2013-02-26 17:50:04 +00:00
|
|
|
#include "opal/mca/common/pmi/common_pmi.h"
|
|
|
|
#include "opal/mca/db/db.h"
|
2011-10-12 20:59:25 +00:00
|
|
|
|
2011-10-17 20:51:22 +00:00
|
|
|
#include "orte/mca/errmgr/errmgr.h"
|
2011-06-20 21:04:46 +00:00
|
|
|
#include "orte/mca/rml/rml.h"
|
|
|
|
#include "orte/util/name_fns.h"
|
2011-10-12 20:59:25 +00:00
|
|
|
#include "orte/util/proc_info.h"
|
2011-06-20 21:04:46 +00:00
|
|
|
|
|
|
|
#include "orte/mca/grpcomm/base/base.h"
|
|
|
|
#include "grpcomm_pmi.h"
|
|
|
|
|
|
|
|
|
|
|
|
/* Static API's */
|
|
|
|
static int init(void);
|
|
|
|
static void finalize(void);
|
|
|
|
static int xcast(orte_jobid_t job,
|
|
|
|
opal_buffer_t *buffer,
|
|
|
|
orte_rml_tag_t tag);
|
2012-04-06 14:23:13 +00:00
|
|
|
static int pmi_allgather(orte_grpcomm_collective_t *coll);
|
|
|
|
static int pmi_barrier(orte_grpcomm_collective_t *coll);
|
|
|
|
static int modex(orte_grpcomm_collective_t *coll);
|
2011-06-20 21:04:46 +00:00
|
|
|
|
|
|
|
/* Module def */
|
|
|
|
orte_grpcomm_base_module_t orte_grpcomm_pmi_module = {
|
|
|
|
init,
|
|
|
|
finalize,
|
|
|
|
xcast,
|
|
|
|
pmi_allgather,
|
|
|
|
pmi_barrier,
|
2012-06-27 14:53:55 +00:00
|
|
|
modex
|
2011-06-20 21:04:46 +00:00
|
|
|
};
|
|
|
|
|
2013-01-03 02:16:10 +00:00
|
|
|
static char *pmi_kvs_name=NULL;
|
2011-06-20 21:04:46 +00:00
|
|
|
|
2013-01-03 02:16:10 +00:00
|
|
|
/**
|
|
|
|
* Initialize the module
|
2011-10-21 04:54:38 +00:00
|
|
|
*/
|
2013-01-03 02:16:10 +00:00
|
|
|
static int init(void)
|
2011-10-21 04:54:38 +00:00
|
|
|
{
|
2013-01-03 02:16:10 +00:00
|
|
|
int max_length, rc;
|
2011-10-21 04:54:38 +00:00
|
|
|
|
|
|
|
#if WANT_CRAY_PMI2_EXT
|
2013-01-03 02:16:10 +00:00
|
|
|
/* TODO -- is this ok */
|
|
|
|
max_length = 1024;
|
2011-10-21 04:54:38 +00:00
|
|
|
#else
|
2013-01-03 02:16:10 +00:00
|
|
|
if (PMI_SUCCESS != (rc = PMI_KVS_Get_name_length_max(&max_length))) {
|
Per the meeting on moving the BTLs to OPAL, move the ORTE database "db" framework to OPAL so the relocated BTLs can access it. Because the data is indexed by process, this requires that we define a new "opal_identifier_t" that corresponds to the orte_process_name_t struct. In order to support multiple run-times, this is defined in opal/mca/db/db_types.h as a uint64_t without identifying the meaning of any part of that data.
A few changes were required to support this move:
1. the PMI component used to identify rte-related data (e.g., host name, bind level) and package them as a unit to reduce the number of PMI keys. This code was moved up to the ORTE layer as the OPAL layer has no understanding of these concepts. In addition, the component locally stored data based on process jobid/vpid - this could no longer be supported (see below for the solution).
2. the hash component was updated to use the new opal_identifier_t instead of orte_process_name_t as its index for storing data in the hash tables. Previously, we did a hash on the vpid and stored the data in a 32-bit hash table. In the revised system, we don't see a separate "vpid" field - we only have a 64-bit opaque value. The orte_process_name_t hash turned out to do nothing useful, so we now store the data in a 64-bit hash table. Preliminary tests didn't show any identifiable change in behavior or performance, but we'll have to see if a move back to the 32-bit table is required at some later time.
3. the db framework was a "select one" system. However, since the PMI component could no longer use its internal storage system, the framework has now been changed to a "select many" mode of operation. This allows the hash component to handle all internal storage, while the PMI component only handles pushing/pulling things from the PMI system. This was something we had planned for some time - when fetching data, we first check internal storage to see if we already have it, and then automatically go to the global system to look for it if we don't. Accordingly, the framework was provided with a custom query function used during "select" that lets you seperately specify the "store" and "fetch" ordering.
4. the ORTE grpcomm and ess/pmi components, and the nidmap code, were updated to work with the new db framework and to specify internal/global storage options.
No changes were made to the MPI layer, except for modifying the ORTE component of the OMPI/rte framework to support the new db framework.
This commit was SVN r28112.
2013-02-26 17:50:04 +00:00
|
|
|
OPAL_PMI_ERROR(rc, "PMI_KVS_Get_name_length_max");
|
2013-01-03 02:16:10 +00:00
|
|
|
return ORTE_ERROR;
|
|
|
|
}
|
2011-10-21 04:54:38 +00:00
|
|
|
#endif
|
2013-01-03 02:16:10 +00:00
|
|
|
pmi_kvs_name = (char*)malloc(max_length);
|
|
|
|
if (NULL == pmi_kvs_name) {
|
|
|
|
return ORTE_ERR_OUT_OF_RESOURCE;
|
|
|
|
}
|
2011-10-21 04:54:38 +00:00
|
|
|
|
|
|
|
#if WANT_CRAY_PMI2_EXT
|
2013-01-03 02:16:10 +00:00
|
|
|
rc = PMI2_Job_GetId(pmi_kvs_name, max_length);
|
2011-10-21 04:54:38 +00:00
|
|
|
#else
|
2013-01-03 02:16:10 +00:00
|
|
|
rc = PMI_KVS_Get_my_name(pmi_kvs_name,max_length);
|
2011-10-21 04:54:38 +00:00
|
|
|
#endif
|
2013-01-03 02:16:10 +00:00
|
|
|
if (PMI_SUCCESS != rc) {
|
Per the meeting on moving the BTLs to OPAL, move the ORTE database "db" framework to OPAL so the relocated BTLs can access it. Because the data is indexed by process, this requires that we define a new "opal_identifier_t" that corresponds to the orte_process_name_t struct. In order to support multiple run-times, this is defined in opal/mca/db/db_types.h as a uint64_t without identifying the meaning of any part of that data.
A few changes were required to support this move:
1. the PMI component used to identify rte-related data (e.g., host name, bind level) and package them as a unit to reduce the number of PMI keys. This code was moved up to the ORTE layer as the OPAL layer has no understanding of these concepts. In addition, the component locally stored data based on process jobid/vpid - this could no longer be supported (see below for the solution).
2. the hash component was updated to use the new opal_identifier_t instead of orte_process_name_t as its index for storing data in the hash tables. Previously, we did a hash on the vpid and stored the data in a 32-bit hash table. In the revised system, we don't see a separate "vpid" field - we only have a 64-bit opaque value. The orte_process_name_t hash turned out to do nothing useful, so we now store the data in a 64-bit hash table. Preliminary tests didn't show any identifiable change in behavior or performance, but we'll have to see if a move back to the 32-bit table is required at some later time.
3. the db framework was a "select one" system. However, since the PMI component could no longer use its internal storage system, the framework has now been changed to a "select many" mode of operation. This allows the hash component to handle all internal storage, while the PMI component only handles pushing/pulling things from the PMI system. This was something we had planned for some time - when fetching data, we first check internal storage to see if we already have it, and then automatically go to the global system to look for it if we don't. Accordingly, the framework was provided with a custom query function used during "select" that lets you seperately specify the "store" and "fetch" ordering.
4. the ORTE grpcomm and ess/pmi components, and the nidmap code, were updated to work with the new db framework and to specify internal/global storage options.
No changes were made to the MPI layer, except for modifying the ORTE component of the OMPI/rte framework to support the new db framework.
This commit was SVN r28112.
2013-02-26 17:50:04 +00:00
|
|
|
OPAL_PMI_ERROR(rc, "PMI_KVS_Get_my_name");
|
2013-01-03 02:16:10 +00:00
|
|
|
return ORTE_ERROR;
|
2011-10-17 20:51:22 +00:00
|
|
|
}
|
2011-06-20 21:04:46 +00:00
|
|
|
return ORTE_SUCCESS;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Finalize the module
|
|
|
|
*/
|
|
|
|
static void finalize(void)
|
|
|
|
{
|
2011-10-17 20:51:22 +00:00
|
|
|
if (NULL != pmi_kvs_name) {
|
|
|
|
free(pmi_kvs_name);
|
|
|
|
}
|
2011-06-20 21:04:46 +00:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* A "broadcast-like" function to a job's processes.
|
|
|
|
* @param jobid The job whose processes are to receive the message
|
|
|
|
* @param buffer The data to broadcast
|
|
|
|
*/
|
|
|
|
|
|
|
|
static int xcast(orte_jobid_t job,
|
|
|
|
opal_buffer_t *buffer,
|
|
|
|
orte_rml_tag_t tag)
|
|
|
|
{
|
|
|
|
/* not used in this module */
|
|
|
|
return ORTE_ERR_NOT_SUPPORTED;
|
|
|
|
}
|
|
|
|
|
2012-04-06 14:23:13 +00:00
|
|
|
static int pmi_barrier(orte_grpcomm_collective_t *coll)
|
2011-06-20 21:04:46 +00:00
|
|
|
{
|
|
|
|
int rc;
|
|
|
|
|
2013-03-27 21:14:43 +00:00
|
|
|
OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base_framework.framework_output,
|
2011-06-20 21:04:46 +00:00
|
|
|
"%s grpcomm:pmi entering barrier",
|
|
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
|
|
|
|
2012-04-06 14:23:13 +00:00
|
|
|
/* if I am alone, just execute the callback */
|
2011-06-20 21:04:46 +00:00
|
|
|
if (1 == orte_process_info.num_procs) {
|
2013-03-27 21:14:43 +00:00
|
|
|
OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base_framework.framework_output,
|
2011-10-11 02:15:17 +00:00
|
|
|
"%s grpcomm:pmi:barrier only one proc",
|
|
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
2012-04-06 14:23:13 +00:00
|
|
|
coll->active = false;
|
|
|
|
if (NULL != coll->cbfunc) {
|
|
|
|
coll->cbfunc(NULL, coll->cbdata);
|
|
|
|
}
|
2011-06-20 21:04:46 +00:00
|
|
|
return ORTE_SUCCESS;
|
|
|
|
}
|
|
|
|
|
2011-10-21 04:54:38 +00:00
|
|
|
#if WANT_CRAY_PMI2_EXT
|
|
|
|
/* Cray doesn't provide a barrier, so use the Fence function here */
|
|
|
|
if (PMI_SUCCESS != (rc = PMI2_KVS_Fence())) {
|
Per the meeting on moving the BTLs to OPAL, move the ORTE database "db" framework to OPAL so the relocated BTLs can access it. Because the data is indexed by process, this requires that we define a new "opal_identifier_t" that corresponds to the orte_process_name_t struct. In order to support multiple run-times, this is defined in opal/mca/db/db_types.h as a uint64_t without identifying the meaning of any part of that data.
A few changes were required to support this move:
1. the PMI component used to identify rte-related data (e.g., host name, bind level) and package them as a unit to reduce the number of PMI keys. This code was moved up to the ORTE layer as the OPAL layer has no understanding of these concepts. In addition, the component locally stored data based on process jobid/vpid - this could no longer be supported (see below for the solution).
2. the hash component was updated to use the new opal_identifier_t instead of orte_process_name_t as its index for storing data in the hash tables. Previously, we did a hash on the vpid and stored the data in a 32-bit hash table. In the revised system, we don't see a separate "vpid" field - we only have a 64-bit opaque value. The orte_process_name_t hash turned out to do nothing useful, so we now store the data in a 64-bit hash table. Preliminary tests didn't show any identifiable change in behavior or performance, but we'll have to see if a move back to the 32-bit table is required at some later time.
3. the db framework was a "select one" system. However, since the PMI component could no longer use its internal storage system, the framework has now been changed to a "select many" mode of operation. This allows the hash component to handle all internal storage, while the PMI component only handles pushing/pulling things from the PMI system. This was something we had planned for some time - when fetching data, we first check internal storage to see if we already have it, and then automatically go to the global system to look for it if we don't. Accordingly, the framework was provided with a custom query function used during "select" that lets you seperately specify the "store" and "fetch" ordering.
4. the ORTE grpcomm and ess/pmi components, and the nidmap code, were updated to work with the new db framework and to specify internal/global storage options.
No changes were made to the MPI layer, except for modifying the ORTE component of the OMPI/rte framework to support the new db framework.
This commit was SVN r28112.
2013-02-26 17:50:04 +00:00
|
|
|
OPAL_PMI_ERROR(rc, "PMI2_KVS_Fence");
|
2011-10-21 04:54:38 +00:00
|
|
|
return ORTE_ERROR;
|
|
|
|
}
|
|
|
|
#else
|
2011-06-20 21:04:46 +00:00
|
|
|
/* use the PMI barrier function */
|
|
|
|
if (PMI_SUCCESS != (rc = PMI_Barrier())) {
|
Per the meeting on moving the BTLs to OPAL, move the ORTE database "db" framework to OPAL so the relocated BTLs can access it. Because the data is indexed by process, this requires that we define a new "opal_identifier_t" that corresponds to the orte_process_name_t struct. In order to support multiple run-times, this is defined in opal/mca/db/db_types.h as a uint64_t without identifying the meaning of any part of that data.
A few changes were required to support this move:
1. the PMI component used to identify rte-related data (e.g., host name, bind level) and package them as a unit to reduce the number of PMI keys. This code was moved up to the ORTE layer as the OPAL layer has no understanding of these concepts. In addition, the component locally stored data based on process jobid/vpid - this could no longer be supported (see below for the solution).
2. the hash component was updated to use the new opal_identifier_t instead of orte_process_name_t as its index for storing data in the hash tables. Previously, we did a hash on the vpid and stored the data in a 32-bit hash table. In the revised system, we don't see a separate "vpid" field - we only have a 64-bit opaque value. The orte_process_name_t hash turned out to do nothing useful, so we now store the data in a 64-bit hash table. Preliminary tests didn't show any identifiable change in behavior or performance, but we'll have to see if a move back to the 32-bit table is required at some later time.
3. the db framework was a "select one" system. However, since the PMI component could no longer use its internal storage system, the framework has now been changed to a "select many" mode of operation. This allows the hash component to handle all internal storage, while the PMI component only handles pushing/pulling things from the PMI system. This was something we had planned for some time - when fetching data, we first check internal storage to see if we already have it, and then automatically go to the global system to look for it if we don't. Accordingly, the framework was provided with a custom query function used during "select" that lets you seperately specify the "store" and "fetch" ordering.
4. the ORTE grpcomm and ess/pmi components, and the nidmap code, were updated to work with the new db framework and to specify internal/global storage options.
No changes were made to the MPI layer, except for modifying the ORTE component of the OMPI/rte framework to support the new db framework.
This commit was SVN r28112.
2013-02-26 17:50:04 +00:00
|
|
|
OPAL_PMI_ERROR(rc, "PMI_Barrier");
|
2011-06-20 21:04:46 +00:00
|
|
|
return ORTE_ERROR;
|
|
|
|
}
|
2011-10-21 04:54:38 +00:00
|
|
|
#endif
|
2011-06-20 21:04:46 +00:00
|
|
|
|
2013-03-27 21:14:43 +00:00
|
|
|
OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base_framework.framework_output,
|
2011-06-20 21:04:46 +00:00
|
|
|
"%s grpcomm:pmi barrier complete",
|
|
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
2012-04-06 14:23:13 +00:00
|
|
|
/* execute the callback */
|
|
|
|
coll->active = false;
|
|
|
|
if (NULL != coll->cbfunc) {
|
|
|
|
coll->cbfunc(NULL, coll->cbdata);
|
|
|
|
}
|
2011-06-20 21:04:46 +00:00
|
|
|
|
|
|
|
return ORTE_SUCCESS;
|
|
|
|
}
|
|
|
|
|
2012-04-06 14:23:13 +00:00
|
|
|
static int pmi_allgather(orte_grpcomm_collective_t *coll)
|
2011-06-20 21:04:46 +00:00
|
|
|
{
|
|
|
|
/* not used in this implementation */
|
|
|
|
return ORTE_ERR_NOT_SUPPORTED;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*** MODEX SECTION ***/
|
2012-04-06 14:23:13 +00:00
|
|
|
static int modex(orte_grpcomm_collective_t *coll)
|
2011-06-20 21:04:46 +00:00
|
|
|
{
|
Per the meeting on moving the BTLs to OPAL, move the ORTE database "db" framework to OPAL so the relocated BTLs can access it. Because the data is indexed by process, this requires that we define a new "opal_identifier_t" that corresponds to the orte_process_name_t struct. In order to support multiple run-times, this is defined in opal/mca/db/db_types.h as a uint64_t without identifying the meaning of any part of that data.
A few changes were required to support this move:
1. the PMI component used to identify rte-related data (e.g., host name, bind level) and package them as a unit to reduce the number of PMI keys. This code was moved up to the ORTE layer as the OPAL layer has no understanding of these concepts. In addition, the component locally stored data based on process jobid/vpid - this could no longer be supported (see below for the solution).
2. the hash component was updated to use the new opal_identifier_t instead of orte_process_name_t as its index for storing data in the hash tables. Previously, we did a hash on the vpid and stored the data in a 32-bit hash table. In the revised system, we don't see a separate "vpid" field - we only have a 64-bit opaque value. The orte_process_name_t hash turned out to do nothing useful, so we now store the data in a 64-bit hash table. Preliminary tests didn't show any identifiable change in behavior or performance, but we'll have to see if a move back to the 32-bit table is required at some later time.
3. the db framework was a "select one" system. However, since the PMI component could no longer use its internal storage system, the framework has now been changed to a "select many" mode of operation. This allows the hash component to handle all internal storage, while the PMI component only handles pushing/pulling things from the PMI system. This was something we had planned for some time - when fetching data, we first check internal storage to see if we already have it, and then automatically go to the global system to look for it if we don't. Accordingly, the framework was provided with a custom query function used during "select" that lets you seperately specify the "store" and "fetch" ordering.
4. the ORTE grpcomm and ess/pmi components, and the nidmap code, were updated to work with the new db framework and to specify internal/global storage options.
No changes were made to the MPI layer, except for modifying the ORTE component of the OMPI/rte framework to support the new db framework.
This commit was SVN r28112.
2013-02-26 17:50:04 +00:00
|
|
|
char *cptr, **fields;
|
2011-10-12 20:59:25 +00:00
|
|
|
orte_vpid_t v;
|
|
|
|
orte_process_name_t name;
|
2013-01-03 02:16:10 +00:00
|
|
|
int rc;
|
Per the meeting on moving the BTLs to OPAL, move the ORTE database "db" framework to OPAL so the relocated BTLs can access it. Because the data is indexed by process, this requires that we define a new "opal_identifier_t" that corresponds to the orte_process_name_t struct. In order to support multiple run-times, this is defined in opal/mca/db/db_types.h as a uint64_t without identifying the meaning of any part of that data.
A few changes were required to support this move:
1. the PMI component used to identify rte-related data (e.g., host name, bind level) and package them as a unit to reduce the number of PMI keys. This code was moved up to the ORTE layer as the OPAL layer has no understanding of these concepts. In addition, the component locally stored data based on process jobid/vpid - this could no longer be supported (see below for the solution).
2. the hash component was updated to use the new opal_identifier_t instead of orte_process_name_t as its index for storing data in the hash tables. Previously, we did a hash on the vpid and stored the data in a 32-bit hash table. In the revised system, we don't see a separate "vpid" field - we only have a 64-bit opaque value. The orte_process_name_t hash turned out to do nothing useful, so we now store the data in a 64-bit hash table. Preliminary tests didn't show any identifiable change in behavior or performance, but we'll have to see if a move back to the 32-bit table is required at some later time.
3. the db framework was a "select one" system. However, since the PMI component could no longer use its internal storage system, the framework has now been changed to a "select many" mode of operation. This allows the hash component to handle all internal storage, while the PMI component only handles pushing/pulling things from the PMI system. This was something we had planned for some time - when fetching data, we first check internal storage to see if we already have it, and then automatically go to the global system to look for it if we don't. Accordingly, the framework was provided with a custom query function used during "select" that lets you seperately specify the "store" and "fetch" ordering.
4. the ORTE grpcomm and ess/pmi components, and the nidmap code, were updated to work with the new db framework and to specify internal/global storage options.
No changes were made to the MPI layer, except for modifying the ORTE component of the OMPI/rte framework to support the new db framework.
This commit was SVN r28112.
2013-02-26 17:50:04 +00:00
|
|
|
opal_identifier_t *id;
|
|
|
|
opal_hwloc_locality_t locality;
|
|
|
|
orte_local_rank_t local_rank;
|
|
|
|
orte_node_rank_t node_rank;
|
2013-03-26 18:27:50 +00:00
|
|
|
bool bound;
|
2011-06-20 21:04:46 +00:00
|
|
|
|
2013-03-27 21:14:43 +00:00
|
|
|
OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base_framework.framework_output,
|
2011-06-20 21:04:46 +00:00
|
|
|
"%s grpcomm:pmi: modex entered",
|
|
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
2012-05-19 16:12:52 +00:00
|
|
|
|
Per the meeting on moving the BTLs to OPAL, move the ORTE database "db" framework to OPAL so the relocated BTLs can access it. Because the data is indexed by process, this requires that we define a new "opal_identifier_t" that corresponds to the orte_process_name_t struct. In order to support multiple run-times, this is defined in opal/mca/db/db_types.h as a uint64_t without identifying the meaning of any part of that data.
A few changes were required to support this move:
1. the PMI component used to identify rte-related data (e.g., host name, bind level) and package them as a unit to reduce the number of PMI keys. This code was moved up to the ORTE layer as the OPAL layer has no understanding of these concepts. In addition, the component locally stored data based on process jobid/vpid - this could no longer be supported (see below for the solution).
2. the hash component was updated to use the new opal_identifier_t instead of orte_process_name_t as its index for storing data in the hash tables. Previously, we did a hash on the vpid and stored the data in a 32-bit hash table. In the revised system, we don't see a separate "vpid" field - we only have a 64-bit opaque value. The orte_process_name_t hash turned out to do nothing useful, so we now store the data in a 64-bit hash table. Preliminary tests didn't show any identifiable change in behavior or performance, but we'll have to see if a move back to the 32-bit table is required at some later time.
3. the db framework was a "select one" system. However, since the PMI component could no longer use its internal storage system, the framework has now been changed to a "select many" mode of operation. This allows the hash component to handle all internal storage, while the PMI component only handles pushing/pulling things from the PMI system. This was something we had planned for some time - when fetching data, we first check internal storage to see if we already have it, and then automatically go to the global system to look for it if we don't. Accordingly, the framework was provided with a custom query function used during "select" that lets you seperately specify the "store" and "fetch" ordering.
4. the ORTE grpcomm and ess/pmi components, and the nidmap code, were updated to work with the new db framework and to specify internal/global storage options.
No changes were made to the MPI layer, except for modifying the ORTE component of the OMPI/rte framework to support the new db framework.
This commit was SVN r28112.
2013-02-26 17:50:04 +00:00
|
|
|
/* our RTE data was constructed and pushed in the ESS pmi component */
|
|
|
|
|
2011-06-20 21:04:46 +00:00
|
|
|
/* commit our modex info */
|
2013-01-03 02:16:10 +00:00
|
|
|
#if WANT_CRAY_PMI2_EXT
|
|
|
|
PMI2_KVS_Fence();
|
|
|
|
#else
|
|
|
|
{
|
|
|
|
int rc;
|
|
|
|
|
|
|
|
if (PMI_SUCCESS != (rc = PMI_KVS_Commit(pmi_kvs_name))) {
|
Per the meeting on moving the BTLs to OPAL, move the ORTE database "db" framework to OPAL so the relocated BTLs can access it. Because the data is indexed by process, this requires that we define a new "opal_identifier_t" that corresponds to the orte_process_name_t struct. In order to support multiple run-times, this is defined in opal/mca/db/db_types.h as a uint64_t without identifying the meaning of any part of that data.
A few changes were required to support this move:
1. the PMI component used to identify rte-related data (e.g., host name, bind level) and package them as a unit to reduce the number of PMI keys. This code was moved up to the ORTE layer as the OPAL layer has no understanding of these concepts. In addition, the component locally stored data based on process jobid/vpid - this could no longer be supported (see below for the solution).
2. the hash component was updated to use the new opal_identifier_t instead of orte_process_name_t as its index for storing data in the hash tables. Previously, we did a hash on the vpid and stored the data in a 32-bit hash table. In the revised system, we don't see a separate "vpid" field - we only have a 64-bit opaque value. The orte_process_name_t hash turned out to do nothing useful, so we now store the data in a 64-bit hash table. Preliminary tests didn't show any identifiable change in behavior or performance, but we'll have to see if a move back to the 32-bit table is required at some later time.
3. the db framework was a "select one" system. However, since the PMI component could no longer use its internal storage system, the framework has now been changed to a "select many" mode of operation. This allows the hash component to handle all internal storage, while the PMI component only handles pushing/pulling things from the PMI system. This was something we had planned for some time - when fetching data, we first check internal storage to see if we already have it, and then automatically go to the global system to look for it if we don't. Accordingly, the framework was provided with a custom query function used during "select" that lets you seperately specify the "store" and "fetch" ordering.
4. the ORTE grpcomm and ess/pmi components, and the nidmap code, were updated to work with the new db framework and to specify internal/global storage options.
No changes were made to the MPI layer, except for modifying the ORTE component of the OMPI/rte framework to support the new db framework.
This commit was SVN r28112.
2013-02-26 17:50:04 +00:00
|
|
|
OPAL_PMI_ERROR(rc, "PMI_KVS_Commit");
|
2013-01-03 02:16:10 +00:00
|
|
|
return ORTE_ERR_FATAL;
|
|
|
|
}
|
|
|
|
/* Barrier here to ensure all other procs have committed */
|
|
|
|
PMI_Barrier();
|
2011-06-20 21:04:46 +00:00
|
|
|
}
|
2013-01-03 02:16:10 +00:00
|
|
|
#endif
|
2011-06-20 21:04:46 +00:00
|
|
|
|
Per the meeting on moving the BTLs to OPAL, move the ORTE database "db" framework to OPAL so the relocated BTLs can access it. Because the data is indexed by process, this requires that we define a new "opal_identifier_t" that corresponds to the orte_process_name_t struct. In order to support multiple run-times, this is defined in opal/mca/db/db_types.h as a uint64_t without identifying the meaning of any part of that data.
A few changes were required to support this move:
1. the PMI component used to identify rte-related data (e.g., host name, bind level) and package them as a unit to reduce the number of PMI keys. This code was moved up to the ORTE layer as the OPAL layer has no understanding of these concepts. In addition, the component locally stored data based on process jobid/vpid - this could no longer be supported (see below for the solution).
2. the hash component was updated to use the new opal_identifier_t instead of orte_process_name_t as its index for storing data in the hash tables. Previously, we did a hash on the vpid and stored the data in a 32-bit hash table. In the revised system, we don't see a separate "vpid" field - we only have a 64-bit opaque value. The orte_process_name_t hash turned out to do nothing useful, so we now store the data in a 64-bit hash table. Preliminary tests didn't show any identifiable change in behavior or performance, but we'll have to see if a move back to the 32-bit table is required at some later time.
3. the db framework was a "select one" system. However, since the PMI component could no longer use its internal storage system, the framework has now been changed to a "select many" mode of operation. This allows the hash component to handle all internal storage, while the PMI component only handles pushing/pulling things from the PMI system. This was something we had planned for some time - when fetching data, we first check internal storage to see if we already have it, and then automatically go to the global system to look for it if we don't. Accordingly, the framework was provided with a custom query function used during "select" that lets you seperately specify the "store" and "fetch" ordering.
4. the ORTE grpcomm and ess/pmi components, and the nidmap code, were updated to work with the new db framework and to specify internal/global storage options.
No changes were made to the MPI layer, except for modifying the ORTE component of the OMPI/rte framework to support the new db framework.
This commit was SVN r28112.
2013-02-26 17:50:04 +00:00
|
|
|
/* cycle thru all my peers and collect their RTE info */
|
2011-10-12 20:59:25 +00:00
|
|
|
name.jobid = ORTE_PROC_MY_NAME->jobid;
|
Per the meeting on moving the BTLs to OPAL, move the ORTE database "db" framework to OPAL so the relocated BTLs can access it. Because the data is indexed by process, this requires that we define a new "opal_identifier_t" that corresponds to the orte_process_name_t struct. In order to support multiple run-times, this is defined in opal/mca/db/db_types.h as a uint64_t without identifying the meaning of any part of that data.
A few changes were required to support this move:
1. the PMI component used to identify rte-related data (e.g., host name, bind level) and package them as a unit to reduce the number of PMI keys. This code was moved up to the ORTE layer as the OPAL layer has no understanding of these concepts. In addition, the component locally stored data based on process jobid/vpid - this could no longer be supported (see below for the solution).
2. the hash component was updated to use the new opal_identifier_t instead of orte_process_name_t as its index for storing data in the hash tables. Previously, we did a hash on the vpid and stored the data in a 32-bit hash table. In the revised system, we don't see a separate "vpid" field - we only have a 64-bit opaque value. The orte_process_name_t hash turned out to do nothing useful, so we now store the data in a 64-bit hash table. Preliminary tests didn't show any identifiable change in behavior or performance, but we'll have to see if a move back to the 32-bit table is required at some later time.
3. the db framework was a "select one" system. However, since the PMI component could no longer use its internal storage system, the framework has now been changed to a "select many" mode of operation. This allows the hash component to handle all internal storage, while the PMI component only handles pushing/pulling things from the PMI system. This was something we had planned for some time - when fetching data, we first check internal storage to see if we already have it, and then automatically go to the global system to look for it if we don't. Accordingly, the framework was provided with a custom query function used during "select" that lets you seperately specify the "store" and "fetch" ordering.
4. the ORTE grpcomm and ess/pmi components, and the nidmap code, were updated to work with the new db framework and to specify internal/global storage options.
No changes were made to the MPI layer, except for modifying the ORTE component of the OMPI/rte framework to support the new db framework.
This commit was SVN r28112.
2013-02-26 17:50:04 +00:00
|
|
|
id = (opal_identifier_t*)&name;
|
|
|
|
fields = NULL;
|
2011-10-12 20:59:25 +00:00
|
|
|
for (v=0; v < orte_process_info.num_procs; v++) {
|
|
|
|
if (v == ORTE_PROC_MY_NAME->vpid) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
name.vpid = v;
|
Per the meeting on moving the BTLs to OPAL, move the ORTE database "db" framework to OPAL so the relocated BTLs can access it. Because the data is indexed by process, this requires that we define a new "opal_identifier_t" that corresponds to the orte_process_name_t struct. In order to support multiple run-times, this is defined in opal/mca/db/db_types.h as a uint64_t without identifying the meaning of any part of that data.
A few changes were required to support this move:
1. the PMI component used to identify rte-related data (e.g., host name, bind level) and package them as a unit to reduce the number of PMI keys. This code was moved up to the ORTE layer as the OPAL layer has no understanding of these concepts. In addition, the component locally stored data based on process jobid/vpid - this could no longer be supported (see below for the solution).
2. the hash component was updated to use the new opal_identifier_t instead of orte_process_name_t as its index for storing data in the hash tables. Previously, we did a hash on the vpid and stored the data in a 32-bit hash table. In the revised system, we don't see a separate "vpid" field - we only have a 64-bit opaque value. The orte_process_name_t hash turned out to do nothing useful, so we now store the data in a 64-bit hash table. Preliminary tests didn't show any identifiable change in behavior or performance, but we'll have to see if a move back to the 32-bit table is required at some later time.
3. the db framework was a "select one" system. However, since the PMI component could no longer use its internal storage system, the framework has now been changed to a "select many" mode of operation. This allows the hash component to handle all internal storage, while the PMI component only handles pushing/pulling things from the PMI system. This was something we had planned for some time - when fetching data, we first check internal storage to see if we already have it, and then automatically go to the global system to look for it if we don't. Accordingly, the framework was provided with a custom query function used during "select" that lets you seperately specify the "store" and "fetch" ordering.
4. the ORTE grpcomm and ess/pmi components, and the nidmap code, were updated to work with the new db framework and to specify internal/global storage options.
No changes were made to the MPI layer, except for modifying the ORTE component of the OMPI/rte framework to support the new db framework.
This commit was SVN r28112.
2013-02-26 17:50:04 +00:00
|
|
|
/* fetch the RTE data for this proc */
|
|
|
|
if (ORTE_SUCCESS != (rc = opal_db.fetch((*id), "RTE", (void **)&cptr, OPAL_STRING))) {
|
2012-08-17 02:19:26 +00:00
|
|
|
ORTE_ERROR_LOG(rc);
|
2013-01-03 02:16:10 +00:00
|
|
|
return rc;
|
|
|
|
}
|
Per the meeting on moving the BTLs to OPAL, move the ORTE database "db" framework to OPAL so the relocated BTLs can access it. Because the data is indexed by process, this requires that we define a new "opal_identifier_t" that corresponds to the orte_process_name_t struct. In order to support multiple run-times, this is defined in opal/mca/db/db_types.h as a uint64_t without identifying the meaning of any part of that data.
A few changes were required to support this move:
1. the PMI component used to identify rte-related data (e.g., host name, bind level) and package them as a unit to reduce the number of PMI keys. This code was moved up to the ORTE layer as the OPAL layer has no understanding of these concepts. In addition, the component locally stored data based on process jobid/vpid - this could no longer be supported (see below for the solution).
2. the hash component was updated to use the new opal_identifier_t instead of orte_process_name_t as its index for storing data in the hash tables. Previously, we did a hash on the vpid and stored the data in a 32-bit hash table. In the revised system, we don't see a separate "vpid" field - we only have a 64-bit opaque value. The orte_process_name_t hash turned out to do nothing useful, so we now store the data in a 64-bit hash table. Preliminary tests didn't show any identifiable change in behavior or performance, but we'll have to see if a move back to the 32-bit table is required at some later time.
3. the db framework was a "select one" system. However, since the PMI component could no longer use its internal storage system, the framework has now been changed to a "select many" mode of operation. This allows the hash component to handle all internal storage, while the PMI component only handles pushing/pulling things from the PMI system. This was something we had planned for some time - when fetching data, we first check internal storage to see if we already have it, and then automatically go to the global system to look for it if we don't. Accordingly, the framework was provided with a custom query function used during "select" that lets you seperately specify the "store" and "fetch" ordering.
4. the ORTE grpcomm and ess/pmi components, and the nidmap code, were updated to work with the new db framework and to specify internal/global storage options.
No changes were made to the MPI layer, except for modifying the ORTE component of the OMPI/rte framework to support the new db framework.
This commit was SVN r28112.
2013-02-26 17:50:04 +00:00
|
|
|
/* split on commas */
|
|
|
|
fields = opal_argv_split(cptr, ',');
|
|
|
|
free(cptr);
|
|
|
|
/* sanity check */
|
|
|
|
if (6 != opal_argv_count(fields)) {
|
|
|
|
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
|
|
|
|
opal_argv_free(fields);
|
|
|
|
return ORTE_ERR_BAD_PARAM;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* store the composite parts */
|
|
|
|
/* first field is the URI */
|
|
|
|
if (ORTE_SUCCESS != (rc = opal_db.store((*id), OPAL_DB_INTERNAL, ORTE_DB_RMLURI, fields[0], OPAL_STRING))) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
opal_argv_free(fields);
|
|
|
|
return rc;
|
|
|
|
}
|
2013-03-27 21:14:43 +00:00
|
|
|
OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base_framework.framework_output,
|
2011-10-12 20:59:25 +00:00
|
|
|
"%s grpcomm:pmi: proc %s oob endpoint %s",
|
|
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
Per the meeting on moving the BTLs to OPAL, move the ORTE database "db" framework to OPAL so the relocated BTLs can access it. Because the data is indexed by process, this requires that we define a new "opal_identifier_t" that corresponds to the orte_process_name_t struct. In order to support multiple run-times, this is defined in opal/mca/db/db_types.h as a uint64_t without identifying the meaning of any part of that data.
A few changes were required to support this move:
1. the PMI component used to identify rte-related data (e.g., host name, bind level) and package them as a unit to reduce the number of PMI keys. This code was moved up to the ORTE layer as the OPAL layer has no understanding of these concepts. In addition, the component locally stored data based on process jobid/vpid - this could no longer be supported (see below for the solution).
2. the hash component was updated to use the new opal_identifier_t instead of orte_process_name_t as its index for storing data in the hash tables. Previously, we did a hash on the vpid and stored the data in a 32-bit hash table. In the revised system, we don't see a separate "vpid" field - we only have a 64-bit opaque value. The orte_process_name_t hash turned out to do nothing useful, so we now store the data in a 64-bit hash table. Preliminary tests didn't show any identifiable change in behavior or performance, but we'll have to see if a move back to the 32-bit table is required at some later time.
3. the db framework was a "select one" system. However, since the PMI component could no longer use its internal storage system, the framework has now been changed to a "select many" mode of operation. This allows the hash component to handle all internal storage, while the PMI component only handles pushing/pulling things from the PMI system. This was something we had planned for some time - when fetching data, we first check internal storage to see if we already have it, and then automatically go to the global system to look for it if we don't. Accordingly, the framework was provided with a custom query function used during "select" that lets you seperately specify the "store" and "fetch" ordering.
4. the ORTE grpcomm and ess/pmi components, and the nidmap code, were updated to work with the new db framework and to specify internal/global storage options.
No changes were made to the MPI layer, except for modifying the ORTE component of the OMPI/rte framework to support the new db framework.
This commit was SVN r28112.
2013-02-26 17:50:04 +00:00
|
|
|
ORTE_NAME_PRINT(&name), fields[0]));
|
2011-10-12 20:59:25 +00:00
|
|
|
/* set the contact info into the hash table */
|
Per the meeting on moving the BTLs to OPAL, move the ORTE database "db" framework to OPAL so the relocated BTLs can access it. Because the data is indexed by process, this requires that we define a new "opal_identifier_t" that corresponds to the orte_process_name_t struct. In order to support multiple run-times, this is defined in opal/mca/db/db_types.h as a uint64_t without identifying the meaning of any part of that data.
A few changes were required to support this move:
1. the PMI component used to identify rte-related data (e.g., host name, bind level) and package them as a unit to reduce the number of PMI keys. This code was moved up to the ORTE layer as the OPAL layer has no understanding of these concepts. In addition, the component locally stored data based on process jobid/vpid - this could no longer be supported (see below for the solution).
2. the hash component was updated to use the new opal_identifier_t instead of orte_process_name_t as its index for storing data in the hash tables. Previously, we did a hash on the vpid and stored the data in a 32-bit hash table. In the revised system, we don't see a separate "vpid" field - we only have a 64-bit opaque value. The orte_process_name_t hash turned out to do nothing useful, so we now store the data in a 64-bit hash table. Preliminary tests didn't show any identifiable change in behavior or performance, but we'll have to see if a move back to the 32-bit table is required at some later time.
3. the db framework was a "select one" system. However, since the PMI component could no longer use its internal storage system, the framework has now been changed to a "select many" mode of operation. This allows the hash component to handle all internal storage, while the PMI component only handles pushing/pulling things from the PMI system. This was something we had planned for some time - when fetching data, we first check internal storage to see if we already have it, and then automatically go to the global system to look for it if we don't. Accordingly, the framework was provided with a custom query function used during "select" that lets you seperately specify the "store" and "fetch" ordering.
4. the ORTE grpcomm and ess/pmi components, and the nidmap code, were updated to work with the new db framework and to specify internal/global storage options.
No changes were made to the MPI layer, except for modifying the ORTE component of the OMPI/rte framework to support the new db framework.
This commit was SVN r28112.
2013-02-26 17:50:04 +00:00
|
|
|
if (ORTE_SUCCESS != (rc = orte_rml.set_contact_info(fields[0]))) {
|
|
|
|
opal_argv_free(fields);
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
/* next is the hostname */
|
|
|
|
if (ORTE_SUCCESS != (rc = opal_db.store((*id), OPAL_DB_INTERNAL, ORTE_DB_HOSTNAME, fields[1], OPAL_STRING))) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
opal_argv_free(fields);
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
/* local rank */
|
2013-03-26 18:27:50 +00:00
|
|
|
local_rank = strtoul(fields[2], NULL, 10);
|
Per the meeting on moving the BTLs to OPAL, move the ORTE database "db" framework to OPAL so the relocated BTLs can access it. Because the data is indexed by process, this requires that we define a new "opal_identifier_t" that corresponds to the orte_process_name_t struct. In order to support multiple run-times, this is defined in opal/mca/db/db_types.h as a uint64_t without identifying the meaning of any part of that data.
A few changes were required to support this move:
1. the PMI component used to identify rte-related data (e.g., host name, bind level) and package them as a unit to reduce the number of PMI keys. This code was moved up to the ORTE layer as the OPAL layer has no understanding of these concepts. In addition, the component locally stored data based on process jobid/vpid - this could no longer be supported (see below for the solution).
2. the hash component was updated to use the new opal_identifier_t instead of orte_process_name_t as its index for storing data in the hash tables. Previously, we did a hash on the vpid and stored the data in a 32-bit hash table. In the revised system, we don't see a separate "vpid" field - we only have a 64-bit opaque value. The orte_process_name_t hash turned out to do nothing useful, so we now store the data in a 64-bit hash table. Preliminary tests didn't show any identifiable change in behavior or performance, but we'll have to see if a move back to the 32-bit table is required at some later time.
3. the db framework was a "select one" system. However, since the PMI component could no longer use its internal storage system, the framework has now been changed to a "select many" mode of operation. This allows the hash component to handle all internal storage, while the PMI component only handles pushing/pulling things from the PMI system. This was something we had planned for some time - when fetching data, we first check internal storage to see if we already have it, and then automatically go to the global system to look for it if we don't. Accordingly, the framework was provided with a custom query function used during "select" that lets you seperately specify the "store" and "fetch" ordering.
4. the ORTE grpcomm and ess/pmi components, and the nidmap code, were updated to work with the new db framework and to specify internal/global storage options.
No changes were made to the MPI layer, except for modifying the ORTE component of the OMPI/rte framework to support the new db framework.
This commit was SVN r28112.
2013-02-26 17:50:04 +00:00
|
|
|
if (ORTE_SUCCESS != (rc = opal_db.store((*id), OPAL_DB_INTERNAL, ORTE_DB_LOCALRANK, &local_rank, ORTE_LOCAL_RANK))) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
opal_argv_free(fields);
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
/* node rank */
|
2013-03-26 18:27:50 +00:00
|
|
|
node_rank = strtoul(fields[3], NULL, 10);
|
Per the meeting on moving the BTLs to OPAL, move the ORTE database "db" framework to OPAL so the relocated BTLs can access it. Because the data is indexed by process, this requires that we define a new "opal_identifier_t" that corresponds to the orte_process_name_t struct. In order to support multiple run-times, this is defined in opal/mca/db/db_types.h as a uint64_t without identifying the meaning of any part of that data.
A few changes were required to support this move:
1. the PMI component used to identify rte-related data (e.g., host name, bind level) and package them as a unit to reduce the number of PMI keys. This code was moved up to the ORTE layer as the OPAL layer has no understanding of these concepts. In addition, the component locally stored data based on process jobid/vpid - this could no longer be supported (see below for the solution).
2. the hash component was updated to use the new opal_identifier_t instead of orte_process_name_t as its index for storing data in the hash tables. Previously, we did a hash on the vpid and stored the data in a 32-bit hash table. In the revised system, we don't see a separate "vpid" field - we only have a 64-bit opaque value. The orte_process_name_t hash turned out to do nothing useful, so we now store the data in a 64-bit hash table. Preliminary tests didn't show any identifiable change in behavior or performance, but we'll have to see if a move back to the 32-bit table is required at some later time.
3. the db framework was a "select one" system. However, since the PMI component could no longer use its internal storage system, the framework has now been changed to a "select many" mode of operation. This allows the hash component to handle all internal storage, while the PMI component only handles pushing/pulling things from the PMI system. This was something we had planned for some time - when fetching data, we first check internal storage to see if we already have it, and then automatically go to the global system to look for it if we don't. Accordingly, the framework was provided with a custom query function used during "select" that lets you seperately specify the "store" and "fetch" ordering.
4. the ORTE grpcomm and ess/pmi components, and the nidmap code, were updated to work with the new db framework and to specify internal/global storage options.
No changes were made to the MPI layer, except for modifying the ORTE component of the OMPI/rte framework to support the new db framework.
This commit was SVN r28112.
2013-02-26 17:50:04 +00:00
|
|
|
if (ORTE_SUCCESS != (rc = opal_db.store((*id), OPAL_DB_INTERNAL, ORTE_DB_NODERANK, &node_rank, ORTE_NODE_RANK))) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
opal_argv_free(fields);
|
2011-10-17 20:51:22 +00:00
|
|
|
return rc;
|
|
|
|
}
|
2013-03-26 18:27:50 +00:00
|
|
|
/* if the process was bound, then there will be another field
|
|
|
|
* that contains its cpuset
|
|
|
|
*/
|
|
|
|
if (5 == opal_argv_count(fields)) {
|
|
|
|
if (ORTE_SUCCESS != (rc = opal_db.store((*id), OPAL_DB_INTERNAL, ORTE_DB_CPUSET, fields[4], OPAL_STRING))) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
opal_argv_free(fields);
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
bound = true;
|
|
|
|
} else {
|
|
|
|
/* store a placeholder so we know that this value was retrieved,
|
|
|
|
* but the proc wasn't bound
|
|
|
|
*/
|
|
|
|
if (ORTE_SUCCESS != (rc = opal_db.store((*id), OPAL_DB_INTERNAL, ORTE_DB_CPUSET, NULL, OPAL_STRING))) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
opal_argv_free(fields);
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
bound = false;
|
|
|
|
}
|
Per the meeting on moving the BTLs to OPAL, move the ORTE database "db" framework to OPAL so the relocated BTLs can access it. Because the data is indexed by process, this requires that we define a new "opal_identifier_t" that corresponds to the orte_process_name_t struct. In order to support multiple run-times, this is defined in opal/mca/db/db_types.h as a uint64_t without identifying the meaning of any part of that data.
A few changes were required to support this move:
1. the PMI component used to identify rte-related data (e.g., host name, bind level) and package them as a unit to reduce the number of PMI keys. This code was moved up to the ORTE layer as the OPAL layer has no understanding of these concepts. In addition, the component locally stored data based on process jobid/vpid - this could no longer be supported (see below for the solution).
2. the hash component was updated to use the new opal_identifier_t instead of orte_process_name_t as its index for storing data in the hash tables. Previously, we did a hash on the vpid and stored the data in a 32-bit hash table. In the revised system, we don't see a separate "vpid" field - we only have a 64-bit opaque value. The orte_process_name_t hash turned out to do nothing useful, so we now store the data in a 64-bit hash table. Preliminary tests didn't show any identifiable change in behavior or performance, but we'll have to see if a move back to the 32-bit table is required at some later time.
3. the db framework was a "select one" system. However, since the PMI component could no longer use its internal storage system, the framework has now been changed to a "select many" mode of operation. This allows the hash component to handle all internal storage, while the PMI component only handles pushing/pulling things from the PMI system. This was something we had planned for some time - when fetching data, we first check internal storage to see if we already have it, and then automatically go to the global system to look for it if we don't. Accordingly, the framework was provided with a custom query function used during "select" that lets you seperately specify the "store" and "fetch" ordering.
4. the ORTE grpcomm and ess/pmi components, and the nidmap code, were updated to work with the new db framework and to specify internal/global storage options.
No changes were made to the MPI layer, except for modifying the ORTE component of the OMPI/rte framework to support the new db framework.
This commit was SVN r28112.
2013-02-26 17:50:04 +00:00
|
|
|
|
|
|
|
/* compute and store the locality as it isn't something that gets pushed to PMI */
|
|
|
|
if (0 != strcmp(fields[1], orte_process_info.nodename)) {
|
|
|
|
/* this is on a different node, then mark as non-local */
|
|
|
|
locality = OPAL_PROC_NON_LOCAL;
|
2013-03-26 18:27:50 +00:00
|
|
|
} else if (!bound) {
|
Per the meeting on moving the BTLs to OPAL, move the ORTE database "db" framework to OPAL so the relocated BTLs can access it. Because the data is indexed by process, this requires that we define a new "opal_identifier_t" that corresponds to the orte_process_name_t struct. In order to support multiple run-times, this is defined in opal/mca/db/db_types.h as a uint64_t without identifying the meaning of any part of that data.
A few changes were required to support this move:
1. the PMI component used to identify rte-related data (e.g., host name, bind level) and package them as a unit to reduce the number of PMI keys. This code was moved up to the ORTE layer as the OPAL layer has no understanding of these concepts. In addition, the component locally stored data based on process jobid/vpid - this could no longer be supported (see below for the solution).
2. the hash component was updated to use the new opal_identifier_t instead of orte_process_name_t as its index for storing data in the hash tables. Previously, we did a hash on the vpid and stored the data in a 32-bit hash table. In the revised system, we don't see a separate "vpid" field - we only have a 64-bit opaque value. The orte_process_name_t hash turned out to do nothing useful, so we now store the data in a 64-bit hash table. Preliminary tests didn't show any identifiable change in behavior or performance, but we'll have to see if a move back to the 32-bit table is required at some later time.
3. the db framework was a "select one" system. However, since the PMI component could no longer use its internal storage system, the framework has now been changed to a "select many" mode of operation. This allows the hash component to handle all internal storage, while the PMI component only handles pushing/pulling things from the PMI system. This was something we had planned for some time - when fetching data, we first check internal storage to see if we already have it, and then automatically go to the global system to look for it if we don't. Accordingly, the framework was provided with a custom query function used during "select" that lets you seperately specify the "store" and "fetch" ordering.
4. the ORTE grpcomm and ess/pmi components, and the nidmap code, were updated to work with the new db framework and to specify internal/global storage options.
No changes were made to the MPI layer, except for modifying the ORTE component of the OMPI/rte framework to support the new db framework.
This commit was SVN r28112.
2013-02-26 17:50:04 +00:00
|
|
|
/* if we share a node, but we don't know anything more, then
|
|
|
|
* mark us as on the node as this is all we know
|
|
|
|
*/
|
|
|
|
locality = OPAL_PROC_ON_NODE;
|
|
|
|
} else {
|
|
|
|
/* determine relative location on our node */
|
|
|
|
locality = opal_hwloc_base_get_relative_locality(opal_hwloc_topology,
|
2013-03-26 18:27:50 +00:00
|
|
|
orte_process_info.cpuset,
|
|
|
|
fields[4]);
|
Per the meeting on moving the BTLs to OPAL, move the ORTE database "db" framework to OPAL so the relocated BTLs can access it. Because the data is indexed by process, this requires that we define a new "opal_identifier_t" that corresponds to the orte_process_name_t struct. In order to support multiple run-times, this is defined in opal/mca/db/db_types.h as a uint64_t without identifying the meaning of any part of that data.
A few changes were required to support this move:
1. the PMI component used to identify rte-related data (e.g., host name, bind level) and package them as a unit to reduce the number of PMI keys. This code was moved up to the ORTE layer as the OPAL layer has no understanding of these concepts. In addition, the component locally stored data based on process jobid/vpid - this could no longer be supported (see below for the solution).
2. the hash component was updated to use the new opal_identifier_t instead of orte_process_name_t as its index for storing data in the hash tables. Previously, we did a hash on the vpid and stored the data in a 32-bit hash table. In the revised system, we don't see a separate "vpid" field - we only have a 64-bit opaque value. The orte_process_name_t hash turned out to do nothing useful, so we now store the data in a 64-bit hash table. Preliminary tests didn't show any identifiable change in behavior or performance, but we'll have to see if a move back to the 32-bit table is required at some later time.
3. the db framework was a "select one" system. However, since the PMI component could no longer use its internal storage system, the framework has now been changed to a "select many" mode of operation. This allows the hash component to handle all internal storage, while the PMI component only handles pushing/pulling things from the PMI system. This was something we had planned for some time - when fetching data, we first check internal storage to see if we already have it, and then automatically go to the global system to look for it if we don't. Accordingly, the framework was provided with a custom query function used during "select" that lets you seperately specify the "store" and "fetch" ordering.
4. the ORTE grpcomm and ess/pmi components, and the nidmap code, were updated to work with the new db framework and to specify internal/global storage options.
No changes were made to the MPI layer, except for modifying the ORTE component of the OMPI/rte framework to support the new db framework.
This commit was SVN r28112.
2013-02-26 17:50:04 +00:00
|
|
|
}
|
|
|
|
if (ORTE_SUCCESS != (rc = opal_db.store((*id), OPAL_DB_INTERNAL, ORTE_DB_LOCALITY, &locality, OPAL_HWLOC_LOCALITY_T))) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
opal_argv_free(fields);
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* cleanup */
|
|
|
|
opal_argv_free(fields);
|
|
|
|
fields = NULL;
|
2011-10-12 20:59:25 +00:00
|
|
|
}
|
2011-10-17 20:51:22 +00:00
|
|
|
|
2012-04-06 14:23:13 +00:00
|
|
|
/* execute the callback */
|
|
|
|
coll->active = false;
|
|
|
|
if (NULL != coll->cbfunc) {
|
|
|
|
coll->cbfunc(NULL, coll->cbdata);
|
|
|
|
}
|
Per the meeting on moving the BTLs to OPAL, move the ORTE database "db" framework to OPAL so the relocated BTLs can access it. Because the data is indexed by process, this requires that we define a new "opal_identifier_t" that corresponds to the orte_process_name_t struct. In order to support multiple run-times, this is defined in opal/mca/db/db_types.h as a uint64_t without identifying the meaning of any part of that data.
A few changes were required to support this move:
1. the PMI component used to identify rte-related data (e.g., host name, bind level) and package them as a unit to reduce the number of PMI keys. This code was moved up to the ORTE layer as the OPAL layer has no understanding of these concepts. In addition, the component locally stored data based on process jobid/vpid - this could no longer be supported (see below for the solution).
2. the hash component was updated to use the new opal_identifier_t instead of orte_process_name_t as its index for storing data in the hash tables. Previously, we did a hash on the vpid and stored the data in a 32-bit hash table. In the revised system, we don't see a separate "vpid" field - we only have a 64-bit opaque value. The orte_process_name_t hash turned out to do nothing useful, so we now store the data in a 64-bit hash table. Preliminary tests didn't show any identifiable change in behavior or performance, but we'll have to see if a move back to the 32-bit table is required at some later time.
3. the db framework was a "select one" system. However, since the PMI component could no longer use its internal storage system, the framework has now been changed to a "select many" mode of operation. This allows the hash component to handle all internal storage, while the PMI component only handles pushing/pulling things from the PMI system. This was something we had planned for some time - when fetching data, we first check internal storage to see if we already have it, and then automatically go to the global system to look for it if we don't. Accordingly, the framework was provided with a custom query function used during "select" that lets you seperately specify the "store" and "fetch" ordering.
4. the ORTE grpcomm and ess/pmi components, and the nidmap code, were updated to work with the new db framework and to specify internal/global storage options.
No changes were made to the MPI layer, except for modifying the ORTE component of the OMPI/rte framework to support the new db framework.
This commit was SVN r28112.
2013-02-26 17:50:04 +00:00
|
|
|
return rc;
|
2011-10-17 20:51:22 +00:00
|
|
|
}
|