1
1

Merge pull request #3934 from rhc54/topic/singleton

Fix the isolated pmix component. Cleanup the ess/singleton component …
Этот коммит содержится в:
Ralph Castain 2017-07-19 16:02:37 -05:00 коммит произвёл GitHub
родитель 6cbea90209 543c16b28d
Коммит fca68b070b
7 изменённых файлов: 141 добавлений и 97 удалений

Просмотреть файл

@ -122,12 +122,18 @@ static int isolated_init(opal_list_t *ilist)
{
int rc;
opal_value_t kv;
opal_process_name_t wildcard;
if (0 < isolated_init_count) {
OPAL_PMIX_ACQUIRE_THREAD(&opal_pmix_base.lock);
++isolated_init_count;
if (1 < isolated_init_count) {
OPAL_PMIX_RELEASE_THREAD(&opal_pmix_base.lock);
return OPAL_SUCCESS;
}
++isolated_init_count;
wildcard.jobid = 1;
wildcard.vpid = OPAL_VPID_WILDCARD;
/* store our name in the opal_proc_t so that
* debug messages will make sense - an upper
@ -178,6 +184,17 @@ static int isolated_init(opal_list_t *ilist)
}
OBJ_DESTRUCT(&kv);
OBJ_CONSTRUCT(&kv, opal_value_t);
kv.key = strdup(OPAL_PMIX_MAX_PROCS);
kv.type = OPAL_UINT32;
kv.data.uint32 = 1;
if (OPAL_SUCCESS != (rc = opal_pmix_base_store(&wildcard, &kv))) {
OPAL_ERROR_LOG(rc);
OBJ_DESTRUCT(&kv);
goto err_exit;
}
OBJ_DESTRUCT(&kv);
OBJ_CONSTRUCT(&kv, opal_value_t);
kv.key = strdup(OPAL_PMIX_JOBID);
kv.type = OPAL_UINT32;
@ -246,30 +263,35 @@ static int isolated_init(opal_list_t *ilist)
}
OBJ_DESTRUCT(&kv);
OPAL_PMIX_RELEASE_THREAD(&opal_pmix_base.lock);
return OPAL_SUCCESS;
err_exit:
OPAL_PMIX_RELEASE_THREAD(&opal_pmix_base.lock);
return rc;
}
static int isolated_fini(void)
{
OPAL_PMIX_ACQUIRE_THREAD(&opal_pmix_base.lock);
--opal_pmix_base.initialized;
if (0 == isolated_init_count) {
return OPAL_SUCCESS;
opal_pmix_base_hash_finalize();
}
if (0 != --isolated_init_count) {
return OPAL_SUCCESS;
}
opal_pmix_base_hash_finalize();
OPAL_PMIX_RELEASE_THREAD(&opal_pmix_base.lock);
return OPAL_SUCCESS;
}
static int isolated_initialized(void)
{
OPAL_PMIX_ACQUIRE_THREAD(&opal_pmix_base.lock);
if (0 < isolated_init_count) {
OPAL_PMIX_RELEASE_THREAD(&opal_pmix_base.lock);
return 1;
}
OPAL_PMIX_RELEASE_THREAD(&opal_pmix_base.lock);
return 0;
}
@ -325,13 +347,16 @@ static int isolated_put(opal_pmix_scope_t scope,
{
int rc;
opal_output_verbose(10, opal_pmix_base_framework.framework_output,
"%s pmix:isolated isolated_put key %s scope %d\n",
opal_output_verbose(2, opal_pmix_base_framework.framework_output,
"%s pmix:isolated isolated_put key %s scope %d",
OPAL_NAME_PRINT(OPAL_PROC_MY_NAME), kv->key, scope);
if (!isolated_init_count) {
OPAL_PMIX_ACQUIRE_THREAD(&opal_pmix_base.lock);
if (0 == isolated_init_count) {
OPAL_PMIX_RELEASE_THREAD(&opal_pmix_base.lock);
return OPAL_ERROR;
}
OPAL_PMIX_RELEASE_THREAD(&opal_pmix_base.lock);
rc = opal_pmix_base_store(&isolated_pname, kv);
@ -340,18 +365,31 @@ static int isolated_put(opal_pmix_scope_t scope,
static int isolated_commit(void)
{
opal_output_verbose(2, opal_pmix_base_framework.framework_output,
"%s pmix:isolated isolated commit",
OPAL_NAME_PRINT(OPAL_PROC_MY_NAME));
return OPAL_SUCCESS;
}
static int isolated_fence(opal_list_t *procs, int collect_data)
{
opal_output_verbose(2, opal_pmix_base_framework.framework_output,
"%s pmix:isolated isolated fence",
OPAL_NAME_PRINT(OPAL_PROC_MY_NAME));
return OPAL_SUCCESS;
}
static int isolated_fence_nb(opal_list_t *procs, int collect_data,
opal_pmix_op_cbfunc_t cbfunc, void *cbdata)
{
return OPAL_ERR_NOT_IMPLEMENTED;
opal_output_verbose(2, opal_pmix_base_framework.framework_output,
"%s pmix:isolated isolated fence_nb",
OPAL_NAME_PRINT(OPAL_PROC_MY_NAME));
if (NULL != cbfunc) {
cbfunc(OPAL_SUCCESS, cbdata);
}
return OPAL_SUCCESS;
}
static int isolated_get(const opal_process_name_t *id,
@ -383,39 +421,60 @@ static int isolated_get(const opal_process_name_t *id,
static int isolated_get_nb(const opal_process_name_t *id, const char *key,
opal_list_t *info, opal_pmix_value_cbfunc_t cbfunc, void *cbdata)
{
opal_output_verbose(2, opal_pmix_base_framework.framework_output,
"%s pmix:isolated isolated get_nb",
OPAL_NAME_PRINT(OPAL_PROC_MY_NAME));
return OPAL_ERR_NOT_IMPLEMENTED;
}
static int isolated_publish(opal_list_t *info)
{
opal_output_verbose(2, opal_pmix_base_framework.framework_output,
"%s pmix:isolated isolated publish",
OPAL_NAME_PRINT(OPAL_PROC_MY_NAME));
return OPAL_ERR_NOT_SUPPORTED;
}
static int isolated_publish_nb(opal_list_t *info,
opal_pmix_op_cbfunc_t cbfunc, void *cbdata)
{
opal_output_verbose(2, opal_pmix_base_framework.framework_output,
"%s pmix:isolated isolated publish_nb",
OPAL_NAME_PRINT(OPAL_PROC_MY_NAME));
return OPAL_ERR_NOT_SUPPORTED;
}
static int isolated_lookup(opal_list_t *data, opal_list_t *info)
{
opal_output_verbose(2, opal_pmix_base_framework.framework_output,
"%s pmix:isolated isolated lookup",
OPAL_NAME_PRINT(OPAL_PROC_MY_NAME));
return OPAL_ERR_NOT_SUPPORTED;
}
static int isolated_lookup_nb(char **keys, opal_list_t *info,
opal_pmix_lookup_cbfunc_t cbfunc, void *cbdata)
{
opal_output_verbose(2, opal_pmix_base_framework.framework_output,
"%s pmix:isolated isolated lookup_nb",
OPAL_NAME_PRINT(OPAL_PROC_MY_NAME));
return OPAL_ERR_NOT_SUPPORTED;
}
static int isolated_unpublish(char **keys, opal_list_t *info)
{
opal_output_verbose(2, opal_pmix_base_framework.framework_output,
"%s pmix:isolated isolated unpublish",
OPAL_NAME_PRINT(OPAL_PROC_MY_NAME));
return OPAL_ERR_NOT_SUPPORTED;
}
static int isolated_unpublish_nb(char **keys, opal_list_t *info,
opal_pmix_op_cbfunc_t cbfunc, void *cbdata)
{
opal_output_verbose(2, opal_pmix_base_framework.framework_output,
"%s pmix:isolated isolated unpublish_nb",
OPAL_NAME_PRINT(OPAL_PROC_MY_NAME));
return OPAL_ERR_NOT_SUPPORTED;
}
@ -427,6 +486,10 @@ static const char *isolated_get_version(void)
static int isolated_store_local(const opal_process_name_t *proc,
opal_value_t *val)
{
opal_output_verbose(2, opal_pmix_base_framework.framework_output,
"%s pmix:isolated isolated store_local",
OPAL_NAME_PRINT(OPAL_PROC_MY_NAME));
opal_pmix_base_store(proc, val);
return OPAL_SUCCESS;

Просмотреть файл

@ -84,8 +84,6 @@ static int rte_init(void)
{
int rc, ret;
char *error = NULL;
opal_value_t *kv;
char *val = NULL;
int u32, *u32ptr;
uint16_t u16, *u16ptr;
orte_process_name_t name;
@ -159,7 +157,7 @@ static int rte_init(void)
} else if (NULL != getenv("SINGULARITY_CONTAINER") ||
mca_ess_singleton_component.isolated) {
/* ensure we use the isolated pmix component */
opal_setenv (OPAL_MCA_PREFIX"pmix", "isolated", true, &environ);
opal_setenv(OPAL_MCA_PREFIX"pmix", "isolated", true, &environ);
} else {
/* we want to use PMIX_NAMESPACE that will be sent by the hnp as a jobid */
opal_setenv(OPAL_MCA_PREFIX"orte_launch", "1", true, &environ);
@ -169,7 +167,7 @@ static int rte_init(void)
return rc;
}
/* our name was given to us by the HNP */
opal_setenv (OPAL_MCA_PREFIX"pmix", "^s1,s2,cray,isolated", true, &environ);
opal_setenv(OPAL_MCA_PREFIX"pmix", "^s1,s2,cray,isolated", true, &environ);
}
/* get an async event base - we use the opal_async one so
@ -265,69 +263,13 @@ static int rte_init(void)
* we can use the jobfam and stepid as unique keys
* because they are unique values assigned by the RM
*/
assert (NULL != getenv(OPAL_MCA_PREFIX"orte_precondition_transports"));
/* retrieve our topology */
OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_LOCAL_TOPO,
&name, &val, OPAL_STRING);
if (OPAL_SUCCESS == ret && NULL != val) {
/* load the topology */
if (0 != hwloc_topology_init(&opal_hwloc_topology)) {
ret = OPAL_ERROR;
free(val);
error = "setting topology";
goto error;
if (NULL == getenv(OPAL_MCA_PREFIX"orte_precondition_transports")) {
char *key;
ret = orte_pre_condition_transports(NULL, &key);
if (ORTE_SUCCESS == ret) {
opal_setenv(OPAL_MCA_PREFIX"orte_precondition_transports", key, true, &environ);
free(key);
}
if (0 != hwloc_topology_set_xmlbuffer(opal_hwloc_topology, val, strlen(val))) {
ret = OPAL_ERROR;
free(val);
hwloc_topology_destroy(opal_hwloc_topology);
error = "setting topology";
goto error;
}
/* since we are loading this from an external source, we have to
* explicitly set a flag so hwloc sets things up correctly
*/
if (0 != hwloc_topology_set_flags(opal_hwloc_topology,
(HWLOC_TOPOLOGY_FLAG_IS_THISSYSTEM |
HWLOC_TOPOLOGY_FLAG_WHOLE_SYSTEM |
HWLOC_TOPOLOGY_FLAG_IO_DEVICES))) {
ret = OPAL_ERROR;
hwloc_topology_destroy(opal_hwloc_topology);
free(val);
error = "setting topology";
goto error;
}
/* now load the topology */
if (0 != hwloc_topology_load(opal_hwloc_topology)) {
ret = OPAL_ERROR;
hwloc_topology_destroy(opal_hwloc_topology);
free(val);
error = "setting topology";
goto error;
}
free(val);
} else {
/* it wasn't passed down to us, so go get it */
if (OPAL_SUCCESS != (ret = opal_hwloc_base_get_topology())) {
error = "topology discovery";
goto error;
}
/* push it into the PMIx database in case someone
* tries to retrieve it so we avoid an attempt to
* get it again */
kv = OBJ_NEW(opal_value_t);
kv->key = strdup(OPAL_PMIX_LOCAL_TOPO);
kv->type = OPAL_STRING;
if (0 != (ret = hwloc_topology_export_xmlbuffer(opal_hwloc_topology, &kv->data.string, &u32))) {
error = "topology export";
goto error;
}
if (OPAL_SUCCESS != (ret = opal_pmix.store_local(ORTE_PROC_MY_NAME, kv))) {
error = "topology store";
goto error;
}
OBJ_RELEASE(kv);
}
/* use the std app init to complete the procedure */

Просмотреть файл

@ -332,7 +332,7 @@ void orte_plm_base_setup_job(int fd, short args, void *cbdata)
}
free(key);
} else {
if (ORTE_SUCCESS != (rc = orte_pre_condition_transports(caddy->jdata))) {
if (ORTE_SUCCESS != (rc = orte_pre_condition_transports(caddy->jdata, NULL))) {
ORTE_ERROR_LOG(rc);
ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
OBJ_RELEASE(caddy);
@ -342,7 +342,7 @@ void orte_plm_base_setup_job(int fd, short args, void *cbdata)
} else {
/* this will also record the transport key attribute in the job object, and
* adds the key envar to each app */
if (ORTE_SUCCESS != (rc = orte_pre_condition_transports(caddy->jdata))) {
if (ORTE_SUCCESS != (rc = orte_pre_condition_transports(caddy->jdata, NULL))) {
ORTE_ERROR_LOG(rc);
ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
OBJ_RELEASE(caddy);

Просмотреть файл

@ -578,7 +578,7 @@ int orte_daemon(int argc, char *argv[])
ORTE_FLAG_SET(proc, ORTE_PROC_FLAG_LOCAL);
/* set the ORTE_JOB_TRANSPORT_KEY from the environment */
orte_pre_condition_transports(jdata);
orte_pre_condition_transports(jdata, NULL);
/* register the singleton's nspace with our PMIx server */
if (ORTE_SUCCESS != (ret = orte_pmix_server_register_nspace(jdata, false))) {

33
orte/test/mpi/hellocycle.pl Исполняемый файл
Просмотреть файл

@ -0,0 +1,33 @@
#!/usr/bin/env perl
#
use strict;
use warnings;
use Date::Parse;
#
$ENV{OMPI_MCA_btl} = "self";
#
sub prtime {
my $count = shift;
my $str = localtime;
print "$count: $str\n";
}
my $totalcount = 5000;
my $count = $totalcount;
prtime($count);
my $start = time();
while ($count > 0) {
system("./hello > /dev/null 2>&1");
$count--;
if ($count % 1000 == 0) {
prtime($count);
}
}
prtime($count);
my $stop = time();
my $rate = $totalcount / ($stop - $start);
print "Rate: $rate\n";

Просмотреть файл

@ -12,7 +12,7 @@
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2015 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* Copyright (c) 2016 Intel, Inc. All rights reserved.
* Copyright (c) 2016-2017 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -130,7 +130,7 @@ char* orte_pre_condition_transports_print(uint64_t *unique_key)
}
int orte_pre_condition_transports(orte_job_t *jdata)
int orte_pre_condition_transports(orte_job_t *jdata, char **key)
{
uint64_t unique_key[2];
int n;
@ -164,23 +164,28 @@ int orte_pre_condition_transports(orte_job_t *jdata)
}
/* record it in case this job executes a dynamic spawn */
orte_set_attribute(&jdata->attributes, ORTE_JOB_TRANSPORT_KEY, ORTE_ATTR_LOCAL, string_key, OPAL_STRING);
if (NULL != jdata) {
orte_set_attribute(&jdata->attributes, ORTE_JOB_TRANSPORT_KEY, ORTE_ATTR_LOCAL, string_key, OPAL_STRING);
if (OPAL_SUCCESS != mca_base_var_env_name ("orte_precondition_transports", &cs_env)) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
free(string_key);
return ORTE_ERR_OUT_OF_RESOURCE;
}
for (n=0; n < jdata->apps->size; n++) {
if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, n))) {
continue;
if (OPAL_SUCCESS != mca_base_var_env_name ("orte_precondition_transports", &cs_env)) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
free(string_key);
return ORTE_ERR_OUT_OF_RESOURCE;
}
opal_setenv(cs_env, string_key, true, &app->env);
}
free(cs_env);
free(string_key);
for (n=0; n < jdata->apps->size; n++) {
if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, n))) {
continue;
}
opal_setenv(cs_env, string_key, true, &app->env);
}
free(cs_env);
free(string_key);
} else if (NULL != key) {
*key = string_key;
} else {
free(string_key);
}
return ORTE_SUCCESS;
}

Просмотреть файл

@ -9,6 +9,7 @@
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2017 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -32,7 +33,7 @@
BEGIN_C_DECLS
ORTE_DECLSPEC int orte_pre_condition_transports(orte_job_t *jdata);
ORTE_DECLSPEC int orte_pre_condition_transports(orte_job_t *jdata, char **key);
ORTE_DECLSPEC char* orte_pre_condition_transports_print(uint64_t *unique_key);