1
1

In the case of direct-launched processes running under slurm, psm requires that the pre_condition_transports MCA param be set. This is normally computed by mpirun and inserted into each proc's environ, but that doesn't work here.

So separate out the printing of that key, and let the individual procs generate it in a way that ensures they all get the same result.

This commit was SVN r24646.
This commit is contained in:
Ralph Castain 2011-04-28 13:54:33 +00:00
parent e4732110da
commit 859aaab93d
3 changed files with 83 additions and 43 deletions

View File

@ -51,6 +51,7 @@
#include "orte/util/name_fns.h"
#include "orte/runtime/orte_globals.h"
#include "orte/util/nidmap.h"
#include "orte/util/pre_condition_transports.h"
#include "orte/util/regex.h"
#include "orte/runtime/orte_wait.h"
@ -109,7 +110,9 @@ static int rte_init(void)
char *regexp, *tasks_per_node;
int *ppn;
bool block=false, cyclic=false;
uint64_t unique_key[2];
char *cs_env, *string_key;
/* init flag */
app_init_complete = false;
slurm20 = false;
@ -149,6 +152,26 @@ static int rte_init(void)
/* now build the jobid */
ORTE_PROC_MY_NAME->jobid = ORTE_CONSTRUCT_LOCAL_JOBID(jobfam << 16, stepid);
/* setup transport keys in case the MPI layer needs them -
* we can use the SLURM jobid and stepid as unique keys
* because they are unique values assigned by the RM
*/
unique_key[0] = (uint64_t)jobfam;
unique_key[1] = (uint64_t)stepid;
if (NULL == (string_key = orte_pre_condition_transports_print(unique_key))) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
if (NULL == (cs_env = mca_base_param_environ_variable("orte_precondition_transports",NULL,NULL))) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
asprintf(&envar, "%s=%s", cs_env, string_key);
putenv(envar);
free(envar);
free(cs_env);
free(string_key);
/* get the slurm procid - this will be our vpid */
if (NULL == (envar = getenv("SLURM_PROCID"))) {
error = "could not get SLURM_PROCID";
@ -395,6 +418,8 @@ static int rte_finalize(void)
*/
unsetenv("OMPI_MCA_grpcomm");
unsetenv("OMPI_MCA_routed");
unsetenv("OMPI_MCA_orte_precondition_transports");
/* deconstruct my nidmap and jobmap arrays - this
* function protects itself from being called
* before things were initialized

View File

@ -62,47 +62,11 @@ static inline void orte_pre_condition_transports_use_rand(uint64_t* unique_key)
unique_key[1] = rand();
}
int orte_pre_condition_transports(orte_job_t *jdata)
char* orte_pre_condition_transports_print(uint64_t *unique_key)
{
size_t i, string_key_len, written_len;
char *cs_env, *string_key = NULL, *format = NULL;
uint64_t unique_key[2];
unsigned int *int_ptr;
int n;
orte_app_context_t *app;
#if !defined(__WINDOWS__)
int fd_rand;
size_t bytes_read;
struct stat buf;
/* put the number here - or else create an appropriate string. this just needs to
* eventually be a string variable
*/
if(0 != stat("/dev/urandom", &buf)) {
/* file doesn't exist! */
orte_pre_condition_transports_use_rand(unique_key);
}
if(-1 == (fd_rand = open("/dev/urandom", O_RDONLY))) {
orte_pre_condition_transports_use_rand(unique_key);
} else {
bytes_read = read(fd_rand, (char *) unique_key, 16);
if(bytes_read != 16) {
orte_pre_condition_transports_use_rand(unique_key);
} else {
close(fd_rand);
}
}
#else
{
unsigned int random_value;
rand_s( &random_value );
unique_key[0] = (uint64_t)random_value;
rand_s( &random_value );
unique_key[1] = (uint64_t)random_value;
}
#endif /* !defined(__WINDOWS__) */
size_t i, string_key_len, written_len;
char *string_key = NULL, *format = NULL;
/* string is two 64 bit numbers printed in hex with a dash between
* and zero padding.
@ -110,7 +74,7 @@ int orte_pre_condition_transports(orte_job_t *jdata)
string_key_len = (sizeof(uint64_t) * 2) * 2 + strlen("-") + 1;
string_key = (char*) malloc(string_key_len);
if (NULL == string_key) {
return ORTE_ERR_OUT_OF_RESOURCE;
return NULL;
}
string_key[0] = '\0';
@ -147,7 +111,57 @@ int orte_pre_condition_transports(orte_job_t *jdata)
format, int_ptr[i]);
written_len = strlen(string_key);
}
free(format);
return string_key;
}
int orte_pre_condition_transports(orte_job_t *jdata)
{
uint64_t unique_key[2];
int n;
orte_app_context_t *app;
char *string_key, *cs_env;
#if !defined(__WINDOWS__)
int fd_rand;
size_t bytes_read;
struct stat buf;
/* put the number here - or else create an appropriate string. this just needs to
* eventually be a string variable
*/
if(0 != stat("/dev/urandom", &buf)) {
/* file doesn't exist! */
orte_pre_condition_transports_use_rand(unique_key);
}
if(-1 == (fd_rand = open("/dev/urandom", O_RDONLY))) {
orte_pre_condition_transports_use_rand(unique_key);
} else {
bytes_read = read(fd_rand, (char *) unique_key, 16);
if(bytes_read != 16) {
orte_pre_condition_transports_use_rand(unique_key);
} else {
close(fd_rand);
}
}
#else
{
unsigned int random_value;
rand_s( &random_value );
unique_key[0] = (uint64_t)random_value;
rand_s( &random_value );
unique_key[1] = (uint64_t)random_value;
}
#endif /* !defined(__WINDOWS__) */
if (NULL == (string_key = orte_pre_condition_transports_print(unique_key))) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
if (NULL == (cs_env = mca_base_param_environ_variable("orte_precondition_transports",NULL,NULL))) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
@ -161,7 +175,6 @@ int orte_pre_condition_transports(orte_job_t *jdata)
}
free(cs_env);
free(format);
free(string_key);
return ORTE_SUCCESS;

View File

@ -34,6 +34,8 @@ BEGIN_C_DECLS
ORTE_DECLSPEC int orte_pre_condition_transports(orte_job_t *jdata);
ORTE_DECLSPEC char* orte_pre_condition_transports_print(uint64_t *unique_key);
END_C_DECLS
#endif