In the case of direct-launched processes running under slurm, psm requires that the pre_condition_transports MCA param be set. This is normally computed by mpirun and inserted into each proc's environ, but that doesn't work here.
So separate out the printing of that key, and let the individual procs generate it in a way that ensures they all get the same result. This commit was SVN r24646.
This commit is contained in:
parent
e4732110da
commit
859aaab93d
@ -51,6 +51,7 @@
|
||||
#include "orte/util/name_fns.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
#include "orte/util/nidmap.h"
|
||||
#include "orte/util/pre_condition_transports.h"
|
||||
#include "orte/util/regex.h"
|
||||
#include "orte/runtime/orte_wait.h"
|
||||
|
||||
@ -109,7 +110,9 @@ static int rte_init(void)
|
||||
char *regexp, *tasks_per_node;
|
||||
int *ppn;
|
||||
bool block=false, cyclic=false;
|
||||
|
||||
uint64_t unique_key[2];
|
||||
char *cs_env, *string_key;
|
||||
|
||||
/* init flag */
|
||||
app_init_complete = false;
|
||||
slurm20 = false;
|
||||
@ -149,6 +152,26 @@ static int rte_init(void)
|
||||
/* now build the jobid */
|
||||
ORTE_PROC_MY_NAME->jobid = ORTE_CONSTRUCT_LOCAL_JOBID(jobfam << 16, stepid);
|
||||
|
||||
/* setup transport keys in case the MPI layer needs them -
|
||||
* we can use the SLURM jobid and stepid as unique keys
|
||||
* because they are unique values assigned by the RM
|
||||
*/
|
||||
unique_key[0] = (uint64_t)jobfam;
|
||||
unique_key[1] = (uint64_t)stepid;
|
||||
if (NULL == (string_key = orte_pre_condition_transports_print(unique_key))) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
if (NULL == (cs_env = mca_base_param_environ_variable("orte_precondition_transports",NULL,NULL))) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
asprintf(&envar, "%s=%s", cs_env, string_key);
|
||||
putenv(envar);
|
||||
free(envar);
|
||||
free(cs_env);
|
||||
free(string_key);
|
||||
|
||||
/* get the slurm procid - this will be our vpid */
|
||||
if (NULL == (envar = getenv("SLURM_PROCID"))) {
|
||||
error = "could not get SLURM_PROCID";
|
||||
@ -395,6 +418,8 @@ static int rte_finalize(void)
|
||||
*/
|
||||
unsetenv("OMPI_MCA_grpcomm");
|
||||
unsetenv("OMPI_MCA_routed");
|
||||
unsetenv("OMPI_MCA_orte_precondition_transports");
|
||||
|
||||
/* deconstruct my nidmap and jobmap arrays - this
|
||||
* function protects itself from being called
|
||||
* before things were initialized
|
||||
|
@ -62,47 +62,11 @@ static inline void orte_pre_condition_transports_use_rand(uint64_t* unique_key)
|
||||
unique_key[1] = rand();
|
||||
}
|
||||
|
||||
int orte_pre_condition_transports(orte_job_t *jdata)
|
||||
char* orte_pre_condition_transports_print(uint64_t *unique_key)
|
||||
{
|
||||
size_t i, string_key_len, written_len;
|
||||
char *cs_env, *string_key = NULL, *format = NULL;
|
||||
uint64_t unique_key[2];
|
||||
unsigned int *int_ptr;
|
||||
int n;
|
||||
orte_app_context_t *app;
|
||||
|
||||
#if !defined(__WINDOWS__)
|
||||
int fd_rand;
|
||||
size_t bytes_read;
|
||||
struct stat buf;
|
||||
|
||||
/* put the number here - or else create an appropriate string. this just needs to
|
||||
* eventually be a string variable
|
||||
*/
|
||||
if(0 != stat("/dev/urandom", &buf)) {
|
||||
/* file doesn't exist! */
|
||||
orte_pre_condition_transports_use_rand(unique_key);
|
||||
}
|
||||
|
||||
if(-1 == (fd_rand = open("/dev/urandom", O_RDONLY))) {
|
||||
orte_pre_condition_transports_use_rand(unique_key);
|
||||
} else {
|
||||
bytes_read = read(fd_rand, (char *) unique_key, 16);
|
||||
if(bytes_read != 16) {
|
||||
orte_pre_condition_transports_use_rand(unique_key);
|
||||
} else {
|
||||
close(fd_rand);
|
||||
}
|
||||
}
|
||||
#else
|
||||
{
|
||||
unsigned int random_value;
|
||||
rand_s( &random_value );
|
||||
unique_key[0] = (uint64_t)random_value;
|
||||
rand_s( &random_value );
|
||||
unique_key[1] = (uint64_t)random_value;
|
||||
}
|
||||
#endif /* !defined(__WINDOWS__) */
|
||||
size_t i, string_key_len, written_len;
|
||||
char *string_key = NULL, *format = NULL;
|
||||
|
||||
/* string is two 64 bit numbers printed in hex with a dash between
|
||||
* and zero padding.
|
||||
@ -110,7 +74,7 @@ int orte_pre_condition_transports(orte_job_t *jdata)
|
||||
string_key_len = (sizeof(uint64_t) * 2) * 2 + strlen("-") + 1;
|
||||
string_key = (char*) malloc(string_key_len);
|
||||
if (NULL == string_key) {
|
||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
string_key[0] = '\0';
|
||||
@ -147,7 +111,57 @@ int orte_pre_condition_transports(orte_job_t *jdata)
|
||||
format, int_ptr[i]);
|
||||
written_len = strlen(string_key);
|
||||
}
|
||||
|
||||
free(format);
|
||||
|
||||
return string_key;
|
||||
}
|
||||
|
||||
|
||||
int orte_pre_condition_transports(orte_job_t *jdata)
|
||||
{
|
||||
uint64_t unique_key[2];
|
||||
int n;
|
||||
orte_app_context_t *app;
|
||||
char *string_key, *cs_env;
|
||||
|
||||
#if !defined(__WINDOWS__)
|
||||
int fd_rand;
|
||||
size_t bytes_read;
|
||||
struct stat buf;
|
||||
|
||||
/* put the number here - or else create an appropriate string. this just needs to
|
||||
* eventually be a string variable
|
||||
*/
|
||||
if(0 != stat("/dev/urandom", &buf)) {
|
||||
/* file doesn't exist! */
|
||||
orte_pre_condition_transports_use_rand(unique_key);
|
||||
}
|
||||
|
||||
if(-1 == (fd_rand = open("/dev/urandom", O_RDONLY))) {
|
||||
orte_pre_condition_transports_use_rand(unique_key);
|
||||
} else {
|
||||
bytes_read = read(fd_rand, (char *) unique_key, 16);
|
||||
if(bytes_read != 16) {
|
||||
orte_pre_condition_transports_use_rand(unique_key);
|
||||
} else {
|
||||
close(fd_rand);
|
||||
}
|
||||
}
|
||||
#else
|
||||
{
|
||||
unsigned int random_value;
|
||||
rand_s( &random_value );
|
||||
unique_key[0] = (uint64_t)random_value;
|
||||
rand_s( &random_value );
|
||||
unique_key[1] = (uint64_t)random_value;
|
||||
}
|
||||
#endif /* !defined(__WINDOWS__) */
|
||||
|
||||
if (NULL == (string_key = orte_pre_condition_transports_print(unique_key))) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
if (NULL == (cs_env = mca_base_param_environ_variable("orte_precondition_transports",NULL,NULL))) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||
@ -161,7 +175,6 @@ int orte_pre_condition_transports(orte_job_t *jdata)
|
||||
}
|
||||
|
||||
free(cs_env);
|
||||
free(format);
|
||||
free(string_key);
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
|
@ -34,6 +34,8 @@ BEGIN_C_DECLS
|
||||
|
||||
ORTE_DECLSPEC int orte_pre_condition_transports(orte_job_t *jdata);
|
||||
|
||||
ORTE_DECLSPEC char* orte_pre_condition_transports_print(uint64_t *unique_key);
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
#endif
|
||||
|
Loading…
x
Reference in New Issue
Block a user