1
1

In the case of direct-launched processes running under slurm, psm requires that the pre_condition_transports MCA param be set. This is normally computed by mpirun and inserted into each proc's environ, but that doesn't work here.

So separate out the printing of that key, and let the individual procs generate it in a way that ensures they all get the same result.

This commit was SVN r24646.
Этот коммит содержится в:
Ralph Castain 2011-04-28 13:54:33 +00:00
родитель e4732110da
Коммит 859aaab93d
3 изменённых файлов: 83 добавлений и 43 удалений

Просмотреть файл

@ -51,6 +51,7 @@
#include "orte/util/name_fns.h" #include "orte/util/name_fns.h"
#include "orte/runtime/orte_globals.h" #include "orte/runtime/orte_globals.h"
#include "orte/util/nidmap.h" #include "orte/util/nidmap.h"
#include "orte/util/pre_condition_transports.h"
#include "orte/util/regex.h" #include "orte/util/regex.h"
#include "orte/runtime/orte_wait.h" #include "orte/runtime/orte_wait.h"
@ -109,6 +110,8 @@ static int rte_init(void)
char *regexp, *tasks_per_node; char *regexp, *tasks_per_node;
int *ppn; int *ppn;
bool block=false, cyclic=false; bool block=false, cyclic=false;
uint64_t unique_key[2];
char *cs_env, *string_key;
/* init flag */ /* init flag */
app_init_complete = false; app_init_complete = false;
@ -149,6 +152,26 @@ static int rte_init(void)
/* now build the jobid */ /* now build the jobid */
ORTE_PROC_MY_NAME->jobid = ORTE_CONSTRUCT_LOCAL_JOBID(jobfam << 16, stepid); ORTE_PROC_MY_NAME->jobid = ORTE_CONSTRUCT_LOCAL_JOBID(jobfam << 16, stepid);
/* setup transport keys in case the MPI layer needs them -
* we can use the SLURM jobid and stepid as unique keys
* because they are unique values assigned by the RM
*/
unique_key[0] = (uint64_t)jobfam;
unique_key[1] = (uint64_t)stepid;
if (NULL == (string_key = orte_pre_condition_transports_print(unique_key))) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
if (NULL == (cs_env = mca_base_param_environ_variable("orte_precondition_transports",NULL,NULL))) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
asprintf(&envar, "%s=%s", cs_env, string_key);
putenv(envar);
free(envar);
free(cs_env);
free(string_key);
/* get the slurm procid - this will be our vpid */ /* get the slurm procid - this will be our vpid */
if (NULL == (envar = getenv("SLURM_PROCID"))) { if (NULL == (envar = getenv("SLURM_PROCID"))) {
error = "could not get SLURM_PROCID"; error = "could not get SLURM_PROCID";
@ -395,6 +418,8 @@ static int rte_finalize(void)
*/ */
unsetenv("OMPI_MCA_grpcomm"); unsetenv("OMPI_MCA_grpcomm");
unsetenv("OMPI_MCA_routed"); unsetenv("OMPI_MCA_routed");
unsetenv("OMPI_MCA_orte_precondition_transports");
/* deconstruct my nidmap and jobmap arrays - this /* deconstruct my nidmap and jobmap arrays - this
* function protects itself from being called * function protects itself from being called
* before things were initialized * before things were initialized

Просмотреть файл

@ -62,47 +62,11 @@ static inline void orte_pre_condition_transports_use_rand(uint64_t* unique_key)
unique_key[1] = rand(); unique_key[1] = rand();
} }
int orte_pre_condition_transports(orte_job_t *jdata) char* orte_pre_condition_transports_print(uint64_t *unique_key)
{ {
size_t i, string_key_len, written_len;
char *cs_env, *string_key = NULL, *format = NULL;
uint64_t unique_key[2];
unsigned int *int_ptr; unsigned int *int_ptr;
int n; size_t i, string_key_len, written_len;
orte_app_context_t *app; char *string_key = NULL, *format = NULL;
#if !defined(__WINDOWS__)
int fd_rand;
size_t bytes_read;
struct stat buf;
/* put the number here - or else create an appropriate string. this just needs to
* eventually be a string variable
*/
if(0 != stat("/dev/urandom", &buf)) {
/* file doesn't exist! */
orte_pre_condition_transports_use_rand(unique_key);
}
if(-1 == (fd_rand = open("/dev/urandom", O_RDONLY))) {
orte_pre_condition_transports_use_rand(unique_key);
} else {
bytes_read = read(fd_rand, (char *) unique_key, 16);
if(bytes_read != 16) {
orte_pre_condition_transports_use_rand(unique_key);
} else {
close(fd_rand);
}
}
#else
{
unsigned int random_value;
rand_s( &random_value );
unique_key[0] = (uint64_t)random_value;
rand_s( &random_value );
unique_key[1] = (uint64_t)random_value;
}
#endif /* !defined(__WINDOWS__) */
/* string is two 64 bit numbers printed in hex with a dash between /* string is two 64 bit numbers printed in hex with a dash between
* and zero padding. * and zero padding.
@ -110,7 +74,7 @@ int orte_pre_condition_transports(orte_job_t *jdata)
string_key_len = (sizeof(uint64_t) * 2) * 2 + strlen("-") + 1; string_key_len = (sizeof(uint64_t) * 2) * 2 + strlen("-") + 1;
string_key = (char*) malloc(string_key_len); string_key = (char*) malloc(string_key_len);
if (NULL == string_key) { if (NULL == string_key) {
return ORTE_ERR_OUT_OF_RESOURCE; return NULL;
} }
string_key[0] = '\0'; string_key[0] = '\0';
@ -147,6 +111,56 @@ int orte_pre_condition_transports(orte_job_t *jdata)
format, int_ptr[i]); format, int_ptr[i]);
written_len = strlen(string_key); written_len = strlen(string_key);
} }
free(format);
return string_key;
}
int orte_pre_condition_transports(orte_job_t *jdata)
{
uint64_t unique_key[2];
int n;
orte_app_context_t *app;
char *string_key, *cs_env;
#if !defined(__WINDOWS__)
int fd_rand;
size_t bytes_read;
struct stat buf;
/* put the number here - or else create an appropriate string. this just needs to
* eventually be a string variable
*/
if(0 != stat("/dev/urandom", &buf)) {
/* file doesn't exist! */
orte_pre_condition_transports_use_rand(unique_key);
}
if(-1 == (fd_rand = open("/dev/urandom", O_RDONLY))) {
orte_pre_condition_transports_use_rand(unique_key);
} else {
bytes_read = read(fd_rand, (char *) unique_key, 16);
if(bytes_read != 16) {
orte_pre_condition_transports_use_rand(unique_key);
} else {
close(fd_rand);
}
}
#else
{
unsigned int random_value;
rand_s( &random_value );
unique_key[0] = (uint64_t)random_value;
rand_s( &random_value );
unique_key[1] = (uint64_t)random_value;
}
#endif /* !defined(__WINDOWS__) */
if (NULL == (string_key = orte_pre_condition_transports_print(unique_key))) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
if (NULL == (cs_env = mca_base_param_environ_variable("orte_precondition_transports",NULL,NULL))) { if (NULL == (cs_env = mca_base_param_environ_variable("orte_precondition_transports",NULL,NULL))) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
@ -161,7 +175,6 @@ int orte_pre_condition_transports(orte_job_t *jdata)
} }
free(cs_env); free(cs_env);
free(format);
free(string_key); free(string_key);
return ORTE_SUCCESS; return ORTE_SUCCESS;

Просмотреть файл

@ -34,6 +34,8 @@ BEGIN_C_DECLS
ORTE_DECLSPEC int orte_pre_condition_transports(orte_job_t *jdata); ORTE_DECLSPEC int orte_pre_condition_transports(orte_job_t *jdata);
ORTE_DECLSPEC char* orte_pre_condition_transports_print(uint64_t *unique_key);
END_C_DECLS END_C_DECLS
#endif #endif