* Add ability to conditionally use the name server for pid reservation in the
RMS and RSH pcms. The RSH pcm was hard-coded to start from 0. The RMS pcm always used the name server. UNfortunately, due to some interesting interactions, using the name server causes failures in the PTL logic because vpids start at 1, not 0 when the reservation code is used. Right now, the default is to start vpids at 0 so that everything still runs out of the box. But this should make it possible for Tim to look at the PTL problems a bit more easily. This commit was SVN r2560.
Этот коммит содержится в:
родитель
9d610a8d7f
Коммит
08dc86af7c
@ -148,8 +148,14 @@ mca_pcm_rms_allocate_resources(mca_ns_base_jobid_t jobid,
|
||||
|
||||
/* For now, just punt on whether we can actually fullfill the request or not */
|
||||
total_procs = (nodes == 0) ? procs : nodes * procs;
|
||||
node_alloc->start =
|
||||
(int) ompi_name_server.reserve_range(jobid, total_procs);
|
||||
if (mca_pcm_rms_use_ns) {
|
||||
node_alloc->start =
|
||||
(int) ompi_name_server.reserve_range(jobid, total_procs);
|
||||
} else {
|
||||
/* BWB - remove the USE_NS code once the failures in PTL / NS
|
||||
due to unexpected offsets are fixed up */
|
||||
node_alloc->start = 0;
|
||||
}
|
||||
node_alloc->nodes = nodes;
|
||||
node_alloc->count = procs;
|
||||
|
||||
|
@ -71,6 +71,6 @@ extern "C" {
|
||||
*/
|
||||
extern int mca_pcm_rms_output;
|
||||
extern ompi_list_t mca_pcm_rms_jobs;
|
||||
|
||||
extern int mca_pcm_rms_use_ns;
|
||||
|
||||
#endif /* MCA_PCM_RMS_H_ */
|
||||
|
@ -80,12 +80,14 @@ ompi_output_stream_t mca_pcm_rms_output_stream = {
|
||||
*/
|
||||
static int mca_pcm_rms_param_priority;
|
||||
static int mca_pcm_rms_param_debug;
|
||||
static int mca_pcm_rms_param_use_ns;
|
||||
|
||||
/*
|
||||
* Module variables
|
||||
*/
|
||||
ompi_list_t mca_pcm_rms_jobs;
|
||||
int mca_pcm_rms_output = 0;
|
||||
int mca_pcm_rms_use_ns;
|
||||
|
||||
int
|
||||
mca_pcm_rms_component_open(void)
|
||||
@ -96,6 +98,9 @@ mca_pcm_rms_component_open(void)
|
||||
mca_pcm_rms_param_priority =
|
||||
mca_base_param_register_int("pcm", "rms", "priority", NULL, 0);
|
||||
|
||||
mca_pcm_rms_param_use_ns =
|
||||
mca_base_param_register_int("pcm", "rms", "use_ns", NULL, 0);
|
||||
|
||||
OBJ_CONSTRUCT(&mca_pcm_rms_jobs, ompi_list_t);
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
@ -123,6 +128,8 @@ mca_pcm_rms_init(int *priority,
|
||||
|
||||
mca_base_param_lookup_int(mca_pcm_rms_param_priority, priority);
|
||||
|
||||
mca_base_param_lookup_int(mca_pcm_rms_param_use_ns, &mca_pcm_use_ns);
|
||||
|
||||
*allow_multi_user_threads = false;
|
||||
*have_hidden_threads = false;
|
||||
|
||||
|
@ -58,4 +58,6 @@ extern char *mca_pcm_rsh_agent;
|
||||
|
||||
extern int mca_pcm_rsh_output;
|
||||
|
||||
extern int mca_pcm_rsh_use_ns;
|
||||
|
||||
#endif /* MCA_PCM_RSH_H_ */
|
||||
|
@ -82,6 +82,7 @@ static int mca_pcm_rsh_param_ignore_stderr;
|
||||
static int mca_pcm_rsh_param_priority;
|
||||
static int mca_pcm_rsh_param_agent;
|
||||
static int mca_pcm_rsh_param_debug;
|
||||
static int mca_pcm_rsh_param_use_ns;
|
||||
|
||||
/*
|
||||
* Module variables
|
||||
@ -96,6 +97,7 @@ int mca_pcm_rsh_ignore_stderr;
|
||||
char *mca_pcm_rsh_agent;
|
||||
|
||||
int mca_pcm_rsh_output = 0;
|
||||
int mca_pcm_rsh_use_ns;
|
||||
|
||||
static mca_llm_base_module_t mca_pcm_rsh_llm;
|
||||
|
||||
@ -115,6 +117,8 @@ mca_pcm_rsh_component_open(void)
|
||||
mca_base_param_register_int("pcm", "rsh", "fast", NULL, 1);
|
||||
mca_pcm_rsh_param_ignore_stderr =
|
||||
mca_base_param_register_int("pcm", "rsh", "ignore_stderr", NULL, 0);
|
||||
mca_pcm_rsh_param_use_ns =
|
||||
mca_base_param_register_int("pcm", "rsh", "use_ns", NULL, 0);
|
||||
|
||||
mca_pcm_rsh_param_priority =
|
||||
mca_base_param_register_int("pcm", "rsh", "priority", NULL, 1);
|
||||
@ -157,6 +161,9 @@ mca_pcm_rsh_init(int *priority,
|
||||
*allow_multi_user_threads = true;
|
||||
*have_hidden_threads = false;
|
||||
|
||||
mca_base_param_lookup_int(mca_pcm_rsh_param_use_ns,
|
||||
&mca_pcm_rsh_use_ns);
|
||||
|
||||
ret = mca_llm_base_select("pcm", &mca_pcm_rsh_llm,
|
||||
allow_multi_user_threads,
|
||||
have_hidden_threads);
|
||||
|
@ -28,6 +28,7 @@
|
||||
#include "util/output.h"
|
||||
#include "util/argv.h"
|
||||
#include "util/numtostr.h"
|
||||
#include "mca/ns/base/base.h"
|
||||
|
||||
#if 1
|
||||
#define BOOTAGENT "mca_pcm_rsh_bootproxy"
|
||||
@ -90,7 +91,12 @@ mca_pcm_rsh_spawn_procs(mca_ns_base_jobid_t jobid, ompi_list_t *schedlist)
|
||||
}
|
||||
|
||||
/* BWB - make sure vpids are reserved */
|
||||
local_start_vpid = global_start_vpid;
|
||||
local_start_vpid = 0;
|
||||
if (mca_pcm_rsh_use_ns) {
|
||||
global_start_vpid = (int) ompi_name_server.reserve_range(jobid, num_procs);
|
||||
} else {
|
||||
global_start_vpid = 0;
|
||||
}
|
||||
|
||||
for (sched_item = ompi_list_get_first(schedlist) ;
|
||||
sched_item != ompi_list_get_end(schedlist) ;
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user