1
1

* Add ability to conditionally use the name server for pid reservation in the

RMS and RSH pcms.  The RSH pcm was hard-coded to start from 0.  The RMS
  pcm always used the name server.  UNfortunately, due to some interesting
  interactions, using the name server causes failures in the PTL logic
  because vpids start at 1, not 0 when the reservation code is used.

  Right now, the default is to start vpids at 0 so that everything still
  runs out of the box.  But this should make it possible for Tim to look
  at the PTL problems a bit more easily.

This commit was SVN r2560.
Этот коммит содержится в:
Brian Barrett 2004-09-09 15:15:27 +00:00
родитель 9d610a8d7f
Коммит 08dc86af7c
6 изменённых файлов: 32 добавлений и 4 удалений

Просмотреть файл

@ -148,8 +148,14 @@ mca_pcm_rms_allocate_resources(mca_ns_base_jobid_t jobid,
/* For now, just punt on whether we can actually fullfill the request or not */
total_procs = (nodes == 0) ? procs : nodes * procs;
node_alloc->start =
(int) ompi_name_server.reserve_range(jobid, total_procs);
if (mca_pcm_rms_use_ns) {
node_alloc->start =
(int) ompi_name_server.reserve_range(jobid, total_procs);
} else {
/* BWB - remove the USE_NS code once the failures in PTL / NS
due to unexpected offsets are fixed up */
node_alloc->start = 0;
}
node_alloc->nodes = nodes;
node_alloc->count = procs;

Просмотреть файл

@ -71,6 +71,6 @@ extern "C" {
*/
extern int mca_pcm_rms_output;
extern ompi_list_t mca_pcm_rms_jobs;
extern int mca_pcm_rms_use_ns;
#endif /* MCA_PCM_RMS_H_ */

Просмотреть файл

@ -80,12 +80,14 @@ ompi_output_stream_t mca_pcm_rms_output_stream = {
*/
static int mca_pcm_rms_param_priority;
static int mca_pcm_rms_param_debug;
static int mca_pcm_rms_param_use_ns;
/*
* Module variables
*/
ompi_list_t mca_pcm_rms_jobs;
int mca_pcm_rms_output = 0;
int mca_pcm_rms_use_ns;
int
mca_pcm_rms_component_open(void)
@ -96,6 +98,9 @@ mca_pcm_rms_component_open(void)
mca_pcm_rms_param_priority =
mca_base_param_register_int("pcm", "rms", "priority", NULL, 0);
mca_pcm_rms_param_use_ns =
mca_base_param_register_int("pcm", "rms", "use_ns", NULL, 0);
OBJ_CONSTRUCT(&mca_pcm_rms_jobs, ompi_list_t);
return OMPI_SUCCESS;
@ -123,6 +128,8 @@ mca_pcm_rms_init(int *priority,
mca_base_param_lookup_int(mca_pcm_rms_param_priority, priority);
mca_base_param_lookup_int(mca_pcm_rms_param_use_ns, &mca_pcm_use_ns);
*allow_multi_user_threads = false;
*have_hidden_threads = false;

Просмотреть файл

@ -58,4 +58,6 @@ extern char *mca_pcm_rsh_agent;
extern int mca_pcm_rsh_output;
extern int mca_pcm_rsh_use_ns;
#endif /* MCA_PCM_RSH_H_ */

Просмотреть файл

@ -82,6 +82,7 @@ static int mca_pcm_rsh_param_ignore_stderr;
static int mca_pcm_rsh_param_priority;
static int mca_pcm_rsh_param_agent;
static int mca_pcm_rsh_param_debug;
static int mca_pcm_rsh_param_use_ns;
/*
* Module variables
@ -96,6 +97,7 @@ int mca_pcm_rsh_ignore_stderr;
char *mca_pcm_rsh_agent;
int mca_pcm_rsh_output = 0;
int mca_pcm_rsh_use_ns;
static mca_llm_base_module_t mca_pcm_rsh_llm;
@ -115,6 +117,8 @@ mca_pcm_rsh_component_open(void)
mca_base_param_register_int("pcm", "rsh", "fast", NULL, 1);
mca_pcm_rsh_param_ignore_stderr =
mca_base_param_register_int("pcm", "rsh", "ignore_stderr", NULL, 0);
mca_pcm_rsh_param_use_ns =
mca_base_param_register_int("pcm", "rsh", "use_ns", NULL, 0);
mca_pcm_rsh_param_priority =
mca_base_param_register_int("pcm", "rsh", "priority", NULL, 1);
@ -157,6 +161,9 @@ mca_pcm_rsh_init(int *priority,
*allow_multi_user_threads = true;
*have_hidden_threads = false;
mca_base_param_lookup_int(mca_pcm_rsh_param_use_ns,
&mca_pcm_rsh_use_ns);
ret = mca_llm_base_select("pcm", &mca_pcm_rsh_llm,
allow_multi_user_threads,
have_hidden_threads);

Просмотреть файл

@ -28,6 +28,7 @@
#include "util/output.h"
#include "util/argv.h"
#include "util/numtostr.h"
#include "mca/ns/base/base.h"
#if 1
#define BOOTAGENT "mca_pcm_rsh_bootproxy"
@ -90,7 +91,12 @@ mca_pcm_rsh_spawn_procs(mca_ns_base_jobid_t jobid, ompi_list_t *schedlist)
}
/* BWB - make sure vpids are reserved */
local_start_vpid = global_start_vpid;
local_start_vpid = 0;
if (mca_pcm_rsh_use_ns) {
global_start_vpid = (int) ompi_name_server.reserve_range(jobid, num_procs);
} else {
global_start_vpid = 0;
}
for (sched_item = ompi_list_get_first(schedlist) ;
sched_item != ompi_list_get_end(schedlist) ;