* add more selection criteria to for the pcm selection code
* remove the ns param switch - always use the ns at this point * clean up some of the evil rms code that wasn't multi-pcm safe. still have somme work on this front This commit was SVN r2779.
Этот коммит содержится в:
родитель
efc09dfc94
Коммит
2dc55f12da
@ -49,14 +49,8 @@ mca_pcm_rms_allocate_resources(struct mca_pcm_base_module_1_0_0_t* me,
|
|||||||
|
|
||||||
/* For now, just punt on whether we can actually fullfill the request or not */
|
/* For now, just punt on whether we can actually fullfill the request or not */
|
||||||
total_procs = (nodes == 0) ? procs : nodes * procs;
|
total_procs = (nodes == 0) ? procs : nodes * procs;
|
||||||
if (mca_pcm_rms_use_ns) {
|
node_alloc->start =
|
||||||
node_alloc->start =
|
|
||||||
(int) ompi_name_server.reserve_range(jobid, total_procs);
|
(int) ompi_name_server.reserve_range(jobid, total_procs);
|
||||||
} else {
|
|
||||||
/* BWB - remove the USE_NS code once the failures in PTL / NS
|
|
||||||
due to unexpected offsets are fixed up */
|
|
||||||
node_alloc->start = 0;
|
|
||||||
}
|
|
||||||
node_alloc->nodes = nodes;
|
node_alloc->nodes = nodes;
|
||||||
node_alloc->count = procs;
|
node_alloc->count = procs;
|
||||||
|
|
||||||
|
@ -95,6 +95,5 @@ extern "C" {
|
|||||||
* Module variables
|
* Module variables
|
||||||
*/
|
*/
|
||||||
extern int mca_pcm_rms_output;
|
extern int mca_pcm_rms_output;
|
||||||
extern int mca_pcm_rms_use_ns;
|
|
||||||
|
|
||||||
#endif /* MCA_PCM_RMS_H_ */
|
#endif /* MCA_PCM_RMS_H_ */
|
||||||
|
@ -18,6 +18,7 @@
|
|||||||
#include "mca/pcm/base/base.h"
|
#include "mca/pcm/base/base.h"
|
||||||
#include "mca/llm/base/base.h"
|
#include "mca/llm/base/base.h"
|
||||||
#include "util/path.h"
|
#include "util/path.h"
|
||||||
|
#include "runtime/runtime.h"
|
||||||
|
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
@ -49,17 +50,6 @@ mca_pcm_base_component_1_0_0_t mca_pcm_rms_component = {
|
|||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
struct mca_pcm_base_module_1_0_0_t mca_pcm_rms_1_0_0 = {
|
|
||||||
mca_pcm_rms_allocate_resources,
|
|
||||||
mca_pcm_rms_can_spawn,
|
|
||||||
mca_pcm_rms_spawn_procs,
|
|
||||||
mca_pcm_rms_kill_proc,
|
|
||||||
mca_pcm_rms_kill_job,
|
|
||||||
mca_pcm_rms_deallocate_resources,
|
|
||||||
mca_pcm_rms_finalize
|
|
||||||
};
|
|
||||||
|
|
||||||
|
|
||||||
/* need to create output stream to dump in file */
|
/* need to create output stream to dump in file */
|
||||||
ompi_output_stream_t mca_pcm_rms_output_stream = {
|
ompi_output_stream_t mca_pcm_rms_output_stream = {
|
||||||
false, /* lds_is_debugging BWB - change me for release */
|
false, /* lds_is_debugging BWB - change me for release */
|
||||||
@ -81,30 +71,28 @@ ompi_output_stream_t mca_pcm_rms_output_stream = {
|
|||||||
*/
|
*/
|
||||||
static int mca_pcm_rms_param_priority;
|
static int mca_pcm_rms_param_priority;
|
||||||
static int mca_pcm_rms_param_debug;
|
static int mca_pcm_rms_param_debug;
|
||||||
static int mca_pcm_rms_param_use_ns;
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Component variables. All of these are shared among the module
|
* Component variables. All of these are shared among the module
|
||||||
* instances, so they don't need to go in a special structure or
|
* instances, so they don't need to go in a special structure or
|
||||||
* anything.
|
* anything.
|
||||||
*/
|
*/
|
||||||
int mca_pcm_rms_output = 0;
|
int mca_pcm_rms_output = -1;
|
||||||
int mca_pcm_rms_use_ns;
|
|
||||||
|
|
||||||
int
|
int
|
||||||
mca_pcm_rms_component_open(void)
|
mca_pcm_rms_component_open(void)
|
||||||
{
|
{
|
||||||
mca_pcm_rms_param_debug =
|
mca_pcm_rms_param_debug =
|
||||||
mca_base_param_register_int("pcm", "rms", "debug", NULL, 100);
|
mca_base_param_register_int("pcm", "rms", "debug", NULL, 100);
|
||||||
|
|
||||||
mca_pcm_rms_param_priority =
|
mca_pcm_rms_param_priority =
|
||||||
mca_base_param_register_int("pcm", "rms", "priority", NULL, 5);
|
mca_base_param_register_int("pcm", "rms", "priority", NULL, 5);
|
||||||
|
|
||||||
mca_pcm_rms_param_use_ns =
|
|
||||||
mca_base_param_register_int("pcm", "rms", "use_ns", NULL, 1);
|
|
||||||
|
|
||||||
mca_pcm_rms_job_list_init();
|
mca_pcm_rms_job_list_init();
|
||||||
|
|
||||||
|
mca_pcm_rms_output = ompi_output_open(&mca_pcm_rms_output_stream);
|
||||||
|
|
||||||
return OMPI_SUCCESS;
|
return OMPI_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -127,27 +115,49 @@ mca_pcm_rms_init(int *priority,
|
|||||||
int debug;
|
int debug;
|
||||||
char *prun;
|
char *prun;
|
||||||
int num_cpus;
|
int num_cpus;
|
||||||
|
mca_pcm_base_module_t *me;
|
||||||
|
|
||||||
|
/* debugging gorp */
|
||||||
mca_base_param_lookup_int(mca_pcm_rms_param_debug, &debug);
|
mca_base_param_lookup_int(mca_pcm_rms_param_debug, &debug);
|
||||||
mca_pcm_rms_output = ompi_output_open(&mca_pcm_rms_output_stream);
|
|
||||||
ompi_output_set_verbosity(mca_pcm_rms_output, debug);
|
ompi_output_set_verbosity(mca_pcm_rms_output, debug);
|
||||||
|
|
||||||
|
/* get our priority - if 0, we don't run */
|
||||||
mca_base_param_lookup_int(mca_pcm_rms_param_priority, priority);
|
mca_base_param_lookup_int(mca_pcm_rms_param_priority, priority);
|
||||||
|
if (0 == priority) return NULL;
|
||||||
|
|
||||||
mca_base_param_lookup_int(mca_pcm_rms_param_use_ns, &mca_pcm_rms_use_ns);
|
/* fill in params */
|
||||||
|
|
||||||
*allow_multi_user_threads = true;
|
*allow_multi_user_threads = true;
|
||||||
*have_hidden_threads = false;
|
*have_hidden_threads = false;
|
||||||
|
|
||||||
|
/* check constrains */
|
||||||
|
/* no daemon */
|
||||||
|
if (0 != (constraints & OMPI_RTE_SPAWN_DAEMON)) return NULL;
|
||||||
|
/* no MPI_COMM_SPAWN* */
|
||||||
|
if (0 != (constraints & OMPI_RTE_SPAWN_FROM_MPI)) return NULL;
|
||||||
|
|
||||||
|
/* see if we are an RMS system */
|
||||||
|
/* BWB - is there a better way to do this */
|
||||||
num_cpus = rms_numCpus(NULL);
|
num_cpus = rms_numCpus(NULL);
|
||||||
if (num_cpus <= 0) return NULL;
|
if (num_cpus <= 0) return NULL;
|
||||||
|
|
||||||
/* poke around for prun */
|
|
||||||
prun = ompi_path_env_findv("prun", X_OK, environ, NULL);
|
prun = ompi_path_env_findv("prun", X_OK, environ, NULL);
|
||||||
if (NULL == prun) return NULL;
|
if (NULL == prun) return NULL;
|
||||||
free(prun);
|
free(prun);
|
||||||
|
|
||||||
return &mca_pcm_rms_1_0_0;
|
/* ok, now let's try to fire up */
|
||||||
|
me = malloc(sizeof(mca_pcm_base_module_t));
|
||||||
|
if (NULL == me) return NULL;
|
||||||
|
|
||||||
|
me->pcm_allocate_resources = mca_pcm_rms_allocate_resources;
|
||||||
|
me->pcm_can_spawn = mca_pcm_rms_can_spawn;
|
||||||
|
me->pcm_spawn_procs = mca_pcm_rms_spawn_procs;
|
||||||
|
me->pcm_kill_proc = mca_pcm_rms_kill_proc;
|
||||||
|
me->pcm_kill_job = mca_pcm_rms_kill_job;
|
||||||
|
me->pcm_deallocate_resources = mca_pcm_rms_deallocate_resources;
|
||||||
|
me->pcm_finalize = mca_pcm_rms_finalize;
|
||||||
|
|
||||||
|
return me;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -59,7 +59,6 @@ extern "C" {
|
|||||||
int fast_boot;
|
int fast_boot;
|
||||||
int ignore_stderr;
|
int ignore_stderr;
|
||||||
char* rsh_agent;
|
char* rsh_agent;
|
||||||
int use_ns;
|
|
||||||
};
|
};
|
||||||
typedef struct mca_pcm_rsh_module_t mca_pcm_rsh_module_t;
|
typedef struct mca_pcm_rsh_module_t mca_pcm_rsh_module_t;
|
||||||
|
|
||||||
|
@ -71,13 +71,12 @@ static int mca_pcm_rsh_param_ignore_stderr;
|
|||||||
static int mca_pcm_rsh_param_priority;
|
static int mca_pcm_rsh_param_priority;
|
||||||
static int mca_pcm_rsh_param_agent;
|
static int mca_pcm_rsh_param_agent;
|
||||||
static int mca_pcm_rsh_param_debug;
|
static int mca_pcm_rsh_param_debug;
|
||||||
static int mca_pcm_rsh_param_use_ns;
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* component variables
|
* component variables
|
||||||
*/
|
*/
|
||||||
/* debugging output stream */
|
/* debugging output stream */
|
||||||
int mca_pcm_rsh_output = 0;
|
int mca_pcm_rsh_output = -1;
|
||||||
|
|
||||||
|
|
||||||
int
|
int
|
||||||
@ -96,8 +95,6 @@ mca_pcm_rsh_component_open(void)
|
|||||||
mca_base_param_register_int("pcm", "rsh", "fast", NULL, 1);
|
mca_base_param_register_int("pcm", "rsh", "fast", NULL, 1);
|
||||||
mca_pcm_rsh_param_ignore_stderr =
|
mca_pcm_rsh_param_ignore_stderr =
|
||||||
mca_base_param_register_int("pcm", "rsh", "ignore_stderr", NULL, 0);
|
mca_base_param_register_int("pcm", "rsh", "ignore_stderr", NULL, 0);
|
||||||
mca_pcm_rsh_param_use_ns =
|
|
||||||
mca_base_param_register_int("pcm", "rsh", "use_ns", NULL, 1);
|
|
||||||
|
|
||||||
mca_pcm_rsh_param_priority =
|
mca_pcm_rsh_param_priority =
|
||||||
mca_base_param_register_int("pcm", "rsh", "priority", NULL, 1);
|
mca_base_param_register_int("pcm", "rsh", "priority", NULL, 1);
|
||||||
@ -148,18 +145,16 @@ mca_pcm_rsh_init(int *priority,
|
|||||||
&(me->ignore_stderr));
|
&(me->ignore_stderr));
|
||||||
mca_base_param_lookup_string(mca_pcm_rsh_param_agent,
|
mca_base_param_lookup_string(mca_pcm_rsh_param_agent,
|
||||||
&(me->rsh_agent));
|
&(me->rsh_agent));
|
||||||
mca_base_param_lookup_int(mca_pcm_rsh_param_use_ns,
|
|
||||||
&(me->use_ns));
|
|
||||||
|
|
||||||
*allow_multi_user_threads = true;
|
*allow_multi_user_threads = true;
|
||||||
*have_hidden_threads = false;
|
*have_hidden_threads = false;
|
||||||
|
|
||||||
ret = mca_llm_base_select("pcm", &(me->llm),
|
ret = mca_llm_base_select("rsh", &(me->llm),
|
||||||
allow_multi_user_threads,
|
allow_multi_user_threads,
|
||||||
have_hidden_threads);
|
have_hidden_threads);
|
||||||
if (OMPI_SUCCESS != ret) {
|
if (OMPI_SUCCESS != ret) {
|
||||||
/* well, that can't be good. guess we can't run */
|
/* well, that can't be good. guess we can't run */
|
||||||
ompi_output_verbose(5, mca_pcm_rsh_output, "select: no llm found");
|
ompi_output_verbose(5, mca_pcm_rsh_output, "init: no llm found");
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -91,11 +91,8 @@ mca_pcm_rsh_spawn_procs(struct mca_pcm_base_module_1_0_0_t* me_super,
|
|||||||
|
|
||||||
/* BWB - make sure vpids are reserved */
|
/* BWB - make sure vpids are reserved */
|
||||||
local_start_vpid = 0;
|
local_start_vpid = 0;
|
||||||
if (me->use_ns) {
|
global_start_vpid = (int) ompi_name_server.reserve_range(jobid, num_procs);
|
||||||
global_start_vpid = (int) ompi_name_server.reserve_range(jobid, num_procs);
|
|
||||||
} else {
|
|
||||||
global_start_vpid = 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
for (sched_item = ompi_list_get_first(schedlist) ;
|
for (sched_item = ompi_list_get_first(schedlist) ;
|
||||||
sched_item != ompi_list_get_end(schedlist) ;
|
sched_item != ompi_list_get_end(schedlist) ;
|
||||||
|
@ -21,7 +21,11 @@
|
|||||||
mpiruntime/mpiruntime.h directly */
|
mpiruntime/mpiruntime.h directly */
|
||||||
#include "mpi/runtime/mpiruntime.h"
|
#include "mpi/runtime/mpiruntime.h"
|
||||||
|
|
||||||
|
/* constants for spawn constraints */
|
||||||
#define OMPI_RTE_SPAWN_MULTI_CELL 0x0001
|
#define OMPI_RTE_SPAWN_MULTI_CELL 0x0001
|
||||||
|
#define OMPI_RTE_SPAWN_DAEMON 0x0002
|
||||||
|
#define OMPI_RTE_SPAWN_HIGH_QOS 0x0004
|
||||||
|
#define OMPI_RTE_SPAWN_FROM_MPI 0x0008
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
extern "C" {
|
extern "C" {
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user