1
1

The fork pls now checks the total number of processes to be launched

against the total number of processors.  If not oversubscribing, emit
the MCA environment variable mpi_paffinity_processor with the
processor number to bind the process to.  This parameter is picked up
during MPI_Init (i.e., ompi_mpi_init()) and used to bind the process,
but currently iif the MCA param mpi_paffinity_alone is set to a
nonzero value (i.e., the user asks for it).

This commit was SVN r6906.
Этот коммит содержится в:
Jeff Squyres 2005-08-16 16:23:20 +00:00
родитель 409b9e73b2
Коммит 5e5fd5a8f2

Просмотреть файл

@ -37,26 +37,27 @@
#include "opal/event/event.h" #include "opal/event/event.h"
#include "opal/util/argv.h" #include "opal/util/argv.h"
#include "opal/util/output.h" #include "opal/util/output.h"
#include "util/sys_info.h" #include "opal/mca/paffinity/base/base.h"
#include "util/univ_info.h" #include "opal/util/sys_info.h"
#include "orte/util/univ_info.h"
#include "opal/util/opal_environ.h" #include "opal/util/opal_environ.h"
#include "util/session_dir.h" #include "orte/util/session_dir.h"
#include "runtime/orte_wait.h" #include "orte/runtime/orte_wait.h"
#include "mca/errmgr/errmgr.h" #include "orte/mca/errmgr/errmgr.h"
#include "mca/iof/iof.h" #include "orte/mca/iof/iof.h"
#include "mca/iof/base/iof_base_setup.h" #include "orte/mca/iof/base/iof_base_setup.h"
#include "mca/base/mca_base_param.h" #include "opal/mca/base/mca_base_param.h"
#include "mca/ns/ns.h" #include "orte/mca/ns/ns.h"
#include "orte/mca/sds/base/base.h" #include "orte/mca/sds/base/base.h"
#include "mca/pls/pls.h" #include "orte/mca/pls/pls.h"
#include "mca/pls/base/base.h" #include "orte/mca/pls/base/base.h"
#include "mca/rml/rml.h" #include "orte/mca/rml/rml.h"
#include "mca/gpr/gpr.h" #include "orte/mca/gpr/gpr.h"
#include "mca/rmaps/base/base.h" #include "orte/mca/rmaps/base/base.h"
#include "mca/rmaps/base/rmaps_base_map.h" #include "orte/mca/rmaps/base/rmaps_base_map.h"
#include "mca/soh/soh.h" #include "orte/mca/soh/soh.h"
#include "mca/soh/base/base.h" #include "orte/mca/soh/base/base.h"
#include "mca/pls/fork/pls_fork.h" #include "orte/mca/pls/fork/pls_fork.h"
extern char **environ; extern char **environ;
@ -127,7 +128,9 @@ static int orte_pls_fork_proc(
orte_app_context_t* context, orte_app_context_t* context,
orte_rmaps_base_proc_t* proc, orte_rmaps_base_proc_t* proc,
orte_vpid_t vpid_start, orte_vpid_t vpid_start,
orte_vpid_t vpid_range) orte_vpid_t vpid_range,
bool want_processor,
size_t processor)
{ {
pid_t pid; pid_t pid;
orte_iof_base_io_conf_t opts; orte_iof_base_io_conf_t opts;
@ -201,8 +204,8 @@ static int orte_pls_fork_proc(
} }
if(pid == 0) { if(pid == 0) {
char* param; char *param, *param2;
char* uri; char *uri;
char **environ_copy; char **environ_copy;
#if 0 #if 0
@ -233,6 +236,18 @@ static int orte_pls_fork_proc(
} }
param = mca_base_param_environ_variable("rmgr","bootproxy","jobid"); param = mca_base_param_environ_variable("rmgr","bootproxy","jobid");
opal_unsetenv(param, &environ_copy); opal_unsetenv(param, &environ_copy);
free(param);
/* Set the relative vpid */
if (want_processor) {
param = mca_base_param_environ_variable("mpi", NULL,
"paffinity_processor");
asprintf(&param2, "%lu", processor);
opal_setenv(param, param2, true, &environ_copy);
free(param);
free(param2);
}
/* setup universe info */ /* setup universe info */
if (NULL != orte_universe_info.name) { if (NULL != orte_universe_info.name) {
@ -357,6 +372,7 @@ int orte_pls_fork_launch(orte_jobid_t jobid)
orte_vpid_t vpid_start; orte_vpid_t vpid_start;
orte_vpid_t vpid_range; orte_vpid_t vpid_range;
int rc; int rc;
size_t num_processors, num_processes;
/* query the allocation for this node */ /* query the allocation for this node */
OBJ_CONSTRUCT(&map, opal_list_t); OBJ_CONSTRUCT(&map, opal_list_t);
@ -373,15 +389,28 @@ int orte_pls_fork_launch(orte_jobid_t jobid)
goto cleanup; goto cleanup;
} }
/* are we oversubscribing? */
opal_paffinity_base_get_num_processors(&rc);
num_processors = (size_t) rc;
for (num_processes = 0, item = opal_list_get_first(&map);
item != opal_list_get_end(&map);
item = opal_list_get_next(item)) {
orte_rmaps_base_map_t* map = (orte_rmaps_base_map_t*)item;
num_processes += map->num_procs;
}
/* attempt to launch each of the apps */ /* attempt to launch each of the apps */
for(item = opal_list_get_first(&map); for (item = opal_list_get_first(&map);
item != opal_list_get_end(&map); item != opal_list_get_end(&map);
item = opal_list_get_next(item)) { item = opal_list_get_next(item)) {
orte_rmaps_base_map_t* map = (orte_rmaps_base_map_t*)item; orte_rmaps_base_map_t* map = (orte_rmaps_base_map_t*)item;
size_t i; size_t i;
for(i=0; i<map->num_procs; i++) { for (i=0; i<map->num_procs; i++) {
rc = orte_pls_fork_proc(map->app, map->procs[i], vpid_start, vpid_range); rc = orte_pls_fork_proc(map->app, map->procs[i], vpid_start,
if(ORTE_SUCCESS != rc) { vpid_range,
(num_processes > num_processors) ?
false : true, i);
if (ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
goto cleanup; goto cleanup;
} }