1
1

Per the discussion on the devel list, move the binding of processes to processors from MPI_Init to process start. This involves:

1. replacing mpi_paffinity_alone with opal_paffinity_alone - for back-compatibility, I have aliased mpi_paffinity_alone to the new param name. This caus
es a mild abstraction break in the opal/mca/paffinity framework - per the devel discussion...live with it. :-) I also moved the ompi_xxx global variable
 that tracked maffinity setup so it could be properly closed in MPI_Finalize to the opal/mca/maffinity framework to avoid an abstraction break.

2. Added code to the odls/default module to perform paffinity binding and maffinity init between process fork and exec. This has been tested on IU's odi
n cluster and works for both MPI and non-MPI apps.

3. Revise MPI_Init to detect if affinity has already been set, and to attempt to set it if not already done. I have *not* tested this as I haven't yet f
igured out a way to do so - I couldn't get slurm to perform cpu bindings, even though it supposedly does do so.

This has only been lightly tested and would definitely benefit from a wider range of evaluation...

This commit was SVN r21209.
Этот коммит содержится в:
Ralph Castain 2009-05-12 02:18:35 +00:00
родитель fa839f4a30
Коммит d396f0a6fc
15 изменённых файлов: 217 добавлений и 126 удалений

Просмотреть файл

@ -38,7 +38,7 @@ WARNING: Cannot set both the MCA parameters mpi_leave_pinned and
mpi_leave_pinned_pipeline to "true". Defaulting to mpi_leave_pinned
ONLY.
[mpi_init:startup:paffinity-unavailable]
The MCA parameter "mpi_paffinity_alone" was set to a nonzero value,
The MCA parameter "opal_paffinity_alone" was set to a nonzero value,
but Open MPI was unable to bind MPI_COMM_WORLD rank %s to a processor.
Typical causes for this problem include:

Просмотреть файл

@ -57,10 +57,6 @@ OMPI_DECLSPEC extern int ompi_mpi_thread_provided;
/** Identifier of the main thread */
OMPI_DECLSPEC extern struct opal_thread_t *ompi_mpi_main_thread;
/** Did we setup maffinity in MPI_INIT (and therefore need to shut
it down during MPI_FINALIZE)? */
OMPI_DECLSPEC extern bool ompi_mpi_maffinity_setup;
/** Do we want to be warned on fork or not? */
OMPI_DECLSPEC extern bool ompi_warn_on_fork;

Просмотреть файл

@ -144,7 +144,7 @@ int ompi_mpi_finalize(void)
opal_progress_event_users_increment();
/* If maffinity was setup, tear it down */
if (ompi_mpi_maffinity_setup) {
if (opal_maffinity_setup) {
opal_maffinity_base_close();
}

Просмотреть файл

@ -280,6 +280,7 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
int param, value;
struct timeval ompistart, ompistop;
char *event_val = NULL;
opal_paffinity_base_cpu_set_t mask;
#if 0
/* see comment below about sched_yield */
int num_processors;
@ -395,56 +396,60 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
orte_process_info.pid);
}
/* Setup process affinity. First check to see if a slot list was
specified. If so, use it. If no slot list was specified,
that's not an error -- just fall through and try the next
paffinity scheme. */
ret = opal_paffinity_base_slot_list_set((long)ORTE_PROC_MY_NAME->vpid);
if (OPAL_SUCCESS == ret) {
paffinity_enabled = true;
}
/* If an error occurred in the slot list setup (other than "there
was not slot list specified"), bail. */
else if (OPAL_ERR_NOT_FOUND != ret) {
error = "opal_paffinity_base_slot_list_set() returned an error";
goto error;
}
/* It's an error if multiple paffinity schemes were specified */
if (paffinity_enabled && ompi_mpi_paffinity_alone) {
ret = OMPI_ERR_BAD_PARAM;
error = "Multiple processor affinity schemes specified (can only specify one)";
goto error;
}
/* Otherwise, if mpi_paffinity_alone was set, use that scheme */
else if (ompi_mpi_paffinity_alone) {
opal_paffinity_base_cpu_set_t mask;
int phys_cpu;
orte_node_rank_t nrank;
if (ORTE_NODE_RANK_INVALID == (nrank = orte_ess.get_node_rank(ORTE_PROC_MY_NAME))) {
error = "Could not get node rank - cannot set processor affinity";
goto error;
/* if it hasn't already been done, setup process affinity.
* First check to see if a slot list was
* specified. If so, use it. If no slot list was specified,
* that's not an error -- just fall through and try the next
* paffinity scheme.
*/
ret = opal_paffinity_base_get(&mask);
if (OPAL_ERR_NOT_FOUND == ret) {
/* the system is capable of doing processor affinity, but it
* has not yet been set - see if a slot_list was given
*/
if (NULL != opal_paffinity_base_slot_list) {
/* It's an error if multiple paffinity schemes were specified */
if (opal_paffinity_alone) {
ret = OMPI_ERR_BAD_PARAM;
error = "Multiple processor affinity schemes specified (can only specify one)";
goto error;
}
ret = opal_paffinity_base_slot_list_set((long)ORTE_PROC_MY_NAME->vpid, opal_paffinity_base_slot_list);
if (OPAL_ERR_NOT_FOUND != ret) {
error = "opal_paffinity_base_slot_list_set() returned an error";
goto error;
}
paffinity_enabled = true;
} else if (opal_paffinity_alone) {
/* no slot_list, but they asked for paffinity */
int phys_cpu;
orte_node_rank_t nrank;
if (ORTE_NODE_RANK_INVALID == (nrank = orte_ess.get_node_rank(ORTE_PROC_MY_NAME))) {
error = "Could not get node rank - cannot set processor affinity";
goto error;
}
OPAL_PAFFINITY_CPU_ZERO(mask);
phys_cpu = opal_paffinity_base_get_physical_processor_id(nrank);
if (0 > phys_cpu) {
error = "Could not get physical processor id - cannot set processor affinity";
goto error;
}
OPAL_PAFFINITY_CPU_SET(phys_cpu, mask);
ret = opal_paffinity_base_set(mask);
if (OPAL_SUCCESS != ret) {
error = "Setting processor affinity failed";
goto error;
}
paffinity_enabled = true;
}
OPAL_PAFFINITY_CPU_ZERO(mask);
phys_cpu = opal_paffinity_base_get_physical_processor_id(nrank);
if (0 > phys_cpu) {
error = "Could not get physical processor id - cannot set processor affinity";
goto error;
}
OPAL_PAFFINITY_CPU_SET(phys_cpu, mask);
ret = opal_paffinity_base_set(mask);
if (OPAL_SUCCESS != ret) {
error = "Setting processor affinity failed";
goto error;
}
paffinity_enabled = true;
}
/* If we were able to set processor affinity, try setting up
memory affinity */
if (paffinity_enabled) {
if (!opal_maffinity_setup && paffinity_enabled) {
if (OPAL_SUCCESS == opal_maffinity_base_open() &&
OPAL_SUCCESS == opal_maffinity_base_select()) {
ompi_mpi_maffinity_setup = true;
opal_maffinity_setup = true;
}
}

Просмотреть файл

@ -52,7 +52,6 @@ int ompi_debug_show_mpi_alloc_mem_leaks = 0;
bool ompi_debug_no_free_handles = false;
bool ompi_mpi_show_mca_params = false;
char *ompi_mpi_show_mca_params_file = NULL;
bool ompi_mpi_paffinity_alone = false;
bool ompi_mpi_abort_print_stack = false;
int ompi_mpi_abort_delay = 0;
bool ompi_mpi_keep_peer_hostnames = true;
@ -261,12 +260,6 @@ int ompi_mpi_register_params(void)
true);
}
mca_base_param_reg_int_name("mpi", "paffinity_alone",
"If nonzero, assume that this job is the only (set of) process(es) running on each node and bind processes to processors, starting with processor ID 0",
false, false,
(int) ompi_mpi_paffinity_alone, &value);
ompi_mpi_paffinity_alone = OPAL_INT_TO_BOOL(value);
mca_base_param_reg_int_name("mpi", "warn_on_fork",
"If nonzero, issue a warning if program forks under conditions that could cause system errors",
false, false,

Просмотреть файл

@ -95,15 +95,6 @@ OMPI_DECLSPEC extern bool ompi_mpi_show_mca_params;
*/
OMPI_DECLSPEC extern char * ompi_mpi_show_mca_params_file;
/**
* If this value is true, assume that this ORTE job is the only job
* running on the nodes that have been allocated to it, and bind
* processes to the processor ID corresponding to their node local
* rank (if you COMM_SPAWN on to empty processors on the same node,
* the NLR will start at N, not 0).
*/
OMPI_DECLSPEC extern bool ompi_mpi_paffinity_alone;
/**
* Whether we should keep the string hostnames of all the MPI
* process peers around or not (eats up a good bit of memory).

Просмотреть файл

@ -154,7 +154,12 @@ OPAL_DECLSPEC extern opal_list_t opal_maffinity_base_components_opened;
* Debugging output stream
*/
extern int opal_maffinity_base_output;
/**
* Flag to indicate whether or not maffinity was setup
*/
OPAL_DECLSPEC extern bool opal_maffinity_setup;
END_C_DECLS
#endif /* OPAL_BASE_MAFFINITY_H */

Просмотреть файл

@ -42,7 +42,7 @@
int opal_maffinity_base_output = -1;
bool opal_maffinity_base_components_opened_valid = false;
opal_list_t opal_maffinity_base_components_opened;
bool opal_maffinity_setup = false;
/*
* Function for finding and opening either all MCA components, or the one

Просмотреть файл

@ -28,36 +28,32 @@
#include <sys/types.h>
#if defined(c_plusplus) || defined(__cplusplus)
extern "C" {
#endif
BEGIN_C_DECLS
/**
* Struct used with opal_maffinity_base_module_set_fn_t. It
* describes a section of memory (starting address and length).
* This is really the same thing as an iovec, but we include a
* separate type for it for at least 2 reasons:
*
* 1. Some OS's iovec definitions are exceedingly lame (e.g.,
* Solaris 9 has the length argument as an int, instead of a
* size_t).
*
* 2. We reserve the right to expand/change this struct in the
* future.
*/
struct opal_maffinity_base_segment_t {
/** Starting address of segment */
void *mbs_start_addr;
/** Length of segment */
size_t mbs_len;
};
/**
* Convenience typedef
*/
typedef struct opal_maffinity_base_segment_t opal_maffinity_base_segment_t;
/**
* Struct used with opal_maffinity_base_module_set_fn_t. It
* describes a section of memory (starting address and length).
* This is really the same thing as an iovec, but we include a
* separate type for it for at least 2 reasons:
*
* 1. Some OS's iovec definitions are exceedingly lame (e.g.,
* Solaris 9 has the length argument as an int, instead of a
* size_t).
*
* 2. We reserve the right to expand/change this struct in the
* future.
*/
struct opal_maffinity_base_segment_t {
/** Starting address of segment */
void *mbs_start_addr;
/** Length of segment */
size_t mbs_len;
};
/**
* Convenience typedef
*/
typedef struct opal_maffinity_base_segment_t opal_maffinity_base_segment_t;
#if defined(c_plusplus) || defined(__cplusplus)
}
#endif
END_C_DECLS
#endif /* OPAL_MAFFINITY_TYPES_H */

Просмотреть файл

@ -237,13 +237,20 @@ OPAL_DECLSPEC extern opal_list_t opal_paffinity_base_components_opened;
/**
* Assigning slot_list to process
*/
OPAL_DECLSPEC int opal_paffinity_base_slot_list_set(long rank);
OPAL_DECLSPEC int opal_paffinity_base_slot_list_set(long rank, char *slot_str);
/**
* Debugging output stream
*/
OPAL_DECLSPEC extern int opal_paffinity_base_output;
/**
* Flag indicating whether or not processor affinity is to be enabled
*/
OPAL_DECLSPEC extern bool opal_paffinity_alone;
OPAL_DECLSPEC extern char *opal_paffinity_base_slot_list;
END_C_DECLS
#endif /* OPAL_BASE_PAFFINITY_H */

Просмотреть файл

@ -43,7 +43,8 @@
OPAL_DECLSPEC int opal_paffinity_base_output = -1;
bool opal_paffinity_base_components_opened_valid = false;
opal_list_t opal_paffinity_base_components_opened;
bool opal_paffinity_alone = false;
char *opal_paffinity_base_slot_list;
/*
* Function for finding and opening either all MCA components, or the one
@ -51,7 +52,7 @@ opal_list_t opal_paffinity_base_components_opened;
*/
int opal_paffinity_base_open(void)
{
int value;
int value, id;
/* Debugging / verbose output */
@ -65,11 +66,19 @@ int opal_paffinity_base_open(void)
opal_paffinity_base_output = -1;
}
id = mca_base_param_reg_int_name("opal", "paffinity_alone",
"If nonzero, assume that this job is the only (set of) process(es) running on each node and bind processes to processors, starting with processor ID 0",
false, false,
0, NULL);
mca_base_param_reg_syn_name(id, "mpi", "paffinity_alone", true);
mca_base_param_lookup_int(id, &value);
opal_paffinity_alone = OPAL_INT_TO_BOOL(value);
opal_paffinity_base_components_opened_valid = false;
mca_base_param_reg_string_name("opal", "paffinity_base_slot_list",
"Used to set list of processor IDs to bind MPI processes to (e.g., used in conjunction with rank files)",
true, false, NULL, NULL);
true, false, NULL, &opal_paffinity_base_slot_list);
/* Open up all available components */

Просмотреть файл

@ -512,29 +512,22 @@ static int opal_paffinity_base_socket_core_to_cpu_set(char **socket_core_list, i
return OPAL_SUCCESS;
}
int opal_paffinity_base_slot_list_set(long rank)
int opal_paffinity_base_slot_list_set(long rank, char *slot_str)
{
char *slot_str = NULL;
char **item;
char **socket_core;
int item_cnt, socket_core_cnt, rc;
bool logical_map;
rc = mca_base_param_find("opal", NULL, "paffinity_base_slot_list");
/* If there was not slot list specified, return a specific error
code indicating that */
if (rc <= 0) {
return OPAL_ERR_NOT_FOUND;
}
if (OPAL_SUCCESS == mca_base_param_lookup_string(rc, &slot_str)) {
if (NULL == slot_str) {
return OPAL_ERR_NOT_FOUND;
}
}
if (0 == strcmp("", slot_str)){
if (NULL == slot_str){
return OPAL_ERR_BAD_PARAM;
}
/* if the slot string is empty, that is an error */
if (0 == strlen(slot_str)) {
return OPAL_ERR_BAD_PARAM;
}
/* check for diag request to avoid repeatedly doing so */
if (4 < opal_output_get_verbosity(opal_paffinity_base_output)) {
diag_requested = true;

Просмотреть файл

@ -1285,16 +1285,6 @@ int orte_odls_base_default_launch_local(orte_jobid_t job,
opal_setenv("OMPI_COMM_WORLD_LOCAL_RANK", value, true, &app->env);
free(value);
param = mca_base_param_environ_variable("opal", NULL, "paffinity_base_slot_list");
if ( NULL != child->slot_list ) {
asprintf(&value, "%s", child->slot_list);
opal_setenv(param, value, true, &app->env);
free(value);
} else {
opal_unsetenv(param, &app->env);
}
free(param);
/* if we are timing things, record when we are going to launch this proc */
if (orte_timing) {
gettimeofday(&child->starttime, NULL);

Просмотреть файл

@ -55,3 +55,36 @@ Max value allowed: %ld
This may be resolved by increasing the number of available node id's by
re-configuring Open MPI with the --enable-jumbo-clusters option, and then
re-running the application
#
[odls-default:multiple-paffinity-schemes]
Multiple processor affinity schemes were specified (can only specify one):
Slot list: %s
opal_paffinity_alone: true
Please specify only the one desired method.
#
[odls-default:slot-list-failed]
We were unable to successfully process/set the requested processor
affinity settings:
Specified slot list: %s
Error: %s
This could mean that a non-existent processor was specified, or
that the specification had improper syntax.
#
[odls-default:invalid-node-rank]
An invalid node rank was obtained - this is probably something
that should be reported to the OMPI developers.
#
[odls-default:invalid-phys-cpu]
An invalid physical processor id was returned when attempting to
set processor affinity. This is probably something that should be
reported to the OMPI developers - your system may not support
this functionality.
#
[odls-default:failed-set-paff]
An attempt to set processor affinity has failed - please check to
ensure that your system supports such functionality. If so, then
this is probably something that should be reported to the OMPI developers.

Просмотреть файл

@ -67,10 +67,14 @@
#endif
#endif /* HAVE_SCHED_YIELD */
#include "opal/mca/maffinity/base/base.h"
#include "opal/mca/paffinity/base/base.h"
#include "orte/util/show_help.h"
#include "orte/runtime/orte_wait.h"
#include "orte/runtime/orte_globals.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/ess/ess.h"
#include "orte/mca/iof/base/iof_base_setup.h"
#include "orte/util/name_fns.h"
@ -178,6 +182,7 @@ static int odls_default_fork_local_proc(orte_app_context_t* context,
sigset_t sigs;
int i, p[2];
pid_t pid;
bool paffinity_enabled = false;
if (NULL != child) {
/* should pull this information from MPIRUN instead of going with
@ -259,7 +264,75 @@ static int odls_default_fork_local_proc(orte_app_context_t* context,
exit(1);
}
/* Setup process affinity. First check to see if a slot list was
* specified. If so, use it. If no slot list was specified,
* that's not an error -- just fall through and try the next
* paffinity scheme.
*/
if (NULL != child->slot_list) {
OPAL_OUTPUT_VERBOSE((2, orte_odls_globals.output,
"%s odls:default:fork got slot_list %s for child %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
child->slot_list, ORTE_NAME_PRINT(child->name)));
if (opal_paffinity_alone) {
/* It's an error if multiple paffinity schemes were specified */
orte_show_help("help-odls-default.txt",
"odls-default:multiple-paffinity-schemes", true, child->slot_list);
rc = ORTE_ERR_FATAL;
write(p[1], &rc, sizeof(int));
exit(1);
}
if (OPAL_SUCCESS != (rc = opal_paffinity_base_slot_list_set((long)child->name->vpid, child->slot_list))) {
orte_show_help("help-odls-default.txt",
"odls-default:slot-list-failed", true, child->slot_list, ORTE_ERROR_NAME(rc));
write(p[1], &rc, sizeof(int));
exit(1);
}
}
/* Otherwise, if opal_paffinity_alone was set, use that scheme */
else if (opal_paffinity_alone) {
opal_paffinity_base_cpu_set_t mask;
int phys_cpu;
orte_node_rank_t nrank;
OPAL_OUTPUT_VERBOSE((2, orte_odls_globals.output,
"%s odls:default:fork setting paffinity for child %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(child->name)));
if (ORTE_NODE_RANK_INVALID == (nrank = orte_ess.get_node_rank(child->name))) {
orte_show_help("help-odls-default.txt",
"odls-default:invalid-node-rank", true);
rc = ORTE_ERR_FATAL;
write(p[1], &rc, sizeof(int));
exit(1);
}
OPAL_PAFFINITY_CPU_ZERO(mask);
phys_cpu = opal_paffinity_base_get_physical_processor_id(nrank);
if (0 > phys_cpu) {
orte_show_help("help-odls-default.txt",
"odls-default:invalid-phys-cpu", true);
rc = ORTE_ERR_FATAL;
write(p[1], &rc, sizeof(int));
exit(1);
}
OPAL_PAFFINITY_CPU_SET(phys_cpu, mask);
if (OPAL_SUCCESS != (rc = opal_paffinity_base_set(mask))) {
orte_show_help("help-odls-default.txt",
"odls-default:failed-set-paff", true);
write(p[1], &rc, sizeof(int));
exit(1);
}
paffinity_enabled = true;
}
/* If we were able to set processor affinity, try setting up
* memory affinity
*/
if (paffinity_enabled) {
if (OPAL_SUCCESS == opal_maffinity_base_open() &&
OPAL_SUCCESS == opal_maffinity_base_select()) {
opal_maffinity_setup = true;
}
}
} else if (!(ORTE_JOB_CONTROL_FORWARD_OUTPUT & controls)) {
/* tie stdin/out/err/internal to /dev/null */
int fdnull;