1
1

Per the discussion on the devel list, move the binding of processes to processors from MPI_Init to process start. This involves:

1. replacing mpi_paffinity_alone with opal_paffinity_alone - for back-compatibility, I have aliased mpi_paffinity_alone to the new param name. This caus
es a mild abstraction break in the opal/mca/paffinity framework - per the devel discussion...live with it. :-) I also moved the ompi_xxx global variable
 that tracked maffinity setup so it could be properly closed in MPI_Finalize to the opal/mca/maffinity framework to avoid an abstraction break.

2. Added code to the odls/default module to perform paffinity binding and maffinity init between process fork and exec. This has been tested on IU's odi
n cluster and works for both MPI and non-MPI apps.

3. Revise MPI_Init to detect if affinity has already been set, and to attempt to set it if not already done. I have *not* tested this as I haven't yet f
igured out a way to do so - I couldn't get slurm to perform cpu bindings, even though it supposedly does do so.

This has only been lightly tested and would definitely benefit from a wider range of evaluation...

This commit was SVN r21209.
Этот коммит содержится в:
Ralph Castain 2009-05-12 02:18:35 +00:00
родитель fa839f4a30
Коммит d396f0a6fc
15 изменённых файлов: 217 добавлений и 126 удалений

Просмотреть файл

@ -38,7 +38,7 @@ WARNING: Cannot set both the MCA parameters mpi_leave_pinned and
mpi_leave_pinned_pipeline to "true". Defaulting to mpi_leave_pinned mpi_leave_pinned_pipeline to "true". Defaulting to mpi_leave_pinned
ONLY. ONLY.
[mpi_init:startup:paffinity-unavailable] [mpi_init:startup:paffinity-unavailable]
The MCA parameter "mpi_paffinity_alone" was set to a nonzero value, The MCA parameter "opal_paffinity_alone" was set to a nonzero value,
but Open MPI was unable to bind MPI_COMM_WORLD rank %s to a processor. but Open MPI was unable to bind MPI_COMM_WORLD rank %s to a processor.
Typical causes for this problem include: Typical causes for this problem include:

Просмотреть файл

@ -57,10 +57,6 @@ OMPI_DECLSPEC extern int ompi_mpi_thread_provided;
/** Identifier of the main thread */ /** Identifier of the main thread */
OMPI_DECLSPEC extern struct opal_thread_t *ompi_mpi_main_thread; OMPI_DECLSPEC extern struct opal_thread_t *ompi_mpi_main_thread;
/** Did we setup maffinity in MPI_INIT (and therefore need to shut
it down during MPI_FINALIZE)? */
OMPI_DECLSPEC extern bool ompi_mpi_maffinity_setup;
/** Do we want to be warned on fork or not? */ /** Do we want to be warned on fork or not? */
OMPI_DECLSPEC extern bool ompi_warn_on_fork; OMPI_DECLSPEC extern bool ompi_warn_on_fork;

Просмотреть файл

@ -144,7 +144,7 @@ int ompi_mpi_finalize(void)
opal_progress_event_users_increment(); opal_progress_event_users_increment();
/* If maffinity was setup, tear it down */ /* If maffinity was setup, tear it down */
if (ompi_mpi_maffinity_setup) { if (opal_maffinity_setup) {
opal_maffinity_base_close(); opal_maffinity_base_close();
} }

Просмотреть файл

@ -280,6 +280,7 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
int param, value; int param, value;
struct timeval ompistart, ompistop; struct timeval ompistart, ompistop;
char *event_val = NULL; char *event_val = NULL;
opal_paffinity_base_cpu_set_t mask;
#if 0 #if 0
/* see comment below about sched_yield */ /* see comment below about sched_yield */
int num_processors; int num_processors;
@ -395,56 +396,60 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
orte_process_info.pid); orte_process_info.pid);
} }
/* Setup process affinity. First check to see if a slot list was /* if it hasn't already been done, setup process affinity.
specified. If so, use it. If no slot list was specified, * First check to see if a slot list was
that's not an error -- just fall through and try the next * specified. If so, use it. If no slot list was specified,
paffinity scheme. */ * that's not an error -- just fall through and try the next
ret = opal_paffinity_base_slot_list_set((long)ORTE_PROC_MY_NAME->vpid); * paffinity scheme.
if (OPAL_SUCCESS == ret) { */
paffinity_enabled = true; ret = opal_paffinity_base_get(&mask);
} if (OPAL_ERR_NOT_FOUND == ret) {
/* If an error occurred in the slot list setup (other than "there /* the system is capable of doing processor affinity, but it
was not slot list specified"), bail. */ * has not yet been set - see if a slot_list was given
else if (OPAL_ERR_NOT_FOUND != ret) { */
error = "opal_paffinity_base_slot_list_set() returned an error"; if (NULL != opal_paffinity_base_slot_list) {
goto error; /* It's an error if multiple paffinity schemes were specified */
} if (opal_paffinity_alone) {
/* It's an error if multiple paffinity schemes were specified */ ret = OMPI_ERR_BAD_PARAM;
if (paffinity_enabled && ompi_mpi_paffinity_alone) { error = "Multiple processor affinity schemes specified (can only specify one)";
ret = OMPI_ERR_BAD_PARAM; goto error;
error = "Multiple processor affinity schemes specified (can only specify one)"; }
goto error; ret = opal_paffinity_base_slot_list_set((long)ORTE_PROC_MY_NAME->vpid, opal_paffinity_base_slot_list);
} if (OPAL_ERR_NOT_FOUND != ret) {
/* Otherwise, if mpi_paffinity_alone was set, use that scheme */ error = "opal_paffinity_base_slot_list_set() returned an error";
else if (ompi_mpi_paffinity_alone) { goto error;
opal_paffinity_base_cpu_set_t mask; }
int phys_cpu; paffinity_enabled = true;
orte_node_rank_t nrank; } else if (opal_paffinity_alone) {
if (ORTE_NODE_RANK_INVALID == (nrank = orte_ess.get_node_rank(ORTE_PROC_MY_NAME))) { /* no slot_list, but they asked for paffinity */
error = "Could not get node rank - cannot set processor affinity"; int phys_cpu;
goto error; orte_node_rank_t nrank;
if (ORTE_NODE_RANK_INVALID == (nrank = orte_ess.get_node_rank(ORTE_PROC_MY_NAME))) {
error = "Could not get node rank - cannot set processor affinity";
goto error;
}
OPAL_PAFFINITY_CPU_ZERO(mask);
phys_cpu = opal_paffinity_base_get_physical_processor_id(nrank);
if (0 > phys_cpu) {
error = "Could not get physical processor id - cannot set processor affinity";
goto error;
}
OPAL_PAFFINITY_CPU_SET(phys_cpu, mask);
ret = opal_paffinity_base_set(mask);
if (OPAL_SUCCESS != ret) {
error = "Setting processor affinity failed";
goto error;
}
paffinity_enabled = true;
} }
OPAL_PAFFINITY_CPU_ZERO(mask);
phys_cpu = opal_paffinity_base_get_physical_processor_id(nrank);
if (0 > phys_cpu) {
error = "Could not get physical processor id - cannot set processor affinity";
goto error;
}
OPAL_PAFFINITY_CPU_SET(phys_cpu, mask);
ret = opal_paffinity_base_set(mask);
if (OPAL_SUCCESS != ret) {
error = "Setting processor affinity failed";
goto error;
}
paffinity_enabled = true;
} }
/* If we were able to set processor affinity, try setting up /* If we were able to set processor affinity, try setting up
memory affinity */ memory affinity */
if (paffinity_enabled) { if (!opal_maffinity_setup && paffinity_enabled) {
if (OPAL_SUCCESS == opal_maffinity_base_open() && if (OPAL_SUCCESS == opal_maffinity_base_open() &&
OPAL_SUCCESS == opal_maffinity_base_select()) { OPAL_SUCCESS == opal_maffinity_base_select()) {
ompi_mpi_maffinity_setup = true; opal_maffinity_setup = true;
} }
} }

Просмотреть файл

@ -52,7 +52,6 @@ int ompi_debug_show_mpi_alloc_mem_leaks = 0;
bool ompi_debug_no_free_handles = false; bool ompi_debug_no_free_handles = false;
bool ompi_mpi_show_mca_params = false; bool ompi_mpi_show_mca_params = false;
char *ompi_mpi_show_mca_params_file = NULL; char *ompi_mpi_show_mca_params_file = NULL;
bool ompi_mpi_paffinity_alone = false;
bool ompi_mpi_abort_print_stack = false; bool ompi_mpi_abort_print_stack = false;
int ompi_mpi_abort_delay = 0; int ompi_mpi_abort_delay = 0;
bool ompi_mpi_keep_peer_hostnames = true; bool ompi_mpi_keep_peer_hostnames = true;
@ -261,12 +260,6 @@ int ompi_mpi_register_params(void)
true); true);
} }
mca_base_param_reg_int_name("mpi", "paffinity_alone",
"If nonzero, assume that this job is the only (set of) process(es) running on each node and bind processes to processors, starting with processor ID 0",
false, false,
(int) ompi_mpi_paffinity_alone, &value);
ompi_mpi_paffinity_alone = OPAL_INT_TO_BOOL(value);
mca_base_param_reg_int_name("mpi", "warn_on_fork", mca_base_param_reg_int_name("mpi", "warn_on_fork",
"If nonzero, issue a warning if program forks under conditions that could cause system errors", "If nonzero, issue a warning if program forks under conditions that could cause system errors",
false, false, false, false,

Просмотреть файл

@ -95,15 +95,6 @@ OMPI_DECLSPEC extern bool ompi_mpi_show_mca_params;
*/ */
OMPI_DECLSPEC extern char * ompi_mpi_show_mca_params_file; OMPI_DECLSPEC extern char * ompi_mpi_show_mca_params_file;
/**
* If this value is true, assume that this ORTE job is the only job
* running on the nodes that have been allocated to it, and bind
* processes to the processor ID corresponding to their node local
* rank (if you COMM_SPAWN on to empty processors on the same node,
* the NLR will start at N, not 0).
*/
OMPI_DECLSPEC extern bool ompi_mpi_paffinity_alone;
/** /**
* Whether we should keep the string hostnames of all the MPI * Whether we should keep the string hostnames of all the MPI
* process peers around or not (eats up a good bit of memory). * process peers around or not (eats up a good bit of memory).

Просмотреть файл

@ -154,7 +154,12 @@ OPAL_DECLSPEC extern opal_list_t opal_maffinity_base_components_opened;
* Debugging output stream * Debugging output stream
*/ */
extern int opal_maffinity_base_output; extern int opal_maffinity_base_output;
/**
* Flag to indicate whether or not maffinity was setup
*/
OPAL_DECLSPEC extern bool opal_maffinity_setup;
END_C_DECLS END_C_DECLS
#endif /* OPAL_BASE_MAFFINITY_H */ #endif /* OPAL_BASE_MAFFINITY_H */

Просмотреть файл

@ -42,7 +42,7 @@
int opal_maffinity_base_output = -1; int opal_maffinity_base_output = -1;
bool opal_maffinity_base_components_opened_valid = false; bool opal_maffinity_base_components_opened_valid = false;
opal_list_t opal_maffinity_base_components_opened; opal_list_t opal_maffinity_base_components_opened;
bool opal_maffinity_setup = false;
/* /*
* Function for finding and opening either all MCA components, or the one * Function for finding and opening either all MCA components, or the one

Просмотреть файл

@ -28,36 +28,32 @@
#include <sys/types.h> #include <sys/types.h>
#if defined(c_plusplus) || defined(__cplusplus) BEGIN_C_DECLS
extern "C" {
#endif
/** /**
* Struct used with opal_maffinity_base_module_set_fn_t. It * Struct used with opal_maffinity_base_module_set_fn_t. It
* describes a section of memory (starting address and length). * describes a section of memory (starting address and length).
* This is really the same thing as an iovec, but we include a * This is really the same thing as an iovec, but we include a
* separate type for it for at least 2 reasons: * separate type for it for at least 2 reasons:
* *
* 1. Some OS's iovec definitions are exceedingly lame (e.g., * 1. Some OS's iovec definitions are exceedingly lame (e.g.,
* Solaris 9 has the length argument as an int, instead of a * Solaris 9 has the length argument as an int, instead of a
* size_t). * size_t).
* *
* 2. We reserve the right to expand/change this struct in the * 2. We reserve the right to expand/change this struct in the
* future. * future.
*/ */
struct opal_maffinity_base_segment_t { struct opal_maffinity_base_segment_t {
/** Starting address of segment */ /** Starting address of segment */
void *mbs_start_addr; void *mbs_start_addr;
/** Length of segment */ /** Length of segment */
size_t mbs_len; size_t mbs_len;
}; };
/** /**
* Convenience typedef * Convenience typedef
*/ */
typedef struct opal_maffinity_base_segment_t opal_maffinity_base_segment_t; typedef struct opal_maffinity_base_segment_t opal_maffinity_base_segment_t;
#if defined(c_plusplus) || defined(__cplusplus) END_C_DECLS
}
#endif
#endif /* OPAL_MAFFINITY_TYPES_H */ #endif /* OPAL_MAFFINITY_TYPES_H */

Просмотреть файл

@ -237,13 +237,20 @@ OPAL_DECLSPEC extern opal_list_t opal_paffinity_base_components_opened;
/** /**
* Assigning slot_list to process * Assigning slot_list to process
*/ */
OPAL_DECLSPEC int opal_paffinity_base_slot_list_set(long rank); OPAL_DECLSPEC int opal_paffinity_base_slot_list_set(long rank, char *slot_str);
/** /**
* Debugging output stream * Debugging output stream
*/ */
OPAL_DECLSPEC extern int opal_paffinity_base_output; OPAL_DECLSPEC extern int opal_paffinity_base_output;
/**
* Flag indicating whether or not processor affinity is to be enabled
*/
OPAL_DECLSPEC extern bool opal_paffinity_alone;
OPAL_DECLSPEC extern char *opal_paffinity_base_slot_list;
END_C_DECLS END_C_DECLS
#endif /* OPAL_BASE_PAFFINITY_H */ #endif /* OPAL_BASE_PAFFINITY_H */

Просмотреть файл

@ -43,7 +43,8 @@
OPAL_DECLSPEC int opal_paffinity_base_output = -1; OPAL_DECLSPEC int opal_paffinity_base_output = -1;
bool opal_paffinity_base_components_opened_valid = false; bool opal_paffinity_base_components_opened_valid = false;
opal_list_t opal_paffinity_base_components_opened; opal_list_t opal_paffinity_base_components_opened;
bool opal_paffinity_alone = false;
char *opal_paffinity_base_slot_list;
/* /*
* Function for finding and opening either all MCA components, or the one * Function for finding and opening either all MCA components, or the one
@ -51,7 +52,7 @@ opal_list_t opal_paffinity_base_components_opened;
*/ */
int opal_paffinity_base_open(void) int opal_paffinity_base_open(void)
{ {
int value; int value, id;
/* Debugging / verbose output */ /* Debugging / verbose output */
@ -65,11 +66,19 @@ int opal_paffinity_base_open(void)
opal_paffinity_base_output = -1; opal_paffinity_base_output = -1;
} }
id = mca_base_param_reg_int_name("opal", "paffinity_alone",
"If nonzero, assume that this job is the only (set of) process(es) running on each node and bind processes to processors, starting with processor ID 0",
false, false,
0, NULL);
mca_base_param_reg_syn_name(id, "mpi", "paffinity_alone", true);
mca_base_param_lookup_int(id, &value);
opal_paffinity_alone = OPAL_INT_TO_BOOL(value);
opal_paffinity_base_components_opened_valid = false; opal_paffinity_base_components_opened_valid = false;
mca_base_param_reg_string_name("opal", "paffinity_base_slot_list", mca_base_param_reg_string_name("opal", "paffinity_base_slot_list",
"Used to set list of processor IDs to bind MPI processes to (e.g., used in conjunction with rank files)", "Used to set list of processor IDs to bind MPI processes to (e.g., used in conjunction with rank files)",
true, false, NULL, NULL); true, false, NULL, &opal_paffinity_base_slot_list);
/* Open up all available components */ /* Open up all available components */

Просмотреть файл

@ -512,29 +512,22 @@ static int opal_paffinity_base_socket_core_to_cpu_set(char **socket_core_list, i
return OPAL_SUCCESS; return OPAL_SUCCESS;
} }
int opal_paffinity_base_slot_list_set(long rank) int opal_paffinity_base_slot_list_set(long rank, char *slot_str)
{ {
char *slot_str = NULL;
char **item; char **item;
char **socket_core; char **socket_core;
int item_cnt, socket_core_cnt, rc; int item_cnt, socket_core_cnt, rc;
bool logical_map; bool logical_map;
rc = mca_base_param_find("opal", NULL, "paffinity_base_slot_list"); if (NULL == slot_str){
/* If there was not slot list specified, return a specific error
code indicating that */
if (rc <= 0) {
return OPAL_ERR_NOT_FOUND;
}
if (OPAL_SUCCESS == mca_base_param_lookup_string(rc, &slot_str)) {
if (NULL == slot_str) {
return OPAL_ERR_NOT_FOUND;
}
}
if (0 == strcmp("", slot_str)){
return OPAL_ERR_BAD_PARAM; return OPAL_ERR_BAD_PARAM;
} }
/* if the slot string is empty, that is an error */
if (0 == strlen(slot_str)) {
return OPAL_ERR_BAD_PARAM;
}
/* check for diag request to avoid repeatedly doing so */ /* check for diag request to avoid repeatedly doing so */
if (4 < opal_output_get_verbosity(opal_paffinity_base_output)) { if (4 < opal_output_get_verbosity(opal_paffinity_base_output)) {
diag_requested = true; diag_requested = true;

Просмотреть файл

@ -1285,16 +1285,6 @@ int orte_odls_base_default_launch_local(orte_jobid_t job,
opal_setenv("OMPI_COMM_WORLD_LOCAL_RANK", value, true, &app->env); opal_setenv("OMPI_COMM_WORLD_LOCAL_RANK", value, true, &app->env);
free(value); free(value);
param = mca_base_param_environ_variable("opal", NULL, "paffinity_base_slot_list");
if ( NULL != child->slot_list ) {
asprintf(&value, "%s", child->slot_list);
opal_setenv(param, value, true, &app->env);
free(value);
} else {
opal_unsetenv(param, &app->env);
}
free(param);
/* if we are timing things, record when we are going to launch this proc */ /* if we are timing things, record when we are going to launch this proc */
if (orte_timing) { if (orte_timing) {
gettimeofday(&child->starttime, NULL); gettimeofday(&child->starttime, NULL);

Просмотреть файл

@ -55,3 +55,36 @@ Max value allowed: %ld
This may be resolved by increasing the number of available node id's by This may be resolved by increasing the number of available node id's by
re-configuring Open MPI with the --enable-jumbo-clusters option, and then re-configuring Open MPI with the --enable-jumbo-clusters option, and then
re-running the application re-running the application
#
[odls-default:multiple-paffinity-schemes]
Multiple processor affinity schemes were specified (can only specify one):
Slot list: %s
opal_paffinity_alone: true
Please specify only the one desired method.
#
[odls-default:slot-list-failed]
We were unable to successfully process/set the requested processor
affinity settings:
Specified slot list: %s
Error: %s
This could mean that a non-existent processor was specified, or
that the specification had improper syntax.
#
[odls-default:invalid-node-rank]
An invalid node rank was obtained - this is probably something
that should be reported to the OMPI developers.
#
[odls-default:invalid-phys-cpu]
An invalid physical processor id was returned when attempting to
set processor affinity. This is probably something that should be
reported to the OMPI developers - your system may not support
this functionality.
#
[odls-default:failed-set-paff]
An attempt to set processor affinity has failed - please check to
ensure that your system supports such functionality. If so, then
this is probably something that should be reported to the OMPI developers.

Просмотреть файл

@ -67,10 +67,14 @@
#endif #endif
#endif /* HAVE_SCHED_YIELD */ #endif /* HAVE_SCHED_YIELD */
#include "opal/mca/maffinity/base/base.h"
#include "opal/mca/paffinity/base/base.h"
#include "orte/util/show_help.h" #include "orte/util/show_help.h"
#include "orte/runtime/orte_wait.h" #include "orte/runtime/orte_wait.h"
#include "orte/runtime/orte_globals.h" #include "orte/runtime/orte_globals.h"
#include "orte/mca/errmgr/errmgr.h" #include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/ess/ess.h"
#include "orte/mca/iof/base/iof_base_setup.h" #include "orte/mca/iof/base/iof_base_setup.h"
#include "orte/util/name_fns.h" #include "orte/util/name_fns.h"
@ -178,6 +182,7 @@ static int odls_default_fork_local_proc(orte_app_context_t* context,
sigset_t sigs; sigset_t sigs;
int i, p[2]; int i, p[2];
pid_t pid; pid_t pid;
bool paffinity_enabled = false;
if (NULL != child) { if (NULL != child) {
/* should pull this information from MPIRUN instead of going with /* should pull this information from MPIRUN instead of going with
@ -259,7 +264,75 @@ static int odls_default_fork_local_proc(orte_app_context_t* context,
exit(1); exit(1);
} }
/* Setup process affinity. First check to see if a slot list was
* specified. If so, use it. If no slot list was specified,
* that's not an error -- just fall through and try the next
* paffinity scheme.
*/
if (NULL != child->slot_list) {
OPAL_OUTPUT_VERBOSE((2, orte_odls_globals.output,
"%s odls:default:fork got slot_list %s for child %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
child->slot_list, ORTE_NAME_PRINT(child->name)));
if (opal_paffinity_alone) {
/* It's an error if multiple paffinity schemes were specified */
orte_show_help("help-odls-default.txt",
"odls-default:multiple-paffinity-schemes", true, child->slot_list);
rc = ORTE_ERR_FATAL;
write(p[1], &rc, sizeof(int));
exit(1);
}
if (OPAL_SUCCESS != (rc = opal_paffinity_base_slot_list_set((long)child->name->vpid, child->slot_list))) {
orte_show_help("help-odls-default.txt",
"odls-default:slot-list-failed", true, child->slot_list, ORTE_ERROR_NAME(rc));
write(p[1], &rc, sizeof(int));
exit(1);
}
}
/* Otherwise, if opal_paffinity_alone was set, use that scheme */
else if (opal_paffinity_alone) {
opal_paffinity_base_cpu_set_t mask;
int phys_cpu;
orte_node_rank_t nrank;
OPAL_OUTPUT_VERBOSE((2, orte_odls_globals.output,
"%s odls:default:fork setting paffinity for child %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(child->name)));
if (ORTE_NODE_RANK_INVALID == (nrank = orte_ess.get_node_rank(child->name))) {
orte_show_help("help-odls-default.txt",
"odls-default:invalid-node-rank", true);
rc = ORTE_ERR_FATAL;
write(p[1], &rc, sizeof(int));
exit(1);
}
OPAL_PAFFINITY_CPU_ZERO(mask);
phys_cpu = opal_paffinity_base_get_physical_processor_id(nrank);
if (0 > phys_cpu) {
orte_show_help("help-odls-default.txt",
"odls-default:invalid-phys-cpu", true);
rc = ORTE_ERR_FATAL;
write(p[1], &rc, sizeof(int));
exit(1);
}
OPAL_PAFFINITY_CPU_SET(phys_cpu, mask);
if (OPAL_SUCCESS != (rc = opal_paffinity_base_set(mask))) {
orte_show_help("help-odls-default.txt",
"odls-default:failed-set-paff", true);
write(p[1], &rc, sizeof(int));
exit(1);
}
paffinity_enabled = true;
}
/* If we were able to set processor affinity, try setting up
* memory affinity
*/
if (paffinity_enabled) {
if (OPAL_SUCCESS == opal_maffinity_base_open() &&
OPAL_SUCCESS == opal_maffinity_base_select()) {
opal_maffinity_setup = true;
}
}
} else if (!(ORTE_JOB_CONTROL_FORWARD_OUTPUT & controls)) { } else if (!(ORTE_JOB_CONTROL_FORWARD_OUTPUT & controls)) {
/* tie stdin/out/err/internal to /dev/null */ /* tie stdin/out/err/internal to /dev/null */
int fdnull; int fdnull;