diff --git a/ompi/runtime/help-mpi-runtime.txt b/ompi/runtime/help-mpi-runtime.txt index 65498aec96..67dd78ebf0 100644 --- a/ompi/runtime/help-mpi-runtime.txt +++ b/ompi/runtime/help-mpi-runtime.txt @@ -38,7 +38,7 @@ WARNING: Cannot set both the MCA parameters mpi_leave_pinned and mpi_leave_pinned_pipeline to "true". Defaulting to mpi_leave_pinned ONLY. [mpi_init:startup:paffinity-unavailable] -The MCA parameter "mpi_paffinity_alone" was set to a nonzero value, +The MCA parameter "opal_paffinity_alone" was set to a nonzero value, but Open MPI was unable to bind MPI_COMM_WORLD rank %s to a processor. Typical causes for this problem include: diff --git a/ompi/runtime/mpiruntime.h b/ompi/runtime/mpiruntime.h index 5483073467..d98482a1bd 100644 --- a/ompi/runtime/mpiruntime.h +++ b/ompi/runtime/mpiruntime.h @@ -57,10 +57,6 @@ OMPI_DECLSPEC extern int ompi_mpi_thread_provided; /** Identifier of the main thread */ OMPI_DECLSPEC extern struct opal_thread_t *ompi_mpi_main_thread; -/** Did we setup maffinity in MPI_INIT (and therefore need to shut - it down during MPI_FINALIZE)? */ -OMPI_DECLSPEC extern bool ompi_mpi_maffinity_setup; - /** Do we want to be warned on fork or not? */ OMPI_DECLSPEC extern bool ompi_warn_on_fork; diff --git a/ompi/runtime/ompi_mpi_finalize.c b/ompi/runtime/ompi_mpi_finalize.c index fd4e3600de..9408f42fe0 100644 --- a/ompi/runtime/ompi_mpi_finalize.c +++ b/ompi/runtime/ompi_mpi_finalize.c @@ -144,7 +144,7 @@ int ompi_mpi_finalize(void) opal_progress_event_users_increment(); /* If maffinity was setup, tear it down */ - if (ompi_mpi_maffinity_setup) { + if (opal_maffinity_setup) { opal_maffinity_base_close(); } diff --git a/ompi/runtime/ompi_mpi_init.c b/ompi/runtime/ompi_mpi_init.c index a778b016d2..d12378c5da 100644 --- a/ompi/runtime/ompi_mpi_init.c +++ b/ompi/runtime/ompi_mpi_init.c @@ -280,6 +280,7 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided) int param, value; struct timeval ompistart, ompistop; char *event_val = NULL; + opal_paffinity_base_cpu_set_t mask; #if 0 /* see comment below about sched_yield */ int num_processors; @@ -395,56 +396,60 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided) orte_process_info.pid); } - /* Setup process affinity. First check to see if a slot list was - specified. If so, use it. If no slot list was specified, - that's not an error -- just fall through and try the next - paffinity scheme. */ - ret = opal_paffinity_base_slot_list_set((long)ORTE_PROC_MY_NAME->vpid); - if (OPAL_SUCCESS == ret) { - paffinity_enabled = true; - } - /* If an error occurred in the slot list setup (other than "there - was not slot list specified"), bail. */ - else if (OPAL_ERR_NOT_FOUND != ret) { - error = "opal_paffinity_base_slot_list_set() returned an error"; - goto error; - } - /* It's an error if multiple paffinity schemes were specified */ - if (paffinity_enabled && ompi_mpi_paffinity_alone) { - ret = OMPI_ERR_BAD_PARAM; - error = "Multiple processor affinity schemes specified (can only specify one)"; - goto error; - } - /* Otherwise, if mpi_paffinity_alone was set, use that scheme */ - else if (ompi_mpi_paffinity_alone) { - opal_paffinity_base_cpu_set_t mask; - int phys_cpu; - orte_node_rank_t nrank; - if (ORTE_NODE_RANK_INVALID == (nrank = orte_ess.get_node_rank(ORTE_PROC_MY_NAME))) { - error = "Could not get node rank - cannot set processor affinity"; - goto error; + /* if it hasn't already been done, setup process affinity. + * First check to see if a slot list was + * specified. If so, use it. If no slot list was specified, + * that's not an error -- just fall through and try the next + * paffinity scheme. + */ + ret = opal_paffinity_base_get(&mask); + if (OPAL_ERR_NOT_FOUND == ret) { + /* the system is capable of doing processor affinity, but it + * has not yet been set - see if a slot_list was given + */ + if (NULL != opal_paffinity_base_slot_list) { + /* It's an error if multiple paffinity schemes were specified */ + if (opal_paffinity_alone) { + ret = OMPI_ERR_BAD_PARAM; + error = "Multiple processor affinity schemes specified (can only specify one)"; + goto error; + } + ret = opal_paffinity_base_slot_list_set((long)ORTE_PROC_MY_NAME->vpid, opal_paffinity_base_slot_list); + if (OPAL_ERR_NOT_FOUND != ret) { + error = "opal_paffinity_base_slot_list_set() returned an error"; + goto error; + } + paffinity_enabled = true; + } else if (opal_paffinity_alone) { + /* no slot_list, but they asked for paffinity */ + int phys_cpu; + orte_node_rank_t nrank; + if (ORTE_NODE_RANK_INVALID == (nrank = orte_ess.get_node_rank(ORTE_PROC_MY_NAME))) { + error = "Could not get node rank - cannot set processor affinity"; + goto error; + } + OPAL_PAFFINITY_CPU_ZERO(mask); + phys_cpu = opal_paffinity_base_get_physical_processor_id(nrank); + if (0 > phys_cpu) { + error = "Could not get physical processor id - cannot set processor affinity"; + goto error; + } + OPAL_PAFFINITY_CPU_SET(phys_cpu, mask); + ret = opal_paffinity_base_set(mask); + if (OPAL_SUCCESS != ret) { + error = "Setting processor affinity failed"; + goto error; + } + paffinity_enabled = true; } - OPAL_PAFFINITY_CPU_ZERO(mask); - phys_cpu = opal_paffinity_base_get_physical_processor_id(nrank); - if (0 > phys_cpu) { - error = "Could not get physical processor id - cannot set processor affinity"; - goto error; - } - OPAL_PAFFINITY_CPU_SET(phys_cpu, mask); - ret = opal_paffinity_base_set(mask); - if (OPAL_SUCCESS != ret) { - error = "Setting processor affinity failed"; - goto error; - } - paffinity_enabled = true; } /* If we were able to set processor affinity, try setting up memory affinity */ - if (paffinity_enabled) { + if (!opal_maffinity_setup && paffinity_enabled) { if (OPAL_SUCCESS == opal_maffinity_base_open() && OPAL_SUCCESS == opal_maffinity_base_select()) { - ompi_mpi_maffinity_setup = true; + opal_maffinity_setup = true; } } diff --git a/ompi/runtime/ompi_mpi_params.c b/ompi/runtime/ompi_mpi_params.c index a88d491c72..e13805315f 100644 --- a/ompi/runtime/ompi_mpi_params.c +++ b/ompi/runtime/ompi_mpi_params.c @@ -52,7 +52,6 @@ int ompi_debug_show_mpi_alloc_mem_leaks = 0; bool ompi_debug_no_free_handles = false; bool ompi_mpi_show_mca_params = false; char *ompi_mpi_show_mca_params_file = NULL; -bool ompi_mpi_paffinity_alone = false; bool ompi_mpi_abort_print_stack = false; int ompi_mpi_abort_delay = 0; bool ompi_mpi_keep_peer_hostnames = true; @@ -261,12 +260,6 @@ int ompi_mpi_register_params(void) true); } - mca_base_param_reg_int_name("mpi", "paffinity_alone", - "If nonzero, assume that this job is the only (set of) process(es) running on each node and bind processes to processors, starting with processor ID 0", - false, false, - (int) ompi_mpi_paffinity_alone, &value); - ompi_mpi_paffinity_alone = OPAL_INT_TO_BOOL(value); - mca_base_param_reg_int_name("mpi", "warn_on_fork", "If nonzero, issue a warning if program forks under conditions that could cause system errors", false, false, diff --git a/ompi/runtime/params.h b/ompi/runtime/params.h index c307efdb86..71036b8824 100644 --- a/ompi/runtime/params.h +++ b/ompi/runtime/params.h @@ -95,15 +95,6 @@ OMPI_DECLSPEC extern bool ompi_mpi_show_mca_params; */ OMPI_DECLSPEC extern char * ompi_mpi_show_mca_params_file; -/** - * If this value is true, assume that this ORTE job is the only job - * running on the nodes that have been allocated to it, and bind - * processes to the processor ID corresponding to their node local - * rank (if you COMM_SPAWN on to empty processors on the same node, - * the NLR will start at N, not 0). - */ -OMPI_DECLSPEC extern bool ompi_mpi_paffinity_alone; - /** * Whether we should keep the string hostnames of all the MPI * process peers around or not (eats up a good bit of memory). diff --git a/opal/mca/maffinity/base/base.h b/opal/mca/maffinity/base/base.h index 073fb4e840..f78f86b36b 100644 --- a/opal/mca/maffinity/base/base.h +++ b/opal/mca/maffinity/base/base.h @@ -154,7 +154,12 @@ OPAL_DECLSPEC extern opal_list_t opal_maffinity_base_components_opened; * Debugging output stream */ extern int opal_maffinity_base_output; - + +/** + * Flag to indicate whether or not maffinity was setup + */ +OPAL_DECLSPEC extern bool opal_maffinity_setup; + END_C_DECLS #endif /* OPAL_BASE_MAFFINITY_H */ diff --git a/opal/mca/maffinity/base/maffinity_base_open.c b/opal/mca/maffinity/base/maffinity_base_open.c index d777a2b6df..9114e94722 100644 --- a/opal/mca/maffinity/base/maffinity_base_open.c +++ b/opal/mca/maffinity/base/maffinity_base_open.c @@ -42,7 +42,7 @@ int opal_maffinity_base_output = -1; bool opal_maffinity_base_components_opened_valid = false; opal_list_t opal_maffinity_base_components_opened; - +bool opal_maffinity_setup = false; /* * Function for finding and opening either all MCA components, or the one diff --git a/opal/mca/maffinity/maffinity_types.h b/opal/mca/maffinity/maffinity_types.h index 27e724f245..1acc7fe093 100644 --- a/opal/mca/maffinity/maffinity_types.h +++ b/opal/mca/maffinity/maffinity_types.h @@ -28,36 +28,32 @@ #include -#if defined(c_plusplus) || defined(__cplusplus) -extern "C" { -#endif +BEGIN_C_DECLS - /** - * Struct used with opal_maffinity_base_module_set_fn_t. It - * describes a section of memory (starting address and length). - * This is really the same thing as an iovec, but we include a - * separate type for it for at least 2 reasons: - * - * 1. Some OS's iovec definitions are exceedingly lame (e.g., - * Solaris 9 has the length argument as an int, instead of a - * size_t). - * - * 2. We reserve the right to expand/change this struct in the - * future. - */ - struct opal_maffinity_base_segment_t { - /** Starting address of segment */ - void *mbs_start_addr; - /** Length of segment */ - size_t mbs_len; - }; - /** - * Convenience typedef - */ - typedef struct opal_maffinity_base_segment_t opal_maffinity_base_segment_t; +/** + * Struct used with opal_maffinity_base_module_set_fn_t. It + * describes a section of memory (starting address and length). + * This is really the same thing as an iovec, but we include a + * separate type for it for at least 2 reasons: + * + * 1. Some OS's iovec definitions are exceedingly lame (e.g., + * Solaris 9 has the length argument as an int, instead of a + * size_t). + * + * 2. We reserve the right to expand/change this struct in the + * future. + */ +struct opal_maffinity_base_segment_t { + /** Starting address of segment */ + void *mbs_start_addr; + /** Length of segment */ + size_t mbs_len; +}; +/** + * Convenience typedef + */ +typedef struct opal_maffinity_base_segment_t opal_maffinity_base_segment_t; -#if defined(c_plusplus) || defined(__cplusplus) -} -#endif +END_C_DECLS #endif /* OPAL_MAFFINITY_TYPES_H */ diff --git a/opal/mca/paffinity/base/base.h b/opal/mca/paffinity/base/base.h index 4b585d75f7..a83dcc5a95 100644 --- a/opal/mca/paffinity/base/base.h +++ b/opal/mca/paffinity/base/base.h @@ -237,13 +237,20 @@ OPAL_DECLSPEC extern opal_list_t opal_paffinity_base_components_opened; /** * Assigning slot_list to process */ -OPAL_DECLSPEC int opal_paffinity_base_slot_list_set(long rank); +OPAL_DECLSPEC int opal_paffinity_base_slot_list_set(long rank, char *slot_str); /** * Debugging output stream */ OPAL_DECLSPEC extern int opal_paffinity_base_output; +/** + * Flag indicating whether or not processor affinity is to be enabled + */ +OPAL_DECLSPEC extern bool opal_paffinity_alone; + +OPAL_DECLSPEC extern char *opal_paffinity_base_slot_list; + END_C_DECLS #endif /* OPAL_BASE_PAFFINITY_H */ diff --git a/opal/mca/paffinity/base/paffinity_base_open.c b/opal/mca/paffinity/base/paffinity_base_open.c index 509e726a81..0e1e49950d 100644 --- a/opal/mca/paffinity/base/paffinity_base_open.c +++ b/opal/mca/paffinity/base/paffinity_base_open.c @@ -43,7 +43,8 @@ OPAL_DECLSPEC int opal_paffinity_base_output = -1; bool opal_paffinity_base_components_opened_valid = false; opal_list_t opal_paffinity_base_components_opened; - +bool opal_paffinity_alone = false; +char *opal_paffinity_base_slot_list; /* * Function for finding and opening either all MCA components, or the one @@ -51,7 +52,7 @@ opal_list_t opal_paffinity_base_components_opened; */ int opal_paffinity_base_open(void) { - int value; + int value, id; /* Debugging / verbose output */ @@ -65,11 +66,19 @@ int opal_paffinity_base_open(void) opal_paffinity_base_output = -1; } + id = mca_base_param_reg_int_name("opal", "paffinity_alone", + "If nonzero, assume that this job is the only (set of) process(es) running on each node and bind processes to processors, starting with processor ID 0", + false, false, + 0, NULL); + mca_base_param_reg_syn_name(id, "mpi", "paffinity_alone", true); + mca_base_param_lookup_int(id, &value); + opal_paffinity_alone = OPAL_INT_TO_BOOL(value); + opal_paffinity_base_components_opened_valid = false; mca_base_param_reg_string_name("opal", "paffinity_base_slot_list", "Used to set list of processor IDs to bind MPI processes to (e.g., used in conjunction with rank files)", - true, false, NULL, NULL); + true, false, NULL, &opal_paffinity_base_slot_list); /* Open up all available components */ diff --git a/opal/mca/paffinity/base/paffinity_base_service.c b/opal/mca/paffinity/base/paffinity_base_service.c index 8cb0171ba6..d8b386ebc8 100644 --- a/opal/mca/paffinity/base/paffinity_base_service.c +++ b/opal/mca/paffinity/base/paffinity_base_service.c @@ -512,29 +512,22 @@ static int opal_paffinity_base_socket_core_to_cpu_set(char **socket_core_list, i return OPAL_SUCCESS; } -int opal_paffinity_base_slot_list_set(long rank) +int opal_paffinity_base_slot_list_set(long rank, char *slot_str) { - char *slot_str = NULL; char **item; char **socket_core; int item_cnt, socket_core_cnt, rc; bool logical_map; - rc = mca_base_param_find("opal", NULL, "paffinity_base_slot_list"); - /* If there was not slot list specified, return a specific error - code indicating that */ - if (rc <= 0) { - return OPAL_ERR_NOT_FOUND; - } - - if (OPAL_SUCCESS == mca_base_param_lookup_string(rc, &slot_str)) { - if (NULL == slot_str) { - return OPAL_ERR_NOT_FOUND; - } - } - if (0 == strcmp("", slot_str)){ + if (NULL == slot_str){ return OPAL_ERR_BAD_PARAM; } + + /* if the slot string is empty, that is an error */ + if (0 == strlen(slot_str)) { + return OPAL_ERR_BAD_PARAM; + } + /* check for diag request to avoid repeatedly doing so */ if (4 < opal_output_get_verbosity(opal_paffinity_base_output)) { diag_requested = true; diff --git a/orte/mca/odls/base/odls_base_default_fns.c b/orte/mca/odls/base/odls_base_default_fns.c index f82f735e0d..3ab9ab9379 100644 --- a/orte/mca/odls/base/odls_base_default_fns.c +++ b/orte/mca/odls/base/odls_base_default_fns.c @@ -1285,16 +1285,6 @@ int orte_odls_base_default_launch_local(orte_jobid_t job, opal_setenv("OMPI_COMM_WORLD_LOCAL_RANK", value, true, &app->env); free(value); - param = mca_base_param_environ_variable("opal", NULL, "paffinity_base_slot_list"); - if ( NULL != child->slot_list ) { - asprintf(&value, "%s", child->slot_list); - opal_setenv(param, value, true, &app->env); - free(value); - } else { - opal_unsetenv(param, &app->env); - } - free(param); - /* if we are timing things, record when we are going to launch this proc */ if (orte_timing) { gettimeofday(&child->starttime, NULL); diff --git a/orte/mca/odls/default/help-odls-default.txt b/orte/mca/odls/default/help-odls-default.txt index 22e44208c5..a4a3647b8e 100644 --- a/orte/mca/odls/default/help-odls-default.txt +++ b/orte/mca/odls/default/help-odls-default.txt @@ -55,3 +55,36 @@ Max value allowed: %ld This may be resolved by increasing the number of available node id's by re-configuring Open MPI with the --enable-jumbo-clusters option, and then re-running the application +# +[odls-default:multiple-paffinity-schemes] +Multiple processor affinity schemes were specified (can only specify one): + +Slot list: %s +opal_paffinity_alone: true + +Please specify only the one desired method. +# +[odls-default:slot-list-failed] +We were unable to successfully process/set the requested processor +affinity settings: + +Specified slot list: %s +Error: %s + +This could mean that a non-existent processor was specified, or +that the specification had improper syntax. +# +[odls-default:invalid-node-rank] +An invalid node rank was obtained - this is probably something +that should be reported to the OMPI developers. +# +[odls-default:invalid-phys-cpu] +An invalid physical processor id was returned when attempting to +set processor affinity. This is probably something that should be +reported to the OMPI developers - your system may not support +this functionality. +# +[odls-default:failed-set-paff] +An attempt to set processor affinity has failed - please check to +ensure that your system supports such functionality. If so, then +this is probably something that should be reported to the OMPI developers. diff --git a/orte/mca/odls/default/odls_default_module.c b/orte/mca/odls/default/odls_default_module.c index 69ab94e1e8..f0bd094021 100644 --- a/orte/mca/odls/default/odls_default_module.c +++ b/orte/mca/odls/default/odls_default_module.c @@ -67,10 +67,14 @@ #endif #endif /* HAVE_SCHED_YIELD */ +#include "opal/mca/maffinity/base/base.h" +#include "opal/mca/paffinity/base/base.h" + #include "orte/util/show_help.h" #include "orte/runtime/orte_wait.h" #include "orte/runtime/orte_globals.h" #include "orte/mca/errmgr/errmgr.h" +#include "orte/mca/ess/ess.h" #include "orte/mca/iof/base/iof_base_setup.h" #include "orte/util/name_fns.h" @@ -178,6 +182,7 @@ static int odls_default_fork_local_proc(orte_app_context_t* context, sigset_t sigs; int i, p[2]; pid_t pid; + bool paffinity_enabled = false; if (NULL != child) { /* should pull this information from MPIRUN instead of going with @@ -259,7 +264,75 @@ static int odls_default_fork_local_proc(orte_app_context_t* context, exit(1); } - + /* Setup process affinity. First check to see if a slot list was + * specified. If so, use it. If no slot list was specified, + * that's not an error -- just fall through and try the next + * paffinity scheme. + */ + if (NULL != child->slot_list) { + OPAL_OUTPUT_VERBOSE((2, orte_odls_globals.output, + "%s odls:default:fork got slot_list %s for child %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + child->slot_list, ORTE_NAME_PRINT(child->name))); + if (opal_paffinity_alone) { + /* It's an error if multiple paffinity schemes were specified */ + orte_show_help("help-odls-default.txt", + "odls-default:multiple-paffinity-schemes", true, child->slot_list); + rc = ORTE_ERR_FATAL; + write(p[1], &rc, sizeof(int)); + exit(1); + } + if (OPAL_SUCCESS != (rc = opal_paffinity_base_slot_list_set((long)child->name->vpid, child->slot_list))) { + orte_show_help("help-odls-default.txt", + "odls-default:slot-list-failed", true, child->slot_list, ORTE_ERROR_NAME(rc)); + write(p[1], &rc, sizeof(int)); + exit(1); + } + } + /* Otherwise, if opal_paffinity_alone was set, use that scheme */ + else if (opal_paffinity_alone) { + opal_paffinity_base_cpu_set_t mask; + int phys_cpu; + orte_node_rank_t nrank; + OPAL_OUTPUT_VERBOSE((2, orte_odls_globals.output, + "%s odls:default:fork setting paffinity for child %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(child->name))); + if (ORTE_NODE_RANK_INVALID == (nrank = orte_ess.get_node_rank(child->name))) { + orte_show_help("help-odls-default.txt", + "odls-default:invalid-node-rank", true); + rc = ORTE_ERR_FATAL; + write(p[1], &rc, sizeof(int)); + exit(1); + } + OPAL_PAFFINITY_CPU_ZERO(mask); + phys_cpu = opal_paffinity_base_get_physical_processor_id(nrank); + if (0 > phys_cpu) { + orte_show_help("help-odls-default.txt", + "odls-default:invalid-phys-cpu", true); + rc = ORTE_ERR_FATAL; + write(p[1], &rc, sizeof(int)); + exit(1); + } + OPAL_PAFFINITY_CPU_SET(phys_cpu, mask); + if (OPAL_SUCCESS != (rc = opal_paffinity_base_set(mask))) { + orte_show_help("help-odls-default.txt", + "odls-default:failed-set-paff", true); + write(p[1], &rc, sizeof(int)); + exit(1); + } + paffinity_enabled = true; + } + /* If we were able to set processor affinity, try setting up + * memory affinity + */ + if (paffinity_enabled) { + if (OPAL_SUCCESS == opal_maffinity_base_open() && + OPAL_SUCCESS == opal_maffinity_base_select()) { + opal_maffinity_setup = true; + } + } + } else if (!(ORTE_JOB_CONTROL_FORWARD_OUTPUT & controls)) { /* tie stdin/out/err/internal to /dev/null */ int fdnull;