diff --git a/opal/mca/paffinity/paffinity.h b/opal/mca/paffinity/paffinity.h index 509505f920..85afd1e6e3 100644 --- a/opal/mca/paffinity/paffinity.h +++ b/opal/mca/paffinity/paffinity.h @@ -108,6 +108,11 @@ #define OPAL_PROC_ON_LOCAL_CU(n) ((n) & OPAL_PROC_ON_CU) #define OPAL_PROC_ON_LOCAL_CLUSTER(n) ((n) & OPAL_PROC_ON_CLUSTER) +/* Process binding modes */ +#define OPAL_PAFFINITY_DO_NOT_BIND 0x01 +#define OPAL_PAFFINITY_BIND_TO_CORE 0x02 +#define OPAL_PAFFINITY_BIND_TO_SOCKET 0x04 +#define OPAL_PAFFINITY_BIND_TO_BOARD 0x08 /* ******************************************************************** */ diff --git a/orte/mca/odls/base/odls_base_default_fns.c b/orte/mca/odls/base/odls_base_default_fns.c index 8a067becd0..1d957ff6f4 100644 --- a/orte/mca/odls/base/odls_base_default_fns.c +++ b/orte/mca/odls/base/odls_base_default_fns.c @@ -54,6 +54,7 @@ #include "orte/mca/ess/base/base.h" #include "orte/mca/plm/base/base.h" #include "orte/mca/routed/base/base.h" +#include "orte/mca/rmaps/rmaps_types.h" #include "orte/util/context_fns.h" #include "orte/util/name_fns.h" @@ -326,6 +327,24 @@ int orte_odls_base_default_get_add_procs_data(opal_buffer_t *data, return rc; } + /* pack the map & binding policy for this job */ + if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &map->policy, 1, ORTE_MAPPING_POLICY))) { + ORTE_ERROR_LOG(rc); + return rc; + } + + /* pack the cpus_per_rank for this job */ + if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &map->cpus_per_rank, 1, OPAL_INT16))) { + ORTE_ERROR_LOG(rc); + return rc; + } + + /* pack the stride for this job */ + if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &map->stride, 1, OPAL_INT16))) { + ORTE_ERROR_LOG(rc); + return rc; + } + /* pack the control flags for this job */ if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &jdata->controls, 1, ORTE_JOB_CONTROL))) { ORTE_ERROR_LOG(rc); @@ -744,6 +763,24 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data, ORTE_ERROR_LOG(rc); goto REPORT_ERROR; } + /* unpack the mapping policy for the job */ + cnt=1; + if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &jobdat->policy, &cnt, ORTE_MAPPING_POLICY))) { + ORTE_ERROR_LOG(rc); + goto REPORT_ERROR; + } + /* unpack the cpus/rank for the job */ + cnt=1; + if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &jobdat->cpus_per_rank, &cnt, OPAL_INT16))) { + ORTE_ERROR_LOG(rc); + goto REPORT_ERROR; + } + /* unpack the stride for the job */ + cnt=1; + if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &jobdat->stride, &cnt, OPAL_INT16))) { + ORTE_ERROR_LOG(rc); + goto REPORT_ERROR; + } /* unpack the control flags for the job */ cnt=1; if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &jobdat->controls, &cnt, ORTE_JOB_CONTROL))) { @@ -1745,7 +1782,7 @@ int orte_odls_base_default_launch_local(orte_jobid_t job, } } - rc = fork_local(app, child, app->env, jobdat->controls, jobdat->stdin_target); + rc = fork_local(app, child, app->env, jobdat); /* reaquire lock so we don't double unlock... */ OPAL_THREAD_LOCK(&orte_odls_globals.mutex); if (ORTE_SUCCESS != rc) { @@ -1791,12 +1828,22 @@ CLEANUP: "%s odls:launch reporting job %s launch status", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(job))); - /* pack the launch results */ - if (ORTE_SUCCESS != (ret = pack_state_update(&alert, true, jobdat))) { - ORTE_ERROR_LOG(ret); - } - if (!launch_failed) { + /* if the launch failed, we need to flag all the procs from this job + * that didn't launch as having failed, or else we will hang + */ + if (launch_failed) { + OPAL_THREAD_UNLOCK(&orte_odls_globals.mutex); + for (item = opal_list_get_first(&orte_local_children); + item != opal_list_get_end(&orte_local_children); + item = opal_list_get_next(item)) { + child = (orte_odls_child_t*)item; + if (child->name->jobid == jobdat->jobid && + ORTE_PROC_STATE_LAUNCHED >= child->state) { + child->state = ORTE_PROC_STATE_FAILED_TO_START; + } + } + } else { /* if the launch succeeded, check to see if we need to * co-locate any debugger daemons so that they get launched * before we report anything to the HNP. This ensures that @@ -1813,13 +1860,16 @@ CLEANUP: ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (ORTE_JOB_CONTROL_FORWARD_OUTPUT & orte_odls_globals.debugger->controls) ? "output forwarded" : "no output")); - fork_local(orte_odls_globals.debugger->apps[0], NULL, NULL, - orte_odls_globals.debugger->controls, ORTE_VPID_INVALID); + fork_local(orte_odls_globals.debugger->apps[0], NULL, NULL, orte_odls_globals.debugger); orte_odls_globals.debugger_launched = true; } - } + /* pack the launch results */ + if (ORTE_SUCCESS != (ret = pack_state_update(&alert, true, jobdat))) { + ORTE_ERROR_LOG(ret); + } + /* if we are the HNP, then we would rather not send this to ourselves - * instead, we queue it up for local processing */ diff --git a/orte/mca/odls/base/odls_base_open.c b/orte/mca/odls/base/odls_base_open.c index 8fc2815b1b..7a61e54f24 100644 --- a/orte/mca/odls/base/odls_base_open.c +++ b/orte/mca/odls/base/odls_base_open.c @@ -107,6 +107,9 @@ static void orte_odls_job_constructor(orte_odls_job_t *ptr) ptr->launch_msg_processed = false; ptr->apps = NULL; ptr->num_apps = 0; + ptr->policy = 0; + ptr->cpus_per_rank = 1; + ptr->stride = 1; ptr->controls = 0; ptr->stdin_target = ORTE_VPID_INVALID; ptr->total_slots_alloc = 0; @@ -232,6 +235,12 @@ int orte_odls_base_open(void) opal_argv_append_nosize(&orte_odls_globals.xtermcmd, "-e"); } + /* see if the user wants us to report bindings */ + mca_base_param_reg_int_name("odls", "base_report_bindings", + "Report process bindings [default: no]", + false, false, (int)false, &i); + orte_odls_globals.report_bindings = OPAL_INT_TO_BOOL(i); + /* Open up all available components */ if (ORTE_SUCCESS != diff --git a/orte/mca/odls/base/odls_private.h b/orte/mca/odls/base/odls_private.h index e562ace102..05a6052909 100644 --- a/orte/mca/odls/base/odls_private.h +++ b/orte/mca/odls/base/odls_private.h @@ -64,6 +64,8 @@ typedef struct { opal_list_t xterm_ranks; /* the xterm cmd to be used */ char **xtermcmd; + /* whether or not to report bindings */ + bool report_bindings; } orte_odls_globals_t; ORTE_DECLSPEC extern orte_odls_globals_t orte_odls_globals; @@ -89,8 +91,7 @@ orte_odls_base_default_construct_child_list(opal_buffer_t *data, typedef int (*orte_odls_base_fork_local_proc_fn_t)(orte_app_context_t *context, orte_odls_child_t *child, char **environ_copy, - orte_job_controls_t controls, - orte_vpid_t stdin_target); + orte_odls_job_t *jobdat); ORTE_DECLSPEC int orte_odls_base_default_launch_local(orte_jobid_t job, diff --git a/orte/mca/odls/default/help-odls-default.txt b/orte/mca/odls/default/help-odls-default.txt index a4a3647b8e..a2a7195c72 100644 --- a/orte/mca/odls/default/help-odls-default.txt +++ b/orte/mca/odls/default/help-odls-default.txt @@ -78,6 +78,10 @@ that the specification had improper syntax. An invalid node rank was obtained - this is probably something that should be reported to the OMPI developers. # +[odls-default:invalid-local-rank] +An invalid local rank was obtained - this is probably something +that should be reported to the OMPI developers. +# [odls-default:invalid-phys-cpu] An invalid physical processor id was returned when attempting to set processor affinity. This is probably something that should be diff --git a/orte/mca/odls/default/odls_default.h b/orte/mca/odls/default/odls_default.h index ab646d5e52..bea7ac949f 100644 --- a/orte/mca/odls/default/odls_default.h +++ b/orte/mca/odls/default/odls_default.h @@ -43,6 +43,9 @@ int orte_odls_default_component_query(mca_base_module_t **module, int *priority) extern orte_odls_base_module_t orte_odls_default_module; ORTE_MODULE_DECLSPEC extern orte_odls_base_component_t mca_odls_default_component; +/* dedicated debug output flag */ +ORTE_MODULE_DECLSPEC extern bool orte_odls_default_report_bindings; + END_C_DECLS #endif /* ORTE_ODLS_H */ diff --git a/orte/mca/odls/default/odls_default_component.c b/orte/mca/odls/default/odls_default_component.c index 96cf4cb7a4..75d829bc19 100644 --- a/orte/mca/odls/default/odls_default_component.c +++ b/orte/mca/odls/default/odls_default_component.c @@ -35,6 +35,9 @@ #include "orte/mca/odls/base/odls_private.h" #include "orte/mca/odls/default/odls_default.h" +/* instantiate a module-global variable */ +bool orte_odls_default_report_bindings; + /* * Instantiate the public struct with all of our public information * and pointers to our public functions in it @@ -66,7 +69,6 @@ orte_odls_base_component_t mca_odls_default_component = { int orte_odls_default_component_open(void) { - /* nothing to do */ return ORTE_SUCCESS; } diff --git a/orte/mca/odls/default/odls_default_module.c b/orte/mca/odls/default/odls_default_module.c index 1a4de89103..06ec66cc31 100644 --- a/orte/mca/odls/default/odls_default_module.c +++ b/orte/mca/odls/default/odls_default_module.c @@ -176,8 +176,7 @@ int orte_odls_default_kill_local_procs(opal_pointer_array_t *procs, bool set_sta static int odls_default_fork_local_proc(orte_app_context_t* context, orte_odls_child_t *child, char **environ_copy, - orte_job_controls_t controls, - orte_vpid_t stdin_target) + orte_odls_job_t *jobdat) { orte_iof_base_io_conf_t opts; int rc; @@ -185,7 +184,13 @@ static int odls_default_fork_local_proc(orte_app_context_t* context, int i, p[2]; pid_t pid; bool paffinity_enabled = false; - + opal_paffinity_base_cpu_set_t mask; + orte_node_rank_t nrank; + int16_t n; + orte_local_rank_t lrank; + int target_socket, npersocket; + int logical_cpu, phys_core, phys_cpu; + if (NULL != child) { /* should pull this information from MPIRUN instead of going with default */ @@ -193,7 +198,7 @@ static int odls_default_fork_local_proc(orte_app_context_t* context, /* do we want to setup stdin? */ if (NULL != child && - (stdin_target == ORTE_VPID_WILDCARD || child->name->vpid == stdin_target)) { + (jobdat->stdin_target == ORTE_VPID_WILDCARD || child->name->vpid == jobdat->stdin_target)) { opts.connect_stdin = true; } else { opts.connect_stdin = false; @@ -265,7 +270,7 @@ static int odls_default_fork_local_proc(orte_app_context_t* context, write(p[1], &i, sizeof(int)); exit(1); } - + /* Setup process affinity. First check to see if a slot list was * specified. If so, use it. If no slot list was specified, * that's not an error -- just fall through and try the next @@ -291,39 +296,144 @@ static int odls_default_fork_local_proc(orte_app_context_t* context, exit(1); } } - /* Otherwise, if opal_paffinity_alone was set, use that scheme */ - else if (opal_paffinity_alone) { - opal_paffinity_base_cpu_set_t mask; - int phys_cpu; - orte_node_rank_t nrank; - OPAL_OUTPUT_VERBOSE((2, orte_odls_globals.output, - "%s odls:default:fork setting paffinity for child %s", + /* Otherwise, if opal_paffinity_alone was set and a binding is specified, use that scheme */ + else if (opal_paffinity_alone && !(ORTE_BIND_TO_NONE & jobdat->policy)) { + OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, + "%s odls:default:fork setting paffinity for child %s using policy %04x", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(child->name))); - if (ORTE_NODE_RANK_INVALID == (nrank = orte_ess.get_node_rank(child->name))) { - orte_show_help("help-odls-default.txt", - "odls-default:invalid-node-rank", true); - rc = ORTE_ERR_FATAL; - write(p[1], &rc, sizeof(int)); - exit(1); + ORTE_NAME_PRINT(child->name), jobdat->policy)); + if (ORTE_BIND_TO_CORE & jobdat->policy) { + /* we want to bind this proc to a specific core, or multiple cores + * if the cpus_per_rank is > 0 + */ + OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, + "%s odls:default:fork binding child %s to core(s) cpus/rank %d stride %d", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(child->name), + (int)jobdat->cpus_per_rank, (int)jobdat->stride)); + if (ORTE_NODE_RANK_INVALID == (nrank = orte_ess.get_node_rank(child->name))) { + orte_show_help("help-odls-default.txt", + "odls-default:invalid-node-rank", true); + rc = ORTE_ERR_FATAL; + write(p[1], &rc, sizeof(int)); + exit(1); + } + OPAL_PAFFINITY_CPU_ZERO(mask); + /* my starting core has to be offset by cpus_per_rank */ + logical_cpu = nrank * jobdat->cpus_per_rank; + for (n=0; n < jobdat->cpus_per_rank; n++) { + phys_cpu = opal_paffinity_base_get_physical_processor_id(logical_cpu); + if (0 > phys_cpu) { + orte_show_help("help-odls-default.txt", + "odls-default:invalid-phys-cpu", true); + rc = ORTE_ERR_FATAL; + write(p[1], &rc, sizeof(int)); + exit(1); + } + OPAL_PAFFINITY_CPU_SET(phys_cpu, mask); + logical_cpu += jobdat->stride; + } + if (orte_odls_globals.report_bindings) { + opal_output(0, "%s odls:default:fork binding child %s to cpus %04lx", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(child->name), mask.bitmask[0]); + } + if (OPAL_SUCCESS != (rc = opal_paffinity_base_set(mask))) { + orte_show_help("help-odls-default.txt", + "odls-default:failed-set-paff", true); + write(p[1], &rc, sizeof(int)); + exit(1); + } + paffinity_enabled = true; + } else if (ORTE_BIND_TO_SOCKET & jobdat->policy) { + /* bind this proc to a socket */ + OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, + "%s odls:default:fork binding child %s to socket", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(child->name))); + /* layout this process across the sockets based on + * the provided mapping policy + */ + if (ORTE_LOCAL_RANK_INVALID == (lrank = orte_ess.get_local_rank(child->name))) { + orte_show_help("help-odls-default.txt", + "odls-default:invalid-local-rank", true); + rc = ORTE_ERR_FATAL; + write(p[1], &rc, sizeof(int)); + exit(1); + } + if (ORTE_MAPPING_NPERXXX & jobdat->policy) { + /* we need to balance the children from this job across the sockets */ + npersocket = jobdat->num_local_procs / orte_default_num_sockets_per_board; + if (ORTE_MAPPING_BYSOCKET & jobdat->policy) { + target_socket = opal_paffinity_base_get_physical_socket_id(lrank % npersocket); + } else { + target_socket = opal_paffinity_base_get_physical_socket_id(lrank / npersocket); + } + OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, + "%s odls:default:fork npersocket %d target socket %d", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + npersocket, target_socket)); + } else if (ORTE_MAPPING_BYSOCKET & jobdat->policy) { + /* this corresponds to a mapping policy where + * local rank 0 goes on socket 0, and local + * rank 1 goes on socket 1, etc. - round robin + * until all ranks are mapped + * + * NOTE: we already know our number of sockets + * from when we initialized + */ + OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, + "bysocket lrank %d numsocks %d logical socket %d", (int)lrank, + (int)orte_default_num_sockets_per_board, + (int)(lrank % orte_default_num_sockets_per_board))); + target_socket = opal_paffinity_base_get_physical_socket_id(lrank % orte_default_num_sockets_per_board); + } else { + /* use a byslot-like policy where local rank 0 goes on + * socket 0, and local rank 1 goes on socket 0, etc. + * following round-robing until all ranks mapped + */ + OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, + "byslot lrank %d numsocks %d logical socket %d", (int)lrank, + (int)orte_default_num_sockets_per_board, + (int)(lrank / orte_default_num_cores_per_socket))); + target_socket = opal_paffinity_base_get_physical_socket_id(lrank / orte_default_num_cores_per_socket); + } + OPAL_PAFFINITY_CPU_ZERO(mask); + for (n=0; n < orte_default_num_cores_per_socket; n++) { + phys_core = opal_paffinity_base_get_physical_core_id(target_socket, n); + if (0 > phys_core) { + orte_show_help("help-odls-default.txt", + "odls-default:invalid-phys-cpu", true); + rc = ORTE_ERR_FATAL; + write(p[1], &rc, sizeof(int)); + exit(1); + } + if (ORTE_SUCCESS != opal_paffinity_base_get_map_to_processor_id(target_socket, phys_core, &phys_cpu)) { + orte_show_help("help-odls-default.txt", + "odls-default:invalid-phys-cpu", true); + rc = ORTE_ERR_FATAL; + write(p[1], &rc, sizeof(int)); + exit(1); + } + OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, + "%s odls:default:fork mapping phys socket %d core %d to phys_cpu %d", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + target_socket, n, phys_cpu)); + OPAL_PAFFINITY_CPU_SET(phys_cpu, mask); + } + if (orte_odls_globals.report_bindings) { + opal_output(0, "%s odls:default:fork binding child %s to socket %d cpus %04lx", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(child->name), target_socket, mask.bitmask[0]); + } + if (OPAL_SUCCESS != (rc = opal_paffinity_base_set(mask))) { + orte_show_help("help-odls-default.txt", + "odls-default:failed-set-paff", true); + write(p[1], &rc, sizeof(int)); + exit(1); + } + paffinity_enabled = true; } - OPAL_PAFFINITY_CPU_ZERO(mask); - phys_cpu = opal_paffinity_base_get_physical_processor_id(nrank); - if (0 > phys_cpu) { - orte_show_help("help-odls-default.txt", - "odls-default:invalid-phys-cpu", true); - rc = ORTE_ERR_FATAL; - write(p[1], &rc, sizeof(int)); - exit(1); - } - OPAL_PAFFINITY_CPU_SET(phys_cpu, mask); - if (OPAL_SUCCESS != (rc = opal_paffinity_base_set(mask))) { - orte_show_help("help-odls-default.txt", - "odls-default:failed-set-paff", true); - write(p[1], &rc, sizeof(int)); - exit(1); - } - paffinity_enabled = true; } /* If we were able to set processor affinity, try setting up * memory affinity @@ -335,15 +445,15 @@ static int odls_default_fork_local_proc(orte_app_context_t* context, } } - } else if (!(ORTE_JOB_CONTROL_FORWARD_OUTPUT & controls)) { - /* tie stdin/out/err/internal to /dev/null */ - int fdnull; - for (i=0; i < 3; i++) { - fdnull = open("/dev/null", O_RDONLY, 0); - if(fdnull > i) { - dup2(fdnull, i); - } - close(fdnull); + } else if (!(ORTE_JOB_CONTROL_FORWARD_OUTPUT & jobdat->controls)) { + /* tie stdin/out/err/internal to /dev/null */ + int fdnull; + for (i=0; i < 3; i++) { + fdnull = open("/dev/null", O_RDONLY, 0); + if(fdnull > i) { + dup2(fdnull, i); + } + close(fdnull); } fdnull = open("/dev/null", O_RDONLY, 0); if(fdnull > opts.p_internal[1]) { @@ -396,7 +506,7 @@ static int odls_default_fork_local_proc(orte_app_context_t* context, exit(1); } else { - if (NULL != child && (ORTE_JOB_CONTROL_FORWARD_OUTPUT & controls)) { + if (NULL != child && (ORTE_JOB_CONTROL_FORWARD_OUTPUT & jobdat->controls)) { /* connect endpoints IOF */ rc = orte_iof_base_setup_parent(child->name, &opts); if(ORTE_SUCCESS != rc) { @@ -447,7 +557,7 @@ static int odls_default_fork_local_proc(orte_app_context_t* context, "%s odls:default:fork got code %d back from child", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), i)); close(p[0]); - return i; + return ORTE_ERR_FAILED_TO_START; } } diff --git a/orte/mca/odls/odls_types.h b/orte/mca/odls/odls_types.h index 7ef030b303..ed59881ced 100644 --- a/orte/mca/odls/odls_types.h +++ b/orte/mca/odls/odls_types.h @@ -101,27 +101,30 @@ ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_odls_child_t); * List object to locally store job related info */ typedef struct orte_odls_job_t { - opal_list_item_t super; /* required to place this on a list */ - orte_job_state_t state; /* state of the job */ - orte_jobid_t jobid; /* jobid for this data */ - bool launch_msg_processed; /* launch msg has been fully processed */ - orte_app_context_t **apps; /* app_contexts for this job */ - orte_std_cntr_t num_apps; /* number of app_contexts */ - orte_job_controls_t controls; /* control flags for job */ - orte_vpid_t stdin_target; /* where stdin is to go */ - orte_std_cntr_t total_slots_alloc; - orte_std_cntr_t num_nodes; /* number of nodes involved in the job */ - orte_vpid_t num_procs; - int32_t num_local_procs; - char *regexp; /* the regular expression describing the job */ - opal_byte_object_t *pmap; /* local copy of pidmap byte object */ - opal_buffer_t collection_bucket; - opal_buffer_t local_collection; - orte_grpcomm_coll_t collective_type; - int32_t num_contributors; - int num_participating; - int num_collected; - struct timeval launch_msg_recvd; /* when the launch msg for this job was recvd - for timing purposes only */ + opal_list_item_t super; /* required to place this on a list */ + orte_job_state_t state; /* state of the job */ + orte_jobid_t jobid; /* jobid for this data */ + bool launch_msg_processed; /* launch msg has been fully processed */ + orte_app_context_t **apps; /* app_contexts for this job */ + orte_std_cntr_t num_apps; /* number of app_contexts */ + orte_mapping_policy_t policy; /* mapping policy */ + int16_t cpus_per_rank; /* number of cpus/rank */ + int16_t stride; /* step size between cores of multi-core/rank procs */ + orte_job_controls_t controls; /* control flags for job */ + orte_vpid_t stdin_target; /* where stdin is to go */ + orte_std_cntr_t total_slots_alloc; + orte_std_cntr_t num_nodes; /* number of nodes involved in the job */ + orte_vpid_t num_procs; + int32_t num_local_procs; + char *regexp; /* the regular expression describing the job */ + opal_byte_object_t *pmap; /* local copy of pidmap byte object */ + opal_buffer_t collection_bucket; + opal_buffer_t local_collection; + orte_grpcomm_coll_t collective_type; + int32_t num_contributors; + int num_participating; + int num_collected; + struct timeval launch_msg_recvd; /* when the launch msg for this job was recvd - for timing purposes only */ } orte_odls_job_t; ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_odls_job_t); diff --git a/orte/mca/odls/process/odls_process_module.c b/orte/mca/odls/process/odls_process_module.c index aad29e8f2b..6d0649e6f5 100644 --- a/orte/mca/odls/process/odls_process_module.c +++ b/orte/mca/odls/process/odls_process_module.c @@ -95,8 +95,7 @@ static int odls_process_kill_local_procs(opal_pointer_array_t *procs, bool set_s static int odls_process_fork_local_proc(orte_app_context_t* context, orte_odls_child_t *child, char **environ_copy, - orte_job_controls_t controls, - orte_vpid_t stdin_target) + orte_odls_job_t *jobdat) { pid_t pid; orte_iof_base_io_conf_t opts; @@ -124,7 +123,7 @@ static int odls_process_fork_local_proc(orte_app_context_t* context, opts.usepty = OPAL_ENABLE_PTY_SUPPORT; /* do we want to setup stdin? */ - if (stdin_target == ORTE_VPID_WILDCARD || child->name->vpid == stdin_target) { + if (jobdat->stdin_target == ORTE_VPID_WILDCARD || child->name->vpid == jobdat->stdin_target) { opts.connect_stdin = true; } else { opts.connect_stdin = false; diff --git a/orte/mca/plm/base/plm_base_rsh_support.c b/orte/mca/plm/base/plm_base_rsh_support.c index cd85c164b4..e5b7cddd97 100644 --- a/orte/mca/plm/base/plm_base_rsh_support.c +++ b/orte/mca/plm/base/plm_base_rsh_support.c @@ -50,6 +50,7 @@ #include "orte/mca/rml/rml.h" #include "orte/mca/rml/rml_types.h" #include "orte/mca/errmgr/errmgr.h" +#include "orte/mca/rmaps/rmaps_types.h" #include "orte/runtime/orte_globals.h" #include "orte/util/show_help.h" #include "orte/runtime/orte_wait.h" diff --git a/orte/mca/rmaps/base/base.h b/orte/mca/rmaps/base/base.h index a1cb544451..af26d1d877 100644 --- a/orte/mca/rmaps/base/base.h +++ b/orte/mca/rmaps/base/base.h @@ -31,6 +31,8 @@ #include "opal/class/opal_list.h" #include "opal/mca/mca.h" +#include "orte/runtime/orte_globals.h" + #include "orte/mca/rmaps/rmaps.h" BEGIN_C_DECLS @@ -56,14 +58,18 @@ typedef struct { opal_list_t available_components; /** selected module */ orte_rmaps_base_module_t *active_module; - /* user specified mapping policy */ - uint8_t policy; /** whether or not we allow oversubscription of nodes */ bool oversubscribe; - /** do we want one ppn if num_procs not specified */ - bool pernode; /** number of ppn for n_per_node mode */ int npernode; + /* number of procs/board */ + int nperboard; + /* number of procs/socket */ + int npersocket; + /* cpus per rank */ + int cpus_per_rank; + /* stride */ + int stride; /* do not allow use of the localhost */ bool no_use_local; /* display the map after it is computed */ diff --git a/orte/mca/rmaps/base/rmaps_base_common_mappers.c b/orte/mca/rmaps/base/rmaps_base_common_mappers.c index fb9956f21f..d7df9f125e 100644 --- a/orte/mca/rmaps/base/rmaps_base_common_mappers.c +++ b/orte/mca/rmaps/base/rmaps_base_common_mappers.c @@ -123,15 +123,14 @@ opal_list_item_t* orte_rmaps_base_get_starting_point(opal_list_t *node_list, ort */ int orte_rmaps_base_map_byslot(orte_job_t *jdata, orte_app_context_t *app, opal_list_t *node_list, orte_vpid_t num_procs, - orte_vpid_t vpid_start, opal_list_item_t *cur_node_item, - orte_vpid_t ppn) + opal_list_item_t *cur_node_item) { int rc=ORTE_SUCCESS; int i; orte_node_t *node; opal_list_item_t *next; orte_vpid_t num_alloc = 0; - int num_slots_to_take; + int num_procs_to_assign, num_possible_procs; /* This loop continues until all procs have been mapped or we run out of resources. We determine that we have "run out of @@ -185,21 +184,37 @@ int orte_rmaps_base_map_byslot(orte_job_t *jdata, orte_app_context_t *app, * to do so after oversubscribing). */ if (node->slots_inuse >= node->slots_alloc || 0 == node->slots_inuse) { - num_slots_to_take = (node->slots_alloc == 0) ? 1 : node->slots_alloc; + if (0 == node->slots_alloc) { + num_procs_to_assign = 1; + } else { + num_possible_procs = node->slots_alloc / jdata->map->cpus_per_rank; + if (0 == num_possible_procs) { + num_procs_to_assign = 1; + } else { + num_procs_to_assign = num_possible_procs; + } + } } else { - num_slots_to_take = node->slots_alloc - node->slots_inuse; + num_possible_procs = (node->slots_alloc - node->slots_inuse) / jdata->map->cpus_per_rank; + if (0 == num_possible_procs) { + num_procs_to_assign = 1; + } else { + num_procs_to_assign = num_possible_procs; + } } /* check if we are in npernode mode - if so, then set the num_slots_to_take * to the num_per_node */ - if (jdata->map->pernode) { - num_slots_to_take = jdata->map->npernode; + if (0 < jdata->map->npernode) { + num_procs_to_assign = jdata->map->npernode; } - for( i = 0; i < num_slots_to_take; ++i) { - if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, node, vpid_start + num_alloc, NULL, app->idx, - node_list, jdata->map->oversubscribe, true))) { + for( i = 0; i < num_procs_to_assign; ++i) { + if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, node, + jdata->map->cpus_per_rank, app->idx, + node_list, jdata->map->oversubscribe, + true, NULL))) { /** if the code is ORTE_ERR_NODE_FULLY_USED, then we know this * really isn't an error - we just need to break from the loop * since the node is fully used up. For now, just don't report @@ -220,8 +235,7 @@ int orte_rmaps_base_map_byslot(orte_job_t *jdata, orte_app_context_t *app, } /* if we have fully used up this node, then break from the loop */ - if (ORTE_ERR_NODE_FULLY_USED == rc || - (orte_rmaps_base.loadbalance && node->num_procs >= ppn)) { + if (ORTE_ERR_NODE_FULLY_USED == rc) { break; } } @@ -231,17 +245,13 @@ int orte_rmaps_base_map_byslot(orte_job_t *jdata, orte_app_context_t *app, * node is NOT max'd out * */ - if (i < (num_slots_to_take-1) && ORTE_ERR_NODE_FULLY_USED != rc && - (orte_rmaps_base.loadbalance && node->num_procs < ppn)) { + if (i < (num_procs_to_assign-1) && ORTE_ERR_NODE_FULLY_USED != rc) { continue; } cur_node_item = next; } -complete: - /* update the starting vpid */ - vpid_start += num_procs; - +complete: /* save the bookmark */ jdata->bookmark = (orte_node_t*)cur_node_item; @@ -250,7 +260,7 @@ complete: int orte_rmaps_base_map_bynode(orte_job_t *jdata, orte_app_context_t *app, opal_list_t *node_list, orte_vpid_t num_procs, - orte_vpid_t vpid_start, opal_list_item_t *cur_node_item) + opal_list_item_t *cur_node_item) { int rc = ORTE_SUCCESS; opal_list_item_t *next; @@ -297,8 +307,8 @@ int orte_rmaps_base_map_bynode(orte_job_t *jdata, orte_app_context_t *app, /* Allocate a slot on this node */ node = (orte_node_t*) cur_node_item; - if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, node, vpid_start + num_alloc, NULL, app->idx, - node_list, jdata->map->oversubscribe, true))) { + if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, node, jdata->map->cpus_per_rank, app->idx, + node_list, jdata->map->oversubscribe, true, NULL))) { /** if the code is ORTE_ERR_NODE_FULLY_USED, then we know this * really isn't an error - we just need to break from the loop * since the node is fully used up. For now, just don't report diff --git a/orte/mca/rmaps/base/rmaps_base_map_job.c b/orte/mca/rmaps/base/rmaps_base_map_job.c index a83aafca6f..f813752183 100644 --- a/orte/mca/rmaps/base/rmaps_base_map_job.c +++ b/orte/mca/rmaps/base/rmaps_base_map_job.c @@ -67,9 +67,12 @@ int orte_rmaps_base_map_job(orte_job_t *jdata) return ORTE_ERR_OUT_OF_RESOURCE; } /* load it with the system defaults */ - map->policy = orte_rmaps_base.policy; - map->pernode = orte_rmaps_base.pernode; + map->policy = orte_default_mapping_policy; map->npernode = orte_rmaps_base.npernode; + map->nperboard = orte_rmaps_base.nperboard; + map->npersocket = orte_rmaps_base.npersocket; + map->cpus_per_rank = orte_rmaps_base.cpus_per_rank; + map->stride = orte_rmaps_base.stride; map->oversubscribe = orte_rmaps_base.oversubscribe; map->display_map = orte_rmaps_base.display_map; /* assign the map object to this job */ diff --git a/orte/mca/rmaps/base/rmaps_base_open.c b/orte/mca/rmaps/base/rmaps_base_open.c index b2bd9b44af..a02eaa4e01 100644 --- a/orte/mca/rmaps/base/rmaps_base_open.c +++ b/orte/mca/rmaps/base/rmaps_base_open.c @@ -30,7 +30,9 @@ #include "opal/util/output.h" #include "opal/mca/base/base.h" #include "opal/mca/base/mca_base_param.h" +#include "opal/mca/paffinity/paffinity.h" +#include "orte/runtime/orte_globals.h" #include "orte/mca/rmaps/base/rmaps_private.h" @@ -92,39 +94,74 @@ int orte_rmaps_base_open(void) /* Are we scheduling by node or by slot? */ param = mca_base_param_reg_string_name("rmaps", "base_schedule_policy", - "Scheduling Policy for RMAPS. [slot | node]", + "Scheduling Policy for RMAPS. [slot (default) | socket | board | node]", false, false, "unspec", &policy); - if (0 == strcmp(policy, "unspec")) { - orte_rmaps_base.policy = ORTE_RMAPS_BYSLOT; /* default to byslot */ + if (0 == strcmp(policy, "socket")) { + ORTE_SET_MAPPING_POLICY(ORTE_MAPPING_BYSOCKET); + } else if (0 == strcmp(policy, "board")) { + ORTE_SET_MAPPING_POLICY(ORTE_MAPPING_BYBOARD); } else if (0 == strcmp(policy, "node")) { - orte_rmaps_base.policy = ORTE_RMAPS_BYNODE; - } else { - orte_rmaps_base.policy = ORTE_RMAPS_BYSLOT; /* default to byslot */ + ORTE_SET_MAPPING_POLICY(ORTE_MAPPING_BYNODE); } + /* if nothing was specified, leave it alone - we already set it + * in orterun + */ - /* Do we want one ppn if num_procs not specified */ + /* check for procs/xxx directives */ param = mca_base_param_reg_int_name("rmaps", "base_pernode", "Launch one ppn as directed", false, false, (int)false, &value); - orte_rmaps_base.pernode = OPAL_INT_TO_BOOL(value); - - /* if pernode is set, we do not allow npernode to also be set - instead - * we default the npernode value to 1 - */ - if (orte_rmaps_base.pernode) { + if (value) { orte_rmaps_base.npernode = 1; - } else { - /* Do we want n ppn */ - param = mca_base_param_reg_int_name("rmaps", "base_n_pernode", - "Launch n procs/node", - false, false, 0, &value); - orte_rmaps_base.npernode = value; - if (0 < orte_rmaps_base.npernode) { - orte_rmaps_base.pernode = true; - } } + /* #procs/node */ + param = mca_base_param_reg_int_name("rmaps", "base_n_pernode", + "Launch n procs/node", + false, false, -1, &value); + if (0 < value) { + orte_rmaps_base.npernode = value; + } + + /* #procs/board */ + param = mca_base_param_reg_int_name("rmaps", "base_n_perboard", + "Launch n procs/board", + false, false, -1, &orte_rmaps_base.nperboard); + if (0 < orte_rmaps_base.nperboard) { + ORTE_ADD_MAPPING_POLICY(ORTE_MAPPING_NPERXXX); + } + + /* #procs/socket */ + param = mca_base_param_reg_int_name("rmaps", "base_n_persocket", + "Launch n procs/socket", + false, false, -1, &orte_rmaps_base.npersocket); + if (0 < orte_rmaps_base.npersocket) { + ORTE_ADD_MAPPING_POLICY(ORTE_MAPPING_NPERXXX); + } + + /* Do we want to loadbalance the job */ + param = mca_base_param_reg_int_name("rmaps", "base_loadbalance", + "Balance total number of procs across all allocated nodes", + false, false, (int)false, &value); + orte_rmaps_base.loadbalance = OPAL_INT_TO_BOOL(value); + + /* #cpus/rank to use */ + param = mca_base_param_reg_int_name("rmaps", "base_cpus_per_rank", + "Number of cpus to use for each rank [1-2**15 (default=1)]", + false, false, 1, &value); + orte_rmaps_base.cpus_per_rank = value; + /* if the cpus/rank > 1, then we have to bind to cores */ + if (1 < orte_rmaps_base.cpus_per_rank) { + ORTE_SET_BINDING_POLICY(ORTE_BIND_TO_CORE); + } + + /* stride to use */ + param = mca_base_param_reg_int_name("rmaps", "base_stride", + "When binding multiple cores to a rank, the step size to use between cores [1-2**15 (default: 1)]", + false, false, 1, &value); + orte_rmaps_base.stride = value; + /* did the user provide a slot list? */ param = mca_base_param_reg_string_name("rmaps", "base_slot_list", "List of processor IDs to bind MPI processes to (e.g., used in conjunction with rank files) [default=NULL]", @@ -136,7 +173,7 @@ int orte_rmaps_base_open(void) "If false, allow scheduling MPI applications on the same node as mpirun (default). If true, do not schedule any MPI applications on the same node as mpirun", false, false, (int)false, &value); if (value) { - orte_rmaps_base.policy |= ORTE_RMAPS_NO_USE_LOCAL; + orte_default_mapping_policy |= ORTE_MAPPING_NO_USE_LOCAL; } /* Should we oversubscribe or not? */ @@ -150,16 +187,6 @@ int orte_rmaps_base_open(void) orte_rmaps_base.oversubscribe = true; } - /* Do we want to loadbalance the job */ - param = mca_base_param_reg_int_name("rmaps", "base_loadbalance", - "Balance total number of procs across all allocated nodes", - false, false, (int)false, &value); - orte_rmaps_base.loadbalance = OPAL_INT_TO_BOOL(value); - /* if we are doing npernode or pernode, then we cannot loadbalance */ - if (orte_rmaps_base.pernode) { - orte_rmaps_base.loadbalance = false; - } - /* should we display the map after determining it? */ mca_base_param_reg_int_name("rmaps", "base_display_map", "Whether to display the process map after it is computed", diff --git a/orte/mca/rmaps/base/rmaps_base_support_fns.c b/orte/mca/rmaps/base/rmaps_base_support_fns.c index 25b1cbe56f..07c2829b24 100644 --- a/orte/mca/rmaps/base/rmaps_base_support_fns.c +++ b/orte/mca/rmaps/base/rmaps_base_support_fns.c @@ -41,7 +41,7 @@ * Query the registry for all nodes allocated to a specified app_context */ int orte_rmaps_base_get_target_nodes(opal_list_t *allocated_nodes, orte_std_cntr_t *total_num_slots, - orte_app_context_t *app, uint8_t policy) + orte_app_context_t *app, orte_mapping_policy_t policy) { opal_list_item_t *item, *next; orte_node_t *node; @@ -169,7 +169,7 @@ int orte_rmaps_base_get_target_nodes(opal_list_t *allocated_nodes, orte_std_cntr /* If the "no local" option was set, then remove the local node * from the list */ - if (policy & ORTE_RMAPS_NO_USE_LOCAL) { + if (policy & ORTE_MAPPING_NO_USE_LOCAL) { /* we don't need to check through the entire list as * the head node - if it is on the list at all - will * always be in the first position @@ -267,9 +267,9 @@ PROCESS: * in the mapper */ OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base.rmaps_output, - "%s rmaps:base: mapping proc %s to node %s", + "%s rmaps:base: mapping proc for job %s to node %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&proc->name), + ORTE_JOBID_PRINT(proc->name.jobid), (NULL == node->name) ? "NULL" : node->name)); if (0 > (rc = opal_pointer_array_add(node->procs, (void*)proc))) { @@ -289,88 +289,56 @@ PROCESS: */ int orte_rmaps_base_claim_slot(orte_job_t *jdata, orte_node_t *current_node, - orte_vpid_t vpid, - char *slot_list, + int32_t cpus_per_rank, orte_std_cntr_t app_idx, opal_list_t *nodes, bool oversubscribe, - bool remove_from_list) + bool remove_from_list, + orte_proc_t **returnproc) { - orte_proc_t *proc, *proc_from_job; + orte_proc_t *proc; bool oversub; int rc; - int n; - - OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base.rmaps_output, - "%s rmaps:base:claim_slot: checking for existence of vpid %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_VPID_PRINT(vpid))); - /* does this proc already exist within the job? */ - proc = NULL; - for (n=0; n < jdata->procs->size; n++) { - if (NULL == (proc_from_job = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, n))) { - continue; - } - if (proc_from_job->name.vpid == vpid) { - /* already have it! */ - proc = proc_from_job; - - OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base.rmaps_output, - "%s rmaps:base:claim_slot: found existing proc %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&proc->name))); - - if (NULL != proc->slot_list) { - /* cleanout stale info */ - free(proc->slot_list); - } - break; - } - } - if (NULL == proc) { - /* need to create mapped_proc object */ + /* if we were given a proc, just use it */ + if (NULL != returnproc && NULL != *returnproc) { + proc = *returnproc; + } else { + /* create mapped_proc object */ proc = OBJ_NEW(orte_proc_t); if (NULL == proc) { ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); return ORTE_ERR_OUT_OF_RESOURCE; } - /* create the process name */ + /* set the jobid */ proc->name.jobid = jdata->jobid; - proc->name.vpid = vpid; + /* we do not set the vpid here - this will be done + * during a second phase + */ proc->app_idx = app_idx; OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base.rmaps_output, "%s rmaps:base:claim_slot: created new proc %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&proc->name))); - /* add this proc to the job's data - we don't have to worry here - * about keeping the array left-justified as all vpids - * from 0 to num_procs will be filled - */ - if (ORTE_SUCCESS != (rc = opal_pointer_array_set_item(jdata->procs, - (int)vpid, - (void*)proc))) { - ORTE_ERROR_LOG(rc); - OBJ_RELEASE(proc); - return rc; + + /* provide returned proc, if requested */ + if (NULL != returnproc) { + *returnproc = proc; } } - + OBJ_RETAIN(current_node); /* maintain accounting on object */ - if ( NULL != slot_list) { - proc->slot_list = strdup(slot_list); - } proc->node = current_node; proc->nodename = current_node->name; OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base.rmaps_output, - "%s rmaps:base:claim_slot mapping rank %d in job %s to node %s", + "%s rmaps:base:claim_slot mapping proc in job %s to node %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - vpid, ORTE_JOBID_PRINT(jdata->jobid), current_node->name)); + ORTE_JOBID_PRINT(jdata->jobid), current_node->name)); - /* Be sure to demarcate this slot as claimed for the node */ - current_node->slots_inuse++; + /* Be sure to demarcate the slots for this proc as claimed from the node */ + current_node->slots_inuse += cpus_per_rank; /* see if this node is oversubscribed now */ if (current_node->slots_inuse > current_node->slots) { @@ -415,8 +383,68 @@ int orte_rmaps_base_claim_slot(orte_job_t *jdata, return ORTE_SUCCESS; } +int orte_rmaps_base_compute_vpids(orte_job_t *jdata) +{ + orte_job_map_t *map; + orte_vpid_t vpid; + int i, j; + orte_node_t *node; + orte_proc_t *proc; + int rc; + + map = jdata->map; + + if (ORTE_MAPPING_BYSLOT & map->policy || + ORTE_MAPPING_BYSOCKET & map->policy || + ORTE_MAPPING_BYBOARD & map->policy) { + /* assign the ranks sequentially */ + vpid = 0; + for (i=0; i < map->nodes->size; i++) { + if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, i))) { + continue; + } + for (j=0; j < node->procs->size; j++) { + if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) { + continue; + } + proc->name.vpid = vpid++; + if (ORTE_SUCCESS != (rc = opal_pointer_array_set_item(jdata->procs, + proc->name.vpid, proc))) { + ORTE_ERROR_LOG(rc); + return rc; + } + } + } + return ORTE_SUCCESS; + } + + if (ORTE_MAPPING_BYNODE & map->policy) { + /* assign the ranks round-robin across nodes */ + for (i=0; i < map->nodes->size; i++) { + if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, i))) { + continue; + } + vpid = i; + for (j=0; j < node->procs->size; j++) { + if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) { + continue; + } + proc->name.vpid = vpid; + vpid += map->num_nodes; + if (ORTE_SUCCESS != (rc = opal_pointer_array_set_item(jdata->procs, + proc->name.vpid, proc))) { + ORTE_ERROR_LOG(rc); + return rc; + } + } + } + return ORTE_SUCCESS; + } -int orte_rmaps_base_compute_usage(orte_job_t *jdata) + return ORTE_ERR_NOT_IMPLEMENTED; +} + +int orte_rmaps_base_compute_local_ranks(orte_job_t *jdata) { orte_std_cntr_t i; int j, k; @@ -501,8 +529,8 @@ int orte_rmaps_base_compute_usage(orte_job_t *jdata) * we don't, then it would be possible for procs to conflict * when opening static ports, should that be enabled. */ -void orte_rmaps_base_update_usage(orte_job_t *jdata, orte_node_t *oldnode, - orte_node_t *newnode, orte_proc_t *newproc) +void orte_rmaps_base_update_local_ranks(orte_job_t *jdata, orte_node_t *oldnode, + orte_node_t *newnode, orte_proc_t *newproc) { int k; orte_node_rank_t node_rank; diff --git a/orte/mca/rmaps/base/rmaps_private.h b/orte/mca/rmaps/base/rmaps_private.h index 53c92df25c..96c175a189 100644 --- a/orte/mca/rmaps/base/rmaps_private.h +++ b/orte/mca/rmaps/base/rmaps_private.h @@ -61,7 +61,7 @@ int orte_rmaps_base_add_proc_to_map(orte_job_map_t *map, orte_node_t *node, ORTE_DECLSPEC int orte_rmaps_base_get_target_nodes(opal_list_t* node_list, orte_std_cntr_t *total_num_slots, orte_app_context_t *app, - uint8_t policy); + orte_mapping_policy_t policy); ORTE_DECLSPEC int orte_rmaps_base_get_target_procs(opal_list_t *procs); ORTE_DECLSPEC int orte_rmaps_base_update_node_usage(opal_list_t *nodes); @@ -72,17 +72,19 @@ ORTE_DECLSPEC int orte_rmaps_base_get_mapped_targets(opal_list_t *mapped_node_li ORTE_DECLSPEC int orte_rmaps_base_claim_slot(orte_job_t *jdata, orte_node_t *current_node, - orte_vpid_t vpid, - char *slot_list, + int32_t stride, orte_std_cntr_t app_idx, opal_list_t *nodes, bool oversubscribe, - bool remove_from_list); + bool remove_from_list, + orte_proc_t **returnproc); -ORTE_DECLSPEC int orte_rmaps_base_compute_usage(orte_job_t *jdata); +ORTE_DECLSPEC int orte_rmaps_base_compute_vpids(orte_job_t *jdata); -ORTE_DECLSPEC void orte_rmaps_base_update_usage(orte_job_t *jdata, orte_node_t *oldnode, - orte_node_t *newnode, orte_proc_t *newproc); +ORTE_DECLSPEC int orte_rmaps_base_compute_local_ranks(orte_job_t *jdata); + +ORTE_DECLSPEC void orte_rmaps_base_update_local_ranks(orte_job_t *jdata, orte_node_t *oldnode, + orte_node_t *newnode, orte_proc_t *newproc); ORTE_DECLSPEC int orte_rmaps_base_rearrange_map(orte_app_context_t *app, orte_job_map_t *map, opal_list_t *procs); @@ -93,12 +95,11 @@ ORTE_DECLSPEC opal_list_item_t* orte_rmaps_base_get_starting_point(opal_list_t * ORTE_DECLSPEC int orte_rmaps_base_map_byslot(orte_job_t *jdata, orte_app_context_t *app, opal_list_t *node_list, orte_vpid_t num_procs, - orte_vpid_t vpid_start, opal_list_item_t *cur_node_item, - orte_vpid_t ppn); + opal_list_item_t *cur_node_item); ORTE_DECLSPEC int orte_rmaps_base_map_bynode(orte_job_t *jdata, orte_app_context_t *app, opal_list_t *node_list, orte_vpid_t num_procs, - orte_vpid_t vpid_start, opal_list_item_t *cur_node_item); + opal_list_item_t *cur_node_item); END_C_DECLS diff --git a/orte/mca/rmaps/load_balance/Makefile.am b/orte/mca/rmaps/load_balance/Makefile.am new file mode 100644 index 0000000000..42eb13827b --- /dev/null +++ b/orte/mca/rmaps/load_balance/Makefile.am @@ -0,0 +1,45 @@ +# +# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana +# University Research and Technology +# Corporation. All rights reserved. +# Copyright (c) 2004-2005 The University of Tennessee and The University +# of Tennessee Research Foundation. All rights +# reserved. +# Copyright (c) 2004-2009 High Performance Computing Center Stuttgart, +# University of Stuttgart. All rights reserved. +# Copyright (c) 2004-2005 The Regents of the University of California. +# All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +dist_pkgdata_DATA = help-orte-rmaps-lb.txt + +sources = \ + rmaps_lb.c \ + rmaps_lb.h \ + rmaps_lb_component.c + +# Make the output library in this directory, and name it either +# mca__.la (for DSO builds) or libmca__.la +# (for static builds). + +if OMPI_BUILD_rmaps_load_balance_DSO +component_noinst = +component_install = mca_rmaps_load_balance.la +else +component_noinst = libmca_rmaps_load_balance.la +component_install = +endif + +mcacomponentdir = $(pkglibdir) +mcacomponent_LTLIBRARIES = $(component_install) +mca_rmaps_load_balance_la_SOURCES = $(sources) +mca_rmaps_load_balance_la_LDFLAGS = -module -avoid-version + +noinst_LTLIBRARIES = $(component_noinst) +libmca_rmaps_load_balance_la_SOURCES =$(sources) +libmca_rmaps_load_balance_la_LDFLAGS = -module -avoid-version diff --git a/orte/mca/rmaps/load_balance/configure.params b/orte/mca/rmaps/load_balance/configure.params new file mode 100644 index 0000000000..3513f8d956 --- /dev/null +++ b/orte/mca/rmaps/load_balance/configure.params @@ -0,0 +1,24 @@ +# -*- shell-script -*- +# +# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana +# University Research and Technology +# Corporation. All rights reserved. +# Copyright (c) 2004-2005 The University of Tennessee and The University +# of Tennessee Research Foundation. All rights +# reserved. +# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, +# University of Stuttgart. All rights reserved. +# Copyright (c) 2004-2005 The Regents of the University of California. +# All rights reserved. +# Copyright (c) 2007 Los Alamos National Security, LLC. All rights +# reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +# Specific to this module + +PARAM_CONFIG_FILES="Makefile" diff --git a/orte/mca/rmaps/load_balance/help-orte-rmaps-lb.txt b/orte/mca/rmaps/load_balance/help-orte-rmaps-lb.txt new file mode 100644 index 0000000000..2b7941d88a --- /dev/null +++ b/orte/mca/rmaps/load_balance/help-orte-rmaps-lb.txt @@ -0,0 +1,53 @@ +# -*- text -*- +# +# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana +# University Research and Technology +# Corporation. All rights reserved. +# Copyright (c) 2004-2005 The University of Tennessee and The University +# of Tennessee Research Foundation. All rights +# reserved. +# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, +# University of Stuttgart. All rights reserved. +# Copyright (c) 2004-2005 The Regents of the University of California. +# All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# +# This is the US/English general help file for Open RTE's orterun. +# +[orte-rmaps-rr:alloc-error] +There are not enough slots available in the system to satisfy the %d slots +that were requested by the application: + %s + +Either request fewer slots for your application, or make more slots available +for use. +[orte-rmaps-rr:multi-apps-and-zero-np] +RMAPS found multiple applications to be launched, with +at least one that failed to specify the number of processes to execute. +When specifying multiple applications, you must specify how many processes +of each to launch via the -np argument. + +[orte-rmaps-rr:per-node-and-too-many-procs] +There are not enough nodes in your allocation to satisfy your request to launch +%d processes on a per-node basis - only %d nodes were available. + +Either request fewer processes, or obtain a larger allocation. +[orte-rmaps-rr:n-per-node-and-too-many-procs] +There are not enough nodes in your allocation to satisfy your request to launch +%d processes on a %d per-node basis - only %d nodes with a total of %d slots were available. + +Either request fewer processes, or obtain a larger allocation. +[orte-rmaps-rr:n-per-node-and-not-enough-slots] +There are not enough slots on the nodes in your allocation to satisfy your request to launch on a %d process-per-node basis - only %d slots/node were available. + +Either request fewer processes/node, or obtain a larger allocation. + +[orte-rmaps-rr:no-np-and-user-map] +You have specified a rank-to-node/slot mapping, but failed to provide +the number of processes to be executed. For some reason, this information +could not be obtained from the mapping you provided, so we cannot continue +with executing the specified application. diff --git a/orte/mca/rmaps/load_balance/rmaps_lb.c b/orte/mca/rmaps/load_balance/rmaps_lb.c new file mode 100644 index 0000000000..1ee9df3c07 --- /dev/null +++ b/orte/mca/rmaps/load_balance/rmaps_lb.c @@ -0,0 +1,430 @@ +/* + * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2006 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2006 Cisco Systems, Inc. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "orte_config.h" +#include "orte/constants.h" +#include "orte/types.h" + +#include +#ifdef HAVE_UNISTD_H +#include +#endif /* HAVE_UNISTD_H */ +#ifdef HAVE_STRING_H +#include +#endif /* HAVE_STRING_H */ + +#include "opal/mca/base/mca_base_param.h" + +#include "orte/util/show_help.h" +#include "orte/mca/errmgr/errmgr.h" + +#include "orte/mca/rmaps/base/rmaps_private.h" +#include "orte/mca/rmaps/base/base.h" +#include "rmaps_lb.h" + +static int switchyard(orte_job_t *jdata); + +orte_rmaps_base_module_t orte_rmaps_load_balance_module = { + switchyard +}; + +/* Local functions */ +static int npernode(orte_job_t *jdata); +static int nperboard(orte_job_t *jdata); +static int npersocket(orte_job_t *jdata); +static int loadbalance(orte_job_t *jdata); + +static int switchyard(orte_job_t *jdata) +{ + int rc; + + if (0 < orte_rmaps_base.npernode) { + rc = npernode(jdata); + } else if (0 < orte_rmaps_base.nperboard) { + rc = nperboard(jdata); + } else if (0 < orte_rmaps_base.npersocket) { + rc = npersocket(jdata); + } else { + rc = loadbalance(jdata); + } + + if (ORTE_SUCCESS != rc) { + return rc; + } + + /* compute vpids and add proc objects to the job */ + if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_vpids(jdata))) { + ORTE_ERROR_LOG(rc); + return rc; + } + + /* compute and save local ranks */ + if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_local_ranks(jdata))) { + ORTE_ERROR_LOG(rc); + return rc; + } + + /* define the daemons that we will use for this job */ + if (ORTE_SUCCESS != (rc = orte_rmaps_base_define_daemons(jdata->map))) { + ORTE_ERROR_LOG(rc); + return rc; + } + + return ORTE_SUCCESS; +} + + +/* place specified #procs on each node, up to the specified total + * number of procs (if one was given). + */ +static int npernode(orte_job_t *jdata) +{ + orte_app_context_t *app; + int i, j, rc=ORTE_SUCCESS; + opal_list_t node_list; + opal_list_item_t *item; + orte_std_cntr_t num_slots; + orte_node_t *node; + int total_procs, np; + + /* setup the node list */ + OBJ_CONSTRUCT(&node_list, opal_list_t); + + /* loop through the app_contexts */ + for(i=0; i < jdata->apps->size; i++) { + if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) { + continue; + } + /* use the number of procs if one was given */ + if (0 < app->num_procs) { + np = app->num_procs; + } else { + np = INT_MAX; + } + total_procs = 0; + /* for each app_context, we have to get the list of nodes that it can + * use since that can now be modified with a hostfile and/or -host + * option + */ + if(ORTE_SUCCESS != (rc = orte_rmaps_base_get_target_nodes(&node_list, &num_slots, app, + jdata->map->policy))) { + ORTE_ERROR_LOG(rc); + goto error; + } + /* loop through the list of nodes */ + while (NULL != (item = opal_list_remove_first(&node_list))) { + node = (orte_node_t*)item; + /* put the specified number of procs on each node */ + for (j=0; j < orte_rmaps_base.npernode && total_procs < np; j++) { + if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, node, + jdata->map->cpus_per_rank, app->idx, + &node_list, jdata->map->oversubscribe, + false, NULL))) { + /** if the code is ORTE_ERR_NODE_FULLY_USED, and we still have + * more procs to place, then that is an error + */ + if (ORTE_ERR_NODE_FULLY_USED != rc || + j < orte_rmaps_base.npernode-1) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(node); + goto error; + } + } + total_procs++; + } + OBJ_RELEASE(node); + } + } + jdata->num_procs = total_procs; + +error: + while (NULL != (item = opal_list_remove_first(&node_list))) { + OBJ_RELEASE(item); + } + OBJ_DESTRUCT(&node_list); + return rc; +} + +static int nperboard(orte_job_t *jdata) +{ + orte_app_context_t *app; + int i, j, k, rc=ORTE_SUCCESS; + opal_list_t node_list; + opal_list_item_t *item; + orte_std_cntr_t num_slots; + orte_node_t *node; + int total_procs, np; + + /* setup the node list */ + OBJ_CONSTRUCT(&node_list, opal_list_t); + + /* loop through the app_contexts */ + for(i=0; i < jdata->apps->size; i++) { + if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) { + continue; + } + /* use the number of procs if one was given */ + if (0 < app->num_procs) { + np = app->num_procs; + } else { + np = INT_MAX; + } + total_procs = 0; + /* for each app_context, we have to get the list of nodes that it can + * use since that can now be modified with a hostfile and/or -host + * option + */ + if(ORTE_SUCCESS != (rc = orte_rmaps_base_get_target_nodes(&node_list, &num_slots, app, + jdata->map->policy))) { + ORTE_ERROR_LOG(rc); + goto error; + } + /* loop through the list of nodes */ + while (NULL != (item = opal_list_remove_first(&node_list))) { + node = (orte_node_t*)item; + /* loop through the number of boards in this node */ + for (k=0; k < node->boards && total_procs < np; k++) { + /* put the specified number of procs on each board */ + for (j=0; j < orte_rmaps_base.nperboard && total_procs < np; j++) { + if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, node, + jdata->map->cpus_per_rank, app->idx, + &node_list, jdata->map->oversubscribe, + false, NULL))) { + /** if the code is ORTE_ERR_NODE_FULLY_USED, and we still have + * more procs to place, then that is an error + */ + if (ORTE_ERR_NODE_FULLY_USED != rc || + j < orte_rmaps_base.nperboard-1) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(node); + goto error; + } + } + total_procs++; + } + } + OBJ_RELEASE(node); + } + } + jdata->num_procs = total_procs; + +error: + while (NULL != (item = opal_list_remove_first(&node_list))) { + OBJ_RELEASE(item); + } + OBJ_DESTRUCT(&node_list); + return rc; +} + + +static int npersocket(orte_job_t *jdata) +{ + orte_app_context_t *app; + int i, j, k, n, rc=ORTE_SUCCESS; + opal_list_t node_list; + opal_list_item_t *item; + orte_std_cntr_t num_slots; + orte_node_t *node; + int total_procs, np; + + /* setup the node list */ + OBJ_CONSTRUCT(&node_list, opal_list_t); + + /* loop through the app_contexts */ + for(i=0; i < jdata->apps->size; i++) { + if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) { + continue; + } + /* use the number of procs if one was given */ + if (0 < app->num_procs) { + np = app->num_procs; + } else { + np = INT_MAX; + } + total_procs = 0; + /* for each app_context, we have to get the list of nodes that it can + * use since that can now be modified with a hostfile and/or -host + * option + */ + if(ORTE_SUCCESS != (rc = orte_rmaps_base_get_target_nodes(&node_list, &num_slots, app, + jdata->map->policy))) { + ORTE_ERROR_LOG(rc); + goto error; + } + /* loop through the list of nodes */ + while (NULL != (item = opal_list_remove_first(&node_list))) { + node = (orte_node_t*)item; + /* loop through the number of boards in this node */ + for (k=0; k < node->boards && total_procs < np; k++) { + /* loop through the number of sockets/board */ + for (n=0; n < node->sockets_per_board && total_procs < np; n++) { + /* put the specified number of procs on each socket */ + for (j=0; j < orte_rmaps_base.npersocket && total_procs < np; j++) { + if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, node, + jdata->map->cpus_per_rank, app->idx, + &node_list, jdata->map->oversubscribe, + false, NULL))) { + /** if the code is ORTE_ERR_NODE_FULLY_USED, and we still have + * more procs to place, then that is an error + */ + if (ORTE_ERR_NODE_FULLY_USED != rc || + j < orte_rmaps_base.npersocket-1) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(node); + goto error; + } + } + /* track the number of procs */ + total_procs++; + } + } + } + OBJ_RELEASE(node); + } + } + jdata->num_procs = total_procs; + +error: + while (NULL != (item = opal_list_remove_first(&node_list))) { + OBJ_RELEASE(item); + } + OBJ_DESTRUCT(&node_list); + return rc; +} + + +/* + * Create a load balanced mapping for the job by assigning a constant #procs/node, with + * leftovers being spread one/node starting from the first node. + */ +static int loadbalance(orte_job_t *jdata) +{ + orte_app_context_t *app; + int i, j; + opal_list_t node_list; + orte_std_cntr_t num_nodes, num_slots; + int rc=ORTE_SUCCESS, total_procs; + int ppn = 0; + opal_list_item_t *item, *start; + orte_node_t *node; + + /* setup */ + OBJ_CONSTRUCT(&node_list, opal_list_t); + + /* compute total #procs we are going to add and the total number of nodes available */ + for(i=0; i < jdata->apps->size; i++) { + if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) { + continue; + } + /* get the nodes and #slots available for this app_context */ + if(ORTE_SUCCESS != (rc = orte_rmaps_base_get_target_nodes(&node_list, &num_slots, app, + jdata->map->policy))) { + ORTE_ERROR_LOG(rc); + goto error; + } + if (0 == app->num_procs) { + /* set the num_procs to the #slots */ + app->num_procs = num_slots; + } + num_nodes = opal_list_get_size(&node_list); + /* compute the base ppn */ + ppn = app->num_procs / num_nodes; + /* if a bookmark exists from some prior mapping, set us to start there */ + start = orte_rmaps_base_get_starting_point(&node_list, jdata); + /* loop through the list of nodes until we either assign all the procs + * or return to the starting point + */ + total_procs = 0; + item = start; + do { + node = (orte_node_t*)item; + /* put the specified number of procs on each node */ + for (j=0; j < ppn; j++) { + if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, node, + jdata->map->cpus_per_rank, app->idx, + &node_list, jdata->map->oversubscribe, + false, NULL))) { + /** if the code is ORTE_ERR_NODE_FULLY_USED, and we still have + * more procs to place, then that is an error + */ + if (ORTE_ERR_NODE_FULLY_USED != rc || + j < ppn-1) { + ORTE_ERROR_LOG(rc); + goto error; + } + } + total_procs++; + } + /* move to next node */ + if (opal_list_get_end(&node_list) == opal_list_get_next(item)) { + item = opal_list_get_first(&node_list); + } + else { + item = opal_list_get_next(item); + } + } while (item != start); + + /* save the bookmark */ + jdata->bookmark = node; + + /* if we haven't assigned all the procs, then loop through the list + * again, assigning 1 per node until all are assigned + */ + item = start; + while (total_procs < app->num_procs) { + node = (orte_node_t*)item; + if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, node, + jdata->map->cpus_per_rank, app->idx, + &node_list, jdata->map->oversubscribe, + false, NULL))) { + /* if the code is not ORTE_ERR_NODE_FULLY_USED, then that is an error */ + if (ORTE_ERR_NODE_FULLY_USED != rc) { + ORTE_ERROR_LOG(rc); + goto error; + } + } + total_procs++; + /* move to next node */ + if (opal_list_get_end(&node_list) == opal_list_get_next(item)) { + item = opal_list_get_first(&node_list); + } + else { + item = opal_list_get_next(item); + } + } + /* save the bookmark */ + jdata->bookmark = node; + + /* cleanup */ + while (NULL != (item = opal_list_remove_first(&node_list))) { + OBJ_RELEASE(item); + } + } + /* record the number of procs */ + jdata->num_procs = total_procs; + +error: + while(NULL != (item = opal_list_remove_first(&node_list))) { + OBJ_RELEASE(item); + } + OBJ_DESTRUCT(&node_list); + + return rc; +} + diff --git a/orte/mca/rmaps/load_balance/rmaps_lb.h b/orte/mca/rmaps/load_balance/rmaps_lb.h new file mode 100644 index 0000000000..1635ac87b1 --- /dev/null +++ b/orte/mca/rmaps/load_balance/rmaps_lb.h @@ -0,0 +1,37 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2006 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +/** + * @file + * + * Resource Mapping + */ +#ifndef ORTE_RMAPS_LB_H +#define ORTE_RMAPS_LB_H + +#include "orte_config.h" +#include "orte/mca/rmaps/rmaps.h" + +BEGIN_C_DECLS + +ORTE_MODULE_DECLSPEC extern orte_rmaps_base_component_t mca_rmaps_load_balance_component; +extern orte_rmaps_base_module_t orte_rmaps_load_balance_module; + + +END_C_DECLS + +#endif diff --git a/orte/mca/rmaps/load_balance/rmaps_lb_component.c b/orte/mca/rmaps/load_balance/rmaps_lb_component.c new file mode 100644 index 0000000000..abdc3007df --- /dev/null +++ b/orte/mca/rmaps/load_balance/rmaps_lb_component.c @@ -0,0 +1,96 @@ +/* + * Copyright (c) 2004-2008 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2005 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "orte_config.h" +#include "orte/constants.h" + +#include "opal/mca/base/base.h" +#include "opal/mca/base/mca_base_param.h" + +#include "orte/mca/rmaps/base/base.h" +#include "rmaps_lb.h" + +/* + * Local functions + */ + +static int orte_rmaps_lb_open(void); +static int orte_rmaps_lb_close(void); +static int orte_rmaps_lb_query(mca_base_module_t **module, int *priority); + + +orte_rmaps_base_component_t mca_rmaps_load_balance_component = { + { + ORTE_RMAPS_BASE_VERSION_2_0_0, + + "load_balance", /* MCA component name */ + ORTE_MAJOR_VERSION, /* MCA component major version */ + ORTE_MINOR_VERSION, /* MCA component minor version */ + ORTE_RELEASE_VERSION, /* MCA component release version */ + orte_rmaps_lb_open, /* component open */ + orte_rmaps_lb_close, /* component close */ + orte_rmaps_lb_query /* component query */ + }, + { + /* The component is checkpoint ready */ + MCA_BASE_METADATA_PARAM_CHECKPOINT + } +}; + + +/** + * component open/close/init function + */ +static int orte_rmaps_lb_open(void) +{ + return ORTE_SUCCESS; +} + + +static int orte_rmaps_lb_query(mca_base_module_t **module, int *priority) +{ + /* the RMAPS framework is -only- opened on HNP's, + * so no need to check for that here + */ + + /* if load balancing, or any nperxxx, was requested, then we must be selected */ + if (orte_rmaps_base.loadbalance || + 0 < orte_rmaps_base.npernode || + 0 < orte_rmaps_base.nperboard || + 0 < orte_rmaps_base.npersocket) { + *priority = 1000; /* must be selected */ + *module = (mca_base_module_t *)&orte_rmaps_load_balance_module; + return ORTE_SUCCESS; + } + + /* otherwise, ignore us */ + *priority = 0; + *module = NULL; + return ORTE_ERROR; +} + +/** + * Close all subsystems. + */ + +static int orte_rmaps_lb_close(void) +{ + return ORTE_SUCCESS; +} + + diff --git a/orte/mca/rmaps/rank_file/rmaps_rank_file.c b/orte/mca/rmaps/rank_file/rmaps_rank_file.c index 3d4434dc89..e068e0d599 100644 --- a/orte/mca/rmaps/rank_file/rmaps_rank_file.c +++ b/orte/mca/rmaps/rank_file/rmaps_rank_file.c @@ -72,6 +72,7 @@ static int map_app_by_node(orte_app_context_t* app, opal_list_item_t *next; orte_node_t *node; orte_std_cntr_t num_alloc = 0; + orte_proc_t *proc; /* This loop continues until all procs have been mapped or we run out of resources. We determine that we have "run out of @@ -118,8 +119,8 @@ static int map_app_by_node(orte_app_context_t* app, /* Allocate a slot on this node */ node = (orte_node_t*) cur_node_item; /* pass the base slot list in case it was provided */ - if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, node, vpid_start+num_alloc, orte_rmaps_base.slot_list, app->idx, - nodes, jdata->map->oversubscribe, true))) { + if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, node, 1, app->idx, + nodes, jdata->map->oversubscribe, true, &proc))) { /** if the code is ORTE_ERR_NODE_FULLY_USED, then we know this * really isn't an error - we just need to break from the loop * since the node is fully used up. For now, just don't report @@ -130,6 +131,9 @@ static int map_app_by_node(orte_app_context_t* app, return rc; } } + if (NULL != orte_rmaps_base.slot_list) { + proc->slot_list = strdup(orte_rmaps_base.slot_list); + } ++num_alloc; cur_node_item = next; } @@ -150,6 +154,7 @@ static int map_app_by_slot(orte_app_context_t* app, orte_std_cntr_t i, num_slots_to_take, num_alloc = 0; orte_node_t *node; opal_list_item_t *next; + orte_proc_t *proc; /* This loop continues until all procs have been mapped or we run out of resources. We determine that we have "run out of @@ -211,7 +216,7 @@ static int map_app_by_slot(orte_app_context_t* app, /* check if we are in npernode mode - if so, then set the num_slots_to_take * to the num_per_node */ - if (jdata->map->pernode) { + if (0 < jdata->map->npernode) { num_slots_to_take = jdata->map->npernode; } @@ -223,8 +228,8 @@ static int map_app_by_slot(orte_app_context_t* app, continue; } /* pass the base slot list in case it was provided */ - if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, node, vpid_start+num_alloc, orte_rmaps_base.slot_list, app->idx, - nodes, jdata->map->oversubscribe, true))) { + if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, node, 1, app->idx, + nodes, jdata->map->oversubscribe, true, &proc))) { /** if the code is ORTE_ERR_NODE_FULLY_USED, then we know this * really isn't an error - we just need to break from the loop * since the node is fully used up. For now, just don't report @@ -235,6 +240,9 @@ static int map_app_by_slot(orte_app_context_t* app, return rc; } } + if (NULL != orte_rmaps_base.slot_list) { + proc->slot_list = strdup(orte_rmaps_base.slot_list); + } /* Update the rank */ ++num_alloc; /* track #slots taken */ @@ -279,6 +287,7 @@ static int orte_rmaps_rf_map(orte_job_t *jdata) orte_rmaps_rank_file_map_t *rfmap; orte_std_cntr_t slots_per_node, relative_index, tmp_cnt; int rc; + orte_proc_t *proc; /* convenience def */ map = jdata->map; @@ -303,7 +312,7 @@ static int orte_rmaps_rf_map(orte_job_t *jdata) } /* likewise, we only support pernode options for a single app_context */ - if (map->pernode && 1 < jdata->num_apps) { + if (0 < map->npernode && 1 < jdata->num_apps) { orte_show_help("help-rmaps_rank_file.txt", "orte-rmaps-rf:multi-apps-and-zero-np", true, jdata->num_apps, NULL); rc = ORTE_ERR_SILENT; @@ -349,7 +358,7 @@ static int orte_rmaps_rf_map(orte_job_t *jdata) num_nodes = (orte_std_cntr_t)opal_list_get_size(&node_list); /* we already checked for sanity, so these are okay to just do here */ - if (map->pernode && map->npernode == 1) { + if (map->npernode == 1) { /* there are three use-cases that we need to deal with: * (a) if -np was not provided, then we just use the number of nodes * (b) if -np was provided AND #procs > #nodes, then error out @@ -365,7 +374,7 @@ static int orte_rmaps_rf_map(orte_job_t *jdata) rc = ORTE_ERR_SILENT; goto error; } - } else if (map->pernode && map->npernode > 1) { + } else if (map->npernode > 1) { /* first, let's check to see if there are enough slots/node to * meet the request - error out if not */ @@ -447,8 +456,9 @@ static int orte_rmaps_rf_map(orte_job_t *jdata) orte_show_help("help-rmaps_rank_file.txt","no-slot-list", true, rank, rfmap->node_name); return ORTE_ERR_SILENT; } - if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, node, rank, rfmap->slot_list, - app->idx, &node_list, jdata->map->oversubscribe, true))) { + proc = NULL; + if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, node, 1, app->idx, + &node_list, jdata->map->oversubscribe, true, &proc))) { if (ORTE_ERR_NODE_FULLY_USED != rc) { /* if this is a true error and not the node just being * full, then report the error and abort @@ -457,6 +467,7 @@ static int orte_rmaps_rf_map(orte_job_t *jdata) return rc; } } + proc->slot_list = strdup(rfmap->slot_list); jdata->num_procs++; } /* update the starting point */ @@ -517,7 +528,7 @@ static int orte_rmaps_rf_map(orte_job_t *jdata) /* if no bookmark, then just start at the beginning of the list */ cur_node_item = opal_list_get_first(&node_list); } - if (map->policy & ORTE_RMAPS_BYNODE) { + if (map->policy & ORTE_MAPPING_BYNODE) { rc = map_app_by_node(app, jdata, vpid_start, &node_list); } else { rc = map_app_by_slot(app, jdata, vpid_start, &node_list); @@ -542,8 +553,14 @@ static int orte_rmaps_rf_map(orte_job_t *jdata) /* update the job's number of procs */ jdata->num_procs = total_procs; + /* compute vpids and add proc objects to the job */ + if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_vpids(jdata))) { + ORTE_ERROR_LOG(rc); + return rc; + } + /* compute and save convenience values */ - if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_usage(jdata))) { + if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_local_ranks(jdata))) { ORTE_ERROR_LOG(rc); return rc; } diff --git a/orte/mca/rmaps/resilient/rmaps_resilient.c b/orte/mca/rmaps/resilient/rmaps_resilient.c index 017e04c5c2..a671803dc9 100644 --- a/orte/mca/rmaps/resilient/rmaps_resilient.c +++ b/orte/mca/rmaps/resilient/rmaps_resilient.c @@ -37,7 +37,6 @@ * Local variable */ static opal_list_item_t *cur_node_item = NULL; -static orte_vpid_t vpid_start = 0; static char *orte_getline(FILE *fp); @@ -51,24 +50,22 @@ static int rr_map_default(orte_job_t *jdata, orte_app_context_t *app, cur_node_item = orte_rmaps_base_get_starting_point(node_list, jdata); /* now perform the mapping */ - if (ORTE_RMAPS_BYNODE & jdata->map->policy) { + if (ORTE_MAPPING_BYNODE & jdata->map->policy) { if (ORTE_SUCCESS != (rc = orte_rmaps_base_map_bynode(jdata, app, node_list, - num_procs, vpid_start, - cur_node_item))) { + num_procs, cur_node_item))) { ORTE_ERROR_LOG(rc); return rc; } } else { if (ORTE_SUCCESS != (rc = orte_rmaps_base_map_byslot(jdata, app, node_list, - num_procs, vpid_start, - cur_node_item, 0))) { + num_procs, cur_node_item))) { ORTE_ERROR_LOG(rc); return rc; } } - /* update the starting vpid */ - vpid_start += num_procs; + /* update number of procs */ + jdata->num_procs += num_procs; return ORTE_SUCCESS; } @@ -123,7 +120,7 @@ static int orte_rmaps_resilient_map(orte_job_t *jdata) float avgload, minload; orte_node_t *node, *nd=NULL, *oldnode; orte_rmaps_res_ftgrp_t *ftgrp, *target; - orte_vpid_t totprocs, lowprocs; + orte_vpid_t totprocs, lowprocs, num_assigned; FILE *fp; char *ftinput; int grp; @@ -275,8 +272,8 @@ static int orte_rmaps_resilient_map(orte_job_t *jdata) ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), nd->name)); /* put proc on the found node */ - if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, nd, proc->name.vpid, NULL, proc->app_idx, - NULL, jdata->map->oversubscribe, false))) { + if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, nd, jdata->map->cpus_per_rank, proc->app_idx, + NULL, jdata->map->oversubscribe, false, &proc))) { /** if the code is ORTE_ERR_NODE_FULLY_USED, then we know this * really isn't an error */ @@ -290,7 +287,7 @@ static int orte_rmaps_resilient_map(orte_job_t *jdata) /* update the node and local ranks so static ports can * be properly selected if active */ - orte_rmaps_base_update_usage(jdata, oldnode, nd, proc); + orte_rmaps_base_update_local_ranks(jdata, oldnode, nd, proc); continue; } /* if we did find a target, re-map the proc to the lightest loaded @@ -313,8 +310,8 @@ static int orte_rmaps_resilient_map(orte_job_t *jdata) ORTE_NAME_PRINT(&proc->name), target->ftgrp, nd->name)); OBJ_RELEASE(proc->node); /* required to maintain bookkeeping */ /* put proc on the found node */ - if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, nd, proc->name.vpid, NULL, proc->app_idx, - NULL, jdata->map->oversubscribe, false))) { + if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, nd, jdata->map->cpus_per_rank, proc->app_idx, + NULL, jdata->map->oversubscribe, false, &proc))) { /** if the code is ORTE_ERR_NODE_FULLY_USED, then we know this * really isn't an error */ @@ -328,7 +325,7 @@ static int orte_rmaps_resilient_map(orte_job_t *jdata) /* update the node and local ranks so static ports can * be properly selected if active */ - orte_rmaps_base_update_usage(jdata, oldnode, nd, proc); + orte_rmaps_base_update_local_ranks(jdata, oldnode, nd, proc); } /* define the daemons that we will use for this job */ if (ORTE_SUCCESS != (rc = orte_rmaps_base_define_daemons(jdata->map))) { @@ -354,7 +351,6 @@ static int orte_rmaps_resilient_map(orte_job_t *jdata) ORTE_JOBID_PRINT(jdata->jobid))); /* start at the beginning... */ - vpid_start = 0; jdata->num_procs = 0; map = jdata->map; @@ -363,6 +359,7 @@ static int orte_rmaps_resilient_map(orte_job_t *jdata) if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) { continue; } + num_assigned = 0; /* for each app_context, we have to get the list of nodes that it can * use since that can now be modified with a hostfile and/or -host * option @@ -434,7 +431,7 @@ static int orte_rmaps_resilient_map(orte_job_t *jdata) OPAL_OUTPUT_VERBOSE((2, orte_rmaps_base.rmaps_output, "%s rmaps:resilient: no available fault group - mapping rr", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - if (ORTE_SUCCESS != (rc = rr_map_default(jdata, app, &node_list, app->num_procs-vpid_start))) { + if (ORTE_SUCCESS != (rc = rr_map_default(jdata, app, &node_list, app->num_procs-num_assigned))) { goto error; } goto cleanup; @@ -455,8 +452,8 @@ static int orte_rmaps_resilient_map(orte_job_t *jdata) ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), target->ftgrp, nd->name)); /* put proc on that node */ - if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, nd, vpid_start, NULL, app->idx, - &node_list, jdata->map->oversubscribe, false))) { + if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, nd, jdata->map->cpus_per_rank, app->idx, + &node_list, jdata->map->oversubscribe, false, NULL))) { /** if the code is ORTE_ERR_NODE_FULLY_USED, then we know this * really isn't an error */ @@ -466,7 +463,7 @@ static int orte_rmaps_resilient_map(orte_job_t *jdata) } } /* track number of procs mapped */ - vpid_start++; + num_assigned++; /* flag this fault group as used */ target->used = true; @@ -484,6 +481,8 @@ static int orte_rmaps_resilient_map(orte_job_t *jdata) } cleanup: + /* track number of procs */ + jdata->num_procs += app->num_procs; /* cleanup the node list - it can differ from one app_context * to another, so we have to get it every time */ @@ -493,11 +492,14 @@ static int orte_rmaps_resilient_map(orte_job_t *jdata) OBJ_DESTRUCT(&node_list); } - /* update the number of procs in the job */ - jdata->num_procs = vpid_start; + /* compute vpids and add proc objects to the job */ + if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_vpids(jdata))) { + ORTE_ERROR_LOG(rc); + return rc; + } - /* compute and save convenience values */ - if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_usage(jdata))) { + /* compute and save local ranks */ + if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_local_ranks(jdata))) { ORTE_ERROR_LOG(rc); return rc; } diff --git a/orte/mca/rmaps/rmaps_types.h b/orte/mca/rmaps/rmaps_types.h index a1cc4db76b..7dd63e99b9 100644 --- a/orte/mca/rmaps/rmaps_types.h +++ b/orte/mca/rmaps/rmaps_types.h @@ -25,32 +25,27 @@ #include "opal/class/opal_pointer_array.h" +#include "orte/runtime/orte_globals.h" + /* * General MAP types - instanced in runtime/orte_globals_class_instances.h */ BEGIN_C_DECLS -/* - * Define flags indicating the policy used to perform the map - */ -#define ORTE_RMAPS_NOPOL 0x00 -#define ORTE_RMAPS_BYNODE 0x01 -#define ORTE_RMAPS_BYSLOT 0x02 -#define ORTE_RMAPS_BYUSER 0x04 -#define ORTE_RMAPS_NO_USE_LOCAL 0x08 - - /* * Structure that represents the mapping of a job to an * allocated set of resources. */ struct orte_job_map_t { opal_object_t super; - /* save the mapping configuration */ - uint8_t policy; - bool pernode; - orte_std_cntr_t npernode; + /* user-specified mapping params */ + orte_mapping_policy_t policy; + int npernode; + int nperboard; + int npersocket; + int16_t cpus_per_rank; + int16_t stride; bool oversubscribe; bool display_map; bool cpu_lists; diff --git a/orte/mca/rmaps/round_robin/rmaps_rr.c b/orte/mca/rmaps/round_robin/rmaps_rr.c index 76ccb09264..69b0e35215 100644 --- a/orte/mca/rmaps/round_robin/rmaps_rr.c +++ b/orte/mca/rmaps/round_robin/rmaps_rr.c @@ -48,56 +48,13 @@ static int orte_rmaps_rr_map(orte_job_t *jdata) int i; opal_list_t node_list; opal_list_item_t *item; - orte_vpid_t vpid_start; orte_std_cntr_t num_nodes, num_slots; int rc; - orte_std_cntr_t slots_per_node; - int ppn = 0; opal_list_item_t *cur_node_item; /* start at the beginning... */ - vpid_start = 0; jdata->num_procs = 0; - /* if loadbalancing is requested, then we need to compute - * the #procs/node - note that this cannot be done - * if we are doing pernode or if #procs was not given - */ - if (orte_rmaps_base.loadbalance && !jdata->map->pernode) { - float res; - /* compute total #procs we are going to add */ - for(i=0; i < jdata->apps->size; i++) { - if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) { - continue; - } - if (0 == app->num_procs) { - /* can't do it - tell user and quit */ - orte_show_help("help-orte-rmaps-rr.txt", - "orte-rmaps-rr:loadbalance-and-zero-np", - true); - rc = ORTE_ERR_SILENT; - goto error; - } - ppn += app->num_procs; - } - /* get the total avail nodes and the number - * of procs already using them - */ - num_nodes=0; - for (i=0; i < orte_node_pool->size; i++) { - if (NULL == opal_pointer_array_get_item(orte_node_pool, i)) { - continue; - } - num_nodes++; - } - /* compute the balance */ - res = ((float)ppn / num_nodes); - ppn = ppn / num_nodes; - if (0 < (res-ppn)) { - ppn++; - } - } - /* cycle through the app_contexts, mapping them sequentially */ for(i=0; i < jdata->apps->size; i++) { if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) { @@ -130,83 +87,22 @@ static int orte_rmaps_rr_map(orte_job_t *jdata) /* if a bookmark exists from some prior mapping, set us to start there */ cur_node_item = orte_rmaps_base_get_starting_point(&node_list, jdata); - if (jdata->map->pernode && jdata->map->npernode == 1) { - /* there are three use-cases that we need to deal with: - * (a) if -np was not provided, then we just use the number of nodes - * (b) if -np was provided AND #procs > #nodes, then error out - * (c) if -np was provided AND #procs <= #nodes, then launch - * the specified #procs one/node. In this case, we just - * leave app->num_procs alone - */ - if (0 == app->num_procs) { - app->num_procs = num_nodes; - } else if (app->num_procs > num_nodes) { - orte_show_help("help-orte-rmaps-rr.txt", "orte-rmaps-rr:per-node-and-too-many-procs", - true, app->num_procs, num_nodes, NULL); - rc = ORTE_ERR_SILENT; - goto error; - } - } else if (jdata->map->pernode && jdata->map->npernode > 1) { - /* first, let's check to see if there are enough slots/node to - * meet the request - error out if not - */ - slots_per_node = num_slots / num_nodes; - if (jdata->map->npernode > slots_per_node) { - orte_show_help("help-orte-rmaps-rr.txt", "orte-rmaps-rr:n-per-node-and-not-enough-slots", - true, jdata->map->npernode, slots_per_node, NULL); - rc = ORTE_ERR_SILENT; - goto error; - } - /* there are three use-cases that we need to deal with: - * (a) if -np was not provided, then we just use the n/node * #nodes - * (b) if -np was provided AND #procs > (n/node * #nodes), then error out - * (c) if -np was provided AND #procs <= (n/node * #nodes), then launch - * the specified #procs n/node. In this case, we just - * leave app->num_procs alone - */ - if (0 == app->num_procs) { - /* set the num_procs to equal the specified num/node * the number of nodes */ - app->num_procs = jdata->map->npernode * num_nodes; - } else if (app->num_procs > (jdata->map->npernode * num_nodes)) { - orte_show_help("help-orte-rmaps-rr.txt", "orte-rmaps-rr:n-per-node-and-too-many-procs", - true, app->num_procs, jdata->map->npernode, num_nodes, num_slots, NULL); - rc = ORTE_ERR_SILENT; - goto error; - } - } else if (0 == app->num_procs) { - if (jdata->map->policy & ORTE_RMAPS_BYUSER) { - /* we can't handle this - it should have been set when we got - * the map info. If it wasn't, then we can only error out - */ - orte_show_help("help-orte-rmaps-rr.txt", "orte-rmaps-rr:no-np-and-user-map", - true, app->num_procs, jdata->map->npernode, num_nodes, num_slots, NULL); - rc = ORTE_ERR_SILENT; - goto error; - } - /** set the num_procs to equal the number of slots on these mapped nodes */ + if (0 == app->num_procs) { + /* set the num_procs to equal the number of slots on these mapped nodes */ app->num_procs = num_slots; } - /** track the total number of processes we mapped */ + /* track the total number of processes we mapped */ jdata->num_procs += app->num_procs; /* Make assignments */ - if (jdata->map->policy & ORTE_RMAPS_BYUSER) { - rc = ORTE_ERR_NOT_IMPLEMENTED; - goto error; - } else if (jdata->map->policy & ORTE_RMAPS_BYNODE) { + if (jdata->map->policy & ORTE_MAPPING_BYNODE) { rc = orte_rmaps_base_map_bynode(jdata, app, &node_list, - app->num_procs, vpid_start, - cur_node_item); + app->num_procs, cur_node_item); } else { rc = orte_rmaps_base_map_byslot(jdata, app, &node_list, - app->num_procs, vpid_start, - cur_node_item, ppn); + app->num_procs, cur_node_item); } - - /* update the starting vpid for the next app_context */ - vpid_start += app->num_procs; - if (ORTE_SUCCESS != rc) { ORTE_ERROR_LOG(rc); goto error; @@ -221,8 +117,14 @@ static int orte_rmaps_rr_map(orte_job_t *jdata) OBJ_DESTRUCT(&node_list); } - /* compute and save convenience values */ - if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_usage(jdata))) { + /* compute vpids and add proc objects to the job */ + if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_vpids(jdata))) { + ORTE_ERROR_LOG(rc); + return rc; + } + + /* compute and save local ranks */ + if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_local_ranks(jdata))) { ORTE_ERROR_LOG(rc); return rc; } diff --git a/orte/mca/rmaps/seq/rmaps_seq.c b/orte/mca/rmaps/seq/rmaps_seq.c index 83749daf94..c429b05df0 100644 --- a/orte/mca/rmaps/seq/rmaps_seq.c +++ b/orte/mca/rmaps/seq/rmaps_seq.c @@ -59,14 +59,15 @@ static int orte_rmaps_seq_map(orte_job_t *jdata) orte_job_map_t *map; orte_app_context_t *app; orte_std_cntr_t i, j; - opal_list_item_t *item, *next, *cur_node_item; - orte_node_t *node, *nd; + opal_list_item_t *item; + orte_node_t *node, *nd, *save; orte_vpid_t vpid; orte_std_cntr_t num_nodes; int rc; opal_list_t *default_node_list=NULL; opal_list_t *node_list=NULL; - + orte_proc_t *proc; + OPAL_OUTPUT_VERBOSE((1, orte_rmaps_base.rmaps_output, "%s rmaps:seq mapping job %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), @@ -87,6 +88,9 @@ static int orte_rmaps_seq_map(orte_job_t *jdata) /* start at the beginning... */ vpid = 0; jdata->num_procs = 0; + if (NULL != default_node_list) { + save = (orte_node_t*)opal_list_get_first(default_node_list); + } /* cycle through the app_contexts, mapping them sequentially */ for(i=0; i < jdata->num_apps; i++) { @@ -103,12 +107,14 @@ static int orte_rmaps_seq_map(orte_job_t *jdata) ORTE_ERROR_LOG(rc); goto error; } + nd = (orte_node_t*)opal_list_get_first(node_list); } else { node_list = default_node_list; + nd = save; } /* check for nolocal and remove the head node, if required */ - if (map->policy & ORTE_RMAPS_NO_USE_LOCAL) { + if (map->policy & ORTE_MAPPING_NO_USE_LOCAL) { for (item = opal_list_get_first(node_list); item != opal_list_get_end(node_list); item = opal_list_get_next(item) ) { @@ -132,43 +138,17 @@ static int orte_rmaps_seq_map(orte_job_t *jdata) return ORTE_ERR_SILENT; } - /* if a bookmark exists from some prior mapping, set us to start there */ - cur_node_item = orte_rmaps_base_get_starting_point(node_list, jdata); - /* if num_procs wasn't specified, set it now */ if (0 == app->num_procs) { app->num_procs = num_nodes; } for (i=0; i < app->num_procs; i++) { - /* see if any nodes remain unused and available. We need to do this check - * each time since we may remove nodes from the list (as they become fully - * used) as we cycle through the loop - */ - if(0 >= opal_list_get_size(node_list) ) { - /* Everything is at max usage! :( */ - orte_show_help("help-orte-rmaps-seq.txt", "orte-rmaps-seq:alloc-error", - true, app->num_procs, app->app); - return ORTE_ERR_SILENT; - } - - /* Save the next node we can use before claiming slots, since - * we may need to prune the nodes list removing overused nodes. - * Wrap around to beginning if we are at the end of the list - */ - if (opal_list_get_end(node_list) == opal_list_get_next(cur_node_item)) { - next = opal_list_get_first(node_list); - } - else { - next = opal_list_get_next(cur_node_item); - } - /* find this node on the global array - this is necessary so * that our mapping gets saved on that array as the objects * returned by the hostfile function are -not- on the array */ node = NULL; - nd = (orte_node_t*)cur_node_item; for (j=0; j < orte_node_pool->size; j++) { if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, j))) { continue; @@ -186,42 +166,46 @@ static int orte_rmaps_seq_map(orte_job_t *jdata) goto error; } - /* assign next vpid to this node - do NOT allow claim_slot to remove + /* assign proc to this node - do NOT allow claim_slot to remove * an oversubscribed node from the list! */ if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, node, - vpid, NULL, app->idx, + jdata->map->cpus_per_rank, app->idx, node_list, jdata->map->oversubscribe, - false))) { + false, &proc))) { if (ORTE_ERR_NODE_FULLY_USED != rc) { ORTE_ERROR_LOG(rc); goto error; } } - /* increment the vpid */ - vpid++; + /* assign the vpid */ + proc->name.vpid = vpid++; + /* add to the jdata proc array */ + if (ORTE_SUCCESS != (rc = opal_pointer_array_set_item(jdata->procs, proc->name.vpid, proc))) { + ORTE_ERROR_LOG(rc); + goto error; + } /* move to next node */ - cur_node_item = next; + nd = (orte_node_t*)opal_list_get_next((opal_list_item_t*)nd); } /** track the total number of processes we mapped */ jdata->num_procs += app->num_procs; - /* update the bookmark */ - jdata->bookmark = (orte_node_t*)cur_node_item; - /* cleanup the node list if it came from this app_context */ if (node_list != default_node_list) { - while(NULL != (item = opal_list_remove_first(node_list))) { + while (NULL != (item = opal_list_remove_first(node_list))) { OBJ_RELEASE(item); } OBJ_RELEASE(node_list); + } else { + save = nd; } } - /* compute and save convenience values */ - if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_usage(jdata))) { + /* compute and save local ranks */ + if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_local_ranks(jdata))) { ORTE_ERROR_LOG(rc); return rc; } diff --git a/orte/mca/rmaps/topo/rmaps_topo.c b/orte/mca/rmaps/topo/rmaps_topo.c index e3a114906a..b8f6d743f4 100644 --- a/orte/mca/rmaps/topo/rmaps_topo.c +++ b/orte/mca/rmaps/topo/rmaps_topo.c @@ -110,8 +110,8 @@ static int map_app_by_node( /* Allocate a slot on this node */ node = (orte_node_t*) cur_node_item; - if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, node, vpid_start + num_alloc, NULL, app->idx, - nodes, jdata->map->oversubscribe, true))) { + if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, node, 1, app->idx, + nodes, jdata->map->oversubscribe, true, NULL))) { /** if the code is ORTE_ERR_NODE_FULLY_USED, then we know this * really isn't an error - we just need to break from the loop * since the node is fully used up. For now, just don't report @@ -212,13 +212,13 @@ static int map_app_by_slot( /* check if we are in npernode mode - if so, then set the num_slots_to_take * to the num_per_node */ - if (jdata->map->pernode) { + if (0 < jdata->map->npernode) { num_slots_to_take = jdata->map->npernode; } for( i = 0; i < num_slots_to_take; ++i) { - if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, node, vpid_start + num_alloc, NULL, app->idx, - nodes, jdata->map->oversubscribe, true))) { + if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, node, 1, app->idx, + nodes, jdata->map->oversubscribe, true, NULL))) { /** if the code is ORTE_ERR_NODE_FULLY_USED, then we know this * really isn't an error - we just need to break from the loop * since the node is fully used up. For now, just don't report @@ -426,7 +426,7 @@ static int topo_map(orte_job_t *jdata) } proceed: - if (map->pernode && map->npernode == 1) { + if (map->npernode == 1) { /* there are three use-cases that we need to deal with: * (a) if -np was not provided, then we just use the number of nodes * (b) if -np was provided AND #procs > #nodes, then error out @@ -442,7 +442,7 @@ static int topo_map(orte_job_t *jdata) rc = ORTE_ERR_SILENT; goto error; } - } else if (map->pernode && map->npernode > 1) { + } else if (map->npernode > 1) { /* first, let's check to see if there are enough slots/node to * meet the request - error out if not */ @@ -473,11 +473,11 @@ static int topo_map(orte_job_t *jdata) /** set the num_procs to equal the number of slots on these mapped nodes - if user has specified "-bynode", then set it to the number of nodes */ - if (map->policy & ORTE_RMAPS_BYNODE) { + if (map->policy & ORTE_MAPPING_BYNODE) { app->num_procs = num_nodes; - } else if (map->policy & ORTE_RMAPS_BYSLOT) { + } else if (map->policy & ORTE_MAPPING_BYSLOT) { app->num_procs = num_slots; - } else if (map->policy & ORTE_RMAPS_BYUSER) { + } else { /* we can't handle this - it should have been set when we got * the map info. If it wasn't, then we can only error out */ @@ -492,10 +492,7 @@ static int topo_map(orte_job_t *jdata) jdata->num_procs += app->num_procs; /* Make assignments */ - if (map->policy == ORTE_RMAPS_BYUSER) { - rc = ORTE_ERR_NOT_IMPLEMENTED; - goto error; - } else if (map->policy == ORTE_RMAPS_BYNODE) { + if (map->policy == ORTE_MAPPING_BYNODE) { rc = map_app_by_node(app, jdata, vpid_start, &node_list); } else { rc = map_app_by_slot(app, jdata, vpid_start, &node_list); @@ -522,7 +519,7 @@ static int topo_map(orte_job_t *jdata) } /* compute and save convenience values */ - if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_usage(jdata))) { + if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_local_ranks(jdata))) { ORTE_ERROR_LOG(rc); return rc; } diff --git a/orte/runtime/data_type_support/orte_dt_copy_fns.c b/orte/runtime/data_type_support/orte_dt_copy_fns.c index 7477bf549c..b4a80713f1 100644 --- a/orte/runtime/data_type_support/orte_dt_copy_fns.c +++ b/orte/runtime/data_type_support/orte_dt_copy_fns.c @@ -280,7 +280,6 @@ int orte_dt_copy_map(orte_job_map_t **dest, orte_job_map_t *src, opal_data_type_ /* copy data into it */ (*dest)->policy = src->policy; - (*dest)->pernode = src->pernode; (*dest)->npernode = src->npernode; (*dest)->oversubscribe = src->oversubscribe; (*dest)->display_map = src->display_map; diff --git a/orte/runtime/data_type_support/orte_dt_packing_fns.c b/orte/runtime/data_type_support/orte_dt_packing_fns.c index 6791ebd229..096fc67cfb 100644 --- a/orte/runtime/data_type_support/orte_dt_packing_fns.c +++ b/orte/runtime/data_type_support/orte_dt_packing_fns.c @@ -407,6 +407,15 @@ int orte_dt_pack_node(opal_buffer_t *buffer, const void *src, return rc; } + /* do not pack the local board, socket, and core info */ + + /* pack the cpu set info */ + if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer, + (void*)(&(nodes[i]->cpu_set)), 1, OPAL_STRING))) { + ORTE_ERROR_LOG(rc); + return rc; + } + /* do not pack the username */ } return ORTE_SUCCESS; @@ -814,13 +823,7 @@ int orte_dt_pack_map(opal_buffer_t *buffer, const void *src, for (i=0; i < num_vals; i++) { /* pack the policy used to generate it */ - if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer, &(maps[i]->policy), 1, OPAL_UINT8))) { - ORTE_ERROR_LOG(rc); - return rc; - } - - /* pack the pernode flag */ - if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer, &(maps[i]->pernode), 1, OPAL_BOOL))) { + if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer, &(maps[i]->policy), 1, ORTE_MAPPING_POLICY))) { ORTE_ERROR_LOG(rc); return rc; } diff --git a/orte/runtime/data_type_support/orte_dt_print_fns.c b/orte/runtime/data_type_support/orte_dt_print_fns.c index c08e05a661..ed29230638 100644 --- a/orte/runtime/data_type_support/orte_dt_print_fns.c +++ b/orte/runtime/data_type_support/orte_dt_print_fns.c @@ -362,6 +362,11 @@ int orte_dt_print_node(char **output, char *prefix, orte_node_t *src, opal_data_ } } + asprintf(&tmp2, "%s\n%s\tNum boards: %ld\tNum sockets/board: %ld\tNum cores/socket: %ld", tmp, pfx2, + (long)src->boards, (long)src->sockets_per_board, (long)src->cores_per_socket); + free(tmp); + tmp = tmp2; + if (NULL == src->daemon) { asprintf(&tmp2, "%s\n%s\tDaemon: %s\tDaemon launched: %s", tmp, pfx2, "Not defined", src->daemon_launched ? "True" : "False"); @@ -377,8 +382,9 @@ int orte_dt_print_node(char **output, char *prefix, orte_node_t *src, opal_data_ free(tmp); tmp = tmp2; - asprintf(&tmp2, "%s\n%s\tNum slots allocated: %ld\tMax slots: %ld", tmp, pfx2, - (long)src->slots_alloc, (long)src->slots_max); + asprintf(&tmp2, "%s\n%s\tNum slots allocated: %ld\tMax slots: %ld:\tCpu set: %s", tmp, pfx2, + (long)src->slots_alloc, (long)src->slots_max, + (NULL == src->cpu_set) ? "NULL" : src->cpu_set); free(tmp); tmp = tmp2; @@ -644,9 +650,8 @@ int orte_dt_print_map(char **output, char *prefix, orte_job_map_t *src, opal_dat asprintf(&pfx, "%s\t", pfx2); if (orte_devel_level_output) { - asprintf(&tmp, "\n%sMap generated by mapping policy: %x\n%s\tPernode: %s\tNpernode: %ld\tOversubscribe allowed: %s\tCPU Lists: %s", - pfx2, src->policy, pfx2, - (src->pernode) ? "TRUE" : "FALSE", (long)src->npernode, + asprintf(&tmp, "\n%sMap generated by mapping policy: %04x\n%s\tNpernode: %ld\tOversubscribe allowed: %s\tCPU Lists: %s", + pfx2, src->policy, pfx2, (long)src->npernode, (src->oversubscribe) ? "TRUE" : "FALSE", (src->cpu_lists) ? "TRUE" : "FALSE"); diff --git a/orte/runtime/data_type_support/orte_dt_unpacking_fns.c b/orte/runtime/data_type_support/orte_dt_unpacking_fns.c index 7504bb088b..33f39b533c 100644 --- a/orte/runtime/data_type_support/orte_dt_unpacking_fns.c +++ b/orte/runtime/data_type_support/orte_dt_unpacking_fns.c @@ -422,6 +422,16 @@ int orte_dt_unpack_node(opal_buffer_t *buffer, void *dest, return rc; } + /* do not unpack the board, socket, and core info */ + + /* unpack the cpu set */ + n = 1; + if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer, + &(nodes[i]->cpu_set), &n, OPAL_STRING))) { + ORTE_ERROR_LOG(rc); + return rc; + } + /* do not unpack the username */ } return ORTE_SUCCESS; @@ -883,15 +893,7 @@ int orte_dt_unpack_map(opal_buffer_t *buffer, void *dest, /* unpack the policy */ n = 1; if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer, - &(maps[i]->policy), &n, OPAL_UINT8))) { - ORTE_ERROR_LOG(rc); - return rc; - } - - /* unpack the pernode flag */ - n = 1; - if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer, - &(maps[i]->pernode), &n, OPAL_BOOL))) { + &(maps[i]->policy), &n, ORTE_MAPPING_POLICY))) { ORTE_ERROR_LOG(rc); return rc; } diff --git a/orte/runtime/orte_globals.c b/orte/runtime/orte_globals.c index 24d21c57b4..aac8e256df 100644 --- a/orte/runtime/orte_globals.c +++ b/orte/runtime/orte_globals.c @@ -27,6 +27,7 @@ #endif #include "opal/mca/base/mca_base_param.h" +#include "opal/mca/paffinity/paffinity.h" #include "opal/util/argv.h" #include "opal/util/output.h" #include "opal/class/opal_pointer_array.h" @@ -132,6 +133,17 @@ bool orte_orted_exit_with_barrier = true; /* report launch progress */ bool orte_report_launch_progress = false; +/* cluster hardware info */ +uint8_t orte_default_num_boards; +uint8_t orte_default_num_sockets_per_board; +uint8_t orte_default_num_cores_per_socket; + +/* allocation specification */ +char *orte_default_cpu_set; + +/* default rank assigment and binding policy */ +orte_mapping_policy_t orte_default_mapping_policy = 0; + #endif /* !ORTE_DISABLE_FULL_RTE */ int orte_debug_output = -1; @@ -670,6 +682,16 @@ static void orte_node_construct(orte_node_t* node) node->slots_inuse = 0; node->slots_alloc = 0; node->slots_max = 0; + + node->boards = orte_default_num_boards; + node->sockets_per_board = orte_default_num_sockets_per_board; + node->cores_per_socket = orte_default_num_cores_per_socket; + if (NULL != orte_default_cpu_set) { + node->cpu_set = strdup(orte_default_cpu_set); + } else { + node->cpu_set = NULL; + } + node->username = NULL; } @@ -702,6 +724,10 @@ static void orte_node_destruct(orte_node_t* node) } OBJ_RELEASE(node->procs); + if (NULL != node->cpu_set) { + free(node->cpu_set); + node->cpu_set = NULL; + } if (NULL != node->username) { free(node->username); node->username = NULL; @@ -871,9 +897,12 @@ OBJ_CLASS_INSTANCE(orte_jmap_t, static void orte_job_map_construct(orte_job_map_t* map) { - map->policy = ORTE_RMAPS_BYSLOT; /* default to byslot mapping as per orterun options */ - map->pernode = false; + map->policy = 0; map->npernode = 0; + map->nperboard = 0; + map->npersocket = 0; + map->cpus_per_rank = 1; + map->stride = 1; map->oversubscribe = true; /* default to allowing oversubscribe */ map->display_map = false; map->cpu_lists = false; diff --git a/orte/runtime/orte_globals.h b/orte/runtime/orte_globals.h index 1832172153..44f69e08d2 100644 --- a/orte/runtime/orte_globals.h +++ b/orte/runtime/orte_globals.h @@ -38,7 +38,6 @@ #include "opal/class/opal_value_array.h" #include "orte/mca/plm/plm_types.h" -#include "orte/mca/rmaps/rmaps_types.h" #include "orte/util/proc_info.h" #include "orte/util/name_fns.h" #include "orte/runtime/runtime.h" @@ -141,6 +140,7 @@ typedef struct orte_job_t orte_job_t; * defining it - resolves potential circular definition */ struct orte_proc_t; +struct orte_job_map_t; /************/ /** @@ -241,6 +241,14 @@ typedef struct { specified limit. For example, if we have two processors, we may want to allow up to four processes but no more. */ orte_std_cntr_t slots_max; + /* number of physical boards in the node - defaults to 1 */ + uint8_t boards; + /* number of sockets on each board - defaults to 1 */ + uint8_t sockets_per_board; + /* number of cores per socket - defaults to 1 */ + uint8_t cores_per_socket; + /* cpus on this node that are assigned for our use */ + char *cpu_set; /** Username on this node, if specified */ char *username; } orte_node_t; @@ -258,6 +266,31 @@ typedef uint8_t orte_job_controls_t; #define ORTE_JOB_CONTROL_FORWARD_COMM 0x20 #define ORTE_JOB_CONTROL_CONTINUOUS_OP 0x40 +typedef uint16_t orte_mapping_policy_t; +#define ORTE_MAPPING_POLICY OPAL_UINT16 +/* put the rank assignment method in the upper 8 bits */ +#define ORTE_MAPPING_NOPOL 0x0100 +#define ORTE_MAPPING_BYNODE 0x0200 +#define ORTE_MAPPING_BYSLOT 0x0400 +#define ORTE_MAPPING_BYSOCKET 0x0800 +#define ORTE_MAPPING_BYBOARD 0x1000 +#define ORTE_MAPPING_NO_USE_LOCAL 0x2000 +#define ORTE_MAPPING_NPERXXX 0x4000 +/* nice macro for setting these */ +#define ORTE_SET_MAPPING_POLICY(pol) \ + orte_default_mapping_policy = (orte_default_mapping_policy & 0x00ff) | (pol); +#define ORTE_ADD_MAPPING_POLICY(pol) \ + orte_default_mapping_policy |= (pol); + +/* put the binding policy in the lower 8 bits, using the paffinity values */ +#define ORTE_BIND_TO_NONE (uint16_t)OPAL_PAFFINITY_DO_NOT_BIND +#define ORTE_BIND_TO_CORE (uint16_t)OPAL_PAFFINITY_BIND_TO_CORE +#define ORTE_BIND_TO_SOCKET (uint16_t)OPAL_PAFFINITY_BIND_TO_SOCKET +#define ORTE_BIND_TO_BOARD (uint16_t)OPAL_PAFFINITY_BIND_TO_BOARD +/* nice macro for setting these */ +#define ORTE_SET_BINDING_POLICY(pol) \ + orte_default_mapping_policy = (orte_default_mapping_policy & 0xff00) | (pol); + /* error manager callback function */ typedef void (*orte_err_cb_fn_t)(orte_process_name_t *proc, orte_proc_state_t state, void *cbdata); @@ -285,7 +318,7 @@ typedef struct { /* array of pointers to procs in this job */ opal_pointer_array_t *procs; /* map of the job */ - orte_job_map_t *map; + struct orte_job_map_t *map; /* bookmark for where we are in mapping - this * indicates the node where we stopped */ @@ -531,6 +564,17 @@ ORTE_DECLSPEC extern bool orte_orted_exit_with_barrier; /* whether or not to report launch progress */ ORTE_DECLSPEC extern bool orte_report_launch_progress; +/* cluster hardware info */ +ORTE_DECLSPEC extern uint8_t orte_default_num_boards; +ORTE_DECLSPEC extern uint8_t orte_default_num_sockets_per_board; +ORTE_DECLSPEC extern uint8_t orte_default_num_cores_per_socket; + +/* allocation specification */ +ORTE_DECLSPEC extern char *orte_default_cpu_set; + +/* default rank assigment and binding policy */ +ORTE_DECLSPEC extern orte_mapping_policy_t orte_default_mapping_policy; + #endif /* ORTE_DISABLE_FULL_SUPPORT */ END_C_DECLS diff --git a/orte/runtime/orte_mca_params.c b/orte/runtime/orte_mca_params.c index f6a0c9dab3..2060d3bfd4 100644 --- a/orte/runtime/orte_mca_params.c +++ b/orte/runtime/orte_mca_params.c @@ -28,6 +28,7 @@ #include #include "opal/mca/base/mca_base_param.h" +#include "opal/mca/paffinity/base/base.h" #include "opal/util/output.h" #include "orte/util/proc_info.h" @@ -38,6 +39,7 @@ int orte_register_params(void) { int value, tmp; + char *strval; mca_base_param_reg_int_name("orte", "base_help_aggregate", "If orte_base_help_aggregate is true, duplicate help messages will be aggregated rather than displayed individually. This can be helpful for parallel jobs that experience multiple identical failures; rather than print out the same help/failure message N times, display it once with a count of how many processes sent the same message.", @@ -297,6 +299,48 @@ int orte_register_params(void) orte_startup_timeout = 2000; /* default to 2 seconds */ } } + + /* cluster hardware info */ + mca_base_param_reg_int_name("orte", "num_boards", + "Number of processor boards/node (1-256) [default: 1]", + false, false, 1, &value); + orte_default_num_boards = (uint8_t)value; + if (OPAL_SUCCESS != opal_paffinity_base_get_socket_info(&value)) { + value = 1; + } + mca_base_param_reg_int_name("orte", "num_sockets", + "Number of sockets/board (1-256) [default: auto-sensed by mpirun or 1]", + false, false, value, &value); + orte_default_num_sockets_per_board = (uint8_t)value; + if (OPAL_SUCCESS != opal_paffinity_base_get_core_info(0, &value)) { + value = 1; + } + mca_base_param_reg_int_name("orte", "num_cores", + "Number of cores/socket (1-256) [default: auto-sensed by mpirun or 1]", + false, false, value, &value); + orte_default_num_cores_per_socket = (uint8_t)value; + + /* cpu allocation specification */ + mca_base_param_reg_string_name("orte", "cpu_set", + "Comma-separated list of ranges specifying logical cpus allocated to this job [default: none]", + false, false, NULL, &orte_default_cpu_set); + + /* binding specification - this will be overridden by any cmd line directive, and + * ignored unless opal_paffinity_alone is set + */ + mca_base_param_reg_string_name("orte", "process_binding", + "Policy for binding processes [core | socket | board (default: none)]", + false, false, NULL, &strval); + if (NULL != strval) { + if (0 == strcmp(strval, "socket")) { + ORTE_SET_BINDING_POLICY(ORTE_BIND_TO_SOCKET); + } else if (0 == strcmp(strval, "board")) { + ORTE_SET_BINDING_POLICY(ORTE_BIND_TO_BOARD); + } else if (0 == strcmp(strval, "core")) { + ORTE_SET_BINDING_POLICY(ORTE_BIND_TO_CORE); + } + } + #endif /* ORTE_DISABLE_FULL_SUPPORT */ return ORTE_SUCCESS; diff --git a/orte/tools/orterun/debuggers.c b/orte/tools/orterun/debuggers.c index 2f9e25309a..215c026f2f 100644 --- a/orte/tools/orterun/debuggers.c +++ b/orte/tools/orterun/debuggers.c @@ -120,6 +120,7 @@ #include "orte/mca/rml/rml_types.h" #include "orte/mca/plm/plm.h" #include "orte/mca/plm/base/plm_private.h" +#include "orte/mca/rmaps/rmaps_types.h" #include "orte/runtime/orte_globals.h" #include "orte/runtime/orte_wait.h" #include "orte/util/show_help.h" @@ -512,7 +513,6 @@ static void check_debugger(int fd, short event, void *arg) * one debugger daemon on each node */ jdata->map = OBJ_NEW(orte_job_map_t); - jdata->map->pernode = true; jdata->map->npernode = 1; /* add it to the global job pool */ ljob = ORTE_LOCAL_JOBID(jdata->jobid); diff --git a/orte/tools/orterun/orterun.c b/orte/tools/orterun/orterun.c index 750f68b199..fa6bca2684 100644 --- a/orte/tools/orterun/orterun.c +++ b/orte/tools/orterun/orterun.c @@ -50,6 +50,7 @@ #include "opal/event/event.h" #include "opal/mca/installdirs/installdirs.h" #include "opal/mca/base/base.h" +#include "opal/mca/paffinity/base/base.h" #include "opal/util/argv.h" #include "opal/util/output.h" #include "opal/util/basename.h" @@ -255,10 +256,16 @@ static opal_cmd_line_init_t cmd_line_init[] = { /* Mapping options */ { NULL, NULL, NULL, '\0', "bynode", "bynode", 0, &orterun_globals.by_node, OPAL_CMD_LINE_TYPE_BOOL, - "Whether to allocate/map processes round-robin by node" }, + "Whether to assign processes round-robin by node" }, { NULL, NULL, NULL, '\0', "byslot", "byslot", 0, &orterun_globals.by_slot, OPAL_CMD_LINE_TYPE_BOOL, - "Whether to allocate/map processes round-robin by slot (the default)" }, + "Whether to assign processes round-robin by slot (the default)" }, + { NULL, NULL, NULL, '\0', "bysocket", "bysocket", 0, + &orterun_globals.by_socket, OPAL_CMD_LINE_TYPE_BOOL, + "Whether to assign processes round-robin by socket" }, + { NULL, NULL, NULL, '\0', "byboard", "byboard", 0, + &orterun_globals.by_slot, OPAL_CMD_LINE_TYPE_BOOL, + "Whether to assign processes round-robin by board (equivalent to bynode if only 1 board/node)" }, { "rmaps", "base", "pernode", '\0', "pernode", "pernode", 0, NULL, OPAL_CMD_LINE_TYPE_BOOL, "Launch one process per available node on the specified number of nodes [no -np => use all allocated nodes]" }, @@ -286,7 +293,30 @@ static opal_cmd_line_init_t cmd_line_init[] = { { "rmaps", "base", "no_schedule_local", '\0', "nolocal", "nolocal", 0, NULL, OPAL_CMD_LINE_TYPE_BOOL, "Do not run any MPI applications on the local node" }, - + { "rmaps", "base", "cpus_per_rank", '\0', "cpus-per-rank", "cpus-per-rank", 1, + NULL, OPAL_CMD_LINE_TYPE_INT, + "Number of cpus to use for each rank [default=1]" }, + { "rmaps", "base", "n_perboard", '\0', "nperboard", "nperboard", 1, + NULL, OPAL_CMD_LINE_TYPE_INT, + "Launch n processes per board on all allocated nodes" }, + { "rmaps", "base", "n_persocket", '\0', "npersocket", "npersocket", 1, + NULL, OPAL_CMD_LINE_TYPE_INT, + "Launch n processes per socket on all allocated nodes" }, + + /* binding options */ + { NULL, NULL, NULL, '\0', "bind-to-core", "bind-to-core", 0, + &orterun_globals.bind_to_core, OPAL_CMD_LINE_TYPE_BOOL, + "Whether to bind processes to specific cores (the default)" }, + { NULL, NULL, NULL, '\0', "bind-to-board", "bind-to-board", 0, + &orterun_globals.bind_to_board, OPAL_CMD_LINE_TYPE_BOOL, + "Whether to bind processes to specific boards (meaningless on 1 board/node)" }, + { NULL, NULL, NULL, '\0', "bind-to-socket", "bind-to-socket", 0, + &orterun_globals.bind_to_socket, OPAL_CMD_LINE_TYPE_BOOL, + "Whether to bind processes to sockets" }, + { "rmaps", "base", "stride", '\0', "stride", "stride", 1, + NULL, OPAL_CMD_LINE_TYPE_INT, + "When binding multiple cores to a rank, the step size to use between cores [default: 1]" }, + /* Allocation options */ { "ras", "base", "display_alloc", '\0', "display-allocation", "display-allocation", 0, NULL, OPAL_CMD_LINE_TYPE_BOOL, @@ -294,6 +324,20 @@ static opal_cmd_line_init_t cmd_line_init[] = { { "ras", "base", "display_devel_alloc", '\0', "display-devel-allocation", "display-devel-allocation", 0, NULL, OPAL_CMD_LINE_TYPE_BOOL, "Display a detailed list (mostly intended for developers) of the allocation being used by this job"}, + { "orte", "cpu", "set", '\0', "cpu-set", "cpu-set", 1, + NULL, OPAL_CMD_LINE_TYPE_STRING, + "Comma-separated list of ranges specifying logical cpus allocated to this job [default: none]"}, + + /* cluster hardware info */ + { "orte", "num", "boards", '\0', "num-boards", "num-boards", 1, + NULL, OPAL_CMD_LINE_TYPE_INT, + "Number of processor boards/node (1-256) [default: 1]"}, + { "orte", "num", "sockets", '\0', "num-sockets", "num-sockets", 1, + NULL, OPAL_CMD_LINE_TYPE_INT, + "Number of sockets/board (1-256) [default: 1]"}, + { "orte", "num", "cores", '\0', "num-cores", "num-cores", 1, + NULL, OPAL_CMD_LINE_TYPE_INT, + "Number of cores/socket (1-256) [default: 1]"}, /* mpiexec-like arguments */ { NULL, NULL, NULL, '\0', "wdir", "wdir", 1, @@ -468,6 +512,7 @@ int orterun(int argc, char *argv[]) ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); return ORTE_ERR_OUT_OF_RESOURCE; } + /* check what user wants us to do with stdin */ if (0 == strcmp(orterun_globals.stdin_target, "all")) { jdata->stdin_target = ORTE_VPID_WILDCARD; @@ -1144,6 +1189,11 @@ static int init_globals(void) orterun_globals.quiet = false; orterun_globals.by_node = false; orterun_globals.by_slot = false; + orterun_globals.by_board = false; + orterun_globals.by_socket = false; + orterun_globals.bind_to_core = false; + orterun_globals.bind_to_board = false; + orterun_globals.bind_to_socket = false; orterun_globals.debugger = false; orterun_globals.num_procs = 0; if( NULL != orterun_globals.env_val ) @@ -1171,8 +1221,6 @@ static int init_globals(void) static int parse_globals(int argc, char* argv[], opal_cmd_line_t *cmd_line) { - int id; - /* print version if requested. Do this before check for help so that --version --help works as one might expect. */ if (orterun_globals.version && @@ -1237,31 +1285,30 @@ static int parse_globals(int argc, char* argv[], opal_cmd_line_t *cmd_line) orte_run_debugger(orterun_basename, cmd_line, argc, argv, orterun_globals.num_procs); } - /* Allocate and map by node or by slot? Shortcut for setting an - MCA param. */ - - /* Don't initialize the MCA parameter here unless we have to, - * since it really should be initialized in rmaps_base_open */ - if (orterun_globals.by_node || orterun_globals.by_slot) { - char *policy = NULL; - id = mca_base_param_reg_string_name("rmaps", "base_schedule_policy", - "Scheduling policy for RMAPS. [slot | node]", - false, false, "slot", &policy); - - if (orterun_globals.by_node) { - orterun_globals.by_slot = false; - mca_base_param_set_string(id, "node"); - } else { - orterun_globals.by_slot = true; - mca_base_param_set_string(id, "slot"); - } - free(policy); + /* extract any rank assignment policy directives */ + if (orterun_globals.by_node) { + ORTE_SET_MAPPING_POLICY(ORTE_MAPPING_BYNODE); + } else if (orterun_globals.by_board) { + ORTE_SET_MAPPING_POLICY(ORTE_MAPPING_BYBOARD); + } else if (orterun_globals.by_socket) { + ORTE_SET_MAPPING_POLICY(ORTE_MAPPING_BYSOCKET); + } else { + /* byslot is the default */ + ORTE_SET_MAPPING_POLICY(ORTE_MAPPING_BYSLOT); } - else { - /* Default */ - orterun_globals.by_slot = true; + + /* extract any binding policy directives - they will + * be ignored unless paffinity_alone is set + */ + if (orterun_globals.bind_to_socket) { + ORTE_SET_BINDING_POLICY(ORTE_BIND_TO_SOCKET); + } else if (orterun_globals.bind_to_board) { + ORTE_SET_BINDING_POLICY(ORTE_BIND_TO_BOARD); + } else { + /* default to by-core */ + ORTE_SET_BINDING_POLICY(ORTE_BIND_TO_CORE); } - + return ORTE_SUCCESS; } diff --git a/orte/tools/orterun/orterun.h b/orte/tools/orterun/orterun.h index a0c560bb4f..fe5673ccd3 100644 --- a/orte/tools/orterun/orterun.h +++ b/orte/tools/orterun/orterun.h @@ -43,6 +43,11 @@ struct orterun_globals_t { bool exit; bool by_node; bool by_slot; + bool by_board; + bool by_socket; + bool bind_to_core; + bool bind_to_board; + bool bind_to_socket; bool debugger; int num_procs; char *env_val; diff --git a/orte/util/hostfile/help-hostfile.txt b/orte/util/hostfile/help-hostfile.txt index 4ef7470e69..7d21ddee43 100644 --- a/orte/util/hostfile/help-hostfile.txt +++ b/orte/util/hostfile/help-hostfile.txt @@ -93,3 +93,19 @@ The requested number of empty hosts was not available - the system was short by Please recheck your allocation - further information is available on the orte_hosts man page. +[boards] +Open RTE detected a bad parameter in the hostfile: + %s +The boards parameter is less than 0: + boards=%d +[sockets] +Open RTE detected a bad parameter in the hostfile: + %s +The sockets parameter is less than 0: + sockets=%d +[cores] +Open RTE detected a bad parameter in the hostfile: + %s +The cores parameter is less than 0: + cores=%d + diff --git a/orte/util/hostfile/hostfile.c b/orte/util/hostfile/hostfile.c index 6387a34327..f4fe51b6f2 100644 --- a/orte/util/hostfile/hostfile.c +++ b/orte/util/hostfile/hostfile.c @@ -261,6 +261,49 @@ static int hostfile_parse_line(int token, opal_list_t* updates, opal_list_t* exc node->username = hostfile_parse_string(); break; + case ORTE_HOSTFILE_BOARDS: + rc = hostfile_parse_int(); + if (rc < 0) { + orte_show_help("help-hostfile.txt", "boards", + true, + cur_hostfile_name, rc); + OBJ_RELEASE(node); + return ORTE_ERROR; + } + node->boards = rc; + break; + + case ORTE_HOSTFILE_SOCKETS_PER_BOARD: + rc = hostfile_parse_int(); + if (rc < 0) { + orte_show_help("help-hostfile.txt", "sockets", + true, + cur_hostfile_name, rc); + OBJ_RELEASE(node); + return ORTE_ERROR; + } + node->sockets_per_board = rc; + break; + + case ORTE_HOSTFILE_CORES_PER_SOCKET: + rc = hostfile_parse_int(); + if (rc < 0) { + orte_show_help("help-hostfile.txt", "cores", + true, + cur_hostfile_name, rc); + OBJ_RELEASE(node); + return ORTE_ERROR; + } + node->cores_per_socket = rc; + break; + + case ORTE_HOSTFILE_CPU_SET: + if (NULL != node->cpu_set) { + free(node->cpu_set); + } + node->cpu_set = hostfile_parse_string(); + break; + case ORTE_HOSTFILE_COUNT: case ORTE_HOSTFILE_CPU: case ORTE_HOSTFILE_SLOTS: diff --git a/orte/util/hostfile/hostfile_lex.h b/orte/util/hostfile/hostfile_lex.h index 85da2875f4..3839225ad1 100644 --- a/orte/util/hostfile/hostfile_lex.h +++ b/orte/util/hostfile/hostfile_lex.h @@ -55,22 +55,26 @@ extern orte_hostfile_value_t orte_util_hostfile_value; #define YY_NO_UNPUT 1 #define YY_SKIP_YYWRAP 1 -#define ORTE_HOSTFILE_DONE 0 -#define ORTE_HOSTFILE_ERROR 1 -#define ORTE_HOSTFILE_QUOTED_STRING 2 -#define ORTE_HOSTFILE_EQUAL 3 -#define ORTE_HOSTFILE_INT 4 -#define ORTE_HOSTFILE_STRING 5 -#define ORTE_HOSTFILE_CPU 6 -#define ORTE_HOSTFILE_COUNT 7 -#define ORTE_HOSTFILE_SLOTS 8 -#define ORTE_HOSTFILE_SLOTS_MAX 9 -#define ORTE_HOSTFILE_USERNAME 10 -#define ORTE_HOSTFILE_IPV4 11 -#define ORTE_HOSTFILE_HOSTNAME 12 -#define ORTE_HOSTFILE_NEWLINE 13 -#define ORTE_HOSTFILE_IPV6 14 -#define ORTE_HOSTFILE_SLOT 15 -#define ORTE_HOSTFILE_RELATIVE 16 +#define ORTE_HOSTFILE_DONE 0 +#define ORTE_HOSTFILE_ERROR 1 +#define ORTE_HOSTFILE_QUOTED_STRING 2 +#define ORTE_HOSTFILE_EQUAL 3 +#define ORTE_HOSTFILE_INT 4 +#define ORTE_HOSTFILE_STRING 5 +#define ORTE_HOSTFILE_CPU 6 +#define ORTE_HOSTFILE_COUNT 7 +#define ORTE_HOSTFILE_SLOTS 8 +#define ORTE_HOSTFILE_SLOTS_MAX 9 +#define ORTE_HOSTFILE_USERNAME 10 +#define ORTE_HOSTFILE_IPV4 11 +#define ORTE_HOSTFILE_HOSTNAME 12 +#define ORTE_HOSTFILE_NEWLINE 13 +#define ORTE_HOSTFILE_IPV6 14 +#define ORTE_HOSTFILE_SLOT 15 +#define ORTE_HOSTFILE_RELATIVE 16 +#define ORTE_HOSTFILE_BOARDS 17 +#define ORTE_HOSTFILE_SOCKETS_PER_BOARD 18 +#define ORTE_HOSTFILE_CORES_PER_SOCKET 19 +#define ORTE_HOSTFILE_CPU_SET 20 #endif diff --git a/orte/util/hostfile/hostfile_lex.l b/orte/util/hostfile/hostfile_lex.l index e2d1cb67a1..90cbc63082 100644 --- a/orte/util/hostfile/hostfile_lex.l +++ b/orte/util/hostfile/hostfile_lex.l @@ -120,6 +120,33 @@ username { orte_util_hostfile_value.sval = yytext; "user_name" { orte_util_hostfile_value.sval = yytext; return ORTE_HOSTFILE_USERNAME; } +boards { orte_util_hostfile_value.sval = yytext; + return ORTE_HOSTFILE_BOARDS; } + +sockets { orte_util_hostfile_value.sval = yytext; + return ORTE_HOSTFILE_SOCKETS_PER_BOARD; } + +sockets_per_board { orte_util_hostfile_value.sval = yytext; + return ORTE_HOSTFILE_SOCKETS_PER_BOARD; } + +"sockets-per-board" { orte_util_hostfile_value.sval = yytext; + return ORTE_HOSTFILE_SOCKETS_PER_BOARD; } + +cores { orte_util_hostfile_value.sval = yytext; + return ORTE_HOSTFILE_CORES_PER_SOCKET; } + +cores_per_socket { orte_util_hostfile_value.sval = yytext; + return ORTE_HOSTFILE_CORES_PER_SOCKET; } + +"cores-per-socket" { orte_util_hostfile_value.sval = yytext; + return ORTE_HOSTFILE_CORES_PER_SOCKET; } + +cpu_set { orte_util_hostfile_value.sval = yytext; + return ORTE_HOSTFILE_CPU_SET; } + +"cpu-set" { orte_util_hostfile_value.sval = yytext; + return ORTE_HOSTFILE_CPU_SET; } + \+n[0-9]+ { orte_util_hostfile_value.sval = yytext; return ORTE_HOSTFILE_RELATIVE; } \+[eE][\:][0-9]+ { orte_util_hostfile_value.sval = yytext; diff --git a/orte/util/regex.c b/orte/util/regex.c index 622dd76b03..9cc5d01ce4 100644 --- a/orte/util/regex.c +++ b/orte/util/regex.c @@ -47,6 +47,7 @@ #include "orte/mca/errmgr/errmgr.h" #include "orte/mca/odls/odls_types.h" #include "orte/mca/rml/base/rml_contact.h" +#include "orte/mca/rmaps/rmaps_types.h" #include "orte/util/show_help.h" #include "orte/util/name_fns.h" #include "orte/util/nidmap.h" @@ -472,20 +473,14 @@ char* orte_regex_encode_maps(orte_job_t *jdata) char suffix, sfx; orte_app_context_t *app; - /* this is only supported with regular maps - i.e., when - * the mapping is byslot or bynode. Irregular maps cannot - * be expressed in a regular expression - * - * Also only supported for one app_context - */ - if (jdata->map->policy & ORTE_RMAPS_BYUSER || - jdata->num_apps > 1) { + /* this is only for one app_context */ + if (jdata->num_apps > 1) { return NULL; } /* determine the mapping policy */ byslot = true; - if (jdata->map->policy & ORTE_RMAPS_BYNODE) { + if (jdata->map->policy & ORTE_MAPPING_BYNODE) { byslot = false; }