From 322f6c5056fc484edc34f0e8eb7d536bf13366b5 Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Thu, 22 Mar 2018 20:50:47 -0500 Subject: [PATCH 1/4] Fix a breakage in the ranking system While it may be faster to reverse the order of the assignment loops, it also results in the wrong answer Signed-off-by: Ralph Castain --- orte/mca/plm/base/plm_base_launch_support.c | 14 ++++- orte/mca/ras/base/ras_base_node.c | 22 ++++++- orte/mca/ras/simulator/ras_sim_module.c | 7 ++- orte/mca/rmaps/base/rmaps_base_binding.c | 12 ++-- orte/mca/rmaps/base/rmaps_base_map_job.c | 33 ++++++++++- orte/mca/rmaps/base/rmaps_base_ranking.c | 65 +++++++++++---------- 6 files changed, 111 insertions(+), 42 deletions(-) diff --git a/orte/mca/plm/base/plm_base_launch_support.c b/orte/mca/plm/base/plm_base_launch_support.c index eb91100b24..92b76d4097 100644 --- a/orte/mca/plm/base/plm_base_launch_support.c +++ b/orte/mca/plm/base/plm_base_launch_support.c @@ -190,9 +190,17 @@ void orte_plm_base_allocation_complete(int fd, short args, void *cbdata) ORTE_ACQUIRE_OBJECT(caddy); - /* move the state machine along */ - caddy->jdata->state = ORTE_JOB_STATE_ALLOCATION_COMPLETE; - ORTE_ACTIVATE_JOB_STATE(caddy->jdata, ORTE_JOB_STATE_LAUNCH_DAEMONS); + /* if we don't want to launch, then we at least want + * to map so we can see where the procs would have + * gone - so skip to the mapping state */ + if (orte_do_not_launch) { + caddy->jdata->state = ORTE_JOB_STATE_ALLOCATION_COMPLETE; + ORTE_ACTIVATE_JOB_STATE(caddy->jdata, ORTE_JOB_STATE_MAP); + } else { + /* move the state machine along */ + caddy->jdata->state = ORTE_JOB_STATE_ALLOCATION_COMPLETE; + ORTE_ACTIVATE_JOB_STATE(caddy->jdata, ORTE_JOB_STATE_LAUNCH_DAEMONS); + } /* cleanup */ OBJ_RELEASE(caddy); diff --git a/orte/mca/ras/base/ras_base_node.c b/orte/mca/ras/base/ras_base_node.c index e24e2a6bab..8e8c8f10c2 100644 --- a/orte/mca/ras/base/ras_base_node.c +++ b/orte/mca/ras/base/ras_base_node.c @@ -11,7 +11,7 @@ * All rights reserved. * Copyright (c) 2011-2017 Los Alamos National Security, LLC. All rights * reserved. - * Copyright (c) 2014-2017 Intel, Inc. All rights reserved. + * Copyright (c) 2014-2018 Intel, Inc. All rights reserved. * Copyright (c) 2015 Research Organization for Information Science * and Technology (RIST). All rights reserved. * $COPYRIGHT$ @@ -50,6 +50,8 @@ int orte_ras_base_node_insert(opal_list_t* nodes, orte_job_t *jdata) bool hnp_alone = true, skiphnp = false; orte_attribute_t *kv; char **alias=NULL, **nalias; + orte_proc_t *daemon; + orte_job_t *djob; /* get the number of nodes */ num_nodes = (orte_std_cntr_t)opal_list_get_size(nodes); @@ -76,6 +78,9 @@ int orte_ras_base_node_insert(opal_list_t* nodes, orte_job_t *jdata) return rc; } + /* if we are not launching, get the daemon job */ + djob = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid); + /* get the hnp node's info */ hnp_node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, 0); @@ -189,6 +194,21 @@ int orte_ras_base_node_insert(opal_list_t* nodes, orte_job_t *jdata) ORTE_ERROR_LOG(rc); return rc; } + if (orte_do_not_launch) { + /* create a daemon for this node since we won't be launching + * and the mapper needs to see a daemon - this is used solely + * for testing the mappers */ + daemon = OBJ_NEW(orte_proc_t); + daemon->name.jobid = ORTE_PROC_MY_NAME->jobid; + daemon->name.vpid = node->index; + daemon->state = ORTE_PROC_STATE_RUNNING; + OBJ_RETAIN(node); + daemon->node = node; + opal_pointer_array_set_item(djob->procs, daemon->name.vpid, daemon); + djob->num_procs++; + OBJ_RETAIN(daemon); + node->daemon = daemon; + } /* update the total slots in the job */ orte_ras_base.total_slots_alloc += node->slots; /* check if we have fqdn names in the allocation */ diff --git a/orte/mca/ras/simulator/ras_sim_module.c b/orte/mca/ras/simulator/ras_sim_module.c index dd7eea91c8..f12a3b4275 100644 --- a/orte/mca/ras/simulator/ras_sim_module.c +++ b/orte/mca/ras/simulator/ras_sim_module.c @@ -3,7 +3,7 @@ * Copyright (c) 2012 Los Alamos National Security, LLC. All rights reserved * Copyright (c) 2015-2017 Research Organization for Information Science * and Technology (RIST). All rights reserved. - * Copyright (c) 2015-2017 Intel, Inc. All rights reserved. + * Copyright (c) 2015-2018 Intel, Inc. All rights reserved. * * $COPYRIGHT$ * @@ -23,6 +23,7 @@ #include "opal/mca/hwloc/hwloc-internal.h" #include "opal/util/argv.h" +#include "orte/mca/errmgr/errmgr.h" #include "orte/util/show_help.h" #include "orte/runtime/orte_globals.h" @@ -179,6 +180,10 @@ static int allocate(orte_job_t *jdata, opal_list_t *nodes) support = (struct hwloc_topology_support*)hwloc_topology_get_support(topo); support->cpubind->set_thisproc_cpubind = mca_ras_simulator_component.have_cpubind; support->membind->set_thisproc_membind = mca_ras_simulator_component.have_membind; + /* pass it thru the filter so we create the summaries required by the mappers */ + if (OPAL_SUCCESS != opal_hwloc_base_filter_cpus(topo)) { + ORTE_ERROR_LOG(ORTE_ERROR); + } /* add it to our array */ t = OBJ_NEW(orte_topology_t); t->topo = topo; diff --git a/orte/mca/rmaps/base/rmaps_base_binding.c b/orte/mca/rmaps/base/rmaps_base_binding.c index df37999475..0ead042f5a 100644 --- a/orte/mca/rmaps/base/rmaps_base_binding.c +++ b/orte/mca/rmaps/base/rmaps_base_binding.c @@ -12,7 +12,7 @@ * Copyright (c) 2011-2014 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2011-2012 Los Alamos National Security, LLC. * All rights reserved. - * Copyright (c) 2013-2017 Intel, Inc. All rights reserved. + * Copyright (c) 2013-2018 Intel, Inc. All rights reserved. * Copyright (c) 2015-2017 Research Organization for Information Science * and Technology (RIST). All rights reserved. * $COPYRIGHT$ @@ -246,7 +246,7 @@ static int bind_downwards(orte_job_t *jdata, hwloc_obj_type_t target, unsigned cache_level) { - int j; + int j, rc; orte_job_map_t *map; orte_proc_t *proc; hwloc_obj_t trg_obj, nxt_obj; @@ -367,7 +367,10 @@ static int bind_downwards(orte_job_t *jdata, ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&proc->name), node->name); } else { - opal_hwloc_base_cset2mapstr(tmp2, sizeof(tmp2), node->topology->topo, totalcpuset); + rc = opal_hwloc_base_cset2mapstr(tmp2, sizeof(tmp2), node->topology->topo, totalcpuset); + if (OPAL_SUCCESS != rc) { + ORTE_ERROR_LOG(rc); + } opal_output(orte_rmaps_base_framework.framework_output, "%s BOUND PROC %s[%s] TO %s: %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), @@ -841,7 +844,8 @@ int orte_rmaps_base_compute_bindings(orte_job_t *jdata) if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(jdata->map->nodes, i))) { continue; } - if (!orte_no_vm && (int)ORTE_PROC_MY_NAME->vpid != node->index) { + if (!orte_no_vm && !orte_do_not_launch && + (int)ORTE_PROC_MY_NAME->vpid != node->index) { continue; } if (!orte_do_not_launch) { diff --git a/orte/mca/rmaps/base/rmaps_base_map_job.c b/orte/mca/rmaps/base/rmaps_base_map_job.c index 33c1f11a97..925c2305db 100644 --- a/orte/mca/rmaps/base/rmaps_base_map_job.c +++ b/orte/mca/rmaps/base/rmaps_base_map_job.c @@ -417,7 +417,33 @@ void orte_rmaps_base_map_job(int fd, short args, void *cbdata) } } - if (!orte_get_attribute(&jdata->attributes, ORTE_JOB_FULLY_DESCRIBED, NULL, OPAL_BOOL)) { + if (orte_do_not_launch) { + /* compute the ranks and add the proc objects + * to the jdata->procs array */ + if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_vpids(jdata))) { + ORTE_ERROR_LOG(rc); + ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_MAP_FAILED); + goto cleanup; + } + /* compute and save local ranks */ + if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_local_ranks(jdata))) { + ORTE_ERROR_LOG(rc); + ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_MAP_FAILED); + goto cleanup; + } + /* compute and save location assignments */ + if (ORTE_SUCCESS != (rc = orte_rmaps_base_assign_locations(jdata))) { + ORTE_ERROR_LOG(rc); + ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_MAP_FAILED); + goto cleanup; + } + /* compute and save bindings */ + if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_bindings(jdata))) { + ORTE_ERROR_LOG(rc); + ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_MAP_FAILED); + goto cleanup; + } + } else if (!orte_get_attribute(&jdata->attributes, ORTE_JOB_FULLY_DESCRIBED, NULL, OPAL_BOOL)) { /* compute and save location assignments */ if (ORTE_SUCCESS != (rc = orte_rmaps_base_assign_locations(jdata))) { ORTE_ERROR_LOG(rc); @@ -454,6 +480,11 @@ void orte_rmaps_base_map_job(int fd, short args, void *cbdata) } } + if (orte_do_not_launch) { + /* display the devel map */ + orte_rmaps_base_display_map(jdata); + } + /* set the job state to the next position */ ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_MAP_COMPLETE); diff --git a/orte/mca/rmaps/base/rmaps_base_ranking.c b/orte/mca/rmaps/base/rmaps_base_ranking.c index 2d4e364cc2..9a3ec3a7f6 100644 --- a/orte/mca/rmaps/base/rmaps_base_ranking.c +++ b/orte/mca/rmaps/base/rmaps_base_ranking.c @@ -10,7 +10,7 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2011-2017 Cisco Systems, Inc. All rights reserved - * Copyright (c) 2014-2017 Intel, Inc. All rights reserved. + * Copyright (c) 2014-2018 Intel, Inc. All rights reserved. * Copyright (c) 2017 Research Organization for Information Science * and Technology (RIST). All rights reserved. * $COPYRIGHT$ @@ -379,35 +379,34 @@ static int rank_by(orte_job_t *jdata, all_done = false; while (!all_done && cnt < app->num_procs) { all_done = true; - /* cycle across the objects */ - for (i=0; i < num_objs && cnt < app->num_procs && all_done; i++) { - obj = (hwloc_obj_t)opal_pointer_array_get_item(&objs, i); - /* find the next proc for this job and app_context */ - for (j=0; j < node->procs->size && cnt < app->num_procs; j++) { - if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) { - continue; - } - /* ignore procs from other jobs */ - if (proc->name.jobid != jdata->jobid) { - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:rank_by skipping proc %s - from another job, num_ranked %d", - ORTE_NAME_PRINT(&proc->name), num_ranked); - continue; - } - /* ignore procs that are already ranked */ - if (ORTE_VPID_INVALID != proc->name.vpid) { - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:rank_by skipping proc %s - already ranked, num_ranked %d", - ORTE_NAME_PRINT(&proc->name), num_ranked); - continue; - } - /* ignore procs from other apps */ - if (proc->app_idx != app->idx) { - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:rank_by skipping proc %s - from another app, num_ranked %d", - ORTE_NAME_PRINT(&proc->name), num_ranked); - continue; - } + for (j=0; j < node->procs->size && cnt < app->num_procs; j++) { + if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) { + continue; + } + /* ignore procs from other jobs */ + if (proc->name.jobid != jdata->jobid) { + opal_output_verbose(5, orte_rmaps_base_framework.framework_output, + "mca:rmaps:rank_by skipping proc %s - from another job, num_ranked %d", + ORTE_NAME_PRINT(&proc->name), num_ranked); + continue; + } + /* ignore procs that are already ranked */ + if (ORTE_VPID_INVALID != proc->name.vpid) { + opal_output_verbose(5, orte_rmaps_base_framework.framework_output, + "mca:rmaps:rank_by skipping proc %s - already ranked, num_ranked %d", + ORTE_NAME_PRINT(&proc->name), num_ranked); + continue; + } + /* ignore procs from other apps */ + if (proc->app_idx != app->idx) { + opal_output_verbose(5, orte_rmaps_base_framework.framework_output, + "mca:rmaps:rank_by skipping proc %s - from another app, num_ranked %d", + ORTE_NAME_PRINT(&proc->name), num_ranked); + continue; + } + /* cycle across the objects */ + for (i=0; i < num_objs && cnt < app->num_procs && all_done; i++) { + obj = (hwloc_obj_t)opal_pointer_array_get_item(&objs, i); /* protect against bozo case */ locale = NULL; if (!orte_get_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, (void**)&locale, OPAL_PTR)) { @@ -429,7 +428,8 @@ static int rank_by(orte_job_t *jdata, } cnt++; opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:rank_by: assigned rank %s", ORTE_VPID_PRINT(proc->name.vpid)); + "mca:rmaps:rank_by: proc in position %d is on object %d assigned rank %s", + j, i, ORTE_VPID_PRINT(proc->name.vpid)); /* insert the proc into the jdata array */ if (NULL != (pptr = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->name.vpid))) { OBJ_RELEASE(pptr); @@ -440,7 +440,8 @@ static int rank_by(orte_job_t *jdata, OBJ_DESTRUCT(&objs); return rc; } - /* flag that one was mapped */ + num_ranked++; + /* flag that one was mapped */ all_done = false; /* track where the highest vpid landed - this is our * new bookmark From 3a93b535ecef36bab3d69f24df3a9eba450b780e Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Sun, 25 Mar 2018 16:12:41 -0700 Subject: [PATCH 2/4] Silence the flood of OSC/RDMA warnings Fixes #4950 Signed-off-by: Ralph Castain --- ompi/mca/osc/rdma/osc_rdma.h | 4 ++-- ompi/mca/osc/rdma/osc_rdma_accumulate.c | 22 +++++++++------------ ompi/mca/osc/rdma/osc_rdma_active_target.c | 5 ++--- ompi/mca/osc/rdma/osc_rdma_comm.c | 6 +++++- ompi/mca/osc/rdma/osc_rdma_passive_target.c | 3 ++- 5 files changed, 20 insertions(+), 20 deletions(-) diff --git a/ompi/mca/osc/rdma/osc_rdma.h b/ompi/mca/osc/rdma/osc_rdma.h index 8287163680..a33e0f332f 100644 --- a/ompi/mca/osc/rdma/osc_rdma.h +++ b/ompi/mca/osc/rdma/osc_rdma.h @@ -12,7 +12,7 @@ * reserved. * Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2012-2013 Sandia National Laboratories. All rights reserved. - * Copyright (c) 2016 Intel, Inc. All rights reserved. + * Copyright (c) 2016-2018 Intel, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -568,7 +568,7 @@ static inline void ompi_osc_rdma_sync_rdma_complete (ompi_osc_rdma_sync_t *sync) OPAL_THREAD_SCOPED_LOCK(&sync->lock, OPAL_LIST_FOREACH_SAFE(aggregation, next, &sync->aggregations, ompi_osc_rdma_aggregation_t) { - fprintf (stderr, "Flushing aggregation %p, peeer %p\n", aggregation, aggregation->peer); + fprintf (stderr, "Flushing aggregation %p, peer %p\n", (void*)aggregation, (void*)aggregation->peer); ompi_osc_rdma_peer_aggregate_flush (aggregation->peer); }); } diff --git a/ompi/mca/osc/rdma/osc_rdma_accumulate.c b/ompi/mca/osc/rdma/osc_rdma_accumulate.c index 4ccc68db6b..aa48af5c22 100644 --- a/ompi/mca/osc/rdma/osc_rdma_accumulate.c +++ b/ompi/mca/osc/rdma/osc_rdma_accumulate.c @@ -4,7 +4,7 @@ * reserved. * Copyright (c) 2016-2017 Research Organization for Information Science * and Technology (RIST). All rights reserved. - * Copyright (c) 2016 Intel, Inc. All rights reserved. + * Copyright (c) 2016-2018 Intel, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -50,6 +50,7 @@ struct ompi_osc_rdma_event_t { typedef struct ompi_osc_rdma_event_t ompi_osc_rdma_event_t; +#if 0 static void *ompi_osc_rdma_event_put (int fd, int flags, void *context) { ompi_osc_rdma_event_t *event = (ompi_osc_rdma_event_t *) context; @@ -112,7 +113,7 @@ static int ompi_osc_rdma_event_queue (ompi_osc_rdma_module_t *module, struct mca return OMPI_SUCCESS; } - +#endif static int ompi_osc_rdma_gacc_local (const void *source_buffer, int source_count, ompi_datatype_t *source_datatype, void *result_buffer, int result_count, ompi_datatype_t *result_datatype, @@ -188,10 +189,7 @@ static inline int ompi_osc_rdma_gacc_contig (ompi_osc_rdma_sync_t *sync, const v ompi_datatype_t *target_datatype, ompi_op_t *op, ompi_osc_rdma_request_t *request) { ompi_osc_rdma_module_t *module = sync->module; - const size_t btl_alignment_mask = ALIGNMENT_MASK(module->selected_btl->btl_get_alignment); unsigned long len = target_count * target_datatype->super.size; - ompi_osc_rdma_frag_t *frag = NULL; - volatile bool complete = false; char *ptr = NULL; int ret; @@ -523,7 +521,7 @@ static int ompi_osc_rdma_fetch_and_op_atomic (ompi_osc_rdma_sync_t *sync, const ompi_osc_rdma_module_t *module = sync->module; int32_t atomic_flags = module->selected_btl->btl_atomic_flags; int ret, btl_op, flags; - int64_t origin, result; + int64_t origin; if ((8 != extent && !((MCA_BTL_ATOMIC_SUPPORTS_32BIT & atomic_flags) && 4 == extent)) || (!(OMPI_DATATYPE_FLAG_DATA_INT & dt->super.flags) && !(MCA_BTL_ATOMIC_SUPPORTS_FLOAT & atomic_flags)) || @@ -590,13 +588,13 @@ static int ompi_osc_rdma_fetch_and_op_cas (ompi_osc_rdma_sync_t *sync, const voi new_value = old_value; if (&ompi_mpi_op_replace.op == op) { - memcpy ((void *)((intptr_t) &new_value) + offset, origin_addr, extent); + memcpy ((void *)((intptr_t) &new_value + offset), origin_addr, extent); } else if (&ompi_mpi_op_no_op.op != op) { - ompi_op_reduce (op, (void *) origin_addr, (void *)((intptr_t) &new_value) + offset, 1, dt); + ompi_op_reduce (op, (void *) origin_addr, (void*)((intptr_t) &new_value + offset), 1, dt); } ret = ompi_osc_rdma_btl_cswap (module, peer->data_endpoint, address, target_handle, - old_value, new_value, 0, &new_value); + old_value, new_value, 0, (int64_t*)&new_value); if (OPAL_SUCCESS != ret || new_value == old_value) { break; } @@ -605,7 +603,7 @@ static int ompi_osc_rdma_fetch_and_op_cas (ompi_osc_rdma_sync_t *sync, const voi } while (1); if (result_addr) { - memcpy (result_addr, (void *)((intptr_t) &new_value) + offset, extent); + memcpy (result_addr, (void *)((intptr_t) &new_value + offset), extent); } if (OPAL_SUCCESS == ret) { @@ -696,11 +694,9 @@ static inline int cas_rdma (ompi_osc_rdma_sync_t *sync, const void *source_addr, mca_btl_base_registration_handle_t *target_handle, bool lock_acquired) { ompi_osc_rdma_module_t *module = sync->module; - const size_t btl_alignment_mask = ALIGNMENT_MASK(module->selected_btl->btl_get_alignment); - unsigned long offset, aligned_len, len = datatype->super.size; + unsigned long len = datatype->super.size; mca_btl_base_registration_handle_t *local_handle = NULL; ompi_osc_rdma_frag_t *frag = NULL; - ompi_osc_rdma_request_t *request; volatile bool complete = false; /* drop the const. this code will not attempt to change the value */ char *ptr = (char *) source_addr; diff --git a/ompi/mca/osc/rdma/osc_rdma_active_target.c b/ompi/mca/osc/rdma/osc_rdma_active_target.c index b4fb3dec64..dd52e4938e 100644 --- a/ompi/mca/osc/rdma/osc_rdma_active_target.c +++ b/ompi/mca/osc/rdma/osc_rdma_active_target.c @@ -16,7 +16,7 @@ * Copyright (c) 2017 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. - * Copyright (c) 2017 Intel, Inc. All rights reserved. + * Copyright (c) 2017-2018 Intel, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -80,7 +80,7 @@ void ompi_osc_rdma_atomic_complete (mca_btl_base_module_t *btl, struct mca_btl_b { ompi_osc_rdma_pending_op_t *pending_op = (ompi_osc_rdma_pending_op_t *) context; - OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "pending atomic %p complete with status %d", pending_op, status); + OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "pending atomic %p complete with status %d", (void*)pending_op, status); if (pending_op->op_result) { memmove (pending_op->op_result, pending_op->op_buffer, pending_op->op_size); @@ -296,7 +296,6 @@ int ompi_osc_rdma_post_atomic (ompi_group_t *group, int assert, ompi_win_t *win) { ompi_osc_rdma_module_t *module = GET_MODULE(win); ompi_osc_rdma_peer_t **peers; - int my_rank = ompi_comm_rank (module->comm); ompi_osc_rdma_state_t *state = module->state; int ret = OMPI_SUCCESS; diff --git a/ompi/mca/osc/rdma/osc_rdma_comm.c b/ompi/mca/osc/rdma/osc_rdma_comm.c index 0d506374c9..fda90e9122 100644 --- a/ompi/mca/osc/rdma/osc_rdma_comm.c +++ b/ompi/mca/osc/rdma/osc_rdma_comm.c @@ -2,7 +2,7 @@ /* * Copyright (c) 2014-2018 Los Alamos National Security, LLC. All rights * reserved. - * Copyright (c) 2016 Intel, Inc. All rights reserved. + * Copyright (c) 2016-2018 Intel, Inc. All rights reserved. * Copyright (c) 2017 Research Organization for Information Science * and Technology (RIST). All rights reserved. * Copyright (c) 2017 IBM Corporation. All rights reserved. @@ -492,6 +492,7 @@ static int ompi_osc_rdma_put_real (ompi_osc_rdma_sync_t *sync, ompi_osc_rdma_pee return ret; } +#if 0 static void ompi_osc_rdma_aggregate_append (ompi_osc_rdma_aggregation_t *aggregation, ompi_osc_rdma_request_t *request, void *source_buffer, size_t size) { @@ -550,13 +551,16 @@ static int ompi_osc_rdma_aggregate_alloc (ompi_osc_rdma_sync_t *sync, ompi_osc_r return OMPI_SUCCESS; } +#endif int ompi_osc_rdma_put_contig (ompi_osc_rdma_sync_t *sync, ompi_osc_rdma_peer_t *peer, uint64_t target_address, mca_btl_base_registration_handle_t *target_handle, void *source_buffer, size_t size, ompi_osc_rdma_request_t *request) { ompi_osc_rdma_module_t *module = sync->module; +#if 0 ompi_osc_rdma_aggregation_t *aggregation = peer->aggregate; +#endif mca_btl_base_registration_handle_t *local_handle = NULL; mca_btl_base_rdma_completion_fn_t cbfunc = NULL; ompi_osc_rdma_frag_t *frag = NULL; diff --git a/ompi/mca/osc/rdma/osc_rdma_passive_target.c b/ompi/mca/osc/rdma/osc_rdma_passive_target.c index 37b1bee257..dc11c5e31d 100644 --- a/ompi/mca/osc/rdma/osc_rdma_passive_target.c +++ b/ompi/mca/osc/rdma/osc_rdma_passive_target.c @@ -12,6 +12,7 @@ * reserved. * Copyright (c) 2010 IBM Corporation. All rights reserved. * Copyright (c) 2012-2013 Sandia National Laboratories. All rights reserved. + * Copyright (c) 2018 Intel, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -202,7 +203,7 @@ int ompi_osc_rdma_demand_lock_peer (ompi_osc_rdma_module_t *module, ompi_osc_rdm } while (0); ); - return OMPI_SUCCESS; + return ret; } int ompi_osc_rdma_lock_atomic (int lock_type, int target, int assert, ompi_win_t *win) From fd704d87084c51adefdaad59518e68900c5fb0ae Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Sun, 25 Mar 2018 16:20:38 -0700 Subject: [PATCH 3/4] Add NEWS item Signed-off-by: Ralph Castain --- NEWS | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/NEWS b/NEWS index a3dcc274ce..da3792e609 100644 --- a/NEWS +++ b/NEWS @@ -19,7 +19,7 @@ Copyright (c) 2012 Oak Ridge National Labs. All rights reserved. Copyright (c) 2012 Sandia National Laboratories. All rights reserved. Copyright (c) 2012 University of Houston. All rights reserved. Copyright (c) 2013 NVIDIA Corporation. All rights reserved. -Copyright (c) 2013-2017 Intel, Inc. All rights reserved. +Copyright (c) 2013-2018 Intel, Inc. All rights reserved. Copyright (c) 2018 Amazon.com, Inc. or its affiliates. All Rights reserved. $COPYRIGHT$ @@ -71,6 +71,7 @@ Master (not on release branches yet) - Remove IB XRC support from the OpenIB BTL due to lack of support. - Remove support for big endian PowerPC. - Remove support for XL compilers older than v13.1 +- Fix rank-by algorithms to properly rank by object and span 3.0.0 -- September, 2017 ------------------------ From d644f7ee2610ffb106515f3a11b3903470a1d2e3 Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Mon, 26 Mar 2018 16:06:46 -0700 Subject: [PATCH 4/4] Correctly fix the ranking policy Shorten the loops as much as possible - if someone wants to further optimize, they are welcome to do so. Signed-off-by: Ralph Castain --- orte/mca/rmaps/base/rmaps_base_ranking.c | 116 ++++++++++++----------- 1 file changed, 61 insertions(+), 55 deletions(-) diff --git a/orte/mca/rmaps/base/rmaps_base_ranking.c b/orte/mca/rmaps/base/rmaps_base_ranking.c index 9a3ec3a7f6..9eaea79ccf 100644 --- a/orte/mca/rmaps/base/rmaps_base_ranking.c +++ b/orte/mca/rmaps/base/rmaps_base_ranking.c @@ -304,15 +304,15 @@ static int rank_by(orte_job_t *jdata, { orte_app_context_t *app; hwloc_obj_t obj; - int num_objs, i, j, m, n, rc; + int num_objs, i, j, m, n, rc, nn; orte_vpid_t num_ranked=0; orte_node_t *node; orte_proc_t *proc, *pptr; - orte_vpid_t vpid; + orte_vpid_t vpid, np; int cnt; opal_pointer_array_t objs; - bool all_done; hwloc_obj_t locale; + orte_app_idx_t napp; if (ORTE_RANKING_SPAN & ORTE_GET_RANKING_DIRECTIVE(jdata->map->ranking)) { return rank_span(jdata, target, cache_level); @@ -333,20 +333,21 @@ static int rank_by(orte_job_t *jdata, */ vpid = 0; - for (n=0; n < jdata->apps->size; n++) { + for (n=0, napp=0; napp < jdata->num_apps && n < jdata->apps->size; n++) { if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, n))) { continue; } - + napp++; /* setup the pointer array */ OBJ_CONSTRUCT(&objs, opal_pointer_array_t); opal_pointer_array_init(&objs, 2, INT_MAX, 2); cnt = 0; - for (m=0; m < jdata->map->nodes->size; m++) { + for (m=0, nn=0; nn < jdata->map->num_nodes && m < jdata->map->nodes->size; m++) { if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(jdata->map->nodes, m))) { continue; } + nn++; /* get the number of objects - only consider those we can actually use */ num_objs = opal_hwloc_base_get_nbobjs_by_type(node->topology->topo, target, @@ -376,13 +377,20 @@ static int rank_by(orte_job_t *jdata, * Perhaps someday someone will come up with a more efficient * algorithm, but this works for now. */ - all_done = false; - while (!all_done && cnt < app->num_procs) { - all_done = true; - for (j=0; j < node->procs->size && cnt < app->num_procs; j++) { + i = 0; + while (cnt < app->num_procs) { + /* get the next object */ + obj = (hwloc_obj_t)opal_pointer_array_get_item(&objs, i); + if (NULL == obj) { + break; + } + /* scan across the procs and find the one that is on this object */ + np = 0; + for (j=0; np < node->num_procs && j < node->procs->size && cnt < app->num_procs; j++) { if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) { continue; } + np++; /* ignore procs from other jobs */ if (proc->name.jobid != jdata->jobid) { opal_output_verbose(5, orte_rmaps_base_framework.framework_output, @@ -404,53 +412,48 @@ static int rank_by(orte_job_t *jdata, ORTE_NAME_PRINT(&proc->name), num_ranked); continue; } - /* cycle across the objects */ - for (i=0; i < num_objs && cnt < app->num_procs && all_done; i++) { - obj = (hwloc_obj_t)opal_pointer_array_get_item(&objs, i); - /* protect against bozo case */ - locale = NULL; - if (!orte_get_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, (void**)&locale, OPAL_PTR)) { - ORTE_ERROR_LOG(ORTE_ERROR); - return ORTE_ERROR; - } - /* ignore procs not on this object */ - if (NULL == locale || - !hwloc_bitmap_intersects(obj->cpuset, locale->cpuset)) { - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:rank_by: proc at position %d is not on object %d", - j, i); - continue; - } - /* assign the vpid */ - proc->name.vpid = vpid++; - if (0 == cnt) { - app->first_rank = proc->name.vpid; - } - cnt++; - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:rank_by: proc in position %d is on object %d assigned rank %s", - j, i, ORTE_VPID_PRINT(proc->name.vpid)); - /* insert the proc into the jdata array */ - if (NULL != (pptr = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->name.vpid))) { - OBJ_RELEASE(pptr); - } - OBJ_RETAIN(proc); - if (ORTE_SUCCESS != (rc = opal_pointer_array_set_item(jdata->procs, proc->name.vpid, proc))) { - ORTE_ERROR_LOG(rc); - OBJ_DESTRUCT(&objs); - return rc; - } - num_ranked++; - /* flag that one was mapped */ - all_done = false; - /* track where the highest vpid landed - this is our - * new bookmark - */ - jdata->bookmark = node; - /* move to next object */ - break; + /* protect against bozo case */ + locale = NULL; + if (!orte_get_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, (void**)&locale, OPAL_PTR)) { + ORTE_ERROR_LOG(ORTE_ERROR); + return ORTE_ERROR; } + /* ignore procs not on this object */ + if (NULL == locale || + !hwloc_bitmap_intersects(obj->cpuset, locale->cpuset)) { + opal_output_verbose(5, orte_rmaps_base_framework.framework_output, + "mca:rmaps:rank_by: proc at position %d is not on object %d", + j, i); + continue; + } + /* assign the vpid */ + proc->name.vpid = vpid++; + if (0 == cnt) { + app->first_rank = proc->name.vpid; + } + cnt++; + opal_output_verbose(5, orte_rmaps_base_framework.framework_output, + "mca:rmaps:rank_by: proc in position %d is on object %d assigned rank %s", + j, i, ORTE_VPID_PRINT(proc->name.vpid)); + /* insert the proc into the jdata array */ + if (NULL != (pptr = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->name.vpid))) { + OBJ_RELEASE(pptr); + } + OBJ_RETAIN(proc); + if (ORTE_SUCCESS != (rc = opal_pointer_array_set_item(jdata->procs, proc->name.vpid, proc))) { + ORTE_ERROR_LOG(rc); + OBJ_DESTRUCT(&objs); + return rc; + } + num_ranked++; + /* track where the highest vpid landed - this is our + * new bookmark + */ + jdata->bookmark = node; + /* move to next object */ + break; } + i++; } } /* cleanup */ @@ -474,6 +477,9 @@ int orte_rmaps_base_compute_vpids(orte_job_t *jdata) map = jdata->map; + opal_output_verbose(5, orte_rmaps_base_framework.framework_output, + "RANKING POLICY: %s", orte_rmaps_base_print_ranking(map->ranking)); + /* start with the rank-by object options - if the object isn't * included in the topology, then we obviously cannot rank by it. * However, if this was the default ranking policy (as opposed to