From bd8b4f7f1eff729d53c0a8b090d9f979c21230b2 Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Fri, 6 Apr 2012 14:23:13 +0000 Subject: [PATCH] Sorry for mid-day commit, but I had promised on the call to do this upon my return. Roll in the ORTE state machine. Remove last traces of opal_sos. Remove UTK epoch code. Please see the various emails about the state machine change for details. I'll send something out later with more info on the new arch. This commit was SVN r26242. --- contrib/hg/build-hgignore.pl | 2 + contrib/platform/iu/odin/debug-nopmi | 28 + contrib/platform/iu/odin/debug-nopmi.conf | 85 + ompi/attribute/attribute.c | 5 +- ompi/communicator/comm_cid.c | 5 +- ompi/datatype/ompi_datatype.h | 3 + ompi/errhandler/errcode-internal.h | 10 +- ompi/mca/bml/bml.h | 5 +- ompi/mca/bml/r2/bml_r2.c | 4 +- ompi/mca/bml/r2/bml_r2_ft.c | 23 +- ompi/mca/btl/ofud/btl_ofud_component.c | 3 +- ompi/mca/btl/openib/btl_openib.c | 11 +- ompi/mca/btl/openib/btl_openib_component.c | 28 +- ompi/mca/btl/openib/btl_openib_endpoint.c | 7 +- ompi/mca/btl/openib/btl_openib_failover.c | 6 +- ompi/mca/btl/openib/btl_openib_ini.c | 9 +- .../openib/connect/btl_openib_connect_base.c | 8 +- .../openib/connect/btl_openib_connect_oob.c | 3 +- .../connect/btl_openib_connect_rdmacm.c | 5 +- .../openib/connect/btl_openib_connect_xoob.c | 5 +- ompi/mca/btl/portals/btl_portals.c | 3 +- ompi/mca/btl/portals/btl_portals_frag.h | 4 +- ompi/mca/btl/portals/btl_portals_send.c | 3 +- ompi/mca/btl/tcp/btl_tcp_component.c | 5 +- ompi/mca/btl/ugni/btl_ugni_put.c | 1 - ompi/mca/btl/wv/btl_wv.c | 9 +- ompi/mca/btl/wv/btl_wv_component.c | 28 +- ompi/mca/btl/wv/btl_wv_endpoint.c | 7 +- ompi/mca/btl/wv/btl_wv_ini.c | 9 +- ompi/mca/btl/wv/connect/btl_wv_connect_base.c | 9 +- ompi/mca/btl/wv/connect/btl_wv_connect_oob.c | 3 +- ompi/mca/crcp/bkmrk/crcp_bkmrk_pml.c | 15 - ompi/mca/dpm/base/dpm_base_select.c | 5 +- ompi/mca/dpm/orte/dpm_orte.c | 152 +- ompi/mca/osc/pt2pt/osc_pt2pt_sync.c | 9 +- ompi/mca/osc/rdma/osc_rdma_comm.c | 8 +- ompi/mca/osc/rdma/osc_rdma_sync.c | 8 +- ompi/mca/pml/base/pml_base_select.c | 5 +- ompi/mca/pml/bfo/pml_bfo.c | 8 +- ompi/mca/pml/bfo/pml_bfo_failover.c | 6 +- ompi/mca/pml/bfo/pml_bfo_recvreq.c | 6 +- ompi/mca/pml/bfo/pml_bfo_recvreq.h | 4 +- ompi/mca/pml/bfo/pml_bfo_sendreq.c | 11 +- ompi/mca/pml/bfo/pml_bfo_sendreq.h | 6 +- ompi/mca/pml/csum/pml_csum.c | 8 +- ompi/mca/pml/csum/pml_csum_recvreq.c | 6 +- ompi/mca/pml/csum/pml_csum_recvreq.h | 4 +- ompi/mca/pml/csum/pml_csum_sendreq.c | 10 +- ompi/mca/pml/csum/pml_csum_sendreq.h | 6 +- ompi/mca/pml/dr/pml_dr_sendreq.c | 4 +- ompi/mca/pml/ob1/pml_ob1.c | 8 +- ompi/mca/pml/ob1/pml_ob1_recvreq.c | 6 +- ompi/mca/pml/ob1/pml_ob1_recvreq.h | 4 +- ompi/mca/pml/ob1/pml_ob1_sendreq.c | 8 +- ompi/mca/pml/ob1/pml_ob1_sendreq.h | 6 +- ompi/mca/pubsub/base/pubsub_base_select.c | 5 +- ompi/mca/pubsub/orte/pubsub_orte.c | 3 +- ompi/mca/rcache/rb/rcache_rb.c | 4 +- .../pessimist/vprotocol_pessimist_eventlog.c | 3 +- ompi/mpi/c/unpublish_name.c | 8 +- ompi/proc/proc.c | 7 +- ompi/runtime/ompi_mpi_finalize.c | 14 +- ompi/runtime/ompi_mpi_init.c | 27 +- ompi/tools/ompi-server/ompi-server.c | 7 +- ompi/tools/ompi_info/components.c | 13 +- ompi/tools/ompi_info/ompi_info.c | 1 + opal/mca/base/mca_base_components_open.c | 7 +- opal/mca/compress/base/compress_base_open.c | 5 +- opal/mca/crs/base/crs_base_open.c | 5 +- opal/mca/event/base/base.h | 5 +- opal/mca/event/base/event_base_close.c | 3 - opal/mca/event/base/event_base_open.c | 7 +- opal/mca/event/event.h | 13 + opal/mca/event/libevent2013/configure.m4 | 4 +- opal/mca/event/libevent2013/libevent/event.c | 11 - opal/mca/event/libevent2013/libevent2013.h | 94 +- .../event/libevent2013/libevent2013_module.c | 83 +- opal/mca/hwloc/base/hwloc_base_util.c | 1 + opal/runtime/opal_finalize.c | 6 +- opal/runtime/opal_init.c | 8 +- opal/util/Makefile.am | 4 +- opal/util/error.c | 11 +- opal/util/opal_sos.c | 535 ---- opal/util/opal_sos.h | 441 ---- opal/util/stacktrace.c | 4 +- orte/Makefile.am | 1 - orte/config/orte_configure_options.m4 | 31 +- orte/include/orte/types.h | 47 +- orte/mca/errmgr/app/Makefile.am | 36 - orte/mca/errmgr/app/errmgr_app.c | 280 --- orte/mca/errmgr/app/errmgr_app.h | 35 - orte/mca/errmgr/app/errmgr_app_component.c | 89 - orte/mca/errmgr/base/errmgr_base_fns.c | 35 +- orte/mca/errmgr/base/errmgr_base_open.c | 5 +- orte/mca/errmgr/base/errmgr_base_tool.c | 3 - orte/mca/errmgr/base/errmgr_private.h | 12 +- orte/mca/errmgr/default_app/configure.m4 | 2 +- .../errmgr/default_app/errmgr_default_app.c | 55 +- orte/mca/errmgr/default_hnp/configure.m4 | 2 +- .../errmgr/default_hnp/errmgr_default_hnp.c | 1433 ++++------- orte/mca/errmgr/default_orted/configure.m4 | 2 +- .../default_orted/errmgr_default_orted.c | 722 +++--- orte/mca/errmgr/errmgr.h | 25 +- orte/mca/errmgr/hnp/Makefile.am | 38 - orte/mca/errmgr/hnp/errmgr_hnp.c | 2182 ----------------- orte/mca/errmgr/hnp/errmgr_hnp.h | 135 - orte/mca/errmgr/hnp/errmgr_hnp_autor.c | 1033 -------- orte/mca/errmgr/hnp/errmgr_hnp_component.c | 201 -- orte/mca/errmgr/hnp/errmgr_hnp_crmig.c | 1517 ------------ orte/mca/errmgr/hnp/help-orte-errmgr-hnp.txt | 71 - orte/mca/errmgr/orted/Makefile.am | 38 - orte/mca/errmgr/orted/configure.m4 | 19 - orte/mca/errmgr/orted/errmgr_orted.c | 1157 --------- orte/mca/errmgr/orted/errmgr_orted.h | 35 - .../mca/errmgr/orted/errmgr_orted_component.c | 84 - .../errmgr/orted/help-orte-errmgr-orted.txt | 14 - orte/mca/ess/alps/ess_alps_module.c | 4 - orte/mca/ess/base/base.h | 8 +- orte/mca/ess/base/ess_base_fns.c | 10 + orte/mca/ess/base/ess_base_open.c | 1 - orte/mca/ess/base/ess_base_select.c | 18 - orte/mca/ess/base/ess_base_std_app.c | 42 +- orte/mca/ess/base/ess_base_std_orted.c | 198 +- orte/mca/ess/base/ess_base_std_tool.c | 2 + orte/mca/ess/cnos/ess_cnos_module.c | 1 - orte/mca/ess/env/ess_env_module.c | 2 - orte/mca/ess/ess.h | 19 +- orte/mca/ess/generic/ess_generic_module.c | 4 - orte/mca/ess/hnp/ess_hnp_module.c | 208 +- orte/mca/ess/lsf/ess_lsf_module.c | 3 - orte/mca/ess/pmi/ess_pmi_component.c | 2 +- orte/mca/ess/pmi/ess_pmi_module.c | 17 +- .../ess_portals4_shmem_module.c | 1 - orte/mca/ess/singleton/ess_singleton_module.c | 2 - orte/mca/ess/slurm/ess_slurm_module.c | 2 - orte/mca/ess/slurmd/ess_slurmd_module.c | 9 +- orte/mca/ess/tm/ess_tm_module.c | 2 - orte/mca/ess/tool/ess_tool_module.c | 1 - orte/mca/filem/base/filem_base_receive.c | 22 +- orte/mca/filem/rsh/filem_rsh_module.c | 3 - orte/mca/grpcomm/bad/grpcomm_bad_module.c | 272 +- orte/mca/grpcomm/base/Makefile.am | 8 +- orte/mca/grpcomm/base/base.h | 69 +- .../mca/grpcomm/base/grpcomm_base_allgather.c | 263 -- orte/mca/grpcomm/base/grpcomm_base_app_fns.c | 220 -- orte/mca/grpcomm/base/grpcomm_base_coll.c | 923 ------- orte/mca/grpcomm/base/grpcomm_base_modex.c | 303 ++- orte/mca/grpcomm/base/grpcomm_base_open.c | 70 +- orte/mca/grpcomm/base/grpcomm_base_receive.c | 686 ++++++ orte/mca/grpcomm/base/grpcomm_base_xcast.c | 221 ++ orte/mca/grpcomm/cnos/grpcomm_cnos_module.c | 54 +- orte/mca/grpcomm/grpcomm.h | 13 +- orte/mca/grpcomm/grpcomm_types.h | 89 +- .../heartbeat => grpcomm/hier}/.ompi_ignore | 0 orte/mca/grpcomm/hier/grpcomm_hier_module.c | 4 - orte/mca/grpcomm/pmi/grpcomm_pmi_module.c | 71 +- .../grpcomm_portals4_shmem_module.c | 55 +- orte/mca/iof/base/base.h | 28 +- orte/mca/iof/base/iof_base_close.c | 3 - orte/mca/iof/base/iof_base_open.c | 9 +- orte/mca/iof/base/iof_base_output.c | 4 +- orte/mca/iof/hnp/iof_hnp.c | 64 +- orte/mca/iof/hnp/iof_hnp_read.c | 47 +- orte/mca/iof/hnp/iof_hnp_receive.c | 62 +- orte/mca/iof/orted/iof_orted.c | 25 +- orte/mca/iof/orted/iof_orted_read.c | 32 +- orte/mca/iof/orted/iof_orted_receive.c | 39 +- orte/mca/iof/tool/iof_tool.c | 7 +- orte/mca/iof/tool/iof_tool_receive.c | 50 +- orte/mca/notifier/base/notifier_base_select.c | 11 +- orte/mca/notifier/hnp/Makefile.am | 4 + orte/mca/notifier/hnp/notifier_hnp.h | 4 - orte/mca/notifier/hnp/notifier_hnp_module.c | 154 +- orte/mca/notifier/hnp/notifier_hnp_recv.c | 215 +- .../mca/notifier/hnp/orte_notifier_hnp.txt | 2 +- orte/mca/notifier/notifier.h | 27 +- orte/mca/odls/base/base.h | 7 - orte/mca/odls/base/odls_base_close.c | 28 +- orte/mca/odls/base/odls_base_default_fns.c | 1510 ++++-------- orte/mca/odls/base/odls_base_open.c | 147 +- orte/mca/odls/base/odls_base_state.c | 3 - orte/mca/odls/base/odls_private.h | 47 +- orte/mca/odls/default/odls_default_module.c | 74 +- orte/mca/odls/odls.h | 8 +- orte/mca/odls/odls_types.h | 81 +- orte/mca/odls/process/odls_process_module.c | 29 +- orte/mca/oob/tcp/oob_tcp.c | 127 +- orte/mca/oob/tcp/oob_tcp.h | 8 +- orte/mca/oob/tcp/oob_tcp_msg.c | 12 +- orte/mca/oob/tcp/oob_tcp_peer.c | 89 +- orte/mca/oob/tcp/oob_tcp_peer.h | 8 +- orte/mca/oob/tcp/oob_tcp_ping.c | 18 +- orte/mca/oob/tcp/oob_tcp_send.c | 36 +- orte/mca/plm/alps/plm_alps_module.c | 173 +- orte/mca/plm/base/base.h | 8 + orte/mca/plm/base/plm_base_close.c | 8 - orte/mca/plm/base/plm_base_jobid.c | 4 +- orte/mca/plm/base/plm_base_launch_support.c | 746 +++--- orte/mca/plm/base/plm_base_open.c | 11 - orte/mca/plm/base/plm_base_orted_cmds.c | 314 +-- orte/mca/plm/base/plm_base_proxy.c | 53 +- orte/mca/plm/base/plm_base_receive.c | 634 ++--- orte/mca/plm/base/plm_private.h | 31 +- orte/mca/plm/ccp/plm_ccp_module.c | 2 +- orte/mca/plm/lsf/plm_lsf_module.c | 135 +- orte/mca/plm/plm.h | 4 +- orte/mca/plm/plm_types.h | 149 +- orte/mca/plm/process/plm_process_module.c | 381 +-- orte/mca/plm/rsh/plm_rsh.h | 6 +- orte/mca/plm/rsh/plm_rsh_component.c | 6 - orte/mca/plm/rsh/plm_rsh_module.c | 749 +++--- orte/mca/plm/slurm/plm_slurm_module.c | 211 +- orte/mca/plm/tm/plm_tm_module.c | 262 +- orte/mca/ras/base/base.h | 6 +- orte/mca/ras/base/ras_base_allocate.c | 121 +- orte/mca/ras/base/ras_base_node.c | 2 + orte/mca/ras/base/ras_base_open.c | 11 +- orte/mca/ras/base/ras_private.h | 67 +- orte/mca/ras/ras.h | 17 +- orte/mca/rmaps/base/Makefile.am | 4 +- orte/mca/rmaps/base/base.h | 7 + orte/mca/rmaps/base/rmaps_base_close.c | 2 + orte/mca/rmaps/base/rmaps_base_get_job_map.c | 88 - orte/mca/rmaps/base/rmaps_base_map_job.c | 76 +- orte/mca/rmaps/base/rmaps_base_open.c | 12 +- orte/mca/rmaps/base/rmaps_base_ranking.c | 33 +- orte/mca/rmaps/base/rmaps_base_support_fns.c | 5 - orte/mca/rmaps/base/rmaps_private.h | 11 +- orte/mca/rmaps/ppr/rmaps_ppr.c | 6 +- orte/mca/rmaps/rank_file/rmaps_rank_file.c | 7 +- orte/mca/rmaps/resilient/rmaps_resilient.c | 7 +- orte/mca/rmaps/rmaps.h | 18 +- orte/mca/rmaps/round_robin/rmaps_rr.c | 9 +- orte/mca/rmaps/seq/rmaps_seq.c | 6 +- orte/mca/rml/base/rml_base_components.c | 32 +- orte/mca/rml/base/rml_base_contact.c | 8 +- orte/mca/rml/base/rml_base_receive.c | 68 +- orte/mca/rml/oob/rml_oob_component.c | 2 +- orte/mca/rml/rml.h | 10 + orte/mca/rml/rml_types.h | 77 +- orte/mca/routed/base/Makefile.am | 2 +- orte/mca/routed/base/base.h | 7 + orte/mca/routed/base/routed_base_components.c | 3 - orte/mca/routed/base/routed_base_fns.c | 379 +++ .../routed/base/routed_base_register_sync.c | 195 -- orte/mca/routed/binomial/routed_binomial.c | 170 +- orte/mca/routed/cm/routed_cm.c | 157 +- orte/mca/routed/debruijn/routed_debruijn.c | 99 +- orte/mca/routed/direct/routed_direct.c | 97 +- orte/mca/routed/linear/routed_linear.c | 144 +- orte/mca/routed/radix/routed_radix.c | 123 +- orte/mca/routed/routed.h | 34 +- orte/mca/routed/routed_types.h | 2 +- orte/mca/sensor/file/sensor_file.c | 173 +- orte/mca/sensor/ft_tester/sensor_ft_tester.c | 28 +- orte/mca/sensor/heartbeat/sensor_heartbeat.c | 109 +- orte/mca/sensor/resusage/sensor_resusage.c | 78 +- orte/mca/snapc/base/snapc_base_fns.c | 2 - orte/mca/snapc/full/snapc_full_global.c | 6 - orte/mca/snapc/full/snapc_full_local.c | 9 +- orte/mca/snapc/full/snapc_full_module.c | 2 - orte/mca/sstore/base/sstore_base_fns.c | 3 - .../sstore/central/sstore_central_global.c | 1 - .../mca/sstore/central/sstore_central_local.c | 3 - orte/mca/sstore/stage/sstore_stage_global.c | 3 - orte/mca/sstore/stage/sstore_stage_local.c | 6 - orte/mca/state/Makefile.am | 31 + orte/mca/{errmgr => state}/app/.windows | 0 orte/mca/state/app/Makefile.am | 35 + orte/mca/state/app/state_app.c | 78 + orte/mca/state/app/state_app.h | 36 + orte/mca/state/app/state_app_component.c | 84 + orte/mca/state/base/Makefile.am | 19 + orte/mca/state/base/base.h | 55 + orte/mca/state/base/state_base_close.c | 44 + orte/mca/state/base/state_base_fns.c | 362 +++ orte/mca/state/base/state_base_open.c | 108 + orte/mca/state/base/state_base_select.c | 59 + orte/mca/state/base/state_private.h | 84 + orte/mca/{errmgr/orted => state/hnp}/.windows | 0 orte/mca/state/hnp/Makefile.am | 35 + .../{errmgr/app => state/hnp}/configure.m4 | 8 +- orte/mca/state/hnp/state_hnp.c | 520 ++++ orte/mca/state/hnp/state_hnp.h | 36 + orte/mca/state/hnp/state_hnp_component.c | 84 + orte/mca/state/orted/.windows | 12 + orte/mca/state/orted/Makefile.am | 35 + .../{errmgr/hnp => state/orted}/configure.m4 | 8 +- orte/mca/state/orted/state_orted.c | 451 ++++ orte/mca/state/orted/state_orted.h | 36 + orte/mca/state/orted/state_orted_component.c | 84 + orte/mca/state/state.h | 273 +++ orte/mca/state/state_types.h | 77 + orte/orted/orted_comm.c | 404 +-- orte/orted/orted_main.c | 219 +- .../data_type_support/orte_dt_compare_fns.c | 43 +- .../data_type_support/orte_dt_copy_fns.c | 41 +- .../data_type_support/orte_dt_packing_fns.c | 74 +- .../data_type_support/orte_dt_print_fns.c | 32 +- .../data_type_support/orte_dt_size_fns.c | 12 +- .../data_type_support/orte_dt_support.h | 32 +- .../data_type_support/orte_dt_unpacking_fns.c | 79 +- orte/runtime/orte_data_server.c | 82 +- orte/runtime/orte_finalize.c | 4 +- orte/runtime/orte_globals.c | 129 +- orte/runtime/orte_globals.h | 89 +- orte/runtime/orte_init.c | 49 +- orte/runtime/orte_quit.c | 149 +- orte/runtime/orte_quit.h | 6 +- orte/runtime/orte_wait.c | 228 +- orte/runtime/orte_wait.h | 229 +- orte/test/mpi/hello.c | 2 +- orte/test/system/Makefile | 2 +- orte/test/system/event-threads.c | 6 +- orte/test/system/evpri-test.c | 108 + orte/test/system/evthread-test.c | 159 +- orte/test/system/oob_stress.c | 1 - orte/test/system/opal-evpri-test.c | 107 + orte/test/system/orte_mcast.c | 2 +- orte/test/system/orte_ring.c | 2 - orte/test/system/orte_sensor.c | 2 +- orte/test/system/orte_spawn.c | 2 - orte/test/system/orte_spin.c | 2 +- orte/test/system/test-time.c | 4 +- orte/threads/Makefile.am | 20 - orte/threads/condition.h | 199 -- orte/threads/mutex.h | 69 - orte/threads/thread.c | 37 - orte/threads/threads.h | 131 - orte/tools/Makefile.am | 2 +- orte/tools/orte-info/components.c | 17 +- orte/tools/orte-info/orte-info.c | 1 + orte/tools/orte-ps/orte-ps.c | 7 +- orte/tools/orte-top/orte-top.c | 77 +- orte/tools/orterun/Makefile.am | 2 +- orte/tools/orterun/orterun.c | 104 +- orte/util/comm/comm.c | 203 +- orte/util/comm/comm.h | 3 - orte/util/error_strings.c | 287 +-- orte/util/error_strings.h | 2 - orte/util/hnp_contact.c | 1 - orte/util/name_fns.c | 148 +- orte/util/name_fns.h | 41 +- orte/util/nidmap.c | 214 +- orte/util/nidmap.h | 17 - orte/util/proc_info.c | 37 +- orte/util/proc_info.h | 11 +- orte/util/show_help.c | 12 +- test/util/Makefile.am | 11 +- test/util/opal_sos.c | 178 -- test/util/orte_session_dir.c | 1 - 351 files changed, 10449 insertions(+), 21019 deletions(-) create mode 100644 contrib/platform/iu/odin/debug-nopmi create mode 100644 contrib/platform/iu/odin/debug-nopmi.conf delete mode 100644 opal/util/opal_sos.c delete mode 100644 opal/util/opal_sos.h delete mode 100644 orte/mca/errmgr/app/Makefile.am delete mode 100644 orte/mca/errmgr/app/errmgr_app.c delete mode 100644 orte/mca/errmgr/app/errmgr_app.h delete mode 100644 orte/mca/errmgr/app/errmgr_app_component.c delete mode 100644 orte/mca/errmgr/hnp/Makefile.am delete mode 100644 orte/mca/errmgr/hnp/errmgr_hnp.c delete mode 100644 orte/mca/errmgr/hnp/errmgr_hnp.h delete mode 100644 orte/mca/errmgr/hnp/errmgr_hnp_autor.c delete mode 100644 orte/mca/errmgr/hnp/errmgr_hnp_component.c delete mode 100644 orte/mca/errmgr/hnp/errmgr_hnp_crmig.c delete mode 100644 orte/mca/errmgr/hnp/help-orte-errmgr-hnp.txt delete mode 100644 orte/mca/errmgr/orted/Makefile.am delete mode 100644 orte/mca/errmgr/orted/configure.m4 delete mode 100644 orte/mca/errmgr/orted/errmgr_orted.c delete mode 100644 orte/mca/errmgr/orted/errmgr_orted.h delete mode 100644 orte/mca/errmgr/orted/errmgr_orted_component.c delete mode 100644 orte/mca/errmgr/orted/help-orte-errmgr-orted.txt delete mode 100644 orte/mca/grpcomm/base/grpcomm_base_allgather.c delete mode 100644 orte/mca/grpcomm/base/grpcomm_base_app_fns.c delete mode 100644 orte/mca/grpcomm/base/grpcomm_base_coll.c create mode 100644 orte/mca/grpcomm/base/grpcomm_base_receive.c create mode 100644 orte/mca/grpcomm/base/grpcomm_base_xcast.c rename orte/mca/{sensor/heartbeat => grpcomm/hier}/.ompi_ignore (100%) rename opal/util/opal_sos_reporter.txt => orte/mca/notifier/hnp/orte_notifier_hnp.txt (94%) delete mode 100644 orte/mca/rmaps/base/rmaps_base_get_job_map.c create mode 100644 orte/mca/routed/base/routed_base_fns.c delete mode 100644 orte/mca/routed/base/routed_base_register_sync.c create mode 100644 orte/mca/state/Makefile.am rename orte/mca/{errmgr => state}/app/.windows (100%) create mode 100644 orte/mca/state/app/Makefile.am create mode 100644 orte/mca/state/app/state_app.c create mode 100644 orte/mca/state/app/state_app.h create mode 100644 orte/mca/state/app/state_app_component.c create mode 100644 orte/mca/state/base/Makefile.am create mode 100644 orte/mca/state/base/base.h create mode 100644 orte/mca/state/base/state_base_close.c create mode 100644 orte/mca/state/base/state_base_fns.c create mode 100644 orte/mca/state/base/state_base_open.c create mode 100644 orte/mca/state/base/state_base_select.c create mode 100644 orte/mca/state/base/state_private.h rename orte/mca/{errmgr/orted => state/hnp}/.windows (100%) create mode 100644 orte/mca/state/hnp/Makefile.am rename orte/mca/{errmgr/app => state/hnp}/configure.m4 (54%) create mode 100644 orte/mca/state/hnp/state_hnp.c create mode 100644 orte/mca/state/hnp/state_hnp.h create mode 100644 orte/mca/state/hnp/state_hnp_component.c create mode 100644 orte/mca/state/orted/.windows create mode 100644 orte/mca/state/orted/Makefile.am rename orte/mca/{errmgr/hnp => state/orted}/configure.m4 (54%) create mode 100644 orte/mca/state/orted/state_orted.c create mode 100644 orte/mca/state/orted/state_orted.h create mode 100644 orte/mca/state/orted/state_orted_component.c create mode 100644 orte/mca/state/state.h create mode 100644 orte/mca/state/state_types.h create mode 100644 orte/test/system/evpri-test.c create mode 100644 orte/test/system/opal-evpri-test.c delete mode 100644 orte/threads/Makefile.am delete mode 100644 orte/threads/condition.h delete mode 100644 orte/threads/mutex.h delete mode 100644 orte/threads/thread.c delete mode 100644 orte/threads/threads.h delete mode 100644 test/util/opal_sos.c diff --git a/contrib/hg/build-hgignore.pl b/contrib/hg/build-hgignore.pl index e2c07561e1..89c50b0163 100755 --- a/contrib/hg/build-hgignore.pl +++ b/contrib/hg/build-hgignore.pl @@ -32,6 +32,8 @@ my @globals = qw/.libs *.orig *.rej *.class +*.xcscheme +*.plist .git* .DS_Store stamp-h[1-9] diff --git a/contrib/platform/iu/odin/debug-nopmi b/contrib/platform/iu/odin/debug-nopmi new file mode 100644 index 0000000000..f146468200 --- /dev/null +++ b/contrib/platform/iu/odin/debug-nopmi @@ -0,0 +1,28 @@ +enable_opal_multi_threads=no +enable_dlopen=no +enable_pty_support=no +with_blcr=no +with_openib=no +with_memory_manager=no +enable_mem_debug=yes +enable_mem_profile=no +enable_debug_symbols=yes +enable_binaries=yes +with_devel_headers=yes +enable_heterogeneous=no +enable_picky=yes +enable_debug=yes +enable_shared=yes +enable_static=no +with_slurm=yes +with_pmi=no +enable_contrib_no_build=libnbc,vt +enable_visibility=yes +enable_memchecker=no +enable_ipv6=no +enable_mpi_f77=no +enable_mpi_f90=no +enable_mpi_cxx=no +enable_mpi_cxx_seek=no +enable_mca_no_build=pml-dr,pml-crcp2,crcp +enable_io_romio=no diff --git a/contrib/platform/iu/odin/debug-nopmi.conf b/contrib/platform/iu/odin/debug-nopmi.conf new file mode 100644 index 0000000000..2116035dff --- /dev/null +++ b/contrib/platform/iu/odin/debug-nopmi.conf @@ -0,0 +1,85 @@ +# +# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana +# University Research and Technology +# Corporation. All rights reserved. +# Copyright (c) 2004-2005 The University of Tennessee and The University +# of Tennessee Research Foundation. All rights +# reserved. +# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, +# University of Stuttgart. All rights reserved. +# Copyright (c) 2004-2005 The Regents of the University of California. +# All rights reserved. +# Copyright (c) 2006 Cisco Systems, Inc. All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +# This is the default system-wide MCA parameters defaults file. +# Specifically, the MCA parameter "mca_param_files" defaults to a +# value of +# "$HOME/.openmpi/mca-params.conf:$sysconf/openmpi-mca-params.conf" +# (this file is the latter of the two). So if the default value of +# mca_param_files is not changed, this file is used to set system-wide +# MCA parameters. This file can therefore be used to set system-wide +# default MCA parameters for all users. Of course, users can override +# these values if they want, but this file is an excellent location +# for setting system-specific MCA parameters for those users who don't +# know / care enough to investigate the proper values for them. + +# Note that this file is only applicable where it is visible (in a +# filesystem sense). Specifically, MPI processes each read this file +# during their startup to determine what default values for MCA +# parameters should be used. mpirun does not bundle up the values in +# this file from the node where it was run and send them to all nodes; +# the default value decisions are effectively distributed. Hence, +# these values are only applicable on nodes that "see" this file. If +# $sysconf is a directory on a local disk, it is likely that changes +# to this file will need to be propagated to other nodes. If $sysconf +# is a directory that is shared via a networked filesystem, changes to +# this file will be visible to all nodes that share this $sysconf. + +# The format is straightforward: one per line, mca_param_name = +# rvalue. Quoting is ignored (so if you use quotes or escape +# characters, they'll be included as part of the value). For example: + +# Disable run-time MPI parameter checking +# mpi_param_check = 0 + +# Note that the value "~/" will be expanded to the current user's home +# directory. For example: + +# Change component loading path +# component_path = /usr/local/lib/openmpi:~/my_openmpi_components + +# See "ompi_info --param all all" for a full listing of Open MPI MCA +# parameters available and their default values. +# + +# Basic behavior to smooth startup +mca_component_show_load_errors = 0 +mpi_param_check = 0 +orte_abort_timeout = 10 +hwloc_base_mem_bind_failure_action = silent + +## Protect the shared file systems + +## Add the interface for out-of-band communication +## and set it up +oob_tcp_peer_retries = 120 +oob_tcp_disable_family = IPv6 +#oob_tcp_connect_timeout=600 + +## Define the MPI interconnects +btl = sm,tcp,self + +## Setup shared memory +btl_sm_free_list_max = 768 + +## Setup TCP +btl_tcp_if_include = ib0 + +## Configure the PML +pml_ob1_use_early_completion = 0 diff --git a/ompi/attribute/attribute.c b/ompi/attribute/attribute.c index ce8f19e918..9b733c2687 100644 --- a/ompi/attribute/attribute.c +++ b/ompi/attribute/attribute.c @@ -10,6 +10,8 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2006-2012 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2012 Los Alamos National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -195,7 +197,6 @@ #include "ompi/attribute/attribute.h" #include "opal/class/opal_bitmap.h" #include "opal/threads/mutex.h" -#include "opal/util/opal_sos.h" #include "ompi/constants.h" #include "ompi/datatype/ompi_datatype.h" @@ -1151,7 +1152,7 @@ static int get_value(opal_hash_table_t *attr_hash, int key, (void**) &keyval); OPAL_THREAD_UNLOCK(&keyval_hash_lock); - if (OMPI_ERR_NOT_FOUND == OPAL_SOS_GET_ERROR_CODE(ret)) { + if (OMPI_ERR_NOT_FOUND == ret) { return MPI_KEYVAL_INVALID; } diff --git a/ompi/communicator/comm_cid.c b/ompi/communicator/comm_cid.c index bbd5a9fa9d..af9f69fdd8 100644 --- a/ompi/communicator/comm_cid.c +++ b/ompi/communicator/comm_cid.c @@ -14,6 +14,8 @@ * Copyright (c) 2007 Voltaire All rights reserved. * Copyright (c) 2006-2010 University of Houston. All rights reserved. * Copyright (c) 2009 Sun Microsystems, Inc. All rights reserved. + * Copyright (c) 2012 Los Alamos National Security, LLC. All rights + * reserved. * Copyright (c) 2012 Oak Ridge National Labs. All rights reserved. * $COPYRIGHT$ * @@ -32,7 +34,6 @@ #include "ompi/constants.h" #include "opal/class/opal_pointer_array.h" #include "opal/class/opal_list.h" -#include "opal/util/opal_sos.h" #include "ompi/mca/pml/pml.h" #include "ompi/mca/coll/base/base.h" #include "ompi/request/request.h" @@ -145,7 +146,7 @@ int ompi_comm_cid_init (void) ompi_comm_world_thread_level_mult = 1; break; } - } else if (OMPI_ERR_NOT_IMPLEMENTED == OPAL_SOS_GET_ERROR_CODE(ret)) { + } else if (OMPI_ERR_NOT_IMPLEMENTED == ret) { if (ompi_mpi_thread_multiple) { ompi_comm_world_thread_level_mult = 1; } diff --git a/ompi/datatype/ompi_datatype.h b/ompi/datatype/ompi_datatype.h index 5016980e34..fdd320b055 100644 --- a/ompi/datatype/ompi_datatype.h +++ b/ompi/datatype/ompi_datatype.h @@ -32,6 +32,9 @@ #ifdef HAVE_STRING_H #include #endif +#ifdef HAVE_LIMITS_H +#include +#endif #include "ompi/constants.h" #include "opal/class/opal_pointer_array.h" diff --git a/ompi/errhandler/errcode-internal.h b/ompi/errhandler/errcode-internal.h index f9d022209b..183f2ec65e 100644 --- a/ompi/errhandler/errcode-internal.h +++ b/ompi/errhandler/errcode-internal.h @@ -12,6 +12,8 @@ * All rights reserved. * Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 Los Alamos National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -29,7 +31,6 @@ #include "ompi/constants.h" #include "opal/class/opal_object.h" #include "opal/class/opal_pointer_array.h" -#include "opal/util/opal_sos.h" #define OMPI_MAX_ERROR_STRING 64 @@ -51,18 +52,13 @@ OMPI_DECLSPEC extern opal_pointer_array_t ompi_errcodes_intern; OMPI_DECLSPEC extern int ompi_errcode_intern_lastused; /** - * Return the MPI errcode for a given internal error code. This - * function guarantees to return a non-OPAL_SOS-encoded error code. - */ + * Return the MPI errcode for a given internal error code. */ static inline int ompi_errcode_get_mpi_code(int errcode) { int ret = MPI_ERR_UNKNOWN; int i; ompi_errcode_intern_t *errc; - /* Transmogrify, if necessary */ - errcode = OPAL_SOS_GET_ERROR_CODE(errcode); - /* If the errcode is >= 0, then it's already an MPI error code, so just return it. */ if (errcode >= 0) { diff --git a/ompi/mca/bml/bml.h b/ompi/mca/bml/bml.h index 9314fd953d..fb621b1c3a 100644 --- a/ompi/mca/bml/bml.h +++ b/ompi/mca/bml/bml.h @@ -10,6 +10,8 @@ * Copyright (c) 2004-2006 The Regents of the University of California. * All rights reserved. * Copyright (c) 2008 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2012 Los Alamos National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -31,7 +33,6 @@ #include "opal/datatype/opal_convertor.h" #include "opal/mca/crs/crs.h" #include "opal/mca/crs/base/base.h" -#include "opal/util/opal_sos.h" #include "ompi/mca/btl/btl.h" @@ -273,7 +274,7 @@ static inline int mca_bml_base_send( mca_bml_base_btl_t* bml_btl, des->des_context = (void*) bml_btl; rc = btl->btl_send(btl, bml_btl->btl_endpoint, des, tag); - if(OPAL_SOS_GET_ERROR_CODE(rc) == OMPI_ERR_RESOURCE_BUSY) + if (rc == OMPI_ERR_RESOURCE_BUSY) rc = OMPI_SUCCESS; return rc; diff --git a/ompi/mca/bml/r2/bml_r2.c b/ompi/mca/bml/r2/bml_r2.c index ac5b85b874..b4d58762b1 100644 --- a/ompi/mca/bml/r2/bml_r2.c +++ b/ompi/mca/bml/r2/bml_r2.c @@ -10,7 +10,7 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2006 The Regents of the University of California. * All rights reserved. - * Copyright (c) 2007 Los Alamos National Security, LLC. All rights + * Copyright (c) 2007-2012 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2008-2009 Cisco Systems, Inc. All rights reserved. * $COPYRIGHT$ @@ -404,7 +404,7 @@ static int mca_bml_r2_add_procs( size_t nprocs, } if (mca_bml_r2.show_unreach_errors && - OMPI_ERR_UNREACH == OPAL_SOS_GET_ERROR_CODE(ret)) { + OMPI_ERR_UNREACH == ret) { orte_show_help("help-mca-bml-r2.txt", "unreachable proc", true, diff --git a/ompi/mca/bml/r2/bml_r2_ft.c b/ompi/mca/bml/r2/bml_r2_ft.c index 164dd4b841..72e73accc6 100644 --- a/ompi/mca/bml/r2/bml_r2_ft.c +++ b/ompi/mca/bml/r2/bml_r2_ft.c @@ -9,7 +9,7 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2006 The Regents of the University of California. * All rights reserved. - * Copyright (c) 2007 Los Alamos National Security, LLC. All rights + * Copyright (c) 2007-2012 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2008 Cisco Systems, Inc. All rights reserved. * $COPYRIGHT$ @@ -25,12 +25,16 @@ #include #include +#include "opal/runtime/opal_progress.h" + +#include "orte/mca/grpcomm/grpcomm.h" +#include "orte/util/proc_info.h" + #include "ompi/runtime/ompi_cr.h" #include "ompi/mca/bml/base/base.h" #include "ompi/mca/btl/base/base.h" #include "ompi/mca/bml/base/bml_base_btl.h" #include "ompi/mca/pml/base/base.h" -#include "orte/mca/grpcomm/grpcomm.h" #include "ompi/proc/proc.h" #include "bml_r2.h" @@ -47,6 +51,7 @@ int mca_bml_r2_ft_event(int state) int loc_state; int param_type = -1; char *param_list = NULL; + orte_grpcomm_collective_t coll; if(OPAL_CRS_CHECKPOINT == state) { /* Do nothing for now */ @@ -153,10 +158,15 @@ int mca_bml_r2_ft_event(int state) * Barrier to make all processes have been successfully restarted before * we try to remove some restart only files. */ - if (ORTE_SUCCESS != (ret = orte_grpcomm.barrier())) { + OBJ_CONSTRUCT(&coll, orte_grpcomm_collective_t); + coll.id = orte_process_info.peer_init_barrier; + if (OMPI_SUCCESS != (ret = orte_grpcomm.barrier(&coll))) { opal_output(0, "bml:r2: ft_event(Restart): Failed in orte_grpcomm.barrier (%d)", ret); return ret; } + while (coll.active) { + opal_progress(); + } /* * Re-open the BTL framework to get the full list of components. @@ -226,10 +236,15 @@ int mca_bml_r2_ft_event(int state) * Barrier to make all processes have been successfully restarted before * we try to remove some restart only files. */ - if (ORTE_SUCCESS != (ret = orte_grpcomm.barrier())) { + OBJ_CONSTRUCT(&coll, orte_grpcomm_collective_t); + coll.id = orte_process_info.peer_init_barrier; + if (OMPI_SUCCESS != (ret = orte_grpcomm.barrier(&coll))) { opal_output(0, "bml:r2: ft_event(Restart): Failed in orte_grpcomm.barrier (%d)", ret); return ret; } + while (coll.active) { + opal_progress(); + } /* * Re-open the BTL framework to get the full list of components. diff --git a/ompi/mca/btl/ofud/btl_ofud_component.c b/ompi/mca/btl/ofud/btl_ofud_component.c index a3e9659eee..ccbb63b63c 100644 --- a/ompi/mca/btl/ofud/btl_ofud_component.c +++ b/ompi/mca/btl/ofud/btl_ofud_component.c @@ -12,6 +12,8 @@ * Copyright (c) 2006 Sandia National Laboratories. All rights * reserved. * Copyright (c) 2008-2011 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2012 Los Alamos National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -34,7 +36,6 @@ #include "ompi/mca/btl/btl.h" #include "opal/mca/timer/base/base.h" #include "opal/util/argv.h" -#include "opal/util/opal_sos.h" #include "opal/mca/base/mca_base_param.h" #include "orte/mca/errmgr/errmgr.h" #include "ompi/mca/btl/base/base.h" diff --git a/ompi/mca/btl/openib/btl_openib.c b/ompi/mca/btl/openib/btl_openib.c index 7cdcf6e008..e39e6e44ec 100644 --- a/ompi/mca/btl/openib/btl_openib.c +++ b/ompi/mca/btl/openib/btl_openib.c @@ -12,7 +12,7 @@ * All rights reserved. * Copyright (c) 2007-2010 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2006-2009 Mellanox Technologies. All rights reserved. - * Copyright (c) 2006-2007 Los Alamos National Security, LLC. All rights + * Copyright (c) 2006-2012 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2006-2007 Voltaire All rights reserved. * Copyright (c) 2008-2012 Oracle and/or its affiliates. All rights reserved. @@ -34,7 +34,6 @@ #include "opal/class/opal_bitmap.h" #include "opal/util/output.h" #include "opal/util/arch.h" -#include "opal/util/opal_sos.h" #include "opal/include/opal_stdint.h" #include "ompi/mca/btl/btl.h" @@ -303,7 +302,7 @@ static int create_srq(mca_btl_openib_module_t *openib_btl) /* Check if our device supports modify srq ability */ rc = check_if_device_support_modify_srq(openib_btl); - if(OMPI_ERR_NOT_SUPPORTED == OPAL_SOS_GET_ERROR_CODE(rc)) { + if(OMPI_ERR_NOT_SUPPORTED == rc) { device_support_modify_srq = false; } else if(OMPI_SUCCESS != rc) { mca_btl_openib_show_init_error(__FILE__, __LINE__, @@ -494,7 +493,7 @@ static int mca_btl_openib_tune_endpoint(mca_btl_openib_module_t* openib_btl, endpoint->rem_info.rem_vendor_part_id, &values); if (OMPI_SUCCESS != ret && - OMPI_ERR_NOT_FOUND != OPAL_SOS_GET_ERROR_CODE(ret)) { + OMPI_ERR_NOT_FOUND != ret) { orte_show_help("help-mpi-btl-openib.txt", "error in device init", true, orte_process_info.nodename, @@ -1625,7 +1624,7 @@ int mca_btl_openib_put( mca_btl_base_module_t* btl, OPAL_THREAD_LOCK(&ep->endpoint_lock); rc = check_endpoint_state(ep, descriptor, &ep->pending_put_frags); OPAL_THREAD_UNLOCK(&ep->endpoint_lock); - if(OMPI_ERR_RESOURCE_BUSY == OPAL_SOS_GET_ERROR_CODE(rc)) + if(OMPI_ERR_RESOURCE_BUSY == rc) return OMPI_SUCCESS; if(OMPI_SUCCESS != rc) return rc; @@ -1696,7 +1695,7 @@ int mca_btl_openib_get(mca_btl_base_module_t* btl, OPAL_THREAD_LOCK(&ep->endpoint_lock); rc = check_endpoint_state(ep, descriptor, &ep->pending_get_frags); OPAL_THREAD_UNLOCK(&ep->endpoint_lock); - if(OMPI_ERR_RESOURCE_BUSY == OPAL_SOS_GET_ERROR_CODE(rc)) + if(OMPI_ERR_RESOURCE_BUSY == rc) return OMPI_SUCCESS; if(OMPI_SUCCESS != rc) return rc; diff --git a/ompi/mca/btl/openib/btl_openib_component.c b/ompi/mca/btl/openib/btl_openib_component.c index 8648b8482e..5180d7da46 100644 --- a/ompi/mca/btl/openib/btl_openib_component.c +++ b/ompi/mca/btl/openib/btl_openib_component.c @@ -12,7 +12,7 @@ * All rights reserved. * Copyright (c) 2006-2011 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2006-2009 Mellanox Technologies. All rights reserved. - * Copyright (c) 2006-2007 Los Alamos National Security, LLC. All rights + * Copyright (c) 2006-2012 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2006-2007 Voltaire All rights reserved. * Copyright (c) 2009-2012 Oracle and/or its affiliates. All rights reserved. @@ -1125,8 +1125,8 @@ static int prepare_device_for_use(mca_btl_openib_device_t *device) if (OMPI_SUCCESS != rc) { /* If we're "out of memory", this usually means that we ran out of registered memory, so show that error message */ - if (OMPI_ERR_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(rc) || - OMPI_ERR_TEMP_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(rc)) { + if (OMPI_ERR_OUT_OF_RESOURCE == rc || + OMPI_ERR_TEMP_OUT_OF_RESOURCE == rc) { errno = ENOMEM; mca_btl_openib_show_init_error(__FILE__, __LINE__, "ompi_free_list_init_ex_new", @@ -1161,8 +1161,8 @@ static int prepare_device_for_use(mca_btl_openib_device_t *device) /* If we're "out of memory", this usually means that we ran out of registered memory, so show that error message */ - if (OMPI_ERR_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(rc) || - OMPI_ERR_TEMP_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(rc)) { + if (OMPI_ERR_OUT_OF_RESOURCE == rc || + OMPI_ERR_TEMP_OUT_OF_RESOURCE == rc) { errno = ENOMEM; mca_btl_openib_show_init_error(__FILE__, __LINE__, "ompi_free_list_init_ex_new", @@ -1658,11 +1658,11 @@ static int init_one_device(opal_list_t *btl_list, struct ibv_device* ib_dev) device->ib_dev_attr.vendor_part_id, &values); if (OMPI_SUCCESS != ret && - OMPI_ERR_NOT_FOUND != OPAL_SOS_GET_ERROR_CODE(ret)) { + OMPI_ERR_NOT_FOUND != ret) { /* If we get a serious error, propagate it upwards */ goto error; } - if (OMPI_ERR_NOT_FOUND == OPAL_SOS_GET_ERROR_CODE(ret)) { + if (OMPI_ERR_NOT_FOUND == ret) { /* If we didn't find a matching device in the INI files, output a warning that we're using default values (unless overridden that we don't want to see these warnings) */ @@ -1679,7 +1679,7 @@ static int init_one_device(opal_list_t *btl_list, struct ibv_device* ib_dev) be set indicating that it does not have good values */ ret = ompi_btl_openib_ini_query(0, 0, &default_values); if (OMPI_SUCCESS != ret && - OMPI_ERR_NOT_FOUND != OPAL_SOS_GET_ERROR_CODE(ret)) { + OMPI_ERR_NOT_FOUND != ret) { /* If we get a serious error, propagate it upwards */ goto error; } @@ -1841,7 +1841,7 @@ static int init_one_device(opal_list_t *btl_list, struct ibv_device* ib_dev) device, &mpool_resources); if(NULL == device->mpool){ /* Don't print an error message here -- we'll get one from - mpool_create anyway (OPAL_SOS would be good here...) */ + mpool_create anyway */ goto error; } @@ -1899,7 +1899,7 @@ static int init_one_device(opal_list_t *btl_list, struct ibv_device* ib_dev) if (OMPI_SUCCESS != ret) { /* Out of bounds error indicates that we hit max btl number * don't propagate the error to the caller */ - if (OMPI_ERR_VALUE_OUT_OF_BOUNDS == OPAL_SOS_GET_ERROR_CODE(ret)) { + if (OMPI_ERR_VALUE_OUT_OF_BOUNDS == ret) { ret = OMPI_SUCCESS; } break; @@ -2830,7 +2830,7 @@ btl_openib_component_init(int *num_btl_modules, /* If we get NOT_SUPPORTED, then no CPC was found for this port. But that's not a fatal error -- just keep going; let's see if we find any usable openib modules or not. */ - if (OMPI_ERR_NOT_SUPPORTED == OPAL_SOS_GET_ERROR_CODE(ret)) { + if (OMPI_ERR_NOT_SUPPORTED == ret) { continue; } else if (OMPI_SUCCESS != ret) { /* All others *are* fatal. Note that we already did a @@ -2994,7 +2994,7 @@ static int progress_no_credits_pending_frags(mca_btl_base_endpoint_t *ep) error upward. */ rc = mca_btl_openib_endpoint_post_send(ep, to_send_frag(frag)); if (OPAL_UNLIKELY(OMPI_SUCCESS != rc && - OMPI_ERR_RESOURCE_BUSY != OPAL_SOS_GET_ERROR_CODE(rc))) { + OMPI_ERR_RESOURCE_BUSY != rc)) { OPAL_THREAD_UNLOCK(&ep->endpoint_lock); return rc; } @@ -3023,7 +3023,7 @@ void mca_btl_openib_frag_progress_pending_put_get(mca_btl_base_endpoint_t *ep, break; rc = mca_btl_openib_get((mca_btl_base_module_t *)openib_btl, ep, &to_base_frag(frag)->base); - if(OMPI_ERR_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(rc)) + if(OMPI_ERR_OUT_OF_RESOURCE == rc) break; } @@ -3036,7 +3036,7 @@ void mca_btl_openib_frag_progress_pending_put_get(mca_btl_base_endpoint_t *ep, break; rc = mca_btl_openib_put((mca_btl_base_module_t*)openib_btl, ep, &to_base_frag(frag)->base); - if(OMPI_ERR_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(rc)) + if(OMPI_ERR_OUT_OF_RESOURCE == rc) break; } } diff --git a/ompi/mca/btl/openib/btl_openib_endpoint.c b/ompi/mca/btl/openib/btl_openib_endpoint.c index abf980aaf4..eb0396f064 100644 --- a/ompi/mca/btl/openib/btl_openib_endpoint.c +++ b/ompi/mca/btl/openib/btl_openib_endpoint.c @@ -11,7 +11,7 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2006-2009 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2006-2007 Los Alamos National Security, LLC. All rights + * Copyright (c) 2006-2012 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2006-2007 Voltaire All rights reserved. * Copyright (c) 2006-2009 Mellanox Technologies, Inc. All rights reserved. @@ -36,7 +36,6 @@ #include "opal_stdint.h" #include "opal/util/output.h" -#include "opal/util/opal_sos.h" #include "orte/util/show_help.h" @@ -714,7 +713,7 @@ int mca_btl_openib_endpoint_send(mca_btl_base_endpoint_t* ep, rc = mca_btl_openib_endpoint_post_send(ep, frag); } OPAL_THREAD_UNLOCK(&ep->endpoint_lock); - if (OPAL_UNLIKELY(OMPI_ERR_RESOURCE_BUSY == OPAL_SOS_GET_ERROR_CODE(rc))) { + if (OPAL_UNLIKELY(OMPI_ERR_RESOURCE_BUSY == rc)) { rc = OMPI_SUCCESS; } @@ -898,7 +897,7 @@ static int mca_btl_openib_endpoint_send_eager_rdma( )); } rc = mca_btl_openib_endpoint_send(endpoint, frag); - if (OMPI_SUCCESS == rc || OMPI_ERR_RESOURCE_BUSY == OPAL_SOS_GET_ERROR_CODE(rc)) + if (OMPI_SUCCESS == rc || OMPI_ERR_RESOURCE_BUSY == rc) return OMPI_SUCCESS; MCA_BTL_IB_FRAG_RETURN(frag); diff --git a/ompi/mca/btl/openib/btl_openib_failover.c b/ompi/mca/btl/openib/btl_openib_failover.c index 47c5ec5adc..17d78bf7e1 100644 --- a/ompi/mca/btl/openib/btl_openib_failover.c +++ b/ompi/mca/btl/openib/btl_openib_failover.c @@ -1,6 +1,8 @@ /* * Copyright (c) 2010-2011 Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011 NVIDIA Corporation. All rights reserved. + * Copyright (c) 2012 Los Alamos National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -28,8 +30,6 @@ #include "btl_openib_proc.h" #include "btl_openib_failover.h" -#include "opal/util/opal_sos.h" - static void error_out_all_pending_frags(mca_btl_base_endpoint_t *ep, struct mca_btl_base_module_t* module, bool errout); @@ -691,7 +691,7 @@ static void mca_btl_openib_endpoint_notify(mca_btl_base_endpoint_t* endpoint, ui BTL_OPENIB_BROKEN_CONNECTION_HEADER_HTON((*bc_hdr)); } rc = mca_btl_openib_endpoint_send(newep, frag); - if (OMPI_SUCCESS == rc || OMPI_ERR_RESOURCE_BUSY == OPAL_SOS_GET_ERROR_CODE(rc)) { + if (OMPI_SUCCESS == rc || OMPI_ERR_RESOURCE_BUSY == rc) { return; } diff --git a/ompi/mca/btl/openib/btl_openib_ini.c b/ompi/mca/btl/openib/btl_openib_ini.c index b9d0a89df5..f13156d416 100644 --- a/ompi/mca/btl/openib/btl_openib_ini.c +++ b/ompi/mca/btl/openib/btl_openib_ini.c @@ -11,6 +11,8 @@ * All rights reserved. * Copyright (c) 2006-2009 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2008 Mellanox Technologies. All rights reserved. + * Copyright (c) 2012 Los Alamos National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -29,7 +31,6 @@ #endif #include "orte/util/show_help.h" -#include "opal/util/opal_sos.h" #include "opal/mca/base/mca_base_param.h" #include "btl_openib.h" @@ -133,13 +134,13 @@ int ompi_btl_openib_ini_init(void) /* Note that NOT_FOUND and SUCCESS are not fatal errors and we keep going. Other errors are treated as fatal */ - if (OMPI_ERR_NOT_FOUND != OPAL_SOS_GET_ERROR_CODE(ret) && OMPI_SUCCESS != ret) { + if (OMPI_ERR_NOT_FOUND != ret && OMPI_SUCCESS != ret) { break; } str = colon + 1; } /* Parse the last file if we didn't have a fatal error above */ - if (OMPI_ERR_NOT_FOUND != OPAL_SOS_GET_ERROR_CODE(ret) && OMPI_SUCCESS != ret) { + if (OMPI_ERR_NOT_FOUND != ret && OMPI_SUCCESS != ret) { ret = parse_file(str); } @@ -150,7 +151,7 @@ int ompi_btl_openib_ini_init(void) /* Return SUCCESS unless we got a fatal error */ initialized = true; - return (OMPI_SUCCESS == ret || OMPI_ERR_NOT_FOUND == OPAL_SOS_GET_ERROR_CODE(ret)) ? + return (OMPI_SUCCESS == ret || OMPI_ERR_NOT_FOUND == ret) ? OMPI_SUCCESS : ret; } diff --git a/ompi/mca/btl/openib/connect/btl_openib_connect_base.c b/ompi/mca/btl/openib/connect/btl_openib_connect_base.c index db098f83cb..8caf672242 100644 --- a/ompi/mca/btl/openib/connect/btl_openib_connect_base.c +++ b/ompi/mca/btl/openib/connect/btl_openib_connect_base.c @@ -1,6 +1,8 @@ /* * Copyright (c) 2007-2009 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2007 Mellanox Technologies, Inc. All rights reserved. + * Copyright (c) 2012 Los Alamos National Security, LLC. All rights + * reserved. * * $COPYRIGHT$ * @@ -28,7 +30,6 @@ #include "orte/util/show_help.h" #include "opal/util/argv.h" #include "opal/util/output.h" -#include "opal/util/opal_sos.h" /* * Array of all possible connection functions @@ -219,7 +220,7 @@ int ompi_btl_openib_connect_base_init(void) opal_output(-1, "found available cpc (SUCCESS init): %s", all[i]->cbc_name); continue; - } else if (OMPI_ERR_NOT_SUPPORTED == OPAL_SOS_GET_ERROR_CODE(rc)) { + } else if (OMPI_ERR_NOT_SUPPORTED == rc) { continue; } else { return rc; @@ -265,8 +266,7 @@ int ompi_btl_openib_connect_base_select_for_local_port(mca_btl_openib_module_t * strcat(msg, available[i]->cbc_name); rc = available[i]->cbc_query(btl, &cpcs[cpc_index]); - if (OMPI_ERR_NOT_SUPPORTED == OPAL_SOS_GET_ERROR_CODE(rc) || - OMPI_ERR_UNREACH == OPAL_SOS_GET_ERROR_CODE(rc)) { + if (OMPI_ERR_NOT_SUPPORTED == rc || OMPI_ERR_UNREACH == rc) { continue; } else if (OMPI_SUCCESS != rc) { free(cpcs); diff --git a/ompi/mca/btl/openib/connect/btl_openib_connect_oob.c b/ompi/mca/btl/openib/connect/btl_openib_connect_oob.c index 8c54783495..0ca94cf030 100644 --- a/ompi/mca/btl/openib/connect/btl_openib_connect_oob.c +++ b/ompi/mca/btl/openib/connect/btl_openib_connect_oob.c @@ -10,7 +10,7 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2006-2009 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2006 Los Alamos National Security, LLC. All rights + * Copyright (c) 2006-2012 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2008-2011 Mellanox Technologies. All rights reserved. * Copyright (c) 2009-2011 IBM Corporation. All rights reserved. @@ -30,7 +30,6 @@ #include "orte/util/show_help.h" #include "opal/util/error.h" #include "opal/util/output.h" -#include "opal/util/opal_sos.h" #include "orte/mca/rml/rml.h" #include "orte/mca/rml/rml_types.h" #include "orte/mca/errmgr/errmgr.h" diff --git a/ompi/mca/btl/openib/connect/btl_openib_connect_rdmacm.c b/ompi/mca/btl/openib/connect/btl_openib_connect_rdmacm.c index 2b57793b75..6eb59a7211 100644 --- a/ompi/mca/btl/openib/connect/btl_openib_connect_rdmacm.c +++ b/ompi/mca/btl/openib/connect/btl_openib_connect_rdmacm.c @@ -4,6 +4,8 @@ * Copyright (c) 2008 Mellanox Technologies. All rights reserved. * Copyright (c) 2009 Sandia National Laboratories. All rights reserved. * Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 Los Alamos National Security, LLC. All rights + * reserved. * * $COPYRIGHT$ * @@ -46,7 +48,6 @@ #include "opal/util/output.h" #include "opal/util/error.h" -#include "opal/util/opal_sos.h" #include "orte/util/show_help.h" #include "btl_openib_fd.h" @@ -1932,7 +1933,7 @@ out3: out1: free(*cpc); out: - if (OMPI_ERR_NOT_SUPPORTED == OPAL_SOS_GET_ERROR_CODE(rc)) { + if (OMPI_ERR_NOT_SUPPORTED == rc) { opal_output_verbose(5, mca_btl_base_output, "openib BTL: rdmacm CPC unavailable for use on %s:%d; skipped", ibv_get_device_name(openib_btl->device->ib_dev), diff --git a/ompi/mca/btl/openib/connect/btl_openib_connect_xoob.c b/ompi/mca/btl/openib/connect/btl_openib_connect_xoob.c index d964f6fe3c..7bc3eaf37d 100644 --- a/ompi/mca/btl/openib/connect/btl_openib_connect_xoob.c +++ b/ompi/mca/btl/openib/connect/btl_openib_connect_xoob.c @@ -5,6 +5,8 @@ * Copyright (c) 2010-2011 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. + * Copyright (c) 2012 Los Alamos National Security, LLC. All rights + * reserved. * * $COPYRIGHT$ * @@ -21,7 +23,6 @@ #include "opal/util/output.h" #include "orte/util/show_help.h" #include "orte/util/name_fns.h" -#include "opal/util/opal_sos.h" #include "orte/mca/rml/rml.h" #include "orte/mca/rml/rml_types.h" #include "orte/mca/errmgr/errmgr.h" @@ -698,10 +699,8 @@ static mca_btl_openib_endpoint_t* xoob_find_endpoint(orte_process_name_t* proces BTL_VERBOSE(("Searching for ep and proc with follow parameters:" "jobid %d, vpid %d, " - "epoch %d, " "sid %" PRIx64 ", lid %d", process_name->jobid, process_name->vpid, - ORTE_EPOCH_GET(process_name), subnet_id, lid)); diff --git a/ompi/mca/btl/portals/btl_portals.c b/ompi/mca/btl/portals/btl_portals.c index d970f3ac41..d756b45f32 100644 --- a/ompi/mca/btl/portals/btl_portals.c +++ b/ompi/mca/btl/portals/btl_portals.c @@ -10,6 +10,8 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2008 UT-Battelle, LLC. All rights reserved. + * Copyright (c) 2012 Los Alamos National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -29,7 +31,6 @@ #include "ompi/constants.h" #include "ompi/mca/btl/btl.h" #include "opal/datatype/opal_convertor.h" -#include "opal/util/opal_sos.h" #include "btl_portals.h" #include "btl_portals_endpoint.h" diff --git a/ompi/mca/btl/portals/btl_portals_frag.h b/ompi/mca/btl/portals/btl_portals_frag.h index 797884048d..b4fe43af94 100644 --- a/ompi/mca/btl/portals/btl_portals_frag.h +++ b/ompi/mca/btl/portals/btl_portals_frag.h @@ -9,6 +9,8 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. + * Copyright (c) 2011-2012 Los Alamos National Security, LLC. + * All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -67,7 +69,7 @@ OBJ_CLASS_DECLARATION(mca_btl_portals_frag_recv_t); ompi_free_list_item_t *item; \ OMPI_FREE_LIST_GET(&((mca_btl_portals_module_t*)btl_macro)->portals_frag_eager, item, rc); \ frag = (mca_btl_portals_frag_t*) item; \ - if (OPAL_SOS_GET_ERROR_CODE(rc) == OMPI_ERR_TEMP_OUT_OF_RESOURCE) { \ + if (rc == OMPI_ERR_TEMP_OUT_OF_RESOURCE) { \ OMPI_BTL_PORTALS_FRAG_ALLOC_MAX(btl_macro, frag, rc); \ } \ } diff --git a/ompi/mca/btl/portals/btl_portals_send.c b/ompi/mca/btl/portals/btl_portals_send.c index 819c8f17d8..c09fb915c4 100644 --- a/ompi/mca/btl/portals/btl_portals_send.c +++ b/ompi/mca/btl/portals/btl_portals_send.c @@ -10,6 +10,8 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2008 UT-Battelle, LLC. All rights reserved. + * Copyright (c) 2012 Los Alamos National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -24,7 +26,6 @@ #include "ompi/constants.h" #include "opal/datatype/opal_convertor.h" -#include "opal/util/opal_sos.h" #include "btl_portals.h" #include "btl_portals_send.h" diff --git a/ompi/mca/btl/tcp/btl_tcp_component.c b/ompi/mca/btl/tcp/btl_tcp_component.c index bd30e74eb0..8b1866ba96 100644 --- a/ompi/mca/btl/tcp/btl_tcp_component.c +++ b/ompi/mca/btl/tcp/btl_tcp_component.c @@ -12,6 +12,8 @@ * Copyright (c) 2007-2011 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2008 Sun Microsystems, Inc. All rights reserved. * Copyright (c) 2009 Oak Ridge National Laboratory + * Copyright (c) 2012 Los Alamos National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -53,7 +55,6 @@ #include "opal/util/output.h" #include "opal/util/argv.h" #include "opal/util/net.h" -#include "opal/util/opal_sos.h" #include "opal/mca/base/mca_base_param.h" #include "orte/types.h" @@ -1055,7 +1056,7 @@ mca_btl_base_module_t** mca_btl_tcp_component_init(int *num_btl_modules, } #if OPAL_WANT_IPV6 if((ret = mca_btl_tcp_component_create_listen(AF_INET6)) != OMPI_SUCCESS) { - if (!(OMPI_ERR_IN_ERRNO == OPAL_SOS_GET_ERROR_CODE(ret) && + if (!(OMPI_ERR_IN_ERRNO == ret && EAFNOSUPPORT == opal_socket_errno)) { opal_output (0, "mca_btl_tcp_component: IPv6 listening socket failed\n"); return 0; diff --git a/ompi/mca/btl/ugni/btl_ugni_put.c b/ompi/mca/btl/ugni/btl_ugni_put.c index 4e38194892..1f67a9e291 100644 --- a/ompi/mca/btl/ugni/btl_ugni_put.c +++ b/ompi/mca/btl/ugni/btl_ugni_put.c @@ -13,7 +13,6 @@ #include "opal/include/opal_stdint.h" #include "btl_ugni_rdma.h" -#include "opal/util/opal_sos.h" /** * Initiate a put operation. diff --git a/ompi/mca/btl/wv/btl_wv.c b/ompi/mca/btl/wv/btl_wv.c index b03d312815..197a72b64c 100644 --- a/ompi/mca/btl/wv/btl_wv.c +++ b/ompi/mca/btl/wv/btl_wv.c @@ -12,7 +12,7 @@ * All rights reserved. * Copyright (c) 2007-2010 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2006-2009 Mellanox Technologies. All rights reserved. - * Copyright (c) 2006-2007 Los Alamos National Security, LLC. All rights + * Copyright (c) 2006-2012 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2006-2007 Voltaire All rights reserved. * Copyright (c) 2008-2010 Oracle and/or its affiliates. All rights reserved. @@ -32,7 +32,6 @@ #include "opal/class/opal_bitmap.h" #include "opal/util/output.h" #include "opal/util/arch.h" -#include "opal/util/opal_sos.h" #include "ompi/mca/btl/btl.h" #include "ompi/mca/btl/base/btl_base_error.h" @@ -309,7 +308,7 @@ static int mca_btl_wv_tune_endpoint(mca_btl_wv_module_t* wv_btl, endpoint->rem_info.rem_vendor_part_id, &values); if (OMPI_SUCCESS != ret && - OMPI_ERR_NOT_FOUND != OPAL_SOS_GET_ERROR_CODE(ret)) { + OMPI_ERR_NOT_FOUND != ret) { orte_show_help("help-mpi-btl-wv.txt", "error in device init", true, orte_process_info.nodename, @@ -1347,7 +1346,7 @@ int mca_btl_wv_put(mca_btl_base_module_t* btl, OPAL_THREAD_LOCK(&ep->endpoint_lock); rc = check_endpoint_state(ep, descriptor, &ep->pending_put_frags); OPAL_THREAD_UNLOCK(&ep->endpoint_lock); - if(OMPI_ERR_RESOURCE_BUSY == OPAL_SOS_GET_ERROR_CODE(rc)) + if(OMPI_ERR_RESOURCE_BUSY == rc) return OMPI_SUCCESS; if(OMPI_SUCCESS != rc) return rc; @@ -1406,7 +1405,7 @@ int mca_btl_wv_get(mca_btl_base_module_t* btl, OPAL_THREAD_LOCK(&ep->endpoint_lock); rc = check_endpoint_state(ep, descriptor, &ep->pending_get_frags); OPAL_THREAD_UNLOCK(&ep->endpoint_lock); - if(OMPI_ERR_RESOURCE_BUSY == OPAL_SOS_GET_ERROR_CODE(rc)) + if(OMPI_ERR_RESOURCE_BUSY == rc) return OMPI_SUCCESS; if(OMPI_SUCCESS != rc) return rc; diff --git a/ompi/mca/btl/wv/btl_wv_component.c b/ompi/mca/btl/wv/btl_wv_component.c index e7d47c0409..12dc8c8e25 100644 --- a/ompi/mca/btl/wv/btl_wv_component.c +++ b/ompi/mca/btl/wv/btl_wv_component.c @@ -12,7 +12,7 @@ * All rights reserved. * Copyright (c) 2006-2011 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2006-2009 Mellanox Technologies. All rights reserved. - * Copyright (c) 2006-2007 Los Alamos National Security, LLC. All rights + * Copyright (c) 2006-2012 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2006-2007 Voltaire All rights reserved. * Copyright (c) 2009-2010 Oracle and/or its affiliates. All rights reserved. @@ -784,8 +784,8 @@ static int prepare_device_for_use(mca_btl_wv_device_t *device) if (OMPI_SUCCESS != rc) { /* If we're "out of memory", this usually means that we ran out of registered memory, so show that error message */ - if (OMPI_ERR_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(rc) || - OMPI_ERR_TEMP_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(rc)) { + if (OMPI_ERR_OUT_OF_RESOURCE == rc || + OMPI_ERR_TEMP_OUT_OF_RESOURCE == rc) { errno = ENOMEM; mca_btl_wv_show_init_error(__FILE__, __LINE__, "ompi_free_list_init_ex_new", @@ -820,8 +820,8 @@ static int prepare_device_for_use(mca_btl_wv_device_t *device) /* If we're "out of memory", this usually means that we ran out of registered memory, so show that error message */ - if (OMPI_ERR_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(rc) || - OMPI_ERR_TEMP_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(rc)) { + if (OMPI_ERR_OUT_OF_RESOURCE == rc || + OMPI_ERR_TEMP_OUT_OF_RESOURCE == rc) { errno = ENOMEM; mca_btl_wv_show_init_error(__FILE__, __LINE__, "ompi_free_list_init_ex_new", @@ -1312,11 +1312,11 @@ static int init_one_device(opal_list_t *btl_list, struct wv_device* ib_dev) device->ib_dev_attr.VendorPartId, &values); if (OMPI_SUCCESS != ret && - OMPI_ERR_NOT_FOUND != OPAL_SOS_GET_ERROR_CODE(ret)) { + OMPI_ERR_NOT_FOUND != ret) { /* If we get a serious error, propagate it upwards */ goto error; } - if (OMPI_ERR_NOT_FOUND == OPAL_SOS_GET_ERROR_CODE(ret)) { + if (OMPI_ERR_NOT_FOUND == ret) { /* If we didn't find a matching device in the INI files, output a warning that we're using default values (unless overridden that we don't want to see these warnings) */ @@ -1333,7 +1333,7 @@ static int init_one_device(opal_list_t *btl_list, struct wv_device* ib_dev) be set indicating that it does not have good values */ ret = ompi_btl_wv_ini_query(0, 0, &default_values); if (OMPI_SUCCESS != ret && - OMPI_ERR_NOT_FOUND != OPAL_SOS_GET_ERROR_CODE(ret)) { + OMPI_ERR_NOT_FOUND != ret) { /* If we get a serious error, propagate it upwards */ goto error; } @@ -1429,7 +1429,7 @@ static int init_one_device(opal_list_t *btl_list, struct wv_device* ib_dev) device, &mpool_resources); if(NULL == device->mpool){ /* Don't print an error message here -- we'll get one from - mpool_create anyway (OPAL_SOS would be good here...) */ + mpool_create anyway */ goto error; } @@ -1481,7 +1481,7 @@ static int init_one_device(opal_list_t *btl_list, struct wv_device* ib_dev) if (OMPI_SUCCESS != ret) { /* Out of bounds error indicates that we hit max btl number * don't propagate the error to the caller */ - if (OMPI_ERR_VALUE_OUT_OF_BOUNDS == OPAL_SOS_GET_ERROR_CODE(ret)) { + if (OMPI_ERR_VALUE_OUT_OF_BOUNDS == ret) { ret = OMPI_SUCCESS; } break; @@ -2313,7 +2313,7 @@ btl_wv_component_init(int *num_btl_modules, /* If we get NOT_SUPPORTED, then no CPC was found for this port. But that's not a fatal error -- just keep going; let's see if we find any usable wv modules or not. */ - if (OMPI_ERR_NOT_SUPPORTED == OPAL_SOS_GET_ERROR_CODE(ret)) { + if (OMPI_ERR_NOT_SUPPORTED == ret) { continue; } else if (OMPI_SUCCESS != ret) { /* All others *are* fatal. Note that we already did a @@ -2469,7 +2469,7 @@ static int progress_no_credits_pending_frags(mca_btl_base_endpoint_t *ep) error upward. */ rc = mca_btl_wv_endpoint_post_send(ep, to_send_frag(frag)); if (OPAL_UNLIKELY(OMPI_SUCCESS != rc && - OMPI_ERR_RESOURCE_BUSY != OPAL_SOS_GET_ERROR_CODE(rc))) { + OMPI_ERR_RESOURCE_BUSY != rc)) { OPAL_THREAD_UNLOCK(&ep->endpoint_lock); return rc; } @@ -2497,7 +2497,7 @@ void mca_btl_wv_frag_progress_pending_put_get(mca_btl_base_endpoint_t *ep, break; rc = mca_btl_wv_get((mca_btl_base_module_t *)wv_btl, ep, &to_base_frag(frag)->base); - if(OMPI_ERR_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(rc)) + if(OMPI_ERR_OUT_OF_RESOURCE == rc) break; } @@ -2510,7 +2510,7 @@ void mca_btl_wv_frag_progress_pending_put_get(mca_btl_base_endpoint_t *ep, break; rc = mca_btl_wv_put((mca_btl_base_module_t *)wv_btl, ep, &to_base_frag(frag)->base); - if(OMPI_ERR_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(rc)) + if(OMPI_ERR_OUT_OF_RESOURCE == rc) break; } } diff --git a/ompi/mca/btl/wv/btl_wv_endpoint.c b/ompi/mca/btl/wv/btl_wv_endpoint.c index 6319238438..2a37877e50 100644 --- a/ompi/mca/btl/wv/btl_wv_endpoint.c +++ b/ompi/mca/btl/wv/btl_wv_endpoint.c @@ -11,7 +11,7 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2006-2009 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2006-2007 Los Alamos National Security, LLC. All rights + * Copyright (c) 2006-2012 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2006-2007 Voltaire All rights reserved. * Copyright (c) 2006-2009 Mellanox Technologies, Inc. All rights reserved. @@ -33,7 +33,6 @@ #include "opal_stdint.h" #include "opal/util/output.h" -#include "opal/util/opal_sos.h" #include "orte/util/show_help.h" @@ -617,7 +616,7 @@ int mca_btl_wv_endpoint_send(mca_btl_base_endpoint_t* ep, rc = mca_btl_wv_endpoint_post_send(ep, frag); } OPAL_THREAD_UNLOCK(&ep->endpoint_lock); - if (OPAL_UNLIKELY(OMPI_ERR_RESOURCE_BUSY == OPAL_SOS_GET_ERROR_CODE(rc))) { + if (OPAL_UNLIKELY(OMPI_ERR_RESOURCE_BUSY == rc)) { rc = OMPI_SUCCESS; } @@ -801,7 +800,7 @@ static int mca_btl_wv_endpoint_send_eager_rdma( )); } rc = mca_btl_wv_endpoint_send(endpoint, frag); - if (OMPI_SUCCESS == rc || OMPI_ERR_RESOURCE_BUSY == OPAL_SOS_GET_ERROR_CODE(rc)) + if (OMPI_SUCCESS == rc || OMPI_ERR_RESOURCE_BUSY == rc) return OMPI_SUCCESS; MCA_BTL_IB_FRAG_RETURN(frag); diff --git a/ompi/mca/btl/wv/btl_wv_ini.c b/ompi/mca/btl/wv/btl_wv_ini.c index 8ee2f171e7..dd593e9913 100644 --- a/ompi/mca/btl/wv/btl_wv_ini.c +++ b/ompi/mca/btl/wv/btl_wv_ini.c @@ -11,6 +11,8 @@ * All rights reserved. * Copyright (c) 2006-2009 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2008 Mellanox Technologies. All rights reserved. + * Copyright (c) 2012 Los Alamos National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -29,7 +31,6 @@ #endif #include "orte/util/show_help.h" -#include "opal/util/opal_sos.h" #include "opal/mca/base/mca_base_param.h" #include "btl_wv.h" @@ -127,13 +128,13 @@ int ompi_btl_wv_ini_init(void) /* Note that NOT_FOUND and SUCCESS are not fatal errors and we keep going. Other errors are treated as fatal */ - if (OMPI_ERR_NOT_FOUND != OPAL_SOS_GET_ERROR_CODE(ret) && OMPI_SUCCESS != ret) { + if (OMPI_ERR_NOT_FOUND != ret && OMPI_SUCCESS != ret) { break; } str = colon + 1; } /* Parse the last file if we didn't have a fatal error above */ - if (OMPI_ERR_NOT_FOUND != OPAL_SOS_GET_ERROR_CODE(ret) && OMPI_SUCCESS != ret) { + if (OMPI_ERR_NOT_FOUND != ret && OMPI_SUCCESS != ret) { ret = parse_file(str); } @@ -144,7 +145,7 @@ int ompi_btl_wv_ini_init(void) /* Return SUCCESS unless we got a fatal error */ initialized = true; - return (OMPI_SUCCESS == ret || OMPI_ERR_NOT_FOUND == OPAL_SOS_GET_ERROR_CODE(ret)) ? + return (OMPI_SUCCESS == ret || OMPI_ERR_NOT_FOUND == ret) ? OMPI_SUCCESS : ret; } diff --git a/ompi/mca/btl/wv/connect/btl_wv_connect_base.c b/ompi/mca/btl/wv/connect/btl_wv_connect_base.c index 7c09c28c9b..2030144d7c 100644 --- a/ompi/mca/btl/wv/connect/btl_wv_connect_base.c +++ b/ompi/mca/btl/wv/connect/btl_wv_connect_base.c @@ -1,6 +1,8 @@ /* * Copyright (c) 2007-2009 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2007 Mellanox Technologies, Inc. All rights reserved. + * Copyright (c) 2012 Los Alamos National Security, LLC. All rights + * reserved. * * $COPYRIGHT$ * @@ -19,7 +21,6 @@ #include "orte/util/show_help.h" #include "opal/util/argv.h" #include "opal/util/output.h" -#include "opal/util/opal_sos.h" /* * Array of all possible connection functions @@ -183,7 +184,7 @@ int ompi_btl_wv_connect_base_init(void) opal_output(-1, "found available cpc (SUCCESS init): %s", all[i]->cbc_name); continue; - } else if (OMPI_ERR_NOT_SUPPORTED == OPAL_SOS_GET_ERROR_CODE(rc)) { + } else if (OMPI_ERR_NOT_SUPPORTED == rc) { continue; } else { return rc; @@ -229,8 +230,8 @@ int ompi_btl_wv_connect_base_select_for_local_port(mca_btl_wv_module_t *btl) strcat(msg, available[i]->cbc_name); rc = available[i]->cbc_query(btl, &cpcs[cpc_index]); - if (OMPI_ERR_NOT_SUPPORTED == OPAL_SOS_GET_ERROR_CODE(rc) || - OMPI_ERR_UNREACH == OPAL_SOS_GET_ERROR_CODE(rc)) { + if (OMPI_ERR_NOT_SUPPORTED == rc || + OMPI_ERR_UNREACH == rc) { continue; } else if (OMPI_SUCCESS != rc) { free(cpcs); diff --git a/ompi/mca/btl/wv/connect/btl_wv_connect_oob.c b/ompi/mca/btl/wv/connect/btl_wv_connect_oob.c index 39ed2e2b69..07e2ad1e35 100644 --- a/ompi/mca/btl/wv/connect/btl_wv_connect_oob.c +++ b/ompi/mca/btl/wv/connect/btl_wv_connect_oob.c @@ -10,7 +10,7 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2006-2009 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2006 Los Alamos National Security, LLC. All rights + * Copyright (c) 2006-2012 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2008-2009 Mellanox Technologies. All rights reserved. * Copyright (c) 2009 IBM Corporation. All rights reserved. @@ -28,7 +28,6 @@ #include "orte/util/show_help.h" #include "opal/util/error.h" #include "opal/util/output.h" -#include "opal/util/opal_sos.h" #include "orte/mca/rml/rml.h" #include "orte/mca/rml/rml_types.h" #include "orte/mca/errmgr/errmgr.h" diff --git a/ompi/mca/crcp/bkmrk/crcp_bkmrk_pml.c b/ompi/mca/crcp/bkmrk/crcp_bkmrk_pml.c index 33649ae2d4..e8bea87cc4 100644 --- a/ompi/mca/crcp/bkmrk/crcp_bkmrk_pml.c +++ b/ompi/mca/crcp/bkmrk/crcp_bkmrk_pml.c @@ -702,7 +702,6 @@ OBJ_CLASS_INSTANCE(ompi_crcp_bkmrk_pml_peer_ref_t, void ompi_crcp_bkmrk_pml_peer_ref_construct(ompi_crcp_bkmrk_pml_peer_ref_t *peer_ref) { peer_ref->proc_name.jobid = ORTE_JOBID_INVALID; peer_ref->proc_name.vpid = ORTE_VPID_INVALID; - ORTE_EPOCH_SET(peer_ref->proc_name.epoch,ORTE_EPOCH_MIN); OBJ_CONSTRUCT(&peer_ref->send_list, opal_list_t); OBJ_CONSTRUCT(&peer_ref->isend_list, opal_list_t); @@ -730,7 +729,6 @@ void ompi_crcp_bkmrk_pml_peer_ref_destruct( ompi_crcp_bkmrk_pml_peer_ref_t *peer peer_ref->proc_name.jobid = ORTE_JOBID_INVALID; peer_ref->proc_name.vpid = ORTE_VPID_INVALID; - ORTE_EPOCH_SET(peer_ref->proc_name.epoch,ORTE_EPOCH_MIN); while( NULL != (item = opal_list_remove_first(&peer_ref->send_list)) ) { HOKE_TRAFFIC_MSG_REF_RETURN(item); @@ -840,7 +838,6 @@ void ompi_crcp_bkmrk_pml_traffic_message_ref_construct(ompi_crcp_bkmrk_pml_traff msg_ref->proc_name.jobid = ORTE_JOBID_INVALID; msg_ref->proc_name.vpid = ORTE_VPID_INVALID; - ORTE_EPOCH_SET(msg_ref->proc_name.epoch,ORTE_EPOCH_MIN); msg_ref->matched = INVALID_INT; msg_ref->done = INVALID_INT; @@ -868,7 +865,6 @@ void ompi_crcp_bkmrk_pml_traffic_message_ref_destruct( ompi_crcp_bkmrk_pml_traff msg_ref->proc_name.jobid = ORTE_JOBID_INVALID; msg_ref->proc_name.vpid = ORTE_VPID_INVALID; - ORTE_EPOCH_SET(msg_ref->proc_name.epoch,ORTE_EPOCH_MIN); msg_ref->matched = INVALID_INT; msg_ref->done = INVALID_INT; @@ -902,7 +898,6 @@ void ompi_crcp_bkmrk_pml_drain_message_ref_construct(ompi_crcp_bkmrk_pml_drain_m msg_ref->proc_name.jobid = ORTE_JOBID_INVALID; msg_ref->proc_name.vpid = ORTE_VPID_INVALID; - ORTE_EPOCH_SET(msg_ref->proc_name.epoch,ORTE_EPOCH_MIN); msg_ref->done = INVALID_INT; msg_ref->active = INVALID_INT; @@ -934,7 +929,6 @@ void ompi_crcp_bkmrk_pml_drain_message_ref_destruct( ompi_crcp_bkmrk_pml_drain_m msg_ref->proc_name.jobid = ORTE_JOBID_INVALID; msg_ref->proc_name.vpid = ORTE_VPID_INVALID; - ORTE_EPOCH_SET(msg_ref->proc_name.epoch,ORTE_EPOCH_MIN); msg_ref->done = INVALID_INT; msg_ref->active = INVALID_INT; @@ -954,7 +948,6 @@ void ompi_crcp_bkmrk_pml_drain_message_ack_ref_construct(ompi_crcp_bkmrk_pml_dra msg_ack_ref->peer.jobid = ORTE_JOBID_INVALID; msg_ack_ref->peer.vpid = ORTE_VPID_INVALID; - ORTE_EPOCH_SET(msg_ack_ref->peer.epoch,ORTE_EPOCH_MIN); } void ompi_crcp_bkmrk_pml_drain_message_ack_ref_destruct( ompi_crcp_bkmrk_pml_drain_message_ack_ref_t *msg_ack_ref) { @@ -962,7 +955,6 @@ void ompi_crcp_bkmrk_pml_drain_message_ack_ref_destruct( ompi_crcp_bkmrk_pml_dra msg_ack_ref->peer.jobid = ORTE_JOBID_INVALID; msg_ack_ref->peer.vpid = ORTE_VPID_INVALID; - ORTE_EPOCH_SET(msg_ack_ref->peer.epoch,ORTE_EPOCH_MIN); } @@ -1034,7 +1026,6 @@ do { \ \ msg_ref->proc_name.jobid = p_jobid; \ msg_ref->proc_name.vpid = p_vpid; \ - ORTE_EPOCH_SET(msg_ref->proc_name.epoch,orte_ess.proc_get_epoch(&(msg_ref->proc_name))); \ \ msg_ref->matched = 0; \ msg_ref->done = 0; \ @@ -1063,7 +1054,6 @@ do { \ \ msg_ref->proc_name.jobid = p_jobid; \ msg_ref->proc_name.vpid = p_vpid; \ - ORTE_EPOCH_SET(msg_ref->proc_name.epoch,orte_ess.proc_get_epoch(&(msg_ref->proc_name))); \ } @@ -1466,7 +1456,6 @@ ompi_crcp_base_pml_state_t* ompi_crcp_bkmrk_pml_add_procs( new_peer_ref->proc_name.jobid = procs[i]->proc_name.jobid; new_peer_ref->proc_name.vpid = procs[i]->proc_name.vpid; - ORTE_EPOCH_SET(new_peer_ref->proc_name.epoch,procs[i]->proc_name.epoch); opal_list_append(&ompi_crcp_bkmrk_pml_peer_refs, &(new_peer_ref->super)); } @@ -3375,7 +3364,6 @@ static int traffic_message_move(ompi_crcp_bkmrk_pml_traffic_message_ref_t *old_m if( NULL == from_peer_ref && NULL != to_peer_ref ) { (*new_msg_ref)->proc_name.jobid = to_peer_ref->proc_name.jobid; (*new_msg_ref)->proc_name.vpid = to_peer_ref->proc_name.vpid; - ORTE_EPOCH_SET((*new_msg_ref)->proc_name.epoch,to_peer_ref->proc_name.epoch); } return exit_status; @@ -5281,7 +5269,6 @@ static int send_bookmarks(int peer_idx) */ peer_name.jobid = ORTE_PROC_MY_NAME->jobid; peer_name.vpid = peer_idx; - ORTE_EPOCH_SET(peer_name.epoch,orte_ess.proc_get_epoch(&peer_name)); if( NULL == (peer_ref = find_peer(peer_name))) { opal_output(mca_crcp_bkmrk_component.super.output_handle, @@ -5342,7 +5329,6 @@ static int recv_bookmarks(int peer_idx) peer_name.jobid = ORTE_PROC_MY_NAME->jobid; peer_name.vpid = peer_idx; - ORTE_EPOCH_SET(peer_name.epoch,orte_ess.proc_get_epoch(&peer_name)); if ( 0 > (ret = orte_rml.recv_buffer_nb(&peer_name, OMPI_CRCP_COORD_BOOKMARK_TAG, @@ -5524,7 +5510,6 @@ static int send_msg_details(ompi_crcp_bkmrk_pml_peer_ref_t *peer_ref, HOKE_DRAIN_ACK_MSG_REF_ALLOC(d_msg_ack, ret); d_msg_ack->peer.jobid = peer_ref->proc_name.jobid; d_msg_ack->peer.vpid = peer_ref->proc_name.vpid; - ORTE_EPOCH_SET(d_msg_ack->peer.epoch,peer_ref->proc_name.epoch); d_msg_ack->complete = false; opal_list_append(&drained_msg_ack_list, &(d_msg_ack->super)); diff --git a/ompi/mca/dpm/base/dpm_base_select.c b/ompi/mca/dpm/base/dpm_base_select.c index f22f9d0ec6..09764183cd 100644 --- a/ompi/mca/dpm/base/dpm_base_select.c +++ b/ompi/mca/dpm/base/dpm_base_select.c @@ -7,6 +7,8 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. + * Copyright (c) 2012 Los Alamos National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -18,7 +20,6 @@ #include "opal/mca/mca.h" #include "opal/mca/base/base.h" -#include "opal/util/opal_sos.h" #include "opal/mca/base/mca_base_param.h" #include "opal/mca/base/mca_base_component_repository.h" @@ -41,7 +42,7 @@ int ompi_dpm_base_select(void) (mca_base_module_t **) &best_module, (mca_base_component_t **) &best_component))) { /* it is okay not to find any executable components */ - if (OMPI_ERR_NOT_FOUND == OPAL_SOS_GET_ERROR_CODE(ret)) { + if (OMPI_ERR_NOT_FOUND == ret) { ret = OPAL_SUCCESS; } goto cleanup; diff --git a/ompi/mca/dpm/orte/dpm_orte.c b/ompi/mca/dpm/orte/dpm_orte.c index 71d54fd15f..613d73c611 100644 --- a/ompi/mca/dpm/orte/dpm_orte.c +++ b/ompi/mca/dpm/orte/dpm_orte.c @@ -12,6 +12,8 @@ * Copyright (c) 2007-2011 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2006-2009 University of Houston. All rights reserved. * Copyright (c) 2009 Sun Microsystems, Inc. All rights reserved. + * Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -28,7 +30,6 @@ #include "opal/util/argv.h" #include "opal/util/opal_getcwd.h" -#include "opal/util/opal_sos.h" #include "opal/dss/dss.h" #include "orte/mca/errmgr/errmgr.h" @@ -65,7 +66,6 @@ static orte_process_name_t carport; static void recv_cb(int status, orte_process_name_t* sender, opal_buffer_t *buffer, orte_rml_tag_t tag, void *cbdata); -static void process_cb(int fd, short event, void *data); /* API functions */ static int init(void); @@ -104,6 +104,13 @@ ompi_dpm_base_module_t ompi_dpm_orte_module = { finalize }; +static void rml_cbfunc(int status, orte_process_name_t* sender, + opal_buffer_t* buffer, orte_rml_tag_t tag, + void* cbdata) +{ + OBJ_RELEASE(buffer); +} + /* * Init the module @@ -136,7 +143,11 @@ static int connect_accept ( ompi_communicator_t *comm, int root, int i,j, new_proc_len; ompi_group_t *new_group_pointer; - + orte_grpcomm_coll_id_t id; + orte_grpcomm_collective_t modex; + opal_list_item_t *item; + orte_namelist_t *nm; + OPAL_OUTPUT_VERBOSE((1, ompi_dpm_base_output, "%s dpm:orte:connect_accept with port %s %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), @@ -179,6 +190,65 @@ static int connect_accept ( ompi_communicator_t *comm, int root, opal_progress_event_users_increment(); if ( rank == root ) { + if (send_first) { + /* Get a collective id for the modex we need later on - we + * have to get a globally unique id for this purpose as + * multiple threads can do simultaneous connect/accept, + * and the same processes can be engaged in multiple + * connect/accepts at the same time. Only one side + * needs to do this, so have it be send_first + */ + nbuf = OBJ_NEW(opal_buffer_t); + if (NULL == nbuf) { + return OMPI_ERROR; + } + /* send the request - doesn't have to include any data */ + rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, nbuf, ORTE_RML_TAG_COLL_ID_REQ, 0, rml_cbfunc, NULL); + /* wait for the id */ + recv_completed = false; + cabuf = OBJ_NEW(opal_buffer_t); + rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_COLL_ID, + ORTE_RML_NON_PERSISTENT, recv_cb, NULL); + /* wait for response */ + while (!recv_completed) { + opal_progress(); + } + i=1; + if (OPAL_SUCCESS != (rc = opal_dss.unpack(cabuf, &id, &i, ORTE_GRPCOMM_COLL_ID_T))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(cabuf); + return OMPI_ERROR; + } + OBJ_RELEASE(cabuf); + /* send it to my peer on the other side */ + nbuf = OBJ_NEW(opal_buffer_t); + if (NULL == nbuf) { + return OMPI_ERROR; + } + if (ORTE_SUCCESS != (rc = opal_dss.pack(nbuf, &id, 1, ORTE_GRPCOMM_COLL_ID_T))) { + ORTE_ERROR_LOG(rc); + goto exit; + } + rc = orte_rml.send_buffer_nb(&port, nbuf, tag, 0, rml_cbfunc, NULL); + } else { + /* wait to recv the collective id */ + recv_completed = false; + cabuf = OBJ_NEW(opal_buffer_t); + rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, tag, + ORTE_RML_NON_PERSISTENT, recv_cb, NULL); + /* wait for response */ + while (!recv_completed) { + opal_progress(); + } + i=1; + if (OPAL_SUCCESS != (rc = opal_dss.unpack(cabuf, &id, &i, ORTE_GRPCOMM_COLL_ID_T))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(cabuf); + return OMPI_ERROR; + } + OBJ_RELEASE(cabuf); + } + /* Generate the message buffer containing the number of processes and the list of participating processes */ nbuf = OBJ_NEW(opal_buffer_t); @@ -186,6 +256,12 @@ static int connect_accept ( ompi_communicator_t *comm, int root, return OMPI_ERROR; } + /* pass the collective id so we can all use it */ + if (ORTE_SUCCESS != (rc = opal_dss.pack(nbuf, &id, 1, ORTE_GRPCOMM_COLL_ID_T))) { + ORTE_ERROR_LOG(rc); + goto exit; + } + if (OPAL_SUCCESS != (rc = opal_dss.pack(nbuf, &size, 1, OPAL_INT))) { ORTE_ERROR_LOG(rc); goto exit; @@ -244,7 +320,9 @@ static int connect_accept ( ompi_communicator_t *comm, int root, rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, tag, ORTE_RML_NON_PERSISTENT, recv_cb, NULL); /* wait for response */ - ORTE_PROGRESSED_WAIT(recv_completed, 0, 1); + while (!recv_completed) { + opal_progress(); + } OPAL_OUTPUT_VERBOSE((3, ompi_dpm_base_output, "%s dpm:orte:connect_accept got data from %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), @@ -259,7 +337,9 @@ static int connect_accept ( ompi_communicator_t *comm, int root, rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, tag, ORTE_RML_NON_PERSISTENT, recv_cb, NULL); /* wait for response */ - ORTE_PROGRESSED_WAIT(recv_completed, 0, 1); + while (!recv_completed) { + opal_progress(); + } /* now send our info */ OPAL_OUTPUT_VERBOSE((3, ompi_dpm_base_output, "%s dpm:orte:connect_accept sending info to %s", @@ -324,6 +404,13 @@ static int connect_accept ( ompi_communicator_t *comm, int root, goto exit; } + /* unload the collective id */ + num_vals = 1; + if (ORTE_SUCCESS != (rc = opal_dss.unpack(nrbuf, &id, &num_vals, ORTE_GRPCOMM_COLL_ID_T))) { + ORTE_ERROR_LOG(rc); + goto exit; + } + num_vals = 1; if (OPAL_SUCCESS != (rc = opal_dss.unpack(nrbuf, &rsize, &num_vals, OPAL_INT))) { ORTE_ERROR_LOG(rc); @@ -360,7 +447,7 @@ static int connect_accept ( ompi_communicator_t *comm, int root, for (i = 0 ; i < rsize ; ++i) { name = OBJ_NEW(orte_namelist_t); name->name = rprocs[i]->proc_name; - opal_list_append(&all_procs, &name->item); + opal_list_append(&all_procs, &name->super); OPAL_OUTPUT_VERBOSE((3, ompi_dpm_base_output, "%s dpm:orte:connect_accept send first adding %s to allgather list", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), @@ -369,7 +456,7 @@ static int connect_accept ( ompi_communicator_t *comm, int root, for (i = 0 ; i < group->grp_proc_count ; ++i) { name = OBJ_NEW(orte_namelist_t); name->name = ompi_group_peer_lookup(group, i)->proc_name; - opal_list_append(&all_procs, &name->item); + opal_list_append(&all_procs, &name->super); OPAL_OUTPUT_VERBOSE((3, ompi_dpm_base_output, "%s dpm:orte:connect_accept send first adding %s to allgather list", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), @@ -380,7 +467,7 @@ static int connect_accept ( ompi_communicator_t *comm, int root, for (i = 0 ; i < group->grp_proc_count ; ++i) { name = OBJ_NEW(orte_namelist_t); name->name = ompi_group_peer_lookup(group, i)->proc_name; - opal_list_append(&all_procs, &name->item); + opal_list_append(&all_procs, &name->super); OPAL_OUTPUT_VERBOSE((3, ompi_dpm_base_output, "%s dpm:orte:connect_accept recv first adding %s to allgather list", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), @@ -389,7 +476,7 @@ static int connect_accept ( ompi_communicator_t *comm, int root, for (i = 0 ; i < rsize ; ++i) { name = OBJ_NEW(orte_namelist_t); name->name = rprocs[i]->proc_name; - opal_list_append(&all_procs, &name->item); + opal_list_append(&all_procs, &name->super); OPAL_OUTPUT_VERBOSE((3, ompi_dpm_base_output, "%s dpm:orte:connect_accept recv first adding %s to allgather list", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), @@ -402,10 +489,28 @@ static int connect_accept ( ompi_communicator_t *comm, int root, "%s dpm:orte:connect_accept executing modex", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - if (ORTE_SUCCESS != (rc = orte_grpcomm.modex(&all_procs))) { + /* setup the modex */ + OBJ_CONSTRUCT(&modex, orte_grpcomm_collective_t); + modex.id = id; + /* copy across the list of participants */ + for (item = opal_list_get_first(&all_procs); + item != opal_list_get_end(&all_procs); + item = opal_list_get_next(item)) { + nm = (orte_namelist_t*)item; + name = OBJ_NEW(orte_namelist_t); + name->name = nm->name; + opal_list_append(&modex.participants, &name->super); + } + + /* perform it */ + if (OMPI_SUCCESS != (rc = orte_grpcomm.modex(&modex))) { ORTE_ERROR_LOG(rc); goto exit; } + while (modex.active) { + opal_progress(); + } + OBJ_DESTRUCT(&modex); OPAL_OUTPUT_VERBOSE((3, ompi_dpm_base_output, "%s dpm:orte:connect_accept modex complete", @@ -1521,33 +1626,12 @@ static void recv_cb(int status, orte_process_name_t* sender, opal_buffer_t *buffer, orte_rml_tag_t tag, void *cbdata) { - /* don't process this right away - we need to get out of the recv before - * we process the message as it may ask us to do something that involves - * more messaging! Instead, setup an event so that the message gets processed - * as soon as we leave the recv. - * - * The macro makes a copy of the buffer, which we release when processed - the incoming - * buffer, however, is NOT released here, although its payload IS transferred - * to the message buffer for later processing - */ - ORTE_MESSAGE_EVENT(sender, buffer, tag, process_cb); - - -} -static void process_cb(int fd, short event, void *data) -{ - orte_message_event_t *mev = (orte_message_event_t*)data; - /* copy the payload to the global buffer */ - opal_dss.copy_payload(cabuf, mev->buffer); + opal_dss.copy_payload(cabuf, buffer); /* flag the identity of the remote proc */ - carport.jobid = mev->sender.jobid; - carport.vpid = mev->sender.vpid; - ORTE_EPOCH_SET(carport.epoch,mev->sender.epoch); - - /* release the event */ - OBJ_RELEASE(mev); + carport.jobid = sender->jobid; + carport.vpid = sender->vpid; /* flag complete */ recv_completed = true; diff --git a/ompi/mca/osc/pt2pt/osc_pt2pt_sync.c b/ompi/mca/osc/pt2pt/osc_pt2pt_sync.c index da77c1cf84..92345fa2a6 100644 --- a/ompi/mca/osc/pt2pt/osc_pt2pt_sync.c +++ b/ompi/mca/osc/pt2pt/osc_pt2pt_sync.c @@ -7,6 +7,8 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. + * Copyright (c) 2012 Los Alamos National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -24,7 +26,6 @@ #include "mpi.h" #include "opal/runtime/opal_progress.h" #include "opal/threads/mutex.h" -#include "opal/util/opal_sos.h" #include "ompi/communicator/communicator.h" #include "ompi/mca/osc/base/base.h" @@ -122,7 +123,7 @@ ompi_osc_pt2pt_module_fence(int assert, ompi_win_t *win) ret = ompi_osc_pt2pt_sendreq_send(module, req); - if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(ret) ) { + if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == ret) { opal_output_verbose(5, ompi_osc_base_output, "complete: failure in starting sendreq (%d). Will try later.", ret); @@ -267,7 +268,7 @@ ompi_osc_pt2pt_module_complete(ompi_win_t *win) ret = ompi_osc_pt2pt_sendreq_send(module, req); - if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(ret) ) { + if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == ret) { opal_output_verbose(5, ompi_osc_base_output, "complete: failure in starting sendreq (%d). Will try later.", ret); @@ -490,7 +491,7 @@ ompi_osc_pt2pt_module_unlock(int target, ret = ompi_osc_pt2pt_sendreq_send(module, req); - if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(ret) ) { + if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == ret) { opal_output_verbose(5, ompi_osc_base_output, "complete: failure in starting sendreq (%d). Will try later.", ret); diff --git a/ompi/mca/osc/rdma/osc_rdma_comm.c b/ompi/mca/osc/rdma/osc_rdma_comm.c index 75d9bc821c..3642295cc4 100644 --- a/ompi/mca/osc/rdma/osc_rdma_comm.c +++ b/ompi/mca/osc/rdma/osc_rdma_comm.c @@ -7,7 +7,7 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. - * Copyright (c) 2007 Los Alamos National Security, LLC. All rights + * Copyright (c) 2007-2012 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. * $COPYRIGHT$ @@ -137,7 +137,7 @@ ompi_osc_rdma_module_accumulate(void *origin_addr, int origin_count, ret = ompi_osc_rdma_sendreq_send(module, sendreq); - if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(ret)) { + if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == ret) { OPAL_THREAD_LOCK(&module->m_lock); sendreq->req_module->m_num_pending_out -= 1; opal_list_append(&(module->m_pending_sendreqs), @@ -209,7 +209,7 @@ ompi_osc_rdma_module_get(void *origin_addr, ret = ompi_osc_rdma_sendreq_send(module, sendreq); - if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(ret)) { + if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == ret) { OPAL_THREAD_LOCK(&module->m_lock); sendreq->req_module->m_num_pending_out -= 1; opal_list_append(&(module->m_pending_sendreqs), @@ -278,7 +278,7 @@ ompi_osc_rdma_module_put(void *origin_addr, int origin_count, ret = ompi_osc_rdma_sendreq_send(module, sendreq); - if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(ret)) { + if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == ret) { OPAL_THREAD_LOCK(&module->m_lock); sendreq->req_module->m_num_pending_out -= 1; opal_list_append(&(module->m_pending_sendreqs), diff --git a/ompi/mca/osc/rdma/osc_rdma_sync.c b/ompi/mca/osc/rdma/osc_rdma_sync.c index e792299eb0..12b3c0e00c 100644 --- a/ompi/mca/osc/rdma/osc_rdma_sync.c +++ b/ompi/mca/osc/rdma/osc_rdma_sync.c @@ -7,7 +7,7 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. - * Copyright (c) 2007 Los Alamos National Security, LLC. All rights + * Copyright (c) 2007-2012 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2010 IBM Corporation. All rights reserved. * $COPYRIGHT$ @@ -140,7 +140,7 @@ ompi_osc_rdma_module_fence(int assert, ompi_win_t *win) opal_list_remove_first(&(module->m_copy_pending_sendreqs)); ret = ompi_osc_rdma_sendreq_send(module, req); - if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(ret)) { + if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == ret) { opal_list_append(&(module->m_copy_pending_sendreqs), (opal_list_item_t*)req); } else if (OMPI_SUCCESS != ret) { return ret; @@ -355,7 +355,7 @@ ompi_osc_rdma_module_complete(ompi_win_t *win) (ompi_osc_rdma_sendreq_t*) item; ret = ompi_osc_rdma_sendreq_send(module, req); - if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(ret)) { + if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == ret) { opal_list_append(&(module->m_copy_pending_sendreqs), item); break; } else if (OMPI_SUCCESS != ret) { @@ -589,7 +589,7 @@ ompi_osc_rdma_module_unlock(int target, (ompi_osc_rdma_sendreq_t*) item; ret = ompi_osc_rdma_sendreq_send(module, req); - if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(ret)) { + if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == ret) { opal_list_append(&(module->m_copy_pending_sendreqs), item); break; } else if (OMPI_SUCCESS != ret) { diff --git a/ompi/mca/pml/base/pml_base_select.c b/ompi/mca/pml/base/pml_base_select.c index 1906ba7085..fd00dcb756 100644 --- a/ompi/mca/pml/base/pml_base_select.c +++ b/ompi/mca/pml/base/pml_base_select.c @@ -10,6 +10,8 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. + * Copyright (c) 2012 Los Alamos National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -25,7 +27,6 @@ #include "opal/class/opal_list.h" #include "opal/util/output.h" -#include "opal/util/opal_sos.h" #include "orte/util/show_help.h" #include "opal/runtime/opal_progress.h" #include "opal/mca/mca.h" @@ -354,7 +355,7 @@ mca_pml_base_pml_check_selected(const char *my_pml, (void**) &remote_pml, &size); /* if modex isn't implemented, then just assume all is well... */ - if (OMPI_ERR_NOT_IMPLEMENTED == OPAL_SOS_GET_ERROR_CODE(ret)) { + if (OMPI_ERR_NOT_IMPLEMENTED == ret) { opal_output_verbose( 10, mca_pml_base_output, "check:select: modex not implemented"); return OMPI_SUCCESS; diff --git a/ompi/mca/pml/bfo/pml_bfo.c b/ompi/mca/pml/bfo/pml_bfo.c index 6eb5b7c755..879d4b5047 100644 --- a/ompi/mca/pml/bfo/pml_bfo.c +++ b/ompi/mca/pml/bfo/pml_bfo.c @@ -14,6 +14,8 @@ * Copyright (c) 2006-2008 University of Houston. All rights reserved. * Copyright (c) 2009-2010 Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011 Sandia National Laboratories. All rights reserved. + * Copyright (c) 2011-2012 Los Alamos National Security, LLC. + * All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -587,7 +589,7 @@ void mca_pml_bfo_process_pending_packets(mca_bml_base_btl_t* bml_btl) pckt->hdr.hdr_ack.hdr_dst_req.pval, pckt->hdr.hdr_ack.hdr_send_offset, pckt->hdr.hdr_common.hdr_flags & MCA_PML_BFO_HDR_FLAGS_NORDMA); - if( OPAL_UNLIKELY(OMPI_ERR_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(rc)) ) { + if( OPAL_UNLIKELY(OMPI_ERR_OUT_OF_RESOURCE == rc) ) { OPAL_THREAD_LOCK(&mca_pml_bfo.lock); opal_list_append(&mca_pml_bfo.pckt_pending, (opal_list_item_t*)pckt); @@ -608,7 +610,7 @@ void mca_pml_bfo_process_pending_packets(mca_bml_base_btl_t* bml_btl) #else /* PML_BFO */ pckt->hdr.hdr_fin.hdr_fail); #endif /* PML_BFO */ - if( OPAL_UNLIKELY(OMPI_ERR_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(rc)) ) { + if( OPAL_UNLIKELY(OMPI_ERR_OUT_OF_RESOURCE == rc) ) { return; } break; @@ -640,7 +642,7 @@ void mca_pml_bfo_process_pending_rdma(void) } else { rc = mca_pml_bfo_recv_request_get_frag(frag); } - if(OMPI_ERR_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(rc)) + if(OMPI_ERR_OUT_OF_RESOURCE == rc) break; } } diff --git a/ompi/mca/pml/bfo/pml_bfo_failover.c b/ompi/mca/pml/bfo/pml_bfo_failover.c index ede5c8bf4f..fee412f9db 100644 --- a/ompi/mca/pml/bfo/pml_bfo_failover.c +++ b/ompi/mca/pml/bfo/pml_bfo_failover.c @@ -1,5 +1,7 @@ /* * Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011-2012 Los Alamos National Security, LLC. + * All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -839,7 +841,7 @@ void mca_pml_bfo_send_request_restart(mca_pml_bfo_send_request_t* sendreq, /* select a btl */ bml_btl = mca_bml_base_btl_array_get_next(&endpoint->btl_eager); rc = mca_pml_bfo_send_request_start_btl(sendreq, bml_btl); - if(OPAL_LIKELY(OMPI_ERR_OUT_OF_RESOURCE != OPAL_SOS_GET_ERROR_CODE(rc))) + if(OPAL_LIKELY(OMPI_ERR_OUT_OF_RESOURCE != rc)) return; } add_request_to_send_pending(sendreq, MCA_PML_BFO_SEND_PENDING_START, true); @@ -897,7 +899,7 @@ void mca_pml_bfo_repost_match_fragment(struct mca_btl_base_descriptor_t* des) rc = mca_pml_bfo_send_request_start_btl(sendreq, bml_btl); if (OMPI_SUCCESS == rc) { return; - } else if (OMPI_ERR_OUT_OF_RESOURCE == (OPAL_SOS_GET_ERROR_CODE(rc))) { + } else if (OMPI_ERR_OUT_OF_RESOURCE == rc) { opal_output_verbose(30, mca_pml_bfo_output, "Warning: delaying reposting of BFO_HDR_TYPE_MATCH, btls=%d", (int)sendreq->req_endpoint->btl_eager.arr_size); diff --git a/ompi/mca/pml/bfo/pml_bfo_recvreq.c b/ompi/mca/pml/bfo/pml_bfo_recvreq.c index f032ade8bd..033f460a0e 100644 --- a/ompi/mca/pml/bfo/pml_bfo_recvreq.c +++ b/ompi/mca/pml/bfo/pml_bfo_recvreq.c @@ -11,6 +11,8 @@ * All rights reserved. * Copyright (c) 2008 UT-Battelle, LLC. All rights reserved. * Copyright (c) 2010-2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011-2012 Los Alamos National Security, LLC. + * All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -56,7 +58,7 @@ void mca_pml_bfo_recv_request_process_pending(void) break; recvreq->req_pending = false; rc = mca_pml_bfo_recv_request_schedule_exclusive(recvreq, NULL); - if(OMPI_ERR_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(rc)) + if(OMPI_ERR_OUT_OF_RESOURCE == rc) break; } } @@ -433,7 +435,7 @@ int mca_pml_bfo_recv_request_get_frag( mca_pml_bfo_rdma_frag_t* frag ) /* queue up get request */ rc = mca_bml_base_get(bml_btl,descriptor); if( OPAL_UNLIKELY(OMPI_SUCCESS != rc) ) { - if(OMPI_ERR_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(rc)) { + if(OMPI_ERR_OUT_OF_RESOURCE == rc) { mca_bml_base_free(bml_btl, descriptor); OPAL_THREAD_LOCK(&mca_pml_bfo.lock); opal_list_append(&mca_pml_bfo.rdma_pending, diff --git a/ompi/mca/pml/bfo/pml_bfo_recvreq.h b/ompi/mca/pml/bfo/pml_bfo_recvreq.h index 69d12b0434..be9e1c441e 100644 --- a/ompi/mca/pml/bfo/pml_bfo_recvreq.h +++ b/ompi/mca/pml/bfo/pml_bfo_recvreq.h @@ -11,6 +11,8 @@ * All rights reserved. * Copyright (c) 2008 UT-Battelle, LLC. All rights reserved. * Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011-2012 Los Alamos National Security, LLC. + * All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -369,7 +371,7 @@ static inline int mca_pml_bfo_recv_request_schedule_exclusive( do { rc = mca_pml_bfo_recv_request_schedule_once(req, start_bml_btl); - if(OPAL_SOS_GET_ERROR_CODE(rc) == OMPI_ERR_OUT_OF_RESOURCE) + if(rc == OMPI_ERR_OUT_OF_RESOURCE) break; } while(!unlock_recv_request(req)); diff --git a/ompi/mca/pml/bfo/pml_bfo_sendreq.c b/ompi/mca/pml/bfo/pml_bfo_sendreq.c index 63549a57d8..547514b971 100644 --- a/ompi/mca/pml/bfo/pml_bfo_sendreq.c +++ b/ompi/mca/pml/bfo/pml_bfo_sendreq.c @@ -11,6 +11,8 @@ * All rights reserved. * Copyright (c) 2008 UT-Battelle, LLC. All rights reserved. * Copyright (c) 2010-2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011-2012 Los Alamos National Security, LLC. + * All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -57,7 +59,7 @@ void mca_pml_bfo_send_request_process_pending(mca_bml_base_btl_t *bml_btl) switch(pending_type) { case MCA_PML_BFO_SEND_PENDING_SCHEDULE: rc = mca_pml_bfo_send_request_schedule_exclusive(sendreq); - if(OMPI_ERR_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(rc)) { + if(OMPI_ERR_OUT_OF_RESOURCE == rc) { return; } break; @@ -70,7 +72,7 @@ void mca_pml_bfo_send_request_process_pending(mca_bml_base_btl_t *bml_btl) MCA_PML_BFO_SEND_PENDING_START, true); } else { rc = mca_pml_bfo_send_request_start_btl(sendreq, send_dst); - if (OMPI_ERR_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(rc)) { + if (OMPI_ERR_OUT_OF_RESOURCE == rc) { /* No more resources on this btl so prepend to the pending * list to minimize reordering and give up for now. */ add_request_to_send_pending(sendreq, @@ -618,8 +620,7 @@ int mca_pml_bfo_send_request_start_copy( mca_pml_bfo_send_request_t* sendreq, } return OMPI_SUCCESS; } - - if (OMPI_ERR_RESOURCE_BUSY == OPAL_SOS_GET_ERROR_CODE(rc)) { + if (OMPI_ERR_RESOURCE_BUSY == rc) { /* No more resources. Allow the upper level to queue the send */ rc = OMPI_ERR_OUT_OF_RESOURCE; } @@ -1311,7 +1312,7 @@ int mca_pml_bfo_send_request_put_frag( mca_pml_bfo_rdma_frag_t* frag ) if( OPAL_UNLIKELY(OMPI_SUCCESS != rc) ) { mca_bml_base_free(bml_btl, des); frag->rdma_length = save_size; - if(OMPI_ERR_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(rc)) { + if(OMPI_ERR_OUT_OF_RESOURCE == rc) { OPAL_THREAD_LOCK(&mca_pml_bfo.lock); opal_list_append(&mca_pml_bfo.rdma_pending, (opal_list_item_t*)frag); OPAL_THREAD_UNLOCK(&mca_pml_bfo.lock); diff --git a/ompi/mca/pml/bfo/pml_bfo_sendreq.h b/ompi/mca/pml/bfo/pml_bfo_sendreq.h index 52d8b896b7..3ef8900458 100644 --- a/ompi/mca/pml/bfo/pml_bfo_sendreq.h +++ b/ompi/mca/pml/bfo/pml_bfo_sendreq.h @@ -10,6 +10,8 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2009-2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011-2012 Los Alamos National Security, LLC. + * All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -293,7 +295,7 @@ mca_pml_bfo_send_request_schedule_exclusive(mca_pml_bfo_send_request_t* sendreq) int rc; do { rc = mca_pml_bfo_send_request_schedule_once(sendreq); - if(OPAL_SOS_GET_ERROR_CODE(rc) == OMPI_ERR_OUT_OF_RESOURCE) + if(rc == OMPI_ERR_OUT_OF_RESOURCE) break; } while(!unlock_send_request(sendreq)); @@ -458,7 +460,7 @@ mca_pml_bfo_send_request_start( mca_pml_bfo_send_request_t* sendreq ) /* select a btl */ bml_btl = mca_bml_base_btl_array_get_next(&endpoint->btl_eager); rc = mca_pml_bfo_send_request_start_btl(sendreq, bml_btl); - if( OPAL_LIKELY(OMPI_ERR_OUT_OF_RESOURCE != OPAL_SOS_GET_ERROR_CODE(rc)) ) + if( OPAL_LIKELY(OMPI_ERR_OUT_OF_RESOURCE != rc) ) return rc; } add_request_to_send_pending(sendreq, MCA_PML_BFO_SEND_PENDING_START, true); diff --git a/ompi/mca/pml/csum/pml_csum.c b/ompi/mca/pml/csum/pml_csum.c index 7935ce3971..4ae381e398 100644 --- a/ompi/mca/pml/csum/pml_csum.c +++ b/ompi/mca/pml/csum/pml_csum.c @@ -13,7 +13,7 @@ * Copyright (c) 2008 UT-Battelle, LLC. All rights reserved. * Copyright (c) 2006-2008 University of Houston. All rights reserved. * Copyright (c) 2009 IBM Corporation. All rights reserved. - * Copyright (c) 2009 Los Alamos National Security, LLC. All rights + * Copyright (c) 2009-2012 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2009-2010 Oracle and/or its affiliates. All rights reserved * Copyright (c) 2011 Sandia National Laboratories. All rights reserved. @@ -586,7 +586,7 @@ void mca_pml_csum_process_pending_packets(mca_bml_base_btl_t* bml_btl) pckt->hdr.hdr_ack.hdr_dst_req.pval, pckt->hdr.hdr_ack.hdr_send_offset, pckt->hdr.hdr_common.hdr_flags & MCA_PML_CSUM_HDR_FLAGS_NORDMA); - if( OPAL_UNLIKELY(OMPI_ERR_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(rc)) ) { + if( OPAL_UNLIKELY(OMPI_ERR_OUT_OF_RESOURCE == rc) ) { OPAL_THREAD_LOCK(&mca_pml_csum.lock); opal_list_append(&mca_pml_csum.pckt_pending, (opal_list_item_t*)pckt); @@ -599,7 +599,7 @@ void mca_pml_csum_process_pending_packets(mca_bml_base_btl_t* bml_btl) pckt->hdr.hdr_fin.hdr_des, pckt->order, pckt->hdr.hdr_fin.hdr_fail); - if( OPAL_UNLIKELY(OMPI_ERR_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(rc)) ) { + if( OPAL_UNLIKELY(OMPI_ERR_OUT_OF_RESOURCE == rc) ) { return; } break; @@ -631,7 +631,7 @@ void mca_pml_csum_process_pending_rdma(void) } else { rc = mca_pml_csum_recv_request_get_frag(frag); } - if(OMPI_ERR_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(rc)) + if(OMPI_ERR_OUT_OF_RESOURCE == rc) break; } } diff --git a/ompi/mca/pml/csum/pml_csum_recvreq.c b/ompi/mca/pml/csum/pml_csum_recvreq.c index e0bc7e8d89..1ab808b963 100644 --- a/ompi/mca/pml/csum/pml_csum_recvreq.c +++ b/ompi/mca/pml/csum/pml_csum_recvreq.c @@ -11,7 +11,7 @@ * All rights reserved. * Copyright (c) 2008 UT-Battelle, LLC. All rights reserved. * Copyright (c) 2009 IBM Corporation. All rights reserved. - * Copyright (c) 2009 Los Alamos National Security, LLC. All rights + * Copyright (c) 2009-2012 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2009 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved. @@ -83,7 +83,7 @@ void mca_pml_csum_recv_request_process_pending(void) break; recvreq->req_pending = false; rc = mca_pml_csum_recv_request_schedule_exclusive(recvreq, NULL); - if(OMPI_ERR_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(rc)) + if(OMPI_ERR_OUT_OF_RESOURCE == rc) break; } } @@ -425,7 +425,7 @@ int mca_pml_csum_recv_request_get_frag( mca_pml_csum_rdma_frag_t* frag ) /* queue up get request */ rc = mca_bml_base_get(bml_btl,descriptor); if( OPAL_UNLIKELY(OMPI_SUCCESS != rc) ) { - if(OMPI_ERR_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(rc)) { + if(OMPI_ERR_OUT_OF_RESOURCE == rc) { mca_bml_base_free(bml_btl, descriptor); OPAL_THREAD_LOCK(&mca_pml_csum.lock); opal_list_append(&mca_pml_csum.rdma_pending, diff --git a/ompi/mca/pml/csum/pml_csum_recvreq.h b/ompi/mca/pml/csum/pml_csum_recvreq.h index b62d96a116..3245b82c2e 100644 --- a/ompi/mca/pml/csum/pml_csum_recvreq.h +++ b/ompi/mca/pml/csum/pml_csum_recvreq.h @@ -11,6 +11,8 @@ * All rights reserved. * Copyright (c) 2008 UT-Battelle, LLC. All rights reserved. * Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011-2012 Los Alamos National Security, LLC. + * All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -346,7 +348,7 @@ static inline int mca_pml_csum_recv_request_schedule_exclusive( do { rc = mca_pml_csum_recv_request_schedule_once(req, start_bml_btl); - if(OPAL_SOS_GET_ERROR_CODE(rc) == OMPI_ERR_OUT_OF_RESOURCE) + if(rc == OMPI_ERR_OUT_OF_RESOURCE) break; } while(!unlock_recv_request(req)); diff --git a/ompi/mca/pml/csum/pml_csum_sendreq.c b/ompi/mca/pml/csum/pml_csum_sendreq.c index aac1a2c462..758181eff6 100644 --- a/ompi/mca/pml/csum/pml_csum_sendreq.c +++ b/ompi/mca/pml/csum/pml_csum_sendreq.c @@ -11,7 +11,7 @@ * All rights reserved. * Copyright (c) 2008 UT-Battelle, LLC. All rights reserved. * Copyright (c) 2009 IBM Corporation. All rights reserved. - * Copyright (c) 2009 Los Alamos National Security, LLC. All rights + * Copyright (c) 2009-2012 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved. * $COPYRIGHT$ @@ -66,7 +66,7 @@ void mca_pml_csum_send_request_process_pending(mca_bml_base_btl_t *bml_btl) switch(pending_type) { case MCA_PML_CSUM_SEND_PENDING_SCHEDULE: rc = mca_pml_csum_send_request_schedule_exclusive(sendreq); - if(OMPI_ERR_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(rc)) { + if(OMPI_ERR_OUT_OF_RESOURCE == rc) { return; } break; @@ -79,7 +79,7 @@ void mca_pml_csum_send_request_process_pending(mca_bml_base_btl_t *bml_btl) MCA_PML_CSUM_SEND_PENDING_START, true); } else { rc = mca_pml_csum_send_request_start_btl(sendreq, send_dst); - if (OMPI_ERR_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(rc)) { + if (OMPI_ERR_OUT_OF_RESOURCE == rc) { /* No more resources on this btl so prepend to the pending * list to minimize reordering and give up for now. */ add_request_to_send_pending(sendreq, @@ -590,7 +590,7 @@ int mca_pml_csum_send_request_start_copy( mca_pml_csum_send_request_t* sendreq, } return OMPI_SUCCESS; } - switch(OPAL_SOS_GET_ERROR_CODE(rc)) { + switch(rc) { case OMPI_ERR_RESOURCE_BUSY: /* No more resources. Allow the upper level to queue the send */ rc = OMPI_ERR_OUT_OF_RESOURCE; @@ -1256,7 +1256,7 @@ int mca_pml_csum_send_request_put_frag( mca_pml_csum_rdma_frag_t* frag ) if( OPAL_UNLIKELY(OMPI_SUCCESS != rc) ) { mca_bml_base_free(bml_btl, des); frag->rdma_length = save_size; - if(OMPI_ERR_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(rc)) { + if(OMPI_ERR_OUT_OF_RESOURCE == rc) { OPAL_THREAD_LOCK(&mca_pml_csum.lock); opal_list_append(&mca_pml_csum.rdma_pending, (opal_list_item_t*)frag); OPAL_THREAD_UNLOCK(&mca_pml_csum.lock); diff --git a/ompi/mca/pml/csum/pml_csum_sendreq.h b/ompi/mca/pml/csum/pml_csum_sendreq.h index bdaeb7e841..f2e84840f7 100644 --- a/ompi/mca/pml/csum/pml_csum_sendreq.h +++ b/ompi/mca/pml/csum/pml_csum_sendreq.h @@ -10,7 +10,7 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2009 IBM Corporation. All rights reserved. - * Copyright (c) 2009 Los Alamos National Security, LLC. All rights + * Copyright (c) 2009-2012 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2009-2010 Oracle and/or its affiliates. All rights reserved. * $COPYRIGHT$ @@ -287,7 +287,7 @@ mca_pml_csum_send_request_schedule_exclusive(mca_pml_csum_send_request_t* sendre int rc; do { rc = mca_pml_csum_send_request_schedule_once(sendreq); - if(OPAL_SOS_GET_ERROR_CODE(rc) == OMPI_ERR_OUT_OF_RESOURCE) + if(rc == OMPI_ERR_OUT_OF_RESOURCE) break; } while(!unlock_send_request(sendreq)); @@ -434,7 +434,7 @@ mca_pml_csum_send_request_start( mca_pml_csum_send_request_t* sendreq ) /* select a btl */ bml_btl = mca_bml_base_btl_array_get_next(&endpoint->btl_eager); rc = mca_pml_csum_send_request_start_btl(sendreq, bml_btl); - if( OPAL_LIKELY(OMPI_ERR_OUT_OF_RESOURCE != OPAL_SOS_GET_ERROR_CODE(rc)) ) + if( OPAL_LIKELY(OMPI_ERR_OUT_OF_RESOURCE != rc) ) return rc; } add_request_to_send_pending(sendreq, MCA_PML_CSUM_SEND_PENDING_START, true); diff --git a/ompi/mca/pml/dr/pml_dr_sendreq.c b/ompi/mca/pml/dr/pml_dr_sendreq.c index e0e3bdb3b4..16ce9f6266 100644 --- a/ompi/mca/pml/dr/pml_dr_sendreq.c +++ b/ompi/mca/pml/dr/pml_dr_sendreq.c @@ -11,6 +11,8 @@ * All rights reserved. * Copyright (c) 2007 Mellanox Technologies. * All rights reserved. + * Copyright (c) 2011-2012 Los Alamos National Security, LLC. + * All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -102,7 +104,7 @@ static void mca_pml_dr_error_completion( mca_pml_dr_vfrag_t* vfrag = (mca_pml_dr_vfrag_t*)descriptor->des_cbdata; mca_pml_dr_send_request_t* sendreq = (mca_pml_dr_send_request_t*)vfrag->vf_send.pval; - switch(OPAL_SOS_GET_ERROR_CODE(status)) { + switch(status) { case OMPI_ERR_UNREACH: /** * peer is no longer reachable through this btl diff --git a/ompi/mca/pml/ob1/pml_ob1.c b/ompi/mca/pml/ob1/pml_ob1.c index bbc84ec985..ff2aa14d3e 100644 --- a/ompi/mca/pml/ob1/pml_ob1.c +++ b/ompi/mca/pml/ob1/pml_ob1.c @@ -14,6 +14,8 @@ * Copyright (c) 2006-2008 University of Houston. All rights reserved. * Copyright (c) 2009-2010 Oracle and/or its affiliates. All rights reserved * Copyright (c) 2011 Sandia National Laboratories. All rights reserved. + * Copyright (c) 2011-2012 Los Alamos National Security, LLC. + * All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -558,7 +560,7 @@ void mca_pml_ob1_process_pending_packets(mca_bml_base_btl_t* bml_btl) pckt->hdr.hdr_ack.hdr_dst_req.pval, pckt->hdr.hdr_ack.hdr_send_offset, pckt->hdr.hdr_common.hdr_flags & MCA_PML_OB1_HDR_FLAGS_NORDMA); - if( OPAL_UNLIKELY(OMPI_ERR_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(rc)) ) { + if( OPAL_UNLIKELY(OMPI_ERR_OUT_OF_RESOURCE == rc) ) { OPAL_THREAD_LOCK(&mca_pml_ob1.lock); opal_list_append(&mca_pml_ob1.pckt_pending, (opal_list_item_t*)pckt); @@ -571,7 +573,7 @@ void mca_pml_ob1_process_pending_packets(mca_bml_base_btl_t* bml_btl) pckt->hdr.hdr_fin.hdr_des, pckt->order, pckt->hdr.hdr_fin.hdr_fail); - if( OPAL_UNLIKELY(OMPI_ERR_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(rc)) ) { + if( OPAL_UNLIKELY(OMPI_ERR_OUT_OF_RESOURCE == rc) ) { return; } break; @@ -603,7 +605,7 @@ void mca_pml_ob1_process_pending_rdma(void) } else { rc = mca_pml_ob1_recv_request_get_frag(frag); } - if(OMPI_ERR_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(rc)) + if(OMPI_ERR_OUT_OF_RESOURCE == rc) break; } } diff --git a/ompi/mca/pml/ob1/pml_ob1_recvreq.c b/ompi/mca/pml/ob1/pml_ob1_recvreq.c index e303d9e496..5beb4d1a2a 100644 --- a/ompi/mca/pml/ob1/pml_ob1_recvreq.c +++ b/ompi/mca/pml/ob1/pml_ob1_recvreq.c @@ -12,6 +12,8 @@ * Copyright (c) 2008 UT-Battelle, LLC. All rights reserved. * Copyright (c) 2011 Sandia National Laboratories. All rights reserved. * Copyright (c) 2012 NVIDIA Corporation. All rights reserved. + * Copyright (c) 2011-2012 Los Alamos National Security, LLC. + * All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -54,7 +56,7 @@ void mca_pml_ob1_recv_request_process_pending(void) break; recvreq->req_pending = false; rc = mca_pml_ob1_recv_request_schedule_exclusive(recvreq, NULL); - if(OMPI_ERR_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(rc)) + if(OMPI_ERR_OUT_OF_RESOURCE == rc) break; } } @@ -391,7 +393,7 @@ int mca_pml_ob1_recv_request_get_frag( mca_pml_ob1_rdma_frag_t* frag ) /* queue up get request */ rc = mca_bml_base_get(bml_btl,descriptor); if( OPAL_UNLIKELY(OMPI_SUCCESS != rc) ) { - if(OMPI_ERR_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(rc)) { + if(OMPI_ERR_OUT_OF_RESOURCE == rc) { mca_bml_base_free(bml_btl, descriptor); OPAL_THREAD_LOCK(&mca_pml_ob1.lock); opal_list_append(&mca_pml_ob1.rdma_pending, diff --git a/ompi/mca/pml/ob1/pml_ob1_recvreq.h b/ompi/mca/pml/ob1/pml_ob1_recvreq.h index 182e3d2d28..2646f2fd9b 100644 --- a/ompi/mca/pml/ob1/pml_ob1_recvreq.h +++ b/ompi/mca/pml/ob1/pml_ob1_recvreq.h @@ -10,6 +10,8 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2008 UT-Battelle, LLC. All rights reserved. + * Copyright (c) 2011-2012 Los Alamos National Security, LLC. + * All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -347,7 +349,7 @@ static inline int mca_pml_ob1_recv_request_schedule_exclusive( do { rc = mca_pml_ob1_recv_request_schedule_once(req, start_bml_btl); - if(OPAL_SOS_GET_ERROR_CODE(rc) == OMPI_ERR_OUT_OF_RESOURCE) + if(rc == OMPI_ERR_OUT_OF_RESOURCE) break; } while(!unlock_recv_request(req)); diff --git a/ompi/mca/pml/ob1/pml_ob1_sendreq.c b/ompi/mca/pml/ob1/pml_ob1_sendreq.c index 71bfa7c382..4c0b6d1cf1 100644 --- a/ompi/mca/pml/ob1/pml_ob1_sendreq.c +++ b/ompi/mca/pml/ob1/pml_ob1_sendreq.c @@ -58,7 +58,7 @@ void mca_pml_ob1_send_request_process_pending(mca_bml_base_btl_t *bml_btl) switch(pending_type) { case MCA_PML_OB1_SEND_PENDING_SCHEDULE: rc = mca_pml_ob1_send_request_schedule_exclusive(sendreq); - if(OMPI_ERR_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(rc)) { + if(OMPI_ERR_OUT_OF_RESOURCE == rc) { return; } break; @@ -71,7 +71,7 @@ void mca_pml_ob1_send_request_process_pending(mca_bml_base_btl_t *bml_btl) MCA_PML_OB1_SEND_PENDING_START, true); } else { rc = mca_pml_ob1_send_request_start_btl(sendreq, send_dst); - if (OMPI_ERR_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(rc)) { + if (OMPI_ERR_OUT_OF_RESOURCE == rc) { /* No more resources on this btl so prepend to the pending * list to minimize reordering and give up for now. */ add_request_to_send_pending(sendreq, @@ -550,7 +550,7 @@ int mca_pml_ob1_send_request_start_copy( mca_pml_ob1_send_request_t* sendreq, return OMPI_SUCCESS; } - if (OMPI_ERR_RESOURCE_BUSY == OPAL_SOS_GET_ERROR_CODE(rc)) { + if (OMPI_ERR_RESOURCE_BUSY == rc) { /* No more resources. Allow the upper level to queue the send */ rc = OMPI_ERR_OUT_OF_RESOURCE; } @@ -1192,7 +1192,7 @@ int mca_pml_ob1_send_request_put_frag( mca_pml_ob1_rdma_frag_t* frag ) if( OPAL_UNLIKELY(OMPI_SUCCESS != rc) ) { mca_bml_base_free(bml_btl, des); frag->rdma_length = save_size; - if(OMPI_ERR_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(rc)) { + if(OMPI_ERR_OUT_OF_RESOURCE == rc) { OPAL_THREAD_LOCK(&mca_pml_ob1.lock); opal_list_append(&mca_pml_ob1.rdma_pending, (opal_list_item_t*)frag); OPAL_THREAD_UNLOCK(&mca_pml_ob1.lock); diff --git a/ompi/mca/pml/ob1/pml_ob1_sendreq.h b/ompi/mca/pml/ob1/pml_ob1_sendreq.h index cae9570cf4..9ef7e818b8 100644 --- a/ompi/mca/pml/ob1/pml_ob1_sendreq.h +++ b/ompi/mca/pml/ob1/pml_ob1_sendreq.h @@ -11,6 +11,8 @@ * All rights reserved. * Copyright (c) 2009 Sun Microsystems, Inc. All rights reserved. * Copyright (c) 2011-2012 NVIDIA Corporation. All rights reserved. + * Copyright (c) 2011-2012 Los Alamos National Security, LLC. + * All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -285,7 +287,7 @@ mca_pml_ob1_send_request_schedule_exclusive(mca_pml_ob1_send_request_t* sendreq) int rc; do { rc = mca_pml_ob1_send_request_schedule_once(sendreq); - if(OPAL_SOS_GET_ERROR_CODE(rc) == OMPI_ERR_OUT_OF_RESOURCE) + if(rc == OMPI_ERR_OUT_OF_RESOURCE) break; } while(!unlock_send_request(sendreq)); @@ -444,7 +446,7 @@ mca_pml_ob1_send_request_start( mca_pml_ob1_send_request_t* sendreq ) /* select a btl */ bml_btl = mca_bml_base_btl_array_get_next(&endpoint->btl_eager); rc = mca_pml_ob1_send_request_start_btl(sendreq, bml_btl); - if( OPAL_LIKELY(OMPI_ERR_OUT_OF_RESOURCE != OPAL_SOS_GET_ERROR_CODE(rc)) ) + if( OPAL_LIKELY(OMPI_ERR_OUT_OF_RESOURCE != rc) ) return rc; } add_request_to_send_pending(sendreq, MCA_PML_OB1_SEND_PENDING_START, true); diff --git a/ompi/mca/pubsub/base/pubsub_base_select.c b/ompi/mca/pubsub/base/pubsub_base_select.c index a69e1f17b7..301e0110d7 100644 --- a/ompi/mca/pubsub/base/pubsub_base_select.c +++ b/ompi/mca/pubsub/base/pubsub_base_select.c @@ -7,6 +7,8 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. + * Copyright (c) 2012 Los Alamos National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -18,7 +20,6 @@ #include "opal/mca/mca.h" #include "opal/mca/base/base.h" -#include "opal/util/opal_sos.h" #include "opal/mca/base/mca_base_param.h" #include "opal/mca/base/mca_base_component_repository.h" @@ -41,7 +42,7 @@ int ompi_pubsub_base_select(void) (mca_base_module_t **) &best_module, (mca_base_component_t **) &best_component))) { /* it is okay not to find any executable components */ - if (OMPI_ERR_NOT_FOUND == OPAL_SOS_GET_ERROR_CODE(ret)) { + if (OMPI_ERR_NOT_FOUND == ret) { ret = OPAL_SUCCESS; } goto cleanup; diff --git a/ompi/mca/pubsub/orte/pubsub_orte.c b/ompi/mca/pubsub/orte/pubsub_orte.c index a87cb4a0e0..4ad80f00f2 100644 --- a/ompi/mca/pubsub/orte/pubsub_orte.c +++ b/ompi/mca/pubsub/orte/pubsub_orte.c @@ -10,6 +10,8 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2012 Los Alamos National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -26,7 +28,6 @@ #include "orte/util/show_help.h" #include "opal/util/argv.h" -#include "opal/util/opal_sos.h" #include "opal/dss/dss.h" #include "orte/mca/errmgr/errmgr.h" diff --git a/ompi/mca/rcache/rb/rcache_rb.c b/ompi/mca/rcache/rb/rcache_rb.c index 5087378300..535decad3a 100644 --- a/ompi/mca/rcache/rb/rcache_rb.c +++ b/ompi/mca/rcache/rb/rcache_rb.c @@ -10,6 +10,8 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2009 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2011-2012 Los Alamos National Security, LLC. + * All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -121,7 +123,7 @@ int mca_rcache_rb_insert ( if(flags & MCA_MPOOL_FLAGS_CACHE) { rc = mca_rcache_rb_mru_insert( (mca_rcache_rb_module_t*) rcache, reg); if(OMPI_SUCCESS != rc) { - if(OMPI_ERR_TEMP_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(rc)) { + if(OMPI_ERR_TEMP_OUT_OF_RESOURCE == rc) { /* * If the registration is too big for the rcache, * don't cache it and reset the flags so the upper level diff --git a/ompi/mca/vprotocol/pessimist/vprotocol_pessimist_eventlog.c b/ompi/mca/vprotocol/pessimist/vprotocol_pessimist_eventlog.c index 515e0b305e..e6ccaf3678 100644 --- a/ompi/mca/vprotocol/pessimist/vprotocol_pessimist_eventlog.c +++ b/ompi/mca/vprotocol/pessimist/vprotocol_pessimist_eventlog.c @@ -1,6 +1,8 @@ /* * Copyright (c) 2004-2011 The Trustees of the University of Tennessee. * All rights reserved. + * Copyright (c) 2012 Los Alamos National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -10,7 +12,6 @@ #include "ompi_config.h" #include "vprotocol_pessimist_eventlog.h" -#include "opal/util/opal_sos.h" #include "orte/mca/rml/rml.h" #include "orte/mca/rml/base/rml_contact.h" diff --git a/ompi/mpi/c/unpublish_name.c b/ompi/mpi/c/unpublish_name.c index 24a135545b..e7ecb782ea 100644 --- a/ompi/mpi/c/unpublish_name.c +++ b/ompi/mpi/c/unpublish_name.c @@ -9,6 +9,8 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. + * Copyright (c) 2012 Los Alamos National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -25,8 +27,6 @@ #include "ompi/info/info.h" #include "ompi/mca/pubsub/pubsub.h" -#include "opal/util/opal_sos.h" - #if OPAL_HAVE_WEAK_SYMBOLS && OMPI_PROFILING_DEFINES #pragma weak MPI_Unpublish_name = PMPI_Unpublish_name #endif @@ -68,13 +68,13 @@ int MPI_Unpublish_name(char *service_name, MPI_Info info, */ rc = ompi_pubsub.unpublish(service_name, info); if ( OMPI_SUCCESS != rc ) { - if (OMPI_ERR_NOT_FOUND == OPAL_SOS_GET_ERROR_CODE(rc)) { + if (OMPI_ERR_NOT_FOUND == rc) { /* service couldn't be found */ OPAL_CR_EXIT_LIBRARY(); return OMPI_ERRHANDLER_INVOKE(MPI_COMM_WORLD, MPI_ERR_SERVICE, FUNC_NAME); } - if (OMPI_ERR_PERM == OPAL_SOS_GET_ERROR_CODE(rc)) { + if (OMPI_ERR_PERM == rc) { /* this process didn't own the specified service */ OPAL_CR_EXIT_LIBRARY(); return OMPI_ERRHANDLER_INVOKE(MPI_COMM_WORLD, MPI_ERR_ACCESS, diff --git a/ompi/proc/proc.c b/ompi/proc/proc.c index 98b048a1d9..b4f8204a6e 100644 --- a/ompi/proc/proc.c +++ b/ompi/proc/proc.c @@ -10,6 +10,8 @@ * Copyright (c) 2004-2006 The Regents of the University of California. * All rights reserved. * Copyright (c) 2006-2007 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2012 Los Alamos National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -26,7 +28,6 @@ #include "opal/threads/mutex.h" #include "opal/dss/dss.h" #include "opal/util/arch.h" -#include "opal/util/opal_sos.h" #include "orte/mca/errmgr/errmgr.h" #include "orte/mca/ess/ess.h" @@ -108,7 +109,6 @@ int ompi_proc_init(void) proc->proc_name.jobid = ORTE_PROC_MY_NAME->jobid; proc->proc_name.vpid = i; - ORTE_EPOCH_SET(proc->proc_name.epoch,ORTE_EPOCH_MIN); if (i == ORTE_PROC_MY_NAME->vpid) { ompi_proc_local_proc = proc; @@ -170,7 +170,7 @@ int ompi_proc_complete_init(void) break; #endif } - } else if (OMPI_ERR_NOT_IMPLEMENTED == OPAL_SOS_GET_ERROR_CODE(ret)) { + } else if (OMPI_ERR_NOT_IMPLEMENTED == ret) { proc->proc_arch = opal_local_arch; } else { errcode = ret; @@ -362,7 +362,6 @@ int ompi_proc_refresh(void) { /* Does not change: proc->proc_name.vpid */ proc->proc_name.jobid = ORTE_PROC_MY_NAME->jobid; - ORTE_EPOCH_SET(proc->proc_name.epoch,orte_ess.proc_get_epoch(&proc->proc_name)); /* Make sure to clear the local flag before we set it below */ proc->proc_flags = 0; diff --git a/ompi/runtime/ompi_mpi_finalize.c b/ompi/runtime/ompi_mpi_finalize.c index 3ec30856fc..976e69b66a 100644 --- a/ompi/runtime/ompi_mpi_finalize.c +++ b/ompi/runtime/ompi_mpi_finalize.c @@ -11,7 +11,7 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2006-2009 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2006 Los Alamos National Security, LLC. All rights + * Copyright (c) 2006-2011 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2006 University of Houston. All rights reserved. * Copyright (c) 2009 Sun Microsystems, Inc. All rights reserved. @@ -87,6 +87,7 @@ #endif #include "ompi/runtime/ompi_cr.h" + int ompi_mpi_finalize(void) { int ret, value; @@ -94,6 +95,7 @@ int ompi_mpi_finalize(void) opal_list_item_t *item; struct timeval ompistart, ompistop; bool timing = false; + orte_grpcomm_collective_t *coll; /* Be a bit social if an erroneous program calls MPI_FINALIZE in two different threads, otherwise we may deadlock in @@ -229,11 +231,19 @@ int ompi_mpi_finalize(void) MPI barrier doesn't ensure that all messages have been transmitted before exiting, so the possibility of a stranded message exists. */ - if (ORTE_SUCCESS != (ret = orte_grpcomm.barrier())) { + coll = OBJ_NEW(orte_grpcomm_collective_t); + coll->id = orte_process_info.peer_fini_barrier; + if (ORTE_SUCCESS != (ret = orte_grpcomm.barrier(coll))) { ORTE_ERROR_LOG(ret); return ret; } + /* wait for barrier to complete */ + while (coll->active) { + opal_progress(); /* block in progress pending events */ + } + OBJ_RELEASE(coll); + /* check for timing request - get stop time and report elapsed time if so */ if (timing && 0 == ORTE_PROC_MY_NAME->vpid) { diff --git a/ompi/runtime/ompi_mpi_init.c b/ompi/runtime/ompi_mpi_init.c index 529f36eb47..438acdae8a 100644 --- a/ompi/runtime/ompi_mpi_init.c +++ b/ompi/runtime/ompi_mpi_init.c @@ -101,6 +101,7 @@ #include "ompi/runtime/ompi_cr.h" #include "orte/runtime/orte_globals.h" +#include "orte/util/name_fns.h" /* This is required for the boundaries of the hash tables used to store * the F90 types returned by the MPI_Type_create_f90_XXX functions. @@ -290,6 +291,7 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided) struct timeval ompistart, ompistop; char *event_val = NULL; bool orte_setup = false; + orte_grpcomm_collective_t *coll; /* bitflag of the thread level support provided. To be used * for the modex in order to work in heterogeneous environments. */ @@ -547,10 +549,20 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided) /* exchange connection info - this function also acts as a barrier * as it will not return until the exchange is complete */ - if (ORTE_SUCCESS != (ret = orte_grpcomm.modex(NULL))) { + coll = OBJ_NEW(orte_grpcomm_collective_t); + coll->id = orte_process_info.peer_modex; + if (ORTE_SUCCESS != (ret = orte_grpcomm.modex(coll))) { error = "orte_grpcomm_modex failed"; goto error; } + /* wait for modex to complete - this may be moved anywhere in mpi_init + * so long as it occurs prior to calling a function that needs + * the modex info! + */ + while (coll->active) { + opal_progress(); /* block in progress pending events */ + } + OBJ_RELEASE(coll); if (timing && 0 == ORTE_PROC_MY_NAME->vpid) { gettimeofday(&ompistop, NULL); @@ -897,7 +909,7 @@ MOVEON: /* If we got "unreachable", then print a specific error message. Otherwise, if we got some other failure, fall through to print a generic message. */ - if (OMPI_ERR_UNREACH == OPAL_SOS_GET_ERROR_CODE(ret)) { + if (OMPI_ERR_UNREACH == ret) { orte_show_help("help-mpi-runtime", "mpi_init:startup:pml-add-procs-fail", true); error = NULL; @@ -934,11 +946,18 @@ MOVEON: } /* wait for everyone to reach this point */ - if (ORTE_SUCCESS != (ret = orte_grpcomm.barrier())) { + coll = OBJ_NEW(orte_grpcomm_collective_t); + coll->id = orte_process_info.peer_init_barrier; + if (ORTE_SUCCESS != (ret = orte_grpcomm.barrier(coll))) { error = "orte_grpcomm_barrier failed"; goto error; } - + /* wait for barrier to complete */ + while (coll->active) { + opal_progress(); /* block in progress pending events */ + } + OBJ_RELEASE(coll); + /* check for timing request - get stop time and report elapsed time if so, then start the clock again */ if (timing && 0 == ORTE_PROC_MY_NAME->vpid) { diff --git a/ompi/tools/ompi-server/ompi-server.c b/ompi/tools/ompi-server/ompi-server.c index 735f71a1f5..0e0f997238 100644 --- a/ompi/tools/ompi-server/ompi-server.c +++ b/ompi/tools/ompi-server/ompi-server.c @@ -10,7 +10,7 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2007-2012 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2007 Los Alamos National Security, LLC. All rights + * Copyright (c) 2007-2012 Los Alamos National Security, LLC. All rights * reserved. * $COPYRIGHT$ * @@ -46,7 +46,6 @@ #include "opal/mca/base/base.h" #include "opal/util/cmd_line.h" #include "opal/util/output.h" -#include "opal/util/opal_sos.h" #include "opal/util/show_help.h" #include "opal/util/daemon_init.h" #include "opal/runtime/opal.h" @@ -287,7 +286,9 @@ int main(int argc, char *argv[]) } /* wait to hear we are done */ - opal_event_dispatch(opal_event_base); + while (orte_event_base_active) { + opal_event_loop(orte_event_base, OPAL_EVLOOP_ONCE); + } /* should never get here, but if we do... */ diff --git a/ompi/tools/ompi_info/components.c b/ompi/tools/ompi_info/components.c index 49bb325c03..ad3cfe6822 100644 --- a/ompi/tools/ompi_info/components.c +++ b/ompi/tools/ompi_info/components.c @@ -101,6 +101,8 @@ #include "orte/mca/errmgr/errmgr.h" #include "orte/mca/errmgr/base/base.h" +#include "orte/mca/state/state.h" +#include "orte/mca/state/base/base.h" #include "orte/mca/grpcomm/grpcomm.h" #include "orte/mca/grpcomm/base/base.h" #include "orte/mca/ess/ess.h" @@ -396,6 +398,14 @@ void ompi_info_open_components(void) */ orte_process_info.proc_type = ORTE_PROC_HNP; + if (ORTE_SUCCESS != orte_state_base_open()) { + goto error; + } + map = OBJ_NEW(ompi_info_component_map_t); + map->type = strdup("state"); + map->components = &orte_state_base_components_available; + opal_pointer_array_add(&component_map, map); + if (ORTE_SUCCESS != orte_errmgr_base_open()) { goto error; } @@ -789,7 +799,8 @@ void ompi_info_close_components() #endif (void) orte_errmgr_base_close(); - + (void) orte_state_base_close(); + (void) opal_backtrace_base_close(); (void) opal_memory_base_close(); (void) opal_memchecker_base_close(); diff --git a/ompi/tools/ompi_info/ompi_info.c b/ompi/tools/ompi_info/ompi_info.c index 9fac292498..2111d61a2a 100644 --- a/ompi/tools/ompi_info/ompi_info.c +++ b/ompi/tools/ompi_info/ompi_info.c @@ -268,6 +268,7 @@ int main(int argc, char *argv[]) opal_pointer_array_add(&mca_types, "filem"); #endif /* these are always included */ + opal_pointer_array_add(&mca_types, "state"); opal_pointer_array_add(&mca_types, "errmgr"); opal_pointer_array_add(&mca_types, "ess"); opal_pointer_array_add(&mca_types, "grpcomm"); diff --git a/opal/mca/base/mca_base_components_open.c b/opal/mca/base/mca_base_components_open.c index e5e5a1f7f8..b6743a7e61 100644 --- a/opal/mca/base/mca_base_components_open.c +++ b/opal/mca/base/mca_base_components_open.c @@ -10,6 +10,8 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2008 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2011-2012 Los Alamos National Security, LLC. + * All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -27,7 +29,6 @@ #include "opal/util/argv.h" #include "opal/util/output.h" #include "opal/util/show_help.h" -#include "opal/util/opal_sos.h" #include "opal/mca/mca.h" #include "opal/mca/base/base.h" #include "opal/mca/base/mca_base_component_repository.h" @@ -392,7 +393,7 @@ static int open_components(const char *type_name, int output_id, "mca: base: components_open: " "component %s register function successful", component->mca_component_name); - } else if (OPAL_ERR_NOT_AVAILABLE != OPAL_SOS_GET_ERROR_CODE(ret)) { + } else if (OPAL_ERR_NOT_AVAILABLE != ret) { /* If the component returns OPAL_ERR_NOT_AVAILABLE, it's a cue to "silently ignore me" -- it's not a failure, it's just a way for the component to say @@ -432,7 +433,7 @@ static int open_components(const char *type_name, int output_id, "mca: base: components_open: " "component %s open function successful", component->mca_component_name); - } else if (OPAL_ERR_NOT_AVAILABLE != OPAL_SOS_GET_ERROR_CODE(ret)) { + } else if (OPAL_ERR_NOT_AVAILABLE != ret) { /* If the component returns OPAL_ERR_NOT_AVAILABLE, it's a cue to "silently ignore me" -- it's not a failure, it's just a way for the component to say diff --git a/opal/mca/compress/base/compress_base_open.c b/opal/mca/compress/base/compress_base_open.c index 6324ab5887..53f0f69f6d 100644 --- a/opal/mca/compress/base/compress_base_open.c +++ b/opal/mca/compress/base/compress_base_open.c @@ -2,6 +2,8 @@ * Copyright (c) 2004-2010 The Trustees of Indiana University. * All rights reserved. * + * Copyright (c) 2011-2012 Los Alamos National Security, LLC. + * All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -18,7 +20,6 @@ #include "opal/mca/compress/compress.h" #include "opal/mca/compress/base/base.h" #include "opal/util/output.h" -#include "opal/util/opal_sos.h" #include "opal/mca/compress/base/static-components.h" @@ -84,7 +85,7 @@ int opal_compress_base_open(void) mca_compress_base_static_components, &opal_compress_base_components_available, true)) ) { - if( OPAL_ERR_NOT_FOUND == OPAL_SOS_GET_ERROR_CODE(ret) && + if( OPAL_ERR_NOT_FOUND == ret && NULL != str_value && 0 == strncmp(str_value, "none", strlen("none")) ) { exit_status = OPAL_SUCCESS; diff --git a/opal/mca/crs/base/crs_base_open.c b/opal/mca/crs/base/crs_base_open.c index b172fea33a..40bca363f3 100644 --- a/opal/mca/crs/base/crs_base_open.c +++ b/opal/mca/crs/base/crs_base_open.c @@ -8,6 +8,8 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2007 Evergrid, Inc. All rights reserved. + * Copyright (c) 2011-2012 Los Alamos National Security, LLC. + * All rights reserved. * * $COPYRIGHT$ * @@ -28,7 +30,6 @@ #include "opal/mca/crs/crs.h" #include "opal/mca/crs/base/base.h" #include "opal/util/output.h" -#include "opal/util/opal_sos.h" #include "opal/mca/crs/base/static-components.h" @@ -95,7 +96,7 @@ int opal_crs_base_open(void) mca_crs_base_static_components, &opal_crs_base_components_available, true)) ) { - if( OPAL_ERR_NOT_FOUND == OPAL_SOS_GET_ERROR_CODE(ret) && + if( OPAL_ERR_NOT_FOUND == ret && NULL != str_value && 0 == strncmp(str_value, "none", strlen("none")) ) { exit_status = OPAL_SUCCESS; diff --git a/opal/mca/event/base/base.h b/opal/mca/event/base/base.h index 567f2e7a1c..1b87c95149 100644 --- a/opal/mca/event/base/base.h +++ b/opal/mca/event/base/base.h @@ -1,5 +1,7 @@ /* * Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2012 Los Alamos National Security, LLC. + * All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -12,6 +14,8 @@ #include "opal_config.h" +#include "opal/class/opal_pointer_array.h" + #include "opal/mca/event/event.h" /* @@ -95,7 +99,6 @@ OPAL_DECLSPEC int opal_event_base_close(void); OPAL_DECLSPEC extern int opal_event_base_output; OPAL_DECLSPEC extern opal_list_t opal_event_components; - END_C_DECLS #endif /* OPAL_BASE_EVENT_H */ diff --git a/opal/mca/event/base/event_base_close.c b/opal/mca/event/base/event_base_close.c index 1b8ae2f131..6ecb89f426 100644 --- a/opal/mca/event/base/event_base_close.c +++ b/opal/mca/event/base/event_base_close.c @@ -21,9 +21,6 @@ int opal_event_base_close(void) opal_event_base_inited--; - /* release the event base */ - opal_event_base_finalize(opal_event_base); - /* no need to close the component as it was statically opened */ /* for support of tools such as ompi_info */ diff --git a/opal/mca/event/base/event_base_open.c b/opal/mca/event/base/event_base_open.c index 2b3ea8683c..22d1c5ba03 100644 --- a/opal/mca/event/base/event_base_open.c +++ b/opal/mca/event/base/event_base_open.c @@ -79,7 +79,12 @@ int opal_event_base_open(void) /* get our event base */ if (NULL == (opal_event_base = opal_event_base_create())) { - rc = OPAL_ERROR; + return OPAL_ERROR; + } + + /* set the number of priorities */ + if (0 < OPAL_EVENT_NUM_PRI) { + opal_event_base_priority_init(opal_event_base, OPAL_EVENT_NUM_PRI); } return rc; diff --git a/opal/mca/event/event.h b/opal/mca/event/event.h index c251601940..bc12b482fd 100644 --- a/opal/mca/event/event.h +++ b/opal/mca/event/event.h @@ -25,6 +25,8 @@ #include #endif +#include "opal/class/opal_pointer_array.h" + #include "opal/mca/mca.h" #include "opal/mca/base/base.h" @@ -38,6 +40,17 @@ typedef unsigned char u_char; typedef unsigned short u_short; #endif +/* set the number of event priority levels */ +#define OPAL_EVENT_NUM_PRI 8 + +#define OPAL_EV_ERROR_PRI 0 +#define OPAL_EV_MSG_HI_PRI 1 +#define OPAL_EV_SYS_HI_PRI 2 +#define OPAL_EV_MSG_LO_PRI 3 +#define OPAL_EV_SYS_LO_PRI 4 +#define OPAL_EV_INFO_HI_PRI 5 +#define OPAL_EV_INFO_LO_PRI 6 +#define OPAL_EV_LOWEST_PRI 7 #define OPAL_EVENT_SIGNAL(ev) opal_event_get_signal(ev) diff --git a/opal/mca/event/libevent2013/configure.m4 b/opal/mca/event/libevent2013/configure.m4 index d955805f6a..063cf943d4 100644 --- a/opal/mca/event/libevent2013/configure.m4 +++ b/opal/mca/event/libevent2013/configure.m4 @@ -87,8 +87,8 @@ AC_DEFUN([MCA_opal_event_libevent2013_CONFIG],[ AC_ARG_ENABLE(event-debug, AC_HELP_STRING([--enable-event-debug], [enable event library debug output])) - if test "$enable_event_debug" = "no"; then - event_args="$event_args --disable-debug-mode" + if test "$enable_event_debug" = "yes"; then + event_args="$event_args --enable-debug-mode" fi AC_ARG_ENABLE(event-thread-support, diff --git a/opal/mca/event/libevent2013/libevent/event.c b/opal/mca/event/libevent2013/libevent/event.c index c0d36a9c5b..4ae06531d7 100644 --- a/opal/mca/event/libevent2013/libevent/event.c +++ b/opal/mca/event/libevent2013/libevent/event.c @@ -1519,9 +1519,6 @@ event_base_loop(struct event_base *base, int flags) * as we invoke user callbacks. */ EVBASE_ACQUIRE_LOCK(base, th_base_lock); - /**** OMPI CHANGE ****/ - /* Disable reentrant check */ -#if 0 if (base->running_loop) { event_warnx("%s: reentrant invocation. Only one event_base_loop" " can run on each event_base at once.", __func__); @@ -1530,8 +1527,6 @@ event_base_loop(struct event_base *base, int flags) } base->running_loop = 1; -#endif - /**** END OMPI CHANGE ****/ clear_time_cache(base); @@ -2148,14 +2143,8 @@ event_del(struct event *ev) int res; if (EVUTIL_FAILURE_CHECK(!ev->ev_base)) { - /**** OMPI CHANGE ****/ - /* Disable warning and return 0 */ - return 0; -#if 0 event_warnx("%s: event has no event_base set.", __func__); return -1; -#endif - /**** END OMPI CHANGE ****/ } EVBASE_ACQUIRE_LOCK(ev->ev_base, th_base_lock); diff --git a/opal/mca/event/libevent2013/libevent2013.h b/opal/mca/event/libevent2013/libevent2013.h index 43a0f5ab89..9f5a190ef4 100644 --- a/opal/mca/event/libevent2013/libevent2013.h +++ b/opal/mca/event/libevent2013/libevent2013.h @@ -1,6 +1,8 @@ /* * Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 Los Alamos National Security, LLC. + * All rights reserved. * * $COPYRIGHT$ * @@ -57,49 +59,14 @@ #include "opal/mca/event/event.h" -typedef struct event opal_event_t; -/*** Overload the event_base_t struct ***/ -/* This may (hopefully) be a temporary change - * to deal with cross-base sync. Specifically, - * when an event in one base needs to release - * a condition_wait in another base, we need - * to "wakeup" the event base in the second base - * so the condition_wait can be checked - * - * On a more permanent level, use this to update - * the event base when it is being progressed in - * a separate thread. - */ -typedef struct { - struct event_base *base; - opal_event_t update_event; - int update_pipe[2]; -} opal_event_base_t; +typedef event_callback_fn opal_event_cbfunc_t; -typedef struct { - opal_event_t *ev; - uint8_t op; -} opal_event_update_t; - -#define OPAL_EVENT_NOOP 0x00 -#define OPAL_EVENT_ADD 0x01 -#define OPAL_EVENT_DEL 0x02 - -#if OPAL_EVENT_HAVE_THREAD_SUPPORT -#define OPAL_UPDATE_EVBASE(b, evt, ad) -#else -#define OPAL_UPDATE_EVBASE(b, evt, ad) \ - do { \ - opal_event_update_t up; \ - up.ev = (evt); \ - up.op = (ad); \ - opal_fd_write((b)->update_pipe[1], sizeof(opal_event_update_t), &up); \ - } while(0); -#endif BEGIN_C_DECLS -/* Temporary global - will be replaced by layer-specific event bases */ +typedef struct event_base opal_event_base_t; +typedef struct event opal_event_t; + OPAL_DECLSPEC extern opal_event_base_t *opal_event_base; #define OPAL_EV_TIMEOUT EV_TIMEOUT @@ -114,14 +81,19 @@ OPAL_DECLSPEC extern opal_event_base_t *opal_event_base; /* Global function to create and release an event base */ OPAL_DECLSPEC opal_event_base_t* opal_event_base_create(void); -OPAL_DECLSPEC void opal_event_base_finalize(opal_event_base_t *base); + +#define opal_event_base_free(x) event_base_free(x) OPAL_DECLSPEC int opal_event_init(void); -OPAL_DECLSPEC int opal_event_reinit(opal_event_base_t *base); +#define opal_event_reinit(b) event_reinit((b)) -OPAL_DECLSPEC struct timeval *opal_event_base_init_common_timeout (opal_event_base_t *evbase, - struct timeval *tv_in); +#define opal_event_base_init_common_timeout (b, t) event_base_init_common_timeout((b), (t)) + +/* Event priority APIs */ +#define opal_event_base_priority_init(b, n) event_base_priority_init((b), (n)) + +#define opal_event_set_priority(x, n) event_priority_set((x), (n)) /* thread support APIs */ #if OPAL_EVENT_HAVE_THREAD_SUPPORT @@ -135,9 +107,11 @@ OPAL_DECLSPEC struct timeval *opal_event_base_init_common_timeout (opal_event_ba #endif /* Basic event APIs */ +#define opal_event_enable_debug_mode() event_enable_debug_mode() + #define opal_event_set_debug_output(x) event_set_debug_output((x)) -#define opal_event_set(b, ev, fd, fg, cb, arg) event_assign((ev), (b)->base, (fd), (fg), (event_callback_fn) (cb), (arg)) +#define opal_event_set(b, x, fd, fg, cb, arg) event_assign((x), (b), (fd), (fg), (event_callback_fn) (cb), (arg)) #define opal_event_add(ev, tv) event_add((ev), (tv)) @@ -145,39 +119,39 @@ OPAL_DECLSPEC struct timeval *opal_event_base_init_common_timeout (opal_event_ba #define opal_event_active(x, y, z) event_active((x), (y), (z)) -#define opal_event_new(b, fd, fg, cb, arg) event_new((b)->base, (fd), (fg), (event_callback_fn) (cb), (arg)) +#define opal_event_new(b, fd, fg, cb, arg) event_new((b), (fd), (fg), (event_callback_fn) (cb), (arg)) + +OPAL_DECLSPEC opal_event_t* opal_event_alloc(void); #define opal_event_free(x) event_free((x)) /* Timer APIs */ -#define opal_event_evtimer_new(b, cb, arg) event_new((b)->base, -1, 0, (event_callback_fn) (cb), (arg)) +#define opal_event_evtimer_new(b, cb, arg) opal_event_new((b), -1, 0, (cb), (arg)) -#define opal_event_evtimer_add(ev, tv) event_add((ev), (tv)) +#define opal_event_evtimer_add(x, tv) opal_event_add((x), (tv)) -#define opal_event_evtimer_set(b, ev, cb, arg) event_assign((ev), (b)->base, -1, 0, (event_callback_fn) (cb), (arg)) +#define opal_event_evtimer_set(b, x, cb, arg) event_assign((x), (b), -1, 0, (event_callback_fn) (cb), (arg)) -#define opal_event_evtimer_del(ev) event_del((ev)) +#define opal_event_evtimer_del(x) opal_event_del((x)) -#define opal_event_evtimer_pending(ev, tv) event_pending((ev), EV_TIMEOUT, (tv)) +#define opal_event_evtimer_pending(x, tv) event_pending((x), EV_TIMEOUT, (tv)) -#define opal_event_evtimer_initialized(ev) event_initialized((ev)) +#define opal_event_evtimer_initialized(x) event_initialized((x)) /* Signal APIs */ -#define opal_event_signal_add(ev, tv) event_add((ev), (tv)) +#define opal_event_signal_add(x, tv) event_add((x), (tv)) -#define opal_event_signal_set(b, ev, fd, cb, arg) event_assign((ev), (b)->base, (fd), EV_SIGNAL|EV_PERSIST, (event_callback_fn) (cb), (arg)) +#define opal_event_signal_set(b, x, fd, cb, arg) event_assign((x), (b), (fd), EV_SIGNAL|EV_PERSIST, (event_callback_fn) (cb), (arg)) -#define opal_event_signal_del(ev) event_del((ev)) +#define opal_event_signal_del(x) event_del((x)) -#define opal_event_signal_pending(ev, tv) event_pending((ev), EV_SIGNAL, (tv)) +#define opal_event_signal_pending(x, tv) event_pending((x), EV_SIGNAL, (tv)) -#define opal_event_signal_initalized(ev) event_initialized((ev)) +#define opal_event_signal_initalized(x) event_initialized((x)) -#define opal_event_get_signal(ev) event_get_signal((ev)) +#define opal_event_get_signal(x) event_get_signal((x)) -#define opal_event_loop(b, fg) event_base_loop((b->base), (fg)) - -#define opal_event_dispatch(b) event_base_loop((b)->base, 0) +#define opal_event_loop(b, fg) event_base_loop((b), (fg)) END_C_DECLS diff --git a/opal/mca/event/libevent2013/libevent2013_module.c b/opal/mca/event/libevent2013/libevent2013_module.c index fbd8b707eb..305ff40f55 100644 --- a/opal/mca/event/libevent2013/libevent2013_module.c +++ b/opal/mca/event/libevent2013/libevent2013_module.c @@ -109,83 +109,16 @@ static const struct eventop *eventops[] = { static struct event_config *config=NULL; -static void update_event(int fd, short flags, void* arg) -{ - opal_event_update_t up; - - /* read the event */ - opal_fd_read(fd, sizeof(opal_event_update_t), &up); - if (NULL == up.ev) { - return; - } - if (OPAL_EVENT_ADD == up.op) { - event_add(up.ev, 0); - } else if (OPAL_EVENT_DEL == up.op) { - event_del(up.ev); - } - return; -} - -/* Public function -- not part of the module */ -/* This includes (hopefully) a temporary change - * to deal with cross-base sync. Specifically, - * when an event in one base needs to release - * a condition_wait in another base, we need - * to "wakeup" the event base in the second base - * so the condition_wait can be checked - */ opal_event_base_t* opal_event_base_create(void) { - struct event_base *base; - opal_event_base_t *evbase; + opal_event_base_t *base; base = event_base_new_with_config(config); if (NULL == base) { /* there is no backend method that does what we want */ opal_output(0, "No event method available"); - return NULL; } - evbase = (opal_event_base_t*)malloc(sizeof(opal_event_base_t)); - evbase->base = base; -#ifndef __WINDOWS__ - if (pipe(evbase->update_pipe) < 0) { - opal_output(0, "Unable to open update pipe"); - free(evbase); - event_base_free(base); - return NULL; - } -#else - if (create_socketpair(AF_UNIX, SOCK_STREAM, 0, evbase->update_pipe) == -1) { - opal_output(0, "Unable to open update socket"); - free(evbase); - event_base_free(base); - return NULL; - } -#endif - event_assign(&evbase->update_event, base, - evbase->update_pipe[0], EV_READ | EV_PERSIST, - update_event, NULL); - event_add(&evbase->update_event, 0); - return evbase; -} - -void opal_event_base_finalize(opal_event_base_t *evbase) -{ - /* delete the wakeup event */ - event_del(&evbase->update_event); -#ifndef __WINDOWS__ - /* close the pipe */ - close(evbase->update_pipe[0]); - close(evbase->update_pipe[1]); -#else - /* close the socket */ - closesocket(evbase->update_pipe[0]); - closesocket(evbase->update_pipe[1]); -#endif - /* release the base */ - event_base_free(evbase->base); - /* free the storage */ - free(evbase); + return base; } int opal_event_init(void) @@ -304,14 +237,10 @@ int opal_event_init(void) return OPAL_SUCCESS; } -int opal_event_reinit(opal_event_base_t *evbase) +opal_event_t* opal_event_alloc(void) { - return event_reinit(evbase->base); -} + opal_event_t *ev; -struct timeval *opal_event_base_init_common_timeout (opal_event_base_t *evbase, - struct timeval *tv_in) -{ - return (struct timeval*)event_base_init_common_timeout (evbase->base, tv_in); + ev = (opal_event_t*)malloc(sizeof(opal_event_t)); + return ev; } - diff --git a/opal/mca/hwloc/base/hwloc_base_util.c b/opal/mca/hwloc/base/hwloc_base_util.c index f828146b6d..bef3d48f3f 100644 --- a/opal/mca/hwloc/base/hwloc_base_util.c +++ b/opal/mca/hwloc/base/hwloc_base_util.c @@ -1456,5 +1456,6 @@ char* opal_hwloc_base_print_locality(opal_paffinity_locality_t locality) ptr->buffers[ptr->cntr][idx++] = 'K'; ptr->buffers[ptr->cntr][idx++] = '\0'; } + return ptr->buffers[ptr->cntr]; } diff --git a/opal/runtime/opal_finalize.c b/opal/runtime/opal_finalize.c index e3dad9ff71..5004c41202 100644 --- a/opal/runtime/opal_finalize.c +++ b/opal/runtime/opal_finalize.c @@ -10,7 +10,7 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2008 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2010-2011 Los Alamos National Security, LLC. + * Copyright (c) 2010-2012 Los Alamos National Security, LLC. * All rights reserved. * $COPYRIGHT$ * @@ -31,7 +31,6 @@ #include "opal/util/net.h" #include "opal/util/keyval_parse.h" #include "opal/util/show_help.h" -#include "opal/util/opal_sos.h" #include "opal/memoryhooks/memory.h" #include "opal/mca/base/base.h" #include "opal/runtime/opal.h" @@ -87,9 +86,6 @@ opal_finalize_util(void) /* finalize the trace system */ opal_trace_finalize(); - /* finalize the OPAL SOS system */ - opal_sos_finalize(); - /* finalize the show_help system */ opal_show_help_finalize(); diff --git a/opal/runtime/opal_init.c b/opal/runtime/opal_init.c index af9e3e67a7..427b794ba9 100644 --- a/opal/runtime/opal_init.c +++ b/opal/runtime/opal_init.c @@ -12,7 +12,7 @@ * Copyright (c) 2007-2012 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2007 Sun Microsystems, Inc. All rights reserved. * Copyright (c) 2009 Oak Ridge National Labs. All rights reserved. - * Copyright (c) 2010 Los Alamos National Security, LLC. + * Copyright (c) 2010-2012 Los Alamos National Security, LLC. * All rights reserved. * $COPYRIGHT$ * @@ -60,7 +60,6 @@ #include "opal/util/stacktrace.h" #include "opal/util/keyval_parse.h" #include "opal/util/sys_limits.h" -#include "opal/util/opal_sos.h" #if OPAL_CC_USE_PRAGMA_IDENT #pragma ident OPAL_IDENT_STRING @@ -78,7 +77,7 @@ opal_err2str(int errnum, const char **errmsg) { const char *retval; - switch (OPAL_SOS_GET_ERROR_CODE(errnum)) { + switch (errnum) { case OPAL_SUCCESS: retval = "Success"; break; @@ -255,9 +254,6 @@ opal_init_util(int* pargc, char*** pargv) /* initialize the help system */ opal_show_help_init(); - /* initialize the OPAL SOS system */ - opal_sos_init(); - /* register handler for errnum -> string converstion */ if (OPAL_SUCCESS != (ret = opal_error_register("OPAL", diff --git a/opal/util/Makefile.am b/opal/util/Makefile.am index 1ab987e853..3323b5813a 100644 --- a/opal/util/Makefile.am +++ b/opal/util/Makefile.am @@ -19,7 +19,7 @@ SUBDIRS = keyval -dist_pkgdata_DATA = help-opal-util.txt opal_sos_reporter.txt +dist_pkgdata_DATA = help-opal-util.txt AM_LFLAGS = -Popal_show_help_yy LEX_OUTPUT_ROOT = lex.opal_show_help_yy @@ -49,7 +49,6 @@ headers = \ opal_environ.h \ opal_getcwd.h \ opal_pty.h \ - opal_sos.h \ os_dirpath.h \ os_path.h \ output.h \ @@ -82,7 +81,6 @@ libopalutil_la_SOURCES = \ opal_environ.c \ opal_getcwd.c \ opal_pty.c \ - opal_sos.c \ os_dirpath.c \ os_path.c \ output.c \ diff --git a/opal/util/error.c b/opal/util/error.c index 50f76fa07b..fb948bfd74 100644 --- a/opal/util/error.c +++ b/opal/util/error.c @@ -9,7 +9,7 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. - * Copyright (c) 2007 Los Alamos National Security, LLC. + * Copyright (c) 2007-2012 Los Alamos National Security, LLC. * All rights reserved. * $COPYRIGHT$ * @@ -30,7 +30,6 @@ #endif #include "opal/util/error.h" -#include "opal/util/opal_sos.h" #include "opal/constants.h" #define MAX_CONVERTERS 5 @@ -99,12 +98,12 @@ opal_perror(int errnum, const char *msg) const char* errmsg; ret = opal_strerror_int(errnum, &errmsg); - if (NULL != msg && OPAL_SOS_GET_ERROR_CODE(errnum) != OPAL_ERR_IN_ERRNO) { + if (NULL != msg && errnum != OPAL_ERR_IN_ERRNO) { fprintf(stderr, "%s: ", msg); } if (OPAL_SUCCESS != ret) { - if (OPAL_SOS_GET_ERROR_CODE(errnum) == OPAL_ERR_IN_ERRNO) { + if (errnum == OPAL_ERR_IN_ERRNO) { perror(msg); } else { char *ue_msg; @@ -129,7 +128,7 @@ opal_strerror(int errnum) int ret; const char* errmsg; - if (OPAL_SOS_GET_ERROR_CODE(errnum) == OPAL_ERR_IN_ERRNO) { + if (errnum == OPAL_ERR_IN_ERRNO) { return strerror(errno); } @@ -156,7 +155,7 @@ opal_strerror_r(int errnum, char *strerrbuf, size_t buflen) ret = opal_strerror_int(errnum, &errmsg); if (OPAL_SUCCESS != ret) { - if (OPAL_SOS_GET_ERROR_CODE(errnum) == OPAL_ERR_IN_ERRNO) { + if (errnum == OPAL_ERR_IN_ERRNO) { char *tmp = strerror(errno); strncpy(strerrbuf, tmp, buflen); return OPAL_SUCCESS; diff --git a/opal/util/opal_sos.c b/opal/util/opal_sos.c deleted file mode 100644 index deadcf968d..0000000000 --- a/opal/util/opal_sos.c +++ /dev/null @@ -1,535 +0,0 @@ -/* - * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2004-2005 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2005 The Regents of the University of California. - * All rights reserved. - * Copyright (c) 2007 Los Alamos National Security, LLC. - * All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "opal_config.h" - -#ifdef HAVE_STRING_H -#include -#endif -#include -#include -#ifdef HAVE_STDARG_H -#include -#endif -#ifdef HAVE_STDLIB_H -#include -#endif - -#include "opal/util/opal_sos.h" -#include "opal/constants.h" -#include "opal/mca/base/mca_base_param.h" -#include "opal/class/opal_hash_table.h" -#include "opal/util/stacktrace.h" -#include "opal/util/show_help.h" - -/** Global variables */ -opal_hash_table_t opal_sos_table; -opal_mutex_t opal_sos_table_lock; -bool opal_sos_print_low; - -/* Local variables */ -static bool opal_sos_initialized = false; -static const char *dash_line = "--------------------------------------------------------------------------"; -static const char *stackhdr = "[STACK TRACE]:\n"; - -/* Local functions */ -static void opal_sos_error_construct(opal_sos_error_t *obj); -static void opal_sos_error_destruct(opal_sos_error_t *obj); - -/** OPAL SOS callback function pointers */ -static opal_sos_print_callback_fn_t cur_print_callback; -static opal_sos_reporter_callback_fn_t cur_reporter_callback; -/* static opal_sos_print_callback_fn_t prev_print_callback; */ -static opal_sos_reporter_callback_fn_t prev_reporter_callback; - -OBJ_CLASS_INSTANCE(opal_sos_error_t, - opal_object_t, - opal_sos_error_construct, - opal_sos_error_destruct); - -/** - * Constructor - */ -static void opal_sos_error_construct(opal_sos_error_t *obj) -{ - obj->errnum = 0; - obj->file = NULL; - obj->line = 0; - obj->func = NULL; - obj->msg = NULL; - obj->prev = obj->next = OPAL_SOS_ERR_BASE; -} - -/** - * Destructor - */ -static void opal_sos_error_destruct(opal_sos_error_t *obj) -{ - if (NULL != obj->file) { - free(obj->file); - } - - if (NULL != obj->func) { - free(obj->func); - } - - if (NULL != obj->msg) { - free(obj->msg); - } -} - -/** - * Initialize the OPAL SOS interface - * - */ -void opal_sos_init(void) -{ - int value; - - if (opal_sos_initialized) { - return; - } - - mca_base_param_reg_int_name("opal", "sos_print_low", - "Set to non-zero to enable the print-at-bottom" - " preference for OPAL SOS. Enabling this option prints" - " out the errors, warnings or info messages as" - " soon as they are encountered.", - false, false, (int)false, &value); - - opal_sos_print_low = OPAL_INT_TO_BOOL(value); - - OBJ_CONSTRUCT(&opal_sos_table, opal_hash_table_t); - opal_hash_table_init(&opal_sos_table, OPAL_SOS_ERR_TABLE_SIZE); - OBJ_CONSTRUCT(&opal_sos_table_lock, opal_mutex_t); - - opal_sos_reg_reporter_callback(opal_sos_print_error, &prev_reporter_callback); - opal_sos_initialized = true; - return; -} - -/** - * Finalize the OPAL SOS interface - * - */ -void opal_sos_finalize(void) -{ - OBJ_DESTRUCT(&opal_sos_table); - OBJ_DESTRUCT(&opal_sos_table_lock); - opal_sos_initialized = false; - return; -} - -/** - * Free all the SOS errors represented by the error code pointed to by \c errnum - * - */ -void opal_sos_free(int *errnum) -{ - opal_sos_error_t *opal_error, *attached_error; - int err, attached_errnum; - - if (NULL == errnum) { - return; - } else if (true == OPAL_SOS_IS_NATIVE(*errnum)) { - return; - } else { - err = *errnum; - } - - *errnum = OPAL_SOS_GET_ERROR_CODE(err); - - do { - /* Look for attached errors */ - if (0 != (attached_errnum = OPAL_SOS_GET_ATTACHED_INDEX(err))) { - OPAL_THREAD_LOCK(&opal_sos_table_lock); - if (OPAL_SUCCESS != opal_hash_table_get_value_uint32(&opal_sos_table, - attached_errnum, - (void **)&attached_error)) { - goto cleanup; - } - OPAL_THREAD_UNLOCK(&opal_sos_table_lock); - - /* If there's an attached error trace, free it! */ - if (NULL != attached_error) { - attached_errnum = attached_error->errnum; - opal_sos_free(&attached_errnum); - } - } - - OPAL_THREAD_LOCK(&opal_sos_table_lock); - if (OPAL_SUCCESS != opal_hash_table_get_value_uint32(&opal_sos_table, - OPAL_SOS_GET_INDEX(err), - (void **)&opal_error)) { - goto cleanup; - } - OPAL_THREAD_UNLOCK(&opal_sos_table_lock); - if (NULL == opal_error) { - goto cleanup; - } - - opal_sos_error_destruct(opal_error); - /* Remove the entry from the SOS table */ - OPAL_THREAD_LOCK(&opal_sos_table_lock); - opal_hash_table_remove_value_uint32(&opal_sos_table, OPAL_SOS_GET_INDEX(err)); - OPAL_THREAD_UNLOCK(&opal_sos_table_lock); - - err = opal_error->prev; - } while (OPAL_SOS_ERR_BASE != err); - -cleanup: - OPAL_THREAD_UNLOCK(&opal_sos_table_lock); -} - -opal_sos_error_t * -opal_sos_build_error(int errnum, bool show_stack, const char *errmsg, ...) -{ - opal_sos_error_t *opal_error; - char *stackframe, msg[OPAL_SOS_MAX_ERR_LEN]; - va_list arglist; - int ret_errno = 0, len; - - if (!opal_sos_initialized) { - opal_sos_init(); - } - - opal_error = OBJ_NEW(opal_sos_error_t); - if (NULL == opal_error) { - return NULL; /* OPAL_ERR_OUT_OF_RESOURCE */ - } - - va_start(arglist, errmsg); - len = vsnprintf(msg, OPAL_SOS_MAX_ERR_LEN, errmsg, arglist); - va_end(arglist); -#if OPAL_WANT_PRETTY_PRINT_STACKTRACE - if ((true == show_stack) && - (NULL != (stackframe = opal_stackframe_output_string()))) { - len += strlen(stackhdr) + strlen(stackframe) + 2; - if (len > OPAL_SOS_MAX_ERR_LEN) - len = OPAL_SOS_MAX_ERR_LEN; - - opal_error->msg = (char *) malloc(len); - if (NULL == opal_error->msg) { - return NULL; - } - snprintf(opal_error->msg, len, "%s\n%s%s", msg, stackhdr, stackframe); - } else { - opal_error->msg = strdup(msg); - } -#else - opal_error->msg = strdup ("OPAL_WANT_PRETTY_PRINT_STACKTRACE disabled"); -#endif - - /* Check if errnum is a native error code and encode it into - the encoded error code if it is native */ - if (OPAL_SOS_IS_NATIVE(errnum)) { - OPAL_SOS_SET_ERROR_CODE(ret_errno, errnum); - } else { - /* Extract the native error code from the encoded error and - encode it back again into the newly encoded error code */ - OPAL_SOS_SET_ERROR_CODE(ret_errno, OPAL_SOS_GET_ERROR_CODE(errnum)); - opal_error->prev = errnum; - } - - opal_error->errnum = ret_errno; - return opal_error; -} - -int opal_sos_reporter(const char *file, int line, const char *func, - opal_sos_severity_t severity, opal_sos_error_t *opal_error) -{ - opal_sos_error_t *prev_error; - int ret_errno = 0, hash; - - if (NULL == opal_error) { - return OPAL_ERR_OUT_OF_RESOURCE; - } - - /* Doing more strict validation here since if either of the file, - * func or msg are not known we replace it by to avoid any issues - * during dss pack/unpack - */ - opal_error->file = (NULL != file)?strdup(file):strdup(""); - opal_error->func = (NULL != func)?strdup(func):strdup(""); - opal_error->line = line; - - ret_errno = opal_error->errnum; - /* Encode the severity level into the return error code */ - OPAL_SOS_SET_SEVERITY(ret_errno, severity); - hash = opal_sos_hash_error(opal_error); - OPAL_SOS_SET_INDEX(ret_errno, hash); - opal_error->errnum = ret_errno; - - if (opal_sos_print_low) { - opal_sos_report_error(opal_error); - } - - /* Add the error object to the error table */ - OPAL_THREAD_LOCK(&opal_sos_table_lock); - - if (OPAL_SUCCESS != - opal_hash_table_set_value_uint32(&opal_sos_table, - OPAL_SOS_GET_INDEX(ret_errno), - (void *)opal_error)) { - OPAL_THREAD_UNLOCK(&opal_sos_table_lock); - OBJ_DESTRUCT(opal_error); - return OPAL_ERROR; - } - - /* Get the previous error in the error call stack and update - its next error pointer */ - prev_error = NULL; - opal_hash_table_get_value_uint32(&opal_sos_table, - OPAL_SOS_GET_INDEX(opal_error->prev), - (void **)&prev_error); - if (NULL != prev_error) { - prev_error->next = opal_error->errnum; - } - OPAL_THREAD_UNLOCK(&opal_sos_table_lock); - - return ret_errno; -} - -void -opal_sos_report_error(opal_sos_error_t *error) -{ - opal_sos_severity_t severity; - char *pretty_error; - int errnum, ret; - - if (NULL == error) - return; - - severity = (opal_sos_severity_t)OPAL_SOS_GET_SEVERITY(error->errnum); - - /* An OPAL SOS encoded error number holds no meaning outside - * the context of Open MPI. We convert it back to the native - * error code before reporting it. */ - if (true == OPAL_SOS_IS_NATIVE(error->errnum)) { - errnum = error->errnum; - } else { - errnum = OPAL_SOS_GET_ERROR_CODE(error->errnum); - } - - /* Prettify the error for printing it locally */ - ret = opal_sos_prettify_error(error->msg, &pretty_error); - - (*cur_reporter_callback)(severity, errnum, "<%s> at %s:%d:%s():\n%s", - opal_sos_severity2str(severity), error->file, - error->line, error->func, - ((0 > ret) ? error->msg : pretty_error)); - - if (ret > 0) { - free(pretty_error); - } - - /* Call the previous reporter callback which should be the selected - * ORTE notifier components */ - if (NULL != prev_reporter_callback) { - prev_reporter_callback(severity, errnum, "<%s> at %s:%d:%s():\n%s", - opal_sos_severity2str(severity), error->file, - error->line, error->func, error->msg); - } -} - -void opal_sos_print(int errnum, bool show_history) -{ - opal_sos_error_t *opal_error, *prev_opal_error, *attached_error; - int tmp, attached_errnum, prev_severity, severity; - - opal_show_help("opal_sos_reporter.txt", "msg header", false, dash_line); - tmp = errnum; - prev_opal_error = NULL; - do { - /* If there is an error attached to this error, print it out. */ - if (0 != (attached_errnum = OPAL_SOS_GET_ATTACHED_INDEX(errnum))) { - OPAL_THREAD_LOCK(&opal_sos_table_lock); - if (OPAL_SUCCESS != opal_hash_table_get_value_uint32(&opal_sos_table, - attached_errnum, - (void **)&attached_error)) { - goto cleanup; - } - OPAL_THREAD_UNLOCK(&opal_sos_table_lock); - - if (NULL != attached_error) { - opal_sos_print(attached_error->errnum, show_history); - } - } - - OPAL_THREAD_LOCK(&opal_sos_table_lock); - if (OPAL_SUCCESS != - opal_hash_table_get_value_uint32(&opal_sos_table, - OPAL_SOS_GET_INDEX(errnum), - (void **)&opal_error)) { - goto cleanup; - } - OPAL_THREAD_UNLOCK(&opal_sos_table_lock); - if (NULL == opal_error) { - return; - } - - if (NULL != prev_opal_error) { - prev_severity = OPAL_SOS_GET_SEVERITY(prev_opal_error->errnum); - severity = OPAL_SOS_GET_SEVERITY(errnum); - - /* If show_history is enabled, or if the preceeding error - was of higher severity, then report the error */ - if (show_history || (prev_severity <= severity)) - /* Print the error denoted by errnum. */ - opal_sos_report_error(prev_opal_error); - } - - prev_opal_error = opal_error; - /* Get the previous error */ - errnum = opal_error->prev; - /* Terminating condition */ - if (OPAL_SOS_ERR_BASE == errnum) { - opal_sos_report_error(opal_error); - } - } while (errnum != OPAL_SOS_ERR_BASE); - opal_show_help("opal_sos_reporter.txt", "msg header", false, dash_line); - errnum = tmp; - return; - -cleanup: - OPAL_THREAD_UNLOCK(&opal_sos_table_lock); -} - -void opal_sos_print_error(opal_sos_severity_t severity, int errnum, const char *errmsg, ...) -{ - va_list arglist; - va_start(arglist, errmsg); - opal_show_vhelp("opal_sos_reporter.txt", "general message", false, arglist); - va_end(arglist); -} - -void opal_sos_log(int errnum) -{ - opal_sos_print(errnum, false); - opal_sos_free(&errnum); -} - -int opal_sos_prettify_error(const char *error, char **pretty_error) -{ - char *str, *token, *saveptr, *errdup; - const char *prefix = "\n| | "; - int len = 0, plen, left; - - if (NULL == error) { - return OPAL_ERROR; - } - - *pretty_error = (char *) malloc(OPAL_SOS_MAX_ERR_LEN); - if (NULL == *pretty_error) { - return OPAL_ERR_OUT_OF_RESOURCE; - } - *(*pretty_error) = '\0'; - - plen = strlen(prefix); - - if (NULL != (errdup = strdup(error))) { - for (str = errdup, len = 0; len < OPAL_SOS_MAX_ERR_LEN; str = NULL) { - if (NULL == (token = strtok_r(str, "\n", &saveptr))) { - break; - } - - left = strlen(token); - if ((len + left) > OPAL_SOS_MAX_ERR_LEN) { - left = OPAL_SOS_MAX_ERR_LEN - len; - } - strncat(*pretty_error, token, left); - len += left; - - left = plen; - if ((len + left) > OPAL_SOS_MAX_ERR_LEN) { - left = OPAL_SOS_MAX_ERR_LEN - len; - } - strncat(*pretty_error, prefix, left); - len += left; - } - free(errdup); - errdup = NULL; - } - - return len; -} - -const char *opal_sos_severity2str(opal_sos_severity_t severity) -{ - switch(severity) { - case OPAL_SOS_SEVERITY_EMERG: return "EMERGENCY"; - case OPAL_SOS_SEVERITY_ALERT: return "ALERT MESSAGE"; - case OPAL_SOS_SEVERITY_CRIT: return "CRITICAL MESSAGE"; - case OPAL_SOS_SEVERITY_ERROR: return "ERROR"; - case OPAL_SOS_SEVERITY_WARN: return "WARNING"; - case OPAL_SOS_SEVERITY_NOTICE: return "NOTICE"; - case OPAL_SOS_SEVERITY_INFO: return "INFO MESSAGE"; - case OPAL_SOS_SEVERITY_DEBUG: return "DEBUG MESSAGE"; - default: return "UNKNOWN ERROR"; - } -} - -int opal_sos_hash_error(opal_sos_error_t *error) -{ - int hash, c; - char *msg; - - /* Naive string hash function to create a key based on the error - details, namely length of the file name, length of the function - name and the sum of the characters in the error message */ - - hash = error->errnum; - if (NULL != error->file) { - hash += strlen(error->file); - } - if (NULL != error->func) { - hash += strlen(error->func); - } - if (NULL != error->msg) { - msg = error->msg; - while ('\0' != (c = *msg++)) { - hash += c; - } - } - - return (hash & (OPAL_SOS_ERR_TABLE_SIZE - 1)); -} - -int opal_sos_reg_print_callback(opal_sos_print_callback_fn_t new_func, - opal_sos_print_callback_fn_t *prev_func) -{ - /* Preserve the previous print callback */ - *prev_func = cur_print_callback; - - /* Update the current print callback */ - cur_print_callback = new_func; - return OPAL_SUCCESS; -} - -int opal_sos_reg_reporter_callback(opal_sos_reporter_callback_fn_t new_func, - opal_sos_reporter_callback_fn_t *prev_func) -{ - /* Preserve the previous reporter callback */ - *prev_func = cur_reporter_callback; - - /* Update the current reporter callback */ - cur_reporter_callback = new_func; - return OPAL_SUCCESS; -} diff --git a/opal/util/opal_sos.h b/opal/util/opal_sos.h deleted file mode 100644 index 22df2395c4..0000000000 --- a/opal/util/opal_sos.h +++ /dev/null @@ -1,441 +0,0 @@ -/* - * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2004-2006 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2005 The Regents of the University of California. - * All rights reserved. - * Copyright (c) 2009 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#ifndef OPAL_SOS_H -#define OPAL_SOS_H - -#ifdef HAVE_LIMITS_H -#include -#endif -#ifdef HAVE_SYSLOG_H -#include -#endif - -#include "opal/class/opal_object.h" -#include "opal/class/opal_hash_table.h" -#include "opal/threads/mutex.h" -#include "opal/util/output.h" - -#ifdef __STDC_VERSION__ -# if __STDC_VERSION__ < 199901L -# if defined(__GNUC__) && __GNUC__ >= 2 -# define OPAL_SOS_FUNCTION __FUNCTION__ -# else -# define OPAL_SOS_FUNCTION "" -# endif -# else -# define OPAL_SOS_FUNCTION __func__ -# endif -#else -# define OPAL_SOS_FUNCTION __func__ -#endif - -/* Internal use only */ -#define OPAL_SOS_ERR_BASE OPAL_SUCCESS - -/** - * Size of the OPAL SOS error table. - * - * Since the index into the error table that is encoded in the error - * code is 9-bit long, setting a higher value than (1 << 9) would make - * no difference at all. - */ -#define OPAL_SOS_ERR_TABLE_SIZE 512 - -/** - * Maximum length for the error string stored per error code in the - * OPAL SOS error table. - */ -#define OPAL_SOS_MAX_ERR_LEN 1024 - -/** - * Reports an error to OPAL SOS reporter. - * - * Encodes an informational message with severity \c severity and - * other passed arguments like errnum, errmsg etc. It also remembers - * the line number, file name and the function name where the error - * has occurred. - * If the MCA parameter \c opal_sos_print_low is set, the error message - * is displayed on stderr using the "show help" subsystem. By default, - * informational messages are not printed out on stderr. - * If \c show_stack is set, the stacktrace is saved and/or printed - * along with the corresponding \c errmsg. - */ -#define OPAL_SOS_REPORT(severity, arg) opal_sos_reporter(__FILE__, __LINE__, \ - OPAL_SOS_FUNCTION, \ - severity, \ - opal_sos_build_error arg) - -/** - * Print or store an event with the maximum severity (EMERG). - */ -#define OPAL_SOS_EMERG(arg) OPAL_SOS_REPORT(OPAL_SOS_SEVERITY_EMERG, arg) - -/** - * Report an event of severity "ALERT". - */ -#define OPAL_SOS_ALERT(arg) OPAL_SOS_REPORT(OPAL_SOS_SEVERITY_ALERT, arg) - -/** - * Report events with severity marked as "CRITICAL". - */ -#define OPAL_SOS_CRIT(arg) OPAL_SOS_REPORT(OPAL_SOS_SEVERITY_CRIT, arg) - -/** - * Prints and/or logs an error. - * This function can be used to log or print error events. - */ -#define OPAL_SOS_ERROR(arg) OPAL_SOS_REPORT(OPAL_SOS_SEVERITY_ERROR, arg) - -/** - * Prints and/or logs a warning. - * - * This function is similar to OPAL_SOS_INFO but with a higher - * severity. These events are printed out on the output stream - * by default. - */ -#define OPAL_SOS_WARN(arg) OPAL_SOS_REPORT(OPAL_SOS_SEVERITY_WARN, arg) - -/** - * Report an error event with severity "NOTICE". - */ -#define OPAL_SOS_NOTICE(arg) OPAL_SOS_REPORT(OPAL_SOS_SEVERITY_NOTICE,arg) - -/** - * Prints or logs an informational message in the OPAL SOS framework. - * Events with this severity are not printed, by default. However, - * they are still stored in the SOS table. - */ -#define OPAL_SOS_INFO(arg) OPAL_SOS_REPORT(OPAL_SOS_SEVERITY_INFO, arg) - -/** - * Log debug events in the SOS framework. - */ -#define OPAL_SOS_DEBUG(arg) OPAL_SOS_REPORT(OPAL_SOS_SEVERITY_DEBUG, arg) - -/** - * Frees all the (entire stack of) OPAL SOS error objects associated - * with the encoded error code obtained after dereferencing the - * pointer \c errnum. - */ -#define OPAL_SOS_FREE(perrnum) opal_sos_free(perrnum) - -/** - * Print the warnings/errors/informational messages previously logged - * in to the SOS framework. - * - * This function prints the error details encoded by \c errnum. - * If \c show_history is true, the entire history for the error - * represented by \c errnum is printed on the output stream. - */ -#define OPAL_SOS_PRINT(errnum, show_history) \ - opal_sos_print(errnum, show_history) - -/** - * Attach the history from one error code to another error code - * Returns the target encoded error \c errtgt with history of \c - * errnum associated to it. - */ -#define OPAL_SOS_ATTACH(errtgt, errnum) \ - (errtgt = -((-errtgt & ~0xFF80000L) | \ - ((OPAL_SOS_GET_INDEX(errnum) & 0x1FFL) * 0x80000L))) - -/** - * Returns the index of the error attached to errnum using OPAL_SOS_ATTACH(). - */ -#define OPAL_SOS_GET_ATTACHED_INDEX(errnum) ((int) ((-errnum & 0xFF80000L) >> 19)) - -/** - * Returns the native error code for the given encoded error code \c - * errnum. \c errnum can be a native error code itself. - */ -#define OPAL_SOS_GET_ERROR_CODE(errnum) \ - ((errnum >= 0) ? errnum : (int) -(-errnum & 0x3FFL)) - -/** - * Sets the native error code for the potentially encoded error code. - * - * The lower 10 bits are reserved for the native error code. This - * macro sets the lower 10 bits of errnum to nativeerr. - */ -#define OPAL_SOS_SET_ERROR_CODE(errnum, nativeerr) \ - (errnum = -((-errnum & ~0x3FFL) | (-nativeerr & 0x3FFL))) - -/** - * Macro to check if the error encoded by \c errnum is a native error - * or an OPAL SOS encoded error. - */ -#define OPAL_SOS_IS_NATIVE(errnum) ((-errnum & ~0x3FFL) == 0) - -/** - * Returns the severity level for the potentially encoded error code. - * - * The severity is encoded in the last three bits of the first nibble. - */ -#define OPAL_SOS_GET_SEVERITY(errnum) ((int)((-errnum >> 28) & 0x7L)) - -/** - * Sets the severity level for the given error code \c errnum. - * - * This macros do not do strict error checking of the specified - * severity levels. - */ -#define OPAL_SOS_SET_SEVERITY(errnum, severity) \ - (errnum = -((-errnum & ~0x70000000L) | ((severity & 0x7L) * 0x10000000L))) - -/** - * Macro to get the encoded error severity level as a string. - * - * This macro accepts the argument \c severity and calls the corresponding - * function opal_sos_severity2str to convert it to a string. The result - * is returned in a static buffer that should not be freed with free(). - */ -#define OPAL_SOS_SEVERITY2STR(severity) opal_sos_severity2str(severity) - -/** - * Log an encoded error \c errnum. - * - * This macro prints out and consequently frees the entire stack of - * errors associated with the \c errnum. - */ -#define OPAL_SOS_LOG(errnum) opal_sos_log(errnum) - -/** - * \internal - * Returns the index into the error table of the error encoded by \c errnum. - * - * The index is 9-bit long stored from bit 11 to bit 20 in the encoded - * error code. - */ -#define OPAL_SOS_GET_INDEX(errnum) ((int)((-errnum & 0x7FC00L) >> 10)) - -/** - * \internal - * Sets the index into the error table for the error encoded by \c errnum. - */ -#define OPAL_SOS_SET_INDEX(errnum, index) \ - (errnum = -((-errnum & ~0x7FC00L) | ((index & 0x1FFL) * 0x400L))) - -BEGIN_C_DECLS - -/** This MCA parameter sos_print_low can be set to non-zero to enable - * the print-at-bottom preference for OPAL SOS. */ -OPAL_DECLSPEC extern bool opal_sos_print_low; - -/* Severity levels for OPAL SOS */ -typedef enum { - OPAL_SOS_SEVERITY_EMERG = LOG_EMERG, - OPAL_SOS_SEVERITY_ALERT = LOG_ALERT, - OPAL_SOS_SEVERITY_CRIT = LOG_CRIT, - OPAL_SOS_SEVERITY_ERROR = LOG_ERR, - OPAL_SOS_SEVERITY_WARN = LOG_WARNING, - OPAL_SOS_SEVERITY_NOTICE = LOG_NOTICE, - OPAL_SOS_SEVERITY_INFO = LOG_INFO, - OPAL_SOS_SEVERITY_DEBUG = LOG_DEBUG -} opal_sos_severity_t; - -typedef struct opal_sos_error_t { - /** Class parent */ - opal_object_t super; - - /** - * The encoded error code for a given type of error. - * - * errnum encodes a native error code (lower 10 bits) with the - * current severity (higher 2 bits) and an index into the error - * table along with the associated error, if there is one. - */ - int errnum; - - /** File in which the error occured */ - char *file; - - /** Line number on which the error was encountered */ - int line; - - /** This is an optional parameter that indicates the function in - which the error occured */ - char *func; - - /** The actual error message or string for the error indicated by - \c errnum */ - char *msg; - - /** Encoded error numbers of the previous and the next error. - These are used are used to maintain the history of an error. - The complete history of an error can be printed later using - OPAL_SOS_PRINT() */ - int prev; - int next; -} opal_sos_error_t; - -OPAL_DECLSPEC OBJ_CLASS_DECLARATION(opal_sos_error_t); - -/** - * Signature for OPAL SOS print function callback type. - */ -typedef void (*opal_sos_print_callback_fn_t) (int errcode); - -/** - * Signature for OPAL SOS reporter function callback type. - */ -typedef void (*opal_sos_reporter_callback_fn_t) (opal_sos_severity_t severity, int errcode, - const char *msg, ...) - __opal_attribute_format_funcptr__(__printf__, 3, 4); - -/** - * A global handle that points to the local OPAL SOS table. - * This is used by the notifier components to reference the local OPAL - * SOS table, especially for packing/unpacking and sending it over to - * the HNP. - */ -OPAL_DECLSPEC extern opal_hash_table_t opal_sos_table; - -/** - * A global handle that points to the OPAL SOS table lock. - * - */ -OPAL_DECLSPEC extern opal_mutex_t opal_sos_table_lock; - -/** - * \internal - * - * Initialize OPAL SOS. - * - * This function initializes and sets up the structures required to - * track the data handled by OPAL SOS. It is invoked by - * opal_util(). - */ -void opal_sos_init(void); - -/** - * \internal - * - * Shut down OPAL SOS. - * - * Invoked by opal_finalize() to deallocate the structures needed by - * OPAL SOS. - */ -void opal_sos_finalize(void); - -/** - * Prints or relays the error locally or using the selected notifier - * components. - */ -void -opal_sos_report_error(opal_sos_error_t *error); - -/** - * Builds an OPAL SOS error object given the parameters errnum, - * show_stack and errmsg. - * NOTE: This function only partially populates the SOS error object - * structure, setting the error message details but nothing about where - * the error occurred. Filling up the rest of the error object is left - * to OPAL SOS reporter which then handles the error appropriately. - * - * @param errnum - * @param show_stack - * @param errmsg - * - * @return - */ -OPAL_DECLSPEC opal_sos_error_t * -opal_sos_build_error(int errnum, bool show_stack, - const char *errmsg, ...) - __opal_attribute_format_funcptr__(__printf__, 3, 4); - -/** - * OPAL SOS reporter logs the error in the OPAL SOS error table or - * prints it out depending on the associated reporter callback. It can - * also relay the error messages to the selected notifier components - * using the OPAL SOS reporter callback interface. - * - * @param file - * @param line - * @param func - * @param opal_error - * - * @return encoded error code - */ -OPAL_DECLSPEC int opal_sos_reporter(const char *file, int line, const char *func, - opal_sos_severity_t severity, - opal_sos_error_t *opal_error); - -/** - * Prints the error encoded by the error number \c errnum - * - * @param errnum - * @param show_history - * - */ -OPAL_DECLSPEC void opal_sos_print(int errnum, bool show_history); - -OPAL_DECLSPEC int opal_sos_prettify_error(const char *error, char **pretty_error); - -/** - * Prints a single error represented by the OPAL SOS error object - * opal_sos_error_t. - */ -OPAL_DECLSPEC void opal_sos_print_error(opal_sos_severity_t severity, - int errnum, const char *errmsg, ...) - __opal_attribute_format_funcptr__(__printf__, 3, 4); - -/** - * Frees the error object represented by the error code \c errnum. - */ -OPAL_DECLSPEC void opal_sos_free(int *errnum); - -/** - * Logs (prints and frees) the error object represented by \c errnum. - */ -OPAL_DECLSPEC void opal_sos_log(int errnum); - -/** - * Returns the OPAL SOS severity level as a string. - * - */ -const char *opal_sos_severity2str(opal_sos_severity_t severity); - -/** - * \internal - * Return a unique key into the hash table (opal_sos_error_table) - * depending on the type and location of the error. - * - */ -int opal_sos_hash_error(opal_sos_error_t *error); - -/** - * Registers a print callback function for OPAL_SOS_PRINT() - */ -OPAL_DECLSPEC int -opal_sos_reg_print_callback(opal_sos_print_callback_fn_t new_func, - opal_sos_print_callback_fn_t *prev_func); - -/** - * Registers a reporter callback function for OPAL_SOS_INFO(), - * OPAL_SOS_WARN() and OPAL_SOS_ERROR() - */ -OPAL_DECLSPEC int -opal_sos_reg_reporter_callback(opal_sos_reporter_callback_fn_t new_func, - opal_sos_reporter_callback_fn_t *prev_func); - -END_C_DECLS - -#endif /* OPAL_SOS_H */ diff --git a/opal/util/stacktrace.c b/opal/util/stacktrace.c index 502e200933..25f899f649 100644 --- a/opal/util/stacktrace.c +++ b/opal/util/stacktrace.c @@ -519,9 +519,7 @@ int opal_util_register_stackhandlers (void) if (!showed_help && complain) { /* JMS This is icky; there is no error message aggregation here so this message may be repeated for - every single MPI process... This should be replaced - with OPAL_SOS when that is done so that it can be - properly aggregated. */ + every single MPI process... */ opal_show_help("help-opal-util.txt", "stacktrace signal override", true, sig, sig, sig, string_value); diff --git a/orte/Makefile.am b/orte/Makefile.am index cf4decea57..13027fc9e0 100644 --- a/orte/Makefile.am +++ b/orte/Makefile.am @@ -63,7 +63,6 @@ include tools/Makefile.am include orted/Makefile.am include test/mpi/Makefile.include include test/system/Makefile.include -include threads/Makefile.am # Set the convenience library to be the same as the non-convenience # library, but a) it's marked as "noinst", so LT knows it's a diff --git a/orte/config/orte_configure_options.m4 b/orte/config/orte_configure_options.m4 index eb889353a8..72ebe798a9 100644 --- a/orte/config/orte_configure_options.m4 +++ b/orte/config/orte_configure_options.m4 @@ -13,7 +13,7 @@ dnl All rights reserved. dnl Copyright (c) 2006-2010 Cisco Systems, Inc. All rights reserved. dnl Copyright (c) 2007 Sun Microsystems, Inc. All rights reserved. dnl Copyright (c) 2009 IBM Corporation. All rights reserved. -dnl Copyright (c) 2009 Los Alamos National Security, LLC. All rights +dnl Copyright (c) 2009-2012 Los Alamos National Security, LLC. All rights dnl reserved. dnl Copyright (c) 2009 Oak Ridge National Labs. All rights reserved. dnl @@ -114,25 +114,22 @@ AC_DEFINE_UNQUOTED([ORTE_ENABLE_HEARTBEAT], [Whether we want daemon heartbeat monitoring enabled]) # -# Compile in resilient runtime code -# -AC_MSG_CHECKING([if want resilient runtime code enabled]) -AC_ARG_ENABLE(resilient-orte, - [AC_HELP_STRING([--enable-resilient-orte], [Enable the resilient runtime code.])]) -if test "$enable_resilient_orte" = "yes"; then +# Do we want a separate orte progress thread? +AC_MSG_CHECKING([if want orte progress thread]) +AC_ARG_ENABLE([orte-progress-thread], + [AC_HELP_STRING([--enable-orte-progress-thread], + [Enable orte progress thread - for experiment by developers only! (default: disabled)])]) +if test "$enable_orte_progress_thread" = "yes"; then AC_MSG_RESULT([yes]) - orte_enable_resilient_code=1 + orte_enable_progress_thread=1 + AC_DEFINE_UNQUOTED(OPAL_EVENT_HAVE_THREAD_SUPPORT, 1, + [Thread support must be configured into the event library]) else AC_MSG_RESULT([no]) - orte_enable_resilient_code=0 + orte_enable_progress_thread=0 fi -AM_CONDITIONAL(ORTE_RESIL_ORTE, [test "$enable_resilient_orte" = "yes"]) -AC_DEFINE_UNQUOTED([ORTE_RESIL_ORTE], [$orte_enable_resilient_code], - [Compile a resilient version of Open MPI]) - -AM_CONDITIONAL(ORTE_ENABLE_EPOCH, [test "$enable_resilient_orte" = "yes"]) -AC_DEFINE_UNQUOTED([ORTE_ENABLE_EPOCH], [$orte_enable_resilient_code], - [Support for epoch in the ORTE process name enabled or not]) - +AC_DEFINE_UNQUOTED([ORTE_ENABLE_PROGRESS_THREAD], + [$orte_enable_progress_thread], + [Whether we want an orte progress thread enabled]) ])dnl diff --git a/orte/include/orte/types.h b/orte/include/orte/types.h index 6338696c5a..385ebff23c 100644 --- a/orte/include/orte/types.h +++ b/orte/include/orte/types.h @@ -82,54 +82,27 @@ typedef uint32_t orte_vpid_t; #define ORTE_VPID_MAX UINT32_MAX-2 #define ORTE_VPID_MIN 0 -#if ORTE_ENABLE_EPOCH -typedef uint32_t orte_epoch_t; -#define ORTE_EPOCH_T OPAL_UINT32 -#define ORTE_EPOCH_MAX UINT32_MAX-2 -#define ORTE_EPOCH_MIN 0 -#endif - -#if ORTE_ENABLE_EPOCH -#define ORTE_PROCESS_NAME_HTON(n) \ -do { \ - n.jobid = htonl(n.jobid); \ - n.vpid = htonl(n.vpid); \ - n.epoch = htonl(n.epoch); \ -} while (0) -#else #define ORTE_PROCESS_NAME_HTON(n) \ do { \ n.jobid = htonl(n.jobid); \ n.vpid = htonl(n.vpid); \ } while (0) -#endif -#if ORTE_ENABLE_EPOCH -#define ORTE_PROCESS_NAME_NTOH(n) \ -do { \ - n.jobid = ntohl(n.jobid); \ - n.vpid = ntohl(n.vpid); \ - n.epoch = ntohl(n.epoch); \ -} while (0) -#else #define ORTE_PROCESS_NAME_NTOH(n) \ do { \ n.jobid = ntohl(n.jobid); \ n.vpid = ntohl(n.vpid); \ } while (0) -#endif #define ORTE_NAME_ARGS(n) \ (unsigned long) ((NULL == n) ? (unsigned long)ORTE_JOBID_INVALID : (unsigned long)(n)->jobid), \ (unsigned long) ((NULL == n) ? (unsigned long)ORTE_VPID_INVALID : (unsigned long)(n)->vpid) \ - (unsigned long) ((NULL == n) ? (unsigned long)ORTE_EPOCH_INVALID : (unsigned long)(n)->epoch) /* * define invalid values */ #define ORTE_JOBID_INVALID (ORTE_JOBID_MAX + 2) #define ORTE_VPID_INVALID (ORTE_VPID_MAX + 2) -#define ORTE_EPOCH_INVALID (ORTE_EPOCH_MAX + 2) #define ORTE_LOCAL_JOBID_INVALID (ORTE_JOBID_INVALID & 0x0000FFFF) /* @@ -137,7 +110,6 @@ do { \ */ #define ORTE_JOBID_WILDCARD (ORTE_JOBID_MAX + 1) #define ORTE_VPID_WILDCARD (ORTE_VPID_MAX + 1) -#define ORTE_EPOCH_WILDCARD (ORTE_EPOCH_MAX + 1) #define ORTE_LOCAL_JOBID_WILDCARD (ORTE_JOBID_WILDCARD & 0x0000FFFF) /* @@ -146,16 +118,6 @@ do { \ struct orte_process_name_t { orte_jobid_t jobid; /**< Job number */ orte_vpid_t vpid; /**< Process id - equivalent to rank */ -#if ORTE_ENABLE_EPOCH - orte_epoch_t epoch; /**< Epoch - used to measure the generation of a recovered process. - * The epoch will start at ORTE_EPOCH_MIN and - * increment every time the process is detected as - * having stopped (including normal shutdown). The - * HNP will be responsible for informing all - * processes that did not directly detect the - * failure to increment their epochs. - */ -#endif }; typedef struct orte_process_name_t orte_process_name_t; @@ -179,10 +141,6 @@ typedef void* orte_iov_base_ptr_t; #define ORTE_VPID (OPAL_DSS_ID_DYNAMIC + 3) /**< a vpid */ #define ORTE_JOBID (OPAL_DSS_ID_DYNAMIC + 4) /**< a jobid */ -#if ORTE_ENABLE_EPOCH -#define ORTE_EPOCH (OPAL_DSS_ID_DYNAMIC + 5) /**< an epoch */ -#endif - #if !ORTE_DISABLE_FULL_SUPPORT /* State-related types */ #define ORTE_NODE_STATE (OPAL_DSS_ID_DYNAMIC + 6) /**< node status flag */ @@ -205,11 +163,8 @@ typedef void* orte_iov_base_ptr_t; /* DAEMON command type */ #define ORTE_DAEMON_CMD (OPAL_DSS_ID_DYNAMIC + 19) /**< command flag for communicating with the daemon */ -/* GRPCOMM types */ -#define ORTE_GRPCOMM_MODE (OPAL_DSS_ID_DYNAMIC + 20) - /* IOF types */ -#define ORTE_IOF_TAG (OPAL_DSS_ID_DYNAMIC + 21) +#define ORTE_IOF_TAG (OPAL_DSS_ID_DYNAMIC + 20) /* provide a boundary for others to use */ diff --git a/orte/mca/errmgr/app/Makefile.am b/orte/mca/errmgr/app/Makefile.am deleted file mode 100644 index e164765296..0000000000 --- a/orte/mca/errmgr/app/Makefile.am +++ /dev/null @@ -1,36 +0,0 @@ -# -# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. -# $COPYRIGHT$ -# -# Additional copyrights may follow -# -# $HEADER$ -# - -EXTRA_DIST = .windows - -sources = \ - errmgr_app.h \ - errmgr_app_component.c \ - errmgr_app.c - -# Make the output library in this directory, and name it either -# mca__.la (for DSO builds) or libmca__.la -# (for static builds). - -if MCA_BUILD_orte_errmgr_app_DSO -component_noinst = -component_install = mca_errmgr_app.la -else -component_noinst = libmca_errmgr_app.la -component_install = -endif - -mcacomponentdir = $(pkglibdir) -mcacomponent_LTLIBRARIES = $(component_install) -mca_errmgr_app_la_SOURCES = $(sources) -mca_errmgr_app_la_LDFLAGS = -module -avoid-version - -noinst_LTLIBRARIES = $(component_noinst) -libmca_errmgr_app_la_SOURCES =$(sources) -libmca_errmgr_app_la_LDFLAGS = -module -avoid-version diff --git a/orte/mca/errmgr/app/errmgr_app.c b/orte/mca/errmgr/app/errmgr_app.c deleted file mode 100644 index 02fb1785b5..0000000000 --- a/orte/mca/errmgr/app/errmgr_app.c +++ /dev/null @@ -1,280 +0,0 @@ -/* - * Copyright (c) 2009-2011 The Trustees of Indiana University. - * All rights reserved. - * - * Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. - * - * Copyright (c) 2004-2006 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "orte_config.h" - -#include -#ifdef HAVE_UNISTD_H -#include -#endif /* HAVE_UNISTD_H */ -#ifdef HAVE_STRING_H -#include -#endif - -#include "opal/util/output.h" -#include "opal/dss/dss.h" -#include "opal/mca/event/event.h" - -#include "orte/util/error_strings.h" -#include "orte/util/name_fns.h" -#include "orte/util/show_help.h" -#include "orte/util/nidmap.h" -#include "orte/runtime/orte_globals.h" -#include "orte/runtime/orte_wait.h" -#include "orte/mca/routed/routed.h" -#include "orte/mca/rml/rml.h" -#include "orte/mca/rml/rml_types.h" -#include "orte/mca/odls/odls_types.h" - -#include "orte/mca/errmgr/base/base.h" -#include "orte/mca/errmgr/base/errmgr_private.h" -#include "errmgr_app.h" - -/* - * Module functions: Global - */ -static int init(void); -static int finalize(void); - -static int update_state(orte_jobid_t job, - orte_job_state_t jobstate, - orte_process_name_t *proc_name, - orte_proc_state_t state, - pid_t pid, - orte_exit_code_t exit_code); - -static int orte_errmgr_app_abort_peers(orte_process_name_t *procs, - orte_std_cntr_t num_procs); - -void epoch_change_recv(int status, - orte_process_name_t *sender, - opal_buffer_t *buffer, - orte_rml_tag_t tag, - void *cbdata); -void epoch_change(int fd, - short event, - void *data); - -/****************** - * HNP module - ******************/ -orte_errmgr_base_module_t orte_errmgr_app_module = { - init, - finalize, - orte_errmgr_base_log, - orte_errmgr_base_abort, - orte_errmgr_app_abort_peers, - update_state, - NULL, - NULL, - NULL, - orte_errmgr_base_register_migration_warning -#if ORTE_RESIL_ORTE - ,orte_errmgr_base_set_fault_callback -#endif -}; - -/************************ - * API Definitions - ************************/ -static int init(void) -{ - int ret = ORTE_SUCCESS; - -#if ORTE_RESIL_ORTE - ret = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, - ORTE_RML_TAG_EPOCH_CHANGE, - ORTE_RML_PERSISTENT, - epoch_change_recv, - NULL); -#endif - - return ret; -} - -static int finalize(void) -{ -#if ORTE_RESIL_ORTE - orte_rml.recv_cancel(ORTE_NAME_WILDCARD, - ORTE_RML_TAG_EPOCH_CHANGE); -#endif - - return ORTE_SUCCESS; -} - -static int update_state(orte_jobid_t job, - orte_job_state_t jobstate, - orte_process_name_t *proc, - orte_proc_state_t state, - pid_t pid, - orte_exit_code_t exit_code) -{ - orte_ns_cmp_bitmask_t mask; - - OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, - "%s errmgr:app: job %s reported state %s" - " for proc %s state %s exit_code %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_JOBID_PRINT(job), - orte_job_state_to_str(jobstate), - (NULL == proc) ? "NULL" : ORTE_NAME_PRINT(proc), - orte_proc_state_to_str(state), exit_code)); - - /* - * if orte is trying to shutdown, just let it - */ - if (orte_finalizing) { - return ORTE_SUCCESS; - } - - if (ORTE_PROC_STATE_COMM_FAILED == state) { - mask = ORTE_NS_CMP_ALL; - /* if it is our own connection, ignore it */ - if (OPAL_EQUAL == orte_util_compare_name_fields(mask, ORTE_PROC_MY_NAME, proc)) { - return ORTE_SUCCESS; - } - - /* delete the route */ - orte_routed.delete_route(proc); - /* see is this was a lifeline */ - if (ORTE_SUCCESS != orte_routed.route_lost(proc)) { - return ORTE_ERR_UNRECOVERABLE; - } - } - return ORTE_SUCCESS; -} - -#if ORTE_RESIL_ORTE -void epoch_change_recv(int status, - orte_process_name_t *sender, - opal_buffer_t *buffer, - orte_rml_tag_t tag, - void *cbdata) { - - ORTE_MESSAGE_EVENT(sender, buffer, tag, epoch_change); -} - -void epoch_change(int fd, - short event, - void *data) { - orte_message_event_t *mev = (orte_message_event_t *) data; - opal_buffer_t *buffer = mev->buffer; - orte_process_name_t *proc; - int n = 1, ret, num_dead, i; - opal_pointer_array_t *procs; - - if (orte_finalizing || orte_job_term_ordered || orte_orteds_term_ordered) { - return; - } - - OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, - "%s errmgr:app Received epoch change notification", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - - procs = OBJ_NEW(opal_pointer_array_t); - - if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &num_dead, &n, ORTE_VPID))) { - ORTE_ERROR_LOG(ret); - opal_output(0, "%s Error unpacking message.", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - return; - } - - proc = (orte_process_name_t *) malloc(sizeof(orte_process_name_t) * num_dead); - for (i = 0; i < num_dead; i++) { - if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &proc[i], &n, ORTE_NAME))) { - ORTE_ERROR_LOG(ret); - opal_output(0, "%s Error unpacking message.", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - return; - } - proc[i].epoch++; - orte_util_set_epoch(&proc[i], proc[i].epoch); - - opal_pointer_array_add(procs, &proc[i]); - - OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, - "%s errmgr:app Epoch for %s updated", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&proc[i]))); - } - - if (NULL != fault_cbfunc && 0 < num_dead) { - OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, - "%s errmgr:app Calling fault callback", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - - (*fault_cbfunc)(procs); - } else if (NULL == fault_cbfunc) { - OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, - "%s errmgr:app Calling fault callback failed (NULL pointer)!", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - } else { - OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, - "%s errmgr:app Calling fault callback failed (num_dead <= 0)!", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - } - - free(proc); - OBJ_RELEASE(procs); -} -#endif - -static int orte_errmgr_app_abort_peers(orte_process_name_t *procs, orte_std_cntr_t num_procs) -{ - int ret, exit_status = ORTE_SUCCESS; - opal_buffer_t buffer; - orte_std_cntr_t i; - orte_daemon_cmd_flag_t command = ORTE_DAEMON_ABORT_PROCS_CALLED; - - /* - * Pack up the list of processes and send them to the HNP - */ - OBJ_CONSTRUCT(&buffer, opal_buffer_t); - - if (ORTE_SUCCESS != (ret = opal_dss.pack(&buffer, &command, 1, ORTE_DAEMON_CMD))) { - ORTE_ERROR_LOG(ret); - exit_status = ret; - goto cleanup; - } - - /* pack number of processes */ - if (ORTE_SUCCESS != (ret = opal_dss.pack(&buffer, &(num_procs), 1, ORTE_STD_CNTR))) { - ORTE_ERROR_LOG(ret); - exit_status = ret; - goto cleanup; - } - - /* Pack the list of names */ - for( i = 0; i < num_procs; ++i ) { - if (ORTE_SUCCESS != (ret = opal_dss.pack(&buffer, &(procs[i]), 1, ORTE_NAME))) { - ORTE_ERROR_LOG(ret); - exit_status = ret; - goto cleanup; - } - } - - /* Send to HNP for termination */ - if (0 > (ret = orte_rml.send_buffer(ORTE_PROC_MY_HNP, &buffer, ORTE_RML_TAG_DAEMON, 0))) { - ORTE_ERROR_LOG(ret); - exit_status = ret; - goto cleanup; - } - -cleanup: - OBJ_DESTRUCT(&buffer); - - return exit_status; -} diff --git a/orte/mca/errmgr/app/errmgr_app.h b/orte/mca/errmgr/app/errmgr_app.h deleted file mode 100644 index 4674b5bf24..0000000000 --- a/orte/mca/errmgr/app/errmgr_app.h +++ /dev/null @@ -1,35 +0,0 @@ -/* - * Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. - * - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -/** - * @file - * - */ - -#ifndef MCA_ERRMGR_app_EXPORT_H -#define MCA_ERRMGR_app_EXPORT_H - -#include "orte_config.h" - -#include "orte/mca/errmgr/errmgr.h" - -BEGIN_C_DECLS - -/* - * Local Component structures - */ - -ORTE_MODULE_DECLSPEC extern orte_errmgr_base_component_t mca_errmgr_app_component; - -ORTE_DECLSPEC extern orte_errmgr_base_module_t orte_errmgr_app_module; - -END_C_DECLS - -#endif /* MCA_ERRMGR_app_EXPORT_H */ diff --git a/orte/mca/errmgr/app/errmgr_app_component.c b/orte/mca/errmgr/app/errmgr_app_component.c deleted file mode 100644 index dda89e52b8..0000000000 --- a/orte/mca/errmgr/app/errmgr_app_component.c +++ /dev/null @@ -1,89 +0,0 @@ -/* - * Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. - * - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "orte_config.h" -#include "opal/util/output.h" - -#include "orte/mca/errmgr/errmgr.h" -#include "orte/mca/errmgr/base/base.h" -#include "errmgr_app.h" - -/* - * Public string for version number - */ -const char *orte_errmgr_app_component_version_string = - "ORTE ERRMGR app MCA component version " ORTE_VERSION; - -/* - * Local functionality - */ -static int errmgr_app_open(void); -static int errmgr_app_close(void); -static int errmgr_app_component_query(mca_base_module_t **module, int *priority); - -/* - * Instantiate the public struct with all of our public information - * and pointer to our public functions in it - */ -orte_errmgr_base_component_t mca_errmgr_app_component = -{ - /* Handle the general mca_component_t struct containing - * meta information about the component itapp - */ - { - ORTE_ERRMGR_BASE_VERSION_3_0_0, - /* Component name and version */ - "app", - ORTE_MAJOR_VERSION, - ORTE_MINOR_VERSION, - ORTE_RELEASE_VERSION, - - /* Component open and close functions */ - errmgr_app_open, - errmgr_app_close, - errmgr_app_component_query - }, - { - /* The component is checkpoint ready */ - MCA_BASE_METADATA_PARAM_CHECKPOINT - }, - /* Verbosity level */ - 0, - /* opal_output handler */ - -1, - /* Default priority */ - 5 -}; - -static int errmgr_app_open(void) -{ - return ORTE_SUCCESS; -} - -static int errmgr_app_close(void) -{ - return ORTE_SUCCESS; -} - -static int errmgr_app_component_query(mca_base_module_t **module, int *priority) -{ - if (ORTE_PROC_IS_APP) { - /* keep our priority low so that other modules are higher - * and will run before us - */ - *priority = 5; - *module = (mca_base_module_t *)&orte_errmgr_app_module; - return ORTE_SUCCESS; - } - - *priority = -1; - *module = NULL; - return ORTE_ERROR; -} diff --git a/orte/mca/errmgr/base/errmgr_base_fns.c b/orte/mca/errmgr/base/errmgr_base_fns.c index b7d5aa4f1a..d102253e86 100644 --- a/orte/mca/errmgr/base/errmgr_base_fns.c +++ b/orte/mca/errmgr/base/errmgr_base_fns.c @@ -11,6 +11,8 @@ * All rights reserved. * Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved. + * Copyright (c) 2011 Los Alamos National Security, LLC. + * All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -100,13 +102,11 @@ void orte_errmgr_predicted_proc_construct(orte_errmgr_predicted_proc_t *item) { item->proc_name.vpid = ORTE_VPID_INVALID; item->proc_name.jobid = ORTE_JOBID_INVALID; - ORTE_EPOCH_SET(item->proc_name.epoch,ORTE_EPOCH_MIN); } void orte_errmgr_predicted_proc_destruct( orte_errmgr_predicted_proc_t *item) { item->proc_name.vpid = ORTE_VPID_INVALID; - ORTE_EPOCH_SET(item->proc_name.epoch,ORTE_EPOCH_INVALID); item->proc_name.jobid = ORTE_JOBID_INVALID; } @@ -142,13 +142,11 @@ OBJ_CLASS_INSTANCE(orte_errmgr_predicted_map_t, void orte_errmgr_predicted_map_construct(orte_errmgr_predicted_map_t *item) { item->proc_name.vpid = ORTE_VPID_INVALID; - ORTE_EPOCH_SET(item->proc_name.epoch,ORTE_EPOCH_MIN); item->proc_name.jobid = ORTE_JOBID_INVALID; item->node_name = NULL; item->map_proc_name.vpid = ORTE_VPID_INVALID; - ORTE_EPOCH_SET(item->map_proc_name.epoch,ORTE_EPOCH_MIN); item->map_proc_name.jobid = ORTE_JOBID_INVALID; item->map_node_name = NULL; @@ -159,7 +157,6 @@ void orte_errmgr_predicted_map_construct(orte_errmgr_predicted_map_t *item) void orte_errmgr_predicted_map_destruct( orte_errmgr_predicted_map_t *item) { item->proc_name.vpid = ORTE_VPID_INVALID; - ORTE_EPOCH_SET(item->proc_name.epoch,ORTE_EPOCH_INVALID); item->proc_name.jobid = ORTE_JOBID_INVALID; if( NULL != item->node_name ) { @@ -168,7 +165,6 @@ void orte_errmgr_predicted_map_destruct( orte_errmgr_predicted_map_t *item) } item->map_proc_name.vpid = ORTE_VPID_INVALID; - ORTE_EPOCH_SET(item->map_proc_name.epoch,ORTE_EPOCH_INVALID); item->map_proc_name.jobid = ORTE_JOBID_INVALID; if( NULL != item->map_node_name ) { @@ -200,17 +196,9 @@ void orte_errmgr_base_log(int error_code, char *filename, int line) return; } - if (NULL != orte_process_info.job_name) { - opal_output(0, "[[%s][%s][%s][%d]] ORTE_ERROR_LOG: %s in file %s at line %d", - orte_process_info.job_name, - (NULL == orte_process_info.job_instance) ? "NULL" : orte_process_info.job_instance, - (NULL == orte_process_info.executable) ? "NULL" : orte_process_info.executable, - orte_process_info.app_rank, errstring, filename, line); - } else { - opal_output(0, "%s ORTE_ERROR_LOG: %s in file %s at line %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - errstring, filename, line); - } + opal_output(0, "%s ORTE_ERROR_LOG: %s in file %s at line %d", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + errstring, filename, line); } #if WANT_PMI_SUPPORT @@ -290,19 +278,6 @@ void orte_errmgr_base_abort(int error_code, char *fmt, ...) /* No way to reach here */ } -int orte_errmgr_base_update_state(orte_jobid_t job, - orte_job_state_t jobstate, - orte_process_name_t *proc_name, - orte_proc_state_t state, - pid_t pid, - orte_exit_code_t exit_code) -{ - /* - * This is a stub function that is only meant to be called by tools, - * so it will always return success. - */ - return ORTE_SUCCESS; -} void orte_errmgr_base_register_migration_warning(struct timeval *tv) { /* stub function - ignore */ diff --git a/orte/mca/errmgr/base/errmgr_base_open.c b/orte/mca/errmgr/base/errmgr_base_open.c index de979a7ae1..535f1edce8 100644 --- a/orte/mca/errmgr/base/errmgr_base_open.c +++ b/orte/mca/errmgr/base/errmgr_base_open.c @@ -10,6 +10,8 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved. + * Copyright (c) 2011 Los Alamos National Security, LLC. + * All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -64,7 +66,6 @@ orte_errmgr_base_module_t orte_errmgr_default_fns = { orte_errmgr_base_log, orte_errmgr_base_abort, orte_errmgr_base_abort_peers, - orte_errmgr_base_update_state, NULL, /* predicted_fault */ NULL, /* suggest_map_targets */ NULL, /* ft_event */ @@ -83,8 +84,6 @@ orte_errmgr_base_module_t orte_errmgr = { NULL, NULL, NULL, - NULL, - NULL, NULL }; diff --git a/orte/mca/errmgr/base/errmgr_base_tool.c b/orte/mca/errmgr/base/errmgr_base_tool.c index b86ca04b7b..6e8dd64bae 100644 --- a/orte/mca/errmgr/base/errmgr_base_tool.c +++ b/orte/mca/errmgr/base/errmgr_base_tool.c @@ -267,7 +267,6 @@ static int errmgr_base_tool_start_cmdline_listener(void) */ errmgr_cmdline_sender.jobid = ORTE_JOBID_INVALID; errmgr_cmdline_sender.vpid = ORTE_VPID_INVALID; - ORTE_EPOCH_SET(errmgr_cmdline_sender.epoch,ORTE_EPOCH_MIN); if (ORTE_SUCCESS != (ret = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_MIGRATE, 0, @@ -379,14 +378,12 @@ static void errmgr_base_tool_cmdline_process_recv(int fd, short event, void *cbd if( OPAL_EQUAL != orte_util_compare_name_fields(ORTE_NS_CMP_ALL, ORTE_NAME_INVALID, &errmgr_cmdline_sender) ) { swap_dest.jobid = errmgr_cmdline_sender.jobid; swap_dest.vpid = errmgr_cmdline_sender.vpid; - ORTE_EPOCH_SET(swap_dest.epoch,errmgr_cmdline_sender.epoch); errmgr_cmdline_sender = *sender; orte_errmgr_base_migrate_update(ORTE_ERRMGR_MIGRATE_STATE_ERR_INPROGRESS); errmgr_cmdline_sender.jobid = swap_dest.jobid; errmgr_cmdline_sender.vpid = swap_dest.vpid; - ORTE_EPOCH_SET(errmgr_cmdline_sender.epoch,swap_dest.epoch); goto cleanup; } diff --git a/orte/mca/errmgr/base/errmgr_private.h b/orte/mca/errmgr/base/errmgr_private.h index 433a47ea12..beb5c2687d 100644 --- a/orte/mca/errmgr/base/errmgr_private.h +++ b/orte/mca/errmgr/base/errmgr_private.h @@ -10,6 +10,8 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved. + * Copyright (c) 2011 Los Alamos National Security, LLC. + * All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -69,18 +71,10 @@ ORTE_DECLSPEC extern orte_errmgr_base_module_t orte_errmgr_default_fns; ORTE_DECLSPEC void orte_errmgr_base_log(int error_code, char *filename, int line); ORTE_DECLSPEC void orte_errmgr_base_abort(int error_code, char *fmt, ...) - __opal_attribute_format__(__printf__, 2, 3) - __opal_attribute_noreturn__; + __opal_attribute_format__(__printf__, 2, 3); ORTE_DECLSPEC int orte_errmgr_base_abort_peers(orte_process_name_t *procs, orte_std_cntr_t num_procs); -ORTE_DECLSPEC int orte_errmgr_base_update_state(orte_jobid_t job, - orte_job_state_t jobstate, - orte_process_name_t *proc_name, - orte_proc_state_t state, - pid_t pid, - orte_exit_code_t exit_code); - ORTE_DECLSPEC void orte_errmgr_base_register_migration_warning(struct timeval *tv); END_C_DECLS diff --git a/orte/mca/errmgr/default_app/configure.m4 b/orte/mca/errmgr/default_app/configure.m4 index 0306771fb8..2c242912fa 100644 --- a/orte/mca/errmgr/default_app/configure.m4 +++ b/orte/mca/errmgr/default_app/configure.m4 @@ -13,7 +13,7 @@ AC_DEFUN([MCA_orte_errmgr_default_app_CONFIG], [ AC_CONFIG_FILES([orte/mca/errmgr/default_app/Makefile]) - AS_IF([test "$orte_enable_resilient_code" = 0 -a "$orte_without_full_support" = 0], + AS_IF([test "$orte_without_full_support" = 0], [$1], [$2]) ]) diff --git a/orte/mca/errmgr/default_app/errmgr_default_app.c b/orte/mca/errmgr/default_app/errmgr_default_app.c index d5f88f0a94..9c96853edb 100644 --- a/orte/mca/errmgr/default_app/errmgr_default_app.c +++ b/orte/mca/errmgr/default_app/errmgr_default_app.c @@ -7,7 +7,8 @@ * Copyright (c) 2004-2006 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. - * + * Copyright (c) 2011 Los Alamos National Security, LLC. + * All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -34,6 +35,7 @@ #include "orte/mca/rml/rml.h" #include "orte/mca/routed/routed.h" #include "orte/mca/odls/odls_types.h" +#include "orte/mca/state/state.h" #include "orte/mca/errmgr/base/base.h" #include "orte/mca/errmgr/base/errmgr_private.h" @@ -45,13 +47,6 @@ static int init(void); static int finalize(void); -static int update_state(orte_jobid_t job, - orte_job_state_t jobstate, - orte_process_name_t *proc_name, - orte_proc_state_t state, - pid_t pid, - orte_exit_code_t exit_code); - static int abort_peers(orte_process_name_t *procs, orte_std_cntr_t num_procs); @@ -64,7 +59,6 @@ orte_errmgr_base_module_t orte_errmgr_default_app_module = { orte_errmgr_base_log, orte_errmgr_base_abort, abort_peers, - update_state, NULL, NULL, NULL, @@ -72,11 +66,16 @@ orte_errmgr_base_module_t orte_errmgr_default_app_module = { NULL }; +static void proc_errors(int fd, short args, void *cbdata); + /************************ * API Definitions ************************/ static int init(void) { + /* setup state machine to trap proc errors */ + orte_state.add_proc_state(ORTE_PROC_STATE_ERROR, proc_errors, ORTE_ERROR_PRI); + return ORTE_SUCCESS; } @@ -85,43 +84,43 @@ static int finalize(void) return ORTE_SUCCESS; } -static int update_state(orte_jobid_t job, - orte_job_state_t jobstate, - orte_process_name_t *proc, - orte_proc_state_t state, - pid_t pid, - orte_exit_code_t exit_code) +static void proc_errors(int fd, short args, void *cbdata) { + orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata; orte_ns_cmp_bitmask_t mask; OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, - "%s errmgr:default_app: job %s reported state %s" - " for proc %s state %s exit_code %d", + "%s errmgr:default_app: proc %s state %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_JOBID_PRINT(job), - orte_job_state_to_str(jobstate), - (NULL == proc) ? "NULL" : ORTE_NAME_PRINT(proc), - orte_proc_state_to_str(state), exit_code)); + ORTE_NAME_PRINT(&caddy->name), + orte_proc_state_to_str(caddy->proc_state))); /* * if orte is trying to shutdown, just let it */ if (orte_finalizing) { - return ORTE_SUCCESS; + OBJ_RELEASE(caddy); + return; } - if (ORTE_PROC_STATE_COMM_FAILED == state) { + if (ORTE_PROC_STATE_COMM_FAILED == caddy->proc_state) { mask = ORTE_NS_CMP_ALL; /* if it is our own connection, ignore it */ - if (OPAL_EQUAL == orte_util_compare_name_fields(mask, ORTE_PROC_MY_NAME, proc)) { - return ORTE_SUCCESS; + if (OPAL_EQUAL == orte_util_compare_name_fields(mask, ORTE_PROC_MY_NAME, &caddy->name)) { + OBJ_RELEASE(caddy); + return; } /* see is this was a lifeline */ - if (ORTE_SUCCESS != orte_routed.route_lost(proc)) { - return ORTE_ERR_UNRECOVERABLE; + if (ORTE_SUCCESS != orte_routed.route_lost(&caddy->name)) { + /* order an exit */ + ORTE_ERROR_LOG(ORTE_ERR_UNRECOVERABLE); + OBJ_RELEASE(caddy); + exit(1); } } - return ORTE_SUCCESS; + + /* cleanup */ + OBJ_RELEASE(caddy); } static int abort_peers(orte_process_name_t *procs, orte_std_cntr_t num_procs) diff --git a/orte/mca/errmgr/default_hnp/configure.m4 b/orte/mca/errmgr/default_hnp/configure.m4 index 6dced14eb4..61a954c4ac 100644 --- a/orte/mca/errmgr/default_hnp/configure.m4 +++ b/orte/mca/errmgr/default_hnp/configure.m4 @@ -13,7 +13,7 @@ AC_DEFUN([MCA_orte_errmgr_default_hnp_CONFIG], [ AC_CONFIG_FILES([orte/mca/errmgr/default_hnp/Makefile]) - AS_IF([test "$orte_enable_resilient_code" = 0 -a "$orte_without_full_support" = 0], + AS_IF([test "$orte_without_full_support" = 0], [$1], [$2]) ]) diff --git a/orte/mca/errmgr/default_hnp/errmgr_default_hnp.c b/orte/mca/errmgr/default_hnp/errmgr_default_hnp.c index 232d3e250c..b58c06673e 100644 --- a/orte/mca/errmgr/default_hnp/errmgr_default_hnp.c +++ b/orte/mca/errmgr/default_hnp/errmgr_default_hnp.c @@ -7,7 +7,7 @@ * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2011 Oracle and/or all its affiliates. All rights reserved. - * Copyright (c) 2011 Los Alamos National Security, LLC. + * Copyright (c) 2011-2012 Los Alamos National Security, LLC. * All rights reserved. * $COPYRIGHT$ * @@ -44,6 +44,7 @@ #include "orte/mca/notifier/notifier.h" #include "orte/mca/grpcomm/grpcomm.h" #include "orte/mca/ess/ess.h" +#include "orte/mca/state/state.h" #include "orte/util/error_strings.h" #include "orte/util/name_fns.h" @@ -69,13 +70,6 @@ static int predicted_fault(opal_list_t *proc_list, opal_list_t *node_list, opal_list_t *suggested_map); -static int update_state(orte_jobid_t job, - orte_job_state_t jobstate, - orte_process_name_t *proc, - orte_proc_state_t state, - pid_t pid, - orte_exit_code_t exit_code); - static int suggest_map_targets(orte_proc_t *proc, orte_node_t *oldnode, opal_list_t *node_list); @@ -92,7 +86,6 @@ orte_errmgr_base_module_t orte_errmgr_default_hnp_module = { orte_errmgr_base_log, orte_errmgr_base_abort, orte_errmgr_base_abort_peers, - update_state, predicted_fault, suggest_map_targets, ft_event, @@ -104,24 +97,21 @@ orte_errmgr_base_module_t orte_errmgr_default_hnp_module = { /* * Local functions */ -static void default_hnp_abort(orte_jobid_t job, orte_exit_code_t exit_code); -static void failed_start(orte_job_t *jdata); -static void update_local_procs_in_job(orte_job_t *jdata, orte_job_state_t jobstate, - orte_proc_state_t state, orte_exit_code_t exit_code); -static void check_job_complete(orte_job_t *jdata); -static void killprocs(orte_jobid_t job, orte_vpid_t vpid); -static void update_proc(orte_job_t *jdata, - orte_process_name_t *proc, - orte_proc_state_t state, - pid_t pid, - orte_exit_code_t exit_code); - +static void default_hnp_abort(orte_job_t *jdata); +static void job_errors(int fd, short args, void *cbdata); +static void proc_errors(int fd, short args, void *cbdata); /********************** * From DEFAULT_HNP **********************/ static int init(void) { + /* setup state machine to trap job errors */ + orte_state.add_job_state(ORTE_JOB_STATE_ERROR, job_errors, ORTE_ERROR_PRI); + + /* setup state machine to trap proc errors */ + orte_state.add_proc_state(ORTE_PROC_STATE_ERROR, proc_errors, ORTE_ERROR_PRI); + return ORTE_SUCCESS; } @@ -130,359 +120,431 @@ static int finalize(void) return ORTE_SUCCESS; } -static int update_state(orte_jobid_t job, - orte_job_state_t jobstate, - orte_process_name_t *proc, - orte_proc_state_t state, - pid_t pid, - orte_exit_code_t exit_code) +static void job_errors(int fd, short args, void *cbdata) { + orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata; orte_job_t *jdata; - orte_proc_t *pptr; + orte_job_state_t jobstate; orte_exit_code_t sts; + /* + * if orte is trying to shutdown, just let it + */ + if (orte_finalizing) { + return; + } + + /* if the jdata is NULL, then we abort as this + * is reporting an unrecoverable error + */ + if (NULL == caddy->jdata) { + ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_FORCED_EXIT); + OBJ_RELEASE(caddy); + return; + } + + /* update the state */ + jdata = caddy->jdata; + jobstate = caddy->job_state; + jdata->state = jobstate; + OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, - "%s errmgr:default_hnp: job %s reported state %s" - " for proc %s state %s pid %d exit_code %d", + "%s errmgr:default_hnp: job %s reported state %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_JOBID_PRINT(job), - orte_job_state_to_str(jobstate), - (NULL == proc) ? "NULL" : ORTE_NAME_PRINT(proc), - orte_proc_state_to_str(state), pid, exit_code)); + ORTE_JOBID_PRINT(jdata->jobid), + orte_job_state_to_str(jobstate))); + + /* set global flags */ + if (ORTE_PROC_MY_NAME->jobid == jdata->jobid && !orte_abnormal_term_ordered) { + /* set the flag indicating that a daemon failed so we use the proper + * methods for attempting to shutdown the rest of the system + */ + orte_abnormal_term_ordered = true; + } + if (ORTE_JOB_STATE_NEVER_LAUNCHED == jobstate) { + orte_never_launched = true; + jdata->num_terminated = jdata->num_procs; + ORTE_ACTIVATE_JOB_STATE(caddy->jdata, ORTE_JOB_STATE_TERMINATED); + OBJ_RELEASE(caddy); + return; + } + + if (ORTE_JOB_STATE_FAILED_TO_START == jobstate || + ORTE_JOB_STATE_FAILED_TO_LAUNCH == jobstate) { + /* the job object for this job will have been NULL'd + * in the array if the job was solely local. If it isn't + * NULL, then we need to tell everyone else to die + */ + if (NULL != jdata->aborted_proc) { + sts = jdata->aborted_proc->exit_code; + if (ORTE_PROC_MY_NAME->jobid == jdata->jobid && !orte_abnormal_term_ordered) { + /* set the flag indicating that a daemon failed so we use the proper + * methods for attempting to shutdown the rest of the system + */ + orte_abnormal_term_ordered = true; + if (WIFSIGNALED(sts)) { /* died on signal */ +#ifdef WCOREDUMP + if (WCOREDUMP(sts)) { + orte_show_help("help-plm-base.txt", "daemon-died-signal-core", true, + WTERMSIG(sts)); + sts = WTERMSIG(sts); + } else { + orte_show_help("help-plm-base.txt", "daemon-died-signal", true, + WTERMSIG(sts)); + sts = WTERMSIG(sts); + } +#else + orte_show_help("help-plm-base.txt", "daemon-died-signal", true, + WTERMSIG(sts)); + sts = WTERMSIG(sts); +#endif /* WCOREDUMP */ + } else { + orte_show_help("help-plm-base.txt", "daemon-died-no-signal", true, + WEXITSTATUS(sts)); + sts = WEXITSTATUS(sts); + } + } + } + } + + /* abort the job */ + ORTE_ACTIVATE_JOB_STATE(caddy->jdata, ORTE_JOB_STATE_FORCED_EXIT); + OBJ_RELEASE(caddy); +} + +static void cleanup_local_proc(orte_job_t *jdata, + orte_process_name_t *proc) +{ + orte_proc_t *pptr; + int i; + + /* see if this is a local proc to me */ + for (i=0; i < orte_local_children->size; i++) { + if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) { + continue; + } + if (OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, proc, &pptr->name)) { + opal_pointer_array_set_item(orte_local_children, i, NULL); + OBJ_RELEASE(pptr); + jdata->num_local_procs--; + return; + } + } +} + +static void proc_errors(int fd, short args, void *cbdata) +{ + orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata; + orte_job_t *jdata; + orte_proc_t *pptr, *proct; + orte_process_name_t *proc = &caddy->name; + orte_proc_state_t state = caddy->proc_state; + int i; + + OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, + "%s errmgr:default_hnp: for proc %s state %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(proc), + orte_proc_state_to_str(state))); /* * if orte is trying to shutdown, just let it */ if (orte_finalizing) { - return ORTE_SUCCESS; + goto cleanup; } - if (NULL == proc) { - /* this is an update for an entire local job */ - if (ORTE_JOBID_INVALID == job) { - /* whatever happened, we don't know what job - * it happened to - */ - if (ORTE_JOB_STATE_NEVER_LAUNCHED == jobstate) { - orte_never_launched = true; - } - orte_show_help("help-orte-errmgr.txt", "errmgr:unknown-job-error", - true, orte_job_state_to_str(jobstate)); - default_hnp_abort(job, exit_code); - return ORTE_SUCCESS; - } - - /* get the job object */ - if (NULL == (jdata = orte_get_job_data_object(job))) { - ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); - return ORTE_ERR_NOT_FOUND; - } - /* update the state */ - jdata->state = jobstate; - - OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, - "%s errmgr:default_hnp: job %s reported state %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_JOBID_PRINT(jdata->jobid), - orte_job_state_to_str(jobstate))); - - switch (jobstate) { - case ORTE_JOB_STATE_TERMINATED: - /* support batch-operated jobs */ - update_local_procs_in_job(jdata, jobstate, ORTE_PROC_STATE_TERMINATED, 0); - jdata->num_terminated = jdata->num_procs; - check_job_complete(jdata); - break; - - case ORTE_JOB_STATE_ABORTED: - /* support batch-operated jobs */ - update_local_procs_in_job(jdata, jobstate, ORTE_PROC_STATE_ABORTED, exit_code); - /* order all local procs for this job to be killed */ - killprocs(jdata->jobid, ORTE_VPID_WILDCARD); - jdata->num_terminated = jdata->num_procs; - check_job_complete(jdata); - break; - - case ORTE_JOB_STATE_FAILED_TO_START: - failed_start(jdata); - check_job_complete(jdata); /* set the local proc states */ - /* the job object for this job will have been NULL'd - * in the array if the job was solely local. If it isn't - * NULL, then we need to tell everyone else to die - */ - if (NULL != (jdata = orte_get_job_data_object(job))) { - sts = exit_code; - if (ORTE_PROC_MY_NAME->jobid == job && !orte_abnormal_term_ordered) { - /* set the flag indicating that a daemon failed so we use the proper - * methods for attempting to shutdown the rest of the system - */ - orte_abnormal_term_ordered = true; - if (WIFSIGNALED(exit_code)) { /* died on signal */ -#ifdef WCOREDUMP - if (WCOREDUMP(exit_code)) { - orte_show_help("help-plm-base.txt", "daemon-died-signal-core", true, - WTERMSIG(exit_code)); - sts = WTERMSIG(exit_code); - } else { - orte_show_help("help-plm-base.txt", "daemon-died-signal", true, - WTERMSIG(exit_code)); - sts = WTERMSIG(exit_code); - } -#else - orte_show_help("help-plm-base.txt", "daemon-died-signal", true, - WTERMSIG(exit_code)); - sts = WTERMSIG(exit_code); -#endif /* WCOREDUMP */ - } else { - orte_show_help("help-plm-base.txt", "daemon-died-no-signal", true, - WEXITSTATUS(exit_code)); - sts = WEXITSTATUS(exit_code); - } - } - default_hnp_abort(jdata->jobid, sts); - } - break; - - case ORTE_JOB_STATE_SILENT_ABORT: - failed_start(jdata); - check_job_complete(jdata); /* set the local proc states */ - /* the job object for this job will have been NULL'd - * in the array if the job was solely local. If it isn't - * NULL, then we need to tell everyone else to die - */ - if (NULL != (jdata = orte_get_job_data_object(job))) { - if (ORTE_PROC_MY_NAME->jobid == job && !orte_abnormal_term_ordered) { - /* set the flag indicating that a daemon failed so we use the proper - * methods for attempting to shutdown the rest of the system - */ - orte_abnormal_term_ordered = true; - } - default_hnp_abort(jdata->jobid, exit_code); - } - break; - - case ORTE_JOB_STATE_RUNNING: - /* update all procs in job */ - update_local_procs_in_job(jdata, jobstate, ORTE_PROC_STATE_RUNNING, 0); - /* record that we reported */ - jdata->num_daemons_reported++; - /* report if requested */ - if (orte_report_launch_progress) { - if (0 == jdata->num_daemons_reported % 100 || jdata->num_daemons_reported == orte_process_info.num_procs) { - opal_output(orte_clean_output, "Reported: %d (out of %d) daemons - %d (out of %d) procs", - (int)jdata->num_daemons_reported, (int)orte_process_info.num_procs, - (int)jdata->num_launched, (int)jdata->num_procs); - } - } - break; - case ORTE_JOB_STATE_NEVER_LAUNCHED: - orte_never_launched = true; - jdata->num_terminated = jdata->num_procs; - check_job_complete(jdata); /* set the local proc states */ - /* the job object for this job will have been NULL'd - * in the array if the job was solely local. If it isn't - * NULL, then we need to tell everyone else to die - */ - if (NULL != (jdata = orte_get_job_data_object(job))) { - default_hnp_abort(jdata->jobid, exit_code); - } - break; - case ORTE_JOB_STATE_SENSOR_BOUND_EXCEEDED: - /* update all procs in job */ - update_local_procs_in_job(jdata, jobstate, - ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED, - exit_code); - /* order all local procs for this job to be killed */ - killprocs(jdata->jobid, ORTE_VPID_WILDCARD); - check_job_complete(jdata); /* set the local proc states */ - /* the job object for this job will have been NULL'd - * in the array if the job was solely local. If it isn't - * NULL, then we need to tell everyone else to die - */ - if (NULL != (jdata = orte_get_job_data_object(job))) { - default_hnp_abort(jdata->jobid, exit_code); - } - break; - case ORTE_JOB_STATE_COMM_FAILED: - /* order all local procs for this job to be killed */ - killprocs(jdata->jobid, ORTE_VPID_WILDCARD); - check_job_complete(jdata); /* set the local proc states */ - /* the job object for this job will have been NULL'd - * in the array if the job was solely local. If it isn't - * NULL, then we need to tell everyone else to die - */ - if (NULL != (jdata = orte_get_job_data_object(job))) { - default_hnp_abort(jdata->jobid, exit_code); - } - break; - case ORTE_JOB_STATE_HEARTBEAT_FAILED: - /* order all local procs for this job to be killed */ - killprocs(jdata->jobid, ORTE_VPID_WILDCARD); - check_job_complete(jdata); /* set the local proc states */ - /* the job object for this job will have been NULL'd - * in the array if the job was solely local. If it isn't - * NULL, then we need to tell everyone else to die - */ - if (NULL != (jdata = orte_get_job_data_object(job))) { - default_hnp_abort(jdata->jobid, exit_code); - } - break; - - default: - break; - } - return ORTE_SUCCESS; - } - /* get the job object */ if (NULL == (jdata = orte_get_job_data_object(proc->jobid))) { /* if the orteds are terminating, check job complete */ if (orte_orteds_term_ordered) { opal_output(0, "TERM ORDERED - CHECKING COMPLETE"); - check_job_complete(NULL); - return ORTE_SUCCESS; + goto cleanup; } else { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); - return ORTE_ERR_NOT_FOUND; + goto cleanup; } } + pptr = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid); - /* update is for a specific proc */ - switch (state) { - case ORTE_PROC_STATE_ABORTED: - case ORTE_PROC_STATE_ABORTED_BY_SIG: - case ORTE_PROC_STATE_TERM_WO_SYNC: - update_proc(jdata, proc, state, pid, exit_code); - /* kill all local procs */ - killprocs(proc->jobid, ORTE_VPID_WILDCARD); - check_job_complete(jdata); /* need to set the job state */ - /* the job object for this job will have been NULL'd - * in the array if the job was solely local. If it isn't - * NULL, then we need to tell everyone else to die - */ - if (NULL != (jdata = orte_get_job_data_object(proc->jobid))) { - default_hnp_abort(jdata->jobid, exit_code); - } - break; - - case ORTE_PROC_STATE_FAILED_TO_START: - case ORTE_PROC_STATE_CALLED_ABORT: - update_proc(jdata, proc, state, pid, exit_code); - check_job_complete(jdata); - /* the job object for this job will have been NULL'd - * in the array if the job was solely local. If it isn't - * NULL, then we need to tell everyone else to die - */ - if (NULL != (jdata = orte_get_job_data_object(proc->jobid))) { - default_hnp_abort(jdata->jobid, exit_code); - } - break; - - case ORTE_PROC_STATE_REGISTERED: - case ORTE_PROC_STATE_RUNNING: - update_proc(jdata, proc, state, pid, exit_code); - break; - - case ORTE_PROC_STATE_LAUNCHED: - /* record the pid for this child */ - update_proc(jdata, proc, state, pid, exit_code); - break; - - case ORTE_PROC_STATE_TERMINATED: - case ORTE_PROC_STATE_TERM_NON_ZERO: - case ORTE_PROC_STATE_KILLED_BY_CMD: - update_proc(jdata, proc, state, pid, exit_code); - check_job_complete(jdata); - break; - - case ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED: - /* kill all jobs */ - update_proc(jdata, proc, state, pid, exit_code); - /* kill all local procs */ - killprocs(proc->jobid, ORTE_VPID_WILDCARD); - check_job_complete(jdata); /* need to set the job state */ - /* the job object for this job will have been NULL'd - * in the array if the job was solely local. If it isn't - * NULL, then we need to tell everyone else to die - */ - if (NULL != (jdata = orte_get_job_data_object(proc->jobid))) { - default_hnp_abort(jdata->jobid, exit_code); - } - break; - - case ORTE_PROC_STATE_COMM_FAILED: + /* we MUST handle a communication failure before doing anything else + * as it requires some special care to avoid normal termination issues + * for local application procs + */ + if (ORTE_PROC_STATE_COMM_FAILED == state) { /* is this to a daemon? */ if (ORTE_PROC_MY_NAME->jobid != proc->jobid) { /* nope - ignore it */ - break; + OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, + "%s Comm failure to non-daemon proc - ignoring it", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + goto cleanup; } /* if this is my own connection, ignore it */ if (ORTE_PROC_MY_NAME->vpid == proc->vpid) { OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, - "%s My own connection - ignoring it", + "%s Comm failure on my own connection - ignoring it", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - break; + goto cleanup; } - /* if we have ordered orteds to terminate, record it */ - if (orte_orteds_term_ordered) { + /* if we have ordered orteds to terminate or abort + * is in progress, record it */ + if (orte_orteds_term_ordered || orte_abnormal_term_ordered) { OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, - "%s Daemons terminating - recording daemon %s as gone", + "%s Comm failure: daemons terminating - recording daemon %s as gone", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc))); /* remove from dependent routes, if it is one */ orte_routed.route_lost(proc); - /* update daemon job */ - if (NULL != (pptr = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid))) { - if (pptr->state < ORTE_PROC_STATE_TERMINATED) { - pptr->state = state; - jdata->num_terminated++; + /* if all my routes and local children are gone, then terminate ourselves */ + if (0 == orte_routed.num_routes()) { + for (i=0; i < orte_local_children->size; i++) { + if (NULL != (proct = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i)) && + proct->alive && proct->state < ORTE_PROC_STATE_UNTERMINATED) { + /* at least one is still alive */ + goto cleanup; } - } - /* check if complete */ - check_job_complete(jdata); - break; + } + /* call our appropriate exit procedure */ + OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, + "%s errmgr_hnp: all routes and children gone - ordering exit", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED); + } + goto cleanup; } - /* if abort is in progress, see if this one failed to tell - * us it had terminated + OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, + "%s Comm failure: daemon %s - aborting", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc))); + /* record the first one to fail */ + if (!jdata->abort) { + jdata->state = ORTE_JOB_STATE_COMM_FAILED; + /* point to the lowest rank to cause the problem */ + jdata->aborted_proc = pptr; + /* retain the object so it doesn't get free'd */ + OBJ_RETAIN(pptr); + jdata->abort = true; + ORTE_UPDATE_EXIT_STATUS(pptr->exit_code); + } + /* abort the system */ + default_hnp_abort(jdata); + goto cleanup; + } + + /* update the proc state - can get multiple reports on a proc + * depending on circumstances, so ensure we only do this once + */ + if (pptr->state < ORTE_PROC_STATE_TERMINATED) { + pptr->state = state; + jdata->num_terminated++; + } + /* since we only come here if the proc terminated, + * cleanup the local proc, if required + */ + cleanup_local_proc(jdata, proc); + + /* ensure we record the failed proc properly so we can report + * the error once we terminate + */ + switch (state) { + case ORTE_PROC_STATE_KILLED_BY_CMD: + OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, + "%s errmgr:hnp: proc %s killed by cmd", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(proc))); + /* we ordered this proc to die, so it isn't an abnormal termination + * and we don't flag it as such */ - if (orte_abnormal_term_ordered) { - OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, - "%s Abort in progress - recording daemon %s as gone", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc))); - /* remove from dependent routes, if it is one */ - orte_routed.route_lost(proc); - /* update daemon job */ - if (NULL != (pptr = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid))) { - if (pptr->state < ORTE_PROC_STATE_TERMINATED) { - pptr->state = state; - jdata->num_terminated++; - } + if (jdata->num_terminated >= jdata->num_procs) { + /* this job has terminated */ + ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_TERMINATED); + } + /* don't abort the job as this isn't an abnormal termination */ + break; + + case ORTE_PROC_STATE_ABORTED: + OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, + "%s errmgr:hnp: proc %s aborted", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(proc))); + if (!jdata->abort) { + jdata->state = ORTE_JOB_STATE_ABORTED; + /* point to the first rank to cause the problem */ + jdata->aborted_proc = pptr; + /* retain the object so it doesn't get free'd */ + OBJ_RETAIN(pptr); + jdata->abort = true; + ORTE_UPDATE_EXIT_STATUS(pptr->exit_code); + } + /* abnormal termination - abort */ + default_hnp_abort(jdata); + break; + + case ORTE_PROC_STATE_ABORTED_BY_SIG: + OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, + "%s errmgr:hnp: proc %s aborted by signal", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(proc))); + if (!jdata->abort) { + jdata->state = ORTE_JOB_STATE_ABORTED_BY_SIG; + /* point to the first rank to cause the problem */ + jdata->aborted_proc = pptr; + /* retain the object so it doesn't get free'd */ + OBJ_RETAIN(pptr); + jdata->abort = true; + ORTE_UPDATE_EXIT_STATUS(pptr->exit_code); + } + /* abnormal termination - abort */ + default_hnp_abort(jdata); + break; + + case ORTE_PROC_STATE_TERM_WO_SYNC: + OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, + "%s errmgr:hnp: proc %s terminated without sync", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(proc))); + if (!jdata->abort) { + jdata->state = ORTE_JOB_STATE_ABORTED_WO_SYNC; + /* point to the first rank to cause the problem */ + jdata->aborted_proc = pptr; + /* retain the object so it doesn't get free'd */ + OBJ_RETAIN(pptr); + jdata->abort = true; + ORTE_UPDATE_EXIT_STATUS(pptr->exit_code); + /* now treat a special case - if the proc exit'd without a required + * sync, it may have done so with a zero exit code. We want to ensure + * that the user realizes there was an error, so in this -one- case, + * we overwrite the process' exit code with the default error code + */ + ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE); + } + /* abnormal termination - abort */ + default_hnp_abort(jdata); + break; + + case ORTE_PROC_STATE_FAILED_TO_START: + case ORTE_PROC_STATE_FAILED_TO_LAUNCH: + OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, + "%s errmgr:hnp: proc %s %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(proc), + orte_proc_state_to_str(state))); + if (!jdata->abort) { + if (ORTE_PROC_STATE_FAILED_TO_START) { + jdata->state = ORTE_JOB_STATE_FAILED_TO_START; + } else { + jdata->state = ORTE_JOB_STATE_FAILED_TO_LAUNCH; + } + /* point to the first rank to cause the problem */ + jdata->aborted_proc = pptr; + /* retain the object so it doesn't get free'd */ + OBJ_RETAIN(pptr); + jdata->abort = true; + ORTE_UPDATE_EXIT_STATUS(pptr->exit_code); + } + /* abnormal termination - abort */ + default_hnp_abort(jdata); + break; + + case ORTE_PROC_STATE_CALLED_ABORT: + OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, + "%s errmgr:hnp: proc %s called abort", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(proc))); + if (!jdata->abort) { + jdata->state = ORTE_JOB_STATE_CALLED_ABORT; + /* point to the first proc to cause the problem */ + jdata->aborted_proc = pptr; + /* retain the object so it doesn't get free'd */ + OBJ_RETAIN(pptr); + jdata->abort = true; + ORTE_UPDATE_EXIT_STATUS(pptr->exit_code); + } + /* abnormal termination - abort */ + default_hnp_abort(jdata); + break; + + case ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED: + OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, + "%s errmgr:hnp: proc %s exceeded sensor boundary", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(proc))); + if (!jdata->abort) { + jdata->state = ORTE_JOB_STATE_SENSOR_BOUND_EXCEEDED; + /* point to the lowest rank to cause the problem */ + jdata->aborted_proc = pptr; + /* retain the object so it doesn't get free'd */ + OBJ_RETAIN(pptr); + jdata->abort = true; + ORTE_UPDATE_EXIT_STATUS(pptr->exit_code); + } + /* abnormal termination - abort */ + default_hnp_abort(jdata); + break; + + case ORTE_PROC_STATE_TERM_NON_ZERO: + OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, + "%s errmgr:hnp: proc %s exited with non-zero status %d", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(proc), + pptr->exit_code)); + ORTE_UPDATE_EXIT_STATUS(pptr->exit_code); + /* track the number of non-zero exits */ + jdata->num_non_zero_exit++; + if (orte_abort_non_zero_exit) { + if (!jdata->abort) { + jdata->state = ORTE_JOB_STATE_NON_ZERO_TERM; + /* point to the first rank to cause the problem */ + jdata->aborted_proc = pptr; + /* retain the object so it doesn't get free'd */ + OBJ_RETAIN(pptr); + jdata->abort = true; + } + /* user requested we abort in this scenario */ + default_hnp_abort(jdata); + } else { + /* user requested we consider this normal termination */ + if (jdata->num_terminated >= jdata->num_procs) { + /* this job has terminated */ + ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_TERMINATED); } - /* check if complete */ - check_job_complete(jdata); } break; case ORTE_PROC_STATE_HEARTBEAT_FAILED: - /* heartbeats are only for daemons */ - if (NULL != (pptr = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid))) { - if (pptr->state < ORTE_PROC_STATE_TERMINATED) { - pptr->state = state; - jdata->num_terminated++; - } + OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, + "%s errmgr:hnp: proc %s heartbeat failed", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(proc))); + if (!jdata->abort) { + jdata->state = ORTE_JOB_STATE_HEARTBEAT_FAILED; + /* point to the first rank to cause the problem */ + jdata->aborted_proc = pptr; + /* retain the object so it doesn't get free'd */ + OBJ_RETAIN(pptr); + jdata->abort = true; + ORTE_UPDATE_EXIT_STATUS(pptr->exit_code); } /* remove from dependent routes, if it is one */ orte_routed.route_lost(proc); - /* kill all local procs */ - killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD); /* kill all jobs */ - default_hnp_abort(ORTE_JOBID_WILDCARD, exit_code); - return ORTE_ERR_UNRECOVERABLE; + default_hnp_abort(jdata); + break; default: + /* shouldn't get this, but terminate job if required */ + OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, + "%s errmgr:hnp: proc %s default error %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(proc), + orte_proc_state_to_str(state))); + if (jdata->num_terminated == jdata->num_procs) { + ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_TERMINATED); + } break; } - return ORTE_SUCCESS; + cleanup: + OBJ_RELEASE(caddy); } static int predicted_fault(opal_list_t *proc_list, @@ -507,35 +569,56 @@ static int ft_event(int state) /***************** * Local Functions *****************/ -static void default_hnp_abort(orte_jobid_t job, orte_exit_code_t exit_code) +static void default_hnp_abort(orte_job_t *jdata) { int rc; /* if we are already in progress, then ignore this call */ if (opal_atomic_trylock(&orte_abort_inprogress_lock)) { /* returns 1 if already locked */ OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, - "%s errmgr:default_hnp: abort in progress, ignoring abort on job %s with status %d", + "%s errmgr:default_hnp: abort in progress, ignoring abort on job %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_JOBID_PRINT(job), exit_code)); + ORTE_JOBID_PRINT(jdata->jobid))); return; } OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, - "%s errmgr:default_hnp: abort called on job %s with status %d", + "%s errmgr:default_hnp: abort called on job %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_JOBID_PRINT(job), exit_code)); + ORTE_JOBID_PRINT(jdata->jobid))); + + /* the job aborted - turn off any sensors on this job */ + orte_sensor.stop(jdata->jobid); /* set control params to indicate we are terminating */ orte_job_term_ordered = true; - orte_abnormal_term_ordered = true; orte_enable_recovery = false; - /* set the exit status, just in case whomever called us failed - * to do so - it can only be done once, so we are protected - * from overwriting it + /* if it is the daemon job that aborted, then we need + * to flag an abnormal term - otherwise, just abort + * the job cleanly */ - ORTE_UPDATE_EXIT_STATUS(exit_code); + if (ORTE_PROC_MY_NAME->jobid == jdata->jobid) { + orte_abnormal_term_ordered = true; + } + if (0 < jdata->num_non_zero_exit) { + /* warn user */ + opal_output(orte_clean_output, + "-------------------------------------------------------\n" + "%s job %s terminated normally, but %d %s. Per user-direction, the job has been aborted.\n" + "-------------------------------------------------------", + (1 == ORTE_LOCAL_JOBID(jdata->jobid)) ? "Primary" : "Child", + (1 == ORTE_LOCAL_JOBID(jdata->jobid)) ? "" : ORTE_LOCAL_JOBID_PRINT(jdata->jobid), + jdata->num_non_zero_exit, + (1 == jdata->num_non_zero_exit) ? "process returned\na non-zero exit code." : + "processes returned\nnon-zero exit codes."); + } + + OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, + "%s errmgr:default_hnp: ordering orted termination", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + /* tell the plm to terminate the orteds - they will automatically * kill their local procs */ @@ -543,691 +626,3 @@ static void default_hnp_abort(orte_jobid_t job, orte_exit_code_t exit_code) ORTE_ERROR_LOG(rc); } } - -static void failed_start(orte_job_t *jdata) -{ - opal_list_item_t *item, *next; - orte_odls_job_t *jobdat; - orte_odls_child_t *child; - orte_proc_t *proc; - - /* lookup the local jobdat for this job */ - jobdat = NULL; - for (item = opal_list_get_first(&orte_local_jobdata); - item != opal_list_get_end(&orte_local_jobdata); - item = opal_list_get_next(item)) { - jobdat = (orte_odls_job_t*)item; - - /* is this the specified job? */ - if (jobdat->jobid == jdata->jobid) { - break; - } - } - if (NULL == jobdat) { - /* race condition - may not have been formed yet */ - return; - } - jobdat->state = ORTE_JOB_STATE_FAILED_TO_START; - - OPAL_THREAD_LOCK(&orte_odls_globals.mutex); - - for (item = opal_list_get_first(&orte_local_children); - item != opal_list_get_end(&orte_local_children); - item = next) { - next = opal_list_get_next(item); - child = (orte_odls_child_t*)item; - if (child->name->jobid == jobdat->jobid) { - if (ORTE_PROC_STATE_LAUNCHED > child->state || - ORTE_PROC_STATE_UNTERMINATED < child->state) { - /* get the master proc object */ - proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, child->name->vpid); - proc->state = child->state; - proc->exit_code = child->exit_code; - /* update the counter so we can terminate */ - jdata->num_terminated++; - /* remove the child from our list */ - opal_list_remove_item(&orte_local_children, &child->super); - OBJ_RELEASE(child); - jobdat->num_local_procs--; - } - } - } - - opal_condition_signal(&orte_odls_globals.cond); - OPAL_THREAD_UNLOCK(&orte_odls_globals.mutex); - - OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, - "%s errmgr:default_hnp: job %s reported incomplete start", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_JOBID_PRINT(jdata->jobid))); -} - -static void update_local_procs_in_job(orte_job_t *jdata, orte_job_state_t jobstate, - orte_proc_state_t state, orte_exit_code_t exit_code) -{ - opal_list_item_t *item, *next; - orte_odls_job_t *jobdat; - orte_odls_child_t *child; - orte_proc_t *proc; - - /* lookup the local jobdat for this job */ - jobdat = NULL; - for (item = opal_list_get_first(&orte_local_jobdata); - item != opal_list_get_end(&orte_local_jobdata); - item = opal_list_get_next(item)) { - jobdat = (orte_odls_job_t*)item; - - /* is this the specified job? */ - if (jobdat->jobid == jdata->jobid) { - break; - } - } - if (NULL == jobdat) { - /* race condition - may not have been formed yet */ - return; - } - jobdat->state = jobstate; - jdata->state = jobstate; - - OPAL_THREAD_LOCK(&orte_odls_globals.mutex); - - for (item = opal_list_get_first(&orte_local_children); - item != opal_list_get_end(&orte_local_children); - item = next) { - next = opal_list_get_next(item); - child = (orte_odls_child_t*)item; - if (jdata->jobid == child->name->jobid) { - child->state = state; - proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, child->name->vpid); - proc->state = state; - if (proc->exit_code < exit_code) { - proc->exit_code = exit_code; - } - if (ORTE_PROC_STATE_UNTERMINATED < state) { - opal_list_remove_item(&orte_local_children, &child->super); - OBJ_RELEASE(child); - jdata->num_terminated++; - jobdat->num_local_procs--; - } else if (ORTE_PROC_STATE_RUNNING) { - jdata->num_launched++; - } else if (ORTE_PROC_STATE_REGISTERED == state) { - jdata->num_reported++; - if (jdata->dyn_spawn_active && - jdata->num_reported == jdata->num_procs) { - OPAL_RELEASE_THREAD(&jdata->dyn_spawn_lock, - &jdata->dyn_spawn_cond, - &jdata->dyn_spawn_active); - } - } - } - } - - opal_condition_signal(&orte_odls_globals.cond); - OPAL_THREAD_UNLOCK(&orte_odls_globals.mutex); - -} - -static void update_proc(orte_job_t *jdata, - orte_process_name_t *proc, - orte_proc_state_t state, - pid_t pid, - orte_exit_code_t exit_code) -{ - opal_list_item_t *item, *next; - orte_odls_child_t *child; - orte_proc_t *proct; - orte_odls_job_t *jobdat, *jdat; - int i; - - jobdat = NULL; - for (item = opal_list_get_first(&orte_local_jobdata); - item != opal_list_get_end(&orte_local_jobdata); - item = opal_list_get_next(item)) { - jdat = (orte_odls_job_t*)item; - if (jdat->jobid == jdata->jobid) { - jobdat = jdat; - break; - } - } - if (NULL == jobdat) { - ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); - } - - OPAL_THREAD_LOCK(&orte_odls_globals.mutex); - - /*** UPDATE LOCAL CHILD ***/ - for (item = opal_list_get_first(&orte_local_children); - item != opal_list_get_end(&orte_local_children); - item = next) { - next = opal_list_get_next(item); - child = (orte_odls_child_t*)item; - if (child->name->jobid == proc->jobid) { - if (child->name->vpid == proc->vpid) { - child->state = state; - if (0 < pid) { - child->pid = pid; - } - child->exit_code = exit_code; - proct = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, child->name->vpid); - proct->state = state; - if (0 < pid) { - proct->pid = pid; - } - proct->exit_code = exit_code; - if (ORTE_PROC_STATE_UNTERMINATED < state) { - opal_list_remove_item(&orte_local_children, &child->super); - OBJ_RELEASE(child); - if (NULL != jobdat) { - jobdat->num_local_procs--; - } - jdata->num_terminated++; - } else if (ORTE_PROC_STATE_RUNNING == state) { - jdata->num_launched++; - if (jdata->num_launched == jdata->num_procs) { - jdata->state = ORTE_JOB_STATE_RUNNING; - } - } else if (ORTE_PROC_STATE_REGISTERED == state) { - jdata->num_reported++; - if (jdata->dyn_spawn_active && - jdata->num_reported == jdata->num_procs) { - OPAL_RELEASE_THREAD(&jdata->dyn_spawn_lock, - &jdata->dyn_spawn_cond, - &jdata->dyn_spawn_active); - } - } - return; - } - } - } - - opal_condition_signal(&orte_odls_globals.cond); - OPAL_THREAD_UNLOCK(&orte_odls_globals.mutex); - - /*** UPDATE REMOTE CHILD ***/ - for (i=0; i < jdata->procs->size; i++) { - if (NULL == (proct = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, i))) { - continue; - } - if (proct->name.jobid != proc->jobid || - proct->name.vpid != proc->vpid) { - continue; - } - proct->state = state; - if (0 < pid) { - proct->pid = pid; - } - proct->exit_code = exit_code; - if (ORTE_PROC_STATE_REGISTERED == state) { - jdata->num_reported++; - if (jdata->dyn_spawn_active && - jdata->num_reported == jdata->num_procs) { - OPAL_RELEASE_THREAD(&jdata->dyn_spawn_lock, - &jdata->dyn_spawn_cond, - &jdata->dyn_spawn_active); - } - } else if (ORTE_PROC_STATE_UNTERMINATED < state) { - /* update the counter so we can terminate */ - jdata->num_terminated++; - } else if (ORTE_PROC_STATE_RUNNING == state) { - jdata->num_launched++; - if (jdata->num_launched == jdata->num_procs) { - jdata->state = ORTE_JOB_STATE_RUNNING; - } - } - return; - } -} - -static void check_job_complete(orte_job_t *jdata) -{ - orte_proc_t *proc; - int i; - orte_std_cntr_t j; - orte_job_t *job; - orte_node_t *node; - orte_job_map_t *map; - orte_std_cntr_t index; - bool one_still_alive; - orte_vpid_t non_zero=0, lowest=0; - char *msg; - - if (NULL == jdata) { - /* just check to see if the daemons are complete */ - OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, - "%s errmgr:default_hnp:check_job_complete - received NULL job, checking daemons", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - goto CHECK_DAEMONS; - } - - for (i=0; i < jdata->procs->size && !jdata->abort; i++) { - if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, i))) { - /* the proc array may no longer be left justified, so - * we need to check everything - */ - continue; - } - - if (0 != proc->exit_code) { - non_zero++; - if (0 == lowest) { - lowest = proc->exit_code; - } - } - - switch (proc->state) { - case ORTE_PROC_STATE_KILLED_BY_CMD: - OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, - "%s errmgr:default_hnp:check_job_completed proc %s killed by cmd", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&proc->name))); - /* we ordered this proc to die, so it isn't an abnormal termination - * and we don't flag it as such - just check the remaining jobs to - * see if anyone is still alive - */ - if (jdata->num_terminated >= jdata->num_procs) { - /* this job has terminated - now we need to check to see if ALL - * the other jobs have also completed and wakeup if that is true - */ - if (!jdata->abort) { - jdata->state = ORTE_JOB_STATE_KILLED_BY_CMD; - } - } - goto CHECK_ALIVE; - break; - case ORTE_PROC_STATE_ABORTED: - OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, - "%s errmgr:default_hnp:check_job_completed proc %s aborted", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&proc->name))); - if (!jdata->abort) { - jdata->state = ORTE_JOB_STATE_ABORTED; - /* point to the lowest rank to cause the problem */ - jdata->aborted_proc = proc; - /* retain the object so it doesn't get free'd */ - OBJ_RETAIN(proc); - jdata->abort = true; - ORTE_UPDATE_EXIT_STATUS(proc->exit_code); - } - break; - case ORTE_PROC_STATE_FAILED_TO_START: - OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, - "%s errmgr_default_hnp:check_job_completed proc %s failed to start", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&proc->name))); - if (!jdata->abort) { - jdata->state = ORTE_JOB_STATE_FAILED_TO_START; - /* point to the lowest rank to cause the problem */ - jdata->aborted_proc = proc; - /* retain the object so it doesn't get free'd */ - OBJ_RETAIN(proc); - jdata->abort = true; - ORTE_UPDATE_EXIT_STATUS(proc->exit_code); - } - break; - case ORTE_PROC_STATE_ABORTED_BY_SIG: - OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, - "%s errmgr:default_hnp:check_job_completed proc %s aborted by signal", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&proc->name))); - if (!jdata->abort) { - jdata->state = ORTE_JOB_STATE_ABORTED_BY_SIG; - /* point to the lowest rank to cause the problem */ - jdata->aborted_proc = proc; - /* retain the object so it doesn't get free'd */ - OBJ_RETAIN(proc); - jdata->abort = true; - ORTE_UPDATE_EXIT_STATUS(proc->exit_code); - } - break; - case ORTE_PROC_STATE_TERM_WO_SYNC: - OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, - "%s errmgr:default_hnp:check_job_completed proc %s terminated without sync", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&proc->name))); - if (!jdata->abort) { - jdata->state = ORTE_JOB_STATE_ABORTED_WO_SYNC; - /* point to the lowest rank to cause the problem */ - jdata->aborted_proc = proc; - /* retain the object so it doesn't get free'd */ - OBJ_RETAIN(proc); - jdata->abort = true; - ORTE_UPDATE_EXIT_STATUS(proc->exit_code); - /* now treat a special case - if the proc exit'd without a required - * sync, it may have done so with a zero exit code. We want to ensure - * that the user realizes there was an error, so in this -one- case, - * we overwrite the process' exit code with the default error code - */ - ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE); - } - break; - case ORTE_PROC_STATE_COMM_FAILED: - if (!jdata->abort) { - jdata->state = ORTE_JOB_STATE_COMM_FAILED; - /* point to the lowest rank to cause the problem */ - jdata->aborted_proc = proc; - /* retain the object so it doesn't get free'd */ - OBJ_RETAIN(proc); - jdata->abort = true; - ORTE_UPDATE_EXIT_STATUS(proc->exit_code); - } - break; - case ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED: - if (!jdata->abort) { - jdata->state = ORTE_JOB_STATE_SENSOR_BOUND_EXCEEDED; - /* point to the lowest rank to cause the problem */ - jdata->aborted_proc = proc; - /* retain the object so it doesn't get free'd */ - OBJ_RETAIN(proc); - jdata->abort = true; - ORTE_UPDATE_EXIT_STATUS(proc->exit_code); - } - break; - case ORTE_PROC_STATE_CALLED_ABORT: - if (!jdata->abort) { - jdata->state = ORTE_JOB_STATE_CALLED_ABORT; - /* point to the first proc to cause the problem */ - jdata->aborted_proc = proc; - /* retain the object so it doesn't get free'd */ - OBJ_RETAIN(proc); - jdata->abort = true; - ORTE_UPDATE_EXIT_STATUS(proc->exit_code); - } - break; - case ORTE_PROC_STATE_HEARTBEAT_FAILED: - if (!jdata->abort) { - jdata->state = ORTE_JOB_STATE_HEARTBEAT_FAILED; - /* point to the lowest rank to cause the problem */ - jdata->aborted_proc = proc; - /* retain the object so it doesn't get free'd */ - OBJ_RETAIN(proc); - jdata->abort = true; - ORTE_UPDATE_EXIT_STATUS(proc->exit_code); - } - break; - case ORTE_PROC_STATE_TERM_NON_ZERO: - ORTE_UPDATE_EXIT_STATUS(proc->exit_code); - if (orte_abort_non_zero_exit) { - if (!jdata->abort) { - jdata->state = ORTE_JOB_STATE_NON_ZERO_TERM; - /* point to the lowest rank to cause the problem */ - jdata->aborted_proc = proc; - /* retain the object so it doesn't get free'd */ - OBJ_RETAIN(proc); - jdata->abort = true; - } - } - break; - - default: - if (ORTE_PROC_STATE_UNTERMINATED < proc->state && - jdata->controls & ORTE_JOB_CONTROL_CONTINUOUS_OP) { - OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, - "%s errmgr:default_hnp:check_job_completed proc %s terminated and continuous", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&proc->name))); - if (!jdata->abort) { - proc->state = ORTE_PROC_STATE_ABORTED; - jdata->state = ORTE_JOB_STATE_ABORTED; - /* point to the lowest rank to cause the problem */ - jdata->aborted_proc = proc; - /* retain the object so it doesn't get free'd */ - OBJ_RETAIN(proc); - jdata->abort = true; - ORTE_UPDATE_EXIT_STATUS(proc->exit_code); - } - } - break; - } - } - - if (jdata->abort) { - /* the job aborted - turn off any sensors on this job */ - orte_sensor.stop(jdata->jobid); - } - - if (ORTE_JOB_STATE_UNTERMINATED > jdata->state && - jdata->num_terminated >= jdata->num_procs) { - /* this job has terminated */ - jdata->state = ORTE_JOB_STATE_TERMINATED; - - /* turn off any sensor monitors on this job */ - orte_sensor.stop(jdata->jobid); - - if (0 < non_zero) { - if (!orte_report_child_jobs_separately || 1 == ORTE_LOCAL_JOBID(jdata->jobid)) { - /* update the exit code */ - ORTE_UPDATE_EXIT_STATUS(lowest); - } - - /* warn user */ - opal_output(orte_clean_output, - "-------------------------------------------------------\n" - "While %s job %s terminated normally, %s %s. Further examination may be required.\n" - "-------------------------------------------------------", - (1 == ORTE_LOCAL_JOBID(jdata->jobid)) ? "the primary" : "child", - (1 == ORTE_LOCAL_JOBID(jdata->jobid)) ? "" : ORTE_LOCAL_JOBID_PRINT(jdata->jobid), - ORTE_VPID_PRINT(non_zero), - (1 == non_zero) ? "process returned\na non-zero exit code." : "processes returned\nnon-zero exit codes."); - } - OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, - "%s errmgr:default_hnp:check_job_completed declared job %s normally terminated - checking all jobs", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_JOBID_PRINT(jdata->jobid))); - } - - /* if this job is a continuously operating one, then don't do - * anything further - just return here - */ - if (NULL != jdata && - (ORTE_JOB_CONTROL_CONTINUOUS_OP & jdata->controls || - ORTE_JOB_CONTROL_RECOVERABLE & jdata->controls)) { - goto CHECK_ALIVE; - } - - /* if the job that is being checked is the HNP, then we are - * trying to terminate the orteds. In that situation, we - * do -not- check all jobs - we simply notify the DEFAULT_HNP - * that the orteds are complete. Also check special case - * if jdata is NULL - we want - * to definitely declare the job done if the orteds - * have completed, no matter what else may be happening. - * This can happen if a ctrl-c hits in the "wrong" place - * while launching - */ -CHECK_DAEMONS: - if (jdata == NULL || jdata->jobid == ORTE_PROC_MY_NAME->jobid) { - if (0 == orte_routed.num_routes()) { - /* orteds are done! */ - OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, - "%s orteds complete - exiting", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - if (NULL == jdata) { - jdata = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid); - } - jdata->state = ORTE_JOB_STATE_TERMINATED; - orte_quit(); - return; - } - return; - } - - /* Release the resources used by this job. Since some errmgrs may want - * to continue using resources allocated to the job as part of their - * fault recovery procedure, we only do this once the job is "complete". - * Note that an aborted/killed job -is- flagged as complete and will - * therefore have its resources released. We need to do this after - * we call the errmgr so that any attempt to restart the job will - * avoid doing so in the exact same place as the current job - */ - if (NULL != jdata->map && jdata->state == ORTE_JOB_STATE_TERMINATED) { - map = jdata->map; - for (index = 0; index < map->nodes->size; index++) { - if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, index))) { - continue; - } - OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, - "%s releasing procs from node %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - node->name)); - for (i = 0; i < node->procs->size; i++) { - if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, i))) { - continue; - } - if (proc->name.jobid != jdata->jobid) { - /* skip procs from another job */ - continue; - } - node->slots_inuse--; - node->num_procs--; - OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, - "%s releasing proc %s from node %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&proc->name), node->name)); - /* set the entry in the node array to NULL */ - opal_pointer_array_set_item(node->procs, i, NULL); - /* release the proc once for the map entry */ - OBJ_RELEASE(proc); - } - } - OBJ_RELEASE(map); - jdata->map = NULL; - } - -CHECK_ALIVE: - /* now check to see if all jobs are done - release this jdata - * object when we find it - */ - one_still_alive = false; - for (j=1; j < orte_job_data->size; j++) { - if (NULL == (job = (orte_job_t*)opal_pointer_array_get_item(orte_job_data, j))) { - /* since we are releasing jdata objects as we - * go, we can no longer assume that the job_data - * array is left justified - */ - continue; - } - /* if this is the job we are checking AND it normally terminated, - * then go ahead and release it. We cannot release it if it - * abnormally terminated as mpirun needs the info so it can - * report appropriately to the user - * - * NOTE: do not release the primary job (j=1) so we - * can pretty-print completion message - */ - if (NULL != jdata && job->jobid == jdata->jobid && - (jdata->state == ORTE_JOB_STATE_TERMINATED || - jdata->state == ORTE_JOB_STATE_KILLED_BY_CMD)) { - /* release this object, ensuring that the - * pointer array internal accounting - * is maintained! - */ - if (1 < j) { - opal_pointer_array_set_item(orte_job_data, j, NULL); /* ensure the array has a NULL */ - OBJ_RELEASE(jdata); - } - continue; - } - /* if the job is flagged to not be monitored, skip it */ - if (ORTE_JOB_CONTROL_DO_NOT_MONITOR & job->controls) { - continue; - } - /* when checking for job termination, we must be sure to NOT check - * our own job as it - rather obviously - has NOT terminated! - */ - if (job->num_terminated < job->num_procs) { - /* we have at least one job that is not done yet - we cannot - * just return, though, as we need to ensure we cleanout the - * job data for the job that just completed - */ - OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, - "%s errmgr:default_hnp:check_job_completed job %s is not terminated (%d:%d)", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_JOBID_PRINT(job->jobid), - job->num_terminated, job->num_procs)); - one_still_alive = true; - } - else { - OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, - "%s errmgr:default_hnp:check_job_completed job %s is terminated (%d vs %d [%s])", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_JOBID_PRINT(job->jobid), - job->num_terminated, job->num_procs, - (NULL == jdata) ? "UNKNOWN" : orte_job_state_to_str(jdata->state) )); - } - } - /* if a job is still alive, we just return */ - if (one_still_alive) { - OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, - "%s errmgr:default_hnp:check_job_completed at least one job is not terminated", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - return; - } - /* if we get here, then all jobs are done, so terminate */ - OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, - "%s errmgr:default_hnp:check_job_completed all jobs terminated", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - /* set the exit status to 0 - this will only happen if it - * wasn't already set by an error condition - */ - ORTE_UPDATE_EXIT_STATUS(0); - /* provide a notifier message if that framework is active - ignored otherwise */ - if (NULL != (job = (orte_job_t*)opal_pointer_array_get_item(orte_job_data, 1))) { - if (NULL == job->name) { - job->name = strdup(orte_process_info.nodename); - } - if (NULL == job->instance) { - asprintf(&job->instance, "%d", orte_process_info.pid); - } - if (0 == orte_exit_status) { - asprintf(&msg, "Job %s:%s complete", job->name, job->instance); - orte_notifier.log(ORTE_NOTIFIER_INFO, 0, msg); - } else { - asprintf(&msg, "Job %s:%s terminated abnormally", job->name, job->instance); - orte_notifier.log(ORTE_NOTIFIER_ALERT, orte_exit_status, msg); - } - free(msg); - /* this job object will be release during finalize */ - } - - orte_jobs_complete(); - /* if I am the only daemon alive, then I can exit now */ - if (0 == orte_routed.num_routes()) { - OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, - "%s orteds complete - exiting", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - orte_quit(); - } -} - -static void killprocs(orte_jobid_t job, orte_vpid_t vpid) -{ - opal_pointer_array_t cmd; - orte_proc_t proc; - int rc; - - /* stop local sensors for this job */ - if (ORTE_VPID_WILDCARD == vpid) { - orte_sensor.stop(job); - } - - if (ORTE_JOBID_WILDCARD == job - && ORTE_VPID_WILDCARD == vpid) { - - if (ORTE_SUCCESS != (rc = orte_odls.kill_local_procs(NULL))) { - ORTE_ERROR_LOG(rc); - } - return; - } - - OBJ_CONSTRUCT(&cmd, opal_pointer_array_t); - OBJ_CONSTRUCT(&proc, orte_proc_t); - proc.name.jobid = job; - proc.name.vpid = vpid; - ORTE_EPOCH_SET(proc.name.epoch,orte_ess.proc_get_epoch(&(proc.name))); - opal_pointer_array_add(&cmd, &proc); - if (ORTE_SUCCESS != (rc = orte_odls.kill_local_procs(&cmd))) { - ORTE_ERROR_LOG(rc); - } - OBJ_DESTRUCT(&cmd); - OBJ_DESTRUCT(&proc); -} diff --git a/orte/mca/errmgr/default_orted/configure.m4 b/orte/mca/errmgr/default_orted/configure.m4 index cfc2eb6348..7ae3aef7cc 100644 --- a/orte/mca/errmgr/default_orted/configure.m4 +++ b/orte/mca/errmgr/default_orted/configure.m4 @@ -13,7 +13,7 @@ AC_DEFUN([MCA_orte_errmgr_default_orted_CONFIG], [ AC_CONFIG_FILES([orte/mca/errmgr/default_orted/Makefile]) - AS_IF([test "$orte_enable_resilient_code" = 0 -a "$orte_without_full_support" = 0], + AS_IF([test "$orte_without_full_support" = 0], [$1], [$2]) ]) diff --git a/orte/mca/errmgr/default_orted/errmgr_default_orted.c b/orte/mca/errmgr/default_orted/errmgr_default_orted.c index 810728476e..2ed26e8b73 100644 --- a/orte/mca/errmgr/default_orted/errmgr_default_orted.c +++ b/orte/mca/errmgr/default_orted/errmgr_default_orted.c @@ -6,6 +6,8 @@ * Copyright (c) 2004-2011 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. + * Copyright (c) 2011-2012 Los Alamos National Security, LLC. + * All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -32,8 +34,7 @@ #include "orte/util/session_dir.h" #include "orte/util/show_help.h" #include "orte/util/nidmap.h" -#include "orte/runtime/orte_globals.h" -#include "orte/runtime/data_type_support/orte_dt_support.h" + #include "orte/mca/rml/rml.h" #include "orte/mca/odls/odls.h" #include "orte/mca/odls/base/base.h" @@ -42,8 +43,11 @@ #include "orte/mca/routed/routed.h" #include "orte/mca/sensor/sensor.h" #include "orte/mca/ess/ess.h" +#include "orte/mca/state/state.h" + #include "orte/runtime/orte_quit.h" #include "orte/runtime/orte_globals.h" +#include "orte/runtime/data_type_support/orte_dt_support.h" #include "orte/mca/errmgr/errmgr.h" #include "orte/mca/errmgr/base/base.h" @@ -51,18 +55,6 @@ #include "errmgr_default_orted.h" -/* Local functions */ -static bool any_live_children(orte_jobid_t job); -static int pack_state_update(opal_buffer_t *alert, orte_odls_job_t *jobdat); -static int pack_state_for_proc(opal_buffer_t *alert, orte_odls_child_t *child); -static bool all_children_registered(orte_jobid_t job); -static int pack_child_contact_info(orte_jobid_t job, opal_buffer_t *buf); -static void failed_start(orte_odls_job_t *jobdat, orte_exit_code_t exit_code); -static void update_local_children(orte_odls_job_t *jobdat, - orte_job_state_t jobstate, - orte_proc_state_t state); -static void killprocs(orte_jobid_t job, orte_vpid_t vpid); - /* * Module functions: Global */ @@ -73,13 +65,6 @@ static int predicted_fault(opal_list_t *proc_list, opal_list_t *node_list, opal_list_t *suggested_map); -static int update_state(orte_jobid_t job, - orte_job_state_t jobstate, - orte_process_name_t *proc, - orte_proc_state_t state, - pid_t pid, - orte_exit_code_t exit_code); - static int suggest_map_targets(orte_proc_t *proc, orte_node_t *oldnode, opal_list_t *node_list); @@ -96,7 +81,6 @@ orte_errmgr_base_module_t orte_errmgr_default_orted_module = { orte_errmgr_base_log, orte_errmgr_base_abort, orte_errmgr_base_abort_peers, - update_state, predicted_fault, suggest_map_targets, ft_event, @@ -104,11 +88,32 @@ orte_errmgr_base_module_t orte_errmgr_default_orted_module = { NULL }; +/* Local functions */ +static bool any_live_children(orte_jobid_t job); +static int pack_state_update(opal_buffer_t *alert, orte_job_t *jobdat); +static int pack_state_for_proc(opal_buffer_t *alert, orte_proc_t *child); +static bool all_children_registered(orte_jobid_t job); +static int pack_child_contact_info(orte_jobid_t job, opal_buffer_t *buf); +static void failed_start(orte_job_t *jobdat); +static void update_local_children(orte_job_t *jobdat, + orte_job_state_t jobstate, + orte_proc_state_t state); +static void killprocs(orte_jobid_t job, orte_vpid_t vpid); + +static void job_errors(int fd, short args, void *cbdata); +static void proc_errors(int fd, short args, void *cbdata); + /************************ * API Definitions ************************/ static int init(void) { + /* setup state machine to trap job errors */ + orte_state.add_job_state(ORTE_JOB_STATE_ERROR, job_errors, ORTE_ERROR_PRI); + + /* setup state machine to trap proc errors */ + orte_state.add_proc_state(ORTE_PROC_STATE_ERROR, proc_errors, ORTE_ERROR_PRI); + return ORTE_SUCCESS; } @@ -117,138 +122,125 @@ static int finalize(void) return ORTE_SUCCESS; } -static void cbfunc(int status, orte_process_name_t* sender, - opal_buffer_t *buffer, orte_rml_tag_t tag, - void* cbdata) +static void job_errors(int fd, short args, void *cbdata) { - OBJ_RELEASE(buffer); -} - -static int update_state(orte_jobid_t job, - orte_job_state_t jobstate, - orte_process_name_t *proc, - orte_proc_state_t state, - pid_t pid, - orte_exit_code_t exit_code) -{ - opal_list_item_t *item, *next; - orte_odls_job_t *jobdat = NULL; - orte_odls_child_t *child; - opal_buffer_t *alert; + orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata; + orte_job_t *jdata; + orte_job_state_t jobstate; + int rc; orte_plm_cmd_flag_t cmd; - int rc=ORTE_SUCCESS; - orte_vpid_t null=ORTE_VPID_INVALID; - orte_ns_cmp_bitmask_t mask; + opal_buffer_t *alert; /* * if orte is trying to shutdown, just let it */ if (orte_finalizing) { - return ORTE_SUCCESS; + return; + } + + /* if the jdata is NULL, then we abort as this + * is reporting an unrecoverable error + */ + if (NULL == caddy->jdata) { + ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_FORCED_EXIT); + OBJ_RELEASE(caddy); + return; + } + + /* update the state */ + jdata = caddy->jdata; + jobstate = caddy->job_state; + jdata->state = jobstate; + + OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, + "%s errmgr:default_orted: job %s reported error state %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_JOBID_PRINT(jdata->jobid), + orte_job_state_to_str(jobstate))); + + switch (jobstate) { + case ORTE_JOB_STATE_FAILED_TO_START: + failed_start(jdata); + break; + case ORTE_JOB_STATE_SENSOR_BOUND_EXCEEDED: + /* update all procs in job */ + update_local_children(jdata, jobstate, ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED); + /* order all local procs for this job to be killed */ + killprocs(jdata->jobid, ORTE_VPID_WILDCARD); + break; + case ORTE_JOB_STATE_COMM_FAILED: + /* kill all local procs */ + killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD); + /* order termination */ + ORTE_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); + goto cleanup; + break; + case ORTE_JOB_STATE_HEARTBEAT_FAILED: + /* let the HNP handle this */ + goto cleanup; + break; + + default: + break; + } + alert = OBJ_NEW(opal_buffer_t); + /* pack update state command */ + cmd = ORTE_PLM_UPDATE_PROC_STATE; + if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(alert); + goto cleanup; + } + /* pack the job info */ + if (ORTE_SUCCESS != (rc = pack_state_update(alert, jdata))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(alert); + goto cleanup; + } + /* send it */ + if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert, + ORTE_RML_TAG_PLM, 0, + orte_rml_send_callback, NULL))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(alert); + } + + cleanup: + OBJ_RELEASE(caddy); +} + +static void proc_errors(int fd, short args, void *cbdata) +{ + orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata; + orte_job_t *jdata; + orte_proc_t *pptr; + orte_process_name_t *proc = &caddy->name; + orte_proc_state_t state = caddy->proc_state; + + orte_proc_t *child, *ptr; + opal_buffer_t *alert; + orte_plm_cmd_flag_t cmd; + int rc=ORTE_SUCCESS; + orte_vpid_t null=ORTE_VPID_INVALID; + orte_ns_cmp_bitmask_t mask=ORTE_NS_CMP_ALL; + int i, nchildren; + + /* + * if orte is trying to shutdown, just let it + */ + if (orte_finalizing) { + goto cleanup; } OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base.output, - "%s errmgr:default_orted:update_state process %s to %s", + "%s errmgr:default_orted:proc_errors process %s error state %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - (NULL == proc) ? "NULL" : ORTE_NAME_PRINT(proc), + ORTE_NAME_PRINT(proc), orte_proc_state_to_str(state))); /* if this is a heartbeat failure, let the HNP handle it */ - if (ORTE_JOB_STATE_HEARTBEAT_FAILED == jobstate || - ORTE_PROC_STATE_HEARTBEAT_FAILED == state) { - return ORTE_SUCCESS; - } - - /*** UPDATE COMMAND FOR A JOB ***/ - if (NULL == proc) { - /* this is an update for an entire job */ - if (ORTE_JOBID_INVALID == job) { - /* whatever happened, we don't know what job - * it happened to - */ - orte_show_help("help-orte-errmgr.txt", "errmgr:unknown-job-error", - true, orte_job_state_to_str(jobstate)); - alert = OBJ_NEW(opal_buffer_t); - /* pack update state command */ - cmd = ORTE_PLM_UPDATE_PROC_STATE; - if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) { - ORTE_ERROR_LOG(rc); - return rc; - } - /* pack the "invalid" jobid */ - if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &job, 1, ORTE_JOBID))) { - ORTE_ERROR_LOG(rc); - return rc; - } - if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert, ORTE_RML_TAG_PLM, 0, cbfunc, NULL))) { - ORTE_ERROR_LOG(rc); - } else { - rc = ORTE_SUCCESS; - } - return rc; - } - - /* lookup the local jobdat for this job */ - jobdat = NULL; - for (item = opal_list_get_first(&orte_local_jobdata); - item != opal_list_get_end(&orte_local_jobdata); - item = opal_list_get_next(item)) { - jobdat = (orte_odls_job_t*)item; - - /* is this the specified job? */ - if (jobdat->jobid == job) { - break; - } - } - if (NULL == jobdat) { - return ORTE_ERR_NOT_FOUND; - } - - switch (jobstate) { - case ORTE_JOB_STATE_FAILED_TO_START: - failed_start(jobdat, exit_code); - break; - case ORTE_JOB_STATE_RUNNING: - /* update all local child states */ - update_local_children(jobdat, jobstate, ORTE_PROC_STATE_RUNNING); - break; - case ORTE_JOB_STATE_SENSOR_BOUND_EXCEEDED: - /* update all procs in job */ - update_local_children(jobdat, jobstate, ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED); - /* order all local procs for this job to be killed */ - killprocs(jobdat->jobid, ORTE_VPID_WILDCARD); - case ORTE_JOB_STATE_COMM_FAILED: - /* kill all local procs */ - killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD); - /* tell the caller we can't recover */ - return ORTE_ERR_UNRECOVERABLE; - break; - case ORTE_JOB_STATE_HEARTBEAT_FAILED: - /* let the HNP handle this */ - return ORTE_SUCCESS; - break; - - default: - break; - } - alert = OBJ_NEW(opal_buffer_t); - /* pack update state command */ - cmd = ORTE_PLM_UPDATE_PROC_STATE; - if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) { - ORTE_ERROR_LOG(rc); - goto FINAL_CLEANUP; - } - /* pack the job info */ - if (ORTE_SUCCESS != (rc = pack_state_update(alert, jobdat))) { - ORTE_ERROR_LOG(rc); - } - /* send it */ - if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert, ORTE_RML_TAG_PLM, 0, cbfunc, NULL))) { - ORTE_ERROR_LOG(rc); - } else { - rc = ORTE_SUCCESS; - } - return rc; + if (ORTE_PROC_STATE_HEARTBEAT_FAILED == state) { + goto cleanup; } /* if this was a failed comm, then see if it was to our @@ -257,12 +249,12 @@ static int update_state(orte_jobid_t job, if (ORTE_PROC_STATE_COMM_FAILED == state) { /* if it is our own connection, ignore it */ if (OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, ORTE_PROC_MY_NAME, proc)) { - return ORTE_SUCCESS; + goto cleanup; } /* was it a daemon? */ if (proc->jobid != ORTE_PROC_MY_NAME->jobid) { /* nope - ignore */ - return ORTE_SUCCESS; + goto cleanup; } OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base.output, "%s errmgr:default:orted daemon %s exited", @@ -279,79 +271,76 @@ static int update_state(orte_jobid_t job, /* terminate - our routed children will see * us leave and automatically die */ - orte_quit(); + ORTE_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); + goto cleanup; } - /* was it a daemon that failed? */ - if (proc->jobid == ORTE_PROC_MY_NAME->jobid) { - /* if all my routes are gone, then terminate ourselves */ - if (0 == orte_routed.num_routes() && - 0 == opal_list_get_size(&orte_local_children)) { - OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base.output, - "%s errmgr:default:orted all routes gone - exiting", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - orte_quit(); - } else { - OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base.output, - "%s errmgr:default:orted not exiting, num_routes() == %d, num children == %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - (int)orte_routed.num_routes(), - (int)opal_list_get_size(&orte_local_children))); + /* are any of my children still alive */ + for (i=0; i < orte_local_children->size; i++) { + if (NULL != (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) { + if (child->alive && child->state < ORTE_PROC_STATE_UNTERMINATED) { + goto cleanup; + } } } - /* if not, then indicate we can continue */ - return ORTE_SUCCESS; - } - - /* lookup the local jobdat for this job */ - jobdat = NULL; - for (item = opal_list_get_first(&orte_local_jobdata); - item != opal_list_get_end(&orte_local_jobdata); - item = opal_list_get_next(item)) { - jobdat = (orte_odls_job_t*)item; - - /* is this the specified job? */ - if (jobdat->jobid == proc->jobid) { - break; + /* if all my routes and children are gone, then terminate ourselves */ + if (0 == orte_routed.num_routes()) { + OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base.output, + "%s errmgr:default:orted all routes gone - exiting", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + ORTE_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); + } else { + OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base.output, + "%s errmgr:default:orted not exiting, num_routes() == %d, num_children == %d", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + (int)orte_routed.num_routes(), nchildren)); } + /* if not, then we can continue */ + goto cleanup; } - if (NULL == jobdat) { + + /* get the job object */ + if (NULL == (jdata = orte_get_job_data_object(proc->jobid))) { /* must already be complete */ - return ORTE_SUCCESS; + goto cleanup; } + pptr = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid); /* if there are no local procs for this job, we can * ignore this call */ - if (0 == jobdat->num_local_procs) { - return ORTE_SUCCESS; + if (0 == jdata->num_local_procs) { + goto cleanup; + } + + /* find this proc in the local children */ + child = NULL; + for (i=0; i < orte_local_children->size; i++) { + if (NULL == (ptr = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) { + continue; + } + if (OPAL_EQUAL == orte_util_compare_name_fields(mask, &ptr->name, proc)) { + child = ptr; + break; + } + } + if (NULL == child) { + ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); + goto cleanup; } OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base.output, - "%s errmgr:default_orted got state %s for proc %s pid %d", + "%s errmgr:default_orted got state %s for proc %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), orte_proc_state_to_str(state), - ORTE_NAME_PRINT(proc), pid)); + ORTE_NAME_PRINT(proc))); - /*** UPDATE COMMAND FOR A SPECIFIC PROCESS ***/ if (ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED == state) { - /* find this proc in the local children */ - for (item = opal_list_get_first(&orte_local_children); - item != opal_list_get_end(&orte_local_children); - item = opal_list_get_next(item)) { - child = (orte_odls_child_t*)item; - mask = ORTE_NS_CMP_ALL; - if (OPAL_EQUAL == orte_util_compare_name_fields(mask, child->name, proc)) { - if (ORTE_PROC_STATE_UNTERMINATED > child->state) { - child->state = state; - child->exit_code = exit_code; - /* Decrement the number of local procs */ - jobdat->num_local_procs--; - /* kill this proc */ - killprocs(proc->jobid, proc->vpid); - } - return ORTE_SUCCESS; - } - } + child->state = state; + /* Decrement the number of local procs */ + jdata->num_local_procs--; + /* kill this proc */ + killprocs(proc->jobid, proc->vpid); + goto cleanup; } if (ORTE_PROC_STATE_TERM_NON_ZERO == state) { @@ -361,6 +350,21 @@ static int update_state(orte_jobid_t job, } } + if (ORTE_PROC_STATE_FAILED_TO_START == state || + ORTE_PROC_STATE_FAILED_TO_LAUNCH == state) { + /* update the proc state */ + child->state = state; + /* count the proc as having "terminated" */ + jdata->num_terminated++; + /* leave the error report in this case to the + * state machine, which will receive notice + * when all local procs have attempted to start + * so that we send a consolidated error report + * back to the HNP + */ + goto cleanup; + } + if (ORTE_PROC_STATE_TERMINATED < state) { /* if the job hasn't completed and the state is abnormally * terminated, then we need to alert the HNP right away @@ -370,79 +374,46 @@ static int update_state(orte_jobid_t job, cmd = ORTE_PLM_UPDATE_PROC_STATE; if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) { ORTE_ERROR_LOG(rc); - goto FINAL_CLEANUP; + return; } /* pack only the data for this proc - have to start with the jobid * so the receiver can unpack it correctly */ if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &proc->jobid, 1, ORTE_JOBID))) { ORTE_ERROR_LOG(rc); - return rc; + return; } - /* find this proc in the local children */ - for (item = opal_list_get_first(&orte_local_children); - item != opal_list_get_end(&orte_local_children); - item = opal_list_get_next(item)) { - child = (orte_odls_child_t*)item; - mask = ORTE_NS_CMP_ALL; - if (OPAL_EQUAL == orte_util_compare_name_fields(mask, child->name, proc)) { - if (ORTE_PROC_STATE_UNTERMINATED > child->state) { - child->state = state; - child->exit_code = exit_code; - } - /* now pack the child's info */ - if (ORTE_SUCCESS != (rc = pack_state_for_proc(alert, child))) { - ORTE_ERROR_LOG(rc); - return rc; - } - /* remove the child from our local list as it is no longer alive */ - opal_list_remove_item(&orte_local_children, &child->super); - /* Decrement the number of local procs */ - jobdat->num_local_procs--; - - OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, - "%s errmgr:default_orted reporting proc %s aborted to HNP (local procs = %d)", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(child->name), - jobdat->num_local_procs)); - - /* release the child object */ - OBJ_RELEASE(child); - /* done with loop */ - break; - } + child->state = state; + /* now pack the child's info */ + if (ORTE_SUCCESS != (rc = pack_state_for_proc(alert, child))) { + ORTE_ERROR_LOG(rc); + return; } + /* remove the child from our local array as it is no longer alive */ + opal_pointer_array_set_item(orte_local_children, i, NULL); + /* Decrement the number of local procs */ + jdata->num_local_procs--; + + OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, + "%s errmgr:default_orted reporting proc %s aborted to HNP (local procs = %d)", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(&child->name), + jdata->num_local_procs)); + + /* release the child object */ + OBJ_RELEASE(child); /* send it */ - if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert, ORTE_RML_TAG_PLM, 0, cbfunc, NULL))) { + if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert, + ORTE_RML_TAG_PLM, 0, + orte_rml_send_callback, NULL))) { ORTE_ERROR_LOG(rc); - } else { - rc = ORTE_SUCCESS; } - return rc; + return; } REPORT_STATE: - /* find this proc in the local children so we can update its state */ - for (item = opal_list_get_first(&orte_local_children); - item != opal_list_get_end(&orte_local_children); - item = opal_list_get_next(item)) { - child = (orte_odls_child_t*)item; - mask = ORTE_NS_CMP_ALL; - if (OPAL_EQUAL == orte_util_compare_name_fields(mask, child->name, proc)) { - if (ORTE_PROC_STATE_UNTERMINATED > child->state) { - child->state = state; - if (0 < pid) { - child->pid = pid; - } - child->exit_code = exit_code; - } - /* done with loop */ - break; - } - } - if (ORTE_PROC_STATE_REGISTERED == state) { /* see if everyone in this job has registered */ if (all_children_registered(proc->jobid)) { @@ -460,116 +431,97 @@ static int update_state(orte_jobid_t job, cmd = ORTE_PLM_INIT_ROUTES_CMD; if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) { ORTE_ERROR_LOG(rc); - goto FINAL_CLEANUP; + return; } /* pack the jobid */ if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &proc->jobid, 1, ORTE_JOBID))) { ORTE_ERROR_LOG(rc); - goto FINAL_CLEANUP; + return; } - /* pack all the local child vpids and epochs */ - for (item = opal_list_get_first(&orte_local_children); - item != opal_list_get_end(&orte_local_children); - item = opal_list_get_next(item)) { - child = (orte_odls_child_t*)item; - if (child->name->jobid == proc->jobid) { - if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &child->name->vpid, 1, ORTE_VPID))) { + /* pack all the local child vpids */ + for (i=0; i < orte_local_children->size; i++) { + if (NULL == (ptr = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) { + continue; + } + if (ptr->name.jobid == proc->jobid) { + if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &ptr->name.vpid, 1, ORTE_VPID))) { ORTE_ERROR_LOG(rc); - goto FINAL_CLEANUP; + return; } } } /* pack an invalid marker */ if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &null, 1, ORTE_VPID))) { ORTE_ERROR_LOG(rc); - goto FINAL_CLEANUP; + return; } /* add in contact info for all procs in the job */ if (ORTE_SUCCESS != (rc = pack_child_contact_info(proc->jobid, alert))) { ORTE_ERROR_LOG(rc); OBJ_DESTRUCT(&alert); - return rc; + return; } /* send it */ - if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert, ORTE_RML_TAG_PLM, 0, cbfunc, NULL))) { + if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert, + ORTE_RML_TAG_PLM, 0, + orte_rml_send_callback, NULL))) { ORTE_ERROR_LOG(rc); - } else { - rc = ORTE_SUCCESS; } } - return rc; + return; } /* only other state is terminated - see if anyone is left alive */ if (!any_live_children(proc->jobid)) { - /* lookup the local jobdat for this job */ - jobdat = NULL; - for (item = opal_list_get_first(&orte_local_jobdata); - item != opal_list_get_end(&orte_local_jobdata); - item = opal_list_get_next(item)) { - jobdat = (orte_odls_job_t*)item; - - /* is this the specified job? */ - if (jobdat->jobid == proc->jobid) { - break; - } - } - if (NULL == jobdat) { - /* race condition - may not have been formed yet */ - return ORTE_SUCCESS; - } - alert = OBJ_NEW(opal_buffer_t); /* pack update state command */ cmd = ORTE_PLM_UPDATE_PROC_STATE; if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) { ORTE_ERROR_LOG(rc); - goto FINAL_CLEANUP; + return; } /* pack the data for the job */ - if (ORTE_SUCCESS != (rc = pack_state_update(alert, jobdat))) { + if (ORTE_SUCCESS != (rc = pack_state_update(alert, jdata))) { ORTE_ERROR_LOG(rc); + return; } - FINAL_CLEANUP: OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, "%s errmgr:default_orted reporting all procs in %s terminated", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_JOBID_PRINT(jobdat->jobid))); + ORTE_JOBID_PRINT(jdata->jobid))); /* remove all of this job's children from the global list - do not lock * the thread as we are already locked */ - for (item = opal_list_get_first(&orte_local_children); - item != opal_list_get_end(&orte_local_children); - item = next) { - child = (orte_odls_child_t*)item; - next = opal_list_get_next(item); - - if (jobdat->jobid == child->name->jobid) { - opal_list_remove_item(&orte_local_children, &child->super); - OBJ_RELEASE(child); + for (i=0; i < orte_local_children->size; i++) { + if (NULL == (ptr = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) { + continue; + } + if (jdata->jobid == ptr->name.jobid) { + opal_pointer_array_set_item(orte_local_children, i, NULL); + OBJ_RELEASE(ptr); } } /* ensure the job's local session directory tree is removed */ - orte_session_dir_cleanup(jobdat->jobid); + orte_session_dir_cleanup(jdata->jobid); /* remove this job from our local job data since it is complete */ - opal_list_remove_item(&orte_local_jobdata, &jobdat->super); - OBJ_RELEASE(jobdat); + opal_pointer_array_set_item(orte_job_data, ORTE_LOCAL_JOBID(jdata->jobid), NULL); + OBJ_RELEASE(jdata); /* send it */ - if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert, ORTE_RML_TAG_PLM, 0, cbfunc, NULL))) { + if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert, + ORTE_RML_TAG_PLM, 0, + orte_rml_send_callback, NULL))) { ORTE_ERROR_LOG(rc); - } else { - rc = ORTE_SUCCESS; } - - /* indicate that the job is complete */ - return rc; + return; } - return ORTE_SUCCESS; + + cleanup: + OBJ_RELEASE(caddy); } static int predicted_fault(opal_list_t *proc_list, @@ -597,18 +549,15 @@ static int ft_event(int state) *****************/ static bool any_live_children(orte_jobid_t job) { - opal_list_item_t *item; - orte_odls_child_t *child; - - /* the thread is locked elsewhere - don't try to do it again here */ - - for (item = opal_list_get_first(&orte_local_children); - item != opal_list_get_end(&orte_local_children); - item = opal_list_get_next(item)) { - child = (orte_odls_child_t*)item; + int i; + orte_proc_t *child; + for (i=0; i < orte_local_children->size; i++) { + if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) { + continue; + } /* is this child part of the specified job? */ - if ((job == child->name->jobid || ORTE_JOBID_WILDCARD == job) && + if ((job == child->name.jobid || ORTE_JOBID_WILDCARD == job) && child->alive) { return true; } @@ -619,12 +568,12 @@ static bool any_live_children(orte_jobid_t job) } -static int pack_state_for_proc(opal_buffer_t *alert, orte_odls_child_t *child) +static int pack_state_for_proc(opal_buffer_t *alert, orte_proc_t *child) { int rc; /* pack the child's vpid */ - if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &(child->name->vpid), 1, ORTE_VPID))) { + if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &(child->name.vpid), 1, ORTE_VPID))) { ORTE_ERROR_LOG(rc); return rc; } @@ -633,20 +582,6 @@ static int pack_state_for_proc(opal_buffer_t *alert, orte_odls_child_t *child) ORTE_ERROR_LOG(rc); return rc; } - /* if we are timing things, pack the time the proc was launched */ - if (orte_timing) { - int64_t tmp; - tmp = child->starttime.tv_sec; - if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &tmp, 1, OPAL_INT64))) { - ORTE_ERROR_LOG(rc); - return rc; - } - tmp = child->starttime.tv_usec; - if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &tmp, 1, OPAL_INT64))) { - ORTE_ERROR_LOG(rc); - return rc; - } - } /* pack its state */ if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &child->state, 1, ORTE_PROC_STATE))) { ORTE_ERROR_LOG(rc); @@ -661,11 +596,10 @@ static int pack_state_for_proc(opal_buffer_t *alert, orte_odls_child_t *child) return ORTE_SUCCESS; } -static int pack_state_update(opal_buffer_t *alert, orte_odls_job_t *jobdat) +static int pack_state_update(opal_buffer_t *alert, orte_job_t *jobdat) { - int rc; - opal_list_item_t *item, *next; - orte_odls_child_t *child; + int rc, i; + orte_proc_t *child; orte_vpid_t null=ORTE_VPID_INVALID; /* pack the jobid */ @@ -673,27 +607,12 @@ static int pack_state_update(opal_buffer_t *alert, orte_odls_job_t *jobdat) ORTE_ERROR_LOG(rc); return rc; } - /* if we are timing things, pack the time the launch msg for this job was recvd */ - if (orte_timing) { - int64_t tmp; - tmp = jobdat->launch_msg_recvd.tv_sec; - if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &tmp, 1, OPAL_INT64))) { - ORTE_ERROR_LOG(rc); - return rc; + for (i=0; i < orte_local_children->size; i++) { + if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) { + continue; } - tmp = jobdat->launch_msg_recvd.tv_usec; - if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &tmp, 1, OPAL_INT64))) { - ORTE_ERROR_LOG(rc); - return rc; - } - } - for (item = opal_list_get_first(&orte_local_children); - item != opal_list_get_end(&orte_local_children); - item = next) { - child = (orte_odls_child_t*)item; - next = opal_list_get_next(item); /* if this child is part of the job... */ - if (child->name->jobid == jobdat->jobid) { + if (child->name.jobid == jobdat->jobid) { if (ORTE_SUCCESS != (rc = pack_state_for_proc(alert, child))) { ORTE_ERROR_LOG(rc); return rc; @@ -711,18 +630,15 @@ static int pack_state_update(opal_buffer_t *alert, orte_odls_job_t *jobdat) static bool all_children_registered(orte_jobid_t job) { - opal_list_item_t *item; - orte_odls_child_t *child; - - /* the thread is locked elsewhere - don't try to do it again here */ - - for (item = opal_list_get_first(&orte_local_children); - item != opal_list_get_end(&orte_local_children); - item = opal_list_get_next(item)) { - child = (orte_odls_child_t*)item; + int i; + orte_proc_t *child; + for (i=0; i < orte_local_children->size; i++) { + if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) { + continue; + } /* is this child part of the specified job? */ - if (OPAL_EQUAL == opal_dss.compare(&child->name->jobid, &job, ORTE_JOBID)) { + if (job == child->name.jobid || ORTE_JOBID_WILDCARD == job) { /* if this child has terminated, we consider it as having * registered for the purposes of this function. If it never * did register, then we will send a NULL rml_uri back to @@ -736,39 +652,31 @@ static bool all_children_registered(orte_jobid_t job) */ continue; } - /* if this child is *not* registered yet, return false */ - if (!child->init_recvd) { - return false; - } - /* if this child has registered a finalize, return false */ - if (child->fini_recvd) { + /* if this child has *not* registered yet, return false */ + if (!child->registered) { return false; } } } - /* if we get here, then everyone in the job is currently registered */ + /* if we get here, then everyone in the job has registered */ return true; } static int pack_child_contact_info(orte_jobid_t job, opal_buffer_t *buf) { - opal_list_item_t *item; - orte_odls_child_t *child; - int rc; - - /* the thread is locked elsewhere - don't try to do it again here */ - - for (item = opal_list_get_first(&orte_local_children); - item != opal_list_get_end(&orte_local_children); - item = opal_list_get_next(item)) { - child = (orte_odls_child_t*)item; + orte_proc_t *child; + int rc, i; + for (i=0; i < orte_local_children->size; i++) { + if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) { + continue; + } /* is this child part of the specified job? */ - if (OPAL_EQUAL == opal_dss.compare(&child->name->jobid, &job, ORTE_JOBID)) { + if (job == child->name.jobid || ORTE_JOBID_WILDCARD == job) { /* pack the child's vpid - must be done in case rml_uri is NULL */ - if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &(child->name->vpid), 1, ORTE_VPID))) { + if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &(child->name.vpid), 1, ORTE_VPID))) { ORTE_ERROR_LOG(rc); return rc; } @@ -784,21 +692,21 @@ static int pack_child_contact_info(orte_jobid_t job, opal_buffer_t *buf) } -static void failed_start(orte_odls_job_t *jobdat, orte_exit_code_t exit_code) +static void failed_start(orte_job_t *jobdat) { - opal_list_item_t *item; - orte_odls_child_t *child; + int i; + orte_proc_t *child; /* set the state */ jobdat->state = ORTE_JOB_STATE_FAILED_TO_START; - for (item = opal_list_get_first(&orte_local_children); - item != opal_list_get_end(&orte_local_children); - item = opal_list_get_next(item)) { - child = (orte_odls_child_t*)item; - if (child->name->jobid == jobdat->jobid) { - if (ORTE_PROC_STATE_LAUNCHED > child->state || - ORTE_PROC_STATE_FAILED_TO_START == child->state) { + for (i=0; i < orte_local_children->size; i++) { + if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) { + continue; + } + /* is this child part of the specified job? */ + if (child->name.jobid == jobdat->jobid) { + if (ORTE_PROC_STATE_FAILED_TO_START == child->state) { /* this proc never launched - flag that the iof * is complete or else we will hang waiting for * pipes to close that were never opened @@ -816,19 +724,20 @@ static void failed_start(orte_odls_job_t *jobdat, orte_exit_code_t exit_code) return; } -static void update_local_children(orte_odls_job_t *jobdat, orte_job_state_t jobstate, orte_proc_state_t state) +static void update_local_children(orte_job_t *jobdat, orte_job_state_t jobstate, orte_proc_state_t state) { - opal_list_item_t *item; - orte_odls_child_t *child; + int i; + orte_proc_t *child; /* update job state */ jobdat->state = jobstate; /* update children */ - for (item = opal_list_get_first(&orte_local_children); - item != opal_list_get_end(&orte_local_children); - item = opal_list_get_next(item)) { - child = (orte_odls_child_t*)item; - if (jobdat->jobid == child->name->jobid) { + for (i=0; i < orte_local_children->size; i++) { + if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) { + continue; + } + /* is this child part of the specified job? */ + if (jobdat->jobid == child->name.jobid) { child->state = state; } } @@ -857,7 +766,6 @@ static void killprocs(orte_jobid_t job, orte_vpid_t vpid) OBJ_CONSTRUCT(&proc, orte_proc_t); proc.name.jobid = job; proc.name.vpid = vpid; - ORTE_EPOCH_SET(proc.name.epoch,orte_ess.proc_get_epoch(&(proc.name))); opal_pointer_array_add(&cmd, &proc); if (ORTE_SUCCESS != (rc = orte_odls.kill_local_procs(&cmd))) { ORTE_ERROR_LOG(rc); diff --git a/orte/mca/errmgr/errmgr.h b/orte/mca/errmgr/errmgr.h index 24a6de292a..2d7348c6f0 100644 --- a/orte/mca/errmgr/errmgr.h +++ b/orte/mca/errmgr/errmgr.h @@ -11,6 +11,8 @@ * All rights reserved. * Copyright (c) 2009 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved. + * Copyright (c) 2011 Los Alamos National Security, LLC. + * All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -204,27 +206,6 @@ __opal_attribute_format_funcptr__(__printf__, 2, 3); typedef int (*orte_errmgr_base_module_abort_peers_fn_t)(orte_process_name_t *procs, orte_std_cntr_t num_procs); -/** - * Alert - process aborted - * This function is called by the PLM when a remote process aborts during execution. Actions taken - * in response to the abnormal termination of a remote application process will vary across - * the various errmgr components. - * - * NOTE: Local process errors should always be reported through the error_detected interface and - * NOT here. - * - * @param *name Pointer to the name of the proc that aborted - * - * @retval ORTE_SUCCESS Whatever action that was taken was successful - * @retval ORTE_ERROR Appropriate error code - */ -typedef int (*orte_errmgr_base_module_update_state_fn_t)(orte_jobid_t job, - orte_job_state_t jobstate, - orte_process_name_t *proc_name, - orte_proc_state_t state, - pid_t pid, - orte_exit_code_t exit_code); - /** * Predicted process/node failure notification * @@ -294,8 +275,6 @@ struct orte_errmgr_base_module_2_3_0_t { orte_errmgr_base_module_abort_fn_t abort; orte_errmgr_base_module_abort_peers_fn_t abort_peers; - /** Actual process failure notification */ - orte_errmgr_base_module_update_state_fn_t update_state; /** Predicted process/node failure notification */ orte_errmgr_base_module_predicted_fault_fn_t predicted_fault; /** Suggest a node to map a restarting process onto */ diff --git a/orte/mca/errmgr/hnp/Makefile.am b/orte/mca/errmgr/hnp/Makefile.am deleted file mode 100644 index db6b1a6a0f..0000000000 --- a/orte/mca/errmgr/hnp/Makefile.am +++ /dev/null @@ -1,38 +0,0 @@ -# -# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. -# $COPYRIGHT$ -# -# Additional copyrights may follow -# -# $HEADER$ -# - -dist_pkgdata_DATA = help-orte-errmgr-hnp.txt - -sources = \ - errmgr_hnp.h \ - errmgr_hnp_component.c \ - errmgr_hnp.c \ - errmgr_hnp_autor.c \ - errmgr_hnp_crmig.c - -# Make the output library in this directory, and name it either -# mca__.la (for DSO builds) or libmca__.la -# (for static builds). - -if MCA_BUILD_orte_errmgr_hnp_DSO -component_noinst = -component_install = mca_errmgr_hnp.la -else -component_noinst = libmca_errmgr_hnp.la -component_install = -endif - -mcacomponentdir = $(pkglibdir) -mcacomponent_LTLIBRARIES = $(component_install) -mca_errmgr_hnp_la_SOURCES = $(sources) -mca_errmgr_hnp_la_LDFLAGS = -module -avoid-version - -noinst_LTLIBRARIES = $(component_noinst) -libmca_errmgr_hnp_la_SOURCES =$(sources) -libmca_errmgr_hnp_la_LDFLAGS = -module -avoid-version diff --git a/orte/mca/errmgr/hnp/errmgr_hnp.c b/orte/mca/errmgr/hnp/errmgr_hnp.c deleted file mode 100644 index 5f6e766cd5..0000000000 --- a/orte/mca/errmgr/hnp/errmgr_hnp.c +++ /dev/null @@ -1,2182 +0,0 @@ -/* - * Copyright (c) 2009-2011 The Trustees of Indiana University. - * All rights reserved. - * Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved. - * Copyright (c) 2004-2011 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2011 Los Alamos National Security, LLC. - * All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "orte_config.h" - -#include -#ifdef HAVE_UNISTD_H -#include -#endif /* HAVE_UNISTD_H */ -#ifdef HAVE_STRING_H -#include -#endif -#ifdef HAVE_SYS_WAIT_H -#include -#endif - -#include "opal/util/output.h" -#include "opal/dss/dss.h" - -#include "orte/mca/rml/rml.h" -#include "orte/mca/odls/odls.h" -#include "orte/mca/odls/base/base.h" -#include "orte/mca/odls/base/odls_private.h" -#include "orte/mca/plm/base/plm_private.h" -#include "orte/mca/plm/plm.h" -#include "orte/mca/rmaps/rmaps_types.h" -#include "orte/mca/sensor/sensor.h" -#include "orte/mca/routed/routed.h" -#include "orte/mca/debugger/base/base.h" -#include "orte/mca/notifier/notifier.h" -#include "orte/mca/grpcomm/grpcomm.h" -#include "orte/mca/ess/ess.h" - -#include "orte/util/error_strings.h" -#include "orte/util/name_fns.h" -#include "orte/util/proc_info.h" -#include "orte/util/show_help.h" -#include "orte/util/nidmap.h" - -#include "orte/runtime/orte_globals.h" -#include "orte/runtime/orte_locks.h" -#include "orte/runtime/orte_quit.h" -#include "orte/runtime/data_type_support/orte_dt_support.h" - -#include "orte/mca/errmgr/errmgr.h" -#include "orte/mca/errmgr/base/base.h" -#include "orte/mca/errmgr/base/errmgr_private.h" - -#include "errmgr_hnp.h" - -/********************** - * C/R Mgr Components - * Global: HNP - **********************/ -static orte_errmgr_base_module_t global_module = { - /** Initialization Function */ - orte_errmgr_hnp_global_module_init, - /** Finalization Function */ - orte_errmgr_hnp_global_module_finalize, - /** Error Log */ - orte_errmgr_base_log, - /** Forced Abort */ - orte_errmgr_base_abort, - /** Peer Force Abort */ - orte_errmgr_base_abort_peers, - /** Update State */ - orte_errmgr_hnp_global_update_state, - /* Predicted Fault */ - orte_errmgr_hnp_global_predicted_fault, - /* Suggest proc to node mapping */ - orte_errmgr_hnp_global_suggest_map_targets, - /* FT Event hook */ - orte_errmgr_hnp_global_ft_event, - orte_errmgr_base_register_migration_warning -#if ORTE_RESIL_ORTE - /* Set the callback */ - ,orte_errmgr_base_set_fault_callback -#endif -}; - - -/* - * Local functions - */ -static void hnp_abort(orte_jobid_t job, orte_exit_code_t exit_code); -static void failed_start(orte_job_t *jdata); -static void update_local_procs_in_job(orte_job_t *jdata, orte_job_state_t jobstate, - orte_proc_state_t state, orte_exit_code_t exit_code); -static void check_job_complete(orte_job_t *jdata); -static void killprocs(orte_jobid_t job, orte_vpid_t vpid); -static int hnp_relocate(orte_job_t *jdata, orte_process_name_t *proc, - orte_proc_state_t state, orte_exit_code_t exit_code); -static orte_odls_child_t* proc_is_local(orte_process_name_t *proc); -#if ORTE_RESIL_ORTE -static int send_to_local_applications(opal_pointer_array_t *dead_names); -static void failure_notification(int status, orte_process_name_t* sender, - opal_buffer_t *buffer, orte_rml_tag_t tag, - void* cbdata); -#endif - -/************************ - * API Definitions - ************************/ -int orte_errmgr_hnp_component_query(mca_base_module_t **module, int *priority) -{ - opal_output_verbose(10, mca_errmgr_hnp_component.super.output_handle, - "errmgr:hnp:component_query()"); - - if( ORTE_PROC_IS_HNP ) { - *priority = mca_errmgr_hnp_component.super.priority; - *module = (mca_base_module_t *)&global_module; - } - /* Daemons and Apps have their own components */ - else { - *module = NULL; - *priority = -1; - } - - return ORTE_SUCCESS; -} - -/******************* - * Global Functions - ********************/ -int orte_errmgr_hnp_global_module_init(void) -{ - int ret, exit_status = ORTE_SUCCESS; - -#if OPAL_ENABLE_FT_CR - if( mca_errmgr_hnp_component.crmig_enabled ) { - if( ORTE_SUCCESS != (ret = orte_errmgr_hnp_crmig_global_module_init()) ) { - exit_status = ret; - goto cleanup; - } - } - else { - /* Still need the tool listener so we can tell it that we cannot do - * anything if they ask. - */ - if( ORTE_SUCCESS != (ret = orte_errmgr_base_tool_init()) ) { - ORTE_ERROR_LOG(ret); - return ret; - } - } - - if( mca_errmgr_hnp_component.autor_enabled ) { - if( ORTE_SUCCESS != (ret = orte_errmgr_hnp_autor_global_module_init()) ) { - exit_status = ret; - goto cleanup; - } - } -#endif /* OPAL_ENABLE_FT_CR */ - - if( ORTE_SUCCESS != (ret = orte_errmgr_hnp_base_global_init()) ) { - exit_status = ret; - goto cleanup; - } - -cleanup: - return exit_status; -} - -int orte_errmgr_hnp_global_module_finalize(void) -{ - int ret, exit_status = ORTE_SUCCESS; - -#if OPAL_ENABLE_FT_CR - if( mca_errmgr_hnp_component.crmig_enabled ) { - if( ORTE_SUCCESS != (ret = orte_errmgr_hnp_crmig_global_module_finalize()) ) { - exit_status = ret; - goto cleanup; - } - } - else { - /* Still need the tool listener so we can tell it that we cannot do - * anything if they ask. - */ - if( ORTE_SUCCESS != (ret = orte_errmgr_base_tool_finalize()) ) { - ORTE_ERROR_LOG(ret); - return ret; - } - } - - if( mca_errmgr_hnp_component.autor_enabled ) { - if( ORTE_SUCCESS != (ret = orte_errmgr_hnp_autor_global_module_finalize()) ) { - exit_status = ret; - goto cleanup; - } - } -#endif /* OPAL_ENABLE_FT_CR */ - - if( ORTE_SUCCESS != (ret = orte_errmgr_hnp_base_global_finalize()) ) { - exit_status = ret; - goto cleanup; - } - -cleanup: - return exit_status; -} - -int orte_errmgr_hnp_global_update_state(orte_jobid_t job, - orte_job_state_t jobstate, - orte_process_name_t *proc_name, - orte_proc_state_t state, - pid_t pid, - orte_exit_code_t exit_code) -{ - int ret, exit_status = ORTE_SUCCESS; - - mca_errmgr_hnp_component.ignore_current_update = false; - - if (orte_finalizing || - orte_job_term_ordered || - ORTE_PROC_STATE_TERMINATED == state ) { - mca_errmgr_hnp_component.term_in_progress = true; - } - - OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base.output, - "errmgr:hnp:update_state() %s) " - "------- %s state updated for process %s to %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ((NULL == proc_name) ? "App. Process" : - (proc_name->jobid == ORTE_PROC_MY_HNP->jobid ? "Daemon" : "App. Process")), - (NULL == proc_name) ? "NULL" : ORTE_NAME_PRINT(proc_name), - orte_proc_state_to_str(state))); - -#if OPAL_ENABLE_FT_CR - if( mca_errmgr_hnp_component.crmig_enabled && - !mca_errmgr_hnp_component.autor_in_progress) { - if( ORTE_SUCCESS != (ret = orte_errmgr_hnp_crmig_global_update_state(job, - jobstate, - proc_name, - state, - pid, - exit_code)) ) { - exit_status = ret; - goto cleanup; - } - } - - if( mca_errmgr_hnp_component.autor_enabled && - !mca_errmgr_hnp_component.crmig_in_progress) { - if( ORTE_SUCCESS != (ret = orte_errmgr_hnp_autor_global_update_state(job, - jobstate, - proc_name, - state, - pid, - exit_code)) ) { - exit_status = ret; - goto cleanup; - } - } -#endif /* OPAL_ENABLE_FT_CR */ - - if( !mca_errmgr_hnp_component.ignore_current_update ) { - if( ORTE_SUCCESS != (ret = orte_errmgr_hnp_base_global_update_state(job, - jobstate, - proc_name, - state, - pid, - exit_code)) ) { - exit_status = ret; - goto cleanup; - } - } - -cleanup: - return exit_status; -} - -int orte_errmgr_hnp_global_predicted_fault(opal_list_t *proc_list, - opal_list_t *node_list, - opal_list_t *suggested_map) -{ -#if OPAL_ENABLE_FT_CR - int ret, exit_status = ORTE_SUCCESS; - - if( mca_errmgr_hnp_component.crmig_enabled ) { - if( ORTE_SUCCESS != (ret = orte_errmgr_hnp_crmig_global_predicted_fault(proc_list, - node_list, - suggested_map)) ) { - exit_status = ret; - goto cleanup; - } - } - /* - * If Process migration is not enabled, then return an error the tool - * which will print an appropriate message for the user. - */ - else { - OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle, - "errmgr:hnp:predicted_fault() Command line asked for a migration, but it is not enabled\n")); - orte_errmgr_base_migrate_update(ORTE_ERRMGR_MIGRATE_STATE_ERROR); - exit_status = ORTE_ERR_NOT_IMPLEMENTED; - goto cleanup; - } - -cleanup: - return exit_status; -#else - return ORTE_ERR_NOT_IMPLEMENTED; -#endif /* OPAL_ENABLE_FT_CR */ -} - -int orte_errmgr_hnp_global_suggest_map_targets(orte_proc_t *proc, - orte_node_t *oldnode, - opal_list_t *node_list) -{ -#if OPAL_ENABLE_FT_CR - int ret, exit_status = ORTE_ERR_NOT_IMPLEMENTED; - - if( mca_errmgr_hnp_component.crmig_enabled && - !mca_errmgr_hnp_component.autor_in_progress ) { - exit_status = ORTE_SUCCESS; - if( ORTE_SUCCESS != (ret = orte_errmgr_hnp_crmig_global_suggest_map_targets(proc, - oldnode, - node_list)) ) { - exit_status = ret; - goto cleanup; - } - } - - if( mca_errmgr_hnp_component.autor_enabled && - !mca_errmgr_hnp_component.crmig_in_progress ) { - exit_status = ORTE_SUCCESS; - if( ORTE_SUCCESS != (ret = orte_errmgr_hnp_autor_global_suggest_map_targets(proc, - oldnode, - node_list)) ) { - exit_status = ret; - goto cleanup; - } - } - -cleanup: - return exit_status; -#else - return ORTE_ERR_NOT_IMPLEMENTED; -#endif /* OPAL_ENABLE_FT_CR */ -} - -int orte_errmgr_hnp_global_ft_event(int state) -{ - int ret, exit_status = ORTE_SUCCESS; - -#if OPAL_ENABLE_FT_CR - if( !mca_errmgr_hnp_component.crmig_enabled ) { - if( ORTE_SUCCESS != (ret = orte_errmgr_hnp_crmig_global_ft_event(state)) ) { - exit_status = ret; - goto cleanup; - } - } - - if( !mca_errmgr_hnp_component.autor_enabled ) { - if( ORTE_SUCCESS != (ret = orte_errmgr_hnp_autor_global_ft_event(state)) ) { - exit_status = ret; - goto cleanup; - } - } -#endif /* OPAL_ENABLE_FT_CR */ - - if( ORTE_SUCCESS != (ret = orte_errmgr_hnp_base_global_ft_event(state)) ) { - exit_status = ret; - goto cleanup; - } - -cleanup: - return exit_status; -} - - -/********************** - * From HNP - **********************/ -int orte_errmgr_hnp_base_global_init(void) -{ - int ret = ORTE_SUCCESS; - -#if ORTE_RESIL_ORTE - ret = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_FAILURE_NOTICE, - ORTE_RML_PERSISTENT, failure_notification, NULL); -#endif - - return ret; -} - -int orte_errmgr_hnp_base_global_finalize(void) -{ -#if ORTE_RESIL_ORTE - orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_FAILURE_NOTICE); -#endif - - return ORTE_SUCCESS; -} - -int orte_errmgr_hnp_base_global_update_state(orte_jobid_t job, - orte_job_state_t jobstate, - orte_process_name_t *proc, - orte_proc_state_t state, - pid_t pid, - orte_exit_code_t exit_code) -{ - orte_job_t *jdata; - orte_exit_code_t sts; - orte_odls_child_t *child; - int rc; - orte_app_context_t *app; - orte_proc_t *pdat; - - OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, - "%s errmgr:hnp: job %s reported state %s" - " for proc %s state %s pid %d exit_code %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_JOBID_PRINT(job), - orte_job_state_to_str(jobstate), - (NULL == proc) ? "NULL" : ORTE_NAME_PRINT(proc), - orte_proc_state_to_str(state), pid, exit_code)); - - /* - * if orte is trying to shutdown, just let it - */ - if (orte_finalizing) { - return ORTE_SUCCESS; - } - - if (NULL == proc) { - /* this is an update for an entire local job */ - if (ORTE_JOBID_INVALID == job) { - /* whatever happened, we don't know what job - * it happened to - */ - if (ORTE_JOB_STATE_NEVER_LAUNCHED == jobstate) { - orte_never_launched = true; - } - orte_show_help("help-orte-errmgr-hnp.txt", "errmgr-hnp:unknown-job-error", - true, orte_job_state_to_str(jobstate)); - hnp_abort(job, exit_code); - return ORTE_SUCCESS; - } - - /* get the job object */ - if (NULL == (jdata = orte_get_job_data_object(job))) { - ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); - return ORTE_ERR_NOT_FOUND; - } - /* update the state */ - jdata->state = jobstate; - - OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, - "%s errmgr:hnp: job %s reported state %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_JOBID_PRINT(jdata->jobid), - orte_job_state_to_str(jobstate))); - - switch (jobstate) { - case ORTE_JOB_STATE_TERMINATED: - /* support batch-operated jobs */ - update_local_procs_in_job(jdata, jobstate, ORTE_PROC_STATE_TERMINATED, 0); - jdata->num_terminated = jdata->num_procs; - check_job_complete(jdata); - break; - - case ORTE_JOB_STATE_ABORTED: - /* support batch-operated jobs */ - update_local_procs_in_job(jdata, jobstate, ORTE_PROC_STATE_ABORTED, exit_code); - jdata->num_terminated = jdata->num_procs; - check_job_complete(jdata); - break; - - case ORTE_JOB_STATE_FAILED_TO_START: - failed_start(jdata); - check_job_complete(jdata); /* set the local proc states */ - /* the job object for this job will have been NULL'd - * in the array if the job was solely local. If it isn't - * NULL, then we need to tell everyone else to die - */ - if (NULL != (jdata = orte_get_job_data_object(job))) { - sts = exit_code; - if (ORTE_PROC_MY_NAME->jobid == job && !orte_abnormal_term_ordered) { - /* set the flag indicating that a daemon failed so we use the proper - * methods for attempting to shutdown the rest of the system - */ - orte_abnormal_term_ordered = true; - if (WIFSIGNALED(exit_code)) { /* died on signal */ -#ifdef WCOREDUMP - if (WCOREDUMP(exit_code)) { - orte_show_help("help-plm-base.txt", "daemon-died-signal-core", true, - WTERMSIG(exit_code)); - sts = WTERMSIG(exit_code); - } else { - orte_show_help("help-plm-base.txt", "daemon-died-signal", true, - WTERMSIG(exit_code)); - sts = WTERMSIG(exit_code); - } -#else - orte_show_help("help-plm-base.txt", "daemon-died-signal", true, - WTERMSIG(exit_code)); - sts = WTERMSIG(exit_code); -#endif /* WCOREDUMP */ - } else { - orte_show_help("help-plm-base.txt", "daemon-died-no-signal", true, - WEXITSTATUS(exit_code)); - sts = WEXITSTATUS(exit_code); - } - } - hnp_abort(jdata->jobid, sts); - } - break; - - case ORTE_JOB_STATE_SILENT_ABORT: - failed_start(jdata); - check_job_complete(jdata); /* set the local proc states */ - /* the job object for this job will have been NULL'd - * in the array if the job was solely local. If it isn't - * NULL, then we need to tell everyone else to die - */ - if (NULL != (jdata = orte_get_job_data_object(job))) { - if (ORTE_PROC_MY_NAME->jobid == job && !orte_abnormal_term_ordered) { - /* set the flag indicating that a daemon failed so we use the proper - * methods for attempting to shutdown the rest of the system - */ - orte_abnormal_term_ordered = true; - } - hnp_abort(jdata->jobid, exit_code); - } - break; - - case ORTE_JOB_STATE_RUNNING: - /* update all procs in job */ - update_local_procs_in_job(jdata, jobstate, ORTE_PROC_STATE_RUNNING, 0); - /* record that we reported */ - jdata->num_daemons_reported++; - /* report if requested */ - if (orte_report_launch_progress) { - if (0 == jdata->num_daemons_reported % 100 || jdata->num_daemons_reported == orte_process_info.num_procs) { - opal_output(orte_clean_output, "Reported: %d (out of %d) daemons - %d (out of %d) procs", - (int)jdata->num_daemons_reported, (int)orte_process_info.num_procs, - (int)jdata->num_launched, (int)jdata->num_procs); - } - } - break; - case ORTE_JOB_STATE_NEVER_LAUNCHED: - orte_never_launched = true; - jdata->num_terminated = jdata->num_procs; - check_job_complete(jdata); /* set the local proc states */ - /* the job object for this job will have been NULL'd - * in the array if the job was solely local. If it isn't - * NULL, then we need to tell everyone else to die - */ - if (NULL != (jdata = orte_get_job_data_object(job))) { - hnp_abort(jdata->jobid, exit_code); - } - break; - case ORTE_JOB_STATE_SENSOR_BOUND_EXCEEDED: - /* update all procs in job */ - update_local_procs_in_job(jdata, jobstate, - ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED, - exit_code); - /* order all local procs for this job to be killed */ - killprocs(jdata->jobid, ORTE_VPID_WILDCARD); - check_job_complete(jdata); /* set the local proc states */ - /* the job object for this job will have been NULL'd - * in the array if the job was solely local. If it isn't - * NULL, then we need to tell everyone else to die - */ - if (NULL != (jdata = orte_get_job_data_object(job))) { - hnp_abort(jdata->jobid, exit_code); - } - break; - case ORTE_JOB_STATE_COMM_FAILED: - /* order all local procs for this job to be killed */ - killprocs(jdata->jobid, ORTE_VPID_WILDCARD); - check_job_complete(jdata); /* set the local proc states */ - /* the job object for this job will have been NULL'd - * in the array if the job was solely local. If it isn't - * NULL, then we need to tell everyone else to die - */ - if (NULL != (jdata = orte_get_job_data_object(job))) { - hnp_abort(jdata->jobid, exit_code); - } - break; - case ORTE_JOB_STATE_HEARTBEAT_FAILED: - /* order all local procs for this job to be killed */ - killprocs(jdata->jobid, ORTE_VPID_WILDCARD); - check_job_complete(jdata); /* set the local proc states */ - /* the job object for this job will have been NULL'd - * in the array if the job was solely local. If it isn't - * NULL, then we need to tell everyone else to die - */ - if (NULL != (jdata = orte_get_job_data_object(job))) { - hnp_abort(jdata->jobid, exit_code); - } - break; - - default: - break; - } - return ORTE_SUCCESS; - } - - /* get the job object */ - if (NULL == (jdata = orte_get_job_data_object(proc->jobid))) { - /* if the orteds are terminating, check job complete */ - if (orte_orteds_term_ordered) { - opal_output(0, "TERM ORDERED - CHECKING COMPLETE"); - check_job_complete(NULL); - return ORTE_SUCCESS; - } else { - ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); - return ORTE_ERR_NOT_FOUND; - } - } - -#if OPAL_ENABLE_FT_CR - /* Notify the process state to the notifier framework if it is - active and selected. */ - orte_errmgr_base_proc_state_notify(state, proc); -#endif - - /* update is for a specific proc */ - switch (state) { - case ORTE_PROC_STATE_ABORTED: - case ORTE_PROC_STATE_ABORTED_BY_SIG: - case ORTE_PROC_STATE_TERM_WO_SYNC: - if( jdata->enable_recovery ) { - /* is this a local proc */ - if (NULL != (child = proc_is_local(proc))) { - /* local proc - see if it has reached its restart limit */ - app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, child->app_idx); - if (child->restarts < app->max_restarts) { - child->restarts++; - if (ORTE_SUCCESS == (rc = orte_odls.restart_proc(child))) { - return ORTE_SUCCESS; - } - /* reset the child's state as restart_proc would - * have cleared it - */ - child->state = state; - /* see if we can relocate it somewhere else */ - if (ORTE_SUCCESS == hnp_relocate(jdata, proc, state, exit_code)) { - return ORTE_SUCCESS; - } - /* let it fall thru to abort */ - } - } else { - /* this is a remote process - see if we can relocate it */ - if (ORTE_SUCCESS == hnp_relocate(jdata, proc, state, exit_code)) { - return ORTE_SUCCESS; - } - /* guess not - let it fall thru to abort */ - } - } - - orte_errmgr_hnp_update_proc(jdata, proc, state, pid, exit_code); - check_job_complete(jdata); /* need to set the job state */ - /* the job object for this job will have been NULL'd - * in the array if the job was solely local. If it isn't - * NULL, then we need to tell everyone else to die - */ - if (NULL != (jdata = orte_get_job_data_object(proc->jobid))) { - hnp_abort(jdata->jobid, exit_code); - } - break; - - case ORTE_PROC_STATE_FAILED_TO_START: - case ORTE_PROC_STATE_CALLED_ABORT: - orte_errmgr_hnp_update_proc(jdata, proc, state, pid, exit_code); - check_job_complete(jdata); - /* the job object for this job will have been NULL'd - * in the array if the job was solely local. If it isn't - * NULL, then we need to tell everyone else to die - */ - if (NULL != (jdata = orte_get_job_data_object(proc->jobid))) { - hnp_abort(jdata->jobid, exit_code); - } - break; - - case ORTE_PROC_STATE_REGISTERED: - case ORTE_PROC_STATE_RUNNING: - orte_errmgr_hnp_update_proc(jdata, proc, state, pid, exit_code); - break; - - case ORTE_PROC_STATE_LAUNCHED: - /* record the pid for this child */ - orte_errmgr_hnp_update_proc(jdata, proc, state, pid, exit_code); - break; - - case ORTE_PROC_STATE_TERMINATED: - case ORTE_PROC_STATE_TERM_NON_ZERO: - case ORTE_PROC_STATE_KILLED_BY_CMD: - orte_errmgr_hnp_update_proc(jdata, proc, state, pid, exit_code); - check_job_complete(jdata); - break; - - case ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED: - if (jdata->enable_recovery) { - killprocs(proc->jobid, proc->vpid); - /* is this a local proc */ - if (NULL != (child = proc_is_local(proc))) { - /* local proc - see if it has reached its restart limit */ - app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, child->app_idx); - if (child->restarts < app->max_restarts) { - child->restarts++; - if (ORTE_SUCCESS == (rc = orte_odls.restart_proc(child))) { - return ORTE_SUCCESS; - } - /* reset the child's state as restart_proc would - * have cleared it - */ - child->state = state; - /* see if we can relocate it somewhere else */ - if (ORTE_SUCCESS == hnp_relocate(jdata, proc, state, exit_code)) { - return ORTE_SUCCESS; - } - /* let it fall thru to abort */ - } - } else { - /* this is a remote process - see if we can relocate it */ - if (ORTE_SUCCESS == hnp_relocate(jdata, proc, state, exit_code)) { - return ORTE_SUCCESS; - } - /* guess not - let it fall thru to abort */ - } - } - /* kill all jobs */ - orte_errmgr_hnp_update_proc(jdata, proc, state, pid, exit_code); - check_job_complete(jdata); /* need to set the job state */ - /* the job object for this job will have been NULL'd - * in the array if the job was solely local. If it isn't - * NULL, then we need to tell everyone else to die - */ - if (NULL != (jdata = orte_get_job_data_object(proc->jobid))) { - hnp_abort(jdata->jobid, exit_code); - } - break; - - case ORTE_PROC_STATE_COMM_FAILED: - /* is this to a daemon? */ - if (ORTE_PROC_MY_NAME->jobid == proc->jobid) { - /* if this is my own connection, ignore it */ - if (ORTE_PROC_MY_NAME->vpid == proc->vpid) { - OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, - "%s My own connection - ignoring it", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - break; - } - /* if we have ordered orteds to terminate, record it */ - if (orte_orteds_term_ordered) { - OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, - "%s Daemons terminating - recording daemon %s as gone", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc))); - /* remove from dependent routes, if it is one */ - orte_routed.route_lost(proc); - /* update daemon job */ - orte_errmgr_hnp_record_dead_process(proc); - /* We'll check if the job was complete when we get the - * message back from the HNP notifying us of the dead - * process - */ - check_job_complete(jdata); - break; - } - /* if abort is in progress, see if this one failed to tell - * us it had terminated - */ - if (orte_abnormal_term_ordered) { - OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, - "%s Abort in progress - recording daemon %s as gone", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc))); - /* remove from dependent routes, if it is one */ - orte_routed.route_lost(proc); - /* update daemon job */ - orte_errmgr_hnp_record_dead_process(proc); - /* We'll check if the job was complete when we get the - * message back from the HNP notifying us of the dead - * process - */ - check_job_complete(jdata); - break; - } - - /* remove from dependent routes, if it is one */ - orte_routed.route_lost(proc); - /* delete the route */ - orte_routed.delete_route(proc); - /* purge the oob */ - orte_rml.purge(proc); - - if( orte_enable_recovery ) { - /* relocate its processes */ - if (ORTE_SUCCESS != (rc = hnp_relocate(jdata, proc, state, exit_code))) { - /* unable to relocate for some reason */ - opal_output(0, "%s UNABLE TO RELOCATE PROCS FROM FAILED DAEMON %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc)); - /* kill all local procs */ - killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD); - /* kill all jobs */ - hnp_abort(ORTE_JOBID_WILDCARD, exit_code); - /* check if all is complete so we can terminate */ - check_job_complete(jdata); - } - } else { -#if !ORTE_RESIL_ORTE - if (NULL == (pdat = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid))) { - ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); - orte_show_help("help-orte-errmgr-hnp.txt", "errmgr-hnp:daemon-died", true, - ORTE_VPID_PRINT(proc->vpid), "Unknown"); - } else { - orte_show_help("help-orte-errmgr-hnp.txt", "errmgr-hnp:daemon-died", true, - ORTE_VPID_PRINT(proc->vpid), - (NULL == pdat->node) ? "Unknown" : - ((NULL == pdat->node->name) ? "Unknown" : pdat->node->name)); - } -#endif - if (ORTE_SUCCESS != orte_errmgr_hnp_record_dead_process(proc)) { - /* The process is already dead so don't keep trying to do - * this stuff. */ - return ORTE_SUCCESS; - } - -#if !ORTE_RESIL_ORTE - /* kill all local procs */ - killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD); - /* kill all jobs */ - hnp_abort(ORTE_JOBID_WILDCARD, exit_code); -#endif - /* We'll check if the job was complete when we get the - * message back from the HNP notifying us of the dead - * process */ - check_job_complete(jdata); - } - } - break; - - case ORTE_PROC_STATE_HEARTBEAT_FAILED: - /* heartbeats are only from daemons */ - if( orte_enable_recovery ) { - /* relocate its processes */ - } else { - orte_errmgr_hnp_record_dead_process(proc); - /* kill all local procs */ - killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD); - /* kill all jobs */ - hnp_abort(ORTE_JOBID_WILDCARD, exit_code); - return ORTE_ERR_UNRECOVERABLE; - } - break; - - default: - break; - } - - return ORTE_SUCCESS; -} - -int orte_errmgr_hnp_base_global_ft_event(int state) -{ - return ORTE_SUCCESS; -} - -#if ORTE_RESIL_ORTE -static void failure_notification(int status, orte_process_name_t* sender, - opal_buffer_t *buffer, orte_rml_tag_t tag, - void* cbdata) -{ - orte_std_cntr_t n; - int ret = ORTE_SUCCESS, num_failed; - opal_pointer_array_t *dead_names; - int32_t i; - orte_process_name_t *name_item; - orte_epoch_t epoch; - orte_job_t *jdat; - orte_proc_t *pdat, *pdat2; - opal_buffer_t *answer; - - /* If processes have started terminating, don't worry about reported - * failures. The ORTEDs don't know the difference. */ - if (mca_errmgr_hnp_component.term_in_progress) { - return; - } - - if (orte_debug_daemons_flag) { - opal_output(0, "%s errmgr:hnp HNP received process failed from orted %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(sender)); - } - - n = 1; - /* Get the number of failed procs */ - if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &num_failed, &n, ORTE_VPID))) { - ORTE_ERROR_LOG(ret); - return; - } - - dead_names = OBJ_NEW(opal_pointer_array_t); - - for (i = 0; i < num_failed; i++) { - name_item = (orte_process_name_t *) malloc(sizeof(orte_process_name_t)); - - /* Unpack the buffer to get the dead process' name. */ - n = 1; - if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, name_item, &n, ORTE_NAME))) { - ORTE_ERROR_LOG(ret); - return; - } - - /* Check to see if the message is telling us about an old epoch. - * If so ignore the message. - */ - epoch = orte_util_lookup_epoch(name_item); - if (name_item->epoch < epoch) { - if (orte_debug_daemons_flag) { - opal_output(0, "%s errmgr:hnp HNP ignoring duplicate notification for %s failure (reported epoch: %s local epoch: %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(name_item), - ORTE_EPOCH_PRINT(name_item->epoch), - ORTE_EPOCH_PRINT(epoch)); - } - free(name_item); - continue; - } else { - if (orte_debug_daemons_flag) { - opal_output(0, "%s errmgr:hnp HNP received notification for %s failure (reported epoch: %s local epoch: %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(name_item), - ORTE_EPOCH_PRINT(name_item->epoch), - ORTE_EPOCH_PRINT(epoch)); - } - } - - opal_pointer_array_add(dead_names, name_item); - - /* Check to see if the message is telling us about an orted and - * it is from another orted. Orteds don't have the list of all - * the application processes so they don't know if there were - * any child processes on the nodes that they are reporting. */ - if (OPAL_EQUAL != orte_util_compare_name_fields(ORTE_NS_CMP_ALL, sender, ORTE_PROC_MY_NAME)) { - if (NULL == (jdat = orte_get_job_data_object(name_item->jobid))) { - continue; - } else if (NULL == (pdat = (orte_proc_t *) opal_pointer_array_get_item(jdat->procs, name_item->vpid))) { - continue; - } else if (NULL == pdat->node) { - continue; - } - - if (ORTE_PROC_MY_NAME->jobid == name_item->jobid) { - for (i = 0; i < opal_pointer_array_get_size(pdat->node->procs); i++) { - if (NULL == (pdat2 = (orte_proc_t *) opal_pointer_array_get_item(pdat->node->procs, i))) { - continue; - } - - /* ignore this process if it has already terminated */ - if (ORTE_PROC_STATE_TERMINATED <= pdat2->state) { - continue; - } - - /* the proc must have been alive, so notify everyone that it died */ - name_item = (orte_process_name_t *) malloc(sizeof(orte_process_name_t)); - - name_item->jobid = pdat2->name.jobid; - name_item->vpid = pdat2->name.vpid; - name_item->epoch = orte_util_lookup_epoch(&(pdat2->name)); - - opal_pointer_array_add(dead_names, name_item); - } - } - } - - } - - /* Update the number of failed process so any duplicates don't get - * re-reported. - */ - num_failed = opal_pointer_array_get_size(dead_names); - - if (num_failed > 0) { - orte_errmgr_hnp_global_mark_processes_as_dead(dead_names); - - if (!orte_orteds_term_ordered) { - /* Send a message out to all the orteds to inform them that the - * process is dead. Long live the process (or not if it is so - * decided)! - */ - answer = OBJ_NEW(opal_buffer_t); - - if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &num_failed, 1, ORTE_VPID))) { - ORTE_ERROR_LOG(ret); - OBJ_RELEASE(answer); - return; - } - - for (i = 0; i < opal_pointer_array_get_size(dead_names); i++) { - if (NULL != (name_item = (orte_process_name_t *) opal_pointer_array_get_item(dead_names, i))) { - if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, name_item, 1, ORTE_NAME))) { - ORTE_ERROR_LOG(ret); - OBJ_RELEASE(answer); - return; - } - } - } - - if (ORTE_SUCCESS != (ret = orte_grpcomm.xcast(ORTE_PROC_MY_NAME->jobid, answer, ORTE_RML_TAG_FAILURE_NOTICE))) { - ORTE_ERROR_LOG(ret); - OBJ_RELEASE(answer); - return; - } - - /* Tell the applications' ORTE layers that there is a failure. */ - if (ORTE_SUCCESS != (ret = send_to_local_applications(dead_names))) { - return; - } - } - - for (i = 0; i < num_failed; i++) { - name_item = (orte_process_name_t *) opal_pointer_array_get_item(dead_names, i); - free(name_item); - } - } - - OBJ_RELEASE(dead_names); -} -#endif - -/***************** - * Local Functions - *****************/ -static void hnp_abort(orte_jobid_t job, orte_exit_code_t exit_code) -{ - int rc; - - /* if we are already in progress, then ignore this call */ - if (opal_atomic_trylock(&orte_abort_inprogress_lock)) { /* returns 1 if already locked */ - OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, - "%s errmgr:hnp: abort in progress, ignoring abort on job %s with status %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_JOBID_PRINT(job), exit_code)); - return; - } - - OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, - "%s errmgr:hnp: abort called on job %s with status %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_JOBID_PRINT(job), exit_code)); - - /* if debuggers are running, clean up */ - orte_debugger.finalize(); - - /* set control params to indicate we are terminating */ - orte_job_term_ordered = true; - orte_abnormal_term_ordered = true; - orte_enable_recovery = false; - - /* set the exit status, just in case whomever called us failed - * to do so - it can only be done once, so we are protected - * from overwriting it - */ - ORTE_UPDATE_EXIT_STATUS(exit_code); - - /* tell the plm to terminate the orteds - they will automatically - * kill their local procs - */ - if (ORTE_SUCCESS != (rc = orte_plm.terminate_orteds())) { - ORTE_ERROR_LOG(rc); - } -} - -static void failed_start(orte_job_t *jdata) -{ - opal_list_item_t *item, *next; - orte_odls_job_t *jobdat; - orte_odls_child_t *child; - orte_proc_t *proc; - - /* lookup the local jobdat for this job */ - jobdat = NULL; - for (item = opal_list_get_first(&orte_local_jobdata); - item != opal_list_get_end(&orte_local_jobdata); - item = opal_list_get_next(item)) { - jobdat = (orte_odls_job_t*)item; - - /* is this the specified job? */ - if (jobdat->jobid == jdata->jobid) { - break; - } - } - if (NULL == jobdat) { - /* race condition - may not have been formed yet */ - return; - } - jobdat->state = ORTE_JOB_STATE_FAILED_TO_START; - - OPAL_THREAD_LOCK(&orte_odls_globals.mutex); - - for (item = opal_list_get_first(&orte_local_children); - item != opal_list_get_end(&orte_local_children); - item = next) { - next = opal_list_get_next(item); - child = (orte_odls_child_t*)item; - if (child->name->jobid == jobdat->jobid) { - if (ORTE_PROC_STATE_LAUNCHED > child->state || - ORTE_PROC_STATE_UNTERMINATED < child->state) { - /* get the master proc object */ - proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, child->name->vpid); - proc->state = child->state; - proc->exit_code = child->exit_code; - /* update the counter so we can terminate */ - jdata->num_terminated++; - /* remove the child from our list */ - opal_list_remove_item(&orte_local_children, &child->super); - OBJ_RELEASE(child); - jobdat->num_local_procs--; - } - } - } - - opal_condition_signal(&orte_odls_globals.cond); - OPAL_THREAD_UNLOCK(&orte_odls_globals.mutex); - - OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, - "%s errmgr:hnp: job %s reported incomplete start", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_JOBID_PRINT(jdata->jobid))); -} - -static void update_local_procs_in_job(orte_job_t *jdata, orte_job_state_t jobstate, - orte_proc_state_t state, orte_exit_code_t exit_code) -{ - opal_list_item_t *item, *next; - orte_odls_job_t *jobdat; - orte_odls_child_t *child; - orte_proc_t *proc; - - /* lookup the local jobdat for this job */ - jobdat = NULL; - for (item = opal_list_get_first(&orte_local_jobdata); - item != opal_list_get_end(&orte_local_jobdata); - item = opal_list_get_next(item)) { - jobdat = (orte_odls_job_t*)item; - - /* is this the specified job? */ - if (jobdat->jobid == jdata->jobid) { - break; - } - } - if (NULL == jobdat) { - /* race condition - may not have been formed yet */ - return; - } - jobdat->state = jobstate; - jdata->state = jobstate; - - OPAL_THREAD_LOCK(&orte_odls_globals.mutex); - - for (item = opal_list_get_first(&orte_local_children); - item != opal_list_get_end(&orte_local_children); - item = next) { - next = opal_list_get_next(item); - child = (orte_odls_child_t*)item; - if (jdata->jobid == child->name->jobid) { - child->state = state; - proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, child->name->vpid); - proc->state = state; - if (proc->exit_code < exit_code) { - proc->exit_code = exit_code; - } - if (ORTE_PROC_STATE_UNTERMINATED < state) { - opal_list_remove_item(&orte_local_children, &child->super); - OBJ_RELEASE(child); - jdata->num_terminated++; - jobdat->num_local_procs--; - } else if (ORTE_PROC_STATE_RUNNING) { - jdata->num_launched++; - } else if (ORTE_PROC_STATE_REGISTERED == state) { - jdata->num_reported++; - if (jdata->dyn_spawn_active && - jdata->num_reported == jdata->num_procs) { - OPAL_RELEASE_THREAD(&jdata->dyn_spawn_lock, - &jdata->dyn_spawn_cond, - &jdata->dyn_spawn_active); - } - } - } - } - - opal_condition_signal(&orte_odls_globals.cond); - OPAL_THREAD_UNLOCK(&orte_odls_globals.mutex); - -} - -void orte_errmgr_hnp_update_proc(orte_job_t *jdata, - orte_process_name_t *proc, - orte_proc_state_t state, - pid_t pid, - orte_exit_code_t exit_code) -{ - opal_list_item_t *item, *next; - orte_odls_child_t *child; - orte_proc_t *proct; - orte_odls_job_t *jobdat, *jdat; - int i; - - jobdat = NULL; - for (item = opal_list_get_first(&orte_local_jobdata); - item != opal_list_get_end(&orte_local_jobdata); - item = opal_list_get_next(item)) { - jdat = (orte_odls_job_t*)item; - if (jdat->jobid == jdata->jobid) { - jobdat = jdat; - break; - } - } - if (NULL == jobdat) { - ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); - } - - OPAL_THREAD_LOCK(&orte_odls_globals.mutex); - - /*** UPDATE LOCAL CHILD ***/ - for (item = opal_list_get_first(&orte_local_children); - item != opal_list_get_end(&orte_local_children); - item = next) { - next = opal_list_get_next(item); - child = (orte_odls_child_t*)item; - if (child->name->jobid == proc->jobid) { - if (child->name->vpid == proc->vpid) { - child->state = state; - if (0 < pid) { - child->pid = pid; - } - child->exit_code = exit_code; - proct = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, child->name->vpid); - proct->state = state; - if (0 < pid) { - proct->pid = pid; - } - proct->exit_code = exit_code; - if (ORTE_PROC_STATE_UNTERMINATED < state) { - if (!jdata->enable_recovery) { - opal_list_remove_item(&orte_local_children, &child->super); - OBJ_RELEASE(child); - if (NULL != jobdat) { - jobdat->num_local_procs--; - } - } - jdata->num_terminated++; - } else if (ORTE_PROC_STATE_RUNNING == state) { - jdata->num_launched++; - if (jdata->num_launched == jdata->num_procs) { - jdata->state = ORTE_JOB_STATE_RUNNING; - } - } else if (ORTE_PROC_STATE_REGISTERED == state) { - jdata->num_reported++; - if (jdata->dyn_spawn_active && - jdata->num_reported == jdata->num_procs) { - OPAL_RELEASE_THREAD(&jdata->dyn_spawn_lock, - &jdata->dyn_spawn_cond, - &jdata->dyn_spawn_active); - } - } - return; - } - } - } - - opal_condition_signal(&orte_odls_globals.cond); - OPAL_THREAD_UNLOCK(&orte_odls_globals.mutex); - - /*** UPDATE REMOTE CHILD ***/ - for (i=0; i < jdata->procs->size; i++) { - if (NULL == (proct = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, i))) { - continue; - } - if (proct->name.jobid != proc->jobid || - proct->name.vpid != proc->vpid) { - continue; - } - proct->state = state; - if (0 < pid) { - proct->pid = pid; - } - proct->exit_code = exit_code; - if (ORTE_PROC_STATE_REGISTERED == state) { - jdata->num_reported++; - if (jdata->dyn_spawn_active && - jdata->num_reported == jdata->num_procs) { - OPAL_RELEASE_THREAD(&jdata->dyn_spawn_lock, - &jdata->dyn_spawn_cond, - &jdata->dyn_spawn_active); - } - } else if (ORTE_PROC_STATE_UNTERMINATED < state) { - /* update the counter so we can terminate */ - jdata->num_terminated++; - } else if (ORTE_PROC_STATE_RUNNING == state) { - jdata->num_launched++; - if (jdata->num_launched == jdata->num_procs) { - jdata->state = ORTE_JOB_STATE_RUNNING; - } - } - return; - } -} - -static void check_job_complete(orte_job_t *jdata) -{ - orte_proc_t *proc; - int i; - orte_std_cntr_t j; - orte_job_t *job; - orte_node_t *node; - orte_job_map_t *map; - orte_std_cntr_t index; - bool one_still_alive; - orte_vpid_t non_zero=0, lowest=0; - char *msg; - -#if 0 - /* Check if FileM is active. If so then keep processing. */ - OPAL_ACQUIRE_THREAD(&orte_filem_base_lock, &orte_filem_base_cond, &orte_filem_base_is_active); -#endif - if (NULL == jdata) { - /* just check to see if the daemons are complete */ - OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, - "%s errmgr:hnp:check_job_complete - received NULL job, checking daemons", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - goto CHECK_DAEMONS; - } - - for (i=0; i < jdata->procs->size && !jdata->abort; i++) { - if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, i))) { - /* the proc array may no longer be left justified, so - * we need to check everything - */ - continue; - } - - if (0 != proc->exit_code) { - non_zero++; - if (0 == lowest) { - lowest = proc->exit_code; - } - } - - switch (proc->state) { - case ORTE_PROC_STATE_KILLED_BY_CMD: - OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, - "%s errmgr:hnp:check_job_completed proc %s killed by cmd", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&proc->name))); - /* we ordered this proc to die, so it isn't an abnormal termination - * and we don't flag it as such - just check the remaining jobs to - * see if anyone is still alive - */ - if (jdata->num_terminated >= jdata->num_procs) { - /* this job has terminated - now we need to check to see if ALL - * the other jobs have also completed and wakeup if that is true - */ - if (!jdata->abort) { - jdata->state = ORTE_JOB_STATE_KILLED_BY_CMD; - } - } - goto CHECK_ALIVE; - break; - case ORTE_PROC_STATE_ABORTED: - OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, - "%s errmgr:hnp:check_job_completed proc %s aborted", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&proc->name))); - if (!jdata->abort) { - jdata->state = ORTE_JOB_STATE_ABORTED; - /* point to the lowest rank to cause the problem */ - jdata->aborted_proc = proc; - /* retain the object so it doesn't get free'd */ - OBJ_RETAIN(proc); - jdata->abort = true; - ORTE_UPDATE_EXIT_STATUS(proc->exit_code); - } - break; - case ORTE_PROC_STATE_FAILED_TO_START: - OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, - "%s errmgr_hnp:check_job_completed proc %s failed to start", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&proc->name))); - if (!jdata->abort) { - jdata->state = ORTE_JOB_STATE_FAILED_TO_START; - /* point to the lowest rank to cause the problem */ - jdata->aborted_proc = proc; - /* retain the object so it doesn't get free'd */ - OBJ_RETAIN(proc); - jdata->abort = true; - ORTE_UPDATE_EXIT_STATUS(proc->exit_code); - } - break; - case ORTE_PROC_STATE_ABORTED_BY_SIG: - OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, - "%s errmgr:hnp:check_job_completed proc %s aborted by signal", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&proc->name))); - if (!jdata->abort) { - jdata->state = ORTE_JOB_STATE_ABORTED_BY_SIG; - /* point to the lowest rank to cause the problem */ - jdata->aborted_proc = proc; - /* retain the object so it doesn't get free'd */ - OBJ_RETAIN(proc); - jdata->abort = true; - ORTE_UPDATE_EXIT_STATUS(proc->exit_code); - } - break; - case ORTE_PROC_STATE_TERM_WO_SYNC: - OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, - "%s errmgr:hnp:check_job_completed proc %s terminated without sync", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&proc->name))); - if (!jdata->abort) { - jdata->state = ORTE_JOB_STATE_ABORTED_WO_SYNC; - /* point to the lowest rank to cause the problem */ - jdata->aborted_proc = proc; - /* retain the object so it doesn't get free'd */ - OBJ_RETAIN(proc); - jdata->abort = true; - ORTE_UPDATE_EXIT_STATUS(proc->exit_code); - /* now treat a special case - if the proc exit'd without a required - * sync, it may have done so with a zero exit code. We want to ensure - * that the user realizes there was an error, so in this -one- case, - * we overwrite the process' exit code with the default error code - */ - ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE); - } - break; - case ORTE_PROC_STATE_COMM_FAILED: - if (!jdata->abort) { - jdata->state = ORTE_JOB_STATE_COMM_FAILED; - /* point to the lowest rank to cause the problem */ - jdata->aborted_proc = proc; - /* retain the object so it doesn't get free'd */ - OBJ_RETAIN(proc); - jdata->abort = true; - ORTE_UPDATE_EXIT_STATUS(proc->exit_code); - } - break; - case ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED: - if (!jdata->abort) { - jdata->state = ORTE_JOB_STATE_SENSOR_BOUND_EXCEEDED; - /* point to the lowest rank to cause the problem */ - jdata->aborted_proc = proc; - /* retain the object so it doesn't get free'd */ - OBJ_RETAIN(proc); - jdata->abort = true; - ORTE_UPDATE_EXIT_STATUS(proc->exit_code); - } - break; - case ORTE_PROC_STATE_CALLED_ABORT: - if (!jdata->abort) { - jdata->state = ORTE_JOB_STATE_CALLED_ABORT; - /* point to the first proc to cause the problem */ - jdata->aborted_proc = proc; - /* retain the object so it doesn't get free'd */ - OBJ_RETAIN(proc); - jdata->abort = true; - ORTE_UPDATE_EXIT_STATUS(proc->exit_code); - } - break; - case ORTE_PROC_STATE_HEARTBEAT_FAILED: - if (!jdata->abort) { - jdata->state = ORTE_JOB_STATE_HEARTBEAT_FAILED; - /* point to the lowest rank to cause the problem */ - jdata->aborted_proc = proc; - /* retain the object so it doesn't get free'd */ - OBJ_RETAIN(proc); - jdata->abort = true; - ORTE_UPDATE_EXIT_STATUS(proc->exit_code); - } - break; - case ORTE_PROC_STATE_TERM_NON_ZERO: - ORTE_UPDATE_EXIT_STATUS(proc->exit_code); - if (orte_abort_non_zero_exit) { - if (!jdata->abort) { - jdata->state = ORTE_JOB_STATE_NON_ZERO_TERM; - /* point to the lowest rank to cause the problem */ - jdata->aborted_proc = proc; - /* retain the object so it doesn't get free'd */ - OBJ_RETAIN(proc); - jdata->abort = true; - } - } - break; - - default: - if (ORTE_PROC_STATE_UNTERMINATED < proc->state && - jdata->controls & ORTE_JOB_CONTROL_CONTINUOUS_OP) { - OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, - "%s errmgr:hnp:check_job_completed proc %s terminated and continuous", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&proc->name))); - if (!jdata->abort) { - proc->state = ORTE_PROC_STATE_ABORTED; - jdata->state = ORTE_JOB_STATE_ABORTED; - /* point to the lowest rank to cause the problem */ - jdata->aborted_proc = proc; - /* retain the object so it doesn't get free'd */ - OBJ_RETAIN(proc); - jdata->abort = true; - ORTE_UPDATE_EXIT_STATUS(proc->exit_code); - } - } - break; - } - } - - if (jdata->abort) { - /* the job aborted - turn off any sensors on this job */ - orte_sensor.stop(jdata->jobid); - } - - if (ORTE_JOB_STATE_UNTERMINATED > jdata->state && - jdata->num_terminated >= jdata->num_procs) { - /* this job has terminated */ - jdata->state = ORTE_JOB_STATE_TERMINATED; - - /* turn off any sensor monitors on this job */ - orte_sensor.stop(jdata->jobid); - - if (0 < non_zero) { - if (!orte_report_child_jobs_separately || 1 == ORTE_LOCAL_JOBID(jdata->jobid)) { - /* update the exit code */ - ORTE_UPDATE_EXIT_STATUS(lowest); - } - - /* warn user */ - opal_output(orte_clean_output, - "-------------------------------------------------------\n" - "While %s job %s terminated normally, %s %s. Further examination may be required.\n" - "-------------------------------------------------------", - (1 == ORTE_LOCAL_JOBID(jdata->jobid)) ? "the primary" : "child", - (1 == ORTE_LOCAL_JOBID(jdata->jobid)) ? "" : ORTE_LOCAL_JOBID_PRINT(jdata->jobid), - ORTE_VPID_PRINT(non_zero), - (1 == non_zero) ? "process returned\na non-zero exit code." : "processes returned\nnon-zero exit codes."); - } - OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, - "%s errmgr:hnp:check_job_completed declared job %s normally terminated - checking all jobs", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_JOBID_PRINT(jdata->jobid))); - } - - /* if this job is a continuously operating one, then don't do - * anything further - just return here - */ - if (NULL != jdata && - (ORTE_JOB_CONTROL_CONTINUOUS_OP & jdata->controls || - ORTE_JOB_CONTROL_RECOVERABLE & jdata->controls)) { - goto CHECK_ALIVE; - } - - /* if the job that is being checked is the HNP, then we are - * trying to terminate the orteds. In that situation, we - * do -not- check all jobs - we simply notify the HNP - * that the orteds are complete. Also check special case - * if jdata is NULL - we want - * to definitely declare the job done if the orteds - * have completed, no matter what else may be happening. - * This can happen if a ctrl-c hits in the "wrong" place - * while launching - */ -CHECK_DAEMONS: - if (jdata == NULL || jdata->jobid == ORTE_PROC_MY_NAME->jobid) { - if (0 == orte_routed.num_routes()) { - /* orteds are done! */ - OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, - "%s orteds complete - exiting", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - if (NULL == jdata) { - jdata = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid); - } - jdata->state = ORTE_JOB_STATE_TERMINATED; - orte_quit(); - return; - } - return; - } - - /* Release the resources used by this job. Since some errmgrs may want - * to continue using resources allocated to the job as part of their - * fault recovery procedure, we only do this once the job is "complete". - * Note that an aborted/killed job -is- flagged as complete and will - * therefore have its resources released. We need to do this after - * we call the errmgr so that any attempt to restart the job will - * avoid doing so in the exact same place as the current job - */ - if (NULL != jdata->map && jdata->state == ORTE_JOB_STATE_TERMINATED) { - map = jdata->map; - for (index = 0; index < map->nodes->size; index++) { - if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, index))) { - continue; - } - OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, - "%s releasing procs from node %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - node->name)); - for (i = 0; i < node->procs->size; i++) { - if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, i))) { - continue; - } - if (proc->name.jobid != jdata->jobid) { - /* skip procs from another job */ - continue; - } - node->slots_inuse--; - node->num_procs--; - OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, - "%s releasing proc %s from node %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&proc->name), node->name)); - /* set the entry in the node array to NULL */ - opal_pointer_array_set_item(node->procs, i, NULL); - /* release the proc once for the map entry */ - OBJ_RELEASE(proc); - } - } - OBJ_RELEASE(map); - jdata->map = NULL; - } - -CHECK_ALIVE: - /* now check to see if all jobs are done - release this jdata - * object when we find it - */ - one_still_alive = false; - for (j=1; j < orte_job_data->size; j++) { - if (NULL == (job = (orte_job_t*)opal_pointer_array_get_item(orte_job_data, j))) { - /* since we are releasing jdata objects as we - * go, we can no longer assume that the job_data - * array is left justified - */ - continue; - } - /* if this is the job we are checking AND it normally terminated, - * then go ahead and release it. We cannot release it if it - * abnormally terminated as mpirun needs the info so it can - * report appropriately to the user - * - * NOTE: do not release the primary job (j=1) so we - * can pretty-print completion message - */ - if (NULL != jdata && job->jobid == jdata->jobid && - (jdata->state == ORTE_JOB_STATE_TERMINATED || - jdata->state == ORTE_JOB_STATE_KILLED_BY_CMD)) { - /* release this object, ensuring that the - * pointer array internal accounting - * is maintained! - */ - if (1 < j) { - opal_pointer_array_set_item(orte_job_data, j, NULL); /* ensure the array has a NULL */ - OBJ_RELEASE(jdata); - } - continue; - } - /* if the job is flagged to not be monitored, skip it */ - if (ORTE_JOB_CONTROL_DO_NOT_MONITOR & job->controls) { - continue; - } - /* when checking for job termination, we must be sure to NOT check - * our own job as it - rather obviously - has NOT terminated! - */ - if (job->num_terminated < job->num_procs) { - /* we have at least one job that is not done yet - we cannot - * just return, though, as we need to ensure we cleanout the - * job data for the job that just completed - */ - OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, - "%s errmgr:hnp:check_job_completed job %s is not terminated (%d:%d)", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_JOBID_PRINT(job->jobid), - job->num_terminated, job->num_procs)); - one_still_alive = true; - } - else { - OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, - "%s errmgr:hnp:check_job_completed job %s is terminated (%d vs %d [%s])", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_JOBID_PRINT(job->jobid), - job->num_terminated, job->num_procs, - (NULL == jdata) ? "UNKNOWN" : orte_job_state_to_str(jdata->state) )); - } - } - /* if a job is still alive, we just return */ - if (one_still_alive) { - OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, - "%s errmgr:hnp:check_job_completed at least one job is not terminated", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - return; - } - /* if we get here, then all jobs are done, so terminate */ - OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, - "%s errmgr:hnp:check_job_completed all jobs terminated", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - /* set the exit status to 0 - this will only happen if it - * wasn't already set by an error condition - */ - ORTE_UPDATE_EXIT_STATUS(0); - /* provide a notifier message if that framework is active - ignored otherwise */ - if (NULL != (job = (orte_job_t*)opal_pointer_array_get_item(orte_job_data, 1))) { - if (NULL == job->name) { - job->name = strdup(orte_process_info.nodename); - } - if (NULL == job->instance) { - asprintf(&job->instance, "%d", orte_process_info.pid); - } - if (0 == orte_exit_status) { - asprintf(&msg, "Job %s:%s complete", job->name, job->instance); - orte_notifier.log(ORTE_NOTIFIER_INFO, 0, msg); - } else { - asprintf(&msg, "Job %s:%s terminated abnormally", job->name, job->instance); - orte_notifier.log(ORTE_NOTIFIER_ALERT, orte_exit_status, msg); - } - free(msg); - /* this job object will be release during finalize */ - } - - orte_jobs_complete(); - /* if I am the only daemon alive, then I can exit now */ - if (0 == orte_routed.num_routes()) { - OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, - "%s orteds complete - exiting", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - orte_quit(); - } -} - -static void killprocs(orte_jobid_t job, orte_vpid_t vpid) -{ - opal_pointer_array_t cmd; - orte_proc_t proc; - int rc; - - /* stop local sensors for this job */ - if (ORTE_VPID_WILDCARD == vpid) { - orte_sensor.stop(job); - } - - if (ORTE_JOBID_WILDCARD == job - && ORTE_VPID_WILDCARD == vpid) { - - if (ORTE_SUCCESS != (rc = orte_odls.kill_local_procs(NULL))) { - ORTE_ERROR_LOG(rc); - } - return; - } - - OBJ_CONSTRUCT(&cmd, opal_pointer_array_t); - OBJ_CONSTRUCT(&proc, orte_proc_t); - proc.name.jobid = job; - proc.name.vpid = vpid; - ORTE_EPOCH_SET(proc.name.epoch,orte_ess.proc_get_epoch(&(proc.name))); - opal_pointer_array_add(&cmd, &proc); - if (ORTE_SUCCESS != (rc = orte_odls.kill_local_procs(&cmd))) { - ORTE_ERROR_LOG(rc); - } - OBJ_DESTRUCT(&cmd); - OBJ_DESTRUCT(&proc); -} - -static int hnp_relocate(orte_job_t *jdata, orte_process_name_t *proc, - orte_proc_state_t state, orte_exit_code_t exit_code) -{ - orte_job_t *jdat; - orte_proc_t *pdata, *pdt, *pdt2; - orte_node_t *node, *nd; - orte_app_context_t *app; - char *app_name; - int rc, i, n; - - OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base.output, - "%s CHECKING ON RELOCATE FOR APP %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(proc))); - - /* get the proc_t object for this process */ - pdata = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid); - if (NULL == pdata) { - opal_output(0, "Data for proc %s could not be found", ORTE_NAME_PRINT(proc)); - return ORTE_ERR_NOT_FOUND; - } - - /* set the state */ - pdata->state = state; - - /* retain the node id */ - node = pdata->node; - - /* if it is a daemon that died, we need to flag all of its procs - * to be relocated - */ - if (ORTE_PROC_MY_NAME->jobid == proc->jobid) { - /* remove this proc from the daemon job */ - orte_errmgr_hnp_record_dead_process(proc); - /* check to see if any other nodes are "alive" */ - if (!orte_hnp_is_allocated && jdata->num_procs == 1) { - return ORTE_ERR_FATAL; - } - app_name = "orted"; - /* scan the procs looking for each unique jobid on the node */ - for (i=0; i < node->procs->size; i++) { - if (NULL == (pdt = (orte_proc_t*)opal_pointer_array_get_item(node->procs, i))) { - continue; - } - /* get the job data object for this process */ - if (NULL == (jdat = orte_get_job_data_object(pdt->name.jobid))) { - /* major problem */ - ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); - continue; - } - /* since the node was used in this job's map, release - * it so that accounting is maintained - */ - OBJ_RELEASE(node); - /* mark this proc as dead so it will be restarted */ - pdt->state = ORTE_PROC_STATE_ABORTED; - /* remove this proc from the node */ - OBJ_RELEASE(pdt); /* maintains accounting */ - opal_pointer_array_set_item(node->procs, i, NULL); - /* maintain accounting on num procs alive in case this can't restart */ - jdat->num_terminated++; - /* look for all other procs on this node from the same job */ - for (n=0; n < node->procs->size; n++) { - if (NULL == (pdt2 = (orte_proc_t*)opal_pointer_array_get_item(node->procs, n))) { - continue; - } - if (pdt2->name.jobid == pdt->name.jobid) { - /* mark this proc as having aborted */ - pdt2->state = ORTE_PROC_STATE_ABORTED; - /* remove it from the node */ - OBJ_RELEASE(pdt2); - opal_pointer_array_set_item(node->procs, n, NULL); - /* maintain accounting on num procs alive */ - jdat->num_terminated++; - } - } - /* and remove the node from the map */ - for (n=0; n < jdat->map->nodes->size; n++) { - if (NULL == (nd = (orte_node_t*)opal_pointer_array_get_item(jdat->map->nodes, n))) { - continue; - } - if (nd->index == node->index) { - opal_pointer_array_set_item(jdat->map->nodes, n, NULL); - OBJ_RELEASE(node); /* maintain accounting */ - break; - } - } - /* reset the job params for this job */ - orte_plm_base_reset_job(jdat); - - /* relaunch the job */ - opal_output(0, "%s RELOCATING APPS FOR JOB %s FROM NODE %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(jdat->jobid), node->name); - if (ORTE_SUCCESS != (rc = orte_plm.spawn(jdat))) { - opal_output(0, "FAILED TO RESTART APP %s on error %s", app_name, ORTE_ERROR_NAME(rc)); - return rc; - } - } - - return ORTE_SUCCESS; - } - - /* otherwise, we are an app - try to relocate us to another node */ - app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, pdata->app_idx); - if (NULL == app) { - /* no way to restart this job */ - orte_show_help("help-orte-errmgr-hnp.txt", "errmgr-hnp:cannot-relocate", true, - ORTE_NAME_PRINT(proc)); - return ORTE_ERR_NOT_FOUND; - } - app_name = app->app; - /* track that we are attempting to restart */ - pdata->restarts++; - /* have we exceeded the number of restarts for this proc? */ - if (app->max_restarts < pdata->restarts) { - return ORTE_ERR_RESTART_LIMIT_EXCEEDED; - } - - /* reset the job params for restart */ - orte_plm_base_reset_job(jdata); - - /* flag the current node as not-to-be-used */ - pdata->node->state = ORTE_NODE_STATE_DO_NOT_USE; - - /* restart the job - the spawn function will remap and - * launch the replacement proc(s) - */ - OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base.output, - "%s RELOCATING APP %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(proc))); - - if (ORTE_SUCCESS != (rc = orte_plm.spawn(jdata))) { - opal_output(0, "FAILED TO RESTART APP %s on error %s", app_name, ORTE_ERROR_NAME(rc)); - return rc; - } - - return ORTE_SUCCESS; -} - -static orte_odls_child_t* proc_is_local(orte_process_name_t *proc) -{ - orte_odls_child_t *child; - opal_list_item_t *item; - - child = NULL; - for (item = opal_list_get_first(&orte_local_children); - item != opal_list_get_end(&orte_local_children); - item = opal_list_get_next(item)) { - child = (orte_odls_child_t*)item; - if (child->name->jobid == proc->jobid && - child->name->vpid == proc->vpid) { - return child; - } - } - return NULL; -} - -#if ORTE_RESIL_ORTE -static void cbfunc(int status, - orte_process_name_t *peer, - opal_buffer_t *buffer, - orte_rml_tag_t tag, - void* cbdata) { - OBJ_RELEASE(buffer); -} -#endif - -int orte_errmgr_hnp_record_dead_process(orte_process_name_t *proc) { - orte_job_t *jdat; - orte_proc_t *pdat, *proc_item; - int i; - opal_pointer_array_t *dead_names; - - OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base.output, - "%s RECORDING DEAD PROCESS %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(proc))); - - if (NULL == (jdat = orte_get_job_data_object(proc->jobid))) { - opal_output(0, "Can't find job object"); - return ORTE_ERR_NOT_FOUND; - } - - if (NULL != (pdat = (orte_proc_t*)opal_pointer_array_get_item(jdat->procs, proc->vpid)) && - ORTE_PROC_STATE_TERMINATED > pdat->state) { - -#if ORTE_ENABLE_EPOCH - /* Make sure that the epochs match. */ - if (proc->epoch != pdat->name.epoch) { - opal_output(1, "The epoch does not match the current epoch. Throwing the request out."); - return ORTE_SUCCESS; - } -#endif - - dead_names = OBJ_NEW(opal_pointer_array_t); - - if (ORTE_PROC_MY_NAME->jobid == proc->jobid) { - opal_pointer_array_add(dead_names, &(pdat->name)); - - for (i = 0; i < opal_pointer_array_get_size(pdat->node->procs); i++) { - if (NULL == (proc_item = (orte_proc_t *) opal_pointer_array_get_item(pdat->node->procs, i))) { - continue; - } - - opal_pointer_array_add(dead_names, &(proc_item->name)); - } - } - -#if ORTE_RESIL_ORTE - if (!mca_errmgr_hnp_component.term_in_progress) { - /* - * Send a message to the other daemons so they know that a daemon has - * died. - */ - int rc, num_failed = opal_pointer_array_get_size(dead_names); - opal_buffer_t* buffer = OBJ_NEW(opal_buffer_t); - orte_process_name_t *proc_name; - - if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &num_failed, 1, ORTE_VPID))) { - ORTE_ERROR_LOG(rc); - OBJ_RELEASE(buffer); - } else { - - /* Iterate over the list of dead procs and send them along with - * the rest. The HNP needs this info so it can tell the other - * ORTEDs and they can inform the appropriate applications. - */ - for (i = 0; i < num_failed; i++) { - if (NULL != (proc_name = (orte_process_name_t *) opal_pointer_array_get_item(dead_names, i))) { - if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, proc_name, 1, ORTE_NAME))) { - ORTE_ERROR_LOG(rc); - OBJ_RELEASE(buffer); - } - } - } - - OBJ_RELEASE(dead_names); - - OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base.output, - "%s SENDING DEAD PROCESS MESSAGE TO HNP", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - - orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, buffer, ORTE_RML_TAG_FAILURE_NOTICE, 0, cbfunc, NULL); - } - } else { - orte_errmgr_hnp_global_mark_processes_as_dead(dead_names); - } -#else - orte_errmgr_hnp_global_mark_processes_as_dead(dead_names); -#endif - } - - return ORTE_SUCCESS; -} - -int orte_errmgr_hnp_global_mark_processes_as_dead(opal_pointer_array_t *dead_procs) { - int i; - orte_process_name_t *name_item; - orte_job_t *jdat; - orte_proc_t *pdat; - orte_node_t *node; - - OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, - "HNP %s marking procs as dead", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - - /* Iterate over the list of processes */ - for (i = 0; i < opal_pointer_array_get_size(dead_procs); i++) { - if (NULL == (name_item = (orte_process_name_t *) opal_pointer_array_get_item(dead_procs, i))) { - opal_output(1, "NULL found in dead process list."); - continue; - } - - if (NULL == (jdat = orte_get_job_data_object(name_item->jobid))) { - OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, - "%s Job data not found.", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - return ORTE_ERR_NOT_FOUND; - } - - if (NULL != (pdat = (orte_proc_t *) opal_pointer_array_get_item(jdat->procs, name_item->vpid)) && - pdat->state < ORTE_PROC_STATE_TERMINATED) { - - OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, - "HNP %s marking %s as dead", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&pdat->name))); - -#if ORTE_RESIL_ORTE - /* Make sure the epochs match, if not it probably means that we - * already reported this failure. */ - if (name_item->epoch != pdat->name.epoch) { - continue; - } - - orte_util_set_epoch(name_item, name_item->epoch + 1); -#endif - - /* Remove it from the job array */ - opal_pointer_array_set_item(jdat->procs, name_item->vpid, NULL); - orte_process_info.num_procs--; - jdat->num_procs--; - - /* Check if this is an ORTED */ - if (ORTE_PROC_MY_NAME->jobid == name_item->jobid) { - /* Mark the node as down so it won't be used in mapping anymore. */ - node = pdat->node; - node->state = ORTE_NODE_STATE_DOWN; - node->daemon = NULL; - } - - OBJ_RELEASE(pdat); - -#if ORTE_RESIL_ORTE - /* Create a new proc object that will keep track of the epoch - * information */ - pdat = OBJ_NEW(orte_proc_t); - pdat->name.jobid = jdat->jobid; - pdat->name.vpid = name_item->vpid; - pdat->name.epoch = name_item->epoch + 1; - - opal_pointer_array_set_item(jdat->procs, name_item->vpid, pdat); - jdat->num_procs++; - jdat->num_terminated++; -#endif - /* Set the state as terminated so we'll know the process isn't - * actually there. */ - pdat->state = ORTE_PROC_STATE_TERMINATED; - } else { -#if ORTE_RESIL_ORTE - opal_output(0, "Proc data not found for %s", ORTE_NAME_PRINT(name_item)); - /* Create a new proc object that will keep track of the epoch - * information */ - pdat = OBJ_NEW(orte_proc_t); - pdat->name.jobid = jdat->jobid; - pdat->name.vpid = name_item->vpid; - pdat->name.epoch = name_item->epoch + 1; - - /* Set the state as terminated so we'll know the process isn't - * actually there. */ - pdat->state = ORTE_PROC_STATE_TERMINATED; - - opal_pointer_array_set_item(jdat->procs, name_item->vpid, pdat); - jdat->num_procs++; - jdat->num_terminated++; -#endif - } - - check_job_complete(jdat); - } - -#if ORTE_RESIL_ORTE - if (!mca_errmgr_hnp_component.term_in_progress) { - /* Need to update the orted routing module. */ - orte_routed.update_routing_tree(ORTE_PROC_MY_NAME->jobid); - - if (NULL != fault_cbfunc) { - (*fault_cbfunc)(dead_procs); - } - } -#endif - - return ORTE_SUCCESS; -} - -#if ORTE_RESIL_ORTE -int send_to_local_applications(opal_pointer_array_t *dead_names) { - opal_buffer_t *buf; - int ret = ORTE_SUCCESS; - orte_process_name_t *name_item; - int size, i; - - OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base.output, - "%s Sending failure to local applications.", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - - buf = OBJ_NEW(opal_buffer_t); - - size = opal_pointer_array_get_size(dead_names); - - if (ORTE_SUCCESS != (ret = opal_dss.pack(buf, &size, 1, ORTE_VPID))) { - ORTE_ERROR_LOG(ret); - OBJ_RELEASE(buf); - return ret; - } - - for (i = 0; i < size; i++) { - if (NULL != (name_item = (orte_process_name_t *) opal_pointer_array_get_item(dead_names, i))) { - if (ORTE_SUCCESS != (ret = opal_dss.pack(buf, name_item, 1, ORTE_NAME))) { - ORTE_ERROR_LOG(ret); - OBJ_RELEASE(buf); - return ret; - } - } - } - - if (ORTE_SUCCESS != (ret = orte_odls.deliver_message(ORTE_JOBID_WILDCARD, buf, ORTE_RML_TAG_EPOCH_CHANGE))) { - ORTE_ERROR_LOG(ret); - OBJ_RELEASE(buf); - return ret; - } - - OBJ_RELEASE(buf); - - return ret; -} -#endif - diff --git a/orte/mca/errmgr/hnp/errmgr_hnp.h b/orte/mca/errmgr/hnp/errmgr_hnp.h deleted file mode 100644 index cd20532141..0000000000 --- a/orte/mca/errmgr/hnp/errmgr_hnp.h +++ /dev/null @@ -1,135 +0,0 @@ -/* - * Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2004-2011 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -/** - * @file - * - */ - -#ifndef MCA_ERRMGR_hnp_EXPORT_H -#define MCA_ERRMGR_hnp_EXPORT_H - -#include "orte_config.h" - -#include "orte/mca/errmgr/errmgr.h" - -BEGIN_C_DECLS - -/* - * Local Component structures - */ -struct orte_errmgr_hnp_component_t { - orte_errmgr_base_component_t super; /** Base Errmgr component */ - - bool ignore_current_update; - bool term_in_progress; - -#if OPAL_ENABLE_FT_CR - /* State of the Recovery */ - bool crmig_in_progress; - bool autor_in_progress; - - /* CRMig Options */ - bool crmig_enabled; - bool crmig_timing_enabled; - - /* AutoR Options */ - bool autor_enabled; - bool autor_timing_enabled; - int autor_recovery_delay; - bool autor_skip_oldnode; -#endif -}; -typedef struct orte_errmgr_hnp_component_t orte_errmgr_hnp_component_t; -ORTE_MODULE_DECLSPEC extern orte_errmgr_hnp_component_t mca_errmgr_hnp_component; - -int orte_errmgr_hnp_component_query(mca_base_module_t **module, int *priority); - -void orte_errmgr_hnp_update_proc(orte_job_t *jdata, - orte_process_name_t *proc, - orte_proc_state_t state, - pid_t pid, - orte_exit_code_t exit_code); - -/*************************** - * Module functions: Global - ***************************/ -int orte_errmgr_hnp_global_module_init(void); -int orte_errmgr_hnp_global_module_finalize(void); - -int orte_errmgr_hnp_global_update_state(orte_jobid_t job, - orte_job_state_t jobstate, - orte_process_name_t *proc_name, - orte_proc_state_t state, - pid_t pid, - orte_exit_code_t exit_code); -int orte_errmgr_hnp_global_predicted_fault(opal_list_t *proc_list, - opal_list_t *node_list, - opal_list_t *suggested_map); -int orte_errmgr_hnp_global_suggest_map_targets(orte_proc_t *proc, - orte_node_t *oldnode, - opal_list_t *node_list); -int orte_errmgr_hnp_global_ft_event(int state); -int orte_errmgr_hnp_global_mark_processes_as_dead(opal_pointer_array_t *dead_procs); -int orte_errmgr_hnp_global_failure_notification(orte_process_name_t *sender, opal_buffer_t *buffer); -int orte_errmgr_hnp_record_dead_process(orte_process_name_t *proc); - -/* hnp Versions */ -int orte_errmgr_hnp_base_global_init(void); -int orte_errmgr_hnp_base_global_finalize(void); -int orte_errmgr_hnp_base_global_update_state(orte_jobid_t job, - orte_job_state_t jobstate, - orte_process_name_t *proc, - orte_proc_state_t state, - pid_t pid, - orte_exit_code_t exit_code); -int orte_errmgr_hnp_base_global_ft_event(int state); - -#if OPAL_ENABLE_FT_CR -/* CRMig Versions */ -int orte_errmgr_hnp_crmig_global_module_init(void); -int orte_errmgr_hnp_crmig_global_module_finalize(void); - -int orte_errmgr_hnp_crmig_global_update_state(orte_jobid_t job, - orte_job_state_t jobstate, - orte_process_name_t *proc_name, - orte_proc_state_t state, - pid_t pid, - orte_exit_code_t exit_code); -int orte_errmgr_hnp_crmig_global_predicted_fault(opal_list_t *proc_list, - opal_list_t *node_list, - opal_list_t *suggested_map); -int orte_errmgr_hnp_crmig_global_suggest_map_targets(orte_proc_t *proc, - orte_node_t *oldnode, - opal_list_t *node_list); -int orte_errmgr_hnp_crmig_global_ft_event(int state); - -/* AutoR Versions */ -int orte_errmgr_hnp_autor_global_module_init(void); -int orte_errmgr_hnp_autor_global_module_finalize(void); - -int orte_errmgr_hnp_autor_global_update_state(orte_jobid_t job, - orte_job_state_t jobstate, - orte_process_name_t *proc_name, - orte_proc_state_t state, - pid_t pid, - orte_exit_code_t exit_code); -int orte_errmgr_hnp_autor_global_suggest_map_targets(orte_proc_t *proc, - orte_node_t *oldnode, - opal_list_t *node_list); -int orte_errmgr_hnp_autor_global_ft_event(int state); -#endif - -END_C_DECLS - -#endif /* MCA_ERRMGR_hnp_EXPORT_H */ diff --git a/orte/mca/errmgr/hnp/errmgr_hnp_autor.c b/orte/mca/errmgr/hnp/errmgr_hnp_autor.c deleted file mode 100644 index 4ba13ac35f..0000000000 --- a/orte/mca/errmgr/hnp/errmgr_hnp_autor.c +++ /dev/null @@ -1,1033 +0,0 @@ -/* - * Copyright (c) 2009-2011 The Trustees of Indiana University. - * All rights reserved. - * Copyright (c) 2011 Oak Ridge National Labs. All rights reserved. - * Copyright (c) 2004-2011 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "orte_config.h" - -#include -#ifdef HAVE_UNISTD_H -#include -#endif /* HAVE_UNISTD_H */ -#ifdef HAVE_STRING_H -#include -#endif - -#include "opal/util/show_help.h" -#include "opal/util/output.h" -#include "opal/util/opal_environ.h" -#include "opal/util/basename.h" -#include "opal/util/argv.h" -#include "opal/mca/mca.h" -#include "opal/mca/base/base.h" -#include "opal/mca/base/mca_base_param.h" -#include "opal/mca/crs/crs.h" -#include "opal/mca/crs/base/base.h" -#include "opal/mca/event/event.h" - -#include "orte/util/error_strings.h" -#include "orte/util/name_fns.h" -#include "orte/util/proc_info.h" -#include "orte/runtime/orte_globals.h" -#include "opal/dss/dss.h" -#include "orte/mca/rml/rml.h" -#include "orte/mca/rml/rml_types.h" -#include "orte/mca/routed/routed.h" -#include "orte/mca/iof/iof.h" -#include "orte/mca/plm/plm.h" -#include "orte/mca/plm/base/base.h" -#include "orte/mca/plm/base/plm_private.h" -#include "orte/mca/filem/filem.h" -#include "orte/mca/grpcomm/grpcomm.h" -#include "orte/runtime/orte_wait.h" -#include "orte/mca/rmaps/rmaps_types.h" -#include "orte/mca/snapc/snapc.h" -#include "orte/mca/snapc/base/base.h" -#include "orte/mca/sstore/sstore.h" -#include "orte/mca/sstore/base/base.h" - -#include "orte/mca/errmgr/errmgr.h" -#include "orte/mca/errmgr/base/base.h" -#include "orte/mca/errmgr/base/errmgr_private.h" - -#include "errmgr_hnp.h" - -#include MCA_timer_IMPLEMENTATION_HEADER - -#if OPAL_ENABLE_FT_CR -/************************ - * Work Pool structures - ************************/ -struct errmgr_autor_wp_item_t { - /** List super object */ - opal_list_item_t super; - - /** ORTE Process name */ - orte_process_name_t name; - - /** State that was passed with it */ - orte_proc_state_t state; -}; -typedef struct errmgr_autor_wp_item_t errmgr_autor_wp_item_t; - -OBJ_CLASS_DECLARATION(errmgr_autor_wp_item_t); - -void errmgr_autor_wp_item_construct(errmgr_autor_wp_item_t *wp); -void errmgr_autor_wp_item_destruct(errmgr_autor_wp_item_t *wp); - -OBJ_CLASS_INSTANCE(errmgr_autor_wp_item_t, - opal_list_item_t, - errmgr_autor_wp_item_construct, - errmgr_autor_wp_item_destruct); - -/************************************ - * Locally Global vars & functions :) - ************************************/ -static orte_jobid_t current_global_jobid = ORTE_JOBID_INVALID; -static orte_job_t *current_global_jobdata = NULL; - -static bool autor_mask_faults = false; - -static opal_list_t *procs_pending_recovery = NULL; -static bool autor_timer_active = false; -static opal_event_t *autor_timer_event = NULL; - -static void errmgr_autor_recover_processes(int fd, short event, void *cbdata); -static int autor_set_current_job_info(orte_job_t *given_jdata, orte_process_name_t *proc_name); - -static int display_procs(void ); -static int autor_procs_sort_compare_fn(opal_list_item_t **a, - opal_list_item_t **b); - -static int orte_errmgr_hnp_autor_global_process_fault(orte_job_t *jdata, - orte_process_name_t *proc_name, - orte_proc_state_t state); -static void errmgr_autor_process_fault_app(orte_job_t *jdata, - orte_process_name_t *proc, - orte_proc_state_t state); -static void errmgr_autor_process_fault_daemon(orte_job_t *jdata, - orte_process_name_t *proc, - orte_proc_state_t state); - -static int check_if_terminated(opal_pointer_array_t *procs); -static int check_if_restarted(opal_pointer_array_t *procs); - -/* - * Timer stuff - */ -static void errmgr_autor_set_time(int idx); -static void errmgr_autor_display_all_timers(void); -static void errmgr_autor_clear_timers(void); - -static double errmgr_autor_get_time(void); -static void errmgr_autor_display_indv_timer_core(double diff, char *str); -static double timer_start[OPAL_CR_TIMER_MAX]; - -#define ERRMGR_AUTOR_TIMER_START 0 -#define ERRMGR_AUTOR_TIMER_SETUP 1 -#define ERRMGR_AUTOR_TIMER_TERM 2 -#define ERRMGR_AUTOR_TIMER_RESETUP 3 -#define ERRMGR_AUTOR_TIMER_RESTART 4 -#define ERRMGR_AUTOR_TIMER_FINISH 5 -#define ERRMGR_AUTOR_TIMER_MAX 6 - -#define ERRMGR_AUTOR_CLEAR_TIMERS() \ - { \ - if(OPAL_UNLIKELY(mca_errmgr_hnp_component.autor_timing_enabled > 0)) { \ - errmgr_autor_clear_timers(); \ - } \ - } - -#define ERRMGR_AUTOR_SET_TIMER(idx) \ - { \ - if(OPAL_UNLIKELY(mca_errmgr_hnp_component.autor_timing_enabled > 0)) { \ - errmgr_autor_set_time(idx); \ - } \ - } - -#define ERRMGR_AUTOR_DISPLAY_ALL_TIMERS() \ - { \ - if(OPAL_UNLIKELY(mca_errmgr_hnp_component.autor_timing_enabled > 0)) { \ - errmgr_autor_display_all_timers(); \ - } \ - } - -/************************ - * Function Definitions: Global - ************************/ -int orte_errmgr_hnp_autor_global_module_init(void) -{ - opal_output_verbose(10, mca_errmgr_hnp_component.super.output_handle, - "errmgr:hnp(autor):init()"); - - procs_pending_recovery = OBJ_NEW(opal_list_t); - - current_global_jobid = ORTE_JOBID_INVALID; - current_global_jobdata = NULL; - - if( NULL == autor_timer_event ) { - autor_timer_event = opal_event_evtimer_new(opal_event_base, errmgr_autor_recover_processes, NULL); - } - - ERRMGR_AUTOR_CLEAR_TIMERS(); - - return ORTE_SUCCESS; -} - -int orte_errmgr_hnp_autor_global_module_finalize(void) -{ - opal_output_verbose(10, mca_errmgr_hnp_component.super.output_handle, - "errmgr:hnp(autor):finalize()"); - - if( NULL != procs_pending_recovery ) { - OBJ_RELEASE(procs_pending_recovery); - procs_pending_recovery = NULL; - } - if( NULL != autor_timer_event ) { - free(autor_timer_event); - autor_timer_event = NULL; - } - - current_global_jobid = ORTE_JOBID_INVALID; - current_global_jobdata = NULL; - - ERRMGR_AUTOR_CLEAR_TIMERS(); - - return ORTE_SUCCESS; -} - -static int autor_set_current_job_info(orte_job_t *given_jdata, orte_process_name_t *proc_name) -{ - orte_job_t *jdata = NULL; - int i; - - /* - * If we already figured it out, then just move ahead - */ - if( NULL != current_global_jobdata ) { - if( given_jdata->jobid != ORTE_PROC_MY_NAME->jobid && - given_jdata->jobid != current_global_jobdata->jobid ) { - current_global_jobdata = given_jdata; - current_global_jobid = given_jdata->jobid; - } - return ORTE_SUCCESS; - } - - /* - * If this references the application, and not the daemons - */ - if( given_jdata->jobid != ORTE_PROC_MY_NAME->jobid ) { - current_global_jobdata = given_jdata; - current_global_jobid = given_jdata->jobid; - return ORTE_SUCCESS; - } - - /* - * Otherwise iterate through the job structure and find the first job. - */ - for(i = 0; i < orte_job_data->size; ++i ) { - if (NULL == (jdata = (orte_job_t*)opal_pointer_array_get_item(orte_job_data, i))) { - continue; - } - /* Exclude outselves */ - if( jdata->jobid == ORTE_PROC_MY_NAME->jobid ) { - continue; - } - current_global_jobdata = jdata; - current_global_jobid = jdata->jobid; - break; - } - - if( NULL == current_global_jobdata ) { - opal_output(0, "errmgr:hnp(autor):process_fault(): Global) Error: Cannot find the jdata for the current job."); - return ORTE_ERROR; - } - - return ORTE_SUCCESS; -} - -int orte_errmgr_hnp_autor_global_update_state(orte_jobid_t job, - orte_job_state_t jobstate, - orte_process_name_t *proc_name, - orte_proc_state_t state, - pid_t pid, - orte_exit_code_t exit_code) -{ - orte_proc_t *loc_proc = NULL; - orte_job_t *jdata = NULL; - int ret = ORTE_SUCCESS, exit_status = ORTE_SUCCESS; - int32_t i; - - /* - * if orte is trying to shutdown, just let it - */ - if( mca_errmgr_hnp_component.term_in_progress ) { - return ORTE_SUCCESS; - } - - if( NULL != proc_name && - OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, ORTE_PROC_MY_NAME, proc_name) ) { - OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, - "%s errmgr:hnp(autor): Update reported on self (%s), state %s. Skip...", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(proc_name), - orte_proc_state_to_str(state) )); - return ORTE_SUCCESS; - } - - /* - * Get the job data object for this process - */ - if( NULL != proc_name ) { /* Get job from proc's jobid */ - jdata = orte_get_job_data_object(proc_name->jobid); - } else { /* Get from the general job */ - jdata = orte_get_job_data_object(job); - } - if( NULL == jdata ) { - opal_output(0, "%s errmgr:hnp(autor):update_state() Error: Cannot find job %s for Process %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_JOBID_PRINT(job), - (NULL == proc_name) ? "NULL" : ORTE_NAME_PRINT(proc_name) ); - ret = ORTE_ERROR; - ORTE_ERROR_LOG(ret); - exit_status = ret; - goto cleanup; - } - - /* - * If this is a tool, ignore - */ - if( jdata->num_apps == 0 && - OPAL_EQUAL != orte_util_compare_name_fields(ORTE_NS_CMP_JOBID, ORTE_PROC_MY_NAME, proc_name) ) { - OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, - "%s errmgr:hnp(autor): An external tool disconnected. Ignore...", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - exit_status = ORTE_SUCCESS; - goto cleanup; - } - - OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, - "%s errmgr:hnp(autor): job %s reported state %s" - " for proc %s state %s exit_code %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_JOBID_PRINT(job), - orte_job_state_to_str(jobstate), - (NULL == proc_name) ? "NULL" : ORTE_NAME_PRINT(proc_name), - orte_proc_state_to_str(state), exit_code)); - - if( ORTE_JOB_STATE_RESTART == jobstate ) { - for(i = 0; i < jdata->procs->size; ++i) { - if (NULL == (loc_proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, i))) { - continue; - } - break; - } - - if( ORTE_SUCCESS != (ret = orte_errmgr_hnp_autor_global_process_fault(jdata, &(loc_proc->name), state)) ) { - ORTE_ERROR_LOG(ret); - exit_status = ret; - goto cleanup; - } - } - else if( ORTE_PROC_STATE_ABORTED_BY_SIG == state || - ORTE_PROC_STATE_COMM_FAILED == state ) { - if( ORTE_SUCCESS != (ret = orte_errmgr_hnp_autor_global_process_fault(jdata, proc_name, state)) ) { - ORTE_ERROR_LOG(ret); - exit_status = ret; - goto cleanup; - } - } - else if( ORTE_PROC_STATE_KILLED_BY_CMD == state ) { - if( autor_mask_faults ) { - mca_errmgr_hnp_component.ignore_current_update = true; - orte_errmgr_hnp_update_proc(jdata, proc_name, state, 0, exit_code); - } - } - - cleanup: - return ret; -} - -static int orte_errmgr_hnp_autor_global_process_fault(orte_job_t *jdata, - orte_process_name_t *proc_name, - orte_proc_state_t state) -{ - int ret; - - /* - * Recover from the process failure by relaunching. - */ - if( ORTE_SUCCESS != (ret = autor_set_current_job_info(jdata, proc_name)) ) { - ORTE_ERROR_LOG(ret); - return ORTE_SUCCESS; /* JJH: Do this for now. Need to fix the flag for normal shutdown */ - /*return ret;*/ - } - - current_global_jobdata->controls |= ORTE_JOB_CONTROL_RECOVERABLE; - - if( proc_name->jobid == ORTE_PROC_MY_NAME->jobid ) { - errmgr_autor_process_fault_daemon(jdata, proc_name, state); - } else { - orte_errmgr_hnp_update_proc(jdata, proc_name, state, 0, 0); - errmgr_autor_process_fault_app(jdata, proc_name, state); - } - - return ORTE_SUCCESS; -} - -int orte_errmgr_hnp_autor_global_suggest_map_targets(orte_proc_t *proc, - orte_node_t *oldnode, - opal_list_t *node_list) -{ - opal_list_item_t *item = NULL; - errmgr_autor_wp_item_t *wp_item = NULL; - orte_node_t *node = NULL; - bool found = false; - int num_removed = 0, num_to_remove; - orte_ns_cmp_bitmask_t mask; - - if( NULL == current_global_jobdata ) { - return ORTE_SUCCESS; - } - - /* JJH Nasty Hack */ - num_to_remove = current_global_jobdata->num_procs / 2; - num_to_remove += 1; - - /* - * Find this process in the known failures list - */ - found = false; - if( mca_errmgr_hnp_component.autor_skip_oldnode ) { - for(item = opal_list_get_first(procs_pending_recovery); - item != opal_list_get_end(procs_pending_recovery); - item = opal_list_get_next(item) ) { - wp_item = (errmgr_autor_wp_item_t*)item; - - mask = ORTE_NS_CMP_ALL; - if (OPAL_EQUAL == orte_util_compare_name_fields(mask, &wp_item->name, &proc->name)) { - found = true; - break; - } - } - } - - OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle, - "%s errmgr:hnp(autor): suggest_map() " - "Process remapping: %s oldnode %s, %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&proc->name), - oldnode->name, - (found ? "Failed Proc." : "Good Proc.") )); - - /* - * If not a failed process, then return it to the oldnode - * If failed process, do not place it back on the same node - */ - num_removed = 0; - for( item = opal_list_get_first(node_list); - item != opal_list_get_end(node_list); - item = opal_list_get_next(item) ) { - node = (orte_node_t*)item; - if( found ) { - if( num_removed >= num_to_remove ) { - break; - } - /* JJH Nasty Hack */ -#if 0 - /* Remove oldnode (if more than one node) */ - if( node == oldnode && 1 < opal_list_get_size(node_list) ) { - opal_output(0, "JJH Remove Node (%s)", node->name); - opal_list_remove_item(node_list, item); - OBJ_RELEASE(item); - } -#else - if( 1 < opal_list_get_size(node_list) ) { - opal_list_remove_item(node_list, item); - OBJ_RELEASE(item); - } -#endif - num_removed++; - } else { - /* Stay on same node */ - if( node != oldnode ) { - opal_list_remove_item(node_list, item); - OBJ_RELEASE(item); - } - } - } - - return ORTE_SUCCESS; -} - -int orte_errmgr_hnp_autor_global_ft_event(int state) -{ - return ORTE_SUCCESS; -} - - -/***************** - * Local Functions - *****************/ -static void errmgr_autor_process_fault_app(orte_job_t *jdata, - orte_process_name_t *proc, - orte_proc_state_t state) -{ - errmgr_autor_wp_item_t *wp_item = NULL; - struct timeval soon; - - OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle, - "%s errmgr:hnp(autor): process_fault() " - "Process fault! proc %s (0x%x)", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(proc), - state)); - - if( !orte_sstore_base_is_checkpoint_available ) { - OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle, - "%s errmgr:hnp(autor): process_fault() " - "No checkpoints are available for this job! Cannot Automaticly Recover!", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME) )); - opal_show_help("help-orte-errmgr-hnp.txt", "autor_failed_to_recover_proc", true, - ORTE_NAME_PRINT(proc), proc->vpid); - return; - } - - mca_errmgr_hnp_component.ignore_current_update = true; - - /* - * If we are already in the shutdown stage of the recovery, then just skip it - */ - if( autor_mask_faults ) { - OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle, - "%s errmgr:hnp(autor):process_fault() " - "Currently recovering the job. Failure masked!", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - return; - } - - /* - * Append this process to the list to process - */ - wp_item = OBJ_NEW(errmgr_autor_wp_item_t); - wp_item->name.jobid = proc->jobid; - wp_item->name.vpid = proc->vpid; - ORTE_EPOCH_SET(wp_item->name.epoch,proc->epoch); - wp_item->state = state; - - opal_list_append(procs_pending_recovery, &(wp_item->super)); - - /* - * Activate the timer, if it is not already setup - */ - if( !autor_timer_active ) { - autor_timer_active = true; - - opal_event_evtimer_set(opal_event_base, autor_timer_event, errmgr_autor_recover_processes, NULL); - soon.tv_sec = mca_errmgr_hnp_component.autor_recovery_delay; - soon.tv_usec = 0; - opal_event_evtimer_add(autor_timer_event, &soon); - } - - return; -} - -static void errmgr_autor_process_fault_daemon(orte_job_t *jdata, - orte_process_name_t *proc, - orte_proc_state_t state) -{ - orte_proc_t *loc_proc = NULL, *child_proc = NULL; - orte_std_cntr_t i_proc; - int32_t i; - - OPAL_OUTPUT_VERBOSE((15, mca_errmgr_hnp_component.super.output_handle, - "%s errmgr:hnp(autor): process_fault_daemon() " - "------- Daemon fault reported! proc %s (0x%x)", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(proc), - state)); - - /* - * Set the process state in the job data structure - */ - for(i = 0; i < jdata->procs->size; ++i) { - if (NULL == (loc_proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, i))) { - continue; - } - - if( loc_proc->name.vpid != proc->vpid) { - continue; - } - - loc_proc->state = state; - - break; - } - - /* - * Remove the route to this process - */ - orte_routed.delete_route(proc); - - /* - * If the aborted daemon had active processes on its node, then we should - * make sure to signal that all the children are gone. - */ - if( loc_proc->node->num_procs > 0 ) { - OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base.output, - "%s errmgr:base: stabalize_runtime() " - "------- Daemon lost with the following processes", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - - for(i_proc = 0; i_proc < opal_pointer_array_get_size(loc_proc->node->procs); ++i_proc) { - child_proc = (orte_proc_t*)opal_pointer_array_get_item(loc_proc->node->procs, i_proc); - if( NULL == child_proc ) { - continue; - } - - OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base.output, - "%s errmgr:base: stabalize_runtime() " - "\t %s [0x%x]", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&child_proc->name), - child_proc->state)); - - if( child_proc->last_errmgr_state < child_proc->state ) { - child_proc->last_errmgr_state = child_proc->state; - orte_errmgr.update_state(child_proc->name.jobid, ORTE_JOB_STATE_COMM_FAILED, - &(child_proc->name), ORTE_PROC_STATE_COMM_FAILED, - 0, 1); - } - } - } else { - /* This daemon had no children, so just mask the failure */ - mca_errmgr_hnp_component.ignore_current_update = true; - } - - /* - * Record the dead daemon - */ - orte_errmgr_hnp_record_dead_process(proc); - - return; -} - -void errmgr_autor_wp_item_construct(errmgr_autor_wp_item_t *wp) -{ - wp->name.jobid = ORTE_JOBID_INVALID; - wp->name.vpid = ORTE_VPID_INVALID; - ORTE_EPOCH_SET(wp->name.epoch,ORTE_EPOCH_MIN); - - wp->state = 0; -} - -void errmgr_autor_wp_item_destruct(errmgr_autor_wp_item_t *wp) -{ - wp->name.jobid = ORTE_JOBID_INVALID; - wp->name.vpid = ORTE_VPID_INVALID; - ORTE_EPOCH_SET(wp->name.epoch,ORTE_EPOCH_INVALID); - - wp->state = 0; -} - -static int display_procs(void ) -{ - opal_list_item_t *item = NULL; - errmgr_autor_wp_item_t *wp_item = NULL; - char *proc_str = NULL; - char *tmp_str = NULL; - - for(item = opal_list_get_first(procs_pending_recovery); - item != opal_list_get_end(procs_pending_recovery); - item = opal_list_get_next(item) ) { - wp_item = (errmgr_autor_wp_item_t*)item; - - if( NULL == proc_str ) { - asprintf(&proc_str, "\t%s Rank %d\n", - ORTE_NAME_PRINT(&(wp_item->name)), - (int)wp_item->name.vpid); - } else { - tmp_str = strdup(proc_str); - free(proc_str); - proc_str = NULL; - asprintf(&proc_str, "%s\t%s Rank %d\n", - tmp_str, - ORTE_NAME_PRINT(&(wp_item->name)), - (int)wp_item->name.vpid); - } - } - - opal_show_help("help-orte-errmgr-hnp.txt", "autor_recovering_job", true, - proc_str); - - if( NULL != tmp_str ) { - free(tmp_str); - tmp_str = NULL; - } - - if( NULL != proc_str ) { - free(proc_str); - proc_str = NULL; - } - - return ORTE_SUCCESS; -} - -static int autor_procs_sort_compare_fn(opal_list_item_t **a, - opal_list_item_t **b) -{ - errmgr_autor_wp_item_t *wp_a, *wp_b; - - wp_a = (errmgr_autor_wp_item_t*)(*a); - wp_b = (errmgr_autor_wp_item_t*)(*b); - - if( wp_a->name.vpid > wp_b->name.vpid ) { - return 1; - } - else if( wp_a->name.vpid == wp_b->name.vpid ) { - return 0; - } - else { - return -1; - } -} - -static void errmgr_autor_recover_processes(int fd, short event, void *cbdata) -{ - int ret, exit_status = ORTE_SUCCESS; - opal_list_item_t *item = NULL; - errmgr_autor_wp_item_t *wp_item = NULL; - orte_std_cntr_t i_proc; - orte_proc_t *proc = NULL; - orte_sstore_base_global_snapshot_info_t *snapshot = NULL; - char * tmp_str = NULL; - - autor_mask_faults = true; - ERRMGR_AUTOR_CLEAR_TIMERS(); - ERRMGR_AUTOR_SET_TIMER(ERRMGR_AUTOR_TIMER_START); - - /* - * Display the processes that are to be recovered - */ - OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle, - "%s errmgr:hnp(autor):recover() " - "------- Display known failed processes in the job %s -------", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_JOBID_PRINT(current_global_jobdata->jobid))); - - opal_list_sort(procs_pending_recovery, autor_procs_sort_compare_fn); - display_procs(); - - /* - * Find the latest checkpoint - */ - OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle, - "%s errmgr:hnp(autor):recover() " - "------- Find the latest checkpoint for the job %s -------", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_JOBID_PRINT(current_global_jobdata->jobid))); - - snapshot = OBJ_NEW(orte_sstore_base_global_snapshot_info_t); - if( ORTE_SUCCESS != (ret = orte_sstore.request_global_snapshot_data(&orte_sstore_handle_last_stable, snapshot)) ) { - ORTE_ERROR_LOG(ret); - exit_status = ret; - goto cleanup; - } - - ERRMGR_AUTOR_SET_TIMER(ERRMGR_AUTOR_TIMER_SETUP); - - /* - * Safely terminate the entire job - */ - opal_output_verbose(10, mca_errmgr_hnp_component.super.output_handle, - "errmgr:hnp(autor):recover() " - "------- Safely terminate the job %s -------", - ORTE_JOBID_PRINT(current_global_jobdata->jobid)); - - for(i_proc = 0; i_proc < opal_pointer_array_get_size(current_global_jobdata->procs); ++i_proc) { - proc = (orte_proc_t*)opal_pointer_array_get_item(current_global_jobdata->procs, i_proc); - if( NULL == proc ) { - continue; - } - if( proc->state < ORTE_PROC_STATE_UNTERMINATED ) { - proc->state = ORTE_PROC_STATE_MIGRATING; - } - if( current_global_jobdata->stdin_target == proc->name.vpid ) { - orte_iof.close(&(proc->name), ORTE_IOF_STDIN); - } - } - - orte_plm.terminate_procs(current_global_jobdata->procs); - - /* - * Wait for the job to terminate all processes - */ - while(!check_if_terminated(current_global_jobdata->procs) ) { - opal_progress(); - } - - ERRMGR_AUTOR_SET_TIMER(ERRMGR_AUTOR_TIMER_TERM); - - opal_output_verbose(10, mca_errmgr_hnp_component.super.output_handle, - "errmgr:hnp(autor):recover() " - "------- Done waiting for termination of job %s -------", - ORTE_JOBID_PRINT(current_global_jobdata->jobid)); - current_global_jobdata->num_terminated = current_global_jobdata->num_procs; - orte_plm_base_reset_job(current_global_jobdata); - - /* - * Construct the app contexts to restart - */ - OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle, - "%s errmgr:hnp(autor):recover() " - "------- Rebuild job %s app context -------", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_JOBID_PRINT(current_global_jobdata->jobid))); - for(i_proc = 0; i_proc < opal_pointer_array_get_size(current_global_jobdata->procs); ++i_proc) { - proc = (orte_proc_t*)opal_pointer_array_get_item(current_global_jobdata->procs, i_proc); - if( NULL == proc ) { - continue; - } - - if( ORTE_SUCCESS != (ret = orte_errmgr_base_update_app_context_for_cr_recovery(current_global_jobdata, - proc, - &(snapshot->local_snapshots))) ) { - ORTE_ERROR_LOG(ret); - exit_status = ret; - goto cleanup; - } - - OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle, - "\tAdjusted: \"%s\" [0x%d] [%s]\n", - ORTE_NAME_PRINT(&proc->name), proc->state, proc->node->name)); - } - - ERRMGR_AUTOR_SET_TIMER(ERRMGR_AUTOR_TIMER_RESETUP); - - /* - * Spawn the restarted job - */ - opal_output_verbose(10, mca_errmgr_hnp_component.super.output_handle, - "errmgr:hnp(autor):recover() " - "------- Respawning the job %s -------", - ORTE_JOBID_PRINT(current_global_jobdata->jobid)); - orte_snapc_base_has_recovered = false; - autor_mask_faults = false; /* Failures pass this point are worth noting */ - orte_plm.spawn(current_global_jobdata); - - /* - * Wait for all the processes to restart - */ - opal_output_verbose(10, mca_errmgr_hnp_component.super.output_handle, - "errmgr:hnp(autor):recover() " - "------- Waiting for restart -------"); - while(!check_if_restarted(current_global_jobdata->procs) ) { - opal_progress(); - } - - ERRMGR_AUTOR_SET_TIMER(ERRMGR_AUTOR_TIMER_RESTART); - - /* - * All done - */ - while( !orte_snapc_base_has_recovered ) { - opal_progress(); - } - - opal_output_verbose(10, mca_errmgr_hnp_component.super.output_handle, - "errmgr:hnp(autor):recover() " - "------- Finished recovering job %s -------", - ORTE_JOBID_PRINT(current_global_jobdata->jobid)); - - opal_show_help("help-orte-errmgr-hnp.txt", "autor_recovery_complete", true); - - ERRMGR_AUTOR_SET_TIMER(ERRMGR_AUTOR_TIMER_FINISH); - - cleanup: - while(NULL != (item = opal_list_remove_first(procs_pending_recovery))) { - wp_item = (errmgr_autor_wp_item_t*)item; - OBJ_RELEASE(wp_item); - } - - if( NULL != tmp_str ) { - free(tmp_str); - tmp_str = NULL; - } - - ERRMGR_AUTOR_DISPLAY_ALL_TIMERS(); - - autor_timer_active = false; - autor_mask_faults = false; - - return; -} - -static int check_if_terminated(opal_pointer_array_t *procs) -{ - orte_std_cntr_t i_proc; - orte_proc_t *proc = NULL; - bool is_done; - - if( NULL == procs ){ - return true; - } - - is_done = true; - for(i_proc = 0; i_proc < opal_pointer_array_get_size(procs); ++i_proc) { - proc = (orte_proc_t*)opal_pointer_array_get_item(procs, i_proc); - if( NULL == proc ) { - continue; - } - - if( proc->state < ORTE_PROC_STATE_UNTERMINATED || - proc->state == ORTE_PROC_STATE_MIGRATING ) { - is_done = false; - break; - } - } - - if( !is_done ) { - OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle, - "\t Still waiting for termination: \"%s\" [0x%x] < [0x%x]\n", - ORTE_NAME_PRINT(&proc->name), proc->state, ORTE_PROC_STATE_UNTERMINATED)); - } - - return is_done; -} - -static int check_if_restarted(opal_pointer_array_t *procs) -{ - orte_std_cntr_t i_proc; - orte_proc_t *proc = NULL; - bool is_done; - - if( NULL == procs ){ - return true; - } - - is_done = true; - for(i_proc = 0; i_proc < opal_pointer_array_get_size(procs); ++i_proc) { - proc = (orte_proc_t*)opal_pointer_array_get_item(procs, i_proc); - if( NULL == proc ) { - continue; - } - - if( !(ORTE_PROC_STATE_RUNNING & proc->state) ) { - is_done = false; - break; - } - } - - if( !is_done ) { - OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle, - "\t Still waiting for restart: \"%s\" [0x%x] != [0x%x]\n", - ORTE_NAME_PRINT(&proc->name), proc->state, ORTE_PROC_STATE_RUNNING)); - } - - return is_done; -} - -/************************ - * Timing - ************************/ -static void errmgr_autor_set_time(int idx) -{ - if(idx < ERRMGR_AUTOR_TIMER_MAX ) { - if( timer_start[idx] <= 0.0 ) { - timer_start[idx] = errmgr_autor_get_time(); - } - } -} - -static void errmgr_autor_display_all_timers(void) -{ - double diff = 0.0; - char * label = NULL; - - opal_output(0, "Auto. Recovery Timing: ******************** Summary Begin\n"); - - /********** Structure Setup **********/ - label = strdup("Setup"); - diff = timer_start[ERRMGR_AUTOR_TIMER_SETUP] - timer_start[ERRMGR_AUTOR_TIMER_START]; - errmgr_autor_display_indv_timer_core(diff, label); - free(label); - - /********** Termination **********/ - label = strdup("Terminate"); - diff = timer_start[ERRMGR_AUTOR_TIMER_TERM] - timer_start[ERRMGR_AUTOR_TIMER_SETUP]; - errmgr_autor_display_indv_timer_core(diff, label); - free(label); - - /********** Setup new job **********/ - label = strdup("Setup Relaunch"); - diff = timer_start[ERRMGR_AUTOR_TIMER_RESETUP] - timer_start[ERRMGR_AUTOR_TIMER_TERM]; - errmgr_autor_display_indv_timer_core(diff, label); - free(label); - - /********** Restart **********/ - label = strdup("Restart"); - diff = timer_start[ERRMGR_AUTOR_TIMER_RESTART] - timer_start[ERRMGR_AUTOR_TIMER_RESETUP]; - errmgr_autor_display_indv_timer_core(diff, label); - free(label); - - /********** Finish **********/ - label = strdup("Finalize"); - diff = timer_start[ERRMGR_AUTOR_TIMER_FINISH] - timer_start[ERRMGR_AUTOR_TIMER_RESTART]; - errmgr_autor_display_indv_timer_core(diff, label); - free(label); - - opal_output(0, "Auto. Recovery Timing: ******************** Summary End\n"); -} - -static void errmgr_autor_clear_timers(void) -{ - int i; - for(i = 0; i < ERRMGR_AUTOR_TIMER_MAX; ++i) { - timer_start[i] = 0.0; - } -} - -static double errmgr_autor_get_time(void) -{ - double wtime; - -#if OPAL_TIMER_USEC_NATIVE - wtime = (double)opal_timer_base_get_usec() / 1000000.0; -#else - struct timeval tv; - gettimeofday(&tv, NULL); - wtime = tv.tv_sec; - wtime += (double)tv.tv_usec / 1000000.0; -#endif - - return wtime; -} - -static void errmgr_autor_display_indv_timer_core(double diff, char *str) -{ - double total = 0; - double perc = 0; - - total = timer_start[ERRMGR_AUTOR_TIMER_MAX-1] - timer_start[ERRMGR_AUTOR_TIMER_START]; - perc = (diff/total) * 100; - - opal_output(0, - "errmgr_autor: timing: %-20s = %10.2f s\t%10.2f s\t%6.2f\n", - str, - diff, - total, - perc); - return; -} - -#endif /* OPAL_ENABLE_FT_CR */ diff --git a/orte/mca/errmgr/hnp/errmgr_hnp_component.c b/orte/mca/errmgr/hnp/errmgr_hnp_component.c deleted file mode 100644 index 3d4ad91ed4..0000000000 --- a/orte/mca/errmgr/hnp/errmgr_hnp_component.c +++ /dev/null @@ -1,201 +0,0 @@ -/* - * Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. - * - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "orte_config.h" -#include "opal/util/output.h" - -#include "orte/mca/errmgr/errmgr.h" -#include "orte/mca/errmgr/base/base.h" -#include "orte/mca/errmgr/base/errmgr_private.h" -#include "errmgr_hnp.h" - -/* - * Public string for version number - */ -const char *orte_errmgr_hnp_component_version_string = - "ORTE ERRMGR hnp MCA component version " ORTE_VERSION; - -/* - * Local functionality - */ -static int orte_errmgr_hnp_open(void); -static int orte_errmgr_hnp_close(void); - -/* - * Instantiate the public struct with all of our public information - * and pointer to our public functions in it - */ -orte_errmgr_hnp_component_t mca_errmgr_hnp_component = { - /* First do the base component stuff */ - { - /* Handle the general mca_component_t struct containing - * meta information about the component hnp - */ - { - ORTE_ERRMGR_BASE_VERSION_3_0_0, - /* Component name and version */ - "hnp", - ORTE_MAJOR_VERSION, - ORTE_MINOR_VERSION, - ORTE_RELEASE_VERSION, - - /* Component open and close functions */ - orte_errmgr_hnp_open, - orte_errmgr_hnp_close, - orte_errmgr_hnp_component_query - }, - { - /* The component is checkpoint ready */ - MCA_BASE_METADATA_PARAM_CHECKPOINT - }, - - /* Verbosity level */ - 0, - /* opal_output handler */ - -1, - /* Default priority */ - 5 - } -}; - -static int orte_errmgr_hnp_open(void) -{ - int val; - - /* - * This should be the last componet to ever get used since - * it doesn't do anything. - */ - mca_base_param_reg_int(&mca_errmgr_hnp_component.super.base_version, - "priority", - "Priority of the ERRMGR hnp component", - false, false, - mca_errmgr_hnp_component.super.priority, - &mca_errmgr_hnp_component.super.priority); - - mca_base_param_reg_int(&mca_errmgr_hnp_component.super.base_version, - "verbose", - "Verbose level for the ERRMGR hnp component", - false, false, - mca_errmgr_hnp_component.super.verbose, - &mca_errmgr_hnp_component.super.verbose); - /* If there is a custom verbose level for this component than use it - * otherwise take our parents level and output channel - */ - if ( 0 != mca_errmgr_hnp_component.super.verbose) { - mca_errmgr_hnp_component.super.output_handle = opal_output_open(NULL); - opal_output_set_verbosity(mca_errmgr_hnp_component.super.output_handle, - mca_errmgr_hnp_component.super.verbose); - } else { - mca_errmgr_hnp_component.super.output_handle = orte_errmgr_base.output; - } - -#if OPAL_ENABLE_FT_CR - /**************************** - * CRMig (C/R Process Migration) MCA Options - ****************************/ - mca_base_param_reg_int(&mca_errmgr_hnp_component.super.base_version, - "crmig_timing", - "Enable Process Migration timer", - false, false, - 0, &val); - mca_errmgr_hnp_component.crmig_timing_enabled = OPAL_INT_TO_BOOL(val); - - mca_base_param_reg_int(&mca_errmgr_hnp_component.super.base_version, - "crmig_enable", - "Enable Process Migration (Default: 0/off)", - false, false, - 0, &val); - mca_errmgr_hnp_component.crmig_enabled = OPAL_INT_TO_BOOL(val); - - /**************************** - * AutoR (Automatic Recovery) MCA Options - ****************************/ - mca_base_param_reg_int(&mca_errmgr_hnp_component.super.base_version, - "autor_timing", - "Enable Automatic Recovery timer", - false, false, - 0, &val); - mca_errmgr_hnp_component.autor_timing_enabled = OPAL_INT_TO_BOOL(val); - - mca_base_param_reg_int(&mca_errmgr_hnp_component.super.base_version, - "autor_enable", - "Enable Automatic Recovery (Default: 0/off)", - false, false, - 0, &val); - mca_errmgr_hnp_component.autor_enabled = OPAL_INT_TO_BOOL(val); - - mca_base_param_reg_int(&mca_errmgr_hnp_component.super.base_version, - "autor_recovery_delay", - "Number of seconds to wait before starting to recover the job after a failure" - " [Default: 1 sec]", - false, false, - 1, &val); - mca_errmgr_hnp_component.autor_recovery_delay = val; - - mca_base_param_reg_int(&mca_errmgr_hnp_component.super.base_version, - "autor_skip_oldnode", - "Skip the old node from failed proc, even if it is still available" - " [Default: Enabled]", - false, false, - 1, &val); - mca_errmgr_hnp_component.autor_skip_oldnode = OPAL_INT_TO_BOOL(val); -#else - val = 0; /* Silence compiler warning */ -#endif /* OPAL_ENABLE_FT_CR */ - - /* - * Debug Output - */ - opal_output_verbose(10, mca_errmgr_hnp_component.super.output_handle, - "errmgr:hnp: open()"); - opal_output_verbose(20, mca_errmgr_hnp_component.super.output_handle, - "errmgr:hnp: open: priority = %d", - mca_errmgr_hnp_component.super.priority); - opal_output_verbose(20, mca_errmgr_hnp_component.super.output_handle, - "errmgr:hnp: open: verbosity = %d", - mca_errmgr_hnp_component.super.verbose); -#if OPAL_ENABLE_FT_CR - opal_output_verbose(20, mca_errmgr_hnp_component.super.output_handle, - "errmgr:hnp: open: --- CR Migration Options ---"); - opal_output_verbose(20, mca_errmgr_hnp_component.super.output_handle, - "errmgr:hnp: open: Process Migration = %s", - (mca_errmgr_hnp_component.crmig_enabled ? "Enabled" : "Disabled")); - opal_output_verbose(20, mca_errmgr_hnp_component.super.output_handle, - "errmgr:hnp: open: timing = %s", - (mca_errmgr_hnp_component.crmig_timing_enabled ? "Enabled" : "Disabled")); - - opal_output_verbose(20, mca_errmgr_hnp_component.super.output_handle, - "errmgr:hnp: open: --- Auto. Recovery Options ---"); - opal_output_verbose(20, mca_errmgr_hnp_component.super.output_handle, - "errmgr:hnp: open: Auto. Recover = %s", - (mca_errmgr_hnp_component.autor_enabled ? "Enabled" : "Disabled")); - opal_output_verbose(20, mca_errmgr_hnp_component.super.output_handle, - "errmgr:hnp: open: timing = %s", - (mca_errmgr_hnp_component.autor_timing_enabled ? "Enabled" : "Disabled")); - opal_output_verbose(20, mca_errmgr_hnp_component.super.output_handle, - "errmgr:hnp: open: recover_delay = %d", - mca_errmgr_hnp_component.autor_recovery_delay); - - mca_errmgr_hnp_component.crmig_in_progress = false; - mca_errmgr_hnp_component.autor_in_progress = false; - mca_errmgr_hnp_component.term_in_progress = false; -#endif /* OPAL_ENABLE_FT_CR */ - - return ORTE_SUCCESS; -} - -static int orte_errmgr_hnp_close(void) -{ - opal_output_verbose(10, mca_errmgr_hnp_component.super.output_handle, - "errmgr:hnp: close()"); - - return ORTE_SUCCESS; -} diff --git a/orte/mca/errmgr/hnp/errmgr_hnp_crmig.c b/orte/mca/errmgr/hnp/errmgr_hnp_crmig.c deleted file mode 100644 index 8698f959d4..0000000000 --- a/orte/mca/errmgr/hnp/errmgr_hnp_crmig.c +++ /dev/null @@ -1,1517 +0,0 @@ -/* - * Copyright (c) 2009-2010 The Trustees of Indiana University. - * All rights reserved. - * Copyright (c) 2011 Oak Ridge National Labs. All rights reserved. - * Copyright (c) 2004-2011 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "orte_config.h" - -#include -#ifdef HAVE_UNISTD_H -#include -#endif /* HAVE_UNISTD_H */ -#ifdef HAVE_STRING_H -#include -#endif - -#include "opal/util/show_help.h" -#include "opal/util/output.h" -#include "opal/util/opal_environ.h" -#include "opal/util/basename.h" -#include "opal/util/argv.h" -#include "opal/mca/mca.h" -#include "opal/mca/base/base.h" -#include "opal/mca/base/mca_base_param.h" -#include "opal/mca/crs/crs.h" -#include "opal/mca/crs/base/base.h" - -#include "orte/util/error_strings.h" -#include "orte/util/name_fns.h" -#include "orte/util/proc_info.h" -#include "orte/runtime/orte_globals.h" -#include "opal/dss/dss.h" -#include "orte/mca/rml/rml.h" -#include "orte/mca/rml/rml_types.h" -#include "orte/mca/iof/iof.h" -#include "orte/mca/plm/plm.h" -#include "orte/mca/plm/base/base.h" -#include "orte/mca/plm/base/plm_private.h" -#include "orte/mca/filem/filem.h" -#include "orte/mca/grpcomm/grpcomm.h" -#include "orte/runtime/orte_wait.h" -#include "orte/mca/rmaps/rmaps_types.h" -#include "orte/mca/routed/routed.h" -#include "orte/mca/snapc/snapc.h" -#include "orte/mca/snapc/base/base.h" - -#include "orte/mca/errmgr/errmgr.h" -#include "orte/mca/errmgr/base/base.h" -#include "orte/mca/errmgr/base/errmgr_private.h" - -#include "errmgr_hnp.h" - -#include MCA_timer_IMPLEMENTATION_HEADER - -#if OPAL_ENABLE_FT_CR - -/************************************ - * Locally Global vars & functions :) - ************************************/ -static orte_jobid_t current_global_jobid = ORTE_JOBID_INVALID; -static orte_job_t *current_global_jobdata = NULL; - -static bool migrating_underway = false; -static bool migrating_terminated = false; -static bool migrating_restarted = false; - -static opal_list_t *current_onto_mapping_general = NULL; -static opal_list_t *current_onto_mapping_exclusive = NULL; - -/*** Command Line Interactions */ -static int current_migration_status = ORTE_ERRMGR_MIGRATE_STATE_NONE; - -static int errmgr_crmig_global_migrate(opal_list_t *off_procs, opal_list_t *off_nodes, opal_list_t *onto_map); - -static int orte_errmgr_hnp_crmig_global_process_fault(orte_job_t *jdata, - orte_process_name_t *proc_name, - orte_proc_state_t state); -static void errmgr_crmig_process_fault_app(orte_job_t *jdata, - orte_process_name_t *proc, - orte_proc_state_t state); -static void errmgr_crmig_process_fault_daemon(orte_job_t *jdata, - orte_process_name_t *proc, - orte_proc_state_t state); - -static bool check_if_duplicate_proc(orte_proc_t *proc, opal_pointer_array_t *migrating_procs); -static int check_if_terminated(opal_pointer_array_t *migrating_procs); -static int check_if_restarted(opal_pointer_array_t *migrating_procs); - -static int check_and_pre_map(opal_list_t *off_procs, - opal_list_t *off_nodes, - orte_snapc_base_quiesce_t *cur_datum); - -static void display_request(opal_list_t *off_procs, - opal_list_t *off_nodes, - orte_snapc_base_quiesce_t *cur_datum); - -/* - * Timer stuff - */ -static void errmgr_crmig_set_time(int idx); -static void errmgr_crmig_display_all_timers(void); -static void errmgr_crmig_clear_timers(void); - -static double errmgr_crmig_get_time(void); -static void errmgr_crmig_display_indv_timer_core(double diff, char *str); -static double timer_start[OPAL_CR_TIMER_MAX]; - -#define ERRMGR_CRMIG_TIMER_START 0 -#define ERRMGR_CRMIG_TIMER_SETUP 1 -#define ERRMGR_CRMIG_TIMER_CKPT 2 -#define ERRMGR_CRMIG_TIMER_TERM 3 -#define ERRMGR_CRMIG_TIMER_RESETUP 4 -#define ERRMGR_CRMIG_TIMER_RESTART 5 -#define ERRMGR_CRMIG_TIMER_FINISH 6 -#define ERRMGR_CRMIG_TIMER_MAX 7 - -#define ERRMGR_CRMIG_CLEAR_TIMERS() \ - { \ - if(OPAL_UNLIKELY(mca_errmgr_hnp_component.crmig_timing_enabled > 0)) { \ - errmgr_crmig_clear_timers(); \ - } \ - } - -#define ERRMGR_CRMIG_SET_TIMER(idx) \ - { \ - if(OPAL_UNLIKELY(mca_errmgr_hnp_component.crmig_timing_enabled > 0)) { \ - errmgr_crmig_set_time(idx); \ - } \ - } - -#define ERRMGR_CRMIG_DISPLAY_ALL_TIMERS() \ - { \ - if(OPAL_UNLIKELY(mca_errmgr_hnp_component.crmig_timing_enabled > 0)) { \ - errmgr_crmig_display_all_timers(); \ - } \ - } - -/************************ - * Function Definitions: Global - ************************/ -int orte_errmgr_hnp_crmig_global_module_init(void) -{ - int ret; - - opal_output_verbose(10, mca_errmgr_hnp_component.super.output_handle, - "errmgr:hnp(crmig): init()"); - - migrating_underway = false; - - current_global_jobid = ORTE_JOBID_INVALID; - current_global_jobdata = NULL; - - /* - * Initialize the connection to the orte-migrate tool - */ - if( ORTE_SUCCESS != (ret = orte_errmgr_base_tool_init()) ) { - ORTE_ERROR_LOG(ret); - return ret; - } - - ERRMGR_CRMIG_CLEAR_TIMERS(); - - return ORTE_SUCCESS; -} - -int orte_errmgr_hnp_crmig_global_module_finalize(void) -{ - int ret; - - opal_output_verbose(10, mca_errmgr_hnp_component.super.output_handle, - "errmgr:hnp(crmig): finalize()"); - - /* - * Finalize the connection to the orte-migrate tool - */ - if( ORTE_SUCCESS != (ret = orte_errmgr_base_tool_finalize()) ) { - ORTE_ERROR_LOG(ret); - return ret; - } - - migrating_underway = false; - - current_global_jobid = ORTE_JOBID_INVALID; - current_global_jobdata = NULL; - - ERRMGR_CRMIG_CLEAR_TIMERS(); - - return ORTE_SUCCESS; -} - -int orte_errmgr_hnp_crmig_global_predicted_fault(opal_list_t *proc_list, - opal_list_t *node_list, - opal_list_t *suggested_map) -{ - int ret, exit_status = ORTE_SUCCESS; - orte_job_t *jdata = NULL; - int i; - - /* - * JJH: RETURN HERE - * If we are already migrating, then reject this request - */ - if( migrating_underway ) { - ; - } - - /* - * Determine the jobid for this migration - * JJH: Assumes only one job active at any one time - */ - for(i = 0; i < orte_job_data->size; ++i ) { - if (NULL == (jdata = (orte_job_t*)opal_pointer_array_get_item(orte_job_data, i))) { - continue; - } - /* Exclude outselves */ - if( jdata->jobid == ORTE_PROC_MY_NAME->jobid ) { - continue; - } - current_global_jobdata = jdata; - current_global_jobid = jdata->jobid; - break; - } - if( NULL == current_global_jobdata ) { - opal_output(0, "errmgr:hnp(crmig):predicted_fault(): Global) Error: Cannot find the jdata for the current job."); - ORTE_ERROR_LOG(ORTE_ERROR); - return ORTE_ERROR; - } - current_global_jobdata->controls |= ORTE_JOB_CONTROL_RECOVERABLE; - - current_migration_status = ORTE_ERRMGR_MIGRATE_STATE_REQUEST; - if( ORTE_SUCCESS != (ret = orte_errmgr_base_migrate_update(current_migration_status)) ) { - ORTE_ERROR_LOG(ret); - exit_status = ret; - goto cleanup; - } - - /************************* - * Kick off the migration - *************************/ - if( ORTE_SUCCESS != (ret = errmgr_crmig_global_migrate(proc_list, node_list, suggested_map)) ) { - ORTE_ERROR_LOG(ret); - exit_status = ret; - goto cleanup; - } - - /************************ - * Set up the Command Line listener again - *************************/ - if( ORTE_ERRMGR_MIGRATE_STATE_ERROR != current_migration_status ) { - if( ORTE_SUCCESS != (ret = orte_errmgr_base_migrate_update(ORTE_ERRMGR_MIGRATE_STATE_NONE)) ) { - ORTE_ERROR_LOG(ret); - exit_status = ret; - goto cleanup; - } - - opal_show_help("help-orte-errmgr-hnp.txt", "crmig_migrated_job", true); - } - current_migration_status = ORTE_ERRMGR_MIGRATE_STATE_NONE; - - cleanup: - return exit_status; -} - -int orte_errmgr_hnp_crmig_global_update_state(orte_jobid_t job, - orte_job_state_t jobstate, - orte_process_name_t *proc_name, - orte_proc_state_t state, - pid_t pid, - orte_exit_code_t exit_code) -{ - orte_job_t *jdata = NULL; - int ret = ORTE_SUCCESS; - - /* - * if orte is trying to shutdown, just let it - */ - if( mca_errmgr_hnp_component.term_in_progress ) { - return ORTE_SUCCESS; - } - - /* - * Get the job data object for this process - */ - if( NULL != proc_name ) { /* Get job from proc's jobid */ - jdata = orte_get_job_data_object(proc_name->jobid); - } else { /* Get from the general job */ - jdata = orte_get_job_data_object(job); - } - if( NULL == jdata ) { - opal_output(0, "%s errmgr:hnp(crmig):update_state() Error: Cannot find job %s for Process %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_JOBID_PRINT(job), - (NULL == proc_name) ? "NULL" : ORTE_NAME_PRINT(proc_name) ); - ret = ORTE_ERROR; - ORTE_ERROR_LOG(ret); - return ret; - } - - /* - * If this is a tool, ignore - */ - if( jdata->num_apps == 0 && - OPAL_EQUAL != orte_util_compare_name_fields(ORTE_NS_CMP_JOBID, ORTE_PROC_MY_NAME, proc_name) ) { - OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, - "%s errmgr:hnp(crmig): An external tool disconnected. Ignore...", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - return ORTE_SUCCESS; - } - - OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, - "%s errmgr:hnp(crmig): job %s reported state %s" - " for proc %s state %s exit_code %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_JOBID_PRINT(job), - orte_job_state_to_str(jobstate), - (NULL == proc_name) ? "NULL" : ORTE_NAME_PRINT(proc_name), - orte_proc_state_to_str(state), exit_code)); - - if( ORTE_PROC_STATE_ABORTED_BY_SIG == state || - ORTE_PROC_STATE_COMM_FAILED == state ) { - if( ORTE_SUCCESS != (ret = orte_errmgr_hnp_crmig_global_process_fault(jdata, proc_name, state)) ) { - ORTE_ERROR_LOG(ret); - return ret; - } - } - else if( ORTE_PROC_STATE_KILLED_BY_CMD == state ) { - if( migrating_underway ) { - /* If we are migrating, then we need to mask this to prevent the lower level from terminating us */ - mca_errmgr_hnp_component.ignore_current_update = true; - orte_errmgr_hnp_update_proc(jdata, proc_name, state, 0, exit_code); - } - } - - return ORTE_SUCCESS; -} - -int orte_errmgr_hnp_crmig_global_suggest_map_targets(orte_proc_t *proc, - orte_node_t *oldnode, - opal_list_t *node_list) -{ - int exit_status = ORTE_SUCCESS; - opal_list_item_t *item = NULL, *m_item = NULL; - orte_errmgr_predicted_map_t *onto_map = NULL, *current_proc_map = NULL; - orte_node_t *node = NULL; - bool found = false; - int num_suggested = 0; - orte_std_cntr_t i_proc; - orte_proc_t *peer_proc = NULL; - - /* - * If not migrating, then suggest nothing - */ - if( !migrating_underway ) { - return ORTE_SUCCESS; - } - - /* - * First look for an exclusive mapping for this process - */ - for(item = opal_list_get_first(current_onto_mapping_exclusive); - item != opal_list_get_end(current_onto_mapping_exclusive); - item = opal_list_get_next(item) ) { - onto_map = (orte_errmgr_predicted_map_t*) item; - if( onto_map->proc_name.vpid == proc->name.vpid ) { - current_proc_map = onto_map; - break; - } - } - - /* - * If there is an exclusive mapping then... - */ - if( NULL != current_proc_map ) { - /* - * If we made an exclusive mapping during the check_and_pre_map() - * then honor it here. - */ - if( NULL != current_proc_map->pre_map_fixed_node ) { - for( item = opal_list_get_first(node_list); - item != opal_list_get_end(node_list); - item = opal_list_get_next(item) ) { - node = (orte_node_t*)item; - - /* Exclude all other nodes */ - found = false; - - if( 0 == strncmp(node->name, current_proc_map->pre_map_fixed_node, - strlen(current_proc_map->pre_map_fixed_node)) ) { - found = true; - break; - } - if( !found ) { - opal_list_remove_item(node_list, item); - OBJ_RELEASE(item); - continue; - } else { - OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle, - "errmgr:hnp(crmig):suggest() ------- Fixed use of node [%15s : %10s -> %10s (%10s)] -------", - ORTE_NAME_PRINT(&proc->name), oldnode->name, - current_proc_map->pre_map_fixed_node, node->name)); - } - } - - /* All done with mapping */ - exit_status = ORTE_SUCCESS; - goto cleanup; - } - - /* - * If 'off_current_node' then exclude current node - */ - if( current_proc_map->off_current_node ) { - OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle, - "errmgr:hnp(crmig):suggest() ------- Remove old node (info) [%15s : %10s] -------", - ORTE_NAME_PRINT(&proc->name), oldnode->name)); - for( item = opal_list_get_first(node_list); - item != opal_list_get_end(node_list); - item = opal_list_get_next(item) ) { - node = (orte_node_t*)item; - - /* Exclude the old node */ - if( node == oldnode ) { - opal_list_remove_item(node_list, item); - OBJ_RELEASE(item); - break; - } - } - } - - /* - * If 'map_proc_name' then map to the node where this process resides - * Note: Only do this if there was no 'other' node suggested. If there - * was an 'other' node suggested then we need to honor that before - * we honor the peer suggestion. - */ - if( ORTE_VPID_INVALID != current_proc_map->map_proc_name.vpid && - current_proc_map->proc_name.vpid != current_proc_map->map_proc_name.vpid && - NULL == current_proc_map->map_node_name ) { - /* - * Find the node containting the target process - */ - for(i_proc = 0; i_proc < opal_pointer_array_get_size(current_global_jobdata->procs); ++i_proc) { - peer_proc = (orte_proc_t*)opal_pointer_array_get_item(current_global_jobdata->procs, i_proc); - if( NULL == peer_proc ) { - continue; - } - if( peer_proc->name.vpid == current_proc_map->map_proc_name.vpid ) { - current_proc_map->map_node_name = strdup(peer_proc->node->name); - break; - } - } - OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle, - "errmgr:hnp(crmig):suggest() ------- Force use of node with proc [%15s -> %15s: %10s -> %10s] -------", - ORTE_NAME_PRINT(&proc->name), ORTE_NAME_PRINT(&peer_proc->name), - oldnode->name, current_proc_map->map_node_name)); - } - - /* - * If 'map_node_name' then use this node exclusively - */ - if( NULL != current_proc_map->map_node_name ) { - for( item = opal_list_get_first(node_list); - item != opal_list_get_end(node_list); - item = opal_list_get_next(item) ) { - node = (orte_node_t*)item; - - /* Exclude all nodes not in the include list */ - found = false; - - if( 0 == strncmp(node->name, current_proc_map->map_node_name, strlen(current_proc_map->map_node_name)) ) { - found = true; - } - if( !found ) { - opal_list_remove_item(node_list, item); - OBJ_RELEASE(item); - continue; - } else { - OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle, - "errmgr:hnp(crmig):suggest() ------- Force use of node [%15s : %10s -> %10s (%10s)] -------", - ORTE_NAME_PRINT(&proc->name), oldnode->name, - current_proc_map->map_node_name, node->name)); - } - } - - /* All done with mapping */ - exit_status = ORTE_SUCCESS; - goto cleanup; - } - - /* - * Otherwise then map as if there was no exclusive mapping - */ - OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle, - "errmgr:hnp(crmig):suggest() ------- Suggesting as if non-exclusive [%15s : 0x%x : %10s] -------", - ORTE_NAME_PRINT(&proc->name), proc->state, oldnode->name)); - } - /* - * If no exclusive mapping (or exclusive did not yield any results) then... - */ - else { - /* - * Remove the old node from the list, if there are more than 1 nodes available - */ - if(1 < opal_list_get_size(node_list) ) { - OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle, - "errmgr:hnp(crmig):suggest() ------- Remove old node [%15s : %10s] -------", - ORTE_NAME_PRINT(&proc->name), oldnode->name)); - for( item = opal_list_get_first(node_list); - item != opal_list_get_end(node_list); - item = opal_list_get_next(item) ) { - node = (orte_node_t*)item; - - /* Exclude the old node */ - if( node == oldnode ) { - opal_list_remove_item(node_list, item); - OBJ_RELEASE(item); - break; - } - } - } - } - - /* - * If we do not have any general suggestions, then just return - */ - if( opal_list_get_size(current_onto_mapping_general) <= 0 ) { - OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle, - "errmgr:hnp(crmig):suggest() ------- No suggestions for target [%15s : 0x%x : %10s] -------", - ORTE_NAME_PRINT(&proc->name), proc->state, oldnode->name)); - exit_status = ORTE_SUCCESS; - goto cleanup; - } - - /* - * Otherwise look through the general suggestions as an include list - */ - OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle, - "errmgr:hnp(crmig):suggest() ------- Suggest a target for [%15s : 0x%x : %10s] -------", - ORTE_NAME_PRINT(&proc->name), proc->state, oldnode->name)); - - num_suggested = 0; - for( item = opal_list_get_first(node_list); - item != opal_list_get_end(node_list); - item = opal_list_get_next(item) ) { - node = (orte_node_t*)item; - - /* Exclude all nodes not in the include list */ - found = false; - - for(m_item = opal_list_get_first(current_onto_mapping_general); - m_item != opal_list_get_end(current_onto_mapping_general); - m_item = opal_list_get_next(m_item) ) { - onto_map = (orte_errmgr_predicted_map_t*) m_item; - - if( 0 == strncmp(node->name, onto_map->map_node_name, strlen(onto_map->map_node_name)) ) { - found = true; - break; - } - } - if( !found ) { - opal_list_remove_item(node_list, item); - OBJ_RELEASE(item); - continue; - } - - ++num_suggested; - - OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle, - "errmgr:hnp(crmig):suggest() ------- Suggesting target %2d [%15s : 0x%x : %10s -> %10s] -------", - num_suggested, ORTE_NAME_PRINT(&proc->name), proc->state, oldnode->name, node->name)); - } - - cleanup: - OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle, - "errmgr:hnp(crmig):suggest() ------- Suggested %2d nodes for [%15s : 0x%x : %10s] -------", - (int)opal_list_get_size(node_list), ORTE_NAME_PRINT(&proc->name), proc->state, oldnode->name)); - - return exit_status; -} - -int orte_errmgr_hnp_crmig_global_ft_event(int state) -{ - return ORTE_SUCCESS; -} - - -/************************ - * Function Definitions: Static - ************************/ -static int orte_errmgr_hnp_crmig_global_process_fault(orte_job_t *jdata, - orte_process_name_t *proc_name, - orte_proc_state_t state) -{ - /* - * JJH: Todo - * The expected logic here is: - * if( a daemon with children fails ) { - * abort migration. - * } - * if( a daemon without children fails ) { - * continue. No processes lost - * } - * if( an application process fails ) { - * abort migration. Might be a bad checkpoint, or a process that we were - * not migrating that died. - * } - * else { - * continue; - * } - */ - if( proc_name->jobid == ORTE_PROC_MY_NAME->jobid ) { - errmgr_crmig_process_fault_daemon(jdata, proc_name, state); - } else { - errmgr_crmig_process_fault_app(jdata, proc_name, state); - } - - return ORTE_SUCCESS; -} - -static int errmgr_crmig_global_migrate(opal_list_t *off_procs, opal_list_t *off_nodes, opal_list_t *onto_maps) -{ - int ret, exit_status = ORTE_SUCCESS; - orte_std_cntr_t i_node; - orte_std_cntr_t i_proc; - orte_node_t *node = NULL; - orte_proc_t *proc = NULL; - bool found = false; - orte_snapc_base_quiesce_t *cur_datum = NULL; - bool close_iof_stdin = false; - orte_process_name_t iof_name = {ORTE_JOBID_INVALID, 0}; - char * err_str_procs = NULL; - char * err_str_nodes = NULL; - char * tmp_str = NULL; - orte_errmgr_predicted_proc_t *off_proc = NULL; - orte_errmgr_predicted_node_t *off_node = NULL; - orte_errmgr_predicted_map_t *onto_map = NULL; - opal_list_item_t *item = NULL; - - ERRMGR_CRMIG_CLEAR_TIMERS(); - ERRMGR_CRMIG_SET_TIMER(ERRMGR_CRMIG_TIMER_START); - - OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle, - "errmgr:hnp(crmig):migrate() ------- Migrating (%3d, %3d, %3d) -------", - (int)opal_list_get_size(off_procs), - (int)opal_list_get_size(off_nodes), - (int)opal_list_get_size(onto_maps))); - - /* - * Modeled after orte_plm_base_reset_job - */ - cur_datum = OBJ_NEW(orte_snapc_base_quiesce_t); - cur_datum->migrating = true; - migrating_underway = true; - mca_errmgr_hnp_component.crmig_in_progress = true; - - current_migration_status = ORTE_ERRMGR_MIGRATE_STATE_RUNNING; - if( ORTE_SUCCESS != (ret = orte_errmgr_base_migrate_update(current_migration_status)) ) { - ORTE_ERROR_LOG(ret); - exit_status = ret; - goto cleanup; - } - - /* - * Check to make sure that the 'off' and 'onto' nodes exist - * - if 'onto' nodes do not, then add them (JJH XXX) - * - if 'off' nodes do not, then return an error (JJH XXX) - * JJH TODO... - */ - - /* - * Copy over the onto_nodes so we can suggest them later - */ - if( NULL != current_onto_mapping_general ) { - OBJ_RELEASE(current_onto_mapping_general); - current_onto_mapping_general = NULL; - } - if( NULL != current_onto_mapping_exclusive ) { - OBJ_RELEASE(current_onto_mapping_exclusive); - current_onto_mapping_exclusive = NULL; - } - current_onto_mapping_general = OBJ_NEW(opal_list_t); - current_onto_mapping_exclusive = OBJ_NEW(opal_list_t); - if( NULL != onto_maps ) { - while( NULL != (item = opal_list_remove_first(onto_maps)) ) { - onto_map = (orte_errmgr_predicted_map_t*) item; - /* Determine if process exclude mapping, or general */ - if( onto_map->proc_name.vpid == ORTE_VPID_INVALID ) { - opal_list_append(current_onto_mapping_general, item); - } else { - opal_list_append(current_onto_mapping_exclusive, item); - } - } - } - - for(item = opal_list_get_first(current_onto_mapping_exclusive); - item != opal_list_get_end(current_onto_mapping_exclusive); - item = opal_list_get_next(item) ) { - onto_map = (orte_errmgr_predicted_map_t*) item; - /* - * Find the node currently containing this process - */ - found = false; - for(i_proc = 0; i_proc < opal_pointer_array_get_size(current_global_jobdata->procs); ++i_proc) { - proc = (orte_proc_t*)opal_pointer_array_get_item(current_global_jobdata->procs, i_proc); - if( NULL == proc ) { - continue; - } - - if( proc->name.vpid == onto_map->proc_name.vpid) { - found = true; - break; - } - } - - /* - * Check to see if this process hsould be skipped - */ - if( !onto_map->off_current_node && - (ORTE_VPID_INVALID == onto_map->map_proc_name.vpid || - onto_map->proc_name.vpid == onto_map->map_proc_name.vpid ) && - (NULL == onto_map->map_node_name || - 0 == strncmp(onto_map->map_node_name, proc->node->name, strlen(proc->node->name))) ) { - OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle, - "errmgr:hnp(crmig):migrate() ------- Process %15s does not wish to move -------", - ORTE_NAME_PRINT(&proc->name))); - - } else { - OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle, - "errmgr:hnp(crmig):migrate() ------- Process %15s will be moved -------", - ORTE_NAME_PRINT(&proc->name))); - /* - * Set the process to restarting - */ - proc->state = ORTE_PROC_STATE_MIGRATING; - - opal_pointer_array_add(&(cur_datum->migrating_procs), (void*)proc); - OBJ_RETAIN(proc); - (cur_datum->num_migrating)++; - - if( current_global_jobdata->stdin_target == proc->name.vpid ) { - close_iof_stdin = true; - iof_name.jobid = proc->name.jobid; - iof_name.vpid = proc->name.vpid; - ORTE_EPOCH_SET(iof_name.epoch,proc->name.epoch); - } - } - } - - migrating_terminated = false; - migrating_restarted = false; - - /* - * Create a list of processes to migrate, if 'off_nodes' specified - */ - for(item = opal_list_get_first(off_nodes); - item != opal_list_get_end(off_nodes); - item = opal_list_get_next(item) ) { - off_node = (orte_errmgr_predicted_node_t*)item; - - /* - * Find the node in the job structure - * - Make sure that 'odin00' doesn't match all 'odin00*' - */ - found = false; - for(i_node = 0; i_node < opal_pointer_array_get_size(current_global_jobdata->map->nodes); ++i_node) { - node = (orte_node_t*)opal_pointer_array_get_item(current_global_jobdata->map->nodes, i_node); - if( NULL == node ) { - continue; - } - - if( 0 == strncmp(node->name, off_node->node_name, strlen(off_node->node_name)) ) { - found = true; - break; - } - } - if( !found ) { - ; /* Warn about invalid node */ - } else { - /* - * Add all processes from this node - */ - for(i_proc = 0; i_proc < opal_pointer_array_get_size(node->procs); ++i_proc) { - proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, i_proc); - if( NULL == proc ) { - continue; - } - - /* - * Set the process to restarting - */ - proc->state = ORTE_PROC_STATE_MIGRATING; - - opal_pointer_array_add(&(cur_datum->migrating_procs), (void*)proc); - OBJ_RETAIN(proc); - (cur_datum->num_migrating)++; - - if( current_global_jobdata->stdin_target == proc->name.vpid ) { - close_iof_stdin = true; - iof_name.jobid = proc->name.jobid; - iof_name.vpid = proc->name.vpid; - ORTE_EPOCH_SET(iof_name.epoch,proc->name.epoch); - } - } - } - } - - /* - * Create a list of processes to migrate, if 'off_procs' specified - */ - for(item = opal_list_get_first(off_procs); - item != opal_list_get_end(off_procs); - item = opal_list_get_next(item) ) { - off_proc = (orte_errmgr_predicted_proc_t*)item; - - /* - * Find the process in the job structure - */ - found = false; - for(i_proc = 0; i_proc < opal_pointer_array_get_size(current_global_jobdata->procs); ++i_proc) { - proc = (orte_proc_t*)opal_pointer_array_get_item(current_global_jobdata->procs, i_proc); - if( NULL == proc ) { - continue; - } - - if( proc->name.vpid == off_proc->proc_name.vpid) { - found = true; - break; - } - } - /* - * Make sure the process is not listed multiple times - */ - if( found ) { - found = check_if_duplicate_proc(proc, &(cur_datum->migrating_procs)); - if( !found ) { - /* - * Set the process to restarting - */ - proc->state = ORTE_PROC_STATE_MIGRATING; - - opal_pointer_array_add(&(cur_datum->migrating_procs), (void*)proc); - OBJ_RETAIN(proc); - (cur_datum->num_migrating)++; - - if( current_global_jobdata->stdin_target == proc->name.vpid ) { - close_iof_stdin = true; - iof_name.jobid = proc->name.jobid; - iof_name.vpid = proc->name.vpid; - ORTE_EPOCH_SET(iof_name.epoch,proc->name.epoch); - } - } - } - } - - /* - * If we did not find any processes to migrate, then throw a warning, and skip it. - */ - if( 0 >= cur_datum->num_migrating ) { - for(item = opal_list_get_first(off_nodes); - item != opal_list_get_end(off_nodes); - item = opal_list_get_next(item) ) { - off_node = (orte_errmgr_predicted_node_t*)item; - if( NULL != err_str_nodes ) { - asprintf(&tmp_str, "%s, %s", err_str_nodes, off_node->node_name); - free(err_str_nodes); - err_str_nodes = strdup(tmp_str); - free(tmp_str); - tmp_str = NULL; - } else { - asprintf(&err_str_nodes, "%s", off_node->node_name); - } - } - - for(item = opal_list_get_first(off_procs); - item != opal_list_get_end(off_procs); - item = opal_list_get_next(item) ) { - off_proc = (orte_errmgr_predicted_proc_t*)item; - if( NULL != err_str_procs ) { - asprintf(&tmp_str, "%s, %d", err_str_procs, (int)off_proc->proc_name.vpid); - free(err_str_procs); - err_str_procs = strdup(tmp_str); - free(tmp_str); - tmp_str = NULL; - } else { - asprintf(&err_str_procs, "%d", off_proc->proc_name.vpid); - } - } - - opal_show_help("help-orte-errmgr-hnp.txt", "crmig_no_migrating_procs", true, - err_str_nodes, - err_str_procs); - - current_migration_status = ORTE_ERRMGR_MIGRATE_STATE_ERROR; - if( ORTE_SUCCESS != (ret = orte_errmgr_base_migrate_update(current_migration_status)) ) { - ORTE_ERROR_LOG(ret); - exit_status = ret; - goto cleanup; - } - goto cleanup; - } - - /* - * Final pass on the migration list to pre-map processes and remove - * processes that should not be migrated. - */ - if( ORTE_SUCCESS != (ret = check_and_pre_map(off_procs, off_nodes, cur_datum)) ) { - ORTE_ERROR_LOG(ret); - exit_status = ret; - goto cleanup; - } - - /* - * Display the request before processing it. - */ - display_request(off_procs, off_nodes, cur_datum); - - ERRMGR_CRMIG_SET_TIMER(ERRMGR_CRMIG_TIMER_SETUP); - - /* - * Checkpoint the job - * - Hold all non-migrating processes - * - Abort the marked processes - * - - */ - current_migration_status = ORTE_ERRMGR_MIGRATE_STATE_RUN_CKPT; - if( ORTE_SUCCESS != (ret = orte_errmgr_base_migrate_update(current_migration_status)) ) { - ORTE_ERROR_LOG(ret); - exit_status = ret; - goto cleanup; - } - - opal_output_verbose(10, mca_errmgr_hnp_component.super.output_handle, - "errmgr:hnp(crmig):migrate() ------- Starting the checkpoint of job %s -------", - ORTE_JOBID_PRINT(current_global_jobdata->jobid)); - - if( ORTE_SUCCESS != (ret = orte_snapc.start_ckpt(cur_datum)) ) { - opal_output(0, "errmgr:hnp(crmig):migrate() Error: Unable to start the checkpoint."); - ORTE_ERROR_LOG(ret); - exit_status = ret; - goto cleanup; - } - - ERRMGR_CRMIG_SET_TIMER(ERRMGR_CRMIG_TIMER_CKPT); - - /* - * Terminate the migrating processes - */ - opal_output_verbose(10, mca_errmgr_hnp_component.super.output_handle, - "errmgr:hnp(crmig):migrate() ------- Terminate old processes in job %s -------", - ORTE_JOBID_PRINT(current_global_jobdata->jobid)); - - orte_plm.terminate_procs(&cur_datum->migrating_procs); - - /* - * Clear the IOF stdin target if necessary - */ - if( close_iof_stdin ) { - OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle, - "errmgr:hnp(crmig):migrate() ------- Closing old STDIN target for job %s (%s)-------", - ORTE_JOBID_PRINT(current_global_jobdata->jobid), - ORTE_NAME_PRINT(&iof_name) )); - - orte_iof.close(&iof_name, ORTE_IOF_STDIN); - } - - /* - * Wait for the processes to finish terminating - */ - opal_output_verbose(10, mca_errmgr_hnp_component.super.output_handle, - "errmgr:hnp(crmig):migrate() ------- Waiting for termination -------"); - - while( !migrating_terminated ) { - opal_progress(); - check_if_terminated(&(cur_datum->migrating_procs)); - } - - ERRMGR_CRMIG_SET_TIMER(ERRMGR_CRMIG_TIMER_TERM); - - /* - * Start remapping the processes - */ - opal_output_verbose(10, mca_errmgr_hnp_component.super.output_handle, - "errmgr:hnp(crmig):migrate() ------- Checkpoint finished, setting up job %s -------", - ORTE_JOBID_PRINT(current_global_jobdata->jobid)); - - current_migration_status = ORTE_ERRMGR_MIGRATE_STATE_STARTUP; - if( ORTE_SUCCESS != (ret = orte_errmgr_base_migrate_update(current_migration_status)) ) { - ORTE_ERROR_LOG(ret); - exit_status = ret; - goto cleanup; - } - - /* - * Reset the job parameters for restart - * This will set the state of the job to 'restart' - */ - orte_plm_base_reset_job(current_global_jobdata); - - /* - * Adjust the application context information - */ - for(i_proc = 0; i_proc < opal_pointer_array_get_size(&(cur_datum->migrating_procs)); ++i_proc) { - proc = (orte_proc_t*)opal_pointer_array_get_item(&(cur_datum->migrating_procs), i_proc); - if( NULL == proc ) { - continue; - } - - if( ORTE_SUCCESS != (ret = orte_errmgr_base_update_app_context_for_cr_recovery(current_global_jobdata, - proc, - &(cur_datum->ss_snapshot->local_snapshots))) ) { - ORTE_ERROR_LOG(ret); - exit_status = ret; - goto cleanup; - } - - OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle, - "\tAdjusted: \"%s\" [0x%d] [%s]\n", - ORTE_NAME_PRINT(&proc->name), proc->state, proc->node->name)); - } - - ERRMGR_CRMIG_SET_TIMER(ERRMGR_CRMIG_TIMER_RESETUP); - - /* - * Restart the job - * - spawn function will remap and launch the replacement proc(s) - */ - opal_output_verbose(10, mca_errmgr_hnp_component.super.output_handle, - "errmgr:hnp(crmig):migrate() ------- Respawning migrating processes in job %s -------", - ORTE_JOBID_PRINT(current_global_jobdata->jobid)); - - orte_plm.spawn(current_global_jobdata); - - - opal_output_verbose(10, mca_errmgr_hnp_component.super.output_handle, - "errmgr:hnp(crmig):migrate() ------- Waiting for restart -------"); - - migrating_restarted = false; - while( !migrating_restarted ) { - opal_progress(); - check_if_restarted(&(cur_datum->migrating_procs)); - } - - ERRMGR_CRMIG_SET_TIMER(ERRMGR_CRMIG_TIMER_RESTART); - - /* - * Finish the checkpoint - */ - opal_output_verbose(10, mca_errmgr_hnp_component.super.output_handle, - "errmgr:hnp(crmig):migrate() ------- Reconnecting processes in job %s -------", - ORTE_JOBID_PRINT(current_global_jobdata->jobid)); - - if( ORTE_SUCCESS != (ret = orte_snapc.end_ckpt(cur_datum)) ) { - opal_output(0, "errmgr:hnp(crmig):migrate() Error: Unable to end the checkpoint."); - ORTE_ERROR_LOG(ret); - exit_status = ret; - goto cleanup; - } - - /* - * All done - */ - opal_output_verbose(10, mca_errmgr_hnp_component.super.output_handle, - "errmgr:hnp(crmig):migrate() ------- Finished migrating processes in job %s -------", - ORTE_JOBID_PRINT(current_global_jobdata->jobid)); - - OBJ_RELEASE(cur_datum); - - current_migration_status = ORTE_ERRMGR_MIGRATE_STATE_FINISH; - if( ORTE_SUCCESS != (ret = orte_errmgr_base_migrate_update(current_migration_status)) ) { - ORTE_ERROR_LOG(ret); - exit_status = ret; - goto cleanup; - } - - ERRMGR_CRMIG_SET_TIMER(ERRMGR_CRMIG_TIMER_FINISH); - ERRMGR_CRMIG_DISPLAY_ALL_TIMERS(); - - cleanup: - migrating_underway = false; - migrating_terminated = false; - migrating_restarted = false; - mca_errmgr_hnp_component.crmig_in_progress = false; - - if( NULL != err_str_procs ) { - free(err_str_procs); - err_str_procs = NULL; - } - - if( NULL != err_str_nodes ) { - free(err_str_nodes); - err_str_nodes = NULL; - } - - return exit_status; -} - -static bool check_if_duplicate_proc(orte_proc_t *proc, opal_pointer_array_t *migrating_procs) -{ - orte_std_cntr_t i_proc; - orte_proc_t *loc_proc = NULL; - - for(i_proc = 0; i_proc < opal_pointer_array_get_size(migrating_procs); ++i_proc) { - loc_proc = (orte_proc_t*)opal_pointer_array_get_item(migrating_procs, i_proc); - if( NULL == loc_proc ) { - continue; - } - if( loc_proc->name.vpid == proc->name.vpid ) { - return true; - } - } - - return false; -} - -static int check_if_terminated(opal_pointer_array_t *migrating_procs) -{ - orte_std_cntr_t i_proc; - orte_proc_t *proc = NULL; - bool is_done; - - is_done = true; - for(i_proc = 0; i_proc < opal_pointer_array_get_size(migrating_procs); ++i_proc) { - proc = (orte_proc_t*)opal_pointer_array_get_item(migrating_procs, i_proc); - if( NULL == proc ) { - continue; - } - - if( !(ORTE_PROC_STATE_KILLED_BY_CMD & proc->state) ) { - is_done = false; - break; - } - } - - if( is_done ) { - migrating_terminated = true; - } - else { - OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle, - "\t Still waiting for termination: \"%s\" [0x%x] != [0x%x]\n", - ORTE_NAME_PRINT(&proc->name), proc->state, ORTE_PROC_STATE_KILLED_BY_CMD)); - } - - return ORTE_SUCCESS; -} - -static int check_if_restarted(opal_pointer_array_t *migrating_procs) -{ - orte_std_cntr_t i_proc; - orte_proc_t *proc = NULL; - bool is_done; - - is_done = true; - for(i_proc = 0; i_proc < opal_pointer_array_get_size(migrating_procs); ++i_proc) { - proc = (orte_proc_t*)opal_pointer_array_get_item(migrating_procs, i_proc); - if( NULL == proc ) { - continue; - } - - /* proc->state != ORTE_PROC_STATE_LAUNCHED */ - if( !(ORTE_PROC_STATE_RUNNING & proc->state) ) { - is_done = false; - break; - } - } - - if( is_done ) { - migrating_restarted = true; - } - else { - OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle, - "\tStill waiting for restart: \"%s\" [0x%x] != [0x%x]\n", - ORTE_NAME_PRINT(&proc->name), proc->state, ORTE_PROC_STATE_RUNNING)); - } - - return ORTE_SUCCESS; -} - -static void errmgr_crmig_process_fault_app(orte_job_t *jdata, - orte_process_name_t *proc, - orte_proc_state_t state) -{ - OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle, - "errmgr:hnp(crmig):process_fault_app() " - "------- Application fault reported! proc %s (0x%x) " - "- %s", - ORTE_NAME_PRINT(proc), - state, - (migrating_underway ? "Migrating" : "Not Migrating") )); - - return; -} - -static void errmgr_crmig_process_fault_daemon(orte_job_t *jdata, - orte_process_name_t *proc, - orte_proc_state_t state) -{ - OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle, - "errmgr:hnp(crmig):process_fault_daemon() " - "------- Daemon fault reported! proc %s (0x%x) " - "- %s", - ORTE_NAME_PRINT(proc), - state, - (migrating_underway ? "Migrating" : "Not Migrating") )); - - /* - * Failed communication can be ignored for the most part. - * Make sure to remove the route - * JJH: Check to make sure this is not a new daemon loss. - */ - if( ORTE_PROC_STATE_COMM_FAILED == state ) { - OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle, - "errmgr:hnp(crmig):process_fault_daemon() " - "------- Daemon fault reported! proc %s (0x%x) " - "- Communication failure, keep going", - ORTE_NAME_PRINT(proc), - state )); - } - - return; -} - -static int check_and_pre_map(opal_list_t *off_procs, - opal_list_t *off_nodes, - orte_snapc_base_quiesce_t *cur_datum) -{ - /* - * Check the 'off_procs' list for processes that should not be migrated - */ - - /* - * Check the 'current_onto_mapping_exclusive' for processes that are moving - * 'near/with' other processes that are also moving. Be sure to watch out - * for circular deadlock. - */ - - /* - * Use the 'pre_map_fixed_node' structure to fix this process' mapping. - */ - - return ORTE_SUCCESS; -} - -static void display_request(opal_list_t *off_procs, - opal_list_t *off_nodes, - orte_snapc_base_quiesce_t *cur_datum) -{ - orte_std_cntr_t i_node; - orte_std_cntr_t i_proc; - orte_node_t *node = NULL; - orte_proc_t *proc = NULL; - bool found = false; - char * status_str = NULL; - char * tmp_str = NULL; - orte_errmgr_predicted_proc_t *off_proc = NULL; - orte_errmgr_predicted_node_t *off_node = NULL; - orte_errmgr_predicted_map_t *onto_map = NULL; - opal_list_item_t *item = NULL; - - /* - * Display all requested processes to migrate - */ - OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle, - "errmgr:hnp(crmig):migrate() Requested Processes to migrate: (%d procs)\n", - (int) opal_list_get_size(off_procs) )); - for(item = opal_list_get_first(off_procs); - item != opal_list_get_end(off_procs); - item = opal_list_get_next(item) ) { - off_proc = (orte_errmgr_predicted_proc_t*)item; - - /* - * Find the process in the job structure - */ - found = false; - for(i_proc = 0; i_proc < opal_pointer_array_get_size(current_global_jobdata->procs); ++i_proc) { - proc = (orte_proc_t*)opal_pointer_array_get_item(current_global_jobdata->procs, i_proc); - if( NULL == proc ) { - continue; - } - - if( proc->name.vpid == off_proc->proc_name.vpid) { - found = true; - break; - } - } - OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle, - "\t%s (Rank %3d) on node %s\n", - ORTE_NAME_PRINT(&proc->name), (int)off_proc->proc_name.vpid, proc->node->name)); - } - - /* - * Display Off Nodes - */ - OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle, - "errmgr:hnp(crmig):migrate() Requested Nodes to migration: (%d nodes)\n", - (int)opal_list_get_size(off_nodes) )); - - for(item = opal_list_get_first(off_nodes); - item != opal_list_get_end(off_nodes); - item = opal_list_get_next(item) ) { - off_node = (orte_errmgr_predicted_node_t*)item; - - for(i_node = 0; i_node < opal_pointer_array_get_size(current_global_jobdata->map->nodes); ++i_node) { - node = (orte_node_t*)opal_pointer_array_get_item(current_global_jobdata->map->nodes, i_node); - if( NULL == node ) { - continue; - } - - found = false; - if( 0 == strncmp(node->name, off_node->node_name, strlen(off_node->node_name)) ) { - found = true; - break; - } - } - if( found ) { - OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle, - "\t\"%s\" \t%d\n", - node->name, node->num_procs)); - for(i_proc = 0; i_proc < opal_pointer_array_get_size(node->procs); ++i_proc) { - proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, i_proc); - if( NULL == proc ) { - continue; - } - - OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle, - "\t\t\"%s\" [0x%x]\n", - ORTE_NAME_PRINT(&proc->name), proc->state)); - } - } - } - - /* - * Suggested onto nodes - */ - OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle, - "errmgr:hnp(crmig):migrate() Suggested nodes to migration onto: (%d nodes)\n", - (int)opal_list_get_size(current_onto_mapping_general) )); - for(item = opal_list_get_first(current_onto_mapping_general); - item != opal_list_get_end(current_onto_mapping_general); - item = opal_list_get_next(item) ) { - onto_map = (orte_errmgr_predicted_map_t*) item; - OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle, - "\t\"%s\"\n", - onto_map->map_node_name)); - } - - OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle, - "errmgr:hnp(crmig):migrate() Suggested nodes to migration onto (exclusive): (%d nodes)\n", - (int)opal_list_get_size(current_onto_mapping_exclusive) )); - for(item = opal_list_get_first(current_onto_mapping_exclusive); - item != opal_list_get_end(current_onto_mapping_exclusive); - item = opal_list_get_next(item) ) { - onto_map = (orte_errmgr_predicted_map_t*) item; - OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle, - "\t%d\t(%c)\t\"%s\"\n", - onto_map->proc_name.vpid, - (onto_map->off_current_node ? 'T' : 'F'), - onto_map->map_node_name)); - } - - /* - * Display all processes scheduled to migrate - */ - OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle, - "errmgr:hnp(crmig):migrate() All Migrating Processes: (%d procs)\n", - cur_datum->num_migrating)); - for(i_proc = 0; i_proc < opal_pointer_array_get_size(&(cur_datum->migrating_procs)); ++i_proc) { - proc = (orte_proc_t*)opal_pointer_array_get_item(&(cur_datum->migrating_procs), i_proc); - if( NULL == proc ) { - continue; - } - - OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle, - "\t\"%s\" [0x%x] [%s]\n", - ORTE_NAME_PRINT(&proc->name), proc->state, proc->node->name)); - - if( NULL == status_str ) { - asprintf(&status_str, "\t%s Rank %d on Node %s\n", - ORTE_NAME_PRINT(&proc->name), - (int)proc->name.vpid, - proc->node->name); - } else { - tmp_str = strdup(status_str); - free(status_str); - status_str = NULL; - asprintf(&status_str, "%s\t%s Rank %d on Node %s\n", - tmp_str, - ORTE_NAME_PRINT(&proc->name), - (int)proc->name.vpid, - proc->node->name); - } - } - - opal_show_help("help-orte-errmgr-hnp.txt", "crmig_migrating_job", true, - status_str); - - if( NULL != tmp_str ) { - free(tmp_str); - tmp_str = NULL; - } - - if( NULL != status_str ) { - free(status_str); - status_str = NULL; - } - - return; -} - -/************************ - * Timing - ************************/ -static void errmgr_crmig_set_time(int idx) -{ - if(idx < ERRMGR_CRMIG_TIMER_MAX ) { - if( timer_start[idx] <= 0.0 ) { - timer_start[idx] = errmgr_crmig_get_time(); - } - } -} - -static void errmgr_crmig_display_all_timers(void) -{ - double diff = 0.0; - char * label = NULL; - - opal_output(0, "Process Migration Timing: ******************** Summary Begin\n"); - - /********** Structure Setup **********/ - label = strdup("Setup"); - diff = timer_start[ERRMGR_CRMIG_TIMER_SETUP] - timer_start[ERRMGR_CRMIG_TIMER_START]; - errmgr_crmig_display_indv_timer_core(diff, label); - free(label); - - /********** Checkpoint **********/ - label = strdup("Checkpoint"); - diff = timer_start[ERRMGR_CRMIG_TIMER_CKPT] - timer_start[ERRMGR_CRMIG_TIMER_SETUP]; - errmgr_crmig_display_indv_timer_core(diff, label); - free(label); - - /********** Termination **********/ - label = strdup("Terminate"); - diff = timer_start[ERRMGR_CRMIG_TIMER_TERM] - timer_start[ERRMGR_CRMIG_TIMER_CKPT]; - errmgr_crmig_display_indv_timer_core(diff, label); - free(label); - - /********** Setup new job **********/ - label = strdup("Setup Relaunch"); - diff = timer_start[ERRMGR_CRMIG_TIMER_RESETUP] - timer_start[ERRMGR_CRMIG_TIMER_TERM]; - errmgr_crmig_display_indv_timer_core(diff, label); - free(label); - - /********** Restart **********/ - label = strdup("Restart"); - diff = timer_start[ERRMGR_CRMIG_TIMER_RESTART] - timer_start[ERRMGR_CRMIG_TIMER_RESETUP]; - errmgr_crmig_display_indv_timer_core(diff, label); - free(label); - - /********** Finish **********/ - label = strdup("Finalize"); - diff = timer_start[ERRMGR_CRMIG_TIMER_FINISH] - timer_start[ERRMGR_CRMIG_TIMER_RESTART]; - errmgr_crmig_display_indv_timer_core(diff, label); - free(label); - - opal_output(0, "Process Migration Timing: ******************** Summary End\n"); -} - -static void errmgr_crmig_clear_timers(void) -{ - int i; - for(i = 0; i < ERRMGR_CRMIG_TIMER_MAX; ++i) { - timer_start[i] = 0.0; - } -} - -static double errmgr_crmig_get_time(void) -{ - double wtime; - -#if OPAL_TIMER_USEC_NATIVE - wtime = (double)opal_timer_base_get_usec() / 1000000.0; -#else - struct timeval tv; - gettimeofday(&tv, NULL); - wtime = tv.tv_sec; - wtime += (double)tv.tv_usec / 1000000.0; -#endif - - return wtime; -} - -static void errmgr_crmig_display_indv_timer_core(double diff, char *str) -{ - double total = 0; - double perc = 0; - - total = timer_start[ERRMGR_CRMIG_TIMER_MAX-1] - timer_start[ERRMGR_CRMIG_TIMER_START]; - perc = (diff/total) * 100; - - opal_output(0, - "errmgr_crmig: timing: %-20s = %10.2f s\t%10.2f s\t%6.2f\n", - str, - diff, - total, - perc); - return; -} - -#endif /* OPAL_ENABLE_FT_CR */ diff --git a/orte/mca/errmgr/hnp/help-orte-errmgr-hnp.txt b/orte/mca/errmgr/hnp/help-orte-errmgr-hnp.txt deleted file mode 100644 index 836e46f4b0..0000000000 --- a/orte/mca/errmgr/hnp/help-orte-errmgr-hnp.txt +++ /dev/null @@ -1,71 +0,0 @@ - -*- text -*- -# -# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. -# -# $COPYRIGHT$ -# -# Additional copyrights may follow -# -# $HEADER$ -# -# This is the US/English general help file for ORTE Errmgr HNP module. -# -[errmgr-hnp:unknown-job-error] -An error has occurred in an unknown job. This generally should not happen -except due to an internal ORTE error. - -Job state: %s - -This information should probably be reported to the OMPI developers. -# -[errmgr-hnp:daemon-died] -The system has lost communication with the following daemon: - -Daemon: %s -Node: %s - -The reason for the lost communication channel is unknown. Possible -reasons include failure of the daemon itself, failure of the -connecting fabric/switch, and loss of the host node. Please -check with your system administrator to try and determine the -source of the problem. - -Your job is being terminated as a result. -# -[errmgr-hnp:cannot-relocate] -The system is unable to relocate the specified process: - -Process: %s - -because the application for that process could not be found. This -appears to be a system error. Please report it to the ORTE -developers. - -[autor_recovering_job] -Notice: The processes listed below failed unexpectedly. - Using the last checkpoint to recover the job. - Please standby. -%s -[autor_recovery_complete] -Notice: The job has been successfully recovered from the - last checkpoint. -[autor_failed_to_recover_proc] -Error: The process below has failed. There is no checkpoint available for - this job, so we are terminating the application since automatic - recovery cannot occur. -Internal Name: %s -MCW Rank: %d - -[crmig_migrating_job] -Notice: A migration of this job has been requested. - The processes below will be migrated. - Please standby. -%s -[crmig_migrated_job] -Notice: The processes have been successfully migrated to/from the specified - machines. -[crmig_no_migrating_procs] -Warning: Could not find any processes to migrate on the nodes specified. - You provided the following: -Nodes: %s -Procs: %s diff --git a/orte/mca/errmgr/orted/Makefile.am b/orte/mca/errmgr/orted/Makefile.am deleted file mode 100644 index dae952bcf6..0000000000 --- a/orte/mca/errmgr/orted/Makefile.am +++ /dev/null @@ -1,38 +0,0 @@ -# -# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. -# $COPYRIGHT$ -# -# Additional copyrights may follow -# -# $HEADER$ -# - -EXTRA_DIST = .windows - -dist_pkgdata_DATA = help-orte-errmgr-orted.txt - -sources = \ - errmgr_orted.h \ - errmgr_orted_component.c \ - errmgr_orted.c - -# Make the output library in this directory, and name it either -# mca__.la (for DSO builds) or libmca__.la -# (for static builds). - -if MCA_BUILD_orte_errmgr_orted_DSO -component_noinst = -component_install = mca_errmgr_orted.la -else -component_noinst = libmca_errmgr_orted.la -component_install = -endif - -mcacomponentdir = $(pkglibdir) -mcacomponent_LTLIBRARIES = $(component_install) -mca_errmgr_orted_la_SOURCES = $(sources) -mca_errmgr_orted_la_LDFLAGS = -module -avoid-version - -noinst_LTLIBRARIES = $(component_noinst) -libmca_errmgr_orted_la_SOURCES =$(sources) -libmca_errmgr_orted_la_LDFLAGS = -module -avoid-version diff --git a/orte/mca/errmgr/orted/configure.m4 b/orte/mca/errmgr/orted/configure.m4 deleted file mode 100644 index 8c10aa375c..0000000000 --- a/orte/mca/errmgr/orted/configure.m4 +++ /dev/null @@ -1,19 +0,0 @@ -# -*- shell-script -*- -# -# Copyright (c) 2011 Los Alamos National Security, LLC. -# All rights reserved. -# $COPYRIGHT$ -# -# Additional copyrights may follow -# -# $HEADER$ -# -# MCA_errmgr_orted_CONFIG([action-if-found], [action-if-not-found]) -# ----------------------------------------------------------- -AC_DEFUN([MCA_orte_errmgr_orted_CONFIG], [ - AC_CONFIG_FILES([orte/mca/errmgr/orted/Makefile]) - - AS_IF([test "$orte_enable_resilient_code" = 1 -a "$orte_without_full_support" = 0], - [$1], - [$2]) -]) diff --git a/orte/mca/errmgr/orted/errmgr_orted.c b/orte/mca/errmgr/orted/errmgr_orted.c deleted file mode 100644 index 646f31104d..0000000000 --- a/orte/mca/errmgr/orted/errmgr_orted.c +++ /dev/null @@ -1,1157 +0,0 @@ -/* - * Copyright (c) 2009-2010 The Trustees of Indiana University. - * All rights reserved. - * Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved. - * Copyright (c) 2004-2011 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "orte_config.h" - -#include -#ifdef HAVE_UNISTD_H -#include -#endif /* HAVE_UNISTD_H */ -#ifdef HAVE_STRING_H -#include -#endif - -#include "opal/util/output.h" -#include "opal/dss/dss.h" - -#include "orte/util/error_strings.h" -#include "orte/util/name_fns.h" -#include "orte/util/proc_info.h" -#include "orte/util/session_dir.h" -#include "orte/util/show_help.h" -#include "orte/util/nidmap.h" -#include "orte/runtime/orte_globals.h" -#include "orte/runtime/data_type_support/orte_dt_support.h" -#include "orte/mca/rml/rml.h" -#include "orte/mca/odls/odls.h" -#include "orte/mca/odls/base/base.h" -#include "orte/mca/odls/base/odls_private.h" -#include "orte/mca/plm/plm_types.h" -#include "orte/mca/routed/routed.h" -#include "orte/mca/sensor/sensor.h" -#include "orte/mca/ess/ess.h" -#include "orte/runtime/orte_quit.h" -#include "orte/runtime/orte_globals.h" - -#include "orte/mca/errmgr/errmgr.h" -#include "orte/mca/errmgr/base/base.h" -#include "orte/mca/errmgr/base/errmgr_private.h" - -#include "errmgr_orted.h" - -/* Local functions */ -static bool any_live_children(orte_jobid_t job); -static int pack_state_update(opal_buffer_t *alert, orte_odls_job_t *jobdat); -static int pack_state_for_proc(opal_buffer_t *alert, orte_odls_child_t *child); -static bool all_children_registered(orte_jobid_t job); -static int pack_child_contact_info(orte_jobid_t job, opal_buffer_t *buf); -static void failed_start(orte_odls_job_t *jobdat, orte_exit_code_t exit_code); -static void update_local_children(orte_odls_job_t *jobdat, - orte_job_state_t jobstate, - orte_proc_state_t state); -static void killprocs(orte_jobid_t job, orte_vpid_t vpid); -#if ORTE_RESIL_ORTE -static int mark_processes_as_dead(opal_pointer_array_t *dead_procs); -static int record_dead_process(orte_process_name_t *proc); -static int send_to_local_applications(opal_pointer_array_t *dead_names); -static void failure_notification(int status, orte_process_name_t* sender, - opal_buffer_t *buffer, orte_rml_tag_t tag, - void* cbdata); -#endif - -/* - * Module functions: Global - */ -static int init(void); -static int finalize(void); - -static int predicted_fault(opal_list_t *proc_list, - opal_list_t *node_list, - opal_list_t *suggested_map); - -static int update_state(orte_jobid_t job, - orte_job_state_t jobstate, - orte_process_name_t *proc, - orte_proc_state_t state, - pid_t pid, - orte_exit_code_t exit_code); - -static int suggest_map_targets(orte_proc_t *proc, - orte_node_t *oldnode, - opal_list_t *node_list); - -static int ft_event(int state); - - -/****************** - * orted module - ******************/ -orte_errmgr_base_module_t orte_errmgr_orted_module = { - init, - finalize, - orte_errmgr_base_log, - orte_errmgr_base_abort, - orte_errmgr_base_abort_peers, - update_state, - predicted_fault, - suggest_map_targets, - ft_event, - orte_errmgr_base_register_migration_warning -#if ORTE_RESIL_ORTE - ,orte_errmgr_base_set_fault_callback /* Set callback function */ -#endif -}; - -/************************ - * API Definitions - ************************/ -static int init(void) -{ - int ret = ORTE_SUCCESS; - -#if ORTE_RESIL_ORTE - ret = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_FAILURE_NOTICE, - ORTE_RML_PERSISTENT, failure_notification, NULL); -#endif - - return ret; -} - -static int finalize(void) -{ -#if ORTE_RESIL_ORTE - orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_FAILURE_NOTICE); -#endif - - return ORTE_SUCCESS; -} - -static void cbfunc(int status, orte_process_name_t* sender, - opal_buffer_t *buffer, orte_rml_tag_t tag, - void* cbdata) -{ - OBJ_RELEASE(buffer); -} - -static int update_state(orte_jobid_t job, - orte_job_state_t jobstate, - orte_process_name_t *proc, - orte_proc_state_t state, - pid_t pid, - orte_exit_code_t exit_code) -{ - opal_list_item_t *item, *next; - orte_odls_job_t *jobdat = NULL; - orte_odls_child_t *child; - opal_buffer_t *alert; - orte_plm_cmd_flag_t cmd; - int rc=ORTE_SUCCESS; - orte_vpid_t null=ORTE_VPID_INVALID; - orte_app_context_t *app; - orte_ns_cmp_bitmask_t mask; - - /* - * if orte is trying to shutdown, just let it - */ - if (orte_finalizing) { - return ORTE_SUCCESS; - } - - OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base.output, - "errmgr:orted:update_state() %s) " - "------- %s state updated for process %s to %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ((NULL == proc) ? "App. Process" : - (proc->jobid == ORTE_PROC_MY_HNP->jobid ? "Daemon" : "App. Process")), - (NULL == proc) ? "NULL" : ORTE_NAME_PRINT(proc), - orte_proc_state_to_str(state))); - - /* if this is a heartbeat failure, let the HNP handle it */ - if (ORTE_JOB_STATE_HEARTBEAT_FAILED == jobstate || - ORTE_PROC_STATE_HEARTBEAT_FAILED == state) { - return ORTE_SUCCESS; - } - - /*** UPDATE COMMAND FOR A JOB ***/ - if (NULL == proc) { - /* this is an update for an entire job */ - if (ORTE_JOBID_INVALID == job) { - /* whatever happened, we don't know what job - * it happened to - */ - orte_show_help("help-orte-errmgr-orted.txt", "errmgr-orted:unknown-job-error", - true, orte_job_state_to_str(jobstate)); - alert = OBJ_NEW(opal_buffer_t); - /* pack update state command */ - cmd = ORTE_PLM_UPDATE_PROC_STATE; - if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) { - ORTE_ERROR_LOG(rc); - return rc; - } - /* pack the "invalid" jobid */ - if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &job, 1, ORTE_JOBID))) { - ORTE_ERROR_LOG(rc); - return rc; - } - if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert, ORTE_RML_TAG_PLM, 0, cbfunc, NULL))) { - ORTE_ERROR_LOG(rc); - } else { - rc = ORTE_SUCCESS; - } - return rc; - } - - /* lookup the local jobdat for this job */ - jobdat = NULL; - for (item = opal_list_get_first(&orte_local_jobdata); - item != opal_list_get_end(&orte_local_jobdata); - item = opal_list_get_next(item)) { - jobdat = (orte_odls_job_t*)item; - - /* is this the specified job? */ - if (jobdat->jobid == job) { - break; - } - } - if (NULL == jobdat) { - return ORTE_ERR_NOT_FOUND; - } - - switch (jobstate) { - case ORTE_JOB_STATE_FAILED_TO_START: - failed_start(jobdat, exit_code); - break; - case ORTE_JOB_STATE_RUNNING: - /* update all local child states */ - update_local_children(jobdat, jobstate, ORTE_PROC_STATE_RUNNING); - break; - case ORTE_JOB_STATE_SENSOR_BOUND_EXCEEDED: - /* update all procs in job */ - update_local_children(jobdat, jobstate, ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED); - /* order all local procs for this job to be killed */ - killprocs(jobdat->jobid, ORTE_VPID_WILDCARD); - case ORTE_JOB_STATE_COMM_FAILED: - /* kill all local procs */ - killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD); - /* tell the caller we can't recover */ - return ORTE_ERR_UNRECOVERABLE; - break; - case ORTE_JOB_STATE_HEARTBEAT_FAILED: - /* let the HNP handle this */ - return ORTE_SUCCESS; - break; - - default: - break; - } - alert = OBJ_NEW(opal_buffer_t); - /* pack update state command */ - cmd = ORTE_PLM_UPDATE_PROC_STATE; - if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) { - ORTE_ERROR_LOG(rc); - goto FINAL_CLEANUP; - } - /* pack the job info */ - if (ORTE_SUCCESS != (rc = pack_state_update(alert, jobdat))) { - ORTE_ERROR_LOG(rc); - } - /* send it */ - if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert, ORTE_RML_TAG_PLM, 0, cbfunc, NULL))) { - ORTE_ERROR_LOG(rc); - } else { - rc = ORTE_SUCCESS; - } - return rc; - } - - /* if this was a failed comm, then see if it was to our - * lifeline - */ - if (ORTE_PROC_STATE_COMM_FAILED == state) { - mask = ORTE_NS_CMP_ALL; - - /* if it is our own connection, ignore it */ - if (OPAL_EQUAL == orte_util_compare_name_fields(mask, ORTE_PROC_MY_NAME, proc)) { - return ORTE_SUCCESS; - } - /* see if this was a lifeline */ - if (ORTE_SUCCESS != orte_routed.route_lost(proc)) { - /* kill our children */ - killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD); - /* terminate - our routed children will see - * us leave and automatically die - */ - orte_quit(); - } - /* purge the oob */ - orte_rml.purge(proc); - /* was it a daemon that failed? */ - if (proc->jobid == ORTE_PROC_MY_NAME->jobid) { - /* if all my routes are gone, then terminate ourselves */ - if (0 == orte_routed.num_routes() && - 0 == opal_list_get_size(&orte_local_children)) { - orte_quit(); - } else { - OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, - "%s errmgr:orted not exiting, num_routes() == %d, num children == %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - (int)orte_routed.num_routes(), - (int)opal_list_get_size(&orte_local_children))); - } - } - -#if ORTE_RESIL_ORTE - record_dead_process(proc); -#endif - - /* if not, then indicate we can continue */ - return ORTE_SUCCESS; - } - - /* lookup the local jobdat for this job */ - jobdat = NULL; - for (item = opal_list_get_first(&orte_local_jobdata); - item != opal_list_get_end(&orte_local_jobdata); - item = opal_list_get_next(item)) { - jobdat = (orte_odls_job_t*)item; - - /* is this the specified job? */ - if (jobdat->jobid == proc->jobid) { - break; - } - } - if (NULL == jobdat) { - /* must already be complete */ - return ORTE_SUCCESS; - } - - /* if there are no local procs for this job, we can - * ignore this call - */ - if (0 == jobdat->num_local_procs) { - return ORTE_SUCCESS; - } - - OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, - "%s errmgr:orted got state %s for proc %s pid %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - orte_proc_state_to_str(state), - ORTE_NAME_PRINT(proc), pid)); - - /*** UPDATE COMMAND FOR A SPECIFIC PROCESS ***/ - if (ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED == state) { - /* find this proc in the local children */ - for (item = opal_list_get_first(&orte_local_children); - item != opal_list_get_end(&orte_local_children); - item = opal_list_get_next(item)) { - child = (orte_odls_child_t*)item; - mask = ORTE_NS_CMP_ALL; - if (OPAL_EQUAL == orte_util_compare_name_fields(mask, child->name, proc)) { - if (ORTE_PROC_STATE_UNTERMINATED > child->state) { - child->state = state; - child->exit_code = exit_code; - /* Decrement the number of local procs */ - jobdat->num_local_procs--; - /* kill this proc */ - killprocs(proc->jobid, proc->vpid); - } - app = (orte_app_context_t*)opal_pointer_array_get_item(&jobdat->apps, child->app_idx); - if( jobdat->enable_recovery && child->restarts < app->max_restarts ) { - child->restarts++; - OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, - "%s errmgr:orted restarting proc %s for the %d time", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(proc), child->restarts)); - rc = orte_odls.restart_proc(child); - } - return rc; - } - } - } - - if (ORTE_PROC_STATE_TERM_NON_ZERO == state) { - if (orte_abort_non_zero_exit) { - /* treat this as an abnormal - * termination - no recovery allowed - */ - goto REPORT_ABORT; - } - /* treat this as normal termination */ - goto REPORT_STATE; - } - - if (ORTE_PROC_STATE_TERMINATED < state) { - if( jobdat->enable_recovery ) { - OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, - "%s RECOVERY ENABLED", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - /* find this proc in the local children */ - for (item = opal_list_get_first(&orte_local_children); - item != opal_list_get_end(&orte_local_children); - item = opal_list_get_next(item)) { - child = (orte_odls_child_t*)item; - mask = ORTE_NS_CMP_ALL; - if (OPAL_EQUAL == orte_util_compare_name_fields(mask, child->name, proc)) { - /* see if this child has reached its local restart limit */ - app = (orte_app_context_t*)opal_pointer_array_get_item(&jobdat->apps, child->app_idx); - OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, - "%s CHECKING RESTARTS %d VS MAX %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - child->restarts, app->max_restarts)); - if (child->restarts < app->max_restarts ) { - /* attempt to restart it locally */ - child->restarts++; - OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, - "%s errmgr:orted restarting proc %s for the %d time", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(child->name), child->restarts)); - if (ORTE_SUCCESS != (rc = orte_odls.restart_proc(child))) { - /* reset the child's state as restart_proc would - * have cleared it - */ - child->state = state; - ORTE_ERROR_LOG(rc); - goto REPORT_ABORT; - } - return ORTE_SUCCESS; - } - } - } - } - -REPORT_ABORT: - /* if the job hasn't completed and the state is abnormally - * terminated, then we need to alert the HNP right away - */ - alert = OBJ_NEW(opal_buffer_t); - /* pack update state command */ - cmd = ORTE_PLM_UPDATE_PROC_STATE; - if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) { - ORTE_ERROR_LOG(rc); - goto FINAL_CLEANUP; - } - /* pack only the data for this proc - have to start with the jobid - * so the receiver can unpack it correctly - */ - if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &proc->jobid, 1, ORTE_JOBID))) { - ORTE_ERROR_LOG(rc); - return rc; - } - - /* find this proc in the local children */ - for (item = opal_list_get_first(&orte_local_children); - item != opal_list_get_end(&orte_local_children); - item = opal_list_get_next(item)) { - child = (orte_odls_child_t*)item; - mask = ORTE_NS_CMP_ALL; - if (OPAL_EQUAL == orte_util_compare_name_fields(mask, child->name, proc)) { - if (ORTE_PROC_STATE_UNTERMINATED > child->state) { - child->state = state; - child->exit_code = exit_code; - } - /* now pack the child's info */ - if (ORTE_SUCCESS != (rc = pack_state_for_proc(alert, child))) { - ORTE_ERROR_LOG(rc); - return rc; - } - /* remove the child from our local list as it is no longer alive */ - opal_list_remove_item(&orte_local_children, &child->super); - /* Decrement the number of local procs */ - jobdat->num_local_procs--; - - OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, - "%s errmgr:orted reporting proc %s aborted to HNP (local procs = %d)", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(child->name), - jobdat->num_local_procs)); - - /* release the child object */ - OBJ_RELEASE(child); - /* done with loop */ - break; - } - } - - /* send it */ - if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert, ORTE_RML_TAG_PLM, 0, cbfunc, NULL))) { - ORTE_ERROR_LOG(rc); - } else { - rc = ORTE_SUCCESS; - } - return rc; - } - - REPORT_STATE: - /* find this proc in the local children so we can update its state */ - for (item = opal_list_get_first(&orte_local_children); - item != opal_list_get_end(&orte_local_children); - item = opal_list_get_next(item)) { - child = (orte_odls_child_t*)item; - mask = ORTE_NS_CMP_ALL; - if (OPAL_EQUAL == orte_util_compare_name_fields(mask, child->name, proc)) { - if (ORTE_PROC_STATE_UNTERMINATED > child->state) { - child->state = state; - if (0 < pid) { - child->pid = pid; - } - child->exit_code = exit_code; - } - /* done with loop */ - break; - } - } - - if (ORTE_PROC_STATE_REGISTERED == state) { - /* see if everyone in this job has registered */ - if (all_children_registered(proc->jobid)) { - /* once everyone registers, send their contact info to - * the HNP so it is available to debuggers and anyone - * else that needs it - */ - - OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, - "%s errmgr:orted: sending contact info to HNP", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - - alert = OBJ_NEW(opal_buffer_t); - /* pack init routes command */ - cmd = ORTE_PLM_INIT_ROUTES_CMD; - if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) { - ORTE_ERROR_LOG(rc); - goto FINAL_CLEANUP; - } - /* pack the jobid */ - if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &proc->jobid, 1, ORTE_JOBID))) { - ORTE_ERROR_LOG(rc); - goto FINAL_CLEANUP; - } - /* pack all the local child vpids and epochs */ - for (item = opal_list_get_first(&orte_local_children); - item != opal_list_get_end(&orte_local_children); - item = opal_list_get_next(item)) { - child = (orte_odls_child_t*)item; - if (child->name->jobid == proc->jobid) { - if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &child->name->vpid, 1, ORTE_VPID))) { - ORTE_ERROR_LOG(rc); - goto FINAL_CLEANUP; - } -#if ORTE_ENABLE_EPOCH - if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &child->name->epoch, 1, ORTE_EPOCH))) { - ORTE_ERROR_LOG(rc); - goto FINAL_CLEANUP; - } -#endif - } - } - /* pack an invalid marker */ - if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &null, 1, ORTE_VPID))) { - ORTE_ERROR_LOG(rc); - goto FINAL_CLEANUP; - } - /* add in contact info for all procs in the job */ - if (ORTE_SUCCESS != (rc = pack_child_contact_info(proc->jobid, alert))) { - ORTE_ERROR_LOG(rc); - OBJ_DESTRUCT(&alert); - return rc; - } - /* send it */ - if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert, ORTE_RML_TAG_PLM, 0, cbfunc, NULL))) { - ORTE_ERROR_LOG(rc); - } else { - rc = ORTE_SUCCESS; - } - } - return rc; - } - - /* only other state is terminated - see if anyone is left alive */ - if (!any_live_children(proc->jobid)) { - /* lookup the local jobdat for this job */ - jobdat = NULL; - for (item = opal_list_get_first(&orte_local_jobdata); - item != opal_list_get_end(&orte_local_jobdata); - item = opal_list_get_next(item)) { - jobdat = (orte_odls_job_t*)item; - - /* is this the specified job? */ - if (jobdat->jobid == proc->jobid) { - break; - } - } - if (NULL == jobdat) { - /* race condition - may not have been formed yet */ - return ORTE_SUCCESS; - } - - alert = OBJ_NEW(opal_buffer_t); - /* pack update state command */ - cmd = ORTE_PLM_UPDATE_PROC_STATE; - if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) { - ORTE_ERROR_LOG(rc); - goto FINAL_CLEANUP; - } - /* pack the data for the job */ - if (ORTE_SUCCESS != (rc = pack_state_update(alert, jobdat))) { - ORTE_ERROR_LOG(rc); - } - -FINAL_CLEANUP: - OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, - "%s errmgr:orted reporting all procs in %s terminated", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_JOBID_PRINT(jobdat->jobid))); - - /* remove all of this job's children from the global list - do not lock - * the thread as we are already locked - */ - for (item = opal_list_get_first(&orte_local_children); - item != opal_list_get_end(&orte_local_children); - item = next) { - child = (orte_odls_child_t*)item; - next = opal_list_get_next(item); - - if (jobdat->jobid == child->name->jobid) { - opal_list_remove_item(&orte_local_children, &child->super); - OBJ_RELEASE(child); - } - } - - /* ensure the job's local session directory tree is removed */ - orte_session_dir_cleanup(jobdat->jobid); - - /* remove this job from our local job data since it is complete */ - opal_list_remove_item(&orte_local_jobdata, &jobdat->super); - OBJ_RELEASE(jobdat); - - /* send it */ - if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert, ORTE_RML_TAG_PLM, 0, cbfunc, NULL))) { - ORTE_ERROR_LOG(rc); - } else { - rc = ORTE_SUCCESS; - } - - /* indicate that the job is complete */ - return rc; - } - return ORTE_SUCCESS; -} - -static int predicted_fault(opal_list_t *proc_list, - opal_list_t *node_list, - opal_list_t *suggested_map) -{ - return ORTE_ERR_NOT_IMPLEMENTED; -} - -static int suggest_map_targets(orte_proc_t *proc, - orte_node_t *oldnode, - opal_list_t *node_list) -{ - return ORTE_ERR_NOT_IMPLEMENTED; -} - -static int ft_event(int state) -{ - return ORTE_SUCCESS; -} - -#if ORTE_RESIL_ORTE -static int mark_processes_as_dead(opal_pointer_array_t *dead_procs) { - int i; - orte_process_name_t *name_item; - opal_list_item_t *item; - orte_odls_child_t *child; - - OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, - "ORTED %s marking procs as dead", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - - for (i = 0; i < opal_pointer_array_get_size(dead_procs); i++) { - if (NULL == (name_item = (orte_process_name_t *) opal_pointer_array_get_item(dead_procs, i))) { - opal_output(0, "NULL found in dead process list."); - continue; - } - - if (0 < ORTE_EPOCH_CMP(name_item->epoch,orte_ess.proc_get_epoch(name_item))) { - continue; - } - - OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, - "ORTED %s marking %s as dead", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(name_item))); - -#if ORTE_ENABLE_EPOCH - /* Increment the epoch */ - orte_util_set_proc_state(name_item, ORTE_PROC_STATE_TERMINATED); - orte_util_set_epoch(name_item, name_item->epoch + 1); -#endif - - OPAL_THREAD_LOCK(&orte_odls_globals.mutex); - - /* Remove the dead process from my list of children if applicable */ - for (item = opal_list_get_first(&orte_local_children); - item != opal_list_get_end(&orte_local_children); - item = opal_list_get_next(item)) { - child = (orte_odls_child_t *) item; - - if (OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_JOBID | ORTE_NS_CMP_VPID, - child->name, name_item)) { - opal_list_remove_item(&orte_local_children, item); - OBJ_RELEASE(item); - break; - } - } - - opal_condition_signal(&orte_odls_globals.cond); - OPAL_THREAD_UNLOCK(&orte_odls_globals.mutex); - - /* Remove the route from the routing layer */ - orte_routed.delete_route(name_item); - } - - /* Update the routing module */ - orte_routed.update_routing_tree(ORTE_PROC_MY_NAME->jobid); - - if (NULL != fault_cbfunc) { - (*fault_cbfunc)(dead_procs); - } - - return ORTE_SUCCESS; -} - -static void failure_notification(int status, orte_process_name_t* sender, - opal_buffer_t *buffer, orte_rml_tag_t tag, - void* cbdata) -{ - opal_pointer_array_t *dead_names; - orte_std_cntr_t n; - int ret = ORTE_SUCCESS, num_failed; - int32_t i; - orte_process_name_t *name_item; - - dead_names = OBJ_NEW(opal_pointer_array_t); - - n = 1; - /* Get the number of failed procs */ - if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &num_failed, &n, ORTE_VPID))) { - ORTE_ERROR_LOG(ret); - return; - } - - for (i = 0; i < num_failed; i++) { - /* Unpack the buffer to get the dead process' name. */ - n = 1; - - name_item = (orte_process_name_t *) malloc(sizeof(orte_process_name_t)); - - if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, name_item, &n, ORTE_NAME))) { - ORTE_ERROR_LOG(ret); - return; - } - - if (orte_debug_daemons_flag) { - opal_output(0, "%s errmgr:orted ORTED received process %s failed from %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(name_item), - ORTE_NAME_PRINT(sender)); - } - - /* There shouldn't be an issue of receiving this message multiple - * times but it doesn't hurt to double check. - */ - if (0 < ORTE_EPOCH_CMP(name_item->epoch,orte_ess.proc_get_epoch(name_item))) { - opal_output(1, "Received from proc %s local epoch %d", ORTE_NAME_PRINT(name_item), orte_util_lookup_epoch(name_item)); - continue; - } - - opal_pointer_array_add(dead_names, name_item); - } - - /* Tell the errmgr so it can handle changing the epoch, routes, etc. */ - mark_processes_as_dead(dead_names); - - /* Tell the applications' ORTE layers that there is a failure. */ - if (ORTE_SUCCESS != (ret = send_to_local_applications(dead_names))) { - return; - } - - for (i = 0; i < num_failed; i++) { - name_item = (orte_process_name_t *) opal_pointer_array_get_item(dead_names, i); - free(name_item); - } -} -#endif - -/***************** - * Local Functions - *****************/ -static bool any_live_children(orte_jobid_t job) -{ - opal_list_item_t *item; - orte_odls_child_t *child; - - /* the thread is locked elsewhere - don't try to do it again here */ - - for (item = opal_list_get_first(&orte_local_children); - item != opal_list_get_end(&orte_local_children); - item = opal_list_get_next(item)) { - child = (orte_odls_child_t*)item; - - /* is this child part of the specified job? */ - if ((job == child->name->jobid || ORTE_JOBID_WILDCARD == job) && - child->alive) { - return true; - } - } - - /* if we get here, then nobody is left alive from that job */ - return false; - -} - -static int pack_state_for_proc(opal_buffer_t *alert, orte_odls_child_t *child) -{ - int rc; - - /* pack the child's vpid */ - if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &(child->name->vpid), 1, ORTE_VPID))) { - ORTE_ERROR_LOG(rc); - return rc; - } - /* pack the pid */ - if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &child->pid, 1, OPAL_PID))) { - ORTE_ERROR_LOG(rc); - return rc; - } - /* if we are timing things, pack the time the proc was launched */ - if (orte_timing) { - int64_t tmp; - tmp = child->starttime.tv_sec; - if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &tmp, 1, OPAL_INT64))) { - ORTE_ERROR_LOG(rc); - return rc; - } - tmp = child->starttime.tv_usec; - if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &tmp, 1, OPAL_INT64))) { - ORTE_ERROR_LOG(rc); - return rc; - } - } - /* pack its state */ - if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &child->state, 1, ORTE_PROC_STATE))) { - ORTE_ERROR_LOG(rc); - return rc; - } - /* pack its exit code */ - if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &child->exit_code, 1, ORTE_EXIT_CODE))) { - ORTE_ERROR_LOG(rc); - return rc; - } - - return ORTE_SUCCESS; -} - -static int pack_state_update(opal_buffer_t *alert, orte_odls_job_t *jobdat) -{ - int rc; - opal_list_item_t *item, *next; - orte_odls_child_t *child; - orte_vpid_t null=ORTE_VPID_INVALID; - - /* pack the jobid */ - if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &jobdat->jobid, 1, ORTE_JOBID))) { - ORTE_ERROR_LOG(rc); - return rc; - } - /* if we are timing things, pack the time the launch msg for this job was recvd */ - if (orte_timing) { - int64_t tmp; - tmp = jobdat->launch_msg_recvd.tv_sec; - if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &tmp, 1, OPAL_INT64))) { - ORTE_ERROR_LOG(rc); - return rc; - } - tmp = jobdat->launch_msg_recvd.tv_usec; - if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &tmp, 1, OPAL_INT64))) { - ORTE_ERROR_LOG(rc); - return rc; - } - } - for (item = opal_list_get_first(&orte_local_children); - item != opal_list_get_end(&orte_local_children); - item = next) { - child = (orte_odls_child_t*)item; - next = opal_list_get_next(item); - /* if this child is part of the job... */ - if (child->name->jobid == jobdat->jobid) { - if (ORTE_SUCCESS != (rc = pack_state_for_proc(alert, child))) { - ORTE_ERROR_LOG(rc); - return rc; - } - } - } - /* flag that this job is complete so the receiver can know */ - if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &null, 1, ORTE_VPID))) { - ORTE_ERROR_LOG(rc); - return rc; - } - - return ORTE_SUCCESS; -} - -static bool all_children_registered(orte_jobid_t job) -{ - opal_list_item_t *item; - orte_odls_child_t *child; - - /* the thread is locked elsewhere - don't try to do it again here */ - - for (item = opal_list_get_first(&orte_local_children); - item != opal_list_get_end(&orte_local_children); - item = opal_list_get_next(item)) { - child = (orte_odls_child_t*)item; - - /* is this child part of the specified job? */ - if (OPAL_EQUAL == opal_dss.compare(&child->name->jobid, &job, ORTE_JOBID)) { - /* if this child has terminated, we consider it as having - * registered for the purposes of this function. If it never - * did register, then we will send a NULL rml_uri back to - * the HNP, which will then know that the proc did not register. - * If other procs did register, then the HNP can declare an - * abnormal termination - */ - if (ORTE_PROC_STATE_UNTERMINATED < child->state) { - /* this proc has terminated somehow - consider it - * as registered for now - */ - continue; - } - /* if this child is *not* registered yet, return false */ - if (!child->init_recvd) { - return false; - } - /* if this child has registered a finalize, return false */ - if (child->fini_recvd) { - return false; - } - } - } - - /* if we get here, then everyone in the job is currently registered */ - return true; - -} - -static int pack_child_contact_info(orte_jobid_t job, opal_buffer_t *buf) -{ - opal_list_item_t *item; - orte_odls_child_t *child; - int rc; - - /* the thread is locked elsewhere - don't try to do it again here */ - - for (item = opal_list_get_first(&orte_local_children); - item != opal_list_get_end(&orte_local_children); - item = opal_list_get_next(item)) { - child = (orte_odls_child_t*)item; - - /* is this child part of the specified job? */ - if (OPAL_EQUAL == opal_dss.compare(&child->name->jobid, &job, ORTE_JOBID)) { - /* pack the child's vpid - must be done in case rml_uri is NULL */ - if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &(child->name->vpid), 1, ORTE_VPID))) { - ORTE_ERROR_LOG(rc); - return rc; - } -#if ORTE_ENABLE_EPOCH - /* Pack the child's epoch. */ - if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &(child->name->epoch), 1, ORTE_EPOCH))) { - ORTE_ERROR_LOG(rc); - return rc; - } -#endif - /* pack the contact info */ - if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &child->rml_uri, 1, OPAL_STRING))) { - ORTE_ERROR_LOG(rc); - return rc; - } - } - } - - return ORTE_SUCCESS; - -} - -static void failed_start(orte_odls_job_t *jobdat, orte_exit_code_t exit_code) -{ - opal_list_item_t *item; - orte_odls_child_t *child; - - /* set the state */ - jobdat->state = ORTE_JOB_STATE_FAILED_TO_START; - - for (item = opal_list_get_first(&orte_local_children); - item != opal_list_get_end(&orte_local_children); - item = opal_list_get_next(item)) { - child = (orte_odls_child_t*)item; - if (child->name->jobid == jobdat->jobid) { - if (ORTE_PROC_STATE_LAUNCHED > child->state || - ORTE_PROC_STATE_FAILED_TO_START == child->state) { - /* this proc never launched - flag that the iof - * is complete or else we will hang waiting for - * pipes to close that were never opened - */ - child->iof_complete = true; - /* ditto for waitpid */ - child->waitpid_recvd = true; - } - } - } - OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, - "%s errmgr:hnp: job %s reported incomplete start", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_JOBID_PRINT(jobdat->jobid))); - return; -} - -static void update_local_children(orte_odls_job_t *jobdat, orte_job_state_t jobstate, orte_proc_state_t state) -{ - opal_list_item_t *item; - orte_odls_child_t *child; - - /* update job state */ - jobdat->state = jobstate; - /* update children */ - for (item = opal_list_get_first(&orte_local_children); - item != opal_list_get_end(&orte_local_children); - item = opal_list_get_next(item)) { - child = (orte_odls_child_t*)item; - if (jobdat->jobid == child->name->jobid) { - child->state = state; - } - } -} - -static void killprocs(orte_jobid_t job, orte_vpid_t vpid) -{ - opal_pointer_array_t cmd; - orte_proc_t proc; - int rc; - - /* stop local sensors for this job */ - if (ORTE_VPID_WILDCARD == vpid) { - orte_sensor.stop(job); - } - - if (ORTE_JOBID_WILDCARD == job - && ORTE_VPID_WILDCARD == vpid) { - if (ORTE_SUCCESS != (rc = orte_odls.kill_local_procs(NULL))) { - ORTE_ERROR_LOG(rc); - } - return; - } - - OBJ_CONSTRUCT(&cmd, opal_pointer_array_t); - OBJ_CONSTRUCT(&proc, orte_proc_t); - proc.name.jobid = job; - proc.name.vpid = vpid; - ORTE_EPOCH_SET(proc.name.epoch,orte_ess.proc_get_epoch(&(proc.name))); - opal_pointer_array_add(&cmd, &proc); - if (ORTE_SUCCESS != (rc = orte_odls.kill_local_procs(&cmd))) { - ORTE_ERROR_LOG(rc); - } - OBJ_DESTRUCT(&cmd); - OBJ_DESTRUCT(&proc); -} - -#if ORTE_RESIL_ORTE -static int record_dead_process(orte_process_name_t *proc) { - opal_pointer_array_t *dead_name; - opal_buffer_t *buffer; - int rc = ORTE_SUCCESS; - int num_failed; - - if (orte_odls_base_default_check_finished(proc)) { - return rc; - } - - dead_name = OBJ_NEW(opal_pointer_array_t); - - opal_pointer_array_add(dead_name, proc); - - /* Mark the process as dead */ - mark_processes_as_dead(dead_name); - - /* Send a message to the HNP */ - buffer = OBJ_NEW(opal_buffer_t); - - num_failed = 1; - - if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &num_failed, 1, ORTE_VPID))) { - ORTE_ERROR_LOG(rc); - } else if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, proc, 1, ORTE_NAME))) { - ORTE_ERROR_LOG(rc); - } - - orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, buffer, ORTE_RML_TAG_FAILURE_NOTICE, 0, - cbfunc, NULL); - - OBJ_RELEASE(dead_name); - - return rc; -} - -int send_to_local_applications(opal_pointer_array_t *dead_names) { - opal_buffer_t *buf; - int ret; - orte_process_name_t *name_item; - int size, i; - - buf = OBJ_NEW(opal_buffer_t); - - size = opal_pointer_array_get_size(dead_names); - - OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base.output, - "%s Sending %d failure(s) to local applications.", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), size)); - - if (ORTE_SUCCESS != (ret = opal_dss.pack(buf, &size, 1, ORTE_VPID))) { - ORTE_ERROR_LOG(ret); - OBJ_RELEASE(buf); - return ret; - } - - for (i = 0; i < size; i++) { - if (NULL != (name_item = (orte_process_name_t *) opal_pointer_array_get_item(dead_names, i))) { - if (ORTE_SUCCESS != (ret = opal_dss.pack(buf, name_item, 1, ORTE_NAME))) { - ORTE_ERROR_LOG(ret); - OBJ_RELEASE(buf); - return ret; - } - } - } - - if (ORTE_SUCCESS != (ret = orte_odls.deliver_message(ORTE_JOBID_WILDCARD, buf, ORTE_RML_TAG_EPOCH_CHANGE))) { - ORTE_ERROR_LOG(ret); - OBJ_RELEASE(buf); - return ret; - } - - OBJ_RELEASE(buf); - - return ORTE_SUCCESS; -} -#endif - diff --git a/orte/mca/errmgr/orted/errmgr_orted.h b/orte/mca/errmgr/orted/errmgr_orted.h deleted file mode 100644 index 2c3e22f1be..0000000000 --- a/orte/mca/errmgr/orted/errmgr_orted.h +++ /dev/null @@ -1,35 +0,0 @@ -/* - * Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. - * - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -/** - * @file - * - */ - -#ifndef MCA_ERRMGR_orted_EXPORT_H -#define MCA_ERRMGR_orted_EXPORT_H - -#include "orte_config.h" - -#include "orte/mca/errmgr/errmgr.h" - -BEGIN_C_DECLS - -/* - * Local Component structures - */ - -ORTE_MODULE_DECLSPEC extern orte_errmgr_base_component_t mca_errmgr_orted_component; - -ORTE_DECLSPEC extern orte_errmgr_base_module_t orte_errmgr_orted_module; - -END_C_DECLS - -#endif /* MCA_ERRMGR_orted_EXPORT_H */ diff --git a/orte/mca/errmgr/orted/errmgr_orted_component.c b/orte/mca/errmgr/orted/errmgr_orted_component.c deleted file mode 100644 index d3ecc83021..0000000000 --- a/orte/mca/errmgr/orted/errmgr_orted_component.c +++ /dev/null @@ -1,84 +0,0 @@ -/* - * Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. - * - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "orte_config.h" -#include "opal/util/output.h" - -#include "orte/mca/errmgr/errmgr.h" -#include "orte/mca/errmgr/base/base.h" -#include "errmgr_orted.h" - -/* - * Public string for version number - */ -const char *orte_errmgr_orted_component_version_string = - "ORTE ERRMGR orted MCA component version " ORTE_VERSION; - -/* - * Local functionality - */ -static int errmgr_orted_open(void); -static int errmgr_orted_close(void); -static int errmgr_orted_component_query(mca_base_module_t **module, int *priority); - -/* - * Instantiate the public struct with all of our public information - * and pointer to our public functions in it - */ -orte_errmgr_base_component_t mca_errmgr_orted_component = -{ - /* Handle the general mca_component_t struct containing - * meta information about the component itorted - */ - { - ORTE_ERRMGR_BASE_VERSION_3_0_0, - /* Component name and version */ - "orted", - ORTE_MAJOR_VERSION, - ORTE_MINOR_VERSION, - ORTE_RELEASE_VERSION, - - /* Component open and close functions */ - errmgr_orted_open, - errmgr_orted_close, - errmgr_orted_component_query - }, - { - /* The component is checkpoint ready */ - MCA_BASE_METADATA_PARAM_CHECKPOINT - } -}; - -static int errmgr_orted_open(void) -{ - return ORTE_SUCCESS; -} - -static int errmgr_orted_close(void) -{ - return ORTE_SUCCESS; -} - -static int errmgr_orted_component_query(mca_base_module_t **module, int *priority) -{ - if (ORTE_PROC_IS_DAEMON) { - /* keep our priority low so that other modules are higher - * and will run before us - */ - *priority = 5; - *module = (mca_base_module_t *)&orte_errmgr_orted_module; - return ORTE_SUCCESS; - } - - *priority = -1; - *module = NULL; - return ORTE_ERROR; -} - diff --git a/orte/mca/errmgr/orted/help-orte-errmgr-orted.txt b/orte/mca/errmgr/orted/help-orte-errmgr-orted.txt deleted file mode 100644 index c6d43f1f77..0000000000 --- a/orte/mca/errmgr/orted/help-orte-errmgr-orted.txt +++ /dev/null @@ -1,14 +0,0 @@ - -*- text -*- -# -# Copyright (c) 2009-2010 The Trustees of Indiana University and Indiana -# University Research and Technology -# Corporation. All rights reserved. -# -# $COPYRIGHT$ -# -# Additional copyrights may follow -# -# $HEADER$ -# -# This is the US/English general help file for ORTE RecoS IGNORE framework. -# diff --git a/orte/mca/ess/alps/ess_alps_module.c b/orte/mca/ess/alps/ess_alps_module.c index 5002ac1c70..6afaab6787 100644 --- a/orte/mca/ess/alps/ess_alps_module.c +++ b/orte/mca/ess/alps/ess_alps_module.c @@ -58,7 +58,6 @@ orte_ess_base_module_t orte_ess_alps_module = { orte_ess_base_proc_get_hostname, orte_ess_base_proc_get_local_rank, orte_ess_base_proc_get_node_rank, - orte_ess_base_proc_get_epoch, orte_ess_base_update_pidmap, orte_ess_base_update_nidmap, NULL /* ft_event */ @@ -221,9 +220,6 @@ static int alps_set_name(void) ORTE_PROC_MY_NAME->jobid = jobid; ORTE_PROC_MY_NAME->vpid = (orte_vpid_t)cnos_get_rank() + starting_vpid; - ORTE_EPOCH_SET(ORTE_PROC_MY_NAME->epoch,ORTE_EPOCH_INVALID); - ORTE_EPOCH_SET(ORTE_PROC_MY_NAME->epoch, - orte_ess.proc_get_epoch(ORTE_PROC_MY_NAME)); OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output, "ess:alps set name to %s", diff --git a/orte/mca/ess/base/base.h b/orte/mca/ess/base/base.h index 4047423d80..b494e530fa 100644 --- a/orte/mca/ess/base/base.h +++ b/orte/mca/ess/base/base.h @@ -64,12 +64,6 @@ ORTE_DECLSPEC extern int orte_ess_base_std_buffering; ORTE_DECLSPEC extern opal_list_t orte_ess_base_components_available; -#if ORTE_ENABLE_EPOCH -ORTE_DECLSPEC orte_epoch_t orte_ess_base_proc_get_epoch(orte_process_name_t *proc); -#else -ORTE_DECLSPEC int orte_ess_base_proc_get_epoch(orte_process_name_t *proc); -#endif - #if !ORTE_DISABLE_FULL_SUPPORT /* @@ -81,7 +75,7 @@ ORTE_DECLSPEC int orte_ess_base_std_prolog(void); ORTE_DECLSPEC int orte_ess_base_app_setup(void); ORTE_DECLSPEC int orte_ess_base_app_finalize(void); -ORTE_DECLSPEC void orte_ess_base_app_abort(int status, bool report) __opal_attribute_noreturn__; +ORTE_DECLSPEC void orte_ess_base_app_abort(int status, bool report); ORTE_DECLSPEC int orte_ess_base_tool_setup(void); ORTE_DECLSPEC int orte_ess_base_tool_finalize(void); diff --git a/orte/mca/ess/base/ess_base_fns.c b/orte/mca/ess/base/ess_base_fns.c index be7141a636..e5b1f4a5fc 100644 --- a/orte/mca/ess/base/ess_base_fns.c +++ b/orte/mca/ess/base/ess_base_fns.c @@ -10,6 +10,8 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2011 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2011-2012 Los Alamos National Security, LLC. + * All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -43,6 +45,10 @@ opal_paffinity_locality_t orte_ess_base_proc_get_locality(orte_process_name_t *p if (NULL == (pmap = orte_util_lookup_pmap(proc))) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); + OPAL_OUTPUT_VERBOSE((5, orte_ess_base_output, + "%s LOOKING FOR PROC %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(proc))); return OPAL_PROC_NON_LOCAL; } @@ -76,6 +82,10 @@ char* orte_ess_base_proc_get_hostname(orte_process_name_t *proc) if (NULL == (nid = orte_util_lookup_nid(proc))) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); + OPAL_OUTPUT_VERBOSE((5, orte_ess_base_output, + "%s LOOKING FOR PROC %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(proc))); return NULL; } diff --git a/orte/mca/ess/base/ess_base_open.c b/orte/mca/ess/base/ess_base_open.c index 8bf42512bf..dfa3722493 100644 --- a/orte/mca/ess/base/ess_base_open.c +++ b/orte/mca/ess/base/ess_base_open.c @@ -46,7 +46,6 @@ orte_ess_base_module_t orte_ess = { NULL, /* proc_get_hostname */ NULL, /* get_local_rank */ NULL, /* get_node_rank */ - NULL, /* proc_get_epoch */ NULL, /* update_pidmap */ NULL, /* update_nidmap */ NULL /* ft_event */ diff --git a/orte/mca/ess/base/ess_base_select.c b/orte/mca/ess/base/ess_base_select.c index 5d03f59448..832f9d77c0 100644 --- a/orte/mca/ess/base/ess_base_select.c +++ b/orte/mca/ess/base/ess_base_select.c @@ -33,24 +33,6 @@ extern opal_list_t orte_ess_base_components_available; -/** - * Generic function to retrieve the epoch of a specific process - * from the job data. - */ -#if !ORTE_ENABLE_EPOCH -int orte_ess_base_proc_get_epoch(orte_process_name_t *proc) { - return 0; -} -#else -orte_epoch_t orte_ess_base_proc_get_epoch(orte_process_name_t *proc) { - orte_epoch_t epoch = ORTE_EPOCH_INVALID; - - epoch = orte_util_lookup_epoch(proc); - - return epoch; -} -#endif - int orte_ess_base_select(void) { diff --git a/orte/mca/ess/base/ess_base_std_app.c b/orte/mca/ess/base/ess_base_std_app.c index 9063e99e30..cb134acb59 100644 --- a/orte/mca/ess/base/ess_base_std_app.c +++ b/orte/mca/ess/base/ess_base_std_app.c @@ -10,6 +10,8 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2010-2012 Oak Ridge National Labs. All rights reserved. + * Copyright (c) 2011 Los Alamos National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -35,6 +37,7 @@ #include "opal/util/output.h" #include "opal/runtime/opal.h" #include "opal/runtime/opal_cr.h" +#include "opal/runtime/opal_progress.h" #include "orte/mca/rml/base/base.h" #include "orte/mca/routed/base/base.h" @@ -48,6 +51,7 @@ #if OPAL_ENABLE_FT_CR == 1 #include "orte/mca/snapc/base/base.h" #endif +#include "orte/mca/state/base/base.h" #include "orte/util/proc_info.h" #include "orte/util/session_dir.h" #include "orte/util/name_fns.h" @@ -85,6 +89,18 @@ int orte_ess_base_app_setup(void) } } + /* open and setup the state machine */ + if (ORTE_SUCCESS != (ret = orte_state_base_open())) { + ORTE_ERROR_LOG(ret); + error = "orte_state_base_open"; + goto error; + } + if (ORTE_SUCCESS != (ret = orte_state_base_select())) { + ORTE_ERROR_LOG(ret); + error = "orte_state_base_select"; + goto error; + } + /* open the errmgr */ if (ORTE_SUCCESS != (ret = orte_errmgr_base_open())) { ORTE_ERROR_LOG(ret); @@ -248,11 +264,18 @@ int orte_ess_base_app_setup(void) * in the job won't be executing this step, so we would hang */ if (ORTE_PROC_IS_NON_MPI && !orte_do_not_barrier) { - if (ORTE_SUCCESS != (ret = orte_grpcomm.barrier())) { + orte_grpcomm_collective_t coll; + OBJ_CONSTRUCT(&coll, orte_grpcomm_collective_t); + coll.id = orte_process_info.peer_init_barrier; + if (ORTE_SUCCESS != (ret = orte_grpcomm.barrier(&coll))) { ORTE_ERROR_LOG(ret); error = "orte barrier"; goto error; } + while (coll.active) { + opal_progress(); /* block in progress pending events */ + } + OBJ_DESTRUCT(&coll); } return ORTE_SUCCESS; @@ -324,12 +347,18 @@ static void report_sync(int status, orte_process_name_t* sender, { /* flag as complete */ sync_recvd = true; + + /* (not really necessary, but good practice) */ + orte_proc_info_finalize(); + + /* Now Exit */ + exit(status); } void orte_ess_base_app_abort(int status, bool report) { orte_daemon_cmd_flag_t cmd=ORTE_DAEMON_ABORT_CALLED; - opal_buffer_t buf; + opal_buffer_t *buf; /* Exit - do NOT do a normal finalize as this will very likely * hang the process. We are aborting due to an abnormal condition @@ -345,10 +374,9 @@ void orte_ess_base_app_abort(int status, bool report) /* If we were asked to report this termination, do so */ if (report) { - OBJ_CONSTRUCT(&buf, opal_buffer_t); - opal_dss.pack(&buf, &cmd, 1, ORTE_DAEMON_CMD); - orte_rml.send_buffer(ORTE_PROC_MY_DAEMON, &buf, ORTE_RML_TAG_DAEMON, 0); - OBJ_DESTRUCT(&buf); + buf = OBJ_NEW(opal_buffer_t); + opal_dss.pack(buf, &cmd, 1, ORTE_DAEMON_CMD); + orte_rml.send_buffer_nb(ORTE_PROC_MY_DAEMON, buf, ORTE_RML_TAG_DAEMON, 0, orte_rml_send_callback, NULL); OPAL_OUTPUT_VERBOSE((5, orte_debug_output, "%s orte_ess_app_abort: sent abort msg to %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), @@ -360,7 +388,7 @@ void orte_ess_base_app_abort(int status, bool report) sync_recvd = false; if (ORTE_SUCCESS == orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_ABORT, ORTE_RML_NON_PERSISTENT, report_sync, NULL)) { - ORTE_PROGRESSED_WAIT(sync_recvd, 0, 1); + return; } } diff --git a/orte/mca/ess/base/ess_base_std_orted.c b/orte/mca/ess/base/ess_base_std_orted.c index 77e3dd104b..bed6f73d83 100644 --- a/orte/mca/ess/base/ess_base_std_orted.c +++ b/orte/mca/ess/base/ess_base_std_orted.c @@ -12,6 +12,8 @@ * Copyright (c) 2009 Institut National de Recherche en Informatique * et Automatique. All rights reserved. * Copyright (c) 2011 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2011 Los Alamos National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -62,6 +64,8 @@ #include "orte/mca/notifier/base/base.h" #include "orte/mca/sensor/base/base.h" #include "orte/mca/sensor/sensor.h" +#include "orte/mca/state/base/base.h" +#include "orte/mca/state/state.h" #include "orte/runtime/orte_cr.h" #include "orte/runtime/orte_wait.h" #include "orte/runtime/orte_globals.h" @@ -84,6 +88,15 @@ static void shutdown_signal(int fd, short flags, void *arg); static void signal_callback(int fd, short flags, void *arg); static void epipe_signal_callback(int fd, short flags, void *arg); +static void setup_sighandler(int signal, opal_event_t *ev, + opal_event_cbfunc_t cbfunc) +{ + opal_event_signal_set(orte_event_base, ev, signal, cbfunc, ev); + opal_event_set_priority(ev, ORTE_ERROR_PRI); + opal_event_signal_add(ev, NULL); +} + + int orte_ess_base_orted_setup(char **hosts) { int ret = ORTE_ERROR; @@ -92,42 +105,32 @@ int orte_ess_base_orted_setup(char **hosts) char *jobidstring; char *error = NULL; char *plm_to_use; + orte_job_t *jdata; + orte_proc_t *proc; + orte_app_context_t *app; #ifndef __WINDOWS__ /* setup callback for SIGPIPE */ - opal_event_signal_set(opal_event_base, &epipe_handler, SIGPIPE, - epipe_signal_callback, &epipe_handler); - opal_event_signal_add(&epipe_handler, NULL); + setup_sighandler(SIGPIPE, &epipe_handler, epipe_signal_callback); /* Set signal handlers to catch kill signals so we can properly clean up * after ourselves. */ - opal_event_set(opal_event_base, &term_handler, SIGTERM, OPAL_EV_SIGNAL, - shutdown_signal, NULL); - opal_event_add(&term_handler, NULL); - opal_event_set(opal_event_base, &int_handler, SIGINT, OPAL_EV_SIGNAL, - shutdown_signal, NULL); - opal_event_add(&int_handler, NULL); - - /** setup callbacks for signals we should ignore */ - opal_event_signal_set(opal_event_base, &sigusr1_handler, SIGUSR1, - signal_callback, &sigusr1_handler); - opal_event_signal_add(&sigusr1_handler, NULL); - opal_event_signal_set(opal_event_base, &sigusr2_handler, SIGUSR2, - signal_callback, &sigusr2_handler); - opal_event_signal_add(&sigusr2_handler, NULL); -#endif /* __WINDOWS__ */ - - signals_set = true; + setup_sighandler(SIGTERM, &term_handler, shutdown_signal); + setup_sighandler(SIGINT, &int_handler, shutdown_signal); - /* initialize the global list of local children and job data */ - OBJ_CONSTRUCT(&orte_local_children, opal_list_t); - OBJ_CONSTRUCT(&orte_local_jobdata, opal_list_t); + /** setup callbacks for signals we should ignore */ + setup_sighandler(SIGUSR1, &sigusr1_handler, signal_callback); + setup_sighandler(SIGUSR2, &sigusr2_handler, signal_callback); + setup_sighandler(SIGTERM, &term_handler, shutdown_signal); +#endif /* __WINDOWS__ */ + + signals_set = true; #if OPAL_HAVE_HWLOC { hwloc_obj_t obj; unsigned i, j; - + /* get the local topology */ if (NULL == opal_hwloc_topology) { if (OPAL_SUCCESS != opal_hwloc_base_get_topology()) { @@ -135,7 +138,7 @@ int orte_ess_base_orted_setup(char **hosts) goto error; } } - + /* remove the hostname from the topology. Unfortunately, hwloc * decided to add the source hostname to the "topology", thus * rendering it unusable as a pure topological description. So @@ -160,14 +163,14 @@ int orte_ess_base_orted_setup(char **hosts) break; } } - + if (4 < opal_output_get_verbosity(orte_ess_base_output)) { opal_output(0, "%s Topology Info:", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); opal_dss.dump(0, opal_hwloc_topology, OPAL_HWLOC_TOPO); } } #endif - + /* open and setup the opal_pstat framework so we can provide * process stats if requested */ @@ -178,7 +181,19 @@ int orte_ess_base_orted_setup(char **hosts) } if (ORTE_SUCCESS != (ret = opal_pstat_base_select())) { ORTE_ERROR_LOG(ret); - error = "orte_pstat_base_select"; + error = "opal_pstat_base_select"; + goto error; + } + + /* open and setup the state machine */ + if (ORTE_SUCCESS != (ret = orte_state_base_open())) { + ORTE_ERROR_LOG(ret); + error = "orte_state_base_open"; + goto error; + } + if (ORTE_SUCCESS != (ret = orte_state_base_select())) { + ORTE_ERROR_LOG(ret); + error = "orte_state_base_select"; goto error; } @@ -188,9 +203,9 @@ int orte_ess_base_orted_setup(char **hosts) error = "orte_errmgr_base_open"; goto error; } - + /* some environments allow remote launches - e.g., ssh - so - * open the PLM and select something -only- if we are given + * open and select something -only- if we are given * a specific module to use */ mca_base_param_reg_string_name("plm", NULL, @@ -215,7 +230,7 @@ int orte_ess_base_orted_setup(char **hosts) goto error; } } - + /* Setup the communication infrastructure */ /* Runtime Messaging Layer - this opens/selects the OOB as well */ @@ -229,14 +244,14 @@ int orte_ess_base_orted_setup(char **hosts) error = "orte_rml_base_select"; goto error; } - + /* select the errmgr */ if (ORTE_SUCCESS != (ret = orte_errmgr_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_errmgr_base_select"; goto error; } - + /* Routed system */ if (ORTE_SUCCESS != (ret = orte_routed_base_open())) { ORTE_ERROR_LOG(ret); @@ -248,7 +263,7 @@ int orte_ess_base_orted_setup(char **hosts) error = "orte_routed_base_select"; goto error; } - + /* * Group communications */ @@ -282,9 +297,6 @@ int orte_ess_base_orted_setup(char **hosts) goto error; } - /* set the communication function */ - orte_comm = orte_global_comm; - /* initialize the nidmaps */ if (ORTE_SUCCESS != (ret = orte_util_nidmap_init(NULL))) { ORTE_ERROR_LOG(ret); @@ -316,12 +328,8 @@ int orte_ess_base_orted_setup(char **hosts) * to mpirun goes through the tree if static ports were enabled - still * need to do it anyway just to initialize things */ - if (ORTE_SUCCESS != (ret = orte_routed.update_routing_tree(ORTE_PROC_MY_NAME->jobid))) { - ORTE_ERROR_LOG(ret); - error = "failed to update routing tree"; - goto error; - } - + orte_routed.update_routing_plan(); + /* Now provide a chance for the PLM * to perform any module-specific init functions. This * needs to occur AFTER the communications are setup @@ -354,17 +362,17 @@ int orte_ess_base_orted_setup(char **hosts) goto error; } /* Once the session directory location has been established, set - the opal_output env file location to be in the - proc-specific session directory. */ + the opal_output env file location to be in the + proc-specific session directory. */ opal_output_set_output_file_info(orte_process_info.proc_session_dir, "output-", NULL, NULL); - + /* setup stdout/stderr */ if (orte_debug_daemons_file_flag) { /* if we are debugging to a file, then send stdout/stderr to * the orted log file */ - + /* get my jobid */ if (ORTE_SUCCESS != (ret = orte_util_convert_jobid_to_string(&jobidstring, ORTE_PROC_MY_NAME->jobid))) { @@ -372,7 +380,7 @@ int orte_ess_base_orted_setup(char **hosts) error = "convert_jobid"; goto error; } - + /* define a log file name in the session directory */ snprintf(log_file, PATH_MAX, "output-orted-%s-%s.log", jobidstring, orte_process_info.nodename); @@ -381,7 +389,7 @@ int orte_ess_base_orted_setup(char **hosts) orte_process_info.top_session_dir, log_file, NULL); - + fd = open(log_path, O_RDWR|O_CREAT|O_TRUNC, 0640); if (fd < 0) { /* couldn't open the file for some reason, so @@ -398,6 +406,44 @@ int orte_ess_base_orted_setup(char **hosts) } } + /* setup the global job array */ + orte_job_data = OBJ_NEW(opal_pointer_array_t); + if (ORTE_SUCCESS != (ret = opal_pointer_array_init(orte_job_data, + 1, + ORTE_GLOBAL_ARRAY_MAX_SIZE, + 1))) { + ORTE_ERROR_LOG(ret); + error = "setup job array"; + goto error; + } + + /* Setup the job data object for the daemons */ + /* create and store the job data object */ + jdata = OBJ_NEW(orte_job_t); + jdata->jobid = ORTE_PROC_MY_NAME->jobid; + opal_pointer_array_set_item(orte_job_data, 0, jdata); + + /* every job requires at least one app */ + app = OBJ_NEW(orte_app_context_t); + opal_pointer_array_set_item(jdata->apps, 0, app); + jdata->num_apps++; + + /* create and store a proc object for us */ + proc = OBJ_NEW(orte_proc_t); + proc->name.jobid = ORTE_PROC_MY_NAME->jobid; + proc->name.vpid = ORTE_PROC_MY_NAME->vpid; + + proc->pid = orte_process_info.pid; + proc->rml_uri = orte_rml.get_contact_info(); + proc->state = ORTE_PROC_STATE_RUNNING; + opal_pointer_array_set_item(jdata->procs, proc->name.vpid, proc); + + /* record that the daemon job is running */ + jdata->num_procs = 1; + jdata->state = ORTE_JOB_STATE_RUNNING; + /* obviously, we have "reported" */ + jdata->num_reported = 1; + /* setup the routed info - the selected routed component * will know what to do. */ @@ -485,15 +531,15 @@ int orte_ess_base_orted_setup(char **hosts) } if (ORTE_SUCCESS != (ret = orte_sensor_base_select())) { ORTE_ERROR_LOG(ret); - error = "ortesensor_select"; + error = "orte_sensor_select"; goto error; } /* start the local sensors */ orte_sensor.start(ORTE_PROC_MY_NAME->jobid); - + return ORTE_SUCCESS; - error: +error: orte_show_help("help-orte-runtime.txt", "orte_init:startup:internal-failure", true, error, ORTE_ERROR_NAME(ret), ret); @@ -505,7 +551,7 @@ int orte_ess_base_orted_finalize(void) { /* stop the local sensors */ orte_sensor.stop(ORTE_PROC_MY_NAME->jobid); - + if (signals_set) { /* Release all local signal handlers */ opal_event_del(&epipe_handler); @@ -516,7 +562,7 @@ int orte_ess_base_orted_finalize(void) opal_event_signal_del(&sigusr2_handler); #endif /* __WINDOWS__ */ } - + /* cleanup */ if (NULL != log_path) { unlink(log_path); @@ -525,49 +571,9 @@ int orte_ess_base_orted_finalize(void) /* make sure our local procs are dead */ orte_odls.kill_local_procs(NULL); - /* whack any lingering session directory files from our jobs */ - orte_session_dir_cleanup(ORTE_JOBID_WILDCARD); - - orte_sensor_base_close(); - orte_notifier_base_close(); - - orte_cr_finalize(); - -#if OPAL_ENABLE_FT_CR == 1 - orte_snapc_base_close(); -#endif - orte_filem_base_close(); - - orte_odls_base_close(); - - orte_wait_finalize(); - orte_iof_base_close(); - - /* finalize selected modules */ - if (plm_in_use) { - orte_plm_base_close(); - } - - orte_errmgr_base_close(); - - /* now can close the rml and its friendly group comm */ - orte_grpcomm_base_close(); - orte_routed_base_close(); - orte_rml_base_close(); - /* cleanup any lingering session directories */ orte_session_dir_cleanup(ORTE_JOBID_WILDCARD); - - /* handle the orted-specific OPAL stuff */ - opal_pstat_base_close(); -#if OPAL_HAVE_HWLOC - /* destroy the topology, if required */ - if (NULL != opal_hwloc_topology) { - opal_hwloc_base_free_topology(opal_hwloc_topology); - opal_hwloc_topology = NULL; - } -#endif - + return ORTE_SUCCESS; } @@ -578,7 +584,7 @@ static void shutdown_signal(int fd, short flags, void *arg) * check the one-time lock */ ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE); - orte_quit(); + ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_FORCED_EXIT); } /** diff --git a/orte/mca/ess/base/ess_base_std_tool.c b/orte/mca/ess/base/ess_base_std_tool.c index 5c77d114ba..a00ed440a0 100644 --- a/orte/mca/ess/base/ess_base_std_tool.c +++ b/orte/mca/ess/base/ess_base_std_tool.c @@ -9,6 +9,8 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. + * Copyright (c) 2011 Los Alamos National Security, LLC. + * All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow diff --git a/orte/mca/ess/cnos/ess_cnos_module.c b/orte/mca/ess/cnos/ess_cnos_module.c index 6f553efc15..dc2d815248 100644 --- a/orte/mca/ess/cnos/ess_cnos_module.c +++ b/orte/mca/ess/cnos/ess_cnos_module.c @@ -58,7 +58,6 @@ orte_ess_base_module_t orte_ess_cnos_module = { proc_get_hostname, proc_get_local_rank, proc_get_node_rank, - orte_ess_base_proc_get_epoch, /* get_epoch */ NULL, /* add_pidmap is only used in ORTE */ NULL, /* update_nidmap is only used in ORTE */ NULL /* ft_event */ diff --git a/orte/mca/ess/env/ess_env_module.c b/orte/mca/ess/env/ess_env_module.c index 3c37ef29f7..26c7183713 100644 --- a/orte/mca/ess/env/ess_env_module.c +++ b/orte/mca/ess/env/ess_env_module.c @@ -92,7 +92,6 @@ orte_ess_base_module_t orte_ess_env_module = { orte_ess_base_proc_get_hostname, orte_ess_base_proc_get_local_rank, orte_ess_base_proc_get_node_rank, - orte_ess_base_proc_get_epoch, /* proc_get_epoch */ orte_ess_base_update_pidmap, orte_ess_base_update_nidmap, #if OPAL_ENABLE_FT_CR == 1 @@ -241,7 +240,6 @@ static int env_set_name(void) ORTE_PROC_MY_NAME->jobid = jobid; ORTE_PROC_MY_NAME->vpid = vpid; - ORTE_EPOCH_SET(ORTE_PROC_MY_NAME->epoch,orte_ess.proc_get_epoch(ORTE_PROC_MY_NAME)); OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output, "ess:env set name to %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); diff --git a/orte/mca/ess/ess.h b/orte/mca/ess/ess.h index ce7156f775..915d97ee39 100644 --- a/orte/mca/ess/ess.h +++ b/orte/mca/ess/ess.h @@ -9,6 +9,8 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. + * Copyright (c) 2011 Los Alamos National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -59,8 +61,7 @@ typedef int (*orte_ess_base_module_finalize_fn_t)(void); * function should create an appropriate file to alert the local * orted that termination was abnormal. */ -typedef void (*orte_ess_base_module_abort_fn_t)(int status, bool report) - __opal_attribute_noreturn_funcptr__; +typedef void (*orte_ess_base_module_abort_fn_t)(int status, bool report); /** * Get the locality flag of the specified process @@ -105,19 +106,6 @@ typedef orte_local_rank_t (*orte_ess_base_module_proc_get_local_rank_fn_t)(orte_ */ typedef orte_node_rank_t (*orte_ess_base_module_proc_get_node_rank_fn_t)(orte_process_name_t *proc); -/** - * Update the epoch - * - * The epochs of the processes are stored in the process_name struct, but this - * will get the most up to date version stored within the orte_proc_t struct. - * Obviously the epoch of the proc that is passed in will be ignored. - */ -#if ORTE_ENABLE_EPOCH -typedef orte_epoch_t (*orte_ess_base_module_proc_get_epoch_fn_t)(orte_process_name_t *proc); -#else -typedef int (*orte_ess_base_module_proc_get_epoch_fn_t)(orte_process_name_t *proc); -#endif - /** * Update the pidmap * @@ -159,7 +147,6 @@ struct orte_ess_base_module_1_0_0_t { orte_ess_base_module_proc_get_hostname_fn_t proc_get_hostname; orte_ess_base_module_proc_get_local_rank_fn_t get_local_rank; orte_ess_base_module_proc_get_node_rank_fn_t get_node_rank; - orte_ess_base_module_proc_get_epoch_fn_t proc_get_epoch; orte_ess_base_module_update_pidmap_fn_t update_pidmap; orte_ess_base_module_update_nidmap_fn_t update_nidmap; orte_ess_base_module_ft_event_fn_t ft_event; diff --git a/orte/mca/ess/generic/ess_generic_module.c b/orte/mca/ess/generic/ess_generic_module.c index cd9a0afa83..eb2c3a684a 100644 --- a/orte/mca/ess/generic/ess_generic_module.c +++ b/orte/mca/ess/generic/ess_generic_module.c @@ -86,7 +86,6 @@ orte_ess_base_module_t orte_ess_generic_module = { orte_ess_base_proc_get_hostname, orte_ess_base_proc_get_local_rank, orte_ess_base_proc_get_node_rank, - orte_ess_base_proc_get_epoch, orte_ess_base_update_pidmap, orte_ess_base_update_nidmap, NULL @@ -145,7 +144,6 @@ static int rte_init(void) goto error; } ORTE_PROC_MY_NAME->vpid = strtol(envar, NULL, 10); - ORTE_EPOCH_SET(ORTE_PROC_MY_NAME->epoch,ORTE_EPOCH_MIN); OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output, "%s completed name definition", @@ -263,7 +261,6 @@ static int rte_init(void) if (vpid == ORTE_PROC_MY_NAME->vpid) { ORTE_PROC_MY_DAEMON->jobid = 0; ORTE_PROC_MY_DAEMON->vpid = i; - ORTE_EPOCH_SET(ORTE_PROC_MY_DAEMON->epoch,ORTE_PROC_MY_NAME->epoch); } OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output, "%s node %d name %s rank %s", @@ -294,7 +291,6 @@ static int rte_init(void) if (vpid == ORTE_PROC_MY_NAME->vpid) { ORTE_PROC_MY_DAEMON->jobid = 0; ORTE_PROC_MY_DAEMON->vpid = i; - ORTE_EPOCH_SET(ORTE_PROC_MY_DAEMON->epoch,ORTE_PROC_MY_NAME->epoch); } OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output, "%s node %d name %s rank %d", diff --git a/orte/mca/ess/hnp/ess_hnp_module.c b/orte/mca/ess/hnp/ess_hnp_module.c index 5e6b79c5c6..17c4bd9272 100644 --- a/orte/mca/ess/hnp/ess_hnp_module.c +++ b/orte/mca/ess/hnp/ess_hnp_module.c @@ -11,6 +11,8 @@ * All rights reserved. * Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved. * Copyright (c) 2011 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2011 Los Alamos National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -63,6 +65,8 @@ #include "orte/mca/snapc/base/base.h" #endif #include "orte/mca/filem/base/base.h" +#include "orte/mca/state/base/base.h" +#include "orte/mca/state/state.h" #include "orte/util/show_help.h" #include "orte/util/proc_info.h" @@ -105,7 +109,6 @@ orte_ess_base_module_t orte_ess_hnp_module = { proc_get_hostname, proc_get_local_rank, proc_get_node_rank, - orte_ess_base_proc_get_epoch, /* proc_get_epoch */ update_pidmap, update_nidmap, NULL /* ft_event */ @@ -113,6 +116,7 @@ orte_ess_base_module_t orte_ess_hnp_module = { /* local globals */ static bool signals_set=false; +static bool forcibly_die=false; static opal_event_t term_handler; static opal_event_t int_handler; static opal_event_t epipe_handler; @@ -124,10 +128,17 @@ static opal_event_t sigcont_handler; #endif /* __WINDOWS__ */ static void abort_signal_callback(int fd, short flags, void *arg); -static void abort_exit_callback(int fd, short event, void *arg); static void epipe_signal_callback(int fd, short flags, void *arg); static void signal_forward_callback(int fd, short event, void *arg); +static void setup_sighandler(int signal, opal_event_t *ev, + opal_event_cbfunc_t cbfunc) +{ + opal_event_signal_set(orte_event_base, ev, signal, cbfunc, ev); + opal_event_set_priority(ev, ORTE_ERROR_PRI); + opal_event_signal_add(ev, NULL); +} + static int rte_init(void) { int ret; @@ -138,47 +149,31 @@ static int rte_init(void) orte_proc_t *proc; orte_app_context_t *app; + /* run the prolog */ if (ORTE_SUCCESS != (ret = orte_ess_base_std_prolog())) { error = "orte_ess_base_std_prolog"; goto error; } - + #ifndef __WINDOWS__ /* setup callback for SIGPIPE */ - opal_event_signal_set(opal_event_base, &epipe_handler, SIGPIPE, - epipe_signal_callback, &epipe_handler); - opal_event_signal_add(&epipe_handler, NULL); + setup_sighandler(SIGPIPE, &epipe_handler, epipe_signal_callback); /** setup callbacks for abort signals - from this point * forward, we need to abort in a manner that allows us * to cleanup */ - opal_event_signal_set(opal_event_base, &term_handler, SIGTERM, - abort_signal_callback, &term_handler); - opal_event_signal_add(&term_handler, NULL); - opal_event_signal_set(opal_event_base, &int_handler, SIGINT, - abort_signal_callback, &int_handler); - opal_event_signal_add(&int_handler, NULL); + setup_sighandler(SIGTERM, &term_handler, abort_signal_callback); + setup_sighandler(SIGINT, &int_handler, abort_signal_callback); /** setup callbacks for signals we should foward */ - opal_event_signal_set(opal_event_base, &sigusr1_handler, SIGUSR1, - signal_forward_callback, &sigusr1_handler); - opal_event_signal_add(&sigusr1_handler, NULL); - opal_event_signal_set(opal_event_base, &sigusr2_handler, SIGUSR2, - signal_forward_callback, &sigusr2_handler); - opal_event_signal_add(&sigusr2_handler, NULL); - if (orte_forward_job_control) { - opal_event_signal_set(opal_event_base, &sigtstp_handler, SIGTSTP, - signal_forward_callback, &sigtstp_handler); - opal_event_signal_add(&sigtstp_handler, NULL); - opal_event_signal_set(opal_event_base, &sigcont_handler, SIGCONT, - signal_forward_callback, &sigcont_handler); - opal_event_signal_add(&sigcont_handler, NULL); - } + setup_sighandler(SIGUSR1, &sigusr1_handler, signal_forward_callback); + setup_sighandler(SIGUSR2, &sigusr2_handler, signal_forward_callback); + setup_sighandler(SIGTSTP, &sigtstp_handler, signal_forward_callback); + setup_sighandler(SIGCONT, &sigcont_handler, signal_forward_callback); #endif /* __WINDOWS__ */ - signals_set = true; - + #if OPAL_HAVE_HWLOC { hwloc_obj_t obj; @@ -240,10 +235,22 @@ static int rte_init(void) } if (ORTE_SUCCESS != (ret = opal_pstat_base_select())) { ORTE_ERROR_LOG(ret); - error = "orte_pstat_base_select"; + error = "opal_pstat_base_select"; goto error; } - + + /* open and setup the state machine */ + if (ORTE_SUCCESS != (ret = orte_state_base_open())) { + ORTE_ERROR_LOG(ret); + error = "orte_state_base_open"; + goto error; + } + if (ORTE_SUCCESS != (ret = orte_state_base_select())) { + ORTE_ERROR_LOG(ret); + error = "orte_state_base_select"; + goto error; + } + if (ORTE_SUCCESS != (ret = orte_errmgr_base_open())) { error = "orte_errmgr_base_open"; goto error; @@ -270,7 +277,7 @@ static int rte_init(void) error = "orte_plm_set_hnp_name"; goto error; } - + /* Setup the communication infrastructure */ /* @@ -381,9 +388,6 @@ static int rte_init(void) goto error; } - /* set the communication function */ - orte_comm = orte_global_comm; - /* we are an hnp, so update the contact info field for later use */ orte_process_info.my_hnp_uri = orte_rml.get_contact_info(); @@ -393,7 +397,7 @@ static int rte_init(void) #if !ORTE_DISABLE_FULL_SUPPORT /* setup the orte_show_help system to recv remote output */ ret = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_SHOW_HELP, - ORTE_RML_NON_PERSISTENT, orte_show_help_recv, NULL); + ORTE_RML_PERSISTENT, orte_show_help_recv, NULL); if (ret != ORTE_SUCCESS && ret != ORTE_ERR_NOT_IMPLEMENTED) { ORTE_ERROR_LOG(ret); error = "setup receive for orte_show_help"; @@ -476,6 +480,7 @@ static int rte_init(void) error = "setup node topologies array"; goto error; } + /* Setup the job data object for the daemons */ /* create and store the job data object */ jdata = OBJ_NEW(orte_job_t); @@ -502,7 +507,6 @@ static int rte_init(void) proc = OBJ_NEW(orte_proc_t); proc->name.jobid = ORTE_PROC_MY_NAME->jobid; proc->name.vpid = ORTE_PROC_MY_NAME->vpid; - ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_MIN); proc->pid = orte_process_info.pid; proc->rml_uri = orte_rml.get_contact_info(); @@ -526,7 +530,9 @@ static int rte_init(void) /* record that the daemon job is running */ jdata->num_procs = 1; jdata->state = ORTE_JOB_STATE_RUNNING; - + /* obviously, we have "reported" */ + jdata->num_reported = 1; + /* setup the routed info - the selected routed component * will know what to do. */ @@ -649,7 +655,7 @@ static int rte_init(void) problematic in some scenarios (e.g., COMM_SPAWN, BTL's that require OOB messages for wireup, etc.). */ opal_progress_set_yield_when_idle(false); - + return ORTE_SUCCESS; error: @@ -665,9 +671,6 @@ error: static int rte_finalize(void) { char *contact_path; - orte_node_t *node; - orte_job_t *job; - int i; if (signals_set) { /* Remove the epipe handler */ @@ -696,69 +699,12 @@ static int rte_finalize(void) unlink(contact_path); free(contact_path); - orte_sensor_base_close(); - orte_notifier_base_close(); - - orte_cr_finalize(); - -#if OPAL_ENABLE_FT_CR == 1 - orte_snapc_base_close(); -#endif - orte_filem_base_close(); - - orte_odls_base_close(); - - orte_wait_finalize(); + /* output any lingering stdout/err data */ orte_iof_base_close(); - - /* finalize selected modules so they can de-register - * any receives - */ - orte_ras_base_close(); - orte_rmaps_base_close(); - orte_plm_base_close(); - orte_errmgr_base_close(); - orte_grpcomm_base_close(); - - /* now can close the rml */ - orte_routed_base_close(); - orte_rml_base_close(); - - /* if we were doing timing studies, close the timing file */ - if (orte_timing) { - if (stdout != orte_timing_output && - stderr != orte_timing_output) { - fclose(orte_timing_output); - } - } - - /* cleanup the job and node info arrays */ - if (NULL != orte_node_pool) { - for (i=0; i < orte_node_pool->size; i++) { - if (NULL != (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool,i))) { - OBJ_RELEASE(node); - } - } - OBJ_RELEASE(orte_node_pool); - } - if (NULL != orte_job_data) { - for (i=0; i < orte_job_data->size; i++) { - if (NULL != (job = (orte_job_t*)opal_pointer_array_get_item(orte_job_data,i))) { - OBJ_RELEASE(job); - } - } - OBJ_RELEASE(orte_job_data); - } /* finalize the session directory tree */ orte_session_dir_finalize(ORTE_PROC_MY_NAME); - /* clean out the global structures */ - orte_proc_info_finalize(); - if (NULL != orte_job_ident) { - free(orte_job_ident); - } - /* close the xml output file, if open */ if (orte_xml_output) { fprintf(orte_xml_fp, "\n"); @@ -767,16 +713,6 @@ static int rte_finalize(void) fclose(orte_xml_fp); } } - - /* handle the orted-specific OPAL stuff */ - opal_pstat_base_close(); -#if OPAL_HAVE_HWLOC - /* destroy the topology, if required */ - if (NULL != opal_hwloc_topology) { - opal_hwloc_base_free_topology(opal_hwloc_topology); - opal_hwloc_topology = NULL; - } -#endif return ORTE_SUCCESS; } @@ -962,51 +898,6 @@ static int update_nidmap(opal_byte_object_t *bo) return ORTE_SUCCESS; } -static bool forcibly_die=false; - -static void abort_exit_callback(int fd, short ign, void *arg) -{ - int ret; - - fprintf(stderr, "%s: killing job...\n\n", orte_basename); - - /* since we are being terminated by a user's signal, be - * sure to exit with a non-zero exit code - but don't - * overwrite any error code from a proc that might have - * failed, in case that is why the user ordered us - * to terminate - */ - ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE); - - /* terminate the job - this will also wakeup orterun so - * it can report to the user and kill all the orteds. - * Check the jobid, though, just in case the user - * hit ctrl-c before we had a chance to setup the - * job in the system - in which case there is nothing - * to terminate! - */ - if (!orte_never_launched) { - /* - * Turn off the process recovery functionality, if it was enabled. - * This keeps the errmgr from trying to recover from the shutdown - * procedure. - */ - orte_enable_recovery = false; - - /* terminate the orteds - they will automatically kill - * their local procs - */ - ret = orte_plm.terminate_orteds(); - - } else { - /* if the jobid is invalid or we never launched, - * there is nothing to do but just clean ourselves - * up and exit - */ - orte_quit(); - } -} - /* * Attempt to terminate the job and wait for callback indicating * the job has been aborted. @@ -1047,12 +938,17 @@ static void abort_signal_callback(int fd, short flags, void *arg) */ orte_execute_quiet = true; + if (!orte_never_launched) { + /* cleanup our data server */ + orte_data_server_finalize(); + } + /* We are in an event handler; the job completed procedure will delete the signal handler that is currently running (which is a Bad Thing), so we can't call it directly. Instead, we have to exit this handler and setup to call job_completed() after this. */ - ORTE_TIMER_EVENT(0, 0, abort_exit_callback); + ORTE_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); } /** @@ -1066,7 +962,7 @@ static void epipe_signal_callback(int fd, short flags, void *arg) if (10 < sigpipe_error_count) { /* time to abort */ opal_output(0, "%s: SIGPIPE detected on fd %d - aborting", orte_basename, fd); - abort_exit_callback(0, 0, 0); + abort_signal_callback(0, 0, NULL); } return; diff --git a/orte/mca/ess/lsf/ess_lsf_module.c b/orte/mca/ess/lsf/ess_lsf_module.c index f7f7a9a52b..7d96f125b4 100644 --- a/orte/mca/ess/lsf/ess_lsf_module.c +++ b/orte/mca/ess/lsf/ess_lsf_module.c @@ -60,7 +60,6 @@ orte_ess_base_module_t orte_ess_lsf_module = { orte_ess_base_proc_get_hostname, orte_ess_base_proc_get_local_rank, orte_ess_base_proc_get_node_rank, - orte_ess_base_proc_get_epoch, /* proc_get_epoch */ orte_ess_base_update_pidmap, orte_ess_base_update_nidmap, NULL /* ft_event */ @@ -215,8 +214,6 @@ static int lsf_set_name(void) lsf_nodeid = atoi(getenv("LSF_PM_TASKID")); ORTE_PROC_MY_NAME->vpid = vpid + lsf_nodeid; - ORTE_EPOCH_SET(ORTE_PROC_MY_NAME->epoch,orte_ess.proc_get_epoch(ORTE_PROC_MY_NAME)); - /* get the non-name common environmental variables */ if (ORTE_SUCCESS != (rc = orte_ess_env_get())) { ORTE_ERROR_LOG(rc); diff --git a/orte/mca/ess/pmi/ess_pmi_component.c b/orte/mca/ess/pmi/ess_pmi_component.c index 2c66fef654..0ae93953d7 100644 --- a/orte/mca/ess/pmi/ess_pmi_component.c +++ b/orte/mca/ess/pmi/ess_pmi_component.c @@ -106,7 +106,7 @@ static int pmi_component_query(mca_base_module_t **module, int *priority) /* we are available anywhere PMI is available, but not for HNP itself */ if (!ORTE_PROC_IS_HNP && pmi_startup()) { /* if PMI is available, use it */ - *priority = 40; + *priority = 35; *module = (mca_base_module_t *)&orte_ess_pmi_module; return ORTE_SUCCESS; } diff --git a/orte/mca/ess/pmi/ess_pmi_module.c b/orte/mca/ess/pmi/ess_pmi_module.c index 3934b04cec..8e5dea8f00 100644 --- a/orte/mca/ess/pmi/ess_pmi_module.c +++ b/orte/mca/ess/pmi/ess_pmi_module.c @@ -63,7 +63,7 @@ static int rte_init(void); static int rte_finalize(void); -static void rte_abort(int error_code, bool report) __opal_attribute_noreturn__; +static void rte_abort(int error_code, bool report); orte_ess_base_module_t orte_ess_pmi_module = { rte_init, @@ -74,7 +74,6 @@ orte_ess_base_module_t orte_ess_pmi_module = { orte_ess_base_proc_get_hostname, orte_ess_base_proc_get_local_rank, orte_ess_base_proc_get_node_rank, - orte_ess_base_proc_get_epoch, /* proc_get_epoch */ orte_ess_base_update_pidmap, orte_ess_base_update_nidmap, NULL /* ft_event */ @@ -223,13 +222,10 @@ static int rte_init(void) free(cs_env); free(string_key); - /* get our app_context number */ - if (PMI_SUCCESS != (ret = PMI_Get_appnum(&i))) { - ORTE_PMI_ERROR(ret, "PMI_Get_appnum"); - error = "could not get PMI appnum"; - goto error; - } - orte_process_info.app_num = i; + /* our app_context number can only be 0 as we don't support + * dynamic spawns + */ + orte_process_info.app_num = 0; /* setup the nidmap arrays - they will be filled by the modex */ if (ORTE_SUCCESS != (ret = orte_util_nidmap_init(NULL))) { @@ -307,9 +303,6 @@ static int rte_init(void) } } - /* complete definition of process name */ - ORTE_EPOCH_SET(ORTE_PROC_MY_NAME->epoch,ORTE_EPOCH_MIN); - /* set max procs */ if (orte_process_info.max_procs < orte_process_info.num_procs) { orte_process_info.max_procs = orte_process_info.num_procs; diff --git a/orte/mca/ess/portals4_shmem/ess_portals4_shmem_module.c b/orte/mca/ess/portals4_shmem/ess_portals4_shmem_module.c index 4245fe1a02..6dd52360ce 100644 --- a/orte/mca/ess/portals4_shmem/ess_portals4_shmem_module.c +++ b/orte/mca/ess/portals4_shmem/ess_portals4_shmem_module.c @@ -56,7 +56,6 @@ orte_ess_base_module_t orte_ess_portals4_shmem_module = { proc_get_hostname, proc_get_local_rank, proc_get_node_rank, - orte_ess_base_proc_get_epoch, /* proc_get_epoch */ NULL, /* add_pidmap is only used in ORTE */ NULL, /* update_nidmap is only used in ORTE */ NULL /* ft_event */ diff --git a/orte/mca/ess/singleton/ess_singleton_module.c b/orte/mca/ess/singleton/ess_singleton_module.c index 437036d26a..605bded8b1 100644 --- a/orte/mca/ess/singleton/ess_singleton_module.c +++ b/orte/mca/ess/singleton/ess_singleton_module.c @@ -79,7 +79,6 @@ orte_ess_base_module_t orte_ess_singleton_module = { orte_ess_base_proc_get_hostname, orte_ess_base_proc_get_local_rank, orte_ess_base_proc_get_node_rank, - orte_ess_base_proc_get_epoch, /* proc_get_epoch */ orte_ess_base_update_pidmap, orte_ess_base_update_nidmap, NULL /* ft_event */ @@ -178,7 +177,6 @@ static int rte_init(void) /* set the name */ ORTE_PROC_MY_NAME->jobid = 0xffff0000 & ((uint32_t)jobfam << 16); ORTE_PROC_MY_NAME->vpid = 0; - ORTE_EPOCH_SET(ORTE_PROC_MY_NAME->epoch,ORTE_EPOCH_MIN); } else { /* diff --git a/orte/mca/ess/slurm/ess_slurm_module.c b/orte/mca/ess/slurm/ess_slurm_module.c index 587d0cf713..74c242a71c 100644 --- a/orte/mca/ess/slurm/ess_slurm_module.c +++ b/orte/mca/ess/slurm/ess_slurm_module.c @@ -63,7 +63,6 @@ orte_ess_base_module_t orte_ess_slurm_module = { orte_ess_base_proc_get_hostname, orte_ess_base_proc_get_local_rank, orte_ess_base_proc_get_node_rank, - orte_ess_base_proc_get_epoch, /* proc_get_epoch */ orte_ess_base_update_pidmap, orte_ess_base_update_nidmap, NULL /* ft_event */ @@ -193,7 +192,6 @@ static int slurm_set_name(void) /* fix up the vpid and make it the "real" vpid */ slurm_nodeid = atoi(getenv("SLURM_NODEID")); ORTE_PROC_MY_NAME->vpid = vpid + slurm_nodeid; - ORTE_EPOCH_SET(ORTE_PROC_MY_NAME->epoch,orte_ess.proc_get_epoch(ORTE_PROC_MY_NAME)); OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output, "ess:slurm set name to %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); diff --git a/orte/mca/ess/slurmd/ess_slurmd_module.c b/orte/mca/ess/slurmd/ess_slurmd_module.c index 84ed39175e..90f59e0dba 100644 --- a/orte/mca/ess/slurmd/ess_slurmd_module.c +++ b/orte/mca/ess/slurmd/ess_slurmd_module.c @@ -10,6 +10,8 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2008-2011 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2011 Los Alamos National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -59,7 +61,7 @@ static int rte_init(void); static int rte_finalize(void); -static void rte_abort(int error_code, bool report) __opal_attribute_noreturn__; +static void rte_abort(int error_code, bool report); orte_ess_base_module_t orte_ess_slurmd_module = { rte_init, @@ -70,7 +72,6 @@ orte_ess_base_module_t orte_ess_slurmd_module = { orte_ess_base_proc_get_hostname, orte_ess_base_proc_get_local_rank, orte_ess_base_proc_get_node_rank, - orte_ess_base_proc_get_epoch, /* proc_get_epoch */ orte_ess_base_update_pidmap, orte_ess_base_update_nidmap, NULL /* ft_event */ @@ -185,7 +186,6 @@ static int rte_init(void) nodeid = strtol(envar, NULL, 10); ORTE_PROC_MY_DAEMON->jobid = 0; ORTE_PROC_MY_DAEMON->vpid = nodeid; - ORTE_EPOCH_SET(ORTE_PROC_MY_DAEMON->epoch,ORTE_PROC_MY_NAME->epoch); /* get the node list */ if (NULL == (regexp = getenv("SLURM_STEP_NODELIST"))) { @@ -370,9 +370,6 @@ static int rte_init(void) putenv("OMPI_MCA_grpcomm=hier"); putenv("OMPI_MCA_routed=direct"); - /* complete definition of process name */ - ORTE_EPOCH_SET(ORTE_PROC_MY_NAME->epoch,ORTE_EPOCH_MIN); - /* get our local rank */ if (NULL == (envar = getenv("SLURM_LOCALID"))) { error = "could not get SLURM_LOCALID"; diff --git a/orte/mca/ess/tm/ess_tm_module.c b/orte/mca/ess/tm/ess_tm_module.c index 6d149f6962..21d14a81b8 100644 --- a/orte/mca/ess/tm/ess_tm_module.c +++ b/orte/mca/ess/tm/ess_tm_module.c @@ -62,7 +62,6 @@ orte_ess_base_module_t orte_ess_tm_module = { orte_ess_base_proc_get_hostname, orte_ess_base_proc_get_local_rank, orte_ess_base_proc_get_node_rank, - orte_ess_base_proc_get_epoch, /* proc_get_epoch */ orte_ess_base_update_pidmap, orte_ess_base_update_nidmap, NULL /* ft_event */ @@ -217,7 +216,6 @@ static int tm_set_name(void) ORTE_PROC_MY_NAME->jobid = jobid; ORTE_PROC_MY_NAME->vpid = vpid; - ORTE_EPOCH_SET(ORTE_PROC_MY_NAME->epoch,orte_ess.proc_get_epoch(ORTE_PROC_MY_NAME)); OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output, "ess:tm set name to %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); diff --git a/orte/mca/ess/tool/ess_tool_module.c b/orte/mca/ess/tool/ess_tool_module.c index 3884990e7f..f9ec730521 100644 --- a/orte/mca/ess/tool/ess_tool_module.c +++ b/orte/mca/ess/tool/ess_tool_module.c @@ -55,7 +55,6 @@ orte_ess_base_module_t orte_ess_tool_module = { NULL, /* don't need a proc_get_hostname fn */ NULL, /* don't need a proc_get_local_rank fn */ NULL, /* don't need a proc_get_node_rank fn */ - orte_ess_base_proc_get_epoch, /* proc_get_epoch */ NULL, /* don't need to update_pidmap */ NULL, /* don't need to update_nidmap */ NULL /* ft_event */ diff --git a/orte/mca/filem/base/filem_base_receive.c b/orte/mca/filem/base/filem_base_receive.c index 93e4b126b3..658ce07095 100644 --- a/orte/mca/filem/base/filem_base_receive.c +++ b/orte/mca/filem/base/filem_base_receive.c @@ -10,6 +10,8 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. + * Copyright (c) 2011 Los Alamos National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -49,6 +51,7 @@ #include "orte/mca/errmgr/errmgr.h" #include "orte/mca/rml/rml.h" #include "orte/mca/rml/rml_types.h" +#include "orte/mca/state/state.h" #include "orte/util/name_fns.h" #include "orte/runtime/orte_globals.h" #include "orte/runtime/orte_quit.h" @@ -184,6 +187,7 @@ static void filem_base_process_get_proc_node_name_cmd(orte_process_name_t* sende count = 1; if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &name, &count, ORTE_NAME))) { ORTE_ERROR_LOG(rc); + ORTE_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); goto CLEANUP; } @@ -193,16 +197,14 @@ static void filem_base_process_get_proc_node_name_cmd(orte_process_name_t* sende /* get the job data object for this proc */ if (NULL == (jdata = orte_get_job_data_object(name.jobid))) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); - ORTE_UPDATE_EXIT_STATUS(1); - orte_jobs_complete(); + ORTE_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); goto CLEANUP; } /* get the proc object for it */ procs = (orte_proc_t**)jdata->procs->addr; if (NULL == procs[name.vpid] || NULL == procs[name.vpid]->node) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); - ORTE_UPDATE_EXIT_STATUS(1); - orte_jobs_complete(); + ORTE_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); goto CLEANUP; } @@ -211,13 +213,13 @@ static void filem_base_process_get_proc_node_name_cmd(orte_process_name_t* sende */ if (ORTE_SUCCESS != (rc = opal_dss.pack(&answer, &(procs[name.vpid]->node->name), 1, OPAL_STRING))) { ORTE_ERROR_LOG(rc); - ORTE_UPDATE_EXIT_STATUS(1); - orte_jobs_complete(); + ORTE_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); goto CLEANUP; } if (0 > (rc = orte_rml.send_buffer(sender, &answer, ORTE_RML_TAG_FILEM_BASE_RESP, 0))) { ORTE_ERROR_LOG(rc); + ORTE_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); } CLEANUP: @@ -251,6 +253,7 @@ static void filem_base_process_get_remote_path_cmd(orte_process_name_t* sender, count = 1; if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &filename, &count, OPAL_STRING))) { ORTE_ERROR_LOG(rc); + ORTE_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); goto CLEANUP; } @@ -297,19 +300,18 @@ static void filem_base_process_get_remote_path_cmd(orte_process_name_t* sender, */ if (ORTE_SUCCESS != (rc = opal_dss.pack(&answer, &tmp_name, 1, OPAL_STRING))) { ORTE_ERROR_LOG(rc); - ORTE_UPDATE_EXIT_STATUS(1); - orte_jobs_complete(); + ORTE_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); goto CLEANUP; } if (ORTE_SUCCESS != (rc = opal_dss.pack(&answer, &file_type, 1, OPAL_INT))) { ORTE_ERROR_LOG(rc); - ORTE_UPDATE_EXIT_STATUS(1); - orte_jobs_complete(); + ORTE_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); goto CLEANUP; } if (0 > (rc = orte_rml.send_buffer(sender, &answer, ORTE_RML_TAG_FILEM_BASE_RESP, 0))) { ORTE_ERROR_LOG(rc); + ORTE_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); } CLEANUP: diff --git a/orte/mca/filem/rsh/filem_rsh_module.c b/orte/mca/filem/rsh/filem_rsh_module.c index c801e4b7b5..fa7fcadc7a 100644 --- a/orte/mca/filem/rsh/filem_rsh_module.c +++ b/orte/mca/filem/rsh/filem_rsh_module.c @@ -1096,11 +1096,9 @@ static int orte_filem_rsh_start_command(orte_filem_base_process_set_t *proc_set if( NULL != proc_set ) { wp_item->proc_set.source.jobid = proc_set->source.jobid; wp_item->proc_set.source.vpid = proc_set->source.vpid; - ORTE_EPOCH_SET(wp_item->proc_set.source.epoch,proc_set->source.epoch); wp_item->proc_set.sink.jobid = proc_set->sink.jobid; wp_item->proc_set.sink.vpid = proc_set->sink.vpid; - ORTE_EPOCH_SET(wp_item->proc_set.sink.epoch,proc_set->sink.epoch); } /* Copy the File Set */ if( NULL != file_set ) { @@ -1395,7 +1393,6 @@ static void orte_filem_rsh_permission_callback(int status, wp_item = OBJ_NEW(orte_filem_rsh_work_pool_item_t); wp_item->proc_set.source.jobid = sender->jobid; wp_item->proc_set.source.vpid = sender->vpid; - ORTE_EPOCH_SET(wp_item->proc_set.source.epoch,sender->epoch); opal_list_append(&work_pool_waiting, &(wp_item->super)); } diff --git a/orte/mca/grpcomm/bad/grpcomm_bad_module.c b/orte/mca/grpcomm/bad/grpcomm_bad_module.c index c6661539b5..9d3ea66c62 100644 --- a/orte/mca/grpcomm/bad/grpcomm_bad_module.c +++ b/orte/mca/grpcomm/bad/grpcomm_bad_module.c @@ -10,6 +10,8 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright (c) 2011-2012 Los Alamos National Security, LLC. + * All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -50,9 +52,8 @@ static void finalize(void); static int xcast(orte_jobid_t job, opal_buffer_t *buffer, orte_rml_tag_t tag); -static int bad_allgather(opal_buffer_t *sbuf, opal_buffer_t *rbuf); -static int bad_barrier(void); -static int modex(opal_list_t *procs); +static int bad_allgather(orte_grpcomm_collective_t *coll); +static int bad_barrier(orte_grpcomm_collective_t *coll); /* Module def */ orte_grpcomm_base_module_t orte_grpcomm_bad_module = { @@ -60,17 +61,13 @@ orte_grpcomm_base_module_t orte_grpcomm_bad_module = { finalize, xcast, bad_allgather, - orte_grpcomm_base_allgather_list, bad_barrier, orte_grpcomm_base_set_proc_attr, orte_grpcomm_base_get_proc_attr, - modex, + orte_grpcomm_base_modex, orte_grpcomm_base_purge_proc_attrs }; -/* Local variables */ -static orte_grpcomm_collective_t barrier, allgather; - /** * Initialize the module */ @@ -83,21 +80,9 @@ static int init(void) return rc; } - /* setup global variables */ - OBJ_CONSTRUCT(&barrier, orte_grpcomm_collective_t); - OBJ_CONSTRUCT(&allgather, orte_grpcomm_collective_t); - - /* if we are a daemon or the hnp, we need to post a - * recv to catch any collective operations - */ - if (ORTE_PROC_IS_DAEMON || ORTE_PROC_IS_HNP) { - if (ORTE_SUCCESS != (rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, - ORTE_RML_TAG_DAEMON_COLLECTIVE, - ORTE_RML_NON_PERSISTENT, - orte_grpcomm_base_daemon_coll_recv, - NULL))) { - ORTE_ERROR_LOG(rc); - } + /* setup recvs */ + if (ORTE_SUCCESS != (rc = orte_grpcomm_base_comm_start())) { + ORTE_ERROR_LOG(rc); } return rc; @@ -110,16 +95,8 @@ static void finalize(void) { orte_grpcomm_base_modex_finalize(); - /* destruct the globals */ - OBJ_DESTRUCT(&barrier); - OBJ_DESTRUCT(&allgather); - - /* if we are a daemon or the hnp, we need to cancel the - * recv we posted - */ - if (ORTE_PROC_IS_DAEMON || ORTE_PROC_IS_HNP) { - orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_DAEMON_COLLECTIVE); - } + /* cancel recv */ + orte_grpcomm_base_comm_stop(); } /** @@ -133,7 +110,7 @@ static int xcast(orte_jobid_t job, orte_rml_tag_t tag) { int rc = ORTE_SUCCESS; - opal_buffer_t buf; + opal_buffer_t *buf; OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base.output, "%s grpcomm:bad:xcast sent to job %s tag %ld", @@ -146,204 +123,129 @@ static int xcast(orte_jobid_t job, } /* prep the output buffer */ - OBJ_CONSTRUCT(&buf, opal_buffer_t); + buf = OBJ_NEW(opal_buffer_t); - if (ORTE_SUCCESS != (rc = orte_grpcomm_base_app_pack_xcast(ORTE_DAEMON_PROCESS_AND_RELAY_CMD, - job, &buf, buffer, tag))) { + if (ORTE_SUCCESS != (rc = orte_grpcomm_base_pack_xcast(job, buf, buffer, tag))) { ORTE_ERROR_LOG(rc); goto CLEANUP; } - /* if I am the HNP, just set things up so the cmd processor gets called. - * We don't want to message ourselves as this can create circular logic - * in the RML. Instead, this macro will set a zero-time event which will - * cause the buffer to be processed by the cmd processor - probably will - * fire right away, but that's okay - * The macro makes a copy of the buffer, so it's okay to release it here - */ - if (ORTE_PROC_IS_HNP) { - ORTE_MESSAGE_EVENT(ORTE_PROC_MY_NAME, &buf, ORTE_RML_TAG_DAEMON, orte_daemon_cmd_processor); - } else { - /* otherwise, send it to the HNP for relay */ - if (0 > (rc = orte_rml.send_buffer(ORTE_PROC_MY_HNP, &buf, ORTE_RML_TAG_DAEMON, 0))) { - ORTE_ERROR_LOG(rc); - goto CLEANUP; - } - rc = ORTE_SUCCESS; + /* send it to the HNP (could be myself) for relay */ + if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, buf, ORTE_RML_TAG_XCAST, + 0, orte_rml_send_callback, NULL))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(buf); + goto CLEANUP; } + rc = ORTE_SUCCESS; CLEANUP: - OBJ_DESTRUCT(&buf); return rc; } -static void barrier_recv(int status, orte_process_name_t* sender, - opal_buffer_t *buffer, - orte_rml_tag_t tag, void *cbdata) -{ - orte_grpcomm_collective_t *coll = (orte_grpcomm_collective_t*)cbdata; - - OPAL_THREAD_LOCK(&coll->lock); - /* flag as recvd */ - coll->recvd = 1; - opal_condition_broadcast(&coll->cond); - OPAL_THREAD_UNLOCK(&coll->lock); -} - -static int bad_barrier(void) +static int bad_barrier(orte_grpcomm_collective_t *coll) { int rc; - + opal_buffer_t *buf; + orte_namelist_t *nm; + OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base.output, "%s grpcomm:bad entering barrier", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - /* if I am alone, just return */ + /* if I am alone, just execute the callback */ if (1 == orte_process_info.num_procs) { + coll->active = false; + if (NULL != coll->cbfunc) { + coll->cbfunc(NULL, coll->cbdata); + } return ORTE_SUCCESS; } - /* setup the recv to get the response */ - rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_BARRIER, - ORTE_RML_NON_PERSISTENT, barrier_recv, &barrier); - if (rc != ORTE_SUCCESS) { + /* mark the collective as active */ + coll->active = true; + + /* setup the collective */ + opal_list_append(&orte_grpcomm_base.active_colls, &coll->super); + + if (0 == opal_list_get_size(&coll->participants)) { + /* add a wildcard name to the participants so the daemon knows + * that everyone in my job must participate + */ + nm = OBJ_NEW(orte_namelist_t); + nm->name.jobid = ORTE_PROC_MY_NAME->jobid; + nm->name.vpid = ORTE_VPID_WILDCARD; + opal_list_append(&coll->participants, &nm->super); + } + + /* pack the collective - no data should be involved, but we need + * to ensure we get the header info correct so it can be + * unpacked without error + */ + buf = OBJ_NEW(opal_buffer_t); + orte_grpcomm_base_pack_collective(buf, coll, ORTE_GRPCOMM_INTERNAL_STG_APP); + + /* send the buffer to my daemon */ + if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_DAEMON, buf, ORTE_RML_TAG_COLLECTIVE, + 0, orte_rml_send_callback, NULL))) { ORTE_ERROR_LOG(rc); + OBJ_RELEASE(buf); + opal_list_remove_item(&orte_grpcomm_base.active_colls, &coll->super); return rc; } - - /* send it and wait for the response */ - if (ORTE_SUCCESS != (rc = orte_grpcomm_base_app_barrier(ORTE_PROC_MY_DAEMON, &barrier))) { - ORTE_ERROR_LOG(rc); - } - - /* don't need to cancel the recv as it only fires once */ OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base.output, - "%s grpcomm:bad received barrier release", + "%s grpcomm:bad barrier underway", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); return rc; } -static void allgather_recv(int status, orte_process_name_t* sender, - opal_buffer_t *buffer, - orte_rml_tag_t tag, void *cbdata) +static int bad_allgather(orte_grpcomm_collective_t *gather) { - orte_grpcomm_collective_t *coll = (orte_grpcomm_collective_t*)cbdata; int rc; - - OPAL_THREAD_LOCK(&coll->lock); - /* xfer the data */ - if (ORTE_SUCCESS != (rc = opal_dss.copy_payload(&coll->results, buffer))) { - ORTE_ERROR_LOG(rc); - } - /* the daemon returns ALL of our recipients in a single message */ - coll->recvd = orte_process_info.num_procs; - opal_condition_broadcast(&coll->cond); - OPAL_THREAD_UNLOCK(&coll->lock); -} + opal_buffer_t *buf; -static int bad_allgather(opal_buffer_t *sbuf, opal_buffer_t *rbuf) -{ - int rc; - OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base.output, "%s grpcomm:bad entering allgather", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - /* setup to receive results */ - rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_ALLGATHER, - ORTE_RML_NON_PERSISTENT, allgather_recv, &allgather); - if (rc != ORTE_SUCCESS) { - ORTE_ERROR_LOG(rc); - return rc; + /* if I am alone, just fire callback */ + if (1 == orte_process_info.num_procs) { + gather->active = false; + if (NULL != gather->cbfunc) { + gather->cbfunc(&gather->buffer, gather->cbdata); + } + return ORTE_SUCCESS; } - /* everyone sends data to their local daemon */ - if (ORTE_SUCCESS != (rc = orte_grpcomm_base_app_allgather(ORTE_PROC_MY_DAEMON, - &allgather, sbuf, rbuf))) { + /* mark the collective as active */ + gather->active = true; + + /* if this is an original request, then record the collective */ + if (NULL == gather->next_cb) { + opal_list_append(&orte_grpcomm_base.active_colls, &gather->super); + } + + /* start the allgather op by sending the data to our daemon - the + * user will have put the data in the "buffer" field + */ + buf = OBJ_NEW(opal_buffer_t); + orte_grpcomm_base_pack_collective(buf, gather, ORTE_GRPCOMM_INTERNAL_STG_APP); + /* send to our daemon */ + if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_DAEMON, buf, + ORTE_RML_TAG_COLLECTIVE, 0, + orte_rml_send_callback, NULL))) { ORTE_ERROR_LOG(rc); + OBJ_RELEASE(buf); + opal_list_remove_item(&orte_grpcomm_base.active_colls, &gather->super); return rc; } - /* don't need to cancel the recv as it only fires once */ - OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base.output, - "%s grpcomm:bad allgather completed", + "%s grpcomm:bad allgather underway", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); return ORTE_SUCCESS; } - -/*** MODEX SECTION ***/ -static int modex(opal_list_t *procs) -{ - int rc; - opal_buffer_t buf, rbuf; - - OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base.output, - "%s grpcomm:bad: modex entered", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - - if (NULL == procs) { - /* This is a modex across our peers at startup. The modex will be realized in the - * background by the daemons. The processes will - * only be informed when all data has been collected from all processes. The get_attr - * will realize the blocking, it will not return until the data has been received. - */ - - OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base.output, - "%s grpcomm:bad:peer:modex: performing modex", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - - /* setup the buffers */ - OBJ_CONSTRUCT(&buf, opal_buffer_t); - OBJ_CONSTRUCT(&rbuf, opal_buffer_t); - - /* put our process name in the buffer so it can be unpacked later */ - if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, ORTE_PROC_MY_NAME, 1, ORTE_NAME))) { - ORTE_ERROR_LOG(rc); - goto cleanup; - } - - /* pack the entries we have received */ - if (ORTE_SUCCESS != (rc = orte_grpcomm_base_pack_modex_entries(&buf))) { - ORTE_ERROR_LOG(rc); - goto cleanup; - } - - /* perform the allgather */ - if (ORTE_SUCCESS != (rc = bad_allgather(&buf, &rbuf))) { - ORTE_ERROR_LOG(rc); - goto cleanup; - } - - /* store the results */ - if( ORTE_SUCCESS != (rc = orte_grpcomm_base_modex_unpack(&rbuf)) ) { - ORTE_ERROR_LOG(rc); - } - - OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base.output, - "%s grpcomm:bad: modex posted", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - cleanup: - OBJ_DESTRUCT(&buf); - OBJ_DESTRUCT(&rbuf); - - return rc; - } else { - /* this is a modex across a specified list of procs, usually during - * a connect/accept. - */ - if (ORTE_SUCCESS != (rc = orte_grpcomm_base_full_modex(procs))) { - ORTE_ERROR_LOG(rc); - } - } - - OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base.output, - "%s grpcomm:bad: modex completed", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - - return rc; -} diff --git a/orte/mca/grpcomm/base/Makefile.am b/orte/mca/grpcomm/base/Makefile.am index 34644845fb..185558af83 100644 --- a/orte/mca/grpcomm/base/Makefile.am +++ b/orte/mca/grpcomm/base/Makefile.am @@ -9,6 +9,8 @@ # University of Stuttgart. All rights reserved. # Copyright (c) 2004-2005 The Regents of the University of California. # All rights reserved. +# Copyright (c) 2011 Los Alamos National Security, LLC. +# All rights reserved. # $COPYRIGHT$ # # Additional copyrights may follow @@ -27,9 +29,7 @@ libmca_grpcomm_la_SOURCES += \ if !ORTE_DISABLE_FULL_SUPPORT libmca_grpcomm_la_SOURCES += \ - base/grpcomm_base_allgather.c \ base/grpcomm_base_modex.c \ - base/grpcomm_base_coll.c \ - base/grpcomm_base_app_fns.c - + base/grpcomm_base_receive.c \ + base/grpcomm_base_xcast.c endif diff --git a/orte/mca/grpcomm/base/base.h b/orte/mca/grpcomm/base/base.h index d4d2daf3b4..eba414b639 100644 --- a/orte/mca/grpcomm/base/base.h +++ b/orte/mca/grpcomm/base/base.h @@ -9,6 +9,8 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. + * Copyright (c) 2011-2012 Los Alamos National Security, LLC. + * All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -49,9 +51,6 @@ ORTE_DECLSPEC int orte_grpcomm_base_open(void); ORTE_DECLSPEC int orte_grpcomm_base_select(void); ORTE_DECLSPEC int orte_grpcomm_base_close(void); -/* daemon collective function */ -typedef void (*orte_grpcomm_daemon_collective_fn_t)(orte_process_name_t *sender, - opal_buffer_t *data); /* * globals that might be needed */ @@ -60,7 +59,8 @@ typedef struct { bool selected; opal_list_t components_available; orte_grpcomm_base_component_t selected_component; - orte_grpcomm_daemon_collective_fn_t daemon_coll; + orte_grpcomm_coll_id_t coll_id; + opal_list_t active_colls; #if OPAL_HAVE_HWLOC hwloc_cpuset_t working_cpuset; #endif @@ -68,30 +68,23 @@ typedef struct { ORTE_DECLSPEC extern orte_grpcomm_base_t orte_grpcomm_base; -/* structure for tracking collective operations */ -typedef struct { - opal_object_t super; - opal_mutex_t lock; - opal_condition_t cond; - orte_vpid_t recvd; - opal_buffer_t results; -} orte_grpcomm_collective_t; -ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_grpcomm_collective_t); +ORTE_DECLSPEC orte_grpcomm_collective_t* orte_grpcomm_base_setup_collective(orte_grpcomm_coll_id_t id); +ORTE_DECLSPEC void orte_grpcomm_base_progress_collectives(void); +ORTE_DECLSPEC orte_grpcomm_coll_id_t orte_grpcomm_base_get_coll_id(void); +ORTE_DECLSPEC void orte_grpcomm_base_pack_collective(opal_buffer_t *relay, + orte_grpcomm_collective_t *coll, + orte_grpcomm_internal_stage_t stg); -/* - * Base functions - */ -ORTE_DECLSPEC int orte_grpcomm_base_allgather_list(opal_list_t *names, - opal_buffer_t *sbuf, - opal_buffer_t *rbuf); +/* modex support */ ORTE_DECLSPEC int orte_grpcomm_base_set_proc_attr(const char *attr_name, const void *data, size_t size); ORTE_DECLSPEC int orte_grpcomm_base_get_proc_attr(const orte_process_name_t proc, const char * attribute_name, void **val, size_t *size); -ORTE_DECLSPEC int orte_grpcomm_base_modex_unpack( opal_buffer_t* rbuf); -ORTE_DECLSPEC int orte_grpcomm_base_full_modex(opal_list_t *procs); +ORTE_DECLSPEC void orte_grpcomm_base_store_peer_modex(opal_buffer_t *rbuf, void *cbdata); +ORTE_DECLSPEC void orte_grpcomm_base_store_modex(opal_buffer_t *rbuf, void *cbdata); +ORTE_DECLSPEC int orte_grpcomm_base_modex(orte_grpcomm_collective_t *modex); ORTE_DECLSPEC int orte_grpcomm_base_purge_proc_attrs(void); ORTE_DECLSPEC int orte_grpcomm_base_modex_init(void); ORTE_DECLSPEC void orte_grpcomm_base_modex_finalize(void); @@ -101,30 +94,16 @@ ORTE_DECLSPEC int orte_grpcomm_base_update_modex_entries(orte_process_name_t * ORTE_DECLSPEC int orte_grpcomm_base_load_modex_data(orte_process_name_t *proc, char *attribute_name, void *data, int num_bytes); -/* app functions */ -ORTE_DECLSPEC int orte_grpcomm_base_app_barrier(orte_process_name_t *recipient, - orte_grpcomm_collective_t *coll); -ORTE_DECLSPEC int orte_grpcomm_base_app_allgather(orte_process_name_t *recipient, - orte_grpcomm_collective_t *coll, - opal_buffer_t *sbuf, - opal_buffer_t *rbuf); -ORTE_DECLSPEC int orte_grpcomm_base_app_pack_xcast(orte_daemon_cmd_flag_t cmd, - orte_jobid_t job, - opal_buffer_t *buffer, - opal_buffer_t *message, - orte_rml_tag_t tag); - -/* Tuned collectives */ -ORTE_DECLSPEC void orte_grpcomm_base_coll_recv(int status, orte_process_name_t* sender, - opal_buffer_t* buffer, orte_rml_tag_t tag, - void* cbdata); -ORTE_DECLSPEC int orte_grpcomm_base_allgather(opal_buffer_t *sendbuf, opal_buffer_t *recvbuf, int32_t num_entries, - orte_jobid_t jobid, orte_vpid_t np, orte_vpid_t *vpids); -ORTE_DECLSPEC void orte_grpcomm_base_daemon_coll_recv(int status, orte_process_name_t* sender, - opal_buffer_t* buffer, orte_rml_tag_t tag, - void* cbdata); -ORTE_DECLSPEC void orte_grpcomm_base_daemon_collective(orte_process_name_t *sender, - opal_buffer_t *data); +/* comm support */ +ORTE_DECLSPEC int orte_grpcomm_base_comm_start(void); +ORTE_DECLSPEC void orte_grpcomm_base_comm_stop(void); +ORTE_DECLSPEC void orte_grpcomm_base_xcast_recv(int status, orte_process_name_t* sender, + opal_buffer_t* buffer, orte_rml_tag_t tag, + void* cbdata); +ORTE_DECLSPEC int orte_grpcomm_base_pack_xcast(orte_jobid_t job, + opal_buffer_t *buffer, + opal_buffer_t *message, + orte_rml_tag_t tag); END_C_DECLS #endif diff --git a/orte/mca/grpcomm/base/grpcomm_base_allgather.c b/orte/mca/grpcomm/base/grpcomm_base_allgather.c deleted file mode 100644 index 9477eadd9a..0000000000 --- a/orte/mca/grpcomm/base/grpcomm_base_allgather.c +++ /dev/null @@ -1,263 +0,0 @@ -/* - * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2004-2005 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2005 The Regents of the University of California. - * All rights reserved. - * Copyright (c) 2007 Sun Microsystems, Inc. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "orte_config.h" -#include "orte/constants.h" -#include "orte/types.h" - -#include -#ifdef HAVE_SYS_TIME_H -#include -#endif /* HAVE_SYS_TIME_H */ - -#include "opal/util/output.h" - -#include "opal/dss/dss.h" -#include "orte/mca/errmgr/errmgr.h" -#include "orte/mca/rml/rml.h" -#include "orte/mca/rml/rml_types.h" -#include "orte/runtime/orte_globals.h" -#include "orte/util/name_fns.h" -#include "orte/orted/orted.h" -#include "orte/runtime/orte_wait.h" - -#include "orte/mca/grpcomm/base/base.h" - -static bool allgather_failed; -static orte_std_cntr_t allgather_num_recvd; -static opal_buffer_t *allgather_buf; - -static void allgather_server_recv(int status, orte_process_name_t* sender, - opal_buffer_t *buffer, - orte_rml_tag_t tag, void *cbdata) -{ - int rc; - - OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base.output, - "%s allgather buffer received from %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(sender))); - - /* append this data to the allgather_buf */ - if (ORTE_SUCCESS != (rc = opal_dss.copy_payload(allgather_buf, buffer))) { - ORTE_ERROR_LOG(rc); - allgather_failed = true; - return; - } - - /* bump the counter */ - ++allgather_num_recvd; - - /* reissue the recv */ - rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_ALLGATHER_LIST, - ORTE_RML_NON_PERSISTENT, allgather_server_recv, NULL); - if (rc != ORTE_SUCCESS && rc != ORTE_ERR_NOT_IMPLEMENTED) { - ORTE_ERROR_LOG(rc); - allgather_failed = true; - } -} - -static void allgather_client_recv(int status, orte_process_name_t* sender, - opal_buffer_t *buffer, - orte_rml_tag_t tag, void *cbdata) -{ - int rc; - - OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base.output, - "%s grpcomm:base: allgather buffer received", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - - /* transfer the buffer */ - if (ORTE_SUCCESS != (rc = opal_dss.copy_payload(allgather_buf, buffer))) { - ORTE_ERROR_LOG(rc); - allgather_failed = true; - } - - /* bump the counter */ - ++allgather_num_recvd; -} - -static orte_std_cntr_t allgather_num_sent; -static void allgather_send_cb(int status, orte_process_name_t* sender, - opal_buffer_t *buffer, - orte_rml_tag_t tag, void *cbdata) -{ - /* increment the count */ - ++allgather_num_sent; -} - - -int orte_grpcomm_base_allgather_list(opal_list_t *names, opal_buffer_t *sbuf, opal_buffer_t *rbuf) -{ - opal_list_item_t *item; - orte_namelist_t *peer, *root; - int32_t num_peers; - int rc; - - OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base.output, - "%s grpcomm: entering allgather_list", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - - /* the first entry on the list is the "root" that collects - * all the data - everyone else just sends and gets back - * the results - */ - root = (orte_namelist_t*)opal_list_get_first(names); - - /*** NON-ROOT ***/ - if (OPAL_EQUAL != opal_dss.compare(&root->name, ORTE_PROC_MY_NAME, ORTE_NAME)) { - /* everyone but root sends data */ - OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base.output, - "%s allgather_list: sending my data to %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&root->name))); - - if (0 > orte_rml.send_buffer(&root->name, sbuf, ORTE_RML_TAG_ALLGATHER_LIST, 0)) { - ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); - return ORTE_ERR_COMM_FAILURE; - } - - OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base.output, - "%s allgather_list: buffer sent", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - - /* setup the buffer that will recv the results */ - allgather_buf = OBJ_NEW(opal_buffer_t); - - /* now receive the final result from rank=0. Be sure to do this in - * a manner that allows us to return without being in a recv! - */ - allgather_num_recvd = 0; - allgather_failed = false; - rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_ALLGATHER_LIST, - ORTE_RML_NON_PERSISTENT, allgather_client_recv, NULL); - if (rc != ORTE_SUCCESS && rc != ORTE_ERR_NOT_IMPLEMENTED) { - ORTE_ERROR_LOG(rc); - return rc; - } - - ORTE_PROGRESSED_WAIT(allgather_failed, allgather_num_recvd, 1); - - /* cancel the lingering recv */ - orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_ALLGATHER_LIST); - - /* if the allgather failed, return an error */ - if (allgather_failed) { - ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); - OBJ_RELEASE(allgather_buf); - return ORTE_ERR_COMM_FAILURE; - } - - /* copy payload to the caller's buffer */ - if (ORTE_SUCCESS != (rc = opal_dss.copy_payload(rbuf, allgather_buf))) { - ORTE_ERROR_LOG(rc); - OBJ_RELEASE(allgather_buf); - return rc; - } - OBJ_RELEASE(allgather_buf); - - OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base.output, - "%s allgather_list: buffer received", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - - return ORTE_SUCCESS; - } - - - /*** ROOT ***/ - /* count how many peers are participating, including myself */ - num_peers = (int32_t)opal_list_get_size(names); - - /* seed the outgoing buffer with the num_procs so it can be unpacked */ - if (ORTE_SUCCESS != (rc = opal_dss.pack(rbuf, &num_peers, 1, OPAL_INT32))) { - ORTE_ERROR_LOG(rc); - return rc; - } - - /* put my own information into the outgoing buffer */ - if (ORTE_SUCCESS != (rc = opal_dss.copy_payload(rbuf, sbuf))) { - ORTE_ERROR_LOG(rc); - return rc; - } - - /* setup the recv conditions */ - allgather_failed = false; - allgather_num_recvd = 0; - - /* setup the buffer that will recv the results */ - allgather_buf = OBJ_NEW(opal_buffer_t); - - OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base.output, - "%s allgather_list: waiting to recv %ld inputs", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - (long)num_peers-1)); - - /* post the non-blocking recv */ - rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_ALLGATHER_LIST, - ORTE_RML_NON_PERSISTENT, allgather_server_recv, NULL); - if (rc != ORTE_SUCCESS && rc != ORTE_ERR_NOT_IMPLEMENTED) { - ORTE_ERROR_LOG(rc); - return rc; - } - - ORTE_PROGRESSED_WAIT(allgather_failed, allgather_num_recvd, num_peers-1); - - /* cancel the lingering recv */ - orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_ALLGATHER_LIST); - - OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base.output, - "%s allgather_list: received all data", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - - /* copy the received info to the caller's buffer */ - if (ORTE_SUCCESS != (rc = opal_dss.copy_payload(rbuf, allgather_buf))) { - ORTE_ERROR_LOG(rc); - OBJ_RELEASE(allgather_buf); - return rc; - } - OBJ_RELEASE(allgather_buf); - - /* broadcast the results */ - allgather_num_sent = 0; - for (item = opal_list_get_first(names); - item != opal_list_get_end(names); - item = opal_list_get_next(item)) { - peer = (orte_namelist_t*)item; - - /* skip myself */ - if (OPAL_EQUAL == opal_dss.compare(&root->name, &peer->name, ORTE_NAME)) { - continue; - } - - /* transmit the buffer to this process */ - if (0 > orte_rml.send_buffer_nb(&peer->name, rbuf, ORTE_RML_TAG_ALLGATHER_LIST, - 0, allgather_send_cb, 0)) { - ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); - return ORTE_ERR_COMM_FAILURE; - } - } - - ORTE_PROGRESSED_WAIT(false, allgather_num_sent, num_peers-1); - - OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base.output, - "%s grpcomm: allgather_list completed", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - - return ORTE_SUCCESS; -} diff --git a/orte/mca/grpcomm/base/grpcomm_base_app_fns.c b/orte/mca/grpcomm/base/grpcomm_base_app_fns.c deleted file mode 100644 index e3b8d2a070..0000000000 --- a/orte/mca/grpcomm/base/grpcomm_base_app_fns.c +++ /dev/null @@ -1,220 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; -*- */ -/* - * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2004-2009 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2004-2009 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2005 The Regents of the University of California. - * All rights reserved. - * Copyright (c) 2007 Sun Microsystems, Inc. All rights reserved. - * Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "orte_config.h" -#include "orte/constants.h" -#include "orte/types.h" - -#include -#ifdef HAVE_SYS_TIME_H -#include -#endif /* HAVE_SYS_TIME_H */ - -#include "opal/util/output.h" -#include "opal/class/opal_hash_table.h" -#include "opal/dss/dss.h" -#include "opal/threads/mutex.h" -#include "opal/threads/condition.h" - -#include "orte/util/proc_info.h" -#include "orte/mca/errmgr/errmgr.h" -#include "orte/util/name_fns.h" -#include "orte/mca/rml/rml.h" -#include "orte/mca/odls/odls_types.h" - -#include "orte/mca/grpcomm/base/base.h" - -int orte_grpcomm_base_app_pack_xcast(orte_daemon_cmd_flag_t cmd, - orte_jobid_t job, - opal_buffer_t *buffer, - opal_buffer_t *message, - orte_rml_tag_t tag) -{ - orte_daemon_cmd_flag_t command; - int rc; - - /* pack the base cmd for the daemon/HNP */ - command = cmd; - if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &command, 1, ORTE_DAEMON_CMD))) { - ORTE_ERROR_LOG(rc); - goto CLEANUP; - } - /* pack the target jobid and tag for use in relay */ - if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &job, 1, ORTE_JOBID))) { - ORTE_ERROR_LOG(rc); - goto CLEANUP; - } - if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &tag, 1, ORTE_RML_TAG))) { - ORTE_ERROR_LOG(rc); - goto CLEANUP; - } - - /* if this isn't intended for the daemon command tag, then we better - * tell the daemon to deliver it to the procs, and what job is supposed - * to get it - this occurs when a caller just wants to send something - * to all the procs in a job. In that use-case, the caller doesn't know - * anything about inserting daemon commands or what routing algo might - * be used, so we have to help them out a little. Functions that are - * sending commands to the daemons themselves are smart enough to know - * what they need to do. - */ - if (ORTE_RML_TAG_DAEMON != tag) { - command = ORTE_DAEMON_MESSAGE_LOCAL_PROCS; - if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &command, 1, ORTE_DAEMON_CMD))) { - ORTE_ERROR_LOG(rc); - goto CLEANUP; - } - if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &job, 1, ORTE_JOBID))) { - ORTE_ERROR_LOG(rc); - goto CLEANUP; - } - if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &tag, 1, ORTE_RML_TAG))) { - ORTE_ERROR_LOG(rc); - goto CLEANUP; - } - } - - /* copy the payload into the new buffer - this is non-destructive, so our - * caller is still responsible for releasing any memory in the buffer they - * gave to us - */ - if (ORTE_SUCCESS != (rc = opal_dss.copy_payload(buffer, message))) { - ORTE_ERROR_LOG(rc); - goto CLEANUP; - } - -CLEANUP: - return ORTE_SUCCESS; -} - -int orte_grpcomm_base_app_barrier(orte_process_name_t *recipient, - orte_grpcomm_collective_t *coll) -{ - int rc; - opal_buffer_t buf; - orte_rml_tag_t tag=ORTE_RML_TAG_BARRIER; - - OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base.output, - "%s grpcomm:app entering barrier", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - - OBJ_CONSTRUCT(&buf, opal_buffer_t); - /* add the barrier tag */ - if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &tag, 1, ORTE_RML_TAG))) { - ORTE_ERROR_LOG(rc); - OBJ_DESTRUCT(&buf); - return rc; - } - - /* send the buffer to recipient */ - if (0 > (rc = orte_rml.send_buffer(recipient, &buf, ORTE_RML_TAG_DAEMON_COLLECTIVE, 0))) { - ORTE_ERROR_LOG(rc); - OBJ_DESTRUCT(&buf); - return rc; - } - OBJ_DESTRUCT(&buf); - - /* wait to complete */ - OPAL_THREAD_LOCK(&coll->lock); - while (0 == coll->recvd) { - opal_condition_wait(&coll->cond, &coll->lock); - } - coll->recvd = 0; /* reset for next time */ - OPAL_THREAD_UNLOCK(&coll->lock); - - OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base.output, - "%s grpcomm:app received barrier release", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - - return ORTE_SUCCESS; -} - -int orte_grpcomm_base_app_allgather(orte_process_name_t *recipient, - orte_grpcomm_collective_t *coll, - opal_buffer_t *sbuf, - opal_buffer_t *rbuf) -{ - int rc; - opal_buffer_t buf; - orte_rml_tag_t tag=ORTE_RML_TAG_ALLGATHER; - int32_t nc; - - OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base.output, - "%s grpcomm:app entering allgather", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - - - /* if I am alone, just copy data across and return */ - if (1 == orte_process_info.num_procs) { - /* since we won't be going through the daemon collective, - * we have to pack num_contributors=1 so that - * things will unpack correctly - */ - nc = 1; - opal_dss.pack(rbuf, &nc, 1, OPAL_INT32); - opal_dss.copy_payload(rbuf, sbuf); - return ORTE_SUCCESS; - } - - /* everyone sends data to their local daemon */ - OBJ_CONSTRUCT(&buf, opal_buffer_t); - /* add the allgather tag */ - if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &tag, 1, ORTE_RML_TAG))) { - ORTE_ERROR_LOG(rc); - OBJ_DESTRUCT(&buf); - return rc; - } - /* add our data to it */ - if (ORTE_SUCCESS != (rc = opal_dss.copy_payload(&buf, sbuf))) { - ORTE_ERROR_LOG(rc); - OBJ_DESTRUCT(&buf); - return rc; - } - /* send to recipient */ - if (0 > (rc = orte_rml.send_buffer(recipient, &buf, ORTE_RML_TAG_DAEMON_COLLECTIVE, 0))) { - ORTE_ERROR_LOG(rc); - OBJ_DESTRUCT(&buf); - return rc; - } - OBJ_DESTRUCT(&buf); - - OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base.output, - "%s grpcomm:app allgather buffer sent", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - - /* wait to complete */ - OPAL_THREAD_LOCK(&coll->lock); - while (coll->recvd < orte_process_info.num_procs) { - opal_condition_wait(&coll->cond, &coll->lock); - } - /* xfer the collected data */ - opal_dss.copy_payload(rbuf, &coll->results); - /* reset for next time */ - OBJ_DESTRUCT(&coll->results); - OBJ_CONSTRUCT(&coll->results, opal_buffer_t); - coll->recvd = 0; - OPAL_THREAD_UNLOCK(&coll->lock); - - OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base.output, - "%s grpcomm:app allgather completed", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - return ORTE_SUCCESS; -} diff --git a/orte/mca/grpcomm/base/grpcomm_base_coll.c b/orte/mca/grpcomm/base/grpcomm_base_coll.c deleted file mode 100644 index 0275ac01e1..0000000000 --- a/orte/mca/grpcomm/base/grpcomm_base_coll.c +++ /dev/null @@ -1,923 +0,0 @@ -/* - * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2004-2011 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2005 The Regents of the University of California. - * All rights reserved. - * Copyright (c) 2006-2008 Los Alamos National Security, LLC. - * All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - * - */ - -#include "orte_config.h" -#include "orte/types.h" - - -#include "orte_config.h" -#include "orte/constants.h" -#include "orte/types.h" - -#include -#ifdef HAVE_SYS_TIME_H -#include -#endif /* HAVE_SYS_TIME_H */ - -#include "opal/util/output.h" -#include "opal/dss/dss.h" -#include "opal/mca/event/event.h" - -#include "orte/mca/errmgr/errmgr.h" -#include "orte/mca/ess/ess.h" -#include "orte/mca/odls/base/base.h" -#include "orte/mca/rml/rml.h" -#include "orte/mca/rml/rml_types.h" -#include "orte/mca/routed/routed.h" -#include "orte/runtime/orte_globals.h" -#include "orte/util/name_fns.h" -#include "orte/orted/orted.h" -#include "orte/runtime/orte_wait.h" - -#include "orte/mca/grpcomm/base/base.h" - -/*************** TUNED COLLECTIVES FOR GRPCOMM MODULES **************/ - -/**** AVAILABLE ALGORITHMS ****/ -static int twoproc(opal_buffer_t *sendbuf, opal_buffer_t *recvbuf, int32_t num_entries, - orte_jobid_t jobid, orte_vpid_t *vpids); -static int bruck(opal_buffer_t *sendbuf, opal_buffer_t *recvbuf, int32_t num_entries, - orte_jobid_t jobid, orte_vpid_t np, orte_vpid_t *vpids); -static int recursivedoubling(opal_buffer_t *sendbuf, opal_buffer_t *recvbuf, int32_t num_entries, - orte_jobid_t jobid, orte_vpid_t np, orte_vpid_t *vpids); - -/**** LOCAL VARIABLES USED IN COLLECTIVES ****/ -static int num_recvd; -static opal_buffer_t bucket; - -/* Receive and process collective messages */ -static void process_coll_msg(int fd, short event, void *data) -{ - orte_message_event_t *mev = (orte_message_event_t*)data; - - /* transfer the data to the collecting bucket */ - opal_dss.copy_payload(&bucket, mev->buffer); - - /* cleanup */ - OBJ_RELEASE(mev); - - /* increment the number recvd */ - num_recvd++; -} - -void orte_grpcomm_base_coll_recv(int status, orte_process_name_t* sender, - opal_buffer_t* buffer, orte_rml_tag_t tag, - void* cbdata) -{ - OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, - "%s grpcomm:coll:receive got message from %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(sender))); - - /* don't process this right away - we need to get out of the recv before - * we process the message as it may ask us to do something that involves - * more messaging! Instead, setup an event so that the message gets processed - * as soon as we leave the recv. - * - * The macro makes a copy of the buffer, which we release above - the incoming - * buffer, however, is NOT released here, although its payload IS transferred - * to the message buffer for later processing - */ - ORTE_MESSAGE_EVENT(sender, buffer, tag, process_coll_msg); - - return; -} - -/* - * Switchyard for selecting the collective algorithm to use - */ -int orte_grpcomm_base_allgather(opal_buffer_t *sendbuf, opal_buffer_t *recvbuf, int32_t num_entries, - orte_jobid_t jobid, orte_vpid_t np, orte_vpid_t *vpids) -{ - bool has_one; - orte_vpid_t n; - - OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, - "%s grpcomm:coll:allgather called with %d entries np %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - num_entries, (int)np)); - - /* if we only have one proc participating, just copy the data across and return */ - if (1 == np) { - opal_dss.pack(recvbuf, &num_entries, 1, OPAL_INT32); - return opal_dss.copy_payload(recvbuf, sendbuf); - } - - if (2 == np) { - /* only two procs in collective */ - return twoproc(sendbuf, recvbuf, num_entries, jobid, vpids); - } - - /* if we have power of 2 participants, use recursive doubling - otherwise, - * use bruck algorithm - */ - has_one = false; - n = np; - for ( ; n > 0; n >>= 1) { - if (n & 0x1) { - if (has_one) { - return bruck(sendbuf, recvbuf, num_entries, jobid, np, vpids); - } - has_one = true; - } - } - - /* must be power of two! */ - return recursivedoubling(sendbuf, recvbuf, num_entries, jobid, np, vpids); -} - - -/* - * The Two-Proc Algorithm - * - * One sends to zero, zero waits to recv from one - * Zero adds its data to message, sends result back to one - */ -static int twoproc(opal_buffer_t *sendbuf, opal_buffer_t *recvbuf, int32_t num_entries, - orte_jobid_t jobid, orte_vpid_t *vpids) -{ - orte_process_name_t peer; - int32_t num_remote, cnt; - int rc; - opal_buffer_t buf; - - peer.jobid = jobid; - - OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, - "%s grpcomm:coll:two-proc algo employed", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - - if (vpids[0] == ORTE_PROC_MY_NAME->vpid) { - /* I send first */ - peer.vpid = vpids[1]; - ORTE_EPOCH_SET(peer.epoch,orte_ess.proc_get_epoch(&peer)); - - /* setup a temp buffer so I can inform the other side as to the - * number of entries in my buffer - */ - OBJ_CONSTRUCT(&buf, opal_buffer_t); - opal_dss.pack(&buf, &num_entries, 1, OPAL_INT32); - opal_dss.copy_payload(&buf, sendbuf); - OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, - "%s grpcomm:coll:two-proc sending to %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&peer))); - - if (0 > (rc = orte_rml.send_buffer(&peer, &buf, ORTE_RML_TAG_DAEMON_COLLECTIVE, 0))) { - ORTE_ERROR_LOG(rc); - return rc; - } - OBJ_DESTRUCT(&buf); - - /* wait for reply */ - num_recvd = 0; - OBJ_CONSTRUCT(&bucket, opal_buffer_t); - if (ORTE_SUCCESS != (rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, - ORTE_RML_TAG_DAEMON_COLLECTIVE, - ORTE_RML_NON_PERSISTENT, - orte_grpcomm_base_coll_recv, - NULL))) { - ORTE_ERROR_LOG(rc); - } - - ORTE_PROGRESSED_WAIT(false, num_recvd, 1); - OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, - "%s grpcomm:coll:two-proc got my return message", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - - } else { - /* if I am not the start, then I recv first */ - num_recvd = 0; - OBJ_CONSTRUCT(&bucket, opal_buffer_t); - if (ORTE_SUCCESS != (rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, - ORTE_RML_TAG_DAEMON_COLLECTIVE, - ORTE_RML_NON_PERSISTENT, - orte_grpcomm_base_coll_recv, - NULL))) { - ORTE_ERROR_LOG(rc); - } - - ORTE_PROGRESSED_WAIT(false, num_recvd, 1); - OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, - "%s grpcomm:coll:two-proc got my starting message", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - - /* send my data back */ - OBJ_CONSTRUCT(&buf, opal_buffer_t); - opal_dss.pack(&buf, &num_entries, 1, OPAL_INT32); - opal_dss.copy_payload(&buf, sendbuf); - peer.vpid = vpids[0]; - ORTE_EPOCH_SET(peer.epoch,orte_ess.proc_get_epoch(&peer)); - - OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, - "%s grpcomm:coll:two-proc sending to %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&peer))); - if (0 > (rc = orte_rml.send_buffer(&peer, &buf, ORTE_RML_TAG_DAEMON_COLLECTIVE, 0))) { - ORTE_ERROR_LOG(rc); - return rc; - } - OBJ_DESTRUCT(&buf); - } - - /* extract the number of entries in the remote buffer */ - cnt = 1; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(&bucket, &num_remote, &cnt, OPAL_INT32))) { - ORTE_ERROR_LOG(rc); - return rc; - } - - /* output of a collective begins with the total number of entries */ - num_remote += num_entries; - if (ORTE_SUCCESS != (rc = opal_dss.pack(recvbuf, &num_remote, 1, OPAL_INT32))) { - ORTE_ERROR_LOG(rc); - return rc; - } - - /* xfer my data */ - opal_dss.copy_payload(recvbuf, sendbuf); - /* xfer the recvd data */ - opal_dss.copy_payload(recvbuf, &bucket); - - /* cleanup */ - OBJ_DESTRUCT(&bucket); - - return ORTE_SUCCESS; -} - - -/* For a complete description of this algorithm, please look at - * ompi/mca/coll/tuned/coll_tuned_allgather.c - */ -static int bruck(opal_buffer_t *sendbuf, opal_buffer_t *recvbuf, int32_t num_entries, - orte_jobid_t jobid, orte_vpid_t np, orte_vpid_t *vpids) -{ - orte_vpid_t rank, distance, nv; - orte_process_name_t peer; - int32_t num_remote, total_entries, cnt; - opal_buffer_t collection, buf; - int rc; - - OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, - "%s grpcomm:coll:bruck algo employed", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - - /* initialize */ - total_entries = num_entries; - - /* start by seeding the collection with our own data */ - OBJ_CONSTRUCT(&collection, opal_buffer_t); - opal_dss.copy_payload(&collection, sendbuf); - - /* collective is constrained to take place within the specified jobid */ - peer.jobid = jobid; - - /* Communication step: - At every step i, rank r: - - doubles the distance - - sends message containing all data collected so far to rank r - distance - - receives message containing all data collected so far from rank (r + distance) - */ - /* find my position in the group of participants. This - * value is the "rank" we will use in the algo - */ - rank = ORTE_VPID_INVALID; - for (nv=0; nv < np; nv++) { - if (vpids[nv] == ORTE_PROC_MY_NAME->vpid) { - rank = nv; - break; - } - } - - /* check for bozo case */ - if (ORTE_VPID_INVALID == rank) { - ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); - return ORTE_ERR_NOT_FOUND; - } - - for (distance = 1; distance < np; distance <<= 1) { - - /* first send my current contents */ - nv = (rank - distance + np) % np; - peer.vpid = vpids[nv]; - ORTE_EPOCH_SET(peer.epoch,orte_ess.proc_get_epoch(&peer)); - - OBJ_CONSTRUCT(&buf, opal_buffer_t); - opal_dss.pack(&buf, &total_entries, 1, OPAL_INT32); - opal_dss.copy_payload(&buf, &collection); - OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, - "%s grpcomm:coll:bruck sending to %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&peer))); - if (0 > (rc = orte_rml.send_buffer(&peer, &buf, ORTE_RML_TAG_DAEMON_COLLECTIVE, 0))) { - ORTE_ERROR_LOG(rc); - return rc; - } - OBJ_DESTRUCT(&buf); - - /* now setup to recv from my other partner */ - num_recvd = 0; - nv = (rank + distance) % np; - peer.vpid = vpids[nv]; - ORTE_EPOCH_SET(peer.epoch,orte_ess.proc_get_epoch(&peer)); - - OBJ_CONSTRUCT(&bucket, opal_buffer_t); - if (ORTE_SUCCESS != (rc = orte_rml.recv_buffer_nb(&peer, - ORTE_RML_TAG_DAEMON_COLLECTIVE, - ORTE_RML_NON_PERSISTENT, - orte_grpcomm_base_coll_recv, - NULL))) { - ORTE_ERROR_LOG(rc); - return rc; - } - /* and wait for it to get here */ - ORTE_PROGRESSED_WAIT(false, num_recvd, 1); - - /* extract the number of entries in the remote buffer */ - cnt = 1; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(&bucket, &num_remote, &cnt, OPAL_INT32))) { - ORTE_ERROR_LOG(rc); - return rc; - } - - /* add it to our running total */ - total_entries += num_remote; - - /* transfer the data to our collection */ - opal_dss.copy_payload(&collection, &bucket); - - /* cleanup */ - OBJ_DESTRUCT(&bucket); - } - - /* output of a collective begins with the total number of entries */ - if (ORTE_SUCCESS != (rc = opal_dss.pack(recvbuf, &total_entries, 1, OPAL_INT32))) { - ORTE_ERROR_LOG(rc); - return rc; - } - - /* transfer the collected data */ - opal_dss.copy_payload(recvbuf, &collection); - - /* cleanup */ - OBJ_DESTRUCT(&collection); - - return ORTE_SUCCESS; -} - -/* For a complete description of this algorithm, please look at - * ompi/mca/coll/tuned/coll_tuned_allgather.c - */ -static int recursivedoubling(opal_buffer_t *sendbuf, opal_buffer_t *recvbuf, int32_t num_entries, - orte_jobid_t jobid, orte_vpid_t np, orte_vpid_t *vpids) -{ - orte_vpid_t rank, distance, nv; - int32_t num_remote, total_entries, cnt; - opal_buffer_t collection, buf; - orte_process_name_t peer; - int rc; - - OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, - "%s grpcomm:coll:recdub algo employed", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - - /* initialize */ - total_entries = num_entries; - - /* start by seeding the collection with our own data */ - OBJ_CONSTRUCT(&collection, opal_buffer_t); - opal_dss.copy_payload(&collection, sendbuf); - - /* collective is constrained to take place within the specified jobid */ - peer.jobid = jobid; - - /* Communication step: - At every step i, rank r: - - exchanges message containing all data collected so far with rank peer = (r ^ 2^i). - */ - /* find my position in the group of participants. This - * value is the "rank" we will use in the algo - */ - rank = ORTE_VPID_INVALID; - for (nv=0; nv < np; nv++) { - if (vpids[nv] == ORTE_PROC_MY_NAME->vpid) { - rank = nv; - break; - } - } - - /* check for bozo case */ - if (ORTE_VPID_INVALID == rank) { - ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); - return ORTE_ERR_NOT_FOUND; - } - - for (distance = 0x1; distance < np; distance<<=1) { - - /* first send my current contents */ - nv = rank ^ distance; - peer.vpid = vpids[nv]; - ORTE_EPOCH_SET(peer.epoch,orte_ess.proc_get_epoch(&peer)); - - OBJ_CONSTRUCT(&buf, opal_buffer_t); - opal_dss.pack(&buf, &total_entries, 1, OPAL_INT32); - opal_dss.copy_payload(&buf, &collection); - OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, - "%s grpcomm:coll:recdub sending to %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&peer))); - if (0 > (rc = orte_rml.send_buffer(&peer, &buf, ORTE_RML_TAG_DAEMON_COLLECTIVE, 0))) { - ORTE_ERROR_LOG(rc); - return rc; - } - OBJ_DESTRUCT(&buf); - - /* now setup to recv from my other partner */ - num_recvd = 0; - OBJ_CONSTRUCT(&bucket, opal_buffer_t); - if (ORTE_SUCCESS != (rc = orte_rml.recv_buffer_nb(&peer, - ORTE_RML_TAG_DAEMON_COLLECTIVE, - ORTE_RML_NON_PERSISTENT, - orte_grpcomm_base_coll_recv, - NULL))) { - ORTE_ERROR_LOG(rc); - return rc; - } - /* and wait for it to get here */ - ORTE_PROGRESSED_WAIT(false, num_recvd, 1); - - /* extract the number of entries in the remote buffer */ - cnt = 1; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(&bucket, &num_remote, &cnt, OPAL_INT32))) { - ORTE_ERROR_LOG(rc); - return rc; - } - - /* add it to our running total */ - total_entries += num_remote; - - /* transfer the data to our collection */ - opal_dss.copy_payload(&collection, &bucket); - - /* cleanup */ - OBJ_DESTRUCT(&bucket); - } - - /* output of a collective begins with the total number of entries */ - if (ORTE_SUCCESS != (rc = opal_dss.pack(recvbuf, &total_entries, 1, OPAL_INT32))) { - ORTE_ERROR_LOG(rc); - return rc; - } - - /* transfer the collected data */ - opal_dss.copy_payload(recvbuf, &collection); - - /* cleanup */ - OBJ_DESTRUCT(&collection); - - return ORTE_SUCCESS; -} - -/**** DAEMON COLLECTIVE SUPPORT ****/ - -static void reset_child_participation(orte_jobid_t job) -{ - opal_list_item_t *item; - orte_odls_child_t *child; - - for (item = opal_list_get_first(&orte_local_children); - item != opal_list_get_end(&orte_local_children); - item = opal_list_get_next(item)) { - child = (orte_odls_child_t*)item; - - /* is this child part of the specified job? */ - if (child->name->jobid == job) { - /* clear flag */ - child->coll_recvd = false; - } - } -} - -static bool all_children_participated(orte_jobid_t job) -{ - opal_list_item_t *item; - orte_odls_child_t *child; - - for (item = opal_list_get_first(&orte_local_children); - item != opal_list_get_end(&orte_local_children); - item = opal_list_get_next(item)) { - child = (orte_odls_child_t*)item; - - /* is this child part of the specified job? */ - if (child->name->jobid == job && !child->coll_recvd) { - /* if this child has *not* participated yet, return false */ - return false; - } - } - - /* if we get here, then everyone in the job has participated */ - return true; - -} - -void orte_grpcomm_base_daemon_collective(orte_process_name_t *sender, - opal_buffer_t *data) -{ - orte_jobid_t jobid; - orte_odls_job_t *jobdat; - orte_routed_tree_t *child; - orte_std_cntr_t n; - opal_list_t daemon_tree; - opal_list_item_t *item, *next; - int32_t num_contributors; - opal_buffer_t buf; - orte_process_name_t my_parent, proc; - orte_vpid_t daemonvpid; - int rc; - int32_t numc; - orte_rml_tag_t rmltag; - - OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, - "%s grpcomm:base:daemon_coll: daemon collective called", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - - /* unpack the jobid using this collective */ - n = 1; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &jobid, &n, ORTE_JOBID))) { - ORTE_ERROR_LOG(rc); - return; - } - - /* lookup the job record for it */ - jobdat = NULL; - for (item = opal_list_get_first(&orte_local_jobdata); - item != opal_list_get_end(&orte_local_jobdata); - item = opal_list_get_next(item)) { - jobdat = (orte_odls_job_t*)item; - - /* is this the specified job? */ - if (jobdat->jobid == jobid) { - break; - } - } - if (NULL == jobdat) { - /* race condition - someone sent us a collective before we could - * parse the add_local_procs cmd. Just add the jobdat object - * and continue - */ - jobdat = OBJ_NEW(orte_odls_job_t); - jobdat->jobid = jobid; - opal_list_append(&orte_local_jobdata, &jobdat->super); - } - - /* it may be possible to get here prior to having actually finished processing our - * local launch msg due to the race condition between different nodes and when - * they start their individual procs. Hence, we have to first ensure that we - * -have- finished processing the launch msg, or else we won't know whether - * or not to wait before sending this on - */ - OPAL_THREAD_LOCK(&jobdat->lock); - while (!jobdat->launch_msg_processed) { - opal_condition_wait(&jobdat->cond, &jobdat->lock); - } - OPAL_THREAD_UNLOCK(&jobdat->lock); - - /* unpack the tag for this collective */ - n = 1; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &rmltag, &n, ORTE_RML_TAG))) { - ORTE_ERROR_LOG(rc); - return; - } - - /* unpack the number of contributors in this data bucket */ - n = 1; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &num_contributors, &n, OPAL_INT32))) { - ORTE_ERROR_LOG(rc); - return; - } - jobdat->num_contributors += num_contributors; - - /* xfer the data */ - opal_dss.copy_payload(&jobdat->collection_bucket, data); - - /* count the number of participants collected */ - jobdat->num_collected++; - - /* if we haven't already done so, figure out how many participants we - * should be expecting - */ - if (jobdat->num_participating < 0) { - if (0 < jobdat->num_local_procs) { - /* we have children, so account for our own participation */ - jobdat->num_participating = 1; - } else { - jobdat->num_participating = 0; - } - /* now see if anyone else will be sending us something */ - OBJ_CONSTRUCT(&daemon_tree, opal_list_t); - orte_routed.get_routing_tree(&daemon_tree); - /* unfortunately, there is no simple way to determine which of our "child" - * daemons in the routing tree will be sending us something. All we can do - * is brute force a search, though we attempt to keep it as short as possible - */ - proc.jobid = jobid; - proc.vpid = 0; - while (proc.vpid < jobdat->num_procs && 0 < opal_list_get_size(&daemon_tree)) { - ORTE_EPOCH_SET(proc.epoch,orte_ess.proc_get_epoch(&proc)); - - /* get the daemon that hosts this proc */ - daemonvpid = orte_ess.proc_get_daemon(&proc); - /* is this daemon one of our children, or at least its contribution - * will pass through one of our children - */ - item = opal_list_get_first(&daemon_tree); - while (item != opal_list_get_end(&daemon_tree)) { - next = opal_list_get_next(item); - child = (orte_routed_tree_t*)item; - if (child->vpid == daemonvpid || opal_bitmap_is_set_bit(&child->relatives, daemonvpid)) { - /* it does - add to num_participating */ - jobdat->num_participating++; - /* remove this from the list so we don't double count it */ - opal_list_remove_item(&daemon_tree, item); - /* done with search */ - break; - } - item = next; - } - proc.vpid++; - } - } - - OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, - "%s grpcomm:base:daemon_coll: daemon collective for job %s from %s type %ld" - " num_collected %d num_participating %d num_contributors %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(jobid), - ORTE_NAME_PRINT(sender), - (long)jobdat->collective_type, jobdat->num_collected, - jobdat->num_participating, jobdat->num_contributors)); - - if (jobdat->num_collected == jobdat->num_participating) { - /* if I am the HNP, go process the results */ - if (ORTE_PROC_IS_HNP) { - goto hnp_process; - } - - /* if I am not the HNP, send to my parent */ - OBJ_CONSTRUCT(&buf, opal_buffer_t); - /* pack the jobid */ - if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &jobid, 1, ORTE_JOBID))) { - ORTE_ERROR_LOG(rc); - return; - } - /* pack the target tag */ - if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &rmltag, 1, ORTE_RML_TAG))) { - ORTE_ERROR_LOG(rc); - return; - } - /* pack the number of contributors */ - if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &jobdat->num_contributors, 1, OPAL_INT32))) { - ORTE_ERROR_LOG(rc); - return; - } - /* xfer the payload*/ - opal_dss.copy_payload(&buf, &jobdat->collection_bucket); - /* reset everything for next collective */ - jobdat->num_contributors = 0; - jobdat->num_collected = 0; - OBJ_DESTRUCT(&jobdat->collection_bucket); - OBJ_CONSTRUCT(&jobdat->collection_bucket, opal_buffer_t); - /* send it */ - my_parent.jobid = ORTE_PROC_MY_NAME->jobid; - my_parent.vpid = orte_routed.get_routing_tree(NULL); - ORTE_EPOCH_SET(my_parent.epoch,orte_ess.proc_get_epoch(&my_parent)); - - OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, - "%s grpcomm:base:daemon_coll: daemon collective not the HNP - sending to parent %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&my_parent))); - if (0 > (rc = orte_rml.send_buffer(&my_parent, &buf, ORTE_RML_TAG_DAEMON_COLLECTIVE, 0))) { - ORTE_ERROR_LOG(rc); - return; - } - OBJ_DESTRUCT(&buf); - } - return; - -hnp_process: - OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, - "%s grpcomm:base:daemon_coll: daemon collective HNP - xcasting to job %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_JOBID_PRINT(jobid))); - /* setup a buffer to send the results back to the job members */ - OBJ_CONSTRUCT(&buf, opal_buffer_t); - - /* add any collected data */ - numc = jobdat->num_contributors; - if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &numc, 1, OPAL_INT32))) { - ORTE_ERROR_LOG(rc); - goto cleanup; - } - if (ORTE_SUCCESS != (rc = opal_dss.copy_payload(&buf, &jobdat->collection_bucket))) { - ORTE_ERROR_LOG(rc); - goto cleanup; - } - /* reset everything for next collective */ - jobdat->num_contributors = 0; - jobdat->num_collected = 0; - OBJ_DESTRUCT(&jobdat->collection_bucket); - OBJ_CONSTRUCT(&jobdat->collection_bucket, opal_buffer_t); - /* send the buffer */ - if (ORTE_SUCCESS != (rc = orte_grpcomm.xcast(jobid, &buf, rmltag))) { - ORTE_ERROR_LOG(rc); - } - -cleanup: - OBJ_DESTRUCT(&buf); - - return; -} - -static void process_msg(int fd, short event, void *data) -{ - orte_message_event_t *mev = (orte_message_event_t*)data; - orte_process_name_t *proc; - opal_buffer_t *buf, relay; - int32_t rc, n; - opal_list_item_t *item; - orte_odls_child_t *child; - bool found = false; - orte_odls_job_t *jobdat; - orte_rml_tag_t rmltag; - - proc = &mev->sender; - buf = mev->buffer; - - /* is the sender a local proc, or a daemon relaying the collective? */ - if (ORTE_PROC_MY_NAME->jobid == proc->jobid) { - /* this is a relay - call that code */ - orte_grpcomm_base.daemon_coll(proc, buf); - goto CLEANUP; - } - - for (item = opal_list_get_first(&orte_local_children); - item != opal_list_get_end(&orte_local_children); - item = opal_list_get_next(item)) { - child = (orte_odls_child_t*)item; - - /* find this child */ - if (OPAL_EQUAL == opal_dss.compare(proc, child->name, ORTE_NAME)) { - - OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, - "%s grpcomm:base:daemon_coll: collecting data from child %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(child->name))); - - found = true; - break; - } - } - - /* if it wasn't found on the list, then we need to add it - must have - * come from a singleton - */ - if (!found) { - child = OBJ_NEW(orte_odls_child_t); - if (ORTE_SUCCESS != (rc = opal_dss.copy((void**)&child->name, proc, ORTE_NAME))) { - ORTE_ERROR_LOG(rc); - return; - } - opal_list_append(&orte_local_children, &child->super); - /* we don't know any other info about the child, so just indicate it's - * alive - */ - child->alive = true; - /* setup a jobdat for it */ - orte_odls_base_setup_singleton_jobdat(proc->jobid); - } - - /* this was one of our local procs - find the jobdat for this job */ - jobdat = NULL; - for (item = opal_list_get_first(&orte_local_jobdata); - item != opal_list_get_end(&orte_local_jobdata); - item = opal_list_get_next(item)) { - jobdat = (orte_odls_job_t*)item; - - /* is this the specified job? */ - if (jobdat->jobid == proc->jobid) { - break; - } - } - if (NULL == jobdat) { - ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); - rc = ORTE_ERR_NOT_FOUND; - goto CLEANUP; - } - - /* unpack the target tag */ - n = 1; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(buf, &rmltag, &n, ORTE_RML_TAG))) { - ORTE_ERROR_LOG(rc); - goto CLEANUP; - } - - /* collect the provided data */ - opal_dss.copy_payload(&jobdat->local_collection, buf); - - /* flag this proc as having participated */ - child->coll_recvd = true; - - /* now check to see if all local procs in this job have participated */ - if (all_children_participated(proc->jobid)) { - - OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, - "%s grpcomm:base:daemon_coll: executing collective", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - - /* prep a buffer to pass it all along */ - OBJ_CONSTRUCT(&relay, opal_buffer_t); - /* pack the jobid */ - if (ORTE_SUCCESS != (rc = opal_dss.pack(&relay, &proc->jobid, 1, ORTE_JOBID))) { - ORTE_ERROR_LOG(rc); - return; - } - /* pack the target tag */ - if (ORTE_SUCCESS != (rc = opal_dss.pack(&relay, &rmltag, 1, ORTE_RML_TAG))) { - ORTE_ERROR_LOG(rc); - return; - } - /* pack the number of contributors */ - if (ORTE_SUCCESS != (rc = opal_dss.pack(&relay, &jobdat->num_local_procs, 1, OPAL_INT32))) { - ORTE_ERROR_LOG(rc); - return; - } - /* xfer the payload*/ - opal_dss.copy_payload(&relay, &jobdat->local_collection); - /* refresh the collection bucket for reuse */ - OBJ_DESTRUCT(&jobdat->local_collection); - OBJ_CONSTRUCT(&jobdat->local_collection, opal_buffer_t); - reset_child_participation(proc->jobid); - /* pass this to the daemon collective operation */ - orte_grpcomm_base.daemon_coll(ORTE_PROC_MY_NAME, &relay); - /* done with the relay */ - OBJ_DESTRUCT(&relay); - - OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base.output, - "%s grpcomm:base:daemon_coll: collective completed", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - } - -CLEANUP: - /* release the message */ - OBJ_RELEASE(mev); -} - -void orte_grpcomm_base_daemon_coll_recv(int status, orte_process_name_t* sender, - opal_buffer_t* buffer, orte_rml_tag_t tag, - void* cbdata) -{ - int rc; - - OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, - "%s grpcomm:base:daemon_coll:receive got message from %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(sender))); - - /* don't process this right away - we need to get out of the recv before - * we process the message as it may ask us to do something that involves - * more messaging! Instead, setup an event so that the message gets processed - * as soon as we leave the recv. - * - * The macro makes a copy of the buffer, which we release above - the incoming - * buffer, however, is NOT released here, although its payload IS transferred - * to the message buffer for later processing - */ - ORTE_MESSAGE_EVENT(sender, buffer, tag, process_msg); - - /* reissue the recv */ - if (ORTE_SUCCESS != (rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, - ORTE_RML_TAG_DAEMON_COLLECTIVE, - ORTE_RML_NON_PERSISTENT, - orte_grpcomm_base_daemon_coll_recv, - cbdata))) { - ORTE_ERROR_LOG(rc); - } - return; -} diff --git a/orte/mca/grpcomm/base/grpcomm_base_modex.c b/orte/mca/grpcomm/base/grpcomm_base_modex.c index 5b663fe3d5..987ddeb8ff 100644 --- a/orte/mca/grpcomm/base/grpcomm_base_modex.c +++ b/orte/mca/grpcomm/base/grpcomm_base_modex.c @@ -12,6 +12,8 @@ * All rights reserved. * Copyright (c) 2007 Sun Microsystems, Inc. All rights reserved. * Copyright (c) 2011 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2011-2012 Los Alamos National Security, LLC. + * All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -46,76 +48,98 @@ #include "orte/mca/grpcomm/base/base.h" #include "orte/mca/grpcomm/grpcomm.h" -/*************** MODEX SECTION **************/ -int orte_grpcomm_base_full_modex(opal_list_t *procs) +orte_grpcomm_coll_id_t orte_grpcomm_base_get_coll_id(void) { - opal_buffer_t buf, rbuf; - int32_t i, n, num_procs; - orte_std_cntr_t cnt; - orte_process_name_t proc_name; - int rc=ORTE_SUCCESS; - orte_nid_t *nid; + orte_grpcomm_coll_id_t id; + + /* assign the next collective id */ + id = orte_grpcomm_base.coll_id; + /* rotate to the next value */ + orte_grpcomm_base.coll_id++; + return id; +} + + +/*************** MODEX SECTION **************/ +int orte_grpcomm_base_modex(orte_grpcomm_collective_t *modex) +{ + int rc; orte_local_rank_t local_rank; orte_node_rank_t node_rank; - orte_jmap_t *jmap; - orte_pmap_t *pmap; - orte_vpid_t daemon; - char *hostname; + orte_namelist_t *nm; OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base.output, - "%s grpcomm:base:full:modex: performing modex", + "%s grpcomm:base:modex: performing modex", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - /* setup the buffer that will actually be sent */ - OBJ_CONSTRUCT(&buf, opal_buffer_t); - OBJ_CONSTRUCT(&rbuf, opal_buffer_t); - + /* record the collective */ + modex->active = true; + modex->next_cbdata = modex; + opal_list_append(&orte_grpcomm_base.active_colls, &modex->super); + /* put our process name in the buffer so it can be unpacked later */ - if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, ORTE_PROC_MY_NAME, 1, ORTE_NAME))) { + if (ORTE_SUCCESS != (rc = opal_dss.pack(&modex->buffer, ORTE_PROC_MY_NAME, 1, ORTE_NAME))) { ORTE_ERROR_LOG(rc); goto cleanup; } - /* pack our hostname */ - if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &orte_process_info.nodename, 1, OPAL_STRING))) { - ORTE_ERROR_LOG(rc); - goto cleanup; - } + if (0 == opal_list_get_size(&modex->participants)) { + /* add a wildcard name to the participants so the daemon knows + * that everyone in my job must participate + */ + nm = OBJ_NEW(orte_namelist_t); + nm->name.jobid = ORTE_PROC_MY_NAME->jobid; + nm->name.vpid = ORTE_VPID_WILDCARD; + opal_list_append(&modex->participants, &nm->super); + modex->next_cb = orte_grpcomm_base_store_modex; + } else { + /* this is not amongst our peers, but rather between a select + * group of processes - e.g., during a connect/accept operation. + * Thus, this requires we send additional info + */ + modex->next_cb = orte_grpcomm_base_store_peer_modex; + + /* pack our hostname */ + if (ORTE_SUCCESS != (rc = opal_dss.pack(&modex->buffer, &orte_process_info.nodename, 1, OPAL_STRING))) { + ORTE_ERROR_LOG(rc); + goto cleanup; + } - /* pack our daemon's vpid */ - if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &ORTE_PROC_MY_DAEMON->vpid, 1, ORTE_VPID))) { - ORTE_ERROR_LOG(rc); - goto cleanup; - } + /* pack our daemon's vpid */ + if (ORTE_SUCCESS != (rc = opal_dss.pack(&modex->buffer, &ORTE_PROC_MY_DAEMON->vpid, 1, ORTE_VPID))) { + ORTE_ERROR_LOG(rc); + goto cleanup; + } - /* pack our node rank */ - node_rank = orte_ess.get_node_rank(ORTE_PROC_MY_NAME); - if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &node_rank, 1, ORTE_NODE_RANK))) { - ORTE_ERROR_LOG(rc); - goto cleanup; - } + /* pack our node rank */ + node_rank = orte_ess.get_node_rank(ORTE_PROC_MY_NAME); + if (ORTE_SUCCESS != (rc = opal_dss.pack(&modex->buffer, &node_rank, 1, ORTE_NODE_RANK))) { + ORTE_ERROR_LOG(rc); + goto cleanup; + } - /* pack our local rank */ - local_rank = orte_ess.get_local_rank(ORTE_PROC_MY_NAME); - if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &local_rank, 1, ORTE_LOCAL_RANK))) { - ORTE_ERROR_LOG(rc); - goto cleanup; - } + /* pack our local rank */ + local_rank = orte_ess.get_local_rank(ORTE_PROC_MY_NAME); + if (ORTE_SUCCESS != (rc = opal_dss.pack(&modex->buffer, &local_rank, 1, ORTE_LOCAL_RANK))) { + ORTE_ERROR_LOG(rc); + goto cleanup; + } #if OPAL_HAVE_HWLOC - /* pack our binding info so other procs can determine our locality */ - if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &orte_process_info.bind_level, 1, OPAL_HWLOC_LEVEL_T))) { - ORTE_ERROR_LOG(rc); - goto cleanup; - } - if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &orte_process_info.bind_idx, 1, OPAL_UINT))) { - ORTE_ERROR_LOG(rc); - goto cleanup; - } + /* pack our binding info so other procs can determine our locality */ + if (ORTE_SUCCESS != (rc = opal_dss.pack(&modex->buffer, &orte_process_info.bind_level, 1, OPAL_HWLOC_LEVEL_T))) { + ORTE_ERROR_LOG(rc); + goto cleanup; + } + if (ORTE_SUCCESS != (rc = opal_dss.pack(&modex->buffer, &orte_process_info.bind_idx, 1, OPAL_UINT))) { + ORTE_ERROR_LOG(rc); + goto cleanup; + } #endif + } /* pack the entries we have received */ - if (ORTE_SUCCESS != (rc = orte_grpcomm_base_pack_modex_entries(&buf))) { + if (ORTE_SUCCESS != (rc = orte_grpcomm_base_pack_modex_entries(&modex->buffer))) { ORTE_ERROR_LOG(rc); goto cleanup; } @@ -124,76 +148,67 @@ int orte_grpcomm_base_full_modex(opal_list_t *procs) "%s grpcomm:base:full:modex: executing allgather", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - if (NULL == procs) { - /* exchange the buffer with my peers */ - if (ORTE_SUCCESS != (rc = orte_grpcomm.allgather(&buf, &rbuf))) { - ORTE_ERROR_LOG(rc); - goto cleanup; - } - } else { - /* exchange the buffer with the list of peers */ - if (ORTE_SUCCESS != (rc = orte_grpcomm.allgather_list(procs, &buf, &rbuf))) { - ORTE_ERROR_LOG(rc); - goto cleanup; - } - } - - OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base.output, - "%s grpcomm:base:full:modex: processing modex info", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - - - - /* extract the number of procs that put data in the buffer */ - cnt=1; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(&rbuf, &num_procs, &cnt, OPAL_INT32))) { + /* execute the allgather */ + if (ORTE_SUCCESS != (rc = orte_grpcomm.allgather(modex))) { ORTE_ERROR_LOG(rc); goto cleanup; } + + OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base.output, + "%s grpcomm:base:modex: modex posted", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, - "%s grpcomm:base:full:modex: received %ld data bytes from %d procs", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - (long)(rbuf.pack_ptr - rbuf.unpack_ptr), num_procs)); - - /* if the buffer doesn't have any more data, ignore it */ - if (0 >= (rbuf.pack_ptr - rbuf.unpack_ptr)) { - goto cleanup; - } - - /* otherwise, process it */ - for (i=0; i < num_procs; i++) { - /* unpack the process name */ - cnt=1; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(&rbuf, &proc_name, &cnt, ORTE_NAME))) { - ORTE_ERROR_LOG(rc); - goto cleanup; - } - + return ORTE_SUCCESS; + + cleanup: + OBJ_RELEASE(modex); + return rc; +} + +void orte_grpcomm_base_store_peer_modex(opal_buffer_t *rbuf, void *cbdata) +{ + int rc, n, cnt; + orte_process_name_t proc_name; + char *hostname; + orte_vpid_t daemon; + orte_node_rank_t node_rank; + orte_local_rank_t local_rank; + orte_nid_t *nid; + orte_jmap_t *jmap; + orte_pmap_t *pmap; + orte_grpcomm_collective_t *modex = (orte_grpcomm_collective_t*)cbdata; + + OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base.output, + "%s STORING PEER MODEX DATA", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + + /* unpack the process name */ + cnt=1; + while (ORTE_SUCCESS == (rc = opal_dss.unpack(rbuf, &proc_name, &cnt, ORTE_NAME))) { /* unpack the hostname */ cnt = 1; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(&rbuf, &hostname, &cnt, OPAL_STRING))) { + if (ORTE_SUCCESS != (rc = opal_dss.unpack(rbuf, &hostname, &cnt, OPAL_STRING))) { ORTE_ERROR_LOG(rc); goto cleanup; } /* unpack the daemon vpid */ cnt = 1; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(&rbuf, &daemon, &cnt, ORTE_VPID))) { + if (ORTE_SUCCESS != (rc = opal_dss.unpack(rbuf, &daemon, &cnt, ORTE_VPID))) { ORTE_ERROR_LOG(rc); goto cleanup; } /* unpack the node rank */ cnt = 1; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(&rbuf, &node_rank, &cnt, ORTE_NODE_RANK))) { + if (ORTE_SUCCESS != (rc = opal_dss.unpack(rbuf, &node_rank, &cnt, ORTE_NODE_RANK))) { ORTE_ERROR_LOG(rc); goto cleanup; } /* unpack the local rank */ cnt = 1; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(&rbuf, &local_rank, &cnt, ORTE_LOCAL_RANK))) { + if (ORTE_SUCCESS != (rc = opal_dss.unpack(rbuf, &local_rank, &cnt, ORTE_LOCAL_RANK))) { ORTE_ERROR_LOG(rc); goto cleanup; } @@ -272,12 +287,12 @@ int orte_grpcomm_base_full_modex(opal_list_t *procs) /* unpack the locality info */ cnt = 1; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(&rbuf, &bind_level, &cnt, OPAL_HWLOC_LEVEL_T))) { + if (ORTE_SUCCESS != (rc = opal_dss.unpack(rbuf, &bind_level, &cnt, OPAL_HWLOC_LEVEL_T))) { ORTE_ERROR_LOG(rc); goto cleanup; } cnt = 1; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(&rbuf, &bind_idx, &cnt, OPAL_UINT))) { + if (ORTE_SUCCESS != (rc = opal_dss.unpack(rbuf, &bind_idx, &cnt, OPAL_UINT))) { ORTE_ERROR_LOG(rc); goto cleanup; } @@ -348,72 +363,43 @@ int orte_grpcomm_base_full_modex(opal_list_t *procs) ORTE_NAME_PRINT(&proc_name))); /* update the modex database */ - if (ORTE_SUCCESS != (rc = orte_grpcomm_base_update_modex_entries(&proc_name, &rbuf))) { + if (ORTE_SUCCESS != (rc = orte_grpcomm_base_update_modex_entries(&proc_name, rbuf))) { ORTE_ERROR_LOG(rc); goto cleanup; } - } - + } + cleanup: - OBJ_DESTRUCT(&buf); - OBJ_DESTRUCT(&rbuf); - return rc; + /* flag the collective as complete */ + modex->active = false; + /* cleanup */ + opal_list_remove_item(&orte_grpcomm_base.active_colls, &modex->super); + /* notify that the modex is complete */ + if (NULL != modex->cbfunc) { + OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base.output, + "%s CALLING MODEX RELEASE", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + modex->cbfunc(NULL, modex->cbdata); + } } -int orte_grpcomm_base_modex_unpack( opal_buffer_t* rbuf) +void orte_grpcomm_base_store_modex(opal_buffer_t *rbuf, void *cbdata) { - int32_t i, num_procs; orte_std_cntr_t cnt; orte_process_name_t proc_name; int rc=ORTE_SUCCESS; - orte_vpid_t daemon; - orte_pmap_t *pmap; + orte_grpcomm_collective_t *modex = (orte_grpcomm_collective_t*)cbdata; - /* process the results */ - /* extract the number of procs that put data in the buffer */ - cnt = 1; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(rbuf, &num_procs, &cnt, OPAL_INT32))) { - ORTE_ERROR_LOG(rc); - goto cleanup; - } - - OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base.output, - "%s grpcomm:base:modex:unpack: received %ld data bytes from %d procs", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - (long)(rbuf->pack_ptr - rbuf->unpack_ptr), num_procs)); - - /* if the buffer doesn't have any more data, ignore it */ - if (0 >= (rbuf->pack_ptr - rbuf->unpack_ptr)) { - goto cleanup; - } - - /* otherwise, process it */ - for (i = 0; i < num_procs; i++) { - /* unpack the process name */ - cnt=1; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(rbuf, &proc_name, &cnt, ORTE_NAME))) { - ORTE_ERROR_LOG(rc); - goto cleanup; - } + OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base.output, + "%s STORING MODEX DATA", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + + /* unpack the process name */ + cnt=1; + while (ORTE_SUCCESS == (rc = opal_dss.unpack(rbuf, &proc_name, &cnt, ORTE_NAME))) { - /* SINCE THIS IS AMONGST PEERS, THERE IS NO NEED TO UPDATE THE NIDMAP/PIDMAP */ - - if (ORTE_VPID_INVALID == (daemon = orte_ess.proc_get_daemon(&proc_name))) { - /* clear problem */ - ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); - rc = ORTE_ERR_NOT_FOUND; - goto cleanup; - } - - if (NULL == (pmap = orte_util_lookup_pmap(&proc_name))) { - /* clear problem */ - ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); - rc = ORTE_ERR_NOT_FOUND; - goto cleanup; - } - OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, - "%s grpcomm:base:modex:unpack: adding modex entry for proc %s", + "%s grpcomm:base:store_modex adding modex entry for proc %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&proc_name))); @@ -423,9 +409,22 @@ int orte_grpcomm_base_modex_unpack( opal_buffer_t* rbuf) goto cleanup; } } + if (ORTE_ERR_UNPACK_READ_PAST_END_OF_BUFFER != rc) { + ORTE_ERROR_LOG(rc); + } cleanup: - return rc; + /* flag the modex as complete */ + modex->active = false; + /* cleanup */ + opal_list_remove_item(&orte_grpcomm_base.active_colls, &modex->super); + /* execute user callback, if requested */ + if (NULL != modex->cbfunc) { + OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base.output, + "%s CALLING MODEX RELEASE", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + modex->cbfunc(NULL, modex->cbdata); + } } /** diff --git a/orte/mca/grpcomm/base/grpcomm_base_open.c b/orte/mca/grpcomm/base/grpcomm_base_open.c index 688b223c5d..e1f4de594c 100644 --- a/orte/mca/grpcomm/base/grpcomm_base_open.c +++ b/orte/mca/grpcomm/base/grpcomm_base_open.c @@ -9,6 +9,8 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. + * Copyright (c) 2011 Los Alamos National Security, LLC. + * All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -53,12 +55,9 @@ int orte_grpcomm_base_open(void) verbose set by the mca open system... */ orte_grpcomm_base.output = opal_output_open(NULL); - /* define the default daemon collective fn */ -#if ORTE_DISABLE_FULL_SUPPORT - orte_grpcomm_base.daemon_coll = NULL; -#else - orte_grpcomm_base.daemon_coll = orte_grpcomm_base_daemon_collective; -#endif + /* init globals */ + OBJ_CONSTRUCT(&orte_grpcomm_base.active_colls, opal_list_t); + orte_grpcomm_base.coll_id = 0; #if OPAL_HAVE_HWLOC orte_grpcomm_base.working_cpuset = NULL; @@ -78,21 +77,64 @@ int orte_grpcomm_base_open(void) return ORTE_SUCCESS; } +orte_grpcomm_collective_t* orte_grpcomm_base_setup_collective(orte_grpcomm_coll_id_t id) +{ + opal_list_item_t *item; + orte_grpcomm_collective_t *cptr, *coll; + + coll = NULL; + for (item = opal_list_get_first(&orte_grpcomm_base.active_colls); + item != opal_list_get_end(&orte_grpcomm_base.active_colls); + item = opal_list_get_next(item)) { + cptr = (orte_grpcomm_collective_t*)item; + if (id == cptr->id) { + coll = cptr; + break; + } + } + if (NULL == coll) { + coll = OBJ_NEW(orte_grpcomm_collective_t); + coll->id = id; + opal_list_append(&orte_grpcomm_base.active_colls, &coll->super); + } + + return coll; +} + /* local objects */ static void collective_constructor(orte_grpcomm_collective_t *ptr) { - OBJ_CONSTRUCT(&ptr->lock, opal_mutex_t); - OBJ_CONSTRUCT(&ptr->cond, opal_condition_t); - OBJ_CONSTRUCT(&ptr->results, opal_buffer_t); - ptr->recvd = 0; + ptr->id = -1; + ptr->active = false; + ptr->num_local_recvd = 0; + OBJ_CONSTRUCT(&ptr->local_bucket, opal_buffer_t); + ptr->num_peer_buckets = 0; + ptr->num_global_recvd = 0; + ptr->locally_complete = false; + OBJ_CONSTRUCT(&ptr->participants, opal_list_t); + ptr->cbfunc = NULL; + ptr->cbdata = NULL; + OBJ_CONSTRUCT(&ptr->buffer, opal_buffer_t); + OBJ_CONSTRUCT(&ptr->targets, opal_list_t); + ptr->next_cb = NULL; + ptr->next_cbdata = NULL; } static void collective_destructor(orte_grpcomm_collective_t *ptr) { - OBJ_DESTRUCT(&ptr->lock); - OBJ_DESTRUCT(&ptr->cond); - OBJ_DESTRUCT(&ptr->results); + opal_list_item_t *item; + + OBJ_DESTRUCT(&ptr->local_bucket); + while (NULL != (item = opal_list_remove_first(&ptr->participants))) { + OBJ_RELEASE(item); + } + OBJ_DESTRUCT(&ptr->participants); + OBJ_DESTRUCT(&ptr->buffer); + while (NULL != (item = opal_list_remove_first(&ptr->targets))) { + OBJ_RELEASE(item); + } + OBJ_DESTRUCT(&ptr->targets); } OBJ_CLASS_INSTANCE(orte_grpcomm_collective_t, - opal_object_t, + opal_list_item_t, collective_constructor, collective_destructor); diff --git a/orte/mca/grpcomm/base/grpcomm_base_receive.c b/orte/mca/grpcomm/base/grpcomm_base_receive.c new file mode 100644 index 0000000000..ac80c2d2a4 --- /dev/null +++ b/orte/mca/grpcomm/base/grpcomm_base_receive.c @@ -0,0 +1,686 @@ +/* -*- C -*- + * + * Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2005 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2011-2012 Los Alamos National Security, LLC. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +/** @file: + * + */ + +/* + * includes + */ +#include "orte_config.h" + + +#include "opal/dss/dss.h" + +#include "orte/util/proc_info.h" +#include "orte/util/error_strings.h" +#include "orte/mca/errmgr/errmgr.h" +#include "orte/mca/ess/ess.h" +#include "orte/mca/odls/base/base.h" +#include "orte/mca/rml/rml.h" +#include "orte/mca/routed/routed.h" +#include "orte/mca/state/state.h" +#include "orte/util/name_fns.h" +#include "orte/runtime/orte_globals.h" + +#include "orte/mca/grpcomm/grpcomm_types.h" +#include "orte/mca/grpcomm/grpcomm.h" +#include "orte/mca/grpcomm/base/base.h" + +static bool recv_issued=false; +static void daemon_local_recv(int status, orte_process_name_t* sender, + opal_buffer_t* buffer, orte_rml_tag_t tag, + void* cbdata); +static void daemon_coll_recv(int status, orte_process_name_t* sender, + opal_buffer_t* buffer, orte_rml_tag_t tag, + void* cbdata); +static void app_recv(int status, orte_process_name_t* sender, + opal_buffer_t* buffer, orte_rml_tag_t tag, + void* cbdata); +static void coll_id_req(int status, orte_process_name_t* sender, + opal_buffer_t* buffer, orte_rml_tag_t tag, + void* cbdata); + +int orte_grpcomm_base_comm_start(void) +{ + int rc; + + OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, + "%s grpcomm:base:receive start comm", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + + if (!recv_issued) { + if (ORTE_PROC_IS_HNP || ORTE_PROC_IS_DAEMON) { + if (ORTE_SUCCESS != (rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, + ORTE_RML_TAG_COLLECTIVE, + ORTE_RML_PERSISTENT, + daemon_local_recv, NULL))) { + ORTE_ERROR_LOG(rc); + recv_issued = false; + return rc; + } + if (ORTE_SUCCESS != (rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, + ORTE_RML_TAG_XCAST, + ORTE_RML_PERSISTENT, + orte_grpcomm_base_xcast_recv, NULL))) { + ORTE_ERROR_LOG(rc); + recv_issued = false; + return rc; + } + if (ORTE_SUCCESS != (rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, + ORTE_RML_TAG_DAEMON_COLL, + ORTE_RML_PERSISTENT, + daemon_coll_recv, NULL))) { + ORTE_ERROR_LOG(rc); + recv_issued = false; + return rc; + } + if (ORTE_PROC_IS_HNP) { + if (ORTE_SUCCESS != (rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, + ORTE_RML_TAG_COLL_ID_REQ, + ORTE_RML_PERSISTENT, + coll_id_req, NULL))) { + ORTE_ERROR_LOG(rc); + recv_issued = false; + return rc; + } + } + recv_issued = true; + } else if (ORTE_PROC_IS_APP) { + if (ORTE_SUCCESS != (rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, + ORTE_RML_TAG_COLLECTIVE, + ORTE_RML_PERSISTENT, + app_recv, NULL))) { + ORTE_ERROR_LOG(rc); + recv_issued = false; + return rc; + } + recv_issued = true; + } + } + + return ORTE_SUCCESS; +} + + +void orte_grpcomm_base_comm_stop(void) +{ + OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, + "%s grpcomm:base:receive stop comm", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + if (recv_issued) { + orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_COLLECTIVE); + if (ORTE_PROC_IS_HNP || ORTE_PROC_IS_DAEMON) { + orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_XCAST); + orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_DAEMON_COLL); + } + if (ORTE_PROC_IS_HNP) { + orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_COLL_ID_REQ); + } + recv_issued = false; + } +} + +static void coll_id_req(int status, orte_process_name_t* sender, + opal_buffer_t* buffer, orte_rml_tag_t tag, + void* cbdata) +{ + orte_grpcomm_coll_id_t id; + opal_buffer_t *relay; + int rc; + /* collective - only the HNP ever gets this message, but check + * in case a developer makes a mistake! + */ + id = orte_grpcomm_base_get_coll_id(); + OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, + "%s grpcomm:base:receive proc %s requested coll id - returned id %d", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(sender), id)); + relay = OBJ_NEW(opal_buffer_t); + if (ORTE_SUCCESS != (rc = opal_dss.pack(relay, &id, 1, ORTE_GRPCOMM_COLL_ID_T))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(relay); + return; + } + if (0 > (rc = orte_rml.send_buffer_nb(sender, relay, ORTE_RML_TAG_COLL_ID, 0, + orte_rml_send_callback, NULL))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(relay); + return; + } +} + + +/* process incoming coll returns */ +static void app_recv(int status, orte_process_name_t* sender, + opal_buffer_t* buffer, orte_rml_tag_t tag, + void* cbdata) +{ + orte_grpcomm_collective_t *coll; + opal_list_item_t *item; + int n, rc; + orte_grpcomm_coll_id_t id; + + /* get the collective id */ + n = 1; + if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &id, &n, ORTE_GRPCOMM_COLL_ID_T))) { + ORTE_ERROR_LOG(rc); + return; + } + + OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, + "%s grpcomm:base:receive processing collective return for id %d", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), id)); + + /* search my list of active collectives */ + for (item = opal_list_get_first(&orte_grpcomm_base.active_colls); + item != opal_list_get_end(&orte_grpcomm_base.active_colls); + item = opal_list_get_next(item)) { + coll = (orte_grpcomm_collective_t*)item; + OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, + "%s CHECKING COLL id %d", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + coll->id)); + + if (id == coll->id) { + /* see if the collective needs another step */ + if (NULL != coll->next_cb) { + /* have to go here next */ + coll->next_cb(buffer, coll->next_cbdata); + break; + } + /* flag the collective as complete */ + coll->active = false; + /* cleanup */ + opal_list_remove_item(&orte_grpcomm_base.active_colls, item); + /* callback the specified function */ + if (NULL != coll->cbfunc) { + OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, + "%s grpcomm:base:receive executing callback", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + + coll->cbfunc(buffer, coll->cbdata); + } + break; + } + } +} + +/**** DAEMON COLLECTIVE SUPPORT ****/ +/* recv for collective messages sent from a daemon's local procs */ +static void daemon_local_recv(int status, orte_process_name_t* sender, + opal_buffer_t* buffer, orte_rml_tag_t tag, + void* cbdata) +{ + int32_t rc, n; + orte_vpid_t nprocs; + orte_job_t *jdata; + orte_grpcomm_collective_t *coll; + orte_process_name_t proc; + orte_namelist_t *nm; + bool keep; + orte_vpid_t i; + orte_grpcomm_coll_id_t id; + bool do_progress=true; + + OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, + "%s COLLECTIVE RECVD FROM %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(sender))); + + /* unpack the collective id */ + n = 1; + if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &id, &n, ORTE_GRPCOMM_COLL_ID_T))) { + ORTE_ERROR_LOG(rc); + return; + } + + OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, + "%s WORKING COLLECTIVE %d", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), id)); + + /* setup the collective for this id - if it's already present, + * then this will just return the existing structure + */ + coll = orte_grpcomm_base_setup_collective(id); + + /* record this proc's participation and its data */ + coll->num_local_recvd++; + + /* unpack the number of participants */ + n = 1; + if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &nprocs, &n, ORTE_VPID))) { + ORTE_ERROR_LOG(rc); + return; + } + + /* do we already have the names of all participants in this collective */ + keep = true; + if (0 < opal_list_get_size(&coll->participants)) { + /* we already have it, so don't bother saving the data */ + keep = false; + } + + /* even if we don't need the names, we still have to + * unpack them to get to the data + */ + for (i=0; i < nprocs; i++) { + /* unpack the name of this participant */ + n = 1; + if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &proc, &n, ORTE_NAME))) { + ORTE_ERROR_LOG(rc); + return; + } + if (keep) { + /* add the name to the list */ + nm = OBJ_NEW(orte_namelist_t); + nm->name.jobid = proc.jobid; + nm->name.vpid = proc.vpid; + OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, + "%s ADDING %s TO PARTICIPANTS", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(&proc))); + opal_list_append(&coll->participants, &nm->super); + } + /* find this job */ + if (NULL == (jdata = orte_get_job_data_object(proc.jobid))) { + /* if we can't find it, then we haven't processed the + * launch msg for this job yet - can't happen with + * our own local procs, but this could involve a proc + * running remotely that we don't know about yet + */ + do_progress = false; + } + } + + /* what remains in the buffer is solely the data payload, so + * add it to the collective + */ + opal_dss.copy_payload(&coll->local_bucket, buffer); + + /* if all involved jobs are known, then progress collectives */ + if (do_progress) { + OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, + "%s PROGRESSING COLLECTIVE %d", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), id)); + orte_grpcomm_base_progress_collectives(); + } +} + +void orte_grpcomm_base_pack_collective(opal_buffer_t *relay, + orte_grpcomm_collective_t *coll, + orte_grpcomm_internal_stage_t stg) +{ + orte_vpid_t nprocs; + orte_namelist_t *nm; + opal_list_item_t *itm; + + opal_dss.pack(relay, &coll->id, 1, ORTE_GRPCOMM_COLL_ID_T); + nprocs = opal_list_get_size(&coll->participants); + opal_dss.pack(relay, &nprocs, 1, ORTE_VPID); + if (0 < nprocs) { + for (itm = opal_list_get_first(&coll->participants); + itm != opal_list_get_end(&coll->participants); + itm = opal_list_get_next(itm)) { + nm = (orte_namelist_t*)itm; + opal_dss.pack(relay, &nm->name, 1, ORTE_NAME); + } + } + if (ORTE_GRPCOMM_INTERNAL_STG_LOCAL == stg) { + opal_dss.pack(relay, &coll->num_local_recvd, 1, ORTE_VPID); + opal_dss.copy_payload(relay, &coll->local_bucket); + } else if (ORTE_GRPCOMM_INTERNAL_STG_APP == stg) { + opal_dss.copy_payload(relay, &coll->buffer); + } else if (ORTE_GRPCOMM_INTERNAL_STG_GLOBAL == stg) { + opal_dss.pack(relay, &coll->num_global_recvd, 1, ORTE_VPID); + opal_dss.copy_payload(relay, &coll->buffer); + } else { + ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); + } +} + + +void orte_grpcomm_base_progress_collectives(void) +{ + opal_list_item_t *item, *itm; + orte_grpcomm_collective_t *coll; + orte_namelist_t *nm; + orte_job_t *jdata; + orte_vpid_t nlp, vpid; + opal_buffer_t *relay; + int rc; + + /* cycle thru all known collectives - any collective on the list + * must have come from either a local proc or receiving a global + * collective. Either way, the number of required recipients + * should have been set + */ + item = opal_list_get_first(&orte_grpcomm_base.active_colls); + while (item != opal_list_get_end(&orte_grpcomm_base.active_colls)) { + coll = (orte_grpcomm_collective_t*)item; + OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, + "%s PROGRESSING COLL id %d", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + coll->id)); + /* if this collective is already locally complete, then ignore it */ + if (coll->locally_complete) { + OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, + "%s COLL %d IS LOCALLY COMPLETE", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + coll->id)); + goto next_coll; + } + /* setup to count number of local participants */ + nlp = 0; + /* check all participants */ + for (itm = opal_list_get_first(&coll->participants); + itm != opal_list_get_end(&coll->participants); + itm = opal_list_get_next(itm)) { + nm = (orte_namelist_t*)itm; + /* get the job object for this participant */ + if (NULL == (jdata = orte_get_job_data_object(nm->name.jobid))) { + /* if the job object isn't found, then we can't progress + * this collective + */ + OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, + "%s COLL %d JOBID %s NOT FOUND", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + coll->id, ORTE_JOBID_PRINT(nm->name.jobid))); + goto next_coll; + } + /* if the job object is found, then we know about this + * job - count its local participants + */ + if (ORTE_VPID_WILDCARD == nm->name.vpid) { + /* all local procs from this job are required to participate */ + OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, + "%s ALL LOCAL PROCS CONTRIBUTE %d", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (int)jdata->num_local_procs)); + nlp += jdata->num_local_procs; + } else { + /* see if this is a local proc */ + if (ORTE_VPID_INVALID == (vpid = orte_ess.proc_get_daemon(&nm->name))) { + OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, + "%s COLL %d VPID %s NONLOCAL", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + coll->id, ORTE_VPID_PRINT(nm->name.vpid))); + continue; + } + if (vpid == ORTE_PROC_MY_NAME->vpid) { + OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, + "%s grpcomm:prog:collectives Counting %s as local participant", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(&nm->name))); + nlp++; + } + } + } + /* see if all reqd participants are done */ + if (nlp == coll->num_local_recvd) { + OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, + "%s COLLECTIVE %d LOCALLY COMPLETE - SENDING TO GLOBAL COLLECTIVE", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), coll->id)); + /* mark it as locally complete */ + coll->locally_complete = true; + /* pack the collective */ + relay = OBJ_NEW(opal_buffer_t); + orte_grpcomm_base_pack_collective(relay, coll, ORTE_GRPCOMM_INTERNAL_STG_LOCAL); + /* send it to our global collective handler */ + if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_NAME, relay, + ORTE_RML_TAG_DAEMON_COLL, 0, + orte_rml_send_callback, NULL))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(relay); + } + } + + next_coll: + item = opal_list_get_next(item); + } +} + +static void daemon_coll_recv(int status, orte_process_name_t* sender, + opal_buffer_t* data, orte_rml_tag_t tag, + void* cbdata) +{ + orte_job_t *jdata; + orte_std_cntr_t n; + opal_list_item_t *item; + orte_vpid_t np, nprocs, total_local_np; + int rc; + orte_grpcomm_collective_t *coll; + orte_namelist_t *nm; + orte_grpcomm_coll_id_t id; + bool keep, do_progress; + orte_process_name_t proc; + opal_buffer_t *relay; + + OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, + "%s grpcomm:base:daemon_coll: daemon collective recvd from %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(sender))); + + /* get the collective id */ + n = 1; + if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &id, &n, ORTE_GRPCOMM_COLL_ID_T))) { + ORTE_ERROR_LOG(rc); + return; + } + + OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, + "%s grpcomm:base:daemon_coll: WORKING COLLECTIVE %d", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), id)); + + /* setup the collective for this id - if it's already present, + * then this will just return the existing structure + */ + coll = orte_grpcomm_base_setup_collective(id); + + /* record that we received a bucket */ + coll->num_peer_buckets++; + + /* unpack the number of procs involved */ + n = 1; + if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &nprocs, &n, ORTE_VPID))) { + ORTE_ERROR_LOG(rc); + return; + } + + /* do we need to keep the participants? */ + keep = true; + if (0 < opal_list_get_size(&coll->participants)) { + /* already have it */ + keep = false; + } + + do_progress = true; + total_local_np = 0; + for (np=0; np < nprocs; np++) { + /* unpack the name of this participant */ + n = 1; + if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &proc, &n, ORTE_NAME))) { + ORTE_ERROR_LOG(rc); + return; + } + if (keep) { + /* add the name to the list */ + nm = OBJ_NEW(orte_namelist_t); + nm->name.jobid = proc.jobid; + nm->name.vpid = proc.vpid; + opal_list_append(&coll->participants, &nm->super); + } + /* find this job */ + if (NULL == (jdata = orte_get_job_data_object(proc.jobid))) { + /* if we can't find it, then we haven't processed the + * launch msg for this job yet - can't happen with + * our own local procs, but this could involve a proc + * running remotely that we don't know about yet + */ + do_progress = false; + } + total_local_np += jdata->num_local_procs; + } + if (do_progress && 0 == total_local_np) { + coll->locally_complete = true; + } + + /* unpack the number of contributors involved in the incoming data */ + n = 1; + if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &np, &n, ORTE_VPID))) { + ORTE_ERROR_LOG(rc); + return; + } + OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, + "%s grpcomm:base:daemon_coll: NUM CONTRIBS: %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_VPID_PRINT(np))); + /* add it to the number of global recvd */ + coll->num_global_recvd += np; + + /* transfer the data */ + opal_dss.copy_payload(&coll->buffer, data); + + /* are we done? */ + if (!do_progress || !coll->locally_complete) { + /* can't continue - missing at least one launch msg + * or not locally complete + */ + OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, + "%s grpcomm:base:daemon_coll: CANNOT PROGRESS", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + return; + } + + /* determine how many buckets we should receive from others + * involved in this collective - need to know the number + * of total contributors from all buckets being relayed + * thru us + */ + orte_routed.get_routing_list(ORTE_GRPCOMM_COLL_PEERS, coll); + np = 1; /* account for our own bucket */ + while (NULL != (item = opal_list_remove_first(&coll->targets))) { + nm = (orte_namelist_t*)item; + if (ORTE_VPID_WILDCARD == nm->name.vpid) { + /* wait for input from all daemons */ + np = orte_process_info.num_procs; + break; + } else { + np++; + } + } + /* clear the list for reuse */ + while (NULL != (nm = (orte_namelist_t*)opal_list_remove_first(&coll->targets))) { + OBJ_RELEASE(nm); + } + + /* relay the data, if required */ + if (np == coll->num_peer_buckets) { + orte_routed.get_routing_list(ORTE_GRPCOMM_COLL_RELAY, coll); + + while (NULL != (nm = (orte_namelist_t*)opal_list_remove_first(&coll->targets))) { + OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, + "%s grpcomm:base:daemon_coll: RELAYING COLLECTIVE TO %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(&nm->name))); + relay = OBJ_NEW(opal_buffer_t); + orte_grpcomm_base_pack_collective(relay, coll, ORTE_GRPCOMM_INTERNAL_STG_GLOBAL); + if (ORTE_VPID_WILDCARD == nm->name.vpid) { + /* this is going to everyone in this job, so use xcast */ + orte_grpcomm.xcast(nm->name.jobid, relay, ORTE_RML_TAG_DAEMON_COLL); + OBJ_RELEASE(relay); + } + /* otherwise, send to each member, but don't send it back to the + * sender as that can create an infinite loop + */ + if (nm->name.vpid == sender->vpid) { + OBJ_RELEASE(relay); + } else { + if (0 > orte_rml.send_buffer_nb(&nm->name, relay, ORTE_RML_TAG_DAEMON_COLL, 0, + orte_rml_send_callback, NULL)) { + ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); + OBJ_RELEASE(relay); + } + } + OBJ_RELEASE(nm); + } + } + /* clear the list for reuse */ + while (NULL != (nm = (orte_namelist_t*)opal_list_remove_first(&coll->targets))) { + OBJ_RELEASE(nm); + } + + /* determine how many contributors we need to recv - we know + * that all job objects were found, so we can skip that test + * while counting + */ + np = 0; + for (item = opal_list_get_first(&coll->participants); + item != opal_list_get_end(&coll->participants); + item = opal_list_get_next(item)) { + nm = (orte_namelist_t*)item; + /* get the job object for this participant */ + jdata = orte_get_job_data_object(nm->name.jobid); + if (ORTE_VPID_WILDCARD == nm->name.vpid) { + /* all procs from this job are required to participate */ + np += jdata->num_procs; + } else { + np++; + } + } + + /* are we done? */ + if (np != coll->num_global_recvd) { + OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, + "%s grpcomm:base:daemon_coll: MISSING CONTRIBUTORS: np %s ngr %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_VPID_PRINT(np), + ORTE_VPID_PRINT(coll->num_global_recvd))); + return; + } + + /* find out where, if anywhere, to send the results */ + orte_routed.get_routing_list(ORTE_GRPCOMM_COLL_COMPLETE, coll); + + /* pass the result */ + while (NULL != (item = opal_list_remove_first(&coll->targets))) { + nm = (orte_namelist_t*)item; + relay = OBJ_NEW(opal_buffer_t); + opal_dss.pack(relay, &coll->id, 1, ORTE_GRPCOMM_COLL_ID_T); + opal_dss.copy_payload(relay, &coll->buffer); + if (ORTE_VPID_WILDCARD == nm->name.vpid) { + /* all procs from this job get it */ + orte_grpcomm.xcast(nm->name.jobid, relay, ORTE_RML_TAG_COLLECTIVE); + OBJ_RELEASE(relay); + } else { + /* send it to this proc */ + if (0 > orte_rml.send_buffer_nb(&nm->name, relay, ORTE_RML_TAG_COLLECTIVE, 0, + orte_rml_send_callback, NULL)) { + ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); + OBJ_RELEASE(relay); + } + } + OBJ_RELEASE(nm); + } + + /* remove this collective */ + opal_list_remove_item(&orte_grpcomm_base.active_colls, &coll->super); + OBJ_RELEASE(coll); +} diff --git a/orte/mca/grpcomm/base/grpcomm_base_xcast.c b/orte/mca/grpcomm/base/grpcomm_base_xcast.c new file mode 100644 index 0000000000..5c1f1624e7 --- /dev/null +++ b/orte/mca/grpcomm/base/grpcomm_base_xcast.c @@ -0,0 +1,221 @@ +/* -*- C -*- + * + * Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2005 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2011-2012 Los Alamos National Security, LLC. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +/** @file: + * + */ + +/* + * includes + */ +#include "orte_config.h" + + +#include "opal/dss/dss.h" + +#include "orte/util/proc_info.h" +#include "orte/util/error_strings.h" +#include "orte/mca/errmgr/errmgr.h" +#include "orte/mca/ess/ess.h" +#include "orte/mca/odls/base/base.h" +#include "orte/mca/rml/rml.h" +#include "orte/mca/routed/routed.h" +#include "orte/util/name_fns.h" +#include "orte/runtime/orte_globals.h" + +#include "orte/mca/grpcomm/grpcomm_types.h" +#include "orte/mca/grpcomm/grpcomm.h" +#include "orte/mca/grpcomm/base/base.h" + +void orte_grpcomm_base_xcast_recv(int status, orte_process_name_t* sender, + opal_buffer_t* buffer, orte_rml_tag_t tag, + void* cbdata) +{ + opal_list_item_t *item; + orte_namelist_t *nm; + int ret, cnt; + opal_buffer_t *relay; + orte_daemon_cmd_flag_t command; + opal_buffer_t wireup; + opal_byte_object_t *bo; + int8_t flag; + orte_grpcomm_collective_t coll; + + OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base.output, + "%s grpcomm:xcast:recv:send_relay", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + + /* setup the relay message */ + relay = OBJ_NEW(opal_buffer_t); + opal_dss.copy_payload(relay, buffer); + + /* peek at the command */ + cnt=1; + if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &command, &cnt, ORTE_DAEMON_CMD))) { + ORTE_ERROR_LOG(ret); + goto relay; + } + + /* if it is add_procs, then... */ + if (ORTE_DAEMON_ADD_LOCAL_PROCS == command) { + /* extract the byte object holding the daemonmap */ + cnt=1; + if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &bo, &cnt, OPAL_BYTE_OBJECT))) { + ORTE_ERROR_LOG(ret); + goto relay; + } + + /* update our local nidmap, if required - the decode function + * knows what to do - it will also free the bytes in the bo + */ + OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, + "%s grpcomm:base:xcast updating nidmap", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + + if (ORTE_SUCCESS != (ret = orte_ess.update_nidmap(bo))) { + ORTE_ERROR_LOG(ret); + goto relay; + } + /* update the routing plan */ + orte_routed.update_routing_plan(); + + /* see if we have wiring info as well */ + cnt=1; + if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &flag, &cnt, OPAL_INT8))) { + ORTE_ERROR_LOG(ret); + goto relay; + } + if (0 == flag) { + /* no - just return */ + goto relay; + } + + /* unpack the byte object */ + cnt=1; + if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &bo, &cnt, OPAL_BYTE_OBJECT))) { + ORTE_ERROR_LOG(ret); + goto relay; + } + if (0 < bo->size) { + /* load it into a buffer */ + OBJ_CONSTRUCT(&wireup, opal_buffer_t); + opal_dss.load(&wireup, bo->bytes, bo->size); + /* pass it for processing */ + if (ORTE_SUCCESS != (ret = orte_routed.init_routes(ORTE_PROC_MY_NAME->jobid, &wireup))) { + ORTE_ERROR_LOG(ret); + OBJ_DESTRUCT(&wireup); + goto relay; + } + /* done with the wireup buffer - dump it */ + OBJ_DESTRUCT(&wireup); + } + } + + relay: + /* setup the relay list */ + OBJ_CONSTRUCT(&coll, orte_grpcomm_collective_t); + + /* get the list of next recipients from the routed module */ + orte_routed.get_routing_list(ORTE_GRPCOMM_XCAST, &coll); + + /* if list is empty, no relay is required */ + if (opal_list_is_empty(&coll.targets)) { + OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, + "%s orte:daemon:send_relay - recipient list is empty!", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + goto CLEANUP; + } + + /* send the message to each recipient on list, deconstructing it as we go */ + while (NULL != (item = opal_list_remove_first(&coll.targets))) { + nm = (orte_namelist_t*)item; + + OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, + "%s orte:daemon:send_relay sending relay msg to %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(&nm->name))); + OBJ_RETAIN(relay); + if (0 > (ret = orte_rml.send_buffer_nb(&nm->name, relay, ORTE_RML_TAG_XCAST, 0, + orte_rml_send_callback, NULL))) { + ORTE_ERROR_LOG(ret); + OBJ_RELEASE(relay); + continue; + } + } + + CLEANUP: + /* cleanup */ + OBJ_DESTRUCT(&coll); + + /* now send it to myself for processing */ + if (0 > (ret = orte_rml.send_buffer_nb(ORTE_PROC_MY_NAME, relay, + ORTE_RML_TAG_DAEMON, 0, + orte_rml_send_callback, NULL))) { + ORTE_ERROR_LOG(ret); + OBJ_RELEASE(relay); + } +} + +int orte_grpcomm_base_pack_xcast(orte_jobid_t job, + opal_buffer_t *buffer, + opal_buffer_t *message, + orte_rml_tag_t tag) +{ + orte_daemon_cmd_flag_t command; + int rc; + + /* if this isn't intended for the daemon command tag, then we better + * tell the daemon to deliver it to the procs, and what job is supposed + * to get it - this occurs when a caller just wants to send something + * to all the procs in a job. In that use-case, the caller doesn't know + * anything about inserting daemon commands or what routing algo might + * be used, so we have to help them out a little. Functions that are + * sending commands to the daemons themselves are smart enough to know + * what they need to do. + */ + if (ORTE_RML_TAG_DAEMON != tag) { + command = ORTE_DAEMON_MESSAGE_LOCAL_PROCS; + if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &command, 1, ORTE_DAEMON_CMD))) { + ORTE_ERROR_LOG(rc); + goto CLEANUP; + } + if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &job, 1, ORTE_JOBID))) { + ORTE_ERROR_LOG(rc); + goto CLEANUP; + } + if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &tag, 1, ORTE_RML_TAG))) { + ORTE_ERROR_LOG(rc); + goto CLEANUP; + } + } + + /* copy the payload into the new buffer - this is non-destructive, so our + * caller is still responsible for releasing any memory in the buffer they + * gave to us + */ + if (ORTE_SUCCESS != (rc = opal_dss.copy_payload(buffer, message))) { + ORTE_ERROR_LOG(rc); + goto CLEANUP; + } + +CLEANUP: + return ORTE_SUCCESS; +} + diff --git a/orte/mca/grpcomm/cnos/grpcomm_cnos_module.c b/orte/mca/grpcomm/cnos/grpcomm_cnos_module.c index 0a73964799..710245599b 100644 --- a/orte/mca/grpcomm/cnos/grpcomm_cnos_module.c +++ b/orte/mca/grpcomm/cnos/grpcomm_cnos_module.c @@ -32,6 +32,7 @@ #include "orte/mca/errmgr/errmgr.h" #include "orte/mca/rml/rml_types.h" +#include "orte/mca/grpcomm/grpcomm_types.h" #include "grpcomm_cnos.h" #if OMPI_GRPCOMM_CNOS_HAVE_BARRIER @@ -50,11 +51,9 @@ static int xcast(orte_jobid_t job, opal_buffer_t *buffer, orte_rml_tag_t tag); -static int orte_grpcomm_cnos_barrier(void); +static int orte_grpcomm_cnos_barrier(orte_grpcomm_collective_t *coll); -static int allgather(opal_buffer_t *sbuf, opal_buffer_t *rbuf); - -static int allgather_list(opal_list_t *names, opal_buffer_t *sbuf, opal_buffer_t *rbuf); +static int allgather(orte_grpcomm_collective_t *coll); static int set_proc_attr(const char *attr_name, const void *data, @@ -64,7 +63,7 @@ static int get_proc_attr(const orte_process_name_t proc, const char * attribute_name, void **val, size_t *size); -static int modex(opal_list_t *procs); +static int modex(orte_grpcomm_collective_t *coll); static int purge_proc_attrs(void); @@ -73,7 +72,6 @@ orte_grpcomm_base_module_t orte_grpcomm_cnos_module = { finalize, xcast, allgather, - allgather_list, orte_grpcomm_cnos_barrier, set_proc_attr, get_proc_attr, @@ -113,37 +111,35 @@ static int xcast(orte_jobid_t job, } static int -orte_grpcomm_cnos_barrier(void) +orte_grpcomm_cnos_barrier(orte_grpcomm_collective_t *coll) { #if OMPI_GRPCOMM_CNOS_HAVE_BARRIER cnos_barrier(); #endif - + coll->active = false; + if (NULL != coll->cbfunc) { + coll->cbfunc(NULL, coll->cbdata); + } return ORTE_SUCCESS; } -static int allgather(opal_buffer_t *sbuf, opal_buffer_t *rbuf) +static int allgather(orte_grpcomm_collective_t *coll) { int rc; orte_std_cntr_t zero=0; - - /* seed the outgoing buffer with num_procs=0 so it won't be unpacked */ - if (ORTE_SUCCESS != (rc = opal_dss.pack(rbuf, &zero, 1, ORTE_STD_CNTR))) { - ORTE_ERROR_LOG(rc); - return rc; - } - return rc; -} + opal_buffer_t rbuf; -static int allgather_list(opal_list_t *names, opal_buffer_t *sbuf, opal_buffer_t *rbuf) -{ - int rc; - orte_std_cntr_t zero=0; - - /* seed the outgoing buffer with num_procs=0 so it won't be unpacked */ - if (ORTE_SUCCESS != (rc = opal_dss.pack(rbuf, &zero, 1, ORTE_STD_CNTR))) { - ORTE_ERROR_LOG(rc); - return rc; + coll->active = false; + if (NULL != coll->cbfunc) { + /* seed the outgoing buffer with num_procs=0 so it won't be unpacked */ + OBJ_CONSTRUCT(&rbuf, opal_buffer_t); + if (ORTE_SUCCESS != (rc = opal_dss.pack(&rbuf, &zero, 1, ORTE_STD_CNTR))) { + ORTE_ERROR_LOG(rc); + OBJ_DESTRUCT(&rbuf); + return rc; + } + coll->cbfunc(&rbuf, coll->cbdata); + OBJ_DESTRUCT(&rbuf); } return rc; } @@ -164,8 +160,12 @@ static int get_proc_attr(const orte_process_name_t proc, return ORTE_ERR_NOT_IMPLEMENTED; } -static int modex(opal_list_t *procs) +static int modex(orte_grpcomm_collective_t *coll) { + modex->active = false; + if (NULL != coll->cbfunc) { + coll->cbfunc(NULL, coll->cbdata); + } return ORTE_SUCCESS; } diff --git a/orte/mca/grpcomm/grpcomm.h b/orte/mca/grpcomm/grpcomm.h index 9d6b4ac9f1..ab261057d1 100644 --- a/orte/mca/grpcomm/grpcomm.h +++ b/orte/mca/grpcomm/grpcomm.h @@ -9,6 +9,8 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. + * Copyright (c) 2011-2012 Los Alamos National Security, LLC. + * All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -63,13 +65,10 @@ typedef int (*orte_grpcomm_base_module_xcast_fn_t)(orte_jobid_t job, orte_rml_tag_t tag); /* allgather - gather data from all procs */ -typedef int (*orte_grpcomm_base_module_allgather_fn_t)(opal_buffer_t *sbuf, opal_buffer_t *rbuf); - -typedef int (*orte_grpcomm_base_module_allgather_list_fn_t)(opal_list_t *names, - opal_buffer_t *sbuf, opal_buffer_t *rbuf); +typedef int (*orte_grpcomm_base_module_allgather_fn_t)(orte_grpcomm_collective_t *coll); /* barrier function */ -typedef int (*orte_grpcomm_base_module_barrier_fn_t)(void); +typedef int (*orte_grpcomm_base_module_barrier_fn_t)(orte_grpcomm_collective_t *coll); /** DATA EXCHANGE FUNCTIONS - SEE ompi/runtime/ompi_module_exchange.h FOR A DESCRIPTION @@ -86,12 +85,11 @@ typedef int (*orte_grpcomm_base_module_modex_get_proc_attr_fn_t)(const orte_proc void **buffer, size_t *size); /* perform a modex operation */ -typedef int (*orte_grpcomm_base_module_modex_fn_t)(opal_list_t *procs); +typedef int (*orte_grpcomm_base_module_modex_fn_t)(orte_grpcomm_collective_t *coll); /* purge the internal attr table */ typedef int (*orte_grpcomm_base_module_purge_proc_attrs_fn_t)(void); - /* * Ver 2.0 */ @@ -101,7 +99,6 @@ struct orte_grpcomm_base_module_2_0_0_t { /* collective operations */ orte_grpcomm_base_module_xcast_fn_t xcast; orte_grpcomm_base_module_allgather_fn_t allgather; - orte_grpcomm_base_module_allgather_list_fn_t allgather_list; orte_grpcomm_base_module_barrier_fn_t barrier; /* modex functions */ orte_grpcomm_base_module_modex_set_proc_attr_fn_t set_proc_attr; diff --git a/orte/mca/grpcomm/grpcomm_types.h b/orte/mca/grpcomm/grpcomm_types.h index bdfd5230b9..ddb8877665 100644 --- a/orte/mca/grpcomm/grpcomm_types.h +++ b/orte/mca/grpcomm/grpcomm_types.h @@ -9,6 +9,8 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. + * Copyright (c) 2011-2012 Los Alamos National Security, LLC. + * All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -41,28 +43,79 @@ BEGIN_C_DECLS -/* - * Define routing modes +/* Define a collective callback function - this will + * be called upon completion of collective ops such + * as modex and barrier. */ -typedef uint8_t orte_grpcomm_mode_t; -#define ORTE_GRPCOMM_MODE_T OPAL_UINT8 +typedef void (*orte_grpcomm_collective_cbfunc_t)(opal_buffer_t *data, void *cbdata); -/* daemon N relays message to daemon N+1 */ -#define ORTE_GRPCOMM_CHAIN (orte_grpcomm_mode_t) 1 -/* binomial tree */ -#define ORTE_GRPCOMM_BINOMIAL (orte_grpcomm_mode_t) 2 -/* linear - HNP sends direct to all daemons */ -#define ORTE_GRPCOMM_LINEAR (orte_grpcomm_mode_t) 3 +/* forward define the struct */ +struct orte_grpcomm_collective_t; -/* - * Define collective types - */ -typedef uint8_t orte_grpcomm_coll_t; -#define ORTE_GRPCOMM_COLL_T OPAL_UINT8 +typedef int32_t orte_grpcomm_coll_id_t; +#define ORTE_GRPCOMM_COLL_ID_T OPAL_INT32 +#define ORTE_GRPCOMM_COLL_ID_REQ -1 -#define ORTE_GRPCOMM_COLL_NONE 0x00 -#define ORTE_GRPCOMM_BARRIER 0x01 -#define ORTE_GRPCOMM_ALLGATHER 0x02 +typedef int8_t orte_grpcomm_coll_t; +#define ORTE_GRPCOMM_XCAST 1 +#define ORTE_GRPCOMM_COLL_RELAY 2 +#define ORTE_GRPCOMM_COLL_COMPLETE 3 +#define ORTE_GRPCOMM_COLL_PEERS 4 + +typedef enum { + ORTE_GRPCOMM_INTERNAL_STG_LOCAL, + ORTE_GRPCOMM_INTERNAL_STG_GLOBAL, + ORTE_GRPCOMM_INTERNAL_STG_APP +} orte_grpcomm_internal_stage_t; + +/* structure for tracking collective operations */ +struct orte_grpcomm_collective_t { + opal_list_item_t super; + orte_grpcomm_coll_id_t id; + /* flag that user can poll on to know when collective + * has completed - set to false just prior to + * calling user callback function, if non-NULL + */ + bool active; + /* number of local contributors */ + orte_vpid_t num_local_recvd; + /* bucket to collect local contributions */ + opal_buffer_t local_bucket; + /* number of buckets collected from peers */ + orte_vpid_t num_peer_buckets; + /* total number of contributors */ + orte_vpid_t num_global_recvd; + /* flag to mark that the collective is locally complete - i.e., + * all local contributions have been recvd and the local + * data has been entered into the global collective + */ + bool locally_complete; + /* list of names of those participating in the collective - an + * entry with vpid=WILDCARD implies that all members of that + * job must participate in the collective + */ + opal_list_t participants; + /* user callback function to be executed when collective + * is completed + */ + orte_grpcomm_collective_cbfunc_t cbfunc; + void *cbdata; + /* buffer collecting data to be delivered to user */ + opal_buffer_t buffer; + /* list of names of procs to receive the next step + * in executing the collective - this is obtained from + * the routed framework to minimize hops + */ + opal_list_t targets; + /* some collectives wrap around and call internal + * steps before completing - e.g., modex. This + * points the collective to the next step in the procedure + */ + orte_grpcomm_collective_cbfunc_t next_cb; + void *next_cbdata; +}; +typedef struct orte_grpcomm_collective_t orte_grpcomm_collective_t; +ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_grpcomm_collective_t); END_C_DECLS diff --git a/orte/mca/sensor/heartbeat/.ompi_ignore b/orte/mca/grpcomm/hier/.ompi_ignore similarity index 100% rename from orte/mca/sensor/heartbeat/.ompi_ignore rename to orte/mca/grpcomm/hier/.ompi_ignore diff --git a/orte/mca/grpcomm/hier/grpcomm_hier_module.c b/orte/mca/grpcomm/hier/grpcomm_hier_module.c index 331f4e6da9..dd54a6173a 100644 --- a/orte/mca/grpcomm/hier/grpcomm_hier_module.c +++ b/orte/mca/grpcomm/hier/grpcomm_hier_module.c @@ -93,7 +93,6 @@ static int init(void) my_local_rank_zero_proc.jobid = ORTE_PROC_MY_NAME->jobid; my_local_rank_zero_proc.vpid = ORTE_VPID_INVALID; - ORTE_EPOCH_SET(my_local_rank_zero_proc.epoch,ORTE_EPOCH_MIN); if (ORTE_SUCCESS != (rc = orte_grpcomm_base_modex_init())) { ORTE_ERROR_LOG(rc); @@ -268,7 +267,6 @@ static int hier_allgather(opal_buffer_t *sbuf, opal_buffer_t *rbuf) proc.jobid = ORTE_PROC_MY_NAME->jobid; for (v=0; v < orte_process_info.num_procs; v++) { proc.vpid = v; - ORTE_EPOCH_SET(proc.epoch,orte_util_lookup_epoch(&proc)); /* is this proc local_rank=0 on its node? */ if (0 == my_local_rank && 0 == orte_ess.get_local_rank(&proc)) { @@ -283,7 +281,6 @@ static int hier_allgather(opal_buffer_t *sbuf, opal_buffer_t *rbuf) nm = OBJ_NEW(orte_namelist_t); nm->name.jobid = proc.jobid; nm->name.vpid = proc.vpid; - ORTE_EPOCH_SET(nm->name.epoch,proc.epoch); opal_list_append(&my_local_peers, &nm->item); /* if I am not local_rank=0, is this one? */ @@ -291,7 +288,6 @@ static int hier_allgather(opal_buffer_t *sbuf, opal_buffer_t *rbuf) 0 == orte_ess.get_local_rank(&proc)) { my_local_rank_zero_proc.jobid = proc.jobid; my_local_rank_zero_proc.vpid = proc.vpid; - ORTE_EPOCH_SET(my_local_rank_zero_proc.epoch,proc.epoch); } } diff --git a/orte/mca/grpcomm/pmi/grpcomm_pmi_module.c b/orte/mca/grpcomm/pmi/grpcomm_pmi_module.c index 1bdc574069..2fbc444ee1 100644 --- a/orte/mca/grpcomm/pmi/grpcomm_pmi_module.c +++ b/orte/mca/grpcomm/pmi/grpcomm_pmi_module.c @@ -3,7 +3,7 @@ * Copyright (c) 2007 The Trustees of Indiana University. * All rights reserved. * Copyright (c) 2011 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2011 Los Alamos National Security, LLC. All + * Copyright (c) 2011-2012 Los Alamos National Security, LLC. All * rights reserved. * $COPYRIGHT$ * @@ -40,16 +40,14 @@ static void finalize(void); static int xcast(orte_jobid_t job, opal_buffer_t *buffer, orte_rml_tag_t tag); -static int pmi_allgather(opal_buffer_t *sbuf, opal_buffer_t *rbuf); -static int pmi_allgather_list(opal_list_t *names, - opal_buffer_t *sbuf, opal_buffer_t *rbuf); -static int pmi_barrier(void); +static int pmi_allgather(orte_grpcomm_collective_t *coll); +static int pmi_barrier(orte_grpcomm_collective_t *coll); static int pmi_set_proc_attr(const char* attr_name, const void *buffer, size_t size); static int pmi_get_proc_attr(const orte_process_name_t name, const char* attr_name, void **buffer, size_t *size); -static int modex(opal_list_t *procs); +static int modex(orte_grpcomm_collective_t *coll); static int purge_proc_attrs(void); /* Module def */ @@ -58,7 +56,6 @@ orte_grpcomm_base_module_t orte_grpcomm_pmi_module = { finalize, xcast, pmi_allgather, - pmi_allgather_list, pmi_barrier, pmi_set_proc_attr, pmi_get_proc_attr, @@ -165,7 +162,7 @@ static int xcast(orte_jobid_t job, return ORTE_ERR_NOT_SUPPORTED; } -static int pmi_barrier(void) +static int pmi_barrier(orte_grpcomm_collective_t *coll) { int rc; @@ -173,11 +170,15 @@ static int pmi_barrier(void) "%s grpcomm:pmi entering barrier", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - /* if I am alone, just return */ + /* if I am alone, just execute the callback */ if (1 == orte_process_info.num_procs) { OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base.output, "%s grpcomm:pmi:barrier only one proc", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + coll->active = false; + if (NULL != coll->cbfunc) { + coll->cbfunc(NULL, coll->cbdata); + } return ORTE_SUCCESS; } @@ -198,25 +199,21 @@ static int pmi_barrier(void) OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base.output, "%s grpcomm:pmi barrier complete", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + /* execute the callback */ + coll->active = false; + if (NULL != coll->cbfunc) { + coll->cbfunc(NULL, coll->cbdata); + } return ORTE_SUCCESS; } -static int pmi_allgather(opal_buffer_t *sbuf, opal_buffer_t *rbuf) +static int pmi_allgather(orte_grpcomm_collective_t *coll) { /* not used in this implementation */ return ORTE_ERR_NOT_SUPPORTED; } -static int pmi_allgather_list(opal_list_t *names, - opal_buffer_t *sbuf, opal_buffer_t *rbuf) -{ - /* no idea how to do this - only occurs for comm_spawn, - * which this module doesn't support - */ - return ORTE_ERR_NOT_SUPPORTED; -} - static int pmi_set_proc_attr(const char* attr_name, const void *buffer, size_t size) { @@ -285,7 +282,7 @@ static int pmi_get_proc_attr(const orte_process_name_t name, } /*** MODEX SECTION ***/ -static int modex(opal_list_t *procs) +static int modex(orte_grpcomm_collective_t *coll) { int rc, i; size_t len; @@ -520,28 +517,17 @@ static int modex(opal_list_t *procs) rc = kvs_get(pmi_kvs_key, pmi_attr_val, pmi_vallen_max); /* don't error out here - if not found, that's okay */ if (PMI_SUCCESS == rc) { - if (OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, &name, ORTE_PROC_MY_NAME)) { + if (name.jobid == ORTE_PROC_MY_NAME->jobid && + name.vpid == ORTE_PROC_MY_NAME->vpid) { /* if this data is from myself, then set locality to all */ - OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base.output, - "%s grpcomm:pmi setting proc %s locale ALL", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&name))); - pmap->locality = OPAL_PROC_ALL_LOCAL; + pmap->locality = OPAL_PROC_ALL_LOCAL; } else if (loc->daemon != ORTE_PROC_MY_DAEMON->vpid) { /* this is on a different node, then mark as non-local */ - OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base.output, - "%s grpcomm:pmi setting proc %s locale NONLOCAL", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&name))); pmap->locality = OPAL_PROC_NON_LOCAL; } else if (0 == strlen(pmi_attr_val)){ /* if we share a node, but we don't know anything more, then * mark us as on the node as this is all we know */ - OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base.output, - "%s grpcomm:pmi setting proc %s locale NODE", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&name))); pmap->locality = OPAL_PROC_ON_NODE; } else { bind_level = strtol(pmi_attr_val, NULL, 10); @@ -560,13 +546,13 @@ static int modex(opal_list_t *procs) orte_process_info.bind_level, orte_process_info.bind_idx, bind_level, bind_idx); - OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base.output, - "%s grpcommpmi setting proc %s locale %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&name), - opal_hwloc_base_print_locality(pmap->locality))); } } + OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base.output, + "%s grpcomm:pmi setting proc %s locale %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(&name), + opal_hwloc_base_print_locality(pmap->locality))); } } #endif @@ -575,7 +561,12 @@ static int modex(opal_list_t *procs) OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base.output, "%s grpcomm:pmi: modex completed", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - + + /* execute the callback */ + coll->active = false; + if (NULL != coll->cbfunc) { + coll->cbfunc(NULL, coll->cbdata); + } return rc; } diff --git a/orte/mca/grpcomm/portals4_shmem/grpcomm_portals4_shmem_module.c b/orte/mca/grpcomm/portals4_shmem/grpcomm_portals4_shmem_module.c index 6fc4521341..0132036bc8 100644 --- a/orte/mca/grpcomm/portals4_shmem/grpcomm_portals4_shmem_module.c +++ b/orte/mca/grpcomm/portals4_shmem/grpcomm_portals4_shmem_module.c @@ -11,6 +11,8 @@ * All rights reserved. * Copyright (c) 2007 Sun Microsystems, Inc. All rights reserved. * Copyright (c) 2010 Sandia National Laboratories. All rights reserved. + * Copyright (c) 2012 Los Alamos National Security, LLC. All + * rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -46,11 +48,9 @@ static int xcast(orte_jobid_t job, opal_buffer_t *buffer, orte_rml_tag_t tag); -static int orte_grpcomm_portals4_shmem_barrier(void); +static int orte_grpcomm_portals4_shmem_barrier(orte_grpcomm_collective_t *coll); -static int allgather(opal_buffer_t *sbuf, opal_buffer_t *rbuf); - -static int allgather_list(opal_list_t *names, opal_buffer_t *sbuf, opal_buffer_t *rbuf); +static int allgather(orte_grpcomm_collective_t *coll); static int set_proc_attr(const char *attr_name, const void *data, @@ -60,7 +60,7 @@ static int get_proc_attr(const orte_process_name_t proc, const char * attribute_name, void **val, size_t *size); -static int modex(opal_list_t *procs); +static int modex(orte_grpcomm_collective_t *coll); static int purge_proc_attrs(void); @@ -69,7 +69,6 @@ orte_grpcomm_base_module_t orte_grpcomm_portals4_shmem_module = { finalize, xcast, allgather, - allgather_list, orte_grpcomm_portals4_shmem_barrier, set_proc_attr, get_proc_attr, @@ -113,35 +112,33 @@ static int xcast(orte_jobid_t job, } static int -orte_grpcomm_portals4_shmem_barrier(void) +orte_grpcomm_portals4_shmem_barrier(orte_grpcomm_collective_t *coll) { runtime_barrier(); - + coll->active = false; + if (NULL != coll->cbfunc) { + coll->cbfunc(NULL, coll->cbdata); + } return ORTE_SUCCESS; } -static int allgather(opal_buffer_t *sbuf, opal_buffer_t *rbuf) +static int allgather(orte_grpcomm_collective_t *coll) { int rc; orte_std_cntr_t zero=0; - - /* seed the outgoing buffer with num_procs=0 so it won't be unpacked */ - if (ORTE_SUCCESS != (rc = opal_dss.pack(rbuf, &zero, 1, ORTE_STD_CNTR))) { - ORTE_ERROR_LOG(rc); - return rc; - } - return rc; -} + opal_buffer_t rbuf; -static int allgather_list(opal_list_t *names, opal_buffer_t *sbuf, opal_buffer_t *rbuf) -{ - int rc; - orte_std_cntr_t zero=0; - - /* seed the outgoing buffer with num_procs=0 so it won't be unpacked */ - if (ORTE_SUCCESS != (rc = opal_dss.pack(rbuf, &zero, 1, ORTE_STD_CNTR))) { - ORTE_ERROR_LOG(rc); - return rc; + coll->active = false; + if (NULL != coll->cbfunc) { + /* seed the outgoing buffer with num_procs=0 so it won't be unpacked */ + OBJ_CONSTRUCT(&rbuf, opal_buffer_t); + if (ORTE_SUCCESS != (rc = opal_dss.pack(&rbuf, &zero, 1, ORTE_STD_CNTR))) { + ORTE_ERROR_LOG(rc); + OBJ_DESTRUCT(&rbuf); + return rc; + } + coll->cbfunc(&rbuf, coll->cbdata); + OBJ_DESTRUCT(&rbuf); } return rc; } @@ -186,8 +183,12 @@ static int get_proc_attr(const orte_process_name_t proc, return ORTE_ERR_NOT_IMPLEMENTED; } -static int modex(opal_list_t *procs) +static int modex(orte_grpcomm_collective_t *coll) { + coll->active = false; + if (NULL != coll->cbfunc) { + coll->cbfunc(NULL, coll->cbdata); + } return ORTE_SUCCESS; } diff --git a/orte/mca/iof/base/base.h b/orte/mca/iof/base/base.h index 248def66fc..5b92eb3a29 100644 --- a/orte/mca/iof/base/base.h +++ b/orte/mca/iof/base/base.h @@ -64,7 +64,7 @@ ORTE_DECLSPEC int orte_iof_base_open(void); typedef struct { opal_list_item_t super; bool pending; - opal_event_t ev; + opal_event_t *ev; int fd; opal_list_t outputs; } orte_iof_write_event_t; @@ -86,7 +86,7 @@ ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_iof_sink_t); typedef struct { opal_object_t super; orte_process_name_t name; - opal_event_t ev; + opal_event_t *ev; int fd; orte_iof_tag_t tag; bool active; @@ -135,12 +135,11 @@ typedef struct orte_iof_base_t orte_iof_base_t; ep = OBJ_NEW(orte_iof_sink_t); \ ep->name.jobid = (nm)->jobid; \ ep->name.vpid = (nm)->vpid; \ - ORTE_EPOCH_SET(ep->name.epoch,(nm)->epoch); \ ep->tag = (tg); \ if (0 <= (fid)) { \ ep->wev->fd = (fid); \ - opal_event_set(opal_event_base, \ - &(ep->wev->ev), ep->wev->fd, \ + opal_event_set(orte_event_base, \ + ep->wev->ev, ep->wev->fd, \ OPAL_EV_WRITE, \ wrthndlr, ep); \ } \ @@ -169,19 +168,18 @@ typedef struct orte_iof_base_t orte_iof_base_t; rev = OBJ_NEW(orte_iof_read_event_t); \ rev->name.jobid = (nm)->jobid; \ rev->name.vpid = (nm)->vpid; \ - ORTE_EPOCH_SET(rev->name.epoch,(nm)->epoch); \ rev->tag = (tg); \ rev->fd = (fid); \ *(rv) = rev; \ rev->file = strdup(__FILE__); \ rev->line = __LINE__; \ - opal_event_set(opal_event_base, \ - &rev->ev, (fid), \ + opal_event_set(orte_event_base, \ + rev->ev, (fid), \ OPAL_EV_READ, \ (cbfunc), rev); \ if ((actv)) { \ rev->active = true; \ - opal_event_add(&rev->ev, 0); \ + opal_event_add(rev->ev, 0); \ } \ } while(0); @@ -194,12 +192,11 @@ typedef struct orte_iof_base_t orte_iof_base_t; ep = OBJ_NEW(orte_iof_sink_t); \ ep->name.jobid = (nm)->jobid; \ ep->name.vpid = (nm)->vpid; \ - ORTE_EPOCH_SET(ep->name.epoch,(nm)->epoch); \ ep->tag = (tg); \ if (0 <= (fid)) { \ ep->wev->fd = (fid); \ - opal_event_set(opal_event_base, \ - &(ep->wev->ev), ep->wev->fd, \ + opal_event_set(orte_event_base, \ + ep->wev->ev, ep->wev->fd, \ OPAL_EV_WRITE, \ wrthndlr, ep); \ } \ @@ -215,17 +212,16 @@ typedef struct orte_iof_base_t orte_iof_base_t; rev = OBJ_NEW(orte_iof_read_event_t); \ rev->name.jobid = (nm)->jobid; \ rev->name.vpid = (nm)->vpid; \ - ORTE_EPOCH_SET(rev->name.epoch,(nm)->epoch); \ rev->tag = (tg); \ rev->fd = (fid); \ *(rv) = rev; \ - opal_event_set(opal_event_base, \ - &rev->ev, (fid), \ + opal_event_set(orte_event_base, \ + rev->ev, (fid), \ OPAL_EV_READ, \ (cbfunc), rev); \ if ((actv)) { \ rev->active = true; \ - opal_event_add(&rev->ev, 0); \ + opal_event_add(rev->ev, 0); \ } \ } while(0); diff --git a/orte/mca/iof/base/iof_base_close.c b/orte/mca/iof/base/iof_base_close.c index a67abe68e9..4ff07f8075 100644 --- a/orte/mca/iof/base/iof_base_close.c +++ b/orte/mca/iof/base/iof_base_close.c @@ -42,9 +42,6 @@ int orte_iof_base_close(void) } OBJ_DESTRUCT(&orte_iof_base.iof_components_opened); - OBJ_DESTRUCT(&orte_iof_base.iof_write_output_lock); - - return ORTE_SUCCESS; } diff --git a/orte/mca/iof/base/iof_base_open.c b/orte/mca/iof/base/iof_base_open.c index b3a20fd03f..1e929f56b1 100644 --- a/orte/mca/iof/base/iof_base_open.c +++ b/orte/mca/iof/base/iof_base_open.c @@ -91,7 +91,6 @@ static void orte_iof_base_sink_construct(orte_iof_sink_t* ptr) { ptr->daemon.jobid = ORTE_JOBID_INVALID; ptr->daemon.vpid = ORTE_VPID_INVALID; - ORTE_EPOCH_SET(ptr->daemon.epoch,ORTE_EPOCH_MIN); ptr->wev = OBJ_NEW(orte_iof_write_event_t); } static void orte_iof_base_sink_destruct(orte_iof_sink_t* ptr) @@ -114,10 +113,11 @@ static void orte_iof_base_read_event_construct(orte_iof_read_event_t* rev) { rev->fd = -1; rev->active = false; + rev->ev = opal_event_alloc(); } static void orte_iof_base_read_event_destruct(orte_iof_read_event_t* rev) { - opal_event_del(&rev->ev); + opal_event_free(rev->ev); if (0 <= rev->fd) { OPAL_OUTPUT_VERBOSE((20, orte_iof_base.iof_output, "%s iof: closing fd %d for process %s", @@ -137,12 +137,11 @@ static void orte_iof_base_write_event_construct(orte_iof_write_event_t* wev) wev->pending = false; wev->fd = -1; OBJ_CONSTRUCT(&wev->outputs, opal_list_t); + wev->ev = opal_event_alloc(); } static void orte_iof_base_write_event_destruct(orte_iof_write_event_t* wev) { - if (wev->pending) { - opal_event_del(&wev->ev); - } + opal_event_free(wev->ev); if (ORTE_PROC_IS_HNP) { int xmlfd = fileno(orte_xml_fp); if (xmlfd == wev->fd) { diff --git a/orte/mca/iof/base/iof_base_output.c b/orte/mca/iof/base/iof_base_output.c index 554d32683a..f18ec7d0a1 100644 --- a/orte/mca/iof/base/iof_base_output.c +++ b/orte/mca/iof/base/iof_base_output.c @@ -266,7 +266,7 @@ process: OPAL_OUTPUT_VERBOSE((1, orte_iof_base.iof_output, "%s write:output adding write event", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - opal_event_add(&channel->ev, 0); + opal_event_add(channel->ev, 0); channel->pending = true; } @@ -322,7 +322,7 @@ void orte_iof_base_write_handler(int fd, short event, void *cbdata) OBJ_RELEASE(output); } ABORT: - opal_event_del(&wev->ev); + opal_event_del(wev->ev); wev->pending = false; DEPART: diff --git a/orte/mca/iof/hnp/iof_hnp.c b/orte/mca/iof/hnp/iof_hnp.c index d07a4d1b08..9e34d1959c 100644 --- a/orte/mca/iof/hnp/iof_hnp.c +++ b/orte/mca/iof/hnp/iof_hnp.c @@ -11,6 +11,8 @@ * All rights reserved. * Copyright (c) 2007 Sun Microsystems, Inc. All rights reserved. * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -144,9 +146,7 @@ static int hnp_push(const orte_process_name_t* dst_name, orte_iof_tag_t src_tag, int flags; char *outfile; int fdout; - orte_odls_job_t *jobdat=NULL; int np, numdigs; - int rc; orte_ns_cmp_bitmask_t mask; /* don't do this if the dst vpid is invalid or the fd is negative! */ @@ -185,24 +185,15 @@ static int hnp_push(const orte_process_name_t* dst_name, orte_iof_tag_t src_tag, proct = OBJ_NEW(orte_iof_proc_t); proct->name.jobid = dst_name->jobid; proct->name.vpid = dst_name->vpid; - ORTE_EPOCH_SET(proct->name.epoch,dst_name->epoch); opal_list_append(&mca_iof_hnp_component.procs, &proct->super); /* see if we are to output to a file */ if (NULL != orte_output_filename) { - /* get the local jobdata for this proc */ - for (item = opal_list_get_first(&orte_local_jobdata); - item != opal_list_get_end(&orte_local_jobdata); - item = opal_list_get_next(item)) { - jobdat = (orte_odls_job_t*)item; - if (jobdat->jobid == proct->name.jobid) { - break; - } - } - if (NULL == jobdat) { + /* get the jobdata for this proc */ + if (NULL == (jdata = orte_get_job_data_object(dst_name->jobid))) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); return ORTE_ERR_NOT_FOUND; } - np = jobdat->num_procs / 10; + np = jdata->num_procs / 10; /* determine the number of digits required for max vpid */ numdigs = 1; while (np > 0) { @@ -246,11 +237,11 @@ static int hnp_push(const orte_process_name_t* dst_name, orte_iof_tag_t src_tag, */ if (NULL != proct->revstdout && NULL != proct->revstderr && NULL != proct->revstddiag) { proct->revstdout->active = true; - opal_event_add(&(proct->revstdout->ev), 0); + opal_event_add(proct->revstdout->ev, 0); proct->revstderr->active = true; - opal_event_add(&(proct->revstderr->ev), 0); + opal_event_add(proct->revstderr->ev, 0); proct->revstddiag->active = true; - opal_event_add(&(proct->revstddiag->ev), 0); + opal_event_add(proct->revstddiag->ev, 0); } return ORTE_SUCCESS; } @@ -282,7 +273,6 @@ static int hnp_push(const orte_process_name_t* dst_name, orte_iof_tag_t src_tag, &mca_iof_hnp_component.sinks); sink->daemon.jobid = ORTE_PROC_MY_NAME->jobid; sink->daemon.vpid = proc->node->daemon->name.vpid; - ORTE_EPOCH_SET(sink->daemon.epoch,orte_ess.proc_get_epoch(&sink->daemon)); } } @@ -315,7 +305,7 @@ static int hnp_push(const orte_process_name_t* dst_name, orte_iof_tag_t src_tag, * filedescriptor is not a tty, don't worry about it * and always stay connected. */ - opal_event_signal_set(opal_event_base, &mca_iof_hnp_component.stdinsig, + opal_event_signal_set(orte_event_base, &mca_iof_hnp_component.stdinsig, SIGCONT, orte_iof_hnp_stdin_cb, NULL); @@ -334,9 +324,7 @@ static int hnp_push(const orte_process_name_t* dst_name, orte_iof_tag_t src_tag, */ if (!(src_tag & ORTE_IOF_STDIN) || orte_iof_hnp_stdin_check(fd)) { mca_iof_hnp_component.stdinev->active = true; - if (OPAL_SUCCESS != (rc = opal_event_add(&(mca_iof_hnp_component.stdinev->ev), 0))) { - ORTE_ERROR_LOG(rc); - } + opal_event_add(mca_iof_hnp_component.stdinev->ev, 0); } } else { /* if we are not looking at a tty, just setup a read event @@ -389,7 +377,6 @@ static int hnp_pull(const orte_process_name_t* dst_name, &mca_iof_hnp_component.sinks); sink->daemon.jobid = ORTE_PROC_MY_NAME->jobid; sink->daemon.vpid = ORTE_PROC_MY_NAME->vpid; - ORTE_EPOCH_SET(sink->daemon.epoch,ORTE_PROC_MY_NAME->epoch); return ORTE_SUCCESS; } @@ -436,9 +423,6 @@ static int finalize(void) int num_written; bool dump; - OPAL_THREAD_LOCK(&mca_iof_hnp_component.lock); - - OPAL_THREAD_LOCK(&orte_iof_base.iof_write_output_lock); /* check if anything is still trying to be written out */ wev = orte_iof_base.iof_write_stdout->wev; if (!opal_list_is_empty(&wev->outputs)) { @@ -456,7 +440,6 @@ static int finalize(void) OBJ_RELEASE(output); } } - OBJ_RELEASE(orte_iof_base.iof_write_stdout); if (!orte_xml_output) { /* we only opened stderr channel if we are NOT doing xml output */ wev = orte_iof_base.iof_write_stderr->wev; @@ -475,31 +458,10 @@ static int finalize(void) OBJ_RELEASE(output); } } - OBJ_RELEASE(orte_iof_base.iof_write_stderr); } - OPAL_THREAD_UNLOCK(&orte_iof_base.iof_write_output_lock); - /* if the stdin event is active, delete it */ - if (NULL != mca_iof_hnp_component.stdinev) { - OBJ_RELEASE(mca_iof_hnp_component.stdinev); - opal_event_signal_del(&mca_iof_hnp_component.stdinsig); - } - /* cleanout all registered sinks */ - while ((item = opal_list_remove_first(&mca_iof_hnp_component.sinks)) != NULL) { - OBJ_RELEASE(item); - } - OBJ_DESTRUCT(&mca_iof_hnp_component.sinks); - /* cleanout all pending proc objects holding receive events */ - while ((item = opal_list_remove_first(&mca_iof_hnp_component.procs)) != NULL) { - OBJ_RELEASE(item); - } - OBJ_DESTRUCT(&mca_iof_hnp_component.procs); orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_IOF_HNP); - /* release and cleanup the lock */ - OPAL_THREAD_UNLOCK(&mca_iof_hnp_component.lock); - OBJ_DESTRUCT(&mca_iof_hnp_component.lock); - return ORTE_SUCCESS; } @@ -564,7 +526,7 @@ static void stdin_write_handler(int fd, short event, void *cbdata) * when the fd is ready. */ wev->pending = true; - opal_event_add(&wev->ev, 0); + opal_event_add(wev->ev, 0); goto CHECK; } /* otherwise, something bad happened so all we can do is declare an @@ -589,7 +551,7 @@ static void stdin_write_handler(int fd, short event, void *cbdata) * when the fd is ready. */ wev->pending = true; - opal_event_add(&wev->ev, 0); + opal_event_add(wev->ev, 0); goto CHECK; } OBJ_RELEASE(output); @@ -616,7 +578,7 @@ CHECK: OPAL_OUTPUT_VERBOSE((1, orte_iof_base.iof_output, "restarting read event")); mca_iof_hnp_component.stdinev->active = true; - opal_event_add(&(mca_iof_hnp_component.stdinev->ev), 0); + opal_event_add(mca_iof_hnp_component.stdinev->ev, 0); } } diff --git a/orte/mca/iof/hnp/iof_hnp_read.c b/orte/mca/iof/hnp/iof_hnp_read.c index 2e94e38725..c98833ed3d 100644 --- a/orte/mca/iof/hnp/iof_hnp_read.c +++ b/orte/mca/iof/hnp/iof_hnp_read.c @@ -10,6 +10,8 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -30,12 +32,12 @@ #include "opal/dss/dss.h" -#include "orte/mca/rml/rml_types.h" +#include "orte/mca/rml/rml.h" #include "orte/mca/errmgr/errmgr.h" #include "orte/mca/odls/odls_types.h" #include "orte/util/name_fns.h" +#include "orte/mca/state/state.h" #include "orte/runtime/orte_globals.h" -#include "orte/orted/orted.h" #include "orte/mca/iof/iof.h" #include "orte/mca/iof/base/base.h" @@ -44,10 +46,18 @@ static void restart_stdin(int fd, short event, void *cbdata) { + orte_timer_t *tm = (orte_timer_t*)cbdata; + if (NULL != mca_iof_hnp_component.stdinev && - !orte_job_term_ordered) { + !orte_job_term_ordered && + !mca_iof_hnp_component.stdinev->active) { mca_iof_hnp_component.stdinev->active = true; - opal_event_add(&(mca_iof_hnp_component.stdinev->ev), 0); + opal_event_add(mca_iof_hnp_component.stdinev->ev, 0); + } + + /* if this was a timer callback, then release the timer */ + if (NULL != tm) { + OBJ_RELEASE(tm); } } @@ -70,9 +80,9 @@ void orte_iof_hnp_stdin_cb(int fd, short event, void *cbdata) if (should_process) { mca_iof_hnp_component.stdinev->active = true; - opal_event_add(&(mca_iof_hnp_component.stdinev->ev), 0); + opal_event_add(mca_iof_hnp_component.stdinev->ev, 0); } else { - opal_event_del(&(mca_iof_hnp_component.stdinev->ev)); + opal_event_del(mca_iof_hnp_component.stdinev->ev); mca_iof_hnp_component.stdinev->active = false; } } @@ -109,7 +119,7 @@ void orte_iof_hnp_read_local_handler(int fd, short event, void *cbdata) /* non-blocking, retry */ if (EAGAIN == errno || EINTR == errno) { - opal_event_add(&rev->ev, 0); + opal_event_add(rev->ev, 0); OPAL_THREAD_UNLOCK(&mca_iof_hnp_component.lock); return; } @@ -207,7 +217,7 @@ void orte_iof_hnp_read_local_handler(int fd, short event, void *cbdata) restart_stdin(fd, 0, NULL); } else { /* delay for awhile and then restart */ - ORTE_TIMER_EVENT(0, 10000, restart_stdin); + ORTE_TIMER_EVENT(0, 10000, restart_stdin, ORTE_INFO_PRI); } } /* nothing more to do */ @@ -275,24 +285,9 @@ void orte_iof_hnp_read_local_handler(int fd, short event, void *cbdata) if (NULL == proct->revstdout && NULL == proct->revstderr && NULL == proct->revstddiag) { - opal_buffer_t cmdbuf; - orte_daemon_cmd_flag_t command; /* this proc's iof is complete */ opal_list_remove_item(&mca_iof_hnp_component.procs, item); - /* setup a cmd to notify that the iof is complete */ - OBJ_CONSTRUCT(&cmdbuf, opal_buffer_t); - command = ORTE_DAEMON_IOF_COMPLETE; - if (ORTE_SUCCESS != (rc = opal_dss.pack(&cmdbuf, &command, 1, ORTE_DAEMON_CMD))) { - ORTE_ERROR_LOG(rc); - goto CLEANUP; - } - if (ORTE_SUCCESS != (rc = opal_dss.pack(&cmdbuf, &proct->name, 1, ORTE_NAME))) { - ORTE_ERROR_LOG(rc); - goto CLEANUP; - } - ORTE_MESSAGE_EVENT(ORTE_PROC_MY_NAME, &cmdbuf, ORTE_RML_TAG_DAEMON, orte_daemon_cmd_processor); - CLEANUP: - OBJ_DESTRUCT(&cmdbuf); + ORTE_ACTIVATE_PROC_STATE(&proct->name, ORTE_PROC_STATE_IOF_COMPLETE); OBJ_RELEASE(proct); } break; @@ -337,8 +332,8 @@ void orte_iof_hnp_read_local_handler(int fd, short event, void *cbdata) } /* re-add the event */ - opal_event_add(&rev->ev, 0); + opal_event_add(rev->ev, 0); - OPAL_THREAD_UNLOCK(&mca_iof_hnp_component.lock); + OPAL_THREAD_UNLOCK(&mca_iof_hnp_component.lock); return; } diff --git a/orte/mca/iof/hnp/iof_hnp_receive.c b/orte/mca/iof/hnp/iof_hnp_receive.c index 355c7ca2b5..d2e84ea4af 100644 --- a/orte/mca/iof/hnp/iof_hnp_receive.c +++ b/orte/mca/iof/hnp/iof_hnp_receive.c @@ -10,6 +10,8 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -36,7 +38,6 @@ #endif #include "orte/mca/rml/rml.h" -#include "orte/mca/rml/rml_types.h" #include "orte/mca/errmgr/errmgr.h" #include "orte/util/name_fns.h" #include "orte/runtime/orte_globals.h" @@ -47,9 +48,10 @@ #include "iof_hnp.h" -static void process_msg(int fd, short event, void *cbdata) +void orte_iof_hnp_recv(int status, orte_process_name_t* sender, + opal_buffer_t* buffer, orte_rml_tag_t tag, + void* cbdata) { - orte_message_event_t *mev = (orte_message_event_t*)cbdata; orte_process_name_t origin; unsigned char data[ORTE_IOF_BASE_MSG_MAX]; orte_iof_tag_t stream; @@ -61,7 +63,7 @@ static void process_msg(int fd, short event, void *cbdata) /* unpack the stream first as this may be flow control info */ count = 1; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(mev->buffer, &stream, &count, ORTE_IOF_TAG))) { + if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &stream, &count, ORTE_IOF_TAG))) { ORTE_ERROR_LOG(rc); goto CLEAN_RETURN; } @@ -72,14 +74,14 @@ static void process_msg(int fd, short event, void *cbdata) !orte_job_term_ordered && !mca_iof_hnp_component.stdinev->active) { mca_iof_hnp_component.stdinev->active = true; - opal_event_add(&(mca_iof_hnp_component.stdinev->ev), 0); + opal_event_add(mca_iof_hnp_component.stdinev->ev, 0); } goto CLEAN_RETURN; } else if (ORTE_IOF_XOFF & stream) { /* stop the stdin read event */ if (NULL != mca_iof_hnp_component.stdinev && !mca_iof_hnp_component.stdinev->active) { - opal_event_del(&(mca_iof_hnp_component.stdinev->ev)); + opal_event_del(mca_iof_hnp_component.stdinev->ev); mca_iof_hnp_component.stdinev->active = false; } goto CLEAN_RETURN; @@ -87,7 +89,7 @@ static void process_msg(int fd, short event, void *cbdata) /* get name of the process whose io we are discussing */ count = 1; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(mev->buffer, &origin, &count, ORTE_NAME))) { + if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &origin, &count, ORTE_NAME))) { ORTE_ERROR_LOG(rc); goto CLEAN_RETURN; } @@ -97,7 +99,7 @@ static void process_msg(int fd, short event, void *cbdata) OPAL_OUTPUT_VERBOSE((1, orte_iof_base.iof_output, "%s received pull cmd from remote tool %s for proc %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&mev->sender), + ORTE_NAME_PRINT(sender), ORTE_NAME_PRINT(&origin))); /* a tool is requesting that we send it a copy of the specified stream(s) * from the specified process(es), so create a sink for it @@ -105,23 +107,20 @@ static void process_msg(int fd, short event, void *cbdata) if (ORTE_IOF_STDOUT & stream) { ORTE_IOF_SINK_DEFINE(&sink, &origin, -1, ORTE_IOF_STDOUT, NULL, &mca_iof_hnp_component.sinks); - sink->daemon.jobid = mev->sender.jobid; - sink->daemon.vpid = mev->sender.vpid; - ORTE_EPOCH_SET(sink->daemon.epoch,mev->sender.epoch); + sink->daemon.jobid = sender->jobid; + sink->daemon.vpid = sender->vpid; } if (ORTE_IOF_STDERR & stream) { ORTE_IOF_SINK_DEFINE(&sink, &origin, -1, ORTE_IOF_STDERR, NULL, &mca_iof_hnp_component.sinks); - sink->daemon.jobid = mev->sender.jobid; - sink->daemon.vpid = mev->sender.vpid; - ORTE_EPOCH_SET(sink->daemon.epoch,mev->sender.epoch); + sink->daemon.jobid = sender->jobid; + sink->daemon.vpid = sender->vpid; } if (ORTE_IOF_STDDIAG & stream) { ORTE_IOF_SINK_DEFINE(&sink, &origin, -1, ORTE_IOF_STDDIAG, NULL, &mca_iof_hnp_component.sinks); - sink->daemon.jobid = mev->sender.jobid; - sink->daemon.vpid = mev->sender.vpid; - ORTE_EPOCH_SET(sink->daemon.epoch,mev->sender.epoch); + sink->daemon.jobid = sender->jobid; + sink->daemon.vpid = sender->vpid; } goto CLEAN_RETURN; } @@ -130,7 +129,7 @@ static void process_msg(int fd, short event, void *cbdata) OPAL_OUTPUT_VERBOSE((1, orte_iof_base.iof_output, "%s received close cmd from remote tool %s for proc %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&mev->sender), + ORTE_NAME_PRINT(sender), ORTE_NAME_PRINT(&origin))); /* a tool is requesting that we no longer forward a copy of the * specified stream(s) from the specified process(es) - remove the sink @@ -163,7 +162,7 @@ static void process_msg(int fd, short event, void *cbdata) /* this must have come from a daemon forwarding output - unpack the data */ numbytes=ORTE_IOF_BASE_MSG_MAX; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(mev->buffer, data, &numbytes, OPAL_BYTE))) { + if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, data, &numbytes, OPAL_BYTE))) { ORTE_ERROR_LOG(rc); goto CLEAN_RETURN; } @@ -201,30 +200,5 @@ static void process_msg(int fd, short event, void *cbdata) } CLEAN_RETURN: - /* release the message event */ - OBJ_RELEASE(mev); - return; -} - -void orte_iof_hnp_recv(int status, orte_process_name_t* sender, - opal_buffer_t* buffer, orte_rml_tag_t tag, - void* cbdata) -{ - OPAL_OUTPUT_VERBOSE((5, orte_iof_base.iof_output, - "%s iof:hnp:receive got message from %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(sender))); - - /* don't process this right away - we need to get out of the recv before - * we process the message to avoid performing the rest of the job while - * inside this receive! Instead, setup an event so that the message gets processed - * as soon as we leave the recv. - * - * The macro makes a copy of the buffer, which we release above - the incoming - * buffer, however, is NOT released here, although its payload IS transferred - * to the message buffer for later processing - */ - ORTE_MESSAGE_EVENT(sender, buffer, tag, process_msg); - return; } diff --git a/orte/mca/iof/orted/iof_orted.c b/orte/mca/iof/orted/iof_orted.c index 4c7d865c9f..dae7fef958 100644 --- a/orte/mca/iof/orted/iof_orted.c +++ b/orte/mca/iof/orted/iof_orted.c @@ -10,6 +10,8 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -124,7 +126,7 @@ static int orted_push(const orte_process_name_t* dst_name, orte_iof_tag_t src_ta orte_iof_sink_t *sink; char *outfile; int fdout; - orte_odls_job_t *jobdat=NULL; + orte_job_t *jobdat=NULL; int np, numdigs; orte_ns_cmp_bitmask_t mask; @@ -161,20 +163,11 @@ static int orted_push(const orte_process_name_t* dst_name, orte_iof_tag_t src_ta proct = OBJ_NEW(orte_iof_proc_t); proct->name.jobid = dst_name->jobid; proct->name.vpid = dst_name->vpid; - ORTE_EPOCH_SET(proct->name.epoch,dst_name->epoch); opal_list_append(&mca_iof_orted_component.procs, &proct->super); /* see if we are to output to a file */ if (NULL != orte_output_filename) { /* get the local jobdata for this proc */ - for (item = opal_list_get_first(&orte_local_jobdata); - item != opal_list_get_end(&orte_local_jobdata); - item = opal_list_get_next(item)) { - jobdat = (orte_odls_job_t*)item; - if (jobdat->jobid == proct->name.jobid) { - break; - } - } - if (NULL == jobdat) { + if (NULL == (jobdat = orte_get_job_data_object(proct->name.jobid))) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); return ORTE_ERR_NOT_FOUND; } @@ -222,11 +215,11 @@ SETUP: */ if (NULL != proct->revstdout && NULL != proct->revstderr && NULL != proct->revstddiag) { proct->revstdout->active = true; - opal_event_add(&(proct->revstdout->ev), 0); + opal_event_add(proct->revstdout->ev, 0); proct->revstderr->active = true; - opal_event_add(&(proct->revstderr->ev), 0); + opal_event_add(proct->revstderr->ev, 0); proct->revstddiag->active = true; - opal_event_add(&(proct->revstddiag->ev), 0); + opal_event_add(proct->revstddiag->ev, 0); } return ORTE_SUCCESS; } @@ -389,7 +382,7 @@ static void stdin_write_handler(int fd, short event, void *cbdata) * when the fd is ready. */ wev->pending = true; - opal_event_add(&wev->ev, 0); + opal_event_add(wev->ev, 0); goto CHECK; } /* otherwise, something bad happened so all we can do is declare an @@ -419,7 +412,7 @@ static void stdin_write_handler(int fd, short event, void *cbdata) * when the fd is ready. */ wev->pending = true; - opal_event_add(&wev->ev, 0); + opal_event_add(wev->ev, 0); goto CHECK; } OBJ_RELEASE(output); diff --git a/orte/mca/iof/orted/iof_orted_read.c b/orte/mca/iof/orted/iof_orted_read.c index 5edeeafcf5..fd95766d69 100644 --- a/orte/mca/iof/orted/iof_orted_read.c +++ b/orte/mca/iof/orted/iof_orted_read.c @@ -10,6 +10,8 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -31,12 +33,11 @@ #include "opal/dss/dss.h" #include "orte/mca/rml/rml.h" -#include "orte/mca/rml/rml_types.h" #include "orte/mca/errmgr/errmgr.h" #include "orte/mca/odls/odls_types.h" #include "orte/util/name_fns.h" +#include "orte/mca/state/state.h" #include "orte/runtime/orte_globals.h" -#include "orte/orted/orted.h" #include "orte/mca/iof/iof.h" #include "orte/mca/iof/base/base.h" @@ -65,7 +66,7 @@ void orte_iof_orted_read_handler(int fd, short event, void *cbdata) opal_list_item_t *item; orte_iof_proc_t *proct; orte_ns_cmp_bitmask_t mask; - + OPAL_THREAD_LOCK(&mca_iof_orted_component.lock); /* read up to the fragment size */ @@ -90,7 +91,7 @@ void orte_iof_orted_read_handler(int fd, short event, void *cbdata) /* either we have a connection error or it was a non-blocking read */ if (EAGAIN == errno || EINTR == errno) { /* non-blocking, retry */ - opal_event_add(&rev->ev, 0); + opal_event_add(rev->ev, 0); OPAL_THREAD_UNLOCK(&mca_iof_orted_component.lock); return; } @@ -164,14 +165,14 @@ void orte_iof_orted_read_handler(int fd, short event, void *cbdata) orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, buf, ORTE_RML_TAG_IOF_HNP, 0, send_cb, NULL); -RESTART: + RESTART: /* re-add the event */ - opal_event_add(&rev->ev, 0); + opal_event_add(rev->ev, 0); OPAL_THREAD_UNLOCK(&mca_iof_orted_component.lock); return; -CLEAN_RETURN: + CLEAN_RETURN: /* must be an error, or zero bytes were read indicating that the * proc terminated this IOF channel - either way, find this proc * on our list and clean up @@ -202,24 +203,9 @@ CLEAN_RETURN: if (NULL == proct->revstdout && NULL == proct->revstderr && NULL == proct->revstddiag) { - opal_buffer_t cmdbuf; - orte_daemon_cmd_flag_t command; /* this proc's iof is complete */ opal_list_remove_item(&mca_iof_orted_component.procs, item); - /* setup a cmd to notify that the iof is complete */ - OBJ_CONSTRUCT(&cmdbuf, opal_buffer_t); - command = ORTE_DAEMON_IOF_COMPLETE; - if (ORTE_SUCCESS != (rc = opal_dss.pack(&cmdbuf, &command, 1, ORTE_DAEMON_CMD))) { - ORTE_ERROR_LOG(rc); - goto CLEANUP; - } - if (ORTE_SUCCESS != (rc = opal_dss.pack(&cmdbuf, &proct->name, 1, ORTE_NAME))) { - ORTE_ERROR_LOG(rc); - goto CLEANUP; - } - ORTE_MESSAGE_EVENT(ORTE_PROC_MY_NAME, &cmdbuf, ORTE_RML_TAG_DAEMON, orte_daemon_cmd_processor); - CLEANUP: - OBJ_DESTRUCT(&cmdbuf); + ORTE_ACTIVATE_PROC_STATE(&proct->name, ORTE_PROC_STATE_IOF_COMPLETE); OBJ_RELEASE(proct); } break; diff --git a/orte/mca/iof/orted/iof_orted_receive.c b/orte/mca/iof/orted/iof_orted_receive.c index 7585b951a0..7ecae6e16d 100644 --- a/orte/mca/iof/orted/iof_orted_receive.c +++ b/orte/mca/iof/orted/iof_orted_receive.c @@ -10,6 +10,8 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2011 Los Alamos National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -83,9 +85,10 @@ void orte_iof_orted_send_xonxoff(orte_iof_tag_t tag) * * (b) flow control messages */ -static void process_msg(int fd, short event, void *cbdata) +void orte_iof_orted_recv(int status, orte_process_name_t* sender, + opal_buffer_t* buffer, orte_rml_tag_t tag, + void* cbdata) { - orte_message_event_t *mev = (orte_message_event_t*)cbdata; unsigned char data[ORTE_IOF_BASE_MSG_MAX]; orte_iof_tag_t stream; int32_t count, numbytes; @@ -95,7 +98,7 @@ static void process_msg(int fd, short event, void *cbdata) /* see what stream generated this data */ count = 1; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(mev->buffer, &stream, &count, ORTE_IOF_TAG))) { + if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &stream, &count, ORTE_IOF_TAG))) { ORTE_ERROR_LOG(rc); goto CLEAN_RETURN; } @@ -108,14 +111,14 @@ static void process_msg(int fd, short event, void *cbdata) /* unpack the intended target */ count = 1; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(mev->buffer, &target, &count, ORTE_NAME))) { + if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &target, &count, ORTE_NAME))) { ORTE_ERROR_LOG(rc); goto CLEAN_RETURN; } /* unpack the data */ numbytes=ORTE_IOF_BASE_MSG_MAX; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(mev->buffer, data, &numbytes, OPAL_BYTE))) { + if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, data, &numbytes, OPAL_BYTE))) { ORTE_ERROR_LOG(rc); goto CLEAN_RETURN; } @@ -163,31 +166,5 @@ static void process_msg(int fd, short event, void *cbdata) } CLEAN_RETURN: - /* release the message event */ - OBJ_RELEASE(mev); - return; -} - - -void orte_iof_orted_recv(int status, orte_process_name_t* sender, - opal_buffer_t* buffer, orte_rml_tag_t tag, - void* cbdata) -{ - OPAL_OUTPUT_VERBOSE((1, orte_iof_base.iof_output, - "%s iof:orted:receive got message from %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(sender))); - - /* don't process this right away - we need to get out of the recv before - * we process the message to avoid performing the rest of the job while - * inside this receive! Instead, setup an event so that the message gets processed - * as soon as we leave the recv. - * - * The macro makes a copy of the buffer, which we release above - the incoming - * buffer, however, is NOT released here, although its payload IS transferred - * to the message buffer for later processing - */ - ORTE_MESSAGE_EVENT(sender, buffer, tag, process_msg); - return; } diff --git a/orte/mca/iof/tool/iof_tool.c b/orte/mca/iof/tool/iof_tool.c index ea62d6e679..b170d13ac4 100644 --- a/orte/mca/iof/tool/iof_tool.c +++ b/orte/mca/iof/tool/iof_tool.c @@ -10,6 +10,8 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2011 Los Alamos National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -73,7 +75,7 @@ static int init(void) from the HNP IOF component */ if (ORTE_SUCCESS != (rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_IOF_PROXY, - ORTE_RML_NON_PERSISTENT, + ORTE_RML_PERSISTENT, orte_iof_tool_recv, NULL))) { ORTE_ERROR_LOG(rc); @@ -217,9 +219,6 @@ static int tool_close(const orte_process_name_t* src_name, orte_rml.send_buffer_nb(&hnp, buf, ORTE_RML_TAG_IOF_HNP, 0, send_cb, NULL); - /* wait right here until the close is confirmed */ - ORTE_PROGRESSED_WAIT(mca_iof_tool_component.closed, 0, 1); - return ORTE_SUCCESS; } diff --git a/orte/mca/iof/tool/iof_tool_receive.c b/orte/mca/iof/tool/iof_tool_receive.c index a83d4b8586..95019e3703 100644 --- a/orte/mca/iof/tool/iof_tool_receive.c +++ b/orte/mca/iof/tool/iof_tool_receive.c @@ -10,6 +10,8 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2011 Los Alamos National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -40,9 +42,10 @@ #include "iof_tool.h" -static void process_msg(int fd, short event, void *cbdata) +void orte_iof_tool_recv(int status, orte_process_name_t* sender, + opal_buffer_t* buffer, orte_rml_tag_t tag, + void* cbdata) { - orte_message_event_t *mev = (orte_message_event_t*)cbdata; orte_process_name_t origin; unsigned char data[ORTE_IOF_BASE_MSG_MAX]; orte_iof_tag_t stream; @@ -52,7 +55,7 @@ static void process_msg(int fd, short event, void *cbdata) /* unpack the stream first as this may be flow control info */ count = 1; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(mev->buffer, &stream, &count, ORTE_IOF_TAG))) { + if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &stream, &count, ORTE_IOF_TAG))) { ORTE_ERROR_LOG(rc); goto CLEAN_RETURN; } @@ -64,21 +67,21 @@ static void process_msg(int fd, short event, void *cbdata) OPAL_OUTPUT_VERBOSE((1, orte_iof_base.iof_output, "%s received CLOSE handshake from remote hnp %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&mev->sender))); + ORTE_NAME_PRINT(sender))); mca_iof_tool_component.closed = true; goto CLEAN_RETURN; } /* get name of the process whose io we are receiving */ count = 1; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(mev->buffer, &origin, &count, ORTE_NAME))) { + if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &origin, &count, ORTE_NAME))) { ORTE_ERROR_LOG(rc); goto CLEAN_RETURN; } /* unpack the data */ numbytes=ORTE_IOF_BASE_MSG_MAX; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(mev->buffer, data, &numbytes, OPAL_BYTE))) { + if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, data, &numbytes, OPAL_BYTE))) { ORTE_ERROR_LOG(rc); goto CLEAN_RETURN; } @@ -102,40 +105,5 @@ static void process_msg(int fd, short event, void *cbdata) } CLEAN_RETURN: - /* release the message event */ - OBJ_RELEASE(mev); - return; -} - -void orte_iof_tool_recv(int status, orte_process_name_t* sender, - opal_buffer_t* buffer, orte_rml_tag_t tag, - void* cbdata) -{ - int rc; - - OPAL_OUTPUT_VERBOSE((5, orte_iof_base.iof_output, - "%s iof:tool:receive got message from %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(sender))); - - /* don't process this right away - we need to get out of the recv before - * we process the message to avoid performing the rest of the job while - * inside this receive! Instead, setup an event so that the message gets processed - * as soon as we leave the recv. - * - * The macro makes a copy of the buffer, which we release above - the incoming - * buffer, however, is NOT released here, although its payload IS transferred - * to the message buffer for later processing - */ - ORTE_MESSAGE_EVENT(sender, buffer, tag, process_msg); - - /* reissue the recv */ - if (ORTE_SUCCESS != (rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, - ORTE_RML_TAG_IOF_PROXY, - ORTE_RML_NON_PERSISTENT, - orte_iof_tool_recv, - NULL))) { - ORTE_ERROR_LOG(rc); - } return; } diff --git a/orte/mca/notifier/base/notifier_base_select.c b/orte/mca/notifier/base/notifier_base_select.c index 3d3f725e80..8f0286e484 100644 --- a/orte/mca/notifier/base/notifier_base_select.c +++ b/orte/mca/notifier/base/notifier_base_select.c @@ -26,7 +26,6 @@ #include "opal/mca/mca.h" #include "opal/util/argv.h" -#include "opal/util/opal_sos.h" #include "opal/mca/base/base.h" #include "opal/util/output.h" @@ -44,7 +43,6 @@ bool orte_notifier_base_help_selected = false; bool orte_notifier_base_log_peer_selected = false; bool orte_notifier_base_log_event_selected = false; -static opal_sos_reporter_callback_fn_t prev_reporter_callback; static inline char **orte_notifier_get_include_list(const char *, const char *, char **); @@ -207,8 +205,8 @@ int orte_notifier_base_select(void) if (NULL != nmodule->init) { /* If the module doesn't want to be used, skip it */ if (ORTE_SUCCESS != (ret = nmodule->init()) ) { - if (ORTE_ERR_NOT_SUPPORTED != OPAL_SOS_GET_ERROR_CODE(ret) && - ORTE_ERR_NOT_IMPLEMENTED != OPAL_SOS_GET_ERROR_CODE(ret)) { + if (ORTE_ERR_NOT_SUPPORTED != ret && + ORTE_ERR_NOT_IMPLEMENTED != ret) { exit_status = ret; goto cleanup; } @@ -293,11 +291,6 @@ int orte_notifier_base_select(void) orte_notifier_base_events_init(); } - /* Register a callback with OPAL SOS so that we can intercept - * error messages */ - opal_sos_reg_reporter_callback((opal_sos_reporter_callback_fn_t) orte_notifier_log, - &prev_reporter_callback); - cleanup: return exit_status; } diff --git a/orte/mca/notifier/hnp/Makefile.am b/orte/mca/notifier/hnp/Makefile.am index 91fba4047b..5f6fa2b22b 100644 --- a/orte/mca/notifier/hnp/Makefile.am +++ b/orte/mca/notifier/hnp/Makefile.am @@ -10,6 +10,8 @@ # Copyright (c) 2004-2005 The Regents of the University of California. # All rights reserved. # Copyright (c) 2009-2010 Cisco Systems, Inc. All rights reserved. +# Copyright (c) 2012 Los Alamos National Security, LLC. +# All rights reserved. # $COPYRIGHT$ # # Additional copyrights may follow @@ -17,6 +19,8 @@ # $HEADER$ # +EXTRA_DIST = orte_notifier_hnp.txt + sources = \ notifier_hnp.h \ notifier_hnp_module.c \ diff --git a/orte/mca/notifier/hnp/notifier_hnp.h b/orte/mca/notifier/hnp/notifier_hnp.h index fa683919d9..30c5879910 100644 --- a/orte/mca/notifier/hnp/notifier_hnp.h +++ b/orte/mca/notifier/hnp/notifier_hnp.h @@ -33,10 +33,6 @@ BEGIN_C_DECLS void orte_notifier_hnp_recv_cb(int status, orte_process_name_t* sender, opal_buffer_t* buffer, orte_rml_tag_t tag, void* cbdata); -#if OPAL_ENABLE_DEBUG -void orte_notifier_hnp_exception_cb(const orte_process_name_t* peer, - orte_rml_exception_t reason); -#endif /* extern opal_pointer_array_t orte_notifier_hnp_tables; diff --git a/orte/mca/notifier/hnp/notifier_hnp_module.c b/orte/mca/notifier/hnp/notifier_hnp_module.c index 6b3cc6e834..8c4e9e026e 100644 --- a/orte/mca/notifier/hnp/notifier_hnp_module.c +++ b/orte/mca/notifier/hnp/notifier_hnp_module.c @@ -11,7 +11,9 @@ * All rights reserved. * Copyright (c) 2007 Sun Microsystems, Inc. All rights reserved. * Copyright (c) 2009 Cisco Systems, Inc. All rights reserved. - * $COPYRIGHT$ + * Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights + * reserved. + * $COPYRIGHT$ * * Additional copyrights may follow * @@ -33,7 +35,6 @@ #endif #include "opal/util/show_help.h" -#include "opal/util/opal_sos.h" #include "opal/dss/dss.h" #include "opal/dss/dss_types.h" @@ -111,134 +112,6 @@ static int send_command(orte_notifier_base_severity_t severity, int errcode, return ORTE_SUCCESS; } -#if 0 -/** - * Function to pack a single SOS error entry. - * - * @return OPAL_SUCCESS Upon success - */ -static int opal_dss_pack_sos_error(opal_buffer_t *buf, opal_sos_error_t *error) -{ - int rc; - if (NULL == error) { - return ORTE_ERROR; - } - - /* Pack errnum */ - if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &error->errnum, 1, OPAL_INT))) { - return rc; - } - - /* Pack the file name in which the error occurred */ - if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, error->file, 1, OPAL_STRING))) { - return rc; - } - - /* Pack the line number on which the error was encountered */ - if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &error->line, 1, OPAL_INT))) { - return rc; - } - - /* Pack the function name (if any) */ - if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, error->func, 1, OPAL_STRING))) { - return rc; - } - - /* Pack the error message */ - if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, error->msg, 1, OPAL_STRING))) { - return rc; - } - - /* Pack the pointer to the previous opal sos error object in the - opal sos table */ - if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &error->prev, 1, OPAL_INT))) { - return rc; - } - - /* Pack the pointer to the next error */ - if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &error->next, 1, OPAL_INT))) { - return rc; - } - - return ORTE_SUCCESS; -} - -/** - * Function to pack all the entries in the SOS table and send it - * over to the HNP. - * - * @return OPAL_SUCCESS Upon success - * @return OPAL_FAILURE Upon failure - * - * ADK: Presently, we simply rely on orte_show_help to do the aggregation on - * a per-error basis. - */ -static int opal_sos_send_table(void) -{ - opal_sos_error_t *opal_error; - opal_buffer_t *buf; - uint32_t key; - int rc; - size_t table_size; - void *prev_error, *next_error; - next_error = NULL; - - buf = OBJ_NEW(opal_buffer_t); - if (NULL == buf) { - return ORTE_ERR_OUT_OF_RESOURCE; - } - - OPAL_THREAD_LOCK(&opal_sos_table_lock); - table_size = opal_hash_table_get_size(&opal_sos_table); - - /* Pack the size of the SOS error table */ - if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &table_size, 1, OPAL_SIZE))) { - ORTE_ERROR_LOG(rc); - goto error; - } - - if (OPAL_SUCCESS != opal_hash_table_get_first_key_uint32(&opal_sos_table, - &key, (void**)&opal_error, - &prev_error)) { - rc = ORTE_ERROR; - goto error; - } - - /* Pack the sos error object */ - if (ORTE_SUCCESS != (rc = opal_dss_pack_sos_error(buf, opal_error))) { - ORTE_ERROR_LOG(rc); - goto error; - } - - while (OPAL_SUCCESS == opal_hash_table_get_next_key_uint32(&opal_sos_table, - &key, (void**)&opal_error, - &prev_error, &next_error)) - { - if (ORTE_SUCCESS != (rc = opal_dss_pack_sos_error(buf, opal_error))) { - ORTE_ERROR_LOG(rc); - goto error; - } - } - OPAL_THREAD_UNLOCK(&opal_sos_table_lock); - - /* Now send the buffer (rc = number of bytes sent) */ - rc = orte_rml.send_buffer(ORTE_PROC_MY_HNP, buf, - ORTE_RML_TAG_NOTIFIER_HNP, 0); - if (rc <= 0) { - ORTE_ERROR_LOG(rc); - OBJ_RELEASE(buf); - return rc; - } - - return ORTE_SUCCESS; - -error: - OPAL_THREAD_UNLOCK(&opal_sos_table_lock); - OBJ_RELEASE(buf); - return rc; -} -#endif - static int init(void) { int rc; @@ -248,23 +121,12 @@ static int init(void) if (ORTE_SUCCESS != (rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_NOTIFIER_HNP, - ORTE_RML_NON_PERSISTENT, + ORTE_RML_PERSISTENT, orte_notifier_hnp_recv_cb, NULL))) { ORTE_ERROR_LOG(rc); return rc; } - -#if OPAL_ENABLE_DEBUG - /* If we're debugging, also add an exception handler -- just to - watch for problems in the RML */ - if (ORTE_SUCCESS != - (rc = orte_rml.add_exception_handler(orte_notifier_hnp_exception_cb))) { - ORTE_ERROR_LOG(rc); - orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_NOTIFIER_HNP); - return rc; - } -#endif } return ORTE_SUCCESS; @@ -289,7 +151,7 @@ static void mylog(orte_notifier_base_severity_t severity, int errcode, if (NULL != output) { if (ORTE_PROC_IS_HNP) { /* output it locally */ - orte_show_help("opal_sos_reporter.txt", "notifier message", false, output); + orte_show_help("orte_notifier_hnp.txt", "notifier message", false, output); } else { send_command(severity, errcode, output); } @@ -307,7 +169,7 @@ static void myhelplog(orte_notifier_base_severity_t severity, int errcode, if (NULL != output) { if (ORTE_PROC_IS_HNP) { /* output it locally */ - orte_show_help("opal_sos_reporter.txt", "notifier message", false, output); + orte_show_help("orte_notifier_hnp.txt", "notifier message", false, output); } else { send_command(severity, errcode, output); } @@ -324,7 +186,7 @@ static void mypeerlog(orte_notifier_base_severity_t severity, int errcode, if (NULL != buf) { if (ORTE_PROC_IS_HNP) { /* output it locally */ - orte_show_help("opal_sos_reporter.txt", "notifier message", false, buf); + orte_show_help("orte_notifier_hnp.txt", "notifier message", false, buf); } else { send_command(severity, errcode, buf); } @@ -336,7 +198,7 @@ static void myeventlog(const char *msg) { if (ORTE_PROC_IS_HNP) { /* output it locally */ - orte_show_help("opal_sos_reporter.txt", "notifier message", false, (char*)msg); + orte_show_help("orte_notifier_hnp.txt", "notifier message", false, (char*)msg); } else { send_command(ORTE_NOTIFIER_NOTICE, ORTE_SUCCESS, (char *)msg); } diff --git a/orte/mca/notifier/hnp/notifier_hnp_recv.c b/orte/mca/notifier/hnp/notifier_hnp_recv.c index 6a770f84cc..c4f640c08a 100644 --- a/orte/mca/notifier/hnp/notifier_hnp_recv.c +++ b/orte/mca/notifier/hnp/notifier_hnp_recv.c @@ -11,7 +11,9 @@ * All rights reserved. * Copyright (c) 2007 Sun Microsystems, Inc. All rights reserved. * Copyright (c) 2009 Cisco Systems, Inc. All rights reserved. - * $COPYRIGHT$ + * Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights + * reserved. + * $COPYRIGHT$ * * Additional copyrights may follow * @@ -25,18 +27,14 @@ #include "orte/mca/notifier/base/base.h" #include "orte/mca/errmgr/errmgr.h" #include "orte/util/show_help.h" -#include "opal/util/opal_sos.h" #include "opal/class/opal_hash_table.h" #include "notifier_hnp.h" -/* - * This function is called back *after* the RML receive callback to - * avoid the RRD ("receive recursion of death"). - */ -static void process_msg(int fd, short event, void *cbdata) +void orte_notifier_hnp_recv_cb(int status, orte_process_name_t* sender, + opal_buffer_t* buffer, orte_rml_tag_t tag, + void* cbdata) { - orte_message_event_t *mev = (orte_message_event_t*)cbdata; uint8_t u8; uint32_t u32; int rc, count; @@ -47,7 +45,7 @@ static void process_msg(int fd, short event, void *cbdata) /* Unpack the severity */ count = 1; if (ORTE_SUCCESS != - (rc = opal_dss.unpack(mev->buffer, &u8, &count, OPAL_UINT8))) { + (rc = opal_dss.unpack(buffer, &u8, &count, OPAL_UINT8))) { ORTE_ERROR_LOG(rc); goto CLEAN_RETURN; } @@ -56,7 +54,7 @@ static void process_msg(int fd, short event, void *cbdata) /* Unpack the errcode */ count = 1; if (ORTE_SUCCESS != - (rc = opal_dss.unpack(mev->buffer, &u32, &count, OPAL_UINT32))) { + (rc = opal_dss.unpack(buffer, &u32, &count, OPAL_UINT32))) { ORTE_ERROR_LOG(rc); goto CLEAN_RETURN; } @@ -65,207 +63,14 @@ static void process_msg(int fd, short event, void *cbdata) /* Unpack the string */ count = 1; if (ORTE_SUCCESS != - (rc = opal_dss.unpack(mev->buffer, &msg, &count, OPAL_STRING))) { + (rc = opal_dss.unpack(buffer, &msg, &count, OPAL_STRING))) { ORTE_ERROR_LOG(rc); goto CLEAN_RETURN; } - orte_show_help("opal_sos_reporter.txt", "notifier message", false, msg); + orte_show_help("orte_notifier_hnp.txt", "notifier message", false, msg); CLEAN_RETURN: - /* release the message event */ - OBJ_RELEASE(mev); return; } -#if 0 -/** - * Function to unpack a single SOS error entry. - * - * @return OPAL_SUCCESS Upon success - */ -static int opal_dss_unpack_sos_error(opal_buffer_t *buf, opal_sos_error_t *error) -{ - int count, rc; - if (NULL == error) { - return ORTE_ERROR; - } - - /* Unpack the errcode */ - count = 1; - if (ORTE_SUCCESS != - (rc = opal_dss.unpack(buf, &error->errnum, &count, OPAL_INT))) { - return rc; - } - - /* Unpack the filename */ - count = 1; - if (ORTE_SUCCESS != - (rc = opal_dss.unpack(buf, error->file, &count, OPAL_STRING))) { - return rc; - } - - /* Unpack the line number */ - count = 1; - if (ORTE_SUCCESS != - (rc = opal_dss.unpack(buf, &error->line, &count, OPAL_INT))) { - return rc; - } - - /* Unpack the function name */ - count = 1; - if (ORTE_SUCCESS != - (rc = opal_dss.unpack(buf, error->func, &count, OPAL_STRING))) { - return rc; - } - - /* Unpack the error message */ - count = 1; - if (ORTE_SUCCESS != - (rc = opal_dss.unpack(buf, error->msg, &count, OPAL_STRING))) { - return rc; - } - - /* Unpack the pointer to the previous error */ - count = 1; - if (ORTE_SUCCESS != - (rc = opal_dss.unpack(buf, &error->prev, &count, OPAL_INT))) { - return rc; - } - - /* Unpack the pointer to the next error */ - count = 1; - if (ORTE_SUCCESS != - (rc = opal_dss.unpack(buf, &error->next, &count, OPAL_INT))) { - return rc; - } - - return ORTE_SUCCESS; -} - -/* - * Function to unpack the entire SOS table on the HNP. - */ -static void process_sos_table_msg(int fd, short event, void *cbdata) -{ - orte_message_event_t *mev = (orte_message_event_t*)cbdata; - size_t table_size; - int i, rc = ORTE_SUCCESS, count, numerrors; - opal_sos_error_t *opal_error; - opal_hash_table_t *sos_table, *old_sos_table; - - /* Allocate a new SOS table */ - sos_table = OBJ_NEW(opal_hash_table_t); - if (NULL == sos_table) { - ORTE_ERROR_LOG(OPAL_ERR_OUT_OF_RESOURCE); - OBJ_RELEASE(mev); - return; - } - - /* Unpack the size of the SOS table */ - count = 1; - if (ORTE_SUCCESS != - (rc = opal_dss.unpack(mev->buffer, &table_size, &count, OPAL_SIZE))) { - goto error; - } - numerrors = (int) table_size; - - /* Initialize the SOS table */ - opal_hash_table_init(sos_table, table_size); - - for (i = 0; i < numerrors; i++) { - - opal_error = OBJ_NEW(opal_sos_error_t); - if (NULL == opal_error) { - rc = OPAL_ERR_OUT_OF_RESOURCE; - goto error; - } - - if (ORTE_SUCCESS != - (rc = opal_dss_unpack_sos_error(mev->buffer, opal_error))) { - goto error; - } - - opal_hash_table_set_value_uint32(sos_table, - opal_error->errnum, - (void *)opal_error); - } - - /* Add this SOS table to the list of SOS tables. - If it already exists, we destroy the old table - and set the new one as the current SOS table. */ - OPAL_THREAD_LOCK(&orte_notifier_hnp_tables_lock); - if (false == - opal_pointer_array_test_and_set_item(&orte_notifier_hnp_tables, - mev->sender.vpid, - (void *)sos_table)) { - old_sos_table = opal_pointer_array_get_item(&orte_notifier_hnp_tables, - mev->sender.vpid); - OBJ_DESTRUCT(old_sos_table); - old_sos_table = NULL; - opal_pointer_array_set_item(&orte_notifier_hnp_tables, - mev->sender.vpid, - (void *)sos_table); - } - OPAL_THREAD_UNLOCK(&orte_notifier_hnp_tables_lock); - OBJ_RELEASE(mev); - return; - -error: - ORTE_ERROR_LOG(rc); - /* release the message event */ - OBJ_RELEASE(mev); - - /* destroy the sos table */ - OBJ_DESTRUCT(sos_table); - return; -} -#endif - -void orte_notifier_hnp_recv_cb(int status, orte_process_name_t* sender, - opal_buffer_t* buffer, orte_rml_tag_t tag, - void* cbdata) -{ - int rc; - - OPAL_OUTPUT_VERBOSE((5, orte_notifier_base_output, - "%s notifier:hnp:receive got message from %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(sender))); - - /* Don't process the message right away - remember that we're in a - * callback during the actual RML receive! We need to get out of - * the receive before we process the message to avoid performing - * the rest of the job while still inside this receive. Instead, - * setup an event so that the message gets processed as soon as we - * leave the receive. This avoids the "receive recursion of - * death" scenarios. - * - * The ORTE_MESSAGE_EVENT macro makes a copy of the buffer, which - * we release in the process_msg() callback - the incoming buffer, - * however, is NOT released here, although its payload IS - * transferred to the message buffer for later processing. - */ - ORTE_MESSAGE_EVENT(sender, buffer, tag, process_msg); - - /* reissue the receive, since it is non-persistent */ - if (ORTE_SUCCESS != - (rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, - ORTE_RML_TAG_NOTIFIER_HNP, - ORTE_RML_NON_PERSISTENT, - orte_notifier_hnp_recv_cb, - NULL))) { - ORTE_ERROR_LOG(rc); - } -} - - -#if OPAL_ENABLE_DEBUG -void orte_notifier_hnp_exception_cb(const orte_process_name_t* peer, - orte_rml_exception_t reason) -{ - opal_output(orte_notifier_base_output, - "Notifier HNP RML receive exception from %s", - ORTE_NAME_PRINT((orte_process_name_t*)peer)); -} -#endif diff --git a/opal/util/opal_sos_reporter.txt b/orte/mca/notifier/hnp/orte_notifier_hnp.txt similarity index 94% rename from opal/util/opal_sos_reporter.txt rename to orte/mca/notifier/hnp/orte_notifier_hnp.txt index 8b68da7602..4377e9d7fe 100644 --- a/opal/util/opal_sos_reporter.txt +++ b/orte/mca/notifier/hnp/orte_notifier_hnp.txt @@ -17,7 +17,7 @@ # # $HEADER$ # -# This is the US/English help file for OPAL SOS error messages. +# This is the US/English help file for HNP notifier messages. # # FORMAT: # filename:linenum:functionname diff --git a/orte/mca/notifier/notifier.h b/orte/mca/notifier/notifier.h index 8c08581592..feccbde9ee 100644 --- a/orte/mca/notifier/notifier.h +++ b/orte/mca/notifier/notifier.h @@ -10,6 +10,8 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2009 Cisco Systems, Inc. All Rights Reserved. + * Copyright (c) 2012 Los Alamos National Security, LLC. + * All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -38,9 +40,14 @@ #ifdef HAVE_STDARG_H #include #endif +#ifdef HAVE_LIMITS_H +#include +#endif +#ifdef HAVE_SYSLOG_H +#include +#endif #include "opal/mca/mca.h" -#include "opal/util/opal_sos.h" #include "orte/constants.h" #include "orte/types.h" @@ -56,16 +63,16 @@ BEGIN_C_DECLS */ #define ORTE_NOTIFIER_MAX_BUF 512 -/* Severities, based on OPAL SOS */ +/* Severities */ typedef enum { - ORTE_NOTIFIER_EMERG = OPAL_SOS_SEVERITY_EMERG, - ORTE_NOTIFIER_ALERT = OPAL_SOS_SEVERITY_ALERT, - ORTE_NOTIFIER_CRIT = OPAL_SOS_SEVERITY_CRIT, - ORTE_NOTIFIER_ERROR = OPAL_SOS_SEVERITY_ERROR, - ORTE_NOTIFIER_WARN = OPAL_SOS_SEVERITY_WARN, - ORTE_NOTIFIER_NOTICE = OPAL_SOS_SEVERITY_NOTICE, - ORTE_NOTIFIER_INFO = OPAL_SOS_SEVERITY_INFO, - ORTE_NOTIFIER_DEBUG = OPAL_SOS_SEVERITY_DEBUG + ORTE_NOTIFIER_EMERG = LOG_EMERG, + ORTE_NOTIFIER_ALERT = LOG_ALERT, + ORTE_NOTIFIER_CRIT = LOG_CRIT, + ORTE_NOTIFIER_ERROR = LOG_ERR, + ORTE_NOTIFIER_WARN = LOG_WARNING, + ORTE_NOTIFIER_NOTICE = LOG_NOTICE, + ORTE_NOTIFIER_INFO = LOG_INFO, + ORTE_NOTIFIER_DEBUG = LOG_DEBUG } orte_notifier_base_severity_t; /* diff --git a/orte/mca/odls/base/base.h b/orte/mca/odls/base/base.h index a3fa7a05ea..13d5000e6b 100644 --- a/orte/mca/odls/base/base.h +++ b/orte/mca/odls/base/base.h @@ -76,16 +76,9 @@ ORTE_DECLSPEC int orte_odls_base_select(void); ORTE_DECLSPEC int orte_odls_base_finalize(void); ORTE_DECLSPEC int orte_odls_base_close(void); -/* proc termination entry points */ -ORTE_DECLSPEC void orte_odls_base_notify_iof_complete(orte_process_name_t *proc); -ORTE_DECLSPEC void orte_base_default_waitpid_fired(orte_process_name_t *proc, int32_t status); - /* setup singleton job data */ ORTE_DECLSPEC void orte_odls_base_setup_singleton_jobdat(orte_jobid_t jobid); -/* Lookup function to see if the child process has already finished. */ -ORTE_DECLSPEC bool orte_odls_base_default_check_finished(orte_process_name_t *proc); - #endif /* ORTE_DISABLE_FULL_SUPPORT */ END_C_DECLS diff --git a/orte/mca/odls/base/odls_base_close.c b/orte/mca/odls/base/odls_base_close.c index 81bac5e442..3f12632050 100644 --- a/orte/mca/odls/base/odls_base_close.c +++ b/orte/mca/odls/base/odls_base_close.c @@ -9,6 +9,8 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. + * Copyright (c) 2011 Los Alamos National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -24,7 +26,7 @@ #include "opal/mca/mca.h" #include "opal/mca/base/base.h" #include "opal/class/opal_list.h" -#include "opal/threads/threads.h" +#include "opal/class/opal_pointer_array.h" #include "orte/mca/odls/odls.h" #include "orte/mca/odls/base/base.h" @@ -33,11 +35,11 @@ int orte_odls_base_close(void) { + int i; + orte_proc_t *proc; opal_list_item_t *item; - + /* cleanup ODLS globals */ - OBJ_DESTRUCT(&orte_odls_globals.mutex); - OBJ_DESTRUCT(&orte_odls_globals.cond); while (NULL != (item = opal_list_remove_first(&orte_odls_globals.xterm_ranks))) { OBJ_RELEASE(item); } @@ -48,19 +50,13 @@ int orte_odls_base_close(void) } /* cleanup the global list of local children and job data */ - while (NULL != (item = opal_list_remove_first(&orte_local_children))) { - OBJ_RELEASE(item); + for (i=0; i < orte_local_children->size; i++) { + if (NULL != (proc = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) { + OBJ_RELEASE(proc); + } } - OBJ_DESTRUCT(&orte_local_children); - OBJ_DESTRUCT(&orte_local_children_lock); - OBJ_DESTRUCT(&orte_local_children_cond); - while (NULL != (item = opal_list_remove_first(&orte_local_jobdata))) { - OBJ_RELEASE(item); - } - OBJ_DESTRUCT(&orte_local_jobdata); - OBJ_DESTRUCT(&orte_local_jobdata_lock); - OBJ_DESTRUCT(&orte_local_jobdata_cond); - + OBJ_RELEASE(orte_local_children); + /* if no components are available, then punt */ if (!orte_odls_base.components_available) { return ORTE_SUCCESS; diff --git a/orte/mca/odls/base/odls_base_default_fns.c b/orte/mca/odls/base/odls_base_default_fns.c index 2bc7eccd48..848667f70c 100644 --- a/orte/mca/odls/base/odls_base_default_fns.c +++ b/orte/mca/odls/base/odls_base_default_fns.c @@ -11,9 +11,9 @@ * All rights reserved. * Copyright (c) 2007-2011 Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011 Oak Ridge National Labs. All rights reserved. - * Copyright (c) 2011 Los Alamos National Security, LLC. + * Copyright (c) 2011-2012 Los Alamos National Security, LLC. * All rights reserved. - * Copyright (c) 2011 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2011 Cisco Systems, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -58,10 +58,12 @@ #include "orte/mca/iof/iof.h" #include "orte/mca/iof/base/iof_base_setup.h" #include "orte/mca/ess/base/base.h" +#include "orte/mca/grpcomm/base/base.h" #include "orte/mca/plm/base/base.h" #include "orte/mca/routed/base/base.h" #include "orte/mca/rmaps/rmaps_types.h" #include "orte/mca/sensor/sensor.h" +#include "orte/mca/state/state.h" #include "orte/util/context_fns.h" #include "orte/util/name_fns.h" @@ -101,7 +103,6 @@ int orte_odls_base_default_get_add_procs_data(opal_buffer_t *data, int32_t numbytes; int8_t flag; int j; - orte_daemon_cmd_flag_t command; orte_app_context_t *app; if (NULL != orte_debugger_daemon && ORTE_JOBID_INVALID == job) { @@ -153,12 +154,6 @@ int orte_odls_base_default_get_add_procs_data(opal_buffer_t *data, /* if anything was inserted, put it in a byte object for xmission */ if (0 < wireup->bytes_used) { opal_dss.unload(wireup, (void**)&bo.bytes, &numbytes); - /* pack the number of bytes required by payload */ - if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &numbytes, 1, OPAL_INT32))) { - ORTE_ERROR_LOG(rc); - OBJ_RELEASE(wireup); - return rc; - } /* pack the byte object */ bo.size = numbytes; boptr = &bo; @@ -185,15 +180,6 @@ int orte_odls_base_default_get_add_procs_data(opal_buffer_t *data, opal_dss.pack(data, &flag, 1, OPAL_INT8); } - /* insert an "add-procs" command here so we can cleanly process it on the - * other end - */ - command = ORTE_DAEMON_ADD_LOCAL_PROCS; - if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &command, 1, ORTE_DAEMON_CMD))) { - ORTE_ERROR_LOG(rc); - return rc; - } - /* are we co-locating debugger daemons? */ if (NULL != orte_debugger_daemon) { /* flag that we are */ @@ -315,6 +301,10 @@ int orte_odls_base_default_get_add_procs_data(opal_buffer_t *data, ORTE_ERROR_LOG(rc); return rc; } + /* save it on the job data object as we won't be unpacking the buffer + * on our end + */ + opal_dss.copy((void**)&jdata->pmap, &bo, OPAL_BYTE_OBJECT); /* release the data since it has now been copied into our buffer */ free(bo.bytes); @@ -332,84 +322,55 @@ int orte_odls_base_default_get_add_procs_data(opal_buffer_t *data, return ORTE_SUCCESS; } -int orte_odls_base_default_update_daemon_info(opal_buffer_t *data) +static int check_local_proc(orte_job_t *jdata, orte_proc_t *pptr) { - opal_buffer_t wireup; - opal_byte_object_t *bo; - int rc; - orte_std_cntr_t cnt; - int32_t numbytes; - int8_t flag; + orte_vpid_t host_daemon; + orte_app_context_t *app; - /* extract the byte object holding the daemonmap */ - cnt=1; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &bo, &cnt, OPAL_BYTE_OBJECT))) { - ORTE_ERROR_LOG(rc); - return rc; + /* get the vpid of the daemon that is to host this proc */ + OPAL_OUTPUT_VERBOSE((20, orte_odls_globals.output, + "%s odls:constructing child list - looking for daemon for proc %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&pptr->name))); + if (ORTE_VPID_INVALID == (host_daemon = orte_ess.proc_get_daemon(&pptr->name))) { + ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); + return ORTE_ERR_NOT_FOUND; } - /* retain a copy for downloading to child processes */ - if (NULL != orte_odls_globals.dmap) { - free(orte_odls_globals.dmap->bytes); - free(orte_odls_globals.dmap); - orte_odls_globals.dmap = NULL; + + OPAL_OUTPUT_VERBOSE((20, orte_odls_globals.output, + "%s odls:constructing child list - checking proc %s on daemon %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&pptr->name), + ORTE_VPID_PRINT(host_daemon))); + + /* does this proc belong to us? */ + if (ORTE_PROC_MY_NAME->vpid != host_daemon) { + return ORTE_SUCCESS; } - opal_dss.copy((void**)&orte_odls_globals.dmap, bo, OPAL_BYTE_OBJECT); - - /* update our local nidmap, if required - the decode function - * knows what to do - it will also free the bytes in the bo - */ - OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, - "%s odls:update:daemon:info updating nidmap", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - - if (ORTE_SUCCESS != (rc = orte_ess.update_nidmap(bo))) { - ORTE_ERROR_LOG(rc); - return rc; - } - /* update the routing tree */ - if (ORTE_SUCCESS != (rc = orte_routed.update_routing_tree(ORTE_PROC_MY_NAME->jobid))) { - ORTE_ERROR_LOG(rc); - return rc; - } - - /* see if we have wiring info as well */ - cnt=1; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &flag, &cnt, OPAL_INT8))) { - ORTE_ERROR_LOG(rc); - return rc; - } - if (0 == flag) { - /* no - just return */ - return rc; + + OPAL_OUTPUT_VERBOSE((10, orte_odls_globals.output, + "%s odls:constructing child list - found proc %s for me!", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&pptr->name))); + + /* is this child on our current list of children */ + if (!pptr->local_proc) { + /* not on the local list */ + OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, + "adding proc %s to my local list", + ORTE_NAME_PRINT(&pptr->name))); + /* keep tabs of the number of local procs */ + jdata->num_local_procs++; + /* add this proc to our child list */ + OBJ_RETAIN(pptr); + pptr->local_proc = true; + opal_pointer_array_add(orte_local_children, pptr); } - /* unpack the #bytes of daemon wireup info in the message */ - cnt=1; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &numbytes, &cnt, OPAL_INT32))) { - ORTE_ERROR_LOG(rc); - return rc; + /* if the job is in restart mode, the child must not barrier when launched */ + if (ORTE_JOB_CONTROL_RESTART & jdata->controls) { + pptr->do_not_barrier = true; } - /* any bytes there? */ - if (0 < numbytes) { - /* unpack the byte object */ - cnt=1; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &bo, &cnt, OPAL_BYTE_OBJECT))) { - ORTE_ERROR_LOG(rc); - return rc; - } - /* load it into a buffer */ - OBJ_CONSTRUCT(&wireup, opal_buffer_t); - opal_dss.load(&wireup, bo->bytes, bo->size); - /* pass it for processing */ - if (ORTE_SUCCESS != (rc = orte_routed.init_routes(ORTE_PROC_MY_NAME->jobid, &wireup))) { - ORTE_ERROR_LOG(rc); - OBJ_DESTRUCT(&wireup); - return rc; - } - /* done with the buffer - dump it */ - OBJ_DESTRUCT(&wireup); - } - + /* mark that this app_context is being used on this node */ + app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, pptr->app_idx); + app->used_on_node = true; return ORTE_SUCCESS; } @@ -417,18 +378,15 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data, orte_jobid_t *job) { int rc; - orte_vpid_t j, host_daemon; - orte_odls_child_t *child; + orte_vpid_t j; orte_std_cntr_t cnt; - orte_odls_job_t *jobdat=NULL; + orte_job_t *jdata=NULL; opal_byte_object_t *bo; - opal_list_item_t *item; int8_t flag; orte_jobid_t debugger; - bool add_child; - orte_ns_cmp_bitmask_t mask; + int32_t n; orte_app_context_t *app; - orte_proc_t *pptr; + orte_proc_t *pptr, *p2; OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, "%s odls:constructing child list", @@ -436,6 +394,41 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data, *job = ORTE_JOBID_INVALID; + /* extract the byte object holding the daemon map */ + cnt=1; + if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &bo, &cnt, OPAL_BYTE_OBJECT))) { + ORTE_ERROR_LOG(rc); + goto REPORT_ERROR; + } + /* retain a copy for downloading to child processes */ + if (NULL != orte_odls_globals.dmap) { + free(orte_odls_globals.dmap->bytes); + free(orte_odls_globals.dmap); + orte_odls_globals.dmap = NULL; + } + orte_odls_globals.dmap = bo; + bo = NULL; + + /* unpack the wireup info flag */ + cnt=1; + if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &flag, &cnt, OPAL_INT8))) { + ORTE_ERROR_LOG(rc); + goto REPORT_ERROR; + } + /* if it was given, unpack and discard it */ + if (0 != flag) { + /* unpack the byte object */ + cnt=1; + if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &bo, &cnt, OPAL_BYTE_OBJECT))) { + ORTE_ERROR_LOG(rc); + goto REPORT_ERROR; + } + if (0 < bo->size) { + free(bo->bytes); + } + free(bo); + } + /* unpack the flag - are we co-locating debugger daemons? */ cnt=1; if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &flag, &cnt, OPAL_INT8))) { @@ -450,7 +443,7 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data, * worry about race conditions as the debugger daemons do not use * the daemon collective system */ - orte_odls_globals.debugger = OBJ_NEW(orte_odls_job_t); + orte_odls_globals.debugger = OBJ_NEW(orte_job_t); /* get the debugger daemon jobid */ cnt=1; if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &debugger, &cnt, ORTE_JOBID))) { @@ -460,7 +453,7 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data, orte_odls_globals.debugger->jobid = debugger; orte_odls_globals.debugger->num_apps = 1; orte_odls_globals.debugger->num_local_procs = 1; - opal_list_append(&orte_local_jobdata, &(orte_odls_globals.debugger)->super); + opal_pointer_array_set_item(orte_job_data, ORTE_LOCAL_JOBID(debugger), orte_odls_globals.debugger); /* retrieve the info */ cnt = 1; app = NULL; @@ -468,7 +461,7 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data, ORTE_ERROR_LOG(rc); goto REPORT_ERROR; } - opal_pointer_array_add(&orte_odls_globals.debugger->apps, app); + opal_pointer_array_add(orte_odls_globals.debugger->apps, app); cnt=1; if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &(orte_odls_globals.debugger->controls), &cnt, ORTE_JOB_CONTROL))) { ORTE_ERROR_LOG(rc); @@ -483,7 +476,7 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data, * launching debugger daemons */ if (ORTE_ERR_UNPACK_READ_PAST_END_OF_BUFFER == rc) { - goto done; + goto COMPLETE; } *job = ORTE_JOBID_INVALID; ORTE_ERROR_LOG(rc); @@ -500,106 +493,113 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data, * to our unpacking add_local_procs. So lookup the job record for this jobid * and see if it already exists */ - for (item = opal_list_get_first(&orte_local_jobdata); - item != opal_list_get_end(&orte_local_jobdata); - item = opal_list_get_next(item)) { - orte_odls_job_t *jdat = (orte_odls_job_t*)item; - - /* is this the specified job? */ - if (jdat->jobid == *job) { - OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, - "%s odls:construct_child_list found existing jobdat for job %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(*job))); - jobdat = jdat; - break; - } - } - if (NULL == jobdat) { - /* setup jobdat object for this job */ + if (NULL == (jdata = orte_get_job_data_object(*job))) { + /* setup job object for this job */ OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, - "%s odls:construct_child_list adding new jobdat for job %s", + "%s odls:construct_child_list adding new object for job %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(*job))); - jobdat = OBJ_NEW(orte_odls_job_t); - jobdat->jobid = *job; - opal_list_append(&orte_local_jobdata, &jobdat->super); + jdata = OBJ_NEW(orte_job_t); + jdata->jobid = *job; + opal_pointer_array_set_item(orte_job_data, ORTE_LOCAL_JOBID(jdata->jobid), jdata); } + + /* if we are the HNP, we don't need to unpack this buffer - we already + * have all the required info in our local job array. So just build the + * array of local children + */ + if (ORTE_PROC_IS_HNP) { + for (n=0; n < jdata->procs->size; n++) { + if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, n))) { + continue; + } + if (ORTE_SUCCESS != (rc = check_local_proc(jdata, pptr))) { + ORTE_ERROR_LOG(rc); + goto REPORT_ERROR; + } + } + goto COMPLETE; + } + /* if we are doing a timing test, store the time the msg was recvd */ if (orte_timing) { - jobdat->launch_msg_recvd.tv_sec = orte_daemon_msg_recvd.tv_sec; - jobdat->launch_msg_recvd.tv_usec = orte_daemon_msg_recvd.tv_usec; + jdata->launch_msg_recvd.tv_sec = orte_daemon_msg_recvd.tv_sec; + jdata->launch_msg_recvd.tv_usec = orte_daemon_msg_recvd.tv_usec; } /* UNPACK JOB-SPECIFIC DATA */ /* unpack the job state so we can know if this is a restart vs initial launch */ cnt=1; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &jobdat->state, &cnt, ORTE_JOB_STATE))) { + if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &jdata->state, &cnt, ORTE_JOB_STATE))) { *job = ORTE_JOBID_INVALID; ORTE_ERROR_LOG(rc); goto REPORT_ERROR; } /* unpack the number of nodes involved in this job */ + if (NULL == jdata->map) { + jdata->map = OBJ_NEW(orte_job_map_t); + } cnt=1; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &jobdat->num_nodes, &cnt, ORTE_STD_CNTR))) { + if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &jdata->map->num_nodes, &cnt, ORTE_STD_CNTR))) { ORTE_ERROR_LOG(rc); goto REPORT_ERROR; } /* unpack the number of procs in this launch */ cnt=1; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &jobdat->num_procs, &cnt, ORTE_VPID))) { + if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &jdata->num_procs, &cnt, ORTE_VPID))) { ORTE_ERROR_LOG(rc); goto REPORT_ERROR; } /* unpack the total slots allocated to us */ cnt=1; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &jobdat->total_slots_alloc, &cnt, ORTE_STD_CNTR))) { + if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &jdata->total_slots_alloc, &cnt, ORTE_STD_CNTR))) { ORTE_ERROR_LOG(rc); goto REPORT_ERROR; } #if OPAL_HAVE_HWLOC /* unpack the binding policy */ cnt=1; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &jobdat->binding, &cnt, OPAL_BINDING_POLICY))) { + if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &jdata->map->binding, &cnt, OPAL_BINDING_POLICY))) { ORTE_ERROR_LOG(rc); goto REPORT_ERROR; } #endif /* unpack the control flags for the job */ cnt=1; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &jobdat->controls, &cnt, ORTE_JOB_CONTROL))) { + if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &jdata->controls, &cnt, ORTE_JOB_CONTROL))) { ORTE_ERROR_LOG(rc); goto REPORT_ERROR; } /* unpack the stdin target for the job */ cnt=1; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &jobdat->stdin_target, &cnt, ORTE_VPID))) { + if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &jdata->stdin_target, &cnt, ORTE_VPID))) { ORTE_ERROR_LOG(rc); goto REPORT_ERROR; } /* unpack whether or not process recovery is allowed for this job */ cnt=1; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &jobdat->enable_recovery, &cnt, OPAL_BOOL))) { + if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &jdata->enable_recovery, &cnt, OPAL_BOOL))) { ORTE_ERROR_LOG(rc); goto REPORT_ERROR; } /* unpack the number of app_contexts for this job */ cnt=1; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &jobdat->num_apps, &cnt, ORTE_APP_IDX))) { + if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &jdata->num_apps, &cnt, ORTE_APP_IDX))) { ORTE_ERROR_LOG(rc); goto REPORT_ERROR; } OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, "%s odls:construct_child_list unpacking %ld app_contexts", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (long)jobdat->num_apps)); - for (j=0; j < jobdat->num_apps; j++) { + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (long)jdata->num_apps)); + for (j=0; j < jdata->num_apps; j++) { cnt = 1; app = NULL; if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &app, &cnt, ORTE_APP_CONTEXT))) { ORTE_ERROR_LOG(rc); goto REPORT_ERROR; } - opal_pointer_array_set_item(&jobdat->apps, app->idx, app); + opal_pointer_array_set_item(jdata->apps, app->idx, app); } /* unpack the pidmap byte object */ @@ -609,11 +609,13 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data, goto REPORT_ERROR; } /* retain a copy for downloading to child processes */ - if (NULL != jobdat->pmap && NULL != jobdat->pmap->bytes) { - free(jobdat->pmap->bytes); - free(jobdat->pmap); + if (NULL != jdata->pmap) { + if (NULL != jdata->pmap->bytes) { + free(jdata->pmap->bytes); + } + free(jdata->pmap); } - opal_dss.copy((void**)&jobdat->pmap, bo, OPAL_BYTE_OBJECT); + opal_dss.copy((void**)&jdata->pmap, bo, OPAL_BYTE_OBJECT); /* decode the pidmap - this will also free the bytes in bo */ if (ORTE_SUCCESS != (rc = orte_ess.update_pidmap(bo))) { ORTE_ERROR_LOG(rc); @@ -621,115 +623,32 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data, } /* unpack the procs */ - for (j=0; j < jobdat->num_procs; j++) { + for (j=0; j < jdata->num_procs; j++) { cnt=1; if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &pptr, &cnt, ORTE_PROC))) { ORTE_ERROR_LOG(rc); goto REPORT_ERROR; } + /* add it to our global jdata object since + * many parts of the system will look for it there + */ + if (NULL != (p2 = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, pptr->name.vpid))) { + OBJ_RELEASE(p2); + } + opal_pointer_array_set_item(jdata->procs, pptr->name.vpid, pptr); - /* see if it is one of mine */ - ORTE_EPOCH_SET(proc.epoch,orte_ess.proc_get_epoch(&pptr->name)); - /* get the vpid of the daemon that is to host this proc */ - OPAL_OUTPUT_VERBOSE((20, orte_odls_globals.output, - "%s odls:constructing child list - looking for daemon for proc %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&pptr->name))); - if (ORTE_VPID_INVALID == (host_daemon = orte_ess.proc_get_daemon(&pptr->name))) { - ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); - rc = ORTE_ERR_NOT_FOUND; + /* see if it belongs to us */ + if (ORTE_SUCCESS != (rc = check_local_proc(jdata, pptr))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(pptr); goto REPORT_ERROR; } - - OPAL_OUTPUT_VERBOSE((20, orte_odls_globals.output, - "%s odls:constructing child list - checking proc %s on daemon %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&pptr->name), - ORTE_VPID_PRINT(host_daemon))); - - /* does this proc belong to us? */ - if (ORTE_PROC_MY_NAME->vpid == host_daemon) { - - OPAL_OUTPUT_VERBOSE((10, orte_odls_globals.output, - "%s odls:constructing child list - found proc %s for me!", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&pptr->name))); - - add_child = true; - /* if this job is restarting procs, then we need to treat things - * a little differently. We may be adding a proc to our local - * children (if the proc moved here from somewhere else), or we - * may simply be restarting someone already here. - */ - if (ORTE_JOB_STATE_RESTART == jobdat->state) { - /* look for this job on our current list of children */ - for (item = opal_list_get_first(&orte_local_children); - item != opal_list_get_end(&orte_local_children); - item = opal_list_get_next(item)) { - child = (orte_odls_child_t*)item; - - mask = ORTE_NS_CMP_ALL; - - if (OPAL_EQUAL == - orte_util_compare_name_fields(mask, child->name, &pptr->name)) { - /* do not duplicate this child on the list! */ - OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, - "proc %s is on list and is %s", - ORTE_NAME_PRINT(&pptr->name), - (child->alive) ? "ALIVE" : "DEAD")); - add_child = false; - child->restarts = pptr->restarts; - child->do_not_barrier = true; - /* mark that this app_context is being used on this node */ - app = (orte_app_context_t*)opal_pointer_array_get_item(&jobdat->apps, pptr->app_idx); - app->used_on_node = true; - break; - } - } - } - - /* if we need to add the child, do so */ - if (add_child) { - OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, - "adding proc %s to my local list", - ORTE_NAME_PRINT(&pptr->name))); - /* keep tabs of the number of local procs */ - jobdat->num_local_procs++; - /* add this proc to our child list */ - child = OBJ_NEW(orte_odls_child_t); - /* copy the name to preserve it */ - if (ORTE_SUCCESS != (rc = opal_dss.copy((void**)&child->name, &pptr->name, ORTE_NAME))) { - ORTE_ERROR_LOG(rc); - goto REPORT_ERROR; - } - child->app_idx = pptr->app_idx; /* save the index into the app_context objects */ - child->restarts = pptr->restarts; - /* if the job is in restart mode, the child must not barrier when launched */ - if (ORTE_JOB_STATE_RESTART == jobdat->state) { - child->do_not_barrier = true; - } -#if OPAL_HAVE_HWLOC - if (NULL != pptr->cpu_bitmap) { - child->cpu_bitmap = strdup(pptr->cpu_bitmap); - } -#endif - /* mark that this app_context is being used on this node */ - app = (orte_app_context_t*)opal_pointer_array_get_item(&jobdat->apps, pptr->app_idx); - app->used_on_node = true; - /* protect operation on the global list of children */ - OPAL_THREAD_LOCK(&orte_odls_globals.mutex); - opal_list_append(&orte_local_children, &child->super); - opal_condition_signal(&orte_odls_globals.cond); - OPAL_THREAD_UNLOCK(&orte_odls_globals.mutex); - } - } - OBJ_RELEASE(pptr); } - /* flag that the launch msg has been processed so daemon collectives can proceed */ - OPAL_THREAD_LOCK(&jobdat->lock); - jobdat->launch_msg_processed = true; - opal_condition_broadcast(&jobdat->cond); - OPAL_THREAD_UNLOCK(&jobdat->lock); + COMPLETE: + /* progress any pending collectives */ + orte_grpcomm_base_progress_collectives(); - done: return ORTE_SUCCESS; REPORT_ERROR: @@ -739,8 +658,7 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data, * for it to happen - especially so developers don't have to * deal with the hang! */ - orte_errmgr.update_state(*job, ORTE_JOB_STATE_NEVER_LAUNCHED, - NULL, ORTE_PROC_STATE_UNDEF, 0, rc); + ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_NEVER_LAUNCHED); return rc; } @@ -912,7 +830,7 @@ static int odls_base_default_setup_fork(orte_app_context_t *context, return ORTE_SUCCESS; } -static int setup_child(orte_odls_child_t *child, orte_odls_job_t *jobdat, char ***env) +static int setup_child(orte_proc_t *child, orte_job_t *jobdat, char ***env) { char *param, *value; orte_node_rank_t node_rank; @@ -920,7 +838,7 @@ static int setup_child(orte_odls_child_t *child, orte_odls_job_t *jobdat, char * int rc; /* setup the jobid */ - if (ORTE_SUCCESS != (rc = orte_util_convert_jobid_to_string(&value, child->name->jobid))) { + if (ORTE_SUCCESS != (rc = orte_util_convert_jobid_to_string(&value, child->name.jobid))) { ORTE_ERROR_LOG(rc); return rc; } @@ -933,24 +851,8 @@ static int setup_child(orte_odls_child_t *child, orte_odls_job_t *jobdat, char * free(param); free(value); -#if ORTE_ENABLE_EPOCH - /* setup the epoch */ - if (ORTE_SUCCESS != (rc = orte_util_convert_epoch_to_string(&value, child->name->epoch))) { - ORTE_ERROR_LOG(rc); - return rc; - } - if (NULL == (param = mca_base_param_environ_variable("orte","ess","epoch"))) { - ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); - rc = ORTE_ERR_OUT_OF_RESOURCE; - return rc; - } - opal_setenv(param, value, true, env); - free(param); - free(value); -#endif - /* setup the vpid */ - if (ORTE_SUCCESS != (rc = orte_util_convert_vpid_to_string(&value, child->name->vpid))) { + if (ORTE_SUCCESS != (rc = orte_util_convert_vpid_to_string(&value, child->name.vpid))) { ORTE_ERROR_LOG(rc); return rc; } @@ -980,7 +882,7 @@ static int setup_child(orte_odls_child_t *child, orte_odls_job_t *jobdat, char * * AND YES - THIS BREAKS THE ABSTRACTION BARRIER TO SOME EXTENT. * We know - just live with it */ - if (ORTE_LOCAL_RANK_INVALID == (local_rank = orte_ess.get_local_rank(child->name))) { + if (ORTE_LOCAL_RANK_INVALID == (local_rank = orte_ess.get_local_rank(&child->name))) { ORTE_ERROR_LOG(ORTE_ERR_VALUE_OUT_OF_BOUNDS); rc = ORTE_ERR_VALUE_OUT_OF_BOUNDS; return rc; @@ -996,7 +898,7 @@ static int setup_child(orte_odls_child_t *child, orte_odls_job_t *jobdat, char * * AND YES - THIS BREAKS THE ABSTRACTION BARRIER TO SOME EXTENT. * We know - just live with it */ - if (ORTE_NODE_RANK_INVALID == (node_rank = orte_ess.get_node_rank(child->name))) { + if (ORTE_NODE_RANK_INVALID == (node_rank = orte_ess.get_node_rank(&child->name))) { ORTE_ERROR_LOG(ORTE_ERR_VALUE_OUT_OF_BOUNDS); rc = ORTE_ERR_VALUE_OUT_OF_BOUNDS; return rc; @@ -1123,34 +1025,38 @@ static int setup_path(orte_app_context_t *app) /* define a timer release point so that we can wait for * file descriptors to come available, if necessary */ -static bool time_is_up; - static void timer_cb(int fd, short event, void *cbdata) { - opal_event_t *ev = (opal_event_t*)cbdata; - - /* free event */ - if (NULL != ev) { - free(ev); - } - /* declare time is up */ - time_is_up = true; + orte_timer_t *tm = (orte_timer_t*)cbdata; + orte_odls_launch_local_t *ll = (orte_odls_launch_local_t*)tm->payload; + + /* increment the number of retries */ + ll->retries++; + + /* re-attempt the launch */ + opal_event_active(ll->ev, OPAL_EV_WRITE, 1); + + /* release the timer event */ + OBJ_RELEASE(tm); } static int compute_num_procs_alive(orte_jobid_t job) { - opal_list_item_t *item; - orte_odls_child_t *child; + int i; + orte_proc_t *child; int num_procs_alive = 0; - for (item = opal_list_get_first(&orte_local_children); - item != opal_list_get_end (&orte_local_children); - item = opal_list_get_next(item)) { - child = (orte_odls_child_t*)item; + for (i=0; i < orte_local_children->size; i++) { + if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) { + continue; + } if (!child->alive) { continue; } - if (job == child->name->jobid) { + /* do not include members of the specified job as they + * will be added later, if required + */ + if (job == child->name.jobid) { continue; } num_procs_alive++; @@ -1158,28 +1064,25 @@ static int compute_num_procs_alive(orte_jobid_t job) return num_procs_alive; } -int orte_odls_base_default_launch_local(orte_jobid_t job, - orte_odls_base_fork_local_proc_fn_t fork_local) + +void orte_odls_base_default_launch_local(int fd, short sd, void *cbdata) { - opal_list_item_t *item; orte_app_context_t *app, *dbg; - orte_odls_child_t *child=NULL; + orte_proc_t *child=NULL; bool oversubscribed; int rc=ORTE_SUCCESS; - bool launch_failed=true; - opal_buffer_t alert; + opal_buffer_t *alert; orte_std_cntr_t proc_rank; - orte_odls_job_t *jobdat; char basedir[MAXPATHLEN]; char **argvsav=NULL; - int inm, j; - opal_event_t *delay; + int inm, j, idx; int total_num_local_procs = 0; orte_nid_t *nid; orte_node_t *node; - - /* protect operations involving the global list of children */ - OPAL_THREAD_LOCK(&orte_odls_globals.mutex); + orte_odls_launch_local_t *caddy = (orte_odls_launch_local_t*)cbdata; + orte_job_t *jobdat; + orte_jobid_t job = caddy->job; + orte_odls_base_fork_local_proc_fn_t fork_local = caddy->fork_local; /* establish our baseline working directory - we will be potentially * bouncing around as we execute various apps, but we will always return @@ -1188,29 +1091,19 @@ int orte_odls_base_default_launch_local(orte_jobid_t job, getcwd(basedir, sizeof(basedir)); /* find the jobdat for this job */ - jobdat = NULL; - for (item = opal_list_get_first(&orte_local_jobdata); - item != opal_list_get_end(&orte_local_jobdata); - item = opal_list_get_next(item)) { - jobdat = (orte_odls_job_t*)item; - - /* is this the specified job? */ - if (jobdat->jobid == job) { - break; - } - } - if (NULL == jobdat) { + if (NULL == (jobdat = orte_get_job_data_object(job))) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); - rc = ORTE_ERR_NOT_FOUND; - goto GETOUT; + /* not much we can do here - we are just hosed, so + * report that to the error manager + */ + ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_FAILED_TO_LAUNCH); + goto ERROR_OUT; } /* do we have any local procs to launch? */ if (0 == jobdat->num_local_procs) { - /* no - just return */ - opal_condition_signal(&orte_odls_globals.cond); - OPAL_THREAD_UNLOCK(&orte_odls_globals.mutex); - return ORTE_SUCCESS; + /* indicate that we are done trying to launch them */ + goto GETOUT; } /* see if the mapper thinks we are oversubscribed */ @@ -1219,8 +1112,8 @@ int orte_odls_base_default_launch_local(orte_jobid_t job, /* just fake it - we don't keep a local nidmap */ if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, 0))) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); - rc = ORTE_ERR_NOT_FOUND; - goto CLEANUP; + ORTE_ACTIVATE_JOB_STATE(jobdat, ORTE_JOB_STATE_FAILED_TO_LAUNCH); + goto ERROR_OUT; } if (node->oversubscribed) { oversubscribed = true; @@ -1249,8 +1142,8 @@ int orte_odls_base_default_launch_local(orte_jobid_t job, /* Now we preload any files that are needed. This is done on a per * app context basis */ - for (j=0; j < jobdat->apps.size; j++) { - if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(&jobdat->apps, j))) { + for (j=0; j < jobdat->apps->size; j++) { + if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jobdat->apps, j))) { continue; } if(app->used_on_node && @@ -1263,8 +1156,8 @@ int orte_odls_base_default_launch_local(orte_jobid_t job, } #if OPAL_ENABLE_FT_CR == 1 - for (j=0; j < jobdat->apps.size; j++) { - if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(&jobdat->apps, j))) { + for (j=0; j < jobdat->apps->size; j++) { + if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jobdat->apps, j))) { continue; } orte_sstore.fetch_app_deps(app); @@ -1273,13 +1166,13 @@ int orte_odls_base_default_launch_local(orte_jobid_t job, #endif /* setup to report the proc state to the HNP */ - OBJ_CONSTRUCT(&alert, opal_buffer_t); + alert = OBJ_NEW(opal_buffer_t); /* compute the total number of local procs currently alive and about to be launched */ total_num_local_procs = compute_num_procs_alive(job) + jobdat->num_local_procs; - for (j=0; j < jobdat->apps.size; j++) { - if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(&jobdat->apps, j))) { + for (j=0; j < jobdat->apps->size; j++) { + if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jobdat->apps, j))) { continue; } @@ -1299,24 +1192,17 @@ int orte_odls_base_default_launch_local(orte_jobid_t job, ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), opal_sys_limits.num_procs, total_num_local_procs)); if (opal_sys_limits.num_procs < total_num_local_procs) { - /* don't have enough - wait a little time */ - time_is_up = false; - ORTE_DETECT_TIMEOUT(&delay, 1000, 1000, -1, timer_cb); - /* wait */ - ORTE_PROGRESSED_WAIT(time_is_up, 0, 1); - /* recompute the num local procs */ - total_num_local_procs = compute_num_procs_alive(job) + jobdat->num_local_procs; - /* see if we still have a problem */ - OPAL_OUTPUT_VERBOSE((10, orte_odls_globals.output, - "%s rechecking limit on num procs %d #children needed %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - opal_sys_limits.num_procs, total_num_local_procs)); - if (opal_sys_limits.num_procs < total_num_local_procs) { - /* at the system limit - abort */ - ORTE_ERROR_LOG(ORTE_ERR_SYS_LIMITS_CHILDREN); - rc = ORTE_ERR_SYS_LIMITS_CHILDREN; - goto CLEANUP; + if (2 < caddy->retries) { + /* if we have already tried too many times, then just give up */ + ORTE_ACTIVATE_JOB_STATE(jobdat, ORTE_JOB_STATE_FAILED_TO_LAUNCH); + goto ERROR_OUT; } + /* set a timer event so we can retry later - this + * gives the system a chance to let other procs + * terminate, thus creating room for new ones + */ + ORTE_DETECT_TIMEOUT(1000, 1000, -1, timer_cb, caddy); + return; } } @@ -1326,7 +1212,7 @@ int orte_odls_base_default_launch_local(orte_jobid_t job, jobdat->num_local_procs, jobdat->num_procs, jobdat->total_slots_alloc, - jobdat->num_nodes, + jobdat->map->num_nodes, oversubscribed, &app->env))) { @@ -1343,17 +1229,17 @@ int orte_odls_base_default_launch_local(orte_jobid_t job, * so we can report things out correctly */ /* cycle through children to find those for this jobid */ - for (item = opal_list_get_first(&orte_local_children); - item != opal_list_get_end(&orte_local_children); - item = opal_list_get_next(item)) { - child = (orte_odls_child_t*)item; - if (OPAL_EQUAL == opal_dss.compare(&job, &(child->name->jobid), ORTE_JOBID) && + for (idx=0; idx < orte_local_children->size; idx++) { + if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, idx))) { + continue; + } + if (OPAL_EQUAL == opal_dss.compare(&job, &(child->name.jobid), ORTE_JOBID) && j == (int)child->app_idx) { child->exit_code = rc; + ORTE_ACTIVATE_PROC_STATE(&child->name, ORTE_PROC_STATE_FAILED_TO_LAUNCH); } } - /* okay, now tell the HNP we couldn't do it */ - goto CLEANUP; + goto GETOUT; } /* setup the working directory for this app - will jump us @@ -1372,24 +1258,24 @@ int orte_odls_base_default_launch_local(orte_jobid_t job, * so we can report things out correctly */ /* cycle through children to find those for this jobid */ - for (item = opal_list_get_first(&orte_local_children); - item != opal_list_get_end(&orte_local_children); - item = opal_list_get_next(item)) { - child = (orte_odls_child_t*)item; - if (OPAL_EQUAL == opal_dss.compare(&job, &(child->name->jobid), ORTE_JOBID) && + for (idx=0; idx < orte_local_children->size; idx++) { + if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, idx))) { + continue; + } + if (OPAL_EQUAL == opal_dss.compare(&job, &(child->name.jobid), ORTE_JOBID) && j == (int)child->app_idx) { child->exit_code = rc; + ORTE_ACTIVATE_PROC_STATE(&child->name, ORTE_PROC_STATE_FAILED_TO_LAUNCH); } } - /* okay, now tell the HNP we couldn't do it */ - goto CLEANUP; + goto GETOUT; } /* okay, now let's launch all the local procs for this app using the provided fork_local fn */ - for (proc_rank = 0, item = opal_list_get_first(&orte_local_children); - item != opal_list_get_end(&orte_local_children); - item = opal_list_get_next(item)) { - child = (orte_odls_child_t*)item; + for (proc_rank = 0, idx=0; idx < orte_local_children->size; idx++) { + if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, idx))) { + continue; + } /* does this child belong to this app? */ if (j != (int)child->app_idx) { @@ -1403,9 +1289,9 @@ int orte_odls_base_default_launch_local(orte_jobid_t job, if (child->alive) { OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, - "%s odls:launch child %s is already alive", + "%s odls:launch child %s has already been launched", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(child->name))); + ORTE_NAME_PRINT(&child->name))); continue; } @@ -1414,12 +1300,12 @@ int orte_odls_base_default_launch_local(orte_jobid_t job, * job could be given as a WILDCARD value, we must use * the dss.compare function to check for equality. */ - if (OPAL_EQUAL != opal_dss.compare(&job, &(child->name->jobid), ORTE_JOBID)) { + if (OPAL_EQUAL != opal_dss.compare(&job, &(child->name.jobid), ORTE_JOBID)) { OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, "%s odls:launch child %s is not in job %s being launched", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(child->name), + ORTE_NAME_PRINT(&child->name), ORTE_JOBID_PRINT(job))); continue; @@ -1428,12 +1314,11 @@ int orte_odls_base_default_launch_local(orte_jobid_t job, OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, "%s odls:launch working child %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(child->name))); + ORTE_NAME_PRINT(&child->name))); /* ensure we clear any prior info regarding state or exit status in * case this is a restart */ - child->state = ORTE_PROC_STATE_FAILED_TO_START; child->exit_code = 0; child->waitpid_recvd = false; /* if we are not forwarding output for this job, then @@ -1444,7 +1329,6 @@ int orte_odls_base_default_launch_local(orte_jobid_t job, } else { child->iof_complete = true; } - child->coll_recvd = false; child->pid = 0; if (NULL != child->rml_uri) { free(child->rml_uri); @@ -1464,25 +1348,14 @@ int orte_odls_base_default_launch_local(orte_jobid_t job, ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), opal_sys_limits.num_files, limit)); if (opal_sys_limits.num_files < limit) { - /* don't have enough - wait a little time */ - time_is_up = false; - ORTE_DETECT_TIMEOUT(&delay, 1000, 1000, -1, timer_cb); - /* wait */ - ORTE_PROGRESSED_WAIT(time_is_up, 0, 1); - /* recompute the num procs alive */ - total_num_local_procs = compute_num_procs_alive(job) + jobdat->num_local_procs; - /* see if we still have a problem */ - limit = 4*total_num_local_procs + 6; - OPAL_OUTPUT_VERBOSE((10, orte_odls_globals.output, - "%s rechecking limit on file descriptors %d need %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - opal_sys_limits.num_files, limit)); - if (opal_sys_limits.num_files < limit) { - /* nope - abort */ - ORTE_ERROR_LOG(ORTE_ERR_SYS_LIMITS_PIPES); - child->exit_code = rc; - goto CLEANUP; + if (2 < caddy->retries) { + /* tried enough - give up */ + ORTE_ACTIVATE_PROC_STATE(&child->name, ORTE_PROC_STATE_FAILED_TO_LAUNCH); + continue; } + /* don't have enough - wait a little time */ + ORTE_DETECT_TIMEOUT(1000, 1000, -1, timer_cb, caddy); + return; } } @@ -1496,7 +1369,7 @@ int orte_odls_base_default_launch_local(orte_jobid_t job, nmitem = opal_list_get_next(nmitem)) { nm = (orte_namelist_t*)nmitem; if (ORTE_VPID_WILDCARD == nm->name.vpid || - child->name->vpid == nm->name.vpid) { + child->name.vpid == nm->name.vpid) { /* we want this one - modify the app's command to include * the orte xterm cmd. Need to be careful, though, that we * don't modify the app for ALL ranks that use it! So we @@ -1512,7 +1385,7 @@ int orte_odls_base_default_launch_local(orte_jobid_t job, } /* insert the rank into the correct place as a window title */ free(app->argv[2]); - asprintf(&app->argv[2], "Rank %s", ORTE_VPID_PRINT(child->name->vpid)); + asprintf(&app->argv[2], "Rank %s", ORTE_VPID_PRINT(child->name.vpid)); /* add back the original argv */ for (inm=0; inm < opal_argv_count(argvsav); inm++) { opal_argv_append_nosize(&app->argv, argvsav[inm]); @@ -1528,9 +1401,9 @@ int orte_odls_base_default_launch_local(orte_jobid_t job, orte_show_help("help-orte-odls-base.txt", "orte-odls-base:xterm-rank-out-of-bounds", true, nm->name.vpid, jobdat->num_procs); - rc = ORTE_ERR_VALUE_OUT_OF_BOUNDS; child->exit_code = ORTE_ERR_SILENT; - goto CLEANUP; + ORTE_ACTIVATE_PROC_STATE(&child->name, ORTE_PROC_STATE_FAILED_TO_LAUNCH); + continue; } } @@ -1555,9 +1428,9 @@ int orte_odls_base_default_launch_local(orte_jobid_t job, orte_show_help("help-orte-odls-base.txt", "orte-odls-base:fork-agent-not-found", true, orte_process_info.nodename, orte_fork_agent[0]); - rc = ORTE_ERR_SILENT; child->exit_code = ORTE_ERR_SILENT; - goto CLEANUP; + ORTE_ACTIVATE_PROC_STATE(&child->name, ORTE_PROC_STATE_FAILED_TO_LAUNCH); + continue; } } @@ -1566,20 +1439,10 @@ int orte_odls_base_default_launch_local(orte_jobid_t job, */ if (ORTE_SUCCESS != (rc = setup_child(child, jobdat, &app->env))) { ORTE_ERROR_LOG(rc); - goto CLEANUP; + ORTE_ACTIVATE_PROC_STATE(&child->name, ORTE_PROC_STATE_FAILED_TO_LAUNCH); + continue; } - /* if we are timing things, record when we are going to launch this proc */ - if (orte_timing) { - gettimeofday(&child->starttime, NULL); - } - - /* must unlock prior to fork to keep things clean in the - * event library - */ - opal_condition_signal(&orte_odls_globals.cond); - OPAL_THREAD_UNLOCK(&orte_odls_globals.mutex); - #if OPAL_ENABLE_FT_CR == 1 /* * OPAL CRS components need the opportunity to take action before a process @@ -1590,21 +1453,22 @@ int orte_odls_base_default_launch_local(orte_jobid_t job, * - Binary to exec */ if( NULL != opal_crs.crs_prelaunch ) { - if( OPAL_SUCCESS != (rc = opal_crs.crs_prelaunch(child->name->vpid, + if( OPAL_SUCCESS != (rc = opal_crs.crs_prelaunch(child->name.vpid, orte_sstore_base_prelaunch_location, &(app->app), &(app->cwd), &(app->argv), &(app->env) ) ) ) { ORTE_ERROR_LOG(rc); - goto CLEANUP; + ORTE_ACTIVATE_PROC_STATE(&child->name, ORTE_PROC_STATE_FAILED_TO_LAUNCH); + continue; } } #endif if (5 < opal_output_get_verbosity(orte_odls_globals.output)) { opal_output(orte_odls_globals.output, "%s odls:launch: spawning child %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(child->name)); + ORTE_NAME_PRINT(&child->name)); /* dump what is going to be exec'd */ if (7 < opal_output_get_verbosity(orte_odls_globals.output)) { @@ -1613,24 +1477,17 @@ int orte_odls_base_default_launch_local(orte_jobid_t job, } rc = fork_local(app, child, app->env, jobdat); - /* reaquire lock so we don't double unlock... */ - OPAL_THREAD_LOCK(&orte_odls_globals.mutex); if (ORTE_SUCCESS != rc) { /* do NOT ERROR_LOG this error - it generates * a message/node as most errors will be common * across the entire cluster. Instead, we let orterun * output a consolidated error message for us */ - goto CLEANUP; + ORTE_ACTIVATE_PROC_STATE(&child->name, ORTE_PROC_STATE_FAILED_TO_START); + continue; } else { child->alive = true; - child->state = ORTE_PROC_STATE_LAUNCHED; - if (ORTE_SUCCESS != (rc = orte_errmgr.update_state(child->name->jobid, ORTE_JOB_STATE_LAUNCHED, - child->name, child->state, - child->pid, child->exit_code))) { - ORTE_ERROR_LOG(rc); - goto CLEANUP; - } + ORTE_ACTIVATE_PROC_STATE(&child->name, ORTE_PROC_STATE_RUNNING); } /* move to next processor */ proc_rank++; @@ -1655,39 +1512,25 @@ int orte_odls_base_default_launch_local(orte_jobid_t job, */ chdir(basedir); } - launch_failed = false; - CLEANUP: - /* ensure we reset our working directory back to our default location */ - chdir(basedir); - - OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, - "%s odls:launch reporting job %s launch status", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_JOBID_PRINT(job))); - - /* if the launch failed, we need to flag all the procs from this job - * that didn't launch as having failed, or else we will hang + /* check to see if we need to + * co-locate any debugger daemons so that they get launched + * before we report anything to the HNP. This ensures that + * the debugger daemons are ready-to-go before mpirun returns + * from the plm.spawn command. Only spawn the debugger, though, + * if we have local children - otherwise, the HNP could spawn + * a debugger when it doesn't have any local procs */ - if (launch_failed) { - if (ORTE_SUCCESS != (rc = orte_errmgr.update_state(jobdat->jobid, ORTE_JOB_STATE_FAILED_TO_START, - NULL, ORTE_PROC_STATE_UNDEF, 0, - child->exit_code))) { - ORTE_ERROR_LOG(rc); + if (NULL != orte_odls_globals.debugger && + !orte_odls_globals.debugger_launched) { + child = NULL; + for (idx=0; idx < orte_local_children->size; idx++) { + if (NULL != (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, idx))) { + break; + } } - } else { - /* if the launch succeeded, check to see if we need to - * co-locate any debugger daemons so that they get launched - * before we report anything to the HNP. This ensures that - * the debugger daemons are ready-to-go before mpirun returns - * from the plm.spawn command. Only spawn the debugger, though, - * if we have local children - otherwise, the HNP could spawn - * a debugger when it doesn't have any local procs - */ - if (NULL != orte_odls_globals.debugger && - !orte_odls_globals.debugger_launched && - 0 < opal_list_get_size(&orte_local_children)) { - dbg = (orte_app_context_t*)opal_pointer_array_get_item(&orte_odls_globals.debugger->apps, 0); + if (NULL != child) { + dbg = (orte_app_context_t*)opal_pointer_array_get_item(orte_odls_globals.debugger->apps, 0); OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, "%s odls:launch forking debugger %s with %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), dbg->app, @@ -1698,77 +1541,71 @@ int orte_odls_base_default_launch_local(orte_jobid_t job, orte_process_info.num_procs, false, &dbg->env); fork_local(dbg, NULL, dbg->env, orte_odls_globals.debugger); orte_odls_globals.debugger_launched = true; - if (ORTE_SUCCESS != (rc = orte_errmgr.update_state(orte_odls_globals.debugger->jobid, - ORTE_JOB_STATE_RUNNING, - NULL, ORTE_PROC_STATE_UNDEF, 0, - ORTE_ERROR_DEFAULT_EXIT_CODE))) { - ORTE_ERROR_LOG(rc); - } + orte_odls_globals.debugger->state = ORTE_JOB_STATE_RUNNING; } + } + + OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, + "%s odls:launch setting waitpids", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - if (ORTE_SUCCESS != (rc = orte_errmgr.update_state(jobdat->jobid, ORTE_JOB_STATE_RUNNING, - NULL, ORTE_PROC_STATE_UNDEF, 0, - ORTE_ERROR_DEFAULT_EXIT_CODE))) { - ORTE_ERROR_LOG(rc); + /* start the sensors for this job (if any) */ + orte_sensor.start(jobdat->jobid); + + /* setup the waitpids on the children that started */ + for (idx=0; idx < orte_local_children->size; idx++) { + if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, idx))) { + continue; } - - OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, - "%s odls:launch setting waitpids", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - - /* start the sensors for this job (if any) */ - orte_sensor.start(jobdat->jobid); - - /* if the launch didn't fail, setup the waitpids on the children */ - for (item = opal_list_get_first(&orte_local_children); - item != opal_list_get_end(&orte_local_children); - item = opal_list_get_next(item)) { - child = (orte_odls_child_t*)item; - - if (child->name->jobid == jobdat->jobid) { - OPAL_THREAD_UNLOCK(&orte_odls_globals.mutex); - orte_wait_cb(child->pid, odls_base_default_wait_local_proc, NULL); - OPAL_THREAD_LOCK(&orte_odls_globals.mutex); - } + if (child->name.jobid == jobdat->jobid && child->alive) { + orte_wait_cb(child->pid, odls_base_default_wait_local_proc, NULL); } } GETOUT: - opal_condition_signal(&orte_odls_globals.cond); - OPAL_THREAD_UNLOCK(&orte_odls_globals.mutex); - return rc; + /* tell the state machine that all local procs for this job + * were launched so that it can do whatever it needs to do, + * like send a state update message for all procs to the HNP + */ + ORTE_ACTIVATE_JOB_STATE(jobdat, ORTE_JOB_STATE_LOCAL_LAUNCH_COMPLETE); + + ERROR_OUT: + /* ensure we reset our working directory back to our default location */ + chdir(basedir); + /* release the event */ + OBJ_RELEASE(caddy); } int orte_odls_base_default_deliver_message(orte_jobid_t job, opal_buffer_t *buffer, orte_rml_tag_t tag) { int rc, exit_status = ORTE_SUCCESS; - opal_list_item_t *item; - orte_odls_child_t *child; - - /* protect operations involving the global list of children */ - OPAL_THREAD_LOCK(&orte_odls_globals.mutex); - - for (item = opal_list_get_first(&orte_local_children); - item != opal_list_get_end(&orte_local_children); - item = opal_list_get_next(item)) { - child = (orte_odls_child_t*)item; + int i; + orte_proc_t *child; + opal_buffer_t *relay; + + for (i=0; i < orte_local_children->size; i++) { + if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) { + continue; + } /* do we have a child from the specified job. Because the * job could be given as a WILDCARD value, we must use * the dss.compare function to check for equality. */ if (!child->alive || - OPAL_EQUAL != opal_dss.compare(&job, &(child->name->jobid), ORTE_JOBID)) { + OPAL_EQUAL != opal_dss.compare(&job, &(child->name.jobid), ORTE_JOBID)) { continue; } OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, "%s odls: sending message to tag %lu on child %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - (unsigned long)tag, ORTE_NAME_PRINT(child->name))); + (unsigned long)tag, ORTE_NAME_PRINT(&child->name))); /* if so, send the message */ - rc = orte_rml.send_buffer(child->name, buffer, tag, 0); + relay = OBJ_NEW(opal_buffer_t); + opal_dss.copy_payload(relay, buffer); + rc = orte_rml.send_buffer_nb(&child->name, relay, tag, 0, orte_rml_send_callback, NULL); if (rc < 0 && rc != ORTE_ERR_ADDRESSEE_UNKNOWN) { /* ignore if the addressee is unknown as a race condition could * have allowed the child to exit before we send it a barrier @@ -1780,14 +1617,12 @@ int orte_odls_base_default_deliver_message(orte_jobid_t job, opal_buffer_t *buff */ ORTE_ERROR_LOG(rc); exit_status = rc; + OBJ_RELEASE(relay); goto cleanup; } } cleanup: - opal_condition_signal(&orte_odls_globals.cond); - OPAL_THREAD_UNLOCK(&orte_odls_globals.mutex); - return exit_status; } @@ -1799,45 +1634,36 @@ int orte_odls_base_default_deliver_message(orte_jobid_t job, opal_buffer_t *buff int orte_odls_base_default_signal_local_procs(const orte_process_name_t *proc, int32_t signal, orte_odls_base_signal_local_fn_t signal_local) { - int rc; - opal_list_item_t *item; - orte_odls_child_t *child; + int rc, i; + orte_proc_t *child; OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, "%s odls: signaling proc %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (NULL == proc) ? "NULL" : ORTE_NAME_PRINT(proc))); - /* protect operations involving the global list of children */ - OPAL_THREAD_LOCK(&orte_odls_globals.mutex); - /* if procs is NULL, then we want to signal all * of the local procs, so just do that case */ if (NULL == proc) { rc = ORTE_SUCCESS; /* pre-set this as an empty list causes us to drop to bottom */ - for (item = opal_list_get_first(&orte_local_children); - item != opal_list_get_end(&orte_local_children); - item = opal_list_get_next(item)) { - child = (orte_odls_child_t*)item; + for (i=0; i < orte_local_children->size; i++) { + if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) { + continue; + } if (ORTE_SUCCESS != (rc = signal_local(child->pid, (int)signal))) { ORTE_ERROR_LOG(rc); } } - opal_condition_signal(&orte_odls_globals.cond); - OPAL_THREAD_UNLOCK(&orte_odls_globals.mutex); return rc; } /* we want it sent to some specified process, so find it */ - for (item = opal_list_get_first(&orte_local_children); - item != opal_list_get_end(&orte_local_children); - item = opal_list_get_next(item)) { - child = (orte_odls_child_t*)item; + for (i=0; i < orte_local_children->size; i++) { + if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) { + continue; + } if (OPAL_EQUAL == opal_dss.compare(&(child->name), (orte_process_name_t*)proc, ORTE_NAME)) { - /* unlock before signaling as this may generate a callback */ - opal_condition_signal(&orte_odls_globals.cond); - OPAL_THREAD_UNLOCK(&orte_odls_globals.mutex); if (ORTE_SUCCESS != (rc = signal_local(child->pid, (int)signal))) { ORTE_ERROR_LOG(rc); } @@ -1849,14 +1675,12 @@ int orte_odls_base_default_signal_local_procs(const orte_process_name_t *proc, i * report that as an error and return it */ ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); - opal_condition_signal(&orte_odls_globals.cond); - OPAL_THREAD_UNLOCK(&orte_odls_globals.mutex); return ORTE_ERR_NOT_FOUND; } void orte_odls_base_setup_singleton_jobdat(orte_jobid_t jobid) { - orte_odls_job_t *jobdat; + orte_job_t *jobdat; orte_vpid_t vpid1; int32_t one32; orte_local_rank_t lrank; @@ -1870,11 +1694,11 @@ void orte_odls_base_setup_singleton_jobdat(orte_jobid_t jobid) #endif /* create a job tracking object for it */ - jobdat = OBJ_NEW(orte_odls_job_t); + jobdat = OBJ_NEW(orte_job_t); jobdat->jobid = jobid; jobdat->num_procs = 1; jobdat->num_local_procs = 1; - opal_list_append(&orte_local_jobdata, &jobdat->super); + opal_pointer_array_set_item(orte_job_data, ORTE_LOCAL_JOBID(jobid), jobdat); /* need to setup a pidmap for it */ OBJ_CONSTRUCT(&buffer, opal_buffer_t); opal_dss.pack(&buffer, &jobid, 1, ORTE_JOBID); /* jobid */ @@ -1925,90 +1749,64 @@ void orte_odls_base_setup_singleton_jobdat(orte_jobid_t jobid) } free(bo); } - /* flag that the "launch msg" has been processed so that daemon - * collectives can proceed - */ - jobdat->launch_msg_processed = true; } int orte_odls_base_default_require_sync(orte_process_name_t *proc, opal_buffer_t *buf, bool drop_nidmap) { - opal_buffer_t buffer; - opal_list_item_t *item; - orte_odls_child_t *child; + opal_buffer_t *buffer; + orte_proc_t *child; orte_std_cntr_t cnt; - int rc=ORTE_SUCCESS; + int rc=ORTE_SUCCESS, i; bool found=false, registering=false; - orte_odls_job_t *jobdat, *jdat; + orte_job_t *jobdat; OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, "%s odls: require sync on child %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc))); - /* protect operations involving the global list of children */ - OPAL_THREAD_LOCK(&orte_odls_globals.mutex); - - for (item = opal_list_get_first(&orte_local_children); - item != opal_list_get_end(&orte_local_children); - item = opal_list_get_next(item)) { - child = (orte_odls_child_t*)item; + for (i=0; i < orte_local_children->size; i++) { + if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) { + continue; + } /* find this child */ - if (OPAL_EQUAL == opal_dss.compare(proc, child->name, ORTE_NAME)) { + if (OPAL_EQUAL == opal_dss.compare(proc, &child->name, ORTE_NAME)) { OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, "%s odls: registering sync on child %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(child->name))); + ORTE_NAME_PRINT(&child->name))); found = true; break; } } - /* if it wasn't found on the list, then we need to add it - must have - * come from a singleton - */ + /* if it wasn't found, that's an error */ if (!found) { - OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, - "%s odls: registering sync on singleton %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(proc))); - child = OBJ_NEW(orte_odls_child_t); - if (ORTE_SUCCESS != (rc = opal_dss.copy((void**)&child->name, proc, ORTE_NAME))) { - ORTE_ERROR_LOG(rc); - goto CLEANUP; - } - opal_list_append(&orte_local_children, &child->super); - /* we don't know any other info about the child, so just indicate it's - * alive - */ - child->alive = true; - /* setup jobdat object for its job so daemon collectives work */ - orte_odls_base_setup_singleton_jobdat(proc->jobid); + ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); + return ORTE_ERR_NOT_FOUND; } - - /* if the contact info is already set, then we are "de-registering" the child - * so free the info and set it to NULL - */ - if (child->init_recvd && NULL != child->rml_uri) { - child->fini_recvd = true; + + /* if the child has registered, then we are "de-registering" the child */ + if (child->registered) { + child->deregistered = true; OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, "%s odls: require sync deregistering child %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(child->name))); + ORTE_NAME_PRINT(&child->name))); } else { - /* if the contact info is not set, then we are registering the child so + /* otherwise, we are registering the child so * unpack the contact info from the buffer and store it */ OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, "%s odls: require sync registering child %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(child->name))); - child->init_recvd = true; + ORTE_NAME_PRINT(&child->name))); + child->registered = true; registering = true; cnt = 1; if (ORTE_SUCCESS != (rc = opal_dss.unpack(buf, &(child->rml_uri), &cnt, OPAL_STRING))) { @@ -2017,29 +1815,20 @@ int orte_odls_base_default_require_sync(orte_process_name_t *proc, } /* ack the call */ - OBJ_CONSTRUCT(&buffer, opal_buffer_t); + buffer = OBJ_NEW(opal_buffer_t); /* do they want the nidmap? */ if (drop_nidmap) { /* get the jobdata object */ - jobdat = NULL; - for (item = opal_list_get_first(&orte_local_jobdata); - item != opal_list_get_end(&orte_local_jobdata); - item = opal_list_get_next(item)) { - jdat = (orte_odls_job_t*)item; - if (jdat->jobid == child->name->jobid) { - jobdat = jdat; - break; - } - } - if (NULL == jobdat) { + if (NULL == (jobdat = orte_get_job_data_object(child->name.jobid))) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); goto CLEANUP; } - OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, - "%s odls:sync nidmap requested for job %s", + "%s odls:sync nidmap requested for job %s: dmap %s pmap %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_JOBID_PRINT(jobdat->jobid))); + ORTE_JOBID_PRINT(jobdat->jobid), + (NULL == orte_odls_globals.dmap) ? "NULL" : "READY", + (NULL == jobdat->pmap) ? "NULL" : "READY")); /* the proc needs a copy of both the daemon/node map, and * the process map for its peers */ @@ -2053,25 +1842,25 @@ int orte_odls_base_default_require_sync(orte_process_name_t *proc, /* send the local topology so the individual apps * don't hammer the system to collect it themselves */ - opal_dss.pack(&buffer, &opal_hwloc_topology, 1, OPAL_HWLOC_TOPO); + opal_dss.pack(buffer, &opal_hwloc_topology, 1, OPAL_HWLOC_TOPO); #endif - opal_dss.pack(&buffer, &orte_odls_globals.dmap, 1, OPAL_BYTE_OBJECT); - opal_dss.pack(&buffer, &jobdat->pmap, 1, OPAL_BYTE_OBJECT); + opal_dss.pack(buffer, &orte_odls_globals.dmap, 1, OPAL_BYTE_OBJECT); + opal_dss.pack(buffer, &jobdat->pmap, 1, OPAL_BYTE_OBJECT); } } OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, "%s odls: sending sync ack to child %s with %ld bytes of data", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(proc), (long)buffer.bytes_used)); + ORTE_NAME_PRINT(proc), (long)buffer->bytes_used)); - if (0 > (rc = orte_rml.send_buffer(proc, &buffer, ORTE_RML_TAG_SYNC, 0))) { + if (0 > (rc = orte_rml.send_buffer_nb(proc, buffer, ORTE_RML_TAG_SYNC, + 0, orte_rml_send_callback, NULL))) { ORTE_ERROR_LOG(rc); - OBJ_DESTRUCT(&buffer); + OBJ_RELEASE(buffer); goto CLEANUP; } rc = ORTE_SUCCESS; - OBJ_DESTRUCT(&buffer); OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, "%s odls: Finished sending sync ack to child %s (Registering %s)", @@ -2080,7 +1869,7 @@ int orte_odls_base_default_require_sync(orte_process_name_t *proc, /* if we are deregistering, then we are done */ if (!registering) { - orte_routed.delete_route(child->name); + orte_routed.delete_route(&child->name); if( NULL != child->rml_uri ) { free(child->rml_uri); child->rml_uri = NULL; @@ -2089,208 +1878,94 @@ int orte_odls_base_default_require_sync(orte_process_name_t *proc, } /* update the proc state */ - orte_errmgr.update_state(ORTE_JOBID_INVALID, ORTE_JOB_STATE_UNDEF, - proc, ORTE_PROC_STATE_REGISTERED, 0, 0); + ORTE_ACTIVATE_PROC_STATE(&child->name, ORTE_PROC_STATE_REGISTERED); CLEANUP: - opal_condition_signal(&orte_odls_globals.cond); - OPAL_THREAD_UNLOCK(&orte_odls_globals.mutex); return rc; } -/* receive external-to-odls notification that a proc has met some completion - * requirements - */ -void orte_odls_base_notify_iof_complete(orte_process_name_t *proc) -{ - orte_odls_child_t *child; - opal_list_item_t *item; - int rc; - orte_ns_cmp_bitmask_t mask; - - OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, - "%s odls:notify_iof_complete for child %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(proc))); - - /* since we are going to be working with the global list of - * children, we need to protect that list from modification - * by other threads. This will also be used to protect us - * from race conditions on any abort situation - */ - OPAL_THREAD_LOCK(&orte_odls_globals.mutex); - - /* find this child */ - for (item = opal_list_get_first(&orte_local_children); - item != opal_list_get_end(&orte_local_children); - item = opal_list_get_next(item)) { - child = (orte_odls_child_t*)item; - - mask = ORTE_NS_CMP_ALL; - - if (OPAL_EQUAL == orte_util_compare_name_fields(mask, child->name, proc)) { /* found it */ - goto GOTCHILD; - } - } - /* get here if we didn't find the child, or if the specified child - * is already dead. If the latter, then we have a problem as it - * means we are detecting it exiting multiple times - */ - - OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, - "%s odls:proc_complete did not find child %s in table!", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(proc))); - - /* it's just a race condition - don't error log it */ - opal_condition_signal(&orte_odls_globals.cond); - OPAL_THREAD_UNLOCK(&orte_odls_globals.mutex); - return; - -GOTCHILD: - /* flag the iof as complete */ - child->iof_complete = true; - /* now check to see if the proc is truly done */ - if (child->waitpid_recvd) { - /* CHILD IS COMPLETE */ - child->alive = false; - - /* Release only the stdin IOF file descriptor for this child, if one - * was defined. File descriptors for the other IOF channels - stdout, - * stderr, and stddiag - were released when their associated pipes - * were cleared and closed due to termination of the process - */ - if (NULL != orte_iof.close) { - orte_iof.close(proc, ORTE_IOF_STDIN); - } - /* Clean up the session directory as if we were the process - * itself. This covers the case where the process died abnormally - * and didn't cleanup its own session directory. - */ - orte_session_dir_finalize(proc); - /* alert the errmgr */ - if (ORTE_SUCCESS != (rc = orte_errmgr.update_state(ORTE_JOBID_INVALID, ORTE_JOB_STATE_UNDEF, - proc, child->state, child->pid, - child->exit_code))) { - ORTE_ERROR_LOG(rc); - } - } - - opal_condition_signal(&orte_odls_globals.cond); - OPAL_THREAD_UNLOCK(&orte_odls_globals.mutex); -} - void orte_odls_base_default_report_abort(orte_process_name_t *proc) { - orte_odls_child_t *child; - opal_list_item_t *item; - opal_buffer_t buffer; - int rc; + orte_proc_t *child; + opal_buffer_t *buffer; + int rc, i; orte_ns_cmp_bitmask_t mask; - /* since we are going to be working with the global list of - * children, we need to protect that list from modification - * by other threads. This will also be used to protect us - * from race conditions on any abort situation - */ - OPAL_THREAD_LOCK(&orte_odls_globals.mutex); - /* find this child */ - for (item = opal_list_get_first(&orte_local_children); - item != opal_list_get_end(&orte_local_children); - item = opal_list_get_next(item)) { - child = (orte_odls_child_t*)item; + for (i=0; i < orte_local_children->size; i++) { + if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) { + continue; + } mask = ORTE_NS_CMP_ALL; if (OPAL_EQUAL == - orte_util_compare_name_fields(mask, proc, child->name)) { /* found it */ + orte_util_compare_name_fields(mask, proc, &child->name)) { /* found it */ child->state = ORTE_PROC_STATE_CALLED_ABORT; /* send ack */ - OBJ_CONSTRUCT(&buffer, opal_buffer_t); - if (0 > (rc = orte_rml.send_buffer(proc, &buffer, ORTE_RML_TAG_ABORT, 0))) { + buffer = OBJ_NEW(opal_buffer_t); + if (0 > (rc = orte_rml.send_buffer_nb(proc, buffer, + ORTE_RML_TAG_ABORT, 0, + orte_rml_send_callback, NULL))) { ORTE_ERROR_LOG(rc); + OBJ_RELEASE(buffer); } - OBJ_DESTRUCT(&buffer); break; } } - opal_condition_signal(&orte_odls_globals.cond); - OPAL_THREAD_UNLOCK(&orte_odls_globals.mutex); } -void orte_base_default_waitpid_fired(orte_process_name_t *proc, int32_t status) +/* + * Wait for a callback indicating the child has completed. + */ + +void odls_base_default_wait_local_proc(pid_t pid, int status, void* cbdata) { - orte_odls_child_t *child, *chd; - orte_odls_job_t *jobdat, *jdat; - opal_list_item_t *item; - int rc; - orte_ns_cmp_bitmask_t mask; + orte_proc_t *proc=NULL, *cptr; + int i; + orte_job_t *jobdat; + orte_proc_state_t state=ORTE_PROC_STATE_WAITPID_FIRED; OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, - "%s odls:waitpid_fired on child %s with status %d", + "%s odls:wait_local_proc child process %ld terminated", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(proc), WEXITSTATUS(status))); - - /* since we are going to be working with the global list of - * children, we need to protect that list from modification - * by other threads. This will also be used to protect us - * from race conditions on any abort situation - */ - OPAL_THREAD_LOCK(&orte_odls_globals.mutex); + (long)pid)); /* find this child */ - for (item = opal_list_get_first(&orte_local_children); - item != opal_list_get_end(&orte_local_children); - item = opal_list_get_next(item)) { - child = (orte_odls_child_t*)item; - - mask = ORTE_NS_CMP_ALL; - - if (OPAL_EQUAL == - orte_util_compare_name_fields(mask, proc, child->name)) { /* found it */ - goto GOTCHILD; + for (i=0; i < orte_local_children->size; i++) { + if (NULL == (cptr = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) { + continue; + } + if (pid == cptr->pid) { + proc = cptr; + break; } } - /* get here if we didn't find the child, or if the specified child - * is already dead. If the latter, then we have a problem as it - * means we are detecting it exiting multiple times - */ - - OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, - "%s odls:waitpid_fired did not find child %s in table!", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(proc))); - - /* it's just a race condition - don't error log it */ - opal_condition_signal(&orte_odls_globals.cond); - OPAL_THREAD_UNLOCK(&orte_odls_globals.mutex); - return; - - GOTCHILD: + if (NULL == proc) { + /* get here if we didn't find the child, or if the specified child + * is already dead. If the latter, then we have a problem as it + * means we are detecting it exiting multiple times + */ + OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, + "%s odls:wait_local_proc did not find pid %ld in table!", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + (long)pid)); + return; + } + /* if the child was previously flagged as dead, then just * ensure that its exit state gets reported to avoid hanging */ - if (!child->alive) { + if (!proc->alive) { OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, "%s odls:waitpid_fired child %s was already dead", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(child->name))); + ORTE_NAME_PRINT(&proc->name))); goto MOVEON; } /* get the jobdat for this child */ - jobdat = NULL; - for (item = opal_list_get_first(&orte_local_jobdata); - item != opal_list_get_end(&orte_local_jobdata); - item = opal_list_get_next(item)) { - jdat = (orte_odls_job_t*)item; - if (jdat->jobid == child->name->jobid) { - jobdat = jdat; - break; - } - } - if (NULL == jobdat) { + if (NULL == (jobdat = orte_get_job_data_object(proc->name.jobid))) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); goto MOVEON; } @@ -2299,27 +1974,26 @@ void orte_base_default_waitpid_fired(orte_process_name_t *proc, int32_t status) * and return as we aren't monitoring it */ if (ORTE_JOB_CONTROL_DEBUGGER_DAEMON & jobdat->controls) { - child->state = ORTE_PROC_STATE_TERMINATED; goto MOVEON; } /* if this child was ordered to die, then just pass that along * so we don't hang */ - if (ORTE_PROC_STATE_KILLED_BY_CMD == child->state) { + if (ORTE_PROC_STATE_KILLED_BY_CMD == proc->state) { OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, "%s odls:waitpid_fired child %s was ordered to die", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(child->name))); + ORTE_NAME_PRINT(&proc->name))); goto MOVEON; } /* determine the state of this process */ if(WIFEXITED(status)) { /* set the exit status appropriately */ - child->exit_code = WEXITSTATUS(status); + proc->exit_code = WEXITSTATUS(status); - if (ORTE_PROC_STATE_CALLED_ABORT == child->state) { + if (ORTE_PROC_STATE_CALLED_ABORT == proc->state) { /* even though the process exited "normally", it happened * via an orte_abort call, so we need to indicate this was * an "abnormal" termination. @@ -2327,81 +2001,69 @@ void orte_base_default_waitpid_fired(orte_process_name_t *proc, int32_t status) OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, "%s odls:waitpid_fired child %s died by call to abort", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(child->name))); - child->state = ORTE_PROC_STATE_ABORTED; + ORTE_NAME_PRINT(&proc->name))); + state = ORTE_PROC_STATE_CALLED_ABORT; goto MOVEON; } /* check to see if a sync was required and if it was received */ - if (child->init_recvd) { - if (!child->fini_recvd) { - /* we required a finalizing sync and didn't get it, so this - * is considered an abnormal termination and treated accordingly + if (proc->registered) { + if (proc->deregistered) { + /* if we did recv a finalize sync, then declare it normally terminated + * unless it returned with a non-zero status indicating the code + * felt it was non-normal */ - if (0 != child->exit_code) { - child->state = ORTE_PROC_STATE_TERM_NON_ZERO; + if (0 != proc->exit_code) { + proc->state = ORTE_PROC_STATE_TERM_NON_ZERO; OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, "%s odls:waitpid_fired child process %s terminated normally " "but with a non-zero exit status - it " "will be treated as an abnormal termination", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(child->name))); + ORTE_NAME_PRINT(&proc->name))); } else { - child->state = ORTE_PROC_STATE_TERM_WO_SYNC; - OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, - "%s odls:waitpid_fired child process %s terminated normally " - "but did not provide a required finalize sync - it " - "will be treated as an abnormal termination", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(child->name))); + /* indicate the waitpid fired */ + state = ORTE_PROC_STATE_WAITPID_FIRED; } - - goto MOVEON; - } - /* if we did recv a finalize sync, then declare it normally terminated - * unless it returned with a non-zero status indicating the code - * felt it was non-normal - */ - if (0 != child->exit_code) { - child->state = ORTE_PROC_STATE_TERM_NON_ZERO; + } else { + /* we required a finalizing sync and didn't get it, so this + * is considered an abnormal termination and treated accordingly + */ + state = ORTE_PROC_STATE_TERM_WO_SYNC; OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, "%s odls:waitpid_fired child process %s terminated normally " - "but with a non-zero exit status - it " + "but did not provide a required finalize sync - it " "will be treated as an abnormal termination", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(child->name))); - } else { - child->state = ORTE_PROC_STATE_TERMINATED; + ORTE_NAME_PRINT(&proc->name))); } } else { /* has any child in this job already registered? */ - for (item = opal_list_get_first(&orte_local_children); - item != opal_list_get_end(&orte_local_children); - item = opal_list_get_next(item)) { - chd = (orte_odls_child_t*)item; - - if (chd->init_recvd) { + for (i=0; i < orte_local_children->size; i++) { + if (NULL == (cptr = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) { + continue; + } + if (cptr->registered) { /* someone has registered, and we didn't before * terminating - this is an abnormal termination */ - if (0 != child->exit_code) { - child->state = ORTE_PROC_STATE_TERM_NON_ZERO; + if (0 != proc->exit_code) { + state = ORTE_PROC_STATE_TERM_NON_ZERO; OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, "%s odls:waitpid_fired child process %s terminated normally " "but with a non-zero exit status - it " "will be treated as an abnormal termination", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(child->name))); + ORTE_NAME_PRINT(&proc->name))); } else { - child->state = ORTE_PROC_STATE_TERM_WO_SYNC; + state = ORTE_PROC_STATE_TERM_WO_SYNC; OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, "%s odls:waitpid_fired child process %s terminated normally " "but did not provide a required init sync - it " "will be treated as an abnormal termination", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(child->name))); + ORTE_NAME_PRINT(&proc->name))); } - goto MOVEON; } } @@ -2409,23 +2071,23 @@ void orte_base_default_waitpid_fired(orte_process_name_t *proc, int32_t status) * none of them will. This is considered acceptable. Still * flag it as abnormal if the exit code was non-zero */ - if (0 != child->exit_code) { - child->state = ORTE_PROC_STATE_TERM_NON_ZERO; + if (0 != proc->exit_code) { + state = ORTE_PROC_STATE_TERM_NON_ZERO; } else { - child->state = ORTE_PROC_STATE_TERMINATED; + state = ORTE_PROC_STATE_WAITPID_FIRED; } } - + OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, "%s odls:waitpid_fired child process %s terminated %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(child->name), - (0 == child->exit_code) ? "normally" : "with non-zero status")); + ORTE_NAME_PRINT(&proc->name), + (0 == proc->exit_code) ? "normally" : "with non-zero status")); } else { /* the process was terminated with a signal! That's definitely * abnormal, so indicate that condition */ - child->state = ORTE_PROC_STATE_ABORTED_BY_SIG; + state = ORTE_PROC_STATE_ABORTED_BY_SIG; /* If a process was killed by a signal, then make the * exit code of orterun be "signo + 128" so that "prog" * and "orterun prog" will both yield the same exit code. @@ -2435,149 +2097,32 @@ void orte_base_default_waitpid_fired(orte_process_name_t *proc, int32_t status) * the termination code to exit status translation the * same way */ - child->exit_code = WTERMSIG(status) + 128; + proc->exit_code = WTERMSIG(status) + 128; OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, "%s odls:waitpid_fired child process %s terminated with signal", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(child->name) )); + ORTE_NAME_PRINT(&proc->name) )); /* Do not decrement the number of local procs here. That is handled in the errmgr */ } MOVEON: - /* indicate the waitpid fired */ - child->waitpid_recvd = true; - - /* now check to see if the proc is truly done */ - if (child->iof_complete) { - /* CHILD IS COMPLETE */ - child->alive = false; - - /* Release only the stdin IOF file descriptor for this child, if one - * was defined. File descriptors for the other IOF channels - stdout, - * stderr, and stddiag - were released when their associated pipes - * were cleared and closed due to termination of the process - */ - if (NULL != orte_iof.close) { - orte_iof.close(proc, ORTE_IOF_STDIN); - } - - /* Clean up the session directory as if we were the process - * itself. This covers the case where the process died abnormally - * and didn't cleanup its own session directory. - */ - orte_session_dir_finalize(proc); - /* alert the errmgr */ - if (ORTE_SUCCESS != (rc = orte_errmgr.update_state(ORTE_JOBID_INVALID, ORTE_JOB_STATE_UNDEF, - proc, child->state, child->pid, - child->exit_code))) { - ORTE_ERROR_LOG(rc); - } - } - - /* done */ - opal_condition_signal(&orte_odls_globals.cond); - OPAL_THREAD_UNLOCK(&orte_odls_globals.mutex); -} - -/* - * Wait for a callback indicating the child has completed. - */ - -void odls_base_default_wait_local_proc(pid_t pid, int status, void* cbdata) -{ - orte_odls_child_t *child; - opal_list_item_t *item, *next; - int rc; - opal_buffer_t cmdbuf; - orte_daemon_cmd_flag_t command; - int32_t istatus; - - OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, - "%s odls:wait_local_proc child process %ld terminated", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - (long)pid)); - - /* since we are going to be working with the global list of - * children, we need to protect that list from modification - * by other threads. This will also be used to protect us - * from race conditions on any abort situation - */ - OPAL_THREAD_LOCK(&orte_odls_globals.mutex); - - /* find this child */ - for (item = opal_list_get_first(&orte_local_children); - item != opal_list_get_end(&orte_local_children); - item = next) { - child = (orte_odls_child_t*)item; - next = opal_list_get_next(item); - - if (pid == child->pid) { /* found it */ - /* this is an independent entry point from the event library. To avoid - * race conditions, we need to get back into the progression of messages - * and commands to be processed by the daemon. We do this by re-posting - * the event into the daemon cmd processor - */ - OBJ_CONSTRUCT(&cmdbuf, opal_buffer_t); - command = ORTE_DAEMON_WAITPID_FIRED; - if (ORTE_SUCCESS != (rc = opal_dss.pack(&cmdbuf, &command, 1, ORTE_DAEMON_CMD))) { - ORTE_ERROR_LOG(rc); - goto CLEANUP; - } - if (ORTE_SUCCESS != (rc = opal_dss.pack(&cmdbuf, child->name, 1, ORTE_NAME))) { - ORTE_ERROR_LOG(rc); - goto CLEANUP; - } - istatus = status; - if (ORTE_SUCCESS != (rc = opal_dss.pack(&cmdbuf, &istatus, 1, OPAL_INT32))) { - ORTE_ERROR_LOG(rc); - goto CLEANUP; - } - ORTE_MESSAGE_EVENT(ORTE_PROC_MY_NAME, &cmdbuf, ORTE_RML_TAG_DAEMON, orte_daemon_cmd_processor); - /* done */ - opal_condition_signal(&orte_odls_globals.cond); - OPAL_THREAD_UNLOCK(&orte_odls_globals.mutex); - return; - } - } - /* get here if we didn't find the child, or if the specified child - * is already dead. If the latter, then we have a problem as it - * means we are detecting it exiting multiple times - */ - - OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, - "%s odls:wait_local_proc did not find pid %ld in table!", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - (long)pid)); - - /* it's just a race condition - don't error log it */ -CLEANUP: - opal_condition_signal(&orte_odls_globals.cond); - OPAL_THREAD_UNLOCK(&orte_odls_globals.mutex); - return; + ORTE_ACTIVATE_PROC_STATE(&proc->name, state); } int orte_odls_base_default_kill_local_procs(opal_pointer_array_t *procs, orte_odls_base_kill_local_fn_t kill_local, orte_odls_base_child_died_fn_t child_died) { - orte_odls_child_t *child; - opal_list_item_t *item; - int rc = ORTE_SUCCESS; + orte_proc_t *child; opal_list_t procs_killed; orte_proc_t *proc, proctmp; - int i; + int i, j; opal_pointer_array_t procarray, *procptr; bool do_cleanup; OBJ_CONSTRUCT(&procs_killed, opal_list_t); - /* since we are going to be working with the global list of - * children, we need to protect that list from modification - * by other threads - */ - OPAL_THREAD_LOCK(&orte_odls_globals.mutex); - /* if the pointer array is NULL, then just kill everything */ if (NULL == procs) { OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, @@ -2588,7 +2133,6 @@ int orte_odls_base_default_kill_local_procs(opal_pointer_array_t *procs, OBJ_CONSTRUCT(&proctmp, orte_proc_t); proctmp.name.jobid = ORTE_JOBID_WILDCARD; proctmp.name.vpid = ORTE_VPID_WILDCARD; - ORTE_EPOCH_SET(proctmp.name.epoch,ORTE_EPOCH_WILDCARD); opal_pointer_array_add(&procarray, &proctmp); procptr = &procarray; do_cleanup = true; @@ -2605,29 +2149,28 @@ int orte_odls_base_default_kill_local_procs(opal_pointer_array_t *procs, if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(procptr, i))) { continue; } - for (item = opal_list_get_first(&orte_local_children); - item != opal_list_get_end(&orte_local_children); - item = opal_list_get_next(item)) { - child = (orte_odls_child_t*)item; - + for(j=0; j < orte_local_children->size; j++) { + if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, j))) { + continue; + } + OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, "%s odls:kill_local_proc checking child process %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(child->name))); + ORTE_NAME_PRINT(&child->name))); /* do we have a child from the specified job? Because the * job could be given as a WILDCARD value, we must * check for that as well as for equality. */ if (ORTE_JOBID_WILDCARD != proc->name.jobid && - proc->name.jobid != child->name->jobid) { + proc->name.jobid != child->name.jobid) { OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, "%s odls:kill_local_proc child %s is not part of job %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(child->name), + ORTE_NAME_PRINT(&child->name), ORTE_JOBID_PRINT(proc->name.jobid))); - continue; } @@ -2635,14 +2178,13 @@ int orte_odls_base_default_kill_local_procs(opal_pointer_array_t *procs, * appropriately */ if (ORTE_VPID_WILDCARD != proc->name.vpid && - proc->name.vpid != child->name->vpid) { + proc->name.vpid != child->name.vpid) { OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, "%s odls:kill_local_proc child %s is not covered by rank %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(child->name), + ORTE_NAME_PRINT(&child->name), ORTE_VPID_PRINT(proc->name.vpid))); - continue; } @@ -2654,14 +2196,13 @@ int orte_odls_base_default_kill_local_procs(opal_pointer_array_t *procs, OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, "%s odls:kill_local_proc child %s is not alive", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(child->name))); + ORTE_NAME_PRINT(&child->name))); /* ensure, though, that the state is terminated so we don't lockup if * the proc never started */ if (ORTE_PROC_STATE_UNDEF == child->state || ORTE_PROC_STATE_INIT == child->state || - ORTE_PROC_STATE_LAUNCHED == child->state || ORTE_PROC_STATE_RUNNING == child->state) { /* we can't be sure what happened, but make sure we * at least have a value that will let us eventually wakeup @@ -2687,7 +2228,7 @@ int orte_odls_base_default_kill_local_procs(opal_pointer_array_t *procs, * channels will automatically close when the proc is killed */ if (NULL != orte_iof.close) { - orte_iof.close(child->name, ORTE_IOF_STDIN); + orte_iof.close(&child->name, ORTE_IOF_STDIN); } /* cancel the waitpid callback as this induces unmanageable race @@ -2702,14 +2243,14 @@ int orte_odls_base_default_kill_local_procs(opal_pointer_array_t *procs, OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, "%s SENDING SIGCONT TO %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(child->name))); + ORTE_NAME_PRINT(&child->name))); kill_local(child->pid, SIGCONT); /* Send a sigterm to the process before sigkill to be nice */ OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, "%s SENDING SIGTERM TO %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(child->name))); + ORTE_NAME_PRINT(&child->name))); kill_local(child->pid, SIGTERM); /* check to see if it died - the child_died function will continue @@ -2726,7 +2267,7 @@ int orte_odls_base_default_kill_local_procs(opal_pointer_array_t *procs, OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, "%s SENDING SIGKILL TO %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(child->name))); + ORTE_NAME_PRINT(&child->name))); kill_local(child->pid, SIGKILL); /* Double check that it actually died this time */ if (!child_died(child)) { @@ -2745,7 +2286,7 @@ int orte_odls_base_default_kill_local_procs(opal_pointer_array_t *procs, OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, "%s SENDING FORCE SIGKILL TO %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(child->name))); + ORTE_NAME_PRINT(&child->name))); kill_local(child->pid, SIGKILL); /* Double check that it actually died this time */ if (!child_died(child)) { @@ -2758,7 +2299,7 @@ int orte_odls_base_default_kill_local_procs(opal_pointer_array_t *procs, OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, "%s odls:kill_local_proc child %s killed", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(child->name))); + ORTE_NAME_PRINT(&child->name))); /* indicate the waitpid fired as this is effectively what * has happened @@ -2770,18 +2311,12 @@ int orte_odls_base_default_kill_local_procs(opal_pointer_array_t *procs, CLEANUP: /* ensure the child's session directory is cleaned up */ - orte_session_dir_finalize(child->name); + orte_session_dir_finalize(&child->name); /* check for everything complete - this will remove * the child object from our local list */ if (child->iof_complete && child->waitpid_recvd) { - rc = orte_errmgr.update_state(ORTE_JOBID_INVALID, ORTE_JOB_STATE_UNDEF, - child->name, child->state, child->pid, - child->exit_code); - if (ORTE_ERR_SILENT == rc) { - /* all procs are complete - we are done */ - break; - } + ORTE_ACTIVATE_PROC_STATE(&child->name, child->state); } } } @@ -2792,12 +2327,6 @@ int orte_odls_base_default_kill_local_procs(opal_pointer_array_t *procs, OBJ_DESTRUCT(&proctmp); } - /* we are done with the global list, so we can now release - * any waiting threads - this also allows any callbacks to work - */ - opal_condition_signal(&orte_odls_globals.cond); - OPAL_THREAD_UNLOCK(&orte_odls_globals.mutex); - return ORTE_SUCCESS; } @@ -2805,10 +2334,9 @@ int orte_odls_base_get_proc_stats(opal_buffer_t *answer, orte_process_name_t *proc) { int rc; - orte_odls_child_t *child; - opal_list_item_t *item, *next; + orte_proc_t *child; opal_pstats_t stats, *statsptr; - int j; + int i, j; OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, "%s odls:get_proc_stats for proc %s", @@ -2816,14 +2344,13 @@ int orte_odls_base_get_proc_stats(opal_buffer_t *answer, ORTE_NAME_PRINT(proc))); /* find this child */ - for (item = opal_list_get_first(&orte_local_children); - item != opal_list_get_end(&orte_local_children); - item = next) { - child = (orte_odls_child_t*)item; - next = opal_list_get_next(item); + for (i=0; i < orte_local_children->size; i++) { + if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) { + continue; + } - if (proc->jobid == child->name->jobid && - (proc->vpid == child->name->vpid || + if (proc->jobid == child->name.jobid && + (proc->vpid == child->name.vpid || ORTE_VPID_WILDCARD == proc->vpid)) { /* found it */ OBJ_CONSTRUCT(&stats, opal_pstats_t); @@ -2834,7 +2361,7 @@ int orte_odls_base_get_proc_stats(opal_buffer_t *answer, stats.node[j] = orte_process_info.nodename[j]; } /* record rank */ - stats.rank = child->name->vpid; + stats.rank = child->name.vpid; /* get stats */ rc = opal_pstat.query(child->pid, &stats, NULL); if (ORTE_SUCCESS != rc) { @@ -2859,22 +2386,18 @@ int orte_odls_base_get_proc_stats(opal_buffer_t *answer, return ORTE_SUCCESS; } -int orte_odls_base_default_restart_proc(orte_odls_child_t *child, +int orte_odls_base_default_restart_proc(orte_proc_t *child, orte_odls_base_fork_local_proc_fn_t fork_local) { int rc; orte_app_context_t *app; - opal_list_item_t *item; - orte_odls_job_t *jobdat; + orte_job_t *jobdat; char basedir[MAXPATHLEN]; - /* protect operations involving the global list of children */ - OPAL_THREAD_LOCK(&orte_odls_globals.mutex); - OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, "%s odls:restart_proc for proc %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(child->name))); + ORTE_NAME_PRINT(&child->name))); /* establish our baseline working directory - we will be potentially * bouncing around as we execute this app, but we will always return @@ -2883,16 +2406,7 @@ int orte_odls_base_default_restart_proc(orte_odls_child_t *child, getcwd(basedir, sizeof(basedir)); /* find this child's jobdat */ - jobdat = NULL; - for (item = opal_list_get_first(&orte_local_jobdata); - item != opal_list_get_end(&orte_local_jobdata); - item = opal_list_get_next(item)) { - jobdat = (orte_odls_job_t*)item; - if (jobdat->jobid == child->name->jobid) { - break; - } - } - if (NULL == jobdat) { + if (NULL == (jobdat = orte_get_job_data_object(child->name.jobid))) { /* not found */ ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); return ORTE_ERR_NOT_FOUND; @@ -2902,29 +2416,22 @@ int orte_odls_base_default_restart_proc(orte_odls_child_t *child, child->exit_code = 0; child->waitpid_recvd = false; child->iof_complete = false; - child->coll_recvd = false; child->pid = 0; - child->init_recvd = false; - child->fini_recvd = false; if (NULL != child->rml_uri) { free(child->rml_uri); child->rml_uri = NULL; } - app = (orte_app_context_t*)opal_pointer_array_get_item(&jobdat->apps, child->app_idx); + app = (orte_app_context_t*)opal_pointer_array_get_item(jobdat->apps, child->app_idx); /* reset envars to match this child */ if (ORTE_SUCCESS != (rc = setup_child(child, jobdat, &app->env))) { ORTE_ERROR_LOG(rc); - opal_condition_signal(&orte_odls_globals.cond); - OPAL_THREAD_UNLOCK(&orte_odls_globals.mutex); goto CLEANUP; } /* setup the path */ if (ORTE_SUCCESS != (rc = setup_path(app))) { ORTE_ERROR_LOG(rc); - opal_condition_signal(&orte_odls_globals.cond); - OPAL_THREAD_UNLOCK(&orte_odls_globals.mutex); goto CLEANUP; } @@ -2932,12 +2439,6 @@ int orte_odls_base_default_restart_proc(orte_odls_child_t *child, "%s restarting app %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), app->app)); - /* must unlock prior to fork to keep things clean in the - * event library - */ - opal_condition_signal(&orte_odls_globals.cond); - OPAL_THREAD_UNLOCK(&orte_odls_globals.mutex); - rc = fork_local(app, child, app->env, jobdat); if (ORTE_SUCCESS == rc) { orte_wait_cb(child->pid, odls_base_default_wait_local_proc, NULL); @@ -2947,7 +2448,7 @@ int orte_odls_base_default_restart_proc(orte_odls_child_t *child, OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, "%s odls:restart of proc %s %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(child->name), + ORTE_NAME_PRINT(&child->name), (ORTE_SUCCESS == rc) ? "succeeded" : "failed")); /* reset our working directory back to our default location - if we @@ -2961,26 +2462,3 @@ int orte_odls_base_default_restart_proc(orte_odls_child_t *child, return rc; } - -bool orte_odls_base_default_check_finished(orte_process_name_t *proc) { - orte_odls_child_t *child; - opal_list_item_t *item; - orte_ns_cmp_bitmask_t mask; - - OPAL_THREAD_LOCK(&orte_odls_globals.mutex); - - /* find this child */ - for (item = opal_list_get_first(&orte_local_children); - item != opal_list_get_end(&orte_local_children); - item = opal_list_get_next(item)) { - child = (orte_odls_child_t*)item; - - mask = ORTE_NS_CMP_ALL; - - if (OPAL_EQUAL == orte_util_compare_name_fields(mask, proc, child->name)) { /* found it */ - return child->fini_recvd; - } - } - - return false; -} diff --git a/orte/mca/odls/base/odls_base_open.c b/orte/mca/odls/base/odls_base_open.c index afd81aa264..09d7906e4e 100644 --- a/orte/mca/odls/base/odls_base_open.c +++ b/orte/mca/odls/base/odls_base_open.c @@ -11,6 +11,8 @@ * All rights reserved. * Copyright (c) 2010-2011 Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2011-2012 Los Alamos National Security, LLC. + * All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -35,8 +37,8 @@ #include "opal/util/output.h" #include "opal/util/path.h" #include "opal/util/argv.h" -#include "opal/threads/threads.h" +#include "orte/mca/errmgr/errmgr.h" #include "orte/mca/plm/plm_types.h" #include "orte/util/name_fns.h" #include "orte/runtime/orte_globals.h" @@ -89,7 +91,7 @@ orte_odls_globals_t orte_odls_globals; int orte_odls_base_open(void) { char **ranks=NULL, *tmp; - int i, rank; + int rc, i, rank; orte_namelist_t *nm; bool xterm_hold; @@ -101,17 +103,17 @@ int orte_odls_base_open(void) "Time to wait for a process to die after issuing a kill signal to it", false, false, 1, &orte_odls_globals.timeout_before_sigkill); - /* initialize the global list of local children and job data */ - OBJ_CONSTRUCT(&orte_local_children, opal_list_t); - OBJ_CONSTRUCT(&orte_local_children_lock, opal_mutex_t); - OBJ_CONSTRUCT(&orte_local_children_cond, opal_condition_t); - OBJ_CONSTRUCT(&orte_local_jobdata, opal_list_t); - OBJ_CONSTRUCT(&orte_local_jobdata_lock, opal_mutex_t); - OBJ_CONSTRUCT(&orte_local_jobdata_cond, opal_condition_t); + /* initialize the global array of local children */ + orte_local_children = OBJ_NEW(opal_pointer_array_t); + if (OPAL_SUCCESS != (rc = opal_pointer_array_init(orte_local_children, + 1, + ORTE_GLOBAL_ARRAY_MAX_SIZE, + 1))) { + ORTE_ERROR_LOG(rc); + return rc; + } /* initialize ODLS globals */ - OBJ_CONSTRUCT(&orte_odls_globals.mutex, opal_mutex_t); - OBJ_CONSTRUCT(&orte_odls_globals.cond, opal_condition_t); OBJ_CONSTRUCT(&orte_odls_globals.xterm_ranks, opal_list_t); orte_odls_globals.xtermcmd = NULL; orte_odls_globals.dmap = NULL; @@ -133,7 +135,6 @@ int orte_odls_base_open(void) if (-1 == rank) { /* wildcard */ nm->name.vpid = ORTE_VPID_WILDCARD; - ORTE_EPOCH_SET(nm->name.epoch,ORTE_EPOCH_WILDCARD); } else if (rank < 0) { /* error out on bozo case */ orte_show_help("help-odls-base.txt", @@ -146,9 +147,8 @@ int orte_odls_base_open(void) * will be in the job - we'll check later */ nm->name.vpid = rank; - ORTE_EPOCH_SET(nm->name.epoch,orte_ess.proc_get_epoch(&nm->name)); } - opal_list_append(&orte_odls_globals.xterm_ranks, &nm->item); + opal_list_append(&orte_odls_globals.xterm_ranks, &nm->super); } opal_argv_free(ranks); /* construct the xtermcmd */ @@ -188,117 +188,20 @@ int orte_odls_base_open(void) return ORTE_SUCCESS; } -/* instance the child list object */ -static void orte_odls_child_constructor(orte_odls_child_t *ptr) +static void launch_local_const(orte_odls_launch_local_t *ptr) { - ptr->name = NULL; - ptr->restarts = 0; - ptr->pid = 0; - ptr->app_idx = 0; - ptr->alive = false; - ptr->coll_recvd = false; - /* set the default state to "failed to start" so - * we can correctly report should something - * go wrong during launch - */ - ptr->state = ORTE_PROC_STATE_FAILED_TO_START; - ptr->exit_code = 0; - ptr->init_recvd = false; - ptr->fini_recvd = false; - ptr->rml_uri = NULL; - ptr->waitpid_recvd = false; - ptr->iof_complete = false; - ptr->do_not_barrier = false; - ptr->notified = false; - OBJ_CONSTRUCT(&ptr->stats, opal_ring_buffer_t); - opal_ring_buffer_init(&ptr->stats, orte_stat_history_size); -#if OPAL_HAVE_HWLOC - ptr->cpu_bitmap = NULL; -#endif + ptr->ev = opal_event_alloc(); + ptr->job = ORTE_JOBID_INVALID; + ptr->fork_local = NULL; + ptr->retries = 0; } -static void orte_odls_child_destructor(orte_odls_child_t *ptr) +static void launch_local_dest(orte_odls_launch_local_t *ptr) { - opal_pstats_t *st; - - if (NULL != ptr->name) free(ptr->name); - if (NULL != ptr->rml_uri) free(ptr->rml_uri); - - while (NULL != (st = (opal_pstats_t*)opal_ring_buffer_pop(&ptr->stats))) { - OBJ_RELEASE(st); - } - OBJ_DESTRUCT(&ptr->stats); -#if OPAL_HAVE_HWLOC - if (NULL != ptr->cpu_bitmap) { - free(ptr->cpu_bitmap); - } -#endif + opal_event_free(ptr->ev); } -OBJ_CLASS_INSTANCE(orte_odls_child_t, - opal_list_item_t, - orte_odls_child_constructor, - orte_odls_child_destructor); - -static void orte_odls_job_constructor(orte_odls_job_t *ptr) -{ - OBJ_CONSTRUCT(&ptr->lock, opal_mutex_t); - OBJ_CONSTRUCT(&ptr->cond, opal_condition_t); - ptr->jobid = ORTE_JOBID_INVALID; - ptr->instance = NULL; - ptr->name = NULL; - ptr->state = ORTE_JOB_STATE_UNDEF; - ptr->launch_msg_processed = false; - OBJ_CONSTRUCT(&ptr->apps, opal_pointer_array_t); - opal_pointer_array_init(&ptr->apps, 2, INT_MAX, 2); - ptr->num_apps = 0; -#if OPAL_HAVE_HWLOC - ptr->binding = 0; -#endif - ptr->cpus_per_rank = 1; - ptr->stride = 1; - ptr->controls = 0; - ptr->stdin_target = ORTE_VPID_INVALID; - ptr->total_slots_alloc = 0; - ptr->num_procs = 0; - ptr->num_local_procs = 0; - ptr->pmap = NULL; - OBJ_CONSTRUCT(&ptr->collection_bucket, opal_buffer_t); - OBJ_CONSTRUCT(&ptr->local_collection, opal_buffer_t); - ptr->collective_type = ORTE_GRPCOMM_COLL_NONE; - ptr->num_contributors = 0; - ptr->num_participating = -1; - ptr->num_collected = 0; - ptr->enable_recovery = false; -} -static void orte_odls_job_destructor(orte_odls_job_t *ptr) -{ - int i; - orte_app_context_t *app; - - OBJ_DESTRUCT(&ptr->lock); - OBJ_DESTRUCT(&ptr->cond); - if (NULL != ptr->instance) { - free(ptr->instance); - } - if (NULL != ptr->name) { - free(ptr->name); - } - for (i=0; i < ptr->apps.size; i++) { - if (NULL != (app = (orte_app_context_t*)opal_pointer_array_get_item(&ptr->apps, i))) { - OBJ_RELEASE(app); - } - OBJ_DESTRUCT(&ptr->apps); - } - if (NULL != ptr->pmap && NULL != ptr->pmap->bytes) { - free(ptr->pmap->bytes); - free(ptr->pmap); - } - - OBJ_DESTRUCT(&ptr->collection_bucket); - OBJ_DESTRUCT(&ptr->local_collection); -} -OBJ_CLASS_INSTANCE(orte_odls_job_t, - opal_list_item_t, - orte_odls_job_constructor, - orte_odls_job_destructor); +OBJ_CLASS_INSTANCE(orte_odls_launch_local_t, + opal_object_t, + launch_local_const, + launch_local_dest); #endif diff --git a/orte/mca/odls/base/odls_base_state.c b/orte/mca/odls/base/odls_base_state.c index 2224209fb4..932c017580 100644 --- a/orte/mca/odls/base/odls_base_state.c +++ b/orte/mca/odls/base/odls_base_state.c @@ -77,17 +77,14 @@ int orte_odls_base_preload_files_app_context(orte_app_context_t* app_context) /* if I am the HNP, then use me as the source */ p_set->source.jobid = ORTE_PROC_MY_NAME->jobid; p_set->source.vpid = ORTE_PROC_MY_NAME->vpid; - ORTE_EPOCH_SET(p_set->source.epoch,ORTE_PROC_MY_NAME->epoch); } else { /* otherwise, set the HNP as the source */ p_set->source.jobid = ORTE_PROC_MY_HNP->jobid; p_set->source.vpid = ORTE_PROC_MY_HNP->vpid; - ORTE_EPOCH_SET(p_set->source.epoch,ORTE_PROC_MY_HNP->epoch); } p_set->sink.jobid = ORTE_PROC_MY_NAME->jobid; p_set->sink.vpid = ORTE_PROC_MY_NAME->vpid; - ORTE_EPOCH_SET(p_set->sink.epoch,ORTE_PROC_MY_NAME->epoch); opal_list_append(&(filem_request->process_sets), &(p_set->super) ); diff --git a/orte/mca/odls/base/odls_private.h b/orte/mca/odls/base/odls_private.h index 9733add738..3a1030eef1 100644 --- a/orte/mca/odls/base/odls_private.h +++ b/orte/mca/odls/base/odls_private.h @@ -10,6 +10,8 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2011 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2011 Los Alamos National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -31,8 +33,6 @@ #include "opal/class/opal_list.h" #include "opal/class/opal_pointer_array.h" #include "opal/class/opal_bitmap.h" -#include "opal/threads/mutex.h" -#include "opal/threads/condition.h" #include "opal/dss/dss_types.h" #include "orte/mca/grpcomm/grpcomm_types.h" @@ -52,14 +52,10 @@ typedef struct { int output; /** Time to allow process to forcibly die */ int timeout_before_sigkill; - /* mutex */ - opal_mutex_t mutex; - /* condition variable */ - opal_condition_t cond; /* byte object to store daemon map for later xmit to procs */ opal_byte_object_t *dmap; /* any co-spawned debugger daemon */ - orte_odls_job_t *debugger; + orte_job_t *debugger; /* debugger launched */ bool debugger_launched; /* list of ranks to be displayed on separate xterms */ @@ -80,22 +76,39 @@ ORTE_DECLSPEC int orte_odls_base_default_get_add_procs_data(opal_buffer_t *data, orte_jobid_t job); -ORTE_DECLSPEC int -orte_odls_base_default_update_daemon_info(opal_buffer_t *data); - ORTE_DECLSPEC int orte_odls_base_default_construct_child_list(opal_buffer_t *data, orte_jobid_t *job); /* define a function that will fork a local proc */ typedef int (*orte_odls_base_fork_local_proc_fn_t)(orte_app_context_t *context, - orte_odls_child_t *child, + orte_proc_t *child, char **environ_copy, - orte_odls_job_t *jobdat); + orte_job_t *jdata); -ORTE_DECLSPEC int -orte_odls_base_default_launch_local(orte_jobid_t job, - orte_odls_base_fork_local_proc_fn_t fork_local); +/* define an object for starting local launch */ +typedef struct { + opal_object_t object; + opal_event_t *ev; + orte_jobid_t job; + orte_odls_base_fork_local_proc_fn_t fork_local; + int retries; +} orte_odls_launch_local_t; +OBJ_CLASS_DECLARATION(orte_odls_launch_local_t); + +#define ORTE_ACTIVATE_LOCAL_LAUNCH(j, f) \ + do { \ + orte_odls_launch_local_t *ll; \ + ll = OBJ_NEW(orte_odls_launch_local_t); \ + ll->job = (j); \ + ll->fork_local = (f); \ + opal_event_set(orte_event_base, ll->ev, -1, OPAL_EV_WRITE, \ + orte_odls_base_default_launch_local, ll); \ + opal_event_set_priority(ll->ev, ORTE_SYS_PRI); \ + opal_event_active(ll->ev, OPAL_EV_WRITE, 1); \ + } while(0); + +ORTE_DECLSPEC void orte_odls_base_default_launch_local(int fd, short sd, void *cbdata); ORTE_DECLSPEC int orte_odls_base_default_deliver_message(orte_jobid_t job, opal_buffer_t *buffer, orte_rml_tag_t tag); @@ -115,7 +128,7 @@ orte_odls_base_default_signal_local_procs(const orte_process_name_t *proc, int32 typedef int (*orte_odls_base_kill_local_fn_t)(pid_t pid, int signum); /* define a function type to detect that a child died */ -typedef bool (*orte_odls_base_child_died_fn_t)(orte_odls_child_t *child); +typedef bool (*orte_odls_base_child_died_fn_t)(orte_proc_t *child); ORTE_DECLSPEC int orte_odls_base_default_kill_local_procs(opal_pointer_array_t *procs, @@ -126,7 +139,7 @@ ORTE_DECLSPEC int orte_odls_base_default_require_sync(orte_process_name_t *proc, opal_buffer_t *buffer, bool drop_nidmap); -ORTE_DECLSPEC int orte_odls_base_default_restart_proc(orte_odls_child_t *child, +ORTE_DECLSPEC int orte_odls_base_default_restart_proc(orte_proc_t *child, orte_odls_base_fork_local_proc_fn_t fork_local); /* diff --git a/orte/mca/odls/default/odls_default_module.c b/orte/mca/odls/default/odls_default_module.c index d2b1b13a83..0a3ca7a202 100644 --- a/orte/mca/odls/default/odls_default_module.c +++ b/orte/mca/odls/default/odls_default_module.c @@ -13,6 +13,8 @@ * Copyright (c) 2007 Evergrid, Inc. All rights reserved. * Copyright (c) 2008-2011 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2010 IBM Corporation. All rights reserved. + * Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights + * reserved. * * $COPYRIGHT$ * @@ -155,7 +157,7 @@ typedef struct { static int orte_odls_default_launch_local_procs(opal_buffer_t *data); static int orte_odls_default_kill_local_procs(opal_pointer_array_t *procs); static int orte_odls_default_signal_local_procs(const orte_process_name_t *proc, int32_t signal); -static int orte_odls_default_restart_proc(orte_odls_child_t *child); +static int orte_odls_default_restart_proc(orte_proc_t *child); /* * Explicitly declared functions so that we can get the noreturn @@ -165,9 +167,9 @@ static void send_error_show_help(int fd, int exit_status, const char *file, const char *topic, ...) __opal_attribute_noreturn__; static int do_child(orte_app_context_t* context, - orte_odls_child_t *child, + orte_proc_t *child, char **environ_copy, - orte_odls_job_t *jobdat, int write_fd, + orte_job_t *jobdat, int write_fd, orte_iof_base_io_conf_t opts) __opal_attribute_noreturn__; @@ -186,7 +188,7 @@ orte_odls_base_module_t orte_odls_default_module = { }; -static bool odls_default_child_died(orte_odls_child_t *child) +static bool odls_default_child_died(orte_proc_t *child) { time_t end; pid_t ret; @@ -381,9 +383,9 @@ static void send_error_show_help(int fd, int exit_status, } static int do_child(orte_app_context_t* context, - orte_odls_child_t *child, + orte_proc_t *child, char **environ_copy, - orte_odls_job_t *jobdat, int write_fd, + orte_job_t *jobdat, int write_fd, orte_iof_base_io_conf_t opts) { int i; @@ -443,7 +445,7 @@ static int do_child(orte_app_context_t* context, if (NULL == msg) { msg = "failed to convert bitmap list to hwloc bitmap"; } - if (OPAL_BINDING_REQUIRED(jobdat->binding)) { + if (OPAL_BINDING_REQUIRED(jobdat->map->binding)) { /* If binding is required, send an error up the pipe (which exits -- it doesn't return). */ send_error_show_help(write_fd, 1, "help-orte-odls-default.txt", @@ -463,7 +465,7 @@ static int do_child(orte_app_context_t* context, if (opal_hwloc_report_bindings) { opal_output(0, "%s odls:default binding child %s to cpus %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(child->name), child->cpu_bitmap); + ORTE_NAME_PRINT(&child->name), child->cpu_bitmap); } rc = hwloc_set_cpubind(opal_hwloc_topology, cpuset, 0); if (rc < 0) { @@ -476,7 +478,7 @@ static int do_child(orte_app_context_t* context, asprintf(&msg, "hwloc_set_cpubind returned \"%s\" for bitmap \"%s\"", opal_strerror(rc), child->cpu_bitmap); } - if (OPAL_BINDING_REQUIRED(jobdat->binding)) { + if (OPAL_BINDING_REQUIRED(jobdat->map->binding)) { /* If binding is required, send an error up the pipe (which exits -- it doesn't return). */ send_error_show_help(write_fd, 1, "help-orte-odls-default.txt", @@ -615,9 +617,9 @@ static int do_child(orte_app_context_t* context, static int do_parent(orte_app_context_t* context, - orte_odls_child_t *child, + orte_proc_t *child, char **environ_copy, - orte_odls_job_t *jobdat, int read_fd, + orte_job_t *jobdat, int read_fd, orte_iof_base_io_conf_t opts) { int rc; @@ -626,7 +628,7 @@ static int do_parent(orte_app_context_t* context, if (NULL != child && (ORTE_JOB_CONTROL_FORWARD_OUTPUT & jobdat->controls)) { /* connect endpoints IOF */ - rc = orte_iof_base_setup_parent(child->name, &opts); + rc = orte_iof_base_setup_parent(&child->name, &opts); if (ORTE_SUCCESS != rc) { ORTE_ERROR_LOG(rc); close(read_fd); @@ -637,7 +639,7 @@ static int do_parent(orte_app_context_t* context, return rc; } } - + /* Block reading a message from the pipe */ while (1) { rc = opal_fd_read(read_fd, sizeof(msg), &msg); @@ -734,7 +736,7 @@ static int do_parent(orte_app_context_t* context, indication of a fatal error, meaning that the child process launched successfully. */ if (NULL != child) { - child->state = ORTE_PROC_STATE_LAUNCHED; + child->state = ORTE_PROC_STATE_RUNNING; child->alive = true; } close(read_fd); @@ -747,9 +749,9 @@ static int do_parent(orte_app_context_t* context, * Fork/exec the specified processes */ static int odls_default_fork_local_proc(orte_app_context_t* context, - orte_odls_child_t *child, + orte_proc_t *child, char **environ_copy, - orte_odls_job_t *jobdat) + orte_job_t *jobdat) { orte_iof_base_io_conf_t opts; int rc, p[2]; @@ -762,7 +764,8 @@ static int odls_default_fork_local_proc(orte_app_context_t* context, /* do we want to setup stdin? */ if (NULL != child && - (jobdat->stdin_target == ORTE_VPID_WILDCARD || child->name->vpid == jobdat->stdin_target)) { + (jobdat->stdin_target == ORTE_VPID_WILDCARD || + child->name.vpid == jobdat->stdin_target)) { opts.connect_stdin = true; } else { opts.connect_stdin = false; @@ -777,7 +780,7 @@ static int odls_default_fork_local_proc(orte_app_context_t* context, return rc; } } - + /* A pipe is used to communicate between the parent and child to indicate whether the exec ultimately succeeded or failed. The child sets the pipe to be close-on-exec; the child only ever @@ -829,49 +832,24 @@ int orte_odls_default_launch_local_procs(opal_buffer_t *data) { int rc; orte_jobid_t job; - orte_job_t *jdata; /* construct the list of children we are to launch */ if (ORTE_SUCCESS != (rc = orte_odls_base_default_construct_child_list(data, &job))) { OPAL_OUTPUT_VERBOSE((2, orte_odls_globals.output, "%s odls:default:launch:local failed to construct child list on error %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_ERROR_NAME(rc))); - goto CLEANUP; + return rc; } /* launch the local procs */ - if (ORTE_SUCCESS != (rc = orte_odls_base_default_launch_local(job, odls_default_fork_local_proc))) { - OPAL_OUTPUT_VERBOSE((2, orte_odls_globals.output, - "%s odls:default:launch:local failed to launch on error %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_ERROR_NAME(rc))); - goto CLEANUP; - } + ORTE_ACTIVATE_LOCAL_LAUNCH(job, odls_default_fork_local_proc); - /* look up job data object */ - if (NULL != (jdata = orte_get_job_data_object(job))) { - if (jdata->state & ORTE_JOB_STATE_SUSPENDED) { - if (ORTE_PROC_IS_HNP) { - /* Have the plm send the signal to all the nodes. - If the signal arrived before the orteds started, - then they won't know to suspend their procs. - The plm also arranges for any local procs to - be signaled. - */ - orte_plm.signal_job(jdata->jobid, SIGTSTP); - } else { - orte_odls_default_signal_local_procs(NULL, SIGTSTP); - } - } - } - -CLEANUP: - - return rc; + return ORTE_SUCCESS; } /** - * Send a sigal to a pid. Note that if we get an error, we set the + * Send a signal to a pid. Note that if we get an error, we set the * return value and let the upper layer print out the message. */ static int send_signal(pid_t pid, int signal) @@ -922,7 +900,7 @@ static int orte_odls_default_signal_local_procs(const orte_process_name_t *proc, return ORTE_SUCCESS; } -static int orte_odls_default_restart_proc(orte_odls_child_t *child) +static int orte_odls_default_restart_proc(orte_proc_t *child) { int rc; diff --git a/orte/mca/odls/odls.h b/orte/mca/odls/odls.h index ea09f89c65..a36ba9e26d 100644 --- a/orte/mca/odls/odls.h +++ b/orte/mca/odls/odls.h @@ -10,6 +10,8 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. + * Copyright (c) 2011-2012 Los Alamos National Security, LLC. + * All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -34,6 +36,8 @@ #include "opal/dss/dss_types.h" #include "orte/mca/rml/rml_types.h" +#include "orte/runtime/orte_globals.h" + #include "orte/mca/odls/odls_types.h" BEGIN_C_DECLS @@ -85,7 +89,7 @@ typedef int (*orte_odls_base_module_require_sync_fn_t)(orte_process_name_t *proc /** * Restart a local process */ -typedef int (*orte_odls_base_module_restart_proc_fn_t)(orte_odls_child_t *child); +typedef int (*orte_odls_base_module_restart_proc_fn_t)(orte_proc_t *child); /** * pls module version @@ -94,7 +98,7 @@ struct orte_odls_base_module_1_3_0_t { orte_odls_base_module_get_add_procs_data_fn_t get_add_procs_data; orte_odls_base_module_launch_local_processes_fn_t launch_local_procs; orte_odls_base_module_kill_local_processes_fn_t kill_local_procs; - orte_odls_base_module_signal_local_process_fn_t signal_local_procs; + orte_odls_base_module_signal_local_process_fn_t signal_local_procs; orte_odls_base_module_deliver_message_fn_t deliver_message; orte_odls_base_module_require_sync_fn_t require_sync; orte_odls_base_module_restart_proc_fn_t restart_proc; diff --git a/orte/mca/odls/odls_types.h b/orte/mca/odls/odls_types.h index 5c419d15e2..fa5f9e5ea9 100644 --- a/orte/mca/odls/odls_types.h +++ b/orte/mca/odls/odls_types.h @@ -9,7 +9,9 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved. - * Copyright (c) 2011 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2011 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2011-2012 Los Alamos National Security, LLC. + * All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -32,9 +34,8 @@ #include "opal/class/opal_list.h" #include "opal/class/opal_ring_buffer.h" #include "opal/dss/dss_types.h" -#include "opal/threads/mutex.h" -#include "opal/threads/condition.h" #include "opal/mca/hwloc/hwloc.h" +#include "opal/mca/event/event.h" #include "orte/mca/plm/plm_types.h" #include "orte/mca/grpcomm/grpcomm_types.h" @@ -72,10 +73,6 @@ typedef uint8_t orte_daemon_cmd_flag_t; #define ORTE_DAEMON_TERMINATE_JOB_CMD (orte_daemon_cmd_flag_t) 18 #define ORTE_DAEMON_HALT_VM_CMD (orte_daemon_cmd_flag_t) 19 -/* proc termination sync cmds */ -#define ORTE_DAEMON_WAITPID_FIRED (orte_daemon_cmd_flag_t) 20 -#define ORTE_DAEMON_IOF_COMPLETE (orte_daemon_cmd_flag_t) 21 - /* request proc resource usage */ #define ORTE_DAEMON_TOP_CMD (orte_daemon_cmd_flag_t) 22 @@ -92,76 +89,6 @@ typedef uint8_t orte_daemon_cmd_flag_t; /* process called "errmgr.abort_procs" */ #define ORTE_DAEMON_ABORT_PROCS_CALLED (orte_daemon_cmd_flag_t) 28 -/* - * List object to locally store the process names and pids of - * our children. This can subsequently be used to order termination - * or pass signals without looking the info up again. - */ -typedef struct { - opal_list_item_t super; /* required to place this on a list */ - orte_process_name_t *name; /* the OmpiRTE name of the proc */ - int32_t restarts; /* number of times this proc has been restarted */ - pid_t pid; /* local pid of the proc */ - orte_app_idx_t app_idx; /* index of the app_context for this proc */ - bool alive; /* is this proc alive? */ - bool coll_recvd; /* collective operation recvd */ - orte_proc_state_t state; /* the state of the process */ - orte_exit_code_t exit_code; /* process exit code */ - bool init_recvd; /* process called orte_init */ - bool fini_recvd; /* process called orte_finalize */ - char *rml_uri; /* contact info for this child */ -#if OPAL_HAVE_HWLOC - char *cpu_bitmap; /* binding pattern for this child */ -#endif - bool waitpid_recvd; /* waitpid has detected proc termination */ - bool iof_complete; /* IOF has noted proc terminating all channels */ - struct timeval starttime; /* when the proc was started - for timing purposes only */ - bool do_not_barrier; /* the proc should not barrier in orte_init */ - bool notified; /* notification of termination has been sent */ - opal_ring_buffer_t stats; -} orte_odls_child_t; -ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_odls_child_t); - -#if !ORTE_DISABLE_FULL_SUPPORT - -/* - * List object to locally store job related info - */ -typedef struct orte_odls_job_t { - opal_list_item_t super; /* required to place this on a list */ - opal_mutex_t lock; - opal_condition_t cond; - orte_job_state_t state; /* state of the job */ - orte_jobid_t jobid; /* jobid for this data */ - char *instance; /* keep handy for scheduler restart */ - char *name; /* keep handy for scheduler restart */ - bool launch_msg_processed; /* launch msg has been fully processed */ - opal_pointer_array_t apps; /* app_contexts for this job */ - orte_app_idx_t num_apps; /* number of app_contexts */ -#if OPAL_HAVE_HWLOC - opal_binding_policy_t binding; /* binding policy */ -#endif - int16_t cpus_per_rank; /* number of cpus/rank */ - int16_t stride; /* step size between cores of multi-core/rank procs */ - orte_job_controls_t controls; /* control flags for job */ - orte_vpid_t stdin_target; /* where stdin is to go */ - orte_std_cntr_t total_slots_alloc; - orte_std_cntr_t num_nodes; /* number of nodes involved in the job */ - orte_vpid_t num_procs; - int32_t num_local_procs; - opal_byte_object_t *pmap; /* local copy of pidmap byte object */ - opal_buffer_t collection_bucket; - opal_buffer_t local_collection; - orte_grpcomm_coll_t collective_type; - int32_t num_contributors; - int num_participating; - int num_collected; - struct timeval launch_msg_recvd; /* when the launch msg for this job was recvd - for timing purposes only */ - bool enable_recovery; /* enable recovery of failed processes */ -} orte_odls_job_t; -ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_odls_job_t); - -#endif END_C_DECLS diff --git a/orte/mca/odls/process/odls_process_module.c b/orte/mca/odls/process/odls_process_module.c index b289f1d992..f6b75fc7d3 100644 --- a/orte/mca/odls/process/odls_process_module.c +++ b/orte/mca/odls/process/odls_process_module.c @@ -50,7 +50,7 @@ static void set_handler_default(int sig); -static bool odls_process_child_died( orte_odls_child_t *child ) +static bool odls_process_child_died( orte_proc_t *child ) { int error; HANDLE handle = OpenProcess( PROCESS_TERMINATE | SYNCHRONIZE, FALSE, @@ -92,9 +92,9 @@ static int odls_process_kill_local_procs(opal_pointer_array_t *procs) */ static int odls_process_fork_local_proc(orte_app_context_t* context, - orte_odls_child_t *child, + orte_proc_t *child, char **environ_copy, - orte_odls_job_t *jobdat) + orte_job_t *jobdat) { pid_t pid; orte_iof_base_io_conf_t opts; @@ -108,7 +108,7 @@ static int odls_process_fork_local_proc(orte_app_context_t* context, */ if (opal_sys_limits.initialized) { if (0 < opal_sys_limits.num_procs && - opal_sys_limits.num_procs <= (int)opal_list_get_size(&orte_local_children)) { + opal_sys_limits.num_procs <= *(&orte_local_children->size)) { /* at the system limit - abort */ ORTE_ERROR_LOG(ORTE_ERR_SYS_LIMITS_CHILDREN); child->state = ORTE_PROC_STATE_FAILED_TO_START; @@ -122,7 +122,7 @@ static int odls_process_fork_local_proc(orte_app_context_t* context, opts.usepty = OPAL_ENABLE_PTY_SUPPORT; /* do we want to setup stdin? */ - if (jobdat->stdin_target == ORTE_VPID_WILDCARD || child->name->vpid == jobdat->stdin_target) { + if (jobdat->stdin_target == ORTE_VPID_WILDCARD || child->name.vpid == jobdat->stdin_target) { opts.connect_stdin = true; } else { opts.connect_stdin = false; @@ -161,16 +161,10 @@ static int odls_process_fork_local_proc(orte_app_context_t* context, } /* set the proc state to LAUNCHED and save the pid */ - child->state = ORTE_PROC_STATE_LAUNCHED; + child->state = ORTE_PROC_STATE_RUNNING; child->pid = pid; child->alive = true; - - /* Windows automatically forwards IO, so we don't need to do so here. However, - * we need to flag that IO termination conditions are met so that the daemon - * knows the proc is done - */ - orte_odls_base_notify_iof_complete(child->name); - + return ORTE_SUCCESS; } @@ -193,12 +187,7 @@ static int odls_process_launch_local_procs(opal_buffer_t *data) } /* launch the local procs */ - if (ORTE_SUCCESS != (rc = orte_odls_base_default_launch_local(job, odls_process_fork_local_proc))) { - OPAL_OUTPUT_VERBOSE((2, orte_odls_globals.output, - "%s odls:process:launch:local failed to launch on error %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_ERROR_NAME(rc))); - goto CLEANUP; - } + ORTE_ACTIVATE_LOCAL_LAUNCH(job, odls_process_fork_local_proc); CLEANUP: @@ -220,7 +209,7 @@ static int odls_process_signal_local_proc(const orte_process_name_t *proc, int32 return rc; } -static int orte_odls_process_restart_proc(orte_odls_child_t *child) +static int orte_odls_process_restart_proc(orte_proc_t *child) { int rc; diff --git a/orte/mca/oob/tcp/oob_tcp.c b/orte/mca/oob/tcp/oob_tcp.c index aa8b6ebd10..e6067197d1 100644 --- a/orte/mca/oob/tcp/oob_tcp.c +++ b/orte/mca/oob/tcp/oob_tcp.c @@ -9,7 +9,7 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. - * Copyright (c) 2006-2010 Los Alamos National Security, LLC. + * Copyright (c) 2006-2012 Los Alamos National Security, LLC. * All rights reserved. * Copyright (c) 2009-2012 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2011 Oak Ridge National Labs. All rights reserved. @@ -70,13 +70,16 @@ */ struct mca_oob_tcp_event_t { opal_list_item_t item; - opal_event_t event; + opal_event_t *event; }; typedef struct mca_oob_tcp_event_t mca_oob_tcp_event_t; static void mca_oob_tcp_event_construct(mca_oob_tcp_event_t* event) { OPAL_THREAD_LOCK(&mca_oob_tcp_component.tcp_lock); + /* get an event */ + event->event = opal_event_alloc(); + /* track our events */ opal_list_append(&mca_oob_tcp_component.tcp_events, &event->item); OPAL_THREAD_UNLOCK(&mca_oob_tcp_component.tcp_lock); } @@ -84,6 +87,9 @@ static void mca_oob_tcp_event_construct(mca_oob_tcp_event_t* event) static void mca_oob_tcp_event_destruct(mca_oob_tcp_event_t* event) { OPAL_THREAD_LOCK(&mca_oob_tcp_component.tcp_lock); + /* release the event for re-use */ + opal_event_free(event->event); + /* remove it from our list */ opal_list_remove_item(&mca_oob_tcp_component.tcp_events, &event->item); OPAL_THREAD_UNLOCK(&mca_oob_tcp_component.tcp_lock); } @@ -414,6 +420,12 @@ static int mca_oob_tcp_component_open(void) OBJ_CONSTRUCT(&mca_oob_tcp_component.tcp_connections_return, opal_list_t); OBJ_CONSTRUCT(&mca_oob_tcp_component.tcp_connections_lock, opal_mutex_t); + mca_oob_tcp_component.tcp_recv_event = NULL; +#if OPAL_WANT_IPV6 + mca_oob_tcp_component.tcp6_recv_event = NULL; +#endif + + mca_oob_tcp_component.tcp_listen_thread_event = NULL; mca_oob_tcp_component.tcp_listen_thread_num_sockets = 0; mca_oob_tcp_component.tcp_listen_thread_sds[0] = -1; mca_oob_tcp_component.tcp_listen_thread_sds[1] = -1; @@ -507,8 +519,9 @@ mca_oob_tcp_create_connection(const int accepted_fd, /* wait for receipt of peers process identifier to complete this connection */ event = OBJ_NEW(mca_oob_tcp_event_t); - opal_event_set(opal_event_base, &event->event, accepted_fd, OPAL_EV_READ, mca_oob_tcp_recv_handler, event); - opal_event_add(&event->event, 0); + opal_event_set(orte_event_base, event->event, accepted_fd, OPAL_EV_READ, mca_oob_tcp_recv_handler, event); + opal_event_set_priority(event->event, ORTE_MSG_PRI); + opal_event_add(event->event, 0); } @@ -1098,16 +1111,25 @@ mca_oob_tcp_accept_thread_handler(int sd, short flags, void* user) tv.tv_sec = mca_oob_tcp_component.tcp_listen_thread_tv.tv_sec; tv.tv_usec = mca_oob_tcp_component.tcp_listen_thread_tv.tv_usec; #ifdef HAVE_PIPE - opal_event_set(opal_event_base, &mca_oob_tcp_component.tcp_listen_thread_event, - mca_oob_tcp_component.tcp_connections_pipe[0], - OPAL_EV_READ, - mca_oob_tcp_accept_thread_handler, NULL); + if (NULL == mca_oob_tcp_component.tcp_listen_thread_event) { + /* get an event */ + mca_oob_tcp_component.tcp_listen_thread_event = opal_event_alloc(); + opal_event_set(orte_event_base, mca_oob_tcp_component.tcp_listen_thread_event, + mca_oob_tcp_component.tcp_connections_pipe[0], + OPAL_EV_READ, + mca_oob_tcp_accept_thread_handler, NULL); + } #else - opal_event_set(opal_event_base, &mca_oob_tcp_component.tcp_listen_thread_event, - -1, 0, - mca_oob_tcp_accept_thread_handler, NULL); + if (NULL == mca_oob_tcp_component.tcp_listen_thread_event) { + /* get an event */ + mca_oob_tcp_component.tcp_listen_thread_event = opal_event_alloc(); + opal_event_set(orte_event_base, mca_oob_tcp_component.tcp_listen_thread_event, + -1, 0, + mca_oob_tcp_accept_thread_handler, NULL); + } #endif - opal_event_add(&mca_oob_tcp_component.tcp_listen_thread_event, &tv); + opal_event_set_priority(mca_oob_tcp_component.tcp_listen_thread_event, ORTE_MSG_PRI); + opal_event_add(mca_oob_tcp_component.tcp_listen_thread_event, &tv); } @@ -1134,16 +1156,25 @@ mca_oob_tcp_create_listen_thread(void) tv.tv_sec = mca_oob_tcp_component.tcp_listen_thread_tv.tv_sec; tv.tv_usec = mca_oob_tcp_component.tcp_listen_thread_tv.tv_usec; #ifdef HAVE_PIPE - opal_event_set(opal_event_base, &mca_oob_tcp_component.tcp_listen_thread_event, - mca_oob_tcp_component.tcp_connections_pipe[0], - OPAL_EV_READ, - mca_oob_tcp_accept_thread_handler, NULL); + if (NULL == mca_oob_tcp_component.tcp_listen_thread_event) { + /* get an event */ + mca_oob_tcp_component.tcp_listen_thread_event = opal_event_alloc(); + opal_event_set(orte_event_base, mca_oob_tcp_component.tcp_listen_thread_event, + mca_oob_tcp_component.tcp_connections_pipe[0], + OPAL_EV_READ, + mca_oob_tcp_accept_thread_handler, NULL); + } #else - opal_event_set(opal_event_base, &mca_oob_tcp_component.tcp_listen_thread_event, - -1, 0, - mca_oob_tcp_accept_thread_handler, NULL); + if (NULL == mca_oob_tcp_component.tcp_listen_thread_event) { + /* get an event */ + mca_oob_tcp_component.tcp_listen_thread_event = opal_event_alloc(); + opal_event_set(orte_event_base, mca_oob_tcp_component.tcp_listen_thread_event, + -1, 0, + mca_oob_tcp_accept_thread_handler, NULL); + } #endif - opal_event_add(&mca_oob_tcp_component.tcp_listen_thread_event, &tv); + opal_event_set_priority(mca_oob_tcp_component.tcp_listen_thread_event, ORTE_MSG_PRI); + opal_event_add(mca_oob_tcp_component.tcp_listen_thread_event, &tv); return opal_thread_start(&mca_oob_tcp_component.tcp_listen_thread); } @@ -1570,12 +1601,6 @@ mca_oob_t* mca_oob_tcp_component_init(int* priority) 8); /* increment to grow by */ - /* intialize event library */ - memset(&mca_oob_tcp_component.tcp_recv_event, 0, sizeof(opal_event_t)); - memset(&mca_oob_tcp_component.tcp_listen_thread_event, 0, sizeof(opal_event_t)); -#if OPAL_WANT_IPV6 - memset(&mca_oob_tcp_component.tcp6_recv_event, 0, sizeof(opal_event_t)); -#endif /* OPAL_WANT_IPV6 */ return &mca_oob_tcp; } @@ -1767,12 +1792,16 @@ int mca_oob_tcp_init(void) mca_oob_tcp_component.tcp_listen_thread_sds[idx] = mca_oob_tcp_component.tcp_listen_sd; } else { - opal_event_set(opal_event_base, &mca_oob_tcp_component.tcp_recv_event, - mca_oob_tcp_component.tcp_listen_sd, - OPAL_EV_READ|OPAL_EV_PERSIST, - mca_oob_tcp_recv_handler, - 0); - opal_event_add(&mca_oob_tcp_component.tcp_recv_event, 0); + if (NULL == mca_oob_tcp_component.tcp_recv_event) { + mca_oob_tcp_component.tcp_recv_event = opal_event_alloc(); + opal_event_set(orte_event_base, mca_oob_tcp_component.tcp_recv_event, + mca_oob_tcp_component.tcp_listen_sd, + OPAL_EV_READ|OPAL_EV_PERSIST, + mca_oob_tcp_recv_handler, + 0); + opal_event_set_priority(mca_oob_tcp_component.tcp_recv_event, ORTE_MSG_PRI); + opal_event_add(mca_oob_tcp_component.tcp_recv_event, 0); + } } } @@ -1799,12 +1828,16 @@ int mca_oob_tcp_init(void) mca_oob_tcp_component.tcp_listen_thread_sds[idx] = mca_oob_tcp_component.tcp6_listen_sd; } else { - opal_event_set(opal_event_base, &mca_oob_tcp_component.tcp6_recv_event, - mca_oob_tcp_component.tcp6_listen_sd, - OPAL_EV_READ|OPAL_EV_PERSIST, - mca_oob_tcp_recv_handler, - 0); - opal_event_add(&mca_oob_tcp_component.tcp6_recv_event, 0); + if (NULL == mca_oob_tcp_component.tcp6_recv_event) { + mca_oob_tcp_component.tcp6_recv_event = opal_event_alloc(); + opal_event_set(orte_event_base, mca_oob_tcp_component.tcp6_recv_event, + mca_oob_tcp_component.tcp6_listen_sd, + OPAL_EV_READ|OPAL_EV_PERSIST, + mca_oob_tcp_recv_handler, + 0); + opal_event_set_priority(mca_oob_tcp_component.tcp6_recv_event, ORTE_MSG_PRI); + opal_event_add(mca_oob_tcp_component.tcp6_recv_event, 0); + } } } #endif @@ -1857,14 +1890,21 @@ int mca_oob_tcp_fini(void) if (OOB_TCP_LISTEN_THREAD == mca_oob_tcp_component.tcp_listen_type) { mca_oob_tcp_component.tcp_shutdown = true; opal_thread_join(&mca_oob_tcp_component.tcp_listen_thread, &data); - opal_event_del(&mca_oob_tcp_component.tcp_listen_thread_event); + if (NULL != mca_oob_tcp_component.tcp_listen_thread_event) { + opal_event_free(mca_oob_tcp_component.tcp_listen_thread_event); + mca_oob_tcp_component.tcp_listen_thread_event = NULL; + } } else { - if (mca_oob_tcp_component.tcp_listen_sd >= 0) { - opal_event_del(&mca_oob_tcp_component.tcp_recv_event); + if (mca_oob_tcp_component.tcp_listen_sd >= 0 && + NULL != mca_oob_tcp_component.tcp_recv_event) { + opal_event_free(mca_oob_tcp_component.tcp_recv_event); + mca_oob_tcp_component.tcp_recv_event = NULL; } #if OPAL_WANT_IPV6 - if (mca_oob_tcp_component.tcp6_listen_sd >= 0) { - opal_event_del(&mca_oob_tcp_component.tcp6_recv_event); + if (mca_oob_tcp_component.tcp6_listen_sd >= 0 && + NULL != mca_oob_tcp_component.tcp6_recv_event) { + opal_event_free(mca_oob_tcp_component.tcp6_recv_event); + mca_oob_tcp_component.tcp6_recv_event = NULL; } #endif } @@ -1894,7 +1934,6 @@ int mca_oob_tcp_fini(void) item != opal_list_get_end(&mca_oob_tcp_component.tcp_events); item = opal_list_get_first(&mca_oob_tcp_component.tcp_events) ) { mca_oob_tcp_event_t* event = (mca_oob_tcp_event_t*)item; - opal_event_del(&event->event); OBJ_RELEASE(event); } diff --git a/orte/mca/oob/tcp/oob_tcp.h b/orte/mca/oob/tcp/oob_tcp.h index ca0ec8d223..ecb16f2703 100644 --- a/orte/mca/oob/tcp/oob_tcp.h +++ b/orte/mca/oob/tcp/oob_tcp.h @@ -9,7 +9,7 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. - * Copyright (c) 2006-2007 Los Alamos National Security, LLC. + * Copyright (c) 2006-2012 Los Alamos National Security, LLC. * All rights reserved. * Copyright (c) 2010-2011 Cisco Systems, Inc. All rights reserved. * $COPYRIGHT$ @@ -206,14 +206,14 @@ struct mca_oob_tcp_component_t { int tcp_sndbuf; /**< socket send buffer size */ int tcp_rcvbuf; /**< socket recv buffer size */ opal_free_list_t tcp_msgs; /**< free list of messages */ - opal_event_t tcp_recv_event; /**< event structure for IPv4 recvs */ + opal_event_t *tcp_recv_event; /**< event structure for IPv4 recvs */ int tcp_listen_sd; /**< listen socket for incoming IPv4 connection requests */ unsigned short tcp_listen_port; /**< IPv4 listen port */ char** tcp4_static_ports; /**< Static ports - IPV4 */ char** tcp4_dyn_ports; /**< Dynamic ports - IPV4 */ int disable_family; /**< disable AF: 0-nothing, 4-IPv4, 6-IPv6 */ #if OPAL_WANT_IPV6 - opal_event_t tcp6_recv_event; /**< event structure for IPv6 recvs */ + opal_event_t *tcp6_recv_event; /**< event structure for IPv6 recvs */ int tcp6_listen_sd; /**< listen socket for incoming IPv6 connection requests */ unsigned short tcp6_listen_port; /**< IPv6 listen port */ char** tcp6_static_ports; /**< Static ports - IPV6 */ @@ -239,7 +239,7 @@ struct mca_oob_tcp_component_t { opal_list_t tcp_connections_return; /**< List of connection fragments being returned to accept thread */ opal_mutex_t tcp_connections_lock; /**< Lock protecting pending_connections and connections_return */ int tcp_connections_pipe[2]; - opal_event_t tcp_listen_thread_event; + opal_event_t *tcp_listen_thread_event; int tcp_copy_max_size; /**< Max size of the copy list before copying must commence */ int tcp_listen_thread_num_sockets; /**< Number of sockets in tcp_listen_thread_sds */ diff --git a/orte/mca/oob/tcp/oob_tcp_msg.c b/orte/mca/oob/tcp/oob_tcp_msg.c index 2a00fc0b0a..0468404f4e 100644 --- a/orte/mca/oob/tcp/oob_tcp_msg.c +++ b/orte/mca/oob/tcp/oob_tcp_msg.c @@ -261,11 +261,13 @@ static bool mca_oob_tcp_msg_recv(mca_oob_tcp_msg_t* msg, mca_oob_tcp_peer_t* pee else if (opal_socket_errno == EAGAIN || opal_socket_errno == EWOULDBLOCK) { return false; } - opal_output(0, "%s-%s mca_oob_tcp_msg_recv: readv failed: %s (%d)", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&(peer->peer_name)), - strerror(opal_socket_errno), - opal_socket_errno); + if (mca_oob_tcp_component.tcp_debug >= OOB_TCP_DEBUG_INFO) { + opal_output(0, "%s-%s mca_oob_tcp_msg_recv: readv failed: %s (%d)", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(&(peer->peer_name)), + strerror(opal_socket_errno), + opal_socket_errno); + } mca_oob_tcp_peer_close(peer); if (NULL != mca_oob_tcp.oob_exception_callback) { mca_oob_tcp.oob_exception_callback(&peer->peer_name, ORTE_RML_PEER_DISCONNECTED); diff --git a/orte/mca/oob/tcp/oob_tcp_peer.c b/orte/mca/oob/tcp/oob_tcp_peer.c index cf6220c665..22476eb17c 100644 --- a/orte/mca/oob/tcp/oob_tcp_peer.c +++ b/orte/mca/oob/tcp/oob_tcp_peer.c @@ -9,7 +9,7 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. - * Copyright (c) 2006-2007 Los Alamos National Security, LLC. + * Copyright (c) 2006-2011 Los Alamos National Security, LLC. * All rights reserved. * Copyright (c) 2009 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2011 Oak Ridge National Labs. All rights reserved. @@ -60,6 +60,7 @@ #include "opal/mca/event/event.h" #include "orte/util/name_fns.h" +#include "orte/mca/state/state.h" #include "orte/runtime/orte_globals.h" #include "orte/mca/errmgr/errmgr.h" #include "orte/mca/ess/ess.h" @@ -102,12 +103,13 @@ static void mca_oob_tcp_peer_construct(mca_oob_tcp_peer_t* peer) { OBJ_CONSTRUCT(&(peer->peer_send_queue), opal_list_t); OBJ_CONSTRUCT(&(peer->peer_lock), opal_mutex_t); - memset(&peer->peer_send_event, 0, sizeof(peer->peer_send_event)); - memset(&peer->peer_recv_event, 0, sizeof(peer->peer_recv_event)); peer->peer_sd = -1; peer->peer_current_af = AF_UNSPEC; - memset(&peer->peer_timer_event, 0, sizeof(peer->peer_timer_event)); - opal_event_evtimer_set(opal_event_base, &peer->peer_timer_event, mca_oob_tcp_peer_timer_handler, peer); + /* get events */ + peer->peer_send_event = opal_event_alloc(); + peer->peer_recv_event = opal_event_alloc(); + peer->peer_timer_event = opal_event_alloc(); + opal_event_evtimer_set(orte_event_base, peer->peer_timer_event, mca_oob_tcp_peer_timer_handler, peer); } /* @@ -133,22 +135,21 @@ static void mca_oob_tcp_peer_destruct(mca_oob_tcp_peer_t * peer) */ static int mca_oob_tcp_peer_event_init(mca_oob_tcp_peer_t* peer) { - memset(&peer->peer_recv_event, 0, sizeof(peer->peer_recv_event)); - memset(&peer->peer_send_event, 0, sizeof(peer->peer_send_event)); - - if (peer->peer_sd >= 0) { - opal_event_set(opal_event_base, - &peer->peer_recv_event, + if (peer->peer_sd >= 0) { + opal_event_set(orte_event_base, + peer->peer_recv_event, peer->peer_sd, OPAL_EV_READ|OPAL_EV_PERSIST, mca_oob_tcp_peer_recv_handler, peer); - opal_event_set(opal_event_base, - &peer->peer_send_event, + opal_event_set_priority(peer->peer_recv_event, ORTE_MSG_PRI); + opal_event_set(orte_event_base, + peer->peer_send_event, peer->peer_sd, OPAL_EV_WRITE|OPAL_EV_PERSIST, mca_oob_tcp_peer_send_handler, peer); + opal_event_set_priority(peer->peer_send_event, ORTE_MSG_PRI); } return ORTE_SUCCESS; @@ -181,7 +182,7 @@ int mca_oob_tcp_peer_send(mca_oob_tcp_peer_t* peer, mca_oob_tcp_msg_t* msg) append to the peer_send_queue. */ OPAL_THREAD_UNLOCK(&peer->peer_lock); rc = mca_oob_tcp_resolve(peer); - if (ORTE_ERR_ADDRESSEE_UNKNOWN != OPAL_SOS_GET_ERROR_CODE(rc)) { + if (ORTE_ERR_ADDRESSEE_UNKNOWN != rc) { OPAL_THREAD_LOCK(&peer->peer_lock); opal_list_append(&peer->peer_send_queue, (opal_list_item_t*)msg); @@ -204,7 +205,7 @@ int mca_oob_tcp_peer_send(mca_oob_tcp_peer_t* peer, mca_oob_tcp_msg_t* msg) /*if the send does not complete */ if(!mca_oob_tcp_msg_send_handler(msg, peer)) { peer->peer_send_msg = msg; - opal_event_add(&peer->peer_send_event, 0); + opal_event_add(peer->peer_send_event, 0); } else { mca_oob_tcp_msg_complete(msg, &peer->peer_name); } @@ -399,7 +400,7 @@ static int mca_oob_tcp_peer_try_connect(mca_oob_tcp_peer_t* peer) rc = mca_oob_tcp_peer_create_socket(peer, inaddr.ss_family); if (ORTE_SUCCESS != rc) { struct timeval tv = { 1,0 }; - opal_event_evtimer_add(&peer->peer_timer_event, &tv); + opal_event_evtimer_add(peer->peer_timer_event, &tv); return rc; } @@ -414,7 +415,7 @@ static int mca_oob_tcp_peer_try_connect(mca_oob_tcp_peer_t* peer) if (connect(peer->peer_sd, (struct sockaddr*)&inaddr, addrlen) < 0) { /* non-blocking so wait for completion */ if(opal_socket_errno == EINPROGRESS || opal_socket_errno == EWOULDBLOCK) { - opal_event_add(&peer->peer_send_event, 0); + opal_event_add(peer->peer_send_event, 0); return ORTE_SUCCESS; } @@ -445,7 +446,7 @@ static int mca_oob_tcp_peer_try_connect(mca_oob_tcp_peer_t* peer) /* send our globally unique process identifier to the peer */ if((rc = mca_oob_tcp_peer_send_connect_ack(peer, peer->peer_sd)) == ORTE_SUCCESS) { peer->peer_state = MCA_OOB_TCP_CONNECT_ACK; - opal_event_add(&peer->peer_recv_event, 0); + opal_event_add(peer->peer_recv_event, 0); return ORTE_SUCCESS; } else { opal_output(0, @@ -505,8 +506,10 @@ static void mca_oob_tcp_peer_complete_connect(mca_oob_tcp_peer_t* peer, int sd) int so_error = 0; opal_socklen_t so_length = sizeof(so_error); - /* unregister from receiving event notifications */ - opal_event_del(&peer->peer_send_event); + /* unregister from receiving event notifications, + * but keep the event in case we need it later + */ + opal_event_del(peer->peer_send_event); /* check connect completion status */ if(getsockopt(sd, SOL_SOCKET, SO_ERROR, (char *)&so_error, &so_length) < 0) { @@ -520,7 +523,7 @@ static void mca_oob_tcp_peer_complete_connect(mca_oob_tcp_peer_t* peer, int sd) } if(so_error == EINPROGRESS) { - opal_event_add(&peer->peer_send_event, 0); + opal_event_add(peer->peer_send_event, 0); return; } else if (so_error == ECONNREFUSED || so_error == ETIMEDOUT) { struct timeval tv = { 1,0 }; @@ -534,7 +537,7 @@ static void mca_oob_tcp_peer_complete_connect(mca_oob_tcp_peer_t* peer, int sd) } mca_oob_tcp_peer_shutdown(peer); if( MCA_OOB_TCP_FAILED != peer->peer_state ) { - opal_event_evtimer_add(&peer->peer_timer_event, &tv); + opal_event_evtimer_add(peer->peer_timer_event, &tv); } return; } else if(so_error != 0) { @@ -554,7 +557,7 @@ static void mca_oob_tcp_peer_complete_connect(mca_oob_tcp_peer_t* peer, int sd) if (mca_oob_tcp_peer_send_connect_ack(peer, sd) == ORTE_SUCCESS) { peer->peer_state = MCA_OOB_TCP_CONNECT_ACK; - opal_event_add(&peer->peer_recv_event, 0); + opal_event_add(peer->peer_recv_event, 0); } else { opal_output(0, "%s-%s mca_oob_tcp_peer_complete_connect: unable to send connect ack.", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), @@ -569,7 +572,7 @@ static void mca_oob_tcp_peer_complete_connect(mca_oob_tcp_peer_t* peer, int sd) */ static void mca_oob_tcp_peer_connected(mca_oob_tcp_peer_t* peer, int sd) { - opal_event_del(&peer->peer_timer_event); + opal_event_del(peer->peer_timer_event); peer->peer_state = MCA_OOB_TCP_CONNECTED; peer->peer_retries = 0; @@ -578,7 +581,7 @@ static void mca_oob_tcp_peer_connected(mca_oob_tcp_peer_t* peer, int sd) peer->peer_send_msg = (mca_oob_tcp_msg_t*) opal_list_remove_first(&peer->peer_send_queue); } - opal_event_add(&peer->peer_send_event, 0); + opal_event_add(peer->peer_send_event, 0); } } @@ -618,20 +621,8 @@ void mca_oob_tcp_peer_close(mca_oob_tcp_peer_t* peer) /* inform the ERRMGR framework that we have lost a connection so * it can decide if this is important, what to do about it, etc. */ - if (ORTE_ERR_UNRECOVERABLE == orte_errmgr.update_state( - peer->peer_name.jobid, - ORTE_JOB_STATE_COMM_FAILED, - &peer->peer_name, - ORTE_PROC_STATE_COMM_FAILED, - 0, - ORTE_ERROR_DEFAULT_EXIT_CODE)) { - /* Should free the