From 41eb41c3f224abcacec78f4d77c138197c36f172 Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Wed, 25 Sep 2019 13:27:41 -0700 Subject: [PATCH] Cleanup stale code in ORTE/OOB Remove code for multiple OOB progress threads as it is an optimization nobody uses. Also turns out to have a race condition that can cause segfault on finalize, so maybe good that nobody is using it. Signed-off-by: Ralph Castain --- orte/mca/oob/base/base.h | 1 - orte/mca/oob/base/oob_base_frame.c | 21 +----------------- orte/mca/oob/tcp/oob_tcp.c | 12 +---------- orte/mca/oob/tcp/oob_tcp_component.c | 31 --------------------------- orte/mca/oob/tcp/oob_tcp_component.h | 14 +----------- orte/mca/oob/tcp/oob_tcp_connection.c | 8 ++----- orte/mca/oob/tcp/oob_tcp_connection.h | 8 +++---- orte/mca/oob/tcp/oob_tcp_peer.h | 3 +-- orte/mca/oob/tcp/oob_tcp_sendrecv.h | 8 +++---- 9 files changed, 14 insertions(+), 92 deletions(-) diff --git a/orte/mca/oob/base/base.h b/orte/mca/oob/base/base.h index fb4ed1c0be..1046b879f3 100644 --- a/orte/mca/oob/base/base.h +++ b/orte/mca/oob/base/base.h @@ -63,7 +63,6 @@ typedef struct { opal_list_t actives; int max_uri_length; opal_hash_table_t peers; - int num_threads; #if OPAL_ENABLE_TIMING bool timing; #endif diff --git a/orte/mca/oob/base/oob_base_frame.c b/orte/mca/oob/base/oob_base_frame.c index be5c745e50..deaf851a90 100644 --- a/orte/mca/oob/base/oob_base_frame.c +++ b/orte/mca/oob/base/oob_base_frame.c @@ -15,7 +15,7 @@ * reserved. * Copyright (c) 2015-2016 Research Organization for Information Science * and Technology (RIST). All rights reserved. - * Copyright (c) 2017 Intel, Inc. All rights reserved. + * Copyright (c) 2017-2019 Intel, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -55,14 +55,6 @@ orte_oob_base_t orte_oob_base = {0}; static int orte_oob_base_register(mca_base_register_flag_t flags) { - orte_oob_base.num_threads = 0; - (void)mca_base_var_register("orte", "oob", "base", "num_progress_threads", - "Number of independent progress OOB messages for each interface", - MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, - OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_READONLY, - &orte_oob_base.num_threads); - #if OPAL_ENABLE_TIMING /* Detailed timing setup */ orte_oob_base.timing = false; @@ -91,10 +83,6 @@ static int orte_oob_base_close(void) OBJ_RELEASE(cli); } - if (!ORTE_PROC_IS_APP && !ORTE_PROC_IS_TOOL) { - opal_progress_thread_finalize("OOB-BASE"); - } - /* destruct our internal lists */ OBJ_DESTRUCT(&orte_oob_base.actives); @@ -122,13 +110,6 @@ static int orte_oob_base_open(mca_base_open_flag_t flags) opal_hash_table_init(&orte_oob_base.peers, 128); OBJ_CONSTRUCT(&orte_oob_base.actives, opal_list_t); - if (ORTE_PROC_IS_APP || ORTE_PROC_IS_TOOL) { - orte_oob_base.ev_base = orte_event_base; - } else { - orte_oob_base.ev_base = opal_progress_thread_init("OOB-BASE"); - } - - #if OPAL_ENABLE_FT_CR == 1 /* register the FT events callback */ orte_state.add_job_state(ORTE_JOB_STATE_FT_CHECKPOINT, orte_oob_base_ft_event, ORTE_ERROR_PRI); diff --git a/orte/mca/oob/tcp/oob_tcp.c b/orte/mca/oob/tcp/oob_tcp.c index 15f326d277..b3bb8947af 100644 --- a/orte/mca/oob/tcp/oob_tcp.c +++ b/orte/mca/oob/tcp/oob_tcp.c @@ -141,12 +141,6 @@ static void ping(const orte_process_name_t *proc) return; } - /* has this peer had a progress thread assigned yet? */ - if (NULL == peer->ev_base) { - /* nope - assign one */ - ORTE_OOB_TCP_NEXT_BASE(peer); - } - /* if we are already connected, there is nothing to do */ if (MCA_OOB_TCP_CONNECTED == peer->state) { opal_output_verbose(2, orte_oob_base_framework.framework_output, @@ -204,11 +198,7 @@ static void send_nb(orte_rml_send_t *msg) __FILE__, __LINE__, ORTE_NAME_PRINT(&msg->dst), msg->tag, msg->seq_num, ORTE_NAME_PRINT(&peer->name)); - /* has this peer had a progress thread assigned yet? */ - if (NULL == peer->ev_base) { - /* nope - assign one */ - ORTE_OOB_TCP_NEXT_BASE(peer); - } + /* add the msg to the hop's send queue */ if (MCA_OOB_TCP_CONNECTED == peer->state) { opal_output_verbose(2, orte_oob_base_framework.framework_output, diff --git a/orte/mca/oob/tcp/oob_tcp_component.c b/orte/mca/oob/tcp/oob_tcp_component.c index d75216a23c..7623d74035 100644 --- a/orte/mca/oob/tcp/oob_tcp_component.c +++ b/orte/mca/oob/tcp/oob_tcp_component.c @@ -145,12 +145,8 @@ mca_oob_tcp_component_t mca_oob_tcp_component = { */ static int tcp_component_open(void) { - mca_oob_tcp_component.next_base = 0; OBJ_CONSTRUCT(&mca_oob_tcp_component.peers, opal_hash_table_t); opal_hash_table_init(&mca_oob_tcp_component.peers, 32); - OBJ_CONSTRUCT(&mca_oob_tcp_component.ev_bases, opal_pointer_array_t); - opal_pointer_array_init(&mca_oob_tcp_component.ev_bases, - orte_oob_base.num_threads, 256, 8); OBJ_CONSTRUCT(&mca_oob_tcp_component.listeners, opal_list_t); if (ORTE_PROC_IS_HNP) { @@ -204,8 +200,6 @@ static int tcp_component_close(void) } #endif - OBJ_DESTRUCT(&mca_oob_tcp_component.ev_bases); - return ORTE_SUCCESS; } static char *static_port_string; @@ -626,27 +620,11 @@ static int component_available(void) static int component_startup(void) { int rc = ORTE_SUCCESS; - int i; - char *tmp; - opal_event_base_t *evb; opal_output_verbose(2, orte_oob_base_framework.framework_output, "%s TCP STARTUP", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - /* initialize state */ - if (0 == orte_oob_base.num_threads) { - opal_pointer_array_add(&mca_oob_tcp_component.ev_bases, orte_oob_base.ev_base); - } else { - for (i=0; i < orte_oob_base.num_threads; i++) { - opal_asprintf(&tmp, "OOB-TCP-%d", i); - evb = opal_progress_thread_init(tmp); - opal_pointer_array_add(&mca_oob_tcp_component.ev_bases, evb); - opal_argv_append_nosize(&mca_oob_tcp_component.ev_threads, tmp); - free(tmp); - } - } - /* if we are a daemon/HNP, or we are a standalone app, * then it is possible that someone else may initiate a * connection to us. In these cases, we need to start the @@ -674,14 +652,6 @@ static void component_shutdown(void) "%s TCP SHUTDOWN", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - if (0 < orte_oob_base.num_threads) { - for (i=0; i < orte_oob_base.num_threads; i++) { - opal_progress_thread_finalize(mca_oob_tcp_component.ev_threads[i]); - opal_pointer_array_set_item(&mca_oob_tcp_component.ev_bases, i, NULL); - } - opal_argv_free(mca_oob_tcp_component.ev_threads); - } - if (ORTE_PROC_IS_HNP && mca_oob_tcp_component.listen_thread_active) { mca_oob_tcp_component.listen_thread_active = false; /* tell the thread to exit */ @@ -1327,7 +1297,6 @@ static char **split_and_resolve(char **orig_str, char *name) static void peer_cons(mca_oob_tcp_peer_t *peer) { - peer->ev_base = NULL; peer->auth_method = NULL; peer->sd = -1; OBJ_CONSTRUCT(&peer->addrs, opal_list_t); diff --git a/orte/mca/oob/tcp/oob_tcp_component.h b/orte/mca/oob/tcp/oob_tcp_component.h index 37f91466f0..7a36ea3c30 100644 --- a/orte/mca/oob/tcp/oob_tcp_component.h +++ b/orte/mca/oob/tcp/oob_tcp_component.h @@ -12,7 +12,7 @@ * Copyright (c) 2006-2013 Los Alamos National Security, LLC. * All rights reserved. * Copyright (c) 2010-2011 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2014-2017 Intel, Inc. All rights reserved. + * Copyright (c) 2014-2019 Intel, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -48,9 +48,6 @@ typedef struct { int max_retries; /**< max number of retries before declaring peer gone */ opal_list_t events; /**< events for monitoring connections */ int peer_limit; /**< max size of tcp peer cache */ - opal_pointer_array_t ev_bases; // event base array for progress threads - char** ev_threads; // event progress thread names - int next_base; // counter to load-level thread use opal_hash_table_t peers; // connection addresses for peers /* Port specifications */ @@ -96,13 +93,4 @@ ORTE_MODULE_DECLSPEC void mca_oob_tcp_component_failed_to_connect(int fd, short ORTE_MODULE_DECLSPEC void mca_oob_tcp_component_no_route(int fd, short args, void *cbdata); ORTE_MODULE_DECLSPEC void mca_oob_tcp_component_hop_unknown(int fd, short args, void *cbdata); -#define ORTE_OOB_TCP_NEXT_BASE(p) \ - do { \ - ++mca_oob_tcp_component.next_base; \ - if (orte_oob_base.num_threads <= mca_oob_tcp_component.next_base) { \ - mca_oob_tcp_component.next_base = 0; \ - } \ - (p)->ev_base = (opal_event_base_t*)opal_pointer_array_get_item(&mca_oob_tcp_component.ev_bases, mca_oob_tcp_component.next_base); \ - } while(0) - #endif /* _MCA_OOB_TCP_COMPONENT_H_ */ diff --git a/orte/mca/oob/tcp/oob_tcp_connection.c b/orte/mca/oob/tcp/oob_tcp_connection.c index 819d2d77bf..e5f322fb6a 100644 --- a/orte/mca/oob/tcp/oob_tcp_connection.c +++ b/orte/mca/oob/tcp/oob_tcp_connection.c @@ -507,10 +507,7 @@ static void tcp_peer_event_init(mca_oob_tcp_peer_t* peer) { if (peer->sd >= 0) { assert(!peer->send_ev_active && !peer->recv_ev_active); - if (NULL == peer->ev_base) { - ORTE_OOB_TCP_NEXT_BASE(peer); - } - opal_event_set(peer->ev_base, + opal_event_set(orte_event_base, &peer->recv_event, peer->sd, OPAL_EV_READ|OPAL_EV_PERSIST, @@ -522,7 +519,7 @@ static void tcp_peer_event_init(mca_oob_tcp_peer_t* peer) peer->recv_ev_active = false; } - opal_event_set(peer->ev_base, + opal_event_set(orte_event_base, &peer->send_event, peer->sd, OPAL_EV_WRITE|OPAL_EV_PERSIST, @@ -803,7 +800,6 @@ int mca_oob_tcp_peer_recv_connect_ack(mca_oob_tcp_peer_t* pr, ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); peer = OBJ_NEW(mca_oob_tcp_peer_t); peer->name = hdr.origin; - ORTE_OOB_TCP_NEXT_BASE(peer); // assign it an event base peer->state = MCA_OOB_TCP_ACCEPTING; ui64 = (uint64_t*)(&peer->name); if (OPAL_SUCCESS != opal_hash_table_set_value_uint64(&mca_oob_tcp_component.peers, (*ui64), peer)) { diff --git a/orte/mca/oob/tcp/oob_tcp_connection.h b/orte/mca/oob/tcp/oob_tcp_connection.h index e1392fe781..0cac37d8da 100644 --- a/orte/mca/oob/tcp/oob_tcp_connection.h +++ b/orte/mca/oob/tcp/oob_tcp_connection.h @@ -12,7 +12,7 @@ * Copyright (c) 2006-2013 Los Alamos National Security, LLC. * All rights reserved. * Copyright (c) 2010-2011 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2014-2017 Intel, Inc. All rights reserved. + * Copyright (c) 2014-2019 Intel, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -60,14 +60,14 @@ OBJ_CLASS_DECLARATION(mca_oob_tcp_conn_op_t); ORTE_NAME_PRINT((&(p)->name))); \ cop = OBJ_NEW(mca_oob_tcp_conn_op_t); \ cop->peer = (p); \ - ORTE_THREADSHIFT(cop, (p)->ev_base, (cbfunc), ORTE_MSG_PRI); \ + ORTE_THREADSHIFT(cop, orte_event_base, (cbfunc), ORTE_MSG_PRI); \ } while(0); #define ORTE_ACTIVATE_TCP_ACCEPT_STATE(s, a, cbfunc) \ do { \ mca_oob_tcp_conn_op_t *cop; \ cop = OBJ_NEW(mca_oob_tcp_conn_op_t); \ - opal_event_set(orte_oob_base.ev_base, &cop->ev, s, \ + opal_event_set(orte_event_base, &cop->ev, s, \ OPAL_EV_READ, (cbfunc), cop); \ opal_event_set_priority(&cop->ev, ORTE_MSG_PRI); \ ORTE_POST_OBJECT(cop); \ @@ -84,7 +84,7 @@ OBJ_CLASS_DECLARATION(mca_oob_tcp_conn_op_t); ORTE_NAME_PRINT((&(p)->name))); \ cop = OBJ_NEW(mca_oob_tcp_conn_op_t); \ cop->peer = (p); \ - opal_event_evtimer_set((p)->ev_base, \ + opal_event_evtimer_set(orte_event_base, \ &cop->ev, \ (cbfunc), cop); \ ORTE_POST_OBJECT(cop); \ diff --git a/orte/mca/oob/tcp/oob_tcp_peer.h b/orte/mca/oob/tcp/oob_tcp_peer.h index 9a175e084e..facf170c73 100644 --- a/orte/mca/oob/tcp/oob_tcp_peer.h +++ b/orte/mca/oob/tcp/oob_tcp_peer.h @@ -52,7 +52,6 @@ typedef struct { mca_oob_tcp_addr_t *active_addr; mca_oob_tcp_state_t state; int num_retries; - opal_event_base_t *ev_base; // progress thread this peer is assigned to opal_event_t send_event; /**< registration with event thread for send events */ bool send_ev_active; opal_event_t recv_event; /**< registration with event thread for recv events */ @@ -82,7 +81,7 @@ OBJ_CLASS_DECLARATION(mca_oob_tcp_peer_op_t); pop = OBJ_NEW(mca_oob_tcp_peer_op_t); \ pop->peer.jobid = (p)->name.jobid; \ pop->peer.vpid = (p)->name.vpid; \ - ORTE_THREADSHIFT(pop, orte_oob_base.ev_base, \ + ORTE_THREADSHIFT(pop, orte_event_base, \ (cbfunc), ORTE_MSG_PRI); \ } while(0); diff --git a/orte/mca/oob/tcp/oob_tcp_sendrecv.h b/orte/mca/oob/tcp/oob_tcp_sendrecv.h index 1ac1b570fc..b4d14186f4 100644 --- a/orte/mca/oob/tcp/oob_tcp_sendrecv.h +++ b/orte/mca/oob/tcp/oob_tcp_sendrecv.h @@ -83,7 +83,7 @@ OBJ_CLASS_DECLARATION(mca_oob_tcp_recv_t); do { \ (s)->peer = (struct mca_oob_tcp_peer_t*)(p); \ (s)->activate = (f); \ - ORTE_THREADSHIFT((s), (p)->ev_base, \ + ORTE_THREADSHIFT((s), orte_event_base, \ mca_oob_tcp_queue_msg, ORTE_MSG_PRI); \ } while(0) @@ -228,7 +228,7 @@ OBJ_CLASS_DECLARATION(mca_oob_tcp_msg_op_t); ORTE_NAME_PRINT(&((ms)->dst))); \ mop = OBJ_NEW(mca_oob_tcp_msg_op_t); \ mop->msg = (ms); \ - ORTE_THREADSHIFT(mop, (ms)->peer->ev_base, \ + ORTE_THREADSHIFT(mop, orte_event_base, \ (cbfunc), ORTE_MSG_PRI); \ } while(0); @@ -274,7 +274,7 @@ OBJ_CLASS_DECLARATION(mca_oob_tcp_msg_error_t); mop->hop.jobid = (h)->jobid; \ mop->hop.vpid = (h)->vpid; \ /* this goes to the OOB framework, so use that event base */ \ - ORTE_THREADSHIFT(mop, orte_oob_base.ev_base, \ + ORTE_THREADSHIFT(mop, orte_event_base, \ (cbfunc), ORTE_MSG_PRI); \ } while(0) @@ -292,7 +292,7 @@ OBJ_CLASS_DECLARATION(mca_oob_tcp_msg_error_t); mop->hop.vpid = (h)->vpid; \ /* this goes to the component, so use the framework \ * event base */ \ - ORTE_THREADSHIFT(mop, orte_oob_base.ev_base, \ + ORTE_THREADSHIFT(mop, orte_event_base, \ (c), ORTE_MSG_PRI); \ } while(0)