diff --git a/LICENSE b/LICENSE index 61b0e2bd87..5c3b10295d 100644 --- a/LICENSE +++ b/LICENSE @@ -14,7 +14,7 @@ Copyright (c) 2004-2006 High Performance Computing Center Stuttgart, University of Stuttgart. All rights reserved. Copyright (c) 2004-2006 The Regents of the University of California. All rights reserved. -© Copyright 2006 Los Alamos National Security, LLC. All rights +Copyright (c) 2006 Los Alamos National Security, LLC. All rights reserved. Copyright (c) 2006 Cisco Systems, Inc. All rights reserved. Copyright (c) 2006 Voltaire, Inc. All rights reserved. diff --git a/ompi/communicator/comm_dyn.c b/ompi/communicator/comm_dyn.c index c49d2496d5..7421ddbe7f 100644 --- a/ompi/communicator/comm_dyn.c +++ b/ompi/communicator/comm_dyn.c @@ -10,6 +10,9 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2006 University of Houston. All rights reserved. + * Copyright (c) 2006 Los Alamos National Security, LLC. All rights + * reserved. + * * $COPYRIGHT$ * * Additional copyrights may follow @@ -83,7 +86,7 @@ int ompi_comm_connect_accept ( ompi_communicator_t *comm, int root, /* tell the progress engine to tick the event library more often, to make sure that the OOB messages get sent */ - opal_progress_event_increment(); + opal_progress_event_users_increment(); if ( rank == root ) { /* The process receiving first does not have yet the contact @@ -243,8 +246,7 @@ int ompi_comm_connect_accept ( ompi_communicator_t *comm, int root, exit: /* done with OOB and such - slow our tick rate again */ opal_progress(); - opal_progress_event_decrement(); - + opal_progress_event_users_decrement(); if ( NULL != rprocs ) { free ( rprocs ); @@ -380,7 +382,7 @@ ompi_comm_start_processes(int count, char **array_of_commands, */ /* make sure the progress engine properly trips the event library */ - opal_progress_event_increment(); + opal_progress_event_users_increment(); /* check to see if we want timing information */ param = mca_base_param_reg_int_name("ompi", "timing", @@ -438,7 +440,7 @@ ompi_comm_start_processes(int count, char **array_of_commands, ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); /* rollback what was already done */ for (j=0; j < i; j++) OBJ_RELEASE(apps[j]); - opal_progress_event_decrement(); + opal_progress_event_users_decrement(); return ORTE_ERR_OUT_OF_RESOURCE; } /* copy over the name of the executable */ @@ -447,7 +449,7 @@ ompi_comm_start_processes(int count, char **array_of_commands, ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); /* rollback what was already done */ for (j=0; j < i; j++) OBJ_RELEASE(apps[j]); - opal_progress_event_decrement(); + opal_progress_event_users_decrement(); return ORTE_ERR_OUT_OF_RESOURCE; } /* record the number of procs to be generated */ @@ -474,7 +476,7 @@ ompi_comm_start_processes(int count, char **array_of_commands, for (j=0; j < i; j++) { OBJ_RELEASE(apps[j]); } - opal_progress_event_decrement(); + opal_progress_event_users_decrement(); return ORTE_ERR_OUT_OF_RESOURCE; } apps[i]->argv[0] = strdup(array_of_commands[i]); @@ -497,7 +499,7 @@ ompi_comm_start_processes(int count, char **array_of_commands, ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); /* rollback what was already done */ for (j=0; j < i; j++) OBJ_RELEASE(apps[j]); - opal_progress_event_decrement(); + opal_progress_event_users_decrement(); return ORTE_ERR_OUT_OF_RESOURCE; } asprintf(&(apps[i]->env[0]), "OMPI_PARENT_PORT=%s", port_name); @@ -571,7 +573,7 @@ ompi_comm_start_processes(int count, char **array_of_commands, ORTE_RMGR_ATTR_OVERRIDE))) { ORTE_ERROR_LOG(rc); OBJ_DESTRUCT(&attributes); - opal_progress_event_decrement(); + opal_progress_event_users_decrement(); return MPI_ERR_SPAWN; } @@ -583,7 +585,7 @@ ompi_comm_start_processes(int count, char **array_of_commands, ORTE_RMGR_ATTR_OVERRIDE))) { ORTE_ERROR_LOG(rc); OBJ_DESTRUCT(&attributes); - opal_progress_event_decrement(); + opal_progress_event_users_decrement(); return MPI_ERR_SPAWN; } @@ -593,7 +595,7 @@ ompi_comm_start_processes(int count, char **array_of_commands, ORTE_RMGR_ATTR_OVERRIDE))) { ORTE_ERROR_LOG(rc); OBJ_DESTRUCT(&attributes); - opal_progress_event_decrement(); + opal_progress_event_users_decrement(); return MPI_ERR_SPAWN; } @@ -616,7 +618,7 @@ ompi_comm_start_processes(int count, char **array_of_commands, /* spawn procs */ if (ORTE_SUCCESS != (rc = orte_rmgr.spawn_job(apps, count, &new_jobid, 0, NULL, NULL, ORTE_PROC_STATE_NONE, &attributes))) { ORTE_ERROR_LOG(rc); - opal_progress_event_decrement(); + opal_progress_event_users_decrement(); return MPI_ERR_SPAWN; } @@ -632,7 +634,7 @@ ompi_comm_start_processes(int count, char **array_of_commands, } /* clean up */ - opal_progress_event_decrement(); + opal_progress_event_users_decrement(); while (NULL != (item = opal_list_remove_first(&attributes))) OBJ_RELEASE(item); OBJ_DESTRUCT(&attributes); diff --git a/ompi/mca/btl/mvapi/btl_mvapi_endpoint.c b/ompi/mca/btl/mvapi/btl_mvapi_endpoint.c index ca65ddb55b..105da43ba9 100644 --- a/ompi/mca/btl/mvapi/btl_mvapi_endpoint.c +++ b/ompi/mca/btl/mvapi/btl_mvapi_endpoint.c @@ -9,6 +9,9 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. + * Copyright (c) 2006 Los Alamos National Security, LLC. All rights + * reserved. + * * $COPYRIGHT$ * * Additional copyrights may follow @@ -506,7 +509,7 @@ static void mca_btl_mvapi_endpoint_connected(mca_btl_mvapi_endpoint_t *endpoint) /** * The connection is correctly setup. Now we can decrease the event trigger. */ - opal_progress_event_decrement(); + opal_progress_event_users_decrement(); while(!opal_list_is_empty(&(endpoint->pending_send_frags))) { frag_item = opal_list_remove_first(&(endpoint->pending_send_frags)); @@ -647,7 +650,7 @@ static void mca_btl_mvapi_endpoint_recv( * let the event engine pool the OOB events. Note: we increment it once peer active * connection. */ - opal_progress_event_increment(); + opal_progress_event_users_increment(); break; case MCA_BTL_IB_CONNECTING : @@ -749,7 +752,7 @@ int mca_btl_mvapi_endpoint_send( * let the event engine pool the OOB events. Note: we increment it once peer active * connection. */ - opal_progress_event_increment(); + opal_progress_event_users_increment(); call_progress = 1; break; diff --git a/ompi/mca/btl/openib/btl_openib_endpoint.c b/ompi/mca/btl/openib/btl_openib_endpoint.c index 49a46f2cd5..aa6ddf69c9 100644 --- a/ompi/mca/btl/openib/btl_openib_endpoint.c +++ b/ompi/mca/btl/openib/btl_openib_endpoint.c @@ -10,6 +10,9 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2006 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2006 Los Alamos National Security, LLC. All rights + * reserved. + * * $COPYRIGHT$ * * Additional copyrights may follow @@ -542,7 +545,7 @@ static void mca_btl_openib_endpoint_connected(mca_btl_openib_endpoint_t *endpoin /** * The connection is correctly setup. Now we can decrease the event trigger. */ - opal_progress_event_decrement(); + opal_progress_event_users_decrement(); /* While there are frags in the list, * process them */ @@ -731,7 +734,7 @@ static void mca_btl_openib_endpoint_recv( * let the event engine pool the OOB events. Note: we increment it once peer active * connection. */ - opal_progress_event_increment(); + opal_progress_event_users_increment(); break; case MCA_BTL_IB_CONNECTING : @@ -831,7 +834,7 @@ int mca_btl_openib_endpoint_send( * let the event engine pool the OOB events. Note: we increment it once peer active * connection. */ - opal_progress_event_increment(); + opal_progress_event_users_increment(); call_progress = true; break; diff --git a/ompi/mca/btl/tcp/btl_tcp.c b/ompi/mca/btl/tcp/btl_tcp.c index ee757fcc9d..535de66e67 100644 --- a/ompi/mca/btl/tcp/btl_tcp.c +++ b/ompi/mca/btl/tcp/btl_tcp.c @@ -9,6 +9,9 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. + * Copyright (c) 2006 Los Alamos National Security, LLC. All rights + * reserved. + * * $COPYRIGHT$ * * Additional copyrights may follow @@ -130,7 +133,7 @@ int mca_btl_tcp_add_procs( /* we increase the count of MPI users of the event library once per peer, so that we are used until we aren't connected to a peer */ - opal_progress_event_increment(); + opal_progress_event_users_increment(); } return OMPI_SUCCESS; @@ -149,7 +152,7 @@ int mca_btl_tcp_del_procs(struct mca_btl_base_module_t* btl, opal_list_remove_item(&tcp_btl->tcp_endpoints, (opal_list_item_t*)tcp_endpoint); OBJ_RELEASE(tcp_endpoint); } - opal_progress_event_decrement(); + opal_progress_event_users_decrement(); } return OMPI_SUCCESS; } @@ -490,7 +493,7 @@ int mca_btl_tcp_finalize(struct mca_btl_base_module_t* btl) item = opal_list_remove_first(&tcp_btl->tcp_endpoints)) { mca_btl_tcp_endpoint_t *endpoint = (mca_btl_tcp_endpoint_t*)item; OBJ_RELEASE(endpoint); - opal_progress_event_decrement(); + opal_progress_event_users_decrement(); } free(tcp_btl); return OMPI_SUCCESS; diff --git a/ompi/runtime/ompi_mpi_finalize.c b/ompi/runtime/ompi_mpi_finalize.c index 25e4ffaa3c..557ba24052 100644 --- a/ompi/runtime/ompi_mpi_finalize.c +++ b/ompi/runtime/ompi_mpi_finalize.c @@ -10,6 +10,9 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2006 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2006 Los Alamos National Security, LLC. All rights + * reserved. + * * $COPYRIGHT$ * * Additional copyrights may follow @@ -115,11 +118,12 @@ int ompi_mpi_finalize(void) ompi_mpi_finalized = true; #if OMPI_ENABLE_PROGRESS_THREADS == 0 - opal_progress_events(OPAL_EVLOOP_ONELOOP); + opal_progress_set_event_flag(OPAL_EVLOOP_ONELOOP); #endif - /* Change progress function priority back to RTE level stuff */ - opal_progress_mpi_disable(); + /* Redo ORTE calling opal_progress_event_users_increment() during + MPI lifetime, to get better latency when not using TCP */ + opal_progress_event_users_increment(); /* If maffinity was setup, tear it down */ if (ompi_mpi_maffinity_setup) { diff --git a/ompi/runtime/ompi_mpi_init.c b/ompi/runtime/ompi_mpi_init.c index c616c45e3a..2d8c93c0f5 100644 --- a/ompi/runtime/ompi_mpi_init.c +++ b/ompi/runtime/ompi_mpi_init.c @@ -10,6 +10,9 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2006 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2006 Los Alamos National Security, LLC. All rights + * reserved. + * * $COPYRIGHT$ * * Additional copyrights may follow @@ -281,7 +284,6 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided) /* Setup process affinity */ if (ompi_mpi_paffinity_alone) { - int param, value; bool set = false; param = mca_base_param_find("mpi", NULL, "paffinity_processor"); if (param >= 0) { @@ -328,13 +330,6 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided) goto error; } - /* initialize the progress engine for MPI functionality */ - if (OMPI_SUCCESS != opal_progress_mpi_init()) { - error = "opal_progress_mpi_init() failed"; - goto error; - } - - /* initialize ops. This has to be done *after* ddt_init, but befor mca_coll_base_open, since come collective modules (e.g. the hierarchical) need them in the query function @@ -678,17 +673,29 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided) #if OMPI_ENABLE_PROGRESS_THREADS == 0 /* switch from letting us sit in the event library for a bit each time through opal_progress() to completely non-blocking */ - opal_progress_events(OPAL_EVLOOP_NONBLOCK); + opal_progress_set_event_flag(OPAL_EVLOOP_NONBLOCK); #endif - /* put the event library in "high performance MPI mode" */ - if (OMPI_SUCCESS != (ret = opal_progress_mpi_enable())) { - error = "opal_progress_mpi_enable() failed"; - /* This will loop back up above, but ret != OMPI_SUCCESS, so - we'll end up returning out of this function before getting - here (and therefore avoiding an infinite loop) */ - goto error; + /* Undo ORTE calling opal_progress_event_users_increment() during + MPI lifetime, to get better latency when not using TCP */ + opal_progress_event_users_decrement(); + + /* override ORTE setting yield_when_idle, if desired */ + param = mca_base_param_find("mpi", NULL, "yield_when_idle"); + mca_base_param_lookup_int(param, &value); + if (value < 0) { + /* if we got a bogus value, do the conservative thing... */ + opal_progress_set_yield_when_idle(true); + } else { + opal_progress_set_yield_when_idle(value == 0 ? false : true); } + param = mca_base_param_find("mpi", NULL, "event_tick_rate"); + mca_base_param_lookup_int(param, &value); + /* negative value means use default - just don't do anything */ + if (value >= 0) { + opal_progress_set_event_poll_rate(value); + } + /* If we want the connection warmup, go do it */ if (ompi_mpi_preconnect_all) { diff --git a/opal/runtime/opal_params.c b/opal/runtime/opal_params.c index a00bce000a..e57eba23ab 100644 --- a/opal/runtime/opal_params.c +++ b/opal/runtime/opal_params.c @@ -9,6 +9,9 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. + * Copyright (c) 2006 Los Alamos National Security, LLC. All rights + * reserved. + * * $COPYRIGHT$ * * Additional copyrights may follow @@ -31,6 +34,8 @@ int opal_register_params(void) { + + /* * This string is going to be used in opal/util/stacktrace.c */ @@ -68,5 +73,11 @@ int opal_register_params(void) false, false, string, NULL); } +#if OMPI_ENABLE_DEBUG + mca_base_param_reg_int_name("opal", "progress_debug", + "Set to non-zero to debug progress engine features", + false, false, 0, NULL); +#endif + return OPAL_SUCCESS; } diff --git a/opal/runtime/opal_progress.c b/opal/runtime/opal_progress.c index 19b180f6cf..9033e0f251 100644 --- a/opal/runtime/opal_progress.c +++ b/opal/runtime/opal_progress.c @@ -9,6 +9,9 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. + * Copyright (c) 2006 Los Alamos National Security, LLC. All rights + * reserved. + * * $COPYRIGHT$ * * Additional copyrights may follow @@ -34,15 +37,8 @@ * default parameters */ static int opal_progress_event_flag = OPAL_EVLOOP_ONELOOP; -#if OPAL_PROGRESS_USE_TIMERS -static const opal_timer_t opal_progress_default_tick_rate = 10000; /* 10ms */ -#else -static const int opal_progress_default_tick_rate = 10000; /* 10k calls to opal_progress */ -#endif - volatile int32_t opal_progress_thread_count = 0; int opal_progress_spin_count = 10000; - /* @@ -71,118 +67,33 @@ static int32_t event_progress_delta = 0; #endif /* users of the event library from MPI cause the tick rate to be every time */ -static int32_t event_num_mpi_users = 0; +static int32_t num_event_users = 0; + +#if OMPI_ENABLE_DEBUG +static int debug_output = -1; +#endif /* init the progress engine - called from orte_init */ int opal_progress_init(void) { + int param, value; + /* reentrant issues */ #if OMPI_HAVE_THREAD_SUPPORT opal_atomic_init(&progress_lock, OPAL_ATOMIC_UNLOCKED); #endif /* OMPI_HAVE_THREAD_SUPPORT */ - /* always call sched yield when in the rte only... */ - call_yield = 1; - -#if OPAL_PROGRESS_USE_TIMERS - event_progress_delta = 0; -#if OPAL_TIMER_USEC_NATIVE - event_progress_last_time = opal_timer_base_get_usec(); -#else - event_progress_last_time = opal_timer_base_get_cycles(); -#endif -#else - event_progress_counter = event_progress_delta = 0; -#endif - - return OPAL_SUCCESS; -} - - -int -opal_progress_mpi_init(void) -{ - event_num_mpi_users = 0; - - return OPAL_SUCCESS; -} - -/* turn on MPI optimizations */ -int -opal_progress_mpi_enable(void) -{ - int param, value; - - /* call sched yield when oversubscribed. */ - param = mca_base_param_find("mpi", NULL, "yield_when_idle"); - mca_base_param_lookup_int(param, &value); - - if (value < 0) { - /* this should never happen set to 1 if it somehow does */ - call_yield = 1; - } else { - call_yield = value; - } - /* set the event tick rate */ - param = mca_base_param_find("mpi", NULL, "event_tick_rate"); + opal_progress_set_event_poll_rate(10000); + +#if OMPI_ENABLE_DEBUG + param = mca_base_param_find("opal", NULL, "progress_debug"); mca_base_param_lookup_int(param, &value); - - if (value < 0) { - /* user didn't specify - default tick rate */ - event_progress_delta = opal_progress_default_tick_rate; - } else if (value == 0) { -#if OPAL_PROGRESS_USE_TIMERS - /* user specified as never tick - tick once per minute */ - event_progress_delta = 60 * 1000000; -#else - /* user specified as never tick - don't count often */ - event_progress_delta = INT_MAX; -#endif - } else { -#if OPAL_PROGRESS_USE_TIMERS - event_progress_delta = value; -#else - /* subtract one so that we can do post-fix subtraction - in the inner loop and go faster */ - event_progress_delta = value - 1; -#endif + if (value) { + debug_output = opal_output_open(NULL); } -#if OPAL_PROGRESS_USE_TIMERS && !OPAL_TIMER_USEC_NATIVE - /* going to use cycles for counter. Adjust specified usec into cycles */ - event_progress_delta = event_progress_delta * opal_timer_base_get_freq() / 1000000; -#endif - -#if OPAL_PROGRESS_USE_TIMERS -#if OPAL_TIMER_USEC_NATIVE - event_progress_last_time = opal_timer_base_get_usec(); -#else - event_progress_last_time = opal_timer_base_get_cycles(); -#endif -#else - /* it's possible that an init function bumped up our tick rate. - * If so, set the event_progress counter to 0. Otherwise, set it to - * the reset value */ - event_progress_counter = (event_num_mpi_users > 0) ? - 0 : event_progress_delta; -#endif - - return OPAL_SUCCESS; -} - - -int -opal_progress_mpi_disable(void) -{ - /* always call sched yield from here on... */ - call_yield = 1; - - /* always tick the event library */ - event_progress_delta = 0; -#if !OPAL_PROGRESS_USE_TIMERS - event_progress_counter = 0; #endif return OPAL_SUCCESS; @@ -192,19 +103,17 @@ opal_progress_mpi_disable(void) int opal_progress_finalize(void) { - /* don't need to free the progess lock */ - /* free memory associated with the callbacks */ #if OMPI_HAVE_THREAD_SUPPORT opal_atomic_lock(&progress_lock); #endif + callbacks_len = 0; + callbacks_size = 0; if (NULL != callbacks) { free(callbacks); callbacks = NULL; } - callbacks_len = 0; - callbacks_size = 0; #if OMPI_HAVE_THREAD_SUPPORT opal_atomic_unlock(&progress_lock); @@ -214,14 +123,6 @@ opal_progress_finalize(void) } - -void -opal_progress_events(int flag) -{ - opal_progress_event_flag = flag; -} - - /* * Progress the event library and any functions that have registered to * be called. We don't propogate errors from the progress functions, @@ -253,7 +154,7 @@ opal_progress(void) #if OMPI_HAVE_THREAD_SUPPORT if (opal_atomic_trylock(&progress_lock)) { #endif /* OMPI_HAVE_THREAD_SUPPORT */ - event_progress_last_time = (event_num_mpi_users > 0) ? + event_progress_last_time = (num_event_users > 0) ? now - event_progress_delta : now; events += opal_event_loop(opal_progress_event_flag); @@ -271,7 +172,7 @@ opal_progress(void) if (opal_atomic_trylock(&progress_lock)) { #endif /* OMPI_HAVE_THREAD_SUPPORT */ event_progress_counter = - (event_num_mpi_users > 0) ? 0 : event_progress_delta; + (num_event_users > 0) ? 0 : event_progress_delta; events += opal_event_loop(opal_progress_event_flag); #if OMPI_HAVE_THREAD_SUPPORT opal_atomic_unlock(&progress_lock); @@ -305,6 +206,103 @@ opal_progress(void) } +int +opal_progress_set_event_flag(int flag) +{ + int tmp = opal_progress_event_flag; + opal_progress_event_flag = flag; + return tmp; +} + + +void +opal_progress_event_users_increment(void) +{ + int32_t val; + val = opal_atomic_add_32(&num_event_users, 1); + + OPAL_OUTPUT((debug_output, "event_users_increment setting count to %d", val)); + +#if OPAL_PROGRESS_USE_TIMERS + /* force an update next round (we'll be past the delta) */ + event_progress_last_time -= event_progress_delta; +#else + /* always reset the tick rate - can't hurt */ + event_progress_counter = 0; +#endif +} + + +void +opal_progress_event_users_decrement(void) +{ + int32_t val; + val = opal_atomic_sub_32(&num_event_users, 1); + + OPAL_OUTPUT((debug_output, "event_users_decrement setting count to %d", val)); + +#if !OPAL_PROGRESS_USE_TIMERS + /* start now in delaying if it's easy */ + if (val >= 0) { + event_progress_counter = event_progress_delta; + } +#endif +} + + +bool +opal_progress_set_yield_when_idle(bool yieldopt) +{ + bool tmp = (call_yield == 0) ? false : true; + call_yield = (yieldopt) ? 1 : 0; + + OPAL_OUTPUT((debug_output, "progress_set_yield_when_idle to %d", call_yield)); + + return tmp; +} + + +void +opal_progress_set_event_poll_rate(int polltime) +{ + OPAL_OUTPUT((debug_output, "progress_set_event_poll_rate(%d)", polltime)); + +#if OPAL_PROGRESS_USE_TIMERS + event_progress_delta = 0; +# if OPAL_TIMER_USEC_NATIVE + event_progress_last_time = opal_timer_base_get_usec(); +# else + event_progress_last_time = opal_timer_base_get_cycles(); +# endif +#else + event_progress_counter = event_progress_delta = 0; +#endif + + if (polltime == 0) { +#if OPAL_PROGRESS_USE_TIMERS + /* user specified as never tick - tick once per minute */ + event_progress_delta = 60 * 1000000; +#else + /* user specified as never tick - don't count often */ + event_progress_delta = INT_MAX; +#endif + } else { +#if OPAL_PROGRESS_USE_TIMERS + event_progress_delta = polltime; +#else + /* subtract one so that we can do post-fix subtraction + in the inner loop and go faster */ + event_progress_delta = polltime - 1; +#endif + } + +#if OPAL_PROGRESS_USE_TIMERS && !OPAL_TIMER_USEC_NATIVE + /* going to use cycles for counter. Adjust specified usec into cycles */ + event_progress_delta = event_progress_delta * opal_timer_base_get_freq() / 1000000; +#endif +} + + int opal_progress_register(opal_progress_callback_t cb) { @@ -378,38 +376,3 @@ opal_progress_unregister(opal_progress_callback_t cb) return ret; } - - -int -opal_progress_event_increment() -{ - int32_t val; - val = opal_atomic_add_32(&event_num_mpi_users, 1); - -#if OPAL_PROGRESS_USE_TIMERS - /* force an update next round (we'll be past the delta) */ - event_progress_last_time -= event_progress_delta; -#else - /* always reset the tick rate - can't hurt */ - event_progress_counter = 0; -#endif - - return OPAL_SUCCESS; -} - - -int -opal_progress_event_decrement() -{ - int32_t val; - val = opal_atomic_sub_32(&event_num_mpi_users, 1); - -#if !OPAL_PROGRESS_USE_TIMERS - /* start now in delaying if it's easy */ - if (val >= 0) { - event_progress_counter = event_progress_delta; - } -#endif - - return OPAL_SUCCESS; -} diff --git a/opal/runtime/opal_progress.h b/opal/runtime/opal_progress.h index 3eca01c801..29d4ffb02b 100644 --- a/opal/runtime/opal_progress.h +++ b/opal/runtime/opal_progress.h @@ -9,6 +9,9 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. + * Copyright (c) 2006 Los Alamos National Security, LLC. All rights + * reserved. + * * $COPYRIGHT$ * * Additional copyrights may follow @@ -22,11 +25,13 @@ * Progress engine for Open MPI */ -#ifndef _OMPI_PROGRESS_H_ -#define _OMPI_PROGRESS_H_ +#ifndef OPAL_RUNTIME_OPAL_PROGRESS_H +#define OPAL_RUNTIME_OPAL_PROGRESS_H + #if defined(c_plusplus) || defined(__cplusplus) extern "C" { #endif + #include "opal/threads/mutex.h" /** @@ -39,35 +44,6 @@ extern "C" { */ OPAL_DECLSPEC int opal_progress_init(void); -/** - * Configure the progress engine for executing MPI applications - * - * Register to receive any needed information from the GPR and - * intialize any data structures required for MPI applications. - * - * \note opal_progress_init() must be called before calling - * this function. Failure to do so is an error. - */ -OPAL_DECLSPEC int opal_progress_mpi_init(void); - -/** - * Turn on optimizations for MPI progress - * - * Turn on optimizations for MPI applications. This includes lowering - * the rate at which the event library is ticked if it is not under - * active use and possibly disabling the sched_yield call when the - * progress engine is idle - */ -OPAL_DECLSPEC int opal_progress_mpi_enable(void); - -/** - * Turn off all optimizations enabled by opal_progress_mpi_enable(). - * - * Completely reverses all optimizations enabled by - * opal_progress_mpi_enable(). The event library resumes constant - * ticking and the progress engine yields the CPU when idle. - */ -OPAL_DECLSPEC int opal_progress_mpi_disable(void); /** * Shut down the progress engine @@ -78,45 +54,126 @@ OPAL_DECLSPEC int opal_progress_mpi_disable(void); */ OPAL_DECLSPEC int opal_progress_finalize(void); -/** - * Control how the event library is called - */ -OPAL_DECLSPEC void opal_progress_events(int); /** * Progress all pending events + * + * Progress all pending events. All registered event handlers will be + * called every call into opal_progress(). The event library will be + * called if opal_progress_event_users is greater than 0 (adjustments + * can be made by calling opal_progress_event_users_add() and + * opal_progress_event_users_delete()) or the time since the last call + * into the event library is greater than the progress tick rate (by + * default, 10ms). */ OPAL_DECLSPEC void opal_progress(void); + +/** + * Control how the event library is called + * + * Adjust the flags argument used to call opal_event_loop() from + * opal_progress(). The default argument is OPAL_EVLOOP_ONELOOP, + * meaning that the call to opal_event_loop() will block pending + * events, but may block for a period of time. + * + * @param flags One of the valid vlags argument to + * opal_event_loop(). + * @return Previous value of flags used to call + * opal_event_loop(). + */ +OPAL_DECLSPEC int opal_progress_set_event_flag(int flags); + + +/** + * Increase the number of users of the event library + * + * Increase the number of users of the event library. This count is + * used by opal_progress to determine if opal_event_loop() should be + * called every call to opal_progress() or only after a time has + * elapsed since the last call (by default, 10ms). The count defaults + * to 0, meaning that opal_progress_event_users_increment() must be + * called at least once for the event loop to be called on every entry + * to opal_progress(). + * + */ +OPAL_DECLSPEC void opal_progress_event_users_increment(void); + + +/** + * Decrease the number of users of the event library + * + * Decrease the number of users of the event library. This count is + * used by opal_progress to determine if opal_event_loop() should be + * called every call to opal_progress() or only after a time has + * elapsed since the last call (by default, 10ms). + */ +OPAL_DECLSPEC void opal_progress_event_users_decrement(void); + + +/** + * Set whether opal_progress() should yield when idle + * + * Set whether opal_progress() should yield the processor (either by + * sched_yield() or SwitchToThread()) if no events were progressed + * during the progress loop. The return value of the callback + * functions is used to determine whether or not yielding is required. + * By default, the event loop will yield when the progress function is + * idle. + * + * @param yieldopt Whether to yield when idle. + * @return Previous value of the yield_when_idle option. + */ +OPAL_DECLSPEC bool opal_progress_set_yield_when_idle(bool yieldopt); + + +/** + * Set time between calls into the event library + * + * Set time between calls into the event library when there are no + * users of the event library (set by + * opal_progress_event_users_increment() and + * opal_progress_event_users_decrement()). + * + * @param polltime Time (in microseconds) between calls to the event + * library + */ +OPAL_DECLSPEC void opal_progress_set_event_poll_rate(int microseconds); + + +/** + * Progress callback function typedef + * + * Prototype for the a progress function callback. Progress function + * callbacks can be registered with opal_progress_register() and + * deregistered with opal_progress_deregister(). It should be noted + * that either registering or deregistering a function callback is an + * extraordinarily expensive operation and should not be used for + * potentially short callback lifetimes. + * + * @return Number of events progressed during the callback + */ typedef int (*opal_progress_callback_t)(void); + /** * Register an event to be progressed + * + * Register an event to be progressed during calls to opal_progress(). + * Please read the note in opal_progress_callback_t. */ OPAL_DECLSPEC int opal_progress_register(opal_progress_callback_t cb); /** - * Unregister previously registered event + * Deregister previously registered event + * + * Deregister an event to be progressed during calls to opal_progress(). + * Please read the note in opal_progress_callback_t. */ OPAL_DECLSPEC int opal_progress_unregister(opal_progress_callback_t cb); -/** - * Increase count of MPI users of the event library - */ -OPAL_DECLSPEC int opal_progress_event_increment(void); - -/** - * Decrease count of MPI users of the event library - */ -OPAL_DECLSPEC int opal_progress_event_decrement(void); - - -/** - * Progress until flag is true or poll iterations completed - */ - OPAL_DECLSPEC extern volatile int32_t opal_progress_thread_count; OPAL_DECLSPEC extern int opal_progress_spin_count; @@ -126,6 +183,9 @@ static inline bool opal_progress_threads(void) } +/** + * Progress until flag is true or poll iterations completed + */ static inline bool opal_progress_spin(volatile bool* complete) { int32_t c; diff --git a/orte/runtime/orte_init_stage1.c b/orte/runtime/orte_init_stage1.c index f5a91baeca..2db7958b76 100644 --- a/orte/runtime/orte_init_stage1.c +++ b/orte/runtime/orte_init_stage1.c @@ -9,6 +9,9 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. + * Copyright (c) 2006 Los Alamos National Security, LLC. All rights + * reserved. + * * $COPYRIGHT$ * * Additional copyrights may follow @@ -159,6 +162,8 @@ int orte_init_stage1(bool infrastructure) error = "opal_progress_init"; goto error; } + /* we want to tick the event library whenever possible */ + opal_progress_event_users_increment(); /* * Internal startup