Fix the SPC initialization.
Use the PVAR ctx to save the SPC index, so that no lookup nor restriction on the SPC vars position is imposed. Make sure the PVAR are always registered. Signed-off-by: George Bosilca <bosilca@icl.utk.edu>
Этот коммит содержится в:
родитель
cadf315ca9
Коммит
dbf89404d7
@ -1,11 +1,13 @@
|
|||||||
/*
|
/*
|
||||||
* Copyright (c) 2018 The University of Tennessee and The University
|
* Copyright (c) 2018-2019 The University of Tennessee and The University
|
||||||
* of Tennessee Research Foundation. All rights
|
* of Tennessee Research Foundation. All rights
|
||||||
* reserved.
|
* reserved.
|
||||||
*
|
*
|
||||||
* Copyright (c) 2018 Cisco Systems, Inc. All rights reserved
|
* Copyright (c) 2018 Cisco Systems, Inc. All rights reserved
|
||||||
* Copyright (c) 2018 Research Organization for Information Science
|
* Copyright (c) 2018 Research Organization for Information Science
|
||||||
* and Technology (RIST). All rights reserved.
|
* and Technology (RIST). All rights reserved.
|
||||||
|
* Copyright (c) 2019 Mellanox Technologies, Inc.
|
||||||
|
* All rights reserved.
|
||||||
* $COPYRIGHT$
|
* $COPYRIGHT$
|
||||||
*
|
*
|
||||||
* Additional copyrights may follow
|
* Additional copyrights may follow
|
||||||
@ -20,10 +22,8 @@ opal_timer_t sys_clock_freq_mhz = 0;
|
|||||||
static void ompi_spc_dump(void);
|
static void ompi_spc_dump(void);
|
||||||
|
|
||||||
/* Array for converting from SPC indices to MPI_T indices */
|
/* Array for converting from SPC indices to MPI_T indices */
|
||||||
OMPI_DECLSPEC int mpi_t_offset = -1;
|
static bool mpi_t_enabled = false;
|
||||||
OMPI_DECLSPEC bool mpi_t_enabled = false;
|
static ompi_communicator_t *ompi_spc_comm = NULL;
|
||||||
|
|
||||||
OPAL_DECLSPEC ompi_communicator_t *comm = NULL;
|
|
||||||
|
|
||||||
typedef struct ompi_spc_event_t {
|
typedef struct ompi_spc_event_t {
|
||||||
const char* counter_name;
|
const char* counter_name;
|
||||||
@ -185,6 +185,8 @@ static int ompi_spc_notify(mca_base_pvar_t *pvar, mca_base_pvar_event_t event, v
|
|||||||
return MPI_SUCCESS;
|
return MPI_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
index = (int)(uintptr_t)pvar->ctx; /* Convert from MPI_T pvar index to SPC index */
|
||||||
|
|
||||||
/* For this event, we need to set count to the number of long long type
|
/* For this event, we need to set count to the number of long long type
|
||||||
* values for this counter. All SPC counters are one long long, so we
|
* values for this counter. All SPC counters are one long long, so we
|
||||||
* always set count to 1.
|
* always set count to 1.
|
||||||
@ -194,14 +196,10 @@ static int ompi_spc_notify(mca_base_pvar_t *pvar, mca_base_pvar_event_t event, v
|
|||||||
}
|
}
|
||||||
/* For this event, we need to turn on the counter */
|
/* For this event, we need to turn on the counter */
|
||||||
else if(MCA_BASE_PVAR_HANDLE_START == event) {
|
else if(MCA_BASE_PVAR_HANDLE_START == event) {
|
||||||
/* Convert from MPI_T pvar index to SPC index */
|
|
||||||
index = pvar->pvar_index - mpi_t_offset;
|
|
||||||
SET_SPC_BIT(ompi_spc_attached_event, index);
|
SET_SPC_BIT(ompi_spc_attached_event, index);
|
||||||
}
|
}
|
||||||
/* For this event, we need to turn off the counter */
|
/* For this event, we need to turn off the counter */
|
||||||
else if(MCA_BASE_PVAR_HANDLE_STOP == event) {
|
else if(MCA_BASE_PVAR_HANDLE_STOP == event) {
|
||||||
/* Convert from MPI_T pvar index to SPC index */
|
|
||||||
index = pvar->pvar_index - mpi_t_offset;
|
|
||||||
CLEAR_SPC_BIT(ompi_spc_attached_event, index);
|
CLEAR_SPC_BIT(ompi_spc_attached_event, index);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -231,7 +229,7 @@ static int ompi_spc_get_count(const struct mca_base_pvar_t *pvar, void *value, v
|
|||||||
}
|
}
|
||||||
|
|
||||||
/* Convert from MPI_T pvar index to SPC index */
|
/* Convert from MPI_T pvar index to SPC index */
|
||||||
int index = pvar->pvar_index - mpi_t_offset;
|
int index = (int)(uintptr_t)pvar->ctx;
|
||||||
/* Set the counter value to the current SPC value */
|
/* Set the counter value to the current SPC value */
|
||||||
*counter_value = (long long)ompi_spc_events[index].value;
|
*counter_value = (long long)ompi_spc_events[index].value;
|
||||||
/* If this is a timer-based counter, convert from cycles to microseconds */
|
/* If this is a timer-based counter, convert from cycles to microseconds */
|
||||||
@ -268,7 +266,7 @@ void ompi_spc_events_init(void)
|
|||||||
ompi_spc_events[i].value = 0;
|
ompi_spc_events[i].value = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
ompi_comm_dup(&ompi_mpi_comm_world.comm, &comm);
|
ompi_comm_dup(&ompi_mpi_comm_world.comm, &ompi_spc_comm);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Initializes the SPC data structures and registers all counters as MPI_T pvars.
|
/* Initializes the SPC data structures and registers all counters as MPI_T pvars.
|
||||||
@ -287,14 +285,6 @@ void ompi_spc_init(void)
|
|||||||
char **arg_strings = opal_argv_split(ompi_mpi_spc_attach_string, ',');
|
char **arg_strings = opal_argv_split(ompi_mpi_spc_attach_string, ',');
|
||||||
int num_args = opal_argv_count(arg_strings);
|
int num_args = opal_argv_count(arg_strings);
|
||||||
|
|
||||||
/* Reset all timer-based counters */
|
|
||||||
for(i = 0; i < OMPI_SPC_NUM_COUNTERS; i++) {
|
|
||||||
CLEAR_SPC_BIT(ompi_spc_timer_event, i);
|
|
||||||
}
|
|
||||||
|
|
||||||
/* If this is a timer event, set the corresponding timer_event entry */
|
|
||||||
SET_SPC_BIT(ompi_spc_timer_event, OMPI_SPC_MATCH_TIME);
|
|
||||||
|
|
||||||
/* If there is only one argument and it is 'all', then all counters
|
/* If there is only one argument and it is 'all', then all counters
|
||||||
* should be turned on. If the size is 0, then no counters will be enabled.
|
* should be turned on. If the size is 0, then no counters will be enabled.
|
||||||
*/
|
*/
|
||||||
@ -304,47 +294,43 @@ void ompi_spc_init(void)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Turn on only the counters that were specified in the MCA parameter */
|
|
||||||
for(i = 0; i < OMPI_SPC_NUM_COUNTERS; i++) {
|
for(i = 0; i < OMPI_SPC_NUM_COUNTERS; i++) {
|
||||||
if(all_on) {
|
/* Reset all timer-based counters */
|
||||||
found++;
|
CLEAR_SPC_BIT(ompi_spc_timer_event, i);
|
||||||
} else {
|
matched = all_on;
|
||||||
matched = 0;
|
|
||||||
/* Note: If no arguments were given, this will be skipped */
|
if( !matched ) {
|
||||||
|
/* Turn on only the counters that were specified in the MCA parameter */
|
||||||
for(j = 0; j < num_args; j++) {
|
for(j = 0; j < num_args; j++) {
|
||||||
if( 0 == strcmp(ompi_spc_events_names[i].counter_name, arg_strings[j]) ) {
|
if( 0 == strcmp(ompi_spc_events_names[i].counter_name, arg_strings[j]) ) {
|
||||||
found++;
|
|
||||||
matched = 1;
|
matched = 1;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (all_on || matched) {
|
if (matched) {
|
||||||
SET_SPC_BIT(ompi_spc_attached_event, i);
|
SET_SPC_BIT(ompi_spc_attached_event, i);
|
||||||
mpi_t_enabled = true;
|
mpi_t_enabled = true;
|
||||||
|
found++;
|
||||||
|
}
|
||||||
|
|
||||||
/* Registers the current counter as an MPI_T pvar regardless of whether it's been turned on or not */
|
/* Registers the current counter as an MPI_T pvar regardless of whether it's been turned on or not */
|
||||||
ret = mca_base_pvar_register("ompi", "runtime", "spc", ompi_spc_events_names[i].counter_name, ompi_spc_events_names[i].counter_description,
|
ret = mca_base_pvar_register("ompi", "runtime", "spc", ompi_spc_events_names[i].counter_name, ompi_spc_events_names[i].counter_description,
|
||||||
OPAL_INFO_LVL_4, MPI_T_PVAR_CLASS_SIZE,
|
OPAL_INFO_LVL_4, MPI_T_PVAR_CLASS_SIZE,
|
||||||
MCA_BASE_VAR_TYPE_UNSIGNED_LONG_LONG, NULL, MPI_T_BIND_NO_OBJECT,
|
MCA_BASE_VAR_TYPE_UNSIGNED_LONG_LONG, NULL, MPI_T_BIND_NO_OBJECT,
|
||||||
MCA_BASE_PVAR_FLAG_READONLY | MCA_BASE_PVAR_FLAG_CONTINUOUS,
|
MCA_BASE_PVAR_FLAG_READONLY | MCA_BASE_PVAR_FLAG_CONTINUOUS,
|
||||||
ompi_spc_get_count, NULL, ompi_spc_notify, NULL);
|
ompi_spc_get_count, NULL, ompi_spc_notify, (void*)(uintptr_t)i);
|
||||||
|
if( ret < 0 ) {
|
||||||
/* Check to make sure that ret is a valid index and not an error code */
|
mpi_t_enabled = false;
|
||||||
if( ret >= 0 ) {
|
opal_show_help("help-mpi-runtime.txt", "spc: MPI_T disabled", true);
|
||||||
if( mpi_t_offset == -1 ) {
|
break;
|
||||||
mpi_t_offset = ret;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if( (ret < 0) || (ret != (mpi_t_offset + found - 1)) ) {
|
|
||||||
mpi_t_enabled = false;
|
|
||||||
opal_show_help("help-mpi-runtime.txt", "spc: MPI_T disabled", true);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* If this is a timer event, set the corresponding timer_event entry */
|
||||||
|
SET_SPC_BIT(ompi_spc_timer_event, OMPI_SPC_MATCH_TIME);
|
||||||
|
|
||||||
opal_argv_free(arg_strings);
|
opal_argv_free(arg_strings);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -356,8 +342,8 @@ static void ompi_spc_dump(void)
|
|||||||
int i, j, world_size, offset;
|
int i, j, world_size, offset;
|
||||||
long long *recv_buffer = NULL, *send_buffer;
|
long long *recv_buffer = NULL, *send_buffer;
|
||||||
|
|
||||||
int rank = ompi_comm_rank(comm);
|
int rank = ompi_comm_rank(ompi_spc_comm);
|
||||||
world_size = ompi_comm_size(comm);
|
world_size = ompi_comm_size(ompi_spc_comm);
|
||||||
|
|
||||||
/* Convert from cycles to usecs before sending */
|
/* Convert from cycles to usecs before sending */
|
||||||
for(i = 0; i < OMPI_SPC_NUM_COUNTERS; i++) {
|
for(i = 0; i < OMPI_SPC_NUM_COUNTERS; i++) {
|
||||||
@ -384,10 +370,10 @@ static void ompi_spc_dump(void)
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
(void)comm->c_coll->coll_gather(send_buffer, OMPI_SPC_NUM_COUNTERS, MPI_LONG_LONG,
|
(void)ompi_spc_comm->c_coll->coll_gather(send_buffer, OMPI_SPC_NUM_COUNTERS, MPI_LONG_LONG,
|
||||||
recv_buffer, OMPI_SPC_NUM_COUNTERS, MPI_LONG_LONG,
|
recv_buffer, OMPI_SPC_NUM_COUNTERS, MPI_LONG_LONG,
|
||||||
0, comm,
|
0, ompi_spc_comm,
|
||||||
comm->c_coll->coll_gather_module);
|
ompi_spc_comm->c_coll->coll_gather_module);
|
||||||
|
|
||||||
/* Once rank 0 has all of the information, print the aggregated counter values for each rank in order */
|
/* Once rank 0 has all of the information, print the aggregated counter values for each rank in order */
|
||||||
if(rank == 0) {
|
if(rank == 0) {
|
||||||
@ -413,7 +399,7 @@ static void ompi_spc_dump(void)
|
|||||||
}
|
}
|
||||||
free(send_buffer);
|
free(send_buffer);
|
||||||
|
|
||||||
comm->c_coll->coll_barrier(comm, comm->c_coll->coll_barrier_module);
|
ompi_spc_comm->c_coll->coll_barrier(ompi_spc_comm, ompi_spc_comm->c_coll->coll_barrier_module);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Frees any dynamically alocated OMPI SPC data structures */
|
/* Frees any dynamically alocated OMPI SPC data structures */
|
||||||
@ -424,7 +410,7 @@ void ompi_spc_fini(void)
|
|||||||
}
|
}
|
||||||
|
|
||||||
free(ompi_spc_events); ompi_spc_events = NULL;
|
free(ompi_spc_events); ompi_spc_events = NULL;
|
||||||
ompi_comm_free(&comm);
|
ompi_comm_free(&ompi_spc_comm);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Records an update to a counter using an atomic add operation. */
|
/* Records an update to a counter using an atomic add operation. */
|
||||||
|
Загрузка…
Ссылка в новой задаче
Block a user