add ability to querry pml monitorinting results with MPI Tools interface
using performance variables "pml_monitoring_messages_count" and "pml_monitoring_messages_size" Per Brice suggestion make all data count and message length be uint64_t.
Этот коммит содержится в:
родитель
a47d69202f
Коммит
80343a0d39
@ -3,6 +3,7 @@
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2013-2015 Inria. All rights reserved.
|
||||
* Copyright (c) 2015 Bull SAS. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -30,14 +31,14 @@ int ompi_mca_pml_monitoring_flush(char* filename);
|
||||
MPI_Group group_world;
|
||||
|
||||
/* array for stroring monitoring data*/
|
||||
size_t *sent_data = NULL;
|
||||
int *messages_count = NULL;
|
||||
size_t *filtered_sent_data = NULL;
|
||||
int *filtered_messages_count = NULL;
|
||||
size_t *all_sent_data = NULL;
|
||||
int *all_messages_count = NULL;
|
||||
size_t *all_filtered_sent_data = NULL;
|
||||
int *all_filtered_messages_count = NULL;
|
||||
uint64_t* sent_data = NULL;
|
||||
uint64_t* messages_count = NULL;
|
||||
uint64_t* filtered_sent_data = NULL;
|
||||
uint64_t* filtered_messages_count = NULL;
|
||||
uint64_t* all_sent_data = NULL;
|
||||
uint64_t* all_messages_count = NULL;
|
||||
uint64_t* all_filtered_sent_data = NULL;
|
||||
uint64_t* all_filtered_messages_count = NULL;
|
||||
|
||||
int init_done = 0;
|
||||
int nbprocs = -1;
|
||||
@ -132,16 +133,16 @@ void finalize_monitoring( void ){
|
||||
}
|
||||
void initialize_monitoring( void ){
|
||||
|
||||
sent_data = (size_t*)calloc(nbprocs, sizeof(size_t));
|
||||
messages_count = (int*) calloc(nbprocs, sizeof(int));
|
||||
all_sent_data = (size_t*)calloc(nbprocs, sizeof(size_t));
|
||||
all_messages_count = (int*) calloc(nbprocs, sizeof(int));
|
||||
sent_data = (uint64_t*)calloc(nbprocs, sizeof(uint64_t));
|
||||
messages_count = (uint64_t*) calloc(nbprocs, sizeof(uint64_t));
|
||||
all_sent_data = (uint64_t*)calloc(nbprocs, sizeof(uint64_t));
|
||||
all_messages_count = (uint64_t*) calloc(nbprocs, sizeof(uint64_t));
|
||||
|
||||
if(filter_monitoring()){
|
||||
filtered_sent_data = (size_t*)calloc(nbprocs, sizeof(size_t));
|
||||
filtered_messages_count = (int*) calloc(nbprocs, sizeof(int));
|
||||
all_filtered_sent_data = (size_t*)calloc(nbprocs, sizeof(size_t));
|
||||
all_filtered_messages_count = (int*) calloc(nbprocs, sizeof(int));
|
||||
filtered_sent_data = (uint64_t*)calloc(nbprocs, sizeof(uint64_t));
|
||||
filtered_messages_count = (uint64_t*) calloc(nbprocs, sizeof(uint64_t));
|
||||
all_filtered_sent_data = (uint64_t*)calloc(nbprocs, sizeof(uint64_t));
|
||||
all_filtered_messages_count = (uint64_t*) calloc(nbprocs, sizeof(uint64_t));
|
||||
}
|
||||
|
||||
init_done = 1;
|
||||
@ -167,13 +168,51 @@ void monitor_send_data(int world_rank, size_t data_size, int tag){
|
||||
|
||||
}
|
||||
|
||||
void output_monitoring( void ){
|
||||
int mca_pml_monitoring_get_messages_count (const struct mca_base_pvar_t *pvar, void *value, void *obj_handle)
|
||||
{
|
||||
ompi_communicator_t *comm = (ompi_communicator_t *) obj_handle;
|
||||
int comm_size = ompi_comm_size (comm);
|
||||
uint64_t *values = (uint64_t*) value;
|
||||
int i;
|
||||
|
||||
if(comm != &ompi_mpi_comm_world.comm)
|
||||
return OMPI_ERROR;
|
||||
|
||||
for (i = 0 ; i < comm_size ; ++i) {
|
||||
values[i] = messages_count[i];
|
||||
}
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
int mca_pml_monitoring_get_messages_size (const struct mca_base_pvar_t *pvar, void *value, void *obj_handle)
|
||||
{
|
||||
ompi_communicator_t *comm = (ompi_communicator_t *) obj_handle;
|
||||
int comm_size = ompi_comm_size (comm);
|
||||
uint64_t *values = (uint64_t*) value;
|
||||
int i;
|
||||
|
||||
if(comm != &ompi_mpi_comm_world.comm)
|
||||
return OMPI_ERROR;
|
||||
|
||||
for (i = 0 ; i < comm_size ; ++i) {
|
||||
values[i] = sent_data[i];
|
||||
}
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
void output_monitoring( void )
|
||||
{
|
||||
int i;
|
||||
|
||||
if ( !init_done ) return;
|
||||
|
||||
for (i = 0 ; i < nbprocs ; i++) {
|
||||
all_sent_data[i] += sent_data[i];
|
||||
all_messages_count[i] += messages_count[i];
|
||||
if(all_sent_data[i] > 0) {
|
||||
fprintf(stderr, "I\t%d\t%d\t%ld bytes\t%d msgs sent\n", my_rank, i, all_sent_data[i], all_messages_count[i]); fflush(stderr);
|
||||
fprintf(stderr, "I\t%d\t%d\t" PRIu64 " bytes\t" PRIu64 " msgs sent\n", my_rank, i, all_sent_data[i], all_messages_count[i]); fflush(stderr);
|
||||
}
|
||||
}
|
||||
|
||||
@ -182,7 +221,7 @@ void output_monitoring( void ){
|
||||
all_filtered_sent_data[i] += filtered_sent_data[i];
|
||||
all_filtered_messages_count[i] += filtered_messages_count[i];
|
||||
if(all_filtered_sent_data[i] > 0) {
|
||||
fprintf(stderr, "E\t%d\t%d\t%ld bytes\t%d msgs sent\n", my_rank, i, all_filtered_sent_data[i], all_filtered_messages_count[i]); fflush(stderr);
|
||||
fprintf(stderr, "E\t%d\t%d\t" PRIu64 " bytes\t" PRIu64 " msgs sent\n", my_rank, i, all_filtered_sent_data[i], all_filtered_messages_count[i]); fflush(stderr);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -198,6 +237,7 @@ int ompi_mca_pml_monitoring_flush(char* filename) {
|
||||
FILE *pf;
|
||||
int i;
|
||||
|
||||
if ( !init_done ) return -1;
|
||||
|
||||
pf = fopen(filename, "w");
|
||||
|
||||
@ -208,7 +248,7 @@ int ompi_mca_pml_monitoring_flush(char* filename) {
|
||||
|
||||
for (i = 0 ; i < nbprocs ; i++) {
|
||||
if(sent_data[i] > 0) {
|
||||
fprintf(pf, "I\t%d\t%d\t%ld bytes\t%d msgs sent\n", my_rank, i, sent_data[i], messages_count[i]); fflush(pf);
|
||||
fprintf(pf, "I\t%d\t%d\t" PRIu64 " bytes\t" PRIu64 " msgs sent\n", my_rank, i, sent_data[i], messages_count[i]); fflush(pf);
|
||||
/* aggregate data in general array*/
|
||||
all_sent_data[i] += sent_data[i];
|
||||
all_messages_count[i] += messages_count[i];
|
||||
@ -221,7 +261,7 @@ int ompi_mca_pml_monitoring_flush(char* filename) {
|
||||
if(filter_monitoring()){
|
||||
for (i = 0 ; i < nbprocs ; i++) {
|
||||
if(filtered_sent_data[i] > 0) {
|
||||
fprintf(pf, "E\t%d\t%d\t%ld bytes\t%d msgs sent\n", my_rank, i, filtered_sent_data[i], filtered_messages_count[i]); fflush(pf);
|
||||
fprintf(pf, "E\t%d\t%d\t" PRIu64 " bytes\t" PRIu64 " msgs sent\n", my_rank, i, filtered_sent_data[i], filtered_messages_count[i]); fflush(pf);
|
||||
/* aggregate data in general array*/
|
||||
all_filtered_sent_data[i] += filtered_sent_data[i];
|
||||
all_filtered_messages_count[i] += filtered_messages_count[i];
|
||||
|
@ -3,6 +3,7 @@
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2013-2015 Inria. All rights reserved.
|
||||
* Copyright (c) 2015 Bull SAS. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -20,6 +21,7 @@ BEGIN_C_DECLS
|
||||
#include <ompi/datatype/ompi_datatype.h>
|
||||
#include <ompi/mca/pml/pml.h>
|
||||
#include <ompi/mca/pml/pml.h>
|
||||
#include <opal/mca/base/mca_base_pvar.h>
|
||||
|
||||
typedef mca_pml_base_module_t mca_pml_monitoring_module_t;
|
||||
|
||||
@ -136,6 +138,14 @@ extern int mca_pml_monitoring_dump(struct ompi_communicator_t* comm,
|
||||
extern int mca_pml_monitoring_start(size_t count,
|
||||
ompi_request_t** requests);
|
||||
|
||||
int mca_pml_monitoring_get_messages_count (const struct mca_base_pvar_t *pvar,
|
||||
void *value,
|
||||
void *obj_handle);
|
||||
|
||||
int mca_pml_monitoring_get_messages_size (const struct mca_base_pvar_t *pvar,
|
||||
void *value,
|
||||
void *obj_handle);
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
#endif /* MCA_PML_MONITORING_H */
|
||||
|
@ -3,6 +3,7 @@
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2013-2015 Inria. All rights reserved.
|
||||
* Copyright (c) 2015 Bull SAS. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -17,6 +18,7 @@
|
||||
#include <opal/mca/base/mca_base_component_repository.h>
|
||||
|
||||
static int mca_pml_monitoring_enabled = 0;
|
||||
static int mca_pml_monitoring_output_enabled = 0;
|
||||
static int mca_pml_monitoring_active = 0;
|
||||
mca_pml_base_component_t pml_selected_component;
|
||||
mca_pml_base_module_t pml_selected_module;
|
||||
@ -62,6 +64,16 @@ static int mca_pml_monitoring_component_open(void)
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
static int mca_pml_monitoring_comm_size_notify (mca_base_pvar_t *pvar, mca_base_pvar_event_t event, void *obj_handle, int *count)
|
||||
{
|
||||
if (MCA_BASE_PVAR_HANDLE_BIND == event) {
|
||||
/* Return the size of the communicator as the number of values */
|
||||
*count = ompi_comm_size ((ompi_communicator_t *) obj_handle);
|
||||
}
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
static int mca_pml_monitoring_component_close(void)
|
||||
{
|
||||
if( mca_pml_monitoring_enabled ) {
|
||||
@ -98,7 +110,9 @@ static int mca_pml_monitoring_component_finish(void)
|
||||
{
|
||||
if( mca_pml_monitoring_enabled && mca_pml_monitoring_active ) {
|
||||
/* It is over... Output what has been monitored*/
|
||||
output_monitoring();
|
||||
if ( mca_pml_monitoring_output_enabled != 0) {
|
||||
output_monitoring();
|
||||
}
|
||||
/* Free internal data structure */
|
||||
finalize_monitoring();
|
||||
/* Call the original PML and then close */
|
||||
@ -123,6 +137,23 @@ static int mca_pml_monitoring_component_register(void)
|
||||
"Enable the monitoring at the PML level. This value should be different than 0 in order for the monitoring to be enabled (default disable)", MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_9,
|
||||
MCA_BASE_VAR_SCOPE_READONLY, &mca_pml_monitoring_enabled);
|
||||
|
||||
(void)mca_base_component_var_register(&mca_pml_monitoring_component.pmlm_version, "enable_output",
|
||||
"Enable the PML monitoring textual output at MPI_Finalize. This value should be different than 0 in order for the output to be enabled (default disable)", MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_9,
|
||||
MCA_BASE_VAR_SCOPE_READONLY, &mca_pml_monitoring_output_enabled);
|
||||
|
||||
(void) mca_base_pvar_register ("ompi", "pml", "monitoring", "messages_count", "Number of messages "
|
||||
"sent to each peer in a communicator", OPAL_INFO_LVL_4, MPI_T_PVAR_CLASS_SIZE,
|
||||
MCA_BASE_VAR_TYPE_UNSIGNED_INT, NULL, MPI_T_BIND_MPI_COMM,
|
||||
MCA_BASE_PVAR_FLAG_READONLY | MCA_BASE_PVAR_FLAG_CONTINUOUS,
|
||||
mca_pml_monitoring_get_messages_count, NULL, mca_pml_monitoring_comm_size_notify, NULL);
|
||||
|
||||
(void) mca_base_pvar_register ("ompi", "pml", "monitoring", "messages_size", "Size of messages "
|
||||
"sent to each peer in a communicator", OPAL_INFO_LVL_4, MPI_T_PVAR_CLASS_SIZE,
|
||||
MCA_BASE_VAR_TYPE_UNSIGNED_INT, NULL, MPI_T_BIND_MPI_COMM,
|
||||
MCA_BASE_PVAR_FLAG_READONLY | MCA_BASE_PVAR_FLAG_CONTINUOUS,
|
||||
mca_pml_monitoring_get_messages_size, NULL, mca_pml_monitoring_comm_size_notify, NULL);
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
|
Загрузка…
Ссылка в новой задаче
Block a user