1
1

add ability to querry pml monitorinting results with MPI Tools interface

using performance variables "pml_monitoring_messages_count" and
"pml_monitoring_messages_size"

Per Brice suggestion make all data count and message length be
uint64_t.
Этот коммит содержится в:
George Bosilca 2015-04-02 12:21:47 -04:00
родитель a47d69202f
Коммит 80343a0d39
3 изменённых файлов: 103 добавлений и 22 удалений

Просмотреть файл

@ -3,6 +3,7 @@
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2013-2015 Inria. All rights reserved.
* Copyright (c) 2015 Bull SAS. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -30,14 +31,14 @@ int ompi_mca_pml_monitoring_flush(char* filename);
MPI_Group group_world;
/* array for stroring monitoring data*/
size_t *sent_data = NULL;
int *messages_count = NULL;
size_t *filtered_sent_data = NULL;
int *filtered_messages_count = NULL;
size_t *all_sent_data = NULL;
int *all_messages_count = NULL;
size_t *all_filtered_sent_data = NULL;
int *all_filtered_messages_count = NULL;
uint64_t* sent_data = NULL;
uint64_t* messages_count = NULL;
uint64_t* filtered_sent_data = NULL;
uint64_t* filtered_messages_count = NULL;
uint64_t* all_sent_data = NULL;
uint64_t* all_messages_count = NULL;
uint64_t* all_filtered_sent_data = NULL;
uint64_t* all_filtered_messages_count = NULL;
int init_done = 0;
int nbprocs = -1;
@ -132,16 +133,16 @@ void finalize_monitoring( void ){
}
void initialize_monitoring( void ){
sent_data = (size_t*)calloc(nbprocs, sizeof(size_t));
messages_count = (int*) calloc(nbprocs, sizeof(int));
all_sent_data = (size_t*)calloc(nbprocs, sizeof(size_t));
all_messages_count = (int*) calloc(nbprocs, sizeof(int));
sent_data = (uint64_t*)calloc(nbprocs, sizeof(uint64_t));
messages_count = (uint64_t*) calloc(nbprocs, sizeof(uint64_t));
all_sent_data = (uint64_t*)calloc(nbprocs, sizeof(uint64_t));
all_messages_count = (uint64_t*) calloc(nbprocs, sizeof(uint64_t));
if(filter_monitoring()){
filtered_sent_data = (size_t*)calloc(nbprocs, sizeof(size_t));
filtered_messages_count = (int*) calloc(nbprocs, sizeof(int));
all_filtered_sent_data = (size_t*)calloc(nbprocs, sizeof(size_t));
all_filtered_messages_count = (int*) calloc(nbprocs, sizeof(int));
filtered_sent_data = (uint64_t*)calloc(nbprocs, sizeof(uint64_t));
filtered_messages_count = (uint64_t*) calloc(nbprocs, sizeof(uint64_t));
all_filtered_sent_data = (uint64_t*)calloc(nbprocs, sizeof(uint64_t));
all_filtered_messages_count = (uint64_t*) calloc(nbprocs, sizeof(uint64_t));
}
init_done = 1;
@ -167,13 +168,51 @@ void monitor_send_data(int world_rank, size_t data_size, int tag){
}
void output_monitoring( void ){
int mca_pml_monitoring_get_messages_count (const struct mca_base_pvar_t *pvar, void *value, void *obj_handle)
{
ompi_communicator_t *comm = (ompi_communicator_t *) obj_handle;
int comm_size = ompi_comm_size (comm);
uint64_t *values = (uint64_t*) value;
int i;
if(comm != &ompi_mpi_comm_world.comm)
return OMPI_ERROR;
for (i = 0 ; i < comm_size ; ++i) {
values[i] = messages_count[i];
}
return OMPI_SUCCESS;
}
int mca_pml_monitoring_get_messages_size (const struct mca_base_pvar_t *pvar, void *value, void *obj_handle)
{
ompi_communicator_t *comm = (ompi_communicator_t *) obj_handle;
int comm_size = ompi_comm_size (comm);
uint64_t *values = (uint64_t*) value;
int i;
if(comm != &ompi_mpi_comm_world.comm)
return OMPI_ERROR;
for (i = 0 ; i < comm_size ; ++i) {
values[i] = sent_data[i];
}
return OMPI_SUCCESS;
}
void output_monitoring( void )
{
int i;
if ( !init_done ) return;
for (i = 0 ; i < nbprocs ; i++) {
all_sent_data[i] += sent_data[i];
all_messages_count[i] += messages_count[i];
if(all_sent_data[i] > 0) {
fprintf(stderr, "I\t%d\t%d\t%ld bytes\t%d msgs sent\n", my_rank, i, all_sent_data[i], all_messages_count[i]); fflush(stderr);
fprintf(stderr, "I\t%d\t%d\t" PRIu64 " bytes\t" PRIu64 " msgs sent\n", my_rank, i, all_sent_data[i], all_messages_count[i]); fflush(stderr);
}
}
@ -182,7 +221,7 @@ void output_monitoring( void ){
all_filtered_sent_data[i] += filtered_sent_data[i];
all_filtered_messages_count[i] += filtered_messages_count[i];
if(all_filtered_sent_data[i] > 0) {
fprintf(stderr, "E\t%d\t%d\t%ld bytes\t%d msgs sent\n", my_rank, i, all_filtered_sent_data[i], all_filtered_messages_count[i]); fflush(stderr);
fprintf(stderr, "E\t%d\t%d\t" PRIu64 " bytes\t" PRIu64 " msgs sent\n", my_rank, i, all_filtered_sent_data[i], all_filtered_messages_count[i]); fflush(stderr);
}
}
}
@ -198,6 +237,7 @@ int ompi_mca_pml_monitoring_flush(char* filename) {
FILE *pf;
int i;
if ( !init_done ) return -1;
pf = fopen(filename, "w");
@ -208,7 +248,7 @@ int ompi_mca_pml_monitoring_flush(char* filename) {
for (i = 0 ; i < nbprocs ; i++) {
if(sent_data[i] > 0) {
fprintf(pf, "I\t%d\t%d\t%ld bytes\t%d msgs sent\n", my_rank, i, sent_data[i], messages_count[i]); fflush(pf);
fprintf(pf, "I\t%d\t%d\t" PRIu64 " bytes\t" PRIu64 " msgs sent\n", my_rank, i, sent_data[i], messages_count[i]); fflush(pf);
/* aggregate data in general array*/
all_sent_data[i] += sent_data[i];
all_messages_count[i] += messages_count[i];
@ -221,7 +261,7 @@ int ompi_mca_pml_monitoring_flush(char* filename) {
if(filter_monitoring()){
for (i = 0 ; i < nbprocs ; i++) {
if(filtered_sent_data[i] > 0) {
fprintf(pf, "E\t%d\t%d\t%ld bytes\t%d msgs sent\n", my_rank, i, filtered_sent_data[i], filtered_messages_count[i]); fflush(pf);
fprintf(pf, "E\t%d\t%d\t" PRIu64 " bytes\t" PRIu64 " msgs sent\n", my_rank, i, filtered_sent_data[i], filtered_messages_count[i]); fflush(pf);
/* aggregate data in general array*/
all_filtered_sent_data[i] += filtered_sent_data[i];
all_filtered_messages_count[i] += filtered_messages_count[i];

Просмотреть файл

@ -3,6 +3,7 @@
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2013-2015 Inria. All rights reserved.
* Copyright (c) 2015 Bull SAS. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -20,6 +21,7 @@ BEGIN_C_DECLS
#include <ompi/datatype/ompi_datatype.h>
#include <ompi/mca/pml/pml.h>
#include <ompi/mca/pml/pml.h>
#include <opal/mca/base/mca_base_pvar.h>
typedef mca_pml_base_module_t mca_pml_monitoring_module_t;
@ -136,6 +138,14 @@ extern int mca_pml_monitoring_dump(struct ompi_communicator_t* comm,
extern int mca_pml_monitoring_start(size_t count,
ompi_request_t** requests);
int mca_pml_monitoring_get_messages_count (const struct mca_base_pvar_t *pvar,
void *value,
void *obj_handle);
int mca_pml_monitoring_get_messages_size (const struct mca_base_pvar_t *pvar,
void *value,
void *obj_handle);
END_C_DECLS
#endif /* MCA_PML_MONITORING_H */

Просмотреть файл

@ -3,6 +3,7 @@
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2013-2015 Inria. All rights reserved.
* Copyright (c) 2015 Bull SAS. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -17,6 +18,7 @@
#include <opal/mca/base/mca_base_component_repository.h>
static int mca_pml_monitoring_enabled = 0;
static int mca_pml_monitoring_output_enabled = 0;
static int mca_pml_monitoring_active = 0;
mca_pml_base_component_t pml_selected_component;
mca_pml_base_module_t pml_selected_module;
@ -62,6 +64,16 @@ static int mca_pml_monitoring_component_open(void)
return OMPI_SUCCESS;
}
static int mca_pml_monitoring_comm_size_notify (mca_base_pvar_t *pvar, mca_base_pvar_event_t event, void *obj_handle, int *count)
{
if (MCA_BASE_PVAR_HANDLE_BIND == event) {
/* Return the size of the communicator as the number of values */
*count = ompi_comm_size ((ompi_communicator_t *) obj_handle);
}
return OMPI_SUCCESS;
}
static int mca_pml_monitoring_component_close(void)
{
if( mca_pml_monitoring_enabled ) {
@ -98,7 +110,9 @@ static int mca_pml_monitoring_component_finish(void)
{
if( mca_pml_monitoring_enabled && mca_pml_monitoring_active ) {
/* It is over... Output what has been monitored*/
output_monitoring();
if ( mca_pml_monitoring_output_enabled != 0) {
output_monitoring();
}
/* Free internal data structure */
finalize_monitoring();
/* Call the original PML and then close */
@ -123,6 +137,23 @@ static int mca_pml_monitoring_component_register(void)
"Enable the monitoring at the PML level. This value should be different than 0 in order for the monitoring to be enabled (default disable)", MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_READONLY, &mca_pml_monitoring_enabled);
(void)mca_base_component_var_register(&mca_pml_monitoring_component.pmlm_version, "enable_output",
"Enable the PML monitoring textual output at MPI_Finalize. This value should be different than 0 in order for the output to be enabled (default disable)", MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_READONLY, &mca_pml_monitoring_output_enabled);
(void) mca_base_pvar_register ("ompi", "pml", "monitoring", "messages_count", "Number of messages "
"sent to each peer in a communicator", OPAL_INFO_LVL_4, MPI_T_PVAR_CLASS_SIZE,
MCA_BASE_VAR_TYPE_UNSIGNED_INT, NULL, MPI_T_BIND_MPI_COMM,
MCA_BASE_PVAR_FLAG_READONLY | MCA_BASE_PVAR_FLAG_CONTINUOUS,
mca_pml_monitoring_get_messages_count, NULL, mca_pml_monitoring_comm_size_notify, NULL);
(void) mca_base_pvar_register ("ompi", "pml", "monitoring", "messages_size", "Size of messages "
"sent to each peer in a communicator", OPAL_INFO_LVL_4, MPI_T_PVAR_CLASS_SIZE,
MCA_BASE_VAR_TYPE_UNSIGNED_INT, NULL, MPI_T_BIND_MPI_COMM,
MCA_BASE_PVAR_FLAG_READONLY | MCA_BASE_PVAR_FLAG_CONTINUOUS,
mca_pml_monitoring_get_messages_size, NULL, mca_pml_monitoring_comm_size_notify, NULL);
return OMPI_SUCCESS;
}