1
1

Cleanup the CRCP Coord timing functionality. Provides a rough assessment of time each element of the algorithm is taking.

There are more details in the code regarding how to use this feature.

Also shift a few of the orte_output back to opal_output. I'm experiencing an odd problem with locks in the oob/tcp when using orte_output. I haven't had time to track it down yet.

This commit was SVN r18439.
Этот коммит содержится в:
Josh Hursey 2008-05-14 19:54:20 +00:00
родитель 7e8cd20a0a
Коммит 35a2af28d1
3 изменённых файлов: 175 добавлений и 152 удалений

Просмотреть файл

@ -51,7 +51,7 @@ extern "C" {
/* /*
* Local variables * Local variables
*/ */
extern bool timing_enabled; extern int timing_enabled;
/* /*
* Module functions * Module functions

Просмотреть файл

@ -28,7 +28,7 @@
const char *ompi_crcp_coord_component_version_string = const char *ompi_crcp_coord_component_version_string =
"OMPI CRCP coord MCA component version " OMPI_VERSION; "OMPI CRCP coord MCA component version " OMPI_VERSION;
bool timing_enabled = false; int timing_enabled = 0;
/* /*
* Local functionality * Local functionality
@ -113,7 +113,7 @@ static int crcp_coord_open(void)
false, false, false, false,
0, 0,
&val); &val);
timing_enabled = OPAL_INT_TO_BOOL(val); timing_enabled = val;
/* /*
* Debug Output * Debug Output

Просмотреть файл

@ -152,6 +152,8 @@
* send(2) | -- * send(2) | --
* send(3) | -- * send(3) | --
* send(4) | -- * send(4) | --
*
*
*/ */
#include "ompi_config.h" #include "ompi_config.h"
@ -416,22 +418,58 @@ static int wait_quiesce_drain_ack(void);
/************************************ /************************************
* A few timing structures * A few timing structures
*
* CRCP Timing Information
* -----------------------
* Pi Pj | Variable
* ---- -----+----------
* exchange_bookmark() | CRCP_TIMER_CKPT_EX_B
* -------------> | CRCP_TIMER_CKPT_EX_PEER_S
* <------------- | CRCP_TIMER_CKPT_EX_PEER_R
* -> wait_for_bk_done() |
* -- -- | CRCP_TIMER_CKPT_EX_WAIT
* check_bookmarks() |
* -- -- | CRCP_TIMER_CKPT_CHECK_B
* -> exchange_details (*) |
* -------------> | CRCP_TIMER_CKPT_CHECK_PEER_S
* <------------- | CRCP_TIMER_CKPT_CHECK_PEER_R
* post_drain[ack]() |
* -- -- | CRCP_TIMER_CKPT_POST_DRAIN
* wait_quiescence() |
* -- -- | CRCP_TIMER_CKPT_WAIT_QUI
* Finish checkpoint | -- Total Pre-Checkpoint
* -- -- | CRCP_TIMER_TOTAL_CKPT
* finalize_exchange() | -- Total Continue / Restart
* -- -- | CRCP_TIMER_TOTAL_CONT / _RST
*-----------------------------+
* (*) If needed.
*
* timing_enabled:
* < 0 : Off
* 1 : Summary only
* 2 : Per Peer messages
* 3 : Messages from all procs
*
************************************/ ************************************/
#define CRCP_TIMER_CKPT 0 #define CRCP_TIMER_TOTAL_CKPT 0
#define CRCP_TIMER_CKPT_EX_B 1 #define CRCP_TIMER_CKPT_EX_B 1
#define CRCP_TIMER_CKPT_CK_B 2 #define CRCP_TIMER_CKPT_EX_PEER_S 2
#define CRCP_TIMER_CKPT_POST 3 #define CRCP_TIMER_CKPT_EX_PEER_R 3
#define CRCP_TIMER_CKPT_WAIT 4 #define CRCP_TIMER_CKPT_EX_WAIT 4
#define CRCP_TIMER_CONT 5 #define CRCP_TIMER_CKPT_CHECK_B 5
#define CRCP_TIMER_CKPT_PEER_S 6 #define CRCP_TIMER_CKPT_CHECK_PEER_S 6
#define CRCP_TIMER_CKPT_PEER_R 7 #define CRCP_TIMER_CKPT_CHECK_PEER_R 7
#define CRCP_TIMER_CKPT_WAIT_B 8 #define CRCP_TIMER_CKPT_POST_DRAIN 8
#define CRCP_TIMER_MAX 9 #define CRCP_TIMER_CKPT_WAIT_QUI 9
#define CRCP_TIMER_TOTAL_CONT 10
#define CRCP_TIMER_TOTAL_RST 11
#define CRCP_TIMER_MAX 12
static double get_time(void); static double get_time(void);
static void start_time(int idx); static void start_time(int idx);
static void end_time(int idx); static void end_time(int idx);
static void display_indv_timer(int idx, int var); static void display_indv_timer(int idx, int proc, int msgs);
static void display_indv_timer_core(int idx, int proc, int msgs, bool direct);
static void display_all_timers(int state); static void display_all_timers(int state);
static void clear_timers(void); static void clear_timers(void);
@ -441,28 +479,28 @@ char * timer_label[CRCP_TIMER_MAX];
#define START_TIMER(idx) \ #define START_TIMER(idx) \
{ \ { \
if(OPAL_UNLIKELY(timing_enabled)) { \ if(OPAL_UNLIKELY(timing_enabled > 0)) { \
start_time(idx); \ start_time(idx); \
} \ } \
} }
#define END_TIMER(idx) \ #define END_TIMER(idx) \
{ \ { \
if(OPAL_UNLIKELY(timing_enabled)) { \ if(OPAL_UNLIKELY(timing_enabled > 0)) { \
end_time(idx); \ end_time(idx); \
} \ } \
} }
#define DISPLAY_INDV_TIMER(idx, var) \ #define DISPLAY_INDV_TIMER(idx, proc, msg) \
{ \ { \
if(OPAL_UNLIKELY(timing_enabled)) { \ if(OPAL_UNLIKELY(timing_enabled > 0)) { \
display_indv_timer(idx, var); \ display_indv_timer(idx, proc, msg);\
} \ } \
} }
#define DISPLAY_ALL_TIMERS(var) \ #define DISPLAY_ALL_TIMERS(var) \
{ \ { \
if(OPAL_UNLIKELY(timing_enabled)) { \ if(OPAL_UNLIKELY(timing_enabled > 0)) { \
display_all_timers(var); \ display_all_timers(var); \
} \ } \
} }
@ -806,15 +844,24 @@ int ompi_crcp_coord_pml_init(void) {
OBJ_CONSTRUCT(&drained_msg_list, opal_list_t); OBJ_CONSTRUCT(&drained_msg_list, opal_list_t);
clear_timers(); clear_timers();
timer_label[CRCP_TIMER_CKPT] = strdup("Ckpt");
timer_label[CRCP_TIMER_CKPT_EX_B] = strdup("Ckpt Exchange"); timer_label[CRCP_TIMER_TOTAL_CKPT] = strdup("Total Ckpt.");
timer_label[CRCP_TIMER_CKPT_CK_B] = strdup("Ckpt Check"); timer_label[CRCP_TIMER_CKPT_EX_B] = strdup("Exchange Bookmarks");
timer_label[CRCP_TIMER_CKPT_POST] = strdup("Ckpt Post"); timer_label[CRCP_TIMER_CKPT_EX_PEER_S] = strdup(" Ex.Bk. Send Peer");
timer_label[CRCP_TIMER_CKPT_WAIT] = strdup("Ckpt Wait"); timer_label[CRCP_TIMER_CKPT_EX_PEER_R] = strdup(" Ex.Bk. Recv Peer");
timer_label[CRCP_TIMER_CONT] = strdup("Continue"); timer_label[CRCP_TIMER_CKPT_EX_WAIT] = strdup(" Ex.Bk. Wait");
timer_label[CRCP_TIMER_CKPT_PEER_S] = strdup("Peer Send");
timer_label[CRCP_TIMER_CKPT_PEER_R] = strdup("Peer Recv"); timer_label[CRCP_TIMER_CKPT_CHECK_B] = strdup("Check Bookmarks");
timer_label[CRCP_TIMER_CKPT_WAIT_B] = strdup("Bookmark Wait"); timer_label[CRCP_TIMER_CKPT_CHECK_PEER_S] = strdup(" Ck.Bk. Send Peer");
timer_label[CRCP_TIMER_CKPT_CHECK_PEER_R] = strdup(" Ck.Bk. Recv Peer");
timer_label[CRCP_TIMER_CKPT_POST_DRAIN] = strdup("Post Drain Msgs.");
timer_label[CRCP_TIMER_CKPT_WAIT_QUI] = strdup("Wait for Quiescence");
timer_label[CRCP_TIMER_TOTAL_CONT] = strdup("Total Continue");
timer_label[CRCP_TIMER_TOTAL_RST] = strdup("Total Restart");
return OMPI_SUCCESS; return OMPI_SUCCESS;
} }
@ -2726,7 +2773,7 @@ ompi_crcp_base_pml_state_t* ompi_crcp_coord_pml_ft_event(
goto DONE; goto DONE;
} }
START_TIMER(CRCP_TIMER_CKPT); START_TIMER(CRCP_TIMER_TOTAL_CKPT);
STEP_1: STEP_1:
step_to_return_to = 0; step_to_return_to = 0;
@ -2754,7 +2801,10 @@ ompi_crcp_base_pml_state_t* ompi_crcp_coord_pml_ft_event(
exit_status = OMPI_EXISTS; exit_status = OMPI_EXISTS;
goto DONE_STALL; goto DONE_STALL;
} }
END_TIMER(CRCP_TIMER_CKPT); END_TIMER(CRCP_TIMER_TOTAL_CKPT);
DISPLAY_ALL_TIMERS(state);
clear_timers();
} }
/***************************** /*****************************
* Continue after a checkpoint * Continue after a checkpoint
@ -2764,7 +2814,7 @@ ompi_crcp_base_pml_state_t* ompi_crcp_coord_pml_ft_event(
goto DONE; goto DONE;
} }
START_TIMER(CRCP_TIMER_CONT); START_TIMER(CRCP_TIMER_TOTAL_CONT);
/* /*
* Finish the coord protocol * Finish the coord protocol
*/ */
@ -2775,7 +2825,10 @@ ompi_crcp_base_pml_state_t* ompi_crcp_coord_pml_ft_event(
exit_status = ret; exit_status = ret;
goto DONE; goto DONE;
} }
END_TIMER(CRCP_TIMER_CONT); END_TIMER(CRCP_TIMER_TOTAL_CONT);
DISPLAY_ALL_TIMERS(state);
clear_timers();
} }
/***************************** /*****************************
* Restart from a checkpoint * Restart from a checkpoint
@ -2785,6 +2838,7 @@ ompi_crcp_base_pml_state_t* ompi_crcp_coord_pml_ft_event(
goto DONE; goto DONE;
} }
START_TIMER(CRCP_TIMER_TOTAL_RST);
/* /*
* Refresh the jobids * Refresh the jobids
*/ */
@ -2808,6 +2862,11 @@ ompi_crcp_base_pml_state_t* ompi_crcp_coord_pml_ft_event(
exit_status = ret; exit_status = ret;
goto DONE; goto DONE;
} }
END_TIMER(CRCP_TIMER_TOTAL_RST);
DISPLAY_ALL_TIMERS(state);
clear_timers();
} }
/***************************** /*****************************
* Terminating the process post checkpoint * Terminating the process post checkpoint
@ -2826,9 +2885,6 @@ ompi_crcp_base_pml_state_t* ompi_crcp_coord_pml_ft_event(
step_to_return_to = 0; step_to_return_to = 0;
ft_event_state = OPAL_CRS_RUNNING; ft_event_state = OPAL_CRS_RUNNING;
DISPLAY_ALL_TIMERS(state);
clear_timers();
DONE_STALL: DONE_STALL:
pml_state->error_code = exit_status; pml_state->error_code = exit_status;
return pml_state; return pml_state;
@ -2972,7 +3028,7 @@ static int ft_event_coordinate_peers(void)
/* /*
* Check exchanged bookmarks * Check exchanged bookmarks
*/ */
START_TIMER(CRCP_TIMER_CKPT_CK_B); START_TIMER(CRCP_TIMER_CKPT_CHECK_B);
if( OMPI_SUCCESS != (ret = ft_event_check_bookmarks() ) ) { if( OMPI_SUCCESS != (ret = ft_event_check_bookmarks() ) ) {
orte_output(mca_crcp_coord_component.super.output_handle, orte_output(mca_crcp_coord_component.super.output_handle,
"crcp:coord: ft_event_coordinate_peers: Bookmark Check Failed %d", "crcp:coord: ft_event_coordinate_peers: Bookmark Check Failed %d",
@ -2980,12 +3036,12 @@ static int ft_event_coordinate_peers(void)
exit_status = ret; exit_status = ret;
goto DONE; goto DONE;
} }
END_TIMER(CRCP_TIMER_CKPT_CK_B); END_TIMER(CRCP_TIMER_CKPT_CHECK_B);
/* /*
* Post Drain Acks and Msgs * Post Drain Acks and Msgs
*/ */
START_TIMER(CRCP_TIMER_CKPT_POST); START_TIMER(CRCP_TIMER_CKPT_POST_DRAIN);
if( OMPI_SUCCESS != (ret = ft_event_post_drain_acks() ) ) { if( OMPI_SUCCESS != (ret = ft_event_post_drain_acks() ) ) {
orte_output(mca_crcp_coord_component.super.output_handle, orte_output(mca_crcp_coord_component.super.output_handle,
"crcp:coord: ft_event_coordinate_peers: Bookmark Post Drain ACKS Failed %d", "crcp:coord: ft_event_coordinate_peers: Bookmark Post Drain ACKS Failed %d",
@ -3001,7 +3057,7 @@ static int ft_event_coordinate_peers(void)
exit_status = ret; exit_status = ret;
goto DONE; goto DONE;
} }
END_TIMER(CRCP_TIMER_CKPT_POST); END_TIMER(CRCP_TIMER_CKPT_POST_DRAIN);
/* /*
* Check if we need to stall for completion of tasks * Check if we need to stall for completion of tasks
@ -3023,7 +3079,7 @@ static int ft_event_coordinate_peers(void)
* - Outstanding Receives (to drain wire) -- Receiver * - Outstanding Receives (to drain wire) -- Receiver
* - Outstanding Irecvs (for drain ack) -- Sender * - Outstanding Irecvs (for drain ack) -- Sender
*/ */
START_TIMER(CRCP_TIMER_CKPT_WAIT); START_TIMER(CRCP_TIMER_CKPT_WAIT_QUI);
if( OMPI_SUCCESS != (ret = ft_event_wait_quiesce() ) ) { if( OMPI_SUCCESS != (ret = ft_event_wait_quiesce() ) ) {
orte_output(mca_crcp_coord_component.super.output_handle, orte_output(mca_crcp_coord_component.super.output_handle,
"crcp:coord: ft_event_coordinate_peers: Wait Quiesce Failed %d", "crcp:coord: ft_event_coordinate_peers: Wait Quiesce Failed %d",
@ -3031,7 +3087,7 @@ static int ft_event_coordinate_peers(void)
exit_status = ret; exit_status = ret;
goto DONE; goto DONE;
} }
END_TIMER(CRCP_TIMER_CKPT_WAIT); END_TIMER(CRCP_TIMER_CKPT_WAIT_QUI);
ORTE_OUTPUT_VERBOSE((5, mca_crcp_coord_component.super.output_handle, ORTE_OUTPUT_VERBOSE((5, mca_crcp_coord_component.super.output_handle,
"crcp:coord: ft_event_coordinate_peers: %s Coordination Finished...\n", "crcp:coord: ft_event_coordinate_peers: %s Coordination Finished...\n",
@ -3140,14 +3196,13 @@ static int ft_event_exchange_bookmarks(void)
} }
/* Wait for all bookmarks to arrive */ /* Wait for all bookmarks to arrive */
START_TIMER(CRCP_TIMER_CKPT_WAIT_B); START_TIMER(CRCP_TIMER_CKPT_EX_WAIT);
while( total_recv_bookmarks > 0 ) { while( total_recv_bookmarks > 0 ) {
opal_progress(); opal_progress();
opal_event_loop(OPAL_EVLOOP_NONBLOCK); opal_event_loop(OPAL_EVLOOP_NONBLOCK);
} }
total_recv_bookmarks = 0; total_recv_bookmarks = 0;
END_TIMER(CRCP_TIMER_CKPT_WAIT_B); END_TIMER(CRCP_TIMER_CKPT_EX_WAIT);
DISPLAY_INDV_TIMER(CRCP_TIMER_CKPT_WAIT_B, 0);
return OMPI_SUCCESS; return OMPI_SUCCESS;
} }
@ -3914,7 +3969,7 @@ static int send_bookmarks(int peer_idx)
int exit_status = OMPI_SUCCESS; int exit_status = OMPI_SUCCESS;
int ret; int ret;
START_TIMER(CRCP_TIMER_CKPT_PEER_S); START_TIMER(CRCP_TIMER_CKPT_EX_PEER_S);
/* /*
* Find the peer structure for this peer * Find the peer structure for this peer
*/ */
@ -3976,14 +4031,12 @@ static int send_bookmarks(int peer_idx)
OBJ_RELEASE(buffer); OBJ_RELEASE(buffer);
} }
END_TIMER(CRCP_TIMER_CKPT_PEER_S); END_TIMER(CRCP_TIMER_CKPT_EX_PEER_S);
DISPLAY_INDV_TIMER(CRCP_TIMER_CKPT_PEER_S, peer_idx); DISPLAY_INDV_TIMER(CRCP_TIMER_CKPT_EX_PEER_S, peer_idx, 1);
return exit_status; return exit_status;
} }
/* JJH -- Blocking vs Non-Blocking Bookmark exchange */
#if 1
/* Paired with send_bookmarks */ /* Paired with send_bookmarks */
static int recv_bookmarks(int peer_idx) static int recv_bookmarks(int peer_idx)
{ {
@ -3991,7 +4044,7 @@ static int recv_bookmarks(int peer_idx)
int exit_status = OMPI_SUCCESS; int exit_status = OMPI_SUCCESS;
int ret; int ret;
START_TIMER(CRCP_TIMER_CKPT_PEER_R); START_TIMER(CRCP_TIMER_CKPT_EX_PEER_R);
peer_name.jobid = ORTE_PROC_MY_NAME->jobid; peer_name.jobid = ORTE_PROC_MY_NAME->jobid;
peer_name.vpid = peer_idx; peer_name.vpid = peer_idx;
@ -4012,96 +4065,11 @@ static int recv_bookmarks(int peer_idx)
++total_recv_bookmarks; ++total_recv_bookmarks;
cleanup: cleanup:
END_TIMER(CRCP_TIMER_CKPT_PEER_R); END_TIMER(CRCP_TIMER_CKPT_EX_PEER_R);
DISPLAY_INDV_TIMER(CRCP_TIMER_CKPT_PEER_R, peer_idx); DISPLAY_INDV_TIMER(CRCP_TIMER_CKPT_EX_PEER_R, peer_idx, 1);
return exit_status; return exit_status;
} }
#else
/* Paired with send_bookmarks */
static int recv_bookmarks(int peer_idx)
{
ompi_crcp_coord_pml_peer_ref_t *peer_ref;
orte_process_name_t peer_name;
opal_buffer_t * buffer = NULL;
int exit_status = OMPI_SUCCESS;
int ret, tmp_int;
START_TIMER(CRCP_TIMER_CKPT_PEER_R);
/*
* Find the peer structure for this peer
*/
peer_name.jobid = ORTE_PROC_MY_NAME->jobid;
peer_name.vpid = peer_idx;
if( NULL == (peer_ref = find_peer(peer_name))) {
orte_output(mca_crcp_coord_component.super.output_handle,
"crcp:coord: recv_bookmarks: Could not find peer indexed %d\n",
peer_idx);
exit_status = OMPI_ERROR;
goto cleanup;
}
/*
* Receive the bookmark from peer
*/
if (NULL == (buffer = OBJ_NEW(opal_buffer_t))) {
exit_status = ORTE_ERROR;
goto cleanup;
}
if ( 0 > (ret = orte_rml.recv_buffer(&peer_name, buffer, OMPI_CRCP_COORD_BOOKMARK_TAG, 0) ) , 0) {
orte_output(mca_crcp_coord_component.super.output_handle,
"crcp:coord: recv_bookmarks: Failed to receive bookmark from peer %s: Return %d\n",
ORTE_NAME_PRINT(&peer_name),
ret);
exit_status = ret;
goto cleanup;
}
UNPACK_BUFFER(buffer, tmp_int, 1, OPAL_UINT32,
"crcp:coord: recv_bookmarks: Unable to unpack total_send_msgs");
peer_ref->matched_send_msgs = tmp_int;
UNPACK_BUFFER(buffer, tmp_int, 1, OPAL_UINT32,
"crcp:coord: recv_bookmarks: Unable to unpack total_isend_msgs");
peer_ref->matched_isend_msgs = tmp_int;
UNPACK_BUFFER(buffer, tmp_int, 1, OPAL_UINT32,
"crcp:coord: recv_bookmarks: Unable to unpack total_send_init_msgs");
peer_ref->matched_send_init_msgs = tmp_int;
UNPACK_BUFFER(buffer, tmp_int, 1, OPAL_UINT32,
"crcp:coord: recv_bookmarks: Unable to unpack total_recv_msgs");
peer_ref->matched_recv_msgs = tmp_int;
UNPACK_BUFFER(buffer, tmp_int, 1, OPAL_UINT32,
"crcp:coord: recv_bookmarks: Unable to unpack total_irecv_msgs");
peer_ref->matched_irecv_msgs = tmp_int;
UNPACK_BUFFER(buffer, tmp_int, 1, OPAL_UINT32,
"crcp:coord: recv_bookmarks: Unable to unpack total_recv_init_msgs");
peer_ref->matched_recv_init_msgs = tmp_int;
ORTE_OUTPUT_VERBOSE((15, mca_crcp_coord_component.super.output_handle,
"crcp:coord: recv_bookmarks: %s <- %s Received bookmark S[%4d,%4d,%4d] R[%4d,%4d,%4d]\n",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&peer_name),
peer_ref->matched_send_msgs,
peer_ref->matched_isend_msgs,
peer_ref->matched_send_init_msgs,
peer_ref->matched_recv_msgs,
peer_ref->matched_irecv_msgs,
peer_ref->matched_recv_init_msgs));
cleanup:
if(NULL != buffer) {
OBJ_RELEASE(buffer);
}
END_TIMER(CRCP_TIMER_CKPT_PEER_R);
DISPLAY_INDV_TIMER(CRCP_TIMER_CKPT_PEER_R, peer_idx);
return exit_status;
}
#endif
static void recv_bookmarks_cbfunc(int status, static void recv_bookmarks_cbfunc(int status,
orte_process_name_t* sender, orte_process_name_t* sender,
@ -4114,7 +4082,7 @@ static void recv_bookmarks_cbfunc(int status,
int ret, tmp_int; int ret, tmp_int;
orte_vpid_t peer_idx; orte_vpid_t peer_idx;
START_TIMER(CRCP_TIMER_CKPT_PEER_R); START_TIMER(CRCP_TIMER_CKPT_EX_PEER_R);
peer_idx = sender->vpid; peer_idx = sender->vpid;
@ -4161,8 +4129,8 @@ static void recv_bookmarks_cbfunc(int status,
peer_ref->matched_recv_init_msgs)); peer_ref->matched_recv_init_msgs));
cleanup: cleanup:
END_TIMER(CRCP_TIMER_CKPT_PEER_R); END_TIMER(CRCP_TIMER_CKPT_EX_PEER_R);
DISPLAY_INDV_TIMER(CRCP_TIMER_CKPT_PEER_R, peer_idx); DISPLAY_INDV_TIMER(CRCP_TIMER_CKPT_EX_PEER_R, peer_idx, 1);
--total_recv_bookmarks; --total_recv_bookmarks;
return; return;
@ -4179,12 +4147,15 @@ static int send_msg_details(ompi_crcp_coord_pml_peer_ref_t *peer_ref,
int need, found; int need, found;
int exit_status = OMPI_SUCCESS; int exit_status = OMPI_SUCCESS;
int ret; int ret;
int total_details_sent = 0;
need = total_sent - total_matched; need = total_sent - total_matched;
found = 0; found = 0;
finished = false; finished = false;
assert(need > 0); assert(need > 0);
START_TIMER(CRCP_TIMER_CKPT_CHECK_PEER_S);
/* /*
* Check the 'send_list' for this peer * Check the 'send_list' for this peer
*/ */
@ -4204,6 +4175,7 @@ static int send_msg_details(ompi_crcp_coord_pml_peer_ref_t *peer_ref,
} }
found_match = false; found_match = false;
total_details_sent++;
if(OMPI_SUCCESS != (ret = do_send_msg_detail(peer_ref, msg_ref, &found_match, &finished)) ) { if(OMPI_SUCCESS != (ret = do_send_msg_detail(peer_ref, msg_ref, &found_match, &finished)) ) {
orte_output(mca_crcp_coord_component.super.output_handle, orte_output(mca_crcp_coord_component.super.output_handle,
"crcp:coord: send_msg_details: %s --> %s Failed to send message details to peer. Return %d\n", "crcp:coord: send_msg_details: %s --> %s Failed to send message details to peer. Return %d\n",
@ -4271,6 +4243,9 @@ static int send_msg_details(ompi_crcp_coord_pml_peer_ref_t *peer_ref,
stall_for_completion = true; stall_for_completion = true;
} }
END_TIMER(CRCP_TIMER_CKPT_CHECK_PEER_S);
DISPLAY_INDV_TIMER(CRCP_TIMER_CKPT_CHECK_PEER_S, peer_ref->proc_name.vpid, total_details_sent);
return exit_status; return exit_status;
} }
@ -4404,11 +4379,14 @@ static int recv_msg_details(ompi_crcp_coord_pml_peer_ref_t *peer_ref,
int response; int response;
int exit_status = OMPI_SUCCESS; int exit_status = OMPI_SUCCESS;
int ret; int ret;
int total_details_recv = 0;
need = total_recv - total_matched; need = total_recv - total_matched;
found = 0; found = 0;
assert(need > 0); assert(need > 0);
START_TIMER(CRCP_TIMER_CKPT_CHECK_PEER_R);
/* /*
* While we are still looking for messages to drain * While we are still looking for messages to drain
*/ */
@ -4422,6 +4400,7 @@ static int recv_msg_details(ompi_crcp_coord_pml_peer_ref_t *peer_ref,
/* /*
* Receive the message details from peer * Receive the message details from peer
*/ */
total_details_recv++;
if( OMPI_SUCCESS != (ret = do_recv_msg_detail(peer_ref, if( OMPI_SUCCESS != (ret = do_recv_msg_detail(peer_ref,
&p_rank, &p_comm_id, &p_rank, &p_comm_id,
&p_tag, &p_count, &p_tag, &p_count,
@ -4486,6 +4465,9 @@ static int recv_msg_details(ompi_crcp_coord_pml_peer_ref_t *peer_ref,
cleanup: cleanup:
END_TIMER(CRCP_TIMER_CKPT_CHECK_PEER_R);
DISPLAY_INDV_TIMER(CRCP_TIMER_CKPT_CHECK_PEER_R, peer_ref->proc_name.vpid, total_details_recv);
return exit_status; return exit_status;
} }
@ -5074,28 +5056,69 @@ static void clear_timers(void) {
} }
static void display_all_timers(int state) { static void display_all_timers(int state) {
bool report_ready = false;
int i; int i;
for( i = 0; i <= CRCP_TIMER_CONT; ++i) { if( 0 != ORTE_PROC_MY_NAME->vpid && timing_enabled < 3) {
display_indv_timer(i, state); return;
} }
for( i = 0; i < CRCP_TIMER_MAX; ++i) {
if(timer_end[i] > 0.001) {
report_ready = true;
}
}
if( !report_ready ) {
return;
}
opal_output(0, "crcp:coord: timing(%20s): ******************** Begin: [State = %12s]\n", "Summary", opal_crs_base_state_str(state));
for( i = 0; i < CRCP_TIMER_MAX; ++i) {
display_indv_timer_core(i, 0, 0, false);
}
opal_output(0, "crcp:coord: timing(%20s): ******************** End: [State = %12s]\n", "Summary", opal_crs_base_state_str(state));
} }
static void display_indv_timer(int idx, int var) { static void display_indv_timer(int idx, int proc, int msgs) {
display_indv_timer_core(idx, proc, msgs, true);
}
static void display_indv_timer_core(int idx, int proc, int msgs, bool direct) {
double diff = timer_end[idx] - timer_start[idx]; double diff = timer_end[idx] - timer_start[idx];
char * str = NULL;
if( 0 != ORTE_PROC_MY_NAME->vpid ) { if( 0 != ORTE_PROC_MY_NAME->vpid && timing_enabled < 3 ) {
return; return;
} }
if( diff < 0.001 ) { /* Only display the timer if an end value was set */
if(timer_end[idx] <= 0.001) {
return; return;
} }
orte_output(0,
"crcp:coord: timing(%3d): %15s = %10.2f [%15.2f - %15.2f]\n", switch(idx) {
var, case CRCP_TIMER_CKPT_EX_PEER_S:
case CRCP_TIMER_CKPT_EX_PEER_R:
case CRCP_TIMER_CKPT_CHECK_PEER_S:
case CRCP_TIMER_CKPT_CHECK_PEER_R:
/* These timers do not mean anything in the aggregate, so only display
* them when directly asked for */
if( direct && timing_enabled >= 2) {
asprintf(&str, "Proc %2d, Msg %5d", proc, msgs);
} else {
return;
}
break;
default:
str = strdup("");
break;
}
opal_output(0,
"crcp:coord: timing(%20s): %20s = %10.2f s\n",
str,
timer_label[idx], timer_label[idx],
diff, diff);
timer_end[idx], free(str);
timer_start[idx]); str = NULL;
} }