Add some debugging to the grpcomm module. Setting grpcomm_base_verbose = 1 will now give you a trace through the functions as they are called. Setting it to 2 or more will give you details on what each function is doing as it works through its procedure.
This commit was SVN r17848.
Этот коммит содержится в:
родитель
6ad96df8bc
Коммит
8cd6142e6d
@ -73,7 +73,7 @@ static int xcast(orte_jobid_t job,
|
|||||||
struct timeval start, stop;
|
struct timeval start, stop;
|
||||||
|
|
||||||
OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base_output,
|
OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base_output,
|
||||||
"%s xcast sent to job %s tag %ld",
|
"%s grpcomm:xcast sent to job %s tag %ld",
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
ORTE_JOBID_PRINT(job), (long)tag));
|
ORTE_JOBID_PRINT(job), (long)tag));
|
||||||
|
|
||||||
@ -87,7 +87,7 @@ static int xcast(orte_jobid_t job,
|
|||||||
}
|
}
|
||||||
|
|
||||||
OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base_output,
|
OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base_output,
|
||||||
"%s xcast: num_procs %ld linear xover: %ld binomial xover: %ld",
|
"%s grpcomm:xcast: num_procs %ld linear xover: %ld binomial xover: %ld",
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
(long)orte_process_info.num_procs,
|
(long)orte_process_info.num_procs,
|
||||||
(long)orte_grpcomm_basic.xcast_linear_xover,
|
(long)orte_grpcomm_basic.xcast_linear_xover,
|
||||||
@ -132,7 +132,7 @@ DONE:
|
|||||||
|
|
||||||
if (orte_timing) {
|
if (orte_timing) {
|
||||||
gettimeofday(&stop, NULL);
|
gettimeofday(&stop, NULL);
|
||||||
opal_output(0, "%s xcast: time %ld usec", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
opal_output(0, "%s grpcomm:xcast: time %ld usec", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
(long int)((stop.tv_sec - start.tv_sec)*1000000 +
|
(long int)((stop.tv_sec - start.tv_sec)*1000000 +
|
||||||
(stop.tv_usec - start.tv_usec)));
|
(stop.tv_usec - start.tv_usec)));
|
||||||
}
|
}
|
||||||
@ -149,7 +149,7 @@ static int xcast_binomial_tree(orte_jobid_t job,
|
|||||||
opal_buffer_t *buf;
|
opal_buffer_t *buf;
|
||||||
|
|
||||||
OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base_output,
|
OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base_output,
|
||||||
"%s xcast_binomial",
|
"%s grpcomm:entering xcast_binomial",
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||||
|
|
||||||
/* binomial xcast can only go through the daemons as app procs are
|
/* binomial xcast can only go through the daemons as app procs are
|
||||||
@ -208,7 +208,7 @@ static int xcast_binomial_tree(orte_jobid_t job,
|
|||||||
}
|
}
|
||||||
|
|
||||||
OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base_output,
|
OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base_output,
|
||||||
"%s xcast_binomial: buffer size %ld",
|
"%s grpcomm:xcast_binomial: buffer size %ld",
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
(long)buf->bytes_used));
|
(long)buf->bytes_used));
|
||||||
|
|
||||||
@ -216,8 +216,8 @@ static int xcast_binomial_tree(orte_jobid_t job,
|
|||||||
* will ensure everyone else gets it!
|
* will ensure everyone else gets it!
|
||||||
*/
|
*/
|
||||||
|
|
||||||
OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base_output,
|
OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base_output,
|
||||||
"%s xcast_binomial: sending %s => %s",
|
"%s grpcomm:xcast_binomial: sending %s => %s",
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_HNP)));
|
ORTE_NAME_PRINT(ORTE_PROC_MY_HNP)));
|
||||||
@ -242,6 +242,10 @@ static int xcast_binomial_tree(orte_jobid_t job,
|
|||||||
CLEANUP:
|
CLEANUP:
|
||||||
OBJ_RELEASE(buf);
|
OBJ_RELEASE(buf);
|
||||||
|
|
||||||
|
OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base_output,
|
||||||
|
"%s grpcomm:xcast_binomial: completed",
|
||||||
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||||
|
|
||||||
return rc;
|
return rc;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -257,7 +261,7 @@ static int xcast_linear(orte_jobid_t job,
|
|||||||
orte_grpcomm_mode_t mode;
|
orte_grpcomm_mode_t mode;
|
||||||
|
|
||||||
OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base_output,
|
OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base_output,
|
||||||
"%s xcast_linear",
|
"%s grpcomm:entering xcast_linear",
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||||
|
|
||||||
/* since we have to pack some additional info into the buffer to be
|
/* since we have to pack some additional info into the buffer to be
|
||||||
@ -323,7 +327,7 @@ static int xcast_linear(orte_jobid_t job,
|
|||||||
}
|
}
|
||||||
|
|
||||||
OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base_output,
|
OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base_output,
|
||||||
"%s xcast_linear: buffer size %ld",
|
"%s grpcomm:xcast_linear: buffer size %ld",
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
(long)buf->bytes_used));
|
(long)buf->bytes_used));
|
||||||
|
|
||||||
@ -351,7 +355,7 @@ static int xcast_linear(orte_jobid_t job,
|
|||||||
for (i=0; i < range; i++) {
|
for (i=0; i < range; i++) {
|
||||||
dummy.vpid = i;
|
dummy.vpid = i;
|
||||||
OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base_output,
|
OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base_output,
|
||||||
"%s xcast_linear: %s => %s",
|
"%s grpcomm:xcast_linear: %s => %s",
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
ORTE_NAME_PRINT(&dummy)));
|
ORTE_NAME_PRINT(&dummy)));
|
||||||
@ -371,6 +375,11 @@ static int xcast_linear(orte_jobid_t job,
|
|||||||
CLEANUP:
|
CLEANUP:
|
||||||
/* release the buffer */
|
/* release the buffer */
|
||||||
OBJ_RELEASE(buf);
|
OBJ_RELEASE(buf);
|
||||||
|
|
||||||
|
OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base_output,
|
||||||
|
"%s grpcomm:xcast_linear: completed",
|
||||||
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||||
|
|
||||||
return rc;
|
return rc;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -382,6 +391,10 @@ static int relay_via_hnp(orte_jobid_t job,
|
|||||||
orte_grpcomm_mode_t mode;
|
orte_grpcomm_mode_t mode;
|
||||||
int rc;
|
int rc;
|
||||||
|
|
||||||
|
OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base_output,
|
||||||
|
"%s grpcomm: relaying buffer to HNP",
|
||||||
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||||
|
|
||||||
/* since we have to pack some additional info into the buffer
|
/* since we have to pack some additional info into the buffer
|
||||||
* for this case, we create a new buffer into to contain all the
|
* for this case, we create a new buffer into to contain all the
|
||||||
* info needed plus the payload
|
* info needed plus the payload
|
||||||
@ -435,6 +448,11 @@ static int relay_via_hnp(orte_jobid_t job,
|
|||||||
|
|
||||||
CLEANUP:
|
CLEANUP:
|
||||||
OBJ_RELEASE(buf);
|
OBJ_RELEASE(buf);
|
||||||
|
|
||||||
|
OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base_output,
|
||||||
|
"%s grpcomm: buffer relayed to HNP",
|
||||||
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||||
|
|
||||||
return rc;
|
return rc;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -450,7 +468,7 @@ static int xcast_direct(orte_jobid_t job,
|
|||||||
orte_rml_tag_t target=tag;
|
orte_rml_tag_t target=tag;
|
||||||
|
|
||||||
OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base_output,
|
OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base_output,
|
||||||
"%s xcast_direct",
|
"%s grpcomm: entering xcast_direct",
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||||
|
|
||||||
/* if I am applicaton proc */
|
/* if I am applicaton proc */
|
||||||
@ -650,6 +668,11 @@ CLEANUP:
|
|||||||
if (NULL != buf) {
|
if (NULL != buf) {
|
||||||
OBJ_RELEASE(buf);
|
OBJ_RELEASE(buf);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base_output,
|
||||||
|
"%s grpcomm: xcast_direct completed",
|
||||||
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||||
|
|
||||||
return rc;
|
return rc;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -660,6 +683,10 @@ static int allgather(opal_buffer_t *sbuf, opal_buffer_t *rbuf)
|
|||||||
orte_vpid_t i;
|
orte_vpid_t i;
|
||||||
opal_buffer_t tmpbuf;
|
opal_buffer_t tmpbuf;
|
||||||
|
|
||||||
|
OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base_output,
|
||||||
|
"%s grpcomm: entering allgather",
|
||||||
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||||
|
|
||||||
/* everything happens within my jobid */
|
/* everything happens within my jobid */
|
||||||
name.jobid = ORTE_PROC_MY_NAME->jobid;
|
name.jobid = ORTE_PROC_MY_NAME->jobid;
|
||||||
|
|
||||||
@ -743,6 +770,10 @@ static int allgather(opal_buffer_t *sbuf, opal_buffer_t *rbuf)
|
|||||||
/* don't need the received buffer - we already have what we need */
|
/* don't need the received buffer - we already have what we need */
|
||||||
OBJ_DESTRUCT(&tmpbuf);
|
OBJ_DESTRUCT(&tmpbuf);
|
||||||
|
|
||||||
|
OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base_output,
|
||||||
|
"%s grpcomm: allgather completed",
|
||||||
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||||
|
|
||||||
return ORTE_SUCCESS;
|
return ORTE_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -754,6 +785,10 @@ static int allgather_list(opal_list_t *names, opal_buffer_t *sbuf, opal_buffer_t
|
|||||||
opal_buffer_t tmpbuf;
|
opal_buffer_t tmpbuf;
|
||||||
int rc;
|
int rc;
|
||||||
|
|
||||||
|
OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base_output,
|
||||||
|
"%s grpcomm: entering allgather_list",
|
||||||
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||||
|
|
||||||
/* the first entry on the list is the "root" that collects
|
/* the first entry on the list is the "root" that collects
|
||||||
* all the data - everyone else just sends and gets back
|
* all the data - everyone else just sends and gets back
|
||||||
* the results
|
* the results
|
||||||
@ -762,6 +797,10 @@ static int allgather_list(opal_list_t *names, opal_buffer_t *sbuf, opal_buffer_t
|
|||||||
|
|
||||||
if (OPAL_EQUAL != opal_dss.compare(&root->name, ORTE_PROC_MY_NAME, ORTE_NAME)) {
|
if (OPAL_EQUAL != opal_dss.compare(&root->name, ORTE_PROC_MY_NAME, ORTE_NAME)) {
|
||||||
/* everyone but root sends data */
|
/* everyone but root sends data */
|
||||||
|
OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base_output,
|
||||||
|
"%s allgather_list: sending my data to %s",
|
||||||
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
|
ORTE_NAME_PRINT(&root->name)));
|
||||||
if (0 > (rc = orte_rml.send_buffer(&root->name, sbuf, ORTE_RML_TAG_ALLGATHER_LIST, 0))) {
|
if (0 > (rc = orte_rml.send_buffer(&root->name, sbuf, ORTE_RML_TAG_ALLGATHER_LIST, 0))) {
|
||||||
ORTE_ERROR_LOG(rc);
|
ORTE_ERROR_LOG(rc);
|
||||||
return rc;
|
return rc;
|
||||||
@ -771,6 +810,9 @@ static int allgather_list(opal_list_t *names, opal_buffer_t *sbuf, opal_buffer_t
|
|||||||
ORTE_ERROR_LOG(rc);
|
ORTE_ERROR_LOG(rc);
|
||||||
return rc;
|
return rc;
|
||||||
}
|
}
|
||||||
|
OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base_output,
|
||||||
|
"%s allgather_list: received result",
|
||||||
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||||
return ORTE_SUCCESS;
|
return ORTE_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -806,6 +848,10 @@ static int allgather_list(opal_list_t *names, opal_buffer_t *sbuf, opal_buffer_t
|
|||||||
OBJ_DESTRUCT(&tmpbuf);
|
OBJ_DESTRUCT(&tmpbuf);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base_output,
|
||||||
|
"%s allgather_list: received all data",
|
||||||
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||||
|
|
||||||
/* broadcast the results */
|
/* broadcast the results */
|
||||||
for (item = opal_list_get_first(names);
|
for (item = opal_list_get_first(names);
|
||||||
item != opal_list_get_end(names);
|
item != opal_list_get_end(names);
|
||||||
@ -824,6 +870,10 @@ static int allgather_list(opal_list_t *names, opal_buffer_t *sbuf, opal_buffer_t
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base_output,
|
||||||
|
"%s grpcomm: allgather_list completed",
|
||||||
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||||
|
|
||||||
return ORTE_SUCCESS;
|
return ORTE_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -835,6 +885,10 @@ static int barrier(void)
|
|||||||
opal_buffer_t buf;
|
opal_buffer_t buf;
|
||||||
int rc;
|
int rc;
|
||||||
|
|
||||||
|
OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base_output,
|
||||||
|
"%s grpcomm: entering barrier",
|
||||||
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||||
|
|
||||||
/* everything happens within the same jobid */
|
/* everything happens within the same jobid */
|
||||||
name.jobid = ORTE_PROC_MY_NAME->jobid;
|
name.jobid = ORTE_PROC_MY_NAME->jobid;
|
||||||
|
|
||||||
@ -912,6 +966,10 @@ static int barrier(void)
|
|||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||||
OBJ_DESTRUCT(&buf);
|
OBJ_DESTRUCT(&buf);
|
||||||
|
|
||||||
|
OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base_output,
|
||||||
|
"%s grpcomm: barrier completed",
|
||||||
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||||
|
|
||||||
return ORTE_SUCCESS;
|
return ORTE_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1303,6 +1361,10 @@ static int modex(opal_list_t *procs)
|
|||||||
modex_attr_data_t *attr_data;
|
modex_attr_data_t *attr_data;
|
||||||
int rc;
|
int rc;
|
||||||
|
|
||||||
|
OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base_output,
|
||||||
|
"%s grpcomm: modex entered",
|
||||||
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||||
|
|
||||||
/* setup the buffer that will actually be sent */
|
/* setup the buffer that will actually be sent */
|
||||||
OBJ_CONSTRUCT(&buf, opal_buffer_t);
|
OBJ_CONSTRUCT(&buf, opal_buffer_t);
|
||||||
OBJ_CONSTRUCT(&rbuf, opal_buffer_t);
|
OBJ_CONSTRUCT(&rbuf, opal_buffer_t);
|
||||||
@ -1331,6 +1393,10 @@ static int modex(opal_list_t *procs)
|
|||||||
}
|
}
|
||||||
OPAL_THREAD_UNLOCK(&orte_grpcomm_basic.mutex);
|
OPAL_THREAD_UNLOCK(&orte_grpcomm_basic.mutex);
|
||||||
|
|
||||||
|
OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base_output,
|
||||||
|
"%s modex: executing allgather",
|
||||||
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||||
|
|
||||||
/* exchange the buffer with the list of peers (if provided) or all my peers */
|
/* exchange the buffer with the list of peers (if provided) or all my peers */
|
||||||
if (NULL == procs) {
|
if (NULL == procs) {
|
||||||
if (ORTE_SUCCESS != (rc = allgather(&buf, &rbuf))) {
|
if (ORTE_SUCCESS != (rc = allgather(&buf, &rbuf))) {
|
||||||
@ -1344,6 +1410,10 @@ static int modex(opal_list_t *procs)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base_output,
|
||||||
|
"%s modex: processing modex info",
|
||||||
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||||
|
|
||||||
/* process the results */
|
/* process the results */
|
||||||
/* extract the number of procs that put data in the buffer */
|
/* extract the number of procs that put data in the buffer */
|
||||||
cnt=1;
|
cnt=1;
|
||||||
@ -1443,6 +1513,10 @@ cleanup:
|
|||||||
OBJ_DESTRUCT(&buf);
|
OBJ_DESTRUCT(&buf);
|
||||||
OBJ_DESTRUCT(&rbuf);
|
OBJ_DESTRUCT(&rbuf);
|
||||||
|
|
||||||
|
OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base_output,
|
||||||
|
"%s grpcomm: modex completed",
|
||||||
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||||
|
|
||||||
return rc;
|
return rc;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user