1
1

Make sure that the epoch is initialized everywhere so we don't get weird output

during valgrind. This shouldn't have caused any problems with any actual
execution. Just extra warnings in valgrind.

This commit was SVN r25015.
Этот коммит содержится в:
Wesley Bland 2011-08-08 15:11:55 +00:00
родитель 3a6e9b19ee
Коммит 09274cd047
29 изменённых файлов: 48 добавлений и 11 удалений

Просмотреть файл

@ -5284,6 +5284,7 @@ static int send_bookmarks(int peer_idx)
*/
peer_name.jobid = ORTE_PROC_MY_NAME->jobid;
peer_name.vpid = peer_idx;
peer_name.epoch = ORTE_EPOCH_INVALID;
peer_name.epoch = orte_ess.proc_get_epoch(&peer_name);
if( NULL == (peer_ref = find_peer(peer_name))) {
@ -5345,6 +5346,7 @@ static int recv_bookmarks(int peer_idx)
peer_name.jobid = ORTE_PROC_MY_NAME->jobid;
peer_name.vpid = peer_idx;
peer_name.epoch = ORTE_EPOCH_INVALID;
peer_name.epoch = orte_ess.proc_get_epoch(&peer_name);
if ( 0 > (ret = orte_rml.recv_buffer_nb(&peer_name,

Просмотреть файл

@ -362,7 +362,7 @@ int ompi_proc_refresh(void) {
/* Does not change: proc->proc_name.vpid */
proc->proc_name.jobid = ORTE_PROC_MY_NAME->jobid;
proc->proc_name.epoch = ORTE_EPOCH_INVALID;
proc->proc_name.epoch = orte_ess.proc_get_epoch(&proc->proc_name);
/* Make sure to clear the local flag before we set it below */

Просмотреть файл

@ -351,6 +351,7 @@ static int alps_set_name(void)
ORTE_PROC_MY_NAME->jobid = jobid;
ORTE_PROC_MY_NAME->vpid = (orte_vpid_t) cnos_get_rank() + starting_vpid;
ORTE_PROC_MY_NAME->epoch = ORTE_EPOCH_INVALID;
ORTE_PROC_MY_NAME->epoch = orte_ess.proc_get_epoch(ORTE_PROC_MY_NAME);
OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output,

1
orte/mca/ess/env/ess_env_module.c поставляемый
Просмотреть файл

@ -392,6 +392,7 @@ static int env_set_name(void)
ORTE_PROC_MY_NAME->jobid = jobid;
ORTE_PROC_MY_NAME->vpid = vpid;
ORTE_PROC_MY_NAME->epoch = ORTE_EPOCH_INVALID;
ORTE_PROC_MY_NAME->epoch = orte_ess.proc_get_epoch(ORTE_PROC_MY_NAME);
OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output,

Просмотреть файл

@ -357,6 +357,7 @@ static int lsf_set_name(void)
ORTE_PROC_MY_NAME->jobid = jobid;
ORTE_PROC_MY_NAME->vpid = vpid;
ORTE_PROC_MY_NAME->epoch = ORTE_EPOCH_INVALID;
ORTE_PROC_MY_NAME->epoch = orte_ess.proc_get_epoch(ORTE_PROC_MY_NAME);
/* fix up the base name and make it the "real" name */

Просмотреть файл

@ -280,6 +280,7 @@ static int slave_set_name(void)
ORTE_PROC_MY_NAME->jobid = jobid;
ORTE_PROC_MY_NAME->vpid = vpid;
ORTE_PROC_MY_NAME->epoch = ORTE_EPOCH_INVALID;
ORTE_PROC_MY_NAME->epoch = orte_ess.proc_get_epoch(ORTE_PROC_MY_NAME);
OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output,

Просмотреть файл

@ -368,6 +368,7 @@ static int slurm_set_name(void)
/* fix up the vpid and make it the "real" vpid */
slurm_nodeid = atoi(getenv("SLURM_NODEID"));
ORTE_PROC_MY_NAME->vpid = vpid + slurm_nodeid;
ORTE_PROC_MY_NAME->epoch = ORTE_EPOCH_INVALID;
ORTE_PROC_MY_NAME->epoch = orte_ess.proc_get_epoch(ORTE_PROC_MY_NAME);
OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output,

Просмотреть файл

@ -168,7 +168,7 @@ static int twoproc(opal_buffer_t *sendbuf, opal_buffer_t *recvbuf, int32_t num_e
if (vpids[0] == ORTE_PROC_MY_NAME->vpid) {
/* I send first */
peer.vpid = vpids[1];
peer.epoch = ORTE_EPOCH_INVALID;
peer.epoch = orte_ess.proc_get_epoch(&peer);
/* setup a temp buffer so I can inform the other side as to the
@ -226,7 +226,7 @@ static int twoproc(opal_buffer_t *sendbuf, opal_buffer_t *recvbuf, int32_t num_e
opal_dss.pack(&buf, &num_entries, 1, OPAL_INT32);
opal_dss.copy_payload(&buf, sendbuf);
peer.vpid = vpids[0];
peer.epoch = ORTE_EPOCH_INVALID;
peer.epoch = orte_ess.proc_get_epoch(&peer);
OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output,
@ -320,7 +320,7 @@ static int bruck(opal_buffer_t *sendbuf, opal_buffer_t *recvbuf, int32_t num_ent
/* first send my current contents */
nv = (rank - distance + np) % np;
peer.vpid = vpids[nv];
peer.epoch = ORTE_EPOCH_INVALID;
peer.epoch = orte_ess.proc_get_epoch(&peer);
OBJ_CONSTRUCT(&buf, opal_buffer_t);
@ -340,7 +340,7 @@ static int bruck(opal_buffer_t *sendbuf, opal_buffer_t *recvbuf, int32_t num_ent
num_recvd = 0;
nv = (rank + distance) % np;
peer.vpid = vpids[nv];
peer.epoch = ORTE_EPOCH_INVALID;
peer.epoch = orte_ess.proc_get_epoch(&peer);
OBJ_CONSTRUCT(&bucket, opal_buffer_t);
@ -439,7 +439,7 @@ static int recursivedoubling(opal_buffer_t *sendbuf, opal_buffer_t *recvbuf, int
/* first send my current contents */
nv = rank ^ distance;
peer.vpid = vpids[nv];
peer.epoch = ORTE_EPOCH_INVALID;
peer.epoch = orte_ess.proc_get_epoch(&peer);
OBJ_CONSTRUCT(&buf, opal_buffer_t);
@ -646,6 +646,7 @@ void orte_grpcomm_base_daemon_collective(orte_process_name_t *sender,
proc.jobid = jobid;
proc.vpid = 0;
while (proc.vpid < jobdat->num_procs && 0 < opal_list_get_size(&daemon_tree)) {
proc.epoch = ORTE_EPOCH_INVALID;
proc.epoch = orte_ess.proc_get_epoch(&proc);
/* get the daemon that hosts this proc */
@ -712,6 +713,7 @@ void orte_grpcomm_base_daemon_collective(orte_process_name_t *sender,
/* send it */
my_parent.jobid = ORTE_PROC_MY_NAME->jobid;
my_parent.vpid = orte_routed.get_routing_tree(NULL);
my_parent.epoch = ORTE_EPOCH_INVALID;
my_parent.epoch = orte_ess.proc_get_epoch(&my_parent);
OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output,

Просмотреть файл

@ -281,6 +281,7 @@ static int hnp_push(const orte_process_name_t* dst_name, orte_iof_tag_t src_tag,
&mca_iof_hnp_component.sinks);
sink->daemon.jobid = ORTE_PROC_MY_NAME->jobid;
sink->daemon.vpid = proc->node->daemon->name.vpid;
sink->daemon.epoch = ORTE_EPOCH_INVALID;
sink->daemon.epoch = orte_ess.proc_get_epoch(&sink->daemon);
}
}

Просмотреть файл

@ -734,6 +734,7 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data,
proc.jobid = jobdat->jobid;
for (j=0; j < jobdat->num_procs; j++) {
proc.vpid = j;
proc.epoch = ORTE_EPOCH_INVALID;
proc.epoch = orte_ess.proc_get_epoch(&proc);
/* get the vpid of the daemon that is to host this proc */
if (ORTE_VPID_INVALID == (host_daemon = orte_ess.proc_get_daemon(&proc))) {

Просмотреть файл

@ -200,6 +200,7 @@ int orte_odls_base_open(void)
* will be in the job - we'll check later
*/
nm->name.vpid = rank;
nm->name.epoch = ORTE_EPOCH_INVALID;
nm->name.epoch = orte_ess.proc_get_epoch(&nm->name);
}
opal_list_append(&orte_odls_globals.xterm_ranks, &nm->item);

Просмотреть файл

@ -377,6 +377,7 @@ int orte_plm_base_launch_apps(orte_jobid_t job)
/* push stdin - the IOF will know what to do with the specified target */
name.jobid = job;
name.vpid = jdata->stdin_target;
name.epoch = ORTE_EPOCH_INVALID;
name.epoch = orte_ess.proc_get_epoch(&name);
if (ORTE_SUCCESS != (rc = orte_iof.push(&name, ORTE_IOF_STDIN, 0))) {

Просмотреть файл

@ -163,6 +163,7 @@ int orte_plm_base_orted_exit(orte_daemon_cmd_flag_t command)
continue;
}
peer.vpid = v;
peer.epoch = ORTE_EPOCH_INVALID;
peer.epoch = orte_ess.proc_get_epoch(&peer);
/* don't worry about errors on the send here - just
@ -339,6 +340,7 @@ int orte_plm_base_orted_kill_local_procs(opal_pointer_array_t *procs)
continue;
}
peer.vpid = v;
peer.epoch = ORTE_EPOCH_INVALID;
peer.epoch = orte_ess.proc_get_epoch(&peer);
/* check to see if this daemon is known to be "dead" */
if (proc->state > ORTE_PROC_STATE_UNTERMINATED) {

Просмотреть файл

@ -394,6 +394,7 @@ static void process_msg(int fd, short event, void *data)
break;
}
name.vpid = vpid;
name.epoch = ORTE_EPOCH_INVALID;
name.epoch = orte_ess.proc_get_epoch(&name);
/* unpack the pid */

Просмотреть файл

@ -559,6 +559,7 @@ int orte_rmaps_base_compute_vpids(orte_job_t *jdata)
}
}
proc->name.vpid = vpid;
proc->name.epoch = ORTE_EPOCH_INVALID;
proc->name.epoch = orte_ess.proc_get_epoch(&proc->name);
/* If there is an invalid epoch here, it's because it doesn't exist yet. */
if (ORTE_NODE_RANK_INVALID == proc->name.epoch) {
@ -600,6 +601,7 @@ int orte_rmaps_base_compute_vpids(orte_job_t *jdata)
}
}
proc->name.vpid = vpid;
proc->name.epoch = ORTE_EPOCH_INVALID;
proc->name.epoch = orte_ess.proc_get_epoch(&proc->name);
}
if (NULL == opal_pointer_array_get_item(jdata->procs, proc->name.vpid)) {
@ -1012,6 +1014,7 @@ int orte_rmaps_base_setup_virtual_machine(orte_job_t *jdata)
return ORTE_ERR_OUT_OF_RESOURCE;
}
proc->name.vpid = jdata->num_procs; /* take the next available vpid */
proc->name.epoch = ORTE_EPOCH_INVALID;
proc->name.epoch = orte_ess.proc_get_epoch(&proc->name);
proc->node = node;
proc->nodename = node->name;

Просмотреть файл

@ -502,6 +502,7 @@ static int orte_rmaps_rf_map(orte_job_t *jdata)
}
proc->name.vpid = rank;
/* Either init or update the epoch. */
proc->name.epoch = ORTE_EPOCH_INVALID;
proc->name.epoch = orte_ess.proc_get_epoch(&proc->name);
proc->slot_list = strdup(rfmap->slot_list);

Просмотреть файл

@ -235,6 +235,7 @@ static int orte_rmaps_seq_map(orte_job_t *jdata)
}
/* assign the vpid */
proc->name.vpid = vpid++;
proc->name.epoch = ORTE_EPOCH_INVALID;
proc->name.epoch = orte_ess.proc_get_epoch(&proc->name);
/* add to the jdata proc array */

Просмотреть файл

@ -363,7 +363,6 @@ rml_oob_queued_progress(int fd, short event, void *arg)
origin = hdr->origin;
next = orte_routed.get_route(&hdr->destination);
#if 0
if (next.vpid == ORTE_VPID_INVALID) {
opal_output(0,
"%s:queued progress tried routing message from %s to %s:%d, can't find route",
@ -374,7 +373,6 @@ rml_oob_queued_progress(int fd, short event, void *arg)
opal_backtrace_print(stderr);
orte_errmgr.abort(ORTE_ERROR_DEFAULT_EXIT_CODE, NULL);
}
#endif
if (OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, &next, ORTE_PROC_MY_NAME)) {
opal_output(0, "%s:queued progress trying to get message from %s to %s:%d, routing loop",
@ -475,7 +473,6 @@ rml_oob_recv_route_callback(int status,
next = orte_routed.get_route(&hdr->destination);
if (next.vpid == ORTE_VPID_INVALID) {
#if 0
opal_output(0, "%s:route_callback tried routing message from %s to %s:%d, can't find route",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&origin),
@ -483,7 +480,6 @@ rml_oob_recv_route_callback(int status,
hdr->tag);
opal_backtrace_print(stderr);
orte_errmgr.abort(ORTE_ERROR_DEFAULT_EXIT_CODE, NULL);
#endif
return;
}

Просмотреть файл

@ -274,6 +274,7 @@ static int update_route(orte_process_name_t *target,
ORTE_NAME_PRINT(route)));
jfam->route.jobid = route->jobid;
jfam->route.vpid = route->vpid;
jfam->route.epoch = ORTE_EPOCH_INVALID;
jfam->route.epoch = orte_ess.proc_get_epoch(&jfam->route);
return ORTE_SUCCESS;
@ -289,6 +290,7 @@ static int update_route(orte_process_name_t *target,
jfam->job_family = jfamily;
jfam->route.jobid = route->jobid;
jfam->route.vpid = route->vpid;
jfam->route.epoch = ORTE_EPOCH_INVALID;
jfam->route.epoch = orte_ess.proc_get_epoch(&jfam->route);
opal_pointer_array_add(&orte_routed_jobfams, jfam);
@ -459,6 +461,7 @@ static orte_process_name_t get_route(orte_process_name_t *target)
ret = &daemon;
found:
daemon.epoch = ORTE_EPOCH_INVALID;
daemon.epoch = orte_ess.proc_get_epoch(&daemon);
OPAL_OUTPUT_VERBOSE((1, orte_routed_base_output,
@ -1007,6 +1010,7 @@ static int update_routing_tree(orte_jobid_t jobid)
ORTE_PROC_MY_PARENT->vpid = binomial_tree(0, 0, ORTE_PROC_MY_NAME->vpid,
orte_process_info.max_procs,
&num_children, &my_children, NULL, true, jobid);
ORTE_PROC_MY_PARENT->epoch = ORTE_EPOCH_INVALID;
ORTE_PROC_MY_PARENT->epoch = orte_ess.proc_get_epoch(ORTE_PROC_MY_PARENT);
if (0 < opal_output_get_verbosity(orte_routed_base_output)) {

Просмотреть файл

@ -257,6 +257,7 @@ static int update_route(orte_process_name_t *target,
ORTE_NAME_PRINT(route)));
jfam->route.jobid = route->jobid;
jfam->route.vpid = route->vpid;
jfam->route.epoch = ORTE_EPOCH_INVALID;
jfam->route.epoch = orte_ess.proc_get_epoch(&jfam->route);
return ORTE_SUCCESS;
@ -272,6 +273,7 @@ static int update_route(orte_process_name_t *target,
jfam->job_family = jfamily;
jfam->route.jobid = route->jobid;
jfam->route.vpid = route->vpid;
jfam->route.epoch = ORTE_EPOCH_INVALID;
jfam->route.epoch = orte_ess.proc_get_epoch(&jfam->route);
opal_pointer_array_add(&orte_routed_jobfams, jfam);
@ -365,6 +367,7 @@ static orte_process_name_t get_route(orte_process_name_t *target)
}
/* Initialize daemon's epoch, based on its current vpid/jobid */
daemon.epoch = ORTE_EPOCH_INVALID;
daemon.epoch = orte_ess.proc_get_epoch(&daemon);
/* if the daemon is me, then send direct to the target! */
@ -811,6 +814,7 @@ static int set_lifeline(orte_process_name_t *proc)
*/
local_lifeline.jobid = proc->jobid;
local_lifeline.vpid = proc->vpid;
local_lifeline.epoch = ORTE_EPOCH_INVALID;
local_lifeline.epoch = orte_ess.proc_get_epoch(&local_lifeline);
lifeline = &local_lifeline;

Просмотреть файл

@ -373,6 +373,7 @@ static orte_process_name_t get_route(orte_process_name_t *target)
}
/* Initialize daemon's epoch, based on its current vpid/jobid */
daemon.epoch = ORTE_EPOCH_INVALID;
daemon.epoch = orte_ess.proc_get_epoch(&daemon);
/* if the daemon is me, then send direct to the target! */
@ -394,6 +395,7 @@ static orte_process_name_t get_route(orte_process_name_t *target)
/* we are at end of chain - wrap around */
daemon.vpid = 0;
}
daemon.epoch = ORTE_EPOCH_INVALID;
daemon.epoch = orte_ess.proc_get_epoch(&daemon);
ret = &daemon;
}

Просмотреть файл

@ -413,6 +413,7 @@ static orte_process_name_t get_route(orte_process_name_t *target)
if (opal_bitmap_is_set_bit(&child->relatives, daemon.vpid)) {
/* yep - we need to step through this child */
daemon.vpid = child->vpid;
daemon.epoch = ORTE_EPOCH_INVALID;
daemon.epoch = orte_ess.proc_get_epoch(&daemon);
ret = &daemon;
goto found;
@ -424,6 +425,7 @@ static orte_process_name_t get_route(orte_process_name_t *target)
* any of our children, so we have to step up through our parent
*/
daemon.vpid = ORTE_PROC_MY_PARENT->vpid;
daemon.epoch = ORTE_EPOCH_INVALID;
daemon.epoch = orte_ess.proc_get_epoch(&daemon);
ret = &daemon;
@ -879,6 +881,7 @@ static int update_routing_tree(orte_jobid_t jobid)
ORTE_PROC_MY_PARENT->vpid = (Ii-Sum) % NInPrevLevel;
ORTE_PROC_MY_PARENT->vpid += (Sum - NInPrevLevel);
}
ORTE_PROC_MY_PARENT->epoch = ORTE_EPOCH_INVALID;
ORTE_PROC_MY_PARENT->epoch = orte_ess.proc_get_epoch(ORTE_PROC_MY_PARENT);
/* compute my direct children and the bitmap that shows which vpids

Просмотреть файл

@ -275,6 +275,7 @@ static int set_lifeline(orte_process_name_t *proc)
*/
local_lifeline.jobid = proc->jobid;
local_lifeline.vpid = proc->vpid;
local_lifeline.epoch = ORTE_EPOCH_INVALID;
local_lifeline.epoch = orte_ess.proc_get_epoch(&local_lifeline);
lifeline = &local_lifeline;

Просмотреть файл

@ -1216,6 +1216,7 @@ static int orte_sstore_central_extract_global_metadata(orte_sstore_central_globa
vpid_snapshot->process_name.jobid = handle_info->jobid;
vpid_snapshot->process_name.vpid = i;
vpid_snapshot->process_name.epoch = ORTE_EPOCH_INVALID;
vpid_snapshot->process_name.epoch = orte_ess.proc_get_epoch(&vpid_snapshot->process_name);
vpid_snapshot->crs_comp = NULL;

Просмотреть файл

@ -1706,6 +1706,7 @@ static int orte_sstore_stage_extract_global_metadata(orte_sstore_stage_global_sn
vpid_snapshot->process_name.jobid = handle_info->jobid;
vpid_snapshot->process_name.vpid = i;
vpid_snapshot->process_name.epoch = ORTE_EPOCH_INVALID;
vpid_snapshot->process_name.epoch = orte_ess.proc_get_epoch(&vpid_snapshot->process_name);
/* JJH: Currently we do not have this information since we do not save

Просмотреть файл

@ -129,6 +129,7 @@ static void send_relay(opal_buffer_t *buf)
continue;
}
target.epoch = ORTE_EPOCH_INVALID;
if (ORTE_NODE_RANK_INVALID == (target.epoch = orte_ess.proc_get_epoch(&target))) {
/* If we are trying to send to a previously failed process it's
* better to fail silently. */

Просмотреть файл

@ -74,7 +74,7 @@ main(int argc, char *argv[]){
for (j=1; j < count+1; j++) {
peer.vpid = (ORTE_PROC_MY_NAME->vpid + j) % orte_process_info.num_procs;
peer.epoch = ORTE_EPOCH_INVALID;
peer.epoch = orte_ess.proc_get_epoch(&peer);
/* rank0 starts ring */

Просмотреть файл

@ -41,6 +41,7 @@ main(int argc, char *argv[]){
if( right_peer_orte_name.vpid >= num_peers ) {
right_peer_orte_name.vpid = 0;
}
right_peer_orte_name.epoch = ORTE_EPOCH_INVALID;
right_peer_orte_name.epoch = orte_ess.proc_get_epoch(&right_peer_orte_name);
left_peer_orte_name.jobid = ORTE_PROC_MY_NAME->jobid;
@ -48,6 +49,7 @@ main(int argc, char *argv[]){
if( ORTE_PROC_MY_NAME->vpid == 0 ) {
left_peer_orte_name.vpid = num_peers - 1;
}
left_peer_orte_name.epoch = ORTE_EPOCH_INVALID;
left_peer_orte_name.epoch = orte_ess.proc_get_epoch(&left_peer_orte_name);
printf("My name is: %s -- PID %d\tMy Left Peer is %s\tMy Right Peer is %s\n",

Просмотреть файл

@ -74,6 +74,7 @@ int main(int argc, char* argv[])
for (i=0; i < app->num_procs; i++) {
name.vpid = i;
name.epoch = ORTE_EPOCH_INVALID;
name.epoch = orte_ess.proc_get_epoch(&name);
fprintf(stderr, "Parent: sending message to child %s\n", ORTE_NAME_PRINT(&name));
if (0 > (rc = orte_rml.send(&name, &msg, 1, MY_TAG, 0))) {