1
1

Cleaned lots of dead code in xcpu soh component (soh_xcpu.c). Checked the fix submitted by Ralph Castain for completing processes in soh_xcpu. Its working fine now.

This commit was SVN r9554.
Этот коммит содержится в:
Sushant Sharma 2006-04-06 16:26:25 +00:00
родитель d5b0da555a
Коммит 26d51d5041
2 изменённых файлов: 9 добавлений и 163 удалений

Просмотреть файл

@ -41,87 +41,6 @@
static int orte_soh_xcpu_begin_monitoring_job(orte_jobid_t);
static int orte_soh_xcpu_finalize(void);
#if 0
static int update_registry(orte_jobid_t jobid, char *proc_name){
orte_gpr_value_t *value;
int rc;
char *segment;
orte_proc_state_t state;
orte_job_state_t jstate;
orte_schema.get_job_segment_name(&segment, jobid);
/*fprintf(stdout, "soh_xcpu: segment: %s\n", segment);*/
if (ORTE_SUCCESS != (rc = orte_gpr.create_value(&value, ORTE_GPR_OVERWRITE | ORTE_GPR_TOKENS_AND,
segment, 3, 0))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/*fprintf(stdout, "debug 1\n");*/
if(ORTE_SUCCESS != (rc = orte_schema.get_proc_tokens(&(value->tokens), &(value->num_tokens),
orte_process_info.my_name) ) ){
ORTE_ERROR_LOG(rc);
}
/*
if(ORTE_SUCCESS != (rc = orte_schema.get_node_tokens(&(value->tokens), &(value->num_tokens), mca_soh_xcpu_component.cellid,
proc_name))) {
ORTE_ERROR_LOG(rc);
return rc;
}*/
/*fprintf(stdout, "debug 1.1\n");*/
state=ORTE_PROC_STATE_TERMINATED;
if (ORTE_SUCCESS != (rc = orte_gpr.create_keyval(&(value->keyvals[0]), ORTE_PROC_STATE_KEY,
ORTE_PROC_STATE, &state))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(value);
return rc;
}
/*fprintf(stdout, "debug 2\n");*/
if (ORTE_SUCCESS != (rc = orte_gpr.create_keyval(&(value->keyvals[1]), ORTE_PROC_EXIT_CODE_KEY,
ORTE_INT, 0))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(value);
return 0;
}
/*fprintf(stdout, "debug 3\n");*/
jstate=ORTE_JOB_STATE_TERMINATED;
if (ORTE_SUCCESS != (rc = orte_gpr.create_keyval(&(value->keyvals[2]), ORTE_JOB_STATE_KEY,
ORTE_JOB_STATE, &jstate))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(value);
return 0;
}
/*fprintf(stdout, "debug 4\n");*/
if ((rc = orte_gpr.put(1, &value)) != ORTE_SUCCESS) {
ORTE_ERROR_LOG(rc);
}
/*fprintf(stdout, "debug 4\n");*/
OBJ_RELEASE(value);
/*fprintf(stdout, "soh_xcpu: registry updated\n");*/
return ORTE_SUCCESS;
}
#endif
/*
static int do_update(){
return 1;
}
static void orte_soh_xcpu_notify_handler(int fd, short flags, void *user)
{
}
*/
/**
* Register a callback to receive xcpu update notifications
*/
int orte_soh_xcpu_module_init(void)
{
int rc;
@ -133,33 +52,8 @@ int orte_soh_xcpu_module_init(void)
}
return ORTE_SUCCESS;
} /*
* Set initial node status
*/
/* if(!do_update()){
fprintf(stderr, "do_update error\n");
}
*/
/*
* Now regiser notify event
*/
}
/*` mca_soh_xcpu_component.notify_fd = 0;*/ /*bproc_notifier();*/
/*
memset(&mca_soh_xcpu_component.notify_event, 0, sizeof(opal_event_t));
opal_event_set(
&mca_soh_xcpu_component.notify_event,
mca_soh_xcpu_component.notify_fd,
OPAL_EV_READ|OPAL_EV_PERSIST,
orte_soh_xcpu_notify_handler,
0);
opal_event_add(&mca_soh_xcpu_component.notify_event, 0);
return ORTE_SUCCESS;
}
*/
orte_soh_base_module_t orte_soh_xcpu_module = {
orte_soh_base_get_proc_soh,
orte_soh_base_set_proc_soh,
@ -171,78 +65,30 @@ orte_soh_base_module_t orte_soh_xcpu_module = {
orte_soh_xcpu_finalize
};
/* begin monitoring right now only trying to update registry so
/* @begin_monitoring: right now, its only trying to update registry so
* that mpirun can exit normally
* pls_xcpu is waiting for all threads to finish before calling this function
*/
static int orte_soh_xcpu_begin_monitoring_job(orte_jobid_t jobid){
#if 0
int rc, nprocs, i;
opal_list_item_t *item, *temp;
orte_rmaps_base_map_t* map;
opal_list_t mapping;
OBJ_CONSTRUCT(&mapping, opal_list_t);
/* 1. get map from registry*/
if(ORTE_SUCCESS != (rc = orte_rmaps_base_get_map(jobid, &mapping))) {
ORTE_ERROR_LOG(rc);
return rc;
}
fprintf(stdout, "soh_xcpu: begin monitoring\n");
if (ORTE_SUCCESS != (rc = orte_ns.get_cellid(&mca_soh_xcpu_component.cellid, orte_process_info.my_name))) {
fprintf(stderr, "soh_xcpu: get_cell_id error\n");
ORTE_ERROR_LOG(rc);
return rc;
}else
for(item = opal_list_get_first(&mapping);
item != opal_list_get_end(&mapping);
item = opal_list_get_next(item)) {
map = (orte_rmaps_base_map_t*) item;
for(temp = opal_list_get_first(&map->nodes);
temp != opal_list_get_end(&map->nodes);
temp = opal_list_get_next(temp)){
nprocs=((orte_rmaps_base_node_t*)temp)->node_procs.opal_list_length;
for (i = 0; i<nprocs; ++i) {
/*fprintf(stdout, "%s\n", ((orte_rmaps_base_node_t*)temp)->node->node_name);*/
update_registry(((orte_rmaps_base_node_t*)temp)->node->node_name);
}
}
}
#endif
/** all you need to do is set the proc soh for all procs (not nodes) in the job */
int rc;
size_t num_procs, i;
orte_process_name_t *peers;
if (ORTE_SUCCESS != (rc = orte_ns.get_job_peers(&peers, &num_procs, jobid))) {
ORTE_ERROR_LOG(rc);
return rc;
}
}else
for (i=0; i < num_procs; i++) {
if (ORTE_SUCCESS != (rc = orte_soh_base_set_proc_soh(peers[i], ORTE_PROC_STATE_TERMINATED, 0))) {
ORTE_ERROR_LOG(rc);
return rc;
if (ORTE_SUCCESS != (rc = orte_soh_base_set_proc_soh(&peers[i], ORTE_PROC_STATE_TERMINATED, 0)) ) {
ORTE_ERROR_LOG(rc);
break;
}
}
free(peers);
return ORTE_SUCCESS;
return rc;
}
/**
* Cleanup
*/
static int orte_soh_xcpu_finalize(void)
{
fprintf(stdout, "soh_xcpu: finalize\n");
/* opal_event_del(&mca_soh_xcpu_component.notify_event);*/
return ORTE_SUCCESS;
}

Просмотреть файл

@ -80,7 +80,7 @@ static int orte_soh_xcpu_open(void)
mca_soh_xcpu_component.debug =
orte_soh_xcpu_param_register_int("debug", 0);
mca_soh_xcpu_component.priority =
orte_soh_xcpu_param_register_int("priority", 1);
orte_soh_xcpu_param_register_int("priority", 100);
/*fprintf(stdout, "soh_xcpu: open\n");*/
return ORTE_SUCCESS;
}