Clean up some debugging output in the loadleveler ras module.
Error output strings were changed to be unique per code site. They are still pretty meaningless to the user, but at least now developers might be able to find which unique place in the code reported which error. This commit was SVN r20238.
Этот коммит содержится в:
родитель
c009b51ad3
Коммит
af45569366
@ -147,7 +147,7 @@ static int orte_ras_loadleveler_get_hostlist(int* num_hosts, char*** hostlist)
|
||||
/* Get the step ID from LOADL_STEP_ID environment variable. */
|
||||
if(NULL == (ll_step_id = getenv("LOADL_STEP_ID"))) {
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"ras:loadleveler:allocate: could not get LOADL_STEP_ID "
|
||||
"ras:loadleveler:get:hostlist: could not get LOADL_STEP_ID "
|
||||
"from environment!");
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
@ -162,7 +162,7 @@ static int orte_ras_loadleveler_get_hostlist(int* num_hosts, char*** hostlist)
|
||||
/* Initialize the LL API. Specify that query type is JOBS. */
|
||||
if(NULL == (queryObject = ll_query(JOBS))) {
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"ras:loadleveler:allocate: ll_query faild on JOBS!");
|
||||
"ras:loadleveler:get:hostlist: 1 ll_query faild on JOBS!");
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
|
||||
@ -170,8 +170,8 @@ static int orte_ras_loadleveler_get_hostlist(int* num_hosts, char*** hostlist)
|
||||
rc = ll_set_request(queryObject, QUERY_STEPID, job_step_list, ALL_DATA);
|
||||
if(0 > rc) {
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"ras:loadleveler:allocate: ll_set request failed: error "
|
||||
"%d!", rc);
|
||||
"ras:loadleveler:get:hostlist: 1 ll_set_request failed: "
|
||||
"error %d!", rc);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
|
||||
@ -179,21 +179,21 @@ static int orte_ras_loadleveler_get_hostlist(int* num_hosts, char*** hostlist)
|
||||
job = ll_get_objs(queryObject, LL_CM, NULL, &obj_count, &err_code);
|
||||
if(NULL == job) {
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"ras:loadleveler:allocate: ll_get_objs faild!");
|
||||
"ras:loadleveler:get:hostlist: 1 ll_get_objs faild!");
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
|
||||
if (obj_count != 1) { /* Only 1 Job object is expected. */
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"ras:loadleveler:allocate: ll_get_objs: expected one job "
|
||||
"to match, got %d!", obj_count);
|
||||
"ras:loadleveler:get:hostlist: 1 ll_get_objs: expected "
|
||||
"one job to match, got %d!", obj_count);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
|
||||
if(0 != (rc = ll_get_data(job, LL_JobSchedd, &schedd_host_name))) {
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"ras:loadleveler:allocate: ll_get_data: failure. RC= %d!",
|
||||
rc);
|
||||
"ras:loadleveler:get:hostlist: 1 ll_get_data: failure. "
|
||||
"RC= %d!", rc);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
if (schedd_host_name != NULL) {
|
||||
@ -201,8 +201,8 @@ static int orte_ras_loadleveler_get_hostlist(int* num_hosts, char*** hostlist)
|
||||
job_step_list[1] = NULL;
|
||||
} else {
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"ras:loadleveler:allocate: ll_get_objs() Error: Could not "
|
||||
"determine managing schedd for job %s.\n",
|
||||
"ras:loadleveler:get:hostlist: ll_get_data() Error: Could "
|
||||
"not determine managing schedd for job %s.\n",
|
||||
job_step_list[0]);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
@ -215,7 +215,7 @@ static int orte_ras_loadleveler_get_hostlist(int* num_hosts, char*** hostlist)
|
||||
/* Initialize the LL API. Specify that query type is JOBS. */
|
||||
if(NULL == (queryObject = ll_query(JOBS))) {
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"ras:loadleveler:allocate: ll_query faild on JOBS!");
|
||||
"ras:loadleveler:get:hostlist: 2 ll_query faild on JOBS!");
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
|
||||
@ -223,8 +223,8 @@ static int orte_ras_loadleveler_get_hostlist(int* num_hosts, char*** hostlist)
|
||||
rc = ll_set_request(queryObject, QUERY_STEPID, job_step_list, ALL_DATA);
|
||||
if(0 != rc) {
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"ras:loadleveler:allocate: ll_set request failed: error "
|
||||
"%d!", rc);
|
||||
"ras:loadleveler:get:hostlist: 2 ll_set_request failed: "
|
||||
"error %d!", rc);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
|
||||
@ -233,49 +233,48 @@ static int orte_ras_loadleveler_get_hostlist(int* num_hosts, char*** hostlist)
|
||||
&err_code);
|
||||
if(NULL == job) {
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"ras:loadleveler:allocate: ll_set request failed: error "
|
||||
"%d!", rc);
|
||||
"ras:loadleveler:get:hostlist: 2 ll_get_objs faild!");
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
|
||||
if (obj_count != 1) { /* Only 1 Job object is expected. */
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"ras:loadleveler:allocate: ll_get_objs: expected one job "
|
||||
"to match, got %d!", obj_count);
|
||||
"ras:loadleveler:get:hostlist: 2 ll_get_objs: expected "
|
||||
"one job to match, got %d!", obj_count);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
|
||||
if(0 != (rc = ll_get_data(job, LL_JobStepCount, &job_step_count))) {
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"ras:loadleveler:allocate: ll_get_data: failure. RC= %d!",
|
||||
rc);
|
||||
"ras:loadleveler:get:hostlist: 2 ll_get_data: failure. "
|
||||
"RC= %d!", rc);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
if (job_step_count != 1) { /* Only 1 Job Step object is expected. */
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"ras:loadleveler:allocate: ll_get_objs: expected one job "
|
||||
"step to match, got %d!", obj_count);
|
||||
"ras:loadleveler:get:hostlist: 2 ll_get_data: expected "
|
||||
"one job step to match, got %d!", job_step_count);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
|
||||
step = NULL;
|
||||
if(0 != (rc = ll_get_data(job, LL_JobGetFirstStep, &step))) {
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"ras:loadleveler:allocate: ll_get_data: failure. RC= %d!",
|
||||
rc);
|
||||
"ras:loadleveler:get:hostlist: 3 ll_get_data: failure on "
|
||||
"LL_JobGetFirstStep. RC= %d!", rc);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
if(NULL == step) {
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"ll_get_data() Error: Unable to obtain Job Step "
|
||||
"information.\n");
|
||||
"ras:loadleveler:get:hostlist: 3 ll_get_data: Error: "
|
||||
"Unable to obtain Job Step information.\n");
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
|
||||
step_mode = -1;
|
||||
if(0 != (rc = ll_get_data(step, LL_StepParallelMode, &step_mode))) {
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"ras:loadleveler:allocate: ll_get_data: failure on "
|
||||
"ras:loadleveler:get:hostlist: 4 ll_get_data: failure on "
|
||||
"LL_StepParallelMode. RC= %d!", rc);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
@ -283,7 +282,7 @@ static int orte_ras_loadleveler_get_hostlist(int* num_hosts, char*** hostlist)
|
||||
/* Serial job step: step_mode==0; Parallel: step_mode==1; Others:2,3,4. */
|
||||
if ((step_mode != 0) && (step_mode != 1)) {
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"ras:loadleveler:allocate: We support only Serial and "
|
||||
"ras:loadleveler:get:hostlist: We support only Serial and "
|
||||
"Parallel LoadLeveler job types. PVM, NQS, and Blue Gene"
|
||||
"jobs are not supported by the LoadLeveler RAS!");
|
||||
return ORTE_ERROR;
|
||||
@ -293,31 +292,31 @@ static int orte_ras_loadleveler_get_hostlist(int* num_hosts, char*** hostlist)
|
||||
node = NULL;
|
||||
if(0 != (rc = ll_get_data(step, LL_StepGetFirstNode, &node))) {
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"ras:loadleveler:allocate: ll_get_data: failure on "
|
||||
"LL_StepGetFirstNode. RC= %d!", rc);
|
||||
"ras:loadleveler:get:hostlist: ll_get_data: failure "
|
||||
"on serial LL_StepGetFirstNode. RC= %d!", rc);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
task = NULL;
|
||||
if(0 != (rc = ll_get_data(node, LL_NodeGetFirstTask, &task))) {
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"ras:loadleveler:allocate: ll_get_data: failure on "
|
||||
"LL_NodeGetFirstTask. RC= %d!", rc);
|
||||
"ras:loadleveler:get:hostlist: ll_get_data: failure "
|
||||
"on serial LL_NodeGetFirstTask. RC= %d!", rc);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
task_instance = NULL;
|
||||
rc = ll_get_data(task, LL_TaskGetFirstTaskInstance, &task_instance);
|
||||
if(0 != rc) {
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"ras:loadleveler:allocate: ll_get_data: failure on "
|
||||
"LL_TaskGetFirstInstance. RC= %d!", rc);
|
||||
"ras:loadleveler:get:hostlist: ll_get_data: failure "
|
||||
"on serial LL_TaskGetFirstInstance. RC= %d!", rc);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
task_machine_name = NULL;
|
||||
if(0 != (rc = ll_get_data(task_instance, LL_TaskInstanceMachineName,
|
||||
&task_machine_name))) {
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"ras:loadleveler:allocate: ll_get_data: failure on "
|
||||
"LL_TaskInstanceMachineName. RC= %d!", rc);
|
||||
"ras:loadleveler:get:hostlist: ll_get_data: failure "
|
||||
"on serial LL_TaskInstanceMachineName. RC= %d!", rc);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
opal_argv_append(num_hosts, hostlist, task_machine_name);
|
||||
@ -327,8 +326,8 @@ static int orte_ras_loadleveler_get_hostlist(int* num_hosts, char*** hostlist)
|
||||
node = NULL;
|
||||
if(0 != (rc = ll_get_data(step, LL_StepGetFirstNode, &node))) {
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"ras:loadleveler:allocate: ll_get_data: failure on "
|
||||
"LL_StepGetFirstNode. RC= %d!", rc);
|
||||
"ras:loadleveler:get:hostlist: ll_get_data: failure "
|
||||
"on LL_StepGetFirstNode. RC= %d!", rc);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
|
||||
@ -336,8 +335,8 @@ static int orte_ras_loadleveler_get_hostlist(int* num_hosts, char*** hostlist)
|
||||
task = NULL;
|
||||
if(0 != (rc = ll_get_data(node, LL_NodeGetFirstTask, &task))) {
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"ras:loadleveler:allocate: ll_get_data: failure on "
|
||||
"LL_NodeGetFirstTask. RC= %d!", rc);
|
||||
"ras:loadleveler:get:hostlist: ll_get_data: failure "
|
||||
"on LL_NodeGetFirstTask. RC= %d!", rc);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
|
||||
@ -346,8 +345,8 @@ static int orte_ras_loadleveler_get_hostlist(int* num_hosts, char*** hostlist)
|
||||
rc = ll_get_data(task, LL_TaskIsMaster, &ll_master_task);
|
||||
if(0 != rc) {
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"ras:loadleveler:allocate: ll_get_data: failure"
|
||||
" on LL_TaskIsMaster. RC= %d!", rc);
|
||||
"ras:loadleveler:get:hostlist: ll_get_data: "
|
||||
"failure on LL_TaskIsMaster. RC= %d!", rc);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
|
||||
@ -359,8 +358,8 @@ static int orte_ras_loadleveler_get_hostlist(int* num_hosts, char*** hostlist)
|
||||
if(0 != (rc = ll_get_data(task, LL_TaskGetFirstTaskInstance,
|
||||
&task_instance))) {
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"ras:loadleveler:allocate: ll_get_data: "
|
||||
"failure on LL_TaskGetFirstTaskInstance. "
|
||||
"ras:loadleveler:get:hostlist: ll_get_data:"
|
||||
" failure on LL_TaskGetFirstTaskInstance."
|
||||
" RC= %d!", rc);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
@ -373,7 +372,7 @@ static int orte_ras_loadleveler_get_hostlist(int* num_hosts, char*** hostlist)
|
||||
&task_machine_name);
|
||||
if(0 != rc) {
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"ras:loadleveler:allocate: ll_get_data:"
|
||||
"ras:loadleveler:get:hostlist: ll_get_data:"
|
||||
" failure on LL_TaskInstanceMachineName"
|
||||
"RC= %d!", rc);
|
||||
return ORTE_ERROR;
|
||||
@ -384,8 +383,8 @@ static int orte_ras_loadleveler_get_hostlist(int* num_hosts, char*** hostlist)
|
||||
&task_instance);
|
||||
if(0 != rc) {
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"ras:loadleveler:allocate: ll_get_data:"
|
||||
" failure on LL_TaskGetNextInstance. "
|
||||
"ras:loadleveler:get:hostlist: ll_get_data:"
|
||||
" failure on LL_TaskGetNextTaskInstance. "
|
||||
"RC= %d!", rc);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
@ -394,7 +393,7 @@ static int orte_ras_loadleveler_get_hostlist(int* num_hosts, char*** hostlist)
|
||||
task = NULL;
|
||||
if(0 != (rc = ll_get_data(node, LL_NodeGetNextTask, &task))) {
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"ras:loadleveler:allocate: ll_get_data: "
|
||||
"ras:loadleveler:get:hostlist: ll_get_data: "
|
||||
"failure on LL_NodeGetNextTask. RC= %d!", rc);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
@ -402,8 +401,8 @@ static int orte_ras_loadleveler_get_hostlist(int* num_hosts, char*** hostlist)
|
||||
node = NULL;
|
||||
if(0 != (rc = ll_get_data(step, LL_StepGetNextNode, &node))) {
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"ras:loadleveler:allocate: ll_get_data: failure "
|
||||
"on LL_StepGetNextNode. RC= %d!", rc);
|
||||
"ras:loadleveler:get:hostlist: ll_get_data: "
|
||||
"failure on LL_StepGetNextNode. RC= %d!", rc);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
}
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user