- Formatting and minor cleanup
- made it so we now set the architecture of each node we discover - remove debugging output This commit was SVN r11751.
Этот коммит содержится в:
родитель
601a6ca17a
Коммит
567676f3c1
@ -36,7 +36,7 @@ static int param_priority;
|
||||
* Local functions
|
||||
*/
|
||||
static int orte_ras_loadleveler_open(void);
|
||||
static orte_ras_base_module_t *ras_loadleveler_init(int*);
|
||||
static orte_ras_base_module_t *orte_ras_loadleveler_init(int*);
|
||||
|
||||
|
||||
orte_ras_base_component_t mca_ras_loadleveler_component = {
|
||||
@ -67,7 +67,7 @@ orte_ras_base_component_t mca_ras_loadleveler_component = {
|
||||
false
|
||||
},
|
||||
|
||||
ras_loadleveler_init
|
||||
orte_ras_loadleveler_init
|
||||
};
|
||||
|
||||
|
||||
@ -85,7 +85,7 @@ static int orte_ras_loadleveler_open(void)
|
||||
}
|
||||
|
||||
|
||||
static orte_ras_base_module_t *ras_loadleveler_init(int* priority)
|
||||
static orte_ras_base_module_t *orte_ras_loadleveler_init(int* priority)
|
||||
{
|
||||
/* if we are not an HNP, then we must not be selected */
|
||||
if (!orte_process_info.seed) {
|
||||
|
@ -44,6 +44,7 @@
|
||||
#include "orte/orte_constants.h"
|
||||
#include "orte/orte_types.h"
|
||||
#include "orte/mca/ras/base/ras_private.h"
|
||||
#include "orte/mca/gpr/gpr.h"
|
||||
#include "ras_loadleveler.h"
|
||||
|
||||
|
||||
@ -54,6 +55,7 @@ static int orte_ras_loadleveler_allocate(orte_jobid_t jobid);
|
||||
static int orte_ras_loadleveler_deallocate(orte_jobid_t jobid);
|
||||
static int orte_ras_loadleveler_finalize(void);
|
||||
static int orte_ras_loadleveler_get_hostlist(int * num_hosts, char*** hostlist);
|
||||
static char* orte_ras_loadleveler_get_host_arch(char * hostname);
|
||||
|
||||
|
||||
/*
|
||||
@ -70,12 +72,10 @@ orte_ras_base_module_t orte_ras_loadleveler_module = {
|
||||
};
|
||||
|
||||
|
||||
/**
|
||||
/*
|
||||
* Discover available (pre-allocated) nodes. Allocate the
|
||||
* requested number of nodes/process slots to the job.
|
||||
*
|
||||
*/
|
||||
#include "orte/mca/gpr/gpr.h"
|
||||
static int orte_ras_loadleveler_allocate(orte_jobid_t jobid)
|
||||
{
|
||||
int i, rc, ret;
|
||||
@ -110,7 +110,7 @@ static int orte_ras_loadleveler_allocate(orte_jobid_t jobid)
|
||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
node->node_name = strdup(hostlist[i]);
|
||||
node->node_arch = NULL;
|
||||
node->node_arch = orte_ras_loadleveler_get_host_arch(hostlist[i]);
|
||||
node->node_state = ORTE_NODE_STATE_UP;
|
||||
node->node_cellid = 0;
|
||||
node->node_slots_inuse = 0;
|
||||
@ -169,16 +169,17 @@ static int orte_ras_loadleveler_get_hostlist(int* num_hosts, char*** hostlist)
|
||||
/* Get the step ID from LOADL_STEP_ID environment variable. */
|
||||
if(NULL == (ll_step_id = getenv("LOADL_STEP_ID"))) {
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"ras:loadleveler:allocate: could not get LOADL_STEP_ID from environment!");
|
||||
"ras:loadleveler:allocate: could not get LOADL_STEP_ID "
|
||||
"from environment!");
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
|
||||
job_step_list[0] = ll_step_id;
|
||||
job_step_list[1] = NULL;
|
||||
|
||||
/* STEP 1: Get Job object from Central Manager to find out the name of the Schedd */
|
||||
/* daemon that handles this job. In a Multicluster environment we can not get */
|
||||
/* the schedd name from the job step id. */
|
||||
/* STEP 1: Get Job object from Central Manager to find out the name of the
|
||||
* Schedd daemon that handles this job. In a Multicluster environment we
|
||||
* can not get the schedd name from the job step id. */
|
||||
|
||||
/* Initialize the LL API. Specify that query type is JOBS. */
|
||||
if(NULL == (queryObject = ll_query(JOBS))) {
|
||||
@ -188,14 +189,17 @@ static int orte_ras_loadleveler_get_hostlist(int* num_hosts, char*** hostlist)
|
||||
}
|
||||
|
||||
/* Specify that this is a QUERY_STEPID type of query. */
|
||||
if(0 > (rc = ll_set_request(queryObject, QUERY_STEPID, job_step_list, ALL_DATA))) {
|
||||
rc = ll_set_request(queryObject, QUERY_STEPID, job_step_list, ALL_DATA);
|
||||
if(0 > rc) {
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"ras:loadleveler:allocate: ll_set request failed: error %d!", rc);
|
||||
"ras:loadleveler:allocate: ll_set request failed: error "
|
||||
"%d!", rc);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
|
||||
/* Get a Job object from LoadL_schedd that contains the relevant job step. */
|
||||
if(NULL == (job = ll_get_objs(queryObject, LL_CM, NULL, &obj_count, &err_code))) {
|
||||
/* Get a Job object from LoadL_schedd that contains the relevant job step */
|
||||
job = ll_get_objs(queryObject, LL_CM, NULL, &obj_count, &err_code);
|
||||
if(NULL == job) {
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"ras:loadleveler:allocate: ll_get_objs faild!");
|
||||
return ORTE_ERROR;
|
||||
@ -203,13 +207,15 @@ static int orte_ras_loadleveler_get_hostlist(int* num_hosts, char*** hostlist)
|
||||
|
||||
if (obj_count != 1) { /* Only 1 Job object is expected. */
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"ras:loadleveler:allocate: ll_get_objs: expected one job to match, got %d!", obj_count);
|
||||
"ras:loadleveler:allocate: ll_get_objs: expected one job "
|
||||
"to match, got %d!", obj_count);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
|
||||
if(0 != (rc = ll_get_data(job, LL_JobSchedd, &schedd_host_name))) {
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"ras:loadleveler:allocate: ll_get_data: failure. RC= %d!", rc);
|
||||
"ras:loadleveler:allocate: ll_get_data: failure. RC= %d!",
|
||||
rc);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
if (schedd_host_name != NULL) {
|
||||
@ -236,59 +242,72 @@ static int orte_ras_loadleveler_get_hostlist(int* num_hosts, char*** hostlist)
|
||||
}
|
||||
|
||||
/* Specify that this is a QUERY_STEPID type of query. */
|
||||
if(0 != (rc = ll_set_request(queryObject, QUERY_STEPID, job_step_list, ALL_DATA))) {
|
||||
rc = ll_set_request(queryObject, QUERY_STEPID, job_step_list, ALL_DATA);
|
||||
if(0 != rc) {
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"ras:loadleveler:allocate: ll_set request failed: error %d!", rc);
|
||||
"ras:loadleveler:allocate: ll_set request failed: error "
|
||||
"%d!", rc);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
|
||||
/* Get a Job object from LoadL_schedd that contains the relevant job step. */
|
||||
if(NULL == (job = ll_get_objs(queryObject, LL_SCHEDD, schedd_host_name, &obj_count, &err_code))) {
|
||||
/* Get a Job object from LoadL_schedd that contains the relevant job step */
|
||||
job = ll_get_objs(queryObject, LL_SCHEDD, schedd_host_name, &obj_count,
|
||||
&err_code);
|
||||
if(NULL == job) {
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"ras:loadleveler:allocate: ll_set request failed: error %d!", rc);
|
||||
"ras:loadleveler:allocate: ll_set request failed: error "
|
||||
"%d!", rc);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
|
||||
if (obj_count != 1) { /* Only 1 Job object is expected. */
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"ras:loadleveler:allocate: ll_get_objs: expected one job to match, got %d!", obj_count);
|
||||
"ras:loadleveler:allocate: ll_get_objs: expected one job "
|
||||
"to match, got %d!", obj_count);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
|
||||
if(0 != (rc = ll_get_data(job, LL_JobStepCount, &job_step_count))) {
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"ras:loadleveler:allocate: ll_get_data: failure. RC= %d!", rc);
|
||||
"ras:loadleveler:allocate: ll_get_data: failure. RC= %d!",
|
||||
rc);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
if (job_step_count != 1) { /* Only 1 Job Step object is expected. */
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"ras:loadleveler:allocate: ll_get_objs: expected one job step to match, got %d!", obj_count);
|
||||
"ras:loadleveler:allocate: ll_get_objs: expected one job "
|
||||
"step to match, got %d!", obj_count);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
|
||||
step = NULL;
|
||||
if(0 != (rc = ll_get_data(job, LL_JobGetFirstStep, &step))) {
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"ras:loadleveler:allocate: ll_get_data: failure. RC= %d!", rc);
|
||||
"ras:loadleveler:allocate: ll_get_data: failure. RC= %d!",
|
||||
rc);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
if (!step) {
|
||||
fprintf(stderr, "ll_get_data() Error: Unable to obtain Job Step information.\n");
|
||||
exit(2);
|
||||
if(NULL == step) {
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"ll_get_data() Error: Unable to obtain Job Step "
|
||||
"information.\n");
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
|
||||
step_mode = -1;
|
||||
if(0 != (rc = ll_get_data(step, LL_StepParallelMode, &step_mode))) {
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"ras:loadleveler:allocate: ll_get_data: failure on LL_StepParallelMode. RC= %d!", rc);
|
||||
"ras:loadleveler:allocate: ll_get_data: failure on "
|
||||
"LL_StepParallelMode. RC= %d!", rc);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
|
||||
/* Serial job step: step_mode == 0; Parallel: step_mode == 1; Others: 2, 3, 4. */
|
||||
/* Serial job step: step_mode==0; Parallel: step_mode==1; Others:2,3,4. */
|
||||
if ((step_mode != 0) && (step_mode != 1)) {
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"ras:loadleveler:allocate: We support only Serial and Parallel LoadLeveler job types."
|
||||
"PVM, NQS, and Blue Gene jobs are not supported by the LoadLeveler RAS!");
|
||||
"ras:loadleveler:allocate: We support only Serial and "
|
||||
"Parallel LoadLeveler job types. PVM, NQS, and Blue Gene"
|
||||
"jobs are not supported by the LoadLeveler RAS!");
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
|
||||
@ -296,25 +315,31 @@ static int orte_ras_loadleveler_get_hostlist(int* num_hosts, char*** hostlist)
|
||||
node = NULL;
|
||||
if(0 != (rc = ll_get_data(step, LL_StepGetFirstNode, &node))) {
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"ras:loadleveler:allocate: ll_get_data: failure on LL_StepGetFirstNode. RC= %d!", rc);
|
||||
"ras:loadleveler:allocate: ll_get_data: failure on "
|
||||
"LL_StepGetFirstNode. RC= %d!", rc);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
task = NULL;
|
||||
if(0 != (rc = ll_get_data(node, LL_NodeGetFirstTask, &task))) {
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"ras:loadleveler:allocate: ll_get_data: failure on LL_NodeGetFirstTask. RC= %d!", rc);
|
||||
"ras:loadleveler:allocate: ll_get_data: failure on "
|
||||
"LL_NodeGetFirstTask. RC= %d!", rc);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
task_instance = NULL;
|
||||
if(0 != (rc = ll_get_data(task, LL_TaskGetFirstTaskInstance, &task_instance))) {
|
||||
rc = ll_get_data(task, LL_TaskGetFirstTaskInstance, &task_instance);
|
||||
if(0 != rc) {
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"ras:loadleveler:allocate: ll_get_data: failure on LL_TaskGetFirstInstance. RC= %d!", rc);
|
||||
"ras:loadleveler:allocate: ll_get_data: failure on "
|
||||
"LL_TaskGetFirstInstance. RC= %d!", rc);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
task_machine_name = NULL;
|
||||
if(0 != (rc = ll_get_data(task_instance, LL_TaskInstanceMachineName, &task_machine_name))) {
|
||||
if(0 != (rc = ll_get_data(task_instance, LL_TaskInstanceMachineName,
|
||||
&task_machine_name))) {
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"ras:loadleveler:allocate: ll_get_data: failure on LL_TaskInstanceMachineName. RC= %d!", rc);
|
||||
"ras:loadleveler:allocate: ll_get_data: failure on "
|
||||
"LL_TaskInstanceMachineName. RC= %d!", rc);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
opal_argv_append(num_hosts, hostlist, task_machine_name);
|
||||
@ -342,38 +367,50 @@ static int orte_ras_loadleveler_get_hostlist(int* num_hosts, char*** hostlist)
|
||||
|
||||
while(task) { /* Loop through the "Task" objects. */
|
||||
ll_master_task = 0;
|
||||
if(0 != (rc = ll_get_data(task, LL_TaskIsMaster, &ll_master_task))) {
|
||||
rc = ll_get_data(task, LL_TaskIsMaster, &ll_master_task);
|
||||
if(0 != rc) {
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"ras:loadleveler:allocate: ll_get_data: failure on LL_TaskIsMaster. RC= %d!", rc);
|
||||
"ras:loadleveler:allocate: ll_get_data: failure"
|
||||
" on LL_TaskIsMaster. RC= %d!", rc);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
|
||||
/* The "master task" Task object is a LoadLeveler abstraction and is not relevant here. */
|
||||
/* Look at only Task objects that are not "master". */
|
||||
/* The "master task" Task object is a LoadLeveler abstraction
|
||||
* and is not relevant here. Look at only Task objects that
|
||||
* are not "master".*/
|
||||
if (!ll_master_task) {
|
||||
task_instance = NULL;
|
||||
if(0 != (rc = ll_get_data(task, LL_TaskGetFirstTaskInstance, &task_instance))) {
|
||||
if(0 != (rc = ll_get_data(task, LL_TaskGetFirstTaskInstance,
|
||||
&task_instance))) {
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"ras:loadleveler:allocate: ll_get_data: failure on LL_TaskGetFirstTaskInstance."
|
||||
"ras:loadleveler:allocate: ll_get_data: "
|
||||
"failure on LL_TaskGetFirstTaskInstance. "
|
||||
" RC= %d!", rc);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
|
||||
while (task_instance) { /* Loop through the "Task Instance" objects. */
|
||||
/* Loop through the "Task Instance" objects. */
|
||||
while (task_instance) {
|
||||
task_machine_name = NULL;
|
||||
if(0 != (rc = ll_get_data(task_instance, LL_TaskInstanceMachineName, &task_machine_name))) {
|
||||
rc = ll_get_data(task_instance,
|
||||
LL_TaskInstanceMachineName,
|
||||
&task_machine_name);
|
||||
if(0 != rc) {
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"ras:loadleveler:allocate: ll_get_data: failure on "
|
||||
"LL_TaskInstanceMachineName. RC= %d!", rc);
|
||||
"ras:loadleveler:allocate: ll_get_data:"
|
||||
" failure on LL_TaskInstanceMachineName"
|
||||
"RC= %d!", rc);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
opal_argv_append(num_hosts, hostlist, task_machine_name);
|
||||
printf("added %s\n", task_machine_name);
|
||||
task_instance = NULL;
|
||||
if(0 != (rc = ll_get_data(task, LL_TaskGetNextTaskInstance, &task_instance))) {
|
||||
rc = ll_get_data(task, LL_TaskGetNextTaskInstance,
|
||||
&task_instance);
|
||||
if(0 != rc) {
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"ras:loadleveler:allocate: ll_get_data: failure on "
|
||||
"LL_TaskGetNextInstance. RC= %d!", rc);
|
||||
"ras:loadleveler:allocate: ll_get_data:"
|
||||
" failure on LL_TaskGetNextInstance. "
|
||||
"RC= %d!", rc);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
}
|
||||
@ -381,14 +418,16 @@ static int orte_ras_loadleveler_get_hostlist(int* num_hosts, char*** hostlist)
|
||||
task = NULL;
|
||||
if(0 != (rc = ll_get_data(node, LL_NodeGetNextTask, &task))) {
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"ras:loadleveler:allocate: ll_get_data: failure on LL_NodeGetNextTask. RC= %d!", rc);
|
||||
"ras:loadleveler:allocate: ll_get_data: "
|
||||
"failure on LL_NodeGetNextTask. RC= %d!", rc);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
}
|
||||
node = NULL;
|
||||
if(0 != (rc = ll_get_data(step, LL_StepGetNextNode, &node))) {
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"ras:loadleveler:allocate: ll_get_data: failure on LL_StepGetNextNode. RC= %d!", rc);
|
||||
"ras:loadleveler:allocate: ll_get_data: failure "
|
||||
"on LL_StepGetNextNode. RC= %d!", rc);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
}
|
||||
@ -397,3 +436,50 @@ static int orte_ras_loadleveler_get_hostlist(int* num_hosts, char*** hostlist)
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/*
|
||||
* get the machine arch from LoadLeveler
|
||||
* Will return NULL on error or a arch string that needs to be freed
|
||||
* (some code from the IBM documentation, licensed as above)
|
||||
*/
|
||||
static char* orte_ras_loadleveler_get_host_arch(char * hostname) {
|
||||
LL_element *queryObject, *machine;
|
||||
int rc, obj_count, err_code;
|
||||
char * hostlist[2];
|
||||
char * arch;
|
||||
|
||||
/* Initialize the query: Machine query */
|
||||
queryObject = ll_query(MACHINES);
|
||||
if(NULL == queryObject) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* Set query parameters: query specific machines by name */
|
||||
hostlist[0] = hostname;
|
||||
hostlist[1] = NULL;
|
||||
|
||||
rc = ll_set_request(queryObject, QUERY_HOST, hostlist, ALL_DATA);
|
||||
if(0 != rc) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* Get the machine objects from the LoadL_negotiator (central manager) daemon */
|
||||
machine = ll_get_objs(queryObject, LL_CM, NULL, &obj_count, &err_code);
|
||||
if(NULL == machine || 1 != obj_count) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* Process the machine object */
|
||||
rc = ll_get_data(machine, LL_MachineArchitecture, &arch);
|
||||
if(0 != rc) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* Free objects obtained from Negotiator */
|
||||
ll_free_objs(queryObject);
|
||||
|
||||
/* Free query element */
|
||||
ll_deallocate(queryObject);
|
||||
|
||||
return arch;
|
||||
}
|
||||
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user