1
1
This commit was SVN r18889.
Этот коммит содержится в:
Ralph Castain 2008-07-11 15:40:25 +00:00
родитель 7834201f69
Коммит 58964b2bf8
4 изменённых файлов: 57 добавлений и 81 удалений

Просмотреть файл

@ -35,6 +35,7 @@
#include "opal/util/opal_environ.h" #include "opal/util/opal_environ.h"
#include "opal/class/opal_pointer_array.h" #include "opal/class/opal_pointer_array.h"
#include "orte/util/show_help.h"
#include "orte/util/name_fns.h" #include "orte/util/name_fns.h"
#include "orte/runtime/orte_globals.h" #include "orte/runtime/orte_globals.h"
#include "opal/mca/base/mca_base_param.h" #include "opal/mca/base/mca_base_param.h"
@ -309,55 +310,38 @@ static int lsf_set_name(void)
{ {
int rc; int rc;
int id; int id;
char* name_string = NULL;
int lsf_nodeid; int lsf_nodeid;
orte_jobid_t jobid;
/* start by getting our jobid, and vpid (which is the orte_vpid_t vpid;
starting vpid for the list of daemons) */ char* jobid_string;
id = mca_base_param_register_string("orte", "ess", "name", NULL, NULL); char* vpid_string;
mca_base_param_lookup_string(id, &name_string);
if (name_string != NULL) {
if (ORTE_SUCCESS !=
(rc = orte_util_convert_string_to_process_name(&ORTE_PROC_MY_NAME, name_string))) {
ORTE_ERROR_LOG(rc);
free(name_string);
return rc;
}
free(name_string);
} else {
orte_jobid_t jobid;
orte_vpid_t vpid;
char* jobid_string;
char* vpid_string;
id = mca_base_param_register_string("orte", "ess", "jobid", NULL, NULL); id = mca_base_param_register_string("orte", "ess", "jobid", NULL, NULL);
mca_base_param_lookup_string(id, &jobid_string); mca_base_param_lookup_string(id, &jobid_string);
if (NULL == jobid_string) { if (NULL == jobid_string) {
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
return ORTE_ERR_NOT_FOUND; return ORTE_ERR_NOT_FOUND;
}
if (ORTE_SUCCESS !=
(rc = orte_util_convert_string_to_jobid(&jobid, jobid_string))) {
ORTE_ERROR_LOG(rc);
return(rc);
}
id = mca_base_param_register_string("orte", "ess", "vpid", NULL, NULL);
mca_base_param_lookup_string(id, &vpid_string);
if (NULL == vpid_string) {
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
return ORTE_ERR_NOT_FOUND;
}
if (ORTE_SUCCESS !=
(rc = orte_util_convert_string_to_vpid(&vpid, vpid_string))) {
ORTE_ERROR_LOG(rc);
return(rc);
}
ORTE_PROC_MY_NAME->jobid;
ORTE_PROC_MY_NAME->vpid = vpid;
} }
if (ORTE_SUCCESS !=
(rc = orte_util_convert_string_to_jobid(&jobid, jobid_string))) {
ORTE_ERROR_LOG(rc);
return(rc);
}
id = mca_base_param_register_string("orte", "ess", "vpid", NULL, NULL);
mca_base_param_lookup_string(id, &vpid_string);
if (NULL == vpid_string) {
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
return ORTE_ERR_NOT_FOUND;
}
if (ORTE_SUCCESS !=
(rc = orte_util_convert_string_to_vpid(&vpid, vpid_string))) {
ORTE_ERROR_LOG(rc);
return(rc);
}
ORTE_PROC_MY_NAME->jobid = jobid;
ORTE_PROC_MY_NAME->vpid = vpid;
/* fix up the base name and make it the "real" name */ /* fix up the base name and make it the "real" name */
lsf_nodeid = atoi(getenv("LSF_PM_TASKID")); lsf_nodeid = atoi(getenv("LSF_PM_TASKID"));

Просмотреть файл

@ -28,17 +28,17 @@
BEGIN_C_DECLS BEGIN_C_DECLS
struct orte_plm_lsf_component_t { struct orte_plm_lsf_component_t {
orte_plm_base_component_t super; orte_plm_base_component_t super;
int priority; int priority;
bool timing; bool timing;
char *orted; char *orted;
}; };
typedef struct orte_plm_lsf_component_t orte_plm_lsf_component_t; typedef struct orte_plm_lsf_component_t orte_plm_lsf_component_t;
/* Globally exported variables */ /* Globally exported variables */
ORTE_DECLSPEC extern orte_plm_lsf_component_t mca_plm_lsf_component; ORTE_DECLSPEC extern orte_plm_lsf_component_t mca_plm_lsf_component;
extern orte_plm_base_module_t orte_plm_lsf_module; extern orte_plm_base_module_t orte_plm_lsf_module;
END_C_DECLS END_C_DECLS

Просмотреть файл

@ -29,9 +29,9 @@
#include <lsf/lsbatch.h> #include <lsf/lsbatch.h>
#include "opal/mca/base/mca_base_param.h" #include "opal/mca/base/mca_base_param.h"
#include "orte/util/show_help.h"
#include "opal/util/argv.h" #include "opal/util/argv.h"
#include "orte/util/show_help.h"
#include "orte/util/proc_info.h" #include "orte/util/proc_info.h"
#include "orte/mca/errmgr/errmgr.h" #include "orte/mca/errmgr/errmgr.h"
@ -95,7 +95,6 @@ orte_plm_lsf_component_t mca_plm_lsf_component = {
static int plm_lsf_open(void) static int plm_lsf_open(void)
{ {
int tmp, value;
mca_base_component_t *comp = &mca_plm_lsf_component.super.base_version; mca_base_component_t *comp = &mca_plm_lsf_component.super.base_version;
mca_base_param_reg_int(comp, "priority", "Default selection priority", mca_base_param_reg_int(comp, "priority", "Default selection priority",
@ -106,15 +105,6 @@ static int plm_lsf_open(void)
false, false, "orted", false, false, "orted",
&mca_plm_lsf_component.orted); &mca_plm_lsf_component.orted);
tmp = mca_base_param_reg_int_name("orte", "timing",
"Request that critical timing loops be measured",
false, false, 0, &value);
if (value != 0) {
mca_plm_lsf_component.timing = true;
} else {
mca_plm_lsf_component.timing = false;
}
return ORTE_SUCCESS; return ORTE_SUCCESS;
} }
@ -131,7 +121,7 @@ static int orte_plm_lsf_component_query(mca_base_module_t **module, int *priorit
/* check if lsf is running here */ /* check if lsf is running here */
if (NULL == getenv("LSB_JOBID") || lsb_init("ORTE launcher") < 0) { if (NULL == getenv("LSB_JOBID") || lsb_init("ORTE launcher") < 0) {
/* nope, not here */ /* nope, not here */
opal_output_verbose(10, orte_plm_base.plm_output, opal_output_verbose(10, orte_plm_globals.output,
"plm:lsf: NOT available for selection"); "plm:lsf: NOT available for selection");
*module = NULL; *module = NULL;
return ORTE_ERROR: return ORTE_ERROR:

Просмотреть файл

@ -55,12 +55,12 @@
#include "opal/mca/installdirs/installdirs.h" #include "opal/mca/installdirs/installdirs.h"
#include "opal/util/argv.h" #include "opal/util/argv.h"
#include "orte/util/show_help.h"
#include "opal/util/opal_environ.h" #include "opal/util/opal_environ.h"
#include "opal/util/path.h" #include "opal/util/path.h"
#include "opal/util/basename.h" #include "opal/util/basename.h"
#include "opal/mca/base/mca_base_param.h" #include "opal/mca/base/mca_base_param.h"
#include "orte/util/show_help.h"
#include "orte/runtime/runtime.h" #include "orte/runtime/runtime.h"
#include "orte/runtime/orte_wakeup.h" #include "orte/runtime/orte_wakeup.h"
#include "orte/runtime/orte_wait.h" #include "orte/runtime/orte_wait.h"
@ -88,7 +88,7 @@ static int plm_lsf_finalize(void);
/* /*
* Global variable * Global variable
*/ */
orte_plm_base_module_1_3_0_t orte_plm_lsf_module = { orte_plm_base_module_t orte_plm_lsf_module = {
plm_lsf_init, plm_lsf_init,
orte_plm_base_set_hnp_name, orte_plm_base_set_hnp_name,
plm_lsf_launch_job, plm_lsf_launch_job,
@ -130,7 +130,6 @@ static int plm_lsf_launch_job(orte_job_t *jdata)
int argc; int argc;
int rc; int rc;
char** env = NULL; char** env = NULL;
char* var;
char **nodelist_argv; char **nodelist_argv;
int nodelist_argc; int nodelist_argc;
char *vpid_string; char *vpid_string;
@ -147,7 +146,7 @@ static int plm_lsf_launch_job(orte_job_t *jdata)
/* default to declaring the daemons failed*/ /* default to declaring the daemons failed*/
failed_job = ORTE_PROC_MY_NAME->jobid; failed_job = ORTE_PROC_MY_NAME->jobid;
if (mca_plm_lsf_component.timing) { if (orte_timing) {
if (0 != gettimeofday(&joblaunchstart, NULL)) { if (0 != gettimeofday(&joblaunchstart, NULL)) {
opal_output(0, "plm_lsf: could not obtain job start time"); opal_output(0, "plm_lsf: could not obtain job start time");
} }
@ -257,8 +256,8 @@ static int plm_lsf_launch_job(orte_job_t *jdata)
don't support different --prefix'es for different nodes in don't support different --prefix'es for different nodes in
the SLURM plm) */ the SLURM plm) */
cur_prefix = NULL; cur_prefix = NULL;
for (i=0; i < map->num_apps; i++) { for (i=0; i < jdata->num_apps; i++) {
char * app_prefix_dir = map->apps[i]->prefix_dir; char * app_prefix_dir = apps[i]->prefix_dir;
/* Check for already set cur_prefix_dir -- if different, /* Check for already set cur_prefix_dir -- if different,
complain */ complain */
if (NULL != app_prefix_dir) { if (NULL != app_prefix_dir) {
@ -284,7 +283,7 @@ static int plm_lsf_launch_job(orte_job_t *jdata)
/* setup environment */ /* setup environment */
env = opal_argv_copy(orte_launch_environ); env = opal_argv_copy(orte_launch_environ);
if (mca_plm_lsf_component.timing) { if (orte_timing) {
if (0 != gettimeofday(&launchstart, NULL)) { if (0 != gettimeofday(&launchstart, NULL)) {
opal_output(0, "plm_lsf: could not obtain start time"); opal_output(0, "plm_lsf: could not obtain start time");
} }
@ -306,7 +305,10 @@ static int plm_lsf_launch_job(orte_job_t *jdata)
/* wait for daemons to callback */ /* wait for daemons to callback */
if (ORTE_SUCCESS != if (ORTE_SUCCESS !=
(rc = orte_plm_base_daemon_callback(map->num_new_daemons))) { (rc = orte_plm_base_daemon_callback(map->num_new_daemons))) {
ORTE_ERROR_LOG(rc); OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
"%s plm:lsf: daemon launch failed for job %s on error %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOBID_PRINT(active_job), ORTE_ERROR_NAME(rc)));
goto cleanup; goto cleanup;
} }
@ -314,14 +316,17 @@ launch_apps:
/* daemons succeeded - any failure now would be from apps */ /* daemons succeeded - any failure now would be from apps */
failed_job = active_job; failed_job = active_job;
if (ORTE_SUCCESS != (rc = orte_plm_base_launch_apps(active_job))) { if (ORTE_SUCCESS != (rc = orte_plm_base_launch_apps(active_job))) {
ORTE_ERROR_LOG(rc); OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
"%s plm:lsf: launch of apps failed for job %s on error %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOBID_PRINT(active_job), ORTE_ERROR_NAME(rc)));
goto cleanup; goto cleanup;
} }
/* declare the launch a success */ /* declare the launch a success */
failed_launch = false; failed_launch = false;
if (mca_plm_lsf_component.timing) { if (orte_timing) {
if (0 != gettimeofday(&launchstop, NULL)) { if (0 != gettimeofday(&launchstop, NULL)) {
opal_output(0, "plm_lsf: could not obtain stop time"); opal_output(0, "plm_lsf: could not obtain stop time");
} else { } else {
@ -340,9 +345,6 @@ launch_apps:
} }
cleanup: cleanup:
if (NULL != map) {
OBJ_RELEASE(map);
}
if (NULL != argv) { if (NULL != argv) {
opal_argv_free(argv); opal_argv_free(argv);
} }