Ckpt the bproc support. All compiles now except for PLM module
This commit was SVN r18744.
Этот коммит содержится в:
родитель
dd563f9297
Коммит
9cebe0ca96
@ -21,63 +21,66 @@
|
||||
* See odls_bproc.h for an overview of how it works.
|
||||
*/
|
||||
#include "orte_config.h"
|
||||
#include "orte/constants.h"
|
||||
|
||||
#include <stdlib.h>
|
||||
#ifdef HAVE_UNISTD_H
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
#ifdef HAVE_SYS_TYPES_H
|
||||
#include <sys/types.h>
|
||||
#endif
|
||||
#ifdef HAVE_FCNTL_H
|
||||
#include <fcntl.h>
|
||||
#include <pty.h>
|
||||
#endif
|
||||
#include <dirent.h>
|
||||
|
||||
#include "opal/mca/base/mca_base_param.h"
|
||||
#include "opal/runtime/opal_progress.h"
|
||||
#include "opal/threads/condition.h"
|
||||
#include "opal/util/os_dirpath.h"
|
||||
#include "opal/util/os_path.h"
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/dss/dss.h"
|
||||
|
||||
#include "orte/dss/dss.h"
|
||||
#include "orte/util/sys_info.h"
|
||||
#include "orte/orte_constants.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/gpr/gpr.h"
|
||||
#include "orte/mca/iof/iof.h"
|
||||
#include "orte/mca/iof/base/iof_base_setup.h"
|
||||
#include "orte/mca/ns/base/base.h"
|
||||
#include "orte/mca/oob/base/base.h"
|
||||
#include "orte/mca/rml/rml.h"
|
||||
#include "orte/util/session_dir.h"
|
||||
#include "orte/util/univ_info.h"
|
||||
|
||||
#include "orte/mca/odls/base/odls_private.h"
|
||||
#include "odls_bproc.h"
|
||||
|
||||
static int orte_odls_bproc_launch_local_procs(opal_buffer_t *data);
|
||||
static int orte_odls_bproc_kill_local_procs(orte_jobid_t job, bool set_state);
|
||||
static int orte_odls_bproc_signal_local_procs(const orte_process_name_t *proc, int32_t signal);
|
||||
|
||||
/**
|
||||
* Initialization of the bproc_orted module with all the needed function pointers
|
||||
*/
|
||||
orte_odls_base_module_t orte_odls_bproc_module = {
|
||||
orte_odls_bproc_subscribe_launch_data,
|
||||
orte_odls_bproc_get_add_procs_data,
|
||||
orte_odls_base_default_get_add_procs_data,
|
||||
orte_odls_bproc_launch_local_procs,
|
||||
orte_odls_bproc_kill_local_procs,
|
||||
orte_odls_bproc_signal_local_procs
|
||||
orte_odls_bproc_signal_local_procs,
|
||||
orte_odls_base_default_deliver_message,
|
||||
orte_odls_base_default_require_sync,
|
||||
orte_odls_base_default_collect_data
|
||||
};
|
||||
|
||||
static int odls_bproc_make_dir(char *directory);
|
||||
static char * odls_bproc_get_base_dir_name(int proc_rank, orte_jobid_t jobid,
|
||||
orte_std_cntr_t app_context);
|
||||
orte_std_cntr_t app_context);
|
||||
static void odls_bproc_delete_dir_tree(char * path);
|
||||
static int odls_bproc_remove_dir(void);
|
||||
static void odls_bproc_send_cb(int status, orte_process_name_t * peer,
|
||||
orte_buffer_t* buffer, int tag, void* cbdata);
|
||||
opal_buffer_t* buffer, int tag, void* cbdata);
|
||||
static int odls_bproc_setup_stdio(orte_process_name_t *proc_name,
|
||||
int proc_rank, orte_jobid_t jobid,
|
||||
orte_std_cntr_t app_context, bool connect_stdin);
|
||||
|
||||
|
||||
int orte_odls_bproc_get_add_procs_data(orte_gpr_notify_data_t **data, orte_job_map_t *map)
|
||||
{
|
||||
return ORTE_ERR_NOT_IMPLEMENTED;
|
||||
}
|
||||
int proc_rank, orte_jobid_t jobid,
|
||||
orte_std_cntr_t app_context, bool connect_stdin);
|
||||
|
||||
/* Local globals */
|
||||
static char *user = NULL;
|
||||
static char *frontend = NULL;
|
||||
|
||||
/**
|
||||
* Creates the passed directory. If the directory already exists, it and its
|
||||
@ -115,39 +118,36 @@ static char *
|
||||
odls_bproc_get_base_dir_name(int proc_rank, orte_jobid_t jobid,
|
||||
orte_std_cntr_t app_context)
|
||||
{
|
||||
char *path = NULL, *user = NULL, *job = NULL;
|
||||
char *path = NULL, *job = NULL;
|
||||
int rc;
|
||||
|
||||
/* ensure that system info is set */
|
||||
orte_sys_info();
|
||||
|
||||
if (NULL == orte_universe_info.name) { /* error condition */
|
||||
ORTE_ERROR_LOG(ORTE_ERROR);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
rc = orte_ns.convert_jobid_to_string(&job, jobid);
|
||||
rc = orte_util_convert_jobid_to_string(&job, jobid);
|
||||
if(ORTE_SUCCESS != rc) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* get the username set by the bproc pls. We need to get it from here
|
||||
/* get the username set by the bproc plm. We need to get it from here
|
||||
* because on many bproc systems the method we use to get the username
|
||||
* from the system on the backend fails and we only get the uid. */
|
||||
rc = mca_base_param_register_string("pls", "bproc", "username", NULL,
|
||||
orte_system_info.user);
|
||||
mca_base_param_lookup_string(rc,&user);
|
||||
* from the system on the backend fails and we only get the uid
|
||||
*/
|
||||
mca_base_param_reg_string_name("orte", "plm_bproc_username",
|
||||
"Name of the user on the remote node",
|
||||
false, false, NULL, &user);
|
||||
|
||||
if (0 > asprintf(&path, OPAL_PATH_SEP"tmp"OPAL_PATH_SEP"openmpi-bproc-%s"OPAL_PATH_SEP"%s"OPAL_PATH_SEP"%s-%d"OPAL_PATH_SEP"%d",
|
||||
user, orte_universe_info.name,
|
||||
job, (int) app_context, proc_rank)) {
|
||||
if (0 > asprintf(&frontend, OPAL_PATH_SEP"%s"OPAL_PATH_SEP"openmpi-bproc-%s",
|
||||
orte_process_info.tmpdir_base, user)) {
|
||||
ORTE_ERROR_LOG(ORTE_ERROR);
|
||||
path = NULL;
|
||||
}
|
||||
if(0 < mca_odls_bproc_component.debug) {
|
||||
opal_output(0, "odls bproc io setup. Path: %s\n", path);
|
||||
|
||||
if (0 > asprintf(&path, "%s"OPAL_PATH_SEP"%s-%d"OPAL_PATH_SEP"%d",
|
||||
frontend, job, (int) app_context, proc_rank)) {
|
||||
ORTE_ERROR_LOG(ORTE_ERROR);
|
||||
path = NULL;
|
||||
}
|
||||
OPAL_OUTPUT_VERBOSE((0, orte_odls_globals.output,
|
||||
"odls bproc io setup. Path: %s\n", path));
|
||||
free(user);
|
||||
free(job);
|
||||
return path;
|
||||
@ -199,26 +199,6 @@ odls_bproc_delete_dir_tree(char * path)
|
||||
static int
|
||||
odls_bproc_remove_dir()
|
||||
{
|
||||
char *frontend = NULL, *user = NULL, *filename = NULL;
|
||||
int id;
|
||||
|
||||
/* get the username set by the bproc pls. We need to get it from here
|
||||
* because on many bproc systems the method we use to get the username
|
||||
* from the system on the backend fails and we only get the uid. */
|
||||
id = mca_base_param_register_string("pls", "bproc", "username", NULL,
|
||||
orte_system_info.user);
|
||||
mca_base_param_lookup_string(id,&user);
|
||||
asprintf(&filename, "openmpi-bproc-%s", user );
|
||||
if( NULL == filename ) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
frontend = opal_os_path(false, "tmp", filename, NULL );
|
||||
free(filename); /* Always free the filename */
|
||||
if (NULL == frontend) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
/* we do our best to clean up the directory tree, but we ignore errors*/
|
||||
odls_bproc_delete_dir_tree(frontend);
|
||||
free(frontend);
|
||||
@ -236,7 +216,7 @@ odls_bproc_remove_dir()
|
||||
*/
|
||||
static void
|
||||
odls_bproc_send_cb(int status, orte_process_name_t * peer,
|
||||
orte_buffer_t* buffer, int tag, void* cbdata)
|
||||
opal_buffer_t* buffer, int tag, void* cbdata)
|
||||
{
|
||||
OBJ_RELEASE(buffer);
|
||||
}
|
||||
@ -430,196 +410,49 @@ cleanup:
|
||||
}
|
||||
|
||||
|
||||
/* this entire function gets called within a GPR compound command,
|
||||
* so the subscription actually doesn't get done until the orted
|
||||
* executes the compound command
|
||||
*/
|
||||
int orte_odls_bproc_subscribe_launch_data(orte_jobid_t job, orte_gpr_notify_cb_fn_t cbfunc)
|
||||
{
|
||||
char *segment;
|
||||
orte_gpr_value_t *values[1];
|
||||
orte_gpr_subscription_t *subs, sub=ORTE_GPR_SUBSCRIPTION_EMPTY;
|
||||
orte_gpr_trigger_t *trigs, trig=ORTE_GPR_TRIGGER_EMPTY;
|
||||
char* keys[] = {
|
||||
ORTE_PROC_NAME_KEY,
|
||||
ORTE_PROC_APP_CONTEXT_KEY,
|
||||
ORTE_NODE_NAME_KEY,
|
||||
};
|
||||
int num_keys = 3;
|
||||
int i, rc;
|
||||
|
||||
/* get the job segment name */
|
||||
if (ORTE_SUCCESS != (rc = orte_schema.get_job_segment_name(&segment, job))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* attach ourselves to the "standard" orted trigger */
|
||||
if (ORTE_SUCCESS !=
|
||||
(rc = orte_schema.get_std_trigger_name(&(trig.name),
|
||||
ORTED_LAUNCH_STAGE_GATE_TRIGGER, job))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
free(segment);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* ask for return of all data required for launching local processes */
|
||||
subs = ⊂
|
||||
sub.action = ORTE_GPR_NOTIFY_DELETE_AFTER_TRIG;
|
||||
if (ORTE_SUCCESS != (rc = orte_schema.get_std_subscription_name(&(sub.name),
|
||||
ORTED_LAUNCH_STG_SUB,
|
||||
job))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
free(segment);
|
||||
free(trig.name);
|
||||
return rc;
|
||||
}
|
||||
sub.cnt = 1;
|
||||
sub.values = values;
|
||||
|
||||
if (ORTE_SUCCESS != (rc = orte_gpr.create_value(&(values[0]), ORTE_GPR_KEYS_OR | ORTE_GPR_TOKENS_OR,
|
||||
segment, num_keys, 0))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
free(segment);
|
||||
free(sub.name);
|
||||
free(trig.name);
|
||||
return rc;
|
||||
}
|
||||
for (i=0; i < num_keys; i++) {
|
||||
if (ORTE_SUCCESS != (rc = orte_gpr.create_keyval(&(values[0]->keyvals[i]),
|
||||
keys[i], ORTE_UNDEF, NULL))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
free(segment);
|
||||
free(sub.name);
|
||||
free(trig.name);
|
||||
OBJ_RELEASE(values[0]);
|
||||
return rc;
|
||||
}
|
||||
}
|
||||
|
||||
sub.cbfunc = cbfunc;
|
||||
|
||||
trigs = &trig;
|
||||
|
||||
/* do the subscription */
|
||||
if (ORTE_SUCCESS != (rc = orte_gpr.subscribe(1, &subs, 1, &trigs))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
free(segment);
|
||||
free(sub.name);
|
||||
free(trig.name);
|
||||
OBJ_RELEASE(values[0]);
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
||||
/**
|
||||
* Setup io for the current node, then tell orterun we are ready for the actual
|
||||
* processes.
|
||||
* @retval ORTE_SUCCESS
|
||||
* @retval error
|
||||
*/
|
||||
int
|
||||
orte_odls_bproc_launch_local_procs(orte_gpr_notify_data_t *data, char **base_environ)
|
||||
int orte_odls_bproc_launch_local_procs(opal_buffer_t *data)
|
||||
{
|
||||
odls_bproc_child_t *child;
|
||||
orte_odls_child_t *child;
|
||||
opal_list_item_t* item;
|
||||
orte_gpr_value_t *value, **values;
|
||||
orte_gpr_keyval_t *kval;
|
||||
char *node_name;
|
||||
int rc;
|
||||
orte_std_cntr_t i, j, kv, kv2, *sptr;
|
||||
int src = 0;
|
||||
orte_buffer_t *ack;
|
||||
opal_buffer_t *ack;
|
||||
bool connect_stdin;
|
||||
orte_jobid_t jobid;
|
||||
int cycle = 0;
|
||||
|
||||
/* first, retrieve the job number we are to launch from the
|
||||
* returned data - we can extract the jobid directly from the
|
||||
* subscription name we created
|
||||
*/
|
||||
if (ORTE_SUCCESS != (rc = orte_schema.extract_jobid_from_std_trigger_name(&jobid, data->target))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/**
|
||||
* hack for bproc4, change process group so that we do not receive signals
|
||||
* from the parent/front-end process, as bproc4 does not currently allow the
|
||||
* process to intercept the signal
|
||||
*/
|
||||
setpgid(0,0);
|
||||
|
||||
/* loop through the returned data to find the global info and
|
||||
* the info for processes going onto this node
|
||||
*/
|
||||
values = (orte_gpr_value_t**)(data->values)->addr;
|
||||
for (j=0, i=0; i < data->cnt && j < (data->values)->size; j++) { /* loop through all returned values */
|
||||
if (NULL != values[j]) {
|
||||
i++;
|
||||
value = values[j];
|
||||
/* this must have come from one of the process containers, so it must
|
||||
* contain data for a proc structure - see if it belongs to this node
|
||||
*/
|
||||
for (kv=0; kv < value->cnt; kv++) {
|
||||
kval = value->keyvals[kv];
|
||||
if (strcmp(kval->key, ORTE_NODE_NAME_KEY) == 0) {
|
||||
/* Most C-compilers will bark if we try to directly compare the string in the
|
||||
* kval data area against a regular string, so we need to "get" the data
|
||||
* so we can access it */
|
||||
if (ORTE_SUCCESS != (rc = orte_dss.get((void**)&node_name, kval->value, ORTE_STRING))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
/* if this is our node...must also protect against a zero-length string */
|
||||
if (NULL != node_name && 0 == strcmp(node_name, orte_system_info.nodename)) {
|
||||
/* ...harvest the info into a new child structure */
|
||||
child = OBJ_NEW(odls_bproc_child_t);
|
||||
for (kv2 = 0; kv2 < value->cnt; kv2++) {
|
||||
kval = value->keyvals[kv2];
|
||||
if(strcmp(kval->key, ORTE_PROC_NAME_KEY) == 0) {
|
||||
/* copy the name into the child object */
|
||||
if (ORTE_SUCCESS != (rc = orte_dss.copy((void**)&(child->name), kval->value->data, ORTE_NAME))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
if(strcmp(kval->key, ORTE_PROC_APP_CONTEXT_KEY) == 0) {
|
||||
if (ORTE_SUCCESS != (rc = orte_dss.get((void**)&sptr, kval->value, ORTE_STD_CNTR))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
child->app_idx = *sptr; /* save the index into the app_context objects */
|
||||
continue;
|
||||
}
|
||||
} /* kv2 */
|
||||
/* protect operation on the global list of children */
|
||||
OPAL_THREAD_LOCK(&mca_odls_bproc_component.mutex);
|
||||
opal_list_append(&mca_odls_bproc_component.children, &child->super);
|
||||
opal_condition_signal(&mca_odls_bproc_component.cond);
|
||||
OPAL_THREAD_UNLOCK(&mca_odls_bproc_component.mutex);
|
||||
|
||||
}
|
||||
}
|
||||
} /* for kv */
|
||||
} /* for j */
|
||||
|
||||
/* construct the list of children we are to launch */
|
||||
if (ORTE_SUCCESS != (rc = orte_odls_base_default_construct_child_list(data, &jobid))) {
|
||||
OPAL_OUTPUT_VERBOSE((2, orte_odls_globals.output,
|
||||
"%s odls:bproc:launch:local failed to construct child list on error %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_ERROR_NAME(rc)));
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/* set up the io files for our children */
|
||||
for(item = opal_list_get_first(&mca_odls_bproc_component.children);
|
||||
item != opal_list_get_end(&mca_odls_bproc_component.children);
|
||||
|
||||
/* set up the io files for our children */
|
||||
for(item = opal_list_get_first(&orte_odls_globals.children);
|
||||
item != opal_list_get_end(&orte_odls_globals.children);
|
||||
item = opal_list_get_next(item)) {
|
||||
child = (odls_bproc_child_t *) item;
|
||||
if(0 < mca_odls_bproc_component.debug) {
|
||||
opal_output(0, "orte_odls_bproc_launch: setting up io for "
|
||||
"[%lu,%lu,%lu] proc rank %lu\n",
|
||||
ORTE_NAME_ARGS((child->name)),
|
||||
child->name->vpid);
|
||||
}
|
||||
child = (orte_odls_child_t *) item;
|
||||
OPAL_OUTPUT_VERBOSE((2, orte_odls_globals.output,
|
||||
"%s odls:bproc:launch:local setting up io for %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(child->name)));
|
||||
/* only setup to forward stdin if it is rank 0, otherwise connect
|
||||
* to /dev/null */
|
||||
* to /dev/null
|
||||
*/
|
||||
if(0 == child->name->vpid) {
|
||||
connect_stdin = true;
|
||||
} else {
|
||||
@ -638,7 +471,7 @@ orte_odls_bproc_launch_local_procs(orte_gpr_notify_data_t *data, char **base_env
|
||||
}
|
||||
|
||||
/* message to indicate that we are ready */
|
||||
ack = OBJ_NEW(orte_buffer_t);
|
||||
ack = OBJ_NEW(opal_buffer_t);
|
||||
rc = orte_dss.pack(ack, &src, 1, ORTE_INT);
|
||||
if(ORTE_SUCCESS != rc) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
|
@ -29,84 +29,22 @@
|
||||
|
||||
#include "orte_config.h"
|
||||
|
||||
#include <sys/bproc.h>
|
||||
|
||||
#include "opal/mca/mca.h"
|
||||
#include "opal/threads/condition.h"
|
||||
|
||||
#include "orte/mca/gpr/gpr_types.h"
|
||||
#include "orte/mca/rmaps/rmaps_types.h"
|
||||
|
||||
#include "orte/mca/odls/odls.h"
|
||||
|
||||
#if defined(c_plusplus) || defined(__cplusplus)
|
||||
extern "C" {
|
||||
#endif
|
||||
BEGIN_C_DECLS
|
||||
|
||||
/*
|
||||
* Module open / close
|
||||
*/
|
||||
int orte_odls_bproc_component_open(void);
|
||||
int orte_odls_bproc_component_close(void);
|
||||
int orte_odls_bproc_finalize(void);
|
||||
orte_odls_base_module_t* orte_odls_bproc_init(int *priority);
|
||||
int orte_odls_bproc_component_query(mca_base_module_t **module, int *priority);
|
||||
|
||||
/*
|
||||
* Startup / Shutdown
|
||||
*/
|
||||
int orte_odls_bproc_finalize(void);
|
||||
ORTE_MODULE_DECLSPEC extern orte_odls_base_component_t mca_odls_bproc_component;
|
||||
|
||||
/*
|
||||
* Interface
|
||||
*/
|
||||
int orte_odls_bproc_subscribe_launch_data(orte_jobid_t job, orte_gpr_notify_cb_fn_t cbfunc);
|
||||
int orte_odls_bproc_get_add_procs_data(orte_gpr_notify_data_t **data, orte_job_map_t *map);
|
||||
int orte_odls_bproc_launch_local_procs(orte_gpr_notify_data_t *data, char **base_environ);
|
||||
int orte_odls_bproc_kill_local_procs(orte_jobid_t job, bool set_state);
|
||||
int orte_odls_bproc_signal_local_procs(const orte_process_name_t* proc_name, int32_t signal);
|
||||
END_C_DECLS
|
||||
|
||||
/**
|
||||
* ODLS bproc_orted component
|
||||
*/
|
||||
struct orte_odls_bproc_component_t {
|
||||
orte_odls_base_component_t super;
|
||||
/**< The base class */
|
||||
int debug;
|
||||
/**< If greater than 0 print debugging information */
|
||||
int priority;
|
||||
/**< The priority of this component. This will be returned if we determine
|
||||
* that bproc is available and running on this node, */
|
||||
opal_mutex_t lock;
|
||||
/**< Lock used to prevent some race conditions */
|
||||
opal_condition_t cond;
|
||||
/**< Condition used to wake up waiting threads */
|
||||
opal_list_t children;
|
||||
/**< list of children on this node */
|
||||
};
|
||||
/**
|
||||
* Convenience typedef
|
||||
*/
|
||||
typedef struct orte_odls_bproc_component_t orte_odls_bproc_component_t;
|
||||
|
||||
/*
|
||||
* List object to locally store the process names and pids of
|
||||
* our children. This can subsequently be used to order termination
|
||||
* or pass signals without looking the info up again.
|
||||
*/
|
||||
typedef struct odls_bproc_child_t {
|
||||
opal_list_item_t super; /* required to place this on a list */
|
||||
orte_process_name_t *name; /* the OpenRTE name of the proc */
|
||||
pid_t pid; /* local pid of the proc */
|
||||
orte_std_cntr_t app_idx; /* index of the app_context for this proc */
|
||||
bool alive; /* is this proc alive? */
|
||||
} odls_bproc_child_t;
|
||||
OBJ_CLASS_DECLARATION(odls_bproc_child_t);
|
||||
|
||||
ORTE_MODULE_DECLSPEC extern orte_odls_bproc_component_t mca_odls_bproc_component;
|
||||
extern orte_odls_base_module_t orte_odls_bproc_module;
|
||||
|
||||
#if defined(c_plusplus) || defined(__cplusplus)
|
||||
}
|
||||
#endif
|
||||
#endif /* ORTE_ODLS_BPROC_H_ */
|
||||
|
||||
|
@ -21,30 +21,17 @@
|
||||
* Takes care of the component stuff for the MCA.
|
||||
*/
|
||||
#include "orte_config.h"
|
||||
#include "orte/orte_constants.h"
|
||||
#include "orte/constants.h"
|
||||
|
||||
#include "opal/mca/base/mca_base_param.h"
|
||||
|
||||
#include "orte/util/proc_info.h"
|
||||
|
||||
#include "orte/mca/odls/odls.h"
|
||||
#include "orte/mca/odls/base/odls_private.h"
|
||||
#include "odls_bproc.h"
|
||||
|
||||
/* instance the child list object */
|
||||
static void odls_bproc_child_constructor(odls_bproc_child_t *ptr)
|
||||
{
|
||||
ptr->name = NULL;
|
||||
ptr->app_idx = -1;
|
||||
ptr->alive = false;
|
||||
}
|
||||
static void odls_bproc_child_destructor(odls_bproc_child_t *ptr)
|
||||
{
|
||||
if (NULL != ptr->name) free(ptr->name);
|
||||
}
|
||||
OBJ_CLASS_INSTANCE(odls_bproc_child_t,
|
||||
opal_list_item_t,
|
||||
odls_bproc_child_constructor,
|
||||
odls_bproc_child_destructor);
|
||||
extern orte_odls_base_module_t orte_odls_bproc_module;
|
||||
|
||||
/**
|
||||
* The bproc component data structure used to store all the relevent data
|
||||
@ -65,17 +52,14 @@ orte_odls_bproc_component_t mca_odls_bproc_component = {
|
||||
ORTE_RELEASE_VERSION,
|
||||
/* Component open and close functions */
|
||||
orte_odls_bproc_component_open,
|
||||
orte_odls_bproc_component_close
|
||||
orte_odls_bproc_component_close,
|
||||
orte_odls_bproc_component_query
|
||||
},
|
||||
/* Next the MCA v1.0.0 component meta data */
|
||||
{
|
||||
/* Whether the component is checkpointable or not */
|
||||
false
|
||||
/* The component is checkpoint ready */
|
||||
MCA_BASE_METADATA_PARAM_CHECKPOINT
|
||||
},
|
||||
/* Initialization / querying functions */
|
||||
orte_odls_bproc_init,
|
||||
orte_odls_bproc_finalize
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
@ -84,42 +68,26 @@ orte_odls_bproc_component_t mca_odls_bproc_component = {
|
||||
*/
|
||||
int orte_odls_bproc_component_open(void)
|
||||
{
|
||||
/* initialize globals */
|
||||
OBJ_CONSTRUCT(&mca_odls_bproc_component.lock, opal_mutex_t);
|
||||
OBJ_CONSTRUCT(&mca_odls_bproc_component.cond, opal_condition_t);
|
||||
OBJ_CONSTRUCT(&mca_odls_bproc_component.children, opal_list_t);
|
||||
|
||||
/* lookup parameters */
|
||||
mca_base_param_reg_int(&mca_odls_bproc_component.super.version,
|
||||
"priority", NULL, false, false, 100,
|
||||
&mca_odls_bproc_component.priority);
|
||||
mca_base_param_reg_int(&mca_odls_bproc_component.super.version,
|
||||
"debug", "If > 0 prints library debugging information",
|
||||
false, false, 0, &mca_odls_bproc_component.debug);
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/**
|
||||
* Initializes the module. We do not want to run unless we are not the seed,
|
||||
* bproc is running, and we are not on the master node.
|
||||
* Initializes the module.
|
||||
*/
|
||||
orte_odls_base_module_t *orte_odls_bproc_init(int *priority)
|
||||
int orte_odls_bproc_component_query(mca_base_module_t **module, int *priority)
|
||||
{
|
||||
int ret;
|
||||
struct bproc_version_t version;
|
||||
|
||||
/* the base open/select logic protects us against operation when
|
||||
* we are NOT in a daemon, so we don't have to check that here
|
||||
*/
|
||||
|
||||
/* check to see if BProc is running here */
|
||||
ret = bproc_version(&version);
|
||||
if (ret != 0) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
*priority = mca_odls_bproc_component.priority;
|
||||
return &orte_odls_bproc_module;
|
||||
*priority = 30;
|
||||
*module = (mca_base_module_t *)&orte_odls_bproc_module;
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/**
|
||||
@ -127,8 +95,10 @@ orte_odls_base_module_t *orte_odls_bproc_init(int *priority)
|
||||
*/
|
||||
int orte_odls_bproc_component_close(void)
|
||||
{
|
||||
OBJ_DESTRUCT(&mca_odls_bproc_component.lock);
|
||||
OBJ_DESTRUCT(&mca_odls_bproc_component.cond);
|
||||
OBJ_DESTRUCT(&mca_odls_bproc_component.children);
|
||||
/* cleanup state */
|
||||
while (NULL != (item = opal_list_remove_first(&orte_odls_globals.children))) {
|
||||
OBJ_RELEASE(item);
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
@ -19,8 +19,8 @@
|
||||
* @file:
|
||||
*/
|
||||
|
||||
#ifndef ORTE_ODLS_H
|
||||
#define ORTE_ODLS_H
|
||||
#ifndef ORTE_ODLS_DEFAULT_H
|
||||
#define ORTE_ODLS_DEFAULT_H
|
||||
|
||||
#include "orte_config.h"
|
||||
|
||||
|
@ -32,9 +32,9 @@ endif
|
||||
|
||||
sources = \
|
||||
plm_bproc.h \
|
||||
plm_bproc_component.c \
|
||||
plm_bproc.c \
|
||||
plm_bproc_state.c \
|
||||
plm_bproc_component.c
|
||||
plm_bproc_state.c
|
||||
|
||||
mcacomponentdir = $(libdir)/openmpi
|
||||
mcacomponent_LTLIBRARIES = $(component_install)
|
||||
|
@ -21,10 +21,11 @@
|
||||
*/
|
||||
/**
|
||||
* @file:
|
||||
* Part of the bproc launcher. See pls_bproc.h for an overview of how it works.
|
||||
* Part of the bproc launcher. See plm_bproc.h for an overview of how it works.
|
||||
*/
|
||||
|
||||
#include "orte_config.h"
|
||||
|
||||
#if HAVE_SYS_TYPES_H
|
||||
#include <sys/types.h>
|
||||
#endif /* HAVE_SYS_TYPES_H */
|
||||
@ -60,72 +61,60 @@
|
||||
#include "opal/util/trace.h"
|
||||
|
||||
#include "orte/dss/dss.h"
|
||||
#include "orte/util/sys_info.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/iof/iof.h"
|
||||
#include "orte/mca/gpr/gpr.h"
|
||||
#include "orte/mca/ns/ns.h"
|
||||
#include "orte/mca/sds/base/base.h"
|
||||
#include "orte/mca/oob/base/base.h"
|
||||
#include "orte/mca/ras/ras.h"
|
||||
#include "orte/mca/rmgr/rmgr.h"
|
||||
#include "orte/mca/rmaps/rmaps.h"
|
||||
#include "orte/mca/rml/rml.h"
|
||||
#include "orte/mca/schema/schema_types.h"
|
||||
#include "orte/mca/smr/smr.h"
|
||||
#include "orte/runtime/orte_wait.h"
|
||||
#include "orte/runtime/runtime.h"
|
||||
#include "orte/runtime/params.h"
|
||||
|
||||
#include "orte/mca/pls/base/pls_private.h"
|
||||
#include "pls_bproc.h"
|
||||
#include "orte/mca/plm/base/plm_private.h"
|
||||
#include "plm_bproc.h"
|
||||
|
||||
static bool daemons_launched;
|
||||
static bool bynode;
|
||||
|
||||
#if OMPI_HAVE_POSIX_THREADS && OMPI_THREADS_HAVE_DIFFERENT_PIDS
|
||||
int orte_pls_bproc_launch_threaded(orte_jobid_t);
|
||||
#endif
|
||||
static int plm_tm_init(void);
|
||||
static int plm_tm_launch_job(orte_job_t *jdata);
|
||||
static int plm_tm_terminate_job(orte_jobid_t jobid);
|
||||
static int plm_tm_terminate_orteds(void);
|
||||
static int plm_tm_signal_job(orte_jobid_t jobid, int32_t signal);
|
||||
static int plm_tm_finalize(void);
|
||||
|
||||
static int plm_tm_connect(void);
|
||||
static int plm_tm_disconnect(void);
|
||||
|
||||
/**
|
||||
* Initialization of the bproc module with all the needed function pointers
|
||||
*/
|
||||
orte_pls_base_module_t orte_pls_bproc_module = {
|
||||
#if OMPI_HAVE_POSIX_THREADS && OMPI_THREADS_HAVE_DIFFERENT_PIDS
|
||||
orte_pls_bproc_launch_threaded,
|
||||
#else
|
||||
orte_pls_bproc_launch,
|
||||
#endif
|
||||
orte_pls_bproc_terminate_job,
|
||||
orte_pls_bproc_terminate_orteds,
|
||||
orte_pls_bproc_terminate_proc,
|
||||
orte_pls_bproc_signal_job,
|
||||
orte_pls_bproc_signal_proc,
|
||||
orte_pls_bproc_cancel_operation,
|
||||
orte_pls_bproc_finalize
|
||||
orte_plm_base_module_t orte_plm_tm_module = {
|
||||
plm_bproc_init,
|
||||
orte_plm_base_set_hnp_name,
|
||||
plm_bproc_launch_job,
|
||||
NULL,
|
||||
plm_bproc_terminate_job,
|
||||
plm_bproc_terminate_orteds,
|
||||
plm_bproc_signal_job,
|
||||
plm_bproc_finalize
|
||||
};
|
||||
|
||||
|
||||
static int orte_pls_bproc_node_list(orte_job_map_t *map,
|
||||
int *node_array, int * num_nodes,
|
||||
int num_procs);
|
||||
static int orte_pls_bproc_setup_io(orte_jobid_t jobid, struct bproc_io_t * io,
|
||||
int node_rank, int app_context);
|
||||
static void orte_pls_bproc_waitpid_cb(pid_t wpid, int status, void *data);
|
||||
static void orte_pls_bproc_waitpid_daemon_cb(pid_t wpid, int status, void *data);
|
||||
#ifdef MCA_pls_bproc_scyld
|
||||
/* compatibility functions for scyld bproc and pre 3.2.0 LANL bproc */
|
||||
static int bproc_vexecmove_io(int nnodes, int *nodes, int *pids,
|
||||
struct bproc_io_t *io, int iolen, const char *cmd,
|
||||
char * const argv[], char * envp[]);
|
||||
static int bproc_vexecmove(int nnodes, int *nodes, int *pids, const char *cmd,
|
||||
char * const argv[], char * envp[]);
|
||||
#endif
|
||||
static void orte_pls_bproc_setup_env(char *** env);
|
||||
static int orte_pls_bproc_launch_daemons(orte_job_map_t *map, char ***envp);
|
||||
static int orte_pls_bproc_launch_app(orte_job_map_t* map, int num_slots,
|
||||
orte_vpid_t vpid_start, int app_context);
|
||||
/**
|
||||
* Init the module
|
||||
*/
|
||||
static int plm_bproc_init(void)
|
||||
{
|
||||
int rc;
|
||||
|
||||
if (ORTE_SUCCESS != (rc = orte_plm_base_comm_start())) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
return rc;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Creates a list of nodes from a job map that should participate in the next launch cycle.
|
||||
@ -134,10 +123,11 @@ static int orte_pls_bproc_launch_app(orte_job_map_t* map, int num_slots,
|
||||
* @param num_nodes a pointer to the place where we will store the number of nodes in the array
|
||||
* @param num_procs the number of processes that a node must have to be placed on the list
|
||||
*/
|
||||
static int orte_pls_bproc_node_list(orte_job_map_t *map, int *node_array, int *num_nodes, int num_procs)
|
||||
static int bproc_node_list(orte_job_map_t *map, int *node_array, int *num_nodes, int num_procs)
|
||||
{
|
||||
opal_list_item_t *item;
|
||||
orte_mapped_node_t *node;
|
||||
orte_node_t **nodes;
|
||||
orte_std_cntr_t i;
|
||||
|
||||
OPAL_TRACE(1);
|
||||
|
||||
@ -146,15 +136,13 @@ static int orte_pls_bproc_node_list(orte_job_map_t *map, int *node_array, int *n
|
||||
memset((void*)node_array, -1, sizeof(int) * map->num_nodes);
|
||||
|
||||
/* build the node list */
|
||||
for(item = opal_list_get_first(&map->nodes);
|
||||
item != opal_list_get_end(&map->nodes);
|
||||
item = opal_list_get_next(item)) {
|
||||
node = (orte_mapped_node_t*)item;
|
||||
|
||||
if (node->num_procs >= num_procs) {
|
||||
node_array[(*num_nodes)++] = atoi(node->nodename);
|
||||
nodes = (orte_node_t**)map->nodes->addr;
|
||||
for (i=0; i < map->num_nodes; i++) {
|
||||
if (nodes[i]->num_procs >= num_procs) {
|
||||
node_array[(*num_nodes)++] = atoi(nodes[i]->name);
|
||||
}
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
@ -168,21 +156,12 @@ static int orte_pls_bproc_node_list(orte_job_map_t *map, int *node_array, int *n
|
||||
* @retval ORTE_SUCCESS
|
||||
* @retval error
|
||||
*/
|
||||
static int orte_pls_bproc_setup_io(orte_jobid_t jobid, struct bproc_io_t * io,
|
||||
int node_rank, int app_context) {
|
||||
static int bproc_setup_io(orte_jobid_t jobid, struct bproc_io_t * io,
|
||||
int node_rank, int app_context) {
|
||||
char *frontend = NULL, *path = NULL, *job = NULL;
|
||||
int rc, i;
|
||||
|
||||
OPAL_TRACE(1);
|
||||
|
||||
/* ensure that system info is set */
|
||||
orte_sys_info();
|
||||
if (NULL == orte_system_info.user) { /* error condition */
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
if (NULL == orte_universe_info.name) { /* error condition */
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
|
||||
rc = orte_ns.convert_jobid_to_string(&job, jobid);
|
||||
if(ORTE_SUCCESS != rc) {
|
||||
@ -204,7 +183,7 @@ static int orte_pls_bproc_setup_io(orte_jobid_t jobid, struct bproc_io_t * io,
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
goto cleanup;
|
||||
}
|
||||
if (mca_pls_bproc_component.debug) {
|
||||
if (mca_plm_bproc_component.debug) {
|
||||
opal_output(0, "mpirun bproc io setup. Path: %s\n", path);
|
||||
}
|
||||
io[i].fd = i;
|
||||
@ -245,7 +224,7 @@ static int orte_pls_bproc_setup_io(orte_jobid_t jobid, struct bproc_io_t * io,
|
||||
* @param status tells why the process died
|
||||
* @param data a pointer to the process's name
|
||||
*/
|
||||
static void orte_pls_bproc_waitpid_cb(pid_t wpid, int status, void *data) {
|
||||
static void orte_plm_bproc_waitpid_cb(pid_t wpid, int status, void *data) {
|
||||
orte_process_name_t * proc = (orte_process_name_t*) data;
|
||||
int rc;
|
||||
|
||||
@ -270,7 +249,7 @@ static void orte_pls_bproc_waitpid_cb(pid_t wpid, int status, void *data) {
|
||||
* @param status tells why the daemon died
|
||||
* @param data a pointer to the node the daemon was on
|
||||
*/
|
||||
static void orte_pls_bproc_waitpid_daemon_cb(pid_t wpid, int status, void *data) {
|
||||
static void orte_plm_bproc_waitpid_daemon_cb(pid_t wpid, int status, void *data) {
|
||||
|
||||
OPAL_TRACE(1);
|
||||
|
||||
@ -295,19 +274,19 @@ static void orte_pls_bproc_waitpid_daemon_cb(pid_t wpid, int status, void *data)
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
}
|
||||
OPAL_THREAD_LOCK(&mca_pls_bproc_component.lock);
|
||||
if(0 < mca_pls_bproc_component.num_daemons) {
|
||||
mca_pls_bproc_component.num_daemons--;
|
||||
OPAL_THREAD_LOCK(&mca_plm_bproc_component.lock);
|
||||
if(0 < mca_plm_bproc_component.num_daemons) {
|
||||
mca_plm_bproc_component.num_daemons--;
|
||||
}
|
||||
opal_condition_signal(&mca_pls_bproc_component.condition);
|
||||
OPAL_THREAD_UNLOCK(&mca_pls_bproc_component.lock);
|
||||
if(0 < mca_pls_bproc_component.debug) {
|
||||
opal_output(0, "in orte_pls_bproc_waitpid_daemon_cb, %d daemons left\n",
|
||||
mca_pls_bproc_component.num_daemons);
|
||||
opal_condition_signal(&mca_plm_bproc_component.condition);
|
||||
OPAL_THREAD_UNLOCK(&mca_plm_bproc_component.lock);
|
||||
if(0 < mca_plm_bproc_component.debug) {
|
||||
opal_output(0, "in orte_plm_bproc_waitpid_daemon_cb, %d daemons left\n",
|
||||
mca_plm_bproc_component.num_daemons);
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef MCA_pls_bproc_scyld
|
||||
#ifdef MCA_plm_bproc_scyld
|
||||
/**
|
||||
* compatibility function for scyld bproc and pre 3.2.0 LANL bproc. See the
|
||||
* bproc documentation for details
|
||||
@ -331,12 +310,12 @@ static int bproc_vexecmove_io(int nnodes, int *nodes, int *pids,
|
||||
opal_setenv("BPROC_RANK", rank, true, &envp);
|
||||
bproc_execmove_io(nodes[i], io, iolen, cmd, argv, envp);
|
||||
/* if we get here, there was an error */
|
||||
opal_show_help("help-pls-bproc.txt", "bproc-vexecmove-launch", true,
|
||||
opal_show_help("help-plm-bproc.txt", "bproc-vexecmove-launch", true,
|
||||
cmd, nodes[i], errno);
|
||||
ORTE_ERROR_LOG(ORTE_ERROR);
|
||||
exit(-1);
|
||||
} else if(-1 == pids[i]) {
|
||||
opal_show_help("help-pls-bproc.txt", "bproc-vexecmove-fork", true,
|
||||
opal_show_help("help-plm-bproc.txt", "bproc-vexecmove-fork", true,
|
||||
errno);
|
||||
ORTE_ERROR_LOG(ORTE_ERROR);
|
||||
return -1;
|
||||
@ -359,7 +338,7 @@ static int bproc_vexecmove(int nnodes, int *nodes, int *pids, const char *cmd,
|
||||
* Sets up the passed environment for processes launched by the bproc launcher.
|
||||
* @param env a pointer to the environment to setup
|
||||
*/
|
||||
static void orte_pls_bproc_setup_env(char *** env)
|
||||
static void orte_plm_bproc_setup_env(char *** env)
|
||||
{
|
||||
char ** merged;
|
||||
char * var;
|
||||
@ -386,7 +365,7 @@ static void orte_pls_bproc_setup_env(char *** env)
|
||||
|
||||
/* make sure the username used to create the bproc directory is the same on
|
||||
* the backend as the frontend */
|
||||
var = mca_base_param_environ_variable("pls","bproc","username");
|
||||
var = mca_base_param_environ_variable("plm","bproc","username");
|
||||
opal_setenv(var, orte_system_info.user, true, env);
|
||||
free(var);
|
||||
|
||||
@ -435,7 +414,7 @@ static void orte_pls_bproc_setup_env(char *** env)
|
||||
* @retval ORTE_SUCCESS
|
||||
* @retval error
|
||||
*/
|
||||
static int orte_pls_bproc_launch_daemons(orte_job_map_t *map, char ***envp) {
|
||||
static int orte_plm_bproc_launch_daemons(orte_job_map_t *map, char ***envp) {
|
||||
int * daemon_list = NULL;
|
||||
int num_daemons = 0;
|
||||
int rc, i;
|
||||
@ -450,15 +429,15 @@ static int orte_pls_bproc_launch_daemons(orte_job_map_t *map, char ***envp) {
|
||||
orte_std_cntr_t idx;
|
||||
struct stat buf;
|
||||
opal_list_t daemons;
|
||||
orte_pls_daemon_info_t *dmn;
|
||||
orte_plm_daemon_info_t *dmn;
|
||||
opal_list_item_t *item;
|
||||
struct timeval joblaunchstart, launchstart, launchstop;
|
||||
|
||||
OPAL_TRACE(1);
|
||||
|
||||
if (orte_pls_base.timing) {
|
||||
if (orte_plm_base.timing) {
|
||||
if (0 != gettimeofday(&joblaunchstart, NULL)) {
|
||||
opal_output(0, "pls_bproc: could not obtain start time");
|
||||
opal_output(0, "plm_bproc: could not obtain start time");
|
||||
}
|
||||
}
|
||||
|
||||
@ -514,7 +493,7 @@ static int orte_pls_bproc_launch_daemons(orte_job_map_t *map, char ***envp) {
|
||||
}
|
||||
|
||||
/* setup the daemon environment */
|
||||
orte_pls_bproc_setup_env(envp);
|
||||
orte_plm_bproc_setup_env(envp);
|
||||
|
||||
/* direct the daemons to drop contact files so the local procs
|
||||
* can learn how to contact them - this is used for routing
|
||||
@ -528,7 +507,7 @@ static int orte_pls_bproc_launch_daemons(orte_job_map_t *map, char ***envp) {
|
||||
* push that value into their environment */
|
||||
stride = 1;
|
||||
asprintf(¶m, "%ld", (long)stride);
|
||||
var = mca_base_param_environ_variable("pls", "bproc", "stride");
|
||||
var = mca_base_param_environ_variable("plm", "bproc", "stride");
|
||||
opal_setenv(var, param, true, envp);
|
||||
free(param);
|
||||
free(var);
|
||||
@ -542,10 +521,10 @@ static int orte_pls_bproc_launch_daemons(orte_job_map_t *map, char ***envp) {
|
||||
}
|
||||
|
||||
argc = 0;
|
||||
opal_argv_append(&argc, &argv, mca_pls_bproc_component.orted);
|
||||
opal_argv_append(&argc, &argv, mca_plm_bproc_component.orted);
|
||||
/* check for debug flags */
|
||||
#if 0
|
||||
if (mca_pls_bproc_component.debug) {
|
||||
if (mca_plm_bproc_component.debug) {
|
||||
opal_argv_append(&argc, &argv, "--debug");
|
||||
opal_argv_append(&argc, &argv, "--debug-daemons");
|
||||
}
|
||||
@ -567,20 +546,20 @@ static int orte_pls_bproc_launch_daemons(orte_job_map_t *map, char ***envp) {
|
||||
opal_argv_append(&argc, &argv, "--no-daemonize");
|
||||
|
||||
/* find orted */
|
||||
if(0 == stat(mca_pls_bproc_component.orted, &buf)) {
|
||||
orted_path = strdup(mca_pls_bproc_component.orted);
|
||||
if(0 == stat(mca_plm_bproc_component.orted, &buf)) {
|
||||
orted_path = strdup(mca_plm_bproc_component.orted);
|
||||
} else {
|
||||
orted_path = opal_path_findv(mca_pls_bproc_component.orted, 0, environ, NULL);
|
||||
orted_path = opal_path_findv(mca_plm_bproc_component.orted, 0, environ, NULL);
|
||||
if(NULL == orted_path) {
|
||||
orted_path = opal_os_path( false, opal_install_dirs.bindir, mca_pls_bproc_component.orted, NULL );
|
||||
orted_path = opal_os_path( false, opal_install_dirs.bindir, mca_plm_bproc_component.orted, NULL );
|
||||
if( (NULL != orted_path) || (0 != stat(orted_path, &buf)) ) {
|
||||
char *path = getenv("PATH");
|
||||
if (NULL == path) {
|
||||
path = ("PATH is empty!");
|
||||
}
|
||||
opal_show_help("help-pls-bproc.txt", "no-orted", true,
|
||||
mca_pls_bproc_component.orted,
|
||||
mca_pls_bproc_component.orted, path, opal_install_dirs.bindir);
|
||||
opal_show_help("help-plm-bproc.txt", "no-orted", true,
|
||||
mca_plm_bproc_component.orted,
|
||||
mca_plm_bproc_component.orted, path, opal_install_dirs.bindir);
|
||||
rc = ORTE_ERROR;
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
@ -588,50 +567,50 @@ static int orte_pls_bproc_launch_daemons(orte_job_map_t *map, char ***envp) {
|
||||
}
|
||||
}
|
||||
|
||||
if(0 < mca_pls_bproc_component.debug) {
|
||||
if(0 < mca_plm_bproc_component.debug) {
|
||||
opal_output(0, "PLS_BPROC DEBUG: launching %d daemons. cmd: %s ",
|
||||
num_daemons, orted_path);
|
||||
}
|
||||
|
||||
/* launch the daemons */
|
||||
if (orte_pls_base.timing) {
|
||||
if (orte_plm_base.timing) {
|
||||
if (0 != gettimeofday(&launchstart, NULL)) {
|
||||
opal_output(0, "pls_bproc: could not obtain start time");
|
||||
opal_output(0, "plm_bproc: could not obtain start time");
|
||||
}
|
||||
}
|
||||
|
||||
if (mca_pls_bproc_component.do_not_launch) {
|
||||
if (mca_plm_bproc_component.do_not_launch) {
|
||||
for (i=0; i < num_daemons; i++) pids[i] = i+1;
|
||||
rc = num_daemons;
|
||||
} else {
|
||||
rc = bproc_vexecmove(num_daemons, daemon_list, pids, orted_path, argv, *envp);
|
||||
}
|
||||
|
||||
if (orte_pls_base.timing) {
|
||||
if (orte_plm_base.timing) {
|
||||
if (0 != gettimeofday(&launchstop, NULL)) {
|
||||
opal_output(0, "pls_bproc: could not obtain stop time");
|
||||
opal_output(0, "plm_bproc: could not obtain stop time");
|
||||
} else {
|
||||
opal_output(0, "pls_bproc: daemon launch time is %ld usec",
|
||||
opal_output(0, "plm_bproc: daemon launch time is %ld usec",
|
||||
(launchstop.tv_sec - launchstart.tv_sec)*1000000 +
|
||||
(launchstop.tv_usec - launchstart.tv_usec));
|
||||
}
|
||||
}
|
||||
|
||||
if(rc != num_daemons) {
|
||||
opal_show_help("help-pls-bproc.txt", "daemon-launch-number", true,
|
||||
opal_show_help("help-plm-bproc.txt", "daemon-launch-number", true,
|
||||
num_daemons, rc, orted_path);
|
||||
rc = ORTE_ERROR;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
if(0 < mca_pls_bproc_component.debug) {
|
||||
if(0 < mca_plm_bproc_component.debug) {
|
||||
opal_output(0, "PLS_BPROC DEBUG: %d daemons launched. First pid: %d\n",
|
||||
rc, *pids);
|
||||
}
|
||||
|
||||
for(i = 0; i < num_daemons; i++) {
|
||||
if(0 >= pids[i]) {
|
||||
opal_show_help("help-pls-bproc.txt", "daemon-launch-bad-pid", true,
|
||||
opal_show_help("help-plm-bproc.txt", "daemon-launch-bad-pid", true,
|
||||
daemon_list[i], pids[i], errno, orted_path);
|
||||
rc = ORTE_ERROR;
|
||||
ORTE_ERROR_LOG(rc);
|
||||
@ -642,13 +621,13 @@ static int orte_pls_bproc_launch_daemons(orte_job_map_t *map, char ***envp) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
goto cleanup;
|
||||
}
|
||||
rc = orte_pls_bproc_set_node_pid(ORTE_PROC_MY_NAME->cellid, param, map->job, pids[i]);
|
||||
rc = orte_plm_bproc_set_node_pid(ORTE_PROC_MY_NAME->cellid, param, map->job, pids[i]);
|
||||
if(ORTE_SUCCESS != rc) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
dmn = OBJ_NEW(orte_pls_daemon_info_t);
|
||||
dmn = OBJ_NEW(orte_plm_daemon_info_t);
|
||||
rc = orte_ns.create_process_name(&(dmn->name), ORTE_PROC_MY_NAME->cellid, 0,
|
||||
daemon_vpid_start + i);
|
||||
if(ORTE_SUCCESS != rc) {
|
||||
@ -665,7 +644,7 @@ static int orte_pls_bproc_launch_daemons(orte_job_map_t *map, char ***envp) {
|
||||
}
|
||||
|
||||
/* store the daemon info */
|
||||
if (ORTE_SUCCESS != (rc = orte_pls_base_store_active_daemons(&daemons))) {
|
||||
if (ORTE_SUCCESS != (rc = orte_plm_base_store_active_daemons(&daemons))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
|
||||
@ -673,9 +652,9 @@ static int orte_pls_bproc_launch_daemons(orte_job_map_t *map, char ***envp) {
|
||||
* daemon info so that short-lived apps don't cause mpirun to
|
||||
* try and terminate the orteds before we record them
|
||||
*/
|
||||
if (!mca_pls_bproc_component.do_not_launch) {
|
||||
if (!mca_plm_bproc_component.do_not_launch) {
|
||||
for (i=0; i < num_daemons; i++) {
|
||||
rc = orte_wait_cb(pids[i], orte_pls_bproc_waitpid_daemon_cb,
|
||||
rc = orte_wait_cb(pids[i], orte_plm_bproc_waitpid_daemon_cb,
|
||||
&daemon_list[i]);
|
||||
if(ORTE_SUCCESS != rc) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
@ -705,17 +684,17 @@ static int orte_pls_bproc_launch_daemons(orte_job_map_t *map, char ***envp) {
|
||||
|
||||
if(-1 == src[0]) {
|
||||
/* one of the daemons has failed to properly launch. The error is sent
|
||||
* by orte_pls_bproc_waitpid_daemon_cb */
|
||||
* by orte_plm_bproc_waitpid_daemon_cb */
|
||||
if(-1 == src[1]) { /* did not die on a signal */
|
||||
opal_show_help("help-pls-bproc.txt", "daemon-died-no-signal", true,
|
||||
opal_show_help("help-plm-bproc.txt", "daemon-died-no-signal", true,
|
||||
src[2], src[3]);
|
||||
} else { /* died on a signal */
|
||||
opal_show_help("help-pls-bproc.txt", "daemon-died-signal", true,
|
||||
opal_show_help("help-plm-bproc.txt", "daemon-died-signal", true,
|
||||
src[2], src[3], src[1]);
|
||||
}
|
||||
rc = ORTE_ERROR;
|
||||
ORTE_ERROR_LOG(rc);
|
||||
orte_pls_bproc_terminate_job(map->job, &orte_abort_timeout, NULL);
|
||||
orte_plm_bproc_terminate_job(map->job, &orte_abort_timeout, NULL);
|
||||
goto cleanup;
|
||||
}
|
||||
}
|
||||
@ -724,11 +703,11 @@ static int orte_pls_bproc_launch_daemons(orte_job_map_t *map, char ***envp) {
|
||||
/* indicate that the daemons have now launched */
|
||||
daemons_launched = true;
|
||||
|
||||
if (orte_pls_base.timing) {
|
||||
if (orte_plm_base.timing) {
|
||||
if (0 != gettimeofday(&launchstop, NULL)) {
|
||||
opal_output(0, "pls_bproc: could not obtain stop time");
|
||||
opal_output(0, "plm_bproc: could not obtain stop time");
|
||||
} else {
|
||||
opal_output(0, "pls_bproc: total job launch time is %ld usec",
|
||||
opal_output(0, "plm_bproc: total job launch time is %ld usec",
|
||||
(launchstop.tv_sec - joblaunchstart.tv_sec)*1000000 +
|
||||
(launchstop.tv_usec - joblaunchstart.tv_usec));
|
||||
}
|
||||
@ -754,7 +733,7 @@ cleanup:
|
||||
|
||||
|
||||
static int
|
||||
orte_pls_bproc_node_failed(orte_gpr_notify_message_t *msg)
|
||||
orte_plm_bproc_node_failed(orte_gpr_notify_message_t *msg)
|
||||
{
|
||||
orte_jobid_t job;
|
||||
|
||||
@ -778,10 +757,10 @@ orte_pls_bproc_node_failed(orte_gpr_notify_message_t *msg)
|
||||
orte_schema.extract_jobid_from_std_trigger_name(&job, msg->target);
|
||||
|
||||
/* terminate all jobs in the in the job family */
|
||||
orte_pls_bproc_terminate_job(job, &orte_abort_timeout, NULL);
|
||||
orte_plm_bproc_terminate_job(job, &orte_abort_timeout, NULL);
|
||||
|
||||
/* kill the daemons */
|
||||
orte_pls_bproc_terminate_job(0, &orte_abort_timeout, NULL);
|
||||
orte_plm_bproc_terminate_job(0, &orte_abort_timeout, NULL);
|
||||
|
||||
/* shouldn't ever get here.. */
|
||||
exit(1);
|
||||
@ -803,7 +782,7 @@ orte_pls_bproc_node_failed(orte_gpr_notify_message_t *msg)
|
||||
* @retval ORTE_SUCCESS
|
||||
* @retval error
|
||||
*/
|
||||
static int orte_pls_bproc_launch_app(orte_job_map_t* map, int num_slots,
|
||||
static int orte_plm_bproc_launch_app(orte_job_map_t* map, int num_slots,
|
||||
orte_vpid_t vpid_start, int app_context) {
|
||||
int *node_array, num_nodes, cycle;
|
||||
int rc, i, j, stride;
|
||||
@ -822,7 +801,7 @@ static int orte_pls_bproc_launch_app(orte_job_map_t* map, int num_slots,
|
||||
|
||||
/* set up app context */
|
||||
asprintf(¶m, "%d", app_context);
|
||||
var = mca_base_param_environ_variable("pls", "bproc", "app_context");
|
||||
var = mca_base_param_environ_variable("plm", "bproc", "app_context");
|
||||
opal_setenv(var, param, true, &env);
|
||||
free(param);
|
||||
free(var);
|
||||
@ -850,7 +829,7 @@ static int orte_pls_bproc_launch_app(orte_job_map_t* map, int num_slots,
|
||||
}
|
||||
/* and push that value into the process' environment */
|
||||
asprintf(¶m, "%ld", (long)stride);
|
||||
var = mca_base_param_environ_variable("pls", "bproc", "stride");
|
||||
var = mca_base_param_environ_variable("plm", "bproc", "stride");
|
||||
opal_setenv(var, param, true, &env);
|
||||
free(param);
|
||||
free(var);
|
||||
@ -883,20 +862,20 @@ static int orte_pls_bproc_launch_app(orte_job_map_t* map, int num_slots,
|
||||
i = 1;
|
||||
num_processes = map->vpid_range;
|
||||
|
||||
rc = orte_pls_bproc_node_list(map, node_array, &num_nodes, i);
|
||||
rc = orte_plm_bproc_node_list(map, node_array, &num_nodes, i);
|
||||
if(ORTE_SUCCESS != rc) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
}
|
||||
opal_output_verbose(1, orte_pls_base.pls_output,
|
||||
opal_output_verbose(1, orte_plm_base.plm_output,
|
||||
"launching app %s", map->apps[app_context]->app);
|
||||
|
||||
while(0 != num_nodes) {
|
||||
if (0 < mca_pls_bproc_component.debug) {
|
||||
opal_output_verbose(1, orte_pls_base.pls_output,
|
||||
if (0 < mca_plm_bproc_component.debug) {
|
||||
opal_output_verbose(1, orte_plm_base.plm_output,
|
||||
"\tlaunching cycle %d", i);
|
||||
for (dbg=0; dbg<num_nodes; dbg++) {
|
||||
opal_output_verbose(1, orte_pls_base.pls_output,
|
||||
opal_output_verbose(1, orte_plm_base.plm_output,
|
||||
"\t\tlaunching on node %d", node_array[dbg]);
|
||||
}
|
||||
}
|
||||
@ -909,13 +888,13 @@ static int orte_pls_bproc_launch_app(orte_job_map_t* map, int num_slots,
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
rc = orte_pls_bproc_setup_io(map->job, bproc_io, i - 1, app_context);
|
||||
rc = orte_plm_bproc_setup_io(map->job, bproc_io, i - 1, app_context);
|
||||
if(ORTE_SUCCESS != rc) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
}
|
||||
if(0 < mca_pls_bproc_component.debug) {
|
||||
opal_output(0, "pls_bproc: launching %d processes:", num_nodes);
|
||||
if(0 < mca_plm_bproc_component.debug) {
|
||||
opal_output(0, "plm_bproc: launching %d processes:", num_nodes);
|
||||
}
|
||||
|
||||
/* allocate space for bproc to return the pids */
|
||||
@ -926,7 +905,7 @@ static int orte_pls_bproc_launch_app(orte_job_map_t* map, int num_slots,
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
if (mca_pls_bproc_component.do_not_launch) {
|
||||
if (mca_plm_bproc_component.do_not_launch) {
|
||||
for (j=0; j < num_nodes; j++) pids[j] = j+1;
|
||||
rc = num_nodes;
|
||||
} else {
|
||||
@ -935,12 +914,12 @@ static int orte_pls_bproc_launch_app(orte_job_map_t* map, int num_slots,
|
||||
map->apps[app_context]->argv, env);
|
||||
}
|
||||
|
||||
if(0 < mca_pls_bproc_component.debug) {
|
||||
opal_output(0, "pls_bproc: %d processes launched. First pid: %d",
|
||||
if(0 < mca_plm_bproc_component.debug) {
|
||||
opal_output(0, "plm_bproc: %d processes launched. First pid: %d",
|
||||
rc, *pids);
|
||||
}
|
||||
if(rc != num_nodes) {
|
||||
opal_show_help("help-pls-bproc.txt", "proc-launch-number", true,
|
||||
opal_show_help("help-plm-bproc.txt", "proc-launch-number", true,
|
||||
num_nodes, rc, map->apps[app_context]->app);
|
||||
rc = ORTE_ERROR;
|
||||
goto cleanup;
|
||||
@ -948,7 +927,7 @@ static int orte_pls_bproc_launch_app(orte_job_map_t* map, int num_slots,
|
||||
|
||||
for(j = 0; j < num_nodes; j++) {
|
||||
if(0 >= pids[j]) {
|
||||
opal_show_help("help-pls-bproc.txt", "proc-launch-bad-pid", true,
|
||||
opal_show_help("help-plm-bproc.txt", "proc-launch-bad-pid", true,
|
||||
node_array[j], pids[j], errno, map->apps[app_context]->app);
|
||||
rc = ORTE_ERROR;
|
||||
ORTE_ERROR_LOG(rc);
|
||||
@ -960,13 +939,13 @@ static int orte_pls_bproc_launch_app(orte_job_map_t* map, int num_slots,
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
}
|
||||
orte_pls_bproc_set_proc_pid(proc_name, pids[j], node_array[j]);
|
||||
orte_plm_bproc_set_proc_pid(proc_name, pids[j], node_array[j]);
|
||||
if(ORTE_SUCCESS != rc) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
}
|
||||
if (!mca_pls_bproc_component.do_not_launch) {
|
||||
rc = orte_wait_cb(pids[j], orte_pls_bproc_waitpid_cb, proc_name);
|
||||
if (!mca_plm_bproc_component.do_not_launch) {
|
||||
rc = orte_wait_cb(pids[j], orte_plm_bproc_waitpid_cb, proc_name);
|
||||
if(ORTE_SUCCESS != rc) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
@ -998,7 +977,7 @@ static int orte_pls_bproc_launch_app(orte_job_map_t* map, int num_slots,
|
||||
}
|
||||
}
|
||||
|
||||
rc = orte_pls_bproc_node_list(map, node_array, &num_nodes, i);
|
||||
rc = orte_plm_bproc_node_list(map, node_array, &num_nodes, i);
|
||||
if(ORTE_SUCCESS != rc) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
@ -1017,7 +996,7 @@ cleanup:
|
||||
}
|
||||
|
||||
/**
|
||||
* The main bproc launcher. See pls_bproc.h for a high level overview of how
|
||||
* The main bproc launcher. See plm_bproc.h for a high level overview of how
|
||||
* the bproc launching works.
|
||||
* Here we:
|
||||
* -# Launch the deamons on the backend nodes.
|
||||
@ -1029,7 +1008,7 @@ cleanup:
|
||||
* @retval ORTE_SUCCESS
|
||||
* @retval error
|
||||
*/
|
||||
int orte_pls_bproc_launch(orte_jobid_t jobid) {
|
||||
int orte_plm_bproc_launch(orte_jobid_t jobid) {
|
||||
orte_job_map_t* map;
|
||||
orte_mapped_node_t *map_node;
|
||||
orte_vpid_t vpid_launch;
|
||||
@ -1044,8 +1023,8 @@ int orte_pls_bproc_launch(orte_jobid_t jobid) {
|
||||
|
||||
OPAL_TRACE(1);
|
||||
|
||||
/* make sure the pls_bproc receive function has been started */
|
||||
if (ORTE_SUCCESS != (rc = orte_pls_bproc_comm_start())) {
|
||||
/* make sure the plm_bproc receive function has been started */
|
||||
if (ORTE_SUCCESS != (rc = orte_plm_bproc_comm_start())) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
@ -1126,7 +1105,7 @@ int orte_pls_bproc_launch(orte_jobid_t jobid) {
|
||||
while (NULL != (ras_node = (orte_ras_node_t*)opal_list_remove_first(&nodelist))) {
|
||||
if (num_slots != ras_node->node_slots) {
|
||||
/* mismatch - error out */
|
||||
opal_show_help("help-pls-bproc.txt", "mismatched-slots", true);
|
||||
opal_show_help("help-plm-bproc.txt", "mismatched-slots", true);
|
||||
ORTE_ERROR_LOG(ORTE_ERR_NOT_SUPPORTED);
|
||||
rc = ORTE_ERR_NOT_SUPPORTED;
|
||||
goto cleanup;
|
||||
@ -1136,8 +1115,8 @@ int orte_pls_bproc_launch(orte_jobid_t jobid) {
|
||||
OBJ_DESTRUCT(&nodelist);
|
||||
|
||||
|
||||
if(0 < mca_pls_bproc_component.debug) {
|
||||
opal_output(0, "pls_bproc: --- starting to launch procs ---");
|
||||
if(0 < mca_plm_bproc_component.debug) {
|
||||
opal_output(0, "plm_bproc: --- starting to launch procs ---");
|
||||
}
|
||||
|
||||
/* save the daemon environment */
|
||||
@ -1145,19 +1124,19 @@ int orte_pls_bproc_launch(orte_jobid_t jobid) {
|
||||
|
||||
/* for each application context, setup its env */
|
||||
for(i=0; i < map->num_apps; i++) {
|
||||
orte_pls_bproc_setup_env(&map->apps[i]->env);
|
||||
orte_plm_bproc_setup_env(&map->apps[i]->env);
|
||||
}
|
||||
|
||||
/* tell the smr which nodes to monitor so we can be notified
|
||||
when the node's state changes, useful for aborting when
|
||||
a bproc node up and dies */
|
||||
if (ORTE_SUCCESS != (rc = orte_smr.begin_monitoring(map, orte_pls_bproc_node_failed, NULL))) {
|
||||
if (ORTE_SUCCESS != (rc = orte_smr.begin_monitoring(map, orte_plm_bproc_node_failed, NULL))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/* launch the daemons on all nodes which have processes assigned to them */
|
||||
rc = orte_pls_bproc_launch_daemons(map, &daemon_env);
|
||||
rc = orte_plm_bproc_launch_daemons(map, &daemon_env);
|
||||
opal_argv_free(daemon_env);
|
||||
|
||||
if(ORTE_SUCCESS != rc) {
|
||||
@ -1174,7 +1153,7 @@ int orte_pls_bproc_launch(orte_jobid_t jobid) {
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
rc = orte_pls_bproc_launch_app(map, num_slots, vpid_launch, context);
|
||||
rc = orte_plm_bproc_launch_app(map, num_slots, vpid_launch, context);
|
||||
if(ORTE_SUCCESS != rc) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
@ -1187,7 +1166,7 @@ cleanup:
|
||||
|
||||
OBJ_RELEASE(map);
|
||||
|
||||
if (mca_pls_bproc_component.do_not_launch) {
|
||||
if (mca_plm_bproc_component.do_not_launch) {
|
||||
/* indicate that we failed to launch, but do so silently */
|
||||
return ORTE_ERR_SILENT;
|
||||
}
|
||||
@ -1197,25 +1176,25 @@ cleanup:
|
||||
|
||||
/**
|
||||
* Terminate all processes associated with this job */
|
||||
int orte_pls_bproc_terminate_job(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs) {
|
||||
int orte_plm_bproc_terminate_job(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs) {
|
||||
pid_t* pids;
|
||||
orte_std_cntr_t i, num_pids;
|
||||
int rc;
|
||||
|
||||
OPAL_TRACE(1);
|
||||
|
||||
if(0 < mca_pls_bproc_component.debug) {
|
||||
opal_output(0, "orte_pls_bproc: terminating job %ld", jobid);
|
||||
if(0 < mca_plm_bproc_component.debug) {
|
||||
opal_output(0, "orte_plm_bproc: terminating job %ld", jobid);
|
||||
}
|
||||
|
||||
/* kill application process */
|
||||
if(ORTE_SUCCESS != (rc = orte_pls_bproc_get_proc_pids(jobid, &pids, &num_pids, attrs)))
|
||||
if(ORTE_SUCCESS != (rc = orte_plm_bproc_get_proc_pids(jobid, &pids, &num_pids, attrs)))
|
||||
return rc;
|
||||
for(i=0; i<num_pids; i++) {
|
||||
if(mca_pls_bproc_component.debug) {
|
||||
opal_output(0, "orte_pls_bproc: killing proc: %d\n", pids[i]);
|
||||
if(mca_plm_bproc_component.debug) {
|
||||
opal_output(0, "orte_plm_bproc: killing proc: %d\n", pids[i]);
|
||||
}
|
||||
kill(pids[i], mca_pls_bproc_component.terminate_sig);
|
||||
kill(pids[i], mca_plm_bproc_component.terminate_sig);
|
||||
}
|
||||
if(NULL != pids)
|
||||
free(pids);
|
||||
@ -1227,7 +1206,7 @@ int orte_pls_bproc_terminate_job(orte_jobid_t jobid, struct timeval *timeout, op
|
||||
/**
|
||||
* Terminate the orteds for a given job
|
||||
*/
|
||||
int orte_pls_bproc_terminate_orteds(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs)
|
||||
int orte_plm_bproc_terminate_orteds(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs)
|
||||
{
|
||||
int rc;
|
||||
opal_list_t daemons;
|
||||
@ -1237,13 +1216,13 @@ int orte_pls_bproc_terminate_orteds(orte_jobid_t jobid, struct timeval *timeout,
|
||||
|
||||
/* construct the list of active daemons on this job */
|
||||
OBJ_CONSTRUCT(&daemons, opal_list_t);
|
||||
if (ORTE_SUCCESS != (rc = orte_pls_base_get_active_daemons(&daemons, jobid, attrs))) {
|
||||
if (ORTE_SUCCESS != (rc = orte_plm_base_get_active_daemons(&daemons, jobid, attrs))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto CLEANUP;
|
||||
}
|
||||
|
||||
/* now tell them to die! */
|
||||
if (ORTE_SUCCESS != (rc = orte_pls_base_orted_exit(&daemons, timeout))) {
|
||||
if (ORTE_SUCCESS != (rc = orte_plm_base_orted_exit(&daemons, timeout))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
|
||||
@ -1258,15 +1237,15 @@ CLEANUP:
|
||||
/**
|
||||
* Terminate a specific process.
|
||||
*/
|
||||
int orte_pls_bproc_terminate_proc(const orte_process_name_t* proc_name) {
|
||||
int orte_plm_bproc_terminate_proc(const orte_process_name_t* proc_name) {
|
||||
int rc;
|
||||
pid_t pid;
|
||||
|
||||
OPAL_TRACE(1);
|
||||
|
||||
if(ORTE_SUCCESS != (rc = orte_pls_bproc_get_proc_pid(proc_name, &pid)))
|
||||
if(ORTE_SUCCESS != (rc = orte_plm_bproc_get_proc_pid(proc_name, &pid)))
|
||||
return rc;
|
||||
if(kill(pid, mca_pls_bproc_component.terminate_sig) != 0) {
|
||||
if(kill(pid, mca_plm_bproc_component.terminate_sig) != 0) {
|
||||
switch(errno) {
|
||||
case EINVAL:
|
||||
return ORTE_ERR_BAD_PARAM;
|
||||
@ -1284,7 +1263,7 @@ int orte_pls_bproc_terminate_proc(const orte_process_name_t* proc_name) {
|
||||
/**
|
||||
* Signal all processes associated with this job
|
||||
*/
|
||||
int orte_pls_bproc_signal_job(orte_jobid_t jobid, int32_t signal, opal_list_t *attrs) {
|
||||
int orte_plm_bproc_signal_job(orte_jobid_t jobid, int32_t signal, opal_list_t *attrs) {
|
||||
pid_t* pids;
|
||||
orte_std_cntr_t i, num_pids;
|
||||
int rc;
|
||||
@ -1292,11 +1271,11 @@ int orte_pls_bproc_signal_job(orte_jobid_t jobid, int32_t signal, opal_list_t *a
|
||||
OPAL_TRACE(1);
|
||||
|
||||
/* signal application process */
|
||||
if(ORTE_SUCCESS != (rc = orte_pls_bproc_get_proc_pids(jobid, &pids, &num_pids, attrs)))
|
||||
if(ORTE_SUCCESS != (rc = orte_plm_bproc_get_proc_pids(jobid, &pids, &num_pids, attrs)))
|
||||
return rc;
|
||||
for(i=0; i<num_pids; i++) {
|
||||
if(mca_pls_bproc_component.debug) {
|
||||
opal_output(0, "orte_pls_bproc: signaling proc: %d\n", pids[i]);
|
||||
if(mca_plm_bproc_component.debug) {
|
||||
opal_output(0, "orte_plm_bproc: signaling proc: %d\n", pids[i]);
|
||||
}
|
||||
kill(pids[i], (int)signal);
|
||||
}
|
||||
@ -1310,13 +1289,13 @@ int orte_pls_bproc_signal_job(orte_jobid_t jobid, int32_t signal, opal_list_t *a
|
||||
/**
|
||||
* Signal a specific process.
|
||||
*/
|
||||
int orte_pls_bproc_signal_proc(const orte_process_name_t* proc_name, int32_t signal) {
|
||||
int orte_plm_bproc_signal_proc(const orte_process_name_t* proc_name, int32_t signal) {
|
||||
int rc;
|
||||
pid_t pid;
|
||||
|
||||
OPAL_TRACE(1);
|
||||
|
||||
if(ORTE_SUCCESS != (rc = orte_pls_bproc_get_proc_pid(proc_name, &pid)))
|
||||
if(ORTE_SUCCESS != (rc = orte_plm_bproc_get_proc_pid(proc_name, &pid)))
|
||||
return rc;
|
||||
if(kill(pid, (int)signal) != 0) {
|
||||
switch(errno) {
|
||||
@ -1336,13 +1315,13 @@ int orte_pls_bproc_signal_proc(const orte_process_name_t* proc_name, int32_t sig
|
||||
/**
|
||||
* Cancel an operation involving comm to an orted
|
||||
*/
|
||||
int orte_pls_bproc_cancel_operation(void)
|
||||
int orte_plm_bproc_cancel_operation(void)
|
||||
{
|
||||
int rc;
|
||||
|
||||
OPAL_TRACE(1);
|
||||
|
||||
if (ORTE_SUCCESS != (rc = orte_pls_base_orted_cancel_operation())) {
|
||||
if (ORTE_SUCCESS != (rc = orte_plm_base_orted_cancel_operation())) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
|
||||
@ -1353,7 +1332,7 @@ int orte_pls_bproc_cancel_operation(void)
|
||||
/**
|
||||
* Module cleanup
|
||||
*/
|
||||
int orte_pls_bproc_finalize(void)
|
||||
int orte_plm_bproc_finalize(void)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
@ -1365,16 +1344,16 @@ int orte_pls_bproc_finalize(void)
|
||||
|
||||
#if OMPI_HAVE_POSIX_THREADS && OMPI_THREADS_HAVE_DIFFERENT_PIDS
|
||||
|
||||
struct orte_pls_bproc_stack_t {
|
||||
struct orte_plm_bproc_stack_t {
|
||||
opal_condition_t cond;
|
||||
opal_mutex_t mutex;
|
||||
bool complete;
|
||||
orte_jobid_t jobid;
|
||||
int rc;
|
||||
};
|
||||
typedef struct orte_pls_bproc_stack_t orte_pls_bproc_stack_t;
|
||||
typedef struct orte_plm_bproc_stack_t orte_plm_bproc_stack_t;
|
||||
|
||||
static void orte_pls_bproc_stack_construct(orte_pls_bproc_stack_t* stack)
|
||||
static void orte_plm_bproc_stack_construct(orte_plm_bproc_stack_t* stack)
|
||||
{
|
||||
OBJ_CONSTRUCT(&stack->mutex, opal_mutex_t);
|
||||
OBJ_CONSTRUCT(&stack->cond, opal_condition_t);
|
||||
@ -1382,40 +1361,40 @@ static void orte_pls_bproc_stack_construct(orte_pls_bproc_stack_t* stack)
|
||||
stack->complete = false;
|
||||
}
|
||||
|
||||
static void orte_pls_bproc_stack_destruct(orte_pls_bproc_stack_t* stack)
|
||||
static void orte_plm_bproc_stack_destruct(orte_plm_bproc_stack_t* stack)
|
||||
{
|
||||
OBJ_DESTRUCT(&stack->mutex);
|
||||
OBJ_DESTRUCT(&stack->cond);
|
||||
}
|
||||
|
||||
static OBJ_CLASS_INSTANCE(
|
||||
orte_pls_bproc_stack_t,
|
||||
orte_plm_bproc_stack_t,
|
||||
opal_object_t,
|
||||
orte_pls_bproc_stack_construct,
|
||||
orte_pls_bproc_stack_destruct);
|
||||
orte_plm_bproc_stack_construct,
|
||||
orte_plm_bproc_stack_destruct);
|
||||
|
||||
|
||||
static void orte_pls_bproc_launch_cb(int fd, short event, void* args)
|
||||
static void orte_plm_bproc_launch_cb(int fd, short event, void* args)
|
||||
{
|
||||
|
||||
orte_pls_bproc_stack_t *stack = (orte_pls_bproc_stack_t*)args;
|
||||
stack->rc = orte_pls_bproc_launch(stack->jobid);
|
||||
orte_plm_bproc_stack_t *stack = (orte_plm_bproc_stack_t*)args;
|
||||
stack->rc = orte_plm_bproc_launch(stack->jobid);
|
||||
OPAL_THREAD_LOCK(&stack->mutex);
|
||||
stack->complete = true;
|
||||
opal_condition_signal(&stack->cond);
|
||||
OPAL_THREAD_UNLOCK(&stack->mutex);
|
||||
}
|
||||
|
||||
int orte_pls_bproc_launch_threaded(orte_jobid_t jobid)
|
||||
int orte_plm_bproc_launch_threaded(orte_jobid_t jobid)
|
||||
{
|
||||
struct timeval tv = { 0, 0 };
|
||||
struct opal_event event;
|
||||
struct orte_pls_bproc_stack_t stack;
|
||||
struct orte_plm_bproc_stack_t stack;
|
||||
|
||||
OBJ_CONSTRUCT(&stack, orte_pls_bproc_stack_t);
|
||||
OBJ_CONSTRUCT(&stack, orte_plm_bproc_stack_t);
|
||||
|
||||
stack.jobid = jobid;
|
||||
opal_evtimer_set(&event, orte_pls_bproc_launch_cb, &stack);
|
||||
opal_evtimer_set(&event, orte_plm_bproc_launch_cb, &stack);
|
||||
opal_evtimer_add(&event, &tv);
|
||||
|
||||
OPAL_THREAD_LOCK(&stack.mutex);
|
||||
|
@ -20,131 +20,37 @@
|
||||
*/
|
||||
/**
|
||||
* @file:
|
||||
* Header file for the bproc launcher. This launcher is actually split into 2
|
||||
* modules: pls_bproc & pls_bproc_orted. The general idea behind this launcher is:
|
||||
* -# pls_bproc is called by orterun. It figures out the process mapping and
|
||||
* launches orted's on the nodes
|
||||
* -# pls_bproc_orted is called by orted. This module intializes either a pty or
|
||||
* pipes, places symlinks to them in well know points of the filesystem, and
|
||||
* sets up the io forwarding. It then sends an ack back to orterun.
|
||||
* -# pls_bproc waits for an ack to come back from the orteds, then does several
|
||||
* parallel launches of the application processes. The number of launches is
|
||||
* equal to the maximum number of processes on a node. For example, if there
|
||||
* were 2 processes assigned to node 1, and 1 process asigned to node 2, we
|
||||
* would do a parallel launch that launches on process on each node, then
|
||||
* another which launches another process on node 1.
|
||||
*/
|
||||
|
||||
#ifndef ORTE_PLS_BPROC_H_
|
||||
#define ORTE_PLS_BPROC_H_
|
||||
#ifndef ORTE_PLM_BPROC_H_
|
||||
#define ORTE_PLM_BPROC_H_
|
||||
|
||||
#include "orte_config.h"
|
||||
#include "orte/orte_constants.h"
|
||||
#include "orte/constants.h"
|
||||
|
||||
#include <sys/bproc.h>
|
||||
#ifdef HAVE_SYS_TIME_H
|
||||
#include <sys/time.h>
|
||||
#endif
|
||||
#include "orte/mca/plm/base/base.h"
|
||||
|
||||
#include "opal/threads/condition.h"
|
||||
|
||||
#include "orte/class/orte_pointer_array.h"
|
||||
#include "orte/util/proc_info.h"
|
||||
|
||||
#include "orte/mca/rml/rml_types.h"
|
||||
|
||||
#include "orte/mca/pls/base/base.h"
|
||||
|
||||
#if defined(c_plusplus) || defined(__cplusplus)
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Module open / close
|
||||
*/
|
||||
int orte_pls_bproc_component_open(void);
|
||||
int orte_pls_bproc_component_close(void);
|
||||
|
||||
/*
|
||||
* Startup / Shutdown
|
||||
*/
|
||||
orte_pls_base_module_t* orte_pls_bproc_init(int *priority);
|
||||
int orte_pls_bproc_finalize(void);
|
||||
|
||||
/*
|
||||
* Interface
|
||||
*/
|
||||
int orte_pls_bproc_launch(orte_jobid_t);
|
||||
int orte_pls_bproc_terminate_job(orte_jobid_t, struct timeval *timeout, opal_list_t*);
|
||||
int orte_pls_bproc_terminate_proc(const orte_process_name_t* proc_name);
|
||||
int orte_pls_bproc_terminate_orteds(orte_jobid_t jobid, struct timeval *timeout, opal_list_t*);
|
||||
int orte_pls_bproc_signal_job(orte_jobid_t, int32_t, opal_list_t*);
|
||||
int orte_pls_bproc_signal_proc(const orte_process_name_t* proc_name, int32_t);
|
||||
int orte_pls_bproc_cancel_operation(void);
|
||||
|
||||
/* Utility routine to get/set process pid */
|
||||
ORTE_DECLSPEC int orte_pls_bproc_set_proc_pid(const orte_process_name_t*, pid_t, int);
|
||||
ORTE_DECLSPEC int orte_pls_bproc_get_proc_pid(const orte_process_name_t*, pid_t*);
|
||||
/**
|
||||
* Utility routine to retreive all process pids w/in a specified job.
|
||||
*/
|
||||
ORTE_DECLSPEC int orte_pls_bproc_get_proc_pids(orte_jobid_t jobid, pid_t** pids,
|
||||
orte_std_cntr_t* num_pids,
|
||||
opal_list_t *attrs);
|
||||
BEGIN_C_DECLS
|
||||
|
||||
/**
|
||||
* Utility routine to get/set daemon pid
|
||||
* PLM bproc Component
|
||||
*/
|
||||
ORTE_DECLSPEC int orte_pls_bproc_set_node_pid(orte_cellid_t cellid, char* node_name, orte_jobid_t jobid, pid_t pid);
|
||||
ORTE_DECLSPEC int orte_pls_bproc_get_node_pids(orte_jobid_t jobid, pid_t** pids, orte_std_cntr_t* num_pids);
|
||||
|
||||
/* utility functions for abort communications */
|
||||
int orte_pls_bproc_comm_start(void);
|
||||
int orte_pls_bproc_comm_stop(void);
|
||||
void orte_pls_bproc_recv(int status, orte_process_name_t* sender,
|
||||
orte_buffer_t* buffer, orte_rml_tag_t tag,
|
||||
void* cbdata);
|
||||
|
||||
/**
|
||||
* PLS bproc Component
|
||||
*/
|
||||
struct orte_pls_bproc_component_t {
|
||||
orte_pls_base_component_t super;
|
||||
struct orte_plm_bproc_component_t {
|
||||
/**< The base class */
|
||||
char * orted;
|
||||
orte_plm_base_component_t super;
|
||||
/**< The orted executable. This can be an absolute path, or if not found
|
||||
* we will look for it in the user's path */
|
||||
int debug;
|
||||
/**< If greater than 0 print debugging information */
|
||||
int priority;
|
||||
/**< The priority of this component. This will be returned if we determine
|
||||
* that bproc is available and running on this node, */
|
||||
int terminate_sig;
|
||||
/**< The signal that gets sent to a process to kill it. */
|
||||
opal_mutex_t lock;
|
||||
/**< Lock used to prevent some race conditions */
|
||||
opal_condition_t condition;
|
||||
/**< Condition that is signaled when all the daemons have died */
|
||||
bool recv_issued;
|
||||
/**< Indicates that the comm recv for reporting abnormal proc termination
|
||||
* has been issued
|
||||
*/
|
||||
bool do_not_launch;
|
||||
/**< for test purposes, do everything but the actual launch */
|
||||
orte_std_cntr_t num_daemons;
|
||||
/**< track the number of daemons being launched so we can tell when
|
||||
* all have reported in */
|
||||
char * orted;
|
||||
};
|
||||
/**
|
||||
* Convenience typedef
|
||||
*/
|
||||
typedef struct orte_pls_bproc_component_t orte_pls_bproc_component_t;
|
||||
typedef struct orte_plm_bproc_component_t orte_plm_bproc_component_t;
|
||||
|
||||
ORTE_DECLSPEC orte_pls_bproc_component_t mca_pls_bproc_component;
|
||||
ORTE_DECLSPEC orte_pls_base_module_t orte_pls_bproc_module;
|
||||
ORTE_DECLSPEC extern orte_plm_bproc_component_t mca_plm_bproc_component;
|
||||
ORTE_DECLSPEC extern orte_plm_base_module_t orte_plm_bproc_module;
|
||||
|
||||
#if defined(c_plusplus) || defined(__cplusplus)
|
||||
}
|
||||
#endif
|
||||
#endif /* ORTE_PLS_BPROC_H_ */
|
||||
END_C_DECLS
|
||||
|
||||
#endif /* ORTE_PLM_BPROC_H_ */
|
||||
|
||||
|
@ -22,73 +22,65 @@
|
||||
* Takes care of the component stuff for the MCA.
|
||||
*/
|
||||
#include "orte_config.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
|
||||
#include "opal/mca/mca.h"
|
||||
#include "opal/mca/base/mca_base_param.h"
|
||||
#include "pls_bproc.h"
|
||||
|
||||
#include "plm_bproc.h"
|
||||
|
||||
/*
|
||||
* Public string showing the plm ompi_bproc component version number
|
||||
*/
|
||||
const char *mca_plm_bproc_component_version_string =
|
||||
"Open MPI bproc plm MCA component version " ORTE_VERSION;
|
||||
|
||||
static int plm_bproc_open(void);
|
||||
static int plm_bproc_close(void);
|
||||
static int orte_plm_bproc_component_query(mca_base_module_t **module, int *priority);
|
||||
|
||||
/**
|
||||
* The bproc component data structure used to store all the relevent data about
|
||||
* this component.
|
||||
*/
|
||||
orte_pls_bproc_component_t mca_pls_bproc_component = {
|
||||
orte_plm_bproc_component_t mca_plm_bproc_component = {
|
||||
{
|
||||
{
|
||||
ORTE_PLS_BASE_VERSION_1_3_0,
|
||||
"bproc", /* MCA component name */
|
||||
ORTE_MAJOR_VERSION, /* MCA component major version */
|
||||
ORTE_MINOR_VERSION, /* MCA component minor version */
|
||||
ORTE_RELEASE_VERSION, /* MCA component release version */
|
||||
orte_pls_bproc_component_open, /* component open */
|
||||
orte_pls_bproc_component_close /* component close */
|
||||
ORTE_PLM_BASE_VERSION_1_0_0,
|
||||
"bproc", /* MCA component name */
|
||||
ORTE_MAJOR_VERSION, /* MCA component major version */
|
||||
ORTE_MINOR_VERSION, /* MCA component minor version */
|
||||
ORTE_RELEASE_VERSION, /* MCA component release version */
|
||||
|
||||
/* Component open and close functions */
|
||||
plm_bproc_open,
|
||||
plm_bproc_close,
|
||||
orte_plm_bproc_component_query
|
||||
},
|
||||
{
|
||||
false /* checkpoint / restart */
|
||||
},
|
||||
orte_pls_bproc_init /* component init */
|
||||
/* The component is checkpoint ready */
|
||||
MCA_BASE_METADATA_PARAM_CHECKPOINT
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* Opens the pls_bproc component, setting all the needed mca parameters and
|
||||
* Opens the plm_bproc component, setting all the needed mca parameters and
|
||||
* finishes setting up the component struct.
|
||||
*/
|
||||
int orte_pls_bproc_component_open(void) {
|
||||
int rc;
|
||||
static int plm_bproc_component_open(void) {
|
||||
mca_base_component_t *c = &mca_plm_bproc_component.super.base_version;
|
||||
|
||||
/* init parameters */
|
||||
mca_base_component_t *c = &mca_pls_bproc_component.super.pls_version;
|
||||
mca_base_param_reg_int(c, "priority", NULL, false, false, 100,
|
||||
&mca_pls_bproc_component.priority);
|
||||
mca_base_param_reg_int(c, "debug",
|
||||
"If > 0 prints library debugging information",
|
||||
false, false, 0, &mca_pls_bproc_component.debug);
|
||||
mca_base_param_reg_int(c, "terminate_sig",
|
||||
"Signal sent to processes to terminate them", false,
|
||||
false, 9, &mca_pls_bproc_component.terminate_sig);
|
||||
mca_base_param_reg_string(c, "orted", "Path to where orted is installed",
|
||||
false, false, "orted", &mca_pls_bproc_component.orted);
|
||||
mca_base_param_reg_int(c, "nolaunch", NULL, false, false, (int)false,
|
||||
&rc);
|
||||
if ((int)false == rc) {
|
||||
mca_pls_bproc_component.do_not_launch = false;
|
||||
} else {
|
||||
mca_pls_bproc_component.do_not_launch = true;
|
||||
}
|
||||
|
||||
mca_pls_bproc_component.recv_issued = false;
|
||||
OBJ_CONSTRUCT(&mca_pls_bproc_component.lock, opal_mutex_t);
|
||||
OBJ_CONSTRUCT(&mca_pls_bproc_component.condition, opal_condition_t);
|
||||
|
||||
mca_base_param_reg_string(c, "orted", "Path to where orted is installed",
|
||||
false, false, "orted", &mca_plm_bproc_component.orted);
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/**
|
||||
* Closes the pls_bproc component
|
||||
* Closes the plm_bproc component
|
||||
*/
|
||||
int orte_pls_bproc_component_close(void) {
|
||||
OBJ_DESTRUCT(&mca_pls_bproc_component.lock);
|
||||
OBJ_DESTRUCT(&mca_pls_bproc_component.condition);
|
||||
static int plm_bproc_component_close(void) {
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
@ -96,28 +88,26 @@ int orte_pls_bproc_component_close(void) {
|
||||
* Initializes the module. We do not want to run unless we are the seed, bproc
|
||||
* is running, and we are the master node.
|
||||
*/
|
||||
orte_pls_base_module_t* orte_pls_bproc_init(int *priority) {
|
||||
static int orte_plm_bproc_component_query(mca_base_module_t **module, int *priority)
|
||||
{
|
||||
int ret;
|
||||
struct bproc_version_t version;
|
||||
|
||||
/* are we the seed */
|
||||
if(orte_process_info.seed == false)
|
||||
return NULL;
|
||||
|
||||
/* okay, we are in an HNP - now check to see if BProc is running here */
|
||||
if (!mca_pls_bproc_component.do_not_launch) {
|
||||
ret = bproc_version(&version);
|
||||
if (ret != 0) {
|
||||
return NULL;
|
||||
}
|
||||
/* see if BProc is running here */
|
||||
ret = bproc_version(&version);
|
||||
if (ret != 0) {
|
||||
*module = NULL;
|
||||
return ORTE_ERR_NOT_AVAILABLE;
|
||||
}
|
||||
|
||||
|
||||
/* only launch from the master node */
|
||||
if (bproc_currnode() != BPROC_NODE_MASTER) {
|
||||
return NULL;
|
||||
*module = NULL;
|
||||
return ORTE_ERR_NOT_AVAILABLE;
|
||||
}
|
||||
|
||||
*priority = mca_pls_bproc_component.priority;
|
||||
return &orte_pls_bproc_module;
|
||||
*priority = 20;
|
||||
*module = (mca_base_module_t *) &orte_plm_bproc_module;
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
@ -35,9 +35,9 @@ endif
|
||||
AM_CPPFLAGS= $(ras_bjs_CPPFLAGS)
|
||||
|
||||
proxy_SOURCES = \
|
||||
ras_bjs.c \
|
||||
ras_bjs.h \
|
||||
ras_bjs_component.c
|
||||
ras_bjs_component.c \
|
||||
ras_bjs.c
|
||||
|
||||
mcacomponentdir = $(libdir)/openmpi
|
||||
mcacomponent_LTLIBRARIES = $(component_install)
|
||||
|
@ -16,30 +16,34 @@
|
||||
* $HEADER$
|
||||
*/
|
||||
#include "orte_config.h"
|
||||
#include <errno.h>
|
||||
#include "orte/constants.h"
|
||||
#include "orte/types.h"
|
||||
|
||||
#include <unistd.h>
|
||||
#include <string.h>
|
||||
#include <sys/bproc.h>
|
||||
|
||||
#include "orte/orte_constants.h"
|
||||
#include "orte/orte_types.h"
|
||||
|
||||
#include "opal/util/argv.h"
|
||||
#include "opal/class/opal_list.h"
|
||||
#include "opal/util/output.h"
|
||||
|
||||
#include "orte/dss/dss.h"
|
||||
#include "orte/mca/rmgr/rmgr.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
|
||||
#include "orte/mca/ras/base/ras_private.h"
|
||||
#include "ras_bjs.h"
|
||||
|
||||
/* API functions */
|
||||
static int allocate(opal_list_t *nodes);
|
||||
static int finalize(void);
|
||||
|
||||
orte_ras_base_module_t orte_ras_bjs_module = {
|
||||
allocate,
|
||||
finalize
|
||||
};
|
||||
|
||||
|
||||
/**
|
||||
* Query the bproc node status
|
||||
*/
|
||||
|
||||
static int orte_ras_bjs_node_state(int node)
|
||||
static int bjs_node_state(int node)
|
||||
{
|
||||
#if defined BPROC_API_VERSION && BPROC_API_VERSION >= 4
|
||||
char nodestatus[BPROC_STATE_LEN + 1];
|
||||
@ -67,124 +71,44 @@ static int orte_ras_bjs_node_state(int node)
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Parse the NODELIST to determine the number of process
|
||||
* slots/processors available on the node.
|
||||
*/
|
||||
|
||||
static size_t orte_ras_bjs_node_slots(char* node_name)
|
||||
static int allocate(opal_list_t *nodes)
|
||||
{
|
||||
static char** nodelist = NULL;
|
||||
char** ptr;
|
||||
size_t count = 0;
|
||||
if(nodelist == NULL)
|
||||
nodelist = opal_argv_split(getenv("NODELIST"), ',');
|
||||
ptr = nodelist;
|
||||
while(ptr && *ptr) {
|
||||
if(strcmp(*ptr, node_name) == 0)
|
||||
count++;
|
||||
ptr++;
|
||||
}
|
||||
return count;
|
||||
}
|
||||
|
||||
/**
|
||||
* Resolve the node name to node number.
|
||||
*/
|
||||
|
||||
static int orte_ras_bjs_node_resolve(char* node_name, int* node_num)
|
||||
{
|
||||
/* for now we expect this to be the node number */
|
||||
if(NULL == node_name || sscanf(node_name, "%d", node_num) != 1)
|
||||
return ORTE_ERROR;
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/**
|
||||
* Discover the available resources.
|
||||
* - validate any nodes specified via hostfile/commandline
|
||||
* - check for additional nodes that have already been allocated
|
||||
*/
|
||||
|
||||
static int orte_ras_bjs_discover(
|
||||
opal_list_t* nodelist,
|
||||
orte_app_context_t** context,
|
||||
size_t num_context)
|
||||
{
|
||||
char* nodes;
|
||||
char* nodelist;
|
||||
char* ptr;
|
||||
opal_list_item_t* item;
|
||||
opal_list_t new_nodes;
|
||||
orte_node_t *node;
|
||||
int rc;
|
||||
|
||||
/* query the nodelist from the registry */
|
||||
if(ORTE_SUCCESS != (rc = orte_ras_base_node_query(nodelist))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* validate that any user supplied nodes actually exist, etc. */
|
||||
item = opal_list_get_first(nodelist);
|
||||
while(item != opal_list_get_end(nodelist)) {
|
||||
opal_list_item_t* next = opal_list_get_next(item);
|
||||
int node_num;
|
||||
|
||||
orte_ras_node_t* node = (orte_ras_node_t*)item;
|
||||
if(ORTE_SUCCESS != orte_ras_bjs_node_resolve(node->node_name, &node_num)) {
|
||||
opal_list_remove_item(nodelist,item);
|
||||
OBJ_DESTRUCT(item);
|
||||
item = next;
|
||||
continue;
|
||||
}
|
||||
|
||||
if(orte_ras_bjs_node_state(node_num) != ORTE_NODE_STATE_UP) {
|
||||
opal_list_remove_item(nodelist,item);
|
||||
OBJ_DESTRUCT(item);
|
||||
item = next;
|
||||
continue;
|
||||
}
|
||||
|
||||
if(bproc_access(node_num, BPROC_X_OK) != 0) {
|
||||
opal_list_remove_item(nodelist,item);
|
||||
OBJ_DESTRUCT(item);
|
||||
item = next;
|
||||
continue;
|
||||
}
|
||||
|
||||
/* try and determine the number of available slots */
|
||||
if(node->node_slots == 0) {
|
||||
node->node_slots = orte_ras_bjs_node_slots(node->node_name);
|
||||
}
|
||||
item = next;
|
||||
}
|
||||
|
||||
/* parse the node list and check node status/access */
|
||||
nodes = getenv("NODES");
|
||||
if (NULL == nodes) {
|
||||
nodelist = getenv("NODES");
|
||||
if (NULL == nodelist) {
|
||||
return ORTE_ERR_NOT_AVAILABLE;
|
||||
}
|
||||
|
||||
OBJ_CONSTRUCT(&new_nodes, opal_list_t);
|
||||
while(NULL != (ptr = strsep(&nodes,","))) {
|
||||
orte_ras_node_t *node;
|
||||
while(NULL != (ptr = strsep(&nodelist,","))) {
|
||||
orte_node_state_t node_state;
|
||||
int node_num;
|
||||
|
||||
/* is this node already in the list */
|
||||
for(item = opal_list_get_first(nodelist);
|
||||
item != opal_list_get_end(nodelist);
|
||||
for(item = opal_list_get_first(nodes);
|
||||
item != opal_list_get_end(nodes);
|
||||
item = opal_list_get_next(item)) {
|
||||
node = (orte_ras_node_t*)item;
|
||||
if(strcmp(node->node_name, ptr) == 0)
|
||||
node = (orte_node_t*)item;
|
||||
if(strcmp(node->name, ptr) == 0)
|
||||
break;
|
||||
}
|
||||
if(item != opal_list_get_end(nodelist))
|
||||
/* it if is in the list, then just increment the slot count */
|
||||
if(item != opal_list_get_end(nodes)) {
|
||||
node->slots++;
|
||||
continue;
|
||||
}
|
||||
|
||||
/* convert to an int node number */
|
||||
if(sscanf(ptr, "%d", &node_num) != 1) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if(ORTE_NODE_STATE_UP != (node_state = orte_ras_bjs_node_state(node_num))) {
|
||||
if(ORTE_NODE_STATE_UP != (node_state = bjs_node_state(node_num))) {
|
||||
opal_output(0, "error: a specified node (%d) is not up.\n", node_num);
|
||||
rc = ORTE_ERROR;
|
||||
goto cleanup;
|
||||
@ -196,96 +120,21 @@ static int orte_ras_bjs_discover(
|
||||
}
|
||||
|
||||
/* create a new node entry */
|
||||
node = OBJ_NEW(orte_ras_node_t);
|
||||
node->node_name = strdup(ptr);
|
||||
node->node_state = node_state;
|
||||
node->node_slots = orte_ras_bjs_node_slots(node->node_name);
|
||||
opal_list_append(&new_nodes, &node->super);
|
||||
node = OBJ_NEW(orte_node_t);
|
||||
node->name = strdup(ptr);
|
||||
node->state = node_state;
|
||||
node->slots = 1;
|
||||
opal_list_append(nodes, &node->super);
|
||||
}
|
||||
|
||||
/* add any newly discovered nodes to the registry */
|
||||
if(opal_list_get_size(&new_nodes)) {
|
||||
rc = orte_ras_base_node_insert(&new_nodes);
|
||||
if(ORTE_SUCCESS != rc) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
}
|
||||
|
||||
/* append them to the nodelist */
|
||||
while(NULL != (item = opal_list_remove_first(&new_nodes)))
|
||||
opal_list_append(nodelist, item);
|
||||
|
||||
cleanup:
|
||||
OBJ_DESTRUCT(&new_nodes);
|
||||
return rc;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Discover available (pre-allocated) nodes. Allocate the
|
||||
* requested number of nodes/process slots to the job.
|
||||
*
|
||||
*/
|
||||
|
||||
static int orte_ras_bjs_allocate(orte_jobid_t jobid, opal_list_t *attributes)
|
||||
{
|
||||
opal_list_t nodes;
|
||||
opal_list_item_t* item;
|
||||
int rc;
|
||||
orte_app_context_t **context = NULL;
|
||||
orte_std_cntr_t i, num_context = 0;
|
||||
|
||||
OBJ_CONSTRUCT(&nodes, opal_list_t);
|
||||
|
||||
rc = orte_rmgr.get_app_context(jobid, &context, &num_context);
|
||||
if(ORTE_SUCCESS != rc) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
if(ORTE_SUCCESS != (rc = orte_ras_bjs_discover(&nodes, context, num_context))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
rc = orte_ras_base_allocate_nodes(jobid, &nodes);
|
||||
if(ORTE_SUCCESS != rc) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
|
||||
cleanup:
|
||||
while(NULL != (item = opal_list_remove_first(&nodes))) {
|
||||
OBJ_RELEASE(item);
|
||||
}
|
||||
OBJ_DESTRUCT(&nodes);
|
||||
for(i=0; i<num_context; i++) {
|
||||
OBJ_RELEASE(context[i]);
|
||||
}
|
||||
if (NULL != context) {
|
||||
free(context);
|
||||
}
|
||||
return rc;
|
||||
}
|
||||
|
||||
static int orte_ras_bjs_deallocate(orte_jobid_t jobid)
|
||||
static int finalize(void)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
static int orte_ras_bjs_finalize(void)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
orte_ras_base_module_t orte_ras_bjs_module = {
|
||||
orte_ras_bjs_allocate,
|
||||
orte_ras_base_node_insert,
|
||||
orte_ras_base_node_query,
|
||||
orte_ras_base_node_query_alloc,
|
||||
orte_ras_base_node_lookup,
|
||||
orte_ras_bjs_deallocate,
|
||||
orte_ras_bjs_finalize
|
||||
};
|
||||
|
||||
|
@ -24,27 +24,12 @@
|
||||
#define ORTE_RAS_BJS_H
|
||||
|
||||
#include "orte/mca/ras/ras.h"
|
||||
#if defined(c_plusplus) || defined(__cplusplus)
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
BEGIN_C_DECLS
|
||||
|
||||
/**
|
||||
* RAS Component
|
||||
*/
|
||||
struct orte_ras_bjs_component_t {
|
||||
orte_ras_base_component_t super;
|
||||
int debug;
|
||||
int priority;
|
||||
char *schedule_policy;
|
||||
};
|
||||
typedef struct orte_ras_bjs_component_t orte_ras_bjs_component_t;
|
||||
|
||||
ORTE_DECLSPEC extern orte_ras_bjs_component_t mca_ras_bjs_component;
|
||||
ORTE_DECLSPEC extern orte_ras_base_component_t mca_ras_bjs_component;
|
||||
ORTE_DECLSPEC extern orte_ras_base_module_t orte_ras_bjs_module;
|
||||
|
||||
#if defined(c_plusplus) || defined(__cplusplus)
|
||||
}
|
||||
#endif
|
||||
END_C_DECLS
|
||||
|
||||
#endif
|
||||
|
@ -17,115 +17,65 @@
|
||||
*/
|
||||
|
||||
#include "orte_config.h"
|
||||
#include "orte/orte_constants.h"
|
||||
#include "orte/constants.h"
|
||||
|
||||
#include "opal/mca/base/base.h"
|
||||
#include "opal/mca/base/mca_base_param.h"
|
||||
#include "orte/util/proc_info.h"
|
||||
#include "opal/util/output.h"
|
||||
|
||||
#include "ras_bjs.h"
|
||||
|
||||
/*
|
||||
* Local functions
|
||||
*/
|
||||
|
||||
static int orte_ras_bjs_open(void);
|
||||
static int orte_ras_bjs_close(void);
|
||||
static orte_ras_base_module_t* orte_ras_bjs_init(int* priority);
|
||||
static int ras_bjs_open(void);
|
||||
static int ras_bjs_component_query(mca_base_module_t **module, int *priority);
|
||||
|
||||
|
||||
orte_ras_bjs_component_t mca_ras_bjs_component = {
|
||||
orte_ras_base_component_t mca_ras_bjs_component = {
|
||||
{
|
||||
/* First, the mca_base_component_t struct containing meta
|
||||
information about the component itself */
|
||||
|
||||
{
|
||||
/* Indicate that we are a ras v1.3.0 component (which also
|
||||
implies a specific MCA version) */
|
||||
|
||||
ORTE_RAS_BASE_VERSION_1_3_0,
|
||||
|
||||
/* Indicate that we are a ras v2.0.0 component (which also
|
||||
implies a specific MCA version) */
|
||||
|
||||
ORTE_RAS_BASE_VERSION_2_0_0,
|
||||
|
||||
"bjs", /* MCA component name */
|
||||
ORTE_MAJOR_VERSION, /* MCA component major version */
|
||||
ORTE_MINOR_VERSION, /* MCA component minor version */
|
||||
ORTE_RELEASE_VERSION, /* MCA component release version */
|
||||
orte_ras_bjs_open, /* component open */
|
||||
orte_ras_bjs_close /* component close */
|
||||
},
|
||||
|
||||
/* Next the MCA v1.0.0 component meta data */
|
||||
{
|
||||
/* Whether the component is checkpointable or not */
|
||||
false
|
||||
},
|
||||
|
||||
orte_ras_bjs_init
|
||||
|
||||
/* Component open and close functions */
|
||||
|
||||
ras_bjs_open, /* component open */
|
||||
NULL, /* component close */
|
||||
ras_bjs_component_query
|
||||
},
|
||||
|
||||
/* Next the MCA v1.0.0 component meta data */
|
||||
{
|
||||
/* The component is checkpoint ready */
|
||||
MCA_BASE_METADATA_PARAM_CHECKPOINT
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
/**
|
||||
* Convience functions to lookup MCA parameter values.
|
||||
*/
|
||||
|
||||
static int orte_ras_bjs_param_register_int(
|
||||
const char* param_name,
|
||||
int default_value)
|
||||
{
|
||||
int id = mca_base_param_register_int("ras","bjs",param_name,NULL,default_value);
|
||||
int param_value = default_value;
|
||||
mca_base_param_lookup_int(id,¶m_value);
|
||||
return param_value;
|
||||
}
|
||||
|
||||
|
||||
static char* orte_ras_bjs_param_register_string(
|
||||
const char * a, const char *b, const char *c,
|
||||
const char* default_value)
|
||||
{
|
||||
char *param_value;
|
||||
int id = mca_base_param_register_string(a, b, c, NULL, default_value);
|
||||
mca_base_param_lookup_string(id, ¶m_value);
|
||||
return param_value;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* component open/close/init function
|
||||
*/
|
||||
static int orte_ras_bjs_open(void)
|
||||
static int ras_bjs_open(void)
|
||||
{
|
||||
mca_ras_bjs_component.debug = orte_ras_bjs_param_register_int("debug",1);
|
||||
mca_ras_bjs_component.priority = orte_ras_bjs_param_register_int("priority",75);
|
||||
/* JMS To be changed post-beta to LAM's C/N command line notation */
|
||||
mca_ras_bjs_component.schedule_policy =
|
||||
orte_ras_bjs_param_register_string("ras", "base", "schedule_policy", "slot");
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
static orte_ras_base_module_t *orte_ras_bjs_init(int* priority)
|
||||
static int ras_bjs_component_query(mca_base_module_t **module, int *priority)
|
||||
{
|
||||
/* if we are not an HNP, then we must not be selected */
|
||||
if (!orte_process_info.seed) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
#if 0
|
||||
if(getenv("NODES") == NULL) {
|
||||
return NULL;
|
||||
*module = NULL;
|
||||
return ORTE_ERR_NOT_AVAILABLE;
|
||||
}
|
||||
#endif
|
||||
*priority = mca_ras_bjs_component.priority;
|
||||
return &orte_ras_bjs_module;
|
||||
}
|
||||
|
||||
/**
|
||||
* Close all subsystems.
|
||||
*/
|
||||
|
||||
static int orte_ras_bjs_close(void)
|
||||
{
|
||||
*priority = 10;
|
||||
*module = (mca_base_module_t *) &orte_ras_bjs_module;
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
|
@ -1 +0,0 @@
|
||||
rhc
|
@ -1,54 +0,0 @@
|
||||
#
|
||||
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
# University Research and Technology
|
||||
# Corporation. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
# of Tennessee Research Foundation. All rights
|
||||
# reserved.
|
||||
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
# Use the top-level Makefile.options
|
||||
|
||||
|
||||
|
||||
# Make the output library in this directory, and name it either
|
||||
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
|
||||
# (for static builds).
|
||||
|
||||
if OMPI_BUILD_ras_lsf_bproc_DSO
|
||||
component_noinst =
|
||||
component_install = mca_ras_lsf_bproc.la
|
||||
else
|
||||
component_noinst = libmca_ras_lsf_bproc.la
|
||||
component_install =
|
||||
endif
|
||||
|
||||
AM_CPPFLAGS= $(ras_lsf_bproc_CPPFLAGS)
|
||||
|
||||
proxy_SOURCES = \
|
||||
ras_lsf_bproc.c \
|
||||
ras_lsf_bproc.h \
|
||||
ras_lsf_bproc_component.c
|
||||
|
||||
mcacomponentdir = $(libdir)/openmpi
|
||||
mcacomponent_LTLIBRARIES = $(component_install)
|
||||
mca_ras_lsf_bproc_la_SOURCES = $(proxy_SOURCES)
|
||||
mca_ras_lsf_bproc_la_LIBADD = \
|
||||
$(ras_lsf_bproc_LIBS) \
|
||||
$(top_ompi_builddir)/orte/libopen-rte.la \
|
||||
$(top_ompi_builddir)/opal/libopen-pal.la
|
||||
mca_ras_lsf_bproc_la_LDFLAGS = -module -avoid-version $(ras_lsf_bproc_LDFLAGS)
|
||||
|
||||
noinst_LTLIBRARIES = $(component_noinst)
|
||||
libmca_ras_lsf_bproc_la_SOURCES = $(proxy_SOURCES)
|
||||
libmca_ras_lsf_bproc_la_LIBADD = $(ras_lsf_bproc_LIBS)
|
||||
libmca_ras_lsf_bproc_la_LDFLAGS = -module -avoid-version $(ras_lsf_bproc_LDFLAGS)
|
@ -1,38 +0,0 @@
|
||||
# -*- shell-script -*-
|
||||
#
|
||||
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
# University Research and Technology
|
||||
# Corporation. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
# of Tennessee Research Foundation. All rights
|
||||
# reserved.
|
||||
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
# MCA_ras_lsf_bproc_CONFIG([action-if-found], [action-if-not-found])
|
||||
# -----------------------------------------------------------
|
||||
AC_DEFUN([MCA_ras_lsf_bproc_CONFIG],[
|
||||
OMPI_CHECK_BPROC([ras_lsf_bproc], [ras_lsf_bproc_good=1],
|
||||
[ras_lsf_bproc_good=1], [ras_lsf_bproc_good=0])
|
||||
|
||||
# if check worked, set wrapper flags if so.
|
||||
# Evaluate succeed / fail
|
||||
AS_IF([test "$ras_lsf_bproc_good" = "1"],
|
||||
[ras_lsf_bproc_WRAPPER_EXTRA_LDFLAGS="$ras_lsf_bproc_LDFLAGS"
|
||||
ras_lsf_bproc_WRAPPER_EXTRA_LIBS="$ras_lsf_bproc_LIBS"
|
||||
$1],
|
||||
[$2])
|
||||
|
||||
# set build flags to use in makefile
|
||||
AC_SUBST([ras_lsf_bproc_CPPFLAGS])
|
||||
AC_SUBST([ras_lsf_bproc_LDFLAGS])
|
||||
AC_SUBST([ras_lsf_bproc_LIBS])
|
||||
])dnl
|
@ -1,24 +0,0 @@
|
||||
# -*- shell-script -*-
|
||||
#
|
||||
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
# University Research and Technology
|
||||
# Corporation. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
# of Tennessee Research Foundation. All rights
|
||||
# reserved.
|
||||
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# Copyright (c) 2007 Los Alamos National Security, LLC. All rights
|
||||
# reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
# Specific to this module
|
||||
|
||||
PARAM_CONFIG_FILES="Makefile"
|
@ -1,55 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
#include "orte_config.h"
|
||||
#include <errno.h>
|
||||
#include <unistd.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "orte/orte_constants.h"
|
||||
|
||||
#include "orte/mca/ras/base/ras_private.h"
|
||||
#include "ras_lsf_bproc.h"
|
||||
|
||||
|
||||
static int orte_ras_lsf_bproc_allocate(orte_jobid_t jobid, opal_list_t *attributes)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static int orte_ras_lsf_bproc_deallocate(orte_jobid_t jobid)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
static int orte_ras_lsf_bproc_finalize(void)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
orte_ras_base_module_t orte_ras_lsf_bproc_module = {
|
||||
orte_ras_lsf_bproc_allocate,
|
||||
orte_ras_base_node_insert,
|
||||
orte_ras_base_node_query,
|
||||
orte_ras_base_node_query_alloc,
|
||||
orte_ras_base_node_lookup,
|
||||
orte_ras_lsf_bproc_deallocate,
|
||||
orte_ras_lsf_bproc_finalize
|
||||
};
|
||||
|
@ -1,49 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2006 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
/**
|
||||
* @file
|
||||
*
|
||||
* Resource Allocation (LSF over BPROC)
|
||||
*/
|
||||
#ifndef ORTE_RAS_LSF_BPROC_H
|
||||
#define ORTE_RAS_LSF_BPROC_H
|
||||
|
||||
#include "orte/mca/ras/ras.h"
|
||||
#if defined(c_plusplus) || defined(__cplusplus)
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
|
||||
/**
|
||||
* RAS Component
|
||||
*/
|
||||
struct orte_ras_lsf_bproc_component_t {
|
||||
orte_ras_base_component_t super;
|
||||
int debug;
|
||||
int priority;
|
||||
};
|
||||
typedef struct orte_ras_lsf_bproc_component_t orte_ras_lsf_bproc_component_t;
|
||||
|
||||
ORTE_DECLSPEC extern orte_ras_lsf_bproc_component_t mca_ras_lsf_bproc_component;
|
||||
ORTE_DECLSPEC extern orte_ras_base_module_t orte_ras_lsf_bproc_module;
|
||||
|
||||
#if defined(c_plusplus) || defined(__cplusplus)
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
@ -1,111 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "orte_config.h"
|
||||
#include "orte/orte_constants.h"
|
||||
#include "opal/mca/base/base.h"
|
||||
#include "opal/mca/base/mca_base_param.h"
|
||||
#include "orte/util/proc_info.h"
|
||||
#include "opal/util/output.h"
|
||||
#include "ras_lsf_bproc.h"
|
||||
|
||||
/*
|
||||
* Local functions
|
||||
*/
|
||||
|
||||
static int orte_ras_lsf_bproc_open(void);
|
||||
static int orte_ras_lsf_bproc_close(void);
|
||||
static orte_ras_base_module_t* orte_ras_lsf_bproc_init(int* priority);
|
||||
|
||||
|
||||
orte_ras_lsf_bproc_component_t mca_ras_lsf_bproc_component = {
|
||||
{
|
||||
/* First, the mca_base_component_t struct containing meta
|
||||
information about the component itself */
|
||||
|
||||
{
|
||||
/* Indicate that we are a ras v1.3.0 component (which also
|
||||
implies a specific MCA version) */
|
||||
|
||||
ORTE_RAS_BASE_VERSION_1_3_0,
|
||||
|
||||
"lsf_bproc", /* MCA component name */
|
||||
ORTE_MAJOR_VERSION, /* MCA component major version */
|
||||
ORTE_MINOR_VERSION, /* MCA component minor version */
|
||||
ORTE_RELEASE_VERSION, /* MCA component release version */
|
||||
orte_ras_lsf_bproc_open, /* component open */
|
||||
orte_ras_lsf_bproc_close /* component close */
|
||||
},
|
||||
|
||||
/* Next the MCA v1.0.0 component meta data */
|
||||
{
|
||||
/* Whether the component is checkpointable or not */
|
||||
false
|
||||
},
|
||||
|
||||
orte_ras_lsf_bproc_init
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
/**
|
||||
* Convience functions to lookup MCA parameters
|
||||
*/
|
||||
static int orte_ras_lsf_bproc_param_register_int(
|
||||
const char* param_name,
|
||||
int default_value)
|
||||
{
|
||||
int id = mca_base_param_register_int("ras","lsf_bproc",param_name,NULL,default_value);
|
||||
int param_value = default_value;
|
||||
mca_base_param_lookup_int(id,¶m_value);
|
||||
return param_value;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* component open/close/init function
|
||||
*/
|
||||
static int orte_ras_lsf_bproc_open(void)
|
||||
{
|
||||
mca_ras_lsf_bproc_component.debug = orte_ras_lsf_bproc_param_register_int("debug",1);
|
||||
mca_ras_lsf_bproc_component.priority = orte_ras_lsf_bproc_param_register_int("priority",-1);
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
static orte_ras_base_module_t *orte_ras_lsf_bproc_init(int* priority)
|
||||
{
|
||||
/* if we are not an HNP, then we must not be selected */
|
||||
if (!orte_process_info.seed) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
*priority = mca_ras_lsf_bproc_component.priority;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/**
|
||||
* Close all subsystems.
|
||||
*/
|
||||
|
||||
static int orte_ras_lsf_bproc_close(void)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user