1
1

minor clean up and treespawn support

This commit was SVN r13876.
Этот коммит содержится в:
Li-Ta Lo 2007-03-01 22:32:37 +00:00
родитель caa1522a22
Коммит a0e5b6a27c
4 изменённых файлов: 40 добавлений и 62 удалений

Просмотреть файл

@ -68,6 +68,7 @@
#include "orte/runtime/orte_wait.h"
#include "orte/runtime/runtime.h"
#include <math.h>
#include "pls_xcpu.h"
#include "spfs.h"
#include "spclient.h"
@ -76,9 +77,6 @@
extern char **environ;
/** external variable defined in libspclient */
extern int spc_chatty;
/**
* Initialization of the xcpu module with all the needed function pointers
*/
@ -96,7 +94,7 @@ orte_pls_base_module_t orte_pls_xcpu_module = {
/* array of *Xpcommand and Xpnodeset, each xcmd/nodeset correspond to one OMPI app_context */
Xpcommand **xcmd_sets;
Xpnodeset **node_sets;
int num_xcmds;
int num_apps;
void
pls_xcpu_stdout_cb(Xpsession *s, u8 *buf, u32 buflen)
@ -242,16 +240,12 @@ pls_xcpu_setup_env(char ***e)
int
orte_pls_xcpu_launch_job(orte_jobid_t jobid)
{
int i, n, rc;
int i, fanout, rc;
int num_processes = 0;
orte_cellid_t cellid;
opal_list_item_t *node_item, *proc_item;
orte_job_map_t *map;
orte_vpid_t vpid_start, vpid_range;
char **env;
if (mca_pls_xcpu_component.chatty)
spc_chatty = 1;
/* get the job map */
rc = orte_rmaps.get_job_map(&map, jobid);
@ -259,6 +253,7 @@ orte_pls_xcpu_launch_job(orte_jobid_t jobid)
ORTE_ERROR_LOG(rc);
return rc;
}
num_apps = map->num_apps;
/* next, get the vpid_start and range */
rc = orte_rmgr.get_vpid_range(jobid, &vpid_start, &vpid_range);
@ -271,13 +266,11 @@ orte_pls_xcpu_launch_job(orte_jobid_t jobid)
cellid = orte_process_info.my_name->cellid;
/* create num_apps of pointers to Xpnodeset and Xpcommand */
node_sets = (Xpnodeset **) malloc(map->num_apps * sizeof(Xpnodeset *));
xcmd_sets = (Xpcommand **) malloc(map->num_apps * sizeof(Xpcommand *));
num_xcmds = map->num_apps;
node_sets = (Xpnodeset **) malloc(num_apps * sizeof(Xpnodeset *));
xcmd_sets = (Xpcommand **) malloc(num_apps * sizeof(Xpcommand *));
/* create Xpnodeset for each app_context */
for (i = 0; i < map->num_apps; i++) {
for (i = 0; i < num_apps; i++) {
node_sets[i] = xp_nodeset_create();
}
@ -299,13 +292,13 @@ orte_pls_xcpu_launch_job(orte_jobid_t jobid)
}
/* setup envrionment variables for each app context */
for (i = 0; i < map->num_apps; i++) {
for (i = 0; i < num_apps; i++) {
/* FixME: how many layers of *? */
pls_xcpu_setup_env(&map->apps[i]->env);
num_processes += map->apps[i]->num_procs;
}
for (i = 0; i < map->num_apps; i++) {
for (i = 0; i < num_apps; i++) {
rc = orte_ns_nds_xcpu_put(cellid, jobid, vpid_start,
num_processes, &map->apps[i]->env);
if (rc != ORTE_SUCCESS) {
@ -315,9 +308,18 @@ orte_pls_xcpu_launch_job(orte_jobid_t jobid)
}
/* create Xpcommand for each app_context from Xpnodeset */
for (i = 0; i < map->num_apps; i++) {
for (i = 0; i < num_apps; i++) {
xcmd_sets[i] = xp_command_create(node_sets[i]);
/* caculate maximum fan out for tree spawn */
if (mca_pls_xcpu_component.maxsessions < 0) {
fanout = (int) sqrt(node_sets[i]->len);
if (fanout*fanout < node_sets[i]->len)
fanout++;
} else
fanout = mca_pls_xcpu_component.maxsessions;
xcmd_sets[i]->nspawn = fanout;
/* setup argc, argv and evn in xcpu command */
xcmd_sets[i]->cwd = strdup(map->apps[i]->cwd);
xcmd_sets[i]->env = process_env(map->apps[i]->env);
@ -341,7 +343,7 @@ orte_pls_xcpu_launch_job(orte_jobid_t jobid)
* FixME: we are blocked here so both success and faulure cases
* fall back to the error handler and all resources are freed.
* this should be changed when we have non-blocking command_wait() */
if (xp_commands_wait(map->num_apps, xcmd_sets) < 0) {
if (xp_commands_wait(num_apps, xcmd_sets) < 0) {
rc = ORTE_ERROR;
} else {
rc = ORTE_SUCCESS;
@ -349,7 +351,7 @@ orte_pls_xcpu_launch_job(orte_jobid_t jobid)
error:
/* error handling and clean up, kill all the processes */
for (i = 0; i < map->num_apps; i++) {
for (i = 0; i < num_apps; i++) {
if (xcmd_sets[i] != NULL) {
xp_command_wipe(xcmd_sets[i]);
xp_command_destroy(xcmd_sets[i]);
@ -363,18 +365,9 @@ error:
int orte_pls_xcpu_terminate_job(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs)
{
int i, rc;
orte_job_map_t *map;
int i;
/* get the job map */
rc = orte_rmaps.get_job_map(&map, jobid);
if (rc != ORTE_SUCCESS) {
ORTE_ERROR_LOG(rc);
return rc;
}
for (i = 0; i < map->num_apps; i++) {
for (i = 0; i < num_apps; i++) {
if (xcmd_sets[i] != NULL) {
xp_command_kill(xcmd_sets[i], SIGTERM);
}
@ -391,7 +384,7 @@ int orte_pls_xcpu_terminate_proc(const orte_process_name_t* proc_name)
{
fprintf(stderr, __FILE__ " terminate_proc\n");
/* libxcpu can not wipe individual process in an
/* libxcpu can not kill individual process in an
* Xpcommand/Xpsessionset, only to the whole session set */
return ORTE_SUCCESS;
@ -399,19 +392,11 @@ int orte_pls_xcpu_terminate_proc(const orte_process_name_t* proc_name)
int orte_pls_xcpu_signal_job(orte_jobid_t jobid, int32_t sig, opal_list_t *attrs)
{
int i, rc;
orte_job_map_t *map;
int i;
fprintf(stderr, __FILE__ " signal_job, sig = %d\n", sig);
/* get the job map */
rc = orte_rmaps.get_job_map(&map, jobid);
if (rc != ORTE_SUCCESS) {
ORTE_ERROR_LOG(rc);
return rc;
}
for (i = 0; i < map->num_apps; i++) {
for (i = 0; i < num_apps; i++) {
if (xcmd_sets[i] != NULL)
xp_command_kill(xcmd_sets[i], sig);
}

Просмотреть файл

@ -84,13 +84,10 @@ struct orte_pls_xcpu_component_t {
/* The priority of this component. This will be returned if
* we determine that xcpu is available and running on this node,
*/
int terminate_sig;
/* The signal that gets sent to a process to kill it. */
opal_mutex_t lock;
/* Lock used to prevent some race conditions */
opal_condition_t condition;
/* Condition that is signaled when all the daemons have died */
int chatty;
/* enable print out of 9P protocol */
int maxsessions;
/* maximum fan out for tree spawn */
};
typedef struct orte_pls_xcpu_component_t orte_pls_xcpu_component_t;

Просмотреть файл

@ -52,6 +52,9 @@ orte_pls_xcpu_component_t mca_pls_xcpu_component = {
}
};
/** external variable defined in libspclient */
extern int spc_chatty;
/**
* Opens the pls_xcpu component, setting all the needed mca parameters and
* finishes setting up the component struct.
@ -69,10 +72,15 @@ int orte_pls_xcpu_component_open(void)
mca_base_param_reg_int(c, "debug",
"If > 0 prints library debugging information",
false, false, 0, &mca_pls_xcpu_component.debug);
mca_base_param_reg_int(c, "chatty", "Prints 9P protocol transactions",
mca_base_param_reg_int(c, "chatty",
"Prints 9P protocol transactions",
false, false, 0, &mca_pls_xcpu_component.chatty);
OBJ_CONSTRUCT(&mca_pls_xcpu_component.lock, opal_mutex_t);
OBJ_CONSTRUCT(&mca_pls_xcpu_component.condition, opal_condition_t);
mca_base_param_reg_int(c, "maxsession",
"Max fan out when using XCPUFS tree spawn",
false, false, -1, &mca_pls_xcpu_component.maxsessions);
if (mca_pls_xcpu_component.chatty)
spc_chatty = 1;
return rc;
}
@ -82,11 +90,6 @@ int orte_pls_xcpu_component_open(void)
*/
int orte_pls_xcpu_component_close(void)
{
//fprintf(stderr, "orte_pls_xcpu_component_close\n");
OBJ_DESTRUCT(&mca_pls_xcpu_component.lock);
OBJ_DESTRUCT(&mca_pls_xcpu_component.condition);
return ORTE_SUCCESS;
}

Просмотреть файл

@ -30,7 +30,6 @@
#include <assert.h>
#include <limits.h>
#include "spfs.h"
//#include "spfsimpl.h"
#include "orte_config.h"
#include "opal/event/event.h"
#include "opal/runtime/opal_progress.h"
@ -97,7 +96,6 @@ spfd_add(int fd, void (*notify)(Spfd *, void *), void *aux)
if (!spfd)
return NULL;
// fprintf(stderr, "spfd_add spfd %p fd %d\n", spfd, fd);
fcntl(fd, F_SETFL, O_NONBLOCK);
spfd->fd = fd;
spfd->flags = 0;
@ -116,7 +114,6 @@ spfd_add(int fd, void (*notify)(Spfd *, void *), void *aux)
void
spfd_remove(Spfd *spfd)
{
// fprintf(stderr, "spfd_remove spfd %p\n", spfd);
if (spfd->prev)
spfd->prev->next = spfd->next;
else
@ -213,7 +210,6 @@ spfd_handler(int fd, short event, void *aux)
spfd = aux;
// fprintf(stderr, "spfd_handler spfd %p event %d events %d flags %d\n", spfd, event, spfd->events, spfd->flags);
flags = spfd->flags;
events = spfd->events;
@ -244,9 +240,6 @@ spfd_handler(int fd, short event, void *aux)
static void
sp_setup_event(Spfd *spfd)
{
// fprintf(stderr, "sp_setup_event ");
// sp_printtime(stderr);
// fprintf(stderr, " spfd %p events %d\n", spfd, spfd->events);
opal_event_set(&spfd->opevent, spfd->fd, spfd->events, spfd_handler, spfd);
opal_event_add(&spfd->opevent, sptval);
}