1
1
openmpi/orte/mca/pls/poe/pls_poe_module.c

628 строки
20 KiB
C
Исходник Обычный вид История

/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*
* These symbols are in a file by themselves to provide nice linker
* semantics. Since linkers generally pull in symbols by object
* files, keeping these symbols as the only symbols in this file
* prevents utility programs such as "ompi_info" from having to import
* entire components just to query their version and parameters.
*/
#include "orte_config.h"
#include <fcntl.h>
#include <errno.h>
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif
#include "orte/orte_constants.h"
#include "opal/mca/base/mca_base_param.h"
#include "opal/util/argv.h"
#include "opal/util/opal_environ.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/gpr/gpr.h"
#include "orte/mca/pls/pls.h"
#include "orte/mca/pls/poe/pls_poe.h"
#include "orte/mca/ns/ns.h"
#include "orte/mca/rmaps/base/base.h"
#include "orte/mca/rmaps/base/rmaps_base_map.h"
#include "orte/mca/rmgr/base/base.h"
#include "orte/mca/rml/rml.h"
#include "orte/mca/sds/base/base.h"
#include "orte/mca/soh/soh.h"
#include "orte/util/univ_info.h"
#include "orte/util/session_dir.h"
#include "orte/runtime/orte_wait.h"
extern char **environ;
/*
* Local functions
*/
static int pls_poe_launch(orte_jobid_t jobid);
static int pls_poe_terminate_job(orte_jobid_t jobid);
static int pls_poe_terminate_proc(const orte_process_name_t *name);
static int pls_poe_finalize(void);
orte_pls_base_module_1_0_0_t orte_pls_poe_module = {
pls_poe_launch,
pls_poe_terminate_job,
pls_poe_terminate_proc,
pls_poe_finalize
};
/**
poe_set_handler_default - set signal handler to default
@param sig signal [IN]
*/
static void poe_set_handler_default(int sig)
{
struct sigaction act;
act.sa_handler = SIG_DFL;
act.sa_flags = 0;
sigemptyset(&act.sa_mask);
sigaction(sig, &act, (struct sigaction *)0);
}
/**
poe_argv_append_int - append integer variable to argument variable
@param argc argument count [OUT]
@param argv argument variable [OUT]
@param varname variable name [IN]
@param min minimum value [IN]
@param argname argument name [IN]
*/
static inline int poe_argv_append_int(int *argc, char ***argv, int varname, int min, char *argname)
{
char *tmp_string;
if(varname >= min) {
opal_argv_append(argc, argv, argname);
asprintf(&tmp_string, "%d", varname);
opal_argv_append(argc, argv, tmp_string);
free(tmp_string);
} else {
return ORTE_ERR_BAD_PARAM;
}
return ORTE_SUCCESS;
}
/**
@warning - THIS FUNCTION IS NOT USED. IT WILL BE USED WHEN FAULT-TOLERANCE FEATURE IS NEEDED
*/
#ifdef __FOR_LATER
int pls_poe_launch_interactive_orted(orte_jobid_t jobid)
{
opal_list_t nodes, mapping_list;
opal_list_item_t* item;
size_t num_nodes;
orte_vpid_t vpid;
int node_name_index1;
int node_name_index2;
int proc_name_index;
char *tmp_string;
char *uri, *param;
char* name_string;
char** argv;
int argc;
int pid;
int rc;
int i;
int status;
FILE *hfp, *cfp;
/* Query the list of nodes allocated and mapped to this job.
* We need the entire mapping for a couple of reasons:
* - need the prefix to start with.
* - need to know if we are launching on a subset of the allocated nodes
* All other mapping responsibilities fall to orted in the fork PLS
*/
if((mca_pls_poe_component.hostfile=tempnam(NULL,NULL))==NULL) return ORTE_ERR_OUT_OF_RESOURCE;
if((mca_pls_poe_component.cmdfile=tempnam(NULL,NULL))==NULL) return ORTE_ERR_OUT_OF_RESOURCE;
if((hfp=fopen(mca_pls_poe_component.hostfile,"w"))==NULL) return ORTE_ERR_OUT_OF_RESOURCE;
if((cfp=fopen(mca_pls_poe_component.cmdfile,"w"))==NULL) return ORTE_ERR_OUT_OF_RESOURCE;
OBJ_CONSTRUCT(&nodes, opal_list_t);
OBJ_CONSTRUCT(&mapping_list, opal_list_t);
rc = orte_rmaps_base_mapped_node_query(&mapping_list, &nodes, jobid);
if(ORTE_SUCCESS != rc) {
goto cleanup;
}
/*
* Allocate a range of vpids for the daemons.
*/
num_nodes = opal_list_get_size(&nodes);
if(num_nodes == 0) {
return ORTE_ERR_BAD_PARAM;
}
rc = orte_ns.reserve_range(0, num_nodes, &vpid);
if(ORTE_SUCCESS != rc) {
goto cleanup;
}
/* application */
argv = opal_argv_copy(opal_argv_split(mca_pls_poe_component.orted, ' '));
argc = opal_argv_count(argv);
if (mca_pls_poe_component.debug) {
opal_argv_append(&argc, &argv, "--debug");
}
opal_argv_append(&argc, &argv, "--debug-daemons");
opal_argv_append(&argc, &argv, "--no-daemonize");
opal_argv_append(&argc, &argv, "--bootproxy");
/* need integer value for command line parameter - NOT hex */
asprintf(&tmp_string, "%lu", (unsigned long)jobid);
opal_argv_append(&argc, &argv, tmp_string);
free(tmp_string);
opal_argv_append(&argc, &argv, "--name");
proc_name_index = argc;
opal_argv_append(&argc, &argv, "");
opal_argv_append(&argc, &argv, "--nodename");
node_name_index2 = argc;
opal_argv_append(&argc, &argv, "");
/* pass along the universe name and location info */
opal_argv_append(&argc, &argv, "--universe");
asprintf(&tmp_string, "%s@%s:%s", orte_universe_info.uid,
orte_universe_info.host, orte_universe_info.name);
opal_argv_append(&argc, &argv, tmp_string);
free(tmp_string);
/* setup ns contact info */
opal_argv_append(&argc, &argv, "--nsreplica");
if(NULL != orte_process_info.ns_replica_uri) {
uri = strdup(orte_process_info.ns_replica_uri);
} else {
uri = orte_rml.get_uri();
}
asprintf(&param, "\"%s\"", uri);
opal_argv_append(&argc, &argv, param);
free(uri);
/* setup gpr contact info */
opal_argv_append(&argc, &argv, "--gprreplica");
if(NULL != orte_process_info.gpr_replica_uri) {
uri = strdup(orte_process_info.gpr_replica_uri);
} else {
uri = orte_rml.get_uri();
}
asprintf(&param, "\"%s\"", uri);
opal_argv_append(&argc, &argv, param);
free(uri);
/*
* Iterate through each of the nodes and spin
* up a daemon.
*/
for(item = opal_list_get_first(&nodes);
item != opal_list_get_end(&nodes);
item = opal_list_get_next(item)) {
Quite a range of small changes. ns_replica.c - Removed the error logging since I use this function in orte_init_stage1 to check if we have created a cellid yet or not. ras_types.h & rase_base_node.h - This was an empty file. moved the orte_ras_node_t from base/ras_base_node.h to this file. - Changed the name of orte_ras_base_node_t to orte_ras_node_t to match the naming mechanisms in place. ras.h - Exposed 2 functions: - node_insert: This takes a list of orte_ras_base_node_t's and places them in the Node Segment of the GPR. This is to be used in orte_init_stage1 for singleton processes, and the hostfile parsing (see rds_hostfile.c). This just puts in the appropriate API interface to keep from calling the orte_ras_base_node_insert function directly. - node_query: This is used in hostfile parsing. This just puts in the appropriate API interface to keep from calling the orte_ras_base_node_query function directly. - Touched all of the implemented components to add reference to these new function pointers ras_base_select.c & ras_base_open.c - Add and set the global module reference rds.h - Exposed 1 function: - store_resource: This stores a list of rds_cell_desc_t's to the Resource Segment. This is used in conjunction with the orte_ras.node_insert function in both the orte_init_stage1 for singleton processes and rds_hostfile.c rds_base_select.c & rds_base_open.c - Add and set the global module reference rds_hostfile.c - Added functionality to create a new cellid for each hostfile, placing each entry in the hostfile into the same cellid. Currently this is commented out with the cellid hard coded to 0, with the intention of taking this out once ORTE is able to handle multiple cellid's - Instead of just adding hosts to the Node Segment via a direct call to the ras_base_node_insert() function. First add the hosts to the Resource Segment of the GPR using the orte_rds.store_resource() function then use the API version of orte_ras.node_insert() to store the hosts on the Node Segment. - Add 1 new function pointer to module as required by the API. rds_hostfile_component.c - Converted this to use the new MCA parameter registration orte_init_stage1.c - It is possible that a cellid was not created yet for the current environment. So I put in some logic to test if the cellid 0 existed. If it does then continue, otherwise create the cellid so we can properly interact with the GPR via the RDS. - For the singleton case we insert some 'dummy' data into the GPR. The RAS matches this logic, so I took out the duplicate GPR put logic, and replaced it with a call to the orte_ras.node_insert() function. - Further before calling orte_ras.node_insert() in the singleton case, we also call orte_rds.store_resource() to add the singleton node to the Resource Segment. Console: - Added a bunch of new functions. Still experimenting with many aspects of the implementation. This is a checkpoint, and has very limited functionality. - Should not be considered stable at the moment. This commit was SVN r6813.
2005-08-11 19:51:50 +00:00
orte_ras_node_t* node = (orte_ras_node_t*)item;
orte_process_name_t* name;
pid_t pid;
/* setup node name */
argv[node_name_index2] = node->node_name;
fprintf(hfp,"%s\n",node->node_name);
/* initialize daemons process name */
rc = orte_ns.create_process_name(&name, node->node_cellid, 0, vpid);
if(ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
/* setup process name */
rc = orte_ns.get_proc_name_string(&name_string, name);
if(ORTE_SUCCESS != rc) {
opal_output(0, "orte_pls_poe: unable to create process name");
return rc;
}
argv[proc_name_index] = name_string;
for(i=0;i<argc;i++) {
fprintf(cfp,"%s ",argv[i]);
}
fprintf(cfp,"\n");
if (mca_pls_poe_component.verbose) {
opal_output(0, "%s:cmdfile %s\n", __FUNCTION__, opal_argv_join(argv, ' '));
}
vpid++;
free(name);
}
fclose(cfp);
fclose(hfp);
argv = opal_argv_copy(mca_pls_poe_component.argv);
argc = mca_pls_poe_component.argc;
opal_argv_append(&argc, &argv, "-hostfile");
opal_argv_append(&argc, &argv, mca_pls_poe_component.hostfile);
opal_argv_append(&argc, &argv, "-cmdfile");
opal_argv_append(&argc, &argv, mca_pls_poe_component.cmdfile);
opal_argv_append(&argc, &argv, "-procs");
asprintf(&tmp_string, "%d", num_nodes);
opal_argv_append(&argc, &argv, tmp_string);
free(tmp_string);
opal_argv_append(&argc, &argv, "-pgmmodel");
opal_argv_append(&argc, &argv, "mpmd");
opal_argv_append(&argc, &argv, "-resd");
opal_argv_append(&argc, &argv, "no");
opal_argv_append(&argc, &argv, "-labelio");
opal_argv_append(&argc, &argv, "yes");
opal_argv_append(&argc, &argv, "-infolevel");
opal_argv_append(&argc, &argv, "6");
opal_argv_append(&argc, &argv, "-stdoutmode");
opal_argv_append(&argc, &argv, "ordered");
rc=poe_argv_append_int(&argc, &argv, mca_pls_poe_component.mp_retry, 0, "-retry");
if(ORTE_SUCCESS!=rc) { ORTE_ERROR_LOG(rc); goto cleanup; }
rc=poe_argv_append_int(&argc, &argv, mca_pls_poe_component.mp_retrycount, 0, "-retrycount");
if(ORTE_SUCCESS!=rc) { ORTE_ERROR_LOG(rc); goto cleanup; }
if (mca_pls_poe_component.verbose) {
opal_output(0, "%s:cmdline %s\n", __FUNCTION__, opal_argv_join(argv, ' '));
}
pid = fork();
if(pid < 0) {
rc = ORTE_ERR_OUT_OF_RESOURCE;
goto cleanup;
}
/* child */
if(pid == 0) {
execv(mca_pls_poe_component.path, argv);
opal_output(0, "orte_pls_poe: execv failed with errno=%d\n", errno);
exit(-1);
}
cleanup:
while(NULL != (item = opal_list_remove_first(&nodes))) {
OBJ_RELEASE(item);
}
OBJ_DESTRUCT(&nodes);
while(NULL != (item = opal_list_remove_first(&mapping_list))) {
OBJ_RELEASE(item);
}
OBJ_DESTRUCT(&mapping_list);
if (mca_pls_poe_component.verbose > 10) opal_output(0, "%s: --- END rc(%d) ---\n", __FUNCTION__, rc);
return rc;
}
#endif
/**
poe_wait_job - call back when POE finish
@param pid pid
@param status status
@param cbdata call back data
@return error number
*/
static void poe_wait_job(pid_t pid, int status, void* cbdata)
{
opal_list_t map;
opal_list_item_t* item;
int rc;
/* query allocation for the job */
OBJ_CONSTRUCT(&map, opal_list_t);
rc = orte_rmaps_base_get_map(mca_pls_poe_component.jobid,&map);
if(ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
}
for(item = opal_list_get_first(&map);
item != opal_list_get_end(&map);
item = opal_list_get_next(item)) {
orte_rmaps_base_map_t* map = (orte_rmaps_base_map_t*) item;
size_t i;
for(i = 0 ; i < map->num_procs ; ++i) {
orte_session_dir_finalize(&(map->procs[i])->proc_name);
rc = orte_soh.set_proc_soh(&(map->procs[i]->proc_name),
ORTE_PROC_STATE_ABORTED, status);
}
if(ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
}
}
OBJ_DESTRUCT(&map);
}
/**
poe_create_cmd_file - create POE command file
@param cfp command file pointer [IN]
@param context context [IN]
@param proc proc [IN]
@param vpid_start vpid start [IN]
@param vpid_range vpid range [IN]
@return error number
*/
static int poe_create_cmd_file(
FILE *cfp,
orte_app_context_t* context,
orte_rmaps_base_proc_t* proc,
orte_vpid_t vpid_start,
orte_vpid_t vpid_range)
{
int i;
char* param;
char* uri;
char **environ_copy;
/* setup base environment */
environ_copy = NULL;
param = mca_base_param_environ_variable("rmgr","bootproxy","jobid");
opal_unsetenv(param, &environ_copy);
/* setup universe info */
if(NULL != orte_universe_info.name) {
param = mca_base_param_environ_variable("universe", NULL, NULL);
asprintf(&uri, "%s@%s:%s", orte_universe_info.uid,
orte_universe_info.host,
orte_universe_info.name);
opal_setenv(param, uri, true, &environ_copy);
free(param);
free(uri);
}
/* setup ns contact info */
if(NULL != orte_process_info.ns_replica_uri) {
uri = strdup(orte_process_info.ns_replica_uri);
} else {
uri = orte_rml.get_uri();
}
param = mca_base_param_environ_variable("ns","replica","uri");
opal_setenv(param, uri, true, &environ_copy);
free(param);
free(uri);
/* setup gpr contact info */
if(NULL != orte_process_info.gpr_replica_uri) {
uri = strdup(orte_process_info.gpr_replica_uri);
} else {
uri = orte_rml.get_uri();
}
param = mca_base_param_environ_variable("gpr","replica","uri");
opal_setenv(param, uri, true, &environ_copy);
free(param);
free(uri);
/* push name into environment */
orte_ns_nds_env_put(&proc->proc_name, vpid_start, vpid_range, &environ_copy);
if (context->argv == NULL) {
context->argv = malloc(sizeof(char*)*2);
context->argv[0] = strdup(context->app);
context->argv[1] = NULL;
}
i=0;
fprintf(cfp,"%s",mca_pls_poe_component.env);
while(environ_copy[i]!=NULL) {
fprintf(cfp," %s",environ_copy[i++]);
}
opal_argv_free(environ_copy);
fprintf(cfp," %s",context->app);
i=1;
while(context->argv[i]!=NULL) {
fprintf(cfp," %s",context->argv[i++]);
}
/* POE will upset if the file doesn't contain end of line. */
fprintf(cfp,"\n");
return ORTE_SUCCESS;
}
/**
poe_launch_interactive - launch an interactive job
@param jobid JOB Identifier [IN]
@return error number
*/
static inline int poe_launch_interactive(orte_jobid_t jobid)
{
opal_list_t map, nodes, mapping_list;
opal_list_item_t* item;
orte_vpid_t vpid_start, vpid_range;
size_t num_nodes, num_procs;
FILE *hfp, *cfp;
char** argv;
int argc;
int rc, pid;
sigset_t sigs;
if( (NULL==(mca_pls_poe_component.cmdfile=tempnam(NULL,NULL))) ||
(NULL==(cfp=fopen(mca_pls_poe_component.cmdfile,"w"))) ) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
mca_pls_poe_component.jobid = jobid;
OBJ_CONSTRUCT(&nodes, opal_list_t);
OBJ_CONSTRUCT(&mapping_list, opal_list_t);
rc = orte_rmaps_base_mapped_node_query(&mapping_list, &nodes, jobid);
if (ORTE_SUCCESS != rc) { ORTE_ERROR_LOG(rc); goto cleanup; }
num_nodes = opal_list_get_size(&nodes);
if(!strncmp(mca_pls_poe_component.resource_allocation,"hostfile",8)) {
/* Create a tempolary hostlist file if user specify */
if( (NULL==(mca_pls_poe_component.hostfile=tempnam(NULL,NULL))) ||
(NULL==(hfp=fopen(mca_pls_poe_component.hostfile,"w"))) ) {
return ORTE_ERR_OUT_OF_RESOURCE;
}
for(item = opal_list_get_first(&nodes);
item != opal_list_get_end(&nodes);
item = opal_list_get_next(item)) {
Quite a range of small changes. ns_replica.c - Removed the error logging since I use this function in orte_init_stage1 to check if we have created a cellid yet or not. ras_types.h & rase_base_node.h - This was an empty file. moved the orte_ras_node_t from base/ras_base_node.h to this file. - Changed the name of orte_ras_base_node_t to orte_ras_node_t to match the naming mechanisms in place. ras.h - Exposed 2 functions: - node_insert: This takes a list of orte_ras_base_node_t's and places them in the Node Segment of the GPR. This is to be used in orte_init_stage1 for singleton processes, and the hostfile parsing (see rds_hostfile.c). This just puts in the appropriate API interface to keep from calling the orte_ras_base_node_insert function directly. - node_query: This is used in hostfile parsing. This just puts in the appropriate API interface to keep from calling the orte_ras_base_node_query function directly. - Touched all of the implemented components to add reference to these new function pointers ras_base_select.c & ras_base_open.c - Add and set the global module reference rds.h - Exposed 1 function: - store_resource: This stores a list of rds_cell_desc_t's to the Resource Segment. This is used in conjunction with the orte_ras.node_insert function in both the orte_init_stage1 for singleton processes and rds_hostfile.c rds_base_select.c & rds_base_open.c - Add and set the global module reference rds_hostfile.c - Added functionality to create a new cellid for each hostfile, placing each entry in the hostfile into the same cellid. Currently this is commented out with the cellid hard coded to 0, with the intention of taking this out once ORTE is able to handle multiple cellid's - Instead of just adding hosts to the Node Segment via a direct call to the ras_base_node_insert() function. First add the hosts to the Resource Segment of the GPR using the orte_rds.store_resource() function then use the API version of orte_ras.node_insert() to store the hosts on the Node Segment. - Add 1 new function pointer to module as required by the API. rds_hostfile_component.c - Converted this to use the new MCA parameter registration orte_init_stage1.c - It is possible that a cellid was not created yet for the current environment. So I put in some logic to test if the cellid 0 existed. If it does then continue, otherwise create the cellid so we can properly interact with the GPR via the RDS. - For the singleton case we insert some 'dummy' data into the GPR. The RAS matches this logic, so I took out the duplicate GPR put logic, and replaced it with a call to the orte_ras.node_insert() function. - Further before calling orte_ras.node_insert() in the singleton case, we also call orte_rds.store_resource() to add the singleton node to the Resource Segment. Console: - Added a bunch of new functions. Still experimenting with many aspects of the implementation. This is a checkpoint, and has very limited functionality. - Should not be considered stable at the moment. This commit was SVN r6813.
2005-08-11 19:51:50 +00:00
orte_ras_node_t* node = (orte_ras_node_t*)item;
fprintf(hfp,"%s\n",node->node_name);
}
fclose(hfp);
}
rc = orte_rmgr_base_get_job_slots(jobid, &num_procs);
if (ORTE_SUCCESS != rc) { ORTE_ERROR_LOG(rc); goto cleanup; }
OBJ_CONSTRUCT(&map, opal_list_t);
rc = orte_rmaps_base_get_map(jobid,&map);
if (ORTE_SUCCESS != rc) { ORTE_ERROR_LOG(rc); goto cleanup; }
rc = orte_rmaps_base_get_vpid_range(jobid, &vpid_start, &vpid_range);
if (ORTE_SUCCESS != rc) { ORTE_ERROR_LOG(rc); goto cleanup; }
/* Create a tempolary POE command file */
for(item = opal_list_get_first(&map);
item != opal_list_get_end(&map);
item = opal_list_get_next(item)) {
orte_rmaps_base_map_t* map2 = (orte_rmaps_base_map_t*)item;
size_t i;
for(i=0; i<map2->num_procs; i++) {
rc = poe_create_cmd_file(cfp, map2->app, map2->procs[i], vpid_start, vpid_range);
if(ORTE_SUCCESS != rc) { ORTE_ERROR_LOG(rc); goto cleanup; }
}
}
fclose(cfp);
/* Generate POE command line */
argv = opal_argv_copy(mca_pls_poe_component.argv);
argc = mca_pls_poe_component.argc;
if(!strncmp(mca_pls_poe_component.resource_allocation,"hostfile",8)) {
opal_argv_append(&argc, &argv, "-hostfile");
opal_argv_append(&argc, &argv, mca_pls_poe_component.hostfile);
opal_argv_append(&argc, &argv, "-resd");
opal_argv_append(&argc, &argv, "no");
rc=poe_argv_append_int(&argc, &argv, num_nodes, 1, "-nodes");
if(ORTE_SUCCESS!=rc) { ORTE_ERROR_LOG(rc); goto cleanup; }
}
opal_argv_append(&argc, &argv, "-pgmmodel");
opal_argv_append(&argc, &argv, "mpmd");
opal_argv_append(&argc, &argv, "-cmdfile");
opal_argv_append(&argc, &argv, mca_pls_poe_component.cmdfile);
opal_argv_append(&argc, &argv, "-labelio");
opal_argv_append(&argc, &argv, mca_pls_poe_component.mp_labelio);
opal_argv_append(&argc, &argv, "-stdoutmode");
opal_argv_append(&argc, &argv, mca_pls_poe_component.mp_stdoutmode);
rc=poe_argv_append_int(&argc, &argv, num_procs, 1, "-procs");
if(ORTE_SUCCESS!=rc) { ORTE_ERROR_LOG(rc); goto cleanup; }
rc=poe_argv_append_int(&argc, &argv, mca_pls_poe_component.mp_retry, 0, "-retry");
if(ORTE_SUCCESS!=rc) { ORTE_ERROR_LOG(rc); goto cleanup; }
rc=poe_argv_append_int(&argc, &argv, mca_pls_poe_component.mp_retrycount, 0, "-retrycount");
if(ORTE_SUCCESS!=rc) { ORTE_ERROR_LOG(rc); goto cleanup; }
rc=poe_argv_append_int(&argc, &argv, mca_pls_poe_component.mp_infolevel, 0, "-infolevel");
if(ORTE_SUCCESS!=rc) { ORTE_ERROR_LOG(rc); goto cleanup; }
if(mca_pls_poe_component.verbose>10) {
opal_output(0, "POE cmdline %s\n", opal_argv_join(argv, ' '));
}
/* Start job with POE */
pid = fork();
if(pid < 0) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
if(pid == 0) {
poe_set_handler_default(SIGTERM);
poe_set_handler_default(SIGINT);
poe_set_handler_default(SIGHUP);
poe_set_handler_default(SIGCHLD);
poe_set_handler_default(SIGPIPE);
sigprocmask(0, 0, &sigs);
sigprocmask(SIG_UNBLOCK, &sigs, 0);
execv(mca_pls_poe_component.path, argv);
opal_output(0, "orte_pls_poe: execv failed with errno=%d\n", errno);
exit(-1);
} else {
orte_wait_cb(pid, poe_wait_job, NULL);
}
cleanup:
while(NULL != (item = opal_list_remove_first(&map))) {
OBJ_RELEASE(item);
}
OBJ_DESTRUCT(&map);
while(NULL != (item = opal_list_remove_first(&nodes))) {
OBJ_RELEASE(item);
}
OBJ_DESTRUCT(&nodes);
while(NULL != (item = opal_list_remove_first(&mapping_list))) {
OBJ_RELEASE(item);
}
OBJ_DESTRUCT(&mapping_list);
return rc;
}
/**
pls_poe_launch - launch a POE job
@warning current support interactive class only!.
@param jobid JOB Identifier [IN]
@return error number
*/
static int pls_poe_launch(orte_jobid_t jobid)
{
if(!strncmp(mca_pls_poe_component.class,"interactive",11)) {
return poe_launch_interactive(jobid);
}
return ORTE_ERR_NOT_IMPLEMENTED;
}
static int pls_poe_terminate_job(orte_jobid_t jobid)
{
return ORTE_ERR_NOT_IMPLEMENTED;
}
static int pls_poe_terminate_proc(const orte_process_name_t *name)
{
return ORTE_ERR_NOT_IMPLEMENTED;
}
/**
pls_poe_finalize - clean up tempolary files
@return error number
*/
static int pls_poe_finalize(void)
{
unlink(mca_pls_poe_component.cmdfile);
unlink(mca_pls_poe_component.hostfile);
return ORTE_SUCCESS;
}