1
1

* More changes from the tim branch. Still has problems with ABORTed procs,

but now tells you when it can't find orted.  Also includes memory leak
  plugs, bproc fixes, and gm repairs.

This commit was SVN r4937.
Этот коммит содержится в:
Brian Barrett 2005-03-18 23:58:36 +00:00
родитель a1d11d9b54
Коммит 30af9a7b90
25 изменённых файлов: 550 добавлений и 377 удалений

16
ISSUES
Просмотреть файл

@ -3,11 +3,23 @@ Undecided timing:
- if an MPI process fails (e.g., it seg faults), it causes orterun to
hang. This is with the rsh pls.
--> Looks like the problem is with what happens when you set the
state of the process in the soh to ORTE_PROC_STATE_ABORTED.
--> Ralph is looking at this
- if the daemon is not found or fails to start, orterun will hang. No
indication is given to the users that something went wrong.
--> Brian thinks he fixed this, but since he sets the state to
ORTE_PROC_STATE_ABORTED, it won't be really clear until the
above issue is fixed. But it at least tells you what went
wrong.
- $prefix/etc/hosts vs. $prefix/etc/openmpi-default-hostfile
--> Brian temporarily added symlink in $prefix/etc/ for
openmpi-default-hostfile -> hosts if there isn't already
a hosts file so that he doesn't have to create one every
time he does "rm -rf $prefix && make install". Will file
bug so that this can be fixed (and will fix in the trunk)
Pre-milestone:
@ -15,6 +27,8 @@ Pre-milestone:
- singleton mpi doesn't work
- Ralph: Populate orte_finalize()
Post-milestone:
---------------
@ -43,8 +57,6 @@ Post-milestone:
- ?: Friendlier error messages (e.g., if no nodes -- need something
meaningful to tell the user)
- ?: Populate orte_finalize()
- Ralph: compare and set function in GPR
- Jeff: collapse MCA params from 3 names to 1 name

Просмотреть файл

@ -1,6 +1,36 @@
This file contains information on merging the branches/tim tree into the
trunk.
UPDATE MERGE 3:
svn merge -r4922:4933 svn+ssh://svn.open-mpi.org/l/svn/ompi/branches/tim .
U src/runtime/orte_finalize.c
U src/runtime/orte_restart.c
U src/tools/orterun/orterun.c
U src/mca/oob/tcp/oob_tcp.c
U src/mca/gpr/replica/communications/gpr_replica_remote_msg.c
U src/mca/gpr/replica/gpr_replica_component.c
U src/mca/gpr/replica/functional_layer/gpr_replica_put_get_fn.c
U src/mca/gpr/replica/functional_layer/gpr_replica_messaging_fn.c
U src/mca/gpr/base/gpr_base_open.c
U src/mca/rds/hostfile/rds_hostfile.c
U src/mca/rds/hostfile/rds_hostfile_component.c
U src/mca/pls/rsh/pls_rsh_module.c
U src/mca/ptl/gm/src/ptl_gm_priv.c
U src/mca/ptl/gm/src/ptl_gm.c
U src/mca/ptl/gm/src/ptl_gm_component.c
U src/mca/ptl/gm/src/ptl_gm_proc.c
U src/mca/ptl/gm/src/Makefile.am
U src/mca/ptl/gm/src/ptl_gm_sendfrag.c
D src/mca/ptl/gm/src/ptl_gm_req.h
D src/mca/ptl/gm/src/ptl_gm_req.c
U src/mca/rmgr/base/rmgr_base_context.c
U src/util/session_dir.c
U etc/Makefile.am
U ISSUES
UPDATE MERGE 2:
svn merge -r4892:4922 svn+ssh://svn.open-mpi.org/l/svn/ompi/branches/tim .

Просмотреть файл

@ -38,3 +38,7 @@ install-data-local:
$(INSTALL_DATA) $$d$$file $(DESTDIR)$(sysconfdir)/$$f; \
fi; \
done
@ if test ! -r $(DESTDIR)$(sysconfdir)/hosts && test ! -h $(DESTDIR)$(sysconfdir)/hosts ; then \
echo "Creating hosts symlink" ; \
( cd $(DESTDIR)$(sysconfdir) ; ln -s openmpi-default-hostfile hosts ) \
fi

Просмотреть файл

@ -90,7 +90,7 @@ static void orte_gpr_value_destructor(orte_gpr_value_t* reg_val)
if (0 < reg_val->cnt && NULL != reg_val->keyvals) {
for (i=0; i < reg_val->cnt; i++) {
if(NULL != reg_val->keyvals[i])
if (NULL != reg_val->keyvals[i])
OBJ_RELEASE(reg_val->keyvals[i]);
}
free(reg_val->keyvals);
@ -134,7 +134,7 @@ static void orte_gpr_notify_data_destructor(orte_gpr_notify_data_t* ptr)
if (0 < ptr->cnt && NULL != ptr->values) {
for (i=0; i < ptr->cnt; i++) {
if(NULL != ptr->values[i])
if (NULL != ptr->values[i])
OBJ_RELEASE(ptr->values[i]);
}
free(ptr->values);
@ -215,7 +215,7 @@ static void orte_gpr_notify_message_destructor(orte_gpr_notify_message_t* msg)
if (0 < msg->cnt && NULL != msg->data) {
for (i=0; i < msg->cnt; i++) {
OBJ_RELEASE(msg->data[i]);
if (NULL != msg->data[i]) OBJ_RELEASE(msg->data[i]);
}
free(msg->data);
}

Просмотреть файл

@ -75,6 +75,5 @@ int orte_gpr_replica_remote_notify(orte_process_name_t *recipient, orte_gpr_noti
OBJ_DESTRUCT(&msg);
OBJ_RELEASE(message);
return ORTE_SUCCESS;
}

Просмотреть файл

@ -74,8 +74,7 @@ int orte_gpr_replica_process_callbacks(void)
}
}
}
cb->message->data = NULL;
} else { /* remote request - send message back */
} else { /* remote request - send message back */
if (orte_gpr_replica_globals.debug) {
ompi_output(0, "process_callbacks: remote to [%d,%d,%d]",
ORTE_NAME_ARGS(cb->requestor));
@ -93,7 +92,7 @@ CLEANUP:
OBJ_RELEASE(trig);
}
OBJ_RELEASE(cb);
OBJ_RELEASE(cb);
}
return ORTE_SUCCESS;

Просмотреть файл

@ -96,6 +96,7 @@ static void orte_gpr_replica_get_list_destructor(orte_gpr_replica_get_list_t* pt
while (NULL != (iptr = (orte_gpr_replica_ival_list_t*)ompi_list_remove_first(ptr->ival_list))) {
OBJ_RELEASE(iptr);
}
OBJ_RELEASE(ptr->ival_list);
}
@ -242,7 +243,7 @@ int orte_gpr_replica_get_fn(orte_gpr_addr_mode_t addr_mode,
orte_gpr_replica_itag_t *keytags, int num_keys,
int *cnt, orte_gpr_value_t ***values)
{
ompi_list_t *get_list;
ompi_list_t get_list;
orte_gpr_replica_get_list_t *gptr;
orte_gpr_replica_ival_list_t *ival_list;
orte_gpr_replica_container_t **cptr, *cptr2;
@ -282,11 +283,7 @@ int orte_gpr_replica_get_fn(orte_gpr_addr_mode_t addr_mode,
}
/* initialize the list of findings */
get_list = OBJ_NEW(ompi_list_t);
if (NULL == get_list) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
OBJ_CONSTRUCT(&get_list, ompi_list_t);
*cnt = 0;
*values = NULL;
tokmode = 0x004f & addr_mode;
@ -302,12 +299,13 @@ int orte_gpr_replica_get_fn(orte_gpr_addr_mode_t addr_mode,
if (ORTE_SUCCESS != (rc = orte_gpr_replica_find_containers(&num_found, seg, tokmode,
tokentags, num_tokens))) {
ORTE_ERROR_LOG(rc);
OBJ_DESTRUCT(&get_list);
return rc;
}
/* if nothing found, then can return */
if (0 == num_found) {
OBJ_RELEASE(get_list);
OBJ_DESTRUCT(&get_list);
return ORTE_SUCCESS;
}
@ -336,7 +334,7 @@ int orte_gpr_replica_get_fn(orte_gpr_addr_mode_t addr_mode,
ompi_list_append(gptr->ival_list, &ival_list->item);
}
}
ompi_list_append(get_list, &gptr->item);
ompi_list_append(&get_list, &gptr->item);
(*cnt)++; /* update number of containers that had something found */
}
}
@ -354,7 +352,7 @@ int orte_gpr_replica_get_fn(orte_gpr_addr_mode_t addr_mode,
goto CLEANUP;
}
for (i=0; i < *cnt; i++) {
gptr = (orte_gpr_replica_get_list_t*)ompi_list_remove_first(get_list);
gptr = (orte_gpr_replica_get_list_t*)ompi_list_remove_first(&get_list);
if (NULL == gptr) {
rc = ORTE_ERROR;
goto CLEANUP;
@ -414,10 +412,10 @@ int orte_gpr_replica_get_fn(orte_gpr_addr_mode_t addr_mode,
CLEANUP:
while (NULL != (gptr = (orte_gpr_replica_get_list_t*)ompi_list_remove_first(get_list))) {
while (NULL != (gptr = (orte_gpr_replica_get_list_t*)ompi_list_remove_first(&get_list))) {
OBJ_RELEASE(gptr);
}
OBJ_RELEASE(get_list);
OBJ_DESTRUCT(&get_list);
if (orte_gpr_replica_globals.debug) {
ompi_output(0, "[%d,%d,%d] gpr replica-get: finished search", ORTE_NAME_ARGS(orte_process_info.my_name));

Просмотреть файл

@ -134,15 +134,31 @@ static void orte_gpr_replica_segment_construct(orte_gpr_replica_segment_t* seg)
/* destructor - used to free any resources held by instance */
static void orte_gpr_replica_segment_destructor(orte_gpr_replica_segment_t* seg)
{
int i;
orte_gpr_replica_dict_t **dptr;
orte_gpr_replica_container_t **cptr;
if (NULL != seg->name) {
free(seg->name);
}
if (NULL != seg->dict) {
OBJ_RELEASE(seg->dict);
dptr = (orte_gpr_replica_dict_t**)((seg->dict)->addr);
for (i=0; i < (seg->dict)->size; i++) {
if (NULL != dptr[i]) {
free(dptr[i]);
}
}
OBJ_RELEASE(seg->dict);
}
if (NULL != seg->containers) {
cptr = (orte_gpr_replica_container_t**)((seg->containers)->addr);
for (i=0; i < (seg->containers)->size; i++) {
if (NULL != cptr[i]) {
OBJ_RELEASE(cptr[i]);
}
}
OBJ_RELEASE(seg->containers);
}
}
@ -174,7 +190,7 @@ static void orte_gpr_replica_container_construct(orte_gpr_replica_container_t* r
/* destructor - used to free any resources held by instance */
static void orte_gpr_replica_container_destructor(orte_gpr_replica_container_t* reg)
{
orte_gpr_replica_itagval_t *ptr;
orte_gpr_replica_itagval_t **ptr;
int i;
if (NULL != reg->itags) {
@ -182,12 +198,11 @@ static void orte_gpr_replica_container_destructor(orte_gpr_replica_container_t*
}
if (NULL != reg->itagvals) {
ptr = (orte_gpr_replica_itagval_t*)((reg->itagvals)->addr);
ptr = (orte_gpr_replica_itagval_t**)((reg->itagvals)->addr);
for (i=0; i < (reg->itagvals)->size; i++) {
if (NULL != ptr) {
OBJ_RELEASE(ptr);
if (NULL != ptr[i]) {
OBJ_RELEASE(ptr[i]);
}
ptr++;
}
OBJ_RELEASE(reg->itagvals);
}
@ -311,6 +326,7 @@ static void orte_gpr_replica_trigger_destructor(orte_gpr_replica_triggers_t* tri
{
int i;
orte_gpr_replica_subscribed_data_t **data;
orte_gpr_replica_counter_t **cntrs;
if (NULL != trig->requestor) {
free(trig->requestor);
@ -319,13 +335,17 @@ static void orte_gpr_replica_trigger_destructor(orte_gpr_replica_triggers_t* tri
if (NULL != trig->subscribed_data) {
data = (orte_gpr_replica_subscribed_data_t**)((trig->subscribed_data)->addr);
for (i=0; i < (trig->subscribed_data)->size; i++) {
if (NULL != data[i]) free(data[i]);
if (NULL != data[i]) OBJ_RELEASE(data[i]);
}
OBJ_RELEASE(trig->subscribed_data);
}
if (NULL != trig->counters) {
OBJ_RELEASE(trig->counters);
cntrs = (orte_gpr_replica_counter_t**)((trig->counters)->addr);
for (i=0; i < (trig->counters)->size; i++) {
if (NULL != cntrs[i]) OBJ_RELEASE(cntrs[i]);
}
OBJ_RELEASE(trig->counters);
}
}
@ -350,7 +370,7 @@ static void orte_gpr_replica_callbacks_construct(orte_gpr_replica_callbacks_t* c
static void orte_gpr_replica_callbacks_destructor(orte_gpr_replica_callbacks_t* cb)
{
if (NULL != cb->requestor) {
free(cb->requestor);
free(cb->requestor);
cb->requestor = NULL;
}
}

Просмотреть файл

@ -282,7 +282,6 @@ static void mca_oob_tcp_accept(void)
static int mca_oob_tcp_create_listen(void)
{
int flags;
int optval = 1;
struct sockaddr_in inaddr;
ompi_socklen_t addrlen;
@ -636,7 +635,7 @@ int mca_oob_tcp_init(void)
ompi_list_item_t* item;
/* random delay to stagger connections back to seed */
usleep((orte_process_info.num_procs % 100) * 10000);
usleep((orte_process_info.my_name->vpid % orte_process_info.num_procs % 1000) * 1000);
/* get my jobid */
if (ORTE_SUCCESS != (rc = orte_ns.get_jobid(&jobid,

Просмотреть файл

@ -35,8 +35,10 @@
#include "mca/ns/ns.h"
#include "mca/pls/pls.h"
#include "mca/rml/rml.h"
#include "mca/errmgr/errmgr.h"
#include "mca/ras/base/ras_base_node.h"
#include "mca/rmaps/base/rmaps_base_map.h"
#include "mca/soh/soh.h"
#include "pls_rsh.h"
#define NUM_CONCURRENT 128
@ -58,6 +60,17 @@ orte_pls_base_module_1_0_0_t orte_pls_rsh_module = {
orte_pls_rsh_finalize
};
/* struct used to have enough information to clean up the state of the
universe if a daemon aborts */
struct rsh_daemon_info_t {
ompi_object_t super;
orte_ras_base_node_t* node;
orte_jobid_t jobid;
};
typedef struct rsh_daemon_info_t rsh_daemon_info_t;
static OBJ_CLASS_INSTANCE(rsh_daemon_info_t,
ompi_object_t,
NULL, NULL);
/**
* Callback on daemon exit.
@ -65,6 +78,63 @@ orte_pls_base_module_1_0_0_t orte_pls_rsh_module = {
static void orte_pls_rsh_wait_daemon(pid_t pid, int status, void* cbdata)
{
rsh_daemon_info_t *info = (rsh_daemon_info_t*) cbdata;
ompi_list_t map;
ompi_list_item_t* item;
int rc;
/* get the mapping for our node so we can cancel the right things */
OBJ_CONSTRUCT(&map, ompi_list_t);
rc = orte_rmaps_base_get_node_map(orte_process_info.my_name->cellid,
info->jobid,
info->node->node_name,
&map);
if(ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
/* set state of all processes associated with the daemon as terminated */
for(item = ompi_list_get_first(&map);
item != ompi_list_get_end(&map);
item = ompi_list_get_next(item)) {
orte_rmaps_base_map_t* map = (orte_rmaps_base_map_t*) item;
size_t i;
for (i = 0 ; i < map->num_procs ; ++i) {
rc = orte_soh.set_proc_soh(&(map->procs[i]->proc_name),
ORTE_PROC_STATE_ABORTED, status);
}
if(ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
}
}
OBJ_DESTRUCT(&map);
cleanup:
/* BWB - XXX - FIXME - this should be made prettier in some way. We
have something of a problem here, since it's a callback, so we
don't have a good way to propogate back up to the user :/ */
/* tell the user something went wrong */
if (! WIFEXITED(status) || ! WEXITSTATUS(status) == 0) {
ompi_output(0, "A daemon on node %s failed to start as expected."
"There may be more information available above from the"
"remote shell.", info->node->node_name);
if (WIFEXITED(status)) {
ompi_output(0, "The daemon exited unexpectedly with status %d.",
WEXITSTATUS(status));
} else if (WIFSIGNALED(status)) {
ompi_output(0, "The daemon received a signal %d.", WTERMSIG(status));
#ifdef WCOREDUMP
if (WCOREDUMP(status)) {
ompi_output(0, "The daemon process dumped core.");
}
#endif /* WCOREDUMP */
} else {
ompi_output(0, "No status information is available: %d.", status);
}
}
/* release any waiting threads */
OMPI_THREAD_LOCK(&mca_pls_rsh_component.lock);
if(mca_pls_rsh_component.num_children-- >= NUM_CONCURRENT ||
@ -72,6 +142,10 @@ static void orte_pls_rsh_wait_daemon(pid_t pid, int status, void* cbdata)
ompi_condition_signal(&mca_pls_rsh_component.cond);
}
OMPI_THREAD_UNLOCK(&mca_pls_rsh_component.lock);
/* cleanup */
OBJ_RELEASE(info->node);
OBJ_RELEASE(info);
}
@ -209,11 +283,9 @@ int orte_pls_rsh_launch(orte_jobid_t jobid)
}
if (mca_pls_rsh_component.debug == 0) {
/* setup stdin/stdout/stderr */
/* setup stdin */
int fd = open("/dev/null", O_RDWR);
dup2(fd, 0);
dup2(fd, 1);
dup2(fd, 2);
close(fd);
}
@ -223,14 +295,18 @@ int orte_pls_rsh_launch(orte_jobid_t jobid)
exit(-1);
} else {
rsh_daemon_info_t *daemon_info;
OMPI_THREAD_LOCK(&mca_pls_rsh_component.lock);
if(mca_pls_rsh_component.num_children++ >= NUM_CONCURRENT)
ompi_condition_wait(&mca_pls_rsh_component.cond, &mca_pls_rsh_component.lock);
OMPI_THREAD_UNLOCK(&mca_pls_rsh_component.lock);
daemon_info = OBJ_NEW(rsh_daemon_info_t);
OBJ_RETAIN(node);
orte_wait_cb(pid, orte_pls_rsh_wait_daemon, node);
daemon_info->node = node;
daemon_info->jobid = jobid;
orte_wait_cb(pid, orte_pls_rsh_wait_daemon, daemon_info);
vpid++;
/* if required - add delay to avoid problems w/ X11 authentication */

Просмотреть файл

@ -29,7 +29,6 @@ libmca_ptl_gm_la_SOURCES = \
ptl_gm_priv.c \
ptl_gm_proc.c \
ptl_gm_proc.h \
ptl_gm_req.c \
ptl_gm_req.h \
ptl_gm_sendfrag.c \
ptl_gm_sendfrag.h

Просмотреть файл

@ -26,7 +26,6 @@
#include "mca/ptl/base/ptl_base_header.h"
#include "ptl_gm.h"
#include "ptl_gm_proc.h"
#include "ptl_gm_req.h"
#include "ptl_gm_peer.h"
#include "ptl_gm_priv.h"
#include "ptl_gm_sendfrag.h"
@ -109,7 +108,7 @@ mca_ptl_gm_add_procs (struct mca_ptl_base_module_t *ptl,
&lid)) {
ompi_output( 0, "[%s:%d] error in converting global to local id \n",
__FILE__, __LINE__ );
return OMPI_ERR_BAD_PARAM;
}
ptl_peer->local_id = lid;
ptl_proc->peer_arr[ptl_proc->proc_peer_count] = ptl_peer;
@ -354,9 +353,14 @@ mca_ptl_gm_matched( mca_ptl_base_module_t * ptl,
hdr->hdr_ack.hdr_dst_addr.lval = 0L;
hdr->hdr_ack.hdr_dst_addr.pval = frag;
hdr->hdr_ack.hdr_dst_size = request->req_bytes_packed;
gm_send_to_peer_with_callback( ((mca_ptl_gm_module_t*)ptl)->gm_port, hdr,
GM_SIZE, sizeof(mca_ptl_base_ack_header_t), GM_LOW_PRIORITY,
peer->local_id, mca_ptl_gm_basic_ack_callback, (void *)hdr );
gm_send_with_callback( ((mca_ptl_gm_module_t*)ptl)->gm_port, hdr,
GM_SIZE, sizeof(mca_ptl_base_ack_header_t),
GM_LOW_PRIORITY,
peer->local_id,
peer->port_number,
mca_ptl_gm_basic_ack_callback,
(void *)hdr );
}
}

Просмотреть файл

@ -245,12 +245,18 @@ mca_ptl_gm_discover_boards( mca_ptl_gm_module_t** pptl,
/* open the first available gm port for this board */
for( port_no = 2; port_no < max_port; port_no++ ) {
if (port_no == 3) continue; /* port 0,1,3 reserved */
if( GM_SUCCESS == gm_open( &gm_port, board_no, port_no,
mca_ptl_gm_component.gm_port_name, GM_API_VERSION_2_0) )
if (3 == port_no) {
continue; /* port 0,1,3 reserved */
} else if (GM_SUCCESS ==
gm_open(&gm_port, board_no, port_no,
mca_ptl_gm_component.gm_port_name,
GM_API_VERSION_2_0) ) {
break;
}
}
if( port_no == max_port ) {
continue;
}
if( port_no == max_port ) continue;
/* Get node local Id */
if( GM_SUCCESS != gm_get_node_id( gm_port, &local_id) ) {
@ -265,8 +271,9 @@ mca_ptl_gm_discover_boards( mca_ptl_gm_module_t** pptl,
}
/* Create the ptl. If fail return the number of already created */
if( OMPI_SUCCESS != mca_ptl_gm_create( &(pptl[index]) ) )
if( OMPI_SUCCESS != mca_ptl_gm_create( &(pptl[index]) ) ) {
return index;
}
pptl[index]->port_id = port_no;
pptl[index]->gm_port = gm_port;
@ -274,13 +281,15 @@ mca_ptl_gm_discover_boards( mca_ptl_gm_module_t** pptl,
pptl[index]->global_id = global_id;
/* everything is OK let's mark it as usable and go to the next one */
if( (++index) >= max_ptls ) break;
if( (++index) >= max_ptls ) {
break;
}
}
return index;
}
static inline int
static int
mca_ptl_gm_init_sendrecv (mca_ptl_gm_module_t * ptl)
{
uint32_t i;
@ -420,14 +429,14 @@ mca_ptl_gm_init( mca_ptl_gm_component_t * gm )
mca_ptl_gm_component.gm_max_boards_number,
mca_ptl_gm_component.gm_max_port_number );
/* In the case when we are in a multi-threaded environment each PTL will have it's
* own thread. At this point all structures are correctly initialized, each thread
* will grab one and use it.
/* In the case when we are in a multi-threaded environment each
* PTL will have its own thread. At this point all structures are
* correctly initialized, each thread will grab one and use it.
*/
for( index = 0; index < mca_ptl_gm_component.gm_num_ptl_modules; index++ ) {
ptl = mca_ptl_gm_component.gm_ptl_modules[index];
/* Now prepost some received and allocate some sends. After this step the PTL
* is fully initialized.
/* Now prepost some received and allocate some sends. After
* this step the PTL is fully initialized.
*/
if( OMPI_SUCCESS != mca_ptl_gm_init_sendrecv( ptl ) )
break;
@ -436,8 +445,9 @@ mca_ptl_gm_init( mca_ptl_gm_component_t * gm )
ptl->thread.t_run = (ompi_thread_fn_t)mca_ptl_gm_thread_progress;
ptl->thread.t_arg = (void*)ptl;
#endif /* OMPI_HAVE_POSIX_THREADS */
if( OMPI_SUCCESS != ompi_thread_start( &(ptl->thread) ) )
if( OMPI_SUCCESS != ompi_thread_start( &(ptl->thread) ) ) {
break;
}
}
}
save_counter = index;

Просмотреть файл

@ -21,7 +21,6 @@
#include "mca/pml/base/pml_base_sendreq.h"
#include "mca/ptl/base/ptl_base_header.h"
#include "ptl_gm.h"
#include "ptl_gm_req.h"
#include "ptl_gm_peer.h"
#include "ptl_gm_proc.h"
#include "ptl_gm_sendfrag.h"
@ -195,10 +194,12 @@ int mca_ptl_gm_sender_advance_pipeline( mca_ptl_gm_send_frag_t* frag )
hdr->hdr_frag.hdr_frag_length = send_line->length;
hdr->registered_memory = send_line->local_memory;
gm_send_to_peer_with_callback( peer->peer_ptl->gm_port, hdr,
GM_SIZE, sizeof(mca_ptl_gm_frag_header_t),
GM_HIGH_PRIORITY, peer->local_id,
send_continue_callback, (void*)hdr );
gm_send_with_callback( peer->peer_ptl->gm_port, hdr,
GM_SIZE, sizeof(mca_ptl_gm_frag_header_t),
GM_HIGH_PRIORITY,
peer->local_id,
peer->port_number,
send_continue_callback, (void*)hdr );
send_line->flags ^= PTL_GM_PIPELINE_REMOTE;
frag->pipeline.pos_transfert = (frag->pipeline.pos_transfert + 1) % GM_PIPELINE_DEPTH;
@ -318,10 +319,14 @@ int mca_ptl_gm_peer_send_continue( mca_ptl_gm_peer_t *ptl_peer,
hdr->hdr_frag.hdr_common.hdr_flags |= PTL_FLAG_GM_LAST_FRAGMENT;
/* for the last piece set the header type to FIN */
gm_send_to_peer_with_callback( ptl_peer->peer_ptl->gm_port, hdr,
GM_SIZE, iov.iov_len + sizeof(mca_ptl_base_frag_header_t),
GM_LOW_PRIORITY, ptl_peer->local_id,
send_continue_callback, (void*)hdr );
gm_send_with_callback( ptl_peer->peer_ptl->gm_port, hdr,
GM_SIZE,
iov.iov_len +
sizeof(mca_ptl_base_frag_header_t),
GM_LOW_PRIORITY,
ptl_peer->local_id,
ptl_peer->port_number,
send_continue_callback, (void*)hdr );
item = NULL; /* force to retrieve a new one on the next loop */
}
*size = fragment->frag_bytes_processed;
@ -348,10 +353,14 @@ int mca_ptl_gm_peer_send_continue( mca_ptl_gm_peer_t *ptl_peer,
hdr->registered_memory.lval = 0L;
hdr->registered_memory.pval = NULL;
gm_send_to_peer_with_callback( ptl_peer->peer_ptl->gm_port, hdr,
GM_SIZE, sizeof(mca_ptl_base_frag_header_t) + sizeof(ompi_ptr_t),
GM_LOW_PRIORITY, ptl_peer->local_id,
mca_ptl_gm_basic_frag_callback, (void *)hdr );
gm_send_with_callback( ptl_peer->peer_ptl->gm_port, hdr,
GM_SIZE,
sizeof(mca_ptl_base_frag_header_t) +
sizeof(ompi_ptr_t),
GM_LOW_PRIORITY,
ptl_peer->local_id,
ptl_peer->port_number,
mca_ptl_gm_basic_frag_callback, (void *)hdr );
pipeline->length = fragment->frag_send.frag_base.frag_size % mca_ptl_gm_component.gm_rdma_frag_size;
if( pipeline->length < (mca_ptl_gm_component.gm_rdma_frag_size >> 1) ) {
@ -463,10 +472,12 @@ int mca_ptl_gm_peer_send( struct mca_ptl_base_module_t* ptl,
size_out = iov.iov_len + header_length;
/* Send the first fragment */
gm_send_to_peer_with_callback( ptl_peer->peer_ptl->gm_port, hdr,
GM_SIZE, size_out, GM_LOW_PRIORITY, ptl_peer->local_id,
send_match_callback, (void *)hdr );
gm_send_with_callback(ptl_peer->peer_ptl->gm_port, hdr,
GM_SIZE, size_out, GM_LOW_PRIORITY,
ptl_peer->local_id,
ptl_peer->port_number,
send_match_callback, (void *)hdr );
if( !(flags & MCA_PTL_FLAGS_ACK) ) {
ptl_peer->peer_ptl->super.ptl_send_progress( (mca_ptl_base_module_t*)ptl_peer->peer_ptl,
sendreq,
@ -608,9 +619,12 @@ static int mca_ptl_gm_send_quick_fin_message( struct mca_ptl_gm_peer_t* ptl_peer
hdr->hdr_ack.hdr_dst_addr.pval = NULL;
hdr->hdr_ack.hdr_dst_size = frag->frag_header.hdr_frag.hdr_frag_length;
gm_send_to_peer_with_callback( ptl_peer->peer_ptl->gm_port, hdr, GM_SIZE, sizeof(mca_ptl_base_ack_header_t),
GM_HIGH_PRIORITY, ptl_peer->local_id,
recv_short_callback, (void*)hdr );
gm_send_with_callback(ptl_peer->peer_ptl->gm_port, hdr,
GM_SIZE, sizeof(mca_ptl_base_ack_header_t),
GM_HIGH_PRIORITY,
ptl_peer->local_id,
ptl_peer->port_number,
recv_short_callback, (void*)hdr );
return OMPI_SUCCESS;
}

Просмотреть файл

@ -131,7 +131,6 @@ mca_ptl_gm_proc_create (mca_ptl_gm_module_t * ptl, ompi_proc_t * ompi_proc)
}
ptl_proc->proc_addr_count = size / sizeof (mca_ptl_gm_addr_t);
/* allocate space for peer array - one for each exported address */
ptl_proc->peer_arr = (mca_ptl_gm_peer_t **)
malloc (ptl_proc->proc_addr_count * sizeof (mca_ptl_gm_peer_t *));
@ -143,8 +142,10 @@ mca_ptl_gm_proc_create (mca_ptl_gm_module_t * ptl, ompi_proc_t * ompi_proc)
return NULL;
}
if(NULL == mca_ptl_gm_component.gm_local && ompi_proc == ompi_proc_local() )
if(NULL == mca_ptl_gm_component.gm_local &&
ompi_proc == ompi_proc_local() ) {
mca_ptl_gm_component.gm_local = ptl_proc;
}
return ptl_proc;
}

Просмотреть файл

@ -1,53 +0,0 @@
/* -*- Mode: C; c-basic-offset:4 ; -*- */
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University.
* All rights reserved.
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
* All rights reserved.
* Copyright (c) 2004 The Ohio State University.
* All rights reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ompi_config.h"
#include "include/types.h"
#include "mca/pml/base/pml_base_sendreq.h"
#include "ptl_gm.h"
#include "ptl_gm_req.h"
/*
static void mca_ptl_gm_send_request_construct (
mca_ptl_gm_send_request_t *);
static void mca_ptl_gm_send_request_destruct (
mca_ptl_gm_send_request_t *);
ompi_class_t mca_ptl_gm_send_request_t_class = {
"mca_ptl_gm_send_request_t",
OBJ_CLASS (mca_pml_base_send_request_t),
(ompi_construct_t) mca_ptl_gm_send_request_construct,
(ompi_destruct_t) mca_ptl_gm_send_request_destruct
};
void
mca_ptl_gm_send_request_construct (
mca_ptl_gm_send_request_t * request)
{
OBJ_CONSTRUCT (&request->req_frag, mca_ptl_gm_send_frag_t);
}
void
mca_ptl_gm_send_request_destruct (
mca_ptl_gm_send_request_t * request)
{
OBJ_DESTRUCT (&request->req_frag);
}
*/

Просмотреть файл

@ -1,44 +0,0 @@
/* -*- Mode: C; c-basic-offset:4 ; -*- */
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University.
* All rights reserved.
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
* All rights reserved.
* Copyright (c) 2004 The Ohio State University.
* All rights reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/**
* @file
*/
#ifndef MCA_PTL_GM_SEND_REQUEST_H
#define MCA_PTL_GM_SEND_REQUEST_H
#include "mca/pml/base/pml_base_sendreq.h"
#include "ptl_gm_sendfrag.h"
#if defined(c_plusplus) || defined(__cplusplus)
extern "C" {
#endif
OBJ_CLASS_DECLARATION (mca_ptl_gm_send_request_t);
struct mca_ptl_gm_send_request_t {
mca_pml_base_send_request_t super;
/* add stuff here */
mca_ptl_gm_send_frag_t *req_frag;
int need_ack;
};
typedef struct mca_ptl_gm_send_request_t mca_ptl_gm_send_request_t;
#if defined(c_plusplus) || defined(__cplusplus)
}
#endif
#endif

Просмотреть файл

@ -22,42 +22,25 @@
#include "ptl_gm_sendfrag.h"
#include "ptl_gm_priv.h"
static void mca_ptl_gm_send_frag_construct (mca_ptl_gm_send_frag_t* frag);
static void mca_ptl_gm_send_frag_destruct (mca_ptl_gm_send_frag_t* frag);
static void mca_ptl_gm_recv_frag_construct (mca_ptl_gm_recv_frag_t* frag);
static void mca_ptl_gm_recv_frag_destruct (mca_ptl_gm_recv_frag_t* frag);
/*
* send fragment constructor/destructors.
*/
static void
mca_ptl_gm_send_frag_construct (mca_ptl_gm_send_frag_t* frag)
{
}
OBJ_CLASS_INSTANCE(mca_ptl_gm_send_frag_t,
mca_ptl_base_send_frag_t,
NULL, NULL);
static void
mca_ptl_gm_send_frag_destruct (mca_ptl_gm_send_frag_t* frag)
{
}
ompi_class_t mca_ptl_gm_send_frag_t_class = {
"mca_ptl_gm_send_frag_t",
OBJ_CLASS (mca_ptl_base_send_frag_t),
(ompi_construct_t) mca_ptl_gm_send_frag_construct,
(ompi_destruct_t) mca_ptl_gm_send_frag_destruct
};
/* It's not yet clear for me what's the best solution here. Block until we
* get a free request or allocate a new one. The fist case allow us to never
* take care of the gm allocated DMA buffer as all send fragments already have
* one attached, but it can stop the application progression. The second case
* require special cases: we should set the data in the header inside the fragment
* and later when we get some free fragments with DMA memory attached we should
/* It's not yet clear for me what's the best solution here. Block
* until we get a free request or allocate a new one. The fist case
* allow us to never take care of the gm allocated DMA buffer as all
* send fragments already have one attached, but it can stop the
* application progression. The second case require special cases: we
* should set the data in the header inside the fragment and later
* when we get some free fragments with DMA memory attached we should
* put the header back there, and send it.
*
* I will implement the first case and add the second one in my TODO list.
* I will implement the first case and add the second one in my TODO
* list.
*/
mca_ptl_gm_send_frag_t*
mca_ptl_gm_alloc_send_frag( struct mca_ptl_gm_module_t* ptl,
@ -121,20 +104,6 @@ int mca_ptl_gm_put_frag_init( struct mca_ptl_gm_send_frag_t** putfrag,
* recv fragment constructor/destructors.
*/
static void
mca_ptl_gm_recv_frag_construct (mca_ptl_gm_recv_frag_t* frag)
{
}
static void
mca_ptl_gm_recv_frag_destruct (mca_ptl_gm_recv_frag_t* frag)
{
}
ompi_class_t mca_ptl_gm_recv_frag_t_class = {
"mca_ptl_gm_recv_frag_t",
OBJ_CLASS (mca_ptl_base_recv_frag_t),
(ompi_construct_t) mca_ptl_gm_recv_frag_construct,
(ompi_construct_t) mca_ptl_gm_recv_frag_destruct
};
OBJ_CLASS_INSTANCE(mca_ptl_gm_recv_frag_t,
mca_ptl_base_recv_frag_t,
NULL, NULL);

Просмотреть файл

@ -220,6 +220,9 @@ static int orte_rds_hostfile_query(void)
goto cleanup;
}
if (NULL != mca_rds_hostfile_component.path) {
free(mca_rds_hostfile_component.path);
}
rc = mca_base_param_find("rds", "hostfile", "path");
mca_base_param_lookup_string(rc, &mca_rds_hostfile_component.path);
rc = orte_rds_hostfile_parse(mca_rds_hostfile_component.path, &existing, &updates);

Просмотреть файл

@ -96,6 +96,7 @@ static int orte_rds_hostfile_open(void)
mca_rds_hostfile_component.debug = orte_rds_hostfile_param_register_int("debug",1);
mca_rds_hostfile_component.path = orte_rds_hostfile_param_register_string("path", path);
mca_rds_hostfile_component.default_hostfile = (strcmp(mca_rds_hostfile_component.path,path) == 0);
free(path);
return ORTE_SUCCESS;
}

Просмотреть файл

@ -217,6 +217,7 @@ int orte_rmgr_base_get_job_slots(orte_jobid_t jobid, size_t* proc_slots)
&num_values,
&values
);
if(rc != ORTE_SUCCESS) {
free(segment);
return rc;
@ -224,9 +225,11 @@ int orte_rmgr_base_get_job_slots(orte_jobid_t jobid, size_t* proc_slots)
if(0 == num_values) {
*proc_slots = 0;
free(segment);
return ORTE_SUCCESS;
}
if(1 != num_values || values[0]->cnt != 1) {
free(segment);
return ORTE_ERR_NOT_FOUND;
}
*proc_slots = values[0]->keyvals[0]->value.ui32;

Просмотреть файл

@ -60,6 +60,13 @@ int orte_finalize(void)
orte_proc_info_finalize();
orte_univ_info_finalize();
/* finalize the mca */
mca_base_close();
/* finalize the output system */
ompi_output_finalize();
/* finalize the memory allocator */
ompi_malloc_finalize();
return ORTE_SUCCESS;

Просмотреть файл

@ -34,6 +34,9 @@
#include "mca/gpr/base/base.h"
#include "mca/rmgr/base/base.h"
#include "util/proc_info.h"
#include "util/sys_info.h"
#include "util/univ_info.h"
#include "util/session_dir.h"
#include "runtime/runtime.h"
#include "runtime/runtime_internal.h"
@ -46,9 +49,10 @@
int orte_restart(orte_process_name_t *name, const char* uri)
{
int rc;
int rc, id;
orte_process_name_t* old_name;
orte_process_name_t* new_name;
char *jobid_str, *procid_str;
if (ORTE_SUCCESS != (rc = orte_ns.copy_process_name(&old_name, orte_process_info.my_name))) {
ORTE_ERROR_LOG(rc);
@ -97,13 +101,111 @@ int orte_restart(orte_process_name_t *name, const char* uri)
/*
* setup new global state
*/
orte_process_info.seed = false;
if(NULL == orte_process_info.ns_replica)
/* close the proc_info structure so it can be reinitialized */
if (ORTE_SUCCESS != (rc = orte_proc_info_finalize())) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* set seed flag to false */
id = mca_base_param_register_int("seed", NULL, NULL, NULL, (int)false);
if (ORTE_SUCCESS != (rc = mca_base_param_set_int(id, (int)false))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* if NULL, set ns_replica to old_name and set the corresponding uri parameter */
if (NULL == orte_process_info.ns_replica) {
orte_process_info.ns_replica = old_name;
if(NULL == orte_process_info.gpr_replica)
if (NULL != orte_process_info.ns_replica_uri) {
free(orte_process_info.ns_replica_uri);
orte_process_info.ns_replica_uri = NULL;
}
id = mca_base_param_register_string("ns", "replica", "uri", NULL, NULL);
if (ORTE_SUCCESS != (rc = mca_base_param_set_string(id, (char*)uri))) {
ORTE_ERROR_LOG(rc);
return rc;
}
}
/* if NULL, set gpr_replica to old_name and set the corresponding uri parameter */
if (NULL == orte_process_info.gpr_replica) {
orte_process_info.gpr_replica = old_name;
if (NULL != orte_process_info.gpr_replica_uri) {
free(orte_process_info.gpr_replica_uri);
orte_process_info.gpr_replica_uri = NULL;
}
id = mca_base_param_register_string("gpr", "replica", "uri", NULL, NULL);
if (ORTE_SUCCESS != (rc = mca_base_param_set_string(id, (char*)uri))) {
ORTE_ERROR_LOG(rc);
return rc;
}
}
/* call proc_info to reset the structure */
if (ORTE_SUCCESS != (rc = orte_proc_info())) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* ensure my_name is set to the new_name */
if (NULL != orte_process_info.my_name) {
free(orte_process_info.my_name);
}
orte_process_info.my_name = new_name;
/* finalize the sys_info structure so it can be reinitialized */
if (ORTE_SUCCESS != (rc = orte_sys_info_finalize())) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* call the sys_info function to load structure with any new info */
orte_system_info.init = false;
if (ORTE_SUCCESS != (rc = orte_sys_info())) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* establish the session directory structure for this process */
if (ORTE_SUCCESS != (rc = orte_ns.get_jobid_string(&jobid_str, orte_process_info.my_name))) {
ORTE_ERROR_LOG(rc);
return rc;
}
if (ORTE_SUCCESS != (rc = orte_ns.get_vpid_string(&procid_str, orte_process_info.my_name))) {
ORTE_ERROR_LOG(rc);
return rc;
}
if (orte_debug_flag) {
ompi_output(0, "[%d,%d,%d] setting up session dir with",
ORTE_NAME_ARGS(orte_process_info.my_name));
if (NULL != orte_process_info.tmpdir_base) {
ompi_output(0, "\ttmpdir %s", orte_process_info.tmpdir_base);
}
ompi_output(0, "\tuniverse %s", orte_universe_info.name);
ompi_output(0, "\tuser %s", orte_system_info.user);
ompi_output(0, "\thost %s", orte_system_info.nodename);
ompi_output(0, "\tjobid %s", jobid_str);
ompi_output(0, "\tprocid %s", procid_str);
}
if (ORTE_SUCCESS != (rc = orte_session_dir(true,
orte_process_info.tmpdir_base,
orte_system_info.user,
orte_system_info.nodename, NULL,
orte_universe_info.name,
jobid_str, procid_str))) {
ORTE_ERROR_LOG(rc);
if (jobid_str != NULL) free(jobid_str);
if (procid_str != NULL) free(procid_str);
return rc;
}
if (NULL != jobid_str) {
free(jobid_str);
}
if (NULL != procid_str) {
free(procid_str);
}
/*
* Re-open components.
*/
@ -143,25 +245,13 @@ int orte_restart(orte_process_name_t *name, const char* uri)
}
/*
* Set contact info.
* Set contact info for our parent
*/
if (ORTE_SUCCESS != (rc = orte_rml.set_uri(uri))) {
ORTE_ERROR_LOG(rc);
return rc;
}
if (NULL != orte_process_info.ns_replica_uri) {
if (ORTE_SUCCESS != (rc = orte_rml.set_uri(orte_process_info.ns_replica_uri))) {
ORTE_ERROR_LOG(rc);
return rc;
}
}
if (NULL != orte_process_info.gpr_replica_uri) {
if (ORTE_SUCCESS != (rc = orte_rml.set_uri(orte_process_info.gpr_replica_uri))) {
ORTE_ERROR_LOG(rc);
return rc;
}
}
/*
* Re-init selected modules.

Просмотреть файл

@ -82,6 +82,7 @@ struct globals_t {
ompi_mutex_t lock;
ompi_condition_t cond;
} orterun_globals;
static bool globals_init = false;
ompi_cmd_line_init_t cmd_line_init[] = {
@ -315,9 +316,20 @@ static int init_globals(void)
NULL
};
/* Only CONSTRUCT things once */
if (!globals_init) {
OBJ_CONSTRUCT(&orterun_globals.lock, ompi_mutex_t);
OBJ_CONSTRUCT(&orterun_globals.cond, ompi_condition_t);
}
/* Reset this every time */
orterun_globals = tmp;
OBJ_CONSTRUCT(&orterun_globals.lock, ompi_mutex_t);
OBJ_CONSTRUCT(&orterun_globals.cond, ompi_condition_t);
/* All done */
globals_init = true;
return ORTE_SUCCESS;
}

Просмотреть файл

@ -128,204 +128,218 @@ int orte_session_dir(bool create, char *prfx, char *usr, char *hostid,
orte_sys_info();
if (NULL == usr) { /* check if user set elsewhere */
if (NULL == orte_system_info.user) { /* error condition */
return OMPI_ERROR;
} else {
user = strdup(orte_system_info.user);
}
if (NULL == orte_system_info.user) { /* error condition */
return OMPI_ERROR;
} else {
user = strdup(orte_system_info.user);
}
} else {
user = strdup(usr);
user = strdup(usr);
}
if (NULL == univ) { /* see if universe set elsewhere */
if (NULL == orte_universe_info.name) { /* error condition */
return OMPI_ERROR;
} else {
universe = strdup(orte_universe_info.name);
}
if (NULL == orte_universe_info.name) { /* error condition */
return OMPI_ERROR;
} else {
universe = strdup(orte_universe_info.name);
}
} else {
universe = strdup(univ);
universe = strdup(univ);
}
if (NULL == job && NULL != proc) { /* can't give a proc without a job */
return OMPI_ERROR;
return OMPI_ERROR;
}
if (NULL == hostid) { /* check if hostname set elsewhere */
if (NULL == orte_system_info.nodename) { /* don't have a hostname anywhere - error */
return_code = OMPI_ERROR;
goto CLEANUP;
} else {
hostname = strdup(orte_system_info.nodename);
}
if (NULL == orte_system_info.nodename) { /* don't have a hostname anywhere - error */
return_code = OMPI_ERROR;
goto CLEANUP;
} else {
hostname = strdup(orte_system_info.nodename);
}
} else {
hostname = strdup(hostid);
hostname = strdup(hostid);
}
if (NULL == batchid) {
batchname = strdup("0");
batchname = strdup("0");
} else {
batchname = batchid;
batchname = batchid;
}
if (NULL == orte_process_info.top_session_dir) {
if (0 > asprintf(&frontend, "openmpi-sessions-%s@%s_%s", user, hostname, batchname)) {
return_code = OMPI_ERROR;
goto CLEANUP;
}
if (0 > asprintf(&frontend, "openmpi-sessions-%s@%s_%s", user, hostname, batchname)) {
return_code = OMPI_ERROR;
goto CLEANUP;
}
} else {
frontend = strdup(orte_process_info.top_session_dir);
frontend = strdup(orte_process_info.top_session_dir);
}
if (NULL != proc) {
if (0 > asprintf(&sessions, "%s%s%s%s%s%s%s", frontend,
orte_system_info.path_sep, universe,
orte_system_info.path_sep, job,
orte_system_info.path_sep, proc)) {
return_code = OMPI_ERROR;
goto CLEANUP;
}
if (0 > asprintf(&sessions, "%s%s%s%s%s%s%s", frontend,
orte_system_info.path_sep, universe,
orte_system_info.path_sep, job,
orte_system_info.path_sep, proc)) {
return_code = OMPI_ERROR;
goto CLEANUP;
}
} else if (NULL != job) {
if (0 > asprintf(&sessions, "%s%s%s%s%s", frontend,
orte_system_info.path_sep, universe,
orte_system_info.path_sep, job)) {
return_code = OMPI_ERROR;
goto CLEANUP;
}
if (0 > asprintf(&sessions, "%s%s%s%s%s", frontend,
orte_system_info.path_sep, universe,
orte_system_info.path_sep, job)) {
return_code = OMPI_ERROR;
goto CLEANUP;
}
} else {
if (0 > asprintf(&sessions, "%s%s%s", frontend, orte_system_info.path_sep, universe)) {
return_code = OMPI_ERROR;
goto CLEANUP;
}
if (0 > asprintf(&sessions, "%s%s%s", frontend, orte_system_info.path_sep, universe)) {
return_code = OMPI_ERROR;
goto CLEANUP;
}
}
if (NULL != prefix) { /* if a prefix is specified, start looking here */
tmp = strdup(prefix);
fulldirpath = strdup(orte_os_path(false, tmp, sessions, NULL)); /* make sure it's an absolute pathname */
if (OMPI_SUCCESS == orte_check_dir(create, fulldirpath)) { /* check for existence and access, or create it */
return_code = OMPI_SUCCESS;
goto COMPLETE;
}
tmp = strdup(prefix);
fulldirpath = strdup(orte_os_path(false, tmp, sessions, NULL)); /* make sure it's an absolute pathname */
if (OMPI_SUCCESS == orte_check_dir(create, fulldirpath)) { /* check for existence and access, or create it */
return_code = OMPI_SUCCESS;
goto COMPLETE;
}
}
/* didn't find it, so first clear fulldirpath and tmp */
if (NULL != fulldirpath) {
free(fulldirpath); fulldirpath = NULL;
}
if (NULL != tmp) {
free(tmp); tmp = NULL;
}
/* no prefix was specified, so check other options in order */
if (NULL != orte_process_info.tmpdir_base) { /* stored value previously */
tmp = strdup(orte_process_info.tmpdir_base);
fulldirpath = strdup(orte_os_path(false, tmp, sessions, NULL));
if (OMPI_SUCCESS == orte_check_dir(create, fulldirpath)) { /* check for existence and access, or create it */
return_code = OMPI_SUCCESS;
goto COMPLETE;
}
tmp = strdup(orte_process_info.tmpdir_base);
fulldirpath = strdup(orte_os_path(false, tmp, sessions, NULL));
if (OMPI_SUCCESS == orte_check_dir(create, fulldirpath)) { /* check for existence and access, or create it */
return_code = OMPI_SUCCESS;
goto COMPLETE;
}
free(tmp); tmp = NULL;
free(fulldirpath); fulldirpath = NULL;
} else if (NULL != getenv("OMPI_PREFIX_ENV")) { /* we have prefix enviro var - try that next */
tmp = strdup(getenv("OMPI_PREFIX_ENV"));
fulldirpath = strdup(orte_os_path(false, tmp, sessions, NULL));
if (OMPI_SUCCESS == orte_check_dir(create, fulldirpath)) { /* check for existence and access, or create it */
return_code = OMPI_SUCCESS;
goto COMPLETE;
}
tmp = strdup(getenv("OMPI_PREFIX_ENV"));
fulldirpath = strdup(orte_os_path(false, tmp, sessions, NULL));
if (OMPI_SUCCESS == orte_check_dir(create, fulldirpath)) { /* check for existence and access, or create it */
return_code = OMPI_SUCCESS;
goto COMPLETE;
}
free(tmp); tmp = NULL;
free(fulldirpath); fulldirpath = NULL;
} else if (NULL != getenv("TMPDIR")) {
tmp = strdup(getenv("TMPDIR"));
fulldirpath = strdup(orte_os_path(false, tmp, sessions, NULL));
if (OMPI_SUCCESS == orte_check_dir(create, fulldirpath)) { /* check for existence and access, or create it */
return_code = OMPI_SUCCESS;
goto COMPLETE;
}
tmp = strdup(getenv("TMPDIR"));
fulldirpath = strdup(orte_os_path(false, tmp, sessions, NULL));
if (OMPI_SUCCESS == orte_check_dir(create, fulldirpath)) { /* check for existence and access, or create it */
return_code = OMPI_SUCCESS;
goto COMPLETE;
}
free(tmp); tmp = NULL;
free(fulldirpath); fulldirpath = NULL;
} else if (NULL != getenv("TMP")) {
tmp = strdup(getenv("TMP"));
fulldirpath = strdup(orte_os_path(false, tmp, sessions, NULL));
if (OMPI_SUCCESS == orte_check_dir(create, fulldirpath)) { /* check for existence and access, or create it */
return_code = OMPI_SUCCESS;
goto COMPLETE;
}
tmp = strdup(getenv("TMP"));
fulldirpath = strdup(orte_os_path(false, tmp, sessions, NULL));
if (OMPI_SUCCESS == orte_check_dir(create, fulldirpath)) { /* check for existence and access, or create it */
return_code = OMPI_SUCCESS;
goto COMPLETE;
}
free(tmp); tmp = NULL;
free(fulldirpath); fulldirpath = NULL;
} else {
tmp = strdup(OMPI_DEFAULT_TMPDIR);
fulldirpath = strdup(orte_os_path(false, tmp, sessions, NULL));
if (OMPI_SUCCESS == orte_check_dir(create, fulldirpath)) { /* check for existence and access, or create it */
return_code = OMPI_SUCCESS;
goto COMPLETE;
}
}
tmp = strdup(OMPI_DEFAULT_TMPDIR);
fulldirpath = strdup(orte_os_path(false, tmp, sessions, NULL));
if (OMPI_SUCCESS == orte_check_dir(create, fulldirpath)) { /* check for existence and access, or create it */
return_code = OMPI_SUCCESS;
goto COMPLETE;
}
free(tmp); tmp = NULL;
free(fulldirpath); fulldirpath = NULL;
}
fulldirpath = strdup(orte_os_path(false, tmp, sessions, NULL));
if (OMPI_SUCCESS == orte_check_dir(create, fulldirpath)) { /* check for existence and access, or create it */
return_code = OMPI_SUCCESS;
goto COMPLETE;
} else {
/* couldn't find anything - return error */
return_code = OMPI_ERROR;
goto CLEANUP;
}
COMPLETE:
if (create) { /* if creating the dir tree, overwrite the fields */
if (NULL != orte_process_info.tmpdir_base) {
free(orte_process_info.tmpdir_base);
orte_process_info.tmpdir_base = NULL;
}
if (NULL != orte_process_info.top_session_dir) {
free(orte_process_info.top_session_dir);
orte_process_info.top_session_dir = NULL;
}
if (NULL != orte_process_info.tmpdir_base) {
free(orte_process_info.tmpdir_base);
orte_process_info.tmpdir_base = NULL;
}
if (NULL != orte_process_info.top_session_dir) {
free(orte_process_info.top_session_dir);
orte_process_info.top_session_dir = NULL;
}
}
if (NULL == orte_process_info.tmpdir_base) {
orte_process_info.tmpdir_base = strdup(tmp); /* fill in if empty */
orte_process_info.tmpdir_base = strdup(tmp); /* fill in if empty */
}
if (NULL == orte_process_info.top_session_dir) {
orte_process_info.top_session_dir = strdup(frontend);
orte_process_info.top_session_dir = strdup(frontend);
}
if (NULL != proc) {
if (create) { /* overwrite if creating */
if (NULL != orte_process_info.proc_session_dir) {
free(orte_process_info.proc_session_dir);
orte_process_info.proc_session_dir = NULL;
}
}
if (NULL == orte_process_info.proc_session_dir) {
orte_process_info.proc_session_dir = strdup(fulldirpath);
}
sav = strdup(fulldirpath);
free(fulldirpath);
fulldirpath = strdup(dirname(sav));
free(sav);
if (create) { /* overwrite if creating */
if (NULL != orte_process_info.proc_session_dir) {
free(orte_process_info.proc_session_dir);
orte_process_info.proc_session_dir = NULL;
}
}
if (NULL == orte_process_info.proc_session_dir) {
orte_process_info.proc_session_dir = strdup(fulldirpath);
}
sav = strdup(fulldirpath);
free(fulldirpath);
fulldirpath = strdup(dirname(sav));
free(sav);
}
if (NULL != job) {
if (create) { /* overwrite if creating */
if (NULL != orte_process_info.job_session_dir) {
free(orte_process_info.job_session_dir);
orte_process_info.job_session_dir = NULL;
}
}
if (NULL == orte_process_info.job_session_dir) {
orte_process_info.job_session_dir = strdup(fulldirpath);
}
sav = strdup(fulldirpath);
free(fulldirpath);
fulldirpath = strdup(dirname(sav));
free(sav);
if (create) { /* overwrite if creating */
if (NULL != orte_process_info.job_session_dir) {
free(orte_process_info.job_session_dir);
orte_process_info.job_session_dir = NULL;
}
}
if (NULL == orte_process_info.job_session_dir) {
orte_process_info.job_session_dir = strdup(fulldirpath);
}
sav = strdup(fulldirpath);
free(fulldirpath);
fulldirpath = strdup(dirname(sav));
free(sav);
}
if (create) { /* overwrite if creating */
if (NULL != orte_process_info.universe_session_dir) {
free(orte_process_info.universe_session_dir);
orte_process_info.universe_session_dir = NULL;
}
if (NULL != orte_process_info.universe_session_dir) {
free(orte_process_info.universe_session_dir);
orte_process_info.universe_session_dir = NULL;
}
}
if (NULL == orte_process_info.universe_session_dir) {
orte_process_info.universe_session_dir = strdup(fulldirpath);
}
if (orte_debug_flag) {
ompi_output(0, "procdir: %s", orte_process_info.proc_session_dir);
ompi_output(0, "jobdir: %s", orte_process_info.job_session_dir);
ompi_output(0, "unidir: %s", orte_process_info.universe_session_dir);
ompi_output(0, "top: %s", orte_process_info.top_session_dir);
ompi_output(0, "tmp: %s", orte_process_info.tmpdir_base);
ompi_output(0, "procdir: %s", orte_process_info.proc_session_dir);
ompi_output(0, "jobdir: %s", orte_process_info.job_session_dir);
ompi_output(0, "unidir: %s", orte_process_info.universe_session_dir);
ompi_output(0, "top: %s", orte_process_info.top_session_dir);
ompi_output(0, "tmp: %s", orte_process_info.tmpdir_base);
}
CLEANUP:
@ -336,18 +350,24 @@ int orte_session_dir(bool create, char *prfx, char *usr, char *hostid,
free(fulldirpath);
}
if (frontend) {
free(frontend);
free(frontend);
}
if (batchname) {
free(batchname);
free(batchname);
}
if (hostname) {
free(hostname);
free(hostname);
}
if (universe) {
free(universe);
}
if (sessions) {
free(sessions);
free(sessions);
}
if (user) {
free(user);
}
return return_code;
}