1
1
openmpi/orte/mca/filem/base/filem_base_fns.c
Ralph Castain 9613b3176c Effectively revert the orte_output system and return to direct use of opal_output at all levels. Retain the orte_show_help subsystem to allow aggregation of show_help messages at the HNP.
After much work by Jeff and myself, and quite a lot of discussion, it has become clear that we simply cannot resolve the infinite loops caused by RML-involved subsystems calling orte_output. The original rationale for the change to orte_output has also been reduced by shifting the output of XML-formatted vs human readable messages to an alternative approach.

I have globally replaced the orte_output/ORTE_OUTPUT calls in the code base, as well as the corresponding .h file name. I have test compiled and run this on the various environments within my reach, so hopefully this will prove minimally disruptive.

This commit was SVN r18619.
2008-06-09 14:53:58 +00:00

410 строки
11 KiB
C

/*
* Copyright (c) 2004-2008 The Trustees of Indiana University.
* All rights reserved.
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
* All rights reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#if HAVE_SYS_TYPES_H
#include <sys/types.h>
#endif
#if HAVE_UNISTD_H
#include <unistd.h>
#endif
#include <time.h>
#include "orte/constants.h"
#include "opal/mca/mca.h"
#include "opal/mca/base/base.h"
#include "orte/util/show_help.h"
#include "opal/util/argv.h"
#include "opal/mca/base/mca_base_param.h"
#include "opal/util/os_dirpath.h"
#include "orte/mca/rml/rml.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/runtime/orte_globals.h"
#include "orte/util/name_fns.h"
#include "orte/mca/filem/filem.h"
#include "orte/mca/filem/base/base.h"
/******************
* Local Functions
******************/
/******************
* Object Stuff
******************/
ORTE_DECLSPEC OBJ_CLASS_INSTANCE(orte_filem_base_process_set_t,
opal_list_item_t,
orte_filem_base_process_set_construct,
orte_filem_base_process_set_destruct);
ORTE_DECLSPEC void orte_filem_base_process_set_construct(orte_filem_base_process_set_t *req) {
req->source = *ORTE_NAME_INVALID;
req->sink = *ORTE_NAME_INVALID;
}
ORTE_DECLSPEC void orte_filem_base_process_set_destruct( orte_filem_base_process_set_t *req) {
req->source = *ORTE_NAME_INVALID;
req->sink = *ORTE_NAME_INVALID;
}
ORTE_DECLSPEC OBJ_CLASS_INSTANCE(orte_filem_base_file_set_t,
opal_list_item_t,
orte_filem_base_file_set_construct,
orte_filem_base_file_set_destruct);
ORTE_DECLSPEC void orte_filem_base_file_set_construct(orte_filem_base_file_set_t *req) {
req->local_target = NULL;
req->remote_target = NULL;
req->target_flag = ORTE_FILEM_TYPE_UNKNOWN;
}
ORTE_DECLSPEC void orte_filem_base_file_set_destruct( orte_filem_base_file_set_t *req) {
if( NULL != req->local_target ) {
free(req->local_target);
req->local_target = NULL;
}
if( NULL != req->remote_target ) {
free(req->remote_target);
req->remote_target = NULL;
}
req->target_flag = ORTE_FILEM_TYPE_UNKNOWN;
}
ORTE_DECLSPEC OBJ_CLASS_INSTANCE(orte_filem_base_request_t,
opal_list_item_t,
orte_filem_base_construct,
orte_filem_base_destruct);
void orte_filem_base_construct(orte_filem_base_request_t *req) {
OBJ_CONSTRUCT(&req->process_sets, opal_list_t);
OBJ_CONSTRUCT(&req->file_sets, opal_list_t);
req->num_mv = 0;
req->is_done = NULL;
req->is_active = NULL;
req->exit_status = NULL;
req->movement_type = ORTE_FILEM_MOVE_TYPE_UNKNOWN;
}
void orte_filem_base_destruct( orte_filem_base_request_t *req) {
opal_list_item_t* item = NULL;
while( NULL != (item = opal_list_remove_first(&req->process_sets)) ) {
OBJ_RELEASE(item);
}
OBJ_DESTRUCT(&req->process_sets);
while( NULL != (item = opal_list_remove_first(&req->file_sets)) ) {
OBJ_RELEASE(item);
}
OBJ_DESTRUCT(&req->file_sets);
req->num_mv = 0;
if( NULL != req->is_done ) {
free(req->is_done);
req->is_done = NULL;
}
if( NULL != req->is_active ) {
free(req->is_active);
req->is_active = NULL;
}
if( NULL != req->exit_status ) {
free(req->exit_status);
req->exit_status = NULL;
}
req->movement_type = ORTE_FILEM_MOVE_TYPE_UNKNOWN;
}
/***********************
* None component stuff
************************/
int orte_filem_base_none_open(void)
{
return ORTE_SUCCESS;
}
int orte_filem_base_none_close(void)
{
return ORTE_SUCCESS;
}
int orte_filem_base_none_query(mca_base_module_t **module, int *priority)
{
*module = NULL;
*priority = 0;
return OPAL_SUCCESS;
}
int orte_filem_base_module_init(void)
{
return ORTE_SUCCESS;
}
int orte_filem_base_module_finalize(void)
{
return ORTE_SUCCESS;
}
int orte_filem_base_none_put(orte_filem_base_request_t *request )
{
return ORTE_SUCCESS;
}
int orte_filem_base_none_put_nb(orte_filem_base_request_t *request )
{
return ORTE_SUCCESS;
}
int orte_filem_base_none_get(orte_filem_base_request_t *request)
{
return ORTE_SUCCESS;
}
int orte_filem_base_none_get_nb(orte_filem_base_request_t *request)
{
return ORTE_SUCCESS;
}
int orte_filem_base_none_rm(orte_filem_base_request_t *request)
{
return ORTE_SUCCESS;
}
int orte_filem_base_none_rm_nb(orte_filem_base_request_t *request)
{
return ORTE_SUCCESS;
}
int orte_filem_base_none_wait(orte_filem_base_request_t *request)
{
return ORTE_SUCCESS;
}
int orte_filem_base_none_wait_all(opal_list_t *request_list)
{
return ORTE_SUCCESS;
}
/********************
* Utility functions
********************/
int orte_filem_base_get_proc_node_name(orte_process_name_t *proc, char **machine_name) {
int ret;
orte_std_cntr_t count;
opal_buffer_t request, answer;
orte_filem_cmd_flag_t command=ORTE_FILEM_GET_PROC_NODE_NAME_CMD;
/* set default answer */
*machine_name = NULL;
if (orte_process_info.hnp) {
/* if I am the HNP, then all the data structures are local to me - no
* need to send messages around to get the info
*/
orte_job_t *jdata;
orte_proc_t **procs;
/* get the job data object for this proc */
if (NULL == (jdata = orte_get_job_data_object(proc->jobid))) {
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
return ORTE_ERR_NOT_FOUND;
}
/* get the proc object for it */
procs = (orte_proc_t**)jdata->procs->addr;
if (NULL == procs[proc->vpid] || NULL == procs[proc->vpid]->node) {
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
return ORTE_ERR_NOT_FOUND;
}
*machine_name = strdup(procs[proc->vpid]->node->name);
return ORTE_SUCCESS;
}
/* if I am not the HNP, then I have to send a request to the HNP
* for the information
*/
OBJ_CONSTRUCT(&request, opal_buffer_t);
OBJ_CONSTRUCT(&answer, opal_buffer_t);
if (ORTE_SUCCESS != (ret = opal_dss.pack(&request, &command, 1, ORTE_FILEM_CMD))) {
ORTE_ERROR_LOG(ret);
goto CLEANUP;
}
if (ORTE_SUCCESS != (ret = opal_dss.pack(&request, proc, 1, ORTE_NAME))) {
ORTE_ERROR_LOG(ret);
goto CLEANUP;
}
if (0 > (ret = orte_rml.send_buffer(ORTE_PROC_MY_HNP, &request, ORTE_RML_TAG_FILEM_BASE, 0))) {
ORTE_ERROR_LOG(ret);
goto CLEANUP;
}
/* wait for answer */
if (0 > (ret = orte_rml.recv_buffer(ORTE_NAME_WILDCARD, &answer, ORTE_RML_TAG_FILEM_BASE_RESP, 0))) {
ORTE_ERROR_LOG(ret);
goto CLEANUP;
}
/* unpack the machine name */
count = 1;
if (ORTE_SUCCESS != (ret = opal_dss.unpack(&answer, machine_name, &count, OPAL_STRING))) {
ORTE_ERROR_LOG(ret);
goto CLEANUP;
}
CLEANUP:
OBJ_DESTRUCT(&answer);
OBJ_DESTRUCT(&request);
return ret;
}
/*
* This function is paired with the filem_base_process_get_remote_path_cmd() function on the remote machine
*/
int orte_filem_base_get_remote_path(char **remote_ref, orte_process_name_t *peer, int *flag) {
int ret, exit_status = ORTE_SUCCESS;
char *tmp_ref = NULL;
orte_std_cntr_t n;
opal_buffer_t request, answer;
int tmp_flag;
orte_filem_cmd_flag_t command=ORTE_FILEM_GET_REMOTE_PATH_CMD;
/*
* Ask for remote file information from the HNP
*/
OBJ_CONSTRUCT(&request, opal_buffer_t);
OBJ_CONSTRUCT(&answer, opal_buffer_t);
if (ORTE_SUCCESS != (ret = opal_dss.pack(&request, &command, 1, ORTE_FILEM_CMD))) {
ORTE_ERROR_LOG(ret);
goto cleanup;
}
if (ORTE_SUCCESS != (ret = opal_dss.pack(&request, remote_ref, 1, OPAL_STRING))) {
exit_status = ret;
goto cleanup;
}
if (0 > (ret = orte_rml.send_buffer(peer, &request, ORTE_RML_TAG_FILEM_BASE, 0))) {
exit_status = ret;
goto cleanup;
}
/*
* Get the response
*/
if( 0 > (ret = orte_rml.recv_buffer(peer, &answer, ORTE_RML_TAG_FILEM_BASE_RESP, 0)) ) {
exit_status = ret;
goto cleanup;
}
/*
* The absolute path for the remote file
*/
n = 1;
if ( ORTE_SUCCESS != (ret = opal_dss.unpack(&answer, &tmp_ref, &n, OPAL_STRING)) ) {
exit_status = ret;
goto cleanup;
}
/*
* The file type on the remote machine
*/
n = 1;
if ( ORTE_SUCCESS != (ret = opal_dss.unpack(&answer, &tmp_flag, &n, OPAL_INT)) ) {
exit_status = ret;
goto cleanup;
}
if(NULL != *remote_ref)
free(*remote_ref);
*remote_ref = strdup(tmp_ref);
*flag = tmp_flag;
cleanup:
OBJ_DESTRUCT(&answer);
OBJ_DESTRUCT(&request);
if( NULL != tmp_ref)
free(tmp_ref);
return exit_status;
}
int orte_filem_base_prepare_request(orte_filem_base_request_t *request, int move_type)
{
int num_reqs = 0, i = 0;
if( ORTE_FILEM_MOVE_TYPE_RM == move_type ) {
num_reqs = opal_list_get_size(&request->process_sets);
}
else {
num_reqs = opal_list_get_size(&request->process_sets) * opal_list_get_size(&request->file_sets);
}
if( 0 >= num_reqs ) {
return ORTE_ERROR;
}
else {
if( NULL != request->is_done ) {
free(request->is_done);
request->is_done = NULL;
}
if( NULL != request->is_active ) {
free(request->is_active);
request->is_active = NULL;
}
if( NULL != request->exit_status ) {
free(request->exit_status);
request->exit_status = NULL;
}
request->num_mv = num_reqs;
request->is_done = (bool*) malloc(sizeof(bool) * num_reqs);
request->is_active = (bool*) malloc(sizeof(bool) * num_reqs);
request->exit_status = (int32_t*) malloc(sizeof(int32_t) * num_reqs);
for( i = 0; i < num_reqs; ++i) {
request->is_done[i] = false;
request->is_active[i] = false;
request->exit_status[i] = 0;
}
}
request->movement_type = move_type;
return ORTE_SUCCESS;
}