1
1
openmpi/ompi/mca/crcp/base/crcp_base_fns.c
Josh Hursey 98fb9f26ef Some cleanup.
- Remove an old comment from crcp_base_fns.c
- Let ob1 have its very own ft_event function (which I'll fill in shortly)
- Make sure ob1 finalizes the bsend stuff so we don't leave a bunch of memory sitting around
- PML base - destruct the array upon finalize. Shrink the include search so it stops after finding a match

This commit was SVN r14222.
2007-04-05 13:52:05 +00:00

616 строки
22 KiB
C

/*
* Copyright (c) 2004-2007 The Trustees of Indiana University.
* All rights reserved.
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
* All rights reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ompi_config.h"
#if HAVE_SYS_TYPES_H
#include <sys/types.h>
#endif
#if HAVE_UNISTD_H
#include <unistd.h>
#endif
#include <time.h>
#include <ctype.h>
#include "opal/mca/mca.h"
#include "opal/mca/base/base.h"
#include "opal/util/output.h"
#include "opal/util/os_dirpath.h"
#include "orte/mca/smr/smr.h"
#include "orte/mca/gpr/gpr.h"
#include "ompi/communicator/communicator.h"
#include "ompi/proc/proc.h"
#include "opal/mca/base/mca_base_param.h"
#include "opal/mca/crs/crs.h"
#include "opal/mca/crs/base/base.h"
#include "ompi/mca/crcp/crcp.h"
#include "ompi/mca/crcp/base/base.h"
#include "ompi/mca/bml/bml.h"
#include "ompi/mca/bml/base/base.h"
#include "ompi/mca/pml/pml.h"
#include "ompi/mca/pml/base/base.h"
#include "ompi/mca/pml/base/pml_base_module_exchange.h"
#include "ompi/mca/pml/base/pml_base_request.h"
/******************
* Local Functions
******************/
/******************
* Object stuff
******************/
OBJ_CLASS_INSTANCE(ompi_crcp_base_pml_state_t,
ompi_free_list_item_t,
NULL,
NULL
);
OBJ_CLASS_INSTANCE(ompi_crcp_base_btl_state_t,
ompi_free_list_item_t,
NULL,
NULL
);
/***********************
* None component stuff
************************/
int ompi_crcp_base_none_open(void)
{
return OMPI_SUCCESS;
}
int ompi_crcp_base_none_close(void)
{
return OMPI_SUCCESS;
}
int ompi_crcp_base_module_init(void)
{
return OMPI_SUCCESS;
}
int ompi_crcp_base_module_finalize(void)
{
return OMPI_SUCCESS;
}
/****************
* PML Wrapper
****************/
ompi_crcp_base_pml_state_t* ompi_crcp_base_none_pml_enable( bool enable,
ompi_crcp_base_pml_state_t* pml_state )
{
pml_state->error_code = OMPI_SUCCESS;
return pml_state;
}
ompi_crcp_base_pml_state_t* ompi_crcp_base_none_pml_add_comm( struct ompi_communicator_t* comm,
ompi_crcp_base_pml_state_t* pml_state )
{
pml_state->error_code = OMPI_SUCCESS;
return pml_state;
}
ompi_crcp_base_pml_state_t* ompi_crcp_base_none_pml_del_comm( struct ompi_communicator_t* comm,
ompi_crcp_base_pml_state_t* pml_state )
{
pml_state->error_code = OMPI_SUCCESS;
return pml_state;
}
ompi_crcp_base_pml_state_t* ompi_crcp_base_none_pml_add_procs( struct ompi_proc_t **procs,
size_t nprocs,
ompi_crcp_base_pml_state_t* pml_state )
{
pml_state->error_code = OMPI_SUCCESS;
return pml_state;
}
ompi_crcp_base_pml_state_t* ompi_crcp_base_none_pml_del_procs( struct ompi_proc_t **procs,
size_t nprocs,
ompi_crcp_base_pml_state_t* pml_state )
{
pml_state->error_code = OMPI_SUCCESS;
return pml_state;
}
ompi_crcp_base_pml_state_t* ompi_crcp_base_none_pml_progress(ompi_crcp_base_pml_state_t* pml_state)
{
pml_state->error_code = OMPI_SUCCESS;
return pml_state;
}
ompi_crcp_base_pml_state_t* ompi_crcp_base_none_pml_iprobe(int dst, int tag,
struct ompi_communicator_t* comm,
int *matched, ompi_status_public_t* status,
ompi_crcp_base_pml_state_t* pml_state )
{
pml_state->error_code = OMPI_SUCCESS;
return pml_state;
}
ompi_crcp_base_pml_state_t* ompi_crcp_base_none_pml_probe( int dst, int tag,
struct ompi_communicator_t* comm,
ompi_status_public_t* status,
ompi_crcp_base_pml_state_t* pml_state )
{
pml_state->error_code = OMPI_SUCCESS;
return pml_state;
}
ompi_crcp_base_pml_state_t* ompi_crcp_base_none_pml_isend_init( void *buf, size_t count,
ompi_datatype_t *datatype,
int dst, int tag,
mca_pml_base_send_mode_t mode,
struct ompi_communicator_t* comm,
struct ompi_request_t **request,
ompi_crcp_base_pml_state_t* pml_state )
{
pml_state->error_code = OMPI_SUCCESS;
return pml_state;
}
ompi_crcp_base_pml_state_t* ompi_crcp_base_none_pml_isend( void *buf, size_t count,
ompi_datatype_t *datatype,
int dst, int tag,
mca_pml_base_send_mode_t mode,
struct ompi_communicator_t* comm,
struct ompi_request_t **request,
ompi_crcp_base_pml_state_t* pml_state )
{
pml_state->error_code = OMPI_SUCCESS;
return pml_state;
}
ompi_crcp_base_pml_state_t* ompi_crcp_base_none_pml_send( void *buf, size_t count,
ompi_datatype_t *datatype,
int dst, int tag,
mca_pml_base_send_mode_t mode,
struct ompi_communicator_t* comm,
ompi_crcp_base_pml_state_t* pml_state )
{
pml_state->error_code = OMPI_SUCCESS;
return pml_state;
}
ompi_crcp_base_pml_state_t* ompi_crcp_base_none_pml_irecv_init( void *buf, size_t count,
ompi_datatype_t *datatype,
int src, int tag,
struct ompi_communicator_t* comm,
struct ompi_request_t **request,
ompi_crcp_base_pml_state_t* pml_state)
{
pml_state->error_code = OMPI_SUCCESS;
return pml_state;
}
ompi_crcp_base_pml_state_t* ompi_crcp_base_none_pml_irecv( void *buf, size_t count,
ompi_datatype_t *datatype,
int src, int tag,
struct ompi_communicator_t* comm,
struct ompi_request_t **request,
ompi_crcp_base_pml_state_t* pml_state )
{
pml_state->error_code = OMPI_SUCCESS;
return pml_state;
}
ompi_crcp_base_pml_state_t* ompi_crcp_base_none_pml_recv( void *buf, size_t count,
ompi_datatype_t *datatype,
int src, int tag,
struct ompi_communicator_t* comm,
ompi_status_public_t* status,
ompi_crcp_base_pml_state_t* pml_state)
{
pml_state->error_code = OMPI_SUCCESS;
return pml_state;
}
ompi_crcp_base_pml_state_t* ompi_crcp_base_none_pml_dump( struct ompi_communicator_t* comm,
int verbose,
ompi_crcp_base_pml_state_t* pml_state )
{
pml_state->error_code = OMPI_SUCCESS;
return pml_state;
}
ompi_crcp_base_pml_state_t* ompi_crcp_base_none_pml_start( size_t count,
ompi_request_t** requests,
ompi_crcp_base_pml_state_t* pml_state )
{
pml_state->error_code = OMPI_SUCCESS;
return pml_state;
}
ompi_crcp_base_pml_state_t* ompi_crcp_base_none_pml_ft_event(int state,
ompi_crcp_base_pml_state_t* pml_state)
{
pml_state->error_code = OMPI_SUCCESS;
return pml_state;
}
/********************
* Request Interface
********************/
int ompi_crcp_base_none_request_complete( struct ompi_request_t *request ) {
return OMPI_SUCCESS;
}
/********************
* BTL Interface
********************/
ompi_crcp_base_btl_state_t*
ompi_crcp_base_none_btl_add_procs( struct mca_btl_base_module_t* btl,
size_t nprocs,
struct ompi_proc_t** procs,
struct mca_btl_base_endpoint_t** endpoints,
struct ompi_bitmap_t* reachable,
ompi_crcp_base_btl_state_t* btl_state)
{
btl_state->error_code = OMPI_SUCCESS;
return btl_state;
}
ompi_crcp_base_btl_state_t*
ompi_crcp_base_none_btl_del_procs( struct mca_btl_base_module_t* btl,
size_t nprocs,
struct ompi_proc_t** procs,
struct mca_btl_base_endpoint_t** endpoints,
ompi_crcp_base_btl_state_t* btl_state)
{
btl_state->error_code = OMPI_SUCCESS;
return btl_state;
}
ompi_crcp_base_btl_state_t*
ompi_crcp_base_none_btl_register( struct mca_btl_base_module_t* btl,
mca_btl_base_tag_t tag,
mca_btl_base_module_recv_cb_fn_t cbfunc,
void* cbdata,
ompi_crcp_base_btl_state_t* btl_state)
{
btl_state->error_code = OMPI_SUCCESS;
return btl_state;
}
ompi_crcp_base_btl_state_t*
ompi_crcp_base_none_btl_finalize( struct mca_btl_base_module_t* btl,
ompi_crcp_base_btl_state_t* btl_state)
{
btl_state->error_code = OMPI_SUCCESS;
return btl_state;
}
ompi_crcp_base_btl_state_t*
ompi_crcp_base_none_btl_alloc( struct mca_btl_base_module_t* btl,
size_t size,
ompi_crcp_base_btl_state_t* btl_state)
{
btl_state->error_code = OMPI_SUCCESS;
return btl_state;
}
ompi_crcp_base_btl_state_t*
ompi_crcp_base_none_btl_free( struct mca_btl_base_module_t* btl,
mca_btl_base_descriptor_t* descriptor,
ompi_crcp_base_btl_state_t* btl_state)
{
btl_state->error_code = OMPI_SUCCESS;
return btl_state;
}
ompi_crcp_base_btl_state_t*
ompi_crcp_base_none_btl_prepare_src( struct mca_btl_base_module_t* btl,
struct mca_btl_base_endpoint_t* endpoint,
mca_mpool_base_registration_t* registration,
struct ompi_convertor_t* convertor,
size_t reserve,
size_t* size,
ompi_crcp_base_btl_state_t* btl_state)
{
btl_state->error_code = OMPI_SUCCESS;
return btl_state;
}
ompi_crcp_base_btl_state_t*
ompi_crcp_base_none_btl_prepare_dst( struct mca_btl_base_module_t* btl,
struct mca_btl_base_endpoint_t* endpoint,
mca_mpool_base_registration_t* registration,
struct ompi_convertor_t* convertor,
size_t reserve,
size_t* size,
ompi_crcp_base_btl_state_t* btl_state)
{
btl_state->error_code = OMPI_SUCCESS;
return btl_state;
}
ompi_crcp_base_btl_state_t*
ompi_crcp_base_none_btl_send( struct mca_btl_base_module_t* btl,
struct mca_btl_base_endpoint_t* endpoint,
struct mca_btl_base_descriptor_t* descriptor,
mca_btl_base_tag_t tag,
ompi_crcp_base_btl_state_t* btl_state)
{
btl_state->error_code = OMPI_SUCCESS;
return btl_state;
}
ompi_crcp_base_btl_state_t*
ompi_crcp_base_none_btl_put( struct mca_btl_base_module_t* btl,
struct mca_btl_base_endpoint_t* endpoint,
struct mca_btl_base_descriptor_t* descriptor,
ompi_crcp_base_btl_state_t* btl_state)
{
btl_state->error_code = OMPI_SUCCESS;
return btl_state;
}
ompi_crcp_base_btl_state_t*
ompi_crcp_base_none_btl_get( struct mca_btl_base_module_t* btl,
struct mca_btl_base_endpoint_t* endpoint,
struct mca_btl_base_descriptor_t* descriptor,
ompi_crcp_base_btl_state_t* btl_state)
{
btl_state->error_code = OMPI_SUCCESS;
return btl_state;
}
ompi_crcp_base_btl_state_t*
ompi_crcp_base_none_btl_dump( struct mca_btl_base_module_t* btl,
struct mca_btl_base_endpoint_t* endpoint,
int verbose,
ompi_crcp_base_btl_state_t* btl_state)
{
btl_state->error_code = OMPI_SUCCESS;
return btl_state;
}
ompi_crcp_base_btl_state_t*
ompi_crcp_base_none_btl_ft_event(int state,
ompi_crcp_base_btl_state_t* btl_state)
{
btl_state->error_code = OMPI_SUCCESS;
return btl_state;
}
/********************
* Utility functions
********************/
int ompi_crcp_base_reboot_pml(ompi_crcp_base_pml_state_t* pml_state) {
int ret;
ompi_proc_t** procs;
size_t nprocs;
char *error_msg = NULL;
int return_code = OMPI_SUCCESS;
ompi_communicator_t *tmp_comm;
int comm_size = 0;
uint32_t c = 0;
opal_output_verbose(5, ompi_crcp_base_output,
"crcp:coord: reboot_pml(): PML/BML/BTL are rebooting [PML = %s]",
mca_pml_base_selected_component.pmlm_version.mca_component_name);
opal_output_verbose(25, ompi_crcp_base_output,
"crcp:coord: reboot_pml(): Disable PML");
/* Disable the PML */
if( OMPI_SUCCESS != (ret = MCA_PML_CALL(enable(false)) ) ) {
error_msg = "PML control failed";
return_code = ret;
goto cleanup;
}
/* Get all the processes that we know about */
nprocs = 0;
if (NULL == (procs = ompi_proc_world(&nprocs))) {
error_msg = "ompi_proc_world() failed";
return_code = ret;
goto cleanup;
}
opal_output_verbose(25, ompi_crcp_base_output,
"crcp:coord: reboot_pml(): Delete all communicators from the PML");
/* Get all the communicators that we know about */
comm_size = ompi_pointer_array_get_size(&ompi_mpi_communicators);
for( c = 0; c < (uint32_t)comm_size; ++c) {
tmp_comm = ompi_comm_lookup(c);
if( ompi_comm_invalid(tmp_comm) ) {
continue;
}
if( OMPI_SUCCESS != (ret = MCA_PML_CALL(del_comm(tmp_comm)) ) ) {
error_msg = "PML del comm failed";
return_code = ret;
goto cleanup;;
}
}
opal_output_verbose(25, ompi_crcp_base_output,
"crcp:coord: reboot_pml(): Delete all %d processes from the PML",
(int)nprocs);
/* Delete all the procs */
if( OMPI_SUCCESS != (ret = MCA_PML_CALL(del_procs(procs, nprocs)) ) ) {
error_msg = "PML del procs failed";
return_code = ret;
goto cleanup;
}
/* Shutdown the PML/BML/BTL */
opal_output_verbose(25, ompi_crcp_base_output,
"crcp:coord: reboot_pml(): Shutdown the PML");
mca_pml_base_close();
opal_output_verbose(25, ompi_crcp_base_output,
"crcp:coord: reboot_pml(): Shutdown the BML");
mca_bml.bml_finalize();
if (OMPI_SUCCESS != (ret = mca_pml_base_modex_finalize())) {
error_msg = "PML base_modex_finalize failed";
return_code = ret;
goto cleanup;
}
/* Refresh the ompi_proc structures &
* Since they are pointed to by the communicators then they are updated as well */
opal_output_verbose(25, ompi_crcp_base_output,
"crcp:coord: reboot_pml(): Refresh Process List");
for(c = 0; c < nprocs; ++c) {
if( procs[c]->proc_modex != NULL ) {
OBJ_RELEASE(procs[c]->proc_modex);
procs[c]->proc_modex = NULL;
}
if( procs[c]->proc_bml != NULL ) {
OBJ_RELEASE( procs[c]->proc_bml);
procs[c]->proc_bml = NULL;
}
if( procs[c]->proc_pml != NULL ) {
free( procs[c]->proc_pml);
procs[c]->proc_pml = NULL;
}
if( procs[c]->proc_hostname != NULL ) {
free( procs[c]->proc_hostname );
procs[c]->proc_hostname = NULL;
}
/*procs[c]->proc_arch = ompi_mpi_local_arch;*/
procs[c]->proc_flags = 0;
}
/* Restart the PML/BML/BTL */
opal_output_verbose(25, ompi_crcp_base_output,
"crcp:coord: reboot_pml(): PML base Open");
if (OMPI_SUCCESS != (ret = mca_pml_base_open())) {
error_msg = "PML base_open failed";
return_code = ret;
goto cleanup;
}
opal_output_verbose(25, ompi_crcp_base_output,
"crcp:coord: reboot_pml(): PML init modex");
if (OMPI_SUCCESS != (ret = mca_pml_base_modex_init())) {
error_msg = "PML base_modex_init failed";
return_code = ret;
goto cleanup;
}
opal_output_verbose(25, ompi_crcp_base_output,
"crcp:coord: reboot_pml(): PML base select");
if (OMPI_SUCCESS !=
(ret = mca_pml_base_select(OMPI_ENABLE_PROGRESS_THREADS,
OMPI_ENABLE_MPI_THREADS))) {
error_msg = "PML base_select failed";
return_code = ret;
goto cleanup;
}
opal_output_verbose(25, ompi_crcp_base_output,
"crcp:coord: reboot_pml(): PML modex exchange");
if (OMPI_SUCCESS != (ret = mca_pml_base_modex_exchange())) {
error_msg = "PML base_modex_exchange failed";
return_code = ret;
goto cleanup;
}
opal_output_verbose(25, ompi_crcp_base_output,
"crcp:coord: reboot_pml(): Enter Stage Gate 1");
if (ORTE_SUCCESS != (ret = orte_smr.set_proc_state(orte_process_info.my_name,
ORTE_PROC_STATE_AT_STG1, 0))) {
error_msg = "PML Stage Gate 1 SOH failed";
return_code = ret;
goto cleanup;
}
if (ORTE_SUCCESS != (ret = orte_rml.xcast(ORTE_PROC_MY_NAME->jobid, true,
NULL, orte_gpr.deliver_notify_msg))) {
error_msg = "PML RML.Xcast(Stage1) failed";
return_code = ret;
goto cleanup;
}
/* Enable the PML */
opal_output_verbose(25, ompi_crcp_base_output,
"crcp:coord: reboot_pml(): PML Enable");
if( OMPI_SUCCESS != (ret = MCA_PML_CALL(enable(true)) ) ) {
error_msg = "PML control failed";
return_code = ret;
goto cleanup;
}
/* Add back the processes */
opal_output_verbose(25, ompi_crcp_base_output,
"crcp:coord: reboot_pml(): PML get new list of processes");
/* Get all the processes that we know about */
nprocs = 0;
if (NULL == (procs = ompi_proc_world(&nprocs))) {
error_msg = "ompi_proc_world() failed";
return_code = ret;
goto cleanup;
}
opal_output_verbose(52, ompi_crcp_base_output,
"crcp:coord: reboot_pml(): PML Add back the %d processes",
(int)nprocs);
if( OMPI_SUCCESS != (ret = MCA_PML_CALL(add_procs(procs, nprocs)) ) ) {
error_msg = "PML add procs failed";
return_code = ret;
goto cleanup;
}
free(procs);
/* Add back the communicators */
opal_output_verbose(25, ompi_crcp_base_output,
"crcp:coord: reboot_pml(): PML Add back the communicators");
for( c = 0; c < (uint32_t)comm_size; ++c) {
tmp_comm = ompi_comm_lookup(c);
if( ompi_comm_invalid(tmp_comm) ) {
continue;
}
if( OMPI_SUCCESS != (ret = MCA_PML_CALL(add_comm(tmp_comm)) ) ) {
error_msg = "PML add comm failed";
return_code = ret;
goto cleanup;;
}
}
opal_output_verbose(25, ompi_crcp_base_output,
"crcp:coord: reboot_pml(): Enter Stage Gate 2");
if (ORTE_SUCCESS != (ret = orte_smr.set_proc_state(orte_process_info.my_name,
ORTE_PROC_STATE_AT_STG2, 0))) {
error_msg = "PML Stage Gate 2 SOH failed";
return_code = ret;
goto cleanup;
}
if (ORTE_SUCCESS != (ret = orte_rml.xcast(ORTE_PROC_MY_NAME->jobid, false,
NULL, orte_gpr.deliver_notify_msg))) {
error_msg = "PML RML.Xcast(Stage2) failed";
return_code = ret;
goto cleanup;
}
opal_output_verbose(5, ompi_crcp_base_output,
"crcp:coord: reboot_pml(): PML/BML/BTL have been rebooted [PML = %s]",
mca_pml_base_selected_component.pmlm_version.mca_component_name);
cleanup:
return return_code;
}