/* -*- C -*- * * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. * Copyright (c) 2004-2005 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow * * $HEADER$ */ /** @file: * * The Open MPI Name Server * * The Open MPI Name Server provides unique name ranges for processes * within the universe. Each universe will have one name server * running within the seed daemon. This is done to prevent the * inadvertent duplication of names. */ /* * includes */ #include "orte_config.h" #include "orte/orte_constants.h" #include "orte/orte_types.h" #include "opal/threads/mutex.h" #include "opal/class/opal_list.h" #include "opal/util/output.h" #include "opal/mca/mca.h" #include "opal/mca/base/mca_base_param.h" #include "orte/util/proc_info.h" #include "orte/mca/errmgr/errmgr.h" #include "orte/mca/rml/rml.h" #include "orte/mca/ns/base/ns_private.h" #include "ns_replica.h" /* * Struct of function pointers that need to be initialized */ mca_ns_base_component_t mca_ns_replica_component = { { MCA_NS_BASE_VERSION_2_0_0, "replica", /* MCA module name */ ORTE_MAJOR_VERSION, /* MCA module major version */ ORTE_MINOR_VERSION, /* MCA module minor version */ ORTE_RELEASE_VERSION, /* MCA module release version */ orte_ns_replica_open, /* module open */ orte_ns_replica_close /* module close */ }, { /* The component is checkpoint ready */ MCA_BASE_METADATA_PARAM_CHECKPOINT }, orte_ns_replica_init, /* module init */ orte_ns_replica_finalize /* module shutdown */ }; /* * setup the function pointers for the module */ static mca_ns_base_module_t orte_ns_replica_module = { /* init */ orte_ns_replica_module_init, /* cell functions */ orte_ns_replica_create_cellid, orte_ns_replica_get_cell_info, orte_ns_base_get_cellid_string, orte_ns_base_convert_cellid_to_string, orte_ns_base_convert_string_to_cellid, /** node functions */ orte_ns_replica_create_nodeids, orte_ns_replica_get_node_info, orte_ns_base_convert_nodeid_to_string, orte_ns_base_convert_string_to_nodeid, /* jobid functions */ orte_ns_replica_create_jobid, orte_ns_replica_get_job_descendants, orte_ns_replica_get_job_children, orte_ns_replica_get_root_job, orte_ns_replica_get_parent_job, orte_ns_replica_get_job_family, orte_ns_base_get_jobid_string, orte_ns_base_convert_jobid_to_string, orte_ns_base_convert_string_to_jobid, orte_ns_replica_reserve_range, orte_ns_replica_get_vpid_range, /* vpid functions */ orte_ns_base_get_vpid_string, orte_ns_base_convert_vpid_to_string, orte_ns_base_convert_string_to_vpid, /* name functions */ orte_ns_base_create_process_name, orte_ns_replica_create_my_name, orte_ns_base_convert_string_to_process_name, orte_ns_base_get_proc_name_string, orte_ns_base_compare_fields, /* peer functions */ orte_ns_replica_get_peers, /* tag server functions */ orte_ns_replica_assign_rml_tag, /* data type functions */ orte_ns_replica_define_data_type, /* diagnostic functions */ orte_ns_replica_dump_cells, orte_ns_replica_dump_jobs, orte_ns_replica_dump_tags, orte_ns_replica_dump_datatypes, orte_ns_replica_ft_event }; /* * Whether or not we allowed this component to be selected */ static bool initialized = false; /* * class instantiations */ #include "ns_replica_class_instances.h" /* * globals needed within replica component */ orte_ns_replica_globals_t orte_ns_replica; /* * don't really need this function - could just put NULL in the above structure * Just holding the place in case we decide there is something we need to do */ int orte_ns_replica_open(void) { int id, param; id = mca_base_param_register_int("ns", "replica", "debug", NULL, (int)false); mca_base_param_lookup_int(id, &orte_ns_replica.debug); id = mca_base_param_register_int("ns", "replica", "isolate", NULL, (int)false); mca_base_param_lookup_int(id, ¶m); if (param) { orte_ns_replica.isolate = true; } else { orte_ns_replica.isolate = false; } id = mca_base_param_register_int("ns", "replica", "maxsize", NULL, ORTE_NS_ARRAY_MAX_SIZE); mca_base_param_lookup_int(id, ¶m); orte_ns_replica.max_size = (size_t)param; id = mca_base_param_register_int("ns", "replica", "blocksize", NULL, ORTE_NS_ARRAY_BLOCK_SIZE); mca_base_param_lookup_int(id, ¶m); orte_ns_replica.block_size = (size_t)param; return ORTE_SUCCESS; } /* * ditto for this one */ int orte_ns_replica_close(void) { return ORTE_SUCCESS; } mca_ns_base_module_t* orte_ns_replica_init(int *priority) { int rc; /* If we are to host a replica, then we want to be selected, so do all the setup and return the module */ if (NULL == orte_process_info.ns_replica_uri) { /* Return a module (choose an arbitrary, positive priority -- it's only relevant compared to other ns components). If we're not the seed, then we don't want to be selected, so return NULL. */ *priority = 50; /* initialize the cell info tracker */ if (ORTE_SUCCESS != (rc = orte_pointer_array_init(&(orte_ns_replica.cells), (orte_std_cntr_t)orte_ns_replica.block_size, (orte_std_cntr_t)orte_ns_replica.max_size, (orte_std_cntr_t)orte_ns_replica.block_size))) { ORTE_ERROR_LOG(rc); return NULL; } orte_ns_replica.num_cells = 0; /* initialize the job tracking system */ OBJ_CONSTRUCT(&orte_ns_replica.jobs, opal_list_t); orte_ns_replica.num_jobids = 0; /* initialize the taglist */ if (ORTE_SUCCESS != (rc = orte_pointer_array_init(&(orte_ns_replica.tags), (orte_std_cntr_t)orte_ns_replica.block_size, (orte_std_cntr_t)orte_ns_replica.max_size, (orte_std_cntr_t)orte_ns_replica.block_size))) { ORTE_ERROR_LOG(rc); return NULL; } orte_ns_replica.num_tags = 0; /* initialize the dtlist */ if (ORTE_SUCCESS != (rc = orte_pointer_array_init(&(orte_ns_replica.dts), (orte_std_cntr_t)orte_ns_replica.block_size, (orte_std_cntr_t)orte_ns_replica.max_size, (orte_std_cntr_t)orte_ns_replica.block_size))) { ORTE_ERROR_LOG(rc); return NULL; } orte_ns_replica.num_dts = 0; /* setup the thread lock */ OBJ_CONSTRUCT(&orte_ns_replica.mutex, opal_mutex_t); /* Return the module */ initialized = true; return &orte_ns_replica_module; } else { return NULL; } } int orte_ns_replica_module_init(void) { int rc; if (orte_ns_replica.isolate) { return ORTE_SUCCESS; } /* issue non-blocking receive for call_back function */ rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_NS, ORTE_RML_PERSISTENT, orte_ns_replica_recv, NULL); if(rc < 0) { ORTE_ERROR_LOG(rc); return rc; } return ORTE_SUCCESS; } /* * finalize routine */ int orte_ns_replica_finalize(void) { orte_ns_replica_cell_tracker_t **cptr; opal_list_item_t *item; orte_ns_replica_tagitem_t **tag; orte_ns_replica_dti_t **dti; orte_std_cntr_t i; /* free all tracking storage, but only if this component was initialized */ if (initialized) { cptr = (orte_ns_replica_cell_tracker_t**)(orte_ns_replica.cells)->addr; for (i=0; i < (orte_ns_replica.cells)->size; i++) { if (NULL != cptr[i]) { OBJ_RELEASE(cptr[i]); } } OBJ_RELEASE(orte_ns_replica.cells); while (NULL != (item = opal_list_remove_first(&orte_ns_replica.jobs))) { OBJ_RELEASE(item); } OBJ_DESTRUCT(&orte_ns_replica.jobs); tag = (orte_ns_replica_tagitem_t**)(orte_ns_replica.tags)->addr; for (i=0; i < (orte_ns_replica.tags)->size; i++) { if (NULL != tag[i]) OBJ_RELEASE(tag[i]); } OBJ_RELEASE(orte_ns_replica.tags); dti = (orte_ns_replica_dti_t**)(orte_ns_replica.dts)->addr; for (i=0; i < (orte_ns_replica.dts)->size; i++) { if (NULL != dti[i]) OBJ_RELEASE(dti[i]); } OBJ_RELEASE(orte_ns_replica.dts); initialized = false; } /* All done */ if (orte_ns_replica.isolate) { return ORTE_SUCCESS; } orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_NS); return ORTE_SUCCESS; } int orte_ns_replica_ft_event(int state) { if(OPAL_CRS_CHECKPOINT == state) { ; } else if(OPAL_CRS_CONTINUE == state) { ; } else if(OPAL_CRS_RESTART == state) { ; } else if(OPAL_CRS_TERM == state ) { ; } else { ; } return ORTE_SUCCESS; }