1
1

Bring over the update to terminate orteds that are generated by a dynamic spawn such as comm_spawn. This introduces the concept of a job "family" - i.e., jobs that have a parent/child relationship. Comm_spawn'ed jobs have a parent (the one that spawned them). We track that relationship throughout the lineage - i.e., if a comm_spawned job in turn calls comm_spawn, then it has a parent (the one that spawned it) and a "root" job (the original job that started things).

Accordingly, there are new APIs to the name service to support the ability to get a job's parent, root, immediate children, and all its descendants. In addition, the terminate_job, terminate_orted, and signal_job APIs for the PLS have been modified to accept attributes that define the extent of their actions. For example, doing a "terminate_job" with an attribute of ORTE_NS_INCLUDE_DESCENDANTS will terminate the given jobid AND all jobs that descended from it.

I have tested this capability on a MacBook under rsh, Odin under SLURM, and LANL's Flash (bproc). It worked successfully on non-MPI jobs (both simple and including a spawn), and MPI jobs (again, both simple and with a spawn).

This commit was SVN r12597.
Этот коммит содержится в:
Ralph Castain 2006-11-14 19:34:59 +00:00
родитель e2666f8394
Коммит 6d6cebb4a7
144 изменённых файлов: 6909 добавлений и 3847 удалений

Просмотреть файл

@ -307,9 +307,7 @@ void ompi_attr_create_predefined_callback(
/* Set some default values */
if (ORTE_SUCCESS != orte_ns.get_jobid(&job, orte_process_info.my_name)) {
return;
}
job = ORTE_PROC_MY_NAME->jobid;
/* Query the gpr to find out how many CPUs there will be.
This will only return a non-empty list in a persistent

Просмотреть файл

@ -1040,7 +1040,7 @@ int ompi_comm_determine_first ( ompi_communicator_t *intercomm, int high )
theirproc = intercomm->c_remote_group->grp_proc_pointers[0];
mask = ORTE_NS_CMP_CELLID | ORTE_NS_CMP_JOBID | ORTE_NS_CMP_VPID;
rc = orte_ns.compare (mask, &(ourproc->proc_name), &(theirproc->proc_name));
rc = orte_ns.compare_fields(mask, &(ourproc->proc_name), &(theirproc->proc_name));
if ( 0 > rc ) {
flag = true;
}

Просмотреть файл

@ -312,7 +312,7 @@ orte_process_name_t *ompi_comm_get_rport (orte_process_name_t *port, int send_fi
if (NULL == rbuf) {
return NULL;
}
if (ORTE_SUCCESS != (rc = orte_rml.recv_buffer(ORTE_RML_NAME_ANY, rbuf, tag))) {
if (ORTE_SUCCESS != (rc = orte_rml.recv_buffer(ORTE_NAME_WILDCARD, rbuf, tag))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(rbuf);
return NULL;
@ -565,6 +565,16 @@ ompi_comm_start_processes(int count, char **array_of_commands,
/* cleanup */
if (NULL != base_prefix) free(base_prefix);
/* tell the RTE that we want to be a child of this process' job */
if (ORTE_SUCCESS != (rc = orte_rmgr.add_attribute(&attributes, ORTE_NS_USE_PARENT,
ORTE_JOBID, &(orte_process_info.my_name->jobid),
ORTE_RMGR_ATTR_OVERRIDE))) {
ORTE_ERROR_LOG(rc);
OBJ_DESTRUCT(&attributes);
opal_progress_event_decrement();
return MPI_ERR_SPAWN;
}
/* tell the RTE that we want to the children to run inside of our allocation -
* don't go get one just for them
*/
@ -857,7 +867,7 @@ void ompi_comm_disconnect_waitall (int count, ompi_comm_disconnect_obj **objs)
#define OMPI_COMM_MAXJOBIDS 64
void ompi_comm_mark_dyncomm (ompi_communicator_t *comm)
{
int i, j, numjobids=0, rc;
int i, j, numjobids=0;
int size, rsize;
int found;
orte_jobid_t jobids[OMPI_COMM_MAXJOBIDS], thisjobid;
@ -875,40 +885,34 @@ void ompi_comm_mark_dyncomm (ompi_communicator_t *comm)
of different jobids. */
grp = comm->c_local_group;
for (i=0; i< size; i++) {
if (ORTE_SUCCESS != (rc = orte_ns.get_jobid(&thisjobid, &(grp->grp_proc_pointers[i]->proc_name)))) {
ORTE_ERROR_LOG(rc);
return;
}
found = 0;
for ( j=0; j<numjobids; j++) {
if ( thisjobid == jobids[j]) {
found = 1;
break;
thisjobid = grp->grp_proc_pointers[i]->proc_name.jobid;
found = 0;
for ( j=0; j<numjobids; j++) {
if (thisjobid == jobids[j]) {
found = 1;
break;
}
}
if (!found ) {
jobids[numjobids++] = thisjobid;
}
}
if (!found ) {
jobids[numjobids++] = thisjobid;
}
}
/* if inter-comm, loop over all processes in remote_group
and count number of different jobids */
grp = comm->c_remote_group;
for (i=0; i< rsize; i++) {
if (ORTE_SUCCESS != (rc = orte_ns.get_jobid(&thisjobid, &(grp->grp_proc_pointers[i]->proc_name)))) {
ORTE_ERROR_LOG(rc);
return;
}
found = 0;
for ( j=0; j<numjobids; j++) {
if ( thisjobid == jobids[j]) {
found = 1;
break;
thisjobid = grp->grp_proc_pointers[i]->proc_name.jobid;
found = 0;
for ( j=0; j<numjobids; j++) {
if ( thisjobid == jobids[j]) {
found = 1;
break;
}
}
if (!found ) {
jobids[numjobids++] = thisjobid;
}
}
if (!found ) {
jobids[numjobids++] = thisjobid;
}
}
/* if number of joibds larger than one, set the disconnect flag*/

Просмотреть файл

@ -27,7 +27,7 @@
#include "mpi.h"
#include "ompi/group/group.h"
#include "ompi/mca/coll/coll.h"
#include "orte/mca/oob/oob_types.h"
#include "orte/mca/rml/rml_types.h"
#include "ompi/proc/proc.h"
#if defined(c_plusplus) || defined(__cplusplus)

Просмотреть файл

@ -579,7 +579,7 @@ static void mca_btl_mvapi_endpoint_recv(
opal_list_get_end(&mca_btl_mvapi_component.ib_procs);
ib_proc = (mca_btl_mvapi_proc_t*)opal_list_get_next(ib_proc)) {
if(orte_ns.compare(ORTE_NS_CMP_ALL, &ib_proc->proc_guid, endpoint) == 0) {
if(orte_ns.compare_fields(ORTE_NS_CMP_ALL, &ib_proc->proc_guid, endpoint) == ORTE_EQUAL) {
bool found = false;
/* Try to get the endpoint instance of this proc */
@ -690,7 +690,7 @@ static void mca_btl_mvapi_endpoint_recv(
void mca_btl_mvapi_post_recv()
{
orte_rml.recv_buffer_nb(
ORTE_RML_NAME_ANY,
ORTE_NAME_WILDCARD,
ORTE_RML_TAG_DYNAMIC-1,
ORTE_RML_PERSISTENT,
mca_btl_mvapi_endpoint_recv,

Просмотреть файл

@ -662,7 +662,7 @@ static void mca_btl_openib_endpoint_recv(
opal_list_get_end(&mca_btl_openib_component.ib_procs);
ib_proc = (mca_btl_openib_proc_t*)opal_list_get_next(ib_proc)) {
if(orte_ns.compare(ORTE_NS_CMP_ALL, &ib_proc->proc_guid, endpoint) == 0) {
if(orte_ns.compare_fields(ORTE_NS_CMP_ALL, &ib_proc->proc_guid, endpoint) == ORTE_EQUAL) {
bool found = false;
/* Try to get the endpoint instance of this proc */
@ -776,7 +776,7 @@ void mca_btl_openib_post_recv()
{
orte_rml.recv_buffer_nb(
ORTE_RML_NAME_ANY,
ORTE_NAME_WILDCARD,
ORTE_RML_TAG_DYNAMIC-1,
ORTE_RML_PERSISTENT,
mca_btl_openib_endpoint_recv,

Просмотреть файл

@ -307,7 +307,7 @@ bool mca_btl_tcp_endpoint_accept(mca_btl_base_endpoint_t* btl_endpoint, struct s
if((btl_addr = btl_endpoint->endpoint_addr) != NULL &&
btl_addr->addr_inet.s_addr == addr->sin_addr.s_addr) {
mca_btl_tcp_proc_t *endpoint_proc = btl_endpoint->endpoint_proc;
cmpval = orte_ns.compare(mask,
cmpval = orte_ns.compare_fields(mask,
&endpoint_proc->proc_ompi->proc_name,
&this_proc->proc_ompi->proc_name);
if((btl_endpoint->endpoint_sd < 0) ||

Просмотреть файл

@ -207,7 +207,7 @@ void mca_btl_udapl_endpoint_recv(int status, orte_process_name_t* endpoint,
opal_list_get_end(&mca_btl_udapl_component.udapl_procs);
proc = (mca_btl_udapl_proc_t*)opal_list_get_next(proc)) {
if(0 == orte_ns.compare(ORTE_NS_CMP_ALL, &proc->proc_guid, endpoint)) {
if(ORTE_EQUAL == orte_ns.compare_fields(ORTE_NS_CMP_ALL, &proc->proc_guid, endpoint)) {
for(i = 0; i < proc->proc_endpoint_count; i++) {
ep = proc->proc_endpoints[i];
@ -231,7 +231,7 @@ void mca_btl_udapl_endpoint_recv(int status, orte_process_name_t* endpoint,
void mca_btl_udapl_endpoint_post_oob_recv(void)
{
orte_rml.recv_buffer_nb(ORTE_RML_NAME_ANY, ORTE_RML_TAG_DYNAMIC-1,
orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_DYNAMIC-1,
ORTE_RML_PERSISTENT, mca_btl_udapl_endpoint_recv, NULL);
}
@ -246,7 +246,7 @@ void mca_btl_udapl_endpoint_connect(mca_btl_udapl_endpoint_t* endpoint)
/* Nasty test to prevent deadlock and unwanted connection attempts */
/* This right here is the whole point of using the ORTE/RML handshake */
if((MCA_BTL_UDAPL_CONN_EAGER == endpoint->endpoint_state &&
0 > orte_ns.compare(ORTE_NS_CMP_ALL,
0 > orte_ns.compare_fields(ORTE_NS_CMP_ALL,
&endpoint->endpoint_proc->proc_guid,
&ompi_proc_local()->proc_name)) ||
(MCA_BTL_UDAPL_CLOSED != endpoint->endpoint_state &&
@ -370,7 +370,7 @@ static int mca_btl_udapl_endpoint_finish_eager(
OPAL_THREAD_UNLOCK(&endpoint->endpoint_lock);
/* Only one side does dat_ep_connect() */
if(0 < orte_ns.compare(ORTE_NS_CMP_ALL,
if(0 < orte_ns.compare_fields(ORTE_NS_CMP_ALL,
&endpoint->endpoint_proc->proc_guid,
&ompi_proc_local()->proc_name)) {

Просмотреть файл

@ -626,7 +626,7 @@ static int bootstrap_comm(ompi_communicator_t *comm)
empty_index = -1;
for (i = 0; i < mca_coll_sm_component.sm_bootstrap_num_segments; ++i) {
if (comm->c_contextid == bshe->smbhe_keys[i].mcsbck_cid &&
0 == orte_ns.compare(ORTE_NS_CMP_ALL,
ORTE_EQUAL == orte_ns.compare_fields(ORTE_NS_CMP_ALL,
rank0,
&bshe->smbhe_keys[i].mcsbck_rank0_name)) {
found = true;

Просмотреть файл

@ -426,10 +426,7 @@ static int mca_pml_base_modex_subscribe(orte_process_name_t* name)
OPAL_UNLOCK(&mca_pml_base_modex_lock);
/* otherwise - subscribe to get this jobid's contact info */
if (ORTE_SUCCESS != (rc = orte_ns.get_jobid(&jobid, name))) {
ORTE_ERROR_LOG(rc);
return rc;
}
jobid = name->jobid;
if (ORTE_SUCCESS != (rc = orte_schema.get_std_subscription_name(&sub_name,
OMPI_MODEX_SUBSCRIPTION, jobid))) {
@ -520,10 +517,7 @@ int mca_pml_base_modex_send(
orte_byte_object_t bo;
orte_data_value_t value = ORTE_DATA_VALUE_EMPTY;
if (ORTE_SUCCESS != (rc = orte_ns.get_jobid(&jobid, orte_process_info.my_name))) {
ORTE_ERROR_LOG(rc);
return rc;
}
jobid = ORTE_PROC_MY_NAME->jobid;
if (ORTE_SUCCESS != (rc = orte_schema.get_job_segment_name(&segment, jobid))) {
ORTE_ERROR_LOG(rc);

Просмотреть файл

@ -172,9 +172,9 @@ int mca_pml_dr_add_procs(ompi_proc_t** procs, size_t nprocs)
processes, but will work for initial job start */
idx = ompi_pointer_array_add(&mca_pml_dr.endpoints,
(void*) endpoint);
if(orte_ns.compare(ORTE_NS_CMP_ALL,
if(orte_ns.compare_fields(ORTE_NS_CMP_ALL,
orte_process_info.my_name,
&(endpoint->proc_ompi->proc_name)) == 0) {
&(endpoint->proc_ompi->proc_name)) == ORTE_EQUAL) {
mca_pml_dr.my_rank = idx;
}
endpoint->local = endpoint->dst = idx;

Просмотреть файл

@ -104,7 +104,7 @@ void ompi_proc_destruct(ompi_proc_t* proc)
int ompi_proc_init(void)
{
orte_process_name_t *peers;
orte_std_cntr_t i, npeers, self, num_tokens;
orte_std_cntr_t i, npeers, num_tokens;
orte_jobid_t jobid;
char *segment, **tokens;
orte_data_value_t value = { {OBJ_CLASS(orte_data_value_t),0}, ORTE_NULL, NULL};
@ -115,7 +115,7 @@ int ompi_proc_init(void)
OBJ_CONSTRUCT(&ompi_proc_lock, opal_mutex_t);
/* get all peers in this job */
if(ORTE_SUCCESS != (rc = orte_ns.get_peers(&peers, &npeers, &self))) {
if(ORTE_SUCCESS != (rc = orte_ns.get_peers(&peers, &npeers, NULL))) {
opal_output(0, "ompi_proc_init: get_peers failed with errno=%d", rc);
return rc;
}
@ -124,7 +124,7 @@ int ompi_proc_init(void)
for( i = 0; i < npeers; i++ ) {
ompi_proc_t *proc = OBJ_NEW(ompi_proc_t);
proc->proc_name = peers[i];
if( i == self ) {
if( i == ORTE_PROC_MY_NAME->vpid ) {
ompi_proc_local_proc = proc;
proc->proc_flags |= OMPI_PROC_FLAG_LOCAL;
}
@ -150,10 +150,7 @@ int ompi_proc_init(void)
return rc;
}
if (ORTE_SUCCESS != (rc = orte_ns.get_jobid(&jobid, orte_process_info.my_name))) {
ORTE_ERROR_LOG(rc);
return rc;
}
jobid = ORTE_PROC_MY_NAME->jobid;
/* find the job segment on the registry */
if (ORTE_SUCCESS != (rc = orte_schema.get_job_segment_name(&segment, jobid))) {
@ -223,7 +220,7 @@ ompi_proc_t** ompi_proc_world(size_t *size)
for (proc = (ompi_proc_t*)opal_list_get_first(&ompi_proc_list);
proc != (ompi_proc_t*)opal_list_get_end(&ompi_proc_list);
proc = (ompi_proc_t*)opal_list_get_next(proc)) {
if (0 == orte_ns.compare(mask, &proc->proc_name, &my_name)) {
if (ORTE_EQUAL == orte_ns.compare_fields(mask, &proc->proc_name, &my_name)) {
++count;
}
}
@ -239,7 +236,7 @@ ompi_proc_t** ompi_proc_world(size_t *size)
for (proc = (ompi_proc_t*)opal_list_get_first(&ompi_proc_list);
proc != (ompi_proc_t*)opal_list_get_end(&ompi_proc_list);
proc = (ompi_proc_t*)opal_list_get_next(proc)) {
if (0 == orte_ns.compare(mask, &proc->proc_name, &my_name)) {
if (ORTE_EQUAL == orte_ns.compare_fields(mask, &proc->proc_name, &my_name)) {
procs[count++] = proc;
}
}
@ -298,7 +295,7 @@ ompi_proc_t * ompi_proc_find ( const orte_process_name_t * name )
for(proc = (ompi_proc_t*)opal_list_get_first(&ompi_proc_list);
proc != (ompi_proc_t*)opal_list_get_end(&ompi_proc_list);
proc = (ompi_proc_t*)opal_list_get_next(proc)) {
if (0 == orte_ns.compare(mask, &proc->proc_name, name)) {
if (ORTE_EQUAL == orte_ns.compare_fields(mask, &proc->proc_name, name)) {
rproc = proc;
break;
}
@ -319,7 +316,7 @@ ompi_proc_t * ompi_proc_find_and_add ( const orte_process_name_t * name, bool* i
for(proc = (ompi_proc_t*)opal_list_get_first(&ompi_proc_list);
proc != (ompi_proc_t*)opal_list_get_end(&ompi_proc_list);
proc = (ompi_proc_t*)opal_list_get_next(proc)) {
if (0 == orte_ns.compare(mask, &proc->proc_name, name)) {
if (ORTE_EQUAL == orte_ns.compare_fields(mask, &proc->proc_name, name)) {
*isnew = false;
rproc = proc;
break;
@ -395,10 +392,7 @@ static int setup_registry_callback(void)
orte_gpr_subscription_id_t id;
orte_jobid_t jobid;
if (ORTE_SUCCESS != (rc = orte_ns.get_jobid(&jobid, &local->proc_name))) {
ORTE_ERROR_LOG(rc);
return rc;
}
jobid = local->proc_name.jobid;
/* find the job segment on the registry */
if (ORTE_SUCCESS !=
@ -534,7 +528,7 @@ static void callback(orte_gpr_notify_data_t *data, void *cbdata)
/* find the associated proc entry and update its
arch flag. If the nodename of this info is
my local host, also set the LOCAL flag. */
if (0 == orte_ns.compare(mask, &name, &proc->proc_name)) {
if (ORTE_EQUAL == orte_ns.compare_fields(mask, &name, &proc->proc_name)) {
proc->proc_arch = arch;
if (0 == strcmp(str, orte_system_info.nodename)) {
proc->proc_flags |= OMPI_PROC_FLAG_LOCAL;

Просмотреть файл

@ -133,7 +133,7 @@ ompi_mpi_abort(struct ompi_communicator_t* comm,
/* put all the local procs in the abort list */
for (i = 0 ; i < ompi_comm_size(comm) ; ++i) {
if (0 != orte_ns.compare(ORTE_NS_CMP_ALL,
if (ORTE_EQUAL != orte_ns.compare_fields(ORTE_NS_CMP_ALL,
&comm->c_local_group->grp_proc_pointers[i]->proc_name,
orte_process_info.my_name)) {
assert(count <= nabort_procs);
@ -147,7 +147,7 @@ ompi_mpi_abort(struct ompi_communicator_t* comm,
/* if requested, kill off remote procs too */
if (kill_remote_of_intercomm) {
for (i = 0 ; i < ompi_comm_remote_size(comm) ; ++i) {
if (0 != orte_ns.compare(ORTE_NS_CMP_ALL,
if (ORTE_EQUAL != orte_ns.compare_fields(ORTE_NS_CMP_ALL,
&comm->c_remote_group->grp_proc_pointers[i]->proc_name,
orte_process_info.my_name)) {
assert(count <= nabort_procs);

Просмотреть файл

@ -294,7 +294,7 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
}
if (!set) {
char *vpid;
orte_ns_base_get_vpid_string(&vpid, orte_process_info.my_name);
orte_ns.get_vpid_string(&vpid, orte_process_info.my_name);
opal_show_help("help-mpi-runtime",
"mpi_init:startup:paffinity-unavailable",
true, vpid);

Просмотреть файл

@ -17,7 +17,7 @@
*/
#include "ompi_config.h"
#include "orte/mca/oob/oob.h"
#include "orte/mca/ns/ns_types.h"
#include "orte/mca/iof/iof.h"
#include "ompi/constants.h"
@ -40,7 +40,7 @@ int ompi_mpi_init_io(void)
close(fds[0]);
rc = mca_iof.iof_publish(
MCA_OOB_NAME_SELF,
OMPI_PROC_MY_NAME,
MCA_IOF_SINK,
MCA_IOF_STDIN,
fds[1]);
@ -56,7 +56,7 @@ int ompi_mpi_init_io(void)
close(fds[1]);
rc = mca_iof.iof_publish(
MCA_OOB_NAME_SELF,
OMPI_PROC_MY_NAME,
MCA_IOF_SOURCE,
MCA_IOF_STDOUT,
fds[0]);
@ -72,7 +72,7 @@ int ompi_mpi_init_io(void)
close(fds[1]);
rc = mca_iof.iof_publish(
MCA_OOB_NAME_SELF,
OMPI_PROC_MY_NAME,
MCA_IOF_SOURCE,
MCA_IOF_STDERR,
fds[0]);

Просмотреть файл

@ -36,7 +36,7 @@ int orte_dss_pack(orte_buffer_t *buffer, void *src, orte_std_cntr_t num_vals,
int rc;
/* check for error */
if (NULL == buffer || NULL == src) {
if (NULL == buffer) {
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
return ORTE_ERR_BAD_PARAM;
}

Просмотреть файл

@ -29,8 +29,8 @@ extern "C" {
#define ORTE_ERR_BASE OPAL_ERR_MAX
/* define the results values for comparisons so we can change them in only one place */
#define ORTE_VALUE1_GREATER -1
#define ORTE_VALUE2_GREATER +1
#define ORTE_VALUE1_GREATER +1
#define ORTE_VALUE2_GREATER -1
#define ORTE_EQUAL 0
enum {

Просмотреть файл

@ -38,6 +38,7 @@ typedef uint8_t orte_data_type_t; /** data type indicators used in ORTE */
typedef int32_t orte_std_cntr_t; /** standard counters used in ORTE */
#define ORTE_STD_CNTR_T ORTE_INT32
#define ORTE_STD_CNTR_MAX INT32_MAX
#define ORTE_STD_CNTR_MIN INT32_MIN
#define ORTE_STD_CNTR_INVALID -1
/* define a structure to hold generic byte objects */

Просмотреть файл

@ -50,7 +50,7 @@ int orte_errmgr_base_comm_start(void)
return ORTE_SUCCESS;
}
if (ORTE_SUCCESS != (rc = orte_rml.recv_buffer_nb(ORTE_RML_NAME_ANY,
if (ORTE_SUCCESS != (rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD,
ORTE_RML_TAG_ERRMGR,
ORTE_RML_PERSISTENT,
orte_errmgr_base_recv,
@ -70,7 +70,7 @@ int orte_errmgr_base_comm_stop(void)
return ORTE_SUCCESS;
}
if (ORTE_SUCCESS != (rc = orte_rml.recv_cancel(ORTE_RML_NAME_ANY, ORTE_RML_TAG_ERRMGR))) {
if (ORTE_SUCCESS != (rc = orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_ERRMGR))) {
ORTE_ERROR_LOG(rc);
}
recv_issued = false;

Просмотреть файл

@ -23,6 +23,7 @@
#include <stdlib.h>
#include <stdarg.h>
#include "opal/class/opal_list.h"
#include "opal/util/trace.h"
#include "opal/util/output.h"
@ -59,6 +60,8 @@ int orte_errmgr_hnp_proc_aborted(orte_gpr_notify_message_t *msg)
NULL
};
orte_data_value_t dval = ORTE_DATA_VALUE_EMPTY;
opal_list_t attrs;
opal_list_item_t *item;
int rc;
OPAL_TRACE(1);
@ -79,11 +82,15 @@ int orte_errmgr_hnp_proc_aborted(orte_gpr_notify_message_t *msg)
return rc;
}
/* tell the pls to terminate the job */
if (ORTE_SUCCESS != (rc = orte_pls.terminate_job(job))) {
/* tell the pls to terminate the job AND ALL ITS DESCENDANTS */
OBJ_CONSTRUCT(&attrs, opal_list_t);
orte_rmgr.add_attribute(&attrs, ORTE_NS_INCLUDE_DESCENDANTS, ORTE_UNDEF, NULL, ORTE_RMGR_ATTR_OVERRIDE);
if (ORTE_SUCCESS != (rc = orte_pls.terminate_job(job, &attrs))) {
ORTE_ERROR_LOG(rc);
return rc;
}
while (NULL != (item = opal_list_remove_first(&attrs))) OBJ_RELEASE(item);
OBJ_DESTRUCT(&attrs);
/* orterun will only wakeup when all procs report terminated. The terminate_job
* function *should* have done that - however, it is possible during abnormal
@ -142,8 +149,10 @@ int orte_errmgr_hnp_incomplete_start(orte_gpr_notify_message_t *msg)
return rc;
}
/* tell the pls to terminate the job */
if (ORTE_SUCCESS != (rc = orte_pls.terminate_job(job))) {
/* tell the pls to terminate the job - just kill this job, not any descendants since
* the job is just trying to start
*/
if (ORTE_SUCCESS != (rc = orte_pls.terminate_job(job, NULL))) {
ORTE_ERROR_LOG(rc);
}

Просмотреть файл

@ -233,7 +233,7 @@ orte_gpr_proxy_component_init(bool *allow_multi_user_threads, bool *have_hidden_
ORTE_ERROR_LOG(ret);
return NULL;
}
if(ORTE_SUCCESS != (ret = orte_ns.copy_process_name(&orte_process_info.gpr_replica, &name))) {
if(ORTE_SUCCESS != (ret = orte_dss.copy((void**)&orte_process_info.gpr_replica, &name, ORTE_NAME))) {
ORTE_ERROR_LOG(ret);
return NULL;
}
@ -299,7 +299,7 @@ int orte_gpr_proxy_module_init(void)
{
/* issue the non-blocking receive */
int rc;
rc = orte_rml.recv_buffer_nb(ORTE_RML_NAME_ANY, ORTE_RML_TAG_GPR_NOTIFY, ORTE_RML_PERSISTENT, orte_gpr_proxy_notify_recv, NULL);
rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_GPR_NOTIFY, ORTE_RML_PERSISTENT, orte_gpr_proxy_notify_recv, NULL);
if(rc < 0) {
ORTE_ERROR_LOG(rc);
return rc;
@ -359,7 +359,7 @@ int orte_gpr_proxy_finalize(void)
}
/* All done */
orte_rml.recv_cancel(ORTE_RML_NAME_ANY, ORTE_RML_TAG_GPR_NOTIFY);
orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_GPR_NOTIFY);
return ORTE_SUCCESS;
}

Просмотреть файл

@ -30,6 +30,7 @@
#include "orte/orte_constants.h"
#include "opal/util/trace.h"
#include "orte/dss/dss.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/gpr/replica/api_layer/gpr_replica_api.h"

Просмотреть файл

@ -88,9 +88,7 @@ int orte_gpr_replica_cleanup_proc_fn(orte_process_name_t *proc)
}
/* find the job segment */
if (ORTE_SUCCESS != orte_ns.get_jobid(&jobid, proc)) {
return ORTE_ERR_BAD_PARAM;
}
jobid = proc->jobid;
if (ORTE_SUCCESS != orte_ns.convert_jobid_to_string(&jobidstring, jobid)) {
return ORTE_ERR_BAD_PARAM;

Просмотреть файл

@ -387,9 +387,7 @@ int orte_gpr_replica_define_callback(orte_gpr_notify_msg_type_t msg_type,
if (((NULL == recipient && NULL == cb->requestor) &&
(msg_type == cb->message->msg_type)) ||
(((NULL != recipient && NULL != cb->requestor) &&
(0 == orte_ns.compare(ORTE_NS_CMP_ALL,
recipient,
cb->requestor))) &&
(ORTE_EQUAL == orte_dss.compare(recipient, cb->requestor, ORTE_NAME))) &&
(msg_type == cb->message->msg_type))) {
/* okay, a callback has been registered to send data to this
* recipient - return this location
@ -421,7 +419,7 @@ int orte_gpr_replica_define_callback(orte_gpr_notify_msg_type_t msg_type,
if (NULL == recipient) {
cb->requestor = NULL;
} else {
if (ORTE_SUCCESS != (rc = orte_ns.copy_process_name(&(cb->requestor), recipient))) {
if (ORTE_SUCCESS != (rc = orte_dss.copy((void**)&(cb->requestor), recipient, ORTE_NAME))) {
ORTE_ERROR_LOG(rc);
return rc;
}

Просмотреть файл

@ -216,8 +216,13 @@ ADDREQ:
i < (sub->requestors)->size; i++) {
if (NULL != reqs[i]) {
j++;
if ((NULL == reqs[i]->requestor && NULL != requestor) ||
(NULL != reqs[i]->requestor && NULL == requestor)) {
continue;
}
if (reqs[i]->idtag == subscription->id &&
0 == orte_ns.compare(ORTE_NS_CMP_ALL, reqs[i]->requestor, requestor)) {
((NULL == reqs[i]->requestor && NULL == requestor) ||
(ORTE_EQUAL == orte_dss.compare(reqs[i]->requestor, requestor, ORTE_NAME)))) {
/* found this requestor - do not add it again */
goto DONESUB;
}
@ -234,8 +239,7 @@ ADDREQ:
}
if (NULL != requestor) {
if (ORTE_SUCCESS != (rc = orte_ns.copy_process_name(&(req->requestor),
requestor))) {
if (ORTE_SUCCESS != (rc = orte_dss.copy((void**)&(req->requestor), requestor, ORTE_NAME))) {
ORTE_ERROR_LOG(rc);
return rc;
}
@ -546,8 +550,14 @@ ADDREQ:
i < (trig->attached)->size; i++) {
if (NULL != reqs[i]) {
j++;
/* if one is NULL and the other isn't, then they can't possibly match */
if ((NULL == reqs[i]->requestor && NULL != requestor) ||
(NULL != reqs[i]->requestor && NULL == requestor)) {
continue;
}
if (reqs[i]->idtag == trigger->id &&
0 == orte_ns.compare(ORTE_NS_CMP_ALL, reqs[i]->requestor, requestor)) {
((NULL == reqs[i]->requestor && NULL == requestor) ||
(ORTE_EQUAL == orte_dss.compare(reqs[i]->requestor, requestor, ORTE_NAME)))) {
/* found this requestor - do not add it again */
goto DONETRIG;
}
@ -562,8 +572,7 @@ ADDREQ:
}
if (NULL != requestor) {
if (ORTE_SUCCESS != (rc = orte_ns.copy_process_name(&(req->requestor),
requestor))) {
if (ORTE_SUCCESS != (rc = orte_dss.copy((void**)&(req->requestor), requestor, ORTE_NAME))) {
ORTE_ERROR_LOG(rc);
return rc;
}
@ -657,8 +666,7 @@ orte_gpr_replica_remove_subscription(orte_process_name_t *requestor,
if (id == reqs[k]->idtag &&
((NULL == requestor && NULL == reqs[k]->requestor) ||
(NULL != requestor && NULL != reqs[k]->requestor &&
0 == orte_ns.compare(ORTE_NS_CMP_ALL,
reqs[k]->requestor, requestor)))) {
ORTE_EQUAL == orte_dss.compare(reqs[k]->requestor, requestor, ORTE_NAME)))) {
/* this is the subscription */
sub = subs[i];
req = reqs[k];
@ -763,8 +771,7 @@ orte_gpr_replica_remove_trigger(orte_process_name_t *requestor,
if (id == reqs[k]->idtag &&
((NULL == requestor && NULL == reqs[k]->requestor) ||
(NULL != requestor && NULL != reqs[k]->requestor &&
0 == orte_ns.compare(ORTE_NS_CMP_ALL,
reqs[k]->requestor, requestor)))) {
ORTE_EQUAL == orte_dss.compare(reqs[k]->requestor, requestor, ORTE_NAME)))) {
/* this is the trigger */
trig = trigs[i];
req = reqs[k];
@ -1330,7 +1337,7 @@ int orte_gpr_replica_purge_subscriptions(orte_process_name_t *proc)
}
OBJ_RELEASE(trig);
} else if (NULL != proc && NULL != trig[i]->requestor &&
0 == orte_ns.compare(ORTE_NS_CMP_ALL, proc, trig[i]->requestor)) {
ORTE_EQUAL == orte_dss.compare(Oproc, trig[i]->requestor, ORTE_NAME)) {
if (ORTE_SUCCESS != (rc = orte_pointer_array_set_item(orte_gpr_replica.triggers,
trig[i]->index, NULL))) {
ORTE_ERROR_LOG(rc);

Просмотреть файл

@ -311,7 +311,7 @@ int orte_gpr_replica_module_init(void)
/* issue the non-blocking receive */
if (!orte_gpr_replica_globals.isolate) {
int rc = orte_rml.recv_buffer_nb(
ORTE_RML_NAME_ANY, ORTE_RML_TAG_GPR, ORTE_RML_PERSISTENT, orte_gpr_replica_recv, NULL);
ORTE_NAME_WILDCARD, ORTE_RML_TAG_GPR, ORTE_RML_PERSISTENT, orte_gpr_replica_recv, NULL);
if(rc < 0) {
ORTE_ERROR_LOG(rc);
return rc;
@ -437,7 +437,7 @@ int orte_gpr_replica_finalize(void)
return ORTE_SUCCESS;
}
orte_rml.recv_cancel(ORTE_RML_NAME_ANY, ORTE_RML_TAG_GPR);
orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_GPR);
return ORTE_SUCCESS;
}

Просмотреть файл

@ -59,7 +59,7 @@ int orte_iof_base_close(void)
OPAL_THREAD_UNLOCK(&orte_iof_base.iof_lock);
if (NULL != orte_iof_base.iof_service) {
orte_ns.free_name(&(orte_iof_base.iof_service));
free(orte_iof_base.iof_service);
}
return ORTE_SUCCESS;

Просмотреть файл

@ -42,7 +42,10 @@
#include <signal.h>
#endif /* HAVE_SIGNAL_H */
#include "opal/util/output.h"
#include "orte/mca/ns/ns.h"
#include "orte/mca/rml/rml.h"
#include "orte/mca/iof/base/base.h"
#include "orte/mca/iof/base/iof_base_endpoint.h"
#include "orte/mca/iof/base/iof_base_fragment.h"
@ -172,7 +175,7 @@ static void orte_iof_base_endpoint_read_handler(int fd, short flags, void *cbdat
hdr = &frag->frag_hdr;
hdr->hdr_common.hdr_type = ORTE_IOF_BASE_HDR_MSG;
hdr->hdr_msg.msg_src = endpoint->ep_name;
hdr->hdr_msg.msg_proxy = *ORTE_RML_NAME_SELF;
hdr->hdr_msg.msg_proxy = *ORTE_PROC_MY_NAME;
hdr->hdr_msg.msg_tag = endpoint->ep_tag;
hdr->hdr_msg.msg_seq = endpoint->ep_seq;
hdr->hdr_msg.msg_len = frag->frag_len;
@ -294,7 +297,7 @@ static orte_iof_base_endpoint_t* orte_iof_base_endpoint_lookup(
item != opal_list_get_end(&orte_iof_base.iof_endpoints);
item = opal_list_get_next(item)) {
orte_iof_base_endpoint_t* endpoint = (orte_iof_base_endpoint_t*)item;
if(orte_ns.compare(ORTE_NS_CMP_ALL,proc,&endpoint->ep_name) == 0 &&
if(orte_ns.compare_fields(ORTE_NS_CMP_ALL,proc,&endpoint->ep_name) == 0 &&
endpoint->ep_tag == tag && endpoint->ep_mode == mode) {
OBJ_RETAIN(endpoint);
return endpoint;
@ -428,7 +431,7 @@ int orte_iof_base_endpoint_delete(
while(item != opal_list_get_end(&orte_iof_base.iof_endpoints)) {
opal_list_item_t* next = opal_list_get_next(item);
orte_iof_base_endpoint_t* endpoint = (orte_iof_base_endpoint_t*)item;
if(orte_ns.compare(mask,proc,&endpoint->ep_name) == 0 &&
if(orte_ns.compare_fields(mask,proc,&endpoint->ep_name) == 0 &&
endpoint->ep_tag == tag) {
OBJ_RELEASE(endpoint);
opal_list_remove_item(&orte_iof_base.iof_endpoints,&endpoint->super);
@ -485,7 +488,7 @@ orte_iof_base_endpoint_t* orte_iof_base_endpoint_match(
item != opal_list_get_end(&orte_iof_base.iof_endpoints);
item = opal_list_get_next(item)) {
orte_iof_base_endpoint_t* endpoint = (orte_iof_base_endpoint_t*)item;
if(orte_ns.compare(dst_mask,dst_name,&endpoint->ep_name) == 0) {
if(orte_ns.compare_fields(dst_mask,dst_name,&endpoint->ep_name) == 0) {
if(endpoint->ep_tag == dst_tag || endpoint->ep_tag == ORTE_IOF_ANY || dst_tag == ORTE_IOF_ANY) {
OBJ_RETAIN(endpoint);
OPAL_THREAD_UNLOCK(&orte_iof_base.iof_lock);

Просмотреть файл

@ -144,7 +144,7 @@ int orte_iof_proxy_push(
/* send a subscription to server on behalf of the destination */
rc = orte_iof_proxy_svc_subscribe(
ORTE_RML_NAME_SELF,
ORTE_PROC_MY_NAME,
ORTE_NS_CMP_ALL,
dst_tag,
dst_name,
@ -156,7 +156,7 @@ int orte_iof_proxy_push(
/* setup a local endpoint to reflect registration */
rc = orte_iof_base_endpoint_create(
ORTE_RML_NAME_SELF,
ORTE_PROC_MY_NAME,
ORTE_IOF_SOURCE,
dst_tag,
fd);
@ -184,7 +184,7 @@ int orte_iof_proxy_pull(
/* setup a local endpoint */
int rc;
rc = orte_iof_base_endpoint_create(
ORTE_RML_NAME_SELF,
ORTE_PROC_MY_NAME,
ORTE_IOF_SINK,
src_tag,
fd);
@ -195,7 +195,7 @@ int orte_iof_proxy_pull(
/* publish this endpoint */
rc = orte_iof_proxy_svc_publish(
ORTE_RML_NAME_SELF,
ORTE_PROC_MY_NAME,
src_tag);
if(rc != ORTE_SUCCESS) {
ORTE_ERROR_LOG(rc);
@ -207,7 +207,7 @@ int orte_iof_proxy_pull(
src_name,
src_mask,
src_tag,
ORTE_RML_NAME_SELF,
ORTE_PROC_MY_NAME,
ORTE_NS_CMP_ALL,
src_tag);
if(rc != ORTE_SUCCESS) {
@ -247,7 +247,7 @@ int orte_iof_proxy_subscribe(
int rc;
/* create a local registration to reflect the callback */
rc = orte_iof_base_callback_create(ORTE_RML_NAME_SELF,src_tag,cbfunc,cbdata);
rc = orte_iof_base_callback_create(ORTE_PROC_MY_NAME,src_tag,cbfunc,cbdata);
if(rc != ORTE_SUCCESS)
return rc;
@ -256,7 +256,7 @@ int orte_iof_proxy_subscribe(
src_name,
src_mask,
src_tag,
ORTE_RML_NAME_SELF,
ORTE_PROC_MY_NAME,
ORTE_NS_CMP_ALL,
src_tag);
return rc;
@ -274,13 +274,13 @@ int orte_iof_proxy_unsubscribe(
src_name,
src_mask,
src_tag,
ORTE_RML_NAME_SELF,
ORTE_PROC_MY_NAME,
ORTE_NS_CMP_ALL,
src_tag);
if(rc != ORTE_SUCCESS)
return rc;
/* remove local callback */
return orte_iof_base_callback_delete(ORTE_RML_NAME_SELF,src_tag);
return orte_iof_base_callback_delete(ORTE_PROC_MY_NAME,src_tag);
}

Просмотреть файл

@ -127,7 +127,7 @@ orte_iof_proxy_init(int* priority, bool *allow_multi_user_threads, bool *have_hi
mca_iof_proxy_component.proxy_iov[0].iov_len = 0;
rc = orte_rml.recv_nb(
ORTE_RML_NAME_ANY,
ORTE_NAME_WILDCARD,
mca_iof_proxy_component.proxy_iov,
1,
ORTE_RML_TAG_IOF_SVC,
@ -152,7 +152,7 @@ static int orte_iof_proxy_close(void)
int rc = ORTE_SUCCESS;
if (initialized) {
rc = orte_rml.recv_cancel(ORTE_RML_NAME_ANY, ORTE_RML_TAG_IOF_SVC);
rc = orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_IOF_SVC);
}
return rc;
}

Просмотреть файл

@ -58,7 +58,7 @@ int orte_iof_proxy_svc_publish(
hdr.hdr_common.hdr_type = ORTE_IOF_BASE_HDR_PUB;
hdr.hdr_common.hdr_status = 0;
hdr.hdr_pub.pub_name = *name;
hdr.hdr_pub.pub_proxy = *ORTE_RML_NAME_SELF;
hdr.hdr_pub.pub_proxy = *ORTE_PROC_MY_NAME;
hdr.hdr_pub.pub_mask = ORTE_NS_CMP_ALL;
hdr.hdr_pub.pub_tag = tag;
ORTE_IOF_BASE_HDR_PUB_NTOH(hdr.hdr_pub);
@ -96,7 +96,7 @@ int orte_iof_proxy_svc_unpublish(
hdr.hdr_common.hdr_type = ORTE_IOF_BASE_HDR_PUB;
hdr.hdr_common.hdr_status = 0;
hdr.hdr_pub.pub_name = *name;
hdr.hdr_pub.pub_proxy = *ORTE_RML_NAME_SELF;
hdr.hdr_pub.pub_proxy = *ORTE_PROC_MY_NAME;
hdr.hdr_pub.pub_mask = mask;
hdr.hdr_pub.pub_tag = tag;
ORTE_IOF_BASE_HDR_PUB_NTOH(hdr.hdr_pub);
@ -257,7 +257,7 @@ static void orte_iof_proxy_svc_msg(
unsigned char* data)
{
orte_iof_base_endpoint_t* endpoint;
endpoint = orte_iof_base_endpoint_match(ORTE_RML_NAME_ANY, ORTE_NS_CMP_NONE, msg->msg_tag);
endpoint = orte_iof_base_endpoint_match(ORTE_NAME_WILDCARD, ORTE_NS_CMP_NONE, msg->msg_tag);
if(endpoint != NULL) {
orte_iof_base_endpoint_forward(endpoint,src,msg,data);
OBJ_RELEASE(endpoint);

Просмотреть файл

@ -80,7 +80,7 @@ int orte_iof_svc_publish(
if(mode == ORTE_IOF_SINK) {
rc = orte_iof_svc_pub_create(
name,
ORTE_RML_NAME_SELF,
ORTE_PROC_MY_NAME,
ORTE_NS_CMP_ALL,
tag);
}
@ -106,7 +106,7 @@ int orte_iof_svc_unpublish(
int rc;
rc = orte_iof_svc_pub_delete(
name,
ORTE_RML_NAME_SELF,
ORTE_PROC_MY_NAME,
mask,
tag);
if(rc != ORTE_SUCCESS)
@ -142,7 +142,7 @@ int orte_iof_svc_push(
/* setup a subscription */
rc = orte_iof_svc_sub_create(
ORTE_RML_NAME_SELF,
ORTE_PROC_MY_NAME,
ORTE_NS_CMP_ALL,
dst_tag,
dst_name,
@ -153,7 +153,7 @@ int orte_iof_svc_push(
/* setup a local endpoint to reflect registration */
rc = orte_iof_base_endpoint_create(
ORTE_RML_NAME_SELF,
ORTE_PROC_MY_NAME,
ORTE_IOF_SOURCE,
dst_tag,
fd);
@ -181,7 +181,7 @@ int orte_iof_svc_pull(
/* setup a local endpoint */
rc = orte_iof_base_endpoint_create(
ORTE_RML_NAME_SELF,
ORTE_PROC_MY_NAME,
ORTE_IOF_SINK,
src_tag,
fd);
@ -193,7 +193,7 @@ int orte_iof_svc_pull(
src_name,
src_mask,
src_tag,
ORTE_RML_NAME_SELF,
ORTE_PROC_MY_NAME,
ORTE_NS_CMP_ALL,
src_tag);
return rc;
@ -230,7 +230,7 @@ int orte_iof_svc_subscribe(
int rc;
/* create a local registration to reflect the callback */
rc = orte_iof_base_callback_create(ORTE_RML_NAME_SELF,src_tag,cbfunc,cbdata);
rc = orte_iof_base_callback_create(ORTE_PROC_MY_NAME,src_tag,cbfunc,cbdata);
if(rc != ORTE_SUCCESS)
return rc;
@ -239,7 +239,7 @@ int orte_iof_svc_subscribe(
src_name,
src_mask,
src_tag,
ORTE_RML_NAME_SELF,
ORTE_PROC_MY_NAME,
ORTE_NS_CMP_ALL,
src_tag);
return rc;
@ -257,12 +257,12 @@ int orte_iof_svc_unsubscribe(
src_name,
src_mask,
src_tag,
ORTE_RML_NAME_SELF,
ORTE_PROC_MY_NAME,
ORTE_NS_CMP_ALL,
src_tag);
if(ORTE_SUCCESS != rc)
return rc;
/* cleanup any locally registered callback */
return orte_iof_base_callback_delete(ORTE_RML_NAME_SELF,src_tag);
return orte_iof_base_callback_delete(ORTE_PROC_MY_NAME,src_tag);
}

Просмотреть файл

@ -120,7 +120,7 @@ static int orte_iof_svc_close(void)
OBJ_RELEASE(item);
}
OPAL_THREAD_UNLOCK(&mca_iof_svc_component.svc_lock);
orte_rml.recv_cancel(ORTE_RML_NAME_ANY, ORTE_RML_TAG_IOF_SVC);
orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_IOF_SVC);
}
return ORTE_SUCCESS;
@ -164,7 +164,7 @@ orte_iof_svc_init(int* priority, bool *allow_multi_user_threads, bool *have_hidd
mca_iof_svc_component.svc_iov[0].iov_len = 0;
rc = orte_rml.recv_nb(
ORTE_RML_NAME_ANY,
ORTE_NAME_WILDCARD,
mca_iof_svc_component.svc_iov,
1,
ORTE_RML_TAG_IOF_SVC,

Просмотреть файл

@ -151,7 +151,7 @@ static void orte_iof_svc_proxy_msg(
continue;
/* source match */
if(orte_ns.compare(sub->src_mask,&sub->src_name,&hdr->msg_src) == 0) {
if(orte_ns.compare_fields(sub->src_mask,&sub->src_name,&hdr->msg_src) == 0) {
if(mca_iof_svc_component.svc_debug > 1) {
opal_output(0, "[%lu,%lu,%lu] orte_iof_svc_proxy_msg: tag %d sequence %d\n",
ORTE_NAME_ARGS(&sub->src_name),hdr->msg_tag,hdr->msg_seq);
@ -227,7 +227,7 @@ static void orte_iof_svc_proxy_ack(
orte_iof_svc_sub_t* sub = (orte_iof_svc_sub_t*)s_item;
opal_list_item_t *f_item;
if (orte_ns.compare(sub->src_mask,&sub->src_name,&hdr->msg_src) != 0 ||
if (orte_ns.compare_fields(sub->src_mask,&sub->src_name,&hdr->msg_src) != 0 ||
sub->src_tag != hdr->msg_tag) {
continue;
}
@ -238,8 +238,8 @@ static void orte_iof_svc_proxy_ack(
f_item = opal_list_get_next(f_item)) {
orte_iof_svc_fwd_t* fwd = (orte_iof_svc_fwd_t*)f_item;
orte_iof_svc_pub_t* pub = fwd->fwd_pub;
if (orte_ns.compare(pub->pub_mask,&pub->pub_name,src) == 0 ||
orte_ns.compare(ORTE_NS_CMP_ALL,&pub->pub_proxy,src) == 0) {
if (orte_ns.compare_fields(pub->pub_mask,&pub->pub_name,src) == 0 ||
orte_ns.compare_fields(ORTE_NS_CMP_ALL,&pub->pub_proxy,src) == 0) {
value.uval = hdr->msg_seq + hdr->msg_len;
orte_hash_table_set_proc(&fwd->fwd_seq,
&hdr->msg_src, &value.vval);
@ -259,7 +259,7 @@ static void orte_iof_svc_proxy_ack(
*/
if(seq_min == hdr->msg_seq+hdr->msg_len) {
if(orte_ns.compare(ORTE_NS_CMP_ALL,orte_process_info.my_name,&hdr->msg_src) == 0) {
if(orte_ns.compare_fields(ORTE_NS_CMP_ALL,orte_process_info.my_name,&hdr->msg_src) == 0) {
orte_iof_base_endpoint_t* endpoint;
/*
* Local delivery

Просмотреть файл

@ -48,8 +48,8 @@ int orte_iof_svc_pub_create(
item != opal_list_get_end(&mca_iof_svc_component.svc_published);
item = opal_list_get_next(item)) {
pub = (orte_iof_svc_pub_t*)item;
if(orte_ns.compare(pub_mask,pub_name,&pub->pub_name) == 0 &&
orte_ns.compare(ORTE_NS_CMP_ALL,pub_proxy,&pub->pub_proxy) == 0 &&
if(orte_ns.compare_fields(pub_mask,pub_name,&pub->pub_name) == 0 &&
orte_ns.compare_fields(ORTE_NS_CMP_ALL,pub_proxy,&pub->pub_proxy) == 0 &&
pub_tag == pub->pub_tag) {
OPAL_THREAD_UNLOCK(&mca_iof_svc_component.svc_lock);
return ORTE_SUCCESS;
@ -96,8 +96,8 @@ orte_iof_svc_pub_t* orte_iof_svc_pub_lookup(
item != opal_list_get_end(&mca_iof_svc_component.svc_published);
item = opal_list_get_next(item)) {
orte_iof_svc_pub_t* pub = (orte_iof_svc_pub_t*)item;
if (orte_ns.compare(ORTE_NS_CMP_ALL, &pub->pub_name,pub_name) == 0 &&
orte_ns.compare(ORTE_NS_CMP_ALL, &pub->pub_proxy,pub_proxy) == 0 &&
if (orte_ns.compare_fields(ORTE_NS_CMP_ALL, &pub->pub_name,pub_name) == 0 &&
orte_ns.compare_fields(ORTE_NS_CMP_ALL, &pub->pub_proxy,pub_proxy) == 0 &&
pub->pub_mask == pub_mask &&
pub->pub_tag == pub_tag) {
return pub;
@ -157,8 +157,8 @@ void orte_iof_svc_pub_delete_all(
opal_list_item_t* p_next = opal_list_get_next(p_item);
orte_iof_svc_pub_t* pub = (orte_iof_svc_pub_t*)p_item;
if (orte_ns.compare(ORTE_NS_CMP_ALL, &pub->pub_name,name) == 0 ||
orte_ns.compare(ORTE_NS_CMP_ALL, &pub->pub_proxy,name) == 0) {
if (orte_ns.compare_fields(ORTE_NS_CMP_ALL, &pub->pub_name,name) == 0 ||
orte_ns.compare_fields(ORTE_NS_CMP_ALL, &pub->pub_proxy,name) == 0) {
opal_list_item_t* s_item;
for(s_item = opal_list_get_first(&mca_iof_svc_component.svc_subscribed);

Просмотреть файл

@ -81,10 +81,10 @@ int orte_iof_svc_sub_create(
item = opal_list_get_next(item)) {
sub = (orte_iof_svc_sub_t*)item;
if (sub->src_mask == src_mask &&
orte_ns.compare(sub->src_mask,&sub->src_name,src_name) == 0 &&
orte_ns.compare_fields(sub->src_mask,&sub->src_name,src_name) == 0 &&
sub->src_tag == src_tag &&
sub->dst_mask == dst_mask &&
orte_ns.compare(sub->dst_mask,&sub->dst_name,dst_name) == 0 &&
orte_ns.compare_fields(sub->dst_mask,&sub->dst_name,dst_name) == 0 &&
sub->dst_tag == dst_tag) {
OPAL_THREAD_UNLOCK(&mca_iof_svc_component.svc_lock);
return ORTE_SUCCESS;
@ -134,10 +134,10 @@ int orte_iof_svc_sub_delete(
opal_list_item_t* next = opal_list_get_next(item);
orte_iof_svc_sub_t* sub = (orte_iof_svc_sub_t*)item;
if (sub->src_mask == src_mask &&
orte_ns.compare(sub->src_mask,&sub->src_name,src_name) == 0 &&
orte_ns.compare_fields(sub->src_mask,&sub->src_name,src_name) == 0 &&
sub->src_tag == src_tag &&
sub->dst_mask == dst_mask &&
orte_ns.compare(sub->dst_mask,&sub->dst_name,dst_name) == 0 &&
orte_ns.compare_fields(sub->dst_mask,&sub->dst_name,dst_name) == 0 &&
sub->dst_tag == dst_tag) {
opal_list_remove_item(&mca_iof_svc_component.svc_subscribed, item);
OBJ_RELEASE(item);
@ -159,9 +159,9 @@ int orte_iof_svc_sub_delete_all(
opal_list_item_t* next = opal_list_get_next(item);
orte_iof_svc_sub_t* sub = (orte_iof_svc_sub_t*)item;
if ((sub->src_mask == ORTE_NS_CMP_ALL &&
orte_ns.compare(ORTE_NS_CMP_ALL,&sub->src_name,name) == 0) ||
orte_ns.compare_fields(ORTE_NS_CMP_ALL,&sub->src_name,name) == 0) ||
(sub->dst_mask == ORTE_NS_CMP_ALL &&
orte_ns.compare(ORTE_NS_CMP_ALL,&sub->dst_name,name) == 0)) {
orte_ns.compare_fields(ORTE_NS_CMP_ALL,&sub->dst_name,name) == 0)) {
opal_list_remove_item(&mca_iof_svc_component.svc_subscribed, item);
OBJ_RELEASE(item);
}
@ -283,7 +283,7 @@ bool orte_iof_svc_fwd_match(
orte_iof_svc_sub_t* sub,
orte_iof_svc_pub_t* pub)
{
if (orte_ns.compare(sub->dst_mask,&sub->dst_name,&pub->pub_name) == 0 &&
if (orte_ns.compare_fields(sub->dst_mask,&sub->dst_name,&pub->pub_name) == 0 &&
sub->src_tag == pub->pub_tag) {
return true;
} else {

Просмотреть файл

@ -17,13 +17,18 @@
#
headers += \
base/base.h
base/base.h \
base/ns_private.h
libmca_ns_la_SOURCES += \
base/ns_base_close.c \
base/ns_base_select.c \
base/ns_base_open.c \
base/ns_base_local_fns.c \
base/ns_base_cell_fns.c \
base/ns_base_job_fns.c \
base/ns_base_vpid_name_fns.c \
base/ns_base_general_fns.c \
base/ns_base_diag_fns.c \
base/data_type_support/ns_data_type_compare_fns.c \
base/data_type_support/ns_data_type_copy_fns.c \
base/data_type_support/ns_data_type_print_fns.c \

Просмотреть файл

@ -42,38 +42,6 @@
extern "C" {
#endif
/* default limits */
#define ORTE_NS_ARRAY_MAX_SIZE INT_MAX
#define ORTE_NS_ARRAY_BLOCK_SIZE 512
/*
* Internal definitions
*/
typedef uint8_t orte_ns_cmd_bitmask_t;
/*
* packing type definitions
*/
/* CAUTION - any changes here must also change corresponding
* typedefs above
*/
#define ORTE_NS_CMD ORTE_INT8
/*
* define flag values for remote commands - only used internally
*/
#define ORTE_NS_CREATE_CELLID_CMD (int8_t)0x01
#define ORTE_NS_GET_CELL_INFO_CMD (int8_t)0x02
#define ORTE_NS_CREATE_JOBID_CMD (int8_t)0x03
#define ORTE_NS_RESERVE_RANGE_CMD (int8_t)0x04
#define ORTE_NS_ASSIGN_OOB_TAG_CMD (int8_t)0x08
#define ORTE_NS_GET_JOB_PEERS_CMD (int8_t)0x0A
#define ORTE_NS_DEFINE_DATA_TYPE_CMD (int8_t)0x10
#define ORTE_NS_CREATE_MY_NAME_CMD (int8_t)0x20
#define ORTE_NS_DUMP_CELLS_CMD (int8_t)0x21
#define ORTE_NS_DUMP_JOBIDS_CMD (int8_t)0x22
#define ORTE_NS_DUMP_TAGS_CMD (int8_t)0x23
#define ORTE_NS_DUMP_DATATYPES_CMD (int8_t)0x24
/*
* function definitions
@ -82,176 +50,6 @@ ORTE_DECLSPEC int orte_ns_base_open(void);
ORTE_DECLSPEC int orte_ns_base_select(void);
ORTE_DECLSPEC int orte_ns_base_close(void);
/*
* Base functions that are common to all implementations - can be overridden
*/
ORTE_DECLSPEC int orte_ns_base_assign_cellid_to_process(orte_process_name_t* name);
ORTE_DECLSPEC int orte_ns_base_create_process_name(orte_process_name_t **name,
orte_cellid_t cell,
orte_jobid_t job,
orte_vpid_t vpid);
ORTE_DECLSPEC int orte_ns_base_copy_process_name(orte_process_name_t **dest,
orte_process_name_t* src);
ORTE_DECLSPEC int orte_ns_base_convert_string_to_process_name(orte_process_name_t **name,
const char* name_string);
ORTE_DECLSPEC int orte_ns_base_get_proc_name_string(char **name_string,
const orte_process_name_t* name);
ORTE_DECLSPEC int orte_ns_base_get_vpid_string(char **vpid_string, const orte_process_name_t* name);
ORTE_DECLSPEC int orte_ns_base_convert_vpid_to_string(char **vpid_string, const orte_vpid_t vpid);
ORTE_DECLSPEC int orte_ns_base_convert_string_to_vpid(orte_vpid_t *vpid, const char* vpidstring);
ORTE_DECLSPEC int orte_ns_base_get_jobid_string(char **jobid_string, const orte_process_name_t* name);
ORTE_DECLSPEC int orte_ns_base_convert_jobid_to_string(char **jobid_string, const orte_jobid_t jobid);
ORTE_DECLSPEC int orte_ns_base_convert_string_to_jobid(orte_jobid_t *jobid, const char* jobidstring);
ORTE_DECLSPEC int orte_ns_base_get_cellid_string(char **cellid_string, const orte_process_name_t* name);
ORTE_DECLSPEC int orte_ns_base_convert_string_to_cellid(orte_cellid_t *cellid, const char *cellidstring);
ORTE_DECLSPEC int orte_ns_base_convert_cellid_to_string(char **cellid_string, const orte_cellid_t cellid);
ORTE_DECLSPEC int orte_ns_base_get_vpid(orte_vpid_t *vpid, const orte_process_name_t* name);
ORTE_DECLSPEC int orte_ns_base_get_jobid(orte_jobid_t *jobid, const orte_process_name_t* name);
ORTE_DECLSPEC int orte_ns_base_get_cellid(orte_cellid_t *cellid, const orte_process_name_t* name);
ORTE_DECLSPEC int orte_ns_base_compare(orte_ns_cmp_bitmask_t fields,
const orte_process_name_t* name1,
const orte_process_name_t* name2);
ORTE_DECLSPEC int orte_ns_base_free_name(orte_process_name_t **name);
ORTE_DECLSPEC int orte_ns_base_print_dump(orte_buffer_t *buffer);
/* not available functions */
ORTE_DECLSPEC int orte_ns_base_module_init_not_available(void);
ORTE_DECLSPEC int orte_ns_base_create_cellid_not_available(orte_cellid_t *cellid,
char *site, char *resource);
ORTE_DECLSPEC int orte_ns_base_get_cell_info_not_available(orte_cellid_t cellid,
char **site, char **resource);
ORTE_DECLSPEC int orte_ns_base_create_jobid_not_available(orte_jobid_t *jobid);
ORTE_DECLSPEC int orte_ns_base_get_vpid_range_not_available(orte_jobid_t job,
orte_vpid_t range,
orte_vpid_t *startvpid);
ORTE_DECLSPEC int orte_ns_base_derive_vpid(orte_vpid_t *vpid,
orte_vpid_t base_vpid,
int offset);
ORTE_DECLSPEC int orte_ns_base_assign_rml_tag_not_available(orte_rml_tag_t *tag, char *name);
ORTE_DECLSPEC int orte_ns_base_define_data_type_not_available(
const char *name,
orte_data_type_t *type);
ORTE_DECLSPEC int orte_ns_base_create_my_name_not_available(void);
ORTE_DECLSPEC int orte_ns_base_get_job_peers_not_available(orte_process_name_t **procs,
orte_std_cntr_t *num_procs, orte_jobid_t job);
ORTE_DECLSPEC int orte_ns_base_dump_cells_not_available(void);
ORTE_DECLSPEC int orte_ns_base_dump_jobs_not_available(void);
ORTE_DECLSPEC int orte_ns_base_dump_tags_not_available(void);
ORTE_DECLSPEC int orte_ns_base_dump_datatypes_not_available(void);
/* Base functions used everywhere */
ORTE_DECLSPEC int orte_ns_base_get_peers(orte_process_name_t **procs,
orte_std_cntr_t *num_procs, orte_std_cntr_t *self);
ORTE_DECLSPEC int orte_ns_base_pack_name(orte_buffer_t *buffer, void *src,
orte_std_cntr_t num_vals, orte_data_type_t type);
ORTE_DECLSPEC int orte_ns_base_pack_cellid(orte_buffer_t *buffer, void *src,
orte_std_cntr_t num_vals, orte_data_type_t type);
ORTE_DECLSPEC int orte_ns_base_pack_jobid(orte_buffer_t *buffer, void *src,
orte_std_cntr_t num_vals, orte_data_type_t type);
ORTE_DECLSPEC int orte_ns_base_pack_vpid(orte_buffer_t *buffer, void *src,
orte_std_cntr_t num_vals, orte_data_type_t type);
ORTE_DECLSPEC int orte_ns_base_unpack_name(orte_buffer_t *buffer, void *dest,
orte_std_cntr_t *num_vals, orte_data_type_t type);
ORTE_DECLSPEC int orte_ns_base_unpack_cellid(orte_buffer_t *buffer, void *dest,
orte_std_cntr_t *num_vals, orte_data_type_t type);
ORTE_DECLSPEC int orte_ns_base_unpack_jobid(orte_buffer_t *buffer, void *dest,
orte_std_cntr_t *num_vals, orte_data_type_t type);
ORTE_DECLSPEC int orte_ns_base_unpack_vpid(orte_buffer_t *buffer, void *dest,
orte_std_cntr_t *num_vals, orte_data_type_t type);
/*
* copy functions
*/
int orte_ns_base_copy_name(orte_process_name_t **dest, orte_process_name_t *src, orte_data_type_t type);
int orte_ns_base_copy_vpid(orte_vpid_t **dest, orte_vpid_t *src, orte_data_type_t type);
int orte_ns_base_copy_cellid(orte_cellid_t **dest, orte_cellid_t *src, orte_data_type_t type);
int orte_ns_base_copy_jobid(orte_jobid_t **dest, orte_jobid_t *src, orte_data_type_t type);
/*
* compare functions
*/
int orte_ns_base_compare_name(orte_process_name_t *value1,
orte_process_name_t *value2,
orte_data_type_t type);
int orte_ns_base_compare_vpid(orte_vpid_t *value1,
orte_vpid_t *value2,
orte_data_type_t type);
int orte_ns_base_compare_jobid(orte_jobid_t *value1,
orte_jobid_t *value2,
orte_data_type_t type);
int orte_ns_base_compare_cellid(orte_cellid_t *value1,
orte_cellid_t *value2,
orte_data_type_t type);
/*
* size functions
*/
int orte_ns_base_std_size(size_t *size, void *src, orte_data_type_t type);
/*
* release functions
*/
void orte_ns_base_std_release(orte_data_value_t *value);
/*
* print functions
*/
int orte_ns_base_std_print(char **output, char *prefix, void *src, orte_data_type_t type);
int orte_ns_base_print_name(char **output, char *prefix, orte_process_name_t *name, orte_data_type_t type);
/*
* globals that might be needed
*/

Просмотреть файл

@ -25,7 +25,7 @@
#include "orte/dss/dss.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/ns/base/base.h"
#include "orte/mca/ns/base/ns_private.h"
/*
* NUMERIC COMPARE FUNCTIONS
@ -42,42 +42,13 @@ int orte_ns_base_compare_name(orte_process_name_t *value1,
return ORTE_VALUE1_GREATER;
}
/** we have to take care of the special case where one of the
* values is ORTE_NAME_WILDCARD. If any of the fields are wildcard,
* then we want to just ignore that one field. However, in the case
* of ORTE_NAME_WILDCARD (where ALL of the fields are wildcard), this
* would automatically result in ORTE_EQUAL for any name in the other
* value - a totally useless result.
*
* Instead, what we want to know in this case is if the value actually
* *is* ORTE_NAME_WILDCARD. So, we need to detect if one of the values
* is ORTE_NAME_WILDCARD, and then specifically check the other one
* to see if it matches
*/
if (value2->cellid == ORTE_CELLID_WILDCARD &&
value2->jobid == ORTE_JOBID_WILDCARD &&
value2->vpid == ORTE_VPID_WILDCARD) {
if (value1->cellid == ORTE_CELLID_WILDCARD &&
value1->jobid == ORTE_JOBID_WILDCARD &&
value1->vpid == ORTE_VPID_WILDCARD) {
return ORTE_EQUAL;
} else {
return ORTE_VALUE1_GREATER;
}
} else if (value1->cellid == ORTE_CELLID_WILDCARD &&
value1->jobid == ORTE_JOBID_WILDCARD &&
value1->vpid == ORTE_VPID_WILDCARD) {
if (value2->cellid == ORTE_CELLID_WILDCARD &&
value2->jobid == ORTE_JOBID_WILDCARD &&
value2->vpid == ORTE_VPID_WILDCARD) {
return ORTE_EQUAL;
} else {
return ORTE_VALUE2_GREATER;
}
}
/** now that the special cases are done, go through the progression */
/* If any of the fields are wildcard,
* then we want to just ignore that one field. In the case
* of ORTE_NAME_WILDCARD (where ALL of the fields are wildcard), this
* will automatically result in ORTE_EQUAL for any name in the other
* value - a totally useless result, but consistent in behavior.
*/
/** check the cellids - if one of them is WILDCARD, then ignore
* this field since anything is okay
*/
@ -163,3 +134,18 @@ int orte_ns_base_compare_cellid(orte_cellid_t *value1,
return ORTE_EQUAL;
}
int orte_ns_base_compare_nodeid(orte_nodeid_t *value1,
orte_nodeid_t *value2,
orte_data_type_t type)
{
/** if either value is WILDCARD, then return equal */
if (*value1 == ORTE_NODEID_WILDCARD ||
*value2 == ORTE_NODEID_WILDCARD) return ORTE_EQUAL;
if (*value1 > *value2) return ORTE_VALUE1_GREATER;
if (*value2 > *value1) return ORTE_VALUE2_GREATER;
return ORTE_EQUAL;
}

Просмотреть файл

@ -23,7 +23,7 @@
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/ns/base/base.h"
#include "orte/mca/ns/base/ns_private.h"
/*
* VPID
@ -63,6 +63,25 @@ int orte_ns_base_copy_cellid(orte_cellid_t **dest, orte_cellid_t *src, orte_data
return ORTE_SUCCESS;
}
/*
* NODEID
*/
int orte_ns_base_copy_nodeid(orte_nodeid_t **dest, orte_nodeid_t *src, orte_data_type_t type)
{
orte_nodeid_t *val;
val = (orte_nodeid_t*)malloc(sizeof(orte_nodeid_t));
if (NULL == val) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
*val = *src;
*dest = val;
return ORTE_SUCCESS;
}
/*
* JOBID
*/

Просмотреть файл

@ -28,7 +28,7 @@
#include "orte/mca/errmgr/errmgr.h"
#include "orte/dss/dss_internal.h"
#include "orte/mca/ns/base/base.h"
#include "orte/mca/ns/base/ns_private.h"
/*
* NAME
@ -51,11 +51,7 @@ int orte_ns_base_pack_name(orte_buffer_t *buffer, void *src,
}
proc = (orte_process_name_t*)src;
for (i=0; i < num_vals; i++) {
if (ORTE_SUCCESS != (rc = orte_ns.get_cellid(&cellid[i], proc))) {
ORTE_ERROR_LOG(rc);
free(cellid);
return rc;
}
cellid[i] = proc->cellid;
proc++;
}
/* now pack them in one shot */
@ -75,11 +71,7 @@ int orte_ns_base_pack_name(orte_buffer_t *buffer, void *src,
}
proc = (orte_process_name_t*)src;
for (i=0; i < num_vals; i++) {
if (ORTE_SUCCESS != (rc = orte_ns.get_jobid(&jobid[i], proc))) {
ORTE_ERROR_LOG(rc);
free(jobid);
return rc;
}
jobid[i] = proc->jobid;
proc++;
}
/* now pack them in one shot */
@ -99,11 +91,7 @@ int orte_ns_base_pack_name(orte_buffer_t *buffer, void *src,
}
proc = (orte_process_name_t*)src;
for (i=0; i < num_vals; i++) {
if (ORTE_SUCCESS != (rc = orte_ns.get_vpid(&vpid[i], proc))) {
ORTE_ERROR_LOG(rc);
free(vpid);
return rc;
}
vpid[i] = proc->vpid;
proc++;
}
/* now pack them in one shot */
@ -128,13 +116,30 @@ int orte_ns_base_pack_cellid(orte_buffer_t *buffer, void *src,
/* Turn around and pack the real type */
if (ORTE_SUCCESS != (
ret = orte_dss_pack_buffer(buffer, src, num_vals, ORTE_STD_CNTR_T))) {
ret = orte_dss_pack_buffer(buffer, src, num_vals, ORTE_CELLID_T))) {
ORTE_ERROR_LOG(ret);
}
return ret;
}
/*
* NODEID
*/
int orte_ns_base_pack_nodeid(orte_buffer_t *buffer, void *src,
orte_std_cntr_t num_vals, orte_data_type_t type)
{
int ret;
/* Turn around and pack the real type */
if (ORTE_SUCCESS != (
ret = orte_dss_pack_buffer(buffer, src, num_vals, ORTE_NODEID_T))) {
ORTE_ERROR_LOG(ret);
}
return ret;
}
/*
* JOBID
*/
@ -145,7 +150,7 @@ int orte_ns_base_pack_jobid(orte_buffer_t *buffer, void *src,
/* Turn around and pack the real type */
if (ORTE_SUCCESS != (
ret = orte_dss_pack_buffer(buffer, src, num_vals, ORTE_STD_CNTR_T))) {
ret = orte_dss_pack_buffer(buffer, src, num_vals, ORTE_JOBID_T))) {
ORTE_ERROR_LOG(ret);
}
@ -162,7 +167,7 @@ int orte_ns_base_pack_vpid(orte_buffer_t *buffer, void *src,
/* Turn around and pack the real type */
if (ORTE_SUCCESS != (
ret = orte_dss_pack_buffer(buffer, src, num_vals, ORTE_STD_CNTR_T))) {
ret = orte_dss_pack_buffer(buffer, src, num_vals, ORTE_VPID_T))) {
ORTE_ERROR_LOG(ret);
}

Просмотреть файл

@ -24,7 +24,7 @@
#include "orte/dss/dss.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/ns/base/base.h"
#include "orte/mca/ns/base/ns_private.h"
static void orte_ns_base_quick_print(char **output, char *type_name, char *pfx, void *src, size_t src_size);
@ -49,6 +49,10 @@ int orte_ns_base_std_print(char **output, char *prefix, void *src, orte_data_typ
orte_ns_base_quick_print(output, "ORTE_CELLID", prefix, src, sizeof(orte_cellid_t));
break;
case ORTE_NODEID:
orte_ns_base_quick_print(output, "ORTE_NODEID", prefix, src, sizeof(orte_nodeid_t));
break;
default:
ORTE_ERROR_LOG(ORTE_ERR_UNKNOWN_DATA_TYPE);
return ORTE_ERR_UNKNOWN_DATA_TYPE;
@ -69,9 +73,9 @@ int orte_ns_base_print_name(char **output, char *prefix, orte_process_name_t *na
asprintf(output, "%sData type: ORTE_PROCESS_NAME\tData Value: NULL",
(NULL == prefix ? " " : prefix));
} else {
asprintf(output, "%sData type: ORTE_PROCESS_NAME\tData Value: [%lu,%lu,%lu]",
(NULL == prefix ? " " : prefix), (unsigned long)name->cellid,
(unsigned long)name->jobid, (unsigned long)name->vpid);
asprintf(output, "%sData type: ORTE_PROCESS_NAME\tData Value: [%ld,%ld,%ld]",
(NULL == prefix ? " " : prefix), (long)name->cellid,
(long)name->jobid, (long)name->vpid);
}
return ORTE_SUCCESS;
@ -80,10 +84,10 @@ int orte_ns_base_print_name(char **output, char *prefix, orte_process_name_t *na
static void orte_ns_base_quick_print(char **output, char *type_name, char *prefix, void *src, size_t src_size)
{
uint8_t *ui8;
uint16_t *ui16;
uint32_t *ui32;
uint64_t *ui64;
int8_t *i8;
int16_t *i16;
int32_t *i32;
int64_t *i64;
char *pfx;
/* set default result */
@ -99,23 +103,23 @@ static void orte_ns_base_quick_print(char **output, char *type_name, char *prefi
switch(src_size) {
case 1:
ui8 = (uint8_t*)src;
asprintf(output, "%sData type: %s\tValue: %d", pfx, type_name, (int) *ui8);
i8 = (int8_t*)src;
asprintf(output, "%sData type: %s\tValue: %d", pfx, type_name, (int) *i8);
break;
case 2:
ui16 = (uint16_t*)src;
asprintf(output, "%sData type: %s\tValue: %d", pfx, type_name, (int) *ui16);
i16 = (int16_t*)src;
asprintf(output, "%sData type: %s\tValue: %d", pfx, type_name, (int) *i16);
break;
case 4:
ui32 = (uint32_t*)src;
asprintf(output, "%sData type: %s\tValue: %lu", pfx, type_name, (unsigned long) *ui32);
i32 = (int32_t*)src;
asprintf(output, "%sData type: %s\tValue: %ld", pfx, type_name, (long) *i32);
break;
case 8:
ui64 = (uint64_t*)src;
asprintf(output, "%sData type: %s\tValue: %lu", pfx, type_name, (unsigned long) *ui64);
i64 = (int64_t*)src;
asprintf(output, "%sData type: %s\tValue: %ld", pfx, type_name, (long) *i64);
break;
default:

Просмотреть файл

@ -24,7 +24,7 @@
#include "orte/dss/dss.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/ns/base/base.h"
#include "orte/mca/ns/base/ns_private.h"
/*
* STANDARD RELEASE FUNCTION - WORKS FOR EVERYTHING NON-STRUCTURED

Просмотреть файл

@ -24,7 +24,7 @@
#include "orte/dss/dss.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/ns/base/base.h"
#include "orte/mca/ns/base/ns_private.h"
/*
* STANDARD SIZE FUNCTION - WORKS FOR EVERYTHING NON-STRUCTURED
@ -44,6 +44,10 @@ int orte_ns_base_std_size(size_t *size, void *src, orte_data_type_t type)
*size = sizeof(orte_cellid_t);
break;
case ORTE_NODEID:
*size = sizeof(orte_nodeid_t);
break;
case ORTE_NAME:
*size = sizeof(orte_process_name_t);
break;

Просмотреть файл

@ -26,7 +26,7 @@
#include "orte/mca/errmgr/errmgr.h"
#include "orte/dss/dss_internal.h"
#include "orte/mca/ns/base/base.h"
#include "orte/mca/ns/base/ns_private.h"
/*
* NAME
@ -123,14 +123,29 @@ int orte_ns_base_unpack_cellid(orte_buffer_t *buffer, void *dest,
int ret;
/* Turn around and unpack the real type */
if (ORTE_SUCCESS != (
ret = orte_dss_unpack_buffer(buffer, dest, num_vals, ORTE_STD_CNTR_T))) {
if (ORTE_SUCCESS != (ret = orte_dss_unpack_buffer(buffer, dest, num_vals, ORTE_CELLID_T))) {
ORTE_ERROR_LOG(ret);
}
return ret;
}
/*
* NODEID
*/
int orte_ns_base_unpack_nodeid(orte_buffer_t *buffer, void *dest,
orte_std_cntr_t *num_vals, orte_data_type_t type)
{
int ret;
/* Turn around and unpack the real type */
if (ORTE_SUCCESS != (ret = orte_dss_unpack_buffer(buffer, dest, num_vals, ORTE_NODEID_T))) {
ORTE_ERROR_LOG(ret);
}
return ret;
}
/*
* JOBID
*/
@ -140,8 +155,7 @@ int orte_ns_base_unpack_jobid(orte_buffer_t *buffer, void *dest,
int ret;
/* Turn around and unpack the real type */
if (ORTE_SUCCESS != (
ret = orte_dss_unpack_buffer(buffer, dest, num_vals, ORTE_STD_CNTR_T))) {
if (ORTE_SUCCESS != (ret = orte_dss_unpack_buffer(buffer, dest, num_vals, ORTE_JOBID_T))) {
ORTE_ERROR_LOG(ret);
}
@ -157,8 +171,7 @@ int orte_ns_base_unpack_vpid(orte_buffer_t *buffer, void *dest,
int ret;
/* Turn around and unpack the real type */
if (ORTE_SUCCESS != (
ret = orte_dss_unpack_buffer(buffer, dest, num_vals, ORTE_STD_CNTR_T))) {
if (ORTE_SUCCESS != (ret = orte_dss_unpack_buffer(buffer, dest, num_vals, ORTE_VPID_T))) {
ORTE_ERROR_LOG(ret);
}

232
orte/mca/ns/base/ns_base_cell_fns.c Обычный файл
Просмотреть файл

@ -0,0 +1,232 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/** @file:
*
*/
#include "orte_config.h"
#include <stdio.h>
#include <string.h>
#include <stddef.h>
#include <stdlib.h>
#if HAVE_NETINET_IN_H
#include <netinet/in.h>
#endif
#include "orte/orte_constants.h"
#include "opal/util/output.h"
#include "opal/util/printf.h"
#include "opal/mca/mca.h"
#include "orte/mca/schema/schema_types.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/ns/base/ns_private.h"
/*
* "not available" functions
*/
int
orte_ns_base_create_cellid_not_available(orte_cellid_t *cellid, char *site, char *resource)
{
*cellid = ORTE_CELLID_INVALID;
ORTE_ERROR_LOG(ORTE_ERR_UNREACH);
return ORTE_ERR_UNREACH;
}
int
orte_ns_base_get_cell_info_not_available(orte_cellid_t cellid,
char **site, char **resource)
{
*site = NULL;
*resource = NULL;
ORTE_ERROR_LOG(ORTE_ERR_UNREACH);
return ORTE_ERR_UNREACH;
}
int
orte_ns_base_create_nodeids_not_available(orte_nodeid_t **nodeids, orte_std_cntr_t *nnodes, orte_cellid_t cellid, char **nodename)
{
*nodeids = NULL;
*nnodes = 0;
ORTE_ERROR_LOG(ORTE_ERR_UNREACH);
return ORTE_ERR_UNREACH;
}
int
orte_ns_base_get_node_info_not_available(char ***nodenames, orte_cellid_t cellid,
orte_std_cntr_t num_nodeids, orte_nodeid_t *nodeids)
{
*nodenames = NULL;
ORTE_ERROR_LOG(ORTE_ERR_UNREACH);
return ORTE_ERR_UNREACH;
}
/**** CELL STRING FUNCTIONS ****/
int orte_ns_base_get_cellid_string(char **cellid_string, const orte_process_name_t* name)
{
if (NULL == name) { /* got an error */
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
*cellid_string = NULL;
return ORTE_ERR_BAD_PARAM;
}
/* check for wildcard value - handle appropriately */
if (ORTE_CELLID_WILDCARD == name->cellid) {
*cellid_string = strdup(ORTE_SCHEMA_WILDCARD_STRING);
return ORTE_SUCCESS;
}
/* check for invalid value - handle appropriately */
if (ORTE_CELLID_INVALID == name->cellid) {
*cellid_string = strdup(ORTE_SCHEMA_INVALID_STRING);
return ORTE_SUCCESS;
}
if (0 > asprintf(cellid_string, "%ld", (long) name->cellid)) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
return ORTE_SUCCESS;
}
int orte_ns_base_convert_cellid_to_string(char **cellid_string, const orte_cellid_t cellid)
{
/* check for wildcard value - handle appropriately */
if (ORTE_CELLID_WILDCARD == cellid) {
*cellid_string = strdup(ORTE_SCHEMA_WILDCARD_STRING);
return ORTE_SUCCESS;
}
/* check for invalid value - handle appropriately */
if (ORTE_CELLID_INVALID == cellid) {
*cellid_string = strdup(ORTE_SCHEMA_INVALID_STRING);
return ORTE_SUCCESS;
}
if (0 > asprintf(cellid_string, "%ld", (long) cellid)) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
return ORTE_SUCCESS;
}
int orte_ns_base_convert_string_to_cellid(orte_cellid_t *cellid, const char *cellidstring)
{
long int tmpint;
if (NULL == cellidstring) {
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
*cellid = ORTE_CELLID_INVALID;
return ORTE_ERR_BAD_PARAM;
}
/** check for wildcard string - handle appropriately */
if (0 == strcmp(ORTE_SCHEMA_WILDCARD_STRING, cellidstring)) {
*cellid = ORTE_CELLID_WILDCARD;
return ORTE_SUCCESS;
}
/** check for invalid string - handle appropriately */
if (0 == strcmp(ORTE_SCHEMA_INVALID_STRING, cellidstring)) {
*cellid = ORTE_CELLID_INVALID;
return ORTE_SUCCESS;
}
tmpint = strtol(cellidstring, NULL, 10);
if (ORTE_CELLID_MAX >= tmpint && ORTE_CELLID_MIN <= tmpint) {
*cellid = (orte_cellid_t)tmpint;
} else {
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
*cellid = ORTE_CELLID_INVALID;
return ORTE_ERR_BAD_PARAM;
}
return ORTE_SUCCESS;
}
/**** NODEID STRING FUNCTIONS ****/
int orte_ns_base_convert_nodeid_to_string(char **string, const orte_nodeid_t nodeid)
{
*string = NULL;
/* check for wildcard value - handle appropriately */
if (ORTE_NODEID_WILDCARD == nodeid) {
*string = strdup(ORTE_SCHEMA_WILDCARD_STRING);
return ORTE_SUCCESS;
}
/* check for invalid value - handle appropriately */
if (ORTE_NODEID_INVALID == nodeid) {
*string = strdup(ORTE_SCHEMA_INVALID_STRING);
return ORTE_SUCCESS;
}
if (0 > asprintf(string, "%ld", (long)nodeid)) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
return ORTE_SUCCESS;
}
int orte_ns_base_convert_string_to_nodeid(orte_nodeid_t *nodeid, const char* string)
{
long int tmpint;
if (NULL == string) { /* got an error */
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
*nodeid = ORTE_NODEID_INVALID;
return ORTE_ERR_BAD_PARAM;
}
/** check for wildcard character - handle appropriately */
if (0 == strcmp(ORTE_SCHEMA_WILDCARD_STRING, string)) {
*nodeid = ORTE_NODEID_WILDCARD;
return ORTE_SUCCESS;
}
/* check for invalid value */
if (0 == strcmp(ORTE_SCHEMA_INVALID_STRING, string)) {
*nodeid = ORTE_NODEID_INVALID;
return ORTE_SUCCESS;
}
tmpint = strtol(string, NULL, 10);
if (ORTE_NODEID_MAX >= tmpint && ORTE_NODEID_MIN <= tmpint) {
*nodeid = (orte_nodeid_t)tmpint;
} else {
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
*nodeid = ORTE_NODEID_INVALID;
return ORTE_ERR_BAD_PARAM;
}
return ORTE_SUCCESS;
}

98
orte/mca/ns/base/ns_base_diag_fns.c Обычный файл
Просмотреть файл

@ -0,0 +1,98 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/** @file:
*
*/
#include "orte_config.h"
#include <stdio.h>
#include <string.h>
#include <stddef.h>
#include <stdlib.h>
#if HAVE_NETINET_IN_H
#include <netinet/in.h>
#endif
#include "orte/orte_constants.h"
#include "opal/util/output.h"
#include "opal/util/printf.h"
#include "opal/mca/mca.h"
#include "orte/dss/dss.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/ns/base/base.h"
#include "orte/mca/ns/base/ns_private.h"
/*
* "not available" functions
*/
int
orte_ns_base_dump_cells_not_available(void)
{
ORTE_ERROR_LOG(ORTE_ERR_UNREACH);
return ORTE_ERR_UNREACH;
}
int
orte_ns_base_dump_jobs_not_available(void)
{
ORTE_ERROR_LOG(ORTE_ERR_UNREACH);
return ORTE_ERR_UNREACH;
}
int
orte_ns_base_dump_tags_not_available(void)
{
ORTE_ERROR_LOG(ORTE_ERR_UNREACH);
return ORTE_ERR_UNREACH;
}
int
orte_ns_base_dump_datatypes_not_available(void)
{
ORTE_ERROR_LOG(ORTE_ERR_UNREACH);
return ORTE_ERR_UNREACH;
}
/**** DIAGNOSTIC FUNCTIONS ****/
int orte_ns_base_print_dump(orte_buffer_t *buffer)
{
char *line;
orte_std_cntr_t n;
orte_data_type_t type;
int rc;
n = 1;
while (ORTE_SUCCESS == orte_dss.peek(buffer, &type, &n)) {
if (ORTE_SUCCESS !=
(rc = orte_dss.unpack(buffer, &line, &n, ORTE_STRING))) {
ORTE_ERROR_LOG(rc);
return rc;
}
opal_output(mca_ns_base_output, "%s", line);
free(line);
n=1;
}
return ORTE_SUCCESS;
}

78
orte/mca/ns/base/ns_base_general_fns.c Обычный файл
Просмотреть файл

@ -0,0 +1,78 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/** @file:
*
*/
#include "orte_config.h"
#include <stdio.h>
#include <string.h>
#include <stddef.h>
#include <stdlib.h>
#if HAVE_NETINET_IN_H
#include <netinet/in.h>
#endif
#include "orte/orte_constants.h"
#include "opal/util/output.h"
#include "opal/util/printf.h"
#include "opal/mca/mca.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/ns/base/ns_private.h"
/*
* "not available" functions
*/
int
orte_ns_base_module_init_not_available(void)
{
ORTE_ERROR_LOG(ORTE_ERR_UNREACH);
return ORTE_ERR_UNREACH;
}
int
orte_ns_base_assign_rml_tag_not_available(orte_rml_tag_t *tag, char *name)
{
*tag = ORTE_RML_TAG_MAX;
ORTE_ERROR_LOG(ORTE_ERR_UNREACH);
return ORTE_ERR_UNREACH;
}
int
orte_ns_base_define_data_type_not_available(
const char *name,
orte_data_type_t *type)
{
*type = ORTE_DSS_ID_MAX;
ORTE_ERROR_LOG(ORTE_ERR_UNREACH);
return ORTE_ERR_UNREACH;
}
/**** GET PEERS ****/
int orte_ns_base_get_peers_not_available(orte_process_name_t **procs,
orte_std_cntr_t *num_procs, opal_list_t *attributes)
{
*procs = NULL;
*num_procs = 0;
ORTE_ERROR_LOG(ORTE_ERR_UNREACH);
return ORTE_ERR_UNREACH;
}

169
orte/mca/ns/base/ns_base_job_fns.c Обычный файл
Просмотреть файл

@ -0,0 +1,169 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/** @file:
*
*/
#include "orte_config.h"
#include <stdio.h>
#include <string.h>
#include <stddef.h>
#include <stdlib.h>
#if HAVE_NETINET_IN_H
#include <netinet/in.h>
#endif
#include "orte/orte_constants.h"
#include "opal/util/output.h"
#include "opal/util/printf.h"
#include "opal/mca/mca.h"
#include "orte/mca/schema/schema_types.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/ns/base/ns_private.h"
/*
* "not available" functions
*/
int
orte_ns_base_create_jobid_not_available(orte_jobid_t *jobid, opal_list_t *attrs)
{
*jobid = ORTE_JOBID_INVALID;
ORTE_ERROR_LOG(ORTE_ERR_UNREACH);
return ORTE_ERR_UNREACH;
}
int
orte_ns_base_get_vpid_range_not_available(orte_jobid_t job,
orte_vpid_t range,
orte_vpid_t *startvpid)
{
*startvpid = ORTE_VPID_INVALID;
ORTE_ERROR_LOG(ORTE_ERR_UNREACH);
return ORTE_ERR_UNREACH;
}
int orte_ns_base_get_job_descendants_not_available(orte_jobid_t** descendants,
orte_std_cntr_t *num_desc,
orte_jobid_t job)
{
*descendants = NULL;
*num_desc = 0;
ORTE_ERROR_LOG(ORTE_ERR_UNREACH);
return ORTE_ERR_UNREACH;
}
int orte_ns_base_get_job_children_not_available(orte_jobid_t** children,
orte_std_cntr_t *num_childs,
orte_jobid_t job)
{
*children = NULL;
*num_childs = 0;
ORTE_ERROR_LOG(ORTE_ERR_UNREACH);
return ORTE_ERR_UNREACH;
}
int orte_ns_base_get_root_job_not_available(orte_jobid_t *root_job, orte_jobid_t job)
{
*root_job = ORTE_JOBID_INVALID;
ORTE_ERROR_LOG(ORTE_ERR_UNREACH);
return ORTE_ERR_UNREACH;
}
int orte_ns_base_get_parent_job_not_available(orte_jobid_t *parent, orte_jobid_t job)
{
*parent = ORTE_JOBID_INVALID;
ORTE_ERROR_LOG(ORTE_ERR_UNREACH);
return ORTE_ERR_UNREACH;
}
/**** JOB STRING FUNCTIONS ****/
int orte_ns_base_get_jobid_string(char **jobid_string, const orte_process_name_t* name)
{
if (NULL == name) { /* got an error */
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
*jobid_string = NULL;
return ORTE_ERR_BAD_PARAM;
}
/* check for wildcard value - handle appropriately */
if (ORTE_JOBID_WILDCARD == name->jobid) {
*jobid_string = strdup(ORTE_SCHEMA_WILDCARD_STRING);
return ORTE_SUCCESS;
}
if (0 > asprintf(jobid_string, "%ld", (long) name->jobid)) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
return ORTE_SUCCESS;
}
int orte_ns_base_convert_jobid_to_string(char **jobid_string, const orte_jobid_t jobid)
{
/* check for wildcard value - handle appropriately */
if (ORTE_JOBID_WILDCARD == jobid) {
*jobid_string = strdup(ORTE_SCHEMA_WILDCARD_STRING);
return ORTE_SUCCESS;
}
if (0 > asprintf(jobid_string, "%ld", (long) jobid)) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
return ORTE_SUCCESS;
}
int orte_ns_base_convert_string_to_jobid(orte_jobid_t *jobid, const char* jobidstring)
{
long int tmpint;
if (NULL == jobidstring) { /* got an error */
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
*jobid = ORTE_JOBID_INVALID;
return ORTE_ERR_BAD_PARAM;
}
tmpint = strtoul(jobidstring, NULL, 10);
/* check for invalid value */
if (ORTE_JOBID_INVALID == tmpint) {
*jobid = ORTE_JOBID_INVALID;
return ORTE_SUCCESS;
}
if (ORTE_JOBID_MAX >= tmpint && ORTE_JOBID_MIN <= tmpint) {
*jobid = (orte_jobid_t)tmpint;
} else {
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
*jobid = ORTE_JOBID_INVALID;
return ORTE_ERR_BAD_PARAM;
}
return ORTE_SUCCESS;
}

Просмотреть файл

@ -1,678 +0,0 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/** @file:
*
*/
#include "orte_config.h"
#include <stdio.h>
#include <string.h>
#include <stddef.h>
#include <stdlib.h>
#if HAVE_NETINET_IN_H
#include <netinet/in.h>
#endif
#include "orte/orte_constants.h"
#include "opal/util/output.h"
#include "opal/util/printf.h"
#include "opal/mca/mca.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/ns/base/base.h"
/**
* globals
*/
/*
* "not available" functions
*/
int
orte_ns_base_module_init_not_available(void)
{
ORTE_ERROR_LOG(ORTE_ERR_UNREACH);
return ORTE_ERR_UNREACH;
}
int
orte_ns_base_create_cellid_not_available(orte_cellid_t *cellid, char *site, char *resource)
{
*cellid = ORTE_CELLID_MAX;
ORTE_ERROR_LOG(ORTE_ERR_UNREACH);
return ORTE_ERR_UNREACH;
}
int
orte_ns_base_get_cell_info_not_available(orte_cellid_t cellid,
char **site, char **resource)
{
*site = NULL;
*resource = NULL;
ORTE_ERROR_LOG(ORTE_ERR_UNREACH);
return ORTE_ERR_UNREACH;
}
int
orte_ns_base_create_jobid_not_available(orte_jobid_t *jobid)
{
*jobid = ORTE_JOBID_MAX;
ORTE_ERROR_LOG(ORTE_ERR_UNREACH);
return ORTE_ERR_UNREACH;
}
int
orte_ns_base_get_vpid_range_not_available(orte_jobid_t job,
orte_vpid_t range,
orte_vpid_t *startvpid)
{
*startvpid = ORTE_VPID_MAX;
ORTE_ERROR_LOG(ORTE_ERR_UNREACH);
return ORTE_ERR_UNREACH;
}
int
orte_ns_base_assign_rml_tag_not_available(orte_rml_tag_t *tag, char *name)
{
*tag = ORTE_RML_TAG_MAX;
ORTE_ERROR_LOG(ORTE_ERR_UNREACH);
return ORTE_ERR_UNREACH;
}
int
orte_ns_base_define_data_type_not_available(
const char *name,
orte_data_type_t *type)
{
*type = ORTE_DSS_ID_MAX;
ORTE_ERROR_LOG(ORTE_ERR_UNREACH);
return ORTE_ERR_UNREACH;
}
int
orte_ns_base_create_my_name_not_available(void)
{
ORTE_ERROR_LOG(ORTE_ERR_UNREACH);
return ORTE_ERR_UNREACH;
}
int orte_ns_base_get_job_peers_not_available(orte_process_name_t **procs,
orte_std_cntr_t *num_procs, orte_jobid_t job)
{
*procs = NULL;
*num_procs = 0;
ORTE_ERROR_LOG(ORTE_ERR_UNREACH);
return ORTE_ERR_UNREACH;
}
int
orte_ns_base_dump_cells_not_available(void)
{
ORTE_ERROR_LOG(ORTE_ERR_UNREACH);
return ORTE_ERR_UNREACH;
}
int
orte_ns_base_dump_jobs_not_available(void)
{
ORTE_ERROR_LOG(ORTE_ERR_UNREACH);
return ORTE_ERR_UNREACH;
}
int
orte_ns_base_dump_tags_not_available(void)
{
ORTE_ERROR_LOG(ORTE_ERR_UNREACH);
return ORTE_ERR_UNREACH;
}
int
orte_ns_base_dump_datatypes_not_available(void)
{
ORTE_ERROR_LOG(ORTE_ERR_UNREACH);
return ORTE_ERR_UNREACH;
}
/*
* functions
*/
int orte_ns_base_assign_cellid_to_process(orte_process_name_t *name)
{
if (NULL == name) {
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
return ORTE_ERR_BAD_PARAM;
}
name->cellid = 0;
return ORTE_SUCCESS;
}
int orte_ns_base_create_process_name(orte_process_name_t **name,
orte_cellid_t cell,
orte_jobid_t job,
orte_vpid_t vpid)
{
*name = NULL;
if (ORTE_CELLID_MAX < cell ||
ORTE_JOBID_MAX < job ||
ORTE_VPID_MAX < vpid) {
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
return ORTE_ERR_BAD_PARAM;
}
*name = (orte_process_name_t*)malloc(sizeof(orte_process_name_t));
if (NULL == *name) { /* got an error */
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
(*name)->cellid = cell;
(*name)->jobid = job;
(*name)->vpid = vpid;
return ORTE_SUCCESS;
}
int orte_ns_base_derive_vpid(orte_vpid_t *vpid, orte_vpid_t base_vpid, int offset)
{
*vpid = base_vpid + (orte_vpid_t)offset;
return ORTE_SUCCESS;
}
int orte_ns_base_copy_process_name(orte_process_name_t **dest,
orte_process_name_t* src)
{
orte_cellid_t cell;
orte_jobid_t job;
orte_vpid_t vpid;
int rc;
*dest = NULL;
if (NULL == src) {
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
return ORTE_ERR_BAD_PARAM;
}
if (ORTE_SUCCESS != orte_ns_base_get_cellid(&cell, src)) {
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
return ORTE_ERR_BAD_PARAM;
}
if (ORTE_SUCCESS != orte_ns_base_get_jobid(&job, src)) {
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
return ORTE_ERR_BAD_PARAM;
}
if (ORTE_SUCCESS != orte_ns_base_get_vpid(&vpid, src)) {
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
return ORTE_ERR_BAD_PARAM;
}
if (ORTE_SUCCESS != (rc = orte_ns_base_create_process_name(dest, cell, job, vpid))) {
ORTE_ERROR_LOG(rc);
}
return rc;
}
int orte_ns_base_get_proc_name_string(char **name_string,
const orte_process_name_t* name)
{
*name_string = NULL;
if (NULL == name) { /* got an error */
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
return ORTE_ERR_BAD_PARAM;
}
if (0 > asprintf(name_string, "%lu.%lu.%lu", ORTE_NAME_ARGS(name))) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
return ORTE_SUCCESS;
}
int orte_ns_base_convert_string_to_process_name(orte_process_name_t **name,
const char* name_string)
{
char *temp, *token;
orte_cellid_t cell;
orte_jobid_t job;
orte_vpid_t vpid;
unsigned long int tmpint;
int return_code=ORTE_SUCCESS;
const char delimiters[] = ".";
*name = NULL;
/* check for NULL string - error */
if (NULL == name_string) {
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
return ORTE_ERR_BAD_PARAM;
}
temp = strdup(name_string);
token = strtok(temp, delimiters); /* get first field -> cellid */
/* convert to largest possible unsigned int - unsigned long long is only supported
* in C99, so we have to use unsigned long for backward compatibility - then
* check to ensure it is within range of cellid_t before casting */
tmpint = strtoul(token, NULL, 10);
if (ORTE_CELLID_MAX >= tmpint) {
cell = (orte_cellid_t)tmpint;
} else {
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
return_code = ORTE_ERR_BAD_PARAM;
goto CLEANUP;
}
token = strtok(NULL, delimiters); /* get second field -> jobid */
/* convert to largest possible unsigned int - then
* check to ensure it is within range of jobid_t before casting */
tmpint = strtoul(token, NULL, 10);
if (ORTE_JOBID_MAX >= tmpint) {
job = (orte_jobid_t)tmpint;
} else {
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
return_code = ORTE_ERR_BAD_PARAM;
goto CLEANUP;
}
token = strtok(NULL, delimiters); /* get third field -> vpid */
/* convert to largest possible unsigned int then
* check to ensure it is within range of vpid_t before casting */
tmpint = strtoul(token, NULL, 10);
if (ORTE_VPID_MAX >= tmpint) {
vpid = (orte_vpid_t)tmpint;
} else {
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
return_code = ORTE_ERR_BAD_PARAM;
goto CLEANUP;
}
if (ORTE_SUCCESS != (return_code = orte_ns_base_create_process_name(name, cell, job, vpid))) {
ORTE_ERROR_LOG(return_code);
}
CLEANUP:
if (temp) {
free(temp);
}
return return_code;
}
int orte_ns_base_get_vpid_string(char **vpid_string, const orte_process_name_t* name)
{
*vpid_string = NULL;
if (NULL == name) { /* got an error */
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
return ORTE_ERR_BAD_PARAM;
}
if (0 > asprintf(vpid_string, "%lu", (unsigned long) name->vpid)) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
return ORTE_SUCCESS;
}
int orte_ns_base_convert_vpid_to_string(char **vpid_string, const orte_vpid_t vpid)
{
*vpid_string = NULL;
if (0 > asprintf(vpid_string, "%lu", (unsigned long) vpid)) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
return ORTE_SUCCESS;
}
int orte_ns_base_convert_string_to_vpid(orte_vpid_t *vpid, const char* vpidstring)
{
unsigned long int tmpint;
*vpid = ORTE_VPID_MAX;
if (NULL == vpidstring) { /* got an error */
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
return ORTE_ERR_BAD_PARAM;
}
tmpint = strtoul(vpidstring, NULL, 10);
if (ORTE_VPID_MAX >= tmpint) {
*vpid = (orte_vpid_t)tmpint;
} else {
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
*vpid = ORTE_VPID_MAX;
return ORTE_ERR_BAD_PARAM;
}
return ORTE_SUCCESS;
}
int orte_ns_base_get_jobid_string(char **jobid_string, const orte_process_name_t* name)
{
*jobid_string = NULL;
if (NULL == name) { /* got an error */
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
return ORTE_ERR_BAD_PARAM;
}
if (0 > asprintf(jobid_string, "%lu", (unsigned long) name->jobid)) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
return ORTE_SUCCESS;
}
int orte_ns_base_convert_jobid_to_string(char **jobid_string, const orte_jobid_t jobid)
{
*jobid_string = NULL;
if (0 > asprintf(jobid_string, "%lu", (unsigned long) jobid)) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
return ORTE_SUCCESS;
}
int orte_ns_base_convert_string_to_jobid(orte_jobid_t *jobid, const char* jobidstring)
{
unsigned long int tmpint;
*jobid = ORTE_JOBID_MAX;
if (NULL == jobidstring) { /* got an error */
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
return ORTE_ERR_BAD_PARAM;
}
tmpint = strtoul(jobidstring, NULL, 10);
if (ORTE_JOBID_MAX >= tmpint) {
*jobid = (orte_jobid_t)tmpint;
} else {
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
*jobid = ORTE_JOBID_MAX;
return ORTE_ERR_BAD_PARAM;
}
return ORTE_SUCCESS;
}
int orte_ns_base_get_cellid_string(char **cellid_string, const orte_process_name_t* name)
{
*cellid_string = NULL;
if (NULL == name) { /* got an error */
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
return ORTE_ERR_BAD_PARAM;
}
if (0 > asprintf(cellid_string, "%lu", (unsigned long) name->cellid)) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
return ORTE_SUCCESS;
}
int orte_ns_base_convert_cellid_to_string(char **cellid_string, const orte_cellid_t cellid)
{
*cellid_string = NULL;
if (0 > asprintf(cellid_string, "%lu", (unsigned long) cellid)) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
return ORTE_SUCCESS;
}
int orte_ns_base_convert_string_to_cellid(orte_cellid_t *cellid, const char *cellidstring)
{
unsigned long int tmpint;
*cellid = ORTE_CELLID_MAX;
if (NULL == cellidstring) {
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
return ORTE_ERR_BAD_PARAM;
}
tmpint = strtoul(cellidstring, NULL, 10);
if (ORTE_CELLID_MAX >= tmpint) {
*cellid = (orte_cellid_t)tmpint;
} else {
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
*cellid = ORTE_CELLID_MAX;
return ORTE_ERR_BAD_PARAM;
}
return ORTE_SUCCESS;
}
int orte_ns_base_get_vpid(orte_vpid_t *vpid, const orte_process_name_t* name)
{
*vpid = ORTE_VPID_MAX;
if (NULL == name) { /* got an error */
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
return ORTE_ERR_BAD_PARAM;
}
*vpid = name->vpid;
return ORTE_SUCCESS;
}
int orte_ns_base_get_jobid(orte_jobid_t *jobid, const orte_process_name_t* name)
{
*jobid = ORTE_JOBID_MAX;
if (NULL == name) { /* got an error */
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
return ORTE_ERR_BAD_PARAM;
}
*jobid = name->jobid;
return ORTE_SUCCESS;
}
int orte_ns_base_get_cellid(orte_cellid_t *cellid, const orte_process_name_t* name)
{
*cellid = ORTE_CELLID_MAX;
if (NULL == name) { /* got an error */
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
return ORTE_ERR_BAD_PARAM;
}
*cellid = name->cellid;
return ORTE_SUCCESS;
}
int orte_ns_base_compare(orte_ns_cmp_bitmask_t fields,
const orte_process_name_t* name1,
const orte_process_name_t* name2)
{
if (NULL == name1 && NULL == name2) {
return 0;
} else if (NULL == name1) {
return -1;
} else if (NULL == name2) {
return 1;
}
if (ORTE_NS_CMP_CELLID & fields) { /* check cellid field */
if (name1->cellid < name2->cellid) {
return -1;
} else if (name1->cellid > name2->cellid) {
return 1;
}
}
/* get here if cellid's are equal, or cellid not being checked */
/* now check job id */
if (ORTE_NS_CMP_JOBID & fields) {
if (name1->jobid < name2->jobid) {
return -1;
} else if (name1->jobid > name2->jobid) {
return 1;
}
}
/* get here if cellid's and jobid's are equal, or neither being checked,
* or cellid not checked and jobid's equal.
* now check vpid
*/
if (ORTE_NS_CMP_VPID & fields) {
if (name1->vpid < name2->vpid) {
return -1;
} else if (name1->vpid > name2->vpid) {
return 1;
}
}
/* only way to get here is if all fields are being checked and are equal,
* or cellid not checked, but jobid and vpid equal,
* or cellid and jobid not checked, but vpid equal,
* only vpid being checked, and equal
* return that fact
*/
return 0;
}
int orte_ns_base_free_name(orte_process_name_t **name)
{
if (NULL != name && NULL != *name) {
free(*name);
}
*name = NULL;
return ORTE_SUCCESS;
}
int orte_ns_base_get_peers(orte_process_name_t **procs,
orte_std_cntr_t *num_procs, orte_std_cntr_t *self)
{
orte_std_cntr_t i;
int rc;
orte_cellid_t mycellid;
orte_jobid_t myjobid;
orte_vpid_t myvpid;
*procs = (orte_process_name_t*)malloc(orte_process_info.num_procs *
sizeof(orte_process_name_t));
if (NULL == *procs) {
return ORTE_ERR_OUT_OF_RESOURCE;
}
if (ORTE_SUCCESS != (rc = orte_ns.get_cellid(&mycellid, orte_process_info.my_name))) {
ORTE_ERROR_LOG(rc);
return rc;
}
if (ORTE_SUCCESS != orte_ns.get_jobid(&myjobid, orte_process_info.my_name)) {
ORTE_ERROR_LOG(rc);
return rc;
}
if (ORTE_SUCCESS != orte_ns.get_vpid(&myvpid, orte_process_info.my_name)) {
ORTE_ERROR_LOG(rc);
return rc;
}
for (i=0; i < orte_process_info.num_procs; i++) {
(*procs)[i].cellid = mycellid;
(*procs)[i].jobid = myjobid;
(*procs)[i].vpid = orte_process_info.vpid_start + i;
}
*num_procs = orte_process_info.num_procs;
*self = (orte_std_cntr_t)(myvpid - orte_process_info.vpid_start);
return ORTE_SUCCESS;
}
/*
* DIAGNOSTIC FUNCTIONS
*/
int orte_ns_base_print_dump(orte_buffer_t *buffer)
{
char *line;
orte_std_cntr_t n;
orte_data_type_t type;
int rc;
n = 1;
while (ORTE_SUCCESS == orte_dss.peek(buffer, &type, &n)) {
if (ORTE_SUCCESS !=
(rc = orte_dss.unpack(buffer, &line, &n, ORTE_STRING))) {
ORTE_ERROR_LOG(rc);
return rc;
}
opal_output(mca_ns_base_output, "%s", line);
free(line);
n=1;
}
return ORTE_SUCCESS;
}

Просмотреть файл

@ -29,6 +29,7 @@
#include "orte/dss/dss.h"
#include "orte/mca/ns/base/base.h"
#include "orte/mca/ns/base/ns_private.h"
/*
@ -43,7 +44,9 @@
* globals
*/
orte_process_name_t orte_name_all = {ORTE_CELLID_MAX, ORTE_JOBID_MAX, ORTE_VPID_MAX};
orte_process_name_t orte_ns_name_wildcard = {ORTE_CELLID_WILDCARD, ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD};
orte_process_name_t orte_ns_name_invalid = {ORTE_CELLID_INVALID, ORTE_JOBID_INVALID, ORTE_VPID_INVALID};
orte_process_name_t orte_ns_name_my_hnp = {0, 0, 0};
/*
* Global variables
@ -54,35 +57,37 @@ mca_ns_base_module_t orte_ns = {
orte_ns_base_module_init_not_available,
/* cell functions */
orte_ns_base_create_cellid_not_available,
orte_ns_base_get_cellid,
orte_ns_base_get_cell_info_not_available,
orte_ns_base_assign_cellid_to_process,
orte_ns_base_get_cellid_string,
orte_ns_base_convert_cellid_to_string,
orte_ns_base_convert_string_to_cellid,
/* node functions */
orte_ns_base_create_nodeids_not_available,
orte_ns_base_get_node_info_not_available,
orte_ns_base_convert_nodeid_to_string,
orte_ns_base_convert_string_to_nodeid,
/* jobid functions */
orte_ns_base_create_jobid_not_available,
orte_ns_base_get_jobid,
orte_ns_base_get_job_descendants_not_available,
orte_ns_base_get_job_children_not_available,
orte_ns_base_get_root_job_not_available,
orte_ns_base_get_parent_job_not_available,
orte_ns_base_get_jobid_string,
orte_ns_base_convert_jobid_to_string,
orte_ns_base_convert_string_to_jobid,
/* vpid functions */
orte_ns_base_get_vpid_range_not_available,
orte_ns_base_get_vpid,
/* vpid functions */
orte_ns_base_get_vpid_string,
orte_ns_base_convert_vpid_to_string,
orte_ns_base_convert_string_to_vpid,
/* name functions */
orte_ns_base_create_process_name,
orte_ns_base_create_my_name_not_available,
orte_ns_base_copy_process_name,
orte_ns_base_convert_string_to_process_name,
orte_ns_base_free_name,
orte_ns_base_get_proc_name_string,
orte_ns_base_compare,
orte_ns_base_compare_fields,
/* peer functions */
orte_ns_base_get_peers,
orte_ns_base_get_job_peers_not_available,
orte_ns_base_get_peers_not_available,
/* tag server functions */
orte_ns_base_assign_rml_tag_not_available,
/* data type functions */

397
orte/mca/ns/base/ns_base_vpid_name_fns.c Обычный файл
Просмотреть файл

@ -0,0 +1,397 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/** @file:
*
*/
#include "orte_config.h"
#include <stdio.h>
#include <string.h>
#include <stddef.h>
#include <stdlib.h>
#if HAVE_NETINET_IN_H
#include <netinet/in.h>
#endif
#include "orte/orte_constants.h"
#include "opal/util/output.h"
#include "opal/util/printf.h"
#include "opal/mca/mca.h"
#include "orte/mca/schema/schema_types.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/ns/base/ns_private.h"
/*
* "not available" functions
*/
int
orte_ns_base_create_my_name_not_available(void)
{
ORTE_ERROR_LOG(ORTE_ERR_UNREACH);
return ORTE_ERR_UNREACH;
}
/**** NAME STRING FUNCTIONS ****/
int orte_ns_base_get_proc_name_string(char **name_string,
const orte_process_name_t* name)
{
char *tmp, *tmp2;
if (NULL == name) { /* got an error */
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
return ORTE_ERR_BAD_PARAM;
}
/* handle the more typical case where none of the fields
* contain WILDCARD or INVALID values
*/
if ((ORTE_CELLID_WILDCARD != name->cellid && ORTE_CELLID_INVALID != name->cellid) &&
(ORTE_JOBID_WILDCARD != name->jobid && ORTE_JOBID_INVALID != name->jobid) &&
(ORTE_VPID_WILDCARD != name->vpid && ORTE_VPID_INVALID != name->vpid)) {
if (0 > asprintf(name_string, "%ld%c%ld%c%ld", (long)name->cellid,
ORTE_SCHEMA_DELIMITER_CHAR, (long)name->jobid,
ORTE_SCHEMA_DELIMITER_CHAR, (long)name->vpid)) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
return ORTE_SUCCESS;
}
/* okay, now handle the corner cases */
if (ORTE_CELLID_WILDCARD == name->cellid) {
tmp = strdup(ORTE_SCHEMA_WILDCARD_STRING);
} else if (ORTE_CELLID_INVALID == name->cellid) {
tmp = strdup(ORTE_SCHEMA_INVALID_STRING);
} else {
asprintf(&tmp, "%ld", (long)name->cellid);
}
if (ORTE_JOBID_WILDCARD == name->jobid) {
asprintf(&tmp2, "%s%c%s", tmp, ORTE_SCHEMA_DELIMITER_CHAR,
ORTE_SCHEMA_WILDCARD_STRING, ORTE_SCHEMA_DELIMITER_CHAR);
} else if (ORTE_JOBID_INVALID == name->jobid) {
asprintf(&tmp2, "%s%c%s", tmp, ORTE_SCHEMA_DELIMITER_CHAR,
ORTE_SCHEMA_INVALID_STRING, ORTE_SCHEMA_DELIMITER_CHAR);
} else {
asprintf(&tmp2, "%s%c%ld", tmp, ORTE_SCHEMA_DELIMITER_CHAR,
(long)name->jobid, ORTE_SCHEMA_DELIMITER_CHAR);
}
free(tmp);
if (ORTE_VPID_WILDCARD == name->vpid) {
asprintf(name_string, "%s%c%s", tmp2, ORTE_SCHEMA_DELIMITER_CHAR,
ORTE_SCHEMA_WILDCARD_STRING);
} else if (ORTE_VPID_INVALID == name->vpid) {
asprintf(name_string, "%s%c%s", tmp2, ORTE_SCHEMA_DELIMITER_CHAR,
ORTE_SCHEMA_INVALID_STRING);
} else {
asprintf(name_string, "%s%c%ld", tmp2, ORTE_SCHEMA_DELIMITER_CHAR,
(long)name->vpid);
}
free(tmp2);
return ORTE_SUCCESS;
}
int orte_ns_base_convert_string_to_process_name(orte_process_name_t **name,
const char* name_string)
{
char *temp, *token;
orte_cellid_t cell;
orte_jobid_t job;
orte_vpid_t vpid;
long int tmpint;
int return_code=ORTE_SUCCESS;
/* check for NULL string - error */
if (NULL == name_string) {
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
return ORTE_ERR_BAD_PARAM;
}
temp = strdup(name_string); /** copy input string as the strtok process is destructive */
token = strtok(temp, ORTE_SCHEMA_DELIMITER_STRING); /** get first field -> cellid */
/* check for error */
if (NULL == token) {
return ORTE_ERR_BAD_PARAM;
}
/* convert to largest possible int - then
* check to ensure it is within range of cellid_t before casting
*/
/* first, though, check for WILDCARD character - assign
* value accordingly, if found
*/
if (0 == strcmp(token, ORTE_SCHEMA_WILDCARD_STRING)) {
cell = ORTE_CELLID_WILDCARD;
} else if (0 == strcmp(token, ORTE_SCHEMA_INVALID_STRING)) {
cell = ORTE_CELLID_INVALID;
} else {
tmpint = strtol(token, NULL, 10);
if (ORTE_CELLID_MAX >= tmpint && ORTE_CELLID_MIN <= tmpint) {
cell = (orte_cellid_t)tmpint;
} else {
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
return_code = ORTE_ERR_BAD_PARAM;
goto CLEANUP;
}
}
token = strtok(NULL, ORTE_SCHEMA_DELIMITER_STRING); /** get next field -> jobid */
/** convert to largest possible int - then
* check to ensure it is within range of jobid_t before casting */
/* check for error */
if (NULL == token) {
return ORTE_ERR_BAD_PARAM;
}
/** first, though, check for WILDCARD character - assign
* value accordingly, if found
*/
if (0 == strcmp(token, ORTE_SCHEMA_WILDCARD_STRING)) {
job = ORTE_JOBID_WILDCARD;
} else if (0 == strcmp(token, ORTE_SCHEMA_INVALID_STRING)) {
job = ORTE_JOBID_INVALID;
} else {
tmpint = strtol(token, NULL, 10);
if (ORTE_JOBID_MAX >= tmpint && ORTE_JOBID_MIN <= tmpint) {
job = (orte_jobid_t)tmpint;
} else {
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
return_code = ORTE_ERR_BAD_PARAM;
goto CLEANUP;
}
}
token = strtok(NULL, ORTE_SCHEMA_DELIMITER_STRING); /** get next field -> vpid */
/* check for error */
if (NULL == token) {
return ORTE_ERR_BAD_PARAM;
}
/** convert to largest possible int then
* check to ensure it is within range of vpid_t before casting */
/** first, though, check for WILDCARD character - assign
* value accordingly, if found
*/
if (0 == strcmp(token, ORTE_SCHEMA_WILDCARD_STRING)) {
vpid = ORTE_VPID_WILDCARD;
} else if (0 == strcmp(token, ORTE_SCHEMA_INVALID_STRING)) {
vpid = ORTE_VPID_INVALID;
} else {
tmpint = strtol(token, NULL, 10);
if (ORTE_VPID_MAX >= tmpint && ORTE_VPID_MIN <= tmpint) {
vpid = (orte_vpid_t)tmpint;
} else {
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
return_code = ORTE_ERR_BAD_PARAM;
goto CLEANUP;
}
}
if (ORTE_SUCCESS != (return_code =
orte_ns_base_create_process_name(name, cell, job, vpid))) {
ORTE_ERROR_LOG(return_code);
}
CLEANUP:
free(temp);
return return_code;
}
/**** CREATE PROCESS NAME ****/
int orte_ns_base_create_process_name(orte_process_name_t **name,
orte_cellid_t cell,
orte_jobid_t job,
orte_vpid_t vpid)
{
*name = NULL;
*name = (orte_process_name_t*)malloc(sizeof(orte_process_name_t));
if (NULL == *name) { /* got an error */
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
(*name)->cellid = cell;
(*name)->jobid = job;
(*name)->vpid = vpid;
return ORTE_SUCCESS;
}
/**** VPID STRING FUNCTIONS ****/
int orte_ns_base_get_vpid_string(char **vpid_string, const orte_process_name_t* name)
{
if (NULL == name) { /* got an error */
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
*vpid_string = NULL;
return ORTE_ERR_BAD_PARAM;
}
/* check for wildcard value - handle appropriately */
if (ORTE_VPID_WILDCARD == name->vpid) {
*vpid_string = strdup(ORTE_SCHEMA_WILDCARD_STRING);
return ORTE_SUCCESS;
}
/* check for invalid value - handle appropriately */
if (ORTE_VPID_INVALID == name->vpid) {
*vpid_string = strdup(ORTE_SCHEMA_INVALID_STRING);
return ORTE_SUCCESS;
}
if (0 > asprintf(vpid_string, "%ld", (long) name->vpid)) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
return ORTE_SUCCESS;
}
int orte_ns_base_convert_vpid_to_string(char **vpid_string, const orte_vpid_t vpid)
{
/* check for wildcard value - handle appropriately */
if (ORTE_VPID_WILDCARD == vpid) {
*vpid_string = strdup(ORTE_SCHEMA_WILDCARD_STRING);
return ORTE_SUCCESS;
}
/* check for invalid value - handle appropriately */
if (ORTE_VPID_INVALID == vpid) {
*vpid_string = strdup(ORTE_SCHEMA_INVALID_STRING);
return ORTE_SUCCESS;
}
if (0 > asprintf(vpid_string, "%ld", (long) vpid)) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
return ORTE_SUCCESS;
}
int orte_ns_base_convert_string_to_vpid(orte_vpid_t *vpid, const char* vpidstring)
{
long int tmpint;
if (NULL == vpidstring) { /* got an error */
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
*vpid = ORTE_VPID_INVALID;
return ORTE_ERR_BAD_PARAM;
}
/** check for wildcard character - handle appropriately */
if (0 == strcmp(ORTE_SCHEMA_WILDCARD_STRING, vpidstring)) {
*vpid = ORTE_VPID_WILDCARD;
return ORTE_SUCCESS;
}
/* check for invalid value */
if (0 == strcmp(ORTE_SCHEMA_INVALID_STRING, vpidstring)) {
*vpid = ORTE_VPID_INVALID;
return ORTE_SUCCESS;
}
tmpint = strtol(vpidstring, NULL, 10);
if (ORTE_VPID_MAX >= tmpint && ORTE_VPID_MIN <= tmpint) {
*vpid = (orte_vpid_t)tmpint;
} else {
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
*vpid = ORTE_VPID_INVALID;
return ORTE_ERR_BAD_PARAM;
}
return ORTE_SUCCESS;
}
/**** COMPARE NAME FIELDS ****/
int orte_ns_base_compare_fields(orte_ns_cmp_bitmask_t fields,
const orte_process_name_t* name1,
const orte_process_name_t* name2)
{
/* handle the NULL pointer case */
if (NULL == name1 && NULL == name2) {
return ORTE_EQUAL;
} else if (NULL == name1) {
return ORTE_VALUE2_GREATER;
} else if (NULL == name2) {
return ORTE_VALUE1_GREATER;
}
/* in this comparison function, we check for exact equalities.
* In the case of wildcards, we check to ensure that the fields
* actually match those values - thus, a "wildcard" in this
* function does not actually stand for a wildcard value, but
* rather a specific value
*/
if (ORTE_NS_CMP_CELLID & fields) { /* check cellid field */
if (name1->cellid < name2->cellid) {
return ORTE_VALUE2_GREATER;
} else if (name1->cellid > name2->cellid) {
return ORTE_VALUE1_GREATER;
}
}
/* get here if cellid's are equal, or cellid not being checked */
/* now check job id */
if (ORTE_NS_CMP_JOBID & fields) {
if (name1->jobid < name2->jobid) {
return ORTE_VALUE2_GREATER;
} else if (name1->jobid > name2->jobid) {
return ORTE_VALUE1_GREATER;
}
}
/* get here if cellid's and jobid's are equal, or neither being checked,
* or cellid not checked and jobid's equal.
* now check vpid
*/
if (ORTE_NS_CMP_VPID & fields) {
if (name1->vpid < name2->vpid) {
return ORTE_VALUE2_GREATER;
} else if (name1->vpid > name2->vpid) {
return ORTE_VALUE1_GREATER;
}
}
/* only way to get here is if all fields are being checked and are equal,
* or cellid not checked, but jobid and vpid equal,
* or cellid and jobid not checked, but vpid equal,
* only vpid being checked, and equal
* return that fact
*/
return ORTE_EQUAL;
}

285
orte/mca/ns/base/ns_private.h Обычный файл
Просмотреть файл

@ -0,0 +1,285 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/** @file:
*/
#ifndef MCA_NS_PRIVATE_H
#define MCA_NS_PRIVATE_H
/*
* includes
*/
#include "orte_config.h"
#include "orte/orte_constants.h"
#include "opal/class/opal_list.h"
#include "opal/mca/mca.h"
#include "orte/dss/dss_types.h"
#include "orte/mca/ns/ns_types.h"
#include "orte/mca/rml/rml_types.h"
/*
* Global functions for MCA overall collective open and close
*/
#if defined(c_plusplus) || defined(__cplusplus)
extern "C" {
#endif
/* default limits */
#define ORTE_NS_ARRAY_MAX_SIZE INT_MAX
#define ORTE_NS_ARRAY_BLOCK_SIZE 512
/*
* Internal definitions
*/
typedef uint8_t orte_ns_cmd_bitmask_t;
typedef uint8_t orte_ns_cmd_flag_t;
/*
* packing type definitions
*/
/* CAUTION - any changes here must also change corresponding
* typedefs above and in ns_types.h
*/
#define ORTE_NS_CMD ORTE_INT8
#define ORTE_CELLID_T ORTE_INT32
#define ORTE_NODEID_T ORTE_INT32
#define ORTE_JOBID_T ORTE_INT32
#define ORTE_VPID_T ORTE_INT32
/*
* define flag values for remote commands - only used internally
*/
#define ORTE_NS_CREATE_CELLID_CMD (int8_t) 1
#define ORTE_NS_GET_CELL_INFO_CMD (int8_t) 2
#define ORTE_NS_CREATE_NODEID_CMD (int8_t) 3
#define ORTE_NS_GET_NODE_INFO_CMD (int8_t) 4
#define ORTE_NS_CREATE_JOBID_CMD (int8_t) 5
#define ORTE_NS_GET_JOB_DESC_CMD (int8_t) 6
#define ORTE_NS_GET_JOB_CHILD_CMD (int8_t) 7
#define ORTE_NS_GET_ROOT_JOB_CMD (int8_t) 8
#define ORTE_NS_GET_PARENT_JOB_CMD (int8_t) 9
#define ORTE_NS_RESERVE_RANGE_CMD (int8_t) 10
#define ORTE_NS_ASSIGN_OOB_TAG_CMD (int8_t) 11
#define ORTE_NS_GET_PEERS_CMD (int8_t) 12
#define ORTE_NS_DEFINE_DATA_TYPE_CMD (int8_t) 13
#define ORTE_NS_CREATE_MY_NAME_CMD (int8_t) 14
#define ORTE_NS_DUMP_CELLS_CMD (int8_t) 15
#define ORTE_NS_DUMP_JOBIDS_CMD (int8_t) 16
#define ORTE_NS_DUMP_TAGS_CMD (int8_t) 17
#define ORTE_NS_DUMP_DATATYPES_CMD (int8_t) 18
/*
* Base functions that are common to all implementations - can be overridden
*/
ORTE_DECLSPEC int orte_ns_base_create_process_name(orte_process_name_t **name,
orte_cellid_t cell,
orte_jobid_t job,
orte_vpid_t vpid);
ORTE_DECLSPEC int orte_ns_base_convert_string_to_process_name(orte_process_name_t **name,
const char* name_string);
ORTE_DECLSPEC int orte_ns_base_get_proc_name_string(char **name_string,
const orte_process_name_t* name);
ORTE_DECLSPEC int orte_ns_base_get_vpid_string(char **vpid_string, const orte_process_name_t* name);
ORTE_DECLSPEC int orte_ns_base_convert_vpid_to_string(char **vpid_string, const orte_vpid_t vpid);
ORTE_DECLSPEC int orte_ns_base_convert_string_to_vpid(orte_vpid_t *vpid, const char* vpidstring);
ORTE_DECLSPEC int orte_ns_base_get_jobid_string(char **jobid_string, const orte_process_name_t* name);
ORTE_DECLSPEC int orte_ns_base_convert_jobid_to_string(char **jobid_string, const orte_jobid_t jobid);
ORTE_DECLSPEC int orte_ns_base_convert_string_to_jobid(orte_jobid_t *jobid, const char* jobidstring);
ORTE_DECLSPEC int orte_ns_base_get_cellid_string(char **cellid_string, const orte_process_name_t* name);
ORTE_DECLSPEC int orte_ns_base_convert_string_to_cellid(orte_cellid_t *cellid, const char *cellidstring);
ORTE_DECLSPEC int orte_ns_base_convert_cellid_to_string(char **cellid_string, const orte_cellid_t cellid);
ORTE_DECLSPEC int orte_ns_base_get_vpid(orte_vpid_t *vpid, const orte_process_name_t* name);
ORTE_DECLSPEC int orte_ns_base_get_jobid(orte_jobid_t *jobid, const orte_process_name_t* name);
ORTE_DECLSPEC int orte_ns_base_get_cellid(orte_cellid_t *cellid, const orte_process_name_t* name);
ORTE_DECLSPEC int orte_ns_base_convert_string_to_nodeid(orte_nodeid_t *cellid, const char *string);
ORTE_DECLSPEC int orte_ns_base_convert_nodeid_to_string(char **nodeid_string, const orte_nodeid_t nodeid);
ORTE_DECLSPEC int orte_ns_base_compare_fields(orte_ns_cmp_bitmask_t fields,
const orte_process_name_t* name1,
const orte_process_name_t* name2);
ORTE_DECLSPEC int orte_ns_base_print_dump(orte_buffer_t *buffer);
/* not available functions */
ORTE_DECLSPEC int orte_ns_base_module_init_not_available(void);
ORTE_DECLSPEC int orte_ns_base_create_cellid_not_available(orte_cellid_t *cellid,
char *site, char *resource);
ORTE_DECLSPEC int orte_ns_base_get_cell_info_not_available(orte_cellid_t cellid,
char **site, char **resource);
ORTE_DECLSPEC int orte_ns_base_create_nodeids_not_available(orte_nodeid_t **nodeids, orte_std_cntr_t *nnodes,
orte_cellid_t cellid, char **nodename);
ORTE_DECLSPEC int orte_ns_base_get_node_info_not_available(char ***nodename, orte_cellid_t cellid,
orte_std_cntr_t num_nodes, orte_nodeid_t *nodeids);
ORTE_DECLSPEC int orte_ns_base_create_jobid_not_available(orte_jobid_t *jobid, opal_list_t *attrs);
ORTE_DECLSPEC int orte_ns_base_get_job_descendants_not_available(orte_jobid_t** descendants,
orte_std_cntr_t *num_desc,
orte_jobid_t job);
ORTE_DECLSPEC int orte_ns_base_get_job_children_not_available(orte_jobid_t** children,
orte_std_cntr_t *num_childs,
orte_jobid_t job);
ORTE_DECLSPEC int orte_ns_base_get_root_job_not_available(orte_jobid_t *root_job, orte_jobid_t job);
ORTE_DECLSPEC int orte_ns_base_get_parent_job_not_available(orte_jobid_t *parent, orte_jobid_t job);
ORTE_DECLSPEC int orte_ns_base_get_vpid_range_not_available(orte_jobid_t job,
orte_vpid_t range,
orte_vpid_t *startvpid);
ORTE_DECLSPEC int orte_ns_base_assign_rml_tag_not_available(orte_rml_tag_t *tag, char *name);
ORTE_DECLSPEC int orte_ns_base_define_data_type_not_available(
const char *name,
orte_data_type_t *type);
ORTE_DECLSPEC int orte_ns_base_create_my_name_not_available(void);
ORTE_DECLSPEC int orte_ns_base_get_peers_not_available(orte_process_name_t **procs,
orte_std_cntr_t *num_procs, opal_list_t *attributes);
ORTE_DECLSPEC int orte_ns_base_dump_cells_not_available(void);
ORTE_DECLSPEC int orte_ns_base_dump_jobs_not_available(void);
ORTE_DECLSPEC int orte_ns_base_dump_tags_not_available(void);
ORTE_DECLSPEC int orte_ns_base_dump_datatypes_not_available(void);
/* Base functions used everywhere */
ORTE_DECLSPEC int orte_ns_base_pack_name(orte_buffer_t *buffer, void *src,
orte_std_cntr_t num_vals, orte_data_type_t type);
ORTE_DECLSPEC int orte_ns_base_pack_cellid(orte_buffer_t *buffer, void *src,
orte_std_cntr_t num_vals, orte_data_type_t type);
ORTE_DECLSPEC int orte_ns_base_pack_nodeid(orte_buffer_t *buffer, void *src,
orte_std_cntr_t num_vals, orte_data_type_t type);
ORTE_DECLSPEC int orte_ns_base_pack_jobid(orte_buffer_t *buffer, void *src,
orte_std_cntr_t num_vals, orte_data_type_t type);
ORTE_DECLSPEC int orte_ns_base_pack_vpid(orte_buffer_t *buffer, void *src,
orte_std_cntr_t num_vals, orte_data_type_t type);
ORTE_DECLSPEC int orte_ns_base_unpack_name(orte_buffer_t *buffer, void *dest,
orte_std_cntr_t *num_vals, orte_data_type_t type);
ORTE_DECLSPEC int orte_ns_base_unpack_cellid(orte_buffer_t *buffer, void *dest,
orte_std_cntr_t *num_vals, orte_data_type_t type);
ORTE_DECLSPEC int orte_ns_base_unpack_nodeid(orte_buffer_t *buffer, void *dest,
orte_std_cntr_t *num_vals, orte_data_type_t type);
ORTE_DECLSPEC int orte_ns_base_unpack_jobid(orte_buffer_t *buffer, void *dest,
orte_std_cntr_t *num_vals, orte_data_type_t type);
ORTE_DECLSPEC int orte_ns_base_unpack_vpid(orte_buffer_t *buffer, void *dest,
orte_std_cntr_t *num_vals, orte_data_type_t type);
/*
* copy functions
*/
int orte_ns_base_copy_name(orte_process_name_t **dest, orte_process_name_t *src, orte_data_type_t type);
int orte_ns_base_copy_vpid(orte_vpid_t **dest, orte_vpid_t *src, orte_data_type_t type);
int orte_ns_base_copy_cellid(orte_cellid_t **dest, orte_cellid_t *src, orte_data_type_t type);
int orte_ns_base_copy_nodeid(orte_nodeid_t **dest, orte_nodeid_t *src, orte_data_type_t type);
int orte_ns_base_copy_jobid(orte_jobid_t **dest, orte_jobid_t *src, orte_data_type_t type);
/*
* compare functions
*/
int orte_ns_base_compare_name(orte_process_name_t *value1,
orte_process_name_t *value2,
orte_data_type_t type);
int orte_ns_base_compare_vpid(orte_vpid_t *value1,
orte_vpid_t *value2,
orte_data_type_t type);
int orte_ns_base_compare_jobid(orte_jobid_t *value1,
orte_jobid_t *value2,
orte_data_type_t type);
int orte_ns_base_compare_cellid(orte_cellid_t *value1,
orte_cellid_t *value2,
orte_data_type_t type);
int orte_ns_base_compare_nodeid(orte_nodeid_t *value1,
orte_nodeid_t *value2,
orte_data_type_t type);
/*
* size functions
*/
int orte_ns_base_std_size(size_t *size, void *src, orte_data_type_t type);
/*
* release functions
*/
void orte_ns_base_std_release(orte_data_value_t *value);
/*
* print functions
*/
int orte_ns_base_std_print(char **output, char *prefix, void *src, orte_data_type_t type);
int orte_ns_base_print_name(char **output, char *prefix, orte_process_name_t *name, orte_data_type_t type);
/*
* external API functions will be documented in the mca/ns/ns.h file
*/
#if defined(c_plusplus) || defined(__cplusplus)
}
#endif
#endif

Просмотреть файл

@ -39,7 +39,7 @@
#include "orte/dss/dss.h"
#include "opal/mca/mca.h"
#include "orte/mca/oob/oob_types.h"
#include "orte/mca/rml/rml_types.h"
#include "ns_types.h"
@ -59,26 +59,18 @@ typedef int (*orte_ns_base_module_init_fn_t)(void);
/**** CELL FUNCTIONS ****/
/**
* Create a new cell id.
* The create_cellid() function allocates a new cell id for use by the caller.
* The function checks to find the next available cell id, reserves it, and returns that
* number. No memory for names is allocated by this process. The range of answers is from
* 1 to MCA_NS_BASE_CELLID_MAX-1 (zero is reserved for the seed name and cannot therefore be
* allocated).
* Allocates a new cell id for use by the caller. The function returns an
* existing cellid if the specified site/resource already has been assigned
* one.
*
* @param site The name of the site where the cell is located.
* @param resource The name of the resource associated with this cell (e.g., the name
* of the cluster).
* @param cellid The numerical value of the allocated cell id. A value of
* MCA_NS_BASE_CELLID_MAX indicates
* that an error occurred - this represents a very unlikely
* event meaning that the system ran out of cell id's. This probably indicates
* an error in the calling program as the number of available cell id's is extremely large.
* @param cellid The location where the cellid is to be stored.
*
* @retval ORTE_SUCCESS A cellid was created and returned.
* @retval ORTE_ERROR_VALUE An error code indicative of the problem.
*
* @code
* new_cellid = ompi_name_server.create_cellid()
* @endcode
*/
typedef int (*orte_ns_base_module_create_cellid_fn_t)(orte_cellid_t *cellid,
@ -97,34 +89,6 @@ typedef int (*orte_ns_base_module_create_cellid_fn_t)(orte_cellid_t *cellid,
typedef int (*orte_ns_base_module_get_cell_info_fn_t)(orte_cellid_t cellid,
char **site, char **resource);
/**
* Get the cell id for a process.
* The cellid designator represents the physical location of the process - it is associated with
* the hardware/system where the process is executing. Each process name contains this identifier
* so that the system can issue commands (e.g., "die") to a collection of processes that are
* executing on a common platform.
*
* Given that usage, it is necessary that the system have a way of telling a process its cellid.
* The create_cellid() function is used by the system to associate a "cellid" identifier with
* each platform. This function - assign_cellid_to_process() - is used to inform the process
* of its cellid.
*
* Given a process name, this function will lookup its current platform and update the name with the
* cellid.
*
* @param name Pointer to an ompi_process_name structure. The function will update the cellid
* entry in the structure.
*
* @retval ORTE_SUCCESS Update was successful.
* @retval OMPI_ERROR Update failed, most likely due to either a NULL process name pointer or the
* inability to locate the process name in the lookup table.
*
* @code
* return_value = ompi_name_server.assign_cellid_to_process(ompi_process_name_t* name);
* @endcode
*/
typedef int (*orte_ns_base_module_assign_cellid_to_process_fn_t)(orte_process_name_t* name);
/**
* Get the cell id as a character string.
* The get_cellid_string() function returns the cell id in a character string
@ -181,50 +145,105 @@ typedef int (*orte_ns_base_module_get_cellid_string_fn_t)(char **cellid_string,
typedef int (*orte_ns_base_module_convert_string_to_cellid_fn_t)(orte_cellid_t *cellid, const char *cellidstring);
/**
* Get the cell id as a numberic value.
* The get_cellid() function returns the cell id in a numeric representation -
* i.e., in an integer form.
*
* @param *name A pointer to the name structure containing the name.
*
* @retval cellid The cell id field of the provided name.
* @retval MCA_NS_BASE_CELLID_MAX Indicates that an error occurred - in this case, that
* the name variable provided was NULL.
*
* @code
* cellid = ompi_name_server.get_cellid(&name)
* @endcode
/**** NODE FUNCTIONS ****/
/*
* Get an array of node id's
* Given the cell and a NULL-terminated array of names of nodes within it, this function assigns an id to represent
* each node within the cell.
*/
typedef int (*orte_ns_base_module_get_cellid_fn_t)(orte_cellid_t *cellid, const orte_process_name_t* name);
typedef int (*orte_ns_base_module_create_nodeids_fn_t)(orte_nodeid_t **nodes, orte_std_cntr_t *nnodes,
orte_cellid_t cellid, char **nodename);
/*
* Get node info
* Retrieve the names of an array of nodes given their cellid and nodeids. The cellid
* is required as the nodeids are only unique within a given cell.
*
* @param cellid The id of the cell of the node.
* @param nodeids The ids of the node.
* @param nodenames Returns a pointer to a NULL-terminated array of strdup'd strings containing the node names.
* @retval ORTE_SUCCESS The nodename was created and returned.
* @retval ORTE_ERROR_VALUE An error code indicative of the problem.
*/
typedef int (*orte_ns_base_module_get_node_info_fn_t)(char ***nodename, orte_cellid_t cellid,
orte_std_cntr_t num_nodes, orte_nodeid_t *nodeids);
/*
* Convert nodeid to character string
* Returns the nodeid in a character string representation. The string is created
* by expressing the provided nodeid in decimal. Memory for the string is
* allocated by the function - releasing that allocation is the responsibility of
* the calling program.
*
* @param nodeid The nodeid to be converted.
*
* @param *nodeid_string A pointer to a character string representation of the nodeid.
* @retval ORTE_SUCCESS The string was created and returned.
* @retval ORTE_ERROR_VALUE An error code indicative of the problem.
*/
typedef int (*orte_ns_base_module_convert_nodeid_to_string_fn_t)(char **nodeid_string, const orte_nodeid_t nodeid);
/*
* Convert a string to a nodeid.
* Converts a characters string into a nodeid. The character string must be a
* decimal representation of a valid nodeid.
*
* @param nodeidstring The string to be converted.
*
* @param nodeid A pointer to a location where the resulting nodeid is to be stored.
* @retval ORTE_SUCCESS The string was created and returned.
* @retval ORTE_ERROR_VALUE An error code indicative of the problem.
*/
typedef int (*orte_ns_base_module_convert_string_to_nodeid_fn_t)(orte_nodeid_t *nodeid, const char *nodeidstring);
/**** JOB ID FUNCTIONS ****/
/**
* Create a new job id.
* The create_jobid() function allocates a new job id for use by the caller.
* The function checks to find the next available job id, reserves it, and returns that
* number. No memory for names is allocated by this process. The range of answers is from
* 1 to MCA_NS_BASE_JOBID_MAX-1 (zero is reserved for the seed name and cannot therefore be
* allocated).
*
* Allocate a new job id for use by the caller.
*
* The 0 job id is reserved for daemons within the system and will not be allocated.
* Developers should therefore assume that the daemon job id is automatically allocated
* and proceed to request names against it.
*
* @param None
* @retval jobid The numerical value of the allocated job id. A value of
* MCA_NS_BASE_JOBID_MAX indicates
* that an error occurred - this represents a very unlikely
* event meaning that the system ran out of job id's. This probably indicates
* an error in the calling program as the number of available job id's is extremely large.
*
* @code
* new_jobid = ompi_name_server.create_jobid()
* @endcode
* @param jobid A pointer to the location where the jobid is to be returned.
* @param attrs A list of attributes that describe any conditions to be placed on
* the assigned jobid. For example, specifying USE_PARENT indicates that the specified
* jobid is to be identified as the parent of the new jobid. USE_ROOT indicates that
* the root of the job family of the specified jobid is to be identified as the parent.
*/
typedef int (*orte_ns_base_module_create_jobid_fn_t)(orte_jobid_t *jobid);
typedef int (*orte_ns_base_module_create_jobid_fn_t)(orte_jobid_t *jobid, opal_list_t *attrs);
/*
* Get job descendants
* Given a jobid, return the array of jobids that descend from this one.
*/
typedef int (*orte_ns_base_module_get_job_descendants_fn_t)(orte_jobid_t** descendants,
orte_std_cntr_t *num_desc,
orte_jobid_t job);
/*
* Get job children
* Given a jobid, return the array of jobids that are direct children of that job
*/
typedef int (*orte_ns_base_module_get_job_children_fn_t)(orte_jobid_t** children,
orte_std_cntr_t *num_childs,
orte_jobid_t job);
/*
* Get root job from job family
* Given a jobid, return the jobid at the head of this job's family. If the jobid provided is the
* root for that family, that value will be returned.
*/
typedef int (*orte_ns_base_module_get_root_job_fn_t)(orte_jobid_t *root_job, orte_jobid_t job);
/*
* Get parent jobid
* Given a jobid, return the parent job from which it descended. If the provided jobid is the
* root (i.e., has no parent), this function will return that same value.
*/
typedef int (*orte_ns_base_module_get_parent_job_fn_t)(orte_jobid_t *parent, orte_jobid_t job);
/**
* Reserve a range of process id's.
@ -305,23 +324,6 @@ typedef int (*orte_ns_base_module_convert_jobid_to_string_fn_t)(char **jobid_str
*/
typedef int (*orte_ns_base_module_convert_string_to_jobid_fn_t)(orte_jobid_t *jobid, const char* jobidstring);
/**
* Get the job id as a numeric value.
* The get_jobid() function returns the job id in a numeric representation -
* i.e., in an integer form.
*
* @param *name A pointer to the name structure containing the name.
*
* @retval jobid The job id field of the provided name.
* @retval MCA_NS_BASE_JOBID_MAX Indicates that an error occurred - in this case, that
* the name variable provided was NULL.
*
* @code
* jobid = ompi_name_server.get_jobid(&name)
* @endcode
*/
typedef int (*orte_ns_base_module_get_jobid_fn_t)(orte_jobid_t *jobid, const orte_process_name_t* name);
/**** NAME FUNCTIONS ****/
@ -365,20 +367,6 @@ typedef int (*orte_ns_base_module_create_proc_name_fn_t)(orte_process_name_t **n
*/
typedef int (*orte_ns_base_module_create_my_name_fn_t)(void);
/**
* Make a copy of a process name.
* Given a process name, this function creates a copy of it and returns a pointer
* to the duplicate structure.
*
* @param *name Pointer to an existing process name structure.
*
* @retval *newname Pointer to the duplicate structure, with all fields transferred.
* @retval NULL Indicates an error - most likely due to a NULL process name
* pointer being supplied as input.
*/
typedef int (*orte_ns_base_module_copy_proc_name_fn_t)(orte_process_name_t **dest,
orte_process_name_t* src);
/**
* Convert a string representation to a process name.
* The convert_string_to_process_name() function converts a string representation of a process
@ -399,30 +387,6 @@ typedef int (*orte_ns_base_module_convert_string_to_process_name_fn_t)(orte_proc
const char* name_string);
/**
* Free (release) a process name.
* The free_name() function releases the process name from the "used" list
* maintained within the name server for the jobid contained in the specified
* name. The memory for the name is also released at that time.
*
* Name values are currently \em not re-used. Hence, free-ing a name
* does not provide any noticeable benefit other than releasing the memory. In
* the future, names may be re-used if this becomes desirable.
*
* @param *name A pointer to the name structure containing the name being released.
*
* @retval ORTE_SUCCESS Indicates the release was succesfully accomplished.
* @retval OMPI_ERROR Indicates the release failed - most likely due to an
* error when free-ing the memory allocation.
*
* @code
* if (OMPI_ERROR == ompi_name_server.free_name(&name) {
* report error
* }
* @endcode
*/
typedef int (*orte_ns_base_module_free_name_fn_t)(orte_process_name_t **name);
/**
* Get the process name as a character string.
* The get_proc_name_string() function returns the entire process name in a
@ -478,9 +442,9 @@ typedef int (*orte_ns_base_module_get_proc_name_string_fn_t)(char **name_string,
* result = ompi_name_server.compare(bit_mask, &name1, &name2)
* @endcode
*/
typedef int (*orte_ns_base_module_compare_fn_t)(orte_ns_cmp_bitmask_t fields,
const orte_process_name_t* name1,
const orte_process_name_t* name2);
typedef int (*orte_ns_base_module_compare_fields_fn_t)(orte_ns_cmp_bitmask_t fields,
const orte_process_name_t* name1,
const orte_process_name_t* name2);
/**** VPID FUNCTIONS ****/
@ -539,22 +503,7 @@ typedef int (*orte_ns_base_module_get_vpid_string_fn_t)(char **vpid_string, cons
*/
typedef int (*orte_ns_base_module_convert_string_to_vpid_fn_t)(orte_vpid_t *vpid, const char* vpidstring);
/**
* Get the virtual process id as a numeric value.
* The get_vpid() function returns the vpid in a numeric representation -
* i.e., in an integer form.
*
* @param *name A pointer to the name structure containing the name.
*
* @retval vpid The vpid field of the provided name.
* @retval MCA_NS_BASE_VPID_MAX Indicates that an error occurred - in this case, that
* the name variable provided was NULL.
*
* @code
* vpid = ompi_name_server.get_vpid(&name)
* @endcode
*/
typedef int (*orte_ns_base_module_get_vpid_fn_t)(orte_vpid_t *vpid, const orte_process_name_t *name);
/**** TAG SERVER ****/
/*
@ -576,24 +525,33 @@ typedef int (*orte_ns_base_module_define_data_type_fn_t)(
/**** PEER RETRIEVAL ****/
/*
* Get my peers
/**
* Get the process names of all processes in the specified conditions. It is
* sometimes necessary for a process to communicate to all processes of a
* given job, all processes in a given cell or on a given node, etc. The RML
* communication system utilizes the process name as its "pointer" for
* sending messages to another process. This function returns an array of
* process name pointers that contains the names of all processes that
* meet the specified combination of attributes.
*
* @param procs The location where the address of the array of pointers
* is to be stored. The function will dynamically allocate space for the
* array - the caller is responsible for releasing this space.
* @param num_procs The location where the number of entries in the
* returned array is to be stored.
* @param attributes A list of conditions to be used in defining the
* peers to be included in the returned array. This can include a
* request that all peers for the parent job be returned, for example.
* More common options would be to specify a cell or job.
*
* NOTE The combination of ORTE_CELLID_WILDCARD and ORTE_JOBID_WILDCARD
* in the attribute list will cause the function to return the names of *all*
* processes currently active in the universe.
*
* THIS FUNCTION MAY BE ELIMINATED IN FUTURE VERSIONS TO REMOVE MULTIPLE STORAGE
* OF O(N) ARRAYS IN THE SYSTEM
*/
typedef int (*orte_ns_base_module_get_peers_fn_t)(orte_process_name_t **procs,
orte_std_cntr_t *num_procs, orte_std_cntr_t *self);
/*
* Get the list of peers from a specified job
*
* THIS FUNCTION MAY BE ELIMINATED IN FUTURE VERSIONS TO REMOVE MULTIPLE STORAGE
* OF O(N) ARRAYS IN THE SYSTEM
*/
typedef int (*orte_ns_base_module_get_job_peers_fn_t)(orte_process_name_t **procs,
orte_std_cntr_t *num_procs, orte_jobid_t job);
orte_std_cntr_t *num_procs,
opal_list_t *attributes);
/*
@ -609,55 +567,57 @@ typedef int (*orte_ns_base_module_dump_datatypes_fn_t)(void);
/*
* Ver 1.0.0
* Ver 2.0
*/
struct mca_ns_base_module_1_0_0_t {
struct mca_ns_base_module_2_0_0_t {
/* init */
orte_ns_base_module_init_fn_t init;
orte_ns_base_module_init_fn_t init;
/* cell functions */
orte_ns_base_module_create_cellid_fn_t create_cellid;
orte_ns_base_module_get_cellid_fn_t get_cellid;
orte_ns_base_module_get_cell_info_fn_t get_cell_info;
orte_ns_base_module_assign_cellid_to_process_fn_t assign_cellid_to_process;
orte_ns_base_module_get_cellid_string_fn_t get_cellid_string;
orte_ns_base_module_convert_cellid_to_string_fn_t convert_cellid_to_string;
orte_ns_base_module_convert_string_to_cellid_fn_t convert_string_to_cellid;
orte_ns_base_module_create_cellid_fn_t create_cellid;
orte_ns_base_module_get_cell_info_fn_t get_cell_info;
orte_ns_base_module_get_cellid_string_fn_t get_cellid_string;
orte_ns_base_module_convert_cellid_to_string_fn_t convert_cellid_to_string;
orte_ns_base_module_convert_string_to_cellid_fn_t convert_string_to_cellid;
/** node functions */
orte_ns_base_module_create_nodeids_fn_t create_nodeids;
orte_ns_base_module_get_node_info_fn_t get_node_info;
orte_ns_base_module_convert_nodeid_to_string_fn_t convert_nodeid_to_string;
orte_ns_base_module_convert_string_to_nodeid_fn_t convert_string_to_nodeid;
/* jobid functions */
orte_ns_base_module_create_jobid_fn_t create_jobid;
orte_ns_base_module_get_jobid_fn_t get_jobid;
orte_ns_base_module_get_jobid_string_fn_t get_jobid_string;
orte_ns_base_module_convert_jobid_to_string_fn_t convert_jobid_to_string;
orte_ns_base_module_convert_string_to_jobid_fn_t convert_string_to_jobid;
orte_ns_base_module_create_jobid_fn_t create_jobid;
orte_ns_base_module_get_job_descendants_fn_t get_job_descendants;
orte_ns_base_module_get_job_children_fn_t get_job_children;
orte_ns_base_module_get_root_job_fn_t get_root_job;
orte_ns_base_module_get_parent_job_fn_t get_parent_job;
orte_ns_base_module_get_jobid_string_fn_t get_jobid_string;
orte_ns_base_module_convert_jobid_to_string_fn_t convert_jobid_to_string;
orte_ns_base_module_convert_string_to_jobid_fn_t convert_string_to_jobid;
orte_ns_base_module_reserve_range_fn_t reserve_range;
/* vpid functions */
orte_ns_base_module_reserve_range_fn_t reserve_range;
orte_ns_base_module_get_vpid_fn_t get_vpid;
orte_ns_base_module_get_vpid_string_fn_t get_vpid_string;
orte_ns_base_module_convert_vpid_to_string_fn_t convert_vpid_to_string;
orte_ns_base_module_convert_string_to_vpid_fn_t convert_string_to_vpid;
orte_ns_base_module_get_vpid_string_fn_t get_vpid_string;
orte_ns_base_module_convert_vpid_to_string_fn_t convert_vpid_to_string;
orte_ns_base_module_convert_string_to_vpid_fn_t convert_string_to_vpid;
/* name functions */
orte_ns_base_module_create_proc_name_fn_t create_process_name;
orte_ns_base_module_create_my_name_fn_t create_my_name;
orte_ns_base_module_copy_proc_name_fn_t copy_process_name;
orte_ns_base_module_create_proc_name_fn_t create_process_name;
orte_ns_base_module_create_my_name_fn_t create_my_name;
orte_ns_base_module_convert_string_to_process_name_fn_t convert_string_to_process_name;
orte_ns_base_module_free_name_fn_t free_name;
orte_ns_base_module_get_proc_name_string_fn_t get_proc_name_string;
orte_ns_base_module_compare_fn_t compare;
orte_ns_base_module_get_proc_name_string_fn_t get_proc_name_string;
orte_ns_base_module_compare_fields_fn_t compare_fields;
/* peer functions */
orte_ns_base_module_get_peers_fn_t get_peers;
orte_ns_base_module_get_job_peers_fn_t get_job_peers;
orte_ns_base_module_get_peers_fn_t get_peers;
/* tag server functions */
orte_ns_base_module_assign_rml_tag_fn_t assign_rml_tag;
orte_ns_base_module_assign_rml_tag_fn_t assign_rml_tag;
/* data type functions */
orte_ns_base_module_define_data_type_fn_t define_data_type;
orte_ns_base_module_define_data_type_fn_t define_data_type;
/* diagnostic functions */
orte_ns_base_module_dump_cells_fn_t dump_cells;
orte_ns_base_module_dump_jobs_fn_t dump_jobs;
orte_ns_base_module_dump_tags_fn_t dump_tags;
orte_ns_base_module_dump_datatypes_fn_t dump_datatypes;
orte_ns_base_module_dump_cells_fn_t dump_cells;
orte_ns_base_module_dump_jobs_fn_t dump_jobs;
orte_ns_base_module_dump_tags_fn_t dump_tags;
orte_ns_base_module_dump_datatypes_fn_t dump_datatypes;
};
typedef struct mca_ns_base_module_1_0_0_t mca_ns_base_module_1_0_0_t;
typedef mca_ns_base_module_1_0_0_t mca_ns_base_module_t;
typedef struct mca_ns_base_module_2_0_0_t mca_ns_base_module_2_0_0_t;
typedef mca_ns_base_module_2_0_0_t mca_ns_base_module_t;
/*
* NS Component
@ -677,26 +637,26 @@ typedef int (*mca_ns_base_component_finalize_fn_t)(void);
* the standard component data structure
*/
struct mca_ns_base_component_1_0_0_t {
struct mca_ns_base_component_2_0_0_t {
mca_base_component_t ns_version;
mca_base_component_data_1_0_0_t ns_data;
mca_ns_base_component_init_fn_t ns_init;
mca_ns_base_component_finalize_fn_t ns_finalize;
};
typedef struct mca_ns_base_component_1_0_0_t mca_ns_base_component_1_0_0_t;
typedef mca_ns_base_component_1_0_0_t mca_ns_base_component_t;
typedef struct mca_ns_base_component_2_0_0_t mca_ns_base_component_2_0_0_t;
typedef mca_ns_base_component_2_0_0_t mca_ns_base_component_t;
/*
* Macro for use in components that are of type ns v1.0.0
* Macro for use in components that are of type ns v2.0.0
*/
#define MCA_NS_BASE_VERSION_1_0_0 \
/* ns v1.0 is chained to MCA v1.0 */ \
#define MCA_NS_BASE_VERSION_2_0_0 \
/* ns v2.0 is chained to MCA v1.0 */ \
MCA_BASE_VERSION_1_0_0, \
/* ns v1.0 */ \
"ns", 1, 0, 0
/* ns v2.0 */ \
"ns", 2, 0, 0
/* Global structure for accessing name server functions
*/

Просмотреть файл

@ -47,10 +47,20 @@
extern "C" {
#endif
/**** NS ATTRIBUTES ****/
#define ORTE_NS_USE_PARENT "orte-ns-use-parent"
#define ORTE_NS_USE_ROOT "orte-ns-use-root"
#define ORTE_NS_USE_CELL "orte-ns-use-cell"
#define ORTE_NS_USE_JOBID "orte-ns-use-job"
#define ORTE_NS_USE_NODE "orte-ns-use-node"
#define ORTE_NS_INCLUDE_DESCENDANTS "orte-ns-include-desc"
#define ORTE_NS_INCLUDE_CHILDREN "orte-ns-include-child"
#define ORTE_NAME_ARGS(n) \
(unsigned long) ((NULL == n) ? -1 : (int32_t)(n)->cellid), \
(unsigned long) ((NULL == n) ? -1 : (int32_t)(n)->jobid), \
(unsigned long) ((NULL == n) ? -1 : (int32_t)(n)->vpid)
(long) ((NULL == n) ? (long)-1 : (long)(n)->cellid), \
(long) ((NULL == n) ? (long)-1 : (long)(n)->jobid), \
(long) ((NULL == n) ? (long)-1 : (long)(n)->vpid)
/*
@ -69,18 +79,19 @@ extern "C" {
/** Set the allowed range for ids in each space
*
* NOTE: Be sure to update the ORTE_NAME_ARGS #define (above) and all
* uses of it if these types change to be larger than (unsigned long)!
* uses of it if these types change to be larger than (long)!
*/
typedef orte_std_cntr_t orte_jobid_t;
typedef orte_std_cntr_t orte_cellid_t;
typedef orte_std_cntr_t orte_nodeid_t;
typedef orte_std_cntr_t orte_vpid_t;
typedef uint8_t orte_ns_cmp_bitmask_t; /**< Bit mask for comparing process names */
typedef uint8_t orte_ns_cmd_flag_t;
struct orte_process_name_t {
orte_cellid_t cellid; /**< Cell number */
orte_jobid_t jobid; /**< Job number */
orte_vpid_t vpid; /**< Process number */
orte_cellid_t cellid; /**< Cell number */
orte_jobid_t jobid; /**< Job number */
orte_vpid_t vpid; /**< Process number */
};
typedef struct orte_process_name_t orte_process_name_t;
@ -90,6 +101,15 @@ typedef struct orte_process_name_t orte_process_name_t;
#define ORTE_CELLID_MAX ORTE_STD_CNTR_MAX
#define ORTE_JOBID_MAX ORTE_STD_CNTR_MAX
#define ORTE_VPID_MAX ORTE_STD_CNTR_MAX
#define ORTE_NODEID_MAX ORTE_STD_CNTR_MAX
/*
* define minimum value for id's in any field
*/
#define ORTE_CELLID_MIN ORTE_STD_CNTR_MIN
#define ORTE_JOBID_MIN ORTE_STD_CNTR_MIN
#define ORTE_VPID_MIN ORTE_STD_CNTR_MIN
#define ORTE_NODEID_MIN ORTE_STD_CNTR_MIN
/*
* define invalid values
@ -97,19 +117,31 @@ typedef struct orte_process_name_t orte_process_name_t;
#define ORTE_CELLID_INVALID -999
#define ORTE_JOBID_INVALID -999
#define ORTE_VPID_INVALID -999
#define ORTE_NODEID_INVALID -999
/*
* define wildcard values
* define wildcard values (should be -1)
*/
#define ORTE_CELLID_WILDCARD -1
#define ORTE_JOBID_WILDCARD -1
#define ORTE_VPID_WILDCARD -1
#define ORTE_NODEID_WILDCARD -1
ORTE_DECLSPEC extern orte_process_name_t orte_name_all;
#define ORTE_NAME_ALL &orte_name_all
/*
* Shortcut for some commonly used names
*/
#define ORTE_NAME_WILDCARD &orte_ns_name_wildcard
ORTE_DECLSPEC extern orte_process_name_t orte_ns_name_wildcard; /** instantiated in orte/mca/ns/base/ns_base_open.c */
#define ORTE_NAME_INVALID &orte_ns_name_invalid
ORTE_DECLSPEC extern orte_process_name_t orte_ns_name_invalid; /** instantiated in orte/mca/ns/base/ns_base_open.c */
#define ORTE_PROC_MY_NAME orte_process_info.my_name
#define ORTE_PROC_MY_HNP &orte_ns_name_my_hnp
ORTE_DECLSPEC extern orte_process_name_t orte_ns_name_my_hnp; /** instantiated in orte/mca/ns/base/ns_base_open.c */
/**
* Convert process name from host to network byte order.
*

Просмотреть файл

@ -16,37 +16,34 @@
# $HEADER$
#
# Use the top-level Makefile.options
sources =
include src/Makefile.extra
sources = \
ns_proxy.h \
ns_proxy_cell_fns.c \
ns_proxy_diag_fns.c \
ns_proxy_general_fns.c \
ns_proxy_job_fns.c \
ns_proxy_component.c
# Make the output library in this directory, and name it either
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
# (for static builds).
if OMPI_BUILD_ns_proxy_DSO
lib =
lib_sources =
component = mca_ns_proxy.la
component_sources = $(sources)
component_noinst =
component_install = mca_ns_proxy.la
else
lib = libmca_ns_proxy.la
lib_sources = $(sources)
component =
component_sources =
component_noinst = libmca_ns_proxy.la
component_install =
endif
mcacomponentdir = $(libdir)/openmpi
mcacomponent_LTLIBRARIES = $(component)
mca_ns_proxy_la_SOURCES = $(component_sources)
mcacomponent_LTLIBRARIES = $(component_install)
mca_ns_proxy_la_SOURCES = $(sources)
mca_ns_proxy_la_LDFLAGS = -module -avoid-version
mca_ns_proxy_la_LIBADD = \
$(top_ompi_builddir)/orte/liborte.la \
$(top_ompi_builddir)/opal/libopal.la
noinst_LTLIBRARIES = $(lib)
libmca_ns_proxy_la_SOURCES = $(lib_sources)
noinst_LTLIBRARIES = $(component_noinst)
libmca_ns_proxy_la_SOURCES =$(sources)
libmca_ns_proxy_la_LDFLAGS = -module -avoid-version

Просмотреть файл

@ -19,5 +19,6 @@
# Specific to this module
PARAM_INIT_FILE=src/ns_proxy.c
PARAM_INIT_FILE=ns_proxy.c
PARAM_CONFIG_HEADER_FILE="ns_proxy.h"
PARAM_CONFIG_FILES="Makefile"

Просмотреть файл

@ -22,27 +22,20 @@
#include "orte_config.h"
#include "opal/types.h"
#include "orte/orte_constants.h"
#include "opal/types.h"
#include "opal/class/opal_list.h"
#include "orte/dss/dss.h"
#include "orte/mca/ns/base/base.h"
#include "orte/mca/ns/ns.h"
#include "orte/mca/ns/base/ns_private.h"
#if defined(c_plusplus) || defined(__cplusplus)
extern "C" {
#endif
struct orte_ns_proxy_cell_info_t {
opal_object_t super;
orte_cellid_t cellid;
char *site;
char *resource;
};
typedef struct orte_ns_proxy_cell_info_t orte_ns_proxy_cell_info_t;
OBJ_CLASS_DECLARATION(orte_ns_proxy_cell_info_t);
struct orte_ns_proxy_tagitem_t {
opal_object_t super;
orte_rml_tag_t tag; /**< OOB tag */
@ -81,7 +74,6 @@ int orte_ns_proxy_finalize(void);
*/
typedef struct {
size_t max_size, block_size;
orte_process_name_t *my_replica;
int debug;
orte_cellid_t num_cells;
orte_pointer_array_t *cells;
@ -94,6 +86,12 @@ typedef struct {
extern orte_ns_proxy_globals_t orte_ns_proxy;
/*
* simplifying define
*/
#define ORTE_NS_MY_REPLICA orte_process_info.ns_replica
/*
* proxy function prototypes
*/
@ -101,13 +99,26 @@ int orte_ns_proxy_create_cellid(orte_cellid_t *cellid, char *site, char *resourc
int orte_ns_proxy_get_cell_info(orte_cellid_t cellid, char **site, char **resource);
int orte_ns_proxy_create_jobid(orte_jobid_t *jobid);
int orte_ns_proxy_create_nodeids(orte_nodeid_t **nodeids, orte_std_cntr_t *nnodes,
orte_cellid_t cellid, char **nodenames);
int orte_ns_proxy_get_node_info(char ***nodename, orte_cellid_t cellid, orte_std_cntr_t num_nodes, orte_nodeid_t *nodeids);
int orte_ns_proxy_create_jobid(orte_jobid_t *jobid, opal_list_t *attrs);
int orte_ns_proxy_get_job_descendants(orte_jobid_t** descendants, orte_std_cntr_t *ndesc, orte_jobid_t job);
int orte_ns_proxy_get_job_children(orte_jobid_t** descendants, orte_std_cntr_t *ndesc, orte_jobid_t job);
int orte_ns_proxy_get_root_job(orte_jobid_t *root_job, orte_jobid_t job);
int orte_ns_proxy_get_parent_job(orte_jobid_t *parent, orte_jobid_t job);
int orte_ns_proxy_reserve_range(orte_jobid_t job, orte_vpid_t range,
orte_vpid_t *startvpid);
int orte_ns_proxy_get_job_peers(orte_process_name_t **procs,
orte_std_cntr_t *num_procs, orte_jobid_t job);
int orte_ns_proxy_get_peers(orte_process_name_t **procs,
orte_std_cntr_t *num_procs, opal_list_t *attrs);
int orte_ns_proxy_assign_rml_tag(orte_rml_tag_t *tag, char *name);

440
orte/mca/ns/proxy/ns_proxy_cell_fns.c Обычный файл
Просмотреть файл

@ -0,0 +1,440 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2006 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/** @file:
*
*/
#include "orte_config.h"
#include <string.h>
#include "orte/orte_constants.h"
#include "orte/orte_types.h"
#include "opal/mca/mca.h"
#include "opal/util/argv.h"
#include "opal/util/output.h"
#include "opal/util/trace.h"
#include "orte/dss/dss.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/rml/rml.h"
#include "ns_proxy.h"
/**
* globals
*/
/*
* functions
*/
int orte_ns_proxy_create_cellid(orte_cellid_t *cellid, char *site, char *resource)
{
orte_buffer_t* cmd;
orte_buffer_t* answer;
orte_ns_cmd_flag_t command;
orte_std_cntr_t count;
int rc;
OPAL_TRACE(1);
/* set the default value of error */
*cellid = ORTE_CELLID_INVALID;
command = ORTE_NS_CREATE_CELLID_CMD;
cmd = OBJ_NEW(orte_buffer_t);
if (cmd == NULL) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, &command, 1, ORTE_NS_CMD))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(cmd);
return rc;
}
if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, &site, 1, ORTE_STRING))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(cmd);
return rc;
}
if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, &resource, 1, ORTE_STRING))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(cmd);
return rc;
}
if (0 > orte_rml.send_buffer(ORTE_NS_MY_REPLICA, cmd, ORTE_RML_TAG_NS, 0)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_RELEASE(cmd);
return ORTE_ERR_COMM_FAILURE;
}
OBJ_RELEASE(cmd);
answer = OBJ_NEW(orte_buffer_t);
if(answer == NULL) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
if (0 > orte_rml.recv_buffer(ORTE_NS_MY_REPLICA, answer, ORTE_RML_TAG_NS)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_RELEASE(answer);
return ORTE_ERR_COMM_FAILURE;
}
count = 1;
if (ORTE_SUCCESS != (rc = orte_dss.unpack(answer, &command, &count, ORTE_NS_CMD))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(answer);
return rc;
}
if (ORTE_NS_CREATE_CELLID_CMD != command) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_RELEASE(answer);
return ORTE_ERR_COMM_FAILURE;
}
count = 1;
if (ORTE_SUCCESS != (rc = orte_dss.unpack(answer, cellid, &count, ORTE_CELLID))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(answer);
return rc;
}
OBJ_RELEASE(answer);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return ORTE_SUCCESS;
}
int orte_ns_proxy_get_cell_info(orte_cellid_t cellid,
char **site, char **resource)
{
orte_buffer_t* cmd;
orte_buffer_t* answer;
orte_ns_cmd_flag_t command;
orte_std_cntr_t count;
int rc, ret=ORTE_SUCCESS;
OPAL_TRACE(1);
command = ORTE_NS_GET_CELL_INFO_CMD;
cmd = OBJ_NEW(orte_buffer_t);
if (cmd == NULL) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return ORTE_ERR_OUT_OF_RESOURCE;
}
if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, &command, 1, ORTE_NS_CMD))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(cmd);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return rc;
}
if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, &cellid, 1, ORTE_CELLID))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(cmd);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return rc;
}
if (0 > orte_rml.send_buffer(ORTE_NS_MY_REPLICA, cmd, ORTE_RML_TAG_NS, 0)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_RELEASE(cmd);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return ORTE_ERR_COMM_FAILURE;
}
OBJ_RELEASE(cmd);
answer = OBJ_NEW(orte_buffer_t);
if(answer == NULL) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return ORTE_ERR_OUT_OF_RESOURCE;
}
if (0 > orte_rml.recv_buffer(ORTE_NS_MY_REPLICA, answer, ORTE_RML_TAG_NS)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_RELEASE(answer);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return ORTE_ERR_COMM_FAILURE;
}
count = 1;
if (ORTE_SUCCESS != (rc = orte_dss.unpack(answer, &command, &count, ORTE_NS_CMD))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(answer);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return rc;
}
if (ORTE_NS_GET_CELL_INFO_CMD != command) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_RELEASE(answer);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return ORTE_ERR_COMM_FAILURE;
}
count = 1;
if (ORTE_SUCCESS != (rc = orte_dss.unpack(answer, site, &count, ORTE_STRING))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(answer);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return rc;
}
count = 1;
if (ORTE_SUCCESS != (rc = orte_dss.unpack(answer, resource, &count, ORTE_STRING))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(answer);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return rc;
}
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return ORTE_SUCCESS;
}
int orte_ns_proxy_create_nodeids(orte_nodeid_t **nodeids, orte_std_cntr_t *nnodes,
orte_cellid_t cellid, char **nodenames)
{
orte_buffer_t* cmd;
orte_buffer_t* answer;
orte_ns_cmd_flag_t command;
orte_std_cntr_t count, index;
int rc;
OPAL_THREAD_LOCK(&orte_ns_proxy.mutex);
command = ORTE_NS_CREATE_NODEID_CMD;
cmd = OBJ_NEW(orte_buffer_t);
if (cmd == NULL) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, &command, 1, ORTE_NS_CMD))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(cmd);
return rc;
}
if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, &cellid, 1, ORTE_CELLID))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(cmd);
return rc;
}
count = opal_argv_count(nodenames);
if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, &count, 1, ORTE_STD_CNTR))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(cmd);
return rc;
}
if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, nodenames, count, ORTE_STRING))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(cmd);
return rc;
}
if (0 > orte_rml.send_buffer(ORTE_NS_MY_REPLICA, cmd, ORTE_RML_TAG_NS, 0)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_RELEASE(cmd);
return ORTE_ERR_COMM_FAILURE;
}
OBJ_RELEASE(cmd);
answer = OBJ_NEW(orte_buffer_t);
if(answer == NULL) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
if (0 > orte_rml.recv_buffer(ORTE_NS_MY_REPLICA, answer, ORTE_RML_TAG_NS)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_RELEASE(answer);
return ORTE_ERR_COMM_FAILURE;
}
count = 1;
if (ORTE_SUCCESS != (rc = orte_dss.unpack(answer, &command, &count, ORTE_NS_CMD))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(answer);
return rc;
}
if (ORTE_NS_CREATE_NODEID_CMD != command) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_RELEASE(answer);
return ORTE_ERR_COMM_FAILURE;
}
count = 1;
if (ORTE_SUCCESS != (rc = orte_dss.unpack(answer, &index, &count, ORTE_STD_CNTR))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(answer);
return rc;
}
/** allocate the space for the nodeids */
*nodeids = (orte_nodeid_t*)malloc(index * sizeof(orte_nodeid_t));
if (NULL == *nodeids) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
if (ORTE_SUCCESS != (rc = orte_dss.unpack(answer, nodeids, &index, ORTE_NODEID))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(answer);
return rc;
}
OBJ_RELEASE(answer);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return ORTE_SUCCESS;
}
int orte_ns_proxy_get_node_info(char ***nodenames, orte_cellid_t cellid,
orte_std_cntr_t num_nodes, orte_nodeid_t *nodeids)
{
orte_buffer_t* cmd;
orte_buffer_t* answer;
orte_ns_cmd_flag_t command;
orte_std_cntr_t count, index;
int rc, ret=ORTE_SUCCESS;
OPAL_THREAD_LOCK(&orte_ns_proxy.mutex);
command = ORTE_NS_GET_NODE_INFO_CMD;
cmd = OBJ_NEW(orte_buffer_t);
if (cmd == NULL) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return ORTE_ERR_OUT_OF_RESOURCE;
}
if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, &command, 1, ORTE_NS_CMD))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(cmd);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return rc;
}
if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, &cellid, 1, ORTE_CELLID))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(cmd);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return rc;
}
if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, &num_nodes, 1, ORTE_STD_CNTR))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(cmd);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return rc;
}
if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, nodeids, num_nodes, ORTE_NODEID))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(cmd);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return rc;
}
if (0 > orte_rml.send_buffer(ORTE_NS_MY_REPLICA, cmd, ORTE_RML_TAG_NS, 0)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_RELEASE(cmd);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return ORTE_ERR_COMM_FAILURE;
}
OBJ_RELEASE(cmd);
answer = OBJ_NEW(orte_buffer_t);
if(answer == NULL) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return ORTE_ERR_OUT_OF_RESOURCE;
}
if (0 > orte_rml.recv_buffer(ORTE_NS_MY_REPLICA, answer, ORTE_RML_TAG_NS)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_RELEASE(answer);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return ORTE_ERR_COMM_FAILURE;
}
count = 1;
if (ORTE_SUCCESS != (rc = orte_dss.unpack(answer, &command, &count, ORTE_NS_CMD))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(answer);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return rc;
}
if (ORTE_NS_GET_NODE_INFO_CMD != command) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_RELEASE(answer);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return ORTE_ERR_COMM_FAILURE;
}
count = 1;
if (ORTE_SUCCESS != (rc = orte_dss.unpack(answer, &index, &count, ORTE_STD_CNTR))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(answer);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return rc;
}
/** create the space for the nodenames */
*nodenames = (char**)malloc(index * sizeof(char*));
if (NULL == *nodenames) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return ORTE_ERR_OUT_OF_RESOURCE;
}
if (ORTE_SUCCESS != (rc = orte_dss.unpack(answer, *nodenames, &index, ORTE_STRING))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(answer);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return rc;
}
count = 1;
if (ORTE_SUCCESS != (rc = orte_dss.unpack(answer, &ret, &count, ORTE_INT))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(answer);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return rc;
}
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return ret;
}

Просмотреть файл

@ -47,7 +47,7 @@
*/
mca_ns_base_component_t mca_ns_proxy_component = {
{
MCA_NS_BASE_VERSION_1_0_0,
MCA_NS_BASE_VERSION_2_0_0,
"proxy", /* MCA module name */
ORTE_MAJOR_VERSION, /* MCA module major version */
@ -71,35 +71,37 @@ static mca_ns_base_module_t orte_ns_proxy_module = {
orte_ns_proxy_module_init,
/* cell functions */
orte_ns_proxy_create_cellid,
orte_ns_base_get_cellid,
orte_ns_proxy_get_cell_info,
orte_ns_base_assign_cellid_to_process,
orte_ns_base_get_cellid_string,
orte_ns_base_convert_cellid_to_string,
orte_ns_base_convert_string_to_cellid,
/** node functions */
orte_ns_proxy_create_nodeids,
orte_ns_proxy_get_node_info,
orte_ns_base_convert_nodeid_to_string,
orte_ns_base_convert_string_to_nodeid,
/* jobid functions */
orte_ns_proxy_create_jobid,
orte_ns_base_get_jobid,
orte_ns_proxy_get_job_descendants,
orte_ns_proxy_get_job_children,
orte_ns_proxy_get_root_job,
orte_ns_proxy_get_parent_job,
orte_ns_base_get_jobid_string,
orte_ns_base_convert_jobid_to_string,
orte_ns_base_convert_string_to_jobid,
/* vpid functions */
orte_ns_proxy_reserve_range,
orte_ns_base_get_vpid,
/* vpid functions */
orte_ns_base_get_vpid_string,
orte_ns_base_convert_vpid_to_string,
orte_ns_base_convert_string_to_vpid,
/* name functions */
orte_ns_base_create_process_name,
orte_ns_proxy_create_my_name,
orte_ns_base_copy_process_name,
orte_ns_base_convert_string_to_process_name,
orte_ns_base_free_name,
orte_ns_base_get_proc_name_string,
orte_ns_base_compare,
orte_ns_base_compare_fields,
/* peer functions */
orte_ns_base_get_peers,
orte_ns_proxy_get_job_peers,
orte_ns_proxy_get_peers,
/* tag server functions */
orte_ns_proxy_assign_rml_tag,
/* data type functions */
@ -116,31 +118,6 @@ static mca_ns_base_module_t orte_ns_proxy_module = {
*/
static bool initialized = false;
/* constructor - used to initialize state of cell info list instance */
static void orte_ns_proxy_cell_info_construct(orte_ns_proxy_cell_info_t* ptr)
{
ptr->resource = NULL;
ptr->site = NULL;
}
/* destructor - used to free any resources held by instance */
static void orte_ns_proxy_cell_info_destructor(orte_ns_proxy_cell_info_t* ptr)
{
if (NULL != ptr->resource) {
free(ptr->resource);
}
if (NULL != ptr->site) {
free(ptr->site);
}
}
/* define instance of opal_class_t */
OBJ_CLASS_INSTANCE(
orte_ns_proxy_cell_info_t, /* type name */
opal_object_t, /* parent "class" name */
orte_ns_proxy_cell_info_construct, /* constructor */
orte_ns_proxy_cell_info_destructor); /* destructor */
/* constructor - used to initialize state of taglist instance */
static void orte_ns_proxy_tagitem_construct(orte_ns_proxy_tagitem_t* tagitem)
{
@ -245,14 +222,10 @@ mca_ns_base_module_t* orte_ns_proxy_init(int *priority)
ORTE_ERROR_LOG(ret);
return NULL;
}
if(ORTE_SUCCESS != (ret = orte_ns.copy_process_name(&orte_process_info.ns_replica, &name))) {
if(ORTE_SUCCESS != (ret = orte_dss.copy((void**)&orte_process_info.ns_replica, &name, ORTE_NAME))) {
ORTE_ERROR_LOG(ret);
return NULL;
}
if (ORTE_SUCCESS != orte_ns_base_copy_process_name(&orte_ns_proxy.my_replica,
orte_process_info.ns_replica)) { /* can't operate */
return NULL;
}
/* initialize the cell info tracker */
if (ORTE_SUCCESS != (rc = orte_pointer_array_init(&(orte_ns_proxy.cells),
@ -315,7 +288,6 @@ int orte_ns_proxy_module_init(void)
*/
int orte_ns_proxy_finalize(void)
{
orte_ns_proxy_cell_info_t **cptr;
orte_ns_proxy_tagitem_t **tag;
orte_ns_proxy_dti_t **dti;
orte_std_cntr_t i;
@ -323,14 +295,6 @@ int orte_ns_proxy_finalize(void)
/* free all tracking storage, but only if this component was initialized */
if (initialized) {
cptr = (orte_ns_proxy_cell_info_t**)(orte_ns_proxy.cells)->addr;
for (i=0; i < (orte_ns_proxy.cells)->size; i++) {
if (NULL != cptr[i]) {
OBJ_RELEASE(cptr[i]);
}
}
OBJ_RELEASE(orte_ns_proxy.cells);
tag = (orte_ns_proxy_tagitem_t**)(orte_ns_proxy.tags)->addr;
for (i=0; i < (orte_ns_proxy.tags)->size; i++) {
if (NULL != tag[i]) OBJ_RELEASE(tag[i]);

313
orte/mca/ns/proxy/ns_proxy_diag_fns.c Обычный файл
Просмотреть файл

@ -0,0 +1,313 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2006 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/** @file:
*
*/
#include "orte_config.h"
#include <string.h>
#include "orte/orte_constants.h"
#include "orte/orte_types.h"
#include "opal/mca/mca.h"
#include "opal/util/output.h"
#include "opal/util/trace.h"
#include "orte/dss/dss.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/rml/rml.h"
#include "orte/mca/ns/base/base.h"
#include "ns_proxy.h"
/*
* DIAGNOSTIC functions
*/
int orte_ns_proxy_dump_cells(void)
{
orte_buffer_t cmd;
orte_buffer_t answer;
orte_ns_cmd_flag_t command;
orte_std_cntr_t count;
int rc;
command = ORTE_NS_DUMP_CELLS_CMD;
OPAL_THREAD_LOCK(&orte_ns_proxy.mutex);
/* dump name service replica cell tracker */
OBJ_CONSTRUCT(&cmd, orte_buffer_t);
if (ORTE_SUCCESS != (rc = orte_dss.pack(&cmd, &command, 1, ORTE_NS_CMD))) {
ORTE_ERROR_LOG(rc);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
OBJ_DESTRUCT(&cmd);
return rc;
}
if (0 > orte_rml.send_buffer(ORTE_NS_MY_REPLICA, &cmd, ORTE_RML_TAG_NS, 0)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_DESTRUCT(&cmd);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return ORTE_ERR_COMM_FAILURE;
}
OBJ_DESTRUCT(&cmd);
OBJ_CONSTRUCT(&answer, orte_buffer_t);
if (0 > orte_rml.recv_buffer(ORTE_NS_MY_REPLICA, &answer, ORTE_RML_TAG_NS)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_DESTRUCT(&answer);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return ORTE_ERR_COMM_FAILURE;
}
count = 1;
if (ORTE_SUCCESS != (rc = orte_dss.unpack(&answer, &command, &count, ORTE_NS_CMD))) {
ORTE_ERROR_LOG(rc);
OBJ_DESTRUCT(&answer);
return rc;
}
if (ORTE_NS_DUMP_CELLS_CMD != command) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_DESTRUCT(&answer);
return ORTE_ERR_COMM_FAILURE;
}
if (ORTE_SUCCESS != (rc = orte_ns_base_print_dump(&answer))) {
ORTE_ERROR_LOG(rc);
OBJ_DESTRUCT(&answer);
return rc;
}
return ORTE_SUCCESS;
}
int orte_ns_proxy_dump_jobs(void)
{
orte_buffer_t cmd;
orte_buffer_t answer;
orte_ns_cmd_flag_t command;
orte_std_cntr_t count;
int rc;
command = ORTE_NS_DUMP_JOBIDS_CMD;
OPAL_THREAD_LOCK(&orte_ns_proxy.mutex);
/* dump name service replica jobid tracker */
OBJ_CONSTRUCT(&cmd, orte_buffer_t);
if (ORTE_SUCCESS != (rc = orte_dss.pack(&cmd, &command, 1, ORTE_NS_CMD))) {
ORTE_ERROR_LOG(rc);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
OBJ_DESTRUCT(&cmd);
return rc;
}
if (0 > orte_rml.send_buffer(ORTE_NS_MY_REPLICA, &cmd, ORTE_RML_TAG_NS, 0)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_DESTRUCT(&cmd);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return ORTE_ERR_COMM_FAILURE;
}
OBJ_DESTRUCT(&cmd);
OBJ_CONSTRUCT(&answer, orte_buffer_t);
if (0 > orte_rml.recv_buffer(ORTE_NS_MY_REPLICA, &answer, ORTE_RML_TAG_NS)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_DESTRUCT(&answer);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return ORTE_ERR_COMM_FAILURE;
}
count = 1;
if (ORTE_SUCCESS != (rc = orte_dss.unpack(&answer, &command, &count, ORTE_NS_CMD))) {
ORTE_ERROR_LOG(rc);
OBJ_DESTRUCT(&answer);
return rc;
}
if (ORTE_NS_DUMP_JOBIDS_CMD != command) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_DESTRUCT(&answer);
return ORTE_ERR_COMM_FAILURE;
}
if (ORTE_SUCCESS != (rc = orte_ns_base_print_dump(&answer))) {
ORTE_ERROR_LOG(rc);
OBJ_DESTRUCT(&answer);
return rc;
}
return ORTE_SUCCESS;
}
int orte_ns_proxy_dump_tags(void)
{
orte_buffer_t cmd;
orte_buffer_t answer;
orte_ns_cmd_flag_t command;
orte_std_cntr_t i;
orte_std_cntr_t count;
orte_rml_tag_t j;
orte_ns_proxy_tagitem_t **ptr;
int rc;
command = ORTE_NS_DUMP_TAGS_CMD;
OPAL_THREAD_LOCK(&orte_ns_proxy.mutex);
/* dump name service replica tag tracker */
OBJ_CONSTRUCT(&cmd, orte_buffer_t);
if (ORTE_SUCCESS != (rc = orte_dss.pack(&cmd, &command, 1, ORTE_NS_CMD))) {
ORTE_ERROR_LOG(rc);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
OBJ_DESTRUCT(&cmd);
return rc;
}
if (0 > orte_rml.send_buffer(ORTE_NS_MY_REPLICA, &cmd, ORTE_RML_TAG_NS, 0)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_DESTRUCT(&cmd);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return ORTE_ERR_COMM_FAILURE;
}
OBJ_DESTRUCT(&cmd);
OBJ_CONSTRUCT(&answer, orte_buffer_t);
if (0 > orte_rml.recv_buffer(ORTE_NS_MY_REPLICA, &answer, ORTE_RML_TAG_NS)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_DESTRUCT(&answer);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return ORTE_ERR_COMM_FAILURE;
}
count = 1;
if (ORTE_SUCCESS != (rc = orte_dss.unpack(&answer, &command, &count, ORTE_NS_CMD))) {
ORTE_ERROR_LOG(rc);
OBJ_DESTRUCT(&answer);
return rc;
}
if (ORTE_NS_DUMP_TAGS_CMD != command) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_DESTRUCT(&answer);
return ORTE_ERR_COMM_FAILURE;
}
if (ORTE_SUCCESS != (rc = orte_ns_base_print_dump(&answer))) {
ORTE_ERROR_LOG(rc);
OBJ_DESTRUCT(&answer);
return rc;
}
/* dump local tag tracker */
opal_output(mca_ns_base_output, "\n\n[%lu,%lu,%lu] Dump of Local Tag Tracker\n",
ORTE_NAME_ARGS(orte_process_info.my_name));
ptr = (orte_ns_proxy_tagitem_t**)(orte_ns_proxy.tags)->addr;
for (i=0, j=0; j < orte_ns_proxy.num_tags &&
i < (orte_ns_proxy.tags)->size; i++) {
if (NULL != ptr[i]) {
j++;
opal_output(mca_ns_base_output, "Num: %lu\tTag: %lu\tTag name: %s\n",
(unsigned long)j, (unsigned long)ptr[i]->tag, ptr[i]->name);
}
}
return ORTE_SUCCESS;
}
int orte_ns_proxy_dump_datatypes(void)
{
orte_buffer_t cmd;
orte_buffer_t answer;
orte_ns_cmd_flag_t command;
orte_std_cntr_t i, j;
orte_std_cntr_t count;
orte_ns_proxy_dti_t **ptr;
int rc;
command = ORTE_NS_DUMP_DATATYPES_CMD;
OPAL_THREAD_LOCK(&orte_ns_proxy.mutex);
/* dump name service replica datatype tracker */
OBJ_CONSTRUCT(&cmd, orte_buffer_t);
if (ORTE_SUCCESS != (rc = orte_dss.pack(&cmd, &command, 1, ORTE_NS_CMD))) {
ORTE_ERROR_LOG(rc);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
OBJ_DESTRUCT(&cmd);
return rc;
}
if (0 > orte_rml.send_buffer(ORTE_NS_MY_REPLICA, &cmd, ORTE_RML_TAG_NS, 0)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_DESTRUCT(&cmd);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return ORTE_ERR_COMM_FAILURE;
}
OBJ_DESTRUCT(&cmd);
OBJ_CONSTRUCT(&answer, orte_buffer_t);
if (0 > orte_rml.recv_buffer(ORTE_NS_MY_REPLICA, &answer, ORTE_RML_TAG_NS)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_DESTRUCT(&answer);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return ORTE_ERR_COMM_FAILURE;
}
count = 1;
if (ORTE_SUCCESS != (rc = orte_dss.unpack(&answer, &command, &count, ORTE_NS_CMD))) {
ORTE_ERROR_LOG(rc);
OBJ_DESTRUCT(&answer);
return rc;
}
if (ORTE_NS_DUMP_DATATYPES_CMD != command) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_DESTRUCT(&answer);
return ORTE_ERR_COMM_FAILURE;
}
if (ORTE_SUCCESS != (rc = orte_ns_base_print_dump(&answer))) {
ORTE_ERROR_LOG(rc);
OBJ_DESTRUCT(&answer);
return rc;
}
/* dump local datatype tracker */
opal_output(mca_ns_base_output, "\n\n[%lu,%lu,%lu] Dump of Local Datatype Tracker\n",
ORTE_NAME_ARGS(orte_process_info.my_name));
ptr = (orte_ns_proxy_dti_t**)(orte_ns_proxy.dts)->addr;
for (i=0, j=0; j < orte_ns_proxy.num_dts &&
i < (orte_ns_proxy.dts)->size; i++) {
if (NULL != ptr[i]) {
j++;
opal_output(mca_ns_base_output, "Num: %lu\tDatatype id: %lu\tDatatype name: %s\n",
(unsigned long)j, (unsigned long)ptr[i]->id, ptr[i]->name);
}
}
return ORTE_SUCCESS;
}

495
orte/mca/ns/proxy/ns_proxy_general_fns.c Обычный файл
Просмотреть файл

@ -0,0 +1,495 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2006 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/** @file:
*
*/
#include "orte_config.h"
#include <string.h>
#include "orte/orte_constants.h"
#include "orte/orte_types.h"
#include "opal/mca/mca.h"
#include "opal/util/output.h"
#include "opal/util/trace.h"
#include "orte/dss/dss.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/rmgr/rmgr.h"
#include "orte/mca/rml/rml.h"
#include "ns_proxy.h"
/*
* PEER functions
*/
int orte_ns_proxy_get_peers(orte_process_name_t **procs,
orte_std_cntr_t *num_procs, opal_list_t *attrs)
{
orte_buffer_t* cmd;
orte_buffer_t* answer;
orte_ns_cmd_flag_t command;
orte_std_cntr_t count, nprocs, i;
orte_cellid_t *cptr;
orte_attribute_t *attr;
int rc;
OPAL_TRACE(1);
/* set default value */
*procs = NULL;
*num_procs = 0;
/* check the attributes to see if USE_JOB or USE_CELL has been set. If not, then this is
* a request for my own job peers - process that one locally
*/
/* if the cell is given AND it matches my own, then we can process this
* quickly. Otherwise, we have to do some more work.
*
* RHC: when we go multi-cell, we need a way to find all the cells upon
* which a job is executing so we can make this work!
*/
if (NULL != (attr = orte_rmgr.find_attribute(attrs, ORTE_NS_USE_CELL))) {
if (ORTE_SUCCESS != (rc = orte_dss.get((void**)&cptr, attr->value, ORTE_CELLID))) {
ORTE_ERROR_LOG(rc);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return rc;
}
if (*cptr != ORTE_PROC_MY_NAME->cellid && *cptr != ORTE_CELLID_WILDCARD) {
ORTE_ERROR_LOG(ORTE_ERR_NOT_IMPLEMENTED);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return ORTE_ERR_NOT_IMPLEMENTED;
}
}
if (NULL == (attr = orte_rmgr.find_attribute(attrs, ORTE_NS_USE_JOBID))) {
/* get my own job peers, assuming all are on this cell - process here
*
* RHC: This is a bad assumption. When we go multi-cell, we are going to have to process
* get peer requests solely on the HNP since we won't know the cellid otherwise
*/
*procs = (orte_process_name_t*)malloc(orte_process_info.num_procs * sizeof(orte_process_name_t));
if (NULL == *procs) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return ORTE_ERR_OUT_OF_RESOURCE;
}
for (i=0; i < orte_process_info.num_procs; i++) {
(*procs)[i].cellid = ORTE_PROC_MY_NAME->cellid;
(*procs)[i].jobid = ORTE_PROC_MY_NAME->jobid;
(*procs)[i].vpid = orte_process_info.vpid_start + i;
}
*num_procs = orte_process_info.num_procs;
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return ORTE_SUCCESS;
}
/* non-local request for peers in another job - send to replica for processing */
if ((cmd = OBJ_NEW(orte_buffer_t)) == NULL) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return ORTE_ERR_OUT_OF_RESOURCE;
}
command = ORTE_NS_GET_PEERS_CMD;
/* pack the command */
if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, (void*)&command, 1, ORTE_NS_CMD))) { /* got a problem */
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(cmd);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return rc;
}
/* pack the attributes */
if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, attrs, 1, ORTE_ATTR_LIST))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(cmd);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return rc;
}
if (0 > orte_rml.send_buffer(ORTE_NS_MY_REPLICA, cmd, ORTE_RML_TAG_NS, 0)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_RELEASE(cmd);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return ORTE_ERR_COMM_FAILURE;
}
OBJ_RELEASE(cmd);
if ((answer = OBJ_NEW(orte_buffer_t)) == NULL) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
OBJ_RELEASE(answer);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return ORTE_ERR_OUT_OF_RESOURCE;
}
if (0 > orte_rml.recv_buffer(ORTE_NS_MY_REPLICA, answer, ORTE_RML_TAG_NS)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_RELEASE(answer);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return ORTE_ERR_COMM_FAILURE;
}
count = 1;
if (ORTE_SUCCESS != (rc = orte_dss.unpack(answer, &command, &count, ORTE_NS_CMD))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(answer);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return rc;
}
if (ORTE_NS_GET_PEERS_CMD != command) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_RELEASE(answer);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return ORTE_ERR_COMM_FAILURE;
}
count = 1;
if (ORTE_SUCCESS != (rc = orte_dss.unpack(answer, &nprocs, &count, ORTE_STD_CNTR))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(answer);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return rc;
}
/* allocate space for array of proc names */
if (0 < nprocs) {
*procs = (orte_process_name_t*)malloc((nprocs) * sizeof(orte_process_name_t));
if (NULL == *procs) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
OBJ_RELEASE(answer);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return ORTE_ERR_OUT_OF_RESOURCE;
}
if (ORTE_SUCCESS != (rc = orte_dss.unpack(answer, *procs, &nprocs, ORTE_NAME))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(answer);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return rc;
}
}
*num_procs = nprocs;
OBJ_RELEASE(answer);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return ORTE_SUCCESS;
}
int orte_ns_proxy_assign_rml_tag(orte_rml_tag_t *tag,
char *name)
{
orte_buffer_t* cmd;
orte_buffer_t* answer;
orte_ns_cmd_flag_t command;
orte_ns_proxy_tagitem_t* tagitem, **tags;
orte_std_cntr_t count, i;
orte_rml_tag_t j;
int rc;
OPAL_THREAD_LOCK(&orte_ns_proxy.mutex);
if (NULL != name) {
/* see if this name is already in list - if so, return tag */
tags = (orte_ns_proxy_tagitem_t**)orte_ns_proxy.tags->addr;
for (i=0, j=0; j < orte_ns_proxy.num_tags &&
i < (orte_ns_proxy.tags)->size; i++) {
if (NULL != tags[i]) {
j++;
if (tags[i]->name != NULL &&
0 == strcmp(name, tags[i]->name)) { /* found name on list */
*tag = tags[i]->tag;
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return ORTE_SUCCESS;
}
}
}
}
/* okay, not on local list - so go get one from tag server */
command = ORTE_NS_ASSIGN_OOB_TAG_CMD;
*tag = ORTE_RML_TAG_MAX; /* set the default error value */
if ((cmd = OBJ_NEW(orte_buffer_t)) == NULL) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return ORTE_ERR_OUT_OF_RESOURCE;
}
if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, (void*)&command, 1, ORTE_NS_CMD))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(cmd);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return rc;
}
if (NULL == name) {
name = "NULL";
}
if (0 > (rc = orte_dss.pack(cmd, &name, 1, ORTE_STRING))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(cmd);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return rc;
}
if (0 > orte_rml.send_buffer(ORTE_NS_MY_REPLICA, cmd, ORTE_RML_TAG_NS, 0)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_RELEASE(cmd);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return ORTE_ERR_COMM_FAILURE;
}
OBJ_RELEASE(cmd);
if ((answer = OBJ_NEW(orte_buffer_t)) == NULL) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return ORTE_ERR_OUT_OF_RESOURCE;
}
if (0 > orte_rml.recv_buffer(ORTE_NS_MY_REPLICA, answer, ORTE_RML_TAG_NS)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_RELEASE(answer);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return ORTE_ERR_COMM_FAILURE;
}
count = 1;
if (ORTE_SUCCESS != (rc = orte_dss.unpack(answer, &command, &count, ORTE_NS_CMD))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(answer);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return rc;
}
if (ORTE_NS_ASSIGN_OOB_TAG_CMD != command) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_RELEASE(answer);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return ORTE_ERR_COMM_FAILURE;
}
count = 1;
if (ORTE_SUCCESS != (rc = orte_dss.unpack(answer, tag, &count, ORTE_RML_TAG))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(answer);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return rc;
}
OBJ_RELEASE(answer);
/* add the new tag to the local list so we don't have to get it again */
tagitem = OBJ_NEW(orte_ns_proxy_tagitem_t);
if (NULL == tagitem) { /* out of memory */
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return ORTE_ERR_OUT_OF_RESOURCE;
}
if (ORTE_SUCCESS != (rc = orte_pointer_array_add(&i,
orte_ns_proxy.tags, tagitem))) {
ORTE_ERROR_LOG(rc);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return rc;
}
tagitem->tag = *tag;
(orte_ns_proxy.num_tags)++;
if (NULL != name) { /* provided - can look it up later */
tagitem->name = strdup(name);
} else {
tagitem->name = NULL;
}
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
/* all done */
return ORTE_SUCCESS;
}
int orte_ns_proxy_define_data_type(const char *name,
orte_data_type_t *type)
{
orte_buffer_t* cmd;
orte_buffer_t* answer;
orte_ns_cmd_flag_t command;
orte_ns_proxy_dti_t **dti, *dtip;
orte_std_cntr_t count, i, j;
int rc=ORTE_SUCCESS;
if (NULL == name || 0 < *type) {
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
return ORTE_ERR_BAD_PARAM;
}
OPAL_THREAD_LOCK(&orte_ns_proxy.mutex);
/* first, check to see if name is already on local list
* if so, return id, ensure registered with dss
*/
dti = (orte_ns_proxy_dti_t**)orte_ns_proxy.dts->addr;
for (i=0, j=0; j < orte_ns_proxy.num_dts &&
i < orte_ns_proxy.dts->size; i++) {
if (NULL != dti[i]) {
j++;
if (dti[i]->name != NULL &&
0 == strcmp(name, dti[i]->name)) { /* found name on list */
*type = dti[i]->id;
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return ORTE_SUCCESS;
}
}
}
/* okay, not on local list - so go get one from tag server */
command = ORTE_NS_DEFINE_DATA_TYPE_CMD;
*type = ORTE_DSS_ID_MAX; /* set the default error value */
if ((cmd = OBJ_NEW(orte_buffer_t)) == NULL) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return ORTE_ERR_OUT_OF_RESOURCE;
}
if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, (void*)&command, 1, ORTE_NS_CMD))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(cmd);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return rc;
}
if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, (void*)&name, 1, ORTE_STRING))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(cmd);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return rc;
}
if (0 > orte_rml.send_buffer(ORTE_NS_MY_REPLICA, cmd, ORTE_RML_TAG_NS, 0)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_RELEASE(cmd);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return ORTE_ERR_COMM_FAILURE;
}
OBJ_RELEASE(cmd);
if ((answer = OBJ_NEW(orte_buffer_t)) == NULL) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return ORTE_ERR_OUT_OF_RESOURCE;
}
if (0 > orte_rml.recv_buffer(ORTE_NS_MY_REPLICA, answer, ORTE_RML_TAG_NS)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_RELEASE(answer);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return ORTE_ERR_COMM_FAILURE;
}
count = 1;
if (ORTE_SUCCESS != (rc = orte_dss.unpack(answer, &command, &count, ORTE_NS_CMD))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(answer);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return rc;
}
if (ORTE_NS_ASSIGN_OOB_TAG_CMD != command) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_RELEASE(answer);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return ORTE_ERR_COMM_FAILURE;
}
count = 1;
if (ORTE_SUCCESS != (rc = orte_dss.unpack(answer, type, &count, ORTE_DATA_TYPE))) {
ORTE_ERROR_LOG(ORTE_ERR_UNPACK_FAILURE);
OBJ_RELEASE(answer);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return ORTE_ERR_UNPACK_FAILURE;
}
OBJ_RELEASE(answer);
/* add the new id to the local list so we don't have to get it again */
dtip = OBJ_NEW(orte_ns_proxy_dti_t);
if (NULL == dtip) { /* out of memory */
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return ORTE_ERR_OUT_OF_RESOURCE;
}
dtip->name = strdup(name);
if (ORTE_SUCCESS != (rc = orte_pointer_array_add(&i,
orte_ns_proxy.dts, dtip))) {
ORTE_ERROR_LOG(rc);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return rc;
}
dtip->id = *type;
(orte_ns_proxy.num_dts)++;
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
/* all done */
return rc;
}
/*
* Take advantage of the way the RML uses the process name as its index into
* the RML communicator table. Because the RML needs a name right away, it will
* automatically assign us one when it receives a message - and it communicates
* that assignment back to us automatically. Thus, to get a name for ourselves,
* all we have to do is send a message! No response from the replica is required.
*/
int orte_ns_proxy_create_my_name(void)
{
orte_buffer_t* cmd;
orte_ns_cmd_flag_t command;
int rc;
command = ORTE_NS_CREATE_MY_NAME_CMD;
cmd = OBJ_NEW(orte_buffer_t);
if (cmd == NULL) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, &command, 1, ORTE_NS_CMD))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(cmd);
return rc;
}
if (0 > orte_rml.send_buffer(ORTE_NS_MY_REPLICA, cmd, ORTE_RML_TAG_NS, 0)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_RELEASE(cmd);
return ORTE_ERR_COMM_FAILURE;
}
OBJ_RELEASE(cmd);
return ORTE_SUCCESS;
}

526
orte/mca/ns/proxy/ns_proxy_job_fns.c Обычный файл
Просмотреть файл

@ -0,0 +1,526 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2006 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/** @file:
*
*/
#include "orte_config.h"
#include <string.h>
#include "orte/orte_constants.h"
#include "orte/orte_types.h"
#include "opal/mca/mca.h"
#include "opal/util/output.h"
#include "opal/util/trace.h"
#include "orte/dss/dss.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/rml/rml.h"
#include "ns_proxy.h"
/**** CREATE JOBID ****/
int orte_ns_proxy_create_jobid(orte_jobid_t *job, opal_list_t *attrs)
{
orte_buffer_t* cmd;
orte_buffer_t* answer;
orte_ns_cmd_flag_t command;
orte_std_cntr_t count;
int rc;
OPAL_TRACE(1);
/* set default value */
*job = ORTE_JOBID_INVALID;
if ((cmd = OBJ_NEW(orte_buffer_t)) == NULL) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
command = ORTE_NS_CREATE_JOBID_CMD;
if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, (void*)&command, 1, ORTE_NS_CMD))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(cmd);
return rc;
}
if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, attrs, 1, ORTE_ATTR_LIST))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(cmd);
return rc;
}
if (0 > orte_rml.send_buffer(ORTE_NS_MY_REPLICA, cmd, ORTE_RML_TAG_NS, 0)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_RELEASE(cmd);
return ORTE_ERR_COMM_FAILURE;
}
OBJ_RELEASE(cmd);
if ((answer = OBJ_NEW(orte_buffer_t)) == NULL) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
OBJ_RELEASE(answer);
return ORTE_ERR_OUT_OF_RESOURCE;
}
if (0 > orte_rml.recv_buffer(ORTE_NS_MY_REPLICA, answer, ORTE_RML_TAG_NS)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_RELEASE(answer);
return ORTE_ERR_COMM_FAILURE;
}
count = 1;
if (ORTE_SUCCESS != (rc = orte_dss.unpack(answer, &command, &count, ORTE_NS_CMD))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(answer);
return rc;
}
if (ORTE_NS_CREATE_JOBID_CMD != command) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_RELEASE(answer);
return ORTE_ERR_COMM_FAILURE;
}
count = 1;
if (ORTE_SUCCESS != (rc = orte_dss.unpack(answer, job, &count, ORTE_JOBID))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(answer);
return rc;
}
OBJ_RELEASE(answer);
return ORTE_SUCCESS;
}
/**** GET JOB DESCENDANTS ****/
int orte_ns_proxy_get_job_descendants(orte_jobid_t **descendants, orte_std_cntr_t *num_desc, orte_jobid_t job)
{
orte_buffer_t* cmd;
orte_buffer_t* answer;
orte_ns_cmd_flag_t command;
orte_std_cntr_t count, ndesc=0;
orte_jobid_t *jobs=NULL;
int rc;
OPAL_TRACE(1);
/* set default response */
*descendants = NULL;
*num_desc = 0;
if ((cmd = OBJ_NEW(orte_buffer_t)) == NULL) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
command = ORTE_NS_GET_JOB_DESC_CMD;
if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, (void*)&command, 1, ORTE_NS_CMD))) { /* got a problem */
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(cmd);
return rc;
}
if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, (void*)&job, 1, ORTE_JOBID))) { /* got a problem */
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(cmd);
return rc;
}
if (0 > orte_rml.send_buffer(ORTE_NS_MY_REPLICA, cmd, ORTE_RML_TAG_NS, 0)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_RELEASE(cmd);
return ORTE_ERR_COMM_FAILURE;
}
OBJ_RELEASE(cmd);
if ((answer = OBJ_NEW(orte_buffer_t)) == NULL) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
OBJ_RELEASE(answer);
return ORTE_ERR_OUT_OF_RESOURCE;
}
if (0 > orte_rml.recv_buffer(ORTE_NS_MY_REPLICA, answer, ORTE_RML_TAG_NS)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_RELEASE(answer);
return ORTE_ERR_COMM_FAILURE;
}
count = 1;
if (ORTE_SUCCESS != (rc = orte_dss.unpack(answer, &command, &count, ORTE_NS_CMD))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(answer);
return rc;
}
if (ORTE_NS_GET_JOB_DESC_CMD != command) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_RELEASE(answer);
return ORTE_ERR_COMM_FAILURE;
}
count = 1;
if (ORTE_SUCCESS != (rc = orte_dss.unpack(answer, &ndesc, &count, ORTE_STD_CNTR))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(answer);
return rc;
}
/* if there are any descendants, allocate space for them and unpack */
if (0 < ndesc) {
jobs = (orte_jobid_t*)malloc(ndesc * sizeof(orte_jobid_t));
if (NULL == jobs) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
OBJ_RELEASE(answer);
return ORTE_ERR_OUT_OF_RESOURCE;
}
count = ndesc;
if (ORTE_SUCCESS != (rc = orte_dss.unpack(answer, jobs, &count, ORTE_JOBID))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(answer);
return rc;
}
}
OBJ_RELEASE(answer);
*descendants = jobs;
*num_desc = count;
return ORTE_SUCCESS;
}
/**** GET JOB CHILDREN ****/
int orte_ns_proxy_get_job_children(orte_jobid_t **descendants, orte_std_cntr_t *num_desc, orte_jobid_t job)
{
orte_buffer_t* cmd;
orte_buffer_t* answer;
orte_ns_cmd_flag_t command;
orte_std_cntr_t count, ndesc=0;
orte_jobid_t *jobs=NULL;
int rc;
OPAL_TRACE(1);
/* set default response */
*descendants = NULL;
*num_desc = 0;
if ((cmd = OBJ_NEW(orte_buffer_t)) == NULL) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
command = ORTE_NS_GET_JOB_CHILD_CMD;
if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, (void*)&command, 1, ORTE_NS_CMD))) { /* got a problem */
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(cmd);
return rc;
}
if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, (void*)&job, 1, ORTE_JOBID))) { /* got a problem */
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(cmd);
return rc;
}
if (0 > orte_rml.send_buffer(ORTE_NS_MY_REPLICA, cmd, ORTE_RML_TAG_NS, 0)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_RELEASE(cmd);
return ORTE_ERR_COMM_FAILURE;
}
OBJ_RELEASE(cmd);
if ((answer = OBJ_NEW(orte_buffer_t)) == NULL) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
OBJ_RELEASE(answer);
return ORTE_ERR_OUT_OF_RESOURCE;
}
if (0 > orte_rml.recv_buffer(ORTE_NS_MY_REPLICA, answer, ORTE_RML_TAG_NS)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_RELEASE(answer);
return ORTE_ERR_COMM_FAILURE;
}
count = 1;
if (ORTE_SUCCESS != (rc = orte_dss.unpack(answer, &command, &count, ORTE_NS_CMD))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(answer);
return rc;
}
if (ORTE_NS_GET_JOB_DESC_CMD != command) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_RELEASE(answer);
return ORTE_ERR_COMM_FAILURE;
}
count = 1;
if (ORTE_SUCCESS != (rc = orte_dss.unpack(answer, &ndesc, &count, ORTE_STD_CNTR))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(answer);
return rc;
}
/* if there are any descendants, allocate space for them and unpack */
if (0 < ndesc) {
jobs = (orte_jobid_t*)malloc(ndesc * sizeof(orte_jobid_t));
if (NULL == jobs) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
OBJ_RELEASE(answer);
return ORTE_ERR_OUT_OF_RESOURCE;
}
count = ndesc;
if (ORTE_SUCCESS != (rc = orte_dss.unpack(answer, jobs, &count, ORTE_JOBID))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(answer);
return rc;
}
}
OBJ_RELEASE(answer);
*descendants = jobs;
*num_desc = count;
return ORTE_SUCCESS;
}
int orte_ns_proxy_get_root_job(orte_jobid_t *root_job, orte_jobid_t job)
{
orte_buffer_t* cmd;
orte_buffer_t* answer;
orte_ns_cmd_flag_t command;
orte_std_cntr_t count;
int rc;
OPAL_TRACE(1);
/* set default value */
*root_job = ORTE_JOBID_INVALID;
if ((cmd = OBJ_NEW(orte_buffer_t)) == NULL) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
command = ORTE_NS_GET_ROOT_JOB_CMD;
if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, (void*)&command, 1, ORTE_NS_CMD))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(cmd);
return rc;
}
if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, &job, 1, ORTE_JOBID))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(cmd);
return rc;
}
if (0 > orte_rml.send_buffer(ORTE_NS_MY_REPLICA, cmd, ORTE_RML_TAG_NS, 0)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_RELEASE(cmd);
return ORTE_ERR_COMM_FAILURE;
}
OBJ_RELEASE(cmd);
if ((answer = OBJ_NEW(orte_buffer_t)) == NULL) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
OBJ_RELEASE(answer);
return ORTE_ERR_OUT_OF_RESOURCE;
}
if (0 > orte_rml.recv_buffer(ORTE_NS_MY_REPLICA, answer, ORTE_RML_TAG_NS)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_RELEASE(answer);
return ORTE_ERR_COMM_FAILURE;
}
count = 1;
if (ORTE_SUCCESS != (rc = orte_dss.unpack(answer, &command, &count, ORTE_NS_CMD))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(answer);
return rc;
}
if (ORTE_NS_GET_ROOT_JOB_CMD != command) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_RELEASE(answer);
return ORTE_ERR_COMM_FAILURE;
}
count = 1;
if (ORTE_SUCCESS != (rc = orte_dss.unpack(answer, root_job, &count, ORTE_JOBID))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(answer);
return rc;
}
OBJ_RELEASE(answer);
return ORTE_SUCCESS;
}
int orte_ns_proxy_get_parent_job(orte_jobid_t *parent, orte_jobid_t job)
{
orte_buffer_t* cmd;
orte_buffer_t* answer;
orte_ns_cmd_flag_t command;
orte_std_cntr_t count;
int rc;
OPAL_TRACE(1);
/* set default value */
*parent = ORTE_JOBID_INVALID;
if ((cmd = OBJ_NEW(orte_buffer_t)) == NULL) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
command = ORTE_NS_GET_PARENT_JOB_CMD;
if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, (void*)&command, 1, ORTE_NS_CMD))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(cmd);
return rc;
}
if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, &job, 1, ORTE_JOBID))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(cmd);
return rc;
}
if (0 > orte_rml.send_buffer(ORTE_NS_MY_REPLICA, cmd, ORTE_RML_TAG_NS, 0)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_RELEASE(cmd);
return ORTE_ERR_COMM_FAILURE;
}
OBJ_RELEASE(cmd);
if ((answer = OBJ_NEW(orte_buffer_t)) == NULL) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
OBJ_RELEASE(answer);
return ORTE_ERR_OUT_OF_RESOURCE;
}
if (0 > orte_rml.recv_buffer(ORTE_NS_MY_REPLICA, answer, ORTE_RML_TAG_NS)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_RELEASE(answer);
return ORTE_ERR_COMM_FAILURE;
}
count = 1;
if (ORTE_SUCCESS != (rc = orte_dss.unpack(answer, &command, &count, ORTE_NS_CMD))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(answer);
return rc;
}
if (ORTE_NS_GET_PARENT_JOB_CMD != command) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_RELEASE(answer);
return ORTE_ERR_COMM_FAILURE;
}
count = 1;
if (ORTE_SUCCESS != (rc = orte_dss.unpack(answer, parent, &count, ORTE_JOBID))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(answer);
return rc;
}
OBJ_RELEASE(answer);
return ORTE_SUCCESS;
}
int orte_ns_proxy_reserve_range(orte_jobid_t job, orte_vpid_t range, orte_vpid_t *starting_vpid)
{
orte_buffer_t* cmd;
orte_buffer_t* answer;
orte_ns_cmd_flag_t command;
orte_std_cntr_t count;
int rc;
OPAL_TRACE(1);
/* set default return value */
*starting_vpid = ORTE_VPID_INVALID;
if ((cmd = OBJ_NEW(orte_buffer_t)) == NULL) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
command = ORTE_NS_RESERVE_RANGE_CMD;
if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, (void*)&command, 1, ORTE_NS_CMD))) { /* got a problem */
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(cmd);
return rc;
}
if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, (void*)&job, 1, ORTE_JOBID))) { /* got a problem */
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(cmd);
return rc;
}
if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, (void*)&range, 1, ORTE_VPID))) { /* got a problem */
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(cmd);
return rc;
}
if (0 > orte_rml.send_buffer(ORTE_NS_MY_REPLICA, cmd, ORTE_RML_TAG_NS, 0)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_RELEASE(cmd);
return ORTE_ERR_COMM_FAILURE;
}
OBJ_RELEASE(cmd);
if ((answer = OBJ_NEW(orte_buffer_t)) == NULL) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
if (0 > orte_rml.recv_buffer(ORTE_NS_MY_REPLICA, answer, ORTE_RML_TAG_NS)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_RELEASE(answer);
return ORTE_ERR_COMM_FAILURE;
}
count = 1;
if ((ORTE_SUCCESS != (rc = orte_dss.unpack(answer, &command, &count, ORTE_NS_CMD)))
|| (ORTE_NS_RESERVE_RANGE_CMD != command)) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(answer);
return rc;
}
count = 1;
if (ORTE_SUCCESS != (rc = orte_dss.unpack(answer, starting_vpid, &count, ORTE_VPID))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(answer);
return rc;
}
OBJ_RELEASE(answer);
return ORTE_SUCCESS;
}

Просмотреть файл

@ -1,23 +0,0 @@
# -*- makefile -*-
#
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2004-2005 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
sources += \
src/ns_proxy.h \
src/ns_proxy.c \
src/ns_proxy_component.c

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -16,37 +16,37 @@
# $HEADER$
#
# Use the top-level Makefile.options
sources =
include src/Makefile.extra
sources = \
ns_replica.h \
ns_replica_class_instances.h \
ns_replica_cell_fns.c \
ns_replica_diag_fns.c \
ns_replica_general_fns.c \
ns_replica_job_fns.c \
ns_replica_recv.c \
ns_replica_support_fns.c \
ns_replica_component.c
# Make the output library in this directory, and name it either
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
# (for static builds).
if OMPI_BUILD_ns_replica_DSO
lib =
lib_sources =
component = mca_ns_replica.la
component_sources = $(sources)
component_noinst =
component_install = mca_ns_replica.la
else
lib = libmca_ns_replica.la
lib_sources = $(sources)
component =
component_sources =
component_noinst = libmca_ns_replica.la
component_install =
endif
mcacomponentdir = $(libdir)/openmpi
mcacomponent_LTLIBRARIES = $(component)
mca_ns_replica_la_SOURCES = $(component_sources)
mcacomponent_LTLIBRARIES = $(component_install)
mca_ns_replica_la_SOURCES = $(sources)
mca_ns_replica_la_LDFLAGS = -module -avoid-version
mca_ns_replica_la_LIBADD = \
$(top_ompi_builddir)/orte/liborte.la \
$(top_ompi_builddir)/opal/libopal.la
noinst_LTLIBRARIES = $(lib)
libmca_ns_replica_la_SOURCES = $(lib_sources)
noinst_LTLIBRARIES = $(component_noinst)
libmca_ns_replica_la_SOURCES =$(sources)
libmca_ns_replica_la_LDFLAGS = -module -avoid-version

Просмотреть файл

@ -19,5 +19,6 @@
# Specific to this module
PARAM_INIT_FILE=src/ns_replica.c
PARAM_INIT_FILE=ns_replica.c
PARAM_CONFIG_HEADER_FILE="ns_replica.h"
PARAM_CONFIG_FILES="Makefile"

Просмотреть файл

@ -38,227 +38,6 @@
*/
#define NS_REPLICA_MAX_STRING_SIZE 256
/*
* functions
*/
int orte_ns_replica_create_cellid(orte_cellid_t *cellid, char *site, char *resource)
{
orte_ns_replica_cell_tracker_t *new_cell;
int rc;
orte_std_cntr_t index;
OPAL_TRACE(1);
OPAL_THREAD_LOCK(&orte_ns_replica.mutex);
*cellid = ORTE_CELLID_MAX;
/* check if cellid is available. NOTE: need to reserve
* ORTE_CELLID_MAX as an invalid value, so can't allow
* num_cells to get there
*/
if (ORTE_CELLID_MAX-2 < orte_ns_replica.num_cells) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return ORTE_ERR_OUT_OF_RESOURCE;
}
new_cell = OBJ_NEW(orte_ns_replica_cell_tracker_t);
if (NULL == new_cell) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return ORTE_ERR_OUT_OF_RESOURCE;
}
if (ORTE_SUCCESS != (rc = orte_pointer_array_add(&index,
orte_ns_replica.cells, new_cell))) {
ORTE_ERROR_LOG(rc);
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return rc;
}
new_cell->site = strdup(site);
new_cell->resource = strdup(resource);
new_cell->cell = orte_ns_replica.num_cells;
*cellid = new_cell->cell;
(orte_ns_replica.num_cells)++;
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return ORTE_SUCCESS;
}
int orte_ns_replica_get_cell_info(orte_cellid_t cellid,
char **site, char **resource)
{
orte_std_cntr_t i;
orte_cellid_t j;
orte_ns_replica_cell_tracker_t **cell;
OPAL_TRACE(1);
OPAL_THREAD_LOCK(&orte_ns_replica.mutex);
cell = (orte_ns_replica_cell_tracker_t**)(orte_ns_replica.cells)->addr;
for (i=0, j=0; j < orte_ns_replica.num_cells &&
i < (orte_ns_replica.cells)->size; i++) {
if (NULL != cell[i]) {
j++;
if (cellid == cell[i]->cell) {
*site = strdup(cell[i]->site);
*resource = strdup(cell[i]->resource);
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return ORTE_SUCCESS;
}
}
}
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return ORTE_ERR_NOT_FOUND;
}
/*
* JOBID functions
*/
int orte_ns_replica_create_jobid(orte_jobid_t *jobid)
{
orte_ns_replica_jobid_tracker_t *ptr;
int rc;
orte_std_cntr_t index;
OPAL_TRACE(1);
OPAL_THREAD_LOCK(&orte_ns_replica.mutex);
*jobid = ORTE_JOBID_MAX;
/* check if a jobid is available. NOTE: need to
* reserve ORTE_JOBID_MAX as an invalid value, so can't let
* num_jobids get there
*/
if (ORTE_JOBID_MAX-2 < orte_ns_replica.num_jobids) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return ORTE_ERR_OUT_OF_RESOURCE;
}
ptr = OBJ_NEW(orte_ns_replica_jobid_tracker_t);
if (NULL == ptr) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return ORTE_ERR_OUT_OF_RESOURCE;
}
if (ORTE_SUCCESS != (rc = orte_pointer_array_add(&index,
orte_ns_replica.jobids, ptr))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(ptr);
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return rc;
}
ptr->jobid = orte_ns_replica.num_jobids;
*jobid = ptr->jobid;
(orte_ns_replica.num_jobids)++;
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return ORTE_SUCCESS;
}
int orte_ns_replica_reserve_range(orte_jobid_t job, orte_vpid_t range,
orte_vpid_t *start)
{
orte_ns_replica_jobid_tracker_t **ptr;
orte_std_cntr_t j;
orte_jobid_t k;
OPAL_TRACE(1);
OPAL_THREAD_LOCK(&orte_ns_replica.mutex);
/* find the jobid */
ptr = (orte_ns_replica_jobid_tracker_t**)(orte_ns_replica.jobids)->addr;
for (j=0, k=0; k < orte_ns_replica.num_jobids &&
j < (orte_ns_replica.jobids)->size; j++) {
if (NULL != ptr[j]) {
k++;
if (job == ptr[j]->jobid) {
goto PROCESS;
}
}
}
/* didn't find the specified jobid - error */
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return ORTE_ERR_NOT_FOUND;
PROCESS:
if ((ORTE_VPID_MAX-range-(ptr[j]->next_vpid)) > 0) {
*start = ptr[j]->next_vpid;
ptr[j]->next_vpid += range;
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return ORTE_SUCCESS;
}
/* get here if the range isn't available */
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return ORTE_ERR_OUT_OF_RESOURCE;
}
int orte_ns_replica_get_job_peers(orte_process_name_t **procs,
orte_std_cntr_t *num_procs, orte_jobid_t job)
{
orte_ns_replica_jobid_tracker_t **ptr;
orte_process_name_t *nptr;
orte_std_cntr_t j;
orte_jobid_t k;
OPAL_TRACE_ARG1(1, job);
OPAL_THREAD_LOCK(&orte_ns_replica.mutex);
/* find the jobid */
ptr = (orte_ns_replica_jobid_tracker_t**)(orte_ns_replica.jobids)->addr;
for (j=0, k=0; k < orte_ns_replica.num_jobids &&
j < (orte_ns_replica.jobids)->size; j++) {
if (NULL != ptr[j]) {
k++;
if (job == ptr[j]->jobid) {
goto PROCESS;
}
}
}
/* didn't find the specified jobid - error */
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return ORTE_ERR_NOT_FOUND;
PROCESS:
/* the field next_vpid contains the value of the next unassigned
* vpid, so the job extends from vpid=0 to that value. create
* an array of process names containing those values
*/
*procs = (orte_process_name_t*)malloc(ptr[j]->next_vpid * sizeof(orte_process_name_t));
if (NULL == *procs) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return ORTE_ERR_OUT_OF_RESOURCE;
}
nptr = *procs;
for (k=0; k < ptr[j]->next_vpid; k++) {
nptr->cellid = 0;
nptr->jobid = job;
nptr->vpid = (orte_vpid_t)k;
nptr++;
}
*num_procs = (orte_std_cntr_t)ptr[j]->next_vpid;
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return ORTE_SUCCESS;
}
/*
* DIAGNOSTIC functions
*/

Просмотреть файл

@ -34,31 +34,54 @@
extern "C" {
#endif
/* list class for tracking cellid's
/*
* globals
*/
#define NS_REPLICA_MAX_STRING_SIZE 256
/* class for tracking cellid's */
struct orte_ns_replica_cell_tracker_t {
opal_object_t super;
orte_cellid_t cell;
char *site;
char *resource;
orte_nodeid_t next_nodeid;
orte_pointer_array_t *nodeids;
};
typedef struct orte_ns_replica_cell_tracker_t orte_ns_replica_cell_tracker_t;
OBJ_CLASS_DECLARATION(orte_ns_replica_cell_tracker_t);
/*
* object for tracking vpids/jobids
* This structure is used to track jobid-max vpid pairs. Basically, we
* are tracking the max used vpid for each jobid that has been created.
*/
struct orte_ns_replica_jobid_tracker_t {
/* object for tracking nodeid's */
struct orte_ns_replica_nodeid_tracker_t {
opal_object_t super;
orte_jobid_t jobid; /**< Job id */
orte_vpid_t next_vpid;
char *nodename;
orte_nodeid_t nodeid;
};
typedef struct orte_ns_replica_jobid_tracker_t orte_ns_replica_jobid_tracker_t;
typedef struct orte_ns_replica_nodeid_tracker_t orte_ns_replica_nodeid_tracker_t;
OBJ_CLASS_DECLARATION(orte_ns_replica_nodeid_tracker_t);
/*
* object for tracking vpids and jobids for job families
* This structure is used to track the parent-child relationship between
* jobs. The "root" of the family is the initial parent - each child has
* a record under that parent. Any child that subsequently spawns its own
* children will form a list of jobids beneath them.
*
* each object records the jobid of the job it represents, and the next vpid
* that will be assigned when a range is requested.
*/
typedef struct {
opal_list_item_t super;
orte_jobid_t jobid;
orte_vpid_t next_vpid;
opal_list_t children;
} orte_ns_replica_jobitem_t;
OBJ_CLASS_DECLARATION(orte_ns_replica_jobitem_t);
OBJ_CLASS_DECLARATION(orte_ns_replica_jobid_tracker_t);
struct orte_ns_replica_tagitem_t {
opal_object_t super;
@ -85,12 +108,8 @@ typedef struct {
size_t max_size, block_size;
orte_cellid_t num_cells;
orte_pointer_array_t *cells;
#if 0
orte_jobgrp_t num_jobgrps;
orte_pointer_array_t *jobgrps;
#endif
orte_jobid_t num_jobids;
orte_pointer_array_t *jobids;
opal_list_t jobs;
orte_pointer_array_t *tags;
orte_rml_tag_t num_tags;
orte_pointer_array_t *dts;
@ -124,38 +143,53 @@ void orte_ns_replica_recv(int status, orte_process_name_t* sender,
orte_buffer_t* buffer, orte_rml_tag_t tag, void* cbdata);
/*
* Implementation of create_cellid().
* CELL FUNCTIONS
*/
int orte_ns_replica_create_cellid(orte_cellid_t *cellid, char *site, char *resource);
/*
* Implementation of get_cell_info()
*/
int orte_ns_replica_get_cell_info(orte_cellid_t cellid,
char **site, char **resource);
/*
* Implementation of create_jobid().
*/
int orte_ns_replica_create_jobid(orte_jobid_t *jobid);
int orte_ns_replica_create_nodeids(orte_nodeid_t **nodeids, orte_std_cntr_t *nnodes,
orte_cellid_t cellid, char **nodenames);
int orte_ns_replica_get_node_info(char ***nodenames, orte_cellid_t cellid, orte_std_cntr_t num_nodes, orte_nodeid_t *nodeids);
/*
* Implementation of reserve_range()
* JOB FUNCTIONS
*/
int orte_ns_replica_create_jobid(orte_jobid_t *jobid, opal_list_t *attrs);
int orte_ns_replica_get_job_descendants(orte_jobid_t **descendants, orte_std_cntr_t *num_desc, orte_jobid_t job);
int orte_ns_replica_get_job_children(orte_jobid_t **descendants, orte_std_cntr_t *num_desc, orte_jobid_t job);
int orte_ns_replica_get_root_job(orte_jobid_t *root_job, orte_jobid_t job);
int orte_ns_replica_get_parent_job(orte_jobid_t *parent, orte_jobid_t job);
int orte_ns_replica_reserve_range(orte_jobid_t job,
orte_vpid_t range,
orte_vpid_t *startvpid);
/*
* Peer functions
* GENERAL FUNCTIONS
*/
int orte_ns_replica_get_job_peers(orte_process_name_t **procs,
orte_std_cntr_t *num_procs, orte_jobid_t job);
int orte_ns_replica_get_peers(orte_process_name_t **procs,
orte_std_cntr_t *num_procs, opal_list_t *attrs);
int orte_ns_replica_assign_rml_tag(orte_rml_tag_t *tag,
char *name);
int orte_ns_replica_define_data_type(const char *name,
orte_data_type_t *type);
int orte_ns_replica_create_my_name(void);
/*
* Diagnostic functions
* DIAGNOSTIC FUNCTIONS
*/
int orte_ns_replica_dump_cells(void);
int orte_ns_replica_dump_cells_fn(orte_buffer_t *buffer);
@ -171,20 +205,46 @@ int orte_ns_replica_dump_datatypes_fn(orte_buffer_t *buffer);
/*
* Implementation of assign rml tag
* INTERNAL SUPPORT FUNCTIONS
*/
int orte_ns_replica_assign_rml_tag(orte_rml_tag_t *tag,
char *name);
int orte_ns_replica_define_data_type(const char *name,
orte_data_type_t *type);
int orte_ns_replica_create_my_name(void);
/*
*
/* find a job's record, wherever it may be located on the list of job families.
* this function searches the entire list of job families, traversing the list
* of all jobs in each family, until it finds the specified job. It then returns
* a pointer to the that job's info structure. It returns
* NULL (without error_logging an error) if no record is found
*/
orte_ns_replica_jobitem_t* orte_ns_replica_find_job(orte_jobid_t job);
/* find the root job for the specified job.
* this function searches the entire list of job families, traversing the list
* of all jobs in each family, until it finds the specified job. It then returns
* a pointer to the root job's info structure for that job family. It returns
* NULL (without error_logging an error) if no record is found
*/
orte_ns_replica_jobitem_t* orte_ns_replica_find_root_job(orte_jobid_t job);
/* find a job's record on a specified root's family tree.
* this function finds the family record for the specified root job. It then
* traverses the children of that root until it finds the specified job, and then
* returns a pointer to that job's info structure. If root=jobid, then it will
* return a pointer to the root job's info structure. It returns
* NULL (without error_logging an error) if no record is found
*/
orte_ns_replica_jobitem_t* orte_ns_replica_search_job_family_tree(orte_jobid_t root, orte_jobid_t jobid);
/* given a job's record, create a flattened list of descendants below it */
void orte_ns_replica_construct_flattened_tree(opal_list_t *tree, orte_ns_replica_jobitem_t *ptr);
/* search down a tree, following all the children's branches, to find the specified
* job. Return a pointer to that object, and a pointer to the parent object
* This function is called recursively, so it passes into it the ptr to the
* current object being looked at
*/
orte_ns_replica_jobitem_t *down_search(orte_ns_replica_jobitem_t *ptr,
orte_ns_replica_jobitem_t **parent_ptr,
orte_jobid_t job);
ORTE_MODULE_DECLSPEC extern mca_ns_base_component_t mca_ns_replica_component;
#if defined(c_plusplus) || defined(__cplusplus)

297
orte/mca/ns/replica/ns_replica_cell_fns.c Обычный файл
Просмотреть файл

@ -0,0 +1,297 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/** @file:
*
*/
#include "orte_config.h"
#include <stdio.h>
#include <string.h>
#include "opal/threads/mutex.h"
#include "opal/util/argv.h"
#include "opal/util/output.h"
#include "opal/util/trace.h"
#include "orte/dss/dss.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/ns/base/base.h"
#include "ns_replica.h"
/*
* functions
*/
int orte_ns_replica_create_cellid(orte_cellid_t *cellid, char *site, char *resource)
{
orte_ns_replica_cell_tracker_t *new_cell, **cell;
int rc;
orte_std_cntr_t i, j, index;
OPAL_TRACE(1);
OPAL_THREAD_LOCK(&orte_ns_replica.mutex);
*cellid = ORTE_CELLID_INVALID;
/* check for error */
if (NULL == site || NULL == resource) {
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return ORTE_ERR_BAD_PARAM;
}
/* is this a known cellid? */
cell = (orte_ns_replica_cell_tracker_t**)(orte_ns_replica.cells)->addr;
for (i=0, j=0; j < orte_ns_replica.num_cells &&
i < (orte_ns_replica.cells)->size; i++) {
if (NULL != cell[i]) {
j++;
if (0 == strcmp(site, cell[i]->site) &&
0 == strcmp(resource, cell[i]->resource)) {
*cellid = cell[i]->cell;
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return ORTE_SUCCESS;
}
}
}
/* new cell - check if cellid is available */
if (ORTE_CELLID_MAX-1 < orte_ns_replica.num_cells) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return ORTE_ERR_OUT_OF_RESOURCE;
}
new_cell = OBJ_NEW(orte_ns_replica_cell_tracker_t);
if (NULL == new_cell) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return ORTE_ERR_OUT_OF_RESOURCE;
}
if (ORTE_SUCCESS != (rc = orte_pointer_array_add(&index,
orte_ns_replica.cells, new_cell))) {
ORTE_ERROR_LOG(rc);
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return rc;
}
new_cell->site = strdup(site);
new_cell->resource = strdup(resource);
new_cell->cell = orte_ns_replica.num_cells;
*cellid = new_cell->cell;
(orte_ns_replica.num_cells)++;
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return ORTE_SUCCESS;
}
int orte_ns_replica_get_cell_info(orte_cellid_t cellid,
char **site, char **resource)
{
orte_std_cntr_t i;
orte_cellid_t j;
orte_ns_replica_cell_tracker_t **cell;
OPAL_TRACE(1);
OPAL_THREAD_LOCK(&orte_ns_replica.mutex);
cell = (orte_ns_replica_cell_tracker_t**)(orte_ns_replica.cells)->addr;
for (i=0, j=0; j < orte_ns_replica.num_cells &&
i < (orte_ns_replica.cells)->size; i++) {
if (NULL != cell[i]) {
j++;
if (cellid == cell[i]->cell) {
*site = strdup(cell[i]->site);
*resource = strdup(cell[i]->resource);
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return ORTE_SUCCESS;
}
}
}
/* it isn't an error to not find the cell - so do NOT
* report it via ORTE_ERROR_LOG
*/
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return ORTE_ERR_NOT_FOUND;
}
/*
* NODEID
*/
int orte_ns_replica_create_nodeids(orte_nodeid_t **nodeids, orte_std_cntr_t *nnodes,
orte_cellid_t cellid, char **nodenames)
{
orte_ns_replica_cell_tracker_t **cell, *cptr;
orte_ns_replica_nodeid_tracker_t **nodes, *node;
orte_nodeid_t *nds, nid;
orte_std_cntr_t i, j, k, m, n, num_nodes;
OPAL_THREAD_LOCK(&orte_ns_replica.mutex);
num_nodes = opal_argv_count(nodenames);
if (0 == num_nodes) { /** no nodenames provided - just return */
*nodeids = NULL;
*nnodes = 0;
return ORTE_SUCCESS;
}
nds = (orte_nodeid_t*)malloc(num_nodes * sizeof(orte_nodeid_t));
if (NULL == nds) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
/** find the cell */
cell = (orte_ns_replica_cell_tracker_t**)(orte_ns_replica.cells)->addr;
for (i=0, j=0; j < orte_ns_replica.num_cells &&
i < (orte_ns_replica.cells)->size; i++) {
if (NULL != cell[i]) {
j++;
if (cellid == cell[i]->cell) {
/** found the specified cell - check to see if nodename has already been
* defined. if so, just return the nodeid. if not, create a new one
*/
cptr = cell[i];
goto PROCESS;
}
}
}
/** get here if we didn't find the cell */
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
free(nds);
*nodeids = NULL;
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return ORTE_ERR_NOT_FOUND;
PROCESS:
nodes = (orte_ns_replica_nodeid_tracker_t**)(cptr->nodeids->addr);
for (n=0; n < num_nodes; n++) {
for (k=0, m=0; m < cptr->next_nodeid &&
k < (cptr->nodeids)->size; k++) {
if (NULL != nodes[k]) {
m++;
if (strcmp(nodenames[n], nodes[k]->nodename) == 0) { /** found same name */
nid = nodes[k]->nodeid;
goto ASSIGN;
}
}
}
/** get here if we don't find this nodename - add one */
node = OBJ_NEW(orte_ns_replica_nodeid_tracker_t);
if (NULL == node) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
free(nds);
*nodeids = NULL;
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return ORTE_ERR_OUT_OF_RESOURCE;
}
node->nodename = strdup(nodenames[n]);
node->nodeid = cptr->next_nodeid;
cptr->next_nodeid++;
nid = node->nodeid;
ASSIGN:
nds[n] = nid;
} /** for n */
*nodeids = nds;
*nnodes = num_nodes;
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return ORTE_SUCCESS;
}
int orte_ns_replica_get_node_info(char ***nodenames, orte_cellid_t cellid,
orte_std_cntr_t num_nodes, orte_nodeid_t *nodeids)
{
char **names, *nm;
orte_ns_replica_cell_tracker_t **cell, *cptr;
orte_ns_replica_nodeid_tracker_t **nodes;
orte_std_cntr_t i, j, k, m, n;
char *err_name = "NODE_NOT_FOUND"
OPAL_THREAD_LOCK(&orte_ns_replica.mutex);
if (0 == num_nodes) {
*nodenames = NULL;
return ORTE_SUCCESS;
}
/** allocate an extra space for the NULL termination */
names = (char**)malloc((num_nodes+1) * sizeof(char*));
if (NULL == names) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
names[num_nodes] = NULL; /** NULL-terminate the list */
/** find the cell */
cell = (orte_ns_replica_cell_tracker_t**)(orte_ns_replica.cells)->addr;
for (i=0, j=0; j < orte_ns_replica.num_cells &&
i < (orte_ns_replica.cells)->size; i++) {
if (NULL != cell[i]) {
j++;
if (cellid == cell[i]->cell) {
/** found the specified cell - check to see if nodename has already been
* defined. if so, just return the nodeid. if not, create a new one
*/
cptr = cell[i];
goto PROCESS;
}
}
}
/** get here if we didn't find the cell */
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
free(names);
*nodenames = NULL;
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return ORTE_ERR_NOT_FOUND;
PROCESS:
nodes = (orte_ns_replica_nodeid_tracker_t**)(cell[i]->nodeids->addr);
for (n=0; n < num_nodes; n++) {
for (k=0, m=0; m < cell[i]->next_nodeid &&
k < (cell[i]->nodeids)->size; k++) {
if (NULL != nodes[k]) {
m++;
if (nodeids[n] == nodes[k]->nodeid) { /** found it */
nm = nodes[k]->nodename;
goto ASSIGN;
}
}
}
/** node not found - set name to error name. Can't set it to NULL since
* the list is a NULL-terminated one
*/
nm = err_name;
ASSIGN:
names[n] = strdup(nm);
}
*nodenames = names;
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return ORTE_SUCCESS;
}

Просмотреть файл

@ -0,0 +1,172 @@
/* -*- C -*-
*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2006 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*
*/
#ifndef NS_REPLICA_CLASS_INSTANCES_H
#define NS_REPLICA_CLASS_INSTANCES_H
#include "orte_config.h"
#include "orte/orte_types.h"
#include "orte/orte_constants.h"
#include "opal/threads/mutex.h"
#include "opal/class/opal_object.h"
#include "orte/class/orte_pointer_array.h"
#include "orte/dss/dss.h"
#include "orte/mca/oob/oob_types.h"
#include "orte/mca/ns/base/base.h"
#if defined(c_plusplus) || defined(__cplusplus)
extern "C" {
#endif
/*** CELLID ***/
/* constructor - used to initialize state of cell_tracker instance */
static void orte_ns_replica_cell_tracker_construct(orte_ns_replica_cell_tracker_t* cell_tracker)
{
cell_tracker->cell = ORTE_CELLID_INVALID;
cell_tracker->site = NULL;
cell_tracker->resource = NULL;
cell_tracker->next_nodeid = 0;
orte_pointer_array_init(&(cell_tracker->nodeids),
orte_ns_replica.block_size,
orte_ns_replica.max_size,
orte_ns_replica.block_size);
}
/* destructor - used to free any resources held by instance */
static void orte_ns_replica_cell_tracker_destructor(orte_ns_replica_cell_tracker_t* cell_tracker)
{
orte_std_cntr_t i, j;
orte_ns_replica_nodeid_tracker_t **nodeid;
if (NULL != cell_tracker->site) free(cell_tracker->site);
if (NULL != cell_tracker->resource) free(cell_tracker->resource);
nodeid = (orte_ns_replica_nodeid_tracker_t**)(cell_tracker->nodeids)->addr;
for (i=0, j=0; j < cell_tracker->next_nodeid &&
i < (cell_tracker->nodeids)->size; i++) {
if (NULL != nodeid[i]) {
j++;
OBJ_RELEASE(nodeid[i]);
}
}
OBJ_RELEASE(cell_tracker->nodeids);
}
/* define instance of opal_class_t */
OBJ_CLASS_INSTANCE(orte_ns_replica_cell_tracker_t, /* type name */
opal_object_t, /* parent "class" name */
orte_ns_replica_cell_tracker_construct, /* constructor */
orte_ns_replica_cell_tracker_destructor); /* destructor */
/** NODEID */
static void orte_ns_replica_nodeid_tracker_construct(orte_ns_replica_nodeid_tracker_t *ptr)
{
ptr->nodeid = ORTE_NODEID_INVALID;
ptr->nodename = NULL;
}
static void orte_ns_replica_nodeid_tracker_destructor(orte_ns_replica_nodeid_tracker_t *ptr)
{
if (NULL != ptr->nodename) free(ptr->nodename);
}
OBJ_CLASS_INSTANCE(orte_ns_replica_nodeid_tracker_t, /* type name */
opal_object_t, /* parent "class" name */
orte_ns_replica_nodeid_tracker_construct, /* constructor */
orte_ns_replica_nodeid_tracker_destructor); /* destructor */
/*** JOBITEM ***/
/* constructor - used to initialize state of jobitem instance */
static void orte_ns_replica_jobitem_construct(orte_ns_replica_jobitem_t *ptr)
{
ptr->jobid = ORTE_JOBID_INVALID;
ptr->next_vpid = 0;
OBJ_CONSTRUCT(&ptr->children, opal_list_t);
}
/* destructor - used to free any resources held by instance */
static void orte_ns_replica_jobitem_destructor(orte_ns_replica_jobitem_t *ptr){
opal_list_item_t *item;
while (NULL != (item = opal_list_remove_first(&ptr->children))) {
OBJ_RELEASE(item);
}
OBJ_DESTRUCT(&ptr->children);
}
/* define instance of opal_class_t */
OBJ_CLASS_INSTANCE(orte_ns_replica_jobitem_t, /* type name */
opal_list_item_t, /* parent "class" name */
orte_ns_replica_jobitem_construct, /* constructor */
orte_ns_replica_jobitem_destructor); /* destructor */
/*** RML TAG ***/
/* constructor - used to initialize state of taglist instance */
static void orte_ns_replica_tagitem_construct(orte_ns_replica_tagitem_t* tagitem)
{
tagitem->tag = ORTE_RML_TAG_MAX;
tagitem->name = NULL;
}
/* destructor - used to free any resources held by instance */
static void orte_ns_replica_tagitem_destructor(orte_ns_replica_tagitem_t* tagitem)
{
if (NULL != tagitem->name) {
free(tagitem->name);
}
}
/* define instance of opal_class_t */
OBJ_CLASS_INSTANCE(orte_ns_replica_tagitem_t, /* type name */
opal_object_t, /* parent "class" name */
orte_ns_replica_tagitem_construct, /* constructor */
orte_ns_replica_tagitem_destructor); /* destructor */
/*** DATA TYPE ***/
/* constructor - used to initialize state of dtilist instance */
static void orte_ns_replica_dti_construct(orte_ns_replica_dti_t* dti)
{
dti->id = ORTE_DSS_ID_MAX;
dti->name = NULL;
}
/* destructor - used to free any resources held by instance */
static void orte_ns_replica_dti_destructor(orte_ns_replica_dti_t* dti)
{
if (NULL != dti->name) {
free(dti->name);
}
}
/* define instance of opal_class_t */
OBJ_CLASS_INSTANCE(orte_ns_replica_dti_t, /* type name */
opal_object_t, /* parent "class" name */
orte_ns_replica_dti_construct, /* constructor */
orte_ns_replica_dti_destructor); /* destructor */
#if defined(c_plusplus) || defined(__cplusplus)
}
#endif
#endif

309
orte/mca/ns/replica/ns_replica_component.c Обычный файл
Просмотреть файл

@ -0,0 +1,309 @@
/* -*- C -*-
*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/** @file:
*
* The Open MPI Name Server
*
* The Open MPI Name Server provides unique name ranges for processes
* within the universe. Each universe will have one name server
* running within the seed daemon. This is done to prevent the
* inadvertent duplication of names.
*/
/*
* includes
*/
#include "orte_config.h"
#include "orte/orte_constants.h"
#include "orte/orte_types.h"
#include "opal/threads/mutex.h"
#include "opal/class/opal_list.h"
#include "opal/util/output.h"
#include "opal/mca/mca.h"
#include "opal/mca/base/mca_base_param.h"
#include "orte/util/proc_info.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/rml/rml.h"
#include "orte/mca/ns/base/ns_private.h"
#include "ns_replica.h"
/*
* Struct of function pointers that need to be initialized
*/
mca_ns_base_component_t mca_ns_replica_component = {
{
MCA_NS_BASE_VERSION_2_0_0,
"replica", /* MCA module name */
ORTE_MAJOR_VERSION, /* MCA module major version */
ORTE_MINOR_VERSION, /* MCA module minor version */
ORTE_RELEASE_VERSION, /* MCA module release version */
orte_ns_replica_open, /* module open */
orte_ns_replica_close /* module close */
},
{
false /* checkpoint / restart */
},
orte_ns_replica_init, /* module init */
orte_ns_replica_finalize /* module shutdown */
};
/*
* setup the function pointers for the module
*/
static mca_ns_base_module_t orte_ns_replica_module = {
/* init */
orte_ns_replica_module_init,
/* cell functions */
orte_ns_replica_create_cellid,
orte_ns_replica_get_cell_info,
orte_ns_base_get_cellid_string,
orte_ns_base_convert_cellid_to_string,
orte_ns_base_convert_string_to_cellid,
/** node functions */
orte_ns_replica_create_nodeids,
orte_ns_replica_get_node_info,
orte_ns_base_convert_nodeid_to_string,
orte_ns_base_convert_string_to_nodeid,
/* jobid functions */
orte_ns_replica_create_jobid,
orte_ns_replica_get_job_descendants,
orte_ns_replica_get_job_children,
orte_ns_replica_get_root_job,
orte_ns_replica_get_parent_job,
orte_ns_base_get_jobid_string,
orte_ns_base_convert_jobid_to_string,
orte_ns_base_convert_string_to_jobid,
orte_ns_replica_reserve_range,
/* vpid functions */
orte_ns_base_get_vpid_string,
orte_ns_base_convert_vpid_to_string,
orte_ns_base_convert_string_to_vpid,
/* name functions */
orte_ns_base_create_process_name,
orte_ns_replica_create_my_name,
orte_ns_base_convert_string_to_process_name,
orte_ns_base_get_proc_name_string,
orte_ns_base_compare_fields,
/* peer functions */
orte_ns_replica_get_peers,
/* tag server functions */
orte_ns_replica_assign_rml_tag,
/* data type functions */
orte_ns_replica_define_data_type,
/* diagnostic functions */
orte_ns_replica_dump_cells,
orte_ns_replica_dump_jobs,
orte_ns_replica_dump_tags,
orte_ns_replica_dump_datatypes
};
/*
* Whether or not we allowed this component to be selected
*/
static bool initialized = false;
/*
* class instantiations
*/
#include "ns_replica_class_instances.h"
/*
* globals needed within replica component
*/
orte_ns_replica_globals_t orte_ns_replica;
/*
* don't really need this function - could just put NULL in the above structure
* Just holding the place in case we decide there is something we need to do
*/
int orte_ns_replica_open(void)
{
int id, param;
id = mca_base_param_register_int("ns", "replica", "debug", NULL, (int)false);
mca_base_param_lookup_int(id, &orte_ns_replica.debug);
id = mca_base_param_register_int("ns", "replica", "isolate", NULL, (int)false);
mca_base_param_lookup_int(id, &param);
if (param) {
orte_ns_replica.isolate = true;
} else {
orte_ns_replica.isolate = false;
}
id = mca_base_param_register_int("ns", "replica", "maxsize", NULL,
ORTE_NS_ARRAY_MAX_SIZE);
mca_base_param_lookup_int(id, &param);
orte_ns_replica.max_size = (size_t)param;
id = mca_base_param_register_int("ns", "replica", "blocksize", NULL,
ORTE_NS_ARRAY_BLOCK_SIZE);
mca_base_param_lookup_int(id, &param);
orte_ns_replica.block_size = (size_t)param;
return ORTE_SUCCESS;
}
/*
* ditto for this one
*/
int orte_ns_replica_close(void)
{
return ORTE_SUCCESS;
}
mca_ns_base_module_t* orte_ns_replica_init(int *priority)
{
int rc;
/* If we are to host a replica, then we want to be selected, so do all the
setup and return the module */
if (NULL == orte_process_info.ns_replica_uri) {
/* Return a module (choose an arbitrary, positive priority --
it's only relevant compared to other ns components). If
we're not the seed, then we don't want to be selected, so
return NULL. */
*priority = 50;
/* initialize the cell info tracker */
if (ORTE_SUCCESS != (rc = orte_pointer_array_init(&(orte_ns_replica.cells),
(orte_std_cntr_t)orte_ns_replica.block_size,
(orte_std_cntr_t)orte_ns_replica.max_size,
(orte_std_cntr_t)orte_ns_replica.block_size))) {
ORTE_ERROR_LOG(rc);
return NULL;
}
orte_ns_replica.num_cells = 0;
/* initialize the job tracking system */
OBJ_CONSTRUCT(&orte_ns_replica.jobs, opal_list_t);
orte_ns_replica.num_jobids = 0;
/* initialize the taglist */
if (ORTE_SUCCESS != (rc = orte_pointer_array_init(&(orte_ns_replica.tags),
(orte_std_cntr_t)orte_ns_replica.block_size,
(orte_std_cntr_t)orte_ns_replica.max_size,
(orte_std_cntr_t)orte_ns_replica.block_size))) {
ORTE_ERROR_LOG(rc);
return NULL;
}
orte_ns_replica.num_tags = 0;
/* initialize the dtlist */
if (ORTE_SUCCESS != (rc = orte_pointer_array_init(&(orte_ns_replica.dts),
(orte_std_cntr_t)orte_ns_replica.block_size,
(orte_std_cntr_t)orte_ns_replica.max_size,
(orte_std_cntr_t)orte_ns_replica.block_size))) {
ORTE_ERROR_LOG(rc);
return NULL;
}
orte_ns_replica.num_dts = 0;
/* setup the thread lock */
OBJ_CONSTRUCT(&orte_ns_replica.mutex, opal_mutex_t);
/* Return the module */
initialized = true;
return &orte_ns_replica_module;
} else {
return NULL;
}
}
int orte_ns_replica_module_init(void)
{
int rc;
if (orte_ns_replica.isolate) {
return ORTE_SUCCESS;
}
/* issue non-blocking receive for call_back function */
rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_NS, ORTE_RML_PERSISTENT, orte_ns_replica_recv, NULL);
if(rc < 0) {
ORTE_ERROR_LOG(rc);
return rc;
}
return ORTE_SUCCESS;
}
/*
* finalize routine
*/
int orte_ns_replica_finalize(void)
{
orte_ns_replica_cell_tracker_t **cptr;
opal_list_item_t *item;
orte_ns_replica_tagitem_t **tag;
orte_ns_replica_dti_t **dti;
orte_std_cntr_t i;
/* free all tracking storage, but only if this component was initialized */
if (initialized) {
cptr = (orte_ns_replica_cell_tracker_t**)(orte_ns_replica.cells)->addr;
for (i=0; i < (orte_ns_replica.cells)->size; i++) {
if (NULL != cptr[i]) {
OBJ_RELEASE(cptr[i]);
}
}
OBJ_RELEASE(orte_ns_replica.cells);
while (NULL != (item = opal_list_remove_first(&orte_ns_replica.jobs))) {
OBJ_RELEASE(item);
}
OBJ_DESTRUCT(&orte_ns_replica.jobs);
tag = (orte_ns_replica_tagitem_t**)(orte_ns_replica.tags)->addr;
for (i=0; i < (orte_ns_replica.tags)->size; i++) {
if (NULL != tag[i]) OBJ_RELEASE(tag[i]);
}
OBJ_RELEASE(orte_ns_replica.tags);
dti = (orte_ns_replica_dti_t**)(orte_ns_replica.dts)->addr;
for (i=0; i < (orte_ns_replica.dts)->size; i++) {
if (NULL != dti[i]) OBJ_RELEASE(dti[i]);
}
OBJ_RELEASE(orte_ns_replica.dts);
initialized = false;
}
/* All done */
if (orte_ns_replica.isolate) {
return ORTE_SUCCESS;
}
orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_NS);
return ORTE_SUCCESS;
}

322
orte/mca/ns/replica/ns_replica_diag_fns.c Обычный файл
Просмотреть файл

@ -0,0 +1,322 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/** @file:
*
*/
#include "orte_config.h"
#include <stdio.h>
#include <string.h>
#include "opal/threads/mutex.h"
#include "opal/util/output.h"
#include "opal/util/trace.h"
#include "orte/dss/dss.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/ns/base/base.h"
#include "orte/mca/ns/base/ns_private.h"
#include "ns_replica.h"
/*
* DIAGNOSTIC functions
*/
int orte_ns_replica_dump_cells(void)
{
orte_buffer_t buffer;
int rc;
OBJ_CONSTRUCT(&buffer, orte_buffer_t);
if (ORTE_SUCCESS != (rc = orte_ns_replica_dump_cells_fn(&buffer))) {
ORTE_ERROR_LOG(rc);
return rc;
}
if (ORTE_SUCCESS != (rc = orte_ns_base_print_dump(&buffer))) {
ORTE_ERROR_LOG(rc);
OBJ_DESTRUCT(&buffer);
return rc;
}
OBJ_DESTRUCT(&buffer);
return ORTE_SUCCESS;
}
int orte_ns_replica_dump_cells_fn(orte_buffer_t *buffer)
{
orte_std_cntr_t i;
orte_cellid_t j;
orte_ns_replica_cell_tracker_t **cell;
char tmp_out[NS_REPLICA_MAX_STRING_SIZE], *tmp;
int rc;
OPAL_THREAD_LOCK(&orte_ns_replica.mutex);
tmp = tmp_out;
snprintf(tmp, NS_REPLICA_MAX_STRING_SIZE, "Dump of Name Service Cell Tracker\n");
if (ORTE_SUCCESS != (rc = orte_dss.pack(buffer, &tmp, 1, ORTE_STRING))) {
ORTE_ERROR_LOG(rc);
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return rc;
}
cell = (orte_ns_replica_cell_tracker_t**)(orte_ns_replica.cells)->addr;
for (i=0, j=0; j < orte_ns_replica.num_cells &&
i < (orte_ns_replica.cells)->size; i++) {
if (NULL != cell[i]) {
j++;
snprintf(tmp, NS_REPLICA_MAX_STRING_SIZE, "Num: %lu\tCell: %lu\n",
(unsigned long)j, (unsigned long)cell[i]->cell);
if (ORTE_SUCCESS != (rc = orte_dss.pack(buffer, &tmp, 1, ORTE_STRING))) {
ORTE_ERROR_LOG(rc);
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return rc;
}
snprintf(tmp, NS_REPLICA_MAX_STRING_SIZE, "\tSite: %s\n\tResource: %s\n",
cell[i]->site, cell[i]->resource);
if (ORTE_SUCCESS != (rc = orte_dss.pack(buffer, &tmp, 1, ORTE_STRING))) {
ORTE_ERROR_LOG(rc);
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return rc;
}
}
}
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return ORTE_SUCCESS;
}
int orte_ns_replica_dump_jobs(void)
{
orte_buffer_t buffer;
int rc;
OPAL_THREAD_LOCK(&orte_ns_replica.mutex);
OBJ_CONSTRUCT(&buffer, orte_buffer_t);
if (ORTE_SUCCESS != (rc = orte_ns_replica_dump_jobs_fn(&buffer))) {
ORTE_ERROR_LOG(rc);
return rc;
}
if (ORTE_SUCCESS != (rc = orte_ns_base_print_dump(&buffer))) {
ORTE_ERROR_LOG(rc);
OBJ_DESTRUCT(&buffer);
return rc;
}
OBJ_DESTRUCT(&buffer);
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return ORTE_SUCCESS;
}
static int dump_child_jobs(orte_ns_replica_jobitem_t *ptr, char *prefix, orte_buffer_t *buffer)
{
opal_list_item_t *item;
orte_ns_replica_jobitem_t *child;
char *tmp;
int rc;
char *pfx;
asprintf(&pfx, "%s ", prefix);
/* print out the children's info */
for (item = opal_list_get_first(&ptr->children);
item != opal_list_get_end(&ptr->children);
item = opal_list_get_next(item)) {
child = (orte_ns_replica_jobitem_t*)item;
asprintf(&tmp, "%sChild jobid: %ld Next vpid: %ld Num direct children: %ld\n",
pfx, (long)child->jobid, (long)child->next_vpid, (long)opal_list_get_size(&child->children));
if (ORTE_SUCCESS != (rc = orte_dss.pack(buffer, &tmp, 1, ORTE_STRING))) {
ORTE_ERROR_LOG(rc);
return rc;
}
free(tmp);
if (ORTE_SUCCESS != (rc = dump_child_jobs(child, pfx, buffer))) {
ORTE_ERROR_LOG(rc);
return rc;
}
}
free(pfx);
return ORTE_SUCCESS;
}
int orte_ns_replica_dump_jobs_fn(orte_buffer_t *buffer)
{
orte_ns_replica_jobitem_t *root;
opal_list_item_t *item;
char *tmp;
int rc;
char *prefix = " ";
asprintf(&tmp, "Dump of Name Service Jobid Tracker\n");
if (ORTE_SUCCESS != (rc = orte_dss.pack(buffer, &tmp, 1, ORTE_STRING))) {
ORTE_ERROR_LOG(rc);
return rc;
}
free(tmp);
for (item = opal_list_get_first(&orte_ns_replica.jobs);
item != opal_list_get_end(&orte_ns_replica.jobs);
item = opal_list_get_next(item)) {
root = (orte_ns_replica_jobitem_t*)item;
asprintf(&tmp, " Data for job family with root %ld\n", (long)root->jobid);
if (ORTE_SUCCESS != (rc = orte_dss.pack(buffer, &tmp, 1, ORTE_STRING))) {
ORTE_ERROR_LOG(rc);
return rc;
}
free(tmp);
asprintf(&tmp, "%sNext vpid: %ld Num direct children: %ld\n",
prefix, (long)root->next_vpid, (long)opal_list_get_size(&root->children));
if (ORTE_SUCCESS != (rc = orte_dss.pack(buffer, &tmp, 1, ORTE_STRING))) {
ORTE_ERROR_LOG(rc);
return rc;
}
free(tmp);
if (ORTE_SUCCESS != (rc = dump_child_jobs(root, prefix, buffer))) {
ORTE_ERROR_LOG(rc);
return rc;
}
}
return ORTE_SUCCESS;
}
int orte_ns_replica_dump_tags(void)
{
orte_buffer_t buffer;
int rc;
OBJ_CONSTRUCT(&buffer, orte_buffer_t);
if (ORTE_SUCCESS != (rc = orte_ns_replica_dump_tags_fn(&buffer))) {
ORTE_ERROR_LOG(rc);
return rc;
}
if (ORTE_SUCCESS != (rc = orte_ns_base_print_dump(&buffer))) {
ORTE_ERROR_LOG(rc);
OBJ_DESTRUCT(&buffer);
return rc;
}
OBJ_DESTRUCT(&buffer);
return ORTE_SUCCESS;
}
int orte_ns_replica_dump_tags_fn(orte_buffer_t *buffer)
{
orte_std_cntr_t i;
orte_rml_tag_t j;
orte_ns_replica_tagitem_t **ptr;
char tmp_out[NS_REPLICA_MAX_STRING_SIZE], *tmp;
int rc;
OPAL_THREAD_LOCK(&orte_ns_replica.mutex);
tmp = tmp_out;
snprintf(tmp, NS_REPLICA_MAX_STRING_SIZE, "Dump of Name Service RML Tag Tracker\n");
if (ORTE_SUCCESS != (rc = orte_dss.pack(buffer, &tmp, 1, ORTE_STRING))) {
ORTE_ERROR_LOG(rc);
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return rc;
}
ptr = (orte_ns_replica_tagitem_t**)(orte_ns_replica.tags)->addr;
for (i=0, j=0; j < orte_ns_replica.num_tags &&
i < (orte_ns_replica.tags)->size; i++) {
if (NULL != ptr[i]) {
j++;
snprintf(tmp, NS_REPLICA_MAX_STRING_SIZE, "Num: %lu\tTag id: %lu\tName: %s\n",
(unsigned long)j, (unsigned long)ptr[i]->tag, ptr[i]->name);
if (ORTE_SUCCESS != (rc = orte_dss.pack(buffer, &tmp, 1, ORTE_STRING))) {
ORTE_ERROR_LOG(rc);
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return rc;
}
}
}
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return ORTE_SUCCESS;
}
int orte_ns_replica_dump_datatypes(void)
{
orte_buffer_t buffer;
int rc;
OBJ_CONSTRUCT(&buffer, orte_buffer_t);
if (ORTE_SUCCESS != (rc = orte_ns_replica_dump_datatypes_fn(&buffer))) {
ORTE_ERROR_LOG(rc);
return rc;
}
if (ORTE_SUCCESS != (rc = orte_ns_base_print_dump(&buffer))) {
ORTE_ERROR_LOG(rc);
OBJ_DESTRUCT(&buffer);
return rc;
}
OBJ_DESTRUCT(&buffer);
return ORTE_SUCCESS;
}
int orte_ns_replica_dump_datatypes_fn(orte_buffer_t *buffer)
{
orte_std_cntr_t i, j;
orte_ns_replica_dti_t **ptr;
char tmp_out[NS_REPLICA_MAX_STRING_SIZE], *tmp;
int rc;
OPAL_THREAD_LOCK(&orte_ns_replica.mutex);
tmp = tmp_out;
snprintf(tmp, NS_REPLICA_MAX_STRING_SIZE, "Dump of Name Service Datatype Tracker\n");
if (ORTE_SUCCESS != (rc = orte_dss.pack(buffer, &tmp, 1, ORTE_STRING))) {
ORTE_ERROR_LOG(rc);
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return rc;
}
ptr = (orte_ns_replica_dti_t**)(orte_ns_replica.dts)->addr;
for (i=0, j=0; j < orte_ns_replica.num_dts &&
i < (orte_ns_replica.dts)->size; i++) {
if (NULL != ptr[i]) {
j++;
snprintf(tmp, NS_REPLICA_MAX_STRING_SIZE, "Num: %lu\tDatatype id: %lu\tName: %s\n",
(unsigned long)j, (unsigned long)ptr[i]->id, ptr[i]->name);
if (ORTE_SUCCESS != (rc = orte_dss.pack(buffer, &tmp, 1, ORTE_STRING))) {
ORTE_ERROR_LOG(rc);
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return rc;
}
}
}
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return ORTE_SUCCESS;
}

403
orte/mca/ns/replica/ns_replica_general_fns.c Обычный файл
Просмотреть файл

@ -0,0 +1,403 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/** @file:
*
*/
#include "orte_config.h"
#include <stdio.h>
#include <string.h>
#include "opal/threads/mutex.h"
#include "opal/util/output.h"
#include "opal/util/trace.h"
#include "orte/dss/dss.h"
#include "orte/mca/rmgr/rmgr.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/ns/base/ns_private.h"
#include "ns_replica.h"
/*** GET PEERS ***/
int orte_ns_replica_get_peers(orte_process_name_t **procs,
orte_std_cntr_t *num_procs, opal_list_t *attrs)
{
orte_std_cntr_t i, isave, npeers;
orte_jobid_t *jptr;
orte_cellid_t *cptr;
orte_attribute_t *attr;
orte_ns_replica_jobitem_t *job_info, *child;
opal_list_item_t *item;
opal_list_t peerlist;
int rc;
OPAL_TRACE(1);
OPAL_THREAD_LOCK(&orte_ns_replica.mutex);
/* set default value */
*procs = NULL;
*num_procs = 0;
/* check the attributes to see if USE_JOB or USE_CELL has been set. If not, then this is
* a request for my own job peers - process that one locally
*/
/* if the cell is given AND it matches my own, then we can process this
* quickly. Otherwise, we have to do some more work.
*
* RHC: when we go multi-cell, we need a way to find all the cells upon
* which a job is executing so we can make this work!
*/
if (NULL != (attr = orte_rmgr.find_attribute(attrs, ORTE_NS_USE_CELL))) {
if (ORTE_SUCCESS != (rc = orte_dss.get((void**)&cptr, attr->value, ORTE_CELLID))) {
ORTE_ERROR_LOG(rc);
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return rc;
}
if (*cptr != ORTE_PROC_MY_NAME->cellid && *cptr != ORTE_CELLID_WILDCARD) {
ORTE_ERROR_LOG(ORTE_ERR_NOT_IMPLEMENTED);
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return ORTE_ERR_NOT_IMPLEMENTED;
}
}
if (NULL == (attr = orte_rmgr.find_attribute(attrs, ORTE_NS_USE_JOBID))) {
/* get my own job peers, assuming all are on this cell */
*procs = (orte_process_name_t*)malloc(orte_process_info.num_procs * sizeof(orte_process_name_t));
if (NULL == *procs) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return ORTE_ERR_OUT_OF_RESOURCE;
}
for (i=0; i < orte_process_info.num_procs; i++) {
(*procs)[i].cellid = ORTE_PROC_MY_NAME->cellid;
(*procs)[i].jobid = ORTE_PROC_MY_NAME->jobid;
(*procs)[i].vpid = orte_process_info.vpid_start + i;
}
*num_procs = orte_process_info.num_procs;
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return ORTE_SUCCESS;
}
/* we get here if the job attribute was passed to us - use that jobid */
if (ORTE_SUCCESS != (rc = orte_dss.get((void**)&jptr, attr->value, ORTE_JOBID))) {
ORTE_ERROR_LOG(rc);
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return rc;
}
/* look up this job's record on the tracking database */
if (NULL == (job_info = orte_ns_replica_find_job(*jptr))) {
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return ORTE_ERR_NOT_FOUND;
}
if (NULL != (attr = orte_rmgr.find_attribute(attrs, ORTE_NS_INCLUDE_DESCENDANTS))) {
/* we want the peers from this job AND ALL of its descendants - start by constructing
* a flattened list of the descendant jobs
*/
OBJ_CONSTRUCT(&peerlist, opal_list_t);
child = OBJ_NEW(orte_ns_replica_jobitem_t);
child->jobid = job_info->jobid;
child->next_vpid = job_info->next_vpid;
opal_list_append(&peerlist, &child->super); /* add the current job to the list */
orte_ns_replica_construct_flattened_tree(&peerlist, job_info);
i = opal_list_get_size(&peerlist);
if (0 < i) {
npeers = 0;
for (item = opal_list_get_first(&peerlist);
item != opal_list_get_end(&peerlist);
item = opal_list_get_next(item)) {
child = (orte_ns_replica_jobitem_t*)item;
npeers += child->next_vpid;
}
if (0 >= npeers) {
*num_procs = npeers;
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return ORTE_SUCCESS;
}
*procs = (orte_process_name_t*)malloc(npeers * sizeof(orte_process_name_t));
if (NULL == *procs) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return ORTE_ERR_OUT_OF_RESOURCE;
}
/* populate it from the list */
isave = 0;
while (NULL != (item = opal_list_remove_first(&peerlist))) {
child = (orte_ns_replica_jobitem_t*)item;
for (i=0; i < child->next_vpid; i++) {
(*procs)[i+isave].cellid = ORTE_PROC_MY_NAME->cellid;
(*procs)[i+isave].jobid = child->jobid;
(*procs)[i+isave].vpid = i;
}
isave += child->next_vpid;
}
}
*num_procs = npeers;
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return ORTE_SUCCESS;
}
if (NULL != (attr = orte_rmgr.find_attribute(attrs, ORTE_NS_INCLUDE_CHILDREN))) {
/* we want the peers from this job AND ONLY its immediate children */
/* determine the number of peers we are going to have */
npeers = job_info->next_vpid;
for (item = opal_list_get_first(&job_info->children);
item != opal_list_get_end(&job_info->children);
item = opal_list_get_next(item)) {
child = (orte_ns_replica_jobitem_t*)item;
npeers += child->next_vpid;
}
/* create the array */
if (0 < npeers) {
*procs = (orte_process_name_t*)malloc(npeers * sizeof(orte_process_name_t));
if (NULL == *procs) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return ORTE_ERR_OUT_OF_RESOURCE;
}
/* populate it, starting with the specified job followed by its children */
for (i=0; i < job_info->next_vpid; i++) {
(*procs)[i].cellid = ORTE_PROC_MY_NAME->cellid;
(*procs)[i].jobid = *jptr;
(*procs)[i].vpid = i;
}
isave = job_info->next_vpid;
for (item = opal_list_get_first(&job_info->children);
item != opal_list_get_end(&job_info->children);
item = opal_list_get_next(item)) {
child = (orte_ns_replica_jobitem_t*)item;
for (i=0; i < child->next_vpid; i++) {
(*procs)[i+isave].cellid = ORTE_PROC_MY_NAME->cellid;
(*procs)[i+isave].jobid = child->jobid;
(*procs)[i+isave].vpid = i;
}
isave += child->next_vpid;
}
}
*num_procs = npeers;
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return ORTE_SUCCESS;
}
/* get here if we want just the peers for the specified job */
/* create the array of peers */
if (0 < job_info->next_vpid) {
*procs = (orte_process_name_t*)malloc(job_info->next_vpid * sizeof(orte_process_name_t));
if (NULL == *procs) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return ORTE_ERR_OUT_OF_RESOURCE;
}
for (i=0; i < job_info->next_vpid; i++) {
(*procs)[i].cellid = ORTE_PROC_MY_NAME->cellid;
(*procs)[i].jobid = *jptr;
(*procs)[i].vpid = i;
}
}
*num_procs = job_info->next_vpid;
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return ORTE_SUCCESS;
}
/*
* TAG SERVER functions
*/
int orte_ns_replica_assign_rml_tag(orte_rml_tag_t *tag,
char *name)
{
orte_ns_replica_tagitem_t *tagitem, **tags;
orte_std_cntr_t i;
orte_rml_tag_t j;
int rc;
OPAL_THREAD_LOCK(&orte_ns_replica.mutex);
if (NULL != name) {
/* see if this name is already in list - if so, return tag */
tags = (orte_ns_replica_tagitem_t**)orte_ns_replica.tags->addr;
for (i=0, j=0; j < orte_ns_replica.num_tags &&
i < (orte_ns_replica.tags)->size; i++) {
if (NULL != tags[i]) {
j++;
if (tags[i]->name != NULL &&
0 == strcmp(name, tags[i]->name)) { /* found name on list */
*tag = tags[i]->tag;
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return ORTE_SUCCESS;
}
}
}
}
/* not in list or not provided, so allocate next tag */
*tag = ORTE_RML_TAG_MAX;
/* check if tag is available - need to do this since the tag type
* is probably not going to be a orte_std_cntr_t, so we cannot just rely
* on the pointer_array's size limits to protect us. NOTE: need to
* reserve ORTE_RML_TAG_MAX as an invalid value, so can't let
* num_tags get there
*/
if (ORTE_RML_TAG_MAX-2 < orte_ns_replica.num_tags) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return ORTE_ERR_OUT_OF_RESOURCE;
}
tagitem = OBJ_NEW(orte_ns_replica_tagitem_t);
if (NULL == tagitem) { /* out of memory */
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return ORTE_ERR_OUT_OF_RESOURCE;
}
if (ORTE_SUCCESS != (rc = orte_pointer_array_add(&i,
orte_ns_replica.tags, tagitem))) {
ORTE_ERROR_LOG(rc);
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return rc;
}
tagitem->tag = orte_ns_replica.num_tags + ORTE_RML_TAG_DYNAMIC;
(orte_ns_replica.num_tags)++;
if (NULL != name) { /* provided - can look it up later */
tagitem->name = strdup(name);
} else {
tagitem->name = NULL;
}
*tag = tagitem->tag;
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return ORTE_SUCCESS;
}
/*
* DATA TYPE SERVER functions
*/
int orte_ns_replica_define_data_type(const char *name,
orte_data_type_t *type)
{
orte_ns_replica_dti_t **dti, *dtip;
orte_std_cntr_t i, j;
int rc;
if (NULL == name || 0 < *type) {
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
return ORTE_ERR_BAD_PARAM;
}
OPAL_THREAD_LOCK(&orte_ns_replica.mutex);
dti = (orte_ns_replica_dti_t**)orte_ns_replica.dts->addr;
for (i=0, j=0; j < orte_ns_replica.num_dts &&
i < orte_ns_replica.dts->size; i++) {
if (NULL != dti[i]) {
j++;
if (dti[i]->name != NULL &&
0 == strcmp(name, dti[i]->name)) { /* found name on list */
*type = dti[i]->id;
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return ORTE_SUCCESS;
}
}
}
/* not in list or not provided, so allocate next id */
*type = ORTE_DSS_ID_MAX;
/* check if id is available - need to do this since the data type
* is probably not going to be a orte_std_cntr_t, so we cannot just rely
* on the pointer_array's size limits to protect us.
*/
if (ORTE_DSS_ID_MAX-2 < orte_ns_replica.num_dts) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return ORTE_ERR_OUT_OF_RESOURCE;
}
dtip = OBJ_NEW(orte_ns_replica_dti_t);
if (NULL == dtip) { /* out of memory */
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return ORTE_ERR_OUT_OF_RESOURCE;
}
dtip->name = strdup(name);
if (ORTE_SUCCESS != (rc = orte_pointer_array_add(&i,
orte_ns_replica.dts, dtip))) {
ORTE_ERROR_LOG(rc);
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return rc;
}
dtip->id = orte_ns_replica.num_dts;
(orte_ns_replica.num_dts)++;
*type = dtip->id;
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return ORTE_SUCCESS;
}
/*
* NAME functions
*/
int orte_ns_replica_create_my_name(void)
{
orte_jobid_t jobid;
orte_vpid_t vpid;
opal_list_t attrs;
int rc;
OBJ_CONSTRUCT(&attrs, opal_list_t);
if (ORTE_SUCCESS != (rc = orte_ns.create_jobid(&jobid, &attrs))) {
ORTE_ERROR_LOG(rc);
OBJ_DESTRUCT(&attrs);
return rc;
}
OBJ_DESTRUCT(&attrs);
if (ORTE_SUCCESS != (rc = orte_ns.reserve_range(jobid, 1, &vpid))) {
ORTE_ERROR_LOG(rc);
return rc;
}
if (ORTE_SUCCESS != (rc = orte_ns.create_process_name(&(orte_process_info.my_name),
0, jobid, vpid))) {
ORTE_ERROR_LOG(rc);
return rc;
}
return ORTE_SUCCESS;
}

295
orte/mca/ns/replica/ns_replica_job_fns.c Обычный файл
Просмотреть файл

@ -0,0 +1,295 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/** @file:
*
*/
#include "orte_config.h"
#include <stdio.h>
#include <string.h>
#include "opal/threads/mutex.h"
#include "opal/util/output.h"
#include "opal/util/trace.h"
#include "orte/dss/dss.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/rmgr/rmgr.h"
#include "ns_replica.h"
/*
* JOBID functions
*/
int orte_ns_replica_create_jobid(orte_jobid_t *jobid, opal_list_t *attrs)
{
orte_ns_replica_jobitem_t *child, *parent, *root;
orte_jobid_t parent_job=ORTE_JOBID_INVALID, *jptr;
orte_attribute_t *attr;
int rc;
OPAL_TRACE(1);
OPAL_THREAD_LOCK(&orte_ns_replica.mutex);
*jobid = ORTE_JOBID_INVALID;
/* check for attributes */
if (NULL != (attr = orte_rmgr.find_attribute(attrs, ORTE_NS_USE_PARENT))) {
/* declares the specified jobid to be the parent of the new one */
if (ORTE_SUCCESS != (rc = orte_dss.get((void**)&jptr, attr->value, ORTE_JOBID))) {
ORTE_ERROR_LOG(rc);
return rc;
}
parent_job = *jptr;
} else if (NULL != (attr = orte_rmgr.find_attribute(attrs, ORTE_NS_USE_ROOT))) {
/* use the root of the specified job as the parent of the new one */
if (ORTE_SUCCESS != (rc = orte_dss.get((void**)&jptr, attr->value, ORTE_JOBID))) {
ORTE_ERROR_LOG(rc);
return rc;
}
if (NULL == (root = orte_ns_replica_find_root_job(*jptr))) {
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
return ORTE_ERR_NOT_FOUND;
}
parent_job = root->jobid;
}
/* if the parent jobid is INVALID, then this is the root of a new
* job family - create it
*/
if (ORTE_JOBID_INVALID == parent_job) {
root = OBJ_NEW(orte_ns_replica_jobitem_t);
if (NULL == root) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return ORTE_ERR_OUT_OF_RESOURCE;
}
root->jobid = orte_ns_replica.num_jobids;
opal_list_append(&orte_ns_replica.jobs, &root->super);
*jobid = root->jobid;
(orte_ns_replica.num_jobids)++;
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return ORTE_SUCCESS;
}
/* if the parent jobid is not INVALID, then the request is for a
* new child for this parent. Find the job's record
*/
if (NULL == (parent = orte_ns_replica_find_job(parent_job))) {
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return ORTE_ERR_NOT_FOUND;
}
/* add this new job to the parent's list of children */
child = OBJ_NEW(orte_ns_replica_jobitem_t);
if (NULL == child) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return ORTE_ERR_OUT_OF_RESOURCE;
}
opal_list_append(&parent->children, &child->super);
child->jobid = orte_ns_replica.num_jobids;
*jobid = child->jobid;
(orte_ns_replica.num_jobids)++;
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return ORTE_SUCCESS;
}
int orte_ns_replica_get_job_descendants(orte_jobid_t **descendants, orte_std_cntr_t *num_desc, orte_jobid_t job)
{
orte_std_cntr_t i, num;
orte_ns_replica_jobitem_t *ptr, *newptr;
orte_jobid_t *descs;
opal_list_t desc_list;
opal_list_item_t *item;
OPAL_THREAD_LOCK(&orte_ns_replica.mutex);
/* default values */
*descendants = NULL;
*num_desc = 0;
/* find this job's record on the tree */
if (NULL == (ptr = orte_ns_replica_find_job(job))) {
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return ORTE_ERR_NOT_FOUND;
}
/* construct a flattened list of its descendants - including ourself */
OBJ_CONSTRUCT(&desc_list, opal_list_t);
newptr = OBJ_NEW(orte_ns_replica_jobitem_t);
newptr->jobid = job;
opal_list_append(&desc_list, &newptr->super);
orte_ns_replica_construct_flattened_tree(&desc_list, ptr);
/* count number of entries */
num = opal_list_get_size(&desc_list);
/* allocate memory for the array */
descs = (orte_jobid_t*)malloc(num * sizeof(orte_jobid_t));
if (NULL == descs) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return ORTE_ERR_OUT_OF_RESOURCE;
}
/* now fill in the array */
i = 0;
while (NULL != (item = opal_list_remove_first(&desc_list))) {
ptr = (orte_ns_replica_jobitem_t*)item;
descs[i++] = ptr->jobid;
OBJ_RELEASE(ptr);
}
OBJ_DESTRUCT(&desc_list);
*descendants = descs;
*num_desc = num;
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return ORTE_SUCCESS;
}
int orte_ns_replica_get_job_children(orte_jobid_t **children, orte_std_cntr_t *num_childs, orte_jobid_t job)
{
orte_std_cntr_t i, num;
orte_ns_replica_jobitem_t *ptr, *newptr;
orte_jobid_t *descs;
opal_list_item_t *item;
OPAL_THREAD_LOCK(&orte_ns_replica.mutex);
/* default values */
*children = NULL;
*num_childs = 0;
/* find this job's record on the tree */
if (NULL == (ptr = orte_ns_replica_find_job(job))) {
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return ORTE_ERR_NOT_FOUND;
}
/* count number of entries in our direct children - include ourselves */
num = 1 + opal_list_get_size(&ptr->children);
/* allocate memory for the array */
descs = (orte_jobid_t*)malloc(num * sizeof(orte_jobid_t));
if (NULL == descs) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return ORTE_ERR_OUT_OF_RESOURCE;
}
/* now fill in the array - put ourselves first */
descs[0] = job;
i = 1;
for (item = opal_list_get_first(&ptr->children);
item != opal_list_get_end(&ptr->children);
item = opal_list_get_next(item)) {
newptr = (orte_ns_replica_jobitem_t*)item;
descs[i++] = newptr->jobid;
}
*children = descs;
*num_childs = num;
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return ORTE_SUCCESS;
}
int orte_ns_replica_get_root_job(orte_jobid_t *root_job, orte_jobid_t job)
{
orte_ns_replica_jobitem_t *root;
OPAL_THREAD_LOCK(&orte_ns_replica.mutex);
if (NULL == (root = orte_ns_replica_find_root_job(job))) {
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return ORTE_ERR_NOT_FOUND;
}
*root_job = root->jobid;
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return ORTE_SUCCESS;
}
int orte_ns_replica_get_parent_job(orte_jobid_t *parent_job, orte_jobid_t job)
{
opal_list_item_t *item;
orte_ns_replica_jobitem_t *root, *ptr, *parent;
OPAL_THREAD_LOCK(&orte_ns_replica.mutex);
/* find this job's parent object */
for (item = opal_list_get_first(&orte_ns_replica.jobs);
item != opal_list_get_end(&orte_ns_replica.jobs);
item = opal_list_get_next(item)) {
root = (orte_ns_replica_jobitem_t*)item;
if (NULL != (ptr = down_search(root, &parent, job))) {
goto REPORT;
}
}
/* don't report an error if not found, just return invalid */
*parent_job = ORTE_JOBID_INVALID;
return ORTE_ERR_NOT_FOUND;
REPORT:
/* return the info */
*parent_job = parent->jobid;
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return ORTE_SUCCESS;
}
int orte_ns_replica_reserve_range(orte_jobid_t job, orte_vpid_t range,
orte_vpid_t *start)
{
orte_ns_replica_jobitem_t *ptr;
OPAL_TRACE(1);
OPAL_THREAD_LOCK(&orte_ns_replica.mutex);
/* find the job's record */
if (NULL == (ptr = orte_ns_replica_find_job(job))) {
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return ORTE_ERR_NOT_FOUND;
}
if ((ORTE_VPID_MAX-range-(ptr->next_vpid)) > 0) {
*start = ptr->next_vpid;
ptr->next_vpid += range;
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return ORTE_SUCCESS;
}
/* get here if the range isn't available */
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return ORTE_ERR_OUT_OF_RESOURCE;
}

457
orte/mca/ns/replica/ns_replica_recv.c Обычный файл
Просмотреть файл

@ -0,0 +1,457 @@
/* -*- C -*-
*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/** @file:
*
* The Open MPI Name Server
*
*/
/*
* includes
*/
#include "orte_config.h"
#include "orte/orte_constants.h"
#include "orte/orte_types.h"
#include "opal/threads/mutex.h"
#include "opal/class/opal_list.h"
#include "opal/util/output.h"
#include "opal/mca/mca.h"
#include "opal/mca/base/mca_base_param.h"
#include "orte/util/proc_info.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/rml/rml.h"
#include "orte/mca/ns/base/ns_private.h"
#include "ns_replica.h"
/*
* handle message from proxies
* NOTE: The incoming buffer "buffer" is OBJ_RELEASED by the calling program.
* DO NOT RELEASE THIS BUFFER IN THIS CODE
*/
void orte_ns_replica_recv(int status, orte_process_name_t* sender,
orte_buffer_t* buffer, orte_rml_tag_t tag,
void* cbdata)
{
orte_buffer_t answer, error_answer;
orte_ns_cmd_flag_t command;
opal_list_t attrs;
orte_cellid_t cell;
orte_jobid_t job, root, *descendants;
orte_vpid_t startvpid, range;
char *tagname, *site, *resource;
orte_rml_tag_t oob_tag;
orte_data_type_t type;
orte_std_cntr_t count, nprocs, nret;
orte_process_name_t *procs;
int rc=ORTE_SUCCESS, ret;
count = 1;
if (ORTE_SUCCESS != (rc = orte_dss.unpack(buffer, &command, &count, ORTE_NS_CMD))) {
ORTE_ERROR_LOG(rc);
rc = ORTE_ERR_BAD_PARAM;
goto RETURN_ERROR;
}
OBJ_CONSTRUCT(&answer, orte_buffer_t);
if (ORTE_SUCCESS != (rc = orte_dss.pack(&answer, &command, 1, ORTE_NS_CMD))) {
ORTE_ERROR_LOG(rc);
goto RETURN_ERROR;
}
switch (command) {
case ORTE_NS_CREATE_CELLID_CMD:
count = 1;
if (ORTE_SUCCESS != (rc = orte_dss.unpack(buffer, &site, &count, ORTE_STRING))) {
ORTE_ERROR_LOG(rc);
rc = ORTE_ERR_BAD_PARAM;
goto RETURN_ERROR;
}
count = 1;
if (ORTE_SUCCESS != (rc = orte_dss.unpack(buffer, &resource, &count, ORTE_STRING))) {
ORTE_ERROR_LOG(rc);
rc = ORTE_ERR_BAD_PARAM;
goto RETURN_ERROR;
}
rc = orte_ns_replica_create_cellid(&cell, site, resource);
if (ORTE_SUCCESS != (ret = orte_dss.pack(&answer, &cell, 1, ORTE_CELLID))) {
ORTE_ERROR_LOG(ret);
goto RETURN_ERROR;
}
if (0 > orte_rml.send_buffer(sender, &answer, tag, 0)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
goto RETURN_ERROR;
}
break;
case ORTE_NS_GET_CELL_INFO_CMD:
count = 1;
if (ORTE_SUCCESS != (rc = orte_dss.unpack(buffer, &cell, &count, ORTE_CELLID))) {
ORTE_ERROR_LOG(rc);
rc = ORTE_ERR_BAD_PARAM;
goto RETURN_ERROR;
}
site = NULL;
resource = NULL;
rc = orte_ns_replica_get_cell_info(cell, &site, &resource);
if (ORTE_SUCCESS != (ret = orte_dss.pack(&answer, &site, 1, ORTE_STRING))) {
ORTE_ERROR_LOG(ret);
goto RETURN_ERROR;
}
if (ORTE_SUCCESS != (ret = orte_dss.pack(&answer, &resource, 1, ORTE_STRING))) {
ORTE_ERROR_LOG(ret);
goto RETURN_ERROR;
}
if (0 > orte_rml.send_buffer(sender, &answer, tag, 0)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
goto RETURN_ERROR;
}
break;
case ORTE_NS_CREATE_NODEID_CMD:
case ORTE_NS_GET_NODE_INFO_CMD:
ORTE_ERROR_LOG(ORTE_ERR_NOT_IMPLEMENTED);
goto RETURN_ERROR;
break;
case ORTE_NS_CREATE_JOBID_CMD:
/* get the list of attributes */
OBJ_CONSTRUCT(&attrs, opal_list_t);
count = 1;
if(ORTE_SUCCESS != (rc = orte_dss.unpack(buffer, &attrs, &count, ORTE_ATTR_LIST))) {
ORTE_ERROR_LOG(rc);
goto RETURN_ERROR;
}
if (ORTE_SUCCESS != (rc = orte_ns_replica_create_jobid(&job, &attrs))) {
ORTE_ERROR_LOG(rc);
OBJ_DESTRUCT(&attrs);
goto RETURN_ERROR;
}
OBJ_DESTRUCT(&attrs);
if (ORTE_SUCCESS != (rc = orte_dss.pack(&answer, (void*)&job, 1, ORTE_JOBID))) {
ORTE_ERROR_LOG(rc);
goto RETURN_ERROR;
}
if (0 > orte_rml.send_buffer(sender, &answer, tag, 0)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
goto RETURN_ERROR;
}
break;
case ORTE_NS_GET_JOB_DESC_CMD:
count = 1;
if (ORTE_SUCCESS != (rc = orte_dss.unpack(buffer, (void*)&job, &count, ORTE_JOBID))) {
ORTE_ERROR_LOG(rc);
goto RETURN_ERROR;
}
if (ORTE_SUCCESS != (rc = orte_ns_replica_get_job_descendants(&descendants, &nret, job))) {
ORTE_ERROR_LOG(rc);
goto RETURN_ERROR;
}
if (ORTE_SUCCESS != (rc = orte_dss.pack(&answer, (void*)&nret, 1, ORTE_STD_CNTR))) {
ORTE_ERROR_LOG(rc);
goto RETURN_ERROR;
}
if (0 < nret) {
if (ORTE_SUCCESS != (rc = orte_dss.pack(&answer, (void*)descendants, nret, ORTE_JOBID))) {
ORTE_ERROR_LOG(rc);
goto RETURN_ERROR;
}
}
if (0 > (rc = orte_rml.send_buffer(sender, &answer, tag, 0))) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
goto RETURN_ERROR;
}
break;
case ORTE_NS_GET_JOB_CHILD_CMD:
count = 1;
if (ORTE_SUCCESS != (rc = orte_dss.unpack(buffer, (void*)&job, &count, ORTE_JOBID))) {
ORTE_ERROR_LOG(rc);
goto RETURN_ERROR;
}
if (ORTE_SUCCESS != (rc = orte_ns_replica_get_job_children(&descendants, &nret, job))) {
ORTE_ERROR_LOG(rc);
goto RETURN_ERROR;
}
if (ORTE_SUCCESS != (rc = orte_dss.pack(&answer, (void*)&nret, 1, ORTE_STD_CNTR))) {
ORTE_ERROR_LOG(rc);
goto RETURN_ERROR;
}
if (0 < nret) {
if (ORTE_SUCCESS != (rc = orte_dss.pack(&answer, (void*)descendants, nret, ORTE_JOBID))) {
ORTE_ERROR_LOG(rc);
goto RETURN_ERROR;
}
}
if (0 > (rc = orte_rml.send_buffer(sender, &answer, tag, 0))) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
goto RETURN_ERROR;
}
break;
case ORTE_NS_GET_ROOT_JOB_CMD:
count = 1;
if (ORTE_SUCCESS != (rc = orte_dss.unpack(buffer, (void*)&job, &count, ORTE_JOBID))) {
ORTE_ERROR_LOG(rc);
goto RETURN_ERROR;
}
if (ORTE_SUCCESS != (rc = orte_ns_replica_get_root_job(&root, job))) {
ORTE_ERROR_LOG(rc);
goto RETURN_ERROR;
}
if (ORTE_SUCCESS != (rc = orte_dss.pack(&answer, (void*)&root, 1, ORTE_JOBID))) {
ORTE_ERROR_LOG(rc);
goto RETURN_ERROR;
}
if (0 > (rc = orte_rml.send_buffer(sender, &answer, tag, 0))) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
goto RETURN_ERROR;
}
break;
case ORTE_NS_GET_PARENT_JOB_CMD:
count = 1;
if (ORTE_SUCCESS != (rc = orte_dss.unpack(buffer, (void*)&job, &count, ORTE_JOBID))) {
ORTE_ERROR_LOG(rc);
goto RETURN_ERROR;
}
if (ORTE_SUCCESS != (rc = orte_ns_replica_get_parent_job(&root, job))) {
ORTE_ERROR_LOG(rc);
goto RETURN_ERROR;
}
if (ORTE_SUCCESS != (rc = orte_dss.pack(&answer, (void*)&root, 1, ORTE_JOBID))) {
ORTE_ERROR_LOG(rc);
goto RETURN_ERROR;
}
if (0 > (rc = orte_rml.send_buffer(sender, &answer, tag, 0))) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
goto RETURN_ERROR;
}
break;
case ORTE_NS_RESERVE_RANGE_CMD:
count = 1;
if (ORTE_SUCCESS != (rc = orte_dss.unpack(buffer, (void*)&job, &count, ORTE_JOBID))) {
ORTE_ERROR_LOG(rc);
goto RETURN_ERROR;
}
count = 1;
if (ORTE_SUCCESS != (rc = orte_dss.unpack(buffer, (void*)&range, &count, ORTE_VPID))) {
ORTE_ERROR_LOG(rc);
goto RETURN_ERROR;
}
if (ORTE_SUCCESS != (rc = orte_ns_replica_reserve_range(job, range, &startvpid))) {
ORTE_ERROR_LOG(rc);
goto RETURN_ERROR;
}
if (ORTE_SUCCESS != (rc = orte_dss.pack(&answer, (void*)&startvpid, 1, ORTE_VPID))) {
ORTE_ERROR_LOG(rc);
goto RETURN_ERROR;
}
if (0 > (rc = orte_rml.send_buffer(sender, &answer, tag, 0))) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
goto RETURN_ERROR;
}
break;
case ORTE_NS_ASSIGN_OOB_TAG_CMD:
count = 1;
if (0 > orte_dss.unpack(buffer, &tagname, &count, ORTE_STRING)) {
rc = ORTE_ERR_UNPACK_FAILURE;
goto RETURN_ERROR;
}
if (0 == strncmp(tagname, "NULL", 4)) {
if (ORTE_SUCCESS != (rc = orte_ns_replica_assign_rml_tag(&oob_tag, NULL))) {
goto RETURN_ERROR;
}
} else {
if (ORTE_SUCCESS != (rc = orte_ns_replica_assign_rml_tag(&oob_tag, tagname))) {
goto RETURN_ERROR;
}
}
if (ORTE_SUCCESS != (rc = orte_dss.pack(&answer, (void*)&oob_tag, 1, ORTE_RML_TAG))) {
goto RETURN_ERROR;
}
if (0 > orte_rml.send_buffer(sender, &answer, tag, 0)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
goto RETURN_ERROR;
}
break;
case ORTE_NS_DEFINE_DATA_TYPE_CMD:
count = 1;
if (0 > orte_dss.unpack(buffer, &tagname, &count, ORTE_STRING)) {
rc = ORTE_ERR_UNPACK_FAILURE;
goto RETURN_ERROR;
}
if (ORTE_SUCCESS != (rc = orte_ns_replica_define_data_type(tagname, &type))) {
goto RETURN_ERROR;
}
if (ORTE_SUCCESS != (rc = orte_dss.pack(&answer, (void*)&type, 1, ORTE_DATA_TYPE))) {
goto RETURN_ERROR;
}
if (0 > orte_rml.send_buffer(sender, &answer, tag, 0)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
goto RETURN_ERROR;
}
break;
case ORTE_NS_CREATE_MY_NAME_CMD:
/* ignore this command */
break;
case ORTE_NS_GET_PEERS_CMD:
/* get the list of attributes */
OBJ_CONSTRUCT(&attrs, opal_list_t);
count = 1;
if(ORTE_SUCCESS != (rc = orte_dss.unpack(buffer, &attrs, &count, ORTE_ATTR_LIST))) {
ORTE_ERROR_LOG(rc);
goto RETURN_ERROR;
}
/* process the request */
if (ORTE_SUCCESS != (rc = orte_ns_replica_get_peers(&procs, &nprocs, &attrs))) {
ORTE_ERROR_LOG(rc);
OBJ_DESTRUCT(&attrs);
goto RETURN_ERROR;
}
OBJ_DESTRUCT(&attrs);
/* pack the answer */
if (ORTE_SUCCESS != (rc = orte_dss.pack(&answer, &nprocs, 1, ORTE_STD_CNTR))) {
ORTE_ERROR_LOG(rc);
goto RETURN_ERROR;
}
if (nprocs > 0) {
if (ORTE_SUCCESS != (rc = orte_dss.pack(&answer, procs, nprocs, ORTE_NAME))) {
ORTE_ERROR_LOG(rc);
goto RETURN_ERROR;
}
}
if (0 > orte_rml.send_buffer(sender, &answer, tag, 0)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
goto RETURN_ERROR;
}
break;
case ORTE_NS_DUMP_CELLS_CMD:
if (ORTE_SUCCESS != (rc = orte_ns_replica_dump_cells_fn(&answer))) {
ORTE_ERROR_LOG(rc);
goto RETURN_ERROR;
}
if (0 > orte_rml.send_buffer(sender, &answer, tag, 0)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
goto RETURN_ERROR;
}
break;
case ORTE_NS_DUMP_JOBIDS_CMD:
if (ORTE_SUCCESS != (rc = orte_ns_replica_dump_jobs_fn(&answer))) {
ORTE_ERROR_LOG(rc);
goto RETURN_ERROR;
}
if (0 > orte_rml.send_buffer(sender, &answer, tag, 0)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
goto RETURN_ERROR;
}
break;
case ORTE_NS_DUMP_TAGS_CMD:
if (ORTE_SUCCESS != (rc = orte_ns_replica_dump_tags_fn(&answer))) {
ORTE_ERROR_LOG(rc);
goto RETURN_ERROR;
}
if (0 > orte_rml.send_buffer(sender, &answer, tag, 0)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
goto RETURN_ERROR;
}
break;
case ORTE_NS_DUMP_DATATYPES_CMD:
if (ORTE_SUCCESS != (rc = orte_ns_replica_dump_datatypes_fn(&answer))) {
ORTE_ERROR_LOG(rc);
goto RETURN_ERROR;
}
if (0 > orte_rml.send_buffer(sender, &answer, tag, 0)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
goto RETURN_ERROR;
}
break;
default:
goto RETURN_ERROR;
}
goto CLEANUP;
RETURN_ERROR:
OBJ_CONSTRUCT(&error_answer, orte_buffer_t);
orte_dss.pack(&error_answer, (void*)&command, 1, ORTE_NS_CMD);
orte_dss.pack(&error_answer, (void*)&rc, 1, ORTE_INT32);
orte_rml.send_buffer(sender, &error_answer, tag, 0);
OBJ_DESTRUCT(&error_answer);
CLEANUP:
/* cleanup */
OBJ_DESTRUCT(&answer);
}

120
orte/mca/ns/replica/ns_replica_support_fns.c Обычный файл
Просмотреть файл

@ -0,0 +1,120 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/** @file:
*
*/
#include "orte_config.h"
#include <stdio.h>
#include <string.h>
#include "opal/util/output.h"
#include "opal/util/trace.h"
#include "orte/mca/errmgr/errmgr.h"
#include "ns_replica.h"
orte_ns_replica_jobitem_t *down_search(orte_ns_replica_jobitem_t *ptr,
orte_ns_replica_jobitem_t **parent_ptr,
orte_jobid_t job)
{
opal_list_item_t *item;
orte_ns_replica_jobitem_t *ptr2, *ptr3;
/* check if this is the specified job */
if (ptr->jobid == job) {
return ptr;
}
/* otherwise, look at the children of this ptr. call ourselves
* to check each one
*/
for (item = opal_list_get_first(&ptr->children);
item != opal_list_get_end(&ptr->children);
item = opal_list_get_next(item)) {
ptr2 = (orte_ns_replica_jobitem_t*)item;
*parent_ptr = ptr;
if (NULL != (ptr3 = down_search(ptr2, parent_ptr, job))) {
return ptr3;
}
}
return NULL;
}
/* find a job's record, wherever it is on the tree */
orte_ns_replica_jobitem_t* orte_ns_replica_find_job(orte_jobid_t job)
{
opal_list_item_t *item;
orte_ns_replica_jobitem_t *root, *ptr, *parent;
for (item = opal_list_get_first(&orte_ns_replica.jobs);
item != opal_list_get_end(&orte_ns_replica.jobs);
item = opal_list_get_next(item)) {
root = (orte_ns_replica_jobitem_t*)item;
if (NULL != (ptr = down_search(root, &parent, job))) {
return ptr;
}
}
/* don't report an error if not found, just return NULL */
return NULL;
}
/* given a jobid, find it's root job's object */
orte_ns_replica_jobitem_t* orte_ns_replica_find_root_job(orte_jobid_t job)
{
opal_list_item_t *item;
orte_ns_replica_jobitem_t *root, *ptr, *parent;
for (item = opal_list_get_first(&orte_ns_replica.jobs);
item != opal_list_get_end(&orte_ns_replica.jobs);
item = opal_list_get_next(item)) {
root = (orte_ns_replica_jobitem_t*)item;
if (NULL != (ptr = down_search(root, &parent, job))) {
return root;
}
}
/* don't report an error if not found, just return NULL */
return NULL;
}
/* given a job's record, construct a flattened list of the descendants below it,
* including the starting point
*/
void orte_ns_replica_construct_flattened_tree(opal_list_t *tree, orte_ns_replica_jobitem_t *ptr)
{
orte_ns_replica_jobitem_t *job, *newjob;
opal_list_item_t *item;
for (item = opal_list_get_first(&ptr->children);
item != opal_list_get_end(&ptr->children);
item = opal_list_get_next(item)) {
job = (orte_ns_replica_jobitem_t*)item;
newjob = OBJ_NEW(orte_ns_replica_jobitem_t);
newjob->jobid = job->jobid;
newjob->next_vpid = job->next_vpid;
opal_list_append(tree, &newjob->super);
orte_ns_replica_construct_flattened_tree(tree, job); /* get anyone below this one */
}
}

Просмотреть файл

@ -1,24 +0,0 @@
# -*- makefile -*-
#
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2004-2005 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
sources += \
src/ns_replica.h \
src/ns_replica.c \
src/ns_replica_component.c

Просмотреть файл

@ -1,702 +0,0 @@
/* -*- C -*-
*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/** @file:
*
* The Open MPI Name Server
*
* The Open MPI Name Server provides unique name ranges for processes
* within the universe. Each universe will have one name server
* running within the seed daemon. This is done to prevent the
* inadvertent duplication of names.
*/
/*
* includes
*/
#include "orte_config.h"
#include "orte/orte_constants.h"
#include "orte/orte_types.h"
#include "opal/threads/mutex.h"
#include "orte/util/proc_info.h"
#include "opal/util/output.h"
#include "opal/mca/mca.h"
#include "opal/mca/base/mca_base_param.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/rml/rml.h"
#include "ns_replica.h"
/*
* Struct of function pointers that need to be initialized
*/
mca_ns_base_component_t mca_ns_replica_component = {
{
MCA_NS_BASE_VERSION_1_0_0,
"replica", /* MCA module name */
ORTE_MAJOR_VERSION, /* MCA module major version */
ORTE_MINOR_VERSION, /* MCA module minor version */
ORTE_RELEASE_VERSION, /* MCA module release version */
orte_ns_replica_open, /* module open */
orte_ns_replica_close /* module close */
},
{
false /* checkpoint / restart */
},
orte_ns_replica_init, /* module init */
orte_ns_replica_finalize /* module shutdown */
};
/*
* setup the function pointers for the module
*/
static mca_ns_base_module_t orte_ns_replica_module = {
/* init */
orte_ns_replica_module_init,
/* cell functions */
orte_ns_replica_create_cellid,
orte_ns_base_get_cellid,
orte_ns_replica_get_cell_info,
orte_ns_base_assign_cellid_to_process,
orte_ns_base_get_cellid_string,
orte_ns_base_convert_cellid_to_string,
orte_ns_base_convert_string_to_cellid,
/* jobid functions */
orte_ns_replica_create_jobid,
orte_ns_base_get_jobid,
orte_ns_base_get_jobid_string,
orte_ns_base_convert_jobid_to_string,
orte_ns_base_convert_string_to_jobid,
/* vpid functions */
orte_ns_replica_reserve_range,
orte_ns_base_get_vpid,
orte_ns_base_get_vpid_string,
orte_ns_base_convert_vpid_to_string,
orte_ns_base_convert_string_to_vpid,
/* name functions */
orte_ns_base_create_process_name,
orte_ns_replica_create_my_name,
orte_ns_base_copy_process_name,
orte_ns_base_convert_string_to_process_name,
orte_ns_base_free_name,
orte_ns_base_get_proc_name_string,
orte_ns_base_compare,
/* peer functions */
orte_ns_base_get_peers,
orte_ns_replica_get_job_peers,
/* tag server functions */
orte_ns_replica_assign_rml_tag,
/* data type functions */
orte_ns_replica_define_data_type,
/* diagnostic functions */
orte_ns_replica_dump_cells,
orte_ns_replica_dump_jobs,
orte_ns_replica_dump_tags,
orte_ns_replica_dump_datatypes
};
/*
* Whether or not we allowed this component to be selected
*/
static bool initialized = false;
/* constructor - used to initialize state of cell_tracker instance */
static void orte_ns_replica_cell_tracker_construct(orte_ns_replica_cell_tracker_t* cell_tracker)
{
cell_tracker->cell = 0;
cell_tracker->site = NULL;
cell_tracker->resource = NULL;
}
/* destructor - used to free any resources held by instance */
static void orte_ns_replica_cell_tracker_destructor(orte_ns_replica_cell_tracker_t* cell_tracker)
{
if (NULL != cell_tracker->site) free(cell_tracker->site);
if (NULL != cell_tracker->resource) free(cell_tracker->resource);
}
/* define instance of opal_class_t */
OBJ_CLASS_INSTANCE(
orte_ns_replica_cell_tracker_t, /* type name */
opal_object_t, /* parent "class" name */
orte_ns_replica_cell_tracker_construct, /* constructor */
orte_ns_replica_cell_tracker_destructor); /* destructor */
/* constructor - used to initialize state of jobid_tracker instance */
static void orte_ns_replica_jobid_tracker_construct(orte_ns_replica_jobid_tracker_t* jobid_tracker)
{
jobid_tracker->jobid = ORTE_JOBID_MAX;
jobid_tracker->next_vpid = 0;
}
/* destructor - used to free any resources held by instance */
static void orte_ns_replica_jobid_tracker_destructor(orte_ns_replica_jobid_tracker_t* jobid_tracker){
}
/* define instance of opal_class_t */
OBJ_CLASS_INSTANCE(
orte_ns_replica_jobid_tracker_t, /* type name */
opal_object_t, /* parent "class" name */
orte_ns_replica_jobid_tracker_construct, /* constructor */
orte_ns_replica_jobid_tracker_destructor); /* destructor */
/* constructor - used to initialize state of taglist instance */
static void orte_ns_replica_tagitem_construct(orte_ns_replica_tagitem_t* tagitem)
{
tagitem->tag = ORTE_RML_TAG_MAX;
tagitem->name = NULL;
}
/* destructor - used to free any resources held by instance */
static void orte_ns_replica_tagitem_destructor(orte_ns_replica_tagitem_t* tagitem)
{
if (NULL != tagitem->name) {
free(tagitem->name);
}
}
/* define instance of opal_class_t */
OBJ_CLASS_INSTANCE(
orte_ns_replica_tagitem_t, /* type name */
opal_object_t, /* parent "class" name */
orte_ns_replica_tagitem_construct, /* constructor */
orte_ns_replica_tagitem_destructor); /* destructor */
/* constructor - used to initialize state of dtilist instance */
static void orte_ns_replica_dti_construct(orte_ns_replica_dti_t* dti)
{
dti->id = ORTE_DSS_ID_MAX;
dti->name = NULL;
}
/* destructor - used to free any resources held by instance */
static void orte_ns_replica_dti_destructor(orte_ns_replica_dti_t* dti)
{
if (NULL != dti->name) {
free(dti->name);
}
}
/* define instance of opal_class_t */
OBJ_CLASS_INSTANCE(
orte_ns_replica_dti_t, /* type name */
opal_object_t, /* parent "class" name */
orte_ns_replica_dti_construct, /* constructor */
orte_ns_replica_dti_destructor); /* destructor */
/*
* globals needed within replica component
*/
orte_ns_replica_globals_t orte_ns_replica;
/*
* don't really need this function - could just put NULL in the above structure
* Just holding the place in case we decide there is something we need to do
*/
int orte_ns_replica_open(void)
{
int id, param;
id = mca_base_param_register_int("ns", "replica", "debug", NULL, (int)false);
mca_base_param_lookup_int(id, &orte_ns_replica.debug);
id = mca_base_param_register_int("ns", "replica", "isolate", NULL, (int)false);
mca_base_param_lookup_int(id, &param);
if (param) {
orte_ns_replica.isolate = true;
} else {
orte_ns_replica.isolate = false;
}
id = mca_base_param_register_int("ns", "replica", "maxsize", NULL,
ORTE_NS_ARRAY_MAX_SIZE);
mca_base_param_lookup_int(id, &param);
orte_ns_replica.max_size = (size_t)param;
id = mca_base_param_register_int("ns", "replica", "blocksize", NULL,
ORTE_NS_ARRAY_BLOCK_SIZE);
mca_base_param_lookup_int(id, &param);
orte_ns_replica.block_size = (size_t)param;
return ORTE_SUCCESS;
}
/*
* ditto for this one
*/
int orte_ns_replica_close(void)
{
return ORTE_SUCCESS;
}
mca_ns_base_module_t* orte_ns_replica_init(int *priority)
{
int rc;
/* If we are to host a replica, then we want to be selected, so do all the
setup and return the module */
if (NULL == orte_process_info.ns_replica_uri) {
/* Return a module (choose an arbitrary, positive priority --
it's only relevant compared to other ns components). If
we're not the seed, then we don't want to be selected, so
return NULL. */
*priority = 50;
/* initialize the cell info tracker */
if (ORTE_SUCCESS != (rc = orte_pointer_array_init(&(orte_ns_replica.cells),
(orte_std_cntr_t)orte_ns_replica.block_size,
(orte_std_cntr_t)orte_ns_replica.max_size,
(orte_std_cntr_t)orte_ns_replica.block_size))) {
ORTE_ERROR_LOG(rc);
return NULL;
}
orte_ns_replica.num_cells = 0;
/* initialize the job id tracker */
if (ORTE_SUCCESS != (rc = orte_pointer_array_init(&(orte_ns_replica.jobids),
(orte_std_cntr_t)orte_ns_replica.block_size,
(orte_std_cntr_t)orte_ns_replica.max_size,
(orte_std_cntr_t)orte_ns_replica.block_size))) {
ORTE_ERROR_LOG(rc);
return NULL;
}
orte_ns_replica.num_jobids = 0;
/* initialize the taglist */
if (ORTE_SUCCESS != (rc = orte_pointer_array_init(&(orte_ns_replica.tags),
(orte_std_cntr_t)orte_ns_replica.block_size,
(orte_std_cntr_t)orte_ns_replica.max_size,
(orte_std_cntr_t)orte_ns_replica.block_size))) {
ORTE_ERROR_LOG(rc);
return NULL;
}
orte_ns_replica.num_tags = 0;
/* initialize the dtlist */
if (ORTE_SUCCESS != (rc = orte_pointer_array_init(&(orte_ns_replica.dts),
(orte_std_cntr_t)orte_ns_replica.block_size,
(orte_std_cntr_t)orte_ns_replica.max_size,
(orte_std_cntr_t)orte_ns_replica.block_size))) {
ORTE_ERROR_LOG(rc);
return NULL;
}
orte_ns_replica.num_dts = 0;
/* setup the thread lock */
OBJ_CONSTRUCT(&orte_ns_replica.mutex, opal_mutex_t);
/* Return the module */
initialized = true;
return &orte_ns_replica_module;
} else {
return NULL;
}
}
int orte_ns_replica_module_init(void)
{
int rc;
if (orte_ns_replica.isolate) {
return ORTE_SUCCESS;
}
/* issue non-blocking receive for call_back function */
rc = orte_rml.recv_buffer_nb(ORTE_RML_NAME_ANY, ORTE_RML_TAG_NS, ORTE_RML_PERSISTENT, orte_ns_replica_recv, NULL);
if(rc < 0) {
ORTE_ERROR_LOG(rc);
return rc;
}
return ORTE_SUCCESS;
}
/*
* finalize routine
*/
int orte_ns_replica_finalize(void)
{
orte_ns_replica_cell_tracker_t **cptr;
orte_ns_replica_jobid_tracker_t **jptr;
orte_ns_replica_tagitem_t **tag;
orte_ns_replica_dti_t **dti;
orte_std_cntr_t i;
/* free all tracking storage, but only if this component was initialized */
if (initialized) {
cptr = (orte_ns_replica_cell_tracker_t**)(orte_ns_replica.cells)->addr;
for (i=0; i < (orte_ns_replica.cells)->size; i++) {
if (NULL != cptr[i]) {
OBJ_RELEASE(cptr[i]);
}
}
OBJ_RELEASE(orte_ns_replica.cells);
jptr = (orte_ns_replica_jobid_tracker_t**)(orte_ns_replica.jobids)->addr;
for (i=0; i < (orte_ns_replica.jobids)->size; i++) {
if (NULL != jptr[i]) {
OBJ_RELEASE(jptr[i]);
}
}
OBJ_RELEASE(orte_ns_replica.jobids);
tag = (orte_ns_replica_tagitem_t**)(orte_ns_replica.tags)->addr;
for (i=0; i < (orte_ns_replica.tags)->size; i++) {
if (NULL != tag[i]) OBJ_RELEASE(tag[i]);
}
OBJ_RELEASE(orte_ns_replica.tags);
dti = (orte_ns_replica_dti_t**)(orte_ns_replica.dts)->addr;
for (i=0; i < (orte_ns_replica.dts)->size; i++) {
if (NULL != dti[i]) OBJ_RELEASE(dti[i]);
}
OBJ_RELEASE(orte_ns_replica.dts);
initialized = false;
}
/* All done */
if (orte_ns_replica.isolate) {
return ORTE_SUCCESS;
}
orte_rml.recv_cancel(ORTE_RML_NAME_ANY, ORTE_RML_TAG_NS);
return ORTE_SUCCESS;
}
/*
* handle message from proxies
* NOTE: The incoming buffer "buffer" is OBJ_RELEASED by the calling program.
* DO NOT RELEASE THIS BUFFER IN THIS CODE
*/
void orte_ns_replica_recv(int status, orte_process_name_t* sender,
orte_buffer_t* buffer, orte_rml_tag_t tag,
void* cbdata)
{
orte_buffer_t answer, error_answer;
orte_ns_cmd_flag_t command;
orte_cellid_t cell;
orte_jobid_t job;
orte_vpid_t startvpid, range;
char *tagname, *site, *resource;
orte_rml_tag_t oob_tag;
orte_data_type_t type;
orte_std_cntr_t count, nprocs;
orte_process_name_t *procs;
int rc=ORTE_SUCCESS, ret;
count = 1;
if (ORTE_SUCCESS != (rc = orte_dss.unpack(buffer, &command, &count, ORTE_NS_CMD))) {
ORTE_ERROR_LOG(rc);
rc = ORTE_ERR_BAD_PARAM;
goto RETURN_ERROR;
}
OBJ_CONSTRUCT(&answer, orte_buffer_t);
switch (command) {
case ORTE_NS_CREATE_CELLID_CMD:
if (ORTE_SUCCESS != (rc = orte_dss.pack(&answer, &command, 1, ORTE_NS_CMD))) {
ORTE_ERROR_LOG(rc);
goto RETURN_ERROR;
}
count = 1;
if (ORTE_SUCCESS != (rc = orte_dss.unpack(buffer, &site, &count, ORTE_STRING))) {
ORTE_ERROR_LOG(rc);
rc = ORTE_ERR_BAD_PARAM;
goto RETURN_ERROR;
}
count = 1;
if (ORTE_SUCCESS != (rc = orte_dss.unpack(buffer, &resource, &count, ORTE_STRING))) {
ORTE_ERROR_LOG(rc);
rc = ORTE_ERR_BAD_PARAM;
goto RETURN_ERROR;
}
rc = orte_ns_replica_create_cellid(&cell, site, resource);
if (ORTE_SUCCESS != (ret = orte_dss.pack(&answer, &cell, 1, ORTE_CELLID))) {
ORTE_ERROR_LOG(ret);
goto RETURN_ERROR;
}
if (0 > orte_rml.send_buffer(sender, &answer, tag, 0)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
goto RETURN_ERROR;
}
break;
case ORTE_NS_GET_CELL_INFO_CMD:
if (ORTE_SUCCESS != (rc = orte_dss.pack(&answer, &command, 1, ORTE_NS_CMD))) {
ORTE_ERROR_LOG(rc);
goto RETURN_ERROR;
}
count = 1;
if (ORTE_SUCCESS != (rc = orte_dss.unpack(buffer, &cell, &count, ORTE_CELLID))) {
ORTE_ERROR_LOG(rc);
rc = ORTE_ERR_BAD_PARAM;
goto RETURN_ERROR;
}
site = NULL;
resource = NULL;
rc = orte_ns_replica_get_cell_info(cell, &site, &resource);
if (ORTE_SUCCESS != (ret = orte_dss.pack(&answer, &site, 1, ORTE_STRING))) {
ORTE_ERROR_LOG(ret);
goto RETURN_ERROR;
}
if (ORTE_SUCCESS != (ret = orte_dss.pack(&answer, &resource, 1, ORTE_STRING))) {
ORTE_ERROR_LOG(ret);
goto RETURN_ERROR;
}
if (ORTE_SUCCESS != (ret = orte_dss.pack(&answer, &rc, 1, ORTE_INT))) {
ORTE_ERROR_LOG(ret);
goto RETURN_ERROR;
}
if (0 > orte_rml.send_buffer(sender, &answer, tag, 0)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
goto RETURN_ERROR;
}
break;
case ORTE_NS_CREATE_JOBID_CMD:
if (ORTE_SUCCESS != (rc = orte_dss.pack(&answer, (void*)&command, 1, ORTE_NS_CMD))) {
ORTE_ERROR_LOG(rc);
goto RETURN_ERROR;
}
if (ORTE_SUCCESS != (rc = orte_ns_replica_create_jobid(&job))) {
ORTE_ERROR_LOG(rc);
goto RETURN_ERROR;
}
if (ORTE_SUCCESS != (rc = orte_dss.pack(&answer, (void*)&job, 1, ORTE_JOBID))) {
ORTE_ERROR_LOG(rc);
goto RETURN_ERROR;
}
if (0 > orte_rml.send_buffer(sender, &answer, tag, 0)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
goto RETURN_ERROR;
}
break;
case ORTE_NS_RESERVE_RANGE_CMD:
count = 1;
if (ORTE_SUCCESS != (rc = orte_dss.unpack(buffer, (void*)&job, &count, ORTE_JOBID))) {
ORTE_ERROR_LOG(rc);
goto RETURN_ERROR;
}
count = 1;
if (ORTE_SUCCESS != (rc = orte_dss.unpack(buffer, (void*)&range, &count, ORTE_VPID))) {
ORTE_ERROR_LOG(rc);
goto RETURN_ERROR;
}
if (ORTE_SUCCESS != (rc = orte_ns_replica_reserve_range(job, range, &startvpid))) {
ORTE_ERROR_LOG(rc);
goto RETURN_ERROR;
}
if (ORTE_SUCCESS != (rc = orte_dss.pack(&answer, (void*)&command, 1, ORTE_NS_CMD))) {
ORTE_ERROR_LOG(rc);
goto RETURN_ERROR;
}
if (ORTE_SUCCESS != (rc = orte_dss.pack(&answer, (void*)&startvpid, 1, ORTE_VPID))) {
ORTE_ERROR_LOG(rc);
goto RETURN_ERROR;
}
if (0 > (rc = orte_rml.send_buffer(sender, &answer, tag, 0))) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
goto RETURN_ERROR;
}
break;
case ORTE_NS_ASSIGN_OOB_TAG_CMD:
count = 1;
if (0 > orte_dss.unpack(buffer, &tagname, &count, ORTE_STRING)) {
rc = ORTE_ERR_UNPACK_FAILURE;
goto RETURN_ERROR;
}
if (0 == strncmp(tagname, "NULL", 4)) {
if (ORTE_SUCCESS != (rc = orte_ns_replica_assign_rml_tag(&oob_tag, NULL))) {
goto RETURN_ERROR;
}
} else {
if (ORTE_SUCCESS != (rc = orte_ns_replica_assign_rml_tag(&oob_tag, tagname))) {
goto RETURN_ERROR;
}
}
if (ORTE_SUCCESS != (rc = orte_dss.pack(&answer, (void*)&command, 1, ORTE_NS_CMD))) {
goto RETURN_ERROR;
}
if (ORTE_SUCCESS != (rc = orte_dss.pack(&answer, (void*)&oob_tag, 1, ORTE_RML_TAG))) {
goto RETURN_ERROR;
}
if (0 > orte_rml.send_buffer(sender, &answer, tag, 0)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
goto RETURN_ERROR;
}
break;
case ORTE_NS_DEFINE_DATA_TYPE_CMD:
count = 1;
if (0 > orte_dss.unpack(buffer, &tagname, &count, ORTE_STRING)) {
rc = ORTE_ERR_UNPACK_FAILURE;
goto RETURN_ERROR;
}
if (ORTE_SUCCESS != (rc = orte_ns_replica_define_data_type(tagname, &type))) {
goto RETURN_ERROR;
}
if (ORTE_SUCCESS != (rc = orte_dss.pack(&answer, (void*)&command, 1, ORTE_NS_CMD))) {
goto RETURN_ERROR;
}
if (ORTE_SUCCESS != (rc = orte_dss.pack(&answer, (void*)&type, 1, ORTE_DATA_TYPE))) {
goto RETURN_ERROR;
}
if (0 > orte_rml.send_buffer(sender, &answer, tag, 0)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
goto RETURN_ERROR;
}
break;
case ORTE_NS_CREATE_MY_NAME_CMD:
/* ignore this command */
break;
case ORTE_NS_GET_JOB_PEERS_CMD:
/* unpack the jobid */
count = 1;
if (ORTE_SUCCESS != (rc = orte_dss.unpack(buffer, &job, &count, ORTE_JOBID))) {
ORTE_ERROR_LOG(rc);
goto RETURN_ERROR;
}
/* process the request */
if (ORTE_SUCCESS != (rc = orte_ns_replica_get_job_peers(&procs, &nprocs, job))) {
ORTE_ERROR_LOG(rc);
goto RETURN_ERROR;
}
/* pack the answer */
if (ORTE_SUCCESS != (rc = orte_dss.pack(&answer, &nprocs, 1, ORTE_STD_CNTR))) {
ORTE_ERROR_LOG(rc);
goto RETURN_ERROR;
}
if (nprocs > 0) {
if (ORTE_SUCCESS != (rc = orte_dss.pack(&answer, &procs, nprocs, ORTE_NAME))) {
ORTE_ERROR_LOG(rc);
goto RETURN_ERROR;
}
}
break;
case ORTE_NS_DUMP_CELLS_CMD:
if (ORTE_SUCCESS != (rc = orte_ns_replica_dump_cells_fn(&answer))) {
ORTE_ERROR_LOG(rc);
goto RETURN_ERROR;
}
if (0 > orte_rml.send_buffer(sender, &answer, tag, 0)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
goto RETURN_ERROR;
}
break;
case ORTE_NS_DUMP_JOBIDS_CMD:
if (ORTE_SUCCESS != (rc = orte_ns_replica_dump_jobs_fn(&answer))) {
ORTE_ERROR_LOG(rc);
goto RETURN_ERROR;
}
if (0 > orte_rml.send_buffer(sender, &answer, tag, 0)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
goto RETURN_ERROR;
}
break;
case ORTE_NS_DUMP_TAGS_CMD:
if (ORTE_SUCCESS != (rc = orte_ns_replica_dump_tags_fn(&answer))) {
ORTE_ERROR_LOG(rc);
goto RETURN_ERROR;
}
if (0 > orte_rml.send_buffer(sender, &answer, tag, 0)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
goto RETURN_ERROR;
}
break;
case ORTE_NS_DUMP_DATATYPES_CMD:
if (ORTE_SUCCESS != (rc = orte_ns_replica_dump_datatypes_fn(&answer))) {
ORTE_ERROR_LOG(rc);
goto RETURN_ERROR;
}
if (0 > orte_rml.send_buffer(sender, &answer, tag, 0)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
goto RETURN_ERROR;
}
break;
default:
goto RETURN_ERROR;
}
goto CLEANUP;
RETURN_ERROR:
OBJ_CONSTRUCT(&error_answer, orte_buffer_t);
orte_dss.pack(&error_answer, (void*)&command, 1, ORTE_NS_CMD);
orte_dss.pack(&error_answer, (void*)&rc, 1, ORTE_INT32);
orte_rml.send_buffer(sender, &error_answer, tag, 0);
OBJ_DESTRUCT(&error_answer);
CLEANUP:
/* cleanup */
OBJ_DESTRUCT(&answer);
}

Просмотреть файл

@ -119,7 +119,7 @@ static char *
return NULL;
}
rc = orte_ns_base_convert_jobid_to_string(&job, jobid);
rc = orte_ns.convert_jobid_to_string(&job, jobid);
if(ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
return NULL;
@ -634,7 +634,7 @@ orte_odls_bproc_launch_local_procs(orte_gpr_notify_data_t *data, char **base_env
if(ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
}
rc = mca_oob_send_packed_nb(ORTE_RML_NAME_SEED, ack, ORTE_RML_TAG_BPROC, 0,
rc = mca_oob_send_packed_nb(ORTE_PROC_MY_HNP, ack, ORTE_RML_TAG_BPROC, 0,
odls_bproc_send_cb, NULL);
if (0 > rc) {
ORTE_ERROR_LOG(rc);
@ -666,7 +666,7 @@ int orte_odls_bproc_kill_local_procs(orte_jobid_t job, bool set_state)
* @param signal The signal to send
* @retval ORTE_SUCCESS
*/
int orte_odls_bproc_signal_local_procs(orte_process_name_t* proc, int32_t signal)
int orte_odls_bproc_signal_local_procs(const orte_process_name_t* proc, int32_t signal)
{
orte_iof.iof_flush();
return ORTE_SUCCESS;

Просмотреть файл

@ -372,6 +372,8 @@ static void odls_default_wait_local_proc(pid_t pid, int status, void* cbdata)
struct stat buf;
int rc;
opal_output(orte_odls_globals.output, "odls: child process terminated");
/* since we are going to be working with the global list of
* children, we need to protect that list from modification
* by other threads. This will also be used to protect us
@ -384,6 +386,8 @@ static void odls_default_wait_local_proc(pid_t pid, int status, void* cbdata)
item != opal_list_get_end(&orte_odls_default.children);
item = opal_list_get_next(item)) {
child = (orte_odls_child_t*)item;
opal_output(orte_odls_globals.output, "odls: checking child [%ld,%ld,%ld] alive %s",
ORTE_NAME_ARGS(child->name), (child->alive ? "true" : "dead"));
if (child->alive && pid == child->pid) { /* found it */
goto GOTCHILD;
}
@ -398,8 +402,14 @@ static void odls_default_wait_local_proc(pid_t pid, int status, void* cbdata)
return;
GOTCHILD:
opal_output(orte_odls_globals.output, "odls: flushing output for [%ld,%ld,%ld]",
ORTE_NAME_ARGS(child->name));
orte_iof.iof_flush();
opal_output(orte_odls_globals.output, "odls: output for [%ld,%ld,%ld] flushed",
ORTE_NAME_ARGS(child->name));
/* determine the state of this process */
aborted = false;
if(WIFEXITED(status)) {
@ -426,6 +436,9 @@ GOTCHILD:
job, vpid, "abort", NULL );
free(job);
free(vpid);
opal_output(orte_odls_globals.output, "odls: stat'ing file %s for [%ld,%ld,%ld]",
abort_file, ORTE_NAME_ARGS(child->name));
if (0 == stat(abort_file, &buf)) {
/* the abort file must exist - there is nothing in it we need. It's
* meer existence indicates that an abnormal termination occurred
@ -435,14 +448,14 @@ GOTCHILD:
aborted = true;
free(abort_file);
} else {
opal_output(orte_odls_globals.output, "odls: child [%ld,%ld,%ld] died naturally",
opal_output(orte_odls_globals.output, "odls: child process [%ld,%ld,%ld] terminated normally",
ORTE_NAME_ARGS(child->name));
}
} else {
/* the process was terminated with a signal! That's definitely
* abnormal, so indicate that condition
*/
opal_output(orte_odls_globals.output, "odls: child [%ld,%ld,%ld] died by signal",
opal_output(orte_odls_globals.output, "odls: child process [%ld,%ld,%ld] terminated with signal",
ORTE_NAME_ARGS(child->name));
aborted = true;
}

Просмотреть файл

@ -48,13 +48,6 @@ extern "C" {
*/
ORTE_DECLSPEC extern bool orte_oob_base_timing;
/*
* Well known address
*/
ORTE_DECLSPEC extern orte_process_name_t mca_oob_name_any;
ORTE_DECLSPEC extern orte_process_name_t mca_oob_name_seed;
/*
* OOB API
*/
@ -187,7 +180,7 @@ ORTE_DECLSPEC int mca_oob_send_packed(
/**
* Similiar to unix readv(2)
*
* @param peer (IN/OUT) Opaque name of peer process or MCA_OOB_NAME_ANY for wildcard receive. In the
* @param peer (IN/OUT) Opaque name of peer process or ORTE_NAME_WILDCARD for wildcard receive. In the
* case of a wildcard receive, will be modified to return the matched peer name.
* @param msg (IN) Array of iovecs describing user buffers and lengths.
* @param count (IN) Number of elements in iovec array.
@ -223,7 +216,7 @@ ORTE_DECLSPEC int mca_oob_recv(
/**
* Similiar to unix read(2)
*
* @param peer (IN) Opaque name of peer process or MCA_OOB_NAME_ANY for wildcard receive.
* @param peer (IN) Opaque name of peer process or ORTE_NAME_WILDCARD for wildcard receive.
* @param buf (OUT) Array of iovecs describing user buffers and lengths.
* @param tag (IN/OUT) User defined tag for matching send/recv.
* @return OMPI error code (<0) on error or number of bytes actually received.
@ -338,7 +331,7 @@ ORTE_DECLSPEC int mca_oob_send_packed_nb(
/**
* Non-blocking version of mca_oob_recv().
*
* @param peer (IN) Opaque name of peer process or MCA_OOB_NAME_ANY for wildcard receive.
* @param peer (IN) Opaque name of peer process or ORTE_NAME_WILDCARD for wildcard receive.
* @param msg (IN) Array of iovecs describing user buffers and lengths.
* @param count (IN) Number of elements in iovec array.
* @param tag (IN) User defined tag for matching send/recv.
@ -363,7 +356,7 @@ ORTE_DECLSPEC int mca_oob_recv_nb(
/**
* Routine to cancel pending non-blocking recvs.
*
* @param peer (IN) Opaque name of peer process or MCA_OOB_NAME_ANY for wildcard receive.
* @param peer (IN) Opaque name of peer process or ORTE_NAME_WILDCARD for wildcard receive.
* @param tag (IN) User defined tag for matching send/recv.
* @return OMPI error code (<0) on error or number of bytes actually received.
*/
@ -375,7 +368,7 @@ ORTE_DECLSPEC int mca_oob_recv_cancel(
/**
* Non-blocking version of mca_oob_recv_packed().
*
* @param peer (IN) Opaque name of peer process or MCA_OOB_NAME_ANY for wildcard receive.
* @param peer (IN) Opaque name of peer process or ORTE_NAME_WILDCARD for wildcard receive.
* @param buffer (IN) Array of iovecs describing user buffers and lengths.
* @param count (IN) Number of elements in iovec array.
* @param tag (IN) User defined tag for matching send/recv.

Просмотреть файл

@ -48,9 +48,6 @@ OBJ_CLASS_INSTANCE(
NULL
);
orte_process_name_t mca_oob_name_seed = { 0, 0, 0 };
orte_process_name_t mca_oob_name_any = { ORTE_CELLID_MAX, ORTE_JOBID_MAX, ORTE_VPID_MAX };
/**
* Parse contact info string into process name and list of uri strings.
*/

Просмотреть файл

@ -34,7 +34,7 @@
/*
* Similiar to unix recv(2)
*
* @param peer (IN) Opaque name of peer process or MCA_OOB_NAME_ANY for wildcard receive.
* @param peer (IN) Opaque name of peer process or ORTE_NAME_WILDCARD for wildcard receive.
* @param msg (IN) Array of iovecs describing user buffers and lengths.
* @param types (IN) Parallel array to iovecs describing data type of each iovec element.
* @param count (IN) Number of elements in iovec array.
@ -51,7 +51,7 @@ int mca_oob_recv(orte_process_name_t* peer, struct iovec *msg, int count, int ta
/*
* Similiar to unix recv(2)
*
* @param peer (IN) Opaque name of peer process or MCA_OOB_NAME_ANY for wildcard receive.
* @param peer (IN) Opaque name of peer process or ORTE_NAME_WILDCARD for wildcard receive.
* @param buffer (OUT) Buffer that the OOB creates to recv this message...
* @param tag (IN) User defined tag for matching send/recv.
* iovec array without removing the message from the queue.

Просмотреть файл

@ -53,7 +53,7 @@ static void mca_oob_recv_callback(
/*
* Non-blocking version of mca_oob_recv_nb().
*
* @param peer (IN) Opaque name of peer process or MCA_OOB_NAME_ANY for wildcard receive.
* @param peer (IN) Opaque name of peer process or ORTE_NAME_WILDCARD for wildcard receive.
* @param msg (IN) Array of iovecs describing user buffers and lengths.
* @param count (IN) Number of elements in iovec array.
* @param flags (IN) May be MCA_OOB_PEEK to return up to size bytes of msg w/out removing it from the queue,
@ -71,7 +71,7 @@ int mca_oob_recv_nb(orte_process_name_t* peer, struct iovec* msg, int count, int
/*
* Cancel non-blocking recv.j
*
* @param peer (IN) Opaque name of peer process or MCA_OOB_NAME_ANY for wildcard receive.
* @param peer (IN) Opaque name of peer process or ORTE_NAME_WILDCARD for wildcard receive.
* @param tag (IN) User defined tag for message matching.
* @return OMPI success or error code (<0) on error.
*/
@ -84,7 +84,7 @@ int mca_oob_recv_cancel(orte_process_name_t* peer, int tag)
/**
* Non-blocking version of mca_oob_recv_packed().
*
* @param peer (IN) Opaque name of peer process or MCA_OOB_NAME_ANY for wildcard receive.
* @param peer (IN) Opaque name of peer process or ORTE_NAME_WILDCARD for wildcard receive.
* @param buffer (IN) Array of iovecs describing user buffers and lengths.
* @param count (IN) Number of elements in iovec array.
* @param tag (IN) User defined tag for matching send/recv.

Просмотреть файл

@ -149,7 +149,7 @@ int mca_oob_xcast(
orte_gpr_notify_message_t *msg;
OBJ_CONSTRUCT(&rbuf, orte_buffer_t);
rc = mca_oob_recv_packed(MCA_OOB_NAME_ANY, &rbuf, tag);
rc = mca_oob_recv_packed(ORTE_NAME_WILDCARD, &rbuf, tag);
if(rc < 0) {
OBJ_DESTRUCT(&rbuf);
return rc;

Просмотреть файл

@ -110,7 +110,7 @@ typedef int (*mca_oob_base_module_send_fn_t)(
/**
* Implementation of mca_oob_recv().
*
* @param peer (IN) Opaque name of peer process or MCA_OOB_NAME_ANY for wildcard receive.
* @param peer (IN) Opaque name of peer process or ORTE_NAME_WILDCARD for wildcard receive.
* @param msg (IN) Array of iovecs describing user buffers and lengths.
* @param types (IN) Parallel array to iovecs describing data type of each iovec element.
* @param count (IN) Number of elements in iovec array.
@ -153,7 +153,7 @@ typedef int (*mca_oob_base_module_send_nb_fn_t)(
/**
* Implementation of mca_oob_recv_nb().
*
* @param peer (IN) Opaque name of peer process or MCA_OOB_NAME_ANY for wildcard receive.
* @param peer (IN) Opaque name of peer process or ORTE_NAME_WILDCARD for wildcard receive.
* @param msg (IN) Array of iovecs describing user buffers and lengths.
* @param count (IN) Number of elements in iovec array.
* @param tag (IN) User defined tag for matching send/recv.
@ -175,7 +175,7 @@ typedef int (*mca_oob_base_module_recv_nb_fn_t)(
/**
* Implementation of mca_oob_recv_cancel().
*
* @param peer (IN) Opaque name of peer process or MCA_OOB_NAME_ANY for wildcard receive.
* @param peer (IN) Opaque name of peer process or ORTE_NAME_WILDCARD for wildcard receive.
* @param tag (IN) User defined tag for matching send/recv.
* @return OMPI error code (<0) on error or number of bytes actually received.
*/

Просмотреть файл

@ -25,24 +25,5 @@
#include "orte_config.h"
#include "orte/orte_constants.h"
#include <limits.h>
#include "orte/mca/rml/rml_types.h"
/*
* Other constants
*/
/**
* The wildcard for receives from any peer.
*/
#define MCA_OOB_NAME_ANY &mca_oob_name_any
/**
* Process name of self
*/
#define MCA_OOB_NAME_SELF orte_process_info.my_name
/**
* Process name of seed
*/
#define MCA_OOB_NAME_SEED &mca_oob_name_seed
#endif /* MCA_OOB_TYPES_H */

Просмотреть файл

@ -690,20 +690,18 @@ static void mca_oob_tcp_recv_connect(int sd, mca_oob_tcp_hdr_t* hdr)
}
}
/* check for wildcard name - if this is true - we allocate a name from the name server
/* check for invalid name - if this is true - we allocate a name from the name server
* and return to the peer
*/
cmpval = orte_ns.compare(ORTE_NS_CMP_ALL, &hdr->msg_src, MCA_OOB_NAME_ANY);
if (cmpval == 0) {
if (ORTE_SUCCESS != orte_ns.create_jobid(&hdr->msg_src.jobid)) {
cmpval = orte_ns.compare_fields(ORTE_NS_CMP_ALL, &hdr->msg_src, ORTE_NAME_INVALID);
if (cmpval == ORTE_EQUAL) {
if (ORTE_SUCCESS != orte_ns.create_jobid(&hdr->msg_src.jobid, NULL)) {
return;
}
if (ORTE_SUCCESS != orte_ns.reserve_range(hdr->msg_src.jobid, 1, &hdr->msg_src.vpid)) {
return;
}
if (ORTE_SUCCESS != orte_ns.assign_cellid_to_process(&hdr->msg_src)) {
return;
}
hdr->msg_src.cellid = ORTE_PROC_MY_NAME->cellid;
}
/* lookup the corresponding process */
@ -1049,12 +1047,8 @@ int mca_oob_tcp_init(void)
#endif
/* get my jobid */
if (ORTE_SUCCESS != (rc = orte_ns.get_jobid(&jobid,
orte_process_info.my_name))) {
ORTE_ERROR_LOG(rc);
return rc;
}
jobid = ORTE_PROC_MY_NAME->jobid;
/* create a listen socket */
if (OOB_TCP_EVENT == mca_oob_tcp_component.tcp_listen_type) {
if(mca_oob_tcp_create_listen() != ORTE_SUCCESS) {
@ -1286,12 +1280,17 @@ int mca_oob_tcp_fini(void)
* Note that the definition of < or > is somewhat arbitrary -
* just needs to be consistently applied to maintain an ordering
* when process names are used as indices.
*
* Currently, this function is ONLY used in one place - in oob_tcp_send.c to
* determine if the recipient of the message-to-be-sent is ourselves. Hence,
* this comparison is okay to be LITERAL and can/should use the ns.compare_fields
* function
*/
int mca_oob_tcp_process_name_compare(const orte_process_name_t* n1, const orte_process_name_t* n2)
{
return orte_ns.compare(ORTE_NS_CMP_ALL, n1, n2);
return orte_ns.compare_fields(ORTE_NS_CMP_ALL, n1, n2);
}

Просмотреть файл

@ -119,7 +119,7 @@ int mca_oob_tcp_send(
/**
* Similiar to unix readv(2)
*
* @param peer (IN) Opaque name of peer process or MCA_OOB_NAME_ANY for wildcard receive.
* @param peer (IN) Opaque name of peer process or ORTE_NAME_WILDCARD for wildcard receive.
* @param msg (IN) Array of iovecs describing user buffers and lengths.
* @param count (IN) Number of elements in iovec array.
* @param tag (IN) User defined tag for matching send/recv.
@ -166,7 +166,7 @@ int mca_oob_tcp_send_nb(
/**
* Non-blocking version of mca_oob_recv().
*
* @param peer (IN) Opaque name of peer process or MCA_OOB_NAME_ANY for wildcard receive.
* @param peer (IN) Opaque name of peer process or ORTE_NAME_WILDCARD for wildcard receive.
* @param msg (IN) Array of iovecs describing user buffers and lengths.
* @param count (IN) Number of elements in iovec array.
* @param tag (IN) User defined tag for matching send/recv.
@ -188,7 +188,7 @@ int mca_oob_tcp_recv_nb(
/**
* Cancel non-blocking receive.
*
* @param peer (IN) Opaque name of peer process or MCA_OOB_NAME_ANY for wildcard receive.
* @param peer (IN) Opaque name of peer process or ORTE_NAME_WILDCARD for wildcard receive.
* @param tag (IN) User defined tag for matching send/recv.
* @return OMPI error code (<0) on error or number of bytes actually received.
*/

Просмотреть файл

@ -32,6 +32,11 @@
#include <string.h>
#include "orte/orte_constants.h"
#include "opal/util/if.h"
#include "orte/mca/ns/ns_types.h"
#include "orte/util/proc_info.h"
#include "orte/dss/dss.h"
#include "oob_tcp.h"
#include "oob_tcp_addr.h"
@ -65,7 +70,7 @@ int mca_oob_tcp_addr_pack(orte_buffer_t* buffer)
int i;
int rc;
rc = orte_dss.pack(buffer, orte_process_info.my_name, 1, ORTE_NAME);
rc = orte_dss.pack(buffer, ORTE_PROC_MY_NAME, 1, ORTE_NAME);
if(rc != ORTE_SUCCESS)
return rc;

Просмотреть файл

@ -404,7 +404,9 @@ void mca_oob_tcp_msg_recv_complete(mca_oob_tcp_msg_t* msg, mca_oob_tcp_peer_t* p
}
/**
* Process an ident message.
* Process an ident message. In this case, we insist that the two process names
* exactly match - hence, we use the orte_ns.compare_fields function, which
* checks each field in a literal manner (i.e., no wildcards).
*/
static void mca_oob_tcp_msg_ident(mca_oob_tcp_msg_t* msg, mca_oob_tcp_peer_t* peer)
@ -412,7 +414,7 @@ static void mca_oob_tcp_msg_ident(mca_oob_tcp_msg_t* msg, mca_oob_tcp_peer_t* pe
orte_process_name_t src = msg->msg_hdr.msg_src;
OPAL_THREAD_LOCK(&mca_oob_tcp_component.tcp_lock);
if (orte_ns.compare(ORTE_NS_CMP_ALL, &peer->peer_name, &src) != 0) {
if (orte_ns.compare_fields(ORTE_NS_CMP_ALL, &peer->peer_name, &src) != ORTE_EQUAL) {
orte_hash_table_remove_proc(&mca_oob_tcp_component.tcp_peers, &peer->peer_name);
peer->peer_name = src;
orte_hash_table_set_proc(&mca_oob_tcp_component.tcp_peers, &peer->peer_name, peer);
@ -558,9 +560,7 @@ mca_oob_tcp_msg_t* mca_oob_tcp_msg_match_recv(orte_process_name_t* name, int tag
msg != (mca_oob_tcp_msg_t*) opal_list_get_end(&mca_oob_tcp_component.tcp_msg_recv);
msg = (mca_oob_tcp_msg_t*) opal_list_get_next(msg)) {
int cmpval1 = orte_ns.compare(ORTE_NS_CMP_ALL, name, MCA_OOB_NAME_ANY);
int cmpval2 = orte_ns.compare(ORTE_NS_CMP_ALL, name, &msg->msg_peer);
if((0 == cmpval1) || (0 == cmpval2)) {
if(ORTE_EQUAL == orte_dss.compare(name, &msg->msg_peer, ORTE_NAME)) {
if (tag == msg->msg_hdr.msg_tag) {
return msg;
}
@ -585,10 +585,7 @@ mca_oob_tcp_msg_t* mca_oob_tcp_msg_match_post(orte_process_name_t* name, int tag
msg != (mca_oob_tcp_msg_t*) opal_list_get_end(&mca_oob_tcp_component.tcp_msg_post);
msg = (mca_oob_tcp_msg_t*) opal_list_get_next(msg)) {
int cmpval1 = orte_ns.compare(ORTE_NS_CMP_ALL, &msg->msg_peer, MCA_OOB_NAME_ANY);
int cmpval2 = orte_ns.compare(ORTE_NS_CMP_ALL, name, &msg->msg_peer);
if((0 == cmpval1) || (0 == cmpval2)) {
if(ORTE_EQUAL == orte_dss.compare(name, &msg->msg_peer, ORTE_NAME)) {
if (msg->msg_hdr.msg_tag == tag) {
if((msg->msg_flags & MCA_OOB_PERSISTENT) == 0) {
opal_list_remove_item(&mca_oob_tcp_component.tcp_msg_post, &msg->super.super);

Просмотреть файл

@ -503,7 +503,7 @@ void mca_oob_tcp_peer_close(mca_oob_tcp_peer_t* peer)
}
/* if we lose the connection to the seed - abort */
if(memcmp(&peer->peer_name,&mca_oob_name_seed,sizeof(mca_oob_name_seed)) == 0) {
if(memcmp(&peer->peer_name,ORTE_PROC_MY_HNP,sizeof(orte_process_name_t)) == 0) {
/* If we are not already inside orte_finalize, then call abort */
if (ORTE_UNIVERSE_STATE_FINALIZE > orte_universe_info.state) {
/* Should free the peer lock before we abort so we don't
@ -554,7 +554,7 @@ static int mca_oob_tcp_peer_send_connect_ack(mca_oob_tcp_peer_t* peer)
mca_oob_tcp_hdr_t hdr;
memset(&hdr,0,sizeof(hdr));
if (NULL == orte_process_info.my_name) { /* my name isn't defined yet */
hdr.msg_src = *MCA_OOB_NAME_ANY;
hdr.msg_src = *ORTE_NAME_INVALID;
} else {
hdr.msg_src = *(orte_process_info.my_name);
}
@ -597,11 +597,13 @@ static int mca_oob_tcp_peer_recv_connect_ack(mca_oob_tcp_peer_t* peer)
return ORTE_ERR_UNREACH;
}
/* if we have a wildcard name - use the name returned by the peer */
/* if we have an invalid name or do not have one assigned at all - use the name returned by the peer.
* This needs to be a LITERAL comparison - we do NOT want wildcard values to return EQUAL
*/
if(orte_process_info.my_name == NULL) {
orte_ns.create_process_name(&orte_process_info.my_name,
hdr.msg_dst.cellid, hdr.msg_dst.jobid, hdr.msg_dst.vpid);
} else if(orte_ns.compare(ORTE_NS_CMP_ALL, orte_process_info.my_name, &mca_oob_name_any) == 0) {
} else if (orte_ns.compare_fields(ORTE_NS_CMP_ALL, orte_process_info.my_name, ORTE_NAME_INVALID) == ORTE_EQUAL) {
*orte_process_info.my_name = hdr.msg_dst;
}
@ -876,18 +878,24 @@ static void mca_oob_tcp_peer_dump(mca_oob_tcp_peer_t* peer, const char* msg)
/*
* Accept incoming connection - if not already connected.
* Accept incoming connection - if not already connected. We compare the name of the
* peer to our own name using the ns.compare_fields function as we want this to be
* a LITERAL comparison - i.e., there is no occasion when the peer's name should
* be a wildcard value.
*
* To avoid competing reciprocal connection attempts, we only accept connections from
* processes whose names are "greater" than our own.
*/
bool mca_oob_tcp_peer_accept(mca_oob_tcp_peer_t* peer, int sd)
{
int cmpval;
OPAL_THREAD_LOCK(&peer->peer_lock);
cmpval = orte_ns.compare(ORTE_NS_CMP_ALL, &peer->peer_name, orte_process_info.my_name);
cmpval = orte_ns.compare_fields(ORTE_NS_CMP_ALL, &peer->peer_name, orte_process_info.my_name);
if ((peer->peer_state == MCA_OOB_TCP_CLOSED) ||
(peer->peer_state == MCA_OOB_TCP_RESOLVE) ||
(peer->peer_state != MCA_OOB_TCP_CONNECTED &&
cmpval < 0)) {
cmpval == ORTE_VALUE1_GREATER)) {
if(peer->peer_state != MCA_OOB_TCP_CLOSED) {
mca_oob_tcp_peer_close(peer);

Просмотреть файл

@ -47,7 +47,10 @@
#include <signal.h>
#endif
#include "opal/event/event.h"
#include "orte/mca/ns/ns_types.h"
#include "orte/util/proc_info.h"
#include "orte/mca/oob/tcp/oob_tcp.h"
/*
@ -148,7 +151,7 @@ int mca_oob_tcp_ping(
if(orte_process_info.my_name != NULL) {
hdr.msg_src = *orte_process_info.my_name;
} else {
hdr.msg_src = mca_oob_name_any;
hdr.msg_src = *ORTE_NAME_INVALID;
}
hdr.msg_dst = *name;
hdr.msg_type = MCA_OOB_TCP_PROBE;

Просмотреть файл

@ -24,7 +24,7 @@
/*
* Similiar to unix readv(2)
*
* @param peer (IN) Opaque name of peer process or MCA_OOB_NAME_ANY for wildcard receive.
* @param peer (IN) Opaque name of peer process or ORTE_NAME_WILDCARD for wildcard receive.
* @param msg (IN) Array of iovecs describing user buffers and lengths.
* @param types (IN) Parallel array to iovecs describing data type of each iovec element.
* @param count (IN) Number of elements in iovec array.
@ -114,7 +114,7 @@ int mca_oob_tcp_recv(
msg->msg_hdr.msg_type = MCA_OOB_TCP_DATA;
msg->msg_hdr.msg_src = *peer;
if (NULL == orte_process_info.my_name) {
msg->msg_hdr.msg_dst = *MCA_OOB_NAME_ANY;
msg->msg_hdr.msg_dst = *ORTE_NAME_INVALID;
} else {
msg->msg_hdr.msg_dst = *orte_process_info.my_name;
}
@ -202,7 +202,7 @@ static void mca_oob_tcp_msg_matched(mca_oob_tcp_msg_t* msg, mca_oob_tcp_msg_t* m
/*
* Non-blocking version of mca_oob_recv().
*
* @param peer (IN) Opaque name of peer process or MCA_OOB_NAME_ANY for wildcard receive.
* @param peer (IN) Opaque name of peer process or ORTE_NAME_WILDCARD for wildcard receive.
* @param msg (IN) Array of iovecs describing user buffers and lengths.
* @param count (IN) Number of elements in iovec array.
* @param tag (IN) User supplied tag for matching send/recv.
@ -242,7 +242,11 @@ int mca_oob_tcp_recv_nb(
}
/* fill in the header */
msg->msg_hdr.msg_src = *orte_process_info.my_name;
if (NULL == orte_process_info.my_name) {
msg->msg_hdr.msg_src = *ORTE_NAME_INVALID;
} else {
msg->msg_hdr.msg_src = *orte_process_info.my_name;
}
msg->msg_hdr.msg_dst = *peer;
msg->msg_hdr.msg_size = size;
msg->msg_hdr.msg_tag = tag;
@ -286,7 +290,7 @@ int mca_oob_tcp_recv_nb(
/*
* Cancel non-blocking recv.
*
* @param peer (IN) Opaque name of peer process or MCA_OOB_NAME_ANY for wildcard receive.
* @param peer (IN) Opaque name of peer process or ORTE_NAME_WILDCARD for wildcard receive.
* @param tag (IN) User supplied tag for matching send/recv.
* @return OMPI error code (<0) on error or number of bytes actually received.
*/
@ -295,7 +299,7 @@ int mca_oob_tcp_recv_cancel(
orte_process_name_t* name,
int tag)
{
int matched = 0, cmpval1, cmpval2;
int matched = 0;
opal_list_item_t *item, *next;
/* wait for any previously matched messages to be processed */
@ -317,9 +321,7 @@ int mca_oob_tcp_recv_cancel(
mca_oob_tcp_msg_t* msg = (mca_oob_tcp_msg_t*)item;
next = opal_list_get_next(item);
cmpval1 = orte_ns.compare(ORTE_NS_CMP_ALL, name, MCA_OOB_NAME_ANY);
cmpval2 = orte_ns.compare(ORTE_NS_CMP_ALL, &msg->msg_peer, name);
if ((0 == cmpval1) || (0 == cmpval2)) {
if (ORTE_EQUAL == orte_dss.compare(name, &msg->msg_peer, ORTE_NAME)) {
if (msg->msg_hdr.msg_tag == tag) {
opal_list_remove_item(&mca_oob_tcp_component.tcp_msg_post, &msg->super.super);
MCA_OOB_TCP_MSG_RETURN(msg);

Просмотреть файл

@ -18,6 +18,7 @@
#include "orte_config.h"
#include "orte/mca/ns/ns_types.h"
#include "orte/util/proc_info.h"
#include "orte/mca/oob/tcp/oob_tcp.h"
@ -119,7 +120,7 @@ int mca_oob_tcp_send(
msg->msg_hdr.msg_size = size;
msg->msg_hdr.msg_tag = tag;
if (NULL == orte_process_info.my_name) {
msg->msg_hdr.msg_src = *MCA_OOB_NAME_ANY;
msg->msg_hdr.msg_src = *ORTE_NAME_INVALID;
} else {
msg->msg_hdr.msg_src = *orte_process_info.my_name;
}
@ -144,7 +145,7 @@ int mca_oob_tcp_send(
msg->msg_peer = peer->peer_name;
if (NULL != name && NULL != orte_process_info.my_name &&
0 == mca_oob_tcp_process_name_compare(name, orte_process_info.my_name)) { /* local delivery */
ORTE_EQUAL == mca_oob_tcp_process_name_compare(name, orte_process_info.my_name)) { /* local delivery */
return mca_oob_tcp_send_self(peer,msg,iov,count);
}
@ -206,7 +207,11 @@ int mca_oob_tcp_send_nb(
msg->msg_hdr.msg_type = MCA_OOB_TCP_DATA;
msg->msg_hdr.msg_size = size;
msg->msg_hdr.msg_tag = tag;
msg->msg_hdr.msg_src = *orte_process_info.my_name;
if (NULL == orte_process_info.my_name) {
msg->msg_hdr.msg_src = *ORTE_NAME_INVALID;
} else {
msg->msg_hdr.msg_src = *orte_process_info.my_name;
}
msg->msg_hdr.msg_dst = *name;
/* create one additional iovect that will hold the size of the message */
@ -227,7 +232,7 @@ int mca_oob_tcp_send_nb(
msg->msg_complete = false;
msg->msg_peer = peer->peer_name;
if (0 == mca_oob_tcp_process_name_compare(name, orte_process_info.my_name)) { /* local delivery */
if (ORTE_EQUAL == mca_oob_tcp_process_name_compare(name, orte_process_info.my_name)) { /* local delivery */
return mca_oob_tcp_send_self(peer,msg,iov,count);
}

Просмотреть файл

@ -28,6 +28,7 @@
#include "orte/mca/ns/ns.h"
#include "orte/mca/gpr/gpr.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/rmgr/rmgr.h"
#include "orte/mca/pls/base/pls_private.h"
@ -128,10 +129,7 @@ CLEANUP:
return rc;
}
/*
* Retrieve a list of the active daemons for a job
*/
int orte_pls_base_get_active_daemons(opal_list_t *daemons, orte_jobid_t job)
static int get_daemons(opal_list_t *daemons, orte_jobid_t job)
{
orte_gpr_value_t **values;
orte_gpr_keyval_t *kv;
@ -149,7 +147,7 @@ int orte_pls_base_get_active_daemons(opal_list_t *daemons, orte_jobid_t job)
orte_pls_daemon_info_t *dmn;
bool found_name, found_node, found_cell;
int rc;
/* setup the key */
if (ORTE_SUCCESS != (rc = orte_ns.convert_jobid_to_string(&jobid_string, job))) {
ORTE_ERROR_LOG(rc);
@ -172,11 +170,11 @@ int orte_pls_base_get_active_daemons(opal_list_t *daemons, orte_jobid_t job)
/* loop through the answers and construct the list */
for (i=0; i < cnt; i++) {
/* for systems such as bproc, the node segment holds containers
* for nodes that we may not have launched upon. Each container
* will send us back a value object, so we have to ensure here
* that we only create daemon objects on the list for those nodes
* that DO provide a valid object
*/
* for nodes that we may not have launched upon. Each container
* will send us back a value object, so we have to ensure here
* that we only create daemon objects on the list for those nodes
* that DO provide a valid object
*/
found_name = found_node = found_cell = false;
for (j=0; j < values[i]->cnt; j++) {
kv = values[i]->keyvals[j];
@ -204,10 +202,10 @@ int orte_pls_base_get_active_daemons(opal_list_t *daemons, orte_jobid_t job)
found_cell = true;
continue;
}
}
}
/* if we found everything, then this is a valid entry - create
* it and add it to the list
*/
* it and add it to the list
*/
if (found_name && found_node && found_cell) {
dmn = OBJ_NEW(orte_pls_daemon_info_t);
if (NULL == dmn) {
@ -230,14 +228,59 @@ int orte_pls_base_get_active_daemons(opal_list_t *daemons, orte_jobid_t job)
}
OBJ_RELEASE(values[i]);
}
CLEANUP:
for (i=0; i < cnt; i++) {
if (NULL != values[i]) OBJ_RELEASE(values[i]);
}
if (NULL != values) free(values);
free(keys[0]);
return rc;
}
/*
* Retrieve a list of the active daemons for a job
*/
int orte_pls_base_get_active_daemons(opal_list_t *daemons, orte_jobid_t job, opal_list_t *attrs)
{
orte_jobid_t *jobs;
orte_std_cntr_t njobs, i;
bool allocated;
int rc;
if (NULL != orte_rmgr.find_attribute(attrs, ORTE_NS_INCLUDE_DESCENDANTS)) {
/* need to include all descendants in list */
if (ORTE_SUCCESS != (rc = orte_ns.get_job_descendants(&jobs, &njobs, job))) {
ORTE_ERROR_LOG(rc);
return rc;
}
allocated = true;
} else if (NULL != orte_rmgr.find_attribute(attrs, ORTE_NS_INCLUDE_CHILDREN)) {
/* just include the direct children of the job */
if (ORTE_SUCCESS != (rc = orte_ns.get_job_children(&jobs, &njobs, job))) {
ORTE_ERROR_LOG(rc);
return rc;
}
allocated = true;
} else {
/* just want daemons for this one job */
jobs = &job;
njobs = 1;
allocated = false;
}
/* loop through all the jobs and get their info */
for (i=0; i < njobs; i++) {
if (ORTE_SUCCESS != (rc = get_daemons(daemons, jobs[i]))) {
ORTE_ERROR_LOG(rc);
goto CLEANUP;
}
}
CLEANUP:
if (allocated) free(jobs);
return rc;
}
@ -246,19 +289,9 @@ CLEANUP:
*/
int orte_pls_base_remove_daemon(orte_pls_daemon_info_t *info)
{
opal_list_t daemons;
int rc;
OBJ_CONSTRUCT(&daemons, opal_list_t);
/* We actually don't want to do this - instead, we need to do a registry
* delete function call targeting this entry
/* We need to do a registry
* delete function call targeting the entry
*/
if (ORTE_SUCCESS != (rc = orte_pls_base_get_active_daemons(&daemons, info->active_job))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* find this item in the list */
return ORTE_SUCCESS;
}

Просмотреть файл

@ -51,7 +51,7 @@ int orte_pls_base_comm_start(void)
return ORTE_SUCCESS;
}
if (ORTE_SUCCESS != (rc = orte_rml.recv_buffer_nb(ORTE_RML_NAME_ANY,
if (ORTE_SUCCESS != (rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD,
ORTE_RML_TAG_PLS,
ORTE_RML_PERSISTENT,
orte_pls_base_recv,
@ -72,7 +72,7 @@ int orte_pls_base_comm_stop(void)
return ORTE_SUCCESS;
}
if (ORTE_SUCCESS != (rc = orte_rml.recv_cancel(ORTE_RML_NAME_ANY, ORTE_RML_TAG_PLS))) {
if (ORTE_SUCCESS != (rc = orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_PLS))) {
ORTE_ERROR_LOG(rc);
}
recv_issued = false;
@ -97,6 +97,8 @@ void orte_pls_base_recv(int status, orte_process_name_t* sender,
orte_jobid_t job;
orte_process_name_t *name;
int32_t signal;
opal_list_t attrs;
opal_list_item_t *item;
int rc;
count = 1;
@ -130,10 +132,21 @@ void orte_pls_base_recv(int status, orte_process_name_t* sender,
ORTE_ERROR_LOG(rc);
goto SEND_ANSWER;
}
OBJ_CONSTRUCT(&attrs, opal_list_t);
count = 1;
if (ORTE_SUCCESS != (rc = orte_dss.unpack(buffer, &attrs, &count, ORTE_ATTR_LIST))) {
ORTE_ERROR_LOG(rc);
goto SEND_ANSWER;
}
if (ORTE_SUCCESS != (rc = orte_pls.terminate_job(job))) {
if (ORTE_SUCCESS != (rc = orte_pls.terminate_job(job, &attrs))) {
ORTE_ERROR_LOG(rc);
}
while (NULL != (item = opal_list_remove_first(&attrs))) OBJ_RELEASE(item);
OBJ_DESTRUCT(&attrs);
break;
case ORTE_PLS_TERMINATE_ORTEDS_CMD:
@ -143,9 +156,19 @@ void orte_pls_base_recv(int status, orte_process_name_t* sender,
goto SEND_ANSWER;
}
if (ORTE_SUCCESS != (rc = orte_pls.terminate_orteds(job))) {
OBJ_CONSTRUCT(&attrs, opal_list_t);
count = 1;
if (ORTE_SUCCESS != (rc = orte_dss.unpack(buffer, &attrs, &count, ORTE_ATTR_LIST))) {
ORTE_ERROR_LOG(rc);
goto SEND_ANSWER;
}
if (ORTE_SUCCESS != (rc = orte_pls.terminate_orteds(job, &attrs))) {
ORTE_ERROR_LOG(rc);
}
while (NULL != (item = opal_list_remove_first(&attrs))) OBJ_RELEASE(item);
OBJ_DESTRUCT(&attrs);
break;
case ORTE_PLS_SIGNAL_JOB_CMD:
@ -161,9 +184,19 @@ void orte_pls_base_recv(int status, orte_process_name_t* sender,
goto SEND_ANSWER;
}
if (ORTE_SUCCESS != (rc = orte_pls.signal_job(job, signal))) {
OBJ_CONSTRUCT(&attrs, opal_list_t);
count = 1;
if (ORTE_SUCCESS != (rc = orte_dss.unpack(buffer, &attrs, &count, ORTE_ATTR_LIST))) {
ORTE_ERROR_LOG(rc);
goto SEND_ANSWER;
}
if (ORTE_SUCCESS != (rc = orte_pls.signal_job(job, signal, &attrs))) {
ORTE_ERROR_LOG(rc);
}
while (NULL != (item = opal_list_remove_first(&attrs))) OBJ_RELEASE(item);
OBJ_DESTRUCT(&attrs);
break;
case ORTE_PLS_TERMINATE_PROC_CMD:

Просмотреть файл

@ -79,7 +79,7 @@ extern "C" {
int orte_pls_base_orted_signal_local_procs(opal_list_t *daemons, int32_t signal);
int orte_pls_base_orted_add_local_procs(opal_list_t *daemons, orte_gpr_notify_data_t *ndat);
int orte_pls_base_get_active_daemons(opal_list_t *daemons, orte_jobid_t job);
int orte_pls_base_get_active_daemons(opal_list_t *daemons, orte_jobid_t job, opal_list_t *attrs);
int orte_pls_base_store_active_daemons(opal_list_t *daemons);
int orte_pls_base_remove_daemon(orte_pls_daemon_info_t *info);

Просмотреть файл

@ -42,6 +42,7 @@
#endif /* HAVE_STRING_H */
#include "opal/install_dirs.h"
#include "opal/class/opal_list.h"
#include "opal/event/event.h"
#include "opal/mca/base/mca_base_param.h"
#include "opal/util/argv.h"
@ -342,7 +343,7 @@ static void orte_pls_bproc_waitpid_daemon_cb(pid_t wpid, int status, void *data)
if(ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
}
rc = mca_oob_send_packed(ORTE_RML_NAME_SELF, &ack, ORTE_RML_TAG_BPROC, 0);
rc = mca_oob_send_packed(ORTE_PROC_MY_NAME, &ack, ORTE_RML_TAG_BPROC, 0);
if(0 > rc) {
ORTE_ERROR_LOG(rc);
}
@ -429,8 +430,7 @@ static void orte_pls_bproc_setup_env(char *** env)
/* ns replica contact info */
if(NULL == orte_process_info.ns_replica) {
orte_ns.copy_process_name(&orte_process_info.ns_replica,
orte_process_info.my_name);
orte_dss.copy((void**)&orte_process_info.ns_replica, orte_process_info.my_name, ORTE_NAME);
orte_process_info.ns_replica_uri = orte_rml.get_uri();
}
var = mca_base_param_environ_variable("ns","replica","uri");
@ -451,8 +451,7 @@ static void orte_pls_bproc_setup_env(char *** env)
/* gpr replica contact info */
if(NULL == orte_process_info.gpr_replica) {
orte_ns.copy_process_name(&orte_process_info.gpr_replica,
orte_process_info.my_name);
orte_dss.copy((void**)&orte_process_info.gpr_replica, orte_process_info.my_name, ORTE_NAME);
orte_process_info.gpr_replica_uri = orte_rml.get_uri();
}
var = mca_base_param_environ_variable("gpr","replica","uri");
@ -832,13 +831,13 @@ orte_pls_bproc_check_node_state(orte_gpr_notify_data_t *notify_data,
orte_schema.extract_jobid_from_segment_name(&jobid, value->tokens[0]);
printf("killing jobid %d\n", jobid);
if(jobid != 0)
orte_pls_bproc_terminate_job(jobid);
orte_pls_bproc_terminate_job(jobid, NULL);
}
/*
* and kill everyone else
*/
printf("and go bye-bye...\n");
orte_pls_bproc_terminate_job(0);
orte_pls_bproc_terminate_job(0, NULL);
/* shouldn't ever get here.. */
exit(1);
}
@ -1240,7 +1239,7 @@ int orte_pls_bproc_launch(orte_jobid_t jobid) {
for(j = 0; j < num_daemons; j++) {
orte_buffer_t ack;
OBJ_CONSTRUCT(&ack, orte_buffer_t);
rc = mca_oob_recv_packed(ORTE_RML_NAME_ANY, &ack, ORTE_RML_TAG_BPROC);
rc = mca_oob_recv_packed(ORTE_NAME_WILDCARD, &ack, ORTE_RML_TAG_BPROC);
if(0 > rc) {
ORTE_ERROR_LOG(rc);
OBJ_DESTRUCT(&ack);
@ -1265,7 +1264,7 @@ int orte_pls_bproc_launch(orte_jobid_t jobid) {
}
rc = ORTE_ERROR;
ORTE_ERROR_LOG(rc);
orte_pls_bproc_terminate_job(jobid);
orte_pls_bproc_terminate_job(jobid, NULL);
goto cleanup;
}
}
@ -1307,7 +1306,7 @@ cleanup:
/**
* Terminate all processes associated with this job */
int orte_pls_bproc_terminate_job(orte_jobid_t jobid) {
int orte_pls_bproc_terminate_job(orte_jobid_t jobid, opal_list_t *attrs) {
pid_t* pids;
orte_std_cntr_t i, num_pids;
int rc;
@ -1319,7 +1318,7 @@ int orte_pls_bproc_terminate_job(orte_jobid_t jobid) {
}
/* kill application process */
if(ORTE_SUCCESS != (rc = orte_pls_bproc_get_proc_pids(jobid, &pids, &num_pids)))
if(ORTE_SUCCESS != (rc = orte_pls_bproc_get_proc_pids(jobid, &pids, &num_pids, attrs)))
return rc;
for(i=0; i<num_pids; i++) {
if(mca_pls_bproc_component.debug) {
@ -1337,7 +1336,7 @@ int orte_pls_bproc_terminate_job(orte_jobid_t jobid) {
/**
* Terminate the orteds for a given job
*/
int orte_pls_bproc_terminate_orteds(orte_jobid_t jobid)
int orte_pls_bproc_terminate_orteds(orte_jobid_t jobid, opal_list_t *attrs)
{
int rc;
opal_list_t daemons;
@ -1347,7 +1346,7 @@ int orte_pls_bproc_terminate_orteds(orte_jobid_t jobid)
/* construct the list of active daemons on this job */
OBJ_CONSTRUCT(&daemons, opal_list_t);
if (ORTE_SUCCESS != (rc = orte_pls_base_get_active_daemons(&daemons, jobid))) {
if (ORTE_SUCCESS != (rc = orte_pls_base_get_active_daemons(&daemons, jobid, attrs))) {
ORTE_ERROR_LOG(rc);
goto CLEANUP;
}
@ -1394,7 +1393,7 @@ int orte_pls_bproc_terminate_proc(const orte_process_name_t* proc_name) {
/**
* Signal all processes associated with this job
*/
int orte_pls_bproc_signal_job(orte_jobid_t jobid, int32_t signal) {
int orte_pls_bproc_signal_job(orte_jobid_t jobid, int32_t signal, opal_list_t *attrs) {
pid_t* pids;
orte_std_cntr_t i, num_pids;
int rc;
@ -1402,7 +1401,7 @@ int orte_pls_bproc_signal_job(orte_jobid_t jobid, int32_t signal) {
OPAL_TRACE(1);
/* signal application process */
if(ORTE_SUCCESS != (rc = orte_pls_bproc_get_proc_pids(jobid, &pids, &num_pids)))
if(ORTE_SUCCESS != (rc = orte_pls_bproc_get_proc_pids(jobid, &pids, &num_pids, attrs)))
return rc;
for(i=0; i<num_pids; i++) {
if(mca_pls_bproc_component.debug) {

Некоторые файлы не были показаны из-за слишком большого количества измененных файлов Показать больше