From b269e4da9be84c12543c1e7326388efe7fe1565f Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Mon, 2 Oct 2006 19:44:35 +0000 Subject: [PATCH] Add missing functionaltiy to the ns replica to support remote get_job_peers requests. Add trace commands to help try and track down remaining problem with comm_spawn. This commit was SVN r11939. --- orte/mca/ns/proxy/src/ns_proxy.c | 32 ++++++++++++++++--- orte/mca/ns/replica/src/ns_replica.c | 16 ++++++++-- .../mca/ns/replica/src/ns_replica_component.c | 30 ++++++++++++++++- orte/mca/rmgr/base/rmgr_base_stage_gate.c | 2 ++ 4 files changed, 72 insertions(+), 8 deletions(-) diff --git a/orte/mca/ns/proxy/src/ns_proxy.c b/orte/mca/ns/proxy/src/ns_proxy.c index d365b76934..2cdbabfd2e 100644 --- a/orte/mca/ns/proxy/src/ns_proxy.c +++ b/orte/mca/ns/proxy/src/ns_proxy.c @@ -25,8 +25,11 @@ #include "orte/orte_constants.h" #include "orte/orte_types.h" + #include "opal/mca/mca.h" #include "opal/util/output.h" +#include "opal/util/trace.h" + #include "orte/dss/dss.h" #include "orte/mca/errmgr/errmgr.h" #include "orte/mca/rml/rml.h" @@ -50,6 +53,8 @@ int orte_ns_proxy_create_cellid(orte_cellid_t *cellid, char *site, char *resourc int rc; orte_ns_proxy_cell_info_t *new_cell; + OPAL_TRACE(1); + /* set the default value of error */ *cellid = ORTE_CELLID_MAX; @@ -160,6 +165,8 @@ int orte_ns_proxy_get_cell_info(orte_cellid_t cellid, orte_ns_proxy_cell_info_t **cell, *new_cell; int rc, ret=ORTE_SUCCESS; + OPAL_TRACE(1); + /* see if we already have the info locally */ OPAL_THREAD_LOCK(&orte_ns_proxy.mutex); @@ -299,6 +306,8 @@ int orte_ns_proxy_create_jobid(orte_jobid_t *job) orte_std_cntr_t count; int rc; + OPAL_TRACE(1); + /* set default value */ *job = ORTE_JOBID_MAX; @@ -365,6 +374,8 @@ int orte_ns_proxy_reserve_range(orte_jobid_t job, orte_vpid_t range, orte_vpid_t orte_std_cntr_t count; int rc; + OPAL_TRACE(1); + /* set default return value */ *starting_vpid = ORTE_VPID_MAX; @@ -439,9 +450,11 @@ int orte_ns_proxy_get_job_peers(orte_process_name_t **procs, orte_buffer_t* cmd; orte_buffer_t* answer; orte_ns_cmd_flag_t command; - orte_std_cntr_t count; + orte_std_cntr_t count, nprocs; int rc; + OPAL_TRACE_ARG1(1, job); + /* set default value */ *procs = NULL; *num_procs = 0; @@ -452,12 +465,20 @@ int orte_ns_proxy_get_job_peers(orte_process_name_t **procs, } command = ORTE_NS_GET_JOB_PEERS_CMD; + /* pack the command */ if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, (void*)&command, 1, ORTE_NS_CMD))) { /* got a problem */ ORTE_ERROR_LOG(rc); OBJ_RELEASE(cmd); return rc; } + /* pack the jobid */ + if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, &job, 1, ORTE_JOBID))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(cmd); + return rc; + } + if (0 > orte_rml.send_buffer(orte_ns_proxy.my_replica, cmd, ORTE_RML_TAG_NS, 0)) { ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); OBJ_RELEASE(cmd); @@ -491,27 +512,28 @@ int orte_ns_proxy_get_job_peers(orte_process_name_t **procs, } count = 1; - if (ORTE_SUCCESS != (rc = orte_dss.unpack(answer, &num_procs, &count, ORTE_STD_CNTR))) { + if (ORTE_SUCCESS != (rc = orte_dss.unpack(answer, &nprocs, &count, ORTE_STD_CNTR))) { ORTE_ERROR_LOG(rc); OBJ_RELEASE(answer); return rc; } /* allocate space for array of proc names */ - if (0 < *num_procs) { - *procs = (orte_process_name_t*)malloc((*num_procs) * sizeof(orte_process_name_t)); + if (0 < nprocs) { + *procs = (orte_process_name_t*)malloc((nprocs) * sizeof(orte_process_name_t)); if (NULL == *procs) { ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); OBJ_RELEASE(answer); return ORTE_ERR_OUT_OF_RESOURCE; } - if (ORTE_SUCCESS != (rc = orte_dss.unpack(answer, procs, num_procs, ORTE_NAME))) { + if (ORTE_SUCCESS != (rc = orte_dss.unpack(answer, procs, &nprocs, ORTE_NAME))) { ORTE_ERROR_LOG(rc); OBJ_RELEASE(answer); return rc; } } + *num_procs = nprocs; OBJ_RELEASE(answer); return ORTE_SUCCESS; diff --git a/orte/mca/ns/replica/src/ns_replica.c b/orte/mca/ns/replica/src/ns_replica.c index e3e8b92071..a5845c58a8 100644 --- a/orte/mca/ns/replica/src/ns_replica.c +++ b/orte/mca/ns/replica/src/ns_replica.c @@ -23,11 +23,13 @@ #include #include -#include "orte/dss/dss.h" #include "opal/threads/mutex.h" - #include "opal/util/output.h" +#include "opal/util/trace.h" + +#include "orte/dss/dss.h" #include "orte/mca/errmgr/errmgr.h" + #include "orte/mca/ns/base/base.h" #include "ns_replica.h" @@ -46,6 +48,8 @@ int orte_ns_replica_create_cellid(orte_cellid_t *cellid, char *site, char *resou int rc; orte_std_cntr_t index; + OPAL_TRACE(1); + OPAL_THREAD_LOCK(&orte_ns_replica.mutex); *cellid = ORTE_CELLID_MAX; @@ -90,6 +94,8 @@ int orte_ns_replica_get_cell_info(orte_cellid_t cellid, orte_cellid_t j; orte_ns_replica_cell_tracker_t **cell; + OPAL_TRACE(1); + OPAL_THREAD_LOCK(&orte_ns_replica.mutex); cell = (orte_ns_replica_cell_tracker_t**)(orte_ns_replica.cells)->addr; @@ -120,6 +126,8 @@ int orte_ns_replica_create_jobid(orte_jobid_t *jobid) int rc; orte_std_cntr_t index; + OPAL_TRACE(1); + OPAL_THREAD_LOCK(&orte_ns_replica.mutex); *jobid = ORTE_JOBID_MAX; @@ -164,6 +172,8 @@ int orte_ns_replica_reserve_range(orte_jobid_t job, orte_vpid_t range, orte_std_cntr_t j; orte_jobid_t k; + OPAL_TRACE(1); + OPAL_THREAD_LOCK(&orte_ns_replica.mutex); /* find the jobid */ @@ -204,6 +214,8 @@ int orte_ns_replica_get_job_peers(orte_process_name_t **procs, orte_std_cntr_t j; orte_jobid_t k; + OPAL_TRACE_ARG1(1, job); + OPAL_THREAD_LOCK(&orte_ns_replica.mutex); /* find the jobid */ diff --git a/orte/mca/ns/replica/src/ns_replica_component.c b/orte/mca/ns/replica/src/ns_replica_component.c index 70f1321bfb..b66bbb2a76 100644 --- a/orte/mca/ns/replica/src/ns_replica_component.c +++ b/orte/mca/ns/replica/src/ns_replica_component.c @@ -412,7 +412,8 @@ void orte_ns_replica_recv(int status, orte_process_name_t* sender, char *tagname, *site, *resource; orte_rml_tag_t oob_tag; orte_data_type_t type; - orte_std_cntr_t count; + orte_std_cntr_t count, nprocs; + orte_process_name_t *procs; int rc=ORTE_SUCCESS, ret; count = 1; @@ -611,6 +612,33 @@ void orte_ns_replica_recv(int status, orte_process_name_t* sender, /* ignore this command */ break; + case ORTE_NS_GET_JOB_PEERS_CMD: + /* unpack the jobid */ + count = 1; + if (ORTE_SUCCESS != (rc = orte_dss.unpack(buffer, &job, &count, ORTE_JOBID))) { + ORTE_ERROR_LOG(rc); + goto RETURN_ERROR; + } + /* process the request */ + if (ORTE_SUCCESS != (rc = orte_ns_replica_get_job_peers(&procs, &nprocs, job))) { + ORTE_ERROR_LOG(rc); + goto RETURN_ERROR; + } + + /* pack the answer */ + if (ORTE_SUCCESS != (rc = orte_dss.pack(&answer, &nprocs, 1, ORTE_STD_CNTR))) { + ORTE_ERROR_LOG(rc); + goto RETURN_ERROR; + } + + if (nprocs > 0) { + if (ORTE_SUCCESS != (rc = orte_dss.pack(&answer, &procs, nprocs, ORTE_NAME))) { + ORTE_ERROR_LOG(rc); + goto RETURN_ERROR; + } + } + break; + case ORTE_NS_DUMP_CELLS_CMD: if (ORTE_SUCCESS != (rc = orte_ns_replica_dump_cells_fn(&answer))) { ORTE_ERROR_LOG(rc); diff --git a/orte/mca/rmgr/base/rmgr_base_stage_gate.c b/orte/mca/rmgr/base/rmgr_base_stage_gate.c index baecee4aa0..5654ffd62d 100644 --- a/orte/mca/rmgr/base/rmgr_base_stage_gate.c +++ b/orte/mca/rmgr/base/rmgr_base_stage_gate.c @@ -85,6 +85,8 @@ int orte_rmgr_base_proc_stage_gate_mgr(orte_gpr_notify_message_t *msg) return rc; } + OPAL_TRACE_ARG1(1, job); + /* need the list of peers for this job so we can send them the xcast. * obtain this list from the name service's get_job_peers function */