1
1

Add missing functionaltiy to the ns replica to support remote get_job_peers requests. Add trace commands to help try and track down remaining problem with comm_spawn.

This commit was SVN r11939.
Этот коммит содержится в:
Ralph Castain 2006-10-02 19:44:35 +00:00
родитель 9eb14425b7
Коммит b269e4da9b
4 изменённых файлов: 72 добавлений и 8 удалений

Просмотреть файл

@ -25,8 +25,11 @@
#include "orte/orte_constants.h"
#include "orte/orte_types.h"
#include "opal/mca/mca.h"
#include "opal/util/output.h"
#include "opal/util/trace.h"
#include "orte/dss/dss.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/rml/rml.h"
@ -50,6 +53,8 @@ int orte_ns_proxy_create_cellid(orte_cellid_t *cellid, char *site, char *resourc
int rc;
orte_ns_proxy_cell_info_t *new_cell;
OPAL_TRACE(1);
/* set the default value of error */
*cellid = ORTE_CELLID_MAX;
@ -160,6 +165,8 @@ int orte_ns_proxy_get_cell_info(orte_cellid_t cellid,
orte_ns_proxy_cell_info_t **cell, *new_cell;
int rc, ret=ORTE_SUCCESS;
OPAL_TRACE(1);
/* see if we already have the info locally */
OPAL_THREAD_LOCK(&orte_ns_proxy.mutex);
@ -299,6 +306,8 @@ int orte_ns_proxy_create_jobid(orte_jobid_t *job)
orte_std_cntr_t count;
int rc;
OPAL_TRACE(1);
/* set default value */
*job = ORTE_JOBID_MAX;
@ -365,6 +374,8 @@ int orte_ns_proxy_reserve_range(orte_jobid_t job, orte_vpid_t range, orte_vpid_t
orte_std_cntr_t count;
int rc;
OPAL_TRACE(1);
/* set default return value */
*starting_vpid = ORTE_VPID_MAX;
@ -439,9 +450,11 @@ int orte_ns_proxy_get_job_peers(orte_process_name_t **procs,
orte_buffer_t* cmd;
orte_buffer_t* answer;
orte_ns_cmd_flag_t command;
orte_std_cntr_t count;
orte_std_cntr_t count, nprocs;
int rc;
OPAL_TRACE_ARG1(1, job);
/* set default value */
*procs = NULL;
*num_procs = 0;
@ -452,12 +465,20 @@ int orte_ns_proxy_get_job_peers(orte_process_name_t **procs,
}
command = ORTE_NS_GET_JOB_PEERS_CMD;
/* pack the command */
if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, (void*)&command, 1, ORTE_NS_CMD))) { /* got a problem */
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(cmd);
return rc;
}
/* pack the jobid */
if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, &job, 1, ORTE_JOBID))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(cmd);
return rc;
}
if (0 > orte_rml.send_buffer(orte_ns_proxy.my_replica, cmd, ORTE_RML_TAG_NS, 0)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_RELEASE(cmd);
@ -491,27 +512,28 @@ int orte_ns_proxy_get_job_peers(orte_process_name_t **procs,
}
count = 1;
if (ORTE_SUCCESS != (rc = orte_dss.unpack(answer, &num_procs, &count, ORTE_STD_CNTR))) {
if (ORTE_SUCCESS != (rc = orte_dss.unpack(answer, &nprocs, &count, ORTE_STD_CNTR))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(answer);
return rc;
}
/* allocate space for array of proc names */
if (0 < *num_procs) {
*procs = (orte_process_name_t*)malloc((*num_procs) * sizeof(orte_process_name_t));
if (0 < nprocs) {
*procs = (orte_process_name_t*)malloc((nprocs) * sizeof(orte_process_name_t));
if (NULL == *procs) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
OBJ_RELEASE(answer);
return ORTE_ERR_OUT_OF_RESOURCE;
}
if (ORTE_SUCCESS != (rc = orte_dss.unpack(answer, procs, num_procs, ORTE_NAME))) {
if (ORTE_SUCCESS != (rc = orte_dss.unpack(answer, procs, &nprocs, ORTE_NAME))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(answer);
return rc;
}
}
*num_procs = nprocs;
OBJ_RELEASE(answer);
return ORTE_SUCCESS;

Просмотреть файл

@ -23,11 +23,13 @@
#include <stdio.h>
#include <string.h>
#include "orte/dss/dss.h"
#include "opal/threads/mutex.h"
#include "opal/util/output.h"
#include "opal/util/trace.h"
#include "orte/dss/dss.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/ns/base/base.h"
#include "ns_replica.h"
@ -46,6 +48,8 @@ int orte_ns_replica_create_cellid(orte_cellid_t *cellid, char *site, char *resou
int rc;
orte_std_cntr_t index;
OPAL_TRACE(1);
OPAL_THREAD_LOCK(&orte_ns_replica.mutex);
*cellid = ORTE_CELLID_MAX;
@ -90,6 +94,8 @@ int orte_ns_replica_get_cell_info(orte_cellid_t cellid,
orte_cellid_t j;
orte_ns_replica_cell_tracker_t **cell;
OPAL_TRACE(1);
OPAL_THREAD_LOCK(&orte_ns_replica.mutex);
cell = (orte_ns_replica_cell_tracker_t**)(orte_ns_replica.cells)->addr;
@ -120,6 +126,8 @@ int orte_ns_replica_create_jobid(orte_jobid_t *jobid)
int rc;
orte_std_cntr_t index;
OPAL_TRACE(1);
OPAL_THREAD_LOCK(&orte_ns_replica.mutex);
*jobid = ORTE_JOBID_MAX;
@ -164,6 +172,8 @@ int orte_ns_replica_reserve_range(orte_jobid_t job, orte_vpid_t range,
orte_std_cntr_t j;
orte_jobid_t k;
OPAL_TRACE(1);
OPAL_THREAD_LOCK(&orte_ns_replica.mutex);
/* find the jobid */
@ -204,6 +214,8 @@ int orte_ns_replica_get_job_peers(orte_process_name_t **procs,
orte_std_cntr_t j;
orte_jobid_t k;
OPAL_TRACE_ARG1(1, job);
OPAL_THREAD_LOCK(&orte_ns_replica.mutex);
/* find the jobid */

Просмотреть файл

@ -412,7 +412,8 @@ void orte_ns_replica_recv(int status, orte_process_name_t* sender,
char *tagname, *site, *resource;
orte_rml_tag_t oob_tag;
orte_data_type_t type;
orte_std_cntr_t count;
orte_std_cntr_t count, nprocs;
orte_process_name_t *procs;
int rc=ORTE_SUCCESS, ret;
count = 1;
@ -611,6 +612,33 @@ void orte_ns_replica_recv(int status, orte_process_name_t* sender,
/* ignore this command */
break;
case ORTE_NS_GET_JOB_PEERS_CMD:
/* unpack the jobid */
count = 1;
if (ORTE_SUCCESS != (rc = orte_dss.unpack(buffer, &job, &count, ORTE_JOBID))) {
ORTE_ERROR_LOG(rc);
goto RETURN_ERROR;
}
/* process the request */
if (ORTE_SUCCESS != (rc = orte_ns_replica_get_job_peers(&procs, &nprocs, job))) {
ORTE_ERROR_LOG(rc);
goto RETURN_ERROR;
}
/* pack the answer */
if (ORTE_SUCCESS != (rc = orte_dss.pack(&answer, &nprocs, 1, ORTE_STD_CNTR))) {
ORTE_ERROR_LOG(rc);
goto RETURN_ERROR;
}
if (nprocs > 0) {
if (ORTE_SUCCESS != (rc = orte_dss.pack(&answer, &procs, nprocs, ORTE_NAME))) {
ORTE_ERROR_LOG(rc);
goto RETURN_ERROR;
}
}
break;
case ORTE_NS_DUMP_CELLS_CMD:
if (ORTE_SUCCESS != (rc = orte_ns_replica_dump_cells_fn(&answer))) {
ORTE_ERROR_LOG(rc);

Просмотреть файл

@ -85,6 +85,8 @@ int orte_rmgr_base_proc_stage_gate_mgr(orte_gpr_notify_message_t *msg)
return rc;
}
OPAL_TRACE_ARG1(1, job);
/* need the list of peers for this job so we can send them the xcast.
* obtain this list from the name service's get_job_peers function
*/