1
1
openmpi/orte/mca/schema/base/schema_base_fns.c
Ralph Castain 4e79a51395 Add a job_info segment to the system that holds a container for each job. Within each container is a keyval indicating the job state (i.e., all procs at stage1, finalized, etc.). This provides a rough state-of-health for the job.
This required a little fiddling with a number of areas. Biggest problem was that it uncovered a potential for an infinite loop to be created in the registry. If a callback function modified the registry, the registry checked the triggers to see if anything had fired. Well, if the original callback was due to a trigger firing, that condition hadn't changed - so the trigger fired again....which caused the callback to be called, which modified the registry, which checked the triggers, etc. etc.

Triggers are now checked and then "flagged" as being "in process" so that the registry will NOT recheck that trigger until all callbacks have been processed. Tried doing this with subscriptions as well, but that caused a problem - when we release processes from a stagegate, they (at the moment) immediately place data on the registry that should cause a subscription to fire. Unfortunately, the system will just hang if that subscription doesn't get processed. So, I have left the subscription system alone - any callback function that modifies the registry in a fashion that will fire a subscription will indeed fire that subscription. We'll have to see if this causes problems - it shouldn't, but a careless user could lock things up if the callback generates a callback to itself.

Also fixed the code that placed a process' RML contact info on the registry to eliminate the leading '/' from the string.

This commit was SVN r6684.
2005-07-29 14:11:19 +00:00

324 строки
8.4 KiB
C

/* -*- C -*-
*
* Copyright (c) 2004-2005 The Trustees of Indiana University.
* All rights reserved.
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
* All rights reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/** @file:
*
* Convenience functions for accessing the General Purpose Registry
*
*/
/*
* includes
*/
#include "orte_config.h"
#include <string.h>
#include "include/orte_constants.h"
#include "opal/util/output.h"
#include "util/proc_info.h"
#include "util/sys_info.h"
#include "mca/ns/ns.h"
#include "mca/gpr/gpr.h"
#include "mca/errmgr/errmgr.h"
#include "mca/schema/base/base.h"
int orte_schema_base_get_proc_tokens(char ***proc_tokens, size_t* num_tokens, orte_process_name_t *proc)
{
int rc;
char** tokens;
char* vpid_string;
tokens = (char**)malloc(3 * sizeof(char*));
if (NULL == tokens) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
if (ORTE_SUCCESS != (rc = orte_ns.get_proc_name_string(&tokens[0], proc))) {
ORTE_ERROR_LOG(rc);
goto CLEANUP;
}
if (ORTE_SUCCESS != (rc = orte_ns.get_vpid_string(&vpid_string, proc))) {
ORTE_ERROR_LOG(rc);
goto CLEANUP;
}
asprintf(&tokens[1], "%s-%s", ORTE_VPID_KEY, vpid_string);
free(vpid_string);
tokens[2] = NULL;
*proc_tokens = tokens;
if(num_tokens != NULL)
*num_tokens = 2;
return ORTE_SUCCESS;
CLEANUP:
if (NULL != tokens) {
if (NULL != tokens[0])
free(tokens[0]);
if (NULL != tokens[1])
free(tokens[1]);
free(tokens);
}
return rc;
}
int orte_schema_base_get_node_tokens(char ***node_tokens, size_t* num_tokens, orte_cellid_t cellid, char *nodename)
{
int rc;
char** tokens;
char* cellid_string;
tokens = (char**)malloc(3 * sizeof(char*));
if (NULL == tokens) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
if (ORTE_SUCCESS != (rc = orte_ns.convert_cellid_to_string(&cellid_string, cellid))) {
ORTE_ERROR_LOG(rc);
goto CLEANUP;
}
asprintf(&tokens[0], "%s-%s", ORTE_CELLID_KEY, cellid_string);
free(cellid_string);
tokens[1] = strdup(nodename);
tokens[2] = NULL;
*node_tokens = tokens;
if(num_tokens != NULL)
*num_tokens = 2;
return ORTE_SUCCESS;
CLEANUP:
if (NULL != tokens) {
if (NULL != tokens[0])
free(tokens[0]);
if (NULL != tokens[1])
free(tokens[1]);
free(tokens);
}
return rc;
}
int orte_schema_base_get_job_tokens(char ***job_tokens, size_t* num_tokens, orte_jobid_t jobid)
{
int rc;
char** tokens;
char* jobid_string;
tokens = (char**)malloc(2 * sizeof(char*));
if (NULL == tokens) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
if (ORTE_SUCCESS != (rc = orte_ns.convert_jobid_to_string(&jobid_string, jobid))) {
ORTE_ERROR_LOG(rc);
goto CLEANUP;
}
asprintf(&tokens[0], "%s-%s", ORTE_JOBID_KEY, jobid_string);
free(jobid_string);
tokens[1] = NULL;
*job_tokens = tokens;
if(num_tokens != NULL)
*num_tokens = 1;
return ORTE_SUCCESS;
CLEANUP:
if (NULL != tokens) {
if (NULL != tokens[0]) free(tokens[0]);
free(tokens);
}
return rc;
}
int orte_schema_base_get_cell_tokens(char ***cell_tokens, size_t* num_tokens, orte_cellid_t cellid)
{
int rc;
char **tokens;
*num_tokens = 0;
tokens = (char**)malloc(3 * sizeof(char*));
if (NULL == tokens) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
if (ORTE_SUCCESS != (rc = orte_ns.get_cell_info(cellid, &tokens[1], &tokens[2]))) {
ORTE_ERROR_LOG(rc);
free(*tokens);
return rc;
}
if (ORTE_SUCCESS != (rc = orte_ns.convert_cellid_to_string(&tokens[0], cellid))) {
ORTE_ERROR_LOG(rc);
free(*tokens);
return rc;
}
*num_tokens = 3;
*cell_tokens = tokens;
return ORTE_SUCCESS;
}
int orte_schema_base_get_job_segment_name(char **name, orte_jobid_t jobid)
{
char *jobidstring;
int rc;
if (ORTE_SUCCESS != (rc = orte_ns.convert_jobid_to_string(&jobidstring, jobid))) {
ORTE_ERROR_LOG(rc);
return rc;
}
if (0 > asprintf(name, "%s-%s", ORTE_JOB_SEGMENT, jobidstring)) {
free(jobidstring);
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
free(jobidstring);
return ORTE_SUCCESS;
}
int orte_schema_base_extract_jobid_from_segment_name(orte_jobid_t *jobid, char *name)
{
char *jobstring;
orte_jobid_t job;
int rc;
jobstring = strrchr(name, '-');
if (NULL == jobstring) {
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
return ORTE_ERR_BAD_PARAM;
}
jobstring++;
if (ORTE_SUCCESS != (rc = orte_ns.convert_string_to_jobid(&job, jobstring))) {
ORTE_ERROR_LOG(rc);
opal_output(0, "[%lu,%lu,%lu] %s\n", ORTE_NAME_ARGS(orte_process_info.my_name), jobstring);
return rc;
}
*jobid = job;
return ORTE_SUCCESS;
}
/**
* Set the process mapping in the registry.
*/
int orte_schema_base_store_my_info(void)
{
int rc = ORTE_SUCCESS;
orte_gpr_value_t value, *values;
orte_gpr_keyval_t local_pid = { {OBJ_CLASS(opal_object_t),0}, ORTE_PROC_LOCAL_PID_KEY, ORTE_PID };
orte_gpr_keyval_t nodename = { {OBJ_CLASS(opal_object_t),0}, ORTE_NODE_NAME_KEY, ORTE_STRING };
orte_gpr_keyval_t* keyvals[2];
size_t i;
orte_jobid_t jobid;
/* NOTE: cannot destruct the value object since the keyval's are statically
* defined, so don't construct it either
*/
keyvals[0] = &local_pid;
keyvals[1] = &nodename;
value.addr_mode = ORTE_GPR_OVERWRITE;
if (ORTE_SUCCESS != (rc = orte_schema_base_get_proc_tokens(&value.tokens,
&value.num_tokens, orte_process_info.my_name))) {
ORTE_ERROR_LOG(rc);
return rc;
}
if (ORTE_SUCCESS != (rc = orte_ns.get_jobid(&jobid, orte_process_info.my_name))) {
ORTE_ERROR_LOG(rc);
return rc;
}
if (ORTE_SUCCESS != (rc = orte_schema_base_get_job_segment_name(&value.segment, jobid))) {
ORTE_ERROR_LOG(rc);
return rc;
}
value.keyvals = keyvals;
value.cnt = 2;
values = &value;
local_pid.value.pid = orte_process_info.pid;
nodename.value.strptr = strdup(orte_system_info.nodename);
/* insert values into registry */
if (ORTE_SUCCESS != (rc = orte_gpr.put(1, &values))) {
ORTE_ERROR_LOG(rc);
}
/* cleanup memory */
for (i=0; i < value.num_tokens; i++) {
free(value.tokens[i]);
}
free(value.segment);
return rc;
}
int orte_schema_base_get_std_trigger_name(char **name,
char *trigger,
orte_jobid_t jobid)
{
char *jobidstring;
int rc;
if (ORTE_SUCCESS != (rc = orte_ns.convert_jobid_to_string(&jobidstring, jobid))) {
ORTE_ERROR_LOG(rc);
return rc;
}
if (0 > asprintf(name, "%s-%s", trigger, jobidstring)) {
free(jobidstring);
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
free(jobidstring);
return ORTE_SUCCESS;
}
int orte_schema_base_get_std_subscription_name(char **name,
char *subscription,
orte_jobid_t jobid)
{
char *jobidstring;
int rc;
if (ORTE_SUCCESS != (rc = orte_ns.convert_jobid_to_string(&jobidstring, jobid))) {
ORTE_ERROR_LOG(rc);
return rc;
}
if (0 > asprintf(name, "%s-%s", subscription, jobidstring)) {
free(jobidstring);
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
free(jobidstring);
return ORTE_SUCCESS;
}