Ensure that standard triggers are defined for all job/process states so that user's can subscribe to those they want to use. Modify the way that is done to avoid over-burdening the standard launch sequence since it doesn't need alerts from all those triggers.
This commit was SVN r8938.
Этот коммит содержится в:
родитель
18bbb049d1
Коммит
892b396d70
@ -26,28 +26,30 @@
|
|||||||
#include <unistd.h>
|
#include <unistd.h>
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#include "orte/dss/dss.h"
|
#include "opal/util/printf.h"
|
||||||
|
#include "opal/util/convert.h"
|
||||||
|
#include "opal/threads/mutex.h"
|
||||||
|
#include "opal/util/bit_ops.h"
|
||||||
|
#include "opal/util/argv.h"
|
||||||
|
|
||||||
#include "ompi/communicator/communicator.h"
|
#include "ompi/communicator/communicator.h"
|
||||||
#include "ompi/request/request.h"
|
#include "ompi/request/request.h"
|
||||||
#include "errhandler/errhandler.h"
|
#include "errhandler/errhandler.h"
|
||||||
#include "proc/proc.h"
|
#include "proc/proc.h"
|
||||||
#include "info/info.h"
|
#include "info/info.h"
|
||||||
#include "opal/util/convert.h"
|
|
||||||
#include "opal/threads/mutex.h"
|
|
||||||
#include "util/proc_info.h"
|
|
||||||
#include "opal/util/bit_ops.h"
|
|
||||||
#include "opal/util/argv.h"
|
|
||||||
#include "ompi/include/constants.h"
|
#include "ompi/include/constants.h"
|
||||||
#include "mca/pml/pml.h"
|
#include "ompi/mca/pml/pml.h"
|
||||||
#include "mca/ns/ns.h"
|
|
||||||
#include "mca/gpr/gpr.h"
|
|
||||||
#include "mca/errmgr/errmgr.h"
|
|
||||||
#include "mca/rmgr/rmgr.h"
|
|
||||||
|
|
||||||
#include "mca/rml/rml.h"
|
#include "orte/util/proc_info.h"
|
||||||
|
#include "orte/dss/dss.h"
|
||||||
|
#include "orte/mca/ns/ns.h"
|
||||||
|
#include "orte/mca/gpr/gpr.h"
|
||||||
|
#include "orte/mca/errmgr/errmgr.h"
|
||||||
|
#include "orte/mca/rmgr/rmgr.h"
|
||||||
|
#include "orte/mca/soh/soh_types.h"
|
||||||
|
#include "orte/mca/rml/rml.h"
|
||||||
|
|
||||||
#include "runtime/runtime.h"
|
#include "runtime/runtime.h"
|
||||||
#include "opal/util/printf.h"
|
|
||||||
|
|
||||||
extern char **environ;
|
extern char **environ;
|
||||||
|
|
||||||
@ -446,8 +448,7 @@ ompi_comm_start_processes(int count, char **array_of_commands,
|
|||||||
|
|
||||||
|
|
||||||
/* spawn procs */
|
/* spawn procs */
|
||||||
if (ORTE_SUCCESS != (rc = orte_rmgr.spawn(apps, count, &new_jobid,
|
if (ORTE_SUCCESS != (rc = orte_rmgr.spawn(apps, count, &new_jobid, NULL, ORTE_PROC_STATE_NONE))) {
|
||||||
NULL))) {
|
|
||||||
ORTE_ERROR_LOG(rc);
|
ORTE_ERROR_LOG(rc);
|
||||||
opal_progress_event_decrement();
|
opal_progress_event_decrement();
|
||||||
return MPI_ERR_SPAWN;
|
return MPI_ERR_SPAWN;
|
||||||
|
@ -116,7 +116,7 @@ int orte_rmgr_base_launch_not_available(orte_jobid_t);
|
|||||||
int orte_rmgr_base_terminate_job_not_available(orte_jobid_t);
|
int orte_rmgr_base_terminate_job_not_available(orte_jobid_t);
|
||||||
int orte_rmgr_base_terminate_proc_not_available(const orte_process_name_t*);
|
int orte_rmgr_base_terminate_proc_not_available(const orte_process_name_t*);
|
||||||
int orte_rmgr_base_proc_stage_gate_init(orte_jobid_t job);
|
int orte_rmgr_base_proc_stage_gate_init(orte_jobid_t job);
|
||||||
int orte_rmgr_base_proc_stage_gate_subscribe(orte_jobid_t job, orte_gpr_notify_cb_fn_t, void*, int);
|
int orte_rmgr_base_proc_stage_gate_subscribe(orte_jobid_t job, orte_gpr_notify_cb_fn_t, void*, orte_proc_state_t);
|
||||||
int orte_rmgr_base_proc_stage_gate_mgr(
|
int orte_rmgr_base_proc_stage_gate_mgr(
|
||||||
orte_gpr_notify_message_t *msg);
|
orte_gpr_notify_message_t *msg);
|
||||||
int orte_rmgr_base_proc_stage_gate_mgr_abort(
|
int orte_rmgr_base_proc_stage_gate_mgr_abort(
|
||||||
|
@ -43,12 +43,19 @@
|
|||||||
|
|
||||||
int orte_rmgr_base_proc_stage_gate_init(orte_jobid_t job)
|
int orte_rmgr_base_proc_stage_gate_init(orte_jobid_t job)
|
||||||
{
|
{
|
||||||
size_t i, num_counters=6, num_named_trigs=5;
|
size_t i, num_counters, num_named_trigs;
|
||||||
size_t zero=0;
|
size_t zero=0;
|
||||||
int rc;
|
int rc;
|
||||||
orte_gpr_value_t *value;
|
orte_gpr_value_t *value;
|
||||||
char* keys[] = {
|
char* keys[] = {
|
||||||
/* changes to this ordering need to be reflected in code below */
|
/* changes to this ordering need to be reflected in code below */
|
||||||
|
/* We need to set up counters for all the defined ORTE process states, even though
|
||||||
|
* the launch system doesn't actually use them all. This must be done so that
|
||||||
|
* user-defined callbacks can be generated - otherwise, they won't happen!
|
||||||
|
*/
|
||||||
|
ORTE_PROC_NUM_AT_INIT,
|
||||||
|
ORTE_PROC_NUM_LAUNCHED,
|
||||||
|
ORTE_PROC_NUM_RUNNING,
|
||||||
ORTE_PROC_NUM_AT_STG1,
|
ORTE_PROC_NUM_AT_STG1,
|
||||||
ORTE_PROC_NUM_AT_STG2,
|
ORTE_PROC_NUM_AT_STG2,
|
||||||
ORTE_PROC_NUM_AT_STG3,
|
ORTE_PROC_NUM_AT_STG3,
|
||||||
@ -58,6 +65,9 @@ int orte_rmgr_base_proc_stage_gate_init(orte_jobid_t job)
|
|||||||
};
|
};
|
||||||
char* trig_names[] = {
|
char* trig_names[] = {
|
||||||
/* changes to this ordering need to be reflected in code below */
|
/* changes to this ordering need to be reflected in code below */
|
||||||
|
ORTE_ALL_INIT_TRIGGER,
|
||||||
|
ORTE_ALL_LAUNCHED_TRIGGER,
|
||||||
|
ORTE_ALL_RUNNING_TRIGGER,
|
||||||
ORTE_STG1_TRIGGER,
|
ORTE_STG1_TRIGGER,
|
||||||
ORTE_STG2_TRIGGER,
|
ORTE_STG2_TRIGGER,
|
||||||
ORTE_STG3_TRIGGER,
|
ORTE_STG3_TRIGGER,
|
||||||
@ -70,6 +80,9 @@ int orte_rmgr_base_proc_stage_gate_init(orte_jobid_t job)
|
|||||||
|
|
||||||
OPAL_TRACE(1);
|
OPAL_TRACE(1);
|
||||||
|
|
||||||
|
num_counters = sizeof(keys)/sizeof(keys[0]);
|
||||||
|
num_named_trigs= sizeof(trig_names)/sizeof(trig_names[0]);
|
||||||
|
|
||||||
if (ORTE_SUCCESS != (rc = orte_schema.get_job_segment_name(&segment, job))) {
|
if (ORTE_SUCCESS != (rc = orte_schema.get_job_segment_name(&segment, job))) {
|
||||||
ORTE_ERROR_LOG(rc);
|
ORTE_ERROR_LOG(rc);
|
||||||
return rc;
|
return rc;
|
||||||
@ -192,11 +205,15 @@ int orte_rmgr_base_proc_stage_gate_mgr(orte_gpr_notify_message_t *msg)
|
|||||||
|
|
||||||
OPAL_TRACE(1);
|
OPAL_TRACE(1);
|
||||||
|
|
||||||
/* check to see if this came from terminate. If so, we ignore it because
|
/* check to see if this came from a trigger that we ignore because
|
||||||
* that stage gate does NOT set an xcast barrier - processes simply
|
* that stage gate does NOT set an xcast barrier - processes simply
|
||||||
* record their state and continue processing
|
* record their state and continue processing. The only triggers that
|
||||||
|
* involve a xcast barrier are the STGx and FINALIZED ones - ignore the rest.
|
||||||
*/
|
*/
|
||||||
if (orte_schema.check_std_trigger_name(msg->target, ORTE_NUM_TERMINATED_TRIGGER)) {
|
if (!orte_schema.check_std_trigger_name(msg->target, ORTE_STG1_TRIGGER) &&
|
||||||
|
!orte_schema.check_std_trigger_name(msg->target, ORTE_STG2_TRIGGER) &&
|
||||||
|
!orte_schema.check_std_trigger_name(msg->target, ORTE_STG3_TRIGGER) &&
|
||||||
|
!orte_schema.check_std_trigger_name(msg->target, ORTE_NUM_FINALIZED_TRIGGER)) {
|
||||||
return ORTE_SUCCESS;
|
return ORTE_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -297,14 +314,28 @@ int orte_rmgr_base_proc_stage_gate_mgr_abort(orte_gpr_notify_message_t *msg)
|
|||||||
* to events on all counters.
|
* to events on all counters.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
int orte_rmgr_base_proc_stage_gate_subscribe(orte_jobid_t job, orte_gpr_notify_cb_fn_t cbfunc, void* cbdata, int type)
|
int orte_rmgr_base_proc_stage_gate_subscribe(orte_jobid_t job, orte_gpr_notify_cb_fn_t cbfunc, void* cbdata, orte_proc_state_t cb_conditions)
|
||||||
{
|
{
|
||||||
size_t i;
|
size_t i;
|
||||||
int rc;
|
int rc;
|
||||||
char *segment, *trig_name, *tokens[2];
|
char *segment, *trig_name, *tokens[2];
|
||||||
orte_gpr_subscription_id_t id;
|
orte_gpr_subscription_id_t id;
|
||||||
|
/** the order of the next three definitions MUST match */
|
||||||
|
orte_proc_state_t state[] = {
|
||||||
|
ORTE_PROC_STATE_INIT,
|
||||||
|
ORTE_PROC_STATE_LAUNCHED,
|
||||||
|
ORTE_PROC_STATE_RUNNING,
|
||||||
|
ORTE_PROC_STATE_AT_STG1,
|
||||||
|
ORTE_PROC_STATE_AT_STG2,
|
||||||
|
ORTE_PROC_STATE_AT_STG3,
|
||||||
|
ORTE_PROC_STATE_FINALIZED,
|
||||||
|
ORTE_PROC_STATE_TERMINATED,
|
||||||
|
ORTE_PROC_STATE_ABORTED
|
||||||
|
};
|
||||||
char* keys[] = {
|
char* keys[] = {
|
||||||
/* changes to this ordering need to be reflected in code below */
|
ORTE_PROC_NUM_AT_INIT,
|
||||||
|
ORTE_PROC_NUM_LAUNCHED,
|
||||||
|
ORTE_PROC_NUM_RUNNING,
|
||||||
ORTE_PROC_NUM_AT_STG1,
|
ORTE_PROC_NUM_AT_STG1,
|
||||||
ORTE_PROC_NUM_AT_STG2,
|
ORTE_PROC_NUM_AT_STG2,
|
||||||
ORTE_PROC_NUM_AT_STG3,
|
ORTE_PROC_NUM_AT_STG3,
|
||||||
@ -313,9 +344,9 @@ int orte_rmgr_base_proc_stage_gate_subscribe(orte_jobid_t job, orte_gpr_notify_c
|
|||||||
ORTE_PROC_NUM_ABORTED
|
ORTE_PROC_NUM_ABORTED
|
||||||
};
|
};
|
||||||
char* trig_names[] = {
|
char* trig_names[] = {
|
||||||
/* changes to this ordering need to be reflected in code below
|
ORTE_ALL_INIT_TRIGGER,
|
||||||
* number of entries MUST match those above
|
ORTE_ALL_LAUNCHED_TRIGGER,
|
||||||
*/
|
ORTE_ALL_RUNNING_TRIGGER,
|
||||||
ORTE_STG1_TRIGGER,
|
ORTE_STG1_TRIGGER,
|
||||||
ORTE_STG2_TRIGGER,
|
ORTE_STG2_TRIGGER,
|
||||||
ORTE_STG3_TRIGGER,
|
ORTE_STG3_TRIGGER,
|
||||||
@ -338,25 +369,8 @@ int orte_rmgr_base_proc_stage_gate_subscribe(orte_jobid_t job, orte_gpr_notify_c
|
|||||||
tokens[1]=NULL;
|
tokens[1]=NULL;
|
||||||
|
|
||||||
for (i=0; i < num_counters; i++) {
|
for (i=0; i < num_counters; i++) {
|
||||||
if (ORTE_STAGE_GATE_TERMINATION == type) {
|
if (state[i] & cb_conditions) {
|
||||||
if ( ORTE_PROC_NUM_TERMINATED != keys[i] &&
|
/** want this one - attach ourselves to the appropriate standard trigger */
|
||||||
ORTE_PROC_NUM_ABORTED != keys[i])
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
else if (ORTE_STAGE_GATE_STAGES == type) {
|
|
||||||
if (ORTE_PROC_NUM_AT_STG1 != keys[i] &&
|
|
||||||
ORTE_PROC_NUM_AT_STG2 != keys[i] &&
|
|
||||||
ORTE_PROC_NUM_AT_STG3 != keys[i] &&
|
|
||||||
ORTE_PROC_NUM_FINALIZED != keys[i] )
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
else if (ORTE_STAGE_GATE_ALL != type) {
|
|
||||||
ORTE_ERROR_LOG(ORTE_ERROR);
|
|
||||||
printf("Invalid argument (%d)\n", type);
|
|
||||||
return ORTE_ERROR;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* attach ourselves to the appropriate standard trigger */
|
|
||||||
if (ORTE_SUCCESS !=
|
if (ORTE_SUCCESS !=
|
||||||
(rc = orte_schema.get_std_trigger_name(&trig_name, trig_names[i], job))) {
|
(rc = orte_schema.get_std_trigger_name(&trig_name, trig_names[i], job))) {
|
||||||
ORTE_ERROR_LOG(rc);
|
ORTE_ERROR_LOG(rc);
|
||||||
@ -376,6 +390,7 @@ int orte_rmgr_base_proc_stage_gate_subscribe(orte_jobid_t job, orte_gpr_notify_c
|
|||||||
}
|
}
|
||||||
free(trig_name);
|
free(trig_name);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
free(segment);
|
free(segment);
|
||||||
|
|
||||||
return ORTE_SUCCESS;
|
return ORTE_SUCCESS;
|
||||||
|
@ -63,7 +63,8 @@ static int orte_rmgr_proxy_spawn(
|
|||||||
orte_app_context_t** app_context,
|
orte_app_context_t** app_context,
|
||||||
size_t num_context,
|
size_t num_context,
|
||||||
orte_jobid_t* jobid,
|
orte_jobid_t* jobid,
|
||||||
orte_rmgr_cb_fn_t cbfn);
|
orte_rmgr_cb_fn_t cbfn,
|
||||||
|
orte_proc_state_t cb_conditions);
|
||||||
|
|
||||||
orte_rmgr_base_module_t orte_rmgr_proxy_module = {
|
orte_rmgr_base_module_t orte_rmgr_proxy_module = {
|
||||||
orte_rmgr_proxy_query,
|
orte_rmgr_proxy_query,
|
||||||
@ -291,6 +292,18 @@ static void orte_rmgr_proxy_callback(orte_gpr_notify_data_t *data, void *cbdata)
|
|||||||
keyvals = value->keyvals;
|
keyvals = value->keyvals;
|
||||||
for(j=0; j<value->cnt; j++) {
|
for(j=0; j<value->cnt; j++) {
|
||||||
orte_gpr_keyval_t* keyval = keyvals[j];
|
orte_gpr_keyval_t* keyval = keyvals[j];
|
||||||
|
if(strcmp(keyval->key, ORTE_PROC_NUM_AT_INIT) == 0) {
|
||||||
|
(*cbfunc)(jobid,ORTE_PROC_STATE_INIT);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if(strcmp(keyval->key, ORTE_PROC_NUM_LAUNCHED) == 0) {
|
||||||
|
(*cbfunc)(jobid,ORTE_PROC_STATE_LAUNCHED);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if(strcmp(keyval->key, ORTE_PROC_NUM_RUNNING) == 0) {
|
||||||
|
(*cbfunc)(jobid,ORTE_PROC_STATE_RUNNING);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
if(strcmp(keyval->key, ORTE_PROC_NUM_AT_STG1) == 0) {
|
if(strcmp(keyval->key, ORTE_PROC_NUM_AT_STG1) == 0) {
|
||||||
(*cbfunc)(jobid,ORTE_PROC_STATE_AT_STG1);
|
(*cbfunc)(jobid,ORTE_PROC_STATE_AT_STG1);
|
||||||
continue;
|
continue;
|
||||||
@ -330,7 +343,8 @@ static int orte_rmgr_proxy_spawn(
|
|||||||
orte_app_context_t** app_context,
|
orte_app_context_t** app_context,
|
||||||
size_t num_context,
|
size_t num_context,
|
||||||
orte_jobid_t* jobid,
|
orte_jobid_t* jobid,
|
||||||
orte_rmgr_cb_fn_t cbfunc)
|
orte_rmgr_cb_fn_t cbfunc,
|
||||||
|
orte_proc_state_t cb_conditions)
|
||||||
{
|
{
|
||||||
int rc;
|
int rc;
|
||||||
orte_process_name_t* name;
|
orte_process_name_t* name;
|
||||||
@ -384,7 +398,7 @@ static int orte_rmgr_proxy_spawn(
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
if(NULL != cbfunc) {
|
if(NULL != cbfunc) {
|
||||||
rc = orte_rmgr_base_proc_stage_gate_subscribe(*jobid, orte_rmgr_proxy_callback, (void*)cbfunc, ORTE_STAGE_GATE_ALL);
|
rc = orte_rmgr_base_proc_stage_gate_subscribe(*jobid, orte_rmgr_proxy_callback, (void*)cbfunc, cb_conditions);
|
||||||
if(ORTE_SUCCESS != rc) {
|
if(ORTE_SUCCESS != rc) {
|
||||||
ORTE_ERROR_LOG(rc);
|
ORTE_ERROR_LOG(rc);
|
||||||
return rc;
|
return rc;
|
||||||
|
@ -139,19 +139,21 @@ typedef void (*orte_rmgr_cb_fn_t)(orte_jobid_t jobid, orte_proc_state_t state);
|
|||||||
* (2) Allocated resources to the job.
|
* (2) Allocated resources to the job.
|
||||||
* (3) Map processes to allocated resources
|
* (3) Map processes to allocated resources
|
||||||
* (4) Launch the job.
|
* (4) Launch the job.
|
||||||
* (5) Callback function - gets called when job completes (if NULL, then no callback done)
|
* (5) Callback function - gets called all procs reach specified conditions (if NULL, then no callback done)
|
||||||
|
* (6) callback conditions - flag indicating which triggers are to generate callbacks to the specified function
|
||||||
*
|
*
|
||||||
* @code
|
* @code
|
||||||
* orte_jobid_t jobid;
|
* orte_jobid_t jobid;
|
||||||
*
|
*
|
||||||
* return_value = orte_rmgr.spawn(app_context, num_context, &jobid, NULL);
|
* return_value = orte_rmgr.spawn(app_context, num_context, &jobid, NULL, 0);
|
||||||
* @endcode
|
* @endcode
|
||||||
*/
|
*/
|
||||||
typedef int (*orte_rmgr_base_module_spawn_fn_t)(
|
typedef int (*orte_rmgr_base_module_spawn_fn_t)(
|
||||||
orte_app_context_t** app_context,
|
orte_app_context_t** app_context,
|
||||||
size_t num_context,
|
size_t num_context,
|
||||||
orte_jobid_t *jobid,
|
orte_jobid_t *jobid,
|
||||||
orte_rmgr_cb_fn_t cbfn);
|
orte_rmgr_cb_fn_t cbfn,
|
||||||
|
orte_proc_state_t cb_conditions);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Init the proc stage gate process
|
* Init the proc stage gate process
|
||||||
|
@ -28,14 +28,6 @@ extern "C" {
|
|||||||
*/
|
*/
|
||||||
#define ORTE_RMGR_LAUNCHER "orte-rmgr-launcher"
|
#define ORTE_RMGR_LAUNCHER "orte-rmgr-launcher"
|
||||||
|
|
||||||
/*
|
|
||||||
* Stage Gate flags used to specify which state changes
|
|
||||||
* one desires to be notified of.
|
|
||||||
*/
|
|
||||||
#define ORTE_STAGE_GATE_ALL 1
|
|
||||||
#define ORTE_STAGE_GATE_STAGES 2
|
|
||||||
#define ORTE_STAGE_GATE_TERMINATION 3
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Constants for command values
|
* Constants for command values
|
||||||
*/
|
*/
|
||||||
|
@ -74,7 +74,8 @@ static int orte_rmgr_urm_spawn(
|
|||||||
orte_app_context_t** app_context,
|
orte_app_context_t** app_context,
|
||||||
size_t num_context,
|
size_t num_context,
|
||||||
orte_jobid_t* jobid,
|
orte_jobid_t* jobid,
|
||||||
orte_rmgr_cb_fn_t cbfn);
|
orte_rmgr_cb_fn_t cbfn,
|
||||||
|
orte_proc_state_t cb_conditions);
|
||||||
|
|
||||||
static int orte_rmgr_urm_finalize(void);
|
static int orte_rmgr_urm_finalize(void);
|
||||||
|
|
||||||
@ -255,11 +256,20 @@ static void orte_rmgr_urm_callback(orte_gpr_notify_data_t *data, void *cbdata)
|
|||||||
keyvals = value->keyvals;
|
keyvals = value->keyvals;
|
||||||
for(j=0; j<value->cnt; j++) {
|
for(j=0; j<value->cnt; j++) {
|
||||||
orte_gpr_keyval_t* keyval = keyvals[j];
|
orte_gpr_keyval_t* keyval = keyvals[j];
|
||||||
|
if(strcmp(keyval->key, ORTE_PROC_NUM_AT_INIT) == 0) {
|
||||||
|
(*cbfunc)(jobid,ORTE_PROC_STATE_INIT);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if(strcmp(keyval->key, ORTE_PROC_NUM_LAUNCHED) == 0) {
|
||||||
|
(*cbfunc)(jobid,ORTE_PROC_STATE_LAUNCHED);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if(strcmp(keyval->key, ORTE_PROC_NUM_RUNNING) == 0) {
|
||||||
|
(*cbfunc)(jobid,ORTE_PROC_STATE_RUNNING);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
if(strcmp(keyval->key, ORTE_PROC_NUM_AT_STG1) == 0) {
|
if(strcmp(keyval->key, ORTE_PROC_NUM_AT_STG1) == 0) {
|
||||||
(*cbfunc)(jobid,ORTE_PROC_STATE_AT_STG1);
|
(*cbfunc)(jobid,ORTE_PROC_STATE_AT_STG1);
|
||||||
/* BWB - XXX - FIX ME: this needs to happen when all
|
|
||||||
are LAUNCHED, before STG1 */
|
|
||||||
orte_rmgr_urm_wireup_stdin(jobid);
|
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
if(strcmp(keyval->key, ORTE_PROC_NUM_AT_STG2) == 0) {
|
if(strcmp(keyval->key, ORTE_PROC_NUM_AT_STG2) == 0) {
|
||||||
@ -288,6 +298,29 @@ static void orte_rmgr_urm_callback(orte_gpr_notify_data_t *data, void *cbdata)
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* define a callback point for completing the wireup of the stdin for io forwarding
|
||||||
|
*/
|
||||||
|
static void orte_rmgr_urm_wireup_callback(orte_gpr_notify_data_t *data, void *cbdata)
|
||||||
|
{
|
||||||
|
orte_gpr_value_t **values;
|
||||||
|
orte_jobid_t jobid;
|
||||||
|
int rc;
|
||||||
|
|
||||||
|
OPAL_TRACE(1);
|
||||||
|
|
||||||
|
/* we made sure in the subscriptions that at least one
|
||||||
|
* value is always returned
|
||||||
|
* get the jobid from the segment name in the first value
|
||||||
|
*/
|
||||||
|
values = (orte_gpr_value_t**)(data->values)->addr;
|
||||||
|
if (ORTE_SUCCESS != (rc = orte_schema.extract_jobid_from_segment_name(&jobid, values[0]->segment))) {
|
||||||
|
ORTE_ERROR_LOG(rc);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
orte_rmgr_urm_wireup_stdin(jobid);
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Shortcut for the multiple steps involved in spawning a new job.
|
* Shortcut for the multiple steps involved in spawning a new job.
|
||||||
*/
|
*/
|
||||||
@ -297,7 +330,8 @@ static int orte_rmgr_urm_spawn(
|
|||||||
orte_app_context_t** app_context,
|
orte_app_context_t** app_context,
|
||||||
size_t num_context,
|
size_t num_context,
|
||||||
orte_jobid_t* jobid,
|
orte_jobid_t* jobid,
|
||||||
orte_rmgr_cb_fn_t cbfunc)
|
orte_rmgr_cb_fn_t cbfunc,
|
||||||
|
orte_proc_state_t cb_conditions)
|
||||||
{
|
{
|
||||||
int rc;
|
int rc;
|
||||||
orte_process_name_t* name;
|
orte_process_name_t* name;
|
||||||
@ -350,18 +384,24 @@ static int orte_rmgr_urm_spawn(
|
|||||||
ORTE_ERROR_LOG(rc);
|
ORTE_ERROR_LOG(rc);
|
||||||
return rc;
|
return rc;
|
||||||
}
|
}
|
||||||
|
/** setup the subscription so we can complete the wireup when all processes reach LAUNCHED */
|
||||||
|
rc = orte_rmgr_base_proc_stage_gate_subscribe(*jobid, orte_rmgr_urm_wireup_callback, NULL, ORTE_PROC_STATE_LAUNCHED);
|
||||||
|
if(ORTE_SUCCESS != rc) {
|
||||||
|
ORTE_ERROR_LOG(rc);
|
||||||
|
return rc;
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* setup callback
|
* setup callback
|
||||||
*/
|
*/
|
||||||
|
|
||||||
if(NULL != cbfunc) {
|
if(NULL != cbfunc) {
|
||||||
rc = orte_rmgr_base_proc_stage_gate_subscribe(*jobid, orte_rmgr_urm_callback, (void*)cbfunc, ORTE_STAGE_GATE_ALL);
|
rc = orte_rmgr_base_proc_stage_gate_subscribe(*jobid, orte_rmgr_urm_callback, (void*)cbfunc, cb_conditions);
|
||||||
if(ORTE_SUCCESS != rc) {
|
if(ORTE_SUCCESS != rc) {
|
||||||
ORTE_ERROR_LOG(rc);
|
ORTE_ERROR_LOG(rc);
|
||||||
return rc;
|
return rc;
|
||||||
}
|
}
|
||||||
cbfunc(*jobid, ORTE_PROC_STATE_INIT);
|
/* cbfunc(*jobid, ORTE_PROC_STATE_INIT); RHC - not sure why this was here, but it doesn't seem required */
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -76,6 +76,9 @@
|
|||||||
#define ORTE_PROC_EXIT_CODE_KEY "orte-proc-exit-code"
|
#define ORTE_PROC_EXIT_CODE_KEY "orte-proc-exit-code"
|
||||||
#define ORTE_PROC_NUM_ALIVE "orte-proc-num-alive"
|
#define ORTE_PROC_NUM_ALIVE "orte-proc-num-alive"
|
||||||
#define ORTE_PROC_NUM_ABORTED "orte-proc-num-aborted"
|
#define ORTE_PROC_NUM_ABORTED "orte-proc-num-aborted"
|
||||||
|
#define ORTE_PROC_NUM_AT_INIT "orte-proc-num-init"
|
||||||
|
#define ORTE_PROC_NUM_LAUNCHED "orte-proc-num-launched"
|
||||||
|
#define ORTE_PROC_NUM_RUNNING "orte-proc-num-running"
|
||||||
#define ORTE_PROC_NUM_AT_STG1 "orte-proc-num-stg1"
|
#define ORTE_PROC_NUM_AT_STG1 "orte-proc-num-stg1"
|
||||||
#define ORTE_PROC_NUM_AT_STG2 "orte-proc-num-stg2"
|
#define ORTE_PROC_NUM_AT_STG2 "orte-proc-num-stg2"
|
||||||
#define ORTE_PROC_NUM_AT_STG3 "orte-proc-num-stg3"
|
#define ORTE_PROC_NUM_AT_STG3 "orte-proc-num-stg3"
|
||||||
@ -85,6 +88,9 @@
|
|||||||
/*
|
/*
|
||||||
* ORTE-wide names for specific system triggers and subscriptions
|
* ORTE-wide names for specific system triggers and subscriptions
|
||||||
*/
|
*/
|
||||||
|
#define ORTE_ALL_INIT_TRIGGER "orte-init-trig"
|
||||||
|
#define ORTE_ALL_LAUNCHED_TRIGGER "orte-launch-trig"
|
||||||
|
#define ORTE_ALL_RUNNING_TRIGGER "orte-running-trig"
|
||||||
#define ORTE_STG1_TRIGGER "orte-stage1"
|
#define ORTE_STG1_TRIGGER "orte-stage1"
|
||||||
#define ORTE_STG2_TRIGGER "orte-stage2"
|
#define ORTE_STG2_TRIGGER "orte-stage2"
|
||||||
#define ORTE_STG3_TRIGGER "orte-stage3"
|
#define ORTE_STG3_TRIGGER "orte-stage3"
|
||||||
|
@ -66,7 +66,7 @@ int orte_soh_base_pack_proc_state(orte_buffer_t *buffer, void *src,
|
|||||||
{
|
{
|
||||||
int rc;
|
int rc;
|
||||||
|
|
||||||
if (ORTE_SUCCESS != (rc = orte_dss_pack_buffer(buffer, src, num_vals, ORTE_INT8))) {
|
if (ORTE_SUCCESS != (rc = orte_dss_pack_buffer(buffer, src, num_vals, ORTE_PROC_STATE_T))) {
|
||||||
ORTE_ERROR_LOG(rc);
|
ORTE_ERROR_LOG(rc);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -81,7 +81,7 @@ int orte_soh_base_pack_job_state(orte_buffer_t *buffer, void *src,
|
|||||||
{
|
{
|
||||||
int rc;
|
int rc;
|
||||||
|
|
||||||
if (ORTE_SUCCESS != (rc = orte_dss_pack_buffer(buffer, src, num_vals, ORTE_INT8))) {
|
if (ORTE_SUCCESS != (rc = orte_dss_pack_buffer(buffer, src, num_vals, ORTE_JOB_STATE_T))) {
|
||||||
ORTE_ERROR_LOG(rc);
|
ORTE_ERROR_LOG(rc);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -66,7 +66,7 @@ int orte_soh_base_unpack_proc_state(orte_buffer_t *buffer, void *dest,
|
|||||||
{
|
{
|
||||||
int rc;
|
int rc;
|
||||||
|
|
||||||
if (ORTE_SUCCESS != (rc = orte_dss_unpack_buffer(buffer, dest, num_vals, ORTE_INT8))) {
|
if (ORTE_SUCCESS != (rc = orte_dss_unpack_buffer(buffer, dest, num_vals, ORTE_PROC_STATE_T))) {
|
||||||
ORTE_ERROR_LOG(rc);
|
ORTE_ERROR_LOG(rc);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -81,7 +81,7 @@ int orte_soh_base_unpack_job_state(orte_buffer_t *buffer, void *dest,
|
|||||||
{
|
{
|
||||||
int rc;
|
int rc;
|
||||||
|
|
||||||
if (ORTE_SUCCESS != (rc = orte_dss_unpack_buffer(buffer, dest, num_vals, ORTE_INT8))) {
|
if (ORTE_SUCCESS != (rc = orte_dss_unpack_buffer(buffer, dest, num_vals, ORTE_JOB_STATE_T))) {
|
||||||
ORTE_ERROR_LOG(rc);
|
ORTE_ERROR_LOG(rc);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -31,33 +31,41 @@ typedef int orte_exit_code_t;
|
|||||||
* Process state codes
|
* Process state codes
|
||||||
*/
|
*/
|
||||||
|
|
||||||
typedef int8_t orte_proc_state_t;
|
typedef uint16_t orte_proc_state_t;
|
||||||
|
#define ORTE_PROC_STATE_T ORTE_UINT16
|
||||||
|
|
||||||
#define ORTE_PROC_STATE_INIT 0x01 /* process entry has been created by rmaps */
|
#define ORTE_PROC_STATE_INIT 0x0001 /* process entry has been created by rmaps */
|
||||||
#define ORTE_PROC_STATE_LAUNCHED 0x02 /* process has been launched by pls */
|
#define ORTE_PROC_STATE_LAUNCHED 0x0002 /* process has been launched by pls */
|
||||||
#define ORTE_PROC_STATE_AT_STG1 0x03 /* process is at Stage Gate 1 barrier in orte_init */
|
#define ORTE_PROC_STATE_AT_STG1 0x0004 /* process is at Stage Gate 1 barrier in orte_init */
|
||||||
#define ORTE_PROC_STATE_AT_STG2 0x04 /* process is at Stage Gate 2 barrier in orte_init */
|
#define ORTE_PROC_STATE_AT_STG2 0x0008 /* process is at Stage Gate 2 barrier in orte_init */
|
||||||
#define ORTE_PROC_STATE_RUNNING 0x06 /* process has exited orte_init and is running */
|
#define ORTE_PROC_STATE_RUNNING 0x0010 /* process has exited orte_init and is running */
|
||||||
#define ORTE_PROC_STATE_AT_STG3 0x07 /* process is at Stage Gate 3 barrier in orte_finalize */
|
#define ORTE_PROC_STATE_AT_STG3 0x0020 /* process is at Stage Gate 3 barrier in orte_finalize */
|
||||||
#define ORTE_PROC_STATE_FINALIZED 0x08 /* process has completed orte_finalize and is running */
|
#define ORTE_PROC_STATE_FINALIZED 0x0040 /* process has completed orte_finalize and is running */
|
||||||
#define ORTE_PROC_STATE_TERMINATED 0x09 /* process has terminated and is no longer running */
|
#define ORTE_PROC_STATE_TERMINATED 0x0080 /* process has terminated and is no longer running */
|
||||||
#define ORTE_PROC_STATE_ABORTED 0x0A /* process aborted */
|
#define ORTE_PROC_STATE_ABORTED 0x0100 /* process aborted */
|
||||||
|
|
||||||
|
/** define some common shorthands for when we want to be alerted */
|
||||||
|
#define ORTE_PROC_STATE_ALL 0xffff /* alert on ALL triggers */
|
||||||
|
#define ORTE_PROC_STAGE_GATES_ONLY ORTE_PROC_STATE_AT_STG1 | ORTE_PROC_STATE_AT_STG2 | ORTE_PROC_STATE_AT_STG3 | ORTE_PROC_STATE_FINALIZED
|
||||||
|
#define ORTE_PROC_STATE_TERMINATION ORTE_PROC_STATE_TERMINATED | ORTE_PROC_STATE_ABORTED
|
||||||
|
#define ORTE_PROC_STATE_NONE 0x0000 /* don't alert on any triggers */
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Job state codes
|
* Job state codes
|
||||||
*/
|
*/
|
||||||
|
|
||||||
typedef int8_t orte_job_state_t;
|
typedef uint16_t orte_job_state_t;
|
||||||
|
#define ORTE_JOB_STATE_T ORTE_UINT16
|
||||||
|
|
||||||
#define ORTE_JOB_STATE_INIT 0x01 /* job entry has been created by rmaps */
|
#define ORTE_JOB_STATE_INIT 0x0001 /* job entry has been created by rmaps */
|
||||||
#define ORTE_JOB_STATE_LAUNCHED 0x02 /* job has been launched by pls */
|
#define ORTE_JOB_STATE_LAUNCHED 0x0002 /* job has been launched by pls */
|
||||||
#define ORTE_JOB_STATE_AT_STG1 0x03 /* all processes are at Stage Gate 1 barrier in orte_init */
|
#define ORTE_JOB_STATE_AT_STG1 0x0004 /* all processes are at Stage Gate 1 barrier in orte_init */
|
||||||
#define ORTE_JOB_STATE_AT_STG2 0x04 /* all processes are at Stage Gate 2 barrier in orte_init */
|
#define ORTE_JOB_STATE_AT_STG2 0x0008 /* all processes are at Stage Gate 2 barrier in orte_init */
|
||||||
#define ORTE_JOB_STATE_RUNNING 0x06 /* all processes have exited orte_init and is running */
|
#define ORTE_JOB_STATE_RUNNING 0x0010 /* all processes have exited orte_init and is running */
|
||||||
#define ORTE_JOB_STATE_AT_STG3 0x07 /* all processes are at Stage Gate 3 barrier in orte_finalize */
|
#define ORTE_JOB_STATE_AT_STG3 0x0020 /* all processes are at Stage Gate 3 barrier in orte_finalize */
|
||||||
#define ORTE_JOB_STATE_FINALIZED 0x08 /* all processes have completed orte_finalize and is running */
|
#define ORTE_JOB_STATE_FINALIZED 0x0040 /* all processes have completed orte_finalize and is running */
|
||||||
#define ORTE_JOB_STATE_TERMINATED 0x09 /* all processes have terminated and is no longer running */
|
#define ORTE_JOB_STATE_TERMINATED 0x0080 /* all processes have terminated and is no longer running */
|
||||||
#define ORTE_JOB_STATE_ABORTED 0x0A /* at least one process aborted, causing job to abort */
|
#define ORTE_JOB_STATE_ABORTED 0x0100 /* at least one process aborted, causing job to abort */
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Node State, corresponding to the ORTE_NODE_STATE_* #defines,
|
* Node State, corresponding to the ORTE_NODE_STATE_* #defines,
|
||||||
|
@ -352,7 +352,7 @@ int main(int argc, char *argv[])
|
|||||||
}
|
}
|
||||||
|
|
||||||
/* Setup callback on jobid */
|
/* Setup callback on jobid */
|
||||||
ret = orte_rmgr_base_proc_stage_gate_subscribe(orted_globals.bootproxy, job_state_callback, NULL, ORTE_STAGE_GATE_TERMINATION);
|
ret = orte_rmgr_base_proc_stage_gate_subscribe(orted_globals.bootproxy, job_state_callback, NULL, ORTE_PROC_STATE_TERMINATION);
|
||||||
if(ORTE_SUCCESS != ret) {
|
if(ORTE_SUCCESS != ret) {
|
||||||
ORTE_ERROR_LOG(ret);
|
ORTE_ERROR_LOG(ret);
|
||||||
return ret;
|
return ret;
|
||||||
|
@ -253,6 +253,7 @@ int orterun(int argc, char *argv[])
|
|||||||
orte_app_context_t **apps;
|
orte_app_context_t **apps;
|
||||||
int rc, i, num_apps, array_size, j;
|
int rc, i, num_apps, array_size, j;
|
||||||
int id, iparam;
|
int id, iparam;
|
||||||
|
orte_proc_state_t cb_states;
|
||||||
|
|
||||||
/* Setup MCA params */
|
/* Setup MCA params */
|
||||||
|
|
||||||
@ -392,7 +393,8 @@ int orterun(int argc, char *argv[])
|
|||||||
|
|
||||||
/* Spawn the job */
|
/* Spawn the job */
|
||||||
|
|
||||||
rc = orte_rmgr.spawn(apps, num_apps, &jobid, job_state_callback);
|
cb_states = ORTE_PROC_STATE_ABORTED | ORTE_PROC_STATE_TERMINATED | ORTE_PROC_STATE_AT_STG1;
|
||||||
|
rc = orte_rmgr.spawn(apps, num_apps, &jobid, job_state_callback, cb_states);
|
||||||
if (ORTE_SUCCESS != rc) {
|
if (ORTE_SUCCESS != rc) {
|
||||||
/* JMS show_help */
|
/* JMS show_help */
|
||||||
opal_output(0, "%s: spawn failed with errno=%d\n", orterun_basename, rc);
|
opal_output(0, "%s: spawn failed with errno=%d\n", orterun_basename, rc);
|
||||||
@ -620,6 +622,10 @@ static void job_state_callback(orte_jobid_t jobid, orte_proc_state_t state)
|
|||||||
case ORTE_PROC_STATE_AT_STG1:
|
case ORTE_PROC_STATE_AT_STG1:
|
||||||
orte_totalview_init_after_spawn(jobid);
|
orte_totalview_init_after_spawn(jobid);
|
||||||
break;
|
break;
|
||||||
|
|
||||||
|
default:
|
||||||
|
opal_output(0, "orterun: job state callback in unexpected state - jobid %lu, state 0x%04x\n", jobid, state);
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
OPAL_THREAD_UNLOCK(&orterun_globals.lock);
|
OPAL_THREAD_UNLOCK(&orterun_globals.lock);
|
||||||
}
|
}
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user