Merge pull request #3659 from rhc54/topic/threads
Update OPAL and ORTE for thread safety
Этот коммит содержится в:
Коммит
21fba8b7f3
1
.gitignore
поставляемый
1
.gitignore
поставляемый
@ -475,6 +475,7 @@ orte/test/system/opal_db
|
||||
orte/test/system/ulfm
|
||||
orte/test/system/pmixtool
|
||||
orte/test/system/orte_notify
|
||||
orte/test/system/threads
|
||||
|
||||
orte/tools/orte-checkpoint/orte-checkpoint
|
||||
orte/tools/orte-checkpoint/orte-checkpoint.1
|
||||
|
@ -31,6 +31,7 @@
|
||||
#include "opal/mca/hwloc/base/base.h"
|
||||
#include "opal/runtime/opal.h"
|
||||
#include "opal/runtime/opal_progress_threads.h"
|
||||
#include "opal/threads/threads.h"
|
||||
#include "opal/util/argv.h"
|
||||
#include "opal/util/error.h"
|
||||
#include "opal/util/output.h"
|
||||
@ -164,6 +165,7 @@ static void return_local_event_hdlr(int status, opal_list_t *results,
|
||||
pmix_status_t pstatus;
|
||||
size_t n;
|
||||
|
||||
OPAL_ACQUIRE_OBJECT(cd);
|
||||
if (NULL != cd->pmixcbfunc) {
|
||||
op = OBJ_NEW(pmix2x_opcaddy_t);
|
||||
|
||||
@ -203,6 +205,8 @@ static void _event_hdlr(int sd, short args, void *cbdata)
|
||||
pmix2x_threadshift_t *cd = (pmix2x_threadshift_t*)cbdata;
|
||||
opal_pmix2x_event_t *event;
|
||||
|
||||
OPAL_ACQUIRE_OBJECT(cd);
|
||||
|
||||
opal_output_verbose(2, opal_pmix_base_framework.framework_output,
|
||||
"%s _EVENT_HDLR RECEIVED NOTIFICATION FOR HANDLER %d OF STATUS %d",
|
||||
OPAL_NAME_PRINT(OPAL_PROC_MY_NAME), (int)cd->id, cd->status);
|
||||
@ -312,6 +316,7 @@ void pmix2x_event_hdlr(size_t evhdlr_registration_id,
|
||||
/* now push it into the local thread */
|
||||
opal_event_assign(&cd->ev, opal_pmix_base.evbase,
|
||||
-1, EV_WRITE, _event_hdlr, cd);
|
||||
OPAL_POST_OBJECT(cd);
|
||||
opal_event_active(&cd->ev, EV_WRITE, 1);
|
||||
}
|
||||
|
||||
@ -986,6 +991,7 @@ static void errreg_cbfunc (pmix_status_t status,
|
||||
{
|
||||
pmix2x_opcaddy_t *op = (pmix2x_opcaddy_t*)cbdata;
|
||||
|
||||
OPAL_ACQUIRE_OBJECT(op);
|
||||
op->event->index = errhandler_ref;
|
||||
opal_output_verbose(5, opal_pmix_base_framework.framework_output,
|
||||
"PMIX2x errreg_cbfunc - error handler registered status=%d, reference=%lu",
|
||||
@ -1003,6 +1009,7 @@ static void _reg_hdlr(int sd, short args, void *cbdata)
|
||||
opal_value_t *kv;
|
||||
size_t n;
|
||||
|
||||
OPAL_ACQUIRE_OBJECT(cd);
|
||||
opal_output_verbose(2, opal_pmix_base_framework.framework_output,
|
||||
"%s REGISTER HANDLER CODES %s",
|
||||
OPAL_NAME_PRINT(OPAL_PROC_MY_NAME),
|
||||
@ -1067,6 +1074,7 @@ static void _dereg_hdlr(int sd, short args, void *cbdata)
|
||||
pmix2x_threadshift_t *cd = (pmix2x_threadshift_t*)cbdata;
|
||||
opal_pmix2x_event_t *event;
|
||||
|
||||
OPAL_ACQUIRE_OBJECT(cd);
|
||||
/* look for this event */
|
||||
OPAL_LIST_FOREACH(event, &mca_pmix_pmix2x_component.events, opal_pmix2x_event_t) {
|
||||
if (cd->handler == event->index) {
|
||||
@ -1116,6 +1124,8 @@ static void _notify(int sd, short args, void *cbdata)
|
||||
pmix_data_range_t prange;
|
||||
opal_pmix2x_jobid_trkr_t *job, *jptr;
|
||||
|
||||
OPAL_ACQUIRE_OBJECT(cd);
|
||||
|
||||
op = OBJ_NEW(pmix2x_opcaddy_t);
|
||||
|
||||
/* convert the status */
|
||||
@ -1204,6 +1214,8 @@ static void infocbfunc(pmix_status_t status,
|
||||
opal_value_t *iptr;
|
||||
size_t n;
|
||||
|
||||
OPAL_ACQUIRE_OBJECT(cd);
|
||||
|
||||
/* convert the array of pmix_info_t to the list of info */
|
||||
if (NULL != info) {
|
||||
results = OBJ_NEW(opal_list_t);
|
||||
@ -1294,6 +1306,8 @@ static void opcbfunc(pmix_status_t status, void *cbdata)
|
||||
{
|
||||
pmix2x_opcaddy_t *op = (pmix2x_opcaddy_t*)cbdata;
|
||||
|
||||
OPAL_ACQUIRE_OBJECT(op);
|
||||
|
||||
if (NULL != op->opcbfunc) {
|
||||
op->opcbfunc(pmix2x_convert_rc(status), op->cbdata);
|
||||
}
|
||||
|
@ -156,6 +156,7 @@ OBJ_CLASS_DECLARATION(pmix2x_threadshift_t);
|
||||
_cd->cbdata = (cd); \
|
||||
opal_event_assign(&((_cd)->ev), opal_pmix_base.evbase, \
|
||||
-1, EV_WRITE, (fn), (_cd)); \
|
||||
OPAL_POST_OBJECT(_cd); \
|
||||
opal_event_active(&((_cd)->ev), EV_WRITE, 1); \
|
||||
} while(0)
|
||||
|
||||
@ -170,6 +171,7 @@ OBJ_CLASS_DECLARATION(pmix2x_threadshift_t);
|
||||
_cd->cbdata = (cd); \
|
||||
opal_event_assign(&((_cd)->ev), opal_pmix_base.evbase, \
|
||||
-1, EV_WRITE, (fn), (_cd)); \
|
||||
OPAL_POST_OBJECT(_cd); \
|
||||
opal_event_active(&((_cd)->ev), EV_WRITE, 1); \
|
||||
} while(0)
|
||||
|
||||
@ -185,6 +187,7 @@ OBJ_CLASS_DECLARATION(pmix2x_threadshift_t);
|
||||
_cd->cbdata = (cd); \
|
||||
opal_event_assign(&((_cd)->ev), opal_pmix_base.evbase, \
|
||||
-1, EV_WRITE, (fn), (_cd)); \
|
||||
OPAL_POST_OBJECT(_cd); \
|
||||
opal_event_active(&((_cd)->ev), EV_WRITE, 1); \
|
||||
} while(0)
|
||||
|
||||
|
@ -27,6 +27,7 @@
|
||||
#endif
|
||||
|
||||
#include "opal/hash_string.h"
|
||||
#include "opal/threads/threads.h"
|
||||
#include "opal/util/argv.h"
|
||||
#include "opal/util/proc.h"
|
||||
|
||||
@ -44,6 +45,7 @@ static bool initialized = false;
|
||||
while ((a)) { \
|
||||
usleep(10); \
|
||||
} \
|
||||
OPAL_ACQUIRE_OBJECT(a); \
|
||||
} while (0)
|
||||
|
||||
|
||||
@ -53,11 +55,14 @@ static void errreg_cbfunc (pmix_status_t status,
|
||||
{
|
||||
opal_pmix2x_event_t *event = (opal_pmix2x_event_t*)cbdata;
|
||||
|
||||
OPAL_ACQUIRE_OBJECT(event);
|
||||
|
||||
event->index = errhandler_ref;
|
||||
opal_output_verbose(5, opal_pmix_base_framework.framework_output,
|
||||
"PMIX client errreg_cbfunc - error handler registered status=%d, reference=%lu",
|
||||
status, (unsigned long)errhandler_ref);
|
||||
regactive = false;
|
||||
OPAL_POST_OBJECT(regactive);
|
||||
}
|
||||
|
||||
int pmix2x_client_init(opal_list_t *ilist)
|
||||
@ -272,6 +277,7 @@ static void opcbfunc(pmix_status_t status, void *cbdata)
|
||||
{
|
||||
pmix2x_opcaddy_t *op = (pmix2x_opcaddy_t*)cbdata;
|
||||
|
||||
OPAL_ACQUIRE_OBJECT(op);
|
||||
if (NULL != op->opcbfunc) {
|
||||
op->opcbfunc(pmix2x_convert_rc(status), op->cbdata);
|
||||
}
|
||||
@ -521,6 +527,8 @@ static void val_cbfunc(pmix_status_t status,
|
||||
int rc;
|
||||
opal_value_t val, *v=NULL;
|
||||
|
||||
OPAL_ACQUIRE_OBJECT(op);
|
||||
|
||||
rc = pmix2x_convert_opalrc(status);
|
||||
if (PMIX_SUCCESS == status && NULL != kv) {
|
||||
rc = pmix2x_value_unload(&val, kv);
|
||||
@ -768,6 +776,8 @@ static void lk_cbfunc(pmix_status_t status,
|
||||
size_t n;
|
||||
opal_pmix2x_jobid_trkr_t *job, *jptr;
|
||||
|
||||
OPAL_ACQUIRE_OBJECT(op);
|
||||
|
||||
/* this is in the PMIx local thread - need to threadshift to
|
||||
* our own thread as we will be accessing framework-global
|
||||
* lists and objects */
|
||||
@ -817,7 +827,7 @@ static void lk_cbfunc(pmix_status_t status,
|
||||
}
|
||||
r = &results;
|
||||
}
|
||||
release:
|
||||
release:
|
||||
/* execute the callback */
|
||||
op->lkcbfunc(rc, r, op->cbdata);
|
||||
|
||||
@ -994,6 +1004,8 @@ static void spcbfunc(pmix_status_t status,
|
||||
opal_jobid_t jobid=OPAL_JOBID_INVALID;
|
||||
opal_pmix2x_jobid_trkr_t *job;
|
||||
|
||||
OPAL_ACQUIRE_OBJECT(op);
|
||||
|
||||
/* this is in the PMIx local thread - need to threadshift to
|
||||
* our own thread as we will be accessing framework-global
|
||||
* lists and objects */
|
||||
|
@ -29,6 +29,7 @@
|
||||
#include "opal/mca/hwloc/base/base.h"
|
||||
#include "opal/runtime/opal.h"
|
||||
#include "opal/runtime/opal_progress_threads.h"
|
||||
#include "opal/threads/threads.h"
|
||||
#include "opal/util/argv.h"
|
||||
#include "opal/util/error.h"
|
||||
#include "opal/util/output.h"
|
||||
@ -142,6 +143,7 @@ static void opal_opcbfunc(int status, void *cbdata)
|
||||
{
|
||||
pmix2x_opalcaddy_t *opalcaddy = (pmix2x_opalcaddy_t*)cbdata;
|
||||
|
||||
OPAL_ACQUIRE_OBJECT(opalcaddy);
|
||||
if (NULL != opalcaddy->opcbfunc) {
|
||||
opalcaddy->opcbfunc(pmix2x_convert_opalrc(status), opalcaddy->cbdata);
|
||||
}
|
||||
|
@ -32,6 +32,7 @@
|
||||
#include "opal/mca/hwloc/base/base.h"
|
||||
#include "opal/runtime/opal.h"
|
||||
#include "opal/runtime/opal_progress_threads.h"
|
||||
#include "opal/threads/threads.h"
|
||||
#include "opal/util/argv.h"
|
||||
#include "opal/util/error.h"
|
||||
#include "opal/util/output.h"
|
||||
@ -58,6 +59,7 @@ static size_t errhdler_ref = 0;
|
||||
while ((a)) { \
|
||||
usleep(10); \
|
||||
} \
|
||||
OPAL_ACQUIRE_OBJECT(a); \
|
||||
} while (0)
|
||||
|
||||
static void errreg_cbfunc (pmix_status_t status,
|
||||
@ -66,10 +68,12 @@ static void errreg_cbfunc (pmix_status_t status,
|
||||
{
|
||||
volatile bool *active = (volatile bool*)cbdata;
|
||||
|
||||
OPAL_ACQUIRE_OBJECT(active);
|
||||
errhdler_ref = errhandler_ref;
|
||||
opal_output_verbose(5, opal_pmix_base_framework.framework_output,
|
||||
"PMIX server errreg_cbfunc - error handler registered status=%d, reference=%lu",
|
||||
status, (unsigned long)errhandler_ref);
|
||||
OPAL_POST_OBJECT(active);
|
||||
*active = false;
|
||||
}
|
||||
|
||||
@ -77,11 +81,14 @@ static void opcbfunc(pmix_status_t status, void *cbdata)
|
||||
{
|
||||
pmix2x_opcaddy_t *op = (pmix2x_opcaddy_t*)cbdata;
|
||||
|
||||
OPAL_ACQUIRE_OBJECT(op);
|
||||
|
||||
if (NULL != op->opcbfunc) {
|
||||
op->opcbfunc(pmix2x_convert_rc(status), op->cbdata);
|
||||
}
|
||||
if (op->active) {
|
||||
op->status = status;
|
||||
OPAL_POST_OBJECT(op);
|
||||
op->active = false;
|
||||
} else {
|
||||
OBJ_RELEASE(op);
|
||||
@ -92,6 +99,7 @@ static void op2cbfunc(pmix_status_t status, void *cbdata)
|
||||
{
|
||||
volatile bool *active = (volatile bool*)cbdata;
|
||||
|
||||
OPAL_POST_OBJECT(active);
|
||||
*active = false;
|
||||
}
|
||||
|
||||
@ -165,6 +173,7 @@ int pmix2x_server_init(opal_pmix_server_module_t *module,
|
||||
static void fincb(pmix_status_t status, void *cbdata)
|
||||
{
|
||||
volatile bool *active = (volatile bool*)cbdata;
|
||||
OPAL_POST_OBJECT(active);
|
||||
*active = false;
|
||||
}
|
||||
|
||||
@ -211,6 +220,8 @@ static void _reg_nspace(int sd, short args, void *cbdata)
|
||||
opal_pmix2x_jobid_trkr_t *job;
|
||||
pmix2x_opcaddy_t op;
|
||||
|
||||
OPAL_ACQUIRE_OBJECT(cd);
|
||||
|
||||
/* we must threadshift this request as we might not be in an event
|
||||
* and we are going to access framework-global lists/objects */
|
||||
|
||||
@ -301,6 +312,7 @@ int pmix2x_server_register_nspace(opal_jobid_t jobid,
|
||||
} else {
|
||||
opal_event_assign(&cd->ev, opal_pmix_base.evbase,
|
||||
-1, EV_WRITE, _reg_nspace, cd);
|
||||
OPAL_POST_OBJECT(cd);
|
||||
opal_event_active(&cd->ev, EV_WRITE, 1);
|
||||
}
|
||||
|
||||
@ -311,10 +323,12 @@ static void tdcbfunc(pmix_status_t status, void *cbdata)
|
||||
{
|
||||
pmix2x_threadshift_t *cd = (pmix2x_threadshift_t*)cbdata;
|
||||
|
||||
OPAL_ACQUIRE_OBJECT(cd);
|
||||
if (NULL != cd->opcbfunc) {
|
||||
cd->opcbfunc(pmix2x_convert_rc(status), cd->cbdata);
|
||||
}
|
||||
if (cd->active) {
|
||||
OPAL_POST_OBJECT(cd);
|
||||
cd->active = false;
|
||||
} else {
|
||||
OBJ_RELEASE(cd);
|
||||
@ -326,6 +340,7 @@ static void _dereg_nspace(int sd, short args, void *cbdata)
|
||||
pmix2x_threadshift_t *cd = (pmix2x_threadshift_t*)cbdata;
|
||||
opal_pmix2x_jobid_trkr_t *jptr;
|
||||
|
||||
OPAL_ACQUIRE_OBJECT(cd);
|
||||
/* if we don't already have it, we can ignore this */
|
||||
OPAL_LIST_FOREACH(jptr, &mca_pmix_pmix2x_component.jobids, opal_pmix2x_jobid_trkr_t) {
|
||||
if (jptr->jobid == cd->jobid) {
|
||||
@ -361,6 +376,7 @@ void pmix2x_server_deregister_nspace(opal_jobid_t jobid,
|
||||
} else {
|
||||
opal_event_assign(&cd->ev, opal_pmix_base.evbase,
|
||||
-1, EV_WRITE, _dereg_nspace, cd);
|
||||
OPAL_POST_OBJECT(cd);
|
||||
opal_event_active(&cd->ev, EV_WRITE, 1);
|
||||
}
|
||||
}
|
||||
@ -397,6 +413,7 @@ static void _dereg_client(int sd, short args, void *cbdata)
|
||||
opal_pmix2x_jobid_trkr_t *jptr;
|
||||
pmix_proc_t p;
|
||||
|
||||
OPAL_ACQUIRE_OBJECT(cd);
|
||||
/* if we don't already have it, we can ignore this */
|
||||
OPAL_LIST_FOREACH(jptr, &mca_pmix_pmix2x_component.jobids, opal_pmix2x_jobid_trkr_t) {
|
||||
if (jptr->jobid == cd->source->jobid) {
|
||||
@ -431,6 +448,7 @@ void pmix2x_server_deregister_client(const opal_process_name_t *proc,
|
||||
} else {
|
||||
opal_event_assign(&cd->ev, opal_pmix_base.evbase,
|
||||
-1, EV_WRITE, _dereg_client, cd);
|
||||
OPAL_POST_OBJECT(cd);
|
||||
opal_event_active(&cd->ev, EV_WRITE, 1);
|
||||
}
|
||||
}
|
||||
|
@ -13,6 +13,7 @@
|
||||
* Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2015-2017 Research Organization for Information Science
|
||||
* and Technology (RIST). All rights reserved.
|
||||
* Copyright (c) 2017 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -114,6 +115,19 @@ OPAL_DECLSPEC OBJ_CLASS_DECLARATION(opal_thread_t);
|
||||
opal_condition_broadcast((cnd)); \
|
||||
} while(0);
|
||||
|
||||
/* provide a macro for forward-proofing the shifting
|
||||
* of objects between libevent threads - at some point, we
|
||||
* may revamp that threading model */
|
||||
|
||||
/* post an object to another thread - for now, we
|
||||
* only have a memory barrier */
|
||||
#define OPAL_POST_OBJECT(o) opal_atomic_wmb()
|
||||
|
||||
/* acquire an object from another thread - for now,
|
||||
* we only have a memory barrier */
|
||||
#define OPAL_ACQUIRE_OBJECT(o) opal_atomic_rmb()
|
||||
|
||||
|
||||
|
||||
OPAL_DECLSPEC int opal_thread_start(opal_thread_t *);
|
||||
OPAL_DECLSPEC int opal_thread_join(opal_thread_t *, void **thread_return);
|
||||
|
@ -1,7 +1,7 @@
|
||||
/*
|
||||
* Copyright (c) 2012-2013 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2014 Research Organization for Information Science
|
||||
* and Technology (RIST). All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
@ -33,6 +33,7 @@
|
||||
#include "orte/util/name_fns.h"
|
||||
#include "orte/util/proc_info.h"
|
||||
#include "orte/util/show_help.h"
|
||||
#include "orte/util/threads.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/rml/rml.h"
|
||||
@ -507,6 +508,8 @@ static void process_opens(int fd, short args, void *cbdata)
|
||||
opal_list_t lt;
|
||||
opal_namelist_t *nm;
|
||||
|
||||
ORTE_ACQUIRE_OBJECT(dfs);
|
||||
|
||||
/* get the scheme to determine if we can process locally or not */
|
||||
if (NULL == (scheme = opal_uri_get_scheme(dfs->uri))) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
|
||||
@ -661,7 +664,7 @@ static void dfs_open(char *uri,
|
||||
dfs->cbdata = cbdata;
|
||||
|
||||
/* post it for processing */
|
||||
ORTE_DFS_POST_REQUEST(dfs, process_opens);
|
||||
ORTE_THREADSHIFT(dfs, orte_event_base, process_opens, ORTE_SYS_PRI);
|
||||
}
|
||||
|
||||
static void process_close(int fd, short args, void *cbdata)
|
||||
@ -672,6 +675,8 @@ static void process_close(int fd, short args, void *cbdata)
|
||||
opal_buffer_t *buffer;
|
||||
int rc;
|
||||
|
||||
ORTE_ACQUIRE_OBJECT(close_dfs);
|
||||
|
||||
opal_output_verbose(1, orte_dfs_base_framework.framework_output,
|
||||
"%s closing fd %d",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
@ -757,7 +762,7 @@ static void dfs_close(int fd,
|
||||
dfs->cbdata = cbdata;
|
||||
|
||||
/* post it for processing */
|
||||
ORTE_DFS_POST_REQUEST(dfs, process_close);
|
||||
ORTE_THREADSHIFT(dfs, orte_event_base, process_close, ORTE_SYS_PRI);
|
||||
}
|
||||
|
||||
static void process_sizes(int fd, short args, void *cbdata)
|
||||
@ -769,6 +774,8 @@ static void process_sizes(int fd, short args, void *cbdata)
|
||||
int rc;
|
||||
struct stat buf;
|
||||
|
||||
ORTE_ACQUIRE_OBJECT(size_dfs);
|
||||
|
||||
opal_output_verbose(1, orte_dfs_base_framework.framework_output,
|
||||
"%s processing get_size on fd %d",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
@ -881,7 +888,7 @@ static void dfs_get_file_size(int fd,
|
||||
dfs->cbdata = cbdata;
|
||||
|
||||
/* post it for processing */
|
||||
ORTE_DFS_POST_REQUEST(dfs, process_sizes);
|
||||
ORTE_THREADSHIFT(dfs, orte_event_base, process_sizes, ORTE_SYS_PRI);
|
||||
}
|
||||
|
||||
|
||||
@ -895,6 +902,8 @@ static void process_seeks(int fd, short args, void *cbdata)
|
||||
int rc;
|
||||
struct stat buf;
|
||||
|
||||
ORTE_ACQUIRE_OBJECT(seek_dfs);
|
||||
|
||||
opal_output_verbose(1, orte_dfs_base_framework.framework_output,
|
||||
"%s processing seek on fd %d",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
@ -1035,7 +1044,7 @@ static void dfs_seek(int fd, long offset, int whence,
|
||||
dfs->cbdata = cbdata;
|
||||
|
||||
/* post it for processing */
|
||||
ORTE_DFS_POST_REQUEST(dfs, process_seeks);
|
||||
ORTE_THREADSHIFT(dfs, orte_event_base, process_seeks, ORTE_SYS_PRI);
|
||||
}
|
||||
|
||||
static void process_reads(int fd, short args, void *cbdata)
|
||||
@ -1048,6 +1057,8 @@ static void process_reads(int fd, short args, void *cbdata)
|
||||
int64_t i64;
|
||||
int rc;
|
||||
|
||||
ORTE_ACQUIRE_OBJECT(read_dfs);
|
||||
|
||||
/* look in our local records for this fd */
|
||||
trk = NULL;
|
||||
for (item = opal_list_get_first(&active_files);
|
||||
@ -1145,7 +1156,7 @@ static void dfs_read(int fd, uint8_t *buffer,
|
||||
dfs->cbdata = cbdata;
|
||||
|
||||
/* post it for processing */
|
||||
ORTE_DFS_POST_REQUEST(dfs, process_reads);
|
||||
ORTE_THREADSHIFT(dfs, orte_event_base, process_reads, ORTE_SYS_PRI);
|
||||
}
|
||||
|
||||
static void process_posts(int fd, short args, void *cbdata)
|
||||
@ -1154,6 +1165,8 @@ static void process_posts(int fd, short args, void *cbdata)
|
||||
opal_buffer_t *buffer;
|
||||
int rc;
|
||||
|
||||
ORTE_ACQUIRE_OBJECT(dfs);
|
||||
|
||||
/* we will get confirmation in our receive function, so
|
||||
* add this request to our list */
|
||||
dfs->id = req_id++;
|
||||
@ -1212,7 +1225,7 @@ static void dfs_post_file_map(opal_buffer_t *bo,
|
||||
dfs->cbdata = cbdata;
|
||||
|
||||
/* post it for processing */
|
||||
ORTE_DFS_POST_REQUEST(dfs, process_posts);
|
||||
ORTE_THREADSHIFT(dfs, orte_event_base, process_posts, ORTE_SYS_PRI);
|
||||
}
|
||||
|
||||
static void process_getfm(int fd, short args, void *cbdata)
|
||||
@ -1221,6 +1234,8 @@ static void process_getfm(int fd, short args, void *cbdata)
|
||||
opal_buffer_t *buffer;
|
||||
int rc;
|
||||
|
||||
ORTE_ACQUIRE_OBJECT(dfs);
|
||||
|
||||
/* we will get confirmation in our receive function, so
|
||||
* add this request to our list */
|
||||
dfs->id = req_id++;
|
||||
@ -1275,7 +1290,7 @@ static void dfs_get_file_map(orte_process_name_t *target,
|
||||
dfs->cbdata = cbdata;
|
||||
|
||||
/* post it for processing */
|
||||
ORTE_DFS_POST_REQUEST(dfs, process_getfm);
|
||||
ORTE_THREADSHIFT(dfs, orte_event_base, process_getfm, ORTE_SYS_PRI);
|
||||
}
|
||||
|
||||
static void dfs_load_file_maps(orte_jobid_t jobid,
|
||||
@ -1298,4 +1313,3 @@ static void dfs_purge_file_maps(orte_jobid_t jobid,
|
||||
cbfunc(cbdata);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1,5 +1,6 @@
|
||||
/*
|
||||
* Copyright (c) 2012-2013 Los Alamos National Security, Inc. All rights reserved.
|
||||
* Copyright (c) 2017 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -76,14 +77,6 @@ typedef struct {
|
||||
} orte_dfs_request_t;
|
||||
OBJ_CLASS_DECLARATION(orte_dfs_request_t);
|
||||
|
||||
#define ORTE_DFS_POST_REQUEST(d, cb) \
|
||||
do { \
|
||||
opal_event_set(orte_event_base, &((d)->ev), \
|
||||
-1, OPAL_EV_WRITE, (cb), (d)); \
|
||||
opal_event_set_priority(&((d)->ev), ORTE_SYS_PRI); \
|
||||
opal_event_active(&((d)->ev), OPAL_EV_WRITE, 1); \
|
||||
} while(0);
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
#endif
|
||||
|
@ -2,7 +2,7 @@
|
||||
* Copyright (c) 2012-2013 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2013 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2015-2016 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2015-2017 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2015 Research Organization for Information Science
|
||||
* and Technology (RIST). All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
@ -35,6 +35,7 @@
|
||||
#include "orte/util/session_dir.h"
|
||||
#include "orte/util/show_help.h"
|
||||
#include "orte/util/nidmap.h"
|
||||
#include "orte/util/threads.h"
|
||||
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/rml/rml.h"
|
||||
@ -304,6 +305,8 @@ static void process_opens(int fd, short args, void *cbdata)
|
||||
int v;
|
||||
orte_node_t *node, *nptr;
|
||||
|
||||
ORTE_ACQUIRE_OBJECT(dfs);
|
||||
|
||||
/* get the scheme to determine if we can process locally or not */
|
||||
if (NULL == (scheme = opal_uri_get_scheme(dfs->uri))) {
|
||||
OBJ_RELEASE(dfs);
|
||||
@ -465,7 +468,7 @@ static void dfs_open(char *uri,
|
||||
dfs->cbdata = cbdata;
|
||||
|
||||
/* post it for processing */
|
||||
ORTE_DFS_POST_REQUEST(dfs, process_opens);
|
||||
ORTE_THREADSHIFT(dfs, orte_event_base, process_opens, ORTE_SYS_PRI);
|
||||
}
|
||||
|
||||
static void process_close(int fd, short args, void *cbdata)
|
||||
@ -476,6 +479,8 @@ static void process_close(int fd, short args, void *cbdata)
|
||||
opal_buffer_t *buffer;
|
||||
int rc;
|
||||
|
||||
ORTE_ACQUIRE_OBJECT(close_dfs);
|
||||
|
||||
opal_output_verbose(1, orte_dfs_base_framework.framework_output,
|
||||
"%s closing fd %d",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
@ -561,7 +566,7 @@ static void dfs_close(int fd,
|
||||
dfs->cbdata = cbdata;
|
||||
|
||||
/* post it for processing */
|
||||
ORTE_DFS_POST_REQUEST(dfs, process_close);
|
||||
ORTE_THREADSHIFT(dfs, orte_event_base, process_close, ORTE_SYS_PRI);
|
||||
}
|
||||
|
||||
static void process_sizes(int fd, short args, void *cbdata)
|
||||
@ -573,6 +578,8 @@ static void process_sizes(int fd, short args, void *cbdata)
|
||||
int rc;
|
||||
struct stat buf;
|
||||
|
||||
ORTE_ACQUIRE_OBJECT(size_dfs);
|
||||
|
||||
opal_output_verbose(1, orte_dfs_base_framework.framework_output,
|
||||
"%s processing get_size on fd %d",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
@ -665,7 +672,7 @@ static void dfs_get_file_size(int fd,
|
||||
dfs->cbdata = cbdata;
|
||||
|
||||
/* post it for processing */
|
||||
ORTE_DFS_POST_REQUEST(dfs, process_sizes);
|
||||
ORTE_THREADSHIFT(dfs, orte_event_base, process_sizes, ORTE_SYS_PRI);
|
||||
}
|
||||
|
||||
|
||||
@ -679,6 +686,8 @@ static void process_seeks(int fd, short args, void *cbdata)
|
||||
int rc;
|
||||
struct stat buf;
|
||||
|
||||
ORTE_ACQUIRE_OBJECT(seek_dfs);
|
||||
|
||||
opal_output_verbose(1, orte_dfs_base_framework.framework_output,
|
||||
"%s processing seek on fd %d",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
@ -814,7 +823,7 @@ static void dfs_seek(int fd, long offset, int whence,
|
||||
dfs->cbdata = cbdata;
|
||||
|
||||
/* post it for processing */
|
||||
ORTE_DFS_POST_REQUEST(dfs, process_seeks);
|
||||
ORTE_THREADSHIFT(dfs, orte_event_base, process_seeks, ORTE_SYS_PRI);
|
||||
}
|
||||
|
||||
static void process_reads(int fd, short args, void *cbdata)
|
||||
@ -827,6 +836,8 @@ static void process_reads(int fd, short args, void *cbdata)
|
||||
int64_t i64;
|
||||
int rc;
|
||||
|
||||
ORTE_ACQUIRE_OBJECT(read_dfs);
|
||||
|
||||
/* look in our local records for this fd */
|
||||
trk = NULL;
|
||||
for (item = opal_list_get_first(&active_files);
|
||||
@ -924,7 +935,7 @@ static void dfs_read(int fd, uint8_t *buffer,
|
||||
dfs->cbdata = cbdata;
|
||||
|
||||
/* post it for processing */
|
||||
ORTE_DFS_POST_REQUEST(dfs, process_reads);
|
||||
ORTE_THREADSHIFT(dfs, orte_event_base, process_reads, ORTE_SYS_PRI);
|
||||
}
|
||||
|
||||
static void process_posts(int fd, short args, void *cbdata)
|
||||
@ -935,6 +946,8 @@ static void process_posts(int fd, short args, void *cbdata)
|
||||
opal_list_item_t *item;
|
||||
int rc;
|
||||
|
||||
ORTE_ACQUIRE_OBJECT(dfs);
|
||||
|
||||
opal_output_verbose(1, orte_dfs_base_framework.framework_output,
|
||||
"%s posting file map containing %d bytes for target %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
@ -1009,7 +1022,7 @@ static void dfs_post_file_map(opal_buffer_t *buffer,
|
||||
dfs->cbdata = cbdata;
|
||||
|
||||
/* post it for processing */
|
||||
ORTE_DFS_POST_REQUEST(dfs, process_posts);
|
||||
ORTE_THREADSHIFT(dfs, orte_event_base, process_posts, ORTE_SYS_PRI);
|
||||
}
|
||||
|
||||
static int get_job_maps(orte_dfs_jobfm_t *jfm,
|
||||
@ -1057,6 +1070,8 @@ static void process_getfm(int fd, short args, void *cbdata)
|
||||
int32_t n, ntotal;
|
||||
int rc;
|
||||
|
||||
ORTE_ACQUIRE_OBJECT(dfs);
|
||||
|
||||
/* if the target job is WILDCARD, then process
|
||||
* data for all jobids - else, find the one
|
||||
*/
|
||||
@ -1120,7 +1135,7 @@ static void dfs_get_file_map(orte_process_name_t *target,
|
||||
dfs->cbdata = cbdata;
|
||||
|
||||
/* post it for processing */
|
||||
ORTE_DFS_POST_REQUEST(dfs, process_getfm);
|
||||
ORTE_THREADSHIFT(dfs, orte_event_base, process_getfm, ORTE_SYS_PRI);
|
||||
}
|
||||
|
||||
static void process_load(int fd, short args, void *cbdata)
|
||||
@ -1135,6 +1150,8 @@ static void process_load(int fd, short args, void *cbdata)
|
||||
int rc;
|
||||
opal_buffer_t *xfer;
|
||||
|
||||
ORTE_ACQUIRE_OBJECT(dfs);
|
||||
|
||||
/* see if we already have a tracker for this job */
|
||||
jfm = NULL;
|
||||
for (item = opal_list_get_first(&file_maps);
|
||||
@ -1233,7 +1250,7 @@ static void dfs_load_file_maps(orte_jobid_t jobid,
|
||||
dfs->cbdata = cbdata;
|
||||
|
||||
/* post it for processing */
|
||||
ORTE_DFS_POST_REQUEST(dfs, process_load);
|
||||
ORTE_THREADSHIFT(dfs, orte_event_base, process_load, ORTE_SYS_PRI);
|
||||
}
|
||||
|
||||
static void process_purge(int fd, short args, void *cbdata)
|
||||
@ -1242,6 +1259,8 @@ static void process_purge(int fd, short args, void *cbdata)
|
||||
opal_list_item_t *item;
|
||||
orte_dfs_jobfm_t *jfm, *jptr;
|
||||
|
||||
ORTE_ACQUIRE_OBJECT(dfs);
|
||||
|
||||
/* find the job tracker */
|
||||
jfm = NULL;
|
||||
for (item = opal_list_get_first(&file_maps);
|
||||
@ -1288,7 +1307,7 @@ static void dfs_purge_file_maps(orte_jobid_t jobid,
|
||||
dfs->cbdata = cbdata;
|
||||
|
||||
/* post it for processing */
|
||||
ORTE_DFS_POST_REQUEST(dfs, process_purge);
|
||||
ORTE_THREADSHIFT(dfs, orte_event_base, process_purge, ORTE_SYS_PRI);
|
||||
}
|
||||
|
||||
|
||||
@ -2368,4 +2387,3 @@ static void remote_read(int fd, short args, void *cbdata)
|
||||
}
|
||||
OBJ_RELEASE(req);
|
||||
}
|
||||
|
||||
|
@ -1,7 +1,7 @@
|
||||
/*
|
||||
* Copyright (c) 2012-2013 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2014-2015 Research Organization for Information Science
|
||||
* and Technology (RIST). All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
@ -32,6 +32,7 @@
|
||||
#include "orte/util/error_strings.h"
|
||||
#include "orte/util/name_fns.h"
|
||||
#include "orte/util/show_help.h"
|
||||
#include "orte/util/threads.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/rml/rml.h"
|
||||
@ -449,6 +450,8 @@ static void process_opens(int fd, short args, void *cbdata)
|
||||
opal_list_t lt;
|
||||
opal_namelist_t *nm;
|
||||
|
||||
ORTE_ACQUIRE_OBJECT(dfs);
|
||||
|
||||
opal_output_verbose(1, orte_dfs_base_framework.framework_output,
|
||||
"%s PROCESSING OPEN", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
|
||||
|
||||
@ -583,7 +586,7 @@ static void dfs_open(char *uri,
|
||||
dfs->cbdata = cbdata;
|
||||
|
||||
/* post it for processing */
|
||||
ORTE_DFS_POST_REQUEST(dfs, process_opens);
|
||||
ORTE_THREADSHIFT(dfs, orte_event_base, process_opens, ORTE_SYS_PRI);
|
||||
}
|
||||
|
||||
static void process_close(int fd, short args, void *cbdata)
|
||||
@ -594,6 +597,8 @@ static void process_close(int fd, short args, void *cbdata)
|
||||
opal_buffer_t *buffer;
|
||||
int rc;
|
||||
|
||||
ORTE_ACQUIRE_OBJECT(close_dfs);
|
||||
|
||||
opal_output_verbose(1, orte_dfs_base_framework.framework_output,
|
||||
"%s closing fd %d",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
@ -673,7 +678,7 @@ static void dfs_close(int fd,
|
||||
dfs->cbdata = cbdata;
|
||||
|
||||
/* post it for processing */
|
||||
ORTE_DFS_POST_REQUEST(dfs, process_close);
|
||||
ORTE_THREADSHIFT(dfs, orte_event_base, process_close, ORTE_SYS_PRI);
|
||||
}
|
||||
|
||||
static void process_sizes(int fd, short args, void *cbdata)
|
||||
@ -684,6 +689,8 @@ static void process_sizes(int fd, short args, void *cbdata)
|
||||
opal_buffer_t *buffer;
|
||||
int rc;
|
||||
|
||||
ORTE_ACQUIRE_OBJECT(size_dfs);
|
||||
|
||||
opal_output_verbose(1, orte_dfs_base_framework.framework_output,
|
||||
"%s processing get_size on fd %d",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
@ -775,7 +782,7 @@ static void dfs_get_file_size(int fd,
|
||||
dfs->cbdata = cbdata;
|
||||
|
||||
/* post it for processing */
|
||||
ORTE_DFS_POST_REQUEST(dfs, process_sizes);
|
||||
ORTE_THREADSHIFT(dfs, orte_event_base, process_sizes, ORTE_SYS_PRI);
|
||||
}
|
||||
|
||||
|
||||
@ -788,6 +795,8 @@ static void process_seeks(int fd, short args, void *cbdata)
|
||||
int64_t i64;
|
||||
int rc;
|
||||
|
||||
ORTE_ACQUIRE_OBJECT(seek_dfs);
|
||||
|
||||
opal_output_verbose(1, orte_dfs_base_framework.framework_output,
|
||||
"%s processing seek on fd %d",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
@ -885,7 +894,7 @@ static void dfs_seek(int fd, long offset, int whence,
|
||||
dfs->cbdata = cbdata;
|
||||
|
||||
/* post it for processing */
|
||||
ORTE_DFS_POST_REQUEST(dfs, process_seeks);
|
||||
ORTE_THREADSHIFT(dfs, orte_event_base, process_seeks, ORTE_SYS_PRI);
|
||||
}
|
||||
|
||||
static void process_reads(int fd, short args, void *cbdata)
|
||||
@ -897,6 +906,8 @@ static void process_reads(int fd, short args, void *cbdata)
|
||||
int64_t i64;
|
||||
int rc;
|
||||
|
||||
ORTE_ACQUIRE_OBJECT(read_dfs);
|
||||
|
||||
/* look in our local records for this fd */
|
||||
trk = NULL;
|
||||
for (item = opal_list_get_first(&active_files);
|
||||
@ -979,7 +990,7 @@ static void dfs_read(int fd, uint8_t *buffer,
|
||||
dfs->cbdata = cbdata;
|
||||
|
||||
/* post it for processing */
|
||||
ORTE_DFS_POST_REQUEST(dfs, process_reads);
|
||||
ORTE_THREADSHIFT(dfs, orte_event_base, process_reads, ORTE_SYS_PRI);
|
||||
}
|
||||
|
||||
static void process_posts(int fd, short args, void *cbdata)
|
||||
@ -988,6 +999,8 @@ static void process_posts(int fd, short args, void *cbdata)
|
||||
opal_buffer_t *buffer;
|
||||
int rc;
|
||||
|
||||
ORTE_ACQUIRE_OBJECT(dfs);
|
||||
|
||||
/* we will get confirmation in our receive function, so
|
||||
* add this request to our list */
|
||||
dfs->id = req_id++;
|
||||
@ -1046,7 +1059,7 @@ static void dfs_post_file_map(opal_buffer_t *bo,
|
||||
dfs->cbdata = cbdata;
|
||||
|
||||
/* post it for processing */
|
||||
ORTE_DFS_POST_REQUEST(dfs, process_posts);
|
||||
ORTE_THREADSHIFT(dfs, orte_event_base, process_posts, ORTE_SYS_PRI);
|
||||
}
|
||||
|
||||
static void process_getfm(int fd, short args, void *cbdata)
|
||||
@ -1055,6 +1068,8 @@ static void process_getfm(int fd, short args, void *cbdata)
|
||||
opal_buffer_t *buffer;
|
||||
int rc;
|
||||
|
||||
ORTE_ACQUIRE_OBJECT(dfs);
|
||||
|
||||
/* we will get confirmation in our receive function, so
|
||||
* add this request to our list */
|
||||
dfs->id = req_id++;
|
||||
@ -1109,7 +1124,7 @@ static void dfs_get_file_map(orte_process_name_t *target,
|
||||
dfs->cbdata = cbdata;
|
||||
|
||||
/* post it for processing */
|
||||
ORTE_DFS_POST_REQUEST(dfs, process_getfm);
|
||||
ORTE_THREADSHIFT(dfs, orte_event_base, process_getfm, ORTE_SYS_PRI);
|
||||
}
|
||||
|
||||
static void dfs_load_file_maps(orte_jobid_t jobid,
|
||||
@ -1132,4 +1147,3 @@ static void dfs_purge_file_maps(orte_jobid_t jobid,
|
||||
cbfunc(cbdata);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -10,6 +10,7 @@
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# Copyright (c) 2012-2013 Los Alamos National Security, Inc. All rights reserved.
|
||||
# Copyright (c) 2017 Intel, Inc. All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
@ -26,5 +27,4 @@ headers += \
|
||||
libmca_errmgr_la_SOURCES += \
|
||||
base/errmgr_base_select.c \
|
||||
base/errmgr_base_frame.c \
|
||||
base/errmgr_base_fns.c \
|
||||
base/errmgr_base_tool.c
|
||||
base/errmgr_base_fns.c
|
||||
|
@ -13,7 +13,7 @@
|
||||
* Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved.
|
||||
* Copyright (c) 2011-2013 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2013-2014 Intel, Inc. All rights reserved
|
||||
* Copyright (c) 2013-2017 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2014 Research Organization for Information Science
|
||||
* and Technology (RIST). All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
@ -82,99 +82,6 @@
|
||||
#include "orte/mca/errmgr/base/base.h"
|
||||
#include "orte/mca/errmgr/base/errmgr_private.h"
|
||||
|
||||
/*
|
||||
* Object stuff
|
||||
*/
|
||||
void orte_errmgr_predicted_proc_construct(orte_errmgr_predicted_proc_t *item);
|
||||
void orte_errmgr_predicted_proc_destruct( orte_errmgr_predicted_proc_t *item);
|
||||
|
||||
OBJ_CLASS_INSTANCE(orte_errmgr_predicted_proc_t,
|
||||
opal_list_item_t,
|
||||
orte_errmgr_predicted_proc_construct,
|
||||
orte_errmgr_predicted_proc_destruct);
|
||||
|
||||
void orte_errmgr_predicted_proc_construct(orte_errmgr_predicted_proc_t *item)
|
||||
{
|
||||
item->proc_name.vpid = ORTE_VPID_INVALID;
|
||||
item->proc_name.jobid = ORTE_JOBID_INVALID;
|
||||
}
|
||||
|
||||
void orte_errmgr_predicted_proc_destruct( orte_errmgr_predicted_proc_t *item)
|
||||
{
|
||||
item->proc_name.vpid = ORTE_VPID_INVALID;
|
||||
item->proc_name.jobid = ORTE_JOBID_INVALID;
|
||||
}
|
||||
|
||||
void orte_errmgr_predicted_node_construct(orte_errmgr_predicted_node_t *item);
|
||||
void orte_errmgr_predicted_node_destruct( orte_errmgr_predicted_node_t *item);
|
||||
|
||||
OBJ_CLASS_INSTANCE(orte_errmgr_predicted_node_t,
|
||||
opal_list_item_t,
|
||||
orte_errmgr_predicted_node_construct,
|
||||
orte_errmgr_predicted_node_destruct);
|
||||
|
||||
void orte_errmgr_predicted_node_construct(orte_errmgr_predicted_node_t *item)
|
||||
{
|
||||
item->node_name = NULL;
|
||||
}
|
||||
|
||||
void orte_errmgr_predicted_node_destruct( orte_errmgr_predicted_node_t *item)
|
||||
{
|
||||
if( NULL != item->node_name ) {
|
||||
free(item->node_name);
|
||||
item->node_name = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
void orte_errmgr_predicted_map_construct(orte_errmgr_predicted_map_t *item);
|
||||
void orte_errmgr_predicted_map_destruct( orte_errmgr_predicted_map_t *item);
|
||||
|
||||
OBJ_CLASS_INSTANCE(orte_errmgr_predicted_map_t,
|
||||
opal_list_item_t,
|
||||
orte_errmgr_predicted_map_construct,
|
||||
orte_errmgr_predicted_map_destruct);
|
||||
|
||||
void orte_errmgr_predicted_map_construct(orte_errmgr_predicted_map_t *item)
|
||||
{
|
||||
item->proc_name.vpid = ORTE_VPID_INVALID;
|
||||
item->proc_name.jobid = ORTE_JOBID_INVALID;
|
||||
|
||||
item->node_name = NULL;
|
||||
|
||||
item->map_proc_name.vpid = ORTE_VPID_INVALID;
|
||||
item->map_proc_name.jobid = ORTE_JOBID_INVALID;
|
||||
|
||||
item->map_node_name = NULL;
|
||||
item->off_current_node = false;
|
||||
item->pre_map_fixed_node = NULL;
|
||||
}
|
||||
|
||||
void orte_errmgr_predicted_map_destruct( orte_errmgr_predicted_map_t *item)
|
||||
{
|
||||
item->proc_name.vpid = ORTE_VPID_INVALID;
|
||||
item->proc_name.jobid = ORTE_JOBID_INVALID;
|
||||
|
||||
if( NULL != item->node_name ) {
|
||||
free(item->node_name);
|
||||
item->node_name = NULL;
|
||||
}
|
||||
|
||||
item->map_proc_name.vpid = ORTE_VPID_INVALID;
|
||||
item->map_proc_name.jobid = ORTE_JOBID_INVALID;
|
||||
|
||||
if( NULL != item->map_node_name ) {
|
||||
free(item->map_node_name);
|
||||
item->map_node_name = NULL;
|
||||
}
|
||||
|
||||
item->off_current_node = false;
|
||||
|
||||
if( NULL != item->pre_map_fixed_node ) {
|
||||
free(item->pre_map_fixed_node);
|
||||
item->pre_map_fixed_node = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Public interfaces
|
||||
*/
|
||||
@ -231,12 +138,6 @@ void orte_errmgr_base_abort(int error_code, char *fmt, ...)
|
||||
/* No way to reach here */
|
||||
}
|
||||
|
||||
void orte_errmgr_base_register_migration_warning(struct timeval *tv)
|
||||
{
|
||||
/* stub function - ignore */
|
||||
return;
|
||||
}
|
||||
|
||||
int orte_errmgr_base_abort_peers(orte_process_name_t *procs,
|
||||
orte_std_cntr_t num_procs,
|
||||
int error_code)
|
||||
@ -244,195 +145,6 @@ int orte_errmgr_base_abort_peers(orte_process_name_t *procs,
|
||||
return ORTE_ERR_NOT_IMPLEMENTED;
|
||||
}
|
||||
|
||||
int orte_errmgr_base_register_error_callback(orte_errmgr_error_callback_fn_t *cbfunc,
|
||||
orte_errmgr_error_order_t order)
|
||||
{
|
||||
orte_errmgr_cback_t *cb, *cbcur;
|
||||
|
||||
/* check the order to see what to do */
|
||||
switch(order) {
|
||||
case ORTE_ERRMGR_CALLBACK_FIRST:
|
||||
/* only one can be so designated */
|
||||
if (NULL != (cb = (orte_errmgr_cback_t*)opal_list_get_first(&orte_errmgr_base.error_cbacks))) {
|
||||
if (ORTE_ERRMGR_CALLBACK_FIRST == cb->order) {
|
||||
return ORTE_ERR_NOT_SUPPORTED;
|
||||
}
|
||||
}
|
||||
cb = OBJ_NEW(orte_errmgr_cback_t);
|
||||
cb->order = order;
|
||||
cb->callback =cbfunc;
|
||||
opal_list_prepend(&orte_errmgr_base.error_cbacks, &cb->super);
|
||||
break;
|
||||
case ORTE_ERRMGR_CALLBACK_LAST:
|
||||
/* only one can be so designated */
|
||||
if (NULL != (cb = (orte_errmgr_cback_t*)opal_list_get_last(&orte_errmgr_base.error_cbacks))) {
|
||||
if (ORTE_ERRMGR_CALLBACK_LAST == cb->order) {
|
||||
return ORTE_ERR_NOT_SUPPORTED;
|
||||
}
|
||||
}
|
||||
cb = OBJ_NEW(orte_errmgr_cback_t);
|
||||
cb->order = order;
|
||||
cb->callback = cbfunc;
|
||||
opal_list_append(&orte_errmgr_base.error_cbacks, &cb->super);
|
||||
break;
|
||||
case ORTE_ERRMGR_CALLBACK_PREPEND:
|
||||
cb = OBJ_NEW(orte_errmgr_cback_t);
|
||||
cb->order = order;
|
||||
cb->callback =cbfunc;
|
||||
if (NULL != (cbcur = (orte_errmgr_cback_t*)opal_list_get_first(&orte_errmgr_base.error_cbacks)) &&
|
||||
ORTE_ERRMGR_CALLBACK_FIRST == cbcur->order) {
|
||||
opal_list_insert(&orte_errmgr_base.error_cbacks, &cb->super, 1);
|
||||
} else {
|
||||
opal_list_prepend(&orte_errmgr_base.error_cbacks, &cb->super);
|
||||
}
|
||||
break;
|
||||
case ORTE_ERRMGR_CALLBACK_APPEND:
|
||||
cb = OBJ_NEW(orte_errmgr_cback_t);
|
||||
cb->order = order;
|
||||
cb->callback =cbfunc;
|
||||
if (NULL != (cbcur = (orte_errmgr_cback_t*)opal_list_get_last(&orte_errmgr_base.error_cbacks)) &&
|
||||
ORTE_ERRMGR_CALLBACK_LAST == cbcur->order) {
|
||||
opal_list_insert_pos(&orte_errmgr_base.error_cbacks, &cbcur->super, &cb->super);
|
||||
} else {
|
||||
opal_list_append(&orte_errmgr_base.error_cbacks, &cb->super);
|
||||
}
|
||||
opal_list_append(&orte_errmgr_base.error_cbacks, &cb->super);
|
||||
break;
|
||||
}
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
void orte_errmgr_base_execute_error_callbacks(opal_pointer_array_t *errors)
|
||||
{
|
||||
orte_errmgr_cback_t *cb;
|
||||
char *errstring=NULL;
|
||||
orte_error_t *err;
|
||||
int errcode = ORTE_ERROR_DEFAULT_EXIT_CODE;
|
||||
|
||||
/* if no callbacks have been provided, then we abort */
|
||||
if (0 == opal_list_get_size(&orte_errmgr_base.error_cbacks)) {
|
||||
/* take the first entry, if available */
|
||||
if (NULL != errors &&
|
||||
(NULL != (err = (orte_error_t*)opal_pointer_array_get_item(errors, 0)))) {
|
||||
errstring = (char*)ORTE_ERROR_NAME(err->errcode);
|
||||
errcode = err->errcode;
|
||||
}
|
||||
if (NULL == errstring) {
|
||||
/* if the error is silent, say nothing */
|
||||
orte_errmgr.abort(errcode, NULL);
|
||||
}
|
||||
orte_errmgr.abort(errcode, "Executing default error callback: %s", errstring);
|
||||
}
|
||||
|
||||
/* cycle across the provided callbacks until we complete the list
|
||||
* or one reports that no further action is required
|
||||
*/
|
||||
OPAL_LIST_FOREACH(cb, &orte_errmgr_base.error_cbacks, orte_errmgr_cback_t) {
|
||||
if (ORTE_SUCCESS == cb->callback(errors)) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/********************
|
||||
* Utility functions
|
||||
********************/
|
||||
#if OPAL_ENABLE_FT_CR
|
||||
|
||||
void orte_errmgr_base_migrate_state_notify(int state)
|
||||
{
|
||||
switch(state) {
|
||||
case ORTE_ERRMGR_MIGRATE_STATE_ERROR:
|
||||
case ORTE_ERRMGR_MIGRATE_STATE_ERR_INPROGRESS:
|
||||
opal_output(0, "%d: Migration failed for process %s.",
|
||||
orte_process_info.pid, ORTE_JOBID_PRINT(ORTE_PROC_MY_NAME->jobid));
|
||||
break;
|
||||
case ORTE_ERRMGR_MIGRATE_STATE_FINISH:
|
||||
opal_output(0, "%d: Migration successful for process %s.",
|
||||
orte_process_info.pid, ORTE_JOBID_PRINT(ORTE_PROC_MY_NAME->jobid));
|
||||
break;
|
||||
|
||||
case ORTE_ERRMGR_MIGRATE_STATE_NONE:
|
||||
case ORTE_ERRMGR_MIGRATE_STATE_REQUEST:
|
||||
case ORTE_ERRMGR_MIGRATE_STATE_RUNNING:
|
||||
case ORTE_ERRMGR_MIGRATE_STATE_RUN_CKPT:
|
||||
case ORTE_ERRMGR_MIGRATE_STATE_STARTUP:
|
||||
case ORTE_ERRMGR_MIGRATE_MAX:
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
void orte_errmgr_base_proc_state_notify(orte_proc_state_t state, orte_process_name_t *proc)
|
||||
{
|
||||
if (NULL != proc) {
|
||||
switch(state) {
|
||||
case ORTE_PROC_STATE_ABORTED:
|
||||
case ORTE_PROC_STATE_ABORTED_BY_SIG:
|
||||
case ORTE_PROC_STATE_TERM_WO_SYNC:
|
||||
case ORTE_PROC_STATE_TERMINATED:
|
||||
case ORTE_PROC_STATE_KILLED_BY_CMD:
|
||||
case ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED:
|
||||
opal_output(0, "%d: Process %s is dead.",
|
||||
orte_process_info.pid, ORTE_JOBID_PRINT(proc->jobid));
|
||||
break;
|
||||
|
||||
case ORTE_PROC_STATE_HEARTBEAT_FAILED:
|
||||
opal_output(0, "%d: Process %s is unreachable.",
|
||||
orte_process_info.pid, ORTE_JOBID_PRINT(proc->jobid));
|
||||
|
||||
case ORTE_PROC_STATE_COMM_FAILED:
|
||||
opal_output(0, "%d: Failed to communicate with process %s.",
|
||||
orte_process_info.pid, ORTE_JOBID_PRINT(proc->jobid));
|
||||
break;
|
||||
|
||||
case ORTE_PROC_STATE_CALLED_ABORT:
|
||||
case ORTE_PROC_STATE_FAILED_TO_START:
|
||||
opal_output(0, "%d: Process %s has called abort.",
|
||||
orte_process_info.pid, ORTE_JOBID_PRINT(proc->jobid));
|
||||
break;
|
||||
case ORTE_PROC_STATE_MIGRATING:
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int orte_errmgr_base_migrate_state_str(char ** state_str, int state)
|
||||
{
|
||||
switch(state) {
|
||||
case ORTE_ERRMGR_MIGRATE_STATE_NONE:
|
||||
*state_str = strdup(" -- ");
|
||||
break;
|
||||
case ORTE_ERRMGR_MIGRATE_STATE_REQUEST:
|
||||
*state_str = strdup("Requested");
|
||||
break;
|
||||
case ORTE_ERRMGR_MIGRATE_STATE_RUNNING:
|
||||
*state_str = strdup("Running");
|
||||
break;
|
||||
case ORTE_ERRMGR_MIGRATE_STATE_RUN_CKPT:
|
||||
*state_str = strdup("Checkpointing");
|
||||
break;
|
||||
case ORTE_ERRMGR_MIGRATE_STATE_STARTUP:
|
||||
*state_str = strdup("Restarting");
|
||||
break;
|
||||
case ORTE_ERRMGR_MIGRATE_STATE_FINISH:
|
||||
*state_str = strdup("Finished");
|
||||
break;
|
||||
case ORTE_ERRMGR_MIGRATE_STATE_ERROR:
|
||||
*state_str = strdup("Error");
|
||||
break;
|
||||
case ORTE_ERRMGR_MIGRATE_STATE_ERR_INPROGRESS:
|
||||
*state_str = strdup("Error: Migration in progress");
|
||||
break;
|
||||
default:
|
||||
asprintf(state_str, "Unknown %d", state);
|
||||
break;
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
#endif
|
||||
|
||||
#if OPAL_ENABLE_FT_CR
|
||||
int orte_errmgr_base_update_app_context_for_cr_recovery(orte_job_t *jobdata,
|
||||
|
@ -12,7 +12,7 @@
|
||||
* Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved.
|
||||
* Copyright (c) 2011-2013 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2013 Intel, Inc. All rights reserved
|
||||
* Copyright (c) 2013-2017 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2014-2015 Research Organization for Information Science
|
||||
* and Technology (RIST). All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
@ -53,17 +53,11 @@ orte_errmgr_base_t orte_errmgr_base = {{{0}}};
|
||||
|
||||
/* Public module provides a wrapper around previous functions */
|
||||
orte_errmgr_base_module_t orte_errmgr_default_fns = {
|
||||
NULL, /* init */
|
||||
NULL, /* finalize */
|
||||
orte_errmgr_base_log,
|
||||
orte_errmgr_base_abort,
|
||||
orte_errmgr_base_abort_peers,
|
||||
NULL, /* predicted_fault */
|
||||
NULL, /* suggest_map_targets */
|
||||
NULL, /* ft_event */
|
||||
orte_errmgr_base_register_migration_warning,
|
||||
orte_errmgr_base_register_error_callback,
|
||||
orte_errmgr_base_execute_error_callbacks
|
||||
.init = NULL, /* init */
|
||||
.finalize = NULL, /* finalize */
|
||||
.logfn = orte_errmgr_base_log,
|
||||
.abort = orte_errmgr_base_abort,
|
||||
.abort_peers = orte_errmgr_base_abort_peers
|
||||
};
|
||||
/* NOTE: ABSOLUTELY MUST initialize this
|
||||
* struct to include the log function as it
|
||||
@ -71,16 +65,7 @@ orte_errmgr_base_module_t orte_errmgr_default_fns = {
|
||||
* opened yet due to error
|
||||
*/
|
||||
orte_errmgr_base_module_t orte_errmgr = {
|
||||
NULL,
|
||||
NULL,
|
||||
orte_errmgr_base_log,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL
|
||||
.logfn = orte_errmgr_base_log
|
||||
};
|
||||
|
||||
static int orte_errmgr_base_close(void)
|
||||
@ -118,7 +103,3 @@ static int orte_errmgr_base_open(mca_base_open_flag_t flags)
|
||||
MCA_BASE_FRAMEWORK_DECLARE(orte, errmgr, "ORTE Error Manager", NULL,
|
||||
orte_errmgr_base_open, orte_errmgr_base_close,
|
||||
mca_errmgr_base_static_components, 0);
|
||||
|
||||
OBJ_CLASS_INSTANCE(orte_errmgr_cback_t,
|
||||
opal_list_item_t,
|
||||
NULL, NULL);
|
||||
|
@ -1,441 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2009-2010 The Trustees of Indiana University.
|
||||
* Copyright (c) 2004-2011 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* All rights reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "orte_config.h"
|
||||
|
||||
#include <string.h>
|
||||
#if HAVE_SYS_TYPES_H
|
||||
#include <sys/types.h>
|
||||
#endif /* HAVE_SYS_TYPES_H */
|
||||
#ifdef HAVE_UNISTD_H
|
||||
#include <unistd.h>
|
||||
#endif /* HAVE_UNISTD_H */
|
||||
#if HAVE_SYS_TYPES_H
|
||||
#include <sys/types.h>
|
||||
#endif /* HAVE_SYS_TYPES_H */
|
||||
#if HAVE_SYS_STAT_H
|
||||
#include <sys/stat.h>
|
||||
#endif /* HAVE_SYS_STAT_H */
|
||||
#ifdef HAVE_DIRENT_H
|
||||
#include <dirent.h>
|
||||
#endif /* HAVE_DIRENT_H */
|
||||
#include <time.h>
|
||||
|
||||
#include "opal/dss/dss.h"
|
||||
|
||||
#include "orte/mca/mca.h"
|
||||
#include "opal/mca/base/base.h"
|
||||
|
||||
#include "opal/util/os_dirpath.h"
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/util/basename.h"
|
||||
#include "opal/util/argv.h"
|
||||
#include "opal/mca/crs/crs.h"
|
||||
#include "opal/mca/crs/base/base.h"
|
||||
|
||||
#include "orte/mca/rml/rml.h"
|
||||
#include "orte/mca/rml/rml_types.h"
|
||||
#include "orte/mca/snapc/snapc.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
#include "orte/util/name_fns.h"
|
||||
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/errmgr/base/base.h"
|
||||
#include "orte/mca/errmgr/base/errmgr_private.h"
|
||||
|
||||
/**
|
||||
* This file contains function for the HNP to communicate with the
|
||||
* orte-migrate command.
|
||||
*/
|
||||
#if OPAL_ENABLE_FT_CR
|
||||
|
||||
/******************
|
||||
* Local Functions
|
||||
******************/
|
||||
static int errmgr_base_tool_start_cmdline_listener(void);
|
||||
static int errmgr_base_tool_stop_cmdline_listener(void);
|
||||
|
||||
static void errmgr_base_tool_cmdline_recv(int status,
|
||||
orte_process_name_t* sender,
|
||||
opal_buffer_t* buffer,
|
||||
orte_rml_tag_t tag,
|
||||
void* cbdata);
|
||||
|
||||
/******************
|
||||
* Object stuff
|
||||
******************/
|
||||
static orte_process_name_t errmgr_cmdline_sender = {ORTE_JOBID_INVALID, ORTE_VPID_INVALID};
|
||||
static bool errmgr_cmdline_recv_issued = false;
|
||||
static int errmgr_tool_initialized = false;
|
||||
|
||||
/********************
|
||||
* Module Functions
|
||||
********************/
|
||||
int orte_errmgr_base_tool_init(void)
|
||||
{
|
||||
int ret;
|
||||
|
||||
if( (++errmgr_tool_initialized) != 1 ) {
|
||||
if( errmgr_tool_initialized < 1 ) {
|
||||
return OPAL_ERROR;
|
||||
}
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
/* Only HNP communicates with tools */
|
||||
if (! ORTE_PROC_IS_HNP) {
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/*
|
||||
* Setup command line migrate tool request listener
|
||||
*/
|
||||
if( ORTE_SUCCESS != (ret = errmgr_base_tool_start_cmdline_listener()) ) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
int orte_errmgr_base_tool_finalize(void)
|
||||
{
|
||||
int ret;
|
||||
|
||||
if( (--errmgr_tool_initialized) != 0 ) {
|
||||
if( errmgr_tool_initialized < 0 ) {
|
||||
return OPAL_ERROR;
|
||||
}
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
/* Only HNP communicates with tools */
|
||||
if (! ORTE_PROC_IS_HNP) {
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/*
|
||||
* Clean up listeners
|
||||
*/
|
||||
if( ORTE_SUCCESS != (ret = errmgr_base_tool_stop_cmdline_listener()) ) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
int orte_errmgr_base_migrate_update(int status)
|
||||
{
|
||||
int ret, exit_status = ORTE_SUCCESS;
|
||||
opal_buffer_t *loc_buffer = NULL;
|
||||
orte_errmgr_tool_cmd_flag_t command = ORTE_ERRMGR_MIGRATE_TOOL_UPDATE_CMD;
|
||||
|
||||
/* Only HNP communicates with tools */
|
||||
if (! ORTE_PROC_IS_HNP) {
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/*
|
||||
* If this is an invalid state, then return an error
|
||||
*/
|
||||
if( ORTE_ERRMGR_MIGRATE_MAX < status ) {
|
||||
opal_output(orte_errmgr_base_framework.framework_output,
|
||||
"errmgr:base:tool:update() Error: Invalid state %d < (Max %d)",
|
||||
status, ORTE_ERRMGR_MIGRATE_MAX);
|
||||
return ORTE_ERR_BAD_PARAM;
|
||||
}
|
||||
|
||||
/*
|
||||
* Report the status over the notifier interface
|
||||
*/
|
||||
orte_errmgr_base_migrate_state_notify(status);
|
||||
|
||||
/*
|
||||
* If the caller is indicating that they are finished and ready for another
|
||||
* command, then repost the RML listener.
|
||||
*/
|
||||
if( ORTE_ERRMGR_MIGRATE_STATE_NONE == status ) {
|
||||
if( ORTE_SUCCESS != (ret = errmgr_base_tool_start_cmdline_listener()) ) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
return ret;
|
||||
}
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/*
|
||||
* Noop if invalid peer, or peer not specified
|
||||
*/
|
||||
if( OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, ORTE_NAME_INVALID, &errmgr_cmdline_sender) ) {
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/*
|
||||
* Do not send to self, as that is silly.
|
||||
*/
|
||||
if( OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, ORTE_PROC_MY_HNP, &errmgr_cmdline_sender) ) {
|
||||
OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base_framework.framework_output,
|
||||
"errmgr:base:tool:update() Warning: Do not send to self!\n"));
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base_framework.framework_output,
|
||||
"errmgr:base:tool:update() Sending update command <status %d>\n",
|
||||
status));
|
||||
|
||||
/********************
|
||||
* Send over the status of the checkpoint
|
||||
* - migration state
|
||||
********************/
|
||||
if (NULL == (loc_buffer = OBJ_NEW(opal_buffer_t))) {
|
||||
exit_status = ORTE_ERROR;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
if (ORTE_SUCCESS != (ret = opal_dss.pack(loc_buffer, &command, 1, ORTE_ERRMGR_MIGRATE_TOOL_CMD)) ) {
|
||||
opal_output(orte_errmgr_base_framework.framework_output,
|
||||
"errmgr:base:tool:update() Error: DSS Pack (cmd) Failure (ret = %d)\n",
|
||||
ret);
|
||||
ORTE_ERROR_LOG(ret);
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
if (ORTE_SUCCESS != (ret = opal_dss.pack(loc_buffer, &status, 1, OPAL_INT))) {
|
||||
opal_output(orte_errmgr_base_framework.framework_output,
|
||||
"errmgr:base:tool:update() Error: DSS Pack (status) Failure (ret = %d)\n",
|
||||
ret);
|
||||
ORTE_ERROR_LOG(ret);
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
if (ORTE_SUCCESS != (ret = orte_rml.send_buffer_nb(&errmgr_cmdline_sender,
|
||||
loc_buffer, ORTE_RML_TAG_MIGRATE,
|
||||
orte_rml_send_callback, NULL))) {
|
||||
opal_output(orte_errmgr_base_framework.framework_output,
|
||||
"errmgr:base:tool:update() Error: Send (status) Failure (ret = %d)\n",
|
||||
ret);
|
||||
ORTE_ERROR_LOG(ret);
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
cleanup:
|
||||
if(NULL != loc_buffer) {
|
||||
OBJ_RELEASE(loc_buffer);
|
||||
loc_buffer = NULL;
|
||||
}
|
||||
|
||||
return exit_status;
|
||||
}
|
||||
|
||||
/********************
|
||||
* Utility functions
|
||||
********************/
|
||||
|
||||
/********************
|
||||
* Local Functions
|
||||
********************/
|
||||
static int errmgr_base_tool_start_cmdline_listener(void)
|
||||
{
|
||||
if (errmgr_cmdline_recv_issued && ORTE_PROC_IS_HNP) {
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
|
||||
"errmgr:base:tool: Startup Command Line Channel"));
|
||||
|
||||
/*
|
||||
* Coordinator command listener
|
||||
*/
|
||||
errmgr_cmdline_sender.jobid = ORTE_JOBID_INVALID;
|
||||
errmgr_cmdline_sender.vpid = ORTE_VPID_INVALID;
|
||||
orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_MIGRATE,
|
||||
0, errmgr_base_tool_cmdline_recv, NULL);
|
||||
|
||||
errmgr_cmdline_recv_issued = true;
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
static int errmgr_base_tool_stop_cmdline_listener(void)
|
||||
{
|
||||
int exit_status = ORTE_SUCCESS;
|
||||
|
||||
if (!errmgr_cmdline_recv_issued && ORTE_PROC_IS_HNP) {
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
|
||||
"errmgr:base:tool: Shutdown Command Line Channel"));
|
||||
|
||||
orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_MIGRATE);
|
||||
|
||||
errmgr_cmdline_recv_issued = false;
|
||||
|
||||
return exit_status;
|
||||
}
|
||||
|
||||
/*****************
|
||||
* Listener Callbacks
|
||||
*****************/
|
||||
static void errmgr_base_tool_cmdline_recv(int status,
|
||||
orte_process_name_t* sender,
|
||||
opal_buffer_t* buffer,
|
||||
orte_rml_tag_t tag,
|
||||
void* cbdata)
|
||||
{
|
||||
int ret;
|
||||
orte_process_name_t swap_dest;
|
||||
orte_errmgr_tool_cmd_flag_t command;
|
||||
orte_std_cntr_t count = 1;
|
||||
char *off_nodes = NULL;
|
||||
char *off_procs = NULL;
|
||||
char *onto_nodes = NULL;
|
||||
char **split_off_nodes = NULL;
|
||||
char **split_off_procs = NULL;
|
||||
char **split_onto_nodes = NULL;
|
||||
opal_list_t *proc_list = NULL;
|
||||
opal_list_t *node_list = NULL;
|
||||
opal_list_t *suggested_map_list = NULL;
|
||||
orte_errmgr_predicted_proc_t *off_proc = NULL;
|
||||
orte_errmgr_predicted_node_t *off_node = NULL;
|
||||
orte_errmgr_predicted_map_t *onto_map = NULL;
|
||||
int cnt = 0, i;
|
||||
|
||||
|
||||
if( ORTE_RML_TAG_MIGRATE != tag ) {
|
||||
opal_output(orte_errmgr_base_framework.framework_output,
|
||||
"errmgr:base:tool:recv() Error: Unknown tag: Received a command message from %s (tag = %d).",
|
||||
ORTE_NAME_PRINT(sender), tag);
|
||||
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
|
||||
return;
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base_framework.framework_output,
|
||||
"errmgr:base:tool:recv() Command Line: Start a migration operation [Sender = %s]",
|
||||
ORTE_NAME_PRINT(sender)));
|
||||
|
||||
errmgr_cmdline_recv_issued = false; /* Not a persistent RML message */
|
||||
|
||||
/*
|
||||
* If we are already interacting with a command line tool then reject this
|
||||
* request. Since we only allow the processing of one tool command at a
|
||||
* time.
|
||||
*/
|
||||
if( OPAL_EQUAL != orte_util_compare_name_fields(ORTE_NS_CMP_ALL, ORTE_NAME_INVALID, &errmgr_cmdline_sender) ) {
|
||||
swap_dest.jobid = errmgr_cmdline_sender.jobid;
|
||||
swap_dest.vpid = errmgr_cmdline_sender.vpid;
|
||||
|
||||
errmgr_cmdline_sender = *sender;
|
||||
orte_errmgr_base_migrate_update(ORTE_ERRMGR_MIGRATE_STATE_ERR_INPROGRESS);
|
||||
|
||||
errmgr_cmdline_sender.jobid = swap_dest.jobid;
|
||||
errmgr_cmdline_sender.vpid = swap_dest.vpid;
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
errmgr_cmdline_sender = *sender;
|
||||
|
||||
count = 1;
|
||||
if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &command, &count, ORTE_ERRMGR_MIGRATE_TOOL_CMD))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
return;
|
||||
}
|
||||
|
||||
/*
|
||||
* orte-migrate has requested that a checkpoint be taken
|
||||
*/
|
||||
if (ORTE_ERRMGR_MIGRATE_TOOL_INIT_CMD == command) {
|
||||
OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base_framework.framework_output,
|
||||
"errmgr:base:tool:recv() Command line requested process migration [command %d]\n",
|
||||
command));
|
||||
|
||||
/*
|
||||
* Unpack the buffer from the orte-migrate command
|
||||
*/
|
||||
count = 1;
|
||||
if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &(off_procs), &count, OPAL_STRING))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
return;
|
||||
}
|
||||
|
||||
if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &(off_nodes), &count, OPAL_STRING))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
return;
|
||||
}
|
||||
|
||||
if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &(onto_nodes), &count, OPAL_STRING))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
return;
|
||||
}
|
||||
|
||||
/*
|
||||
* Parse the comma separated list
|
||||
*/
|
||||
proc_list = OBJ_NEW(opal_list_t);
|
||||
node_list = OBJ_NEW(opal_list_t);
|
||||
suggested_map_list = OBJ_NEW(opal_list_t);
|
||||
|
||||
split_off_procs = opal_argv_split(off_procs, ',');
|
||||
cnt = opal_argv_count(split_off_procs);
|
||||
if( cnt > 0 ) {
|
||||
for(i = 0; i < cnt; ++i) {
|
||||
off_proc = OBJ_NEW(orte_errmgr_predicted_proc_t);
|
||||
off_proc->proc_name.vpid = atoi(split_off_procs[i]);
|
||||
opal_list_append(proc_list, &(off_proc->super));
|
||||
}
|
||||
}
|
||||
|
||||
split_off_nodes = opal_argv_split(off_nodes, ',');
|
||||
cnt = opal_argv_count(split_off_nodes);
|
||||
if( cnt > 0 ) {
|
||||
for(i = 0; i < cnt; ++i) {
|
||||
off_node = OBJ_NEW(orte_errmgr_predicted_node_t);
|
||||
off_node->node_name = strdup(split_off_nodes[i]);
|
||||
opal_list_append(node_list, &(off_node->super));
|
||||
}
|
||||
}
|
||||
|
||||
split_onto_nodes = opal_argv_split(onto_nodes, ',');
|
||||
cnt = opal_argv_count(split_onto_nodes);
|
||||
if( cnt > 0 ) {
|
||||
for(i = 0; i < cnt; ++i) {
|
||||
onto_map = OBJ_NEW(orte_errmgr_predicted_map_t);
|
||||
onto_map->map_node_name = strdup(split_onto_nodes[i]);
|
||||
opal_list_append(suggested_map_list, &(onto_map->super));
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Pass to the predicted fault function to see how they would like to progress
|
||||
*/
|
||||
orte_errmgr.predicted_fault(proc_list, node_list, suggested_map_list);
|
||||
}
|
||||
/*
|
||||
* Unknown command
|
||||
*/
|
||||
else {
|
||||
OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base_framework.framework_output,
|
||||
"errmgr:base:tool:recv() Command line sent an unknown command (command %d)\n",
|
||||
command));
|
||||
ORTE_ERROR_LOG(ORTE_ERR_NOT_SUPPORTED);
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
#endif
|
@ -12,6 +12,7 @@
|
||||
* Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved.
|
||||
* Copyright (c) 2011 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2017 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -53,14 +54,6 @@ typedef struct {
|
||||
|
||||
ORTE_DECLSPEC extern orte_errmgr_base_t orte_errmgr_base;
|
||||
|
||||
/* define a struct to hold registered error callbacks */
|
||||
typedef struct {
|
||||
opal_list_item_t super;
|
||||
orte_errmgr_error_order_t order;
|
||||
orte_errmgr_error_callback_fn_t *callback;
|
||||
} orte_errmgr_cback_t;
|
||||
OBJ_CLASS_DECLARATION(orte_errmgr_cback_t);
|
||||
|
||||
/* declare the base default module */
|
||||
ORTE_DECLSPEC extern orte_errmgr_base_module_t orte_errmgr_default_fns;
|
||||
|
||||
@ -75,12 +68,5 @@ ORTE_DECLSPEC int orte_errmgr_base_abort_peers(orte_process_name_t *procs,
|
||||
orte_std_cntr_t num_procs,
|
||||
int error_code);
|
||||
|
||||
ORTE_DECLSPEC void orte_errmgr_base_register_migration_warning(struct timeval *tv);
|
||||
|
||||
ORTE_DECLSPEC int orte_errmgr_base_register_error_callback(orte_errmgr_error_callback_fn_t *cbfunc,
|
||||
orte_errmgr_error_order_t order);
|
||||
|
||||
ORTE_DECLSPEC void orte_errmgr_base_execute_error_callbacks(opal_pointer_array_t *errors);
|
||||
|
||||
END_C_DECLS
|
||||
#endif
|
||||
|
@ -56,17 +56,11 @@
|
||||
* HNP module
|
||||
******************/
|
||||
orte_errmgr_base_module_t orte_errmgr_default_app_module = {
|
||||
init,
|
||||
finalize,
|
||||
orte_errmgr_base_log,
|
||||
orte_errmgr_base_abort,
|
||||
abort_peers,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
orte_errmgr_base_register_migration_warning,
|
||||
orte_errmgr_base_register_error_callback,
|
||||
orte_errmgr_base_execute_error_callbacks
|
||||
.init = init,
|
||||
.finalize = finalize,
|
||||
.logfn = orte_errmgr_base_log,
|
||||
.abort = orte_errmgr_base_abort,
|
||||
.abort_peers = abort_peers
|
||||
};
|
||||
|
||||
static void proc_errors(int fd, short args, void *cbdata);
|
||||
@ -77,6 +71,7 @@ static void register_cbfunc(int status, size_t errhndler, void *cbdata)
|
||||
{
|
||||
volatile bool *active = (volatile bool*)cbdata;
|
||||
myerrhandle = errhndler;
|
||||
ORTE_POST_OBJECT(active);
|
||||
*active = false;
|
||||
}
|
||||
|
||||
@ -112,7 +107,7 @@ static void notify_cbfunc(int status,
|
||||
}
|
||||
|
||||
/* push it into our event base */
|
||||
ORTE_ACTIVATE_PROC_STATE(ORTE_PROC_MY_NAME, state);
|
||||
ORTE_ACTIVATE_PROC_STATE((orte_process_name_t*)source, state);
|
||||
}
|
||||
|
||||
/************************
|
||||
@ -154,8 +149,8 @@ static void proc_errors(int fd, short args, void *cbdata)
|
||||
{
|
||||
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
|
||||
char *nodename;
|
||||
orte_error_t err;
|
||||
opal_pointer_array_t errors;
|
||||
|
||||
ORTE_ACQUIRE_OBJECT(caddy);
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_framework.framework_output,
|
||||
"%s errmgr:default_app: proc %s state %s",
|
||||
@ -171,14 +166,6 @@ static void proc_errors(int fd, short args, void *cbdata)
|
||||
return;
|
||||
}
|
||||
|
||||
/* pass the error to the error_callbacks for processing */
|
||||
OBJ_CONSTRUCT(&errors, opal_pointer_array_t);
|
||||
opal_pointer_array_init(&errors, 1, INT_MAX, 1);
|
||||
err.errcode = caddy->proc_state;
|
||||
err.proc = caddy->name;
|
||||
opal_pointer_array_add(&errors, &err);
|
||||
|
||||
|
||||
if (ORTE_PROC_STATE_UNABLE_TO_SEND_MSG == caddy->proc_state) {
|
||||
/* we can't send a message - print a message */
|
||||
nodename = orte_get_proc_hostname(&caddy->name);
|
||||
@ -197,9 +184,6 @@ static void proc_errors(int fd, short args, void *cbdata)
|
||||
orte_abnormal_term_ordered = true;
|
||||
}
|
||||
|
||||
orte_errmgr_base_execute_error_callbacks(&errors);
|
||||
OBJ_DESTRUCT(&errors);
|
||||
|
||||
OBJ_RELEASE(caddy);
|
||||
}
|
||||
|
||||
|
@ -50,6 +50,7 @@
|
||||
#include "orte/util/proc_info.h"
|
||||
#include "orte/util/show_help.h"
|
||||
#include "orte/util/nidmap.h"
|
||||
#include "orte/util/threads.h"
|
||||
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
#include "orte/runtime/orte_locks.h"
|
||||
@ -66,32 +67,15 @@ static int init(void);
|
||||
static int finalize(void);
|
||||
static void hnp_abort(int error_code, char *fmt, ...);
|
||||
|
||||
static int predicted_fault(opal_list_t *proc_list,
|
||||
opal_list_t *node_list,
|
||||
opal_list_t *suggested_map);
|
||||
|
||||
static int suggest_map_targets(orte_proc_t *proc,
|
||||
orte_node_t *oldnode,
|
||||
opal_list_t *node_list);
|
||||
|
||||
static int ft_event(int state);
|
||||
|
||||
|
||||
/******************
|
||||
* default_hnp module
|
||||
******************/
|
||||
orte_errmgr_base_module_t orte_errmgr_default_hnp_module = {
|
||||
init,
|
||||
finalize,
|
||||
orte_errmgr_base_log,
|
||||
hnp_abort,
|
||||
orte_errmgr_base_abort_peers,
|
||||
predicted_fault,
|
||||
suggest_map_targets,
|
||||
ft_event,
|
||||
orte_errmgr_base_register_migration_warning,
|
||||
NULL,
|
||||
orte_errmgr_base_execute_error_callbacks
|
||||
.init = init,
|
||||
.finalize = finalize,
|
||||
.logfn = orte_errmgr_base_log,
|
||||
.abort = hnp_abort,
|
||||
.abort_peers = orte_errmgr_base_abort_peers
|
||||
};
|
||||
|
||||
|
||||
@ -129,6 +113,7 @@ static int finalize(void)
|
||||
static void wakeup(int sd, short args, void *cbdata)
|
||||
{
|
||||
/* nothing more we can do */
|
||||
ORTE_ACQUIRE_OBJECT(cbdata);
|
||||
orte_quit(0, 0, NULL);
|
||||
}
|
||||
|
||||
@ -187,6 +172,7 @@ static void hnp_abort(int error_code, char *fmt, ...)
|
||||
timer->tv.tv_usec = 0;
|
||||
opal_event_evtimer_set(orte_event_base, timer->ev, wakeup, NULL);
|
||||
opal_event_set_priority(timer->ev, ORTE_ERROR_PRI);
|
||||
ORTE_POST_OBJECT(timer);
|
||||
opal_event_evtimer_add(timer->ev, &timer->tv);
|
||||
}
|
||||
|
||||
@ -202,6 +188,8 @@ static void job_errors(int fd, short args, void *cbdata)
|
||||
int32_t rc, ret;
|
||||
int room, *rmptr;
|
||||
|
||||
ORTE_ACQUIRE_OBJECT(caddy);
|
||||
|
||||
/*
|
||||
* if orte is trying to shutdown, just let it
|
||||
*/
|
||||
@ -363,6 +351,8 @@ static void proc_errors(int fd, short args, void *cbdata)
|
||||
int32_t i32, *i32ptr;
|
||||
char *rtmod;
|
||||
|
||||
ORTE_ACQUIRE_OBJECT(caddy);
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_framework.framework_output,
|
||||
"%s errmgr:default_hnp: for proc %s state %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
@ -497,7 +487,7 @@ static void proc_errors(int fd, short args, void *cbdata)
|
||||
}
|
||||
}
|
||||
|
||||
keep_going:
|
||||
keep_going:
|
||||
/* if this is a continuously operating job, then there is nothing more
|
||||
* to do - we let the job continue to run */
|
||||
if (orte_get_attribute(&jdata->attributes, ORTE_JOB_CONTINUOUS_OP, NULL, OPAL_BOOL)) {
|
||||
@ -798,25 +788,6 @@ static void proc_errors(int fd, short args, void *cbdata)
|
||||
OBJ_RELEASE(caddy);
|
||||
}
|
||||
|
||||
static int predicted_fault(opal_list_t *proc_list,
|
||||
opal_list_t *node_list,
|
||||
opal_list_t *suggested_map)
|
||||
{
|
||||
return ORTE_ERR_NOT_IMPLEMENTED;
|
||||
}
|
||||
|
||||
static int suggest_map_targets(orte_proc_t *proc,
|
||||
orte_node_t *oldnode,
|
||||
opal_list_t *node_list)
|
||||
{
|
||||
return ORTE_ERR_NOT_IMPLEMENTED;
|
||||
}
|
||||
|
||||
static int ft_event(int state)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/*****************
|
||||
* Local Functions
|
||||
*****************/
|
||||
|
@ -33,6 +33,7 @@
|
||||
#include "orte/util/session_dir.h"
|
||||
#include "orte/util/show_help.h"
|
||||
#include "orte/util/nidmap.h"
|
||||
#include "orte/util/threads.h"
|
||||
|
||||
#include "orte/mca/iof/base/base.h"
|
||||
#include "orte/mca/rml/rml.h"
|
||||
@ -60,32 +61,16 @@
|
||||
static int init(void);
|
||||
static int finalize(void);
|
||||
static void orted_abort(int error_code, char *fmt, ...);
|
||||
static int predicted_fault(opal_list_t *proc_list,
|
||||
opal_list_t *node_list,
|
||||
opal_list_t *suggested_map);
|
||||
|
||||
static int suggest_map_targets(orte_proc_t *proc,
|
||||
orte_node_t *oldnode,
|
||||
opal_list_t *node_list);
|
||||
|
||||
static int ft_event(int state);
|
||||
|
||||
|
||||
/******************
|
||||
* default_orted module
|
||||
******************/
|
||||
orte_errmgr_base_module_t orte_errmgr_default_orted_module = {
|
||||
init,
|
||||
finalize,
|
||||
orte_errmgr_base_log,
|
||||
orted_abort,
|
||||
orte_errmgr_base_abort_peers,
|
||||
predicted_fault,
|
||||
suggest_map_targets,
|
||||
ft_event,
|
||||
orte_errmgr_base_register_migration_warning,
|
||||
NULL,
|
||||
orte_errmgr_base_execute_error_callbacks
|
||||
.init = init,
|
||||
.finalize = finalize,
|
||||
.logfn = orte_errmgr_base_log,
|
||||
.abort = orted_abort,
|
||||
.abort_peers = orte_errmgr_base_abort_peers
|
||||
};
|
||||
|
||||
/* Local functions */
|
||||
@ -125,6 +110,7 @@ static int finalize(void)
|
||||
static void wakeup(int sd, short args, void *cbdata)
|
||||
{
|
||||
/* nothing more we can do */
|
||||
ORTE_ACQUIRE_OBJECT(cbdata);
|
||||
orte_quit(0, 0, NULL);
|
||||
}
|
||||
|
||||
@ -231,6 +217,7 @@ static void orted_abort(int error_code, char *fmt, ...)
|
||||
timer->tv.tv_usec = 0;
|
||||
opal_event_evtimer_set(orte_event_base, timer->ev, wakeup, NULL);
|
||||
opal_event_set_priority(timer->ev, ORTE_ERROR_PRI);
|
||||
ORTE_POST_OBJECT(timer);
|
||||
opal_event_evtimer_add(timer->ev, &timer->tv);
|
||||
|
||||
}
|
||||
@ -244,6 +231,8 @@ static void job_errors(int fd, short args, void *cbdata)
|
||||
orte_plm_cmd_flag_t cmd;
|
||||
opal_buffer_t *alert;
|
||||
|
||||
ORTE_ACQUIRE_OBJECT(caddy);
|
||||
|
||||
/*
|
||||
* if orte is trying to shutdown, just let it
|
||||
*/
|
||||
@ -330,6 +319,8 @@ static void proc_errors(int fd, short args, void *cbdata)
|
||||
int rc=ORTE_SUCCESS;
|
||||
int i;
|
||||
|
||||
ORTE_ACQUIRE_OBJECT(caddy);
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output,
|
||||
"%s errmgr:default_orted:proc_errors process %s error state %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
@ -720,30 +711,10 @@ static void proc_errors(int fd, short args, void *cbdata)
|
||||
return;
|
||||
}
|
||||
|
||||
cleanup:
|
||||
cleanup:
|
||||
OBJ_RELEASE(caddy);
|
||||
}
|
||||
|
||||
static int predicted_fault(opal_list_t *proc_list,
|
||||
opal_list_t *node_list,
|
||||
opal_list_t *suggested_map)
|
||||
{
|
||||
return ORTE_ERR_NOT_IMPLEMENTED;
|
||||
}
|
||||
|
||||
static int suggest_map_targets(orte_proc_t *proc,
|
||||
orte_node_t *oldnode,
|
||||
opal_list_t *node_list)
|
||||
{
|
||||
return ORTE_ERR_NOT_IMPLEMENTED;
|
||||
}
|
||||
|
||||
static int ft_event(int state)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
/*****************
|
||||
* Local Functions
|
||||
*****************/
|
||||
|
@ -31,6 +31,7 @@
|
||||
#include "orte/util/error_strings.h"
|
||||
#include "orte/util/name_fns.h"
|
||||
#include "orte/util/show_help.h"
|
||||
#include "orte/util/threads.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
#include "orte/mca/rml/rml.h"
|
||||
#include "orte/mca/odls/odls_types.h"
|
||||
@ -54,17 +55,11 @@ static int abort_peers(orte_process_name_t *procs,
|
||||
* HNP module
|
||||
******************/
|
||||
orte_errmgr_base_module_t orte_errmgr_default_tool_module = {
|
||||
init,
|
||||
finalize,
|
||||
orte_errmgr_base_log,
|
||||
orte_errmgr_base_abort,
|
||||
abort_peers,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
orte_errmgr_base_register_migration_warning,
|
||||
orte_errmgr_base_register_error_callback,
|
||||
orte_errmgr_base_execute_error_callbacks
|
||||
.init= init,
|
||||
.finalize = finalize,
|
||||
.logfn = orte_errmgr_base_log,
|
||||
.abort = orte_errmgr_base_abort,
|
||||
.abort_peers = abort_peers
|
||||
};
|
||||
|
||||
static void proc_errors(int fd, short args, void *cbdata);
|
||||
@ -89,6 +84,8 @@ static void proc_errors(int fd, short args, void *cbdata)
|
||||
{
|
||||
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
|
||||
|
||||
ORTE_ACQUIRE_OBJECT(caddy);
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_framework.framework_output,
|
||||
"%s errmgr:default_tool: proc %s state %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
@ -106,6 +103,7 @@ static void proc_errors(int fd, short args, void *cbdata)
|
||||
/* if we lost our lifeline, then just stop the event loop
|
||||
* so the main program can cleanly terminate */
|
||||
if (ORTE_PROC_STATE_LIFELINE_LOST == caddy->proc_state) {
|
||||
ORTE_POST_OBJECT(caddy);
|
||||
orte_event_base_active = false;
|
||||
} else {
|
||||
/* all other errors require abort */
|
||||
|
@ -50,6 +50,7 @@
|
||||
#include "orte/util/proc_info.h"
|
||||
#include "orte/util/show_help.h"
|
||||
#include "orte/util/nidmap.h"
|
||||
#include "orte/util/threads.h"
|
||||
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
#include "orte/runtime/orte_locks.h"
|
||||
@ -65,32 +66,15 @@
|
||||
static int init(void);
|
||||
static int finalize(void);
|
||||
|
||||
static int predicted_fault(opal_list_t *proc_list,
|
||||
opal_list_t *node_list,
|
||||
opal_list_t *suggested_map);
|
||||
|
||||
static int suggest_map_targets(orte_proc_t *proc,
|
||||
orte_node_t *oldnode,
|
||||
opal_list_t *node_list);
|
||||
|
||||
static int ft_event(int state);
|
||||
|
||||
|
||||
/******************
|
||||
* dvm module
|
||||
******************/
|
||||
orte_errmgr_base_module_t orte_errmgr_dvm_module = {
|
||||
init,
|
||||
finalize,
|
||||
orte_errmgr_base_log,
|
||||
orte_errmgr_base_abort,
|
||||
orte_errmgr_base_abort_peers,
|
||||
predicted_fault,
|
||||
suggest_map_targets,
|
||||
ft_event,
|
||||
orte_errmgr_base_register_migration_warning,
|
||||
NULL,
|
||||
orte_errmgr_base_execute_error_callbacks
|
||||
.init = init,
|
||||
.finalize = finalize,
|
||||
.logfn = orte_errmgr_base_log,
|
||||
.abort = orte_errmgr_base_abort,
|
||||
.abort_peers = orte_errmgr_base_abort_peers
|
||||
};
|
||||
|
||||
|
||||
@ -146,6 +130,8 @@ static void job_errors(int fd, short args, void *cbdata)
|
||||
int32_t rc, ret;
|
||||
int room, *rmptr;
|
||||
|
||||
ORTE_ACQUIRE_OBJECT(caddy);
|
||||
|
||||
/*
|
||||
* if orte is trying to shutdown, just let it
|
||||
*/
|
||||
@ -248,6 +234,8 @@ static void proc_errors(int fd, short args, void *cbdata)
|
||||
int32_t i32, *i32ptr;
|
||||
char *rtmod;
|
||||
|
||||
ORTE_ACQUIRE_OBJECT(caddy);
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_framework.framework_output,
|
||||
"%s errmgr:dvm: for proc %s state %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
@ -386,7 +374,7 @@ static void proc_errors(int fd, short args, void *cbdata)
|
||||
}
|
||||
}
|
||||
|
||||
keep_going:
|
||||
keep_going:
|
||||
/* ensure we record the failed proc properly so we can report
|
||||
* the error once we terminate
|
||||
*/
|
||||
@ -643,22 +631,3 @@ static void proc_errors(int fd, short args, void *cbdata)
|
||||
cleanup:
|
||||
OBJ_RELEASE(caddy);
|
||||
}
|
||||
|
||||
static int predicted_fault(opal_list_t *proc_list,
|
||||
opal_list_t *node_list,
|
||||
opal_list_t *suggested_map)
|
||||
{
|
||||
return ORTE_ERR_NOT_IMPLEMENTED;
|
||||
}
|
||||
|
||||
static int suggest_map_targets(orte_proc_t *proc,
|
||||
orte_node_t *oldnode,
|
||||
opal_list_t *node_list)
|
||||
{
|
||||
return ORTE_ERR_NOT_IMPLEMENTED;
|
||||
}
|
||||
|
||||
static int ft_event(int state)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
@ -14,7 +14,7 @@
|
||||
* Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved.
|
||||
* Copyright (c) 2011-2015 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2013 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2013-2017 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2014 NVIDIA Corporation. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
@ -63,70 +63,6 @@
|
||||
|
||||
BEGIN_C_DECLS
|
||||
|
||||
/*
|
||||
* Structure to describe a predicted process fault.
|
||||
*
|
||||
* This can be expanded in the future to support assurance levels, and
|
||||
* additional information that may wish to be conveyed.
|
||||
*/
|
||||
struct orte_errmgr_predicted_proc_t {
|
||||
/** This is an object, so must have a super */
|
||||
opal_list_item_t super;
|
||||
|
||||
/** Process Name */
|
||||
orte_process_name_t proc_name;
|
||||
};
|
||||
typedef struct orte_errmgr_predicted_proc_t orte_errmgr_predicted_proc_t;
|
||||
OBJ_CLASS_DECLARATION(orte_errmgr_predicted_proc_t);
|
||||
|
||||
/*
|
||||
* Structure to describe a predicted node fault.
|
||||
*
|
||||
* This can be expanded in the future to support assurance levels, and
|
||||
* additional information that may wish to be conveyed.
|
||||
*/
|
||||
struct orte_errmgr_predicted_node_t {
|
||||
/** This is an object, so must have a super */
|
||||
opal_list_item_t super;
|
||||
|
||||
/** Node Name */
|
||||
char * node_name;
|
||||
};
|
||||
typedef struct orte_errmgr_predicted_node_t orte_errmgr_predicted_node_t;
|
||||
OBJ_CLASS_DECLARATION(orte_errmgr_predicted_node_t);
|
||||
|
||||
/*
|
||||
* Structure to describe a suggested remapping element for a predicted fault.
|
||||
*
|
||||
* This can be expanded in the future to support weights , and
|
||||
* additional information that may wish to be conveyed.
|
||||
*/
|
||||
struct orte_errmgr_predicted_map_t {
|
||||
/** This is an object, so must have a super */
|
||||
opal_list_item_t super;
|
||||
|
||||
/** Process Name (predicted to fail) */
|
||||
orte_process_name_t proc_name;
|
||||
|
||||
/** Node Name (predicted to fail) */
|
||||
char * node_name;
|
||||
|
||||
/** Process Name (Map to) */
|
||||
orte_process_name_t map_proc_name;
|
||||
|
||||
/** Node Name (Map to) */
|
||||
char * map_node_name;
|
||||
|
||||
/** Just off current node */
|
||||
bool off_current_node;
|
||||
|
||||
/** Pre-map fixed node assignment */
|
||||
char * pre_map_fixed_node;
|
||||
};
|
||||
typedef struct orte_errmgr_predicted_map_t orte_errmgr_predicted_map_t;
|
||||
OBJ_CLASS_DECLARATION(orte_errmgr_predicted_map_t);
|
||||
|
||||
|
||||
/*
|
||||
* Macro definitions
|
||||
*/
|
||||
@ -183,84 +119,6 @@ typedef int (*orte_errmgr_base_module_abort_peers_fn_t)(orte_process_name_t *pro
|
||||
orte_std_cntr_t num_procs,
|
||||
int error_code);
|
||||
|
||||
/**
|
||||
* Predicted process/node failure notification
|
||||
*
|
||||
* @param[in] proc_list List of processes (or NULL if none)
|
||||
* @param[in] node_list List of nodes (or NULL if none)
|
||||
* @param[in] suggested_map List of mapping suggestions to use on recovery (or NULL if none)
|
||||
*
|
||||
* @retval ORTE_SUCCESS The operation completed successfully
|
||||
* @retval ORTE_ERROR An unspecifed error occurred
|
||||
*/
|
||||
typedef int (*orte_errmgr_base_module_predicted_fault_fn_t)(opal_list_t *proc_list,
|
||||
opal_list_t *node_list,
|
||||
opal_list_t *suggested_map);
|
||||
|
||||
/**
|
||||
* Suggest a node to map a restarting process onto
|
||||
*
|
||||
* @param[in] proc Process that is being mapped
|
||||
* @param[in] oldnode Previous node where this process resided
|
||||
* @param[in|out] node_list List of nodes to select from
|
||||
*
|
||||
* @retval ORTE_SUCCESS The operation completed successfully
|
||||
* @retval ORTE_ERROR An unspecifed error occurred
|
||||
*/
|
||||
typedef int (*orte_errmgr_base_module_suggest_map_targets_fn_t)(orte_proc_t *proc,
|
||||
orte_node_t *oldnode,
|
||||
opal_list_t *node_list);
|
||||
|
||||
/**
|
||||
* Handle fault tolerance updates
|
||||
*
|
||||
* @param[in] state Fault tolerance state update
|
||||
*
|
||||
* @retval ORTE_SUCCESS The operation completed successfully
|
||||
* @retval ORTE_ERROR An unspecifed error occurred
|
||||
*/
|
||||
typedef int (*orte_errmgr_base_module_ft_event_fn_t)(int state);
|
||||
|
||||
/**
|
||||
* Function to perform actions that require the rest of the ORTE layer to be up
|
||||
* and running.
|
||||
*
|
||||
* @retval ORTE_SUCCESS The operation completed successfully
|
||||
* @retval ORTE_ERROR An unspecified error occured
|
||||
*/
|
||||
typedef void (*orte_errmgr_base_module_register_migration_warning_fn_t)(struct timeval *tv);
|
||||
|
||||
typedef enum {
|
||||
ORTE_ERRMGR_CALLBACK_FIRST,
|
||||
ORTE_ERRMGR_CALLBACK_LAST,
|
||||
ORTE_ERRMGR_CALLBACK_PREPEND,
|
||||
ORTE_ERRMGR_CALLBACK_APPEND
|
||||
} orte_errmgr_error_order_t;
|
||||
|
||||
/**
|
||||
* Register a callback function for faults.
|
||||
*
|
||||
* This callback function will be used anytime (other than during finalize) the
|
||||
* runtime detects and handles a critical failure. The runtime will complete all
|
||||
* its stabilization before cycling thru all registered callbacks. The order of
|
||||
* the callbacks will proceed in the indicated order with which they were registered.
|
||||
*
|
||||
* The parameter to the callback function will be the orte_process_name_t
|
||||
* of the process involved in the error.
|
||||
*
|
||||
* @param[in] cbfunc The callback function.
|
||||
*
|
||||
*/
|
||||
typedef struct {
|
||||
orte_process_name_t proc;
|
||||
int errcode;
|
||||
} orte_error_t;
|
||||
|
||||
typedef int (orte_errmgr_error_callback_fn_t)(opal_pointer_array_t *errors);
|
||||
typedef int (*orte_errmgr_base_module_register_error_callback_fn_t)(orte_errmgr_error_callback_fn_t *cbfunc,
|
||||
orte_errmgr_error_order_t order);
|
||||
typedef void (*orte_errmgr_base_module_execute_error_callbacks_fn_t)(opal_pointer_array_t *errors);
|
||||
|
||||
/*
|
||||
* Module Structure
|
||||
*/
|
||||
@ -273,21 +131,6 @@ struct orte_errmgr_base_module_2_3_0_t {
|
||||
orte_errmgr_base_module_log_fn_t logfn;
|
||||
orte_errmgr_base_module_abort_fn_t abort;
|
||||
orte_errmgr_base_module_abort_peers_fn_t abort_peers;
|
||||
|
||||
/** Predicted process/node failure notification */
|
||||
orte_errmgr_base_module_predicted_fault_fn_t predicted_fault;
|
||||
/** Suggest a node to map a restarting process onto */
|
||||
orte_errmgr_base_module_suggest_map_targets_fn_t suggest_map_targets;
|
||||
|
||||
/** Handle any FT Notifications */
|
||||
orte_errmgr_base_module_ft_event_fn_t ft_event;
|
||||
|
||||
/* Register to be warned of impending migration */
|
||||
orte_errmgr_base_module_register_migration_warning_fn_t register_migration_warning;
|
||||
|
||||
/* Register a callback function */
|
||||
orte_errmgr_base_module_register_error_callback_fn_t register_error_callback;
|
||||
orte_errmgr_base_module_execute_error_callbacks_fn_t execute_error_callbacks;
|
||||
};
|
||||
typedef struct orte_errmgr_base_module_2_3_0_t orte_errmgr_base_module_2_3_0_t;
|
||||
typedef orte_errmgr_base_module_2_3_0_t orte_errmgr_base_module_t;
|
||||
|
@ -9,6 +9,7 @@
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2012-2013 Los Alamos National Security, LLC.
|
||||
* All rights reserved
|
||||
* Copyright (c) 2017 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -32,18 +33,18 @@
|
||||
* Globals
|
||||
*/
|
||||
ORTE_DECLSPEC orte_filem_base_module_t orte_filem = {
|
||||
orte_filem_base_module_init,
|
||||
orte_filem_base_module_finalize,
|
||||
orte_filem_base_none_put,
|
||||
orte_filem_base_none_put_nb,
|
||||
orte_filem_base_none_get,
|
||||
orte_filem_base_none_get_nb,
|
||||
orte_filem_base_none_rm,
|
||||
orte_filem_base_none_rm_nb,
|
||||
orte_filem_base_none_wait,
|
||||
orte_filem_base_none_wait_all,
|
||||
orte_filem_base_none_preposition_files,
|
||||
orte_filem_base_none_link_local_files
|
||||
.filem_init = orte_filem_base_module_init,
|
||||
.filem_finalize = orte_filem_base_module_finalize,
|
||||
.put = orte_filem_base_none_put,
|
||||
.put_nb = orte_filem_base_none_put_nb,
|
||||
.get = orte_filem_base_none_get,
|
||||
.get_nb = orte_filem_base_none_get_nb,
|
||||
.rm = orte_filem_base_none_rm,
|
||||
.rm_nb = orte_filem_base_none_rm_nb,
|
||||
.wait = orte_filem_base_none_wait,
|
||||
.wait_all = orte_filem_base_none_wait_all,
|
||||
.preposition_files = orte_filem_base_none_preposition_files,
|
||||
.link_local_files = orte_filem_base_none_link_local_files
|
||||
};
|
||||
bool orte_filem_base_is_active = false;
|
||||
|
||||
@ -69,4 +70,3 @@ static int orte_filem_base_open(mca_base_open_flag_t flags)
|
||||
|
||||
MCA_BASE_FRAMEWORK_DECLARE(orte, filem, NULL, NULL, orte_filem_base_open, orte_filem_base_close,
|
||||
mca_filem_base_static_components, 0);
|
||||
|
||||
|
@ -2,7 +2,7 @@
|
||||
* Copyright (c) 2012-2013 Los Alamos National Security, LLC.
|
||||
* All rights reserved
|
||||
* Copyright (c) 2013 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2015-2017 Research Organization for Information Science
|
||||
* and Technology (RIST). All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
@ -49,6 +49,7 @@
|
||||
#include "orte/util/name_fns.h"
|
||||
#include "orte/util/proc_info.h"
|
||||
#include "orte/util/session_dir.h"
|
||||
#include "orte/util/threads.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/grpcomm/base/base.h"
|
||||
@ -61,14 +62,6 @@
|
||||
|
||||
static int raw_init(void);
|
||||
static int raw_finalize(void);
|
||||
static int raw_put(orte_filem_base_request_t *req);
|
||||
static int raw_put_nb(orte_filem_base_request_t *req);
|
||||
static int raw_get(orte_filem_base_request_t *req);
|
||||
static int raw_get_nb(orte_filem_base_request_t *req);
|
||||
static int raw_rm(orte_filem_base_request_t *req);
|
||||
static int raw_rm_nb(orte_filem_base_request_t *req);
|
||||
static int raw_wait(orte_filem_base_request_t *req);
|
||||
static int raw_wait_all(opal_list_t *reqs);
|
||||
static int raw_preposition_files(orte_job_t *jdata,
|
||||
orte_filem_completion_cbfunc_t cbfunc,
|
||||
void *cbdata);
|
||||
@ -76,20 +69,20 @@ static int raw_link_local_files(orte_job_t *jdata,
|
||||
orte_app_context_t *app);
|
||||
|
||||
orte_filem_base_module_t mca_filem_raw_module = {
|
||||
raw_init,
|
||||
raw_finalize,
|
||||
.filem_init = raw_init,
|
||||
.filem_finalize = raw_finalize,
|
||||
/* we don't use any of the following */
|
||||
raw_put,
|
||||
raw_put_nb,
|
||||
raw_get,
|
||||
raw_get_nb,
|
||||
raw_rm,
|
||||
raw_rm_nb,
|
||||
raw_wait,
|
||||
raw_wait_all,
|
||||
.put = orte_filem_base_none_put,
|
||||
.put_nb = orte_filem_base_none_put_nb,
|
||||
.get = orte_filem_base_none_get,
|
||||
.get_nb = orte_filem_base_none_get_nb,
|
||||
.rm = orte_filem_base_none_rm,
|
||||
.rm_nb = orte_filem_base_none_rm_nb,
|
||||
.wait = orte_filem_base_none_wait,
|
||||
.wait_all = orte_filem_base_none_wait_all,
|
||||
/* now the APIs we *do* use */
|
||||
raw_preposition_files,
|
||||
raw_link_local_files
|
||||
.preposition_files = raw_preposition_files,
|
||||
.link_local_files = raw_link_local_files
|
||||
};
|
||||
|
||||
static opal_list_t outbound_files;
|
||||
@ -164,46 +157,6 @@ static int raw_finalize(void)
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static int raw_put(orte_filem_base_request_t *req)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static int raw_put_nb(orte_filem_base_request_t *req)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static int raw_get(orte_filem_base_request_t *req)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static int raw_get_nb(orte_filem_base_request_t *req)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static int raw_rm(orte_filem_base_request_t *req)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static int raw_rm_nb(orte_filem_base_request_t *req)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static int raw_wait(orte_filem_base_request_t *req)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static int raw_wait_all(opal_list_t *reqs)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static void xfer_complete(int status, orte_filem_raw_xfer_t *xfer)
|
||||
{
|
||||
orte_filem_raw_outbound_t *outbound = xfer->outbound;
|
||||
@ -586,8 +539,9 @@ static int raw_preposition_files(orte_job_t *jdata,
|
||||
opal_list_append(&outbound->xfers, &xfer->super);
|
||||
opal_event_set(orte_event_base, &xfer->ev, fd, OPAL_EV_READ, send_chunk, xfer);
|
||||
opal_event_set_priority(&xfer->ev, ORTE_MSG_PRI);
|
||||
opal_event_add(&xfer->ev, 0);
|
||||
xfer->pending = true;
|
||||
ORTE_POST_OBJECT(xfer);
|
||||
opal_event_add(&xfer->ev, 0);
|
||||
OBJ_RELEASE(item);
|
||||
}
|
||||
OBJ_DESTRUCT(&fsets);
|
||||
@ -804,6 +758,8 @@ static void send_chunk(int fd, short argc, void *cbdata)
|
||||
opal_buffer_t chunk;
|
||||
orte_grpcomm_signature_t *sig;
|
||||
|
||||
ORTE_ACQUIRE_OBJECT(rev);
|
||||
|
||||
/* flag that event has fired */
|
||||
rev->pending = false;
|
||||
|
||||
@ -815,6 +771,7 @@ static void send_chunk(int fd, short argc, void *cbdata)
|
||||
|
||||
/* non-blocking, retry */
|
||||
if (EAGAIN == errno || EINTR == errno) {
|
||||
ORTE_POST_OBJECT(rev);
|
||||
opal_event_add(&rev->ev, 0);
|
||||
return;
|
||||
}
|
||||
@ -891,8 +848,9 @@ static void send_chunk(int fd, short argc, void *cbdata)
|
||||
return;
|
||||
} else {
|
||||
/* restart the read event */
|
||||
opal_event_add(&rev->ev, 0);
|
||||
rev->pending = true;
|
||||
ORTE_POST_OBJECT(rev);
|
||||
opal_event_add(&rev->ev, 0);
|
||||
}
|
||||
}
|
||||
|
||||
@ -1116,7 +1074,8 @@ static void recv_files(int status, orte_process_name_t* sender,
|
||||
}
|
||||
}
|
||||
free(tmp);
|
||||
opal_event_set(orte_event_base, &incoming->ev, incoming->fd, OPAL_EV_WRITE, write_handler, incoming);
|
||||
opal_event_set(orte_event_base, &incoming->ev, incoming->fd,
|
||||
OPAL_EV_WRITE, write_handler, incoming);
|
||||
opal_event_set_priority(&incoming->ev, ORTE_MSG_PRI);
|
||||
}
|
||||
/* create an output object for this data */
|
||||
@ -1135,8 +1094,9 @@ static void recv_files(int status, orte_process_name_t* sender,
|
||||
|
||||
if (!incoming->pending) {
|
||||
/* add the event */
|
||||
opal_event_add(&incoming->ev, 0);
|
||||
incoming->pending = true;
|
||||
ORTE_POST_OBJECT(incoming);
|
||||
opal_event_add(&incoming->ev, 0);
|
||||
}
|
||||
|
||||
/* cleanup */
|
||||
@ -1154,6 +1114,8 @@ static void write_handler(int fd, short event, void *cbdata)
|
||||
char homedir[MAXPATHLEN];
|
||||
int rc;
|
||||
|
||||
ORTE_ACQUIRE_OBJECT(sink);
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_filem_base_framework.framework_output,
|
||||
"%s write:handler writing data to %d",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
@ -1226,8 +1188,9 @@ static void write_handler(int fd, short event, void *cbdata)
|
||||
/* leave the write event running so it will call us again
|
||||
* when the fd is ready.
|
||||
*/
|
||||
opal_event_add(&sink->ev, 0);
|
||||
sink->pending = true;
|
||||
ORTE_POST_OBJECT(sink);
|
||||
opal_event_add(&sink->ev, 0);
|
||||
return;
|
||||
}
|
||||
/* otherwise, something bad happened so all we can do is abort
|
||||
@ -1250,8 +1213,9 @@ static void write_handler(int fd, short event, void *cbdata)
|
||||
/* leave the write event running so it will call us again
|
||||
* when the fd is ready
|
||||
*/
|
||||
opal_event_add(&sink->ev, 0);
|
||||
sink->pending = true;
|
||||
ORTE_POST_OBJECT(sink);
|
||||
opal_event_add(&sink->ev, 0);
|
||||
return;
|
||||
}
|
||||
OBJ_RELEASE(output);
|
||||
|
@ -44,6 +44,7 @@
|
||||
#include "orte/mca/state/state.h"
|
||||
#include "orte/util/name_fns.h"
|
||||
#include "orte/util/nidmap.h"
|
||||
#include "orte/util/threads.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
|
||||
#include "orte/mca/grpcomm/grpcomm.h"
|
||||
@ -144,6 +145,8 @@ static void allgather_stub(int fd, short args, void *cbdata)
|
||||
orte_grpcomm_coll_t *coll;
|
||||
uint32_t *seq_number;
|
||||
|
||||
ORTE_ACQUIRE_OBJECT(cd);
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base_framework.framework_output,
|
||||
"%s grpcomm:base:allgather stub",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
@ -212,6 +215,7 @@ int orte_grpcomm_API_allgather(orte_grpcomm_signature_t *sig,
|
||||
cd->cbdata = cbdata;
|
||||
opal_event_set(orte_event_base, &cd->ev, -1, OPAL_EV_WRITE, allgather_stub, cd);
|
||||
opal_event_set_priority(&cd->ev, ORTE_MSG_PRI);
|
||||
ORTE_POST_OBJECT(cd);
|
||||
opal_event_active(&cd->ev, OPAL_EV_WRITE, 1);
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
@ -52,6 +52,7 @@
|
||||
#include "orte/mca/iof/iof.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
#include "orte/mca/rml/rml_types.h"
|
||||
#include "orte/util/threads.h"
|
||||
|
||||
BEGIN_C_DECLS
|
||||
|
||||
@ -163,6 +164,7 @@ typedef struct orte_iof_base_t orte_iof_base_t;
|
||||
opal_event_set_priority(ep->wev->ev, ORTE_MSG_PRI); \
|
||||
} \
|
||||
*(snk) = ep; \
|
||||
ORTE_POST_OBJECT(ep); \
|
||||
} while(0);
|
||||
|
||||
/* add list of structs that has name of proc + orte_iof_tag_t - when
|
||||
@ -192,6 +194,7 @@ typedef struct orte_iof_base_t orte_iof_base_t;
|
||||
opal_event_set_priority(rev->ev, ORTE_MSG_PRI); \
|
||||
if ((actv)) { \
|
||||
rev->active = true; \
|
||||
ORTE_POST_OBJECT(rev); \
|
||||
opal_event_add(rev->ev, 0); \
|
||||
} \
|
||||
} while(0);
|
||||
|
@ -38,6 +38,7 @@
|
||||
#include "opal/util/output.h"
|
||||
|
||||
#include "orte/util/name_fns.h"
|
||||
#include "orte/util/threads.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/state/state.h"
|
||||
@ -147,7 +148,7 @@ int orte_iof_base_write_output(const orte_process_name_t *name, orte_iof_tag_t s
|
||||
output->numbytes = numbytes;
|
||||
goto process;
|
||||
|
||||
construct:
|
||||
construct:
|
||||
starttaglen = strlen(starttag);
|
||||
endtaglen = strlen(endtag);
|
||||
endtagged = false;
|
||||
@ -249,7 +250,7 @@ construct:
|
||||
}
|
||||
output->numbytes = k;
|
||||
|
||||
process:
|
||||
process:
|
||||
/* add this data to the write list for this fd */
|
||||
opal_list_append(&channel->outputs, &output->super);
|
||||
|
||||
@ -262,8 +263,9 @@ process:
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_iof_base_framework.framework_output,
|
||||
"%s write:output adding write event",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
opal_event_add(channel->ev, 0);
|
||||
channel->pending = true;
|
||||
ORTE_POST_OBJECT(channel);
|
||||
opal_event_add(channel->ev, 0);
|
||||
}
|
||||
|
||||
return num_buffered;
|
||||
@ -303,6 +305,8 @@ void orte_iof_base_write_handler(int fd, short event, void *cbdata)
|
||||
orte_iof_write_output_t *output;
|
||||
int num_written;
|
||||
|
||||
ORTE_ACQUIRE_OBJECT(sink);
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_iof_base_framework.framework_output,
|
||||
"%s write:handler writing data to %d",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
@ -356,8 +360,8 @@ void orte_iof_base_write_handler(int fd, short event, void *cbdata)
|
||||
}
|
||||
OBJ_RELEASE(output);
|
||||
}
|
||||
ABORT:
|
||||
ABORT:
|
||||
opal_event_del(wev->ev);
|
||||
wev->pending = false;
|
||||
|
||||
ORTE_POST_OBJECT(wev);
|
||||
}
|
||||
|
@ -47,6 +47,7 @@
|
||||
#include "orte/mca/ess/ess.h"
|
||||
#include "orte/mca/rml/rml.h"
|
||||
#include "orte/util/name_fns.h"
|
||||
#include "orte/util/threads.h"
|
||||
#include "orte/mca/odls/odls_types.h"
|
||||
|
||||
#include "orte/mca/iof/base/base.h"
|
||||
@ -214,10 +215,13 @@ static int hnp_push(const orte_process_name_t* dst_name, orte_iof_tag_t src_tag,
|
||||
}
|
||||
}
|
||||
proct->revstdout->active = true;
|
||||
ORTE_POST_OBJECT(proct->revstdout);
|
||||
opal_event_add(proct->revstdout->ev, 0);
|
||||
proct->revstderr->active = true;
|
||||
ORTE_POST_OBJECT(proct->revstderr);
|
||||
opal_event_add(proct->revstderr->ev, 0);
|
||||
proct->revstddiag->active = true;
|
||||
ORTE_POST_OBJECT(proct->revstddiag);
|
||||
opal_event_add(proct->revstddiag->ev, 0);
|
||||
}
|
||||
return ORTE_SUCCESS;
|
||||
@ -299,6 +303,7 @@ static int hnp_push(const orte_process_name_t* dst_name, orte_iof_tag_t src_tag,
|
||||
*/
|
||||
if (!(src_tag & ORTE_IOF_STDIN) || orte_iof_hnp_stdin_check(fd)) {
|
||||
mca_iof_hnp_component.stdinev->active = true;
|
||||
ORTE_POST_OBJECT(proct->revstdout);
|
||||
opal_event_add(mca_iof_hnp_component.stdinev->ev, 0);
|
||||
}
|
||||
} else {
|
||||
@ -515,6 +520,8 @@ static void stdin_write_handler(int fd, short event, void *cbdata)
|
||||
orte_iof_write_output_t *output;
|
||||
int num_written;
|
||||
|
||||
ORTE_ACQUIRE_OBJECT(sink);
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_iof_base_framework.framework_output,
|
||||
"%s hnp:stdin:write:handler writing data to %d",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
@ -558,6 +565,7 @@ static void stdin_write_handler(int fd, short event, void *cbdata)
|
||||
* when the fd is ready.
|
||||
*/
|
||||
wev->pending = true;
|
||||
ORTE_POST_OBJECT(wev);
|
||||
opal_event_add(wev->ev, 0);
|
||||
goto CHECK;
|
||||
}
|
||||
@ -583,13 +591,14 @@ static void stdin_write_handler(int fd, short event, void *cbdata)
|
||||
* when the fd is ready.
|
||||
*/
|
||||
wev->pending = true;
|
||||
ORTE_POST_OBJECT(wev);
|
||||
opal_event_add(wev->ev, 0);
|
||||
goto CHECK;
|
||||
}
|
||||
OBJ_RELEASE(output);
|
||||
}
|
||||
|
||||
CHECK:
|
||||
CHECK:
|
||||
if (NULL != mca_iof_hnp_component.stdinev &&
|
||||
!orte_abnormal_term_ordered &&
|
||||
!mca_iof_hnp_component.stdinev->active) {
|
||||
@ -610,6 +619,7 @@ CHECK:
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_iof_base_framework.framework_output,
|
||||
"restarting read event"));
|
||||
mca_iof_hnp_component.stdinev->active = true;
|
||||
ORTE_POST_OBJECT(mca_iof_hnp_component.stdinev);
|
||||
opal_event_add(mca_iof_hnp_component.stdinev->ev, 0);
|
||||
}
|
||||
}
|
||||
|
@ -35,6 +35,7 @@
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/odls/odls_types.h"
|
||||
#include "orte/util/name_fns.h"
|
||||
#include "orte/util/threads.h"
|
||||
#include "orte/mca/state/state.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
#include "orte/runtime/orte_wait.h"
|
||||
@ -48,10 +49,13 @@ static void restart_stdin(int fd, short event, void *cbdata)
|
||||
{
|
||||
orte_timer_t *tm = (orte_timer_t*)cbdata;
|
||||
|
||||
ORTE_ACQUIRE_OBJECT(tm);
|
||||
|
||||
if (NULL != mca_iof_hnp_component.stdinev &&
|
||||
!orte_job_term_ordered &&
|
||||
!mca_iof_hnp_component.stdinev->active) {
|
||||
mca_iof_hnp_component.stdinev->active = true;
|
||||
ORTE_POST_OBJECT(mca_iof_hnp_component.stdinev);
|
||||
opal_event_add(mca_iof_hnp_component.stdinev->ev, 0);
|
||||
}
|
||||
|
||||
@ -74,7 +78,11 @@ bool orte_iof_hnp_stdin_check(int fd)
|
||||
|
||||
void orte_iof_hnp_stdin_cb(int fd, short event, void *cbdata)
|
||||
{
|
||||
bool should_process = orte_iof_hnp_stdin_check(0);
|
||||
bool should_process;
|
||||
|
||||
ORTE_ACQUIRE_OBJECT(mca_iof_hnp_component.stdinev);
|
||||
|
||||
should_process = orte_iof_hnp_stdin_check(0);
|
||||
|
||||
if (should_process) {
|
||||
mca_iof_hnp_component.stdinev->active = true;
|
||||
@ -99,6 +107,8 @@ void orte_iof_hnp_read_local_handler(int fd, short event, void *cbdata)
|
||||
bool exclusive;
|
||||
orte_iof_sink_t *sink;
|
||||
|
||||
ORTE_ACQUIRE_OBJECT(rev);
|
||||
|
||||
/* read up to the fragment size */
|
||||
numbytes = read(fd, data, sizeof(data));
|
||||
|
||||
@ -293,6 +303,7 @@ void orte_iof_hnp_read_local_handler(int fd, short event, void *cbdata)
|
||||
}
|
||||
|
||||
/* re-add the event */
|
||||
ORTE_POST_OBJECT(rev);
|
||||
opal_event_add(rev->ev, 0);
|
||||
|
||||
return;
|
||||
|
@ -12,7 +12,7 @@
|
||||
* Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2011-2013 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2014-2016 Intel Corporation. All rights reserved.
|
||||
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -41,6 +41,7 @@
|
||||
#include "orte/mca/rml/rml.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/util/name_fns.h"
|
||||
#include "orte/util/threads.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
|
||||
#include "orte/mca/iof/iof.h"
|
||||
@ -81,6 +82,7 @@ void orte_iof_hnp_recv(int status, orte_process_name_t* sender,
|
||||
!orte_job_term_ordered &&
|
||||
!mca_iof_hnp_component.stdinev->active) {
|
||||
mca_iof_hnp_component.stdinev->active = true;
|
||||
ORTE_POST_OBJECT(mca_iof_hnp_component.stdinev);
|
||||
opal_event_add(mca_iof_hnp_component.stdinev->ev, 0);
|
||||
}
|
||||
goto CLEAN_RETURN;
|
||||
|
@ -42,6 +42,7 @@
|
||||
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/util/name_fns.h"
|
||||
#include "orte/util/threads.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
#include "orte/mca/odls/odls_types.h"
|
||||
#include "orte/mca/rml/rml.h"
|
||||
@ -190,10 +191,13 @@ SETUP:
|
||||
*/
|
||||
if (NULL != proct->revstdout && NULL != proct->revstderr && NULL != proct->revstddiag) {
|
||||
proct->revstdout->active = true;
|
||||
ORTE_POST_OBJECT(proct->revstdout);
|
||||
opal_event_add(proct->revstdout->ev, 0);
|
||||
proct->revstderr->active = true;
|
||||
ORTE_POST_OBJECT(proct->revstderr);
|
||||
opal_event_add(proct->revstderr->ev, 0);
|
||||
proct->revstddiag->active = true;
|
||||
ORTE_POST_OBJECT(proct->revstddiag);
|
||||
opal_event_add(proct->revstddiag->ev, 0);
|
||||
}
|
||||
return ORTE_SUCCESS;
|
||||
@ -367,6 +371,8 @@ static void stdin_write_handler(int fd, short event, void *cbdata)
|
||||
orte_iof_write_output_t *output;
|
||||
int num_written;
|
||||
|
||||
ORTE_ACQUIRE_OBJECT(sink);
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_iof_base_framework.framework_output,
|
||||
"%s orted:stdin:write:handler writing data to %d",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
@ -400,6 +406,7 @@ static void stdin_write_handler(int fd, short event, void *cbdata)
|
||||
* when the fd is ready.
|
||||
*/
|
||||
wev->pending = true;
|
||||
ORTE_POST_OBJECT(wev);
|
||||
opal_event_add(wev->ev, 0);
|
||||
goto CHECK;
|
||||
}
|
||||
@ -430,6 +437,7 @@ static void stdin_write_handler(int fd, short event, void *cbdata)
|
||||
* when the fd is ready.
|
||||
*/
|
||||
wev->pending = true;
|
||||
ORTE_POST_OBJECT(wev);
|
||||
opal_event_add(wev->ev, 0);
|
||||
goto CHECK;
|
||||
}
|
||||
|
@ -35,6 +35,7 @@
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/odls/odls_types.h"
|
||||
#include "orte/util/name_fns.h"
|
||||
#include "orte/util/threads.h"
|
||||
#include "orte/mca/state/state.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
|
||||
@ -52,6 +53,8 @@ void orte_iof_orted_read_handler(int fd, short event, void *cbdata)
|
||||
int32_t numbytes;
|
||||
orte_iof_proc_t *proct = (orte_iof_proc_t*)rev->proc;
|
||||
|
||||
ORTE_ACQUIRE_OBJECT(rev);
|
||||
|
||||
/* read up to the fragment size */
|
||||
#if !defined(__WINDOWS__)
|
||||
numbytes = read(fd, data, sizeof(data));
|
||||
@ -100,6 +103,7 @@ void orte_iof_orted_read_handler(int fd, short event, void *cbdata)
|
||||
}
|
||||
if (!proct->copy) {
|
||||
/* re-add the event */
|
||||
ORTE_POST_OBJECT(rev);
|
||||
opal_event_add(rev->ev, 0);
|
||||
return;
|
||||
}
|
||||
@ -137,6 +141,7 @@ void orte_iof_orted_read_handler(int fd, short event, void *cbdata)
|
||||
orte_rml_send_callback, NULL);
|
||||
|
||||
/* re-add the event */
|
||||
ORTE_POST_OBJECT(rev);
|
||||
opal_event_add(rev->ev, 0);
|
||||
|
||||
return;
|
||||
|
@ -10,7 +10,7 @@
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2008-2015 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2014-2015 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -25,6 +25,7 @@
|
||||
#include "opal/util/argv.h"
|
||||
|
||||
#include "orte/util/attr.h"
|
||||
#include "orte/util/threads.h"
|
||||
#include "orte/mca/notifier/base/base.h"
|
||||
|
||||
|
||||
@ -38,6 +39,8 @@ void orte_notifier_base_log(int sd, short args, void *cbdata)
|
||||
orte_notifier_active_module_t *imod;
|
||||
int i;
|
||||
|
||||
ORTE_ACQUIRE_OBJECT(req);
|
||||
|
||||
/* if no modules are active, then there is nothing to do */
|
||||
if (0 == opal_list_get_size(&orte_notifier_base.modules)) {
|
||||
return;
|
||||
@ -74,6 +77,8 @@ void orte_notifier_base_event(int sd, short args, void *cbdata)
|
||||
orte_notifier_active_module_t *imod;
|
||||
int i;
|
||||
|
||||
ORTE_ACQUIRE_OBJECT(req);
|
||||
|
||||
/* if no modules are active, then there is nothing to do */
|
||||
if (0 == opal_list_get_size(&orte_notifier_base.modules)) {
|
||||
return;
|
||||
@ -110,6 +115,8 @@ void orte_notifier_base_report(int sd, short args, void *cbdata)
|
||||
orte_notifier_active_module_t *imod;
|
||||
int i;
|
||||
|
||||
ORTE_ACQUIRE_OBJECT(req);
|
||||
|
||||
/* if no modules are active, then there is nothing to do */
|
||||
if (0 == opal_list_get_size(&orte_notifier_base.modules)) {
|
||||
return;
|
||||
|
@ -13,7 +13,7 @@
|
||||
* Copyright (c) 2009 Cisco Systems, Inc. All Rights Reserved.
|
||||
* Copyright (c) 2012-2015 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2014-2015 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -51,6 +51,7 @@
|
||||
#include "orte/types.h"
|
||||
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
#include "orte/util/threads.h"
|
||||
|
||||
BEGIN_C_DECLS
|
||||
|
||||
@ -63,7 +64,7 @@ ORTE_DECLSPEC extern int orte_notifier_debug_output;
|
||||
* The code has NOT been auditied for use of malloc, so this still
|
||||
* may fail to get the "OUT_OF_RESOURCE" message out. Oh Well.
|
||||
*/
|
||||
#define ORTE_NOTIFIER_MAX_BUF 512
|
||||
#define ORTE_NOTIFIER_MAX_BUF 512
|
||||
|
||||
/* Severities */
|
||||
typedef enum {
|
||||
@ -136,6 +137,7 @@ typedef void (*orte_notifier_base_module_report_fn_t)(orte_notifier_request_t *r
|
||||
opal_event_set(orte_notifier_base.ev_base, &(_n)->ev, -1, \
|
||||
OPAL_EV_WRITE, orte_notifier_base_log, (_n)); \
|
||||
opal_event_set_priority(&(_n)->ev, ORTE_ERROR_PRI); \
|
||||
ORTE_POST_OBJECT(_n); \
|
||||
opal_event_active(&(_n)->ev, OPAL_EV_WRITE, 1); \
|
||||
} while(0);
|
||||
|
||||
@ -160,6 +162,7 @@ typedef void (*orte_notifier_base_module_report_fn_t)(orte_notifier_request_t *r
|
||||
opal_event_set(orte_notifier_base.ev_base, &(_n)->ev, -1, \
|
||||
OPAL_EV_WRITE, orte_notifier_base_report, (_n)); \
|
||||
opal_event_set_priority(&(_n)->ev, ORTE_ERROR_PRI); \
|
||||
ORTE_POST_OBJECT(_n); \
|
||||
opal_event_active(&(_n)->ev, OPAL_EV_WRITE, 1); \
|
||||
} while(0);
|
||||
|
||||
@ -183,6 +186,7 @@ typedef void (*orte_notifier_base_module_report_fn_t)(orte_notifier_request_t *r
|
||||
opal_event_set(orte_notifier_base.ev_base, &(_n)->ev, -1, \
|
||||
OPAL_EV_WRITE, orte_notifier_base_event, (_n)); \
|
||||
opal_event_set_priority(&(_n)->ev, ORTE_ERROR_PRI); \
|
||||
ORTE_POST_OBJECT(_n); \
|
||||
opal_event_active(&(_n)->ev, OPAL_EV_WRITE, 1); \
|
||||
} while(0);
|
||||
|
||||
|
@ -11,7 +11,7 @@
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2007 Sun Microsystems, Inc. All rights reserved.
|
||||
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2014 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -50,18 +50,10 @@
|
||||
/* Static API's */
|
||||
static void mylog(orte_notifier_base_severity_t severity, int errcode,
|
||||
const char *msg, va_list ap);
|
||||
static void myhelplog(orte_notifier_base_severity_t severity, int errcode,
|
||||
const char *filename,
|
||||
const char *topic, va_list ap);
|
||||
static void mypeerlog(orte_notifier_base_severity_t severity, int errcode,
|
||||
orte_process_name_t *peer_proc,
|
||||
const char *msg, va_list ap);
|
||||
|
||||
/* Module */
|
||||
orte_notifier_base_module_t orte_notifier_smtp_module = {
|
||||
NULL,
|
||||
NULL,
|
||||
mylog,
|
||||
.log = mylog
|
||||
};
|
||||
|
||||
typedef enum {
|
||||
|
@ -10,7 +10,7 @@
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2007 Sun Microsystems, Inc. All rights reserved.
|
||||
* Copyright (c) 2014-2015 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -48,11 +48,11 @@ static void myreport(orte_notifier_request_t *req);
|
||||
|
||||
/* Module def */
|
||||
orte_notifier_base_module_t orte_notifier_syslog_module = {
|
||||
init,
|
||||
finalize,
|
||||
mylog,
|
||||
myevent,
|
||||
myreport
|
||||
.init = init,
|
||||
.finalize = finalize,
|
||||
.log = mylog,
|
||||
.event = myevent,
|
||||
.report = myreport
|
||||
};
|
||||
|
||||
|
||||
@ -130,4 +130,3 @@ static void myreport(orte_notifier_request_t *req)
|
||||
orte_job_state_to_str(req->state),
|
||||
(NULL == req->msg) ? "<N/A>" : req->msg);
|
||||
}
|
||||
|
||||
|
@ -81,6 +81,7 @@
|
||||
#include "orte/util/proc_info.h"
|
||||
#include "orte/util/nidmap.h"
|
||||
#include "orte/util/show_help.h"
|
||||
#include "orte/util/threads.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
#include "orte/runtime/orte_wait.h"
|
||||
#include "orte/orted/orted.h"
|
||||
@ -582,6 +583,8 @@ static void timer_cb(int fd, short event, void *cbdata)
|
||||
orte_timer_t *tm = (orte_timer_t*)cbdata;
|
||||
orte_odls_launch_local_t *ll = (orte_odls_launch_local_t*)tm->payload;
|
||||
|
||||
ORTE_ACQUIRE_OBJECT(tm);
|
||||
|
||||
/* increment the number of retries */
|
||||
ll->retries++;
|
||||
|
||||
@ -629,6 +632,8 @@ void orte_odls_base_spawn_proc(int fd, short sd, void *cbdata)
|
||||
char *pathenv = NULL, *mpiexec_pathenv = NULL;
|
||||
char *full_search;
|
||||
|
||||
ORTE_ACQUIRE_OBJECT(cd);
|
||||
|
||||
/* thread-protect common values */
|
||||
cd->env = opal_argv_copy(app->env);
|
||||
|
||||
@ -820,6 +825,8 @@ void orte_odls_base_default_launch_local(int fd, short sd, void *cbdata)
|
||||
opal_event_base_t *evb;
|
||||
char *effective_dir = NULL;
|
||||
|
||||
ORTE_ACQUIRE_OBJECT(caddy);
|
||||
|
||||
opal_output_verbose(5, orte_odls_base_framework.framework_output,
|
||||
"%s local:launch",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
|
||||
|
@ -127,6 +127,7 @@
|
||||
#include "orte/mca/plm/plm.h"
|
||||
#include "orte/mca/rtc/rtc.h"
|
||||
#include "orte/util/name_fns.h"
|
||||
#include "orte/util/threads.h"
|
||||
|
||||
#include "orte/mca/odls/base/base.h"
|
||||
#include "orte/mca/odls/base/odls_private.h"
|
||||
@ -157,11 +158,11 @@ static int do_child(orte_odls_spawn_caddy_t *cd, int write_fd)
|
||||
* Module
|
||||
*/
|
||||
orte_odls_base_module_t orte_odls_default_module = {
|
||||
orte_odls_base_default_get_add_procs_data,
|
||||
orte_odls_default_launch_local_procs,
|
||||
orte_odls_default_kill_local_procs,
|
||||
orte_odls_default_signal_local_procs,
|
||||
orte_odls_default_restart_proc
|
||||
.get_add_procs_data = orte_odls_base_default_get_add_procs_data,
|
||||
.launch_local_procs = orte_odls_default_launch_local_procs,
|
||||
.kill_local_procs = orte_odls_default_kill_local_procs,
|
||||
.signal_local_procs = orte_odls_default_signal_local_procs,
|
||||
.restart_proc = orte_odls_default_restart_proc
|
||||
};
|
||||
|
||||
|
||||
|
@ -42,9 +42,11 @@
|
||||
#include "opal/class/opal_hash_table.h"
|
||||
#include "opal/class/opal_list.h"
|
||||
#include "opal/util/timings.h"
|
||||
#include "orte/mca/mca.h"
|
||||
#include "opal/mca/event/event.h"
|
||||
|
||||
#include "orte/mca/mca.h"
|
||||
#include "orte/util/threads.h"
|
||||
|
||||
#include "orte/mca/oob/oob.h"
|
||||
|
||||
BEGIN_C_DECLS
|
||||
@ -119,11 +121,8 @@ ORTE_DECLSPEC void orte_oob_base_send_nb(int fd, short args, void *cbdata);
|
||||
__FILE__, __LINE__); \
|
||||
cd = OBJ_NEW(orte_oob_send_t); \
|
||||
cd->msg = (m); \
|
||||
opal_event_set(orte_oob_base.ev_base, &cd->ev, -1, \
|
||||
OPAL_EV_WRITE, \
|
||||
orte_oob_base_send_nb, cd); \
|
||||
opal_event_set_priority(&cd->ev, ORTE_MSG_PRI); \
|
||||
opal_event_active(&cd->ev, OPAL_EV_WRITE, 1); \
|
||||
ORTE_THREADSHIFT(cd, orte_oob_base.ev_base, \
|
||||
orte_oob_base_send_nb, ORTE_MSG_PRI); \
|
||||
}while(0)
|
||||
|
||||
/* Our contact info is actually subject to change as transports
|
||||
@ -168,11 +167,11 @@ typedef struct {
|
||||
} mca_oob_uri_req_t;
|
||||
OBJ_CLASS_DECLARATION(mca_oob_uri_req_t);
|
||||
|
||||
#define ORTE_OOB_SET_URI(u) \
|
||||
do { \
|
||||
mca_oob_uri_req_t *rq; \
|
||||
rq = OBJ_NEW(mca_oob_uri_req_t); \
|
||||
rq->uri = strdup((u)); \
|
||||
#define ORTE_OOB_SET_URI(u) \
|
||||
do { \
|
||||
mca_oob_uri_req_t *rq; \
|
||||
rq = OBJ_NEW(mca_oob_uri_req_t); \
|
||||
rq->uri = strdup((u)); \
|
||||
orte_oob_base_set_addr(0, 0, (void*)rq); \
|
||||
}while(0)
|
||||
|
||||
|
@ -21,7 +21,7 @@
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/state/state.h"
|
||||
#include "orte/mca/rml/rml.h"
|
||||
|
||||
#include "orte/util/threads.h"
|
||||
#include "orte/mca/oob/base/base.h"
|
||||
#if OPAL_ENABLE_FT_CR == 1
|
||||
#include "orte/mca/state/base/base.h"
|
||||
@ -32,7 +32,7 @@ static void process_uri(char *uri);
|
||||
void orte_oob_base_send_nb(int fd, short args, void *cbdata)
|
||||
{
|
||||
orte_oob_send_t *cd = (orte_oob_send_t*)cbdata;
|
||||
orte_rml_send_t *msg = cd->msg;
|
||||
orte_rml_send_t *msg;
|
||||
mca_base_component_list_item_t *cli;
|
||||
orte_oob_base_peer_t *pr;
|
||||
int rc;
|
||||
@ -42,7 +42,10 @@ void orte_oob_base_send_nb(int fd, short args, void *cbdata)
|
||||
bool reachable;
|
||||
char *uri;
|
||||
|
||||
ORTE_ACQUIRE_OBJECT(cd);
|
||||
|
||||
/* done with this. release it now */
|
||||
msg = cd->msg;
|
||||
OBJ_RELEASE(cd);
|
||||
|
||||
opal_output_verbose(5, orte_oob_base_framework.framework_output,
|
||||
@ -276,7 +279,7 @@ void orte_oob_base_get_addr(char **uri)
|
||||
}
|
||||
}
|
||||
|
||||
unblock:
|
||||
unblock:
|
||||
*uri = final;
|
||||
}
|
||||
|
||||
@ -303,7 +306,10 @@ OBJ_CLASS_INSTANCE(mca_oob_uri_req_t,
|
||||
void orte_oob_base_set_addr(int fd, short args, void *cbdata)
|
||||
{
|
||||
mca_oob_uri_req_t *req = (mca_oob_uri_req_t*)cbdata;
|
||||
char *uri = req->uri;
|
||||
char *uri;
|
||||
|
||||
ORTE_ACQUIRE_OBJECT(req);
|
||||
uri = req->uri;
|
||||
|
||||
opal_output_verbose(5, orte_oob_base_framework.framework_output,
|
||||
"%s: set_addr to uri %s",
|
||||
|
@ -62,6 +62,7 @@
|
||||
#include "orte/util/name_fns.h"
|
||||
#include "orte/util/parse_options.h"
|
||||
#include "orte/util/show_help.h"
|
||||
#include "orte/util/threads.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
|
||||
#include "orte/mca/oob/tcp/oob_tcp.h"
|
||||
@ -253,6 +254,8 @@ static void recv_handler(int sd, short flg, void *cbdata)
|
||||
mca_oob_tcp_hdr_t hdr;
|
||||
mca_oob_tcp_peer_t *peer;
|
||||
|
||||
ORTE_ACQUIRE_OBJECT(op);
|
||||
|
||||
opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output,
|
||||
"%s:tcp:recv:handler called",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
|
||||
|
@ -74,6 +74,7 @@
|
||||
#include "orte/util/name_fns.h"
|
||||
#include "orte/util/parse_options.h"
|
||||
#include "orte/util/show_help.h"
|
||||
#include "orte/util/threads.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
#include "orte/runtime/orte_wait.h"
|
||||
|
||||
@ -698,6 +699,9 @@ static void cleanup(int sd, short args, void *cbdata)
|
||||
{
|
||||
opal_list_item_t * item;
|
||||
bool *active = (bool*)cbdata;
|
||||
|
||||
ORTE_ACQUIRE_OBJECT(active);
|
||||
|
||||
while (NULL != (item = opal_list_remove_first(&mca_oob_tcp_component.listeners))) {
|
||||
OBJ_RELEASE(item);
|
||||
}
|
||||
@ -756,6 +760,7 @@ static void component_shutdown(void)
|
||||
opal_event_set(orte_event_base, &ev, -1,
|
||||
OPAL_EV_WRITE, cleanup, &active);
|
||||
opal_event_set_priority(&ev, ORTE_ERROR_PRI);
|
||||
ORTE_POST_OBJECT(active);
|
||||
opal_event_active(&ev, OPAL_EV_WRITE, 1);
|
||||
ORTE_WAIT_FOR_COMPLETION(active);
|
||||
} else {
|
||||
@ -1062,6 +1067,8 @@ void mca_oob_tcp_component_set_module(int fd, short args, void *cbdata)
|
||||
int rc;
|
||||
orte_oob_base_peer_t *bpr;
|
||||
|
||||
ORTE_ACQUIRE_OBJECT(pop);
|
||||
|
||||
opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output,
|
||||
"%s tcp:set_module called for peer %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
@ -1093,6 +1100,8 @@ void mca_oob_tcp_component_lost_connection(int fd, short args, void *cbdata)
|
||||
orte_oob_base_peer_t *bpr;
|
||||
int rc;
|
||||
|
||||
ORTE_ACQUIRE_OBJECT(pop);
|
||||
|
||||
opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output,
|
||||
"%s tcp:lost connection called for peer %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
@ -1128,6 +1137,8 @@ void mca_oob_tcp_component_no_route(int fd, short args, void *cbdata)
|
||||
int rc;
|
||||
orte_oob_base_peer_t *bpr;
|
||||
|
||||
ORTE_ACQUIRE_OBJECT(mop);
|
||||
|
||||
opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output,
|
||||
"%s tcp:no route called for peer %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
@ -1162,6 +1173,8 @@ void mca_oob_tcp_component_hop_unknown(int fd, short args, void *cbdata)
|
||||
orte_rml_send_t *snd;
|
||||
orte_oob_base_peer_t *bpr;
|
||||
|
||||
ORTE_ACQUIRE_OBJECT(mop);
|
||||
|
||||
opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output,
|
||||
"%s tcp:unknown hop called for peer %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
@ -1235,6 +1248,8 @@ void mca_oob_tcp_component_failed_to_connect(int fd, short args, void *cbdata)
|
||||
{
|
||||
mca_oob_tcp_peer_op_t *pop = (mca_oob_tcp_peer_op_t*)cbdata;
|
||||
|
||||
ORTE_ACQUIRE_OBJECT(pop);
|
||||
|
||||
opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output,
|
||||
"%s tcp:failed_to_connect called for peer %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
|
@ -63,6 +63,7 @@
|
||||
|
||||
#include "orte/util/name_fns.h"
|
||||
#include "orte/util/show_help.h"
|
||||
#include "orte/util/threads.h"
|
||||
#include "orte/mca/state/state.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
@ -152,7 +153,7 @@ static int tcp_peer_create_socket(mca_oob_tcp_peer_t* peer)
|
||||
void mca_oob_tcp_peer_try_connect(int fd, short args, void *cbdata)
|
||||
{
|
||||
mca_oob_tcp_conn_op_t *op = (mca_oob_tcp_conn_op_t*)cbdata;
|
||||
mca_oob_tcp_peer_t *peer = op->peer;
|
||||
mca_oob_tcp_peer_t *peer;
|
||||
int rc;
|
||||
opal_socklen_t addrlen = 0;
|
||||
mca_oob_tcp_addr_t *addr;
|
||||
@ -160,6 +161,9 @@ void mca_oob_tcp_peer_try_connect(int fd, short args, void *cbdata)
|
||||
mca_oob_tcp_send_t *snd;
|
||||
bool connected = false;
|
||||
|
||||
ORTE_ACQUIRE_OBJECT(op);
|
||||
peer = op->peer;
|
||||
|
||||
opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output,
|
||||
"%s orte_tcp_peer_try_connect: "
|
||||
"attempting to connect to proc %s",
|
||||
@ -586,8 +590,9 @@ void mca_oob_tcp_peer_complete_connect(mca_oob_tcp_peer_t *peer)
|
||||
ORTE_NAME_PRINT(&(peer->name)));
|
||||
|
||||
if (!peer->recv_ev_active) {
|
||||
opal_event_add(&peer->recv_event, 0);
|
||||
peer->recv_ev_active = true;
|
||||
ORTE_POST_OBJECT(peer);
|
||||
opal_event_add(&peer->recv_event, 0);
|
||||
}
|
||||
} else {
|
||||
opal_output(0, "%s tcp_peer_complete_connect: unable to send connect ack to %s",
|
||||
@ -608,6 +613,8 @@ static int tcp_peer_send_blocking(int sd, void* data, size_t size)
|
||||
size_t cnt = 0;
|
||||
int retval;
|
||||
|
||||
ORTE_ACQUIRE_OBJECT(ptr);
|
||||
|
||||
opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output,
|
||||
"%s send blocking of %"PRIsize_t" bytes to socket %d",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
@ -949,8 +956,9 @@ static void tcp_peer_connected(mca_oob_tcp_peer_t* peer)
|
||||
opal_list_remove_first(&peer->send_queue);
|
||||
}
|
||||
if (NULL != peer->send_msg && !peer->send_ev_active) {
|
||||
opal_event_add(&peer->send_event, 0);
|
||||
peer->send_ev_active = true;
|
||||
ORTE_POST_OBJECT(peer);
|
||||
opal_event_add(&peer->send_event, 0);
|
||||
}
|
||||
}
|
||||
|
||||
@ -1214,8 +1222,9 @@ bool mca_oob_tcp_peer_accept(mca_oob_tcp_peer_t* peer)
|
||||
|
||||
tcp_peer_connected(peer);
|
||||
if (!peer->recv_ev_active) {
|
||||
opal_event_add(&peer->recv_event, 0);
|
||||
peer->recv_ev_active = true;
|
||||
ORTE_POST_OBJECT(peer);
|
||||
opal_event_add(&peer->recv_event, 0);
|
||||
}
|
||||
if (OOB_TCP_DEBUG_CONNECT <= opal_output_get_verbosity(orte_oob_base_framework.framework_output)) {
|
||||
mca_oob_tcp_peer_dump(peer, "accepted");
|
||||
|
@ -32,6 +32,7 @@
|
||||
#include <sys/socket.h>
|
||||
#endif
|
||||
|
||||
#include "orte/util/threads.h"
|
||||
#include "oob_tcp.h"
|
||||
#include "oob_tcp_peer.h"
|
||||
|
||||
@ -59,10 +60,7 @@ OBJ_CLASS_DECLARATION(mca_oob_tcp_conn_op_t);
|
||||
ORTE_NAME_PRINT((&(p)->name))); \
|
||||
cop = OBJ_NEW(mca_oob_tcp_conn_op_t); \
|
||||
cop->peer = (p); \
|
||||
opal_event_set((p)->ev_base, &cop->ev, -1, \
|
||||
OPAL_EV_WRITE, (cbfunc), cop); \
|
||||
opal_event_set_priority(&cop->ev, ORTE_MSG_PRI); \
|
||||
opal_event_active(&cop->ev, OPAL_EV_WRITE, 1); \
|
||||
ORTE_THREADSHIFT(cop, (p)->ev_base, (cbfunc), ORTE_MSG_PRI); \
|
||||
} while(0);
|
||||
|
||||
#define ORTE_ACTIVATE_TCP_ACCEPT_STATE(s, a, cbfunc) \
|
||||
@ -72,6 +70,7 @@ OBJ_CLASS_DECLARATION(mca_oob_tcp_conn_op_t);
|
||||
opal_event_set(orte_oob_base.ev_base, &cop->ev, s, \
|
||||
OPAL_EV_READ, (cbfunc), cop); \
|
||||
opal_event_set_priority(&cop->ev, ORTE_MSG_PRI); \
|
||||
ORTE_POST_OBJECT(cop); \
|
||||
opal_event_add(&cop->ev, 0); \
|
||||
} while(0);
|
||||
|
||||
@ -88,6 +87,7 @@ OBJ_CLASS_DECLARATION(mca_oob_tcp_conn_op_t);
|
||||
opal_event_evtimer_set((p)->ev_base, \
|
||||
&cop->ev, \
|
||||
(cbfunc), cop); \
|
||||
ORTE_POST_OBJECT(cop); \
|
||||
opal_event_evtimer_add(&cop->ev, (tv)); \
|
||||
} while(0);
|
||||
|
||||
|
@ -66,6 +66,7 @@
|
||||
#include "orte/util/name_fns.h"
|
||||
#include "orte/util/parse_options.h"
|
||||
#include "orte/util/show_help.h"
|
||||
#include "orte/util/threads.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
|
||||
#include "orte/mca/oob/tcp/oob_tcp.h"
|
||||
@ -162,6 +163,7 @@ int orte_oob_tcp_start_listening(void)
|
||||
connection_event_handler,
|
||||
0);
|
||||
opal_event_set_priority(&listener->event, ORTE_MSG_PRI);
|
||||
ORTE_POST_OBJECT(listener);
|
||||
opal_event_add(&listener->event, 0);
|
||||
}
|
||||
|
||||
@ -816,6 +818,7 @@ static void* listen_thread(opal_object_t *obj)
|
||||
}
|
||||
|
||||
/* activate the event */
|
||||
ORTE_POST_OBJECT(pending_connection);
|
||||
opal_event_active(&pending_connection->ev, OPAL_EV_WRITE, 1);
|
||||
accepted_connections++;
|
||||
}
|
||||
@ -858,6 +861,8 @@ static void connection_handler(int sd, short flags, void* cbdata)
|
||||
|
||||
new_connection = (mca_oob_tcp_pending_connection_t*)cbdata;
|
||||
|
||||
ORTE_ACQUIRE_OBJECT(new_connection);
|
||||
|
||||
opal_output_verbose(4, orte_oob_base_framework.framework_output,
|
||||
"%s connection_handler: working connection "
|
||||
"(%d, %d) %s:%d\n",
|
||||
|
@ -27,6 +27,7 @@
|
||||
|
||||
#include "opal/mca/event/event.h"
|
||||
|
||||
#include "orte/util/threads.h"
|
||||
#include "oob_tcp.h"
|
||||
#include "oob_tcp_sendrecv.h"
|
||||
|
||||
@ -87,10 +88,8 @@ OBJ_CLASS_DECLARATION(mca_oob_tcp_peer_op_t);
|
||||
if (NULL != proxy) { \
|
||||
pop->rtmod = strdup(proxy); \
|
||||
} \
|
||||
opal_event_set(orte_oob_base.ev_base, &pop->ev, -1, \
|
||||
OPAL_EV_WRITE, (cbfunc), pop); \
|
||||
opal_event_set_priority(&pop->ev, ORTE_MSG_PRI); \
|
||||
opal_event_active(&pop->ev, OPAL_EV_WRITE, 1); \
|
||||
ORTE_THREADSHIFT(pop, orte_oob_base.ev_base, \
|
||||
(cbfunc), ORTE_MSG_PRI); \
|
||||
} while(0);
|
||||
|
||||
#endif /* _MCA_OOB_TCP_PEER_H_ */
|
||||
|
@ -64,6 +64,7 @@
|
||||
#include "opal/mca/event/event.h"
|
||||
|
||||
#include "orte/util/name_fns.h"
|
||||
#include "orte/util/threads.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/ess/ess.h"
|
||||
@ -82,7 +83,10 @@
|
||||
void mca_oob_tcp_queue_msg(int sd, short args, void *cbdata)
|
||||
{
|
||||
mca_oob_tcp_send_t *snd = (mca_oob_tcp_send_t*)cbdata;
|
||||
mca_oob_tcp_peer_t *peer = (mca_oob_tcp_peer_t*)snd->peer;
|
||||
mca_oob_tcp_peer_t *peer;
|
||||
|
||||
ORTE_ACQUIRE_OBJECT(snd);
|
||||
peer = (mca_oob_tcp_peer_t*)snd->peer;
|
||||
|
||||
/* if there is no message on-deck, put this one there */
|
||||
if (NULL == peer->send_msg) {
|
||||
@ -99,8 +103,9 @@ void mca_oob_tcp_queue_msg(int sd, short args, void *cbdata)
|
||||
} else {
|
||||
/* ensure the send event is active */
|
||||
if (!peer->send_ev_active) {
|
||||
opal_event_add(&peer->send_event, 0);
|
||||
peer->send_ev_active = true;
|
||||
ORTE_POST_OBJECT(peer);
|
||||
opal_event_add(&peer->send_event, 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -196,9 +201,12 @@ static int send_msg(mca_oob_tcp_peer_t* peer, mca_oob_tcp_send_t* msg)
|
||||
void mca_oob_tcp_send_handler(int sd, short flags, void *cbdata)
|
||||
{
|
||||
mca_oob_tcp_peer_t* peer = (mca_oob_tcp_peer_t*)cbdata;
|
||||
mca_oob_tcp_send_t* msg = peer->send_msg;
|
||||
mca_oob_tcp_send_t* msg;
|
||||
int rc;
|
||||
|
||||
ORTE_ACQUIRE_OBJECT(peer);
|
||||
msg = peer->send_msg;
|
||||
|
||||
opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output,
|
||||
"%s tcp:send_handler called to send to peer %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
@ -424,6 +432,8 @@ void mca_oob_tcp_recv_handler(int sd, short flags, void *cbdata)
|
||||
int rc;
|
||||
orte_rml_send_t *snd;
|
||||
|
||||
ORTE_ACQUIRE_OBJECT(peer);
|
||||
|
||||
opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output,
|
||||
"%s:tcp:recv:handler called for peer %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
@ -437,8 +447,9 @@ void mca_oob_tcp_recv_handler(int sd, short flags, void *cbdata)
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
|
||||
/* we connected! Start the send/recv events */
|
||||
if (!peer->recv_ev_active) {
|
||||
opal_event_add(&peer->recv_event, 0);
|
||||
peer->recv_ev_active = true;
|
||||
ORTE_POST_OBJECT(peer);
|
||||
opal_event_add(&peer->recv_event, 0);
|
||||
}
|
||||
if (peer->timer_ev_active) {
|
||||
opal_event_del(&peer->timer_event);
|
||||
@ -449,8 +460,9 @@ void mca_oob_tcp_recv_handler(int sd, short flags, void *cbdata)
|
||||
peer->send_msg = (mca_oob_tcp_send_t*)opal_list_remove_first(&peer->send_queue);
|
||||
}
|
||||
if (NULL != peer->send_msg && !peer->send_ev_active) {
|
||||
opal_event_add(&peer->send_event, 0);
|
||||
peer->send_ev_active = true;
|
||||
ORTE_POST_OBJECT(peer);
|
||||
opal_event_add(&peer->send_event, 0);
|
||||
}
|
||||
/* update our state */
|
||||
peer->state = MCA_OOB_TCP_CONNECTED;
|
||||
|
@ -28,7 +28,7 @@
|
||||
#include "opal/class/opal_list.h"
|
||||
|
||||
#include "orte/mca/rml/base/base.h"
|
||||
|
||||
#include "orte/util/threads.h"
|
||||
#include "oob_tcp.h"
|
||||
#include "oob_tcp_hdr.h"
|
||||
|
||||
@ -82,10 +82,8 @@ OBJ_CLASS_DECLARATION(mca_oob_tcp_recv_t);
|
||||
do { \
|
||||
(s)->peer = (struct mca_oob_tcp_peer_t*)(p); \
|
||||
(s)->activate = (f); \
|
||||
opal_event_set((p)->ev_base, &(s)->ev, -1, \
|
||||
OPAL_EV_WRITE, mca_oob_tcp_queue_msg, (s)); \
|
||||
opal_event_set_priority(&(s)->ev, ORTE_MSG_PRI); \
|
||||
opal_event_active(&(s)->ev, OPAL_EV_WRITE, 1); \
|
||||
ORTE_THREADSHIFT((s), (p)->ev_base, \
|
||||
mca_oob_tcp_queue_msg, ORTE_MSG_PRI); \
|
||||
} while(0)
|
||||
|
||||
/* queue a message to be sent by one of our modules - must
|
||||
@ -134,7 +132,7 @@ OBJ_CLASS_DECLARATION(mca_oob_tcp_recv_t);
|
||||
_s->sdbytes = sizeof(mca_oob_tcp_hdr_t); \
|
||||
/* add to the msg queue for this peer */ \
|
||||
MCA_OOB_TCP_QUEUE_MSG((p), _s, true); \
|
||||
}while(0);
|
||||
} while(0)
|
||||
|
||||
/* queue a message to be sent by one of our modules upon completing
|
||||
* the connection process - must provide the following params:
|
||||
@ -182,7 +180,7 @@ OBJ_CLASS_DECLARATION(mca_oob_tcp_recv_t);
|
||||
_s->sdbytes = sizeof(mca_oob_tcp_hdr_t); \
|
||||
/* add to the msg queue for this peer */ \
|
||||
MCA_OOB_TCP_QUEUE_MSG((p), _s, false); \
|
||||
}while(0);
|
||||
} while(0)
|
||||
|
||||
/* queue a message for relay by one of our modules - must
|
||||
* provide the following params:
|
||||
@ -217,7 +215,7 @@ OBJ_CLASS_DECLARATION(mca_oob_tcp_recv_t);
|
||||
_s->sdbytes = sizeof(mca_oob_tcp_hdr_t); \
|
||||
/* add to the msg queue for this peer */ \
|
||||
MCA_OOB_TCP_QUEUE_MSG((p), _s, true); \
|
||||
}while(0);
|
||||
} while(0)
|
||||
|
||||
/* State machine for processing message */
|
||||
typedef struct {
|
||||
@ -237,10 +235,8 @@ OBJ_CLASS_DECLARATION(mca_oob_tcp_msg_op_t);
|
||||
ORTE_NAME_PRINT(&((ms)->dst))); \
|
||||
mop = OBJ_NEW(mca_oob_tcp_msg_op_t); \
|
||||
mop->msg = (ms); \
|
||||
opal_event_set((ms)->peer->ev_base, &mop->ev, -1, \
|
||||
OPAL_EV_WRITE, (cbfunc), mop); \
|
||||
opal_event_set_priority(&mop->ev, ORTE_MSG_PRI); \
|
||||
opal_event_active(&mop->ev, OPAL_EV_WRITE, 1); \
|
||||
ORTE_THREADSHIFT(mop, (ms)->peer->ev_base, \
|
||||
(cbfunc), ORTE_MSG_PRI); \
|
||||
} while(0);
|
||||
|
||||
typedef struct {
|
||||
@ -285,11 +281,9 @@ OBJ_CLASS_DECLARATION(mca_oob_tcp_msg_error_t);
|
||||
mop->hop.jobid = (h)->jobid; \
|
||||
mop->hop.vpid = (h)->vpid; \
|
||||
/* this goes to the OOB framework, so use that event base */ \
|
||||
opal_event_set(orte_oob_base.ev_base, &mop->ev, -1, \
|
||||
OPAL_EV_WRITE, (cbfunc), mop); \
|
||||
opal_event_set_priority(&mop->ev, ORTE_MSG_PRI); \
|
||||
opal_event_active(&mop->ev, OPAL_EV_WRITE, 1); \
|
||||
} while(0);
|
||||
ORTE_THREADSHIFT(mop, orte_oob_base.ev_base, \
|
||||
(cbfunc), ORTE_MSG_PRI); \
|
||||
} while(0)
|
||||
|
||||
#define ORTE_ACTIVATE_TCP_NO_ROUTE(r, h, c) \
|
||||
do { \
|
||||
@ -305,10 +299,8 @@ OBJ_CLASS_DECLARATION(mca_oob_tcp_msg_error_t);
|
||||
mop->hop.vpid = (h)->vpid; \
|
||||
/* this goes to the component, so use the framework \
|
||||
* event base */ \
|
||||
opal_event_set(orte_oob_base.ev_base, &mop->ev, -1, \
|
||||
OPAL_EV_WRITE, (c), mop); \
|
||||
opal_event_set_priority(&mop->ev, ORTE_MSG_PRI); \
|
||||
opal_event_active(&mop->ev, OPAL_EV_WRITE, 1); \
|
||||
} while(0);
|
||||
ORTE_THREADSHIFT(mop, orte_oob_base.ev_base, \
|
||||
(c), ORTE_MSG_PRI); \
|
||||
} while(0)
|
||||
|
||||
#endif /* _MCA_OOB_TCP_SENDRECV_H_ */
|
||||
|
@ -55,13 +55,14 @@
|
||||
#include "opal/mca/installdirs/installdirs.h"
|
||||
#include "opal/util/argv.h"
|
||||
#include "opal/util/output.h"
|
||||
#include "orte/util/show_help.h"
|
||||
#include "opal/util/opal_environ.h"
|
||||
#include "opal/util/path.h"
|
||||
#include "opal/util/basename.h"
|
||||
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
#include "orte/util/name_fns.h"
|
||||
#include "orte/util/show_help.h"
|
||||
#include "orte/util/threads.h"
|
||||
#include "orte/runtime/orte_wait.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/rmaps/rmaps.h"
|
||||
@ -187,6 +188,8 @@ static void launch_daemons(int fd, short args, void *cbdata)
|
||||
orte_state_caddy_t *state = (orte_state_caddy_t*)cbdata;
|
||||
char *ltmp;
|
||||
|
||||
ORTE_ACQUIRE_OBJECT(state);
|
||||
|
||||
/* if we are launching debugger daemons, then just go
|
||||
* do it - no new daemons will be launched
|
||||
*/
|
||||
|
@ -74,6 +74,7 @@
|
||||
#include "orte/util/pre_condition_transports.h"
|
||||
#include "orte/util/proc_info.h"
|
||||
#include "orte/util/regex.h"
|
||||
#include "orte/util/threads.h"
|
||||
#include "orte/mca/state/state.h"
|
||||
#include "orte/mca/state/base/base.h"
|
||||
#include "orte/util/hostfile/hostfile.h"
|
||||
@ -129,6 +130,8 @@ void orte_plm_base_daemons_reported(int fd, short args, void *cbdata)
|
||||
orte_node_t *node;
|
||||
int i;
|
||||
|
||||
ORTE_ACQUIRE_OBJECT(caddy);
|
||||
|
||||
/* if we are not launching, then we just assume that all
|
||||
* daemons share our topology */
|
||||
if (orte_do_not_launch) {
|
||||
@ -182,6 +185,8 @@ void orte_plm_base_allocation_complete(int fd, short args, void *cbdata)
|
||||
{
|
||||
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
|
||||
|
||||
ORTE_ACQUIRE_OBJECT(caddy);
|
||||
|
||||
/* move the state machine along */
|
||||
caddy->jdata->state = ORTE_JOB_STATE_ALLOCATION_COMPLETE;
|
||||
ORTE_ACTIVATE_JOB_STATE(caddy->jdata, ORTE_JOB_STATE_LAUNCH_DAEMONS);
|
||||
@ -194,6 +199,8 @@ void orte_plm_base_daemons_launched(int fd, short args, void *cbdata)
|
||||
{
|
||||
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
|
||||
|
||||
ORTE_ACQUIRE_OBJECT(caddy);
|
||||
|
||||
/* do NOT increment the state - we wait for the
|
||||
* daemons to report that they have actually
|
||||
* started before moving to the right state
|
||||
@ -217,6 +224,8 @@ void orte_plm_base_vm_ready(int fd, short args, void *cbdata)
|
||||
{
|
||||
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
|
||||
|
||||
ORTE_ACQUIRE_OBJECT(caddy);
|
||||
|
||||
/* progress the job */
|
||||
caddy->jdata->state = ORTE_JOB_STATE_VM_READY;
|
||||
|
||||
@ -233,6 +242,8 @@ void orte_plm_base_mapping_complete(int fd, short args, void *cbdata)
|
||||
{
|
||||
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
|
||||
|
||||
ORTE_ACQUIRE_OBJECT(caddy);
|
||||
|
||||
/* move the state machine along */
|
||||
caddy->jdata->state = ORTE_JOB_STATE_MAP_COMPLETE;
|
||||
ORTE_ACTIVATE_JOB_STATE(caddy->jdata, ORTE_JOB_STATE_SYSTEM_PREP);
|
||||
@ -252,6 +263,8 @@ void orte_plm_base_setup_job(int fd, short args, void *cbdata)
|
||||
orte_job_t *parent;
|
||||
orte_process_name_t name, *nptr;
|
||||
|
||||
ORTE_ACQUIRE_OBJECT(caddy);
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
|
||||
"%s plm:base:setup_job",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
@ -357,6 +370,8 @@ void orte_plm_base_setup_job_complete(int fd, short args, void *cbdata)
|
||||
{
|
||||
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
|
||||
|
||||
ORTE_ACQUIRE_OBJECT(caddy);
|
||||
|
||||
/* nothing to do here but move along */
|
||||
ORTE_ACTIVATE_JOB_STATE(caddy->jdata, ORTE_JOB_STATE_ALLOCATE);
|
||||
OBJ_RELEASE(caddy);
|
||||
@ -372,6 +387,8 @@ void orte_plm_base_complete_setup(int fd, short args, void *cbdata)
|
||||
int i, rc;
|
||||
char *serial_number;
|
||||
|
||||
ORTE_ACQUIRE_OBJECT(caddy);
|
||||
|
||||
opal_output_verbose(5, orte_plm_base_framework.framework_output,
|
||||
"%s complete_setup on job %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
@ -465,6 +482,8 @@ static void timer_cb(int fd, short event, void *cbdata)
|
||||
orte_job_t *jdata = (orte_job_t*)cbdata;
|
||||
orte_timer_t *timer=NULL;
|
||||
|
||||
ORTE_ACQUIRE_OBJECT(jdata);
|
||||
|
||||
/* declare launch failed */
|
||||
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_FAILED_TO_START);
|
||||
|
||||
@ -486,6 +505,8 @@ void orte_plm_base_launch_apps(int fd, short args, void *cbdata)
|
||||
orte_timer_t *timer;
|
||||
orte_grpcomm_signature_t *sig;
|
||||
|
||||
ORTE_ACQUIRE_OBJECT(caddy);
|
||||
|
||||
/* convenience */
|
||||
jdata = caddy->jdata;
|
||||
|
||||
@ -587,6 +608,7 @@ void orte_plm_base_launch_apps(int fd, short args, void *cbdata)
|
||||
timer->tv.tv_sec = orte_startup_timeout;
|
||||
timer->tv.tv_usec = 0;
|
||||
orte_set_attribute(&jdata->attributes, ORTE_JOB_FAILURE_TIMER_EVENT, ORTE_ATTR_LOCAL, timer, OPAL_PTR);
|
||||
ORTE_POST_OBJECT(timer);
|
||||
opal_event_evtimer_add(timer->ev, &timer->tv);
|
||||
}
|
||||
|
||||
@ -605,6 +627,8 @@ void orte_plm_base_post_launch(int fd, short args, void *cbdata)
|
||||
opal_buffer_t *answer;
|
||||
int room, *rmptr;
|
||||
|
||||
ORTE_ACQUIRE_OBJECT(caddy);
|
||||
|
||||
/* convenience */
|
||||
jdata = caddy->jdata;
|
||||
|
||||
@ -720,6 +744,8 @@ void orte_plm_base_registered(int fd, short args, void *cbdata)
|
||||
opal_buffer_t *answer;
|
||||
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
|
||||
|
||||
ORTE_ACQUIRE_OBJECT(caddy);
|
||||
|
||||
/* convenience */
|
||||
jdata = caddy->jdata;
|
||||
|
||||
@ -793,7 +819,7 @@ void orte_plm_base_registered(int fd, short args, void *cbdata)
|
||||
return;
|
||||
}
|
||||
|
||||
cleanup:
|
||||
cleanup:
|
||||
/* if this wasn't a debugger job, then need to init_after_spawn for debuggers */
|
||||
if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_DEBUGGER_DAEMON)) {
|
||||
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_READY_FOR_DEBUGGERS);
|
||||
|
@ -14,7 +14,7 @@
|
||||
* reserved.
|
||||
* Copyright (c) 2008-2009 Sun Microsystems, Inc. All rights reserved.
|
||||
* Copyright (c) 2011 IBM Corporation. All rights reserved.
|
||||
* Copyright (c) 2014 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -38,6 +38,7 @@
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
#include "orte/util/name_fns.h"
|
||||
#include "orte/util/proc_info.h"
|
||||
#include "orte/util/threads.h"
|
||||
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/state/state.h"
|
||||
@ -114,6 +115,8 @@ static void launch_daemons(int fd, short args, void *cbdata)
|
||||
{
|
||||
orte_state_caddy_t *state = (orte_state_caddy_t*)cbdata;
|
||||
|
||||
ORTE_ACQUIRE_OBJECT(state);
|
||||
|
||||
/* there are no daemons to launch, so just trigger the
|
||||
* daemon-launch-complete state
|
||||
*/
|
||||
|
@ -66,6 +66,7 @@
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/rmaps/rmaps.h"
|
||||
#include "orte/mca/state/state.h"
|
||||
#include "orte/util/threads.h"
|
||||
|
||||
#include "orte/mca/plm/plm.h"
|
||||
#include "orte/mca/plm/base/base.h"
|
||||
@ -171,7 +172,10 @@ static void launch_daemons(int fd, short args, void *cbdata)
|
||||
orte_std_cntr_t nnode;
|
||||
orte_job_t *daemons;
|
||||
orte_state_caddy_t *state = (orte_state_caddy_t*)cbdata;
|
||||
orte_job_t *jdata = state->jdata;
|
||||
orte_job_t *jdata;
|
||||
|
||||
ORTE_ACQUIRE_OBJECT(state);
|
||||
jdata = state->jdata;
|
||||
|
||||
/* start by setting up the virtual machine */
|
||||
daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid);
|
||||
|
@ -80,6 +80,7 @@
|
||||
#include "orte/util/name_fns.h"
|
||||
#include "orte/util/nidmap.h"
|
||||
#include "orte/util/proc_info.h"
|
||||
#include "orte/util/threads.h"
|
||||
|
||||
#include "orte/mca/rml/rml.h"
|
||||
#include "orte/mca/rml/rml_types.h"
|
||||
@ -926,6 +927,8 @@ static void process_launch_list(int fd, short args, void *cbdata)
|
||||
pid_t pid;
|
||||
orte_plm_rsh_caddy_t *caddy;
|
||||
|
||||
ORTE_ACQUIRE_OBJECT(caddy);
|
||||
|
||||
while (num_in_progress < mca_plm_rsh_component.num_concurrent) {
|
||||
item = opal_list_remove_first(&launch_list);
|
||||
if (NULL == item) {
|
||||
@ -1021,6 +1024,8 @@ static void launch_daemons(int fd, short args, void *cbdata)
|
||||
orte_namelist_t *child;
|
||||
char *rtmod;
|
||||
|
||||
ORTE_ACQUIRE_OBJECT(state);
|
||||
|
||||
/* if we are launching debugger daemons, then just go
|
||||
* do it - no new daemons will be launched
|
||||
*/
|
||||
@ -1285,6 +1290,7 @@ static void launch_daemons(int fd, short args, void *cbdata)
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output,
|
||||
"%s plm:rsh: activating launch event",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
ORTE_POST_OBJECT(state);
|
||||
opal_event_active(&launch_event, EV_WRITE, 1);
|
||||
|
||||
/* now that we've launched the daemons, let the daemon callback
|
||||
|
@ -61,6 +61,7 @@
|
||||
#include "orte/types.h"
|
||||
#include "orte/util/show_help.h"
|
||||
#include "orte/util/name_fns.h"
|
||||
#include "orte/util/threads.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
#include "orte/runtime/orte_wait.h"
|
||||
#include "orte/runtime/orte_quit.h"
|
||||
@ -108,7 +109,6 @@ orte_plm_base_module_1_0_0_t orte_plm_slurm_module = {
|
||||
*/
|
||||
static pid_t primary_srun_pid = 0;
|
||||
static bool primary_pid_set = false;
|
||||
static bool launching_daemons;
|
||||
static void launch_daemons(int fd, short args, void *cbdata);
|
||||
|
||||
/**
|
||||
@ -189,6 +189,8 @@ static void launch_daemons(int fd, short args, void *cbdata)
|
||||
orte_job_t *daemons;
|
||||
orte_state_caddy_t *state = (orte_state_caddy_t*)cbdata;
|
||||
|
||||
ORTE_ACQUIRE_OBJECT(state);
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output,
|
||||
"%s plm:slurm: LAUNCH DAEMONS CALLED",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
@ -545,27 +547,18 @@ static void srun_wait_cb(orte_proc_t *proc, void* cbdata){
|
||||
|
||||
jdata = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid);
|
||||
|
||||
/* if we are in the launch phase, then any termination is bad */
|
||||
if (launching_daemons) {
|
||||
/* report that one or more daemons failed to launch so we can exit */
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output,
|
||||
"%s plm:slurm: daemon failed during launch",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
/* notify the error manager */
|
||||
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_FAILED_TO_START);
|
||||
} else {
|
||||
/* if this is after launch, then we need to abort only if the status
|
||||
* returned is non-zero - i.e., if the orteds exited with an error
|
||||
/* abort only if the status returned is non-zero - i.e., if
|
||||
* the orteds exited with an error
|
||||
*/
|
||||
if (0 != proc->exit_code) {
|
||||
/* an orted must have died unexpectedly - report
|
||||
* that the daemon has failed so we exit
|
||||
*/
|
||||
if (0 != proc->exit_code) {
|
||||
/* an orted must have died unexpectedly after launch - report
|
||||
* that the daemon has failed so we exit
|
||||
*/
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output,
|
||||
"%s plm:slurm: daemon failed while running",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_ABORTED);
|
||||
}
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output,
|
||||
"%s plm:slurm: daemon failed while running",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_ABORTED);
|
||||
} else {
|
||||
/* otherwise, check to see if this is the primary pid */
|
||||
if (primary_srun_pid == proc->pid) {
|
||||
/* in this case, we just want to fire the proper trigger so
|
||||
@ -579,6 +572,7 @@ static void srun_wait_cb(orte_proc_t *proc, void* cbdata){
|
||||
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_DAEMONS_TERMINATED);
|
||||
}
|
||||
}
|
||||
|
||||
/* done with this dummy */
|
||||
OBJ_RELEASE(proc);
|
||||
}
|
||||
@ -602,6 +596,13 @@ static int plm_slurm_start_proc(int argc, char **argv, char **env,
|
||||
free(exec_argv);
|
||||
return ORTE_ERR_SYS_LIMITS_CHILDREN;
|
||||
}
|
||||
/* if this is the primary launch - i.e., not a comm_spawn of a
|
||||
* child job - then save the pid
|
||||
*/
|
||||
if (0 < srun_pid && !primary_pid_set) {
|
||||
primary_srun_pid = srun_pid;
|
||||
primary_pid_set = true;
|
||||
}
|
||||
|
||||
/* setup a dummy proc object to track the srun */
|
||||
dummy = OBJ_NEW(orte_proc_t);
|
||||
@ -692,14 +693,6 @@ static int plm_slurm_start_proc(int argc, char **argv, char **env,
|
||||
sides of the fork... */
|
||||
setpgid(srun_pid, srun_pid);
|
||||
|
||||
/* if this is the primary launch - i.e., not a comm_spawn of a
|
||||
* child job - then save the pid
|
||||
*/
|
||||
if (!primary_pid_set) {
|
||||
primary_srun_pid = srun_pid;
|
||||
primary_pid_set = true;
|
||||
}
|
||||
|
||||
free(exec_argv);
|
||||
}
|
||||
|
||||
|
@ -63,6 +63,7 @@
|
||||
#include "opal/util/basename.h"
|
||||
|
||||
#include "orte/util/name_fns.h"
|
||||
#include "orte/util/threads.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
#include "orte/runtime/orte_wait.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
@ -185,6 +186,8 @@ static void launch_daemons(int fd, short args, void *cbdata)
|
||||
int32_t launchid, *ldptr;
|
||||
char *prefix_dir = NULL;
|
||||
|
||||
ORTE_ACQUIRE_OBJECT(state);
|
||||
|
||||
jdata = state->jdata;
|
||||
|
||||
/* if we are launching debugger daemons, then just go
|
||||
@ -403,7 +406,7 @@ static void launch_daemons(int fd, short args, void *cbdata)
|
||||
"%s plm:tm:launch: finished spawning orteds",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
|
||||
cleanup:
|
||||
cleanup:
|
||||
/* cleanup */
|
||||
OBJ_RELEASE(state);
|
||||
|
||||
@ -421,6 +424,8 @@ static void poll_spawns(int fd, short args, void *cbdata)
|
||||
int local_err;
|
||||
tm_event_t event;
|
||||
|
||||
ORTE_ACQUIRE_OBJECT(state);
|
||||
|
||||
/* TM poll for all the spawns */
|
||||
for (i = 0; i < launched; ++i) {
|
||||
rc = tm_poll(TM_NULL_EVENT, &event, 1, &local_err);
|
||||
@ -435,7 +440,7 @@ static void poll_spawns(int fd, short args, void *cbdata)
|
||||
}
|
||||
failed_launch = false;
|
||||
|
||||
cleanup:
|
||||
cleanup:
|
||||
/* cleanup */
|
||||
OBJ_RELEASE(state);
|
||||
|
||||
|
@ -45,6 +45,7 @@
|
||||
#include "orte/util/proc_info.h"
|
||||
#include "orte/util/comm/comm.h"
|
||||
#include "orte/util/error_strings.h"
|
||||
#include "orte/util/threads.h"
|
||||
#include "orte/mca/state/state.h"
|
||||
#include "orte/runtime/orte_quit.h"
|
||||
|
||||
@ -115,6 +116,8 @@ void orte_ras_base_allocate(int fd, short args, void *cbdata)
|
||||
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
|
||||
char *hosts=NULL;
|
||||
|
||||
ORTE_ACQUIRE_OBJECT(caddy);
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_ras_base_framework.framework_output,
|
||||
"%s ras:base:allocate",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
|
@ -36,6 +36,7 @@
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
#include "orte/util/show_help.h"
|
||||
#include "orte/util/threads.h"
|
||||
#include "orte/mca/state/state.h"
|
||||
|
||||
#include "orte/mca/rmaps/base/base.h"
|
||||
@ -45,7 +46,7 @@
|
||||
void orte_rmaps_base_map_job(int fd, short args, void *cbdata)
|
||||
{
|
||||
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
|
||||
orte_job_t *jdata = caddy->jdata;
|
||||
orte_job_t *jdata;
|
||||
orte_node_t *node;
|
||||
int rc, i, ppx = 0;
|
||||
bool did_map, given, pernode = false;
|
||||
@ -54,6 +55,9 @@ void orte_rmaps_base_map_job(int fd, short args, void *cbdata)
|
||||
orte_vpid_t nprocs;
|
||||
orte_app_context_t *app;
|
||||
|
||||
ORTE_ACQUIRE_OBJECT(caddy);
|
||||
jdata = caddy->jdata;
|
||||
|
||||
jdata->state = ORTE_JOB_STATE_MAP;
|
||||
|
||||
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
|
||||
|
@ -29,6 +29,7 @@
|
||||
#include "orte/mca/state/state.h"
|
||||
#include "orte/runtime/orte_wait.h"
|
||||
#include "orte/util/name_fns.h"
|
||||
#include "orte/util/threads.h"
|
||||
|
||||
#include "orte/mca/rml/base/base.h"
|
||||
|
||||
@ -87,8 +88,10 @@ static void cleanup(int sd, short args, void *cbdata)
|
||||
{
|
||||
volatile bool *active = (volatile bool*)cbdata;
|
||||
|
||||
ORTE_ACQUIRE_OBJECT(active);
|
||||
OPAL_LIST_DESTRUCT(&orte_rml_base.posted_recvs);
|
||||
if (NULL != active) {
|
||||
ORTE_POST_OBJECT(active);
|
||||
*active = false;
|
||||
}
|
||||
}
|
||||
@ -128,6 +131,7 @@ static int orte_rml_base_close(void)
|
||||
opal_event_set(orte_event_base, &ev, -1,
|
||||
OPAL_EV_WRITE, cleanup, (void*)&active);
|
||||
opal_event_set_priority(&ev, ORTE_ERROR_PRI);
|
||||
ORTE_POST_OBJECT(ev);
|
||||
opal_event_active(&ev, OPAL_EV_WRITE, 1);
|
||||
ORTE_WAIT_FOR_COMPLETION(active);
|
||||
} else {
|
||||
@ -243,12 +247,14 @@ void orte_rml_recv_callback(int status, orte_process_name_t* sender,
|
||||
{
|
||||
orte_rml_recv_cb_t *blob = (orte_rml_recv_cb_t*)cbdata;
|
||||
|
||||
ORTE_ACQUIRE_OBJECT(blob);
|
||||
/* transfer the sender */
|
||||
blob->name.jobid = sender->jobid;
|
||||
blob->name.vpid = sender->vpid;
|
||||
/* just copy the payload to the buf */
|
||||
opal_dss.copy_payload(&blob->data, buffer);
|
||||
/* flag as complete */
|
||||
ORTE_POST_OBJECT(blob);
|
||||
blob->active = false;
|
||||
}
|
||||
|
||||
|
@ -42,6 +42,7 @@
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
#include "orte/runtime/orte_wait.h"
|
||||
#include "orte/util/name_fns.h"
|
||||
#include "orte/util/threads.h"
|
||||
|
||||
#include "orte/mca/rml/rml.h"
|
||||
#include "orte/mca/rml/base/base.h"
|
||||
@ -57,6 +58,8 @@ void orte_rml_base_post_recv(int sd, short args, void *cbdata)
|
||||
orte_rml_posted_recv_t *post, *recv;
|
||||
orte_ns_cmp_bitmask_t mask = ORTE_NS_CMP_ALL | ORTE_NS_CMP_WILD;
|
||||
|
||||
ORTE_ACQUIRE_OBJECT(req);
|
||||
|
||||
opal_output_verbose(5, orte_rml_base_framework.framework_output,
|
||||
"%s posting recv",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
|
||||
@ -159,6 +162,8 @@ void orte_rml_base_process_msg(int fd, short flags, void *cbdata)
|
||||
orte_ns_cmp_bitmask_t mask = ORTE_NS_CMP_ALL | ORTE_NS_CMP_WILD;
|
||||
opal_buffer_t buf;
|
||||
|
||||
ORTE_ACQUIRE_OBJECT(msg);
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_rml_base_framework.framework_output,
|
||||
"%s message received from %s for tag %d",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
|
@ -30,6 +30,7 @@
|
||||
#include "orte/mca/state/state.h"
|
||||
#include "orte/runtime/orte_wait.h"
|
||||
#include "orte/util/name_fns.h"
|
||||
#include "orte/util/threads.h"
|
||||
|
||||
#include "orte/mca/rml/base/base.h"
|
||||
|
||||
@ -269,11 +270,7 @@ void orte_rml_API_recv_nb(orte_process_name_t* peer,
|
||||
req->post->persistent = persistent;
|
||||
req->post->cbfunc.iov = cbfunc;
|
||||
req->post->cbdata = cbdata;
|
||||
opal_event_set(orte_event_base, &req->ev, -1,
|
||||
OPAL_EV_WRITE,
|
||||
orte_rml_base_post_recv, req);
|
||||
opal_event_set_priority(&req->ev, ORTE_MSG_PRI);
|
||||
opal_event_active(&req->ev, OPAL_EV_WRITE, 1);
|
||||
ORTE_THREADSHIFT(req, orte_event_base, orte_rml_base_post_recv, ORTE_MSG_PRI);
|
||||
}
|
||||
|
||||
/** Receive non-blocking buffer message */
|
||||
@ -300,11 +297,7 @@ void orte_rml_API_recv_buffer_nb(orte_process_name_t* peer,
|
||||
req->post->persistent = persistent;
|
||||
req->post->cbfunc.buffer = cbfunc;
|
||||
req->post->cbdata = cbdata;
|
||||
opal_event_set(orte_event_base, &req->ev, -1,
|
||||
OPAL_EV_WRITE,
|
||||
orte_rml_base_post_recv, req);
|
||||
opal_event_set_priority(&req->ev, ORTE_MSG_PRI);
|
||||
opal_event_active(&req->ev, OPAL_EV_WRITE, 1);
|
||||
ORTE_THREADSHIFT(req, orte_event_base, orte_rml_base_post_recv, ORTE_MSG_PRI);
|
||||
}
|
||||
|
||||
/** Cancel posted non-blocking receive */
|
||||
@ -316,6 +309,8 @@ void orte_rml_API_recv_cancel(orte_process_name_t* peer, orte_rml_tag_t tag)
|
||||
"%s rml_recv_cancel for peer %s tag %d",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(peer), tag);
|
||||
|
||||
ORTE_ACQUIRE_OBJECT(orte_event_base_active);
|
||||
if (!orte_event_base_active) {
|
||||
/* no event will be processed any more, so simply return. */
|
||||
return;
|
||||
@ -328,11 +323,7 @@ void orte_rml_API_recv_cancel(orte_process_name_t* peer, orte_rml_tag_t tag)
|
||||
req->post->peer.jobid = peer->jobid;
|
||||
req->post->peer.vpid = peer->vpid;
|
||||
req->post->tag = tag;
|
||||
opal_event_set(orte_event_base, &req->ev, -1,
|
||||
OPAL_EV_WRITE,
|
||||
orte_rml_base_post_recv, req);
|
||||
opal_event_set_priority(&req->ev, ORTE_MSG_PRI);
|
||||
opal_event_active(&req->ev, OPAL_EV_WRITE, 1);
|
||||
ORTE_THREADSHIFT(req, orte_event_base, orte_rml_base_post_recv, ORTE_MSG_PRI);
|
||||
}
|
||||
|
||||
/** Purge information */
|
||||
|
@ -29,6 +29,7 @@
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/oob/base/base.h"
|
||||
#include "orte/util/name_fns.h"
|
||||
#include "orte/util/threads.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
|
||||
#include "orte/mca/rml/base/base.h"
|
||||
@ -39,6 +40,8 @@ static void send_self_exe(int fd, short args, void* data)
|
||||
{
|
||||
orte_self_send_xfer_t *xfer = (orte_self_send_xfer_t*)data;
|
||||
|
||||
ORTE_ACQUIRE_OBJECT(xfer);
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_rml_base_framework.framework_output,
|
||||
"%s rml_send_to_self callback executing for tag %d",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), xfer->tag));
|
||||
@ -130,9 +133,7 @@ int orte_rml_oob_send_nb(struct orte_rml_base_module_t *mod,
|
||||
xfer->tag = tag;
|
||||
xfer->cbdata = cbdata;
|
||||
/* setup the event for the send callback */
|
||||
opal_event_set(orte_event_base, &xfer->ev, -1, OPAL_EV_WRITE, send_self_exe, xfer);
|
||||
opal_event_set_priority(&xfer->ev, ORTE_MSG_PRI);
|
||||
opal_event_active(&xfer->ev, OPAL_EV_WRITE, 1);
|
||||
ORTE_THREADSHIFT(xfer, orte_event_base, send_self_exe, ORTE_MSG_PRI);
|
||||
|
||||
/* copy the message for the recv */
|
||||
rcv = OBJ_NEW(orte_rml_recv_t);
|
||||
@ -235,9 +236,7 @@ int orte_rml_oob_send_buffer_nb(struct orte_rml_base_module_t *mod,
|
||||
xfer->tag = tag;
|
||||
xfer->cbdata = cbdata;
|
||||
/* setup the event for the send callback */
|
||||
opal_event_set(orte_event_base, &xfer->ev, -1, OPAL_EV_WRITE, send_self_exe, xfer);
|
||||
opal_event_set_priority(&xfer->ev, ORTE_MSG_PRI);
|
||||
opal_event_active(&xfer->ev, OPAL_EV_WRITE, 1);
|
||||
ORTE_THREADSHIFT(xfer, orte_event_base, send_self_exe, ORTE_MSG_PRI);
|
||||
|
||||
/* copy the message for the recv */
|
||||
rcv = OBJ_NEW(orte_rml_recv_t);
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2014-2015 Intel, Inc. All rights reserved
|
||||
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2017 Cisco Systems, Inc. All rights reserved
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
@ -39,11 +39,9 @@ static void set(orte_job_t *jdata,
|
||||
int write_fd);
|
||||
|
||||
orte_rtc_base_module_t orte_rtc_hwloc_module = {
|
||||
init,
|
||||
finalize,
|
||||
NULL,
|
||||
set,
|
||||
NULL
|
||||
.init = init,
|
||||
.finalize = finalize,
|
||||
.set = set
|
||||
};
|
||||
|
||||
static int init(void)
|
||||
|
@ -36,6 +36,7 @@
|
||||
#include "orte/mca/rml/rml.h"
|
||||
#include "orte/mca/routed/routed.h"
|
||||
#include "orte/util/session_dir.h"
|
||||
#include "orte/util/threads.h"
|
||||
|
||||
#include "orte/mca/state/base/base.h"
|
||||
#include "orte/mca/state/base/state_private.h"
|
||||
@ -78,9 +79,7 @@ void orte_state_base_activate_job_state(orte_job_t *jdata,
|
||||
caddy->job_state = state;
|
||||
OBJ_RETAIN(jdata);
|
||||
}
|
||||
opal_event_set(orte_event_base, &caddy->ev, -1, OPAL_EV_WRITE, s->cbfunc, caddy);
|
||||
opal_event_set_priority(&caddy->ev, s->priority);
|
||||
opal_event_active(&caddy->ev, OPAL_EV_WRITE, 1);
|
||||
ORTE_THREADSHIFT(caddy, orte_event_base, s->cbfunc, s->priority);
|
||||
return;
|
||||
}
|
||||
}
|
||||
@ -107,14 +106,12 @@ void orte_state_base_activate_job_state(orte_job_t *jdata,
|
||||
caddy->job_state = state;
|
||||
OBJ_RETAIN(jdata);
|
||||
}
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_state_base_framework.framework_output,
|
||||
"%s ACTIVATING JOB %s STATE %s PRI %d",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
(NULL == jdata) ? "NULL" : ORTE_JOBID_PRINT(jdata->jobid),
|
||||
orte_job_state_to_str(state), s->priority));
|
||||
opal_event_set(orte_event_base, &caddy->ev, -1, OPAL_EV_WRITE, s->cbfunc, caddy);
|
||||
opal_event_set_priority(&caddy->ev, s->priority);
|
||||
opal_event_active(&caddy->ev, OPAL_EV_WRITE, 1);
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_state_base_framework.framework_output,
|
||||
"%s ACTIVATING JOB %s STATE %s PRI %d",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
(NULL == jdata) ? "NULL" : ORTE_JOBID_PRINT(jdata->jobid),
|
||||
orte_job_state_to_str(state), s->priority));
|
||||
ORTE_THREADSHIFT(caddy, orte_event_base, s->cbfunc, s->priority);
|
||||
}
|
||||
|
||||
|
||||
@ -262,9 +259,7 @@ void orte_state_base_activate_proc_state(orte_process_name_t *proc,
|
||||
caddy = OBJ_NEW(orte_state_caddy_t);
|
||||
caddy->name = *proc;
|
||||
caddy->proc_state = state;
|
||||
opal_event_set(orte_event_base, &caddy->ev, -1, OPAL_EV_WRITE, s->cbfunc, caddy);
|
||||
opal_event_set_priority(&caddy->ev, s->priority);
|
||||
opal_event_active(&caddy->ev, OPAL_EV_WRITE, 1);
|
||||
ORTE_THREADSHIFT(caddy, orte_event_base, s->cbfunc, s->priority);
|
||||
return;
|
||||
}
|
||||
}
|
||||
@ -288,14 +283,12 @@ void orte_state_base_activate_proc_state(orte_process_name_t *proc,
|
||||
caddy = OBJ_NEW(orte_state_caddy_t);
|
||||
caddy->name = *proc;
|
||||
caddy->proc_state = state;
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_state_base_framework.framework_output,
|
||||
"%s ACTIVATING PROC %s STATE %s PRI %d",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(proc),
|
||||
orte_proc_state_to_str(state), s->priority));
|
||||
opal_event_set(orte_event_base, &caddy->ev, -1, OPAL_EV_WRITE, s->cbfunc, caddy);
|
||||
opal_event_set_priority(&caddy->ev, s->priority);
|
||||
opal_event_active(&caddy->ev, OPAL_EV_WRITE, 1);
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_state_base_framework.framework_output,
|
||||
"%s ACTIVATING PROC %s STATE %s PRI %d",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(proc),
|
||||
orte_proc_state_to_str(state), s->priority));
|
||||
ORTE_THREADSHIFT(caddy, orte_event_base, s->cbfunc, s->priority);
|
||||
}
|
||||
|
||||
int orte_state_base_add_proc_state(orte_proc_state_t state,
|
||||
@ -443,7 +436,10 @@ void orte_state_base_local_launch_complete(int fd, short argc, void *cbdata)
|
||||
void orte_state_base_cleanup_job(int fd, short argc, void *cbdata)
|
||||
{
|
||||
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
|
||||
orte_job_t *jdata = caddy->jdata;
|
||||
orte_job_t *jdata;
|
||||
|
||||
ORTE_ACQUIRE_OBJECT(caddy);
|
||||
jdata = caddy->jdata;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((2, orte_state_base_framework.framework_output,
|
||||
"%s state:base:cleanup on job %s",
|
||||
@ -460,9 +456,12 @@ void orte_state_base_cleanup_job(int fd, short argc, void *cbdata)
|
||||
void orte_state_base_report_progress(int fd, short argc, void *cbdata)
|
||||
{
|
||||
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
|
||||
orte_job_t *jdata = caddy->jdata;
|
||||
orte_job_t *jdata;
|
||||
|
||||
opal_output(orte_clean_output, "App launch reported: %d (out of %d) daemons - %d (out of %d) procs",
|
||||
ORTE_ACQUIRE_OBJECT(caddy);
|
||||
jdata = caddy->jdata;
|
||||
|
||||
opal_output(orte_clean_output, "App launch reported: %d (out of %d) daemons - %d (out of %d) procs",
|
||||
(int)jdata->num_daemons_reported, (int)orte_process_info.num_procs,
|
||||
(int)jdata->num_launched, (int)jdata->num_procs);
|
||||
OBJ_RELEASE(caddy);
|
||||
@ -659,14 +658,18 @@ static void _send_notification(int status,
|
||||
void orte_state_base_track_procs(int fd, short argc, void *cbdata)
|
||||
{
|
||||
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
|
||||
orte_process_name_t *proc = &caddy->name;
|
||||
orte_proc_state_t state = caddy->proc_state;
|
||||
orte_process_name_t *proc;
|
||||
orte_proc_state_t state;
|
||||
orte_job_t *jdata;
|
||||
orte_proc_t *pdata;
|
||||
int i;
|
||||
char *rtmod;
|
||||
orte_process_name_t parent, target, *npptr;
|
||||
|
||||
ORTE_ACQUIRE_OBJECT(caddy);
|
||||
proc = &caddy->name;
|
||||
state = caddy->proc_state;
|
||||
|
||||
opal_output_verbose(5, orte_state_base_framework.framework_output,
|
||||
"%s state:base:track_procs called for proc %s state %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
@ -811,8 +814,7 @@ void orte_state_base_track_procs(int fd, short argc, void *cbdata)
|
||||
void orte_state_base_check_all_complete(int fd, short args, void *cbdata)
|
||||
{
|
||||
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
|
||||
orte_job_t *jdata = caddy->jdata;
|
||||
|
||||
orte_job_t *jdata;
|
||||
orte_proc_t *proc;
|
||||
int i;
|
||||
orte_std_cntr_t j;
|
||||
@ -827,6 +829,9 @@ void orte_state_base_check_all_complete(int fd, short args, void *cbdata)
|
||||
void *nptr;
|
||||
char *rtmod;
|
||||
|
||||
ORTE_ACQUIRE_OBJECT(caddy);
|
||||
jdata = caddy->jdata;
|
||||
|
||||
opal_output_verbose(2, orte_state_base_framework.framework_output,
|
||||
"%s state:base:check_job_complete on job %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
|
@ -31,6 +31,7 @@
|
||||
#include "orte/mca/routed/routed.h"
|
||||
#include "orte/util/nidmap.h"
|
||||
#include "orte/util/session_dir.h"
|
||||
#include "orte/util/threads.h"
|
||||
#include "orte/runtime/orte_quit.h"
|
||||
#include "orte/runtime/orte_wait.h"
|
||||
|
||||
@ -223,6 +224,8 @@ static void init_complete(int sd, short args, void *cbdata)
|
||||
{
|
||||
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
|
||||
|
||||
ORTE_ACQUIRE_OBJECT(caddy);
|
||||
|
||||
/* nothing to do here but move along - if it is the
|
||||
* daemon job, then next step is allocate */
|
||||
if (caddy->jdata->jobid == ORTE_PROC_MY_NAME->jobid) {
|
||||
@ -249,6 +252,8 @@ static void vm_ready(int fd, short args, void *cbdata)
|
||||
int32_t numbytes;
|
||||
char *nidmap;
|
||||
|
||||
ORTE_ACQUIRE_OBJECT(caddy);
|
||||
|
||||
/* if this is my job, then we are done */
|
||||
if (ORTE_PROC_MY_NAME->jobid == caddy->jdata->jobid) {
|
||||
/* send the daemon map to every daemon in this DVM - we
|
||||
@ -353,8 +358,7 @@ static void vm_ready(int fd, short args, void *cbdata)
|
||||
static void check_complete(int fd, short args, void *cbdata)
|
||||
{
|
||||
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
|
||||
orte_job_t *jdata = caddy->jdata;
|
||||
|
||||
orte_job_t *jdata;
|
||||
orte_proc_t *proc;
|
||||
int i;
|
||||
orte_node_t *node;
|
||||
@ -362,6 +366,9 @@ static void check_complete(int fd, short args, void *cbdata)
|
||||
orte_std_cntr_t index;
|
||||
char *rtmod;
|
||||
|
||||
ORTE_ACQUIRE_OBJECT(caddy);
|
||||
jdata = caddy->jdata;
|
||||
|
||||
opal_output_verbose(2, orte_state_base_framework.framework_output,
|
||||
"%s state:dvm:check_job_complete on job %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
@ -472,7 +479,10 @@ static void check_complete(int fd, short args, void *cbdata)
|
||||
static void cleanup_job(int sd, short args, void *cbdata)
|
||||
{
|
||||
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
|
||||
orte_job_t *jdata = caddy->jdata;
|
||||
orte_job_t *jdata;
|
||||
|
||||
ORTE_ACQUIRE_OBJECT(caddy);
|
||||
jdata = caddy->jdata;
|
||||
|
||||
/* remove this object from the job array */
|
||||
opal_hash_table_set_value_uint32(orte_job_data, jdata->jobid, NULL);
|
||||
|
@ -26,6 +26,7 @@
|
||||
#include "orte/mca/rmaps/base/base.h"
|
||||
#include "orte/mca/routed/routed.h"
|
||||
#include "orte/util/session_dir.h"
|
||||
#include "orte/util/threads.h"
|
||||
#include "orte/runtime/orte_quit.h"
|
||||
|
||||
#include "orte/mca/state/state.h"
|
||||
@ -196,12 +197,15 @@ static int finalize(void)
|
||||
static void allocation_complete(int fd, short args, void *cbdata)
|
||||
{
|
||||
orte_state_caddy_t *state = (orte_state_caddy_t*)cbdata;
|
||||
orte_job_t *jdata = state->jdata;
|
||||
orte_job_t *jdata;
|
||||
orte_job_t *daemons;
|
||||
orte_topology_t *t;
|
||||
orte_node_t *node;
|
||||
int i;
|
||||
|
||||
ORTE_ACQUIRE_OBJECT(caddy);
|
||||
jdata = state->jdata;
|
||||
|
||||
jdata->state = ORTE_JOB_STATE_ALLOCATION_COMPLETE;
|
||||
|
||||
/* get the daemon job object */
|
||||
@ -252,7 +256,10 @@ static void allocation_complete(int fd, short args, void *cbdata)
|
||||
static void map_complete(int fd, short args, void *cbdata)
|
||||
{
|
||||
orte_state_caddy_t *state = (orte_state_caddy_t*)cbdata;
|
||||
orte_job_t *jdata = state->jdata;
|
||||
orte_job_t *jdata;
|
||||
|
||||
ORTE_ACQUIRE_OBJECT(caddy);
|
||||
jdata = state->jdata;
|
||||
|
||||
jdata->state = ORTE_JOB_STATE_MAP_COMPLETE;
|
||||
/* move to the map stage */
|
||||
@ -265,7 +272,10 @@ static void map_complete(int fd, short args, void *cbdata)
|
||||
static void vm_ready(int fd, short args, void *cbdata)
|
||||
{
|
||||
orte_state_caddy_t *state = (orte_state_caddy_t*)cbdata;
|
||||
orte_job_t *jdata = state->jdata;
|
||||
orte_job_t *jdata;
|
||||
|
||||
ORTE_ACQUIRE_OBJECT(caddy);
|
||||
jdata = state->jdata;
|
||||
|
||||
/* now that the daemons are launched, we are ready
|
||||
* to roll
|
||||
|
@ -27,6 +27,7 @@
|
||||
#include "orte/mca/rml/rml.h"
|
||||
#include "orte/mca/routed/routed.h"
|
||||
#include "orte/util/session_dir.h"
|
||||
#include "orte/util/threads.h"
|
||||
#include "orte/orted/pmix/pmix_server_internal.h"
|
||||
#include "orte/runtime/orte_data_server.h"
|
||||
#include "orte/runtime/orte_quit.h"
|
||||
@ -165,6 +166,8 @@ static void track_jobs(int fd, short argc, void *cbdata)
|
||||
orte_proc_t *child;
|
||||
orte_vpid_t null=ORTE_VPID_INVALID;
|
||||
|
||||
ORTE_ACQUIRE_OBJECT(caddy);
|
||||
|
||||
if (ORTE_JOB_STATE_LOCAL_LAUNCH_COMPLETE == caddy->job_state) {
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_state_base_framework.framework_output,
|
||||
"%s state:orted:track_jobs sending local launch complete for job %s",
|
||||
@ -251,8 +254,8 @@ static void track_jobs(int fd, short argc, void *cbdata)
|
||||
static void track_procs(int fd, short argc, void *cbdata)
|
||||
{
|
||||
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
|
||||
orte_process_name_t *proc = &caddy->name;
|
||||
orte_proc_state_t state = caddy->proc_state;
|
||||
orte_process_name_t *proc;
|
||||
orte_proc_state_t state;
|
||||
orte_job_t *jdata;
|
||||
orte_proc_t *pdata, *pptr;
|
||||
opal_buffer_t *alert;
|
||||
@ -264,6 +267,10 @@ static void track_procs(int fd, short argc, void *cbdata)
|
||||
orte_node_t *node;
|
||||
orte_process_name_t target;
|
||||
|
||||
ORTE_ACQUIRE_OBJECT(caddy);
|
||||
proc = &caddy->name;
|
||||
state = caddy->proc_state;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_state_base_framework.framework_output,
|
||||
"%s state:orted:track_procs called for proc %s state %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
|
@ -77,6 +77,7 @@
|
||||
#include "orte/mca/rml/base/rml_contact.h"
|
||||
#include "orte/util/pre_condition_transports.h"
|
||||
#include "orte/util/compress.h"
|
||||
#include "orte/util/threads.h"
|
||||
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/ess/ess.h"
|
||||
@ -919,6 +920,7 @@ int orte_daemon(int argc, char *argv[])
|
||||
while (orte_event_base_active) {
|
||||
opal_event_loop(orte_event_base, OPAL_EVLOOP_ONCE);
|
||||
}
|
||||
ORTE_ACQUIRE_OBJECT(orte_event_base_active);
|
||||
|
||||
/* ensure all local procs are dead */
|
||||
orte_odls.kill_local_procs(NULL);
|
||||
|
@ -68,6 +68,7 @@
|
||||
#include "orte/util/proc_info.h"
|
||||
#include "orte/util/session_dir.h"
|
||||
#include "orte/util/show_help.h"
|
||||
#include "orte/util/threads.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
|
||||
#include "pmix_server.h"
|
||||
@ -350,6 +351,8 @@ static void _mdxresp(int sd, short args, void *cbdata)
|
||||
int rc;
|
||||
opal_buffer_t *reply;
|
||||
|
||||
ORTE_ACQUIRE_OBJECT(req);
|
||||
|
||||
/* check us out of the hotel */
|
||||
opal_hotel_checkout(&orte_pmix_server_globals.reqs, req->room_num);
|
||||
|
||||
@ -399,6 +402,8 @@ static void modex_resp(int status,
|
||||
pmix_server_req_t *req = (pmix_server_req_t*)cbdata;
|
||||
opal_buffer_t xfer;
|
||||
|
||||
ORTE_ACQUIRE_OBJECT(req);
|
||||
|
||||
req->status = status;
|
||||
/* we need to preserve the data as the caller
|
||||
* will free it upon our return */
|
||||
@ -413,6 +418,7 @@ static void modex_resp(int status,
|
||||
opal_event_set(orte_event_base, &(req->ev),
|
||||
-1, OPAL_EV_WRITE, _mdxresp, req);
|
||||
opal_event_set_priority(&(req->ev), ORTE_MSG_PRI);
|
||||
ORTE_POST_OBJECT(req);
|
||||
opal_event_active(&(req->ev), OPAL_EV_WRITE, 1);
|
||||
}
|
||||
static void pmix_server_dmdx_recv(int status, orte_process_name_t* sender,
|
||||
|
@ -44,6 +44,7 @@
|
||||
#include "orte/mca/rmaps/base/base.h"
|
||||
#include "orte/util/name_fns.h"
|
||||
#include "orte/util/show_help.h"
|
||||
#include "orte/util/threads.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
#include "orte/mca/rml/rml.h"
|
||||
|
||||
@ -103,6 +104,8 @@ static void spawn(int sd, short args, void *cbdata)
|
||||
opal_buffer_t *buf;
|
||||
orte_plm_cmd_flag_t command;
|
||||
|
||||
ORTE_ACQUIRE_OBJECT(req);
|
||||
|
||||
/* add this request to our tracker hotel */
|
||||
if (OPAL_SUCCESS != (rc = opal_hotel_checkin(&orte_pmix_server_globals.reqs, req, &req->room_num))) {
|
||||
orte_show_help("help-orted.txt", "noroom", true, req->operation, orte_pmix_server_globals.num_rooms);
|
||||
@ -351,6 +354,8 @@ static void _cnlk(int status, opal_list_t *data, void *cbdata)
|
||||
orte_job_t *jdata;
|
||||
opal_buffer_t buf;
|
||||
|
||||
ORTE_ACQUIRE_OBJECT(cd);
|
||||
|
||||
/* if we failed to get the required data, then just inform
|
||||
* the embedded server that the connect cannot succeed */
|
||||
if (ORTE_SUCCESS != status || NULL == data) {
|
||||
@ -402,6 +407,8 @@ static void _cnct(int sd, short args, void *cbdata)
|
||||
orte_job_t *jdata;
|
||||
int rc = ORTE_SUCCESS;
|
||||
|
||||
ORTE_ACQUIRE_OBJECT(cd);
|
||||
|
||||
/* at some point, we need to add bookeeping to track which
|
||||
* procs are "connected" so we know who to notify upon
|
||||
* termination or failure. For now, we have to ensure
|
||||
@ -477,6 +484,8 @@ static void mdxcbfunc(int status,
|
||||
{
|
||||
orte_pmix_server_op_caddy_t *cd = (orte_pmix_server_op_caddy_t*)cbdata;
|
||||
|
||||
ORTE_ACQUIRE_OBJECT(cd);
|
||||
|
||||
/* ack the call */
|
||||
if (NULL != cd->cbfunc) {
|
||||
cd->cbfunc(status, cd->cbdata);
|
||||
|
@ -38,6 +38,7 @@
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/util/name_fns.h"
|
||||
#include "orte/util/show_help.h"
|
||||
#include "orte/util/threads.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
#include "orte/mca/grpcomm/grpcomm.h"
|
||||
#include "orte/mca/rml/rml.h"
|
||||
@ -59,6 +60,8 @@ static void pmix_server_release(int status, opal_buffer_t *buf, void *cbdata)
|
||||
int32_t ndata = 0;
|
||||
int rc = OPAL_SUCCESS;
|
||||
|
||||
ORTE_ACQUIRE_OBJECT(cd);
|
||||
|
||||
/* unload the buffer */
|
||||
if (NULL != buf) {
|
||||
rc = opal_dss.unload(buf, (void**)&data, &ndata);
|
||||
@ -135,6 +138,8 @@ static void dmodex_req(int sd, short args, void *cbdata)
|
||||
uint8_t *data=NULL;
|
||||
int32_t sz=0;
|
||||
|
||||
ORTE_ACQUIRE_OBJECT(rq);
|
||||
|
||||
/* a race condition exists here because of the thread-shift - it is
|
||||
* possible that data for the specified proc arrived while we were
|
||||
* waiting to be serviced. In that case, the tracker that would have
|
||||
|
@ -43,6 +43,7 @@
|
||||
#include "orte/mca/schizo/schizo.h"
|
||||
#include "orte/mca/state/state.h"
|
||||
#include "orte/util/name_fns.h"
|
||||
#include "orte/util/threads.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
#include "orte/mca/rml/rml.h"
|
||||
#include "orte/mca/plm/plm.h"
|
||||
@ -57,6 +58,8 @@ static void _client_conn(int sd, short args, void *cbdata)
|
||||
orte_proc_t *p, *ptr;
|
||||
int i;
|
||||
|
||||
ORTE_ACQUIRE_OBJECT(cd);
|
||||
|
||||
if (NULL != cd->server_object) {
|
||||
/* we were passed back the orte_proc_t */
|
||||
p = (orte_proc_t*)cd->server_object;
|
||||
@ -106,6 +109,8 @@ static void _client_finalized(int sd, short args, void *cbdata)
|
||||
orte_proc_t *p, *ptr;
|
||||
int i;
|
||||
|
||||
ORTE_ACQUIRE_OBJECT(cd);
|
||||
|
||||
if (NULL != cd->server_object) {
|
||||
/* we were passed back the orte_proc_t */
|
||||
p = (orte_proc_t*)cd->server_object;
|
||||
@ -164,6 +169,8 @@ static void _client_abort(int sd, short args, void *cbdata)
|
||||
orte_proc_t *p, *ptr;
|
||||
int i;
|
||||
|
||||
ORTE_ACQUIRE_OBJECT(cd);
|
||||
|
||||
if (NULL != cd->server_object) {
|
||||
p = (orte_proc_t*)cd->server_object;
|
||||
} else {
|
||||
@ -214,6 +221,8 @@ static void _register_events(int sd, short args, void *cbdata)
|
||||
orte_pmix_server_op_caddy_t *cd = (orte_pmix_server_op_caddy_t*)cbdata;
|
||||
opal_value_t *info;
|
||||
|
||||
ORTE_ACQUIRE_OBJECT(cd);
|
||||
|
||||
/* the OPAL layer "owns" the list, but let's deconstruct it
|
||||
* here so we don't have to duplicate the data */
|
||||
while (NULL != (info = (opal_value_t*)opal_list_remove_first(cd->info))) {
|
||||
@ -246,6 +255,8 @@ static void _deregister_events(int sd, short args, void *cbdata)
|
||||
orte_pmix_server_op_caddy_t *cd = (orte_pmix_server_op_caddy_t*)cbdata;
|
||||
opal_value_t *info, *iptr, *nptr;
|
||||
|
||||
ORTE_ACQUIRE_OBJECT(cd);
|
||||
|
||||
/* the OPAL layer "owns" the list, but let's deconstruct it
|
||||
* here for consistency */
|
||||
while (NULL != (info = (opal_value_t*)opal_list_remove_first(cd->info))) {
|
||||
@ -281,6 +292,8 @@ static void _notify_release(int status, void *cbdata)
|
||||
{
|
||||
orte_pmix_server_op_caddy_t *cd = (orte_pmix_server_op_caddy_t*)cbdata;
|
||||
|
||||
ORTE_ACQUIRE_OBJECT(cd);
|
||||
|
||||
if (NULL != cd->info) {
|
||||
OPAL_LIST_RELEASE(cd->info);
|
||||
}
|
||||
@ -465,6 +478,8 @@ static void _query(int sd, short args, void *cbdata)
|
||||
opal_pstats_t pstat;
|
||||
float pss;
|
||||
|
||||
ORTE_ACQUIRE_OBJECT(cd);
|
||||
|
||||
opal_output_verbose(2, orte_pmix_server_globals.output,
|
||||
"%s processing query",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
|
||||
@ -654,6 +669,7 @@ int pmix_server_query_fn(opal_process_name_t *requestor,
|
||||
opal_event_set(orte_event_base, &(cd->ev), -1,
|
||||
OPAL_EV_WRITE, _query, cd);
|
||||
opal_event_set_priority(&(cd->ev), ORTE_MSG_PRI);
|
||||
ORTE_POST_OBJECT(cd);
|
||||
opal_event_active(&(cd->ev), OPAL_EV_WRITE, 1);
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
@ -669,6 +685,8 @@ static void _toolconn(int sd, short args, void *cbdata)
|
||||
orte_process_name_t tool;
|
||||
int rc;
|
||||
|
||||
ORTE_ACQUIRE_OBJECT(cd);
|
||||
|
||||
opal_output_verbose(2, orte_pmix_server_globals.output,
|
||||
"%s TOOL CONNECTION PROCESSING",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
|
||||
@ -768,6 +786,7 @@ void pmix_tool_connected_fn(opal_list_t *info,
|
||||
opal_event_set(orte_event_base, &(cd->ev), -1,
|
||||
OPAL_EV_WRITE, _toolconn, cd);
|
||||
opal_event_set_priority(&(cd->ev), ORTE_MSG_PRI);
|
||||
ORTE_POST_OBJECT(cd);
|
||||
opal_event_active(&(cd->ev), OPAL_EV_WRITE, 1);
|
||||
|
||||
}
|
||||
|
@ -43,9 +43,11 @@
|
||||
#include "opal/mca/event/event.h"
|
||||
#include "opal/mca/pmix/pmix.h"
|
||||
#include "opal/util/proc.h"
|
||||
#include "opal/sys/atomic.h"
|
||||
|
||||
#include "orte/mca/grpcomm/base/base.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
#include "orte/util/threads.h"
|
||||
|
||||
BEGIN_C_DECLS
|
||||
|
||||
@ -119,6 +121,7 @@ OBJ_CLASS_DECLARATION(orte_pmix_mdx_caddy_t);
|
||||
opal_event_set(orte_event_base, &(_req->ev), \
|
||||
-1, OPAL_EV_WRITE, (cf), _req); \
|
||||
opal_event_set_priority(&(_req->ev), ORTE_MSG_PRI); \
|
||||
ORTE_POST_OBJECT(_req); \
|
||||
opal_event_active(&(_req->ev), OPAL_EV_WRITE, 1); \
|
||||
} while(0);
|
||||
|
||||
@ -133,6 +136,7 @@ OBJ_CLASS_DECLARATION(orte_pmix_mdx_caddy_t);
|
||||
opal_event_set(orte_event_base, &(_req->ev), \
|
||||
-1, OPAL_EV_WRITE, (cf), _req); \
|
||||
opal_event_set_priority(&(_req->ev), ORTE_MSG_PRI); \
|
||||
ORTE_POST_OBJECT(_req); \
|
||||
opal_event_active(&(_req->ev), OPAL_EV_WRITE, 1); \
|
||||
} while(0);
|
||||
|
||||
@ -147,6 +151,7 @@ OBJ_CLASS_DECLARATION(orte_pmix_mdx_caddy_t);
|
||||
opal_event_set(orte_event_base, &(_cd->ev), -1, \
|
||||
OPAL_EV_WRITE, (fn), _cd); \
|
||||
opal_event_set_priority(&(_cd->ev), ORTE_MSG_PRI); \
|
||||
ORTE_POST_OBJECT(_cd); \
|
||||
opal_event_active(&(_cd->ev), OPAL_EV_WRITE, 1); \
|
||||
} while(0);
|
||||
|
||||
@ -165,6 +170,7 @@ OBJ_CLASS_DECLARATION(orte_pmix_mdx_caddy_t);
|
||||
opal_event_set(orte_event_base, &(_cd->ev), -1, \
|
||||
OPAL_EV_WRITE, (fn), _cd); \
|
||||
opal_event_set_priority(&(_cd->ev), ORTE_MSG_PRI); \
|
||||
ORTE_POST_OBJECT(_cd); \
|
||||
opal_event_active(&(_cd->ev), OPAL_EV_WRITE, 1); \
|
||||
} while(0);
|
||||
|
||||
|
@ -39,6 +39,7 @@
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/util/name_fns.h"
|
||||
#include "orte/util/show_help.h"
|
||||
#include "orte/util/threads.h"
|
||||
#include "orte/runtime/orte_data_server.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
#include "orte/mca/rml/rml.h"
|
||||
@ -150,6 +151,8 @@ static void execute(int sd, short args, void *cbdata)
|
||||
opal_buffer_t *xfer;
|
||||
orte_process_name_t *target;
|
||||
|
||||
ORTE_ACQUIRE_OBJECT(req);
|
||||
|
||||
if (!orte_pmix_server_globals.pubsub_init) {
|
||||
/* we need to initialize our connection to the server */
|
||||
if (ORTE_SUCCESS != (rc = init_server())) {
|
||||
@ -298,6 +301,7 @@ int pmix_server_publish_fn(opal_process_name_t *proc,
|
||||
opal_event_set(orte_event_base, &(req->ev),
|
||||
-1, OPAL_EV_WRITE, execute, req);
|
||||
opal_event_set_priority(&(req->ev), ORTE_MSG_PRI);
|
||||
ORTE_POST_OBJECT(req);
|
||||
opal_event_active(&(req->ev), OPAL_EV_WRITE, 1);
|
||||
|
||||
return OPAL_SUCCESS;
|
||||
@ -395,6 +399,7 @@ int pmix_server_lookup_fn(opal_process_name_t *proc, char **keys,
|
||||
opal_event_set(orte_event_base, &(req->ev),
|
||||
-1, OPAL_EV_WRITE, execute, req);
|
||||
opal_event_set_priority(&(req->ev), ORTE_MSG_PRI);
|
||||
ORTE_POST_OBJECT(req);
|
||||
opal_event_active(&(req->ev), OPAL_EV_WRITE, 1);
|
||||
|
||||
return OPAL_SUCCESS;
|
||||
@ -483,6 +488,7 @@ int pmix_server_unpublish_fn(opal_process_name_t *proc, char **keys,
|
||||
opal_event_set(orte_event_base, &(req->ev),
|
||||
-1, OPAL_EV_WRITE, execute, req);
|
||||
opal_event_set_priority(&(req->ev), ORTE_MSG_PRI);
|
||||
ORTE_POST_OBJECT(req);
|
||||
opal_event_active(&(req->ev), OPAL_EV_WRITE, 1);
|
||||
|
||||
return OPAL_SUCCESS;
|
||||
|
@ -54,6 +54,7 @@
|
||||
|
||||
#include "orte/util/session_dir.h"
|
||||
#include "orte/util/show_help.h"
|
||||
#include "orte/util/threads.h"
|
||||
|
||||
#include "orte/runtime/runtime.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
@ -75,6 +76,8 @@ void orte_quit(int fd, short args, void *cbdata)
|
||||
{
|
||||
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
|
||||
|
||||
ORTE_ACQUIRE_OBJECT(caddy);
|
||||
|
||||
/* cleanup */
|
||||
if (NULL != caddy) {
|
||||
OBJ_RELEASE(caddy);
|
||||
@ -135,6 +138,7 @@ void orte_quit(int fd, short args, void *cbdata)
|
||||
* so we will exit
|
||||
*/
|
||||
orte_event_base_active = false;
|
||||
ORTE_POST_OBJECT(orte_event_base_active);
|
||||
/* break out of the event loop */
|
||||
opal_event_base_loopbreak(orte_event_base);
|
||||
}
|
||||
|
@ -13,7 +13,7 @@
|
||||
* reserved.
|
||||
* Copyright (c) 2008 Institut National de Recherche en Informatique
|
||||
* et Automatique. All rights reserved.
|
||||
* Copyright (c) 2014 Intel Corporation. All rights reserved.
|
||||
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -60,6 +60,7 @@
|
||||
#include "orte/constants.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/util/name_fns.h"
|
||||
#include "orte/util/threads.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
|
||||
#include "orte/runtime/orte_wait.h"
|
||||
@ -188,6 +189,8 @@ static void cancel_callback(int fd, short args, void *cbdata)
|
||||
orte_wait_tracker_t *trk = (orte_wait_tracker_t*)cbdata;
|
||||
orte_wait_tracker_t *t2;
|
||||
|
||||
ORTE_ACQUIRE_OBJECT(trk);
|
||||
|
||||
OPAL_LIST_FOREACH(t2, &pending_cbs, orte_wait_tracker_t) {
|
||||
if (t2->child == trk->child) {
|
||||
opal_list_remove_item(&pending_cbs, &t2->super);
|
||||
@ -214,9 +217,7 @@ void orte_wait_cb_cancel(orte_proc_t *child)
|
||||
trk = OBJ_NEW(orte_wait_tracker_t);
|
||||
OBJ_RETAIN(child); // protect against race conditions
|
||||
trk->child = child;
|
||||
opal_event_set(orte_event_base, &trk->ev, -1, OPAL_EV_WRITE, cancel_callback, trk);
|
||||
opal_event_set_priority(&trk->ev, ORTE_SYS_PRI);
|
||||
opal_event_active(&trk->ev, OPAL_EV_WRITE, 1);
|
||||
ORTE_THREADSHIFT(trk, orte_event_base, cancel_callback, ORTE_SYS_PRI);
|
||||
}
|
||||
|
||||
|
||||
@ -228,6 +229,8 @@ static void wait_signal_callback(int fd, short event, void *arg)
|
||||
pid_t pid;
|
||||
orte_wait_tracker_t *t2;
|
||||
|
||||
ORTE_ACQUIRE_OBJECT(signal);
|
||||
|
||||
if (SIGCHLD != OPAL_EVENT_SIGNAL(signal)) {
|
||||
return;
|
||||
}
|
||||
|
@ -13,7 +13,7 @@
|
||||
* et Automatique. All rights reserved.
|
||||
* Copyright (c) 2011 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2014 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -48,6 +48,7 @@
|
||||
#include "orte/types.h"
|
||||
#include "orte/mca/rml/rml_types.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
#include "orte/util/threads.h"
|
||||
|
||||
BEGIN_C_DECLS
|
||||
|
||||
@ -95,6 +96,7 @@ ORTE_DECLSPEC void orte_wait_cb_cancel(orte_proc_t *proc);
|
||||
struct timespec tp = {0, 100000}; \
|
||||
nanosleep(&tp, NULL); \
|
||||
} \
|
||||
ORTE_ACQUIRE_OBJECT(flg); \
|
||||
}while(0);
|
||||
|
||||
/**
|
||||
@ -135,6 +137,7 @@ ORTE_DECLSPEC void orte_wait_cb_cancel(orte_proc_t *proc);
|
||||
"defining timeout: %ld sec %ld usec at %s:%d", \
|
||||
(long)tmp->tv.tv_sec, (long)tmp->tv.tv_usec, \
|
||||
__FILE__, __LINE__)); \
|
||||
ORTE_POST_OBJECT(tmp); \
|
||||
opal_event_evtimer_add(tmp->ev, &tmp->tv); \
|
||||
}while(0); \
|
||||
|
||||
@ -161,6 +164,7 @@ ORTE_DECLSPEC void orte_wait_cb_cancel(orte_proc_t *proc);
|
||||
"defining timer event: %ld sec %ld usec at %s:%d", \
|
||||
(long)tm->tv.tv_sec, (long)tm->tv.tv_usec, \
|
||||
__FILE__, __LINE__)); \
|
||||
ORTE_POST_OBJECT(tm); \
|
||||
opal_event_evtimer_add(tm->ev, &tm->tv); \
|
||||
}while(0); \
|
||||
|
||||
|
@ -1,7 +1,7 @@
|
||||
PROGS = no_op sigusr_trap spin orte_nodename orte_spawn orte_loop_spawn orte_loop_child orte_abort get_limits \
|
||||
orte_tool orte_no_op binom oob_stress iof_stress iof_delay radix opal_interface orte_spin segfault \
|
||||
orte_exit test-time event-threads psm_keygen regex orte_errors evpri-test opal-evpri-test evpri-test2 \
|
||||
mapper reducer opal_hotel orte_dfs ulfm pmixtool
|
||||
mapper reducer opal_hotel orte_dfs ulfm pmixtool threads
|
||||
|
||||
all: $(PROGS)
|
||||
|
||||
@ -19,3 +19,6 @@ oob_stress:
|
||||
|
||||
pmixtool:
|
||||
ortecc -o pmixtool pmixtool.c -lpmix
|
||||
|
||||
threads:
|
||||
ortecc -O0 -g -lpthread -lhwloc threads.c -o threads
|
||||
|
335
orte/test/system/threads.c
Обычный файл
335
orte/test/system/threads.c
Обычный файл
@ -0,0 +1,335 @@
|
||||
/*
|
||||
* Test program for memory consistency in a thread shifting design
|
||||
*
|
||||
*
|
||||
* Run:
|
||||
* ./threads ITERATIONS [MODE]
|
||||
* ./threads 9000000 3
|
||||
*
|
||||
* Example:
|
||||
* ./threads 9000000 0 --> Will fail, no memory barriers
|
||||
* ./threads 9000000 1 --> Will fail, no WMB
|
||||
* ./threads 9000000 2 --> Will fail, no RMB
|
||||
* ./threads 9000000 3 --> Success
|
||||
* ./threads 9000000 4 --> Success
|
||||
* ./threads 9000000 5 --> N/A
|
||||
*/
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdbool.h>
|
||||
#include <string.h>
|
||||
#include <pthread.h>
|
||||
#include <stdint.h>
|
||||
#include <hwloc.h>
|
||||
#include <sys/time.h>
|
||||
|
||||
#include "opal/sys/atomic.h"
|
||||
|
||||
|
||||
// Max value for an int16_t
|
||||
#define MAX_VAL 32767
|
||||
|
||||
typedef struct {
|
||||
int type;
|
||||
union {
|
||||
bool flag;
|
||||
int integer;
|
||||
int8_t int8;
|
||||
int16_t int16;
|
||||
int32_t int32;
|
||||
int64_t int64;
|
||||
//char padding[1];
|
||||
} data;
|
||||
} my_value_t;
|
||||
|
||||
// Structure to handoff work to the peer thread
|
||||
typedef struct {
|
||||
volatile bool working;
|
||||
void *ptr; // Note that adding a volatile here has no effect
|
||||
} thread_handoff_t;
|
||||
|
||||
// Shared object to handoff work
|
||||
thread_handoff_t handoff;
|
||||
|
||||
// Indicates if the test has finished
|
||||
bool time_to_stop = false;
|
||||
|
||||
// Progress reporting
|
||||
#define PERC_INC 10.0
|
||||
double perc_report_after = PERC_INC;
|
||||
double perc_current = 0.0;
|
||||
|
||||
// Memory barrier modes
|
||||
#define MB_MODE_NONE 0x0
|
||||
#define MB_MODE_RMB 0x1
|
||||
#define MB_MODE_WMB 0x2
|
||||
#define MB_MODE_MB 0x4
|
||||
#define MB_MODE_XMB 0x8
|
||||
#define MB_MODE_ALL (MB_MODE_RMB | MB_MODE_WMB)
|
||||
int mb_mode = MB_MODE_ALL;
|
||||
|
||||
|
||||
// Shared hwloc topology (so we only have to read it once)
|
||||
static hwloc_topology_t topo;
|
||||
// Which object we are binding to
|
||||
// 4 - sockets with 5 cores each
|
||||
// 20 - cores with 8 PUs each
|
||||
//#define OBJ_TYPE HWLOC_OBJ_SOCKET
|
||||
#define OBJ_TYPE HWLOC_OBJ_CORE
|
||||
|
||||
/*
|
||||
* Some basic timing support
|
||||
*/
|
||||
double acc_time, start_time, stop_time, delta;
|
||||
static double get_ts_gettimeofday(void) {
|
||||
double ret;
|
||||
struct timeval tv;
|
||||
gettimeofday(&tv, NULL);
|
||||
ret = tv.tv_sec;
|
||||
ret += (double)tv.tv_usec / 1000000.0;
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Bind either the main or support thread far away from each other
|
||||
*/
|
||||
void bind_me_to(bool main_thread);
|
||||
|
||||
/*
|
||||
* Support thread to do the memory allocation and xfer
|
||||
*/
|
||||
void *value_xfer_thread(void *arg);
|
||||
|
||||
/*
|
||||
* Main thread
|
||||
*/
|
||||
int main(int argc, char **argv) {
|
||||
pthread_t support_thread;
|
||||
int rc, i, max_iters = 10, cur_iter;
|
||||
my_value_t *val = NULL;
|
||||
int mode;
|
||||
|
||||
/*
|
||||
* Parse command line arguments
|
||||
*/
|
||||
if( argc > 1 ) {
|
||||
max_iters = atoi(argv[1]);
|
||||
}
|
||||
if( argc > 2 ) {
|
||||
mode = atoi(argv[2]);
|
||||
if( 0 > mode || mode > 5 ) {
|
||||
printf("Error: Invalid mode %d\n"
|
||||
"\tNone = 0\n"
|
||||
"\tRMB = 1\n"
|
||||
"\tWMB = 2\n"
|
||||
"\tBoth = 3\n"
|
||||
"\tMB Only = 4\n",
|
||||
"\tXMB Only = 5\n",
|
||||
mode);
|
||||
exit(-1);
|
||||
}
|
||||
}
|
||||
else {
|
||||
mode = 3;
|
||||
}
|
||||
switch(mode) {
|
||||
case 0:
|
||||
mb_mode = MB_MODE_NONE;
|
||||
break;
|
||||
case 1:
|
||||
mb_mode = MB_MODE_RMB;
|
||||
break;
|
||||
case 2:
|
||||
mb_mode = MB_MODE_WMB;
|
||||
break;
|
||||
case 3:
|
||||
mb_mode = MB_MODE_ALL;
|
||||
break;
|
||||
case 4:
|
||||
mb_mode = MB_MODE_MB;
|
||||
break;
|
||||
case 5:
|
||||
mb_mode = MB_MODE_XMB;
|
||||
break;
|
||||
}
|
||||
|
||||
// Load hwloc topology
|
||||
hwloc_topology_init(&topo);
|
||||
hwloc_topology_load(topo);
|
||||
|
||||
// Display banner
|
||||
printf("---------------------------\n");
|
||||
printf("Iterations: %10d\n", max_iters);
|
||||
printf("Mode R MB : %10s\n", (mb_mode & MB_MODE_RMB ? "Enabled" : "Disabled") );
|
||||
printf("Mode W MB : %10s\n", (mb_mode & MB_MODE_WMB ? "Enabled" : "Disabled") );
|
||||
printf("Mode - MB : %10s\n", (mb_mode & MB_MODE_MB ? "Enabled" : "Disabled") );
|
||||
printf("Mode X MB : %10s\n", (mb_mode & MB_MODE_XMB ? "Enabled" : "Disabled") );
|
||||
printf("---------------------------\n");
|
||||
|
||||
bind_me_to(true);
|
||||
handoff.working = false;
|
||||
|
||||
/*
|
||||
* Launch supporting thread
|
||||
*/
|
||||
rc = pthread_create(&support_thread, NULL, value_xfer_thread, NULL);
|
||||
if( 0 != rc ) {
|
||||
printf("Error: Failed to create a thread! %d\n", rc);
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
/*
|
||||
* Main work loop
|
||||
*/
|
||||
acc_time = 0.0;
|
||||
for(cur_iter = 0; cur_iter < max_iters; ++cur_iter) {
|
||||
perc_current = (cur_iter / ((double)max_iters)) * 100.0;
|
||||
if( perc_current > perc_report_after ) {
|
||||
delta = (acc_time / cur_iter) * 1000000;
|
||||
printf("%6.1f %% complete : Iteration %10d / %10d : %6.1f usec / iter\n",
|
||||
perc_current, cur_iter+1, max_iters, delta);
|
||||
perc_report_after += PERC_INC;
|
||||
}
|
||||
|
||||
start_time = get_ts_gettimeofday();
|
||||
// Initialize values
|
||||
val = NULL;
|
||||
handoff.ptr = &val;
|
||||
if( mb_mode & MB_MODE_RMB ) {
|
||||
opal_atomic_rmb();
|
||||
}
|
||||
if( mb_mode & MB_MODE_MB ) {
|
||||
opal_atomic_mb();
|
||||
}
|
||||
handoff.working = true;
|
||||
|
||||
// Wait for work to finish
|
||||
while( handoff.working ) {
|
||||
usleep(1);
|
||||
}
|
||||
if( mb_mode & MB_MODE_WMB ) {
|
||||
opal_atomic_wmb();
|
||||
}
|
||||
if( mb_mode & MB_MODE_MB ) {
|
||||
opal_atomic_mb();
|
||||
}
|
||||
|
||||
// Inspect values for correctness
|
||||
if( NULL == val ) {
|
||||
printf("[%10d / %10d] Error: val = %s\n", cur_iter+1, max_iters,
|
||||
(NULL == val ? "NULL" : "Valid") );
|
||||
exit(-1);
|
||||
}
|
||||
else if( 999 != val->type ) {
|
||||
printf("[%10d / %10d] Error: val->type = %d\n", cur_iter+1, max_iters, val->type);
|
||||
exit(-1);
|
||||
}
|
||||
else if( (cur_iter+1)%MAX_VAL != val->data.int16 ) {
|
||||
printf("[%10d / %10d] Error: val->data.int16 = %d\n", cur_iter+1, max_iters, val->data.int16);
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
stop_time = get_ts_gettimeofday();
|
||||
acc_time += (stop_time - start_time);
|
||||
|
||||
// Yes, this is a memory leak!
|
||||
// I need to make sure that the supporting thread is not reusing a
|
||||
// previous storage location when it calls malloc. This is to emulate
|
||||
// a program that calls malloc after the value was acquired, possibly
|
||||
// reusing this memory location.
|
||||
//free(val);
|
||||
val = NULL;
|
||||
}
|
||||
delta = (acc_time / max_iters) * 1000000;
|
||||
|
||||
/*
|
||||
* All done - Cleanup
|
||||
*/
|
||||
time_to_stop = true;
|
||||
|
||||
rc = pthread_join(support_thread, NULL);
|
||||
if( 0 != rc ) {
|
||||
printf("Error: Failed to join a thread! %d\n", rc);
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
hwloc_topology_destroy(topo);
|
||||
|
||||
printf("Success - %6.1f usec / iter\n", delta);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
void *value_xfer_thread(void *arg) {
|
||||
my_value_t **val = NULL;
|
||||
static int var = 0;
|
||||
|
||||
// Bind this thread away from the main thread
|
||||
bind_me_to(false);
|
||||
|
||||
while( !time_to_stop ) {
|
||||
if( handoff.working ) {
|
||||
// Make sure I have the right pointer
|
||||
if( mb_mode & MB_MODE_WMB ) {
|
||||
opal_atomic_wmb();
|
||||
}
|
||||
if( mb_mode & MB_MODE_MB ) {
|
||||
opal_atomic_mb();
|
||||
}
|
||||
|
||||
// Allocate and set the value
|
||||
val = (my_value_t**)handoff.ptr;
|
||||
(*val) = malloc(sizeof(my_value_t));
|
||||
(*val)->type = 999;
|
||||
(*val)->data.int16 = (++var)%MAX_VAL;
|
||||
|
||||
// Make sure main thread can see the value
|
||||
// See 'Examples' -> 'Global thread flag' discussion here:
|
||||
// https://www.ibm.com/developerworks/systems/articles/powerpc.html
|
||||
if( mb_mode & MB_MODE_RMB ) {
|
||||
opal_atomic_rmb();
|
||||
}
|
||||
if( mb_mode & MB_MODE_MB ) {
|
||||
opal_atomic_mb();
|
||||
}
|
||||
// Release main thread
|
||||
handoff.working = false;
|
||||
}
|
||||
else {
|
||||
// wait for work
|
||||
usleep(1);
|
||||
}
|
||||
}
|
||||
pthread_exit(NULL);
|
||||
}
|
||||
|
||||
void bind_me_to(bool main_thread) {
|
||||
int num_objs;
|
||||
hwloc_cpuset_t set;
|
||||
char *buffer = NULL;
|
||||
hwloc_obj_t obj;
|
||||
|
||||
num_objs = hwloc_get_nbobjs_by_type(topo, OBJ_TYPE);
|
||||
|
||||
if( main_thread ) {
|
||||
obj = hwloc_get_obj_by_type(topo, OBJ_TYPE, 0);
|
||||
}
|
||||
else {
|
||||
obj = hwloc_get_obj_by_type(topo, OBJ_TYPE, num_objs-1);
|
||||
}
|
||||
|
||||
if( obj->type == OBJ_TYPE ) {
|
||||
hwloc_set_cpubind(topo, obj->cpuset, HWLOC_CPUBIND_THREAD);
|
||||
}
|
||||
else {
|
||||
printf("Error: Invalid object\n");
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
set = hwloc_bitmap_alloc();
|
||||
hwloc_get_cpubind(topo, set, HWLOC_CPUBIND_THREAD);
|
||||
hwloc_bitmap_asprintf(&buffer, set);
|
||||
printf("%s : [objs = %d] : cpuset is %s\n", (main_thread ? "Main" : "Peer"), num_objs, buffer);
|
||||
free(buffer);
|
||||
hwloc_bitmap_free(set);
|
||||
}
|
@ -84,6 +84,7 @@
|
||||
#include "orte/runtime/runtime.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
#include "orte/util/show_help.h"
|
||||
#include "orte/util/threads.h"
|
||||
|
||||
#include "orte/orted/orted.h"
|
||||
|
||||
@ -490,6 +491,7 @@ int main(int argc, char *argv[])
|
||||
while (orte_event_base_active) {
|
||||
opal_event_loop(orte_event_base, OPAL_EVLOOP_ONCE);
|
||||
}
|
||||
ORTE_ACQUIRE_OBJECT(orte_event_base_active);
|
||||
|
||||
/* cleanup and leave */
|
||||
orte_finalize();
|
||||
|
@ -13,7 +13,7 @@
|
||||
* Copyright (c) 2007-2013 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2007-2016 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2015 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2015-2017 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -54,6 +54,7 @@
|
||||
|
||||
#include "orte/util/name_fns.h"
|
||||
#include "orte/util/proc_info.h"
|
||||
#include "orte/util/threads.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/rml/rml.h"
|
||||
#include "orte/orted/orted.h"
|
||||
@ -283,6 +284,7 @@ int main(int argc, char *argv[])
|
||||
while (orte_event_base_active) {
|
||||
opal_event_loop(orte_event_base, OPAL_EVLOOP_ONCE);
|
||||
}
|
||||
ORTE_ACQUIRE_OBJECT(orte_event_base_active);
|
||||
|
||||
/* should never get here, but if we do... */
|
||||
|
||||
|
@ -13,7 +13,7 @@
|
||||
* Copyright (c) 2007-2012 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2007-2016 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2015-2016 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2015-2017 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -58,6 +58,7 @@
|
||||
#include "orte/util/name_fns.h"
|
||||
#include "orte/util/show_help.h"
|
||||
#include "orte/util/proc_info.h"
|
||||
#include "orte/util/threads.h"
|
||||
#include "orte/runtime/orte_wait.h"
|
||||
#include "orte/mca/rml/base/rml_contact.h"
|
||||
#include "orte/runtime/orte_quit.h"
|
||||
@ -532,6 +533,7 @@ SEND:
|
||||
while (orte_event_base_active) {
|
||||
opal_event_loop(orte_event_base, OPAL_EVLOOP_ONCE);
|
||||
}
|
||||
ORTE_ACQUIRE_OBJECT(orte_event_base_active);
|
||||
|
||||
/***************
|
||||
* Cleanup
|
||||
|
@ -87,6 +87,7 @@
|
||||
#include "orte/mca/state/state.h"
|
||||
#include "orte/util/proc_info.h"
|
||||
#include "orte/util/show_help.h"
|
||||
#include "orte/util/threads.h"
|
||||
|
||||
#include "orte/runtime/runtime.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
@ -198,6 +199,7 @@ int orterun(int argc, char *argv[])
|
||||
while (orte_event_base_active && launchst.active) {
|
||||
opal_event_loop(orte_event_base, OPAL_EVLOOP_ONCE);
|
||||
}
|
||||
ORTE_ACQUIRE_OBJECT(orte_event_base_active);
|
||||
if (orte_debug_flag) {
|
||||
opal_output(0, "Job %s has launched",
|
||||
(NULL == launchst.jdata) ? "UNKNOWN" : ORTE_JOBID_PRINT(launchst.jdata->jobid));
|
||||
@ -209,6 +211,7 @@ int orterun(int argc, char *argv[])
|
||||
while (orte_event_base_active && completest.active) {
|
||||
opal_event_loop(orte_event_base, OPAL_EVLOOP_ONCE);
|
||||
}
|
||||
ORTE_ACQUIRE_OBJECT(orte_event_base_active);
|
||||
|
||||
if (ORTE_PROC_IS_HNP) {
|
||||
/* ensure all local procs are dead */
|
||||
|
@ -43,14 +43,14 @@ AM_LFLAGS = -Porte_util_hostfile_
|
||||
LEX_OUTPUT_ROOT = lex.orte_util_hostfile_
|
||||
|
||||
headers += \
|
||||
util/name_fns.h \
|
||||
util/name_fns.h \
|
||||
util/proc_info.h \
|
||||
util/session_dir.h \
|
||||
util/show_help.h \
|
||||
util/error_strings.h \
|
||||
util/context_fns.h \
|
||||
util/parse_options.h \
|
||||
util/pre_condition_transports.h \
|
||||
util/context_fns.h \
|
||||
util/parse_options.h \
|
||||
util/pre_condition_transports.h \
|
||||
util/hnp_contact.h \
|
||||
util/hostfile/hostfile.h \
|
||||
util/hostfile/hostfile_lex.h \
|
||||
@ -60,7 +60,8 @@ headers += \
|
||||
util/regex.h \
|
||||
util/attr.h \
|
||||
util/listener.h \
|
||||
util/compress.h
|
||||
util/compress.h \
|
||||
util/threads.h
|
||||
|
||||
lib@ORTE_LIB_PREFIX@open_rte_la_SOURCES += \
|
||||
util/error_strings.c \
|
||||
@ -68,9 +69,9 @@ lib@ORTE_LIB_PREFIX@open_rte_la_SOURCES += \
|
||||
util/proc_info.c \
|
||||
util/session_dir.c \
|
||||
util/show_help.c \
|
||||
util/context_fns.c \
|
||||
util/parse_options.c \
|
||||
util/pre_condition_transports.c \
|
||||
util/context_fns.c \
|
||||
util/parse_options.c \
|
||||
util/pre_condition_transports.c \
|
||||
util/hnp_contact.c \
|
||||
util/hostfile/hostfile_lex.l \
|
||||
util/hostfile/hostfile.c \
|
||||
|
38
orte/util/threads.h
Обычный файл
38
orte/util/threads.h
Обычный файл
@ -0,0 +1,38 @@
|
||||
/*
|
||||
* Copyright (c) 2017 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#ifndef ORTE_THREADS_H
|
||||
#define ORTE_THREADS_H
|
||||
|
||||
#include "orte_config.h"
|
||||
|
||||
#include "opal/sys/atomic.h"
|
||||
|
||||
/* provide macros for forward-proofing the shifting
|
||||
* of objects between threads - at some point, we
|
||||
* may revamp our threading model */
|
||||
|
||||
/* post an object to another thread - for now, we
|
||||
* only have a memory barrier */
|
||||
#define ORTE_POST_OBJECT(o) opal_atomic_wmb()
|
||||
|
||||
/* acquire an object from another thread - for now,
|
||||
* we only have a memory barrier */
|
||||
#define ORTE_ACQUIRE_OBJECT(o) opal_atomic_rmb()
|
||||
|
||||
/* define a threadshift macro */
|
||||
#define ORTE_THREADSHIFT(x, eb, f, p) \
|
||||
do { \
|
||||
opal_event_set((eb), &((x)->ev), -1, OPAL_EV_WRITE, (f), (x)); \
|
||||
opal_event_set_priority(&((x)->ev), (p)); \
|
||||
ORTE_POST_OBJECT((x)); \
|
||||
opal_event_active(&((x)->ev), OPAL_EV_WRITE, 1); \
|
||||
} while(0)
|
||||
|
||||
#endif /* ORTE_THREADS_H */
|
Загрузка…
Ссылка в новой задаче
Block a user