1
1

Merge pull request #3659 from rhc54/topic/threads

Update OPAL and ORTE for thread safety
Этот коммит содержится в:
Ralph Castain 2017-06-06 14:52:40 -07:00 коммит произвёл GitHub
родитель 7be09f8143 93cf3c7203
Коммит 21fba8b7f3
84 изменённых файлов: 1080 добавлений и 1414 удалений

1
.gitignore поставляемый
Просмотреть файл

@ -475,6 +475,7 @@ orte/test/system/opal_db
orte/test/system/ulfm
orte/test/system/pmixtool
orte/test/system/orte_notify
orte/test/system/threads
orte/tools/orte-checkpoint/orte-checkpoint
orte/tools/orte-checkpoint/orte-checkpoint.1

Просмотреть файл

@ -31,6 +31,7 @@
#include "opal/mca/hwloc/base/base.h"
#include "opal/runtime/opal.h"
#include "opal/runtime/opal_progress_threads.h"
#include "opal/threads/threads.h"
#include "opal/util/argv.h"
#include "opal/util/error.h"
#include "opal/util/output.h"
@ -164,6 +165,7 @@ static void return_local_event_hdlr(int status, opal_list_t *results,
pmix_status_t pstatus;
size_t n;
OPAL_ACQUIRE_OBJECT(cd);
if (NULL != cd->pmixcbfunc) {
op = OBJ_NEW(pmix2x_opcaddy_t);
@ -203,6 +205,8 @@ static void _event_hdlr(int sd, short args, void *cbdata)
pmix2x_threadshift_t *cd = (pmix2x_threadshift_t*)cbdata;
opal_pmix2x_event_t *event;
OPAL_ACQUIRE_OBJECT(cd);
opal_output_verbose(2, opal_pmix_base_framework.framework_output,
"%s _EVENT_HDLR RECEIVED NOTIFICATION FOR HANDLER %d OF STATUS %d",
OPAL_NAME_PRINT(OPAL_PROC_MY_NAME), (int)cd->id, cd->status);
@ -312,6 +316,7 @@ void pmix2x_event_hdlr(size_t evhdlr_registration_id,
/* now push it into the local thread */
opal_event_assign(&cd->ev, opal_pmix_base.evbase,
-1, EV_WRITE, _event_hdlr, cd);
OPAL_POST_OBJECT(cd);
opal_event_active(&cd->ev, EV_WRITE, 1);
}
@ -986,6 +991,7 @@ static void errreg_cbfunc (pmix_status_t status,
{
pmix2x_opcaddy_t *op = (pmix2x_opcaddy_t*)cbdata;
OPAL_ACQUIRE_OBJECT(op);
op->event->index = errhandler_ref;
opal_output_verbose(5, opal_pmix_base_framework.framework_output,
"PMIX2x errreg_cbfunc - error handler registered status=%d, reference=%lu",
@ -1003,6 +1009,7 @@ static void _reg_hdlr(int sd, short args, void *cbdata)
opal_value_t *kv;
size_t n;
OPAL_ACQUIRE_OBJECT(cd);
opal_output_verbose(2, opal_pmix_base_framework.framework_output,
"%s REGISTER HANDLER CODES %s",
OPAL_NAME_PRINT(OPAL_PROC_MY_NAME),
@ -1067,6 +1074,7 @@ static void _dereg_hdlr(int sd, short args, void *cbdata)
pmix2x_threadshift_t *cd = (pmix2x_threadshift_t*)cbdata;
opal_pmix2x_event_t *event;
OPAL_ACQUIRE_OBJECT(cd);
/* look for this event */
OPAL_LIST_FOREACH(event, &mca_pmix_pmix2x_component.events, opal_pmix2x_event_t) {
if (cd->handler == event->index) {
@ -1116,6 +1124,8 @@ static void _notify(int sd, short args, void *cbdata)
pmix_data_range_t prange;
opal_pmix2x_jobid_trkr_t *job, *jptr;
OPAL_ACQUIRE_OBJECT(cd);
op = OBJ_NEW(pmix2x_opcaddy_t);
/* convert the status */
@ -1204,6 +1214,8 @@ static void infocbfunc(pmix_status_t status,
opal_value_t *iptr;
size_t n;
OPAL_ACQUIRE_OBJECT(cd);
/* convert the array of pmix_info_t to the list of info */
if (NULL != info) {
results = OBJ_NEW(opal_list_t);
@ -1294,6 +1306,8 @@ static void opcbfunc(pmix_status_t status, void *cbdata)
{
pmix2x_opcaddy_t *op = (pmix2x_opcaddy_t*)cbdata;
OPAL_ACQUIRE_OBJECT(op);
if (NULL != op->opcbfunc) {
op->opcbfunc(pmix2x_convert_rc(status), op->cbdata);
}

Просмотреть файл

@ -156,6 +156,7 @@ OBJ_CLASS_DECLARATION(pmix2x_threadshift_t);
_cd->cbdata = (cd); \
opal_event_assign(&((_cd)->ev), opal_pmix_base.evbase, \
-1, EV_WRITE, (fn), (_cd)); \
OPAL_POST_OBJECT(_cd); \
opal_event_active(&((_cd)->ev), EV_WRITE, 1); \
} while(0)
@ -170,6 +171,7 @@ OBJ_CLASS_DECLARATION(pmix2x_threadshift_t);
_cd->cbdata = (cd); \
opal_event_assign(&((_cd)->ev), opal_pmix_base.evbase, \
-1, EV_WRITE, (fn), (_cd)); \
OPAL_POST_OBJECT(_cd); \
opal_event_active(&((_cd)->ev), EV_WRITE, 1); \
} while(0)
@ -185,6 +187,7 @@ OBJ_CLASS_DECLARATION(pmix2x_threadshift_t);
_cd->cbdata = (cd); \
opal_event_assign(&((_cd)->ev), opal_pmix_base.evbase, \
-1, EV_WRITE, (fn), (_cd)); \
OPAL_POST_OBJECT(_cd); \
opal_event_active(&((_cd)->ev), EV_WRITE, 1); \
} while(0)

Просмотреть файл

@ -27,6 +27,7 @@
#endif
#include "opal/hash_string.h"
#include "opal/threads/threads.h"
#include "opal/util/argv.h"
#include "opal/util/proc.h"
@ -44,6 +45,7 @@ static bool initialized = false;
while ((a)) { \
usleep(10); \
} \
OPAL_ACQUIRE_OBJECT(a); \
} while (0)
@ -53,11 +55,14 @@ static void errreg_cbfunc (pmix_status_t status,
{
opal_pmix2x_event_t *event = (opal_pmix2x_event_t*)cbdata;
OPAL_ACQUIRE_OBJECT(event);
event->index = errhandler_ref;
opal_output_verbose(5, opal_pmix_base_framework.framework_output,
"PMIX client errreg_cbfunc - error handler registered status=%d, reference=%lu",
status, (unsigned long)errhandler_ref);
regactive = false;
OPAL_POST_OBJECT(regactive);
}
int pmix2x_client_init(opal_list_t *ilist)
@ -272,6 +277,7 @@ static void opcbfunc(pmix_status_t status, void *cbdata)
{
pmix2x_opcaddy_t *op = (pmix2x_opcaddy_t*)cbdata;
OPAL_ACQUIRE_OBJECT(op);
if (NULL != op->opcbfunc) {
op->opcbfunc(pmix2x_convert_rc(status), op->cbdata);
}
@ -521,6 +527,8 @@ static void val_cbfunc(pmix_status_t status,
int rc;
opal_value_t val, *v=NULL;
OPAL_ACQUIRE_OBJECT(op);
rc = pmix2x_convert_opalrc(status);
if (PMIX_SUCCESS == status && NULL != kv) {
rc = pmix2x_value_unload(&val, kv);
@ -768,6 +776,8 @@ static void lk_cbfunc(pmix_status_t status,
size_t n;
opal_pmix2x_jobid_trkr_t *job, *jptr;
OPAL_ACQUIRE_OBJECT(op);
/* this is in the PMIx local thread - need to threadshift to
* our own thread as we will be accessing framework-global
* lists and objects */
@ -817,7 +827,7 @@ static void lk_cbfunc(pmix_status_t status,
}
r = &results;
}
release:
release:
/* execute the callback */
op->lkcbfunc(rc, r, op->cbdata);
@ -994,6 +1004,8 @@ static void spcbfunc(pmix_status_t status,
opal_jobid_t jobid=OPAL_JOBID_INVALID;
opal_pmix2x_jobid_trkr_t *job;
OPAL_ACQUIRE_OBJECT(op);
/* this is in the PMIx local thread - need to threadshift to
* our own thread as we will be accessing framework-global
* lists and objects */

Просмотреть файл

@ -29,6 +29,7 @@
#include "opal/mca/hwloc/base/base.h"
#include "opal/runtime/opal.h"
#include "opal/runtime/opal_progress_threads.h"
#include "opal/threads/threads.h"
#include "opal/util/argv.h"
#include "opal/util/error.h"
#include "opal/util/output.h"
@ -142,6 +143,7 @@ static void opal_opcbfunc(int status, void *cbdata)
{
pmix2x_opalcaddy_t *opalcaddy = (pmix2x_opalcaddy_t*)cbdata;
OPAL_ACQUIRE_OBJECT(opalcaddy);
if (NULL != opalcaddy->opcbfunc) {
opalcaddy->opcbfunc(pmix2x_convert_opalrc(status), opalcaddy->cbdata);
}

Просмотреть файл

@ -32,6 +32,7 @@
#include "opal/mca/hwloc/base/base.h"
#include "opal/runtime/opal.h"
#include "opal/runtime/opal_progress_threads.h"
#include "opal/threads/threads.h"
#include "opal/util/argv.h"
#include "opal/util/error.h"
#include "opal/util/output.h"
@ -58,6 +59,7 @@ static size_t errhdler_ref = 0;
while ((a)) { \
usleep(10); \
} \
OPAL_ACQUIRE_OBJECT(a); \
} while (0)
static void errreg_cbfunc (pmix_status_t status,
@ -66,10 +68,12 @@ static void errreg_cbfunc (pmix_status_t status,
{
volatile bool *active = (volatile bool*)cbdata;
OPAL_ACQUIRE_OBJECT(active);
errhdler_ref = errhandler_ref;
opal_output_verbose(5, opal_pmix_base_framework.framework_output,
"PMIX server errreg_cbfunc - error handler registered status=%d, reference=%lu",
status, (unsigned long)errhandler_ref);
OPAL_POST_OBJECT(active);
*active = false;
}
@ -77,11 +81,14 @@ static void opcbfunc(pmix_status_t status, void *cbdata)
{
pmix2x_opcaddy_t *op = (pmix2x_opcaddy_t*)cbdata;
OPAL_ACQUIRE_OBJECT(op);
if (NULL != op->opcbfunc) {
op->opcbfunc(pmix2x_convert_rc(status), op->cbdata);
}
if (op->active) {
op->status = status;
OPAL_POST_OBJECT(op);
op->active = false;
} else {
OBJ_RELEASE(op);
@ -92,6 +99,7 @@ static void op2cbfunc(pmix_status_t status, void *cbdata)
{
volatile bool *active = (volatile bool*)cbdata;
OPAL_POST_OBJECT(active);
*active = false;
}
@ -165,6 +173,7 @@ int pmix2x_server_init(opal_pmix_server_module_t *module,
static void fincb(pmix_status_t status, void *cbdata)
{
volatile bool *active = (volatile bool*)cbdata;
OPAL_POST_OBJECT(active);
*active = false;
}
@ -211,6 +220,8 @@ static void _reg_nspace(int sd, short args, void *cbdata)
opal_pmix2x_jobid_trkr_t *job;
pmix2x_opcaddy_t op;
OPAL_ACQUIRE_OBJECT(cd);
/* we must threadshift this request as we might not be in an event
* and we are going to access framework-global lists/objects */
@ -301,6 +312,7 @@ int pmix2x_server_register_nspace(opal_jobid_t jobid,
} else {
opal_event_assign(&cd->ev, opal_pmix_base.evbase,
-1, EV_WRITE, _reg_nspace, cd);
OPAL_POST_OBJECT(cd);
opal_event_active(&cd->ev, EV_WRITE, 1);
}
@ -311,10 +323,12 @@ static void tdcbfunc(pmix_status_t status, void *cbdata)
{
pmix2x_threadshift_t *cd = (pmix2x_threadshift_t*)cbdata;
OPAL_ACQUIRE_OBJECT(cd);
if (NULL != cd->opcbfunc) {
cd->opcbfunc(pmix2x_convert_rc(status), cd->cbdata);
}
if (cd->active) {
OPAL_POST_OBJECT(cd);
cd->active = false;
} else {
OBJ_RELEASE(cd);
@ -326,6 +340,7 @@ static void _dereg_nspace(int sd, short args, void *cbdata)
pmix2x_threadshift_t *cd = (pmix2x_threadshift_t*)cbdata;
opal_pmix2x_jobid_trkr_t *jptr;
OPAL_ACQUIRE_OBJECT(cd);
/* if we don't already have it, we can ignore this */
OPAL_LIST_FOREACH(jptr, &mca_pmix_pmix2x_component.jobids, opal_pmix2x_jobid_trkr_t) {
if (jptr->jobid == cd->jobid) {
@ -361,6 +376,7 @@ void pmix2x_server_deregister_nspace(opal_jobid_t jobid,
} else {
opal_event_assign(&cd->ev, opal_pmix_base.evbase,
-1, EV_WRITE, _dereg_nspace, cd);
OPAL_POST_OBJECT(cd);
opal_event_active(&cd->ev, EV_WRITE, 1);
}
}
@ -397,6 +413,7 @@ static void _dereg_client(int sd, short args, void *cbdata)
opal_pmix2x_jobid_trkr_t *jptr;
pmix_proc_t p;
OPAL_ACQUIRE_OBJECT(cd);
/* if we don't already have it, we can ignore this */
OPAL_LIST_FOREACH(jptr, &mca_pmix_pmix2x_component.jobids, opal_pmix2x_jobid_trkr_t) {
if (jptr->jobid == cd->source->jobid) {
@ -431,6 +448,7 @@ void pmix2x_server_deregister_client(const opal_process_name_t *proc,
} else {
opal_event_assign(&cd->ev, opal_pmix_base.evbase,
-1, EV_WRITE, _dereg_client, cd);
OPAL_POST_OBJECT(cd);
opal_event_active(&cd->ev, EV_WRITE, 1);
}
}

Просмотреть файл

@ -13,6 +13,7 @@
* Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2015-2017 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* Copyright (c) 2017 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -114,6 +115,19 @@ OPAL_DECLSPEC OBJ_CLASS_DECLARATION(opal_thread_t);
opal_condition_broadcast((cnd)); \
} while(0);
/* provide a macro for forward-proofing the shifting
* of objects between libevent threads - at some point, we
* may revamp that threading model */
/* post an object to another thread - for now, we
* only have a memory barrier */
#define OPAL_POST_OBJECT(o) opal_atomic_wmb()
/* acquire an object from another thread - for now,
* we only have a memory barrier */
#define OPAL_ACQUIRE_OBJECT(o) opal_atomic_rmb()
OPAL_DECLSPEC int opal_thread_start(opal_thread_t *);
OPAL_DECLSPEC int opal_thread_join(opal_thread_t *, void **thread_return);

Просмотреть файл

@ -1,7 +1,7 @@
/*
* Copyright (c) 2012-2013 Los Alamos National Security, LLC.
* All rights reserved.
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
* Copyright (c) 2014 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* $COPYRIGHT$
@ -33,6 +33,7 @@
#include "orte/util/name_fns.h"
#include "orte/util/proc_info.h"
#include "orte/util/show_help.h"
#include "orte/util/threads.h"
#include "orte/runtime/orte_globals.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/rml/rml.h"
@ -507,6 +508,8 @@ static void process_opens(int fd, short args, void *cbdata)
opal_list_t lt;
opal_namelist_t *nm;
ORTE_ACQUIRE_OBJECT(dfs);
/* get the scheme to determine if we can process locally or not */
if (NULL == (scheme = opal_uri_get_scheme(dfs->uri))) {
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
@ -661,7 +664,7 @@ static void dfs_open(char *uri,
dfs->cbdata = cbdata;
/* post it for processing */
ORTE_DFS_POST_REQUEST(dfs, process_opens);
ORTE_THREADSHIFT(dfs, orte_event_base, process_opens, ORTE_SYS_PRI);
}
static void process_close(int fd, short args, void *cbdata)
@ -672,6 +675,8 @@ static void process_close(int fd, short args, void *cbdata)
opal_buffer_t *buffer;
int rc;
ORTE_ACQUIRE_OBJECT(close_dfs);
opal_output_verbose(1, orte_dfs_base_framework.framework_output,
"%s closing fd %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
@ -757,7 +762,7 @@ static void dfs_close(int fd,
dfs->cbdata = cbdata;
/* post it for processing */
ORTE_DFS_POST_REQUEST(dfs, process_close);
ORTE_THREADSHIFT(dfs, orte_event_base, process_close, ORTE_SYS_PRI);
}
static void process_sizes(int fd, short args, void *cbdata)
@ -769,6 +774,8 @@ static void process_sizes(int fd, short args, void *cbdata)
int rc;
struct stat buf;
ORTE_ACQUIRE_OBJECT(size_dfs);
opal_output_verbose(1, orte_dfs_base_framework.framework_output,
"%s processing get_size on fd %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
@ -881,7 +888,7 @@ static void dfs_get_file_size(int fd,
dfs->cbdata = cbdata;
/* post it for processing */
ORTE_DFS_POST_REQUEST(dfs, process_sizes);
ORTE_THREADSHIFT(dfs, orte_event_base, process_sizes, ORTE_SYS_PRI);
}
@ -895,6 +902,8 @@ static void process_seeks(int fd, short args, void *cbdata)
int rc;
struct stat buf;
ORTE_ACQUIRE_OBJECT(seek_dfs);
opal_output_verbose(1, orte_dfs_base_framework.framework_output,
"%s processing seek on fd %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
@ -1035,7 +1044,7 @@ static void dfs_seek(int fd, long offset, int whence,
dfs->cbdata = cbdata;
/* post it for processing */
ORTE_DFS_POST_REQUEST(dfs, process_seeks);
ORTE_THREADSHIFT(dfs, orte_event_base, process_seeks, ORTE_SYS_PRI);
}
static void process_reads(int fd, short args, void *cbdata)
@ -1048,6 +1057,8 @@ static void process_reads(int fd, short args, void *cbdata)
int64_t i64;
int rc;
ORTE_ACQUIRE_OBJECT(read_dfs);
/* look in our local records for this fd */
trk = NULL;
for (item = opal_list_get_first(&active_files);
@ -1145,7 +1156,7 @@ static void dfs_read(int fd, uint8_t *buffer,
dfs->cbdata = cbdata;
/* post it for processing */
ORTE_DFS_POST_REQUEST(dfs, process_reads);
ORTE_THREADSHIFT(dfs, orte_event_base, process_reads, ORTE_SYS_PRI);
}
static void process_posts(int fd, short args, void *cbdata)
@ -1154,6 +1165,8 @@ static void process_posts(int fd, short args, void *cbdata)
opal_buffer_t *buffer;
int rc;
ORTE_ACQUIRE_OBJECT(dfs);
/* we will get confirmation in our receive function, so
* add this request to our list */
dfs->id = req_id++;
@ -1212,7 +1225,7 @@ static void dfs_post_file_map(opal_buffer_t *bo,
dfs->cbdata = cbdata;
/* post it for processing */
ORTE_DFS_POST_REQUEST(dfs, process_posts);
ORTE_THREADSHIFT(dfs, orte_event_base, process_posts, ORTE_SYS_PRI);
}
static void process_getfm(int fd, short args, void *cbdata)
@ -1221,6 +1234,8 @@ static void process_getfm(int fd, short args, void *cbdata)
opal_buffer_t *buffer;
int rc;
ORTE_ACQUIRE_OBJECT(dfs);
/* we will get confirmation in our receive function, so
* add this request to our list */
dfs->id = req_id++;
@ -1275,7 +1290,7 @@ static void dfs_get_file_map(orte_process_name_t *target,
dfs->cbdata = cbdata;
/* post it for processing */
ORTE_DFS_POST_REQUEST(dfs, process_getfm);
ORTE_THREADSHIFT(dfs, orte_event_base, process_getfm, ORTE_SYS_PRI);
}
static void dfs_load_file_maps(orte_jobid_t jobid,
@ -1298,4 +1313,3 @@ static void dfs_purge_file_maps(orte_jobid_t jobid,
cbfunc(cbdata);
}
}

Просмотреть файл

@ -1,5 +1,6 @@
/*
* Copyright (c) 2012-2013 Los Alamos National Security, Inc. All rights reserved.
* Copyright (c) 2017 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -76,14 +77,6 @@ typedef struct {
} orte_dfs_request_t;
OBJ_CLASS_DECLARATION(orte_dfs_request_t);
#define ORTE_DFS_POST_REQUEST(d, cb) \
do { \
opal_event_set(orte_event_base, &((d)->ev), \
-1, OPAL_EV_WRITE, (cb), (d)); \
opal_event_set_priority(&((d)->ev), ORTE_SYS_PRI); \
opal_event_active(&((d)->ev), OPAL_EV_WRITE, 1); \
} while(0);
END_C_DECLS
#endif

Просмотреть файл

@ -2,7 +2,7 @@
* Copyright (c) 2012-2013 Los Alamos National Security, LLC.
* All rights reserved.
* Copyright (c) 2013 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2015-2016 Intel, Inc. All rights reserved.
* Copyright (c) 2015-2017 Intel, Inc. All rights reserved.
* Copyright (c) 2015 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* $COPYRIGHT$
@ -35,6 +35,7 @@
#include "orte/util/session_dir.h"
#include "orte/util/show_help.h"
#include "orte/util/nidmap.h"
#include "orte/util/threads.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/rml/rml.h"
@ -304,6 +305,8 @@ static void process_opens(int fd, short args, void *cbdata)
int v;
orte_node_t *node, *nptr;
ORTE_ACQUIRE_OBJECT(dfs);
/* get the scheme to determine if we can process locally or not */
if (NULL == (scheme = opal_uri_get_scheme(dfs->uri))) {
OBJ_RELEASE(dfs);
@ -465,7 +468,7 @@ static void dfs_open(char *uri,
dfs->cbdata = cbdata;
/* post it for processing */
ORTE_DFS_POST_REQUEST(dfs, process_opens);
ORTE_THREADSHIFT(dfs, orte_event_base, process_opens, ORTE_SYS_PRI);
}
static void process_close(int fd, short args, void *cbdata)
@ -476,6 +479,8 @@ static void process_close(int fd, short args, void *cbdata)
opal_buffer_t *buffer;
int rc;
ORTE_ACQUIRE_OBJECT(close_dfs);
opal_output_verbose(1, orte_dfs_base_framework.framework_output,
"%s closing fd %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
@ -561,7 +566,7 @@ static void dfs_close(int fd,
dfs->cbdata = cbdata;
/* post it for processing */
ORTE_DFS_POST_REQUEST(dfs, process_close);
ORTE_THREADSHIFT(dfs, orte_event_base, process_close, ORTE_SYS_PRI);
}
static void process_sizes(int fd, short args, void *cbdata)
@ -573,6 +578,8 @@ static void process_sizes(int fd, short args, void *cbdata)
int rc;
struct stat buf;
ORTE_ACQUIRE_OBJECT(size_dfs);
opal_output_verbose(1, orte_dfs_base_framework.framework_output,
"%s processing get_size on fd %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
@ -665,7 +672,7 @@ static void dfs_get_file_size(int fd,
dfs->cbdata = cbdata;
/* post it for processing */
ORTE_DFS_POST_REQUEST(dfs, process_sizes);
ORTE_THREADSHIFT(dfs, orte_event_base, process_sizes, ORTE_SYS_PRI);
}
@ -679,6 +686,8 @@ static void process_seeks(int fd, short args, void *cbdata)
int rc;
struct stat buf;
ORTE_ACQUIRE_OBJECT(seek_dfs);
opal_output_verbose(1, orte_dfs_base_framework.framework_output,
"%s processing seek on fd %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
@ -814,7 +823,7 @@ static void dfs_seek(int fd, long offset, int whence,
dfs->cbdata = cbdata;
/* post it for processing */
ORTE_DFS_POST_REQUEST(dfs, process_seeks);
ORTE_THREADSHIFT(dfs, orte_event_base, process_seeks, ORTE_SYS_PRI);
}
static void process_reads(int fd, short args, void *cbdata)
@ -827,6 +836,8 @@ static void process_reads(int fd, short args, void *cbdata)
int64_t i64;
int rc;
ORTE_ACQUIRE_OBJECT(read_dfs);
/* look in our local records for this fd */
trk = NULL;
for (item = opal_list_get_first(&active_files);
@ -924,7 +935,7 @@ static void dfs_read(int fd, uint8_t *buffer,
dfs->cbdata = cbdata;
/* post it for processing */
ORTE_DFS_POST_REQUEST(dfs, process_reads);
ORTE_THREADSHIFT(dfs, orte_event_base, process_reads, ORTE_SYS_PRI);
}
static void process_posts(int fd, short args, void *cbdata)
@ -935,6 +946,8 @@ static void process_posts(int fd, short args, void *cbdata)
opal_list_item_t *item;
int rc;
ORTE_ACQUIRE_OBJECT(dfs);
opal_output_verbose(1, orte_dfs_base_framework.framework_output,
"%s posting file map containing %d bytes for target %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
@ -1009,7 +1022,7 @@ static void dfs_post_file_map(opal_buffer_t *buffer,
dfs->cbdata = cbdata;
/* post it for processing */
ORTE_DFS_POST_REQUEST(dfs, process_posts);
ORTE_THREADSHIFT(dfs, orte_event_base, process_posts, ORTE_SYS_PRI);
}
static int get_job_maps(orte_dfs_jobfm_t *jfm,
@ -1057,6 +1070,8 @@ static void process_getfm(int fd, short args, void *cbdata)
int32_t n, ntotal;
int rc;
ORTE_ACQUIRE_OBJECT(dfs);
/* if the target job is WILDCARD, then process
* data for all jobids - else, find the one
*/
@ -1120,7 +1135,7 @@ static void dfs_get_file_map(orte_process_name_t *target,
dfs->cbdata = cbdata;
/* post it for processing */
ORTE_DFS_POST_REQUEST(dfs, process_getfm);
ORTE_THREADSHIFT(dfs, orte_event_base, process_getfm, ORTE_SYS_PRI);
}
static void process_load(int fd, short args, void *cbdata)
@ -1135,6 +1150,8 @@ static void process_load(int fd, short args, void *cbdata)
int rc;
opal_buffer_t *xfer;
ORTE_ACQUIRE_OBJECT(dfs);
/* see if we already have a tracker for this job */
jfm = NULL;
for (item = opal_list_get_first(&file_maps);
@ -1233,7 +1250,7 @@ static void dfs_load_file_maps(orte_jobid_t jobid,
dfs->cbdata = cbdata;
/* post it for processing */
ORTE_DFS_POST_REQUEST(dfs, process_load);
ORTE_THREADSHIFT(dfs, orte_event_base, process_load, ORTE_SYS_PRI);
}
static void process_purge(int fd, short args, void *cbdata)
@ -1242,6 +1259,8 @@ static void process_purge(int fd, short args, void *cbdata)
opal_list_item_t *item;
orte_dfs_jobfm_t *jfm, *jptr;
ORTE_ACQUIRE_OBJECT(dfs);
/* find the job tracker */
jfm = NULL;
for (item = opal_list_get_first(&file_maps);
@ -1288,7 +1307,7 @@ static void dfs_purge_file_maps(orte_jobid_t jobid,
dfs->cbdata = cbdata;
/* post it for processing */
ORTE_DFS_POST_REQUEST(dfs, process_purge);
ORTE_THREADSHIFT(dfs, orte_event_base, process_purge, ORTE_SYS_PRI);
}
@ -2368,4 +2387,3 @@ static void remote_read(int fd, short args, void *cbdata)
}
OBJ_RELEASE(req);
}

Просмотреть файл

@ -1,7 +1,7 @@
/*
* Copyright (c) 2012-2013 Los Alamos National Security, LLC.
* All rights reserved.
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
* Copyright (c) 2014-2015 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* $COPYRIGHT$
@ -32,6 +32,7 @@
#include "orte/util/error_strings.h"
#include "orte/util/name_fns.h"
#include "orte/util/show_help.h"
#include "orte/util/threads.h"
#include "orte/runtime/orte_globals.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/rml/rml.h"
@ -449,6 +450,8 @@ static void process_opens(int fd, short args, void *cbdata)
opal_list_t lt;
opal_namelist_t *nm;
ORTE_ACQUIRE_OBJECT(dfs);
opal_output_verbose(1, orte_dfs_base_framework.framework_output,
"%s PROCESSING OPEN", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
@ -583,7 +586,7 @@ static void dfs_open(char *uri,
dfs->cbdata = cbdata;
/* post it for processing */
ORTE_DFS_POST_REQUEST(dfs, process_opens);
ORTE_THREADSHIFT(dfs, orte_event_base, process_opens, ORTE_SYS_PRI);
}
static void process_close(int fd, short args, void *cbdata)
@ -594,6 +597,8 @@ static void process_close(int fd, short args, void *cbdata)
opal_buffer_t *buffer;
int rc;
ORTE_ACQUIRE_OBJECT(close_dfs);
opal_output_verbose(1, orte_dfs_base_framework.framework_output,
"%s closing fd %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
@ -673,7 +678,7 @@ static void dfs_close(int fd,
dfs->cbdata = cbdata;
/* post it for processing */
ORTE_DFS_POST_REQUEST(dfs, process_close);
ORTE_THREADSHIFT(dfs, orte_event_base, process_close, ORTE_SYS_PRI);
}
static void process_sizes(int fd, short args, void *cbdata)
@ -684,6 +689,8 @@ static void process_sizes(int fd, short args, void *cbdata)
opal_buffer_t *buffer;
int rc;
ORTE_ACQUIRE_OBJECT(size_dfs);
opal_output_verbose(1, orte_dfs_base_framework.framework_output,
"%s processing get_size on fd %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
@ -775,7 +782,7 @@ static void dfs_get_file_size(int fd,
dfs->cbdata = cbdata;
/* post it for processing */
ORTE_DFS_POST_REQUEST(dfs, process_sizes);
ORTE_THREADSHIFT(dfs, orte_event_base, process_sizes, ORTE_SYS_PRI);
}
@ -788,6 +795,8 @@ static void process_seeks(int fd, short args, void *cbdata)
int64_t i64;
int rc;
ORTE_ACQUIRE_OBJECT(seek_dfs);
opal_output_verbose(1, orte_dfs_base_framework.framework_output,
"%s processing seek on fd %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
@ -885,7 +894,7 @@ static void dfs_seek(int fd, long offset, int whence,
dfs->cbdata = cbdata;
/* post it for processing */
ORTE_DFS_POST_REQUEST(dfs, process_seeks);
ORTE_THREADSHIFT(dfs, orte_event_base, process_seeks, ORTE_SYS_PRI);
}
static void process_reads(int fd, short args, void *cbdata)
@ -897,6 +906,8 @@ static void process_reads(int fd, short args, void *cbdata)
int64_t i64;
int rc;
ORTE_ACQUIRE_OBJECT(read_dfs);
/* look in our local records for this fd */
trk = NULL;
for (item = opal_list_get_first(&active_files);
@ -979,7 +990,7 @@ static void dfs_read(int fd, uint8_t *buffer,
dfs->cbdata = cbdata;
/* post it for processing */
ORTE_DFS_POST_REQUEST(dfs, process_reads);
ORTE_THREADSHIFT(dfs, orte_event_base, process_reads, ORTE_SYS_PRI);
}
static void process_posts(int fd, short args, void *cbdata)
@ -988,6 +999,8 @@ static void process_posts(int fd, short args, void *cbdata)
opal_buffer_t *buffer;
int rc;
ORTE_ACQUIRE_OBJECT(dfs);
/* we will get confirmation in our receive function, so
* add this request to our list */
dfs->id = req_id++;
@ -1046,7 +1059,7 @@ static void dfs_post_file_map(opal_buffer_t *bo,
dfs->cbdata = cbdata;
/* post it for processing */
ORTE_DFS_POST_REQUEST(dfs, process_posts);
ORTE_THREADSHIFT(dfs, orte_event_base, process_posts, ORTE_SYS_PRI);
}
static void process_getfm(int fd, short args, void *cbdata)
@ -1055,6 +1068,8 @@ static void process_getfm(int fd, short args, void *cbdata)
opal_buffer_t *buffer;
int rc;
ORTE_ACQUIRE_OBJECT(dfs);
/* we will get confirmation in our receive function, so
* add this request to our list */
dfs->id = req_id++;
@ -1109,7 +1124,7 @@ static void dfs_get_file_map(orte_process_name_t *target,
dfs->cbdata = cbdata;
/* post it for processing */
ORTE_DFS_POST_REQUEST(dfs, process_getfm);
ORTE_THREADSHIFT(dfs, orte_event_base, process_getfm, ORTE_SYS_PRI);
}
static void dfs_load_file_maps(orte_jobid_t jobid,
@ -1132,4 +1147,3 @@ static void dfs_purge_file_maps(orte_jobid_t jobid,
cbfunc(cbdata);
}
}

Просмотреть файл

@ -10,6 +10,7 @@
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# Copyright (c) 2012-2013 Los Alamos National Security, Inc. All rights reserved.
# Copyright (c) 2017 Intel, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
@ -26,5 +27,4 @@ headers += \
libmca_errmgr_la_SOURCES += \
base/errmgr_base_select.c \
base/errmgr_base_frame.c \
base/errmgr_base_fns.c \
base/errmgr_base_tool.c
base/errmgr_base_fns.c

Просмотреть файл

@ -13,7 +13,7 @@
* Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved.
* Copyright (c) 2011-2013 Los Alamos National Security, LLC.
* All rights reserved.
* Copyright (c) 2013-2014 Intel, Inc. All rights reserved
* Copyright (c) 2013-2017 Intel, Inc. All rights reserved.
* Copyright (c) 2014 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* $COPYRIGHT$
@ -82,99 +82,6 @@
#include "orte/mca/errmgr/base/base.h"
#include "orte/mca/errmgr/base/errmgr_private.h"
/*
* Object stuff
*/
void orte_errmgr_predicted_proc_construct(orte_errmgr_predicted_proc_t *item);
void orte_errmgr_predicted_proc_destruct( orte_errmgr_predicted_proc_t *item);
OBJ_CLASS_INSTANCE(orte_errmgr_predicted_proc_t,
opal_list_item_t,
orte_errmgr_predicted_proc_construct,
orte_errmgr_predicted_proc_destruct);
void orte_errmgr_predicted_proc_construct(orte_errmgr_predicted_proc_t *item)
{
item->proc_name.vpid = ORTE_VPID_INVALID;
item->proc_name.jobid = ORTE_JOBID_INVALID;
}
void orte_errmgr_predicted_proc_destruct( orte_errmgr_predicted_proc_t *item)
{
item->proc_name.vpid = ORTE_VPID_INVALID;
item->proc_name.jobid = ORTE_JOBID_INVALID;
}
void orte_errmgr_predicted_node_construct(orte_errmgr_predicted_node_t *item);
void orte_errmgr_predicted_node_destruct( orte_errmgr_predicted_node_t *item);
OBJ_CLASS_INSTANCE(orte_errmgr_predicted_node_t,
opal_list_item_t,
orte_errmgr_predicted_node_construct,
orte_errmgr_predicted_node_destruct);
void orte_errmgr_predicted_node_construct(orte_errmgr_predicted_node_t *item)
{
item->node_name = NULL;
}
void orte_errmgr_predicted_node_destruct( orte_errmgr_predicted_node_t *item)
{
if( NULL != item->node_name ) {
free(item->node_name);
item->node_name = NULL;
}
}
void orte_errmgr_predicted_map_construct(orte_errmgr_predicted_map_t *item);
void orte_errmgr_predicted_map_destruct( orte_errmgr_predicted_map_t *item);
OBJ_CLASS_INSTANCE(orte_errmgr_predicted_map_t,
opal_list_item_t,
orte_errmgr_predicted_map_construct,
orte_errmgr_predicted_map_destruct);
void orte_errmgr_predicted_map_construct(orte_errmgr_predicted_map_t *item)
{
item->proc_name.vpid = ORTE_VPID_INVALID;
item->proc_name.jobid = ORTE_JOBID_INVALID;
item->node_name = NULL;
item->map_proc_name.vpid = ORTE_VPID_INVALID;
item->map_proc_name.jobid = ORTE_JOBID_INVALID;
item->map_node_name = NULL;
item->off_current_node = false;
item->pre_map_fixed_node = NULL;
}
void orte_errmgr_predicted_map_destruct( orte_errmgr_predicted_map_t *item)
{
item->proc_name.vpid = ORTE_VPID_INVALID;
item->proc_name.jobid = ORTE_JOBID_INVALID;
if( NULL != item->node_name ) {
free(item->node_name);
item->node_name = NULL;
}
item->map_proc_name.vpid = ORTE_VPID_INVALID;
item->map_proc_name.jobid = ORTE_JOBID_INVALID;
if( NULL != item->map_node_name ) {
free(item->map_node_name);
item->map_node_name = NULL;
}
item->off_current_node = false;
if( NULL != item->pre_map_fixed_node ) {
free(item->pre_map_fixed_node);
item->pre_map_fixed_node = NULL;
}
}
/*
* Public interfaces
*/
@ -231,12 +138,6 @@ void orte_errmgr_base_abort(int error_code, char *fmt, ...)
/* No way to reach here */
}
void orte_errmgr_base_register_migration_warning(struct timeval *tv)
{
/* stub function - ignore */
return;
}
int orte_errmgr_base_abort_peers(orte_process_name_t *procs,
orte_std_cntr_t num_procs,
int error_code)
@ -244,195 +145,6 @@ int orte_errmgr_base_abort_peers(orte_process_name_t *procs,
return ORTE_ERR_NOT_IMPLEMENTED;
}
int orte_errmgr_base_register_error_callback(orte_errmgr_error_callback_fn_t *cbfunc,
orte_errmgr_error_order_t order)
{
orte_errmgr_cback_t *cb, *cbcur;
/* check the order to see what to do */
switch(order) {
case ORTE_ERRMGR_CALLBACK_FIRST:
/* only one can be so designated */
if (NULL != (cb = (orte_errmgr_cback_t*)opal_list_get_first(&orte_errmgr_base.error_cbacks))) {
if (ORTE_ERRMGR_CALLBACK_FIRST == cb->order) {
return ORTE_ERR_NOT_SUPPORTED;
}
}
cb = OBJ_NEW(orte_errmgr_cback_t);
cb->order = order;
cb->callback =cbfunc;
opal_list_prepend(&orte_errmgr_base.error_cbacks, &cb->super);
break;
case ORTE_ERRMGR_CALLBACK_LAST:
/* only one can be so designated */
if (NULL != (cb = (orte_errmgr_cback_t*)opal_list_get_last(&orte_errmgr_base.error_cbacks))) {
if (ORTE_ERRMGR_CALLBACK_LAST == cb->order) {
return ORTE_ERR_NOT_SUPPORTED;
}
}
cb = OBJ_NEW(orte_errmgr_cback_t);
cb->order = order;
cb->callback = cbfunc;
opal_list_append(&orte_errmgr_base.error_cbacks, &cb->super);
break;
case ORTE_ERRMGR_CALLBACK_PREPEND:
cb = OBJ_NEW(orte_errmgr_cback_t);
cb->order = order;
cb->callback =cbfunc;
if (NULL != (cbcur = (orte_errmgr_cback_t*)opal_list_get_first(&orte_errmgr_base.error_cbacks)) &&
ORTE_ERRMGR_CALLBACK_FIRST == cbcur->order) {
opal_list_insert(&orte_errmgr_base.error_cbacks, &cb->super, 1);
} else {
opal_list_prepend(&orte_errmgr_base.error_cbacks, &cb->super);
}
break;
case ORTE_ERRMGR_CALLBACK_APPEND:
cb = OBJ_NEW(orte_errmgr_cback_t);
cb->order = order;
cb->callback =cbfunc;
if (NULL != (cbcur = (orte_errmgr_cback_t*)opal_list_get_last(&orte_errmgr_base.error_cbacks)) &&
ORTE_ERRMGR_CALLBACK_LAST == cbcur->order) {
opal_list_insert_pos(&orte_errmgr_base.error_cbacks, &cbcur->super, &cb->super);
} else {
opal_list_append(&orte_errmgr_base.error_cbacks, &cb->super);
}
opal_list_append(&orte_errmgr_base.error_cbacks, &cb->super);
break;
}
return ORTE_SUCCESS;
}
void orte_errmgr_base_execute_error_callbacks(opal_pointer_array_t *errors)
{
orte_errmgr_cback_t *cb;
char *errstring=NULL;
orte_error_t *err;
int errcode = ORTE_ERROR_DEFAULT_EXIT_CODE;
/* if no callbacks have been provided, then we abort */
if (0 == opal_list_get_size(&orte_errmgr_base.error_cbacks)) {
/* take the first entry, if available */
if (NULL != errors &&
(NULL != (err = (orte_error_t*)opal_pointer_array_get_item(errors, 0)))) {
errstring = (char*)ORTE_ERROR_NAME(err->errcode);
errcode = err->errcode;
}
if (NULL == errstring) {
/* if the error is silent, say nothing */
orte_errmgr.abort(errcode, NULL);
}
orte_errmgr.abort(errcode, "Executing default error callback: %s", errstring);
}
/* cycle across the provided callbacks until we complete the list
* or one reports that no further action is required
*/
OPAL_LIST_FOREACH(cb, &orte_errmgr_base.error_cbacks, orte_errmgr_cback_t) {
if (ORTE_SUCCESS == cb->callback(errors)) {
break;
}
}
}
/********************
* Utility functions
********************/
#if OPAL_ENABLE_FT_CR
void orte_errmgr_base_migrate_state_notify(int state)
{
switch(state) {
case ORTE_ERRMGR_MIGRATE_STATE_ERROR:
case ORTE_ERRMGR_MIGRATE_STATE_ERR_INPROGRESS:
opal_output(0, "%d: Migration failed for process %s.",
orte_process_info.pid, ORTE_JOBID_PRINT(ORTE_PROC_MY_NAME->jobid));
break;
case ORTE_ERRMGR_MIGRATE_STATE_FINISH:
opal_output(0, "%d: Migration successful for process %s.",
orte_process_info.pid, ORTE_JOBID_PRINT(ORTE_PROC_MY_NAME->jobid));
break;
case ORTE_ERRMGR_MIGRATE_STATE_NONE:
case ORTE_ERRMGR_MIGRATE_STATE_REQUEST:
case ORTE_ERRMGR_MIGRATE_STATE_RUNNING:
case ORTE_ERRMGR_MIGRATE_STATE_RUN_CKPT:
case ORTE_ERRMGR_MIGRATE_STATE_STARTUP:
case ORTE_ERRMGR_MIGRATE_MAX:
default:
break;
}
}
void orte_errmgr_base_proc_state_notify(orte_proc_state_t state, orte_process_name_t *proc)
{
if (NULL != proc) {
switch(state) {
case ORTE_PROC_STATE_ABORTED:
case ORTE_PROC_STATE_ABORTED_BY_SIG:
case ORTE_PROC_STATE_TERM_WO_SYNC:
case ORTE_PROC_STATE_TERMINATED:
case ORTE_PROC_STATE_KILLED_BY_CMD:
case ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED:
opal_output(0, "%d: Process %s is dead.",
orte_process_info.pid, ORTE_JOBID_PRINT(proc->jobid));
break;
case ORTE_PROC_STATE_HEARTBEAT_FAILED:
opal_output(0, "%d: Process %s is unreachable.",
orte_process_info.pid, ORTE_JOBID_PRINT(proc->jobid));
case ORTE_PROC_STATE_COMM_FAILED:
opal_output(0, "%d: Failed to communicate with process %s.",
orte_process_info.pid, ORTE_JOBID_PRINT(proc->jobid));
break;
case ORTE_PROC_STATE_CALLED_ABORT:
case ORTE_PROC_STATE_FAILED_TO_START:
opal_output(0, "%d: Process %s has called abort.",
orte_process_info.pid, ORTE_JOBID_PRINT(proc->jobid));
break;
case ORTE_PROC_STATE_MIGRATING:
default:
break;
}
}
}
int orte_errmgr_base_migrate_state_str(char ** state_str, int state)
{
switch(state) {
case ORTE_ERRMGR_MIGRATE_STATE_NONE:
*state_str = strdup(" -- ");
break;
case ORTE_ERRMGR_MIGRATE_STATE_REQUEST:
*state_str = strdup("Requested");
break;
case ORTE_ERRMGR_MIGRATE_STATE_RUNNING:
*state_str = strdup("Running");
break;
case ORTE_ERRMGR_MIGRATE_STATE_RUN_CKPT:
*state_str = strdup("Checkpointing");
break;
case ORTE_ERRMGR_MIGRATE_STATE_STARTUP:
*state_str = strdup("Restarting");
break;
case ORTE_ERRMGR_MIGRATE_STATE_FINISH:
*state_str = strdup("Finished");
break;
case ORTE_ERRMGR_MIGRATE_STATE_ERROR:
*state_str = strdup("Error");
break;
case ORTE_ERRMGR_MIGRATE_STATE_ERR_INPROGRESS:
*state_str = strdup("Error: Migration in progress");
break;
default:
asprintf(state_str, "Unknown %d", state);
break;
}
return ORTE_SUCCESS;
}
#endif
#if OPAL_ENABLE_FT_CR
int orte_errmgr_base_update_app_context_for_cr_recovery(orte_job_t *jobdata,

Просмотреть файл

@ -12,7 +12,7 @@
* Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved.
* Copyright (c) 2011-2013 Los Alamos National Security, LLC.
* All rights reserved.
* Copyright (c) 2013 Intel, Inc. All rights reserved
* Copyright (c) 2013-2017 Intel, Inc. All rights reserved.
* Copyright (c) 2014-2015 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* $COPYRIGHT$
@ -53,17 +53,11 @@ orte_errmgr_base_t orte_errmgr_base = {{{0}}};
/* Public module provides a wrapper around previous functions */
orte_errmgr_base_module_t orte_errmgr_default_fns = {
NULL, /* init */
NULL, /* finalize */
orte_errmgr_base_log,
orte_errmgr_base_abort,
orte_errmgr_base_abort_peers,
NULL, /* predicted_fault */
NULL, /* suggest_map_targets */
NULL, /* ft_event */
orte_errmgr_base_register_migration_warning,
orte_errmgr_base_register_error_callback,
orte_errmgr_base_execute_error_callbacks
.init = NULL, /* init */
.finalize = NULL, /* finalize */
.logfn = orte_errmgr_base_log,
.abort = orte_errmgr_base_abort,
.abort_peers = orte_errmgr_base_abort_peers
};
/* NOTE: ABSOLUTELY MUST initialize this
* struct to include the log function as it
@ -71,16 +65,7 @@ orte_errmgr_base_module_t orte_errmgr_default_fns = {
* opened yet due to error
*/
orte_errmgr_base_module_t orte_errmgr = {
NULL,
NULL,
orte_errmgr_base_log,
NULL,
NULL,
NULL,
NULL,
NULL,
NULL,
NULL
.logfn = orte_errmgr_base_log
};
static int orte_errmgr_base_close(void)
@ -118,7 +103,3 @@ static int orte_errmgr_base_open(mca_base_open_flag_t flags)
MCA_BASE_FRAMEWORK_DECLARE(orte, errmgr, "ORTE Error Manager", NULL,
orte_errmgr_base_open, orte_errmgr_base_close,
mca_errmgr_base_static_components, 0);
OBJ_CLASS_INSTANCE(orte_errmgr_cback_t,
opal_list_item_t,
NULL, NULL);

Просмотреть файл

@ -1,441 +0,0 @@
/*
* Copyright (c) 2009-2010 The Trustees of Indiana University.
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include <string.h>
#if HAVE_SYS_TYPES_H
#include <sys/types.h>
#endif /* HAVE_SYS_TYPES_H */
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif /* HAVE_UNISTD_H */
#if HAVE_SYS_TYPES_H
#include <sys/types.h>
#endif /* HAVE_SYS_TYPES_H */
#if HAVE_SYS_STAT_H
#include <sys/stat.h>
#endif /* HAVE_SYS_STAT_H */
#ifdef HAVE_DIRENT_H
#include <dirent.h>
#endif /* HAVE_DIRENT_H */
#include <time.h>
#include "opal/dss/dss.h"
#include "orte/mca/mca.h"
#include "opal/mca/base/base.h"
#include "opal/util/os_dirpath.h"
#include "opal/util/output.h"
#include "opal/util/basename.h"
#include "opal/util/argv.h"
#include "opal/mca/crs/crs.h"
#include "opal/mca/crs/base/base.h"
#include "orte/mca/rml/rml.h"
#include "orte/mca/rml/rml_types.h"
#include "orte/mca/snapc/snapc.h"
#include "orte/runtime/orte_globals.h"
#include "orte/util/name_fns.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/errmgr/base/base.h"
#include "orte/mca/errmgr/base/errmgr_private.h"
/**
* This file contains function for the HNP to communicate with the
* orte-migrate command.
*/
#if OPAL_ENABLE_FT_CR
/******************
* Local Functions
******************/
static int errmgr_base_tool_start_cmdline_listener(void);
static int errmgr_base_tool_stop_cmdline_listener(void);
static void errmgr_base_tool_cmdline_recv(int status,
orte_process_name_t* sender,
opal_buffer_t* buffer,
orte_rml_tag_t tag,
void* cbdata);
/******************
* Object stuff
******************/
static orte_process_name_t errmgr_cmdline_sender = {ORTE_JOBID_INVALID, ORTE_VPID_INVALID};
static bool errmgr_cmdline_recv_issued = false;
static int errmgr_tool_initialized = false;
/********************
* Module Functions
********************/
int orte_errmgr_base_tool_init(void)
{
int ret;
if( (++errmgr_tool_initialized) != 1 ) {
if( errmgr_tool_initialized < 1 ) {
return OPAL_ERROR;
}
return OPAL_SUCCESS;
}
/* Only HNP communicates with tools */
if (! ORTE_PROC_IS_HNP) {
return ORTE_SUCCESS;
}
/*
* Setup command line migrate tool request listener
*/
if( ORTE_SUCCESS != (ret = errmgr_base_tool_start_cmdline_listener()) ) {
ORTE_ERROR_LOG(ret);
return ret;
}
return ORTE_SUCCESS;
}
int orte_errmgr_base_tool_finalize(void)
{
int ret;
if( (--errmgr_tool_initialized) != 0 ) {
if( errmgr_tool_initialized < 0 ) {
return OPAL_ERROR;
}
return OPAL_SUCCESS;
}
/* Only HNP communicates with tools */
if (! ORTE_PROC_IS_HNP) {
return ORTE_SUCCESS;
}
/*
* Clean up listeners
*/
if( ORTE_SUCCESS != (ret = errmgr_base_tool_stop_cmdline_listener()) ) {
ORTE_ERROR_LOG(ret);
return ret;
}
return ORTE_SUCCESS;
}
int orte_errmgr_base_migrate_update(int status)
{
int ret, exit_status = ORTE_SUCCESS;
opal_buffer_t *loc_buffer = NULL;
orte_errmgr_tool_cmd_flag_t command = ORTE_ERRMGR_MIGRATE_TOOL_UPDATE_CMD;
/* Only HNP communicates with tools */
if (! ORTE_PROC_IS_HNP) {
return ORTE_SUCCESS;
}
/*
* If this is an invalid state, then return an error
*/
if( ORTE_ERRMGR_MIGRATE_MAX < status ) {
opal_output(orte_errmgr_base_framework.framework_output,
"errmgr:base:tool:update() Error: Invalid state %d < (Max %d)",
status, ORTE_ERRMGR_MIGRATE_MAX);
return ORTE_ERR_BAD_PARAM;
}
/*
* Report the status over the notifier interface
*/
orte_errmgr_base_migrate_state_notify(status);
/*
* If the caller is indicating that they are finished and ready for another
* command, then repost the RML listener.
*/
if( ORTE_ERRMGR_MIGRATE_STATE_NONE == status ) {
if( ORTE_SUCCESS != (ret = errmgr_base_tool_start_cmdline_listener()) ) {
ORTE_ERROR_LOG(ret);
return ret;
}
return ORTE_SUCCESS;
}
/*
* Noop if invalid peer, or peer not specified
*/
if( OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, ORTE_NAME_INVALID, &errmgr_cmdline_sender) ) {
return ORTE_SUCCESS;
}
/*
* Do not send to self, as that is silly.
*/
if( OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, ORTE_PROC_MY_HNP, &errmgr_cmdline_sender) ) {
OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base_framework.framework_output,
"errmgr:base:tool:update() Warning: Do not send to self!\n"));
return ORTE_SUCCESS;
}
OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base_framework.framework_output,
"errmgr:base:tool:update() Sending update command <status %d>\n",
status));
/********************
* Send over the status of the checkpoint
* - migration state
********************/
if (NULL == (loc_buffer = OBJ_NEW(opal_buffer_t))) {
exit_status = ORTE_ERROR;
goto cleanup;
}
if (ORTE_SUCCESS != (ret = opal_dss.pack(loc_buffer, &command, 1, ORTE_ERRMGR_MIGRATE_TOOL_CMD)) ) {
opal_output(orte_errmgr_base_framework.framework_output,
"errmgr:base:tool:update() Error: DSS Pack (cmd) Failure (ret = %d)\n",
ret);
ORTE_ERROR_LOG(ret);
exit_status = ret;
goto cleanup;
}
if (ORTE_SUCCESS != (ret = opal_dss.pack(loc_buffer, &status, 1, OPAL_INT))) {
opal_output(orte_errmgr_base_framework.framework_output,
"errmgr:base:tool:update() Error: DSS Pack (status) Failure (ret = %d)\n",
ret);
ORTE_ERROR_LOG(ret);
exit_status = ret;
goto cleanup;
}
if (ORTE_SUCCESS != (ret = orte_rml.send_buffer_nb(&errmgr_cmdline_sender,
loc_buffer, ORTE_RML_TAG_MIGRATE,
orte_rml_send_callback, NULL))) {
opal_output(orte_errmgr_base_framework.framework_output,
"errmgr:base:tool:update() Error: Send (status) Failure (ret = %d)\n",
ret);
ORTE_ERROR_LOG(ret);
exit_status = ret;
goto cleanup;
}
cleanup:
if(NULL != loc_buffer) {
OBJ_RELEASE(loc_buffer);
loc_buffer = NULL;
}
return exit_status;
}
/********************
* Utility functions
********************/
/********************
* Local Functions
********************/
static int errmgr_base_tool_start_cmdline_listener(void)
{
if (errmgr_cmdline_recv_issued && ORTE_PROC_IS_HNP) {
return ORTE_SUCCESS;
}
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
"errmgr:base:tool: Startup Command Line Channel"));
/*
* Coordinator command listener
*/
errmgr_cmdline_sender.jobid = ORTE_JOBID_INVALID;
errmgr_cmdline_sender.vpid = ORTE_VPID_INVALID;
orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_MIGRATE,
0, errmgr_base_tool_cmdline_recv, NULL);
errmgr_cmdline_recv_issued = true;
return ORTE_SUCCESS;
}
static int errmgr_base_tool_stop_cmdline_listener(void)
{
int exit_status = ORTE_SUCCESS;
if (!errmgr_cmdline_recv_issued && ORTE_PROC_IS_HNP) {
return ORTE_SUCCESS;
}
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
"errmgr:base:tool: Shutdown Command Line Channel"));
orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_MIGRATE);
errmgr_cmdline_recv_issued = false;
return exit_status;
}
/*****************
* Listener Callbacks
*****************/
static void errmgr_base_tool_cmdline_recv(int status,
orte_process_name_t* sender,
opal_buffer_t* buffer,
orte_rml_tag_t tag,
void* cbdata)
{
int ret;
orte_process_name_t swap_dest;
orte_errmgr_tool_cmd_flag_t command;
orte_std_cntr_t count = 1;
char *off_nodes = NULL;
char *off_procs = NULL;
char *onto_nodes = NULL;
char **split_off_nodes = NULL;
char **split_off_procs = NULL;
char **split_onto_nodes = NULL;
opal_list_t *proc_list = NULL;
opal_list_t *node_list = NULL;
opal_list_t *suggested_map_list = NULL;
orte_errmgr_predicted_proc_t *off_proc = NULL;
orte_errmgr_predicted_node_t *off_node = NULL;
orte_errmgr_predicted_map_t *onto_map = NULL;
int cnt = 0, i;
if( ORTE_RML_TAG_MIGRATE != tag ) {
opal_output(orte_errmgr_base_framework.framework_output,
"errmgr:base:tool:recv() Error: Unknown tag: Received a command message from %s (tag = %d).",
ORTE_NAME_PRINT(sender), tag);
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
return;
}
OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base_framework.framework_output,
"errmgr:base:tool:recv() Command Line: Start a migration operation [Sender = %s]",
ORTE_NAME_PRINT(sender)));
errmgr_cmdline_recv_issued = false; /* Not a persistent RML message */
/*
* If we are already interacting with a command line tool then reject this
* request. Since we only allow the processing of one tool command at a
* time.
*/
if( OPAL_EQUAL != orte_util_compare_name_fields(ORTE_NS_CMP_ALL, ORTE_NAME_INVALID, &errmgr_cmdline_sender) ) {
swap_dest.jobid = errmgr_cmdline_sender.jobid;
swap_dest.vpid = errmgr_cmdline_sender.vpid;
errmgr_cmdline_sender = *sender;
orte_errmgr_base_migrate_update(ORTE_ERRMGR_MIGRATE_STATE_ERR_INPROGRESS);
errmgr_cmdline_sender.jobid = swap_dest.jobid;
errmgr_cmdline_sender.vpid = swap_dest.vpid;
return;
}
errmgr_cmdline_sender = *sender;
count = 1;
if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &command, &count, ORTE_ERRMGR_MIGRATE_TOOL_CMD))) {
ORTE_ERROR_LOG(ret);
return;
}
/*
* orte-migrate has requested that a checkpoint be taken
*/
if (ORTE_ERRMGR_MIGRATE_TOOL_INIT_CMD == command) {
OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base_framework.framework_output,
"errmgr:base:tool:recv() Command line requested process migration [command %d]\n",
command));
/*
* Unpack the buffer from the orte-migrate command
*/
count = 1;
if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &(off_procs), &count, OPAL_STRING))) {
ORTE_ERROR_LOG(ret);
return;
}
if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &(off_nodes), &count, OPAL_STRING))) {
ORTE_ERROR_LOG(ret);
return;
}
if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &(onto_nodes), &count, OPAL_STRING))) {
ORTE_ERROR_LOG(ret);
return;
}
/*
* Parse the comma separated list
*/
proc_list = OBJ_NEW(opal_list_t);
node_list = OBJ_NEW(opal_list_t);
suggested_map_list = OBJ_NEW(opal_list_t);
split_off_procs = opal_argv_split(off_procs, ',');
cnt = opal_argv_count(split_off_procs);
if( cnt > 0 ) {
for(i = 0; i < cnt; ++i) {
off_proc = OBJ_NEW(orte_errmgr_predicted_proc_t);
off_proc->proc_name.vpid = atoi(split_off_procs[i]);
opal_list_append(proc_list, &(off_proc->super));
}
}
split_off_nodes = opal_argv_split(off_nodes, ',');
cnt = opal_argv_count(split_off_nodes);
if( cnt > 0 ) {
for(i = 0; i < cnt; ++i) {
off_node = OBJ_NEW(orte_errmgr_predicted_node_t);
off_node->node_name = strdup(split_off_nodes[i]);
opal_list_append(node_list, &(off_node->super));
}
}
split_onto_nodes = opal_argv_split(onto_nodes, ',');
cnt = opal_argv_count(split_onto_nodes);
if( cnt > 0 ) {
for(i = 0; i < cnt; ++i) {
onto_map = OBJ_NEW(orte_errmgr_predicted_map_t);
onto_map->map_node_name = strdup(split_onto_nodes[i]);
opal_list_append(suggested_map_list, &(onto_map->super));
}
}
/*
* Pass to the predicted fault function to see how they would like to progress
*/
orte_errmgr.predicted_fault(proc_list, node_list, suggested_map_list);
}
/*
* Unknown command
*/
else {
OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base_framework.framework_output,
"errmgr:base:tool:recv() Command line sent an unknown command (command %d)\n",
command));
ORTE_ERROR_LOG(ORTE_ERR_NOT_SUPPORTED);
}
return;
}
#endif

Просмотреть файл

@ -12,6 +12,7 @@
* Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved.
* Copyright (c) 2011 Los Alamos National Security, LLC.
* All rights reserved.
* Copyright (c) 2017 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -53,14 +54,6 @@ typedef struct {
ORTE_DECLSPEC extern orte_errmgr_base_t orte_errmgr_base;
/* define a struct to hold registered error callbacks */
typedef struct {
opal_list_item_t super;
orte_errmgr_error_order_t order;
orte_errmgr_error_callback_fn_t *callback;
} orte_errmgr_cback_t;
OBJ_CLASS_DECLARATION(orte_errmgr_cback_t);
/* declare the base default module */
ORTE_DECLSPEC extern orte_errmgr_base_module_t orte_errmgr_default_fns;
@ -75,12 +68,5 @@ ORTE_DECLSPEC int orte_errmgr_base_abort_peers(orte_process_name_t *procs,
orte_std_cntr_t num_procs,
int error_code);
ORTE_DECLSPEC void orte_errmgr_base_register_migration_warning(struct timeval *tv);
ORTE_DECLSPEC int orte_errmgr_base_register_error_callback(orte_errmgr_error_callback_fn_t *cbfunc,
orte_errmgr_error_order_t order);
ORTE_DECLSPEC void orte_errmgr_base_execute_error_callbacks(opal_pointer_array_t *errors);
END_C_DECLS
#endif

Просмотреть файл

@ -56,17 +56,11 @@
* HNP module
******************/
orte_errmgr_base_module_t orte_errmgr_default_app_module = {
init,
finalize,
orte_errmgr_base_log,
orte_errmgr_base_abort,
abort_peers,
NULL,
NULL,
NULL,
orte_errmgr_base_register_migration_warning,
orte_errmgr_base_register_error_callback,
orte_errmgr_base_execute_error_callbacks
.init = init,
.finalize = finalize,
.logfn = orte_errmgr_base_log,
.abort = orte_errmgr_base_abort,
.abort_peers = abort_peers
};
static void proc_errors(int fd, short args, void *cbdata);
@ -77,6 +71,7 @@ static void register_cbfunc(int status, size_t errhndler, void *cbdata)
{
volatile bool *active = (volatile bool*)cbdata;
myerrhandle = errhndler;
ORTE_POST_OBJECT(active);
*active = false;
}
@ -112,7 +107,7 @@ static void notify_cbfunc(int status,
}
/* push it into our event base */
ORTE_ACTIVATE_PROC_STATE(ORTE_PROC_MY_NAME, state);
ORTE_ACTIVATE_PROC_STATE((orte_process_name_t*)source, state);
}
/************************
@ -154,8 +149,8 @@ static void proc_errors(int fd, short args, void *cbdata)
{
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
char *nodename;
orte_error_t err;
opal_pointer_array_t errors;
ORTE_ACQUIRE_OBJECT(caddy);
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_framework.framework_output,
"%s errmgr:default_app: proc %s state %s",
@ -171,14 +166,6 @@ static void proc_errors(int fd, short args, void *cbdata)
return;
}
/* pass the error to the error_callbacks for processing */
OBJ_CONSTRUCT(&errors, opal_pointer_array_t);
opal_pointer_array_init(&errors, 1, INT_MAX, 1);
err.errcode = caddy->proc_state;
err.proc = caddy->name;
opal_pointer_array_add(&errors, &err);
if (ORTE_PROC_STATE_UNABLE_TO_SEND_MSG == caddy->proc_state) {
/* we can't send a message - print a message */
nodename = orte_get_proc_hostname(&caddy->name);
@ -197,9 +184,6 @@ static void proc_errors(int fd, short args, void *cbdata)
orte_abnormal_term_ordered = true;
}
orte_errmgr_base_execute_error_callbacks(&errors);
OBJ_DESTRUCT(&errors);
OBJ_RELEASE(caddy);
}

Просмотреть файл

@ -50,6 +50,7 @@
#include "orte/util/proc_info.h"
#include "orte/util/show_help.h"
#include "orte/util/nidmap.h"
#include "orte/util/threads.h"
#include "orte/runtime/orte_globals.h"
#include "orte/runtime/orte_locks.h"
@ -66,32 +67,15 @@ static int init(void);
static int finalize(void);
static void hnp_abort(int error_code, char *fmt, ...);
static int predicted_fault(opal_list_t *proc_list,
opal_list_t *node_list,
opal_list_t *suggested_map);
static int suggest_map_targets(orte_proc_t *proc,
orte_node_t *oldnode,
opal_list_t *node_list);
static int ft_event(int state);
/******************
* default_hnp module
******************/
orte_errmgr_base_module_t orte_errmgr_default_hnp_module = {
init,
finalize,
orte_errmgr_base_log,
hnp_abort,
orte_errmgr_base_abort_peers,
predicted_fault,
suggest_map_targets,
ft_event,
orte_errmgr_base_register_migration_warning,
NULL,
orte_errmgr_base_execute_error_callbacks
.init = init,
.finalize = finalize,
.logfn = orte_errmgr_base_log,
.abort = hnp_abort,
.abort_peers = orte_errmgr_base_abort_peers
};
@ -129,6 +113,7 @@ static int finalize(void)
static void wakeup(int sd, short args, void *cbdata)
{
/* nothing more we can do */
ORTE_ACQUIRE_OBJECT(cbdata);
orte_quit(0, 0, NULL);
}
@ -187,6 +172,7 @@ static void hnp_abort(int error_code, char *fmt, ...)
timer->tv.tv_usec = 0;
opal_event_evtimer_set(orte_event_base, timer->ev, wakeup, NULL);
opal_event_set_priority(timer->ev, ORTE_ERROR_PRI);
ORTE_POST_OBJECT(timer);
opal_event_evtimer_add(timer->ev, &timer->tv);
}
@ -202,6 +188,8 @@ static void job_errors(int fd, short args, void *cbdata)
int32_t rc, ret;
int room, *rmptr;
ORTE_ACQUIRE_OBJECT(caddy);
/*
* if orte is trying to shutdown, just let it
*/
@ -363,6 +351,8 @@ static void proc_errors(int fd, short args, void *cbdata)
int32_t i32, *i32ptr;
char *rtmod;
ORTE_ACQUIRE_OBJECT(caddy);
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_framework.framework_output,
"%s errmgr:default_hnp: for proc %s state %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
@ -497,7 +487,7 @@ static void proc_errors(int fd, short args, void *cbdata)
}
}
keep_going:
keep_going:
/* if this is a continuously operating job, then there is nothing more
* to do - we let the job continue to run */
if (orte_get_attribute(&jdata->attributes, ORTE_JOB_CONTINUOUS_OP, NULL, OPAL_BOOL)) {
@ -798,25 +788,6 @@ static void proc_errors(int fd, short args, void *cbdata)
OBJ_RELEASE(caddy);
}
static int predicted_fault(opal_list_t *proc_list,
opal_list_t *node_list,
opal_list_t *suggested_map)
{
return ORTE_ERR_NOT_IMPLEMENTED;
}
static int suggest_map_targets(orte_proc_t *proc,
orte_node_t *oldnode,
opal_list_t *node_list)
{
return ORTE_ERR_NOT_IMPLEMENTED;
}
static int ft_event(int state)
{
return ORTE_SUCCESS;
}
/*****************
* Local Functions
*****************/

Просмотреть файл

@ -33,6 +33,7 @@
#include "orte/util/session_dir.h"
#include "orte/util/show_help.h"
#include "orte/util/nidmap.h"
#include "orte/util/threads.h"
#include "orte/mca/iof/base/base.h"
#include "orte/mca/rml/rml.h"
@ -60,32 +61,16 @@
static int init(void);
static int finalize(void);
static void orted_abort(int error_code, char *fmt, ...);
static int predicted_fault(opal_list_t *proc_list,
opal_list_t *node_list,
opal_list_t *suggested_map);
static int suggest_map_targets(orte_proc_t *proc,
orte_node_t *oldnode,
opal_list_t *node_list);
static int ft_event(int state);
/******************
* default_orted module
******************/
orte_errmgr_base_module_t orte_errmgr_default_orted_module = {
init,
finalize,
orte_errmgr_base_log,
orted_abort,
orte_errmgr_base_abort_peers,
predicted_fault,
suggest_map_targets,
ft_event,
orte_errmgr_base_register_migration_warning,
NULL,
orte_errmgr_base_execute_error_callbacks
.init = init,
.finalize = finalize,
.logfn = orte_errmgr_base_log,
.abort = orted_abort,
.abort_peers = orte_errmgr_base_abort_peers
};
/* Local functions */
@ -125,6 +110,7 @@ static int finalize(void)
static void wakeup(int sd, short args, void *cbdata)
{
/* nothing more we can do */
ORTE_ACQUIRE_OBJECT(cbdata);
orte_quit(0, 0, NULL);
}
@ -231,6 +217,7 @@ static void orted_abort(int error_code, char *fmt, ...)
timer->tv.tv_usec = 0;
opal_event_evtimer_set(orte_event_base, timer->ev, wakeup, NULL);
opal_event_set_priority(timer->ev, ORTE_ERROR_PRI);
ORTE_POST_OBJECT(timer);
opal_event_evtimer_add(timer->ev, &timer->tv);
}
@ -244,6 +231,8 @@ static void job_errors(int fd, short args, void *cbdata)
orte_plm_cmd_flag_t cmd;
opal_buffer_t *alert;
ORTE_ACQUIRE_OBJECT(caddy);
/*
* if orte is trying to shutdown, just let it
*/
@ -330,6 +319,8 @@ static void proc_errors(int fd, short args, void *cbdata)
int rc=ORTE_SUCCESS;
int i;
ORTE_ACQUIRE_OBJECT(caddy);
OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output,
"%s errmgr:default_orted:proc_errors process %s error state %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
@ -720,30 +711,10 @@ static void proc_errors(int fd, short args, void *cbdata)
return;
}
cleanup:
cleanup:
OBJ_RELEASE(caddy);
}
static int predicted_fault(opal_list_t *proc_list,
opal_list_t *node_list,
opal_list_t *suggested_map)
{
return ORTE_ERR_NOT_IMPLEMENTED;
}
static int suggest_map_targets(orte_proc_t *proc,
orte_node_t *oldnode,
opal_list_t *node_list)
{
return ORTE_ERR_NOT_IMPLEMENTED;
}
static int ft_event(int state)
{
return ORTE_SUCCESS;
}
/*****************
* Local Functions
*****************/

Просмотреть файл

@ -31,6 +31,7 @@
#include "orte/util/error_strings.h"
#include "orte/util/name_fns.h"
#include "orte/util/show_help.h"
#include "orte/util/threads.h"
#include "orte/runtime/orte_globals.h"
#include "orte/mca/rml/rml.h"
#include "orte/mca/odls/odls_types.h"
@ -54,17 +55,11 @@ static int abort_peers(orte_process_name_t *procs,
* HNP module
******************/
orte_errmgr_base_module_t orte_errmgr_default_tool_module = {
init,
finalize,
orte_errmgr_base_log,
orte_errmgr_base_abort,
abort_peers,
NULL,
NULL,
NULL,
orte_errmgr_base_register_migration_warning,
orte_errmgr_base_register_error_callback,
orte_errmgr_base_execute_error_callbacks
.init= init,
.finalize = finalize,
.logfn = orte_errmgr_base_log,
.abort = orte_errmgr_base_abort,
.abort_peers = abort_peers
};
static void proc_errors(int fd, short args, void *cbdata);
@ -89,6 +84,8 @@ static void proc_errors(int fd, short args, void *cbdata)
{
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
ORTE_ACQUIRE_OBJECT(caddy);
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_framework.framework_output,
"%s errmgr:default_tool: proc %s state %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
@ -106,6 +103,7 @@ static void proc_errors(int fd, short args, void *cbdata)
/* if we lost our lifeline, then just stop the event loop
* so the main program can cleanly terminate */
if (ORTE_PROC_STATE_LIFELINE_LOST == caddy->proc_state) {
ORTE_POST_OBJECT(caddy);
orte_event_base_active = false;
} else {
/* all other errors require abort */

Просмотреть файл

@ -50,6 +50,7 @@
#include "orte/util/proc_info.h"
#include "orte/util/show_help.h"
#include "orte/util/nidmap.h"
#include "orte/util/threads.h"
#include "orte/runtime/orte_globals.h"
#include "orte/runtime/orte_locks.h"
@ -65,32 +66,15 @@
static int init(void);
static int finalize(void);
static int predicted_fault(opal_list_t *proc_list,
opal_list_t *node_list,
opal_list_t *suggested_map);
static int suggest_map_targets(orte_proc_t *proc,
orte_node_t *oldnode,
opal_list_t *node_list);
static int ft_event(int state);
/******************
* dvm module
******************/
orte_errmgr_base_module_t orte_errmgr_dvm_module = {
init,
finalize,
orte_errmgr_base_log,
orte_errmgr_base_abort,
orte_errmgr_base_abort_peers,
predicted_fault,
suggest_map_targets,
ft_event,
orte_errmgr_base_register_migration_warning,
NULL,
orte_errmgr_base_execute_error_callbacks
.init = init,
.finalize = finalize,
.logfn = orte_errmgr_base_log,
.abort = orte_errmgr_base_abort,
.abort_peers = orte_errmgr_base_abort_peers
};
@ -146,6 +130,8 @@ static void job_errors(int fd, short args, void *cbdata)
int32_t rc, ret;
int room, *rmptr;
ORTE_ACQUIRE_OBJECT(caddy);
/*
* if orte is trying to shutdown, just let it
*/
@ -248,6 +234,8 @@ static void proc_errors(int fd, short args, void *cbdata)
int32_t i32, *i32ptr;
char *rtmod;
ORTE_ACQUIRE_OBJECT(caddy);
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_framework.framework_output,
"%s errmgr:dvm: for proc %s state %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
@ -386,7 +374,7 @@ static void proc_errors(int fd, short args, void *cbdata)
}
}
keep_going:
keep_going:
/* ensure we record the failed proc properly so we can report
* the error once we terminate
*/
@ -643,22 +631,3 @@ static void proc_errors(int fd, short args, void *cbdata)
cleanup:
OBJ_RELEASE(caddy);
}
static int predicted_fault(opal_list_t *proc_list,
opal_list_t *node_list,
opal_list_t *suggested_map)
{
return ORTE_ERR_NOT_IMPLEMENTED;
}
static int suggest_map_targets(orte_proc_t *proc,
orte_node_t *oldnode,
opal_list_t *node_list)
{
return ORTE_ERR_NOT_IMPLEMENTED;
}
static int ft_event(int state)
{
return ORTE_SUCCESS;
}

Просмотреть файл

@ -14,7 +14,7 @@
* Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved.
* Copyright (c) 2011-2015 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2013 Intel, Inc. All rights reserved.
* Copyright (c) 2013-2017 Intel, Inc. All rights reserved.
* Copyright (c) 2014 NVIDIA Corporation. All rights reserved.
* $COPYRIGHT$
*
@ -63,70 +63,6 @@
BEGIN_C_DECLS
/*
* Structure to describe a predicted process fault.
*
* This can be expanded in the future to support assurance levels, and
* additional information that may wish to be conveyed.
*/
struct orte_errmgr_predicted_proc_t {
/** This is an object, so must have a super */
opal_list_item_t super;
/** Process Name */
orte_process_name_t proc_name;
};
typedef struct orte_errmgr_predicted_proc_t orte_errmgr_predicted_proc_t;
OBJ_CLASS_DECLARATION(orte_errmgr_predicted_proc_t);
/*
* Structure to describe a predicted node fault.
*
* This can be expanded in the future to support assurance levels, and
* additional information that may wish to be conveyed.
*/
struct orte_errmgr_predicted_node_t {
/** This is an object, so must have a super */
opal_list_item_t super;
/** Node Name */
char * node_name;
};
typedef struct orte_errmgr_predicted_node_t orte_errmgr_predicted_node_t;
OBJ_CLASS_DECLARATION(orte_errmgr_predicted_node_t);
/*
* Structure to describe a suggested remapping element for a predicted fault.
*
* This can be expanded in the future to support weights , and
* additional information that may wish to be conveyed.
*/
struct orte_errmgr_predicted_map_t {
/** This is an object, so must have a super */
opal_list_item_t super;
/** Process Name (predicted to fail) */
orte_process_name_t proc_name;
/** Node Name (predicted to fail) */
char * node_name;
/** Process Name (Map to) */
orte_process_name_t map_proc_name;
/** Node Name (Map to) */
char * map_node_name;
/** Just off current node */
bool off_current_node;
/** Pre-map fixed node assignment */
char * pre_map_fixed_node;
};
typedef struct orte_errmgr_predicted_map_t orte_errmgr_predicted_map_t;
OBJ_CLASS_DECLARATION(orte_errmgr_predicted_map_t);
/*
* Macro definitions
*/
@ -183,84 +119,6 @@ typedef int (*orte_errmgr_base_module_abort_peers_fn_t)(orte_process_name_t *pro
orte_std_cntr_t num_procs,
int error_code);
/**
* Predicted process/node failure notification
*
* @param[in] proc_list List of processes (or NULL if none)
* @param[in] node_list List of nodes (or NULL if none)
* @param[in] suggested_map List of mapping suggestions to use on recovery (or NULL if none)
*
* @retval ORTE_SUCCESS The operation completed successfully
* @retval ORTE_ERROR An unspecifed error occurred
*/
typedef int (*orte_errmgr_base_module_predicted_fault_fn_t)(opal_list_t *proc_list,
opal_list_t *node_list,
opal_list_t *suggested_map);
/**
* Suggest a node to map a restarting process onto
*
* @param[in] proc Process that is being mapped
* @param[in] oldnode Previous node where this process resided
* @param[in|out] node_list List of nodes to select from
*
* @retval ORTE_SUCCESS The operation completed successfully
* @retval ORTE_ERROR An unspecifed error occurred
*/
typedef int (*orte_errmgr_base_module_suggest_map_targets_fn_t)(orte_proc_t *proc,
orte_node_t *oldnode,
opal_list_t *node_list);
/**
* Handle fault tolerance updates
*
* @param[in] state Fault tolerance state update
*
* @retval ORTE_SUCCESS The operation completed successfully
* @retval ORTE_ERROR An unspecifed error occurred
*/
typedef int (*orte_errmgr_base_module_ft_event_fn_t)(int state);
/**
* Function to perform actions that require the rest of the ORTE layer to be up
* and running.
*
* @retval ORTE_SUCCESS The operation completed successfully
* @retval ORTE_ERROR An unspecified error occured
*/
typedef void (*orte_errmgr_base_module_register_migration_warning_fn_t)(struct timeval *tv);
typedef enum {
ORTE_ERRMGR_CALLBACK_FIRST,
ORTE_ERRMGR_CALLBACK_LAST,
ORTE_ERRMGR_CALLBACK_PREPEND,
ORTE_ERRMGR_CALLBACK_APPEND
} orte_errmgr_error_order_t;
/**
* Register a callback function for faults.
*
* This callback function will be used anytime (other than during finalize) the
* runtime detects and handles a critical failure. The runtime will complete all
* its stabilization before cycling thru all registered callbacks. The order of
* the callbacks will proceed in the indicated order with which they were registered.
*
* The parameter to the callback function will be the orte_process_name_t
* of the process involved in the error.
*
* @param[in] cbfunc The callback function.
*
*/
typedef struct {
orte_process_name_t proc;
int errcode;
} orte_error_t;
typedef int (orte_errmgr_error_callback_fn_t)(opal_pointer_array_t *errors);
typedef int (*orte_errmgr_base_module_register_error_callback_fn_t)(orte_errmgr_error_callback_fn_t *cbfunc,
orte_errmgr_error_order_t order);
typedef void (*orte_errmgr_base_module_execute_error_callbacks_fn_t)(opal_pointer_array_t *errors);
/*
* Module Structure
*/
@ -273,21 +131,6 @@ struct orte_errmgr_base_module_2_3_0_t {
orte_errmgr_base_module_log_fn_t logfn;
orte_errmgr_base_module_abort_fn_t abort;
orte_errmgr_base_module_abort_peers_fn_t abort_peers;
/** Predicted process/node failure notification */
orte_errmgr_base_module_predicted_fault_fn_t predicted_fault;
/** Suggest a node to map a restarting process onto */
orte_errmgr_base_module_suggest_map_targets_fn_t suggest_map_targets;
/** Handle any FT Notifications */
orte_errmgr_base_module_ft_event_fn_t ft_event;
/* Register to be warned of impending migration */
orte_errmgr_base_module_register_migration_warning_fn_t register_migration_warning;
/* Register a callback function */
orte_errmgr_base_module_register_error_callback_fn_t register_error_callback;
orte_errmgr_base_module_execute_error_callbacks_fn_t execute_error_callbacks;
};
typedef struct orte_errmgr_base_module_2_3_0_t orte_errmgr_base_module_2_3_0_t;
typedef orte_errmgr_base_module_2_3_0_t orte_errmgr_base_module_t;

Просмотреть файл

@ -9,6 +9,7 @@
* All rights reserved.
* Copyright (c) 2012-2013 Los Alamos National Security, LLC.
* All rights reserved
* Copyright (c) 2017 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -32,18 +33,18 @@
* Globals
*/
ORTE_DECLSPEC orte_filem_base_module_t orte_filem = {
orte_filem_base_module_init,
orte_filem_base_module_finalize,
orte_filem_base_none_put,
orte_filem_base_none_put_nb,
orte_filem_base_none_get,
orte_filem_base_none_get_nb,
orte_filem_base_none_rm,
orte_filem_base_none_rm_nb,
orte_filem_base_none_wait,
orte_filem_base_none_wait_all,
orte_filem_base_none_preposition_files,
orte_filem_base_none_link_local_files
.filem_init = orte_filem_base_module_init,
.filem_finalize = orte_filem_base_module_finalize,
.put = orte_filem_base_none_put,
.put_nb = orte_filem_base_none_put_nb,
.get = orte_filem_base_none_get,
.get_nb = orte_filem_base_none_get_nb,
.rm = orte_filem_base_none_rm,
.rm_nb = orte_filem_base_none_rm_nb,
.wait = orte_filem_base_none_wait,
.wait_all = orte_filem_base_none_wait_all,
.preposition_files = orte_filem_base_none_preposition_files,
.link_local_files = orte_filem_base_none_link_local_files
};
bool orte_filem_base_is_active = false;
@ -69,4 +70,3 @@ static int orte_filem_base_open(mca_base_open_flag_t flags)
MCA_BASE_FRAMEWORK_DECLARE(orte, filem, NULL, NULL, orte_filem_base_open, orte_filem_base_close,
mca_filem_base_static_components, 0);

Просмотреть файл

@ -2,7 +2,7 @@
* Copyright (c) 2012-2013 Los Alamos National Security, LLC.
* All rights reserved
* Copyright (c) 2013 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
* Copyright (c) 2015-2017 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* $COPYRIGHT$
@ -49,6 +49,7 @@
#include "orte/util/name_fns.h"
#include "orte/util/proc_info.h"
#include "orte/util/session_dir.h"
#include "orte/util/threads.h"
#include "orte/runtime/orte_globals.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/grpcomm/base/base.h"
@ -61,14 +62,6 @@
static int raw_init(void);
static int raw_finalize(void);
static int raw_put(orte_filem_base_request_t *req);
static int raw_put_nb(orte_filem_base_request_t *req);
static int raw_get(orte_filem_base_request_t *req);
static int raw_get_nb(orte_filem_base_request_t *req);
static int raw_rm(orte_filem_base_request_t *req);
static int raw_rm_nb(orte_filem_base_request_t *req);
static int raw_wait(orte_filem_base_request_t *req);
static int raw_wait_all(opal_list_t *reqs);
static int raw_preposition_files(orte_job_t *jdata,
orte_filem_completion_cbfunc_t cbfunc,
void *cbdata);
@ -76,20 +69,20 @@ static int raw_link_local_files(orte_job_t *jdata,
orte_app_context_t *app);
orte_filem_base_module_t mca_filem_raw_module = {
raw_init,
raw_finalize,
.filem_init = raw_init,
.filem_finalize = raw_finalize,
/* we don't use any of the following */
raw_put,
raw_put_nb,
raw_get,
raw_get_nb,
raw_rm,
raw_rm_nb,
raw_wait,
raw_wait_all,
.put = orte_filem_base_none_put,
.put_nb = orte_filem_base_none_put_nb,
.get = orte_filem_base_none_get,
.get_nb = orte_filem_base_none_get_nb,
.rm = orte_filem_base_none_rm,
.rm_nb = orte_filem_base_none_rm_nb,
.wait = orte_filem_base_none_wait,
.wait_all = orte_filem_base_none_wait_all,
/* now the APIs we *do* use */
raw_preposition_files,
raw_link_local_files
.preposition_files = raw_preposition_files,
.link_local_files = raw_link_local_files
};
static opal_list_t outbound_files;
@ -164,46 +157,6 @@ static int raw_finalize(void)
return ORTE_SUCCESS;
}
static int raw_put(orte_filem_base_request_t *req)
{
return ORTE_SUCCESS;
}
static int raw_put_nb(orte_filem_base_request_t *req)
{
return ORTE_SUCCESS;
}
static int raw_get(orte_filem_base_request_t *req)
{
return ORTE_SUCCESS;
}
static int raw_get_nb(orte_filem_base_request_t *req)
{
return ORTE_SUCCESS;
}
static int raw_rm(orte_filem_base_request_t *req)
{
return ORTE_SUCCESS;
}
static int raw_rm_nb(orte_filem_base_request_t *req)
{
return ORTE_SUCCESS;
}
static int raw_wait(orte_filem_base_request_t *req)
{
return ORTE_SUCCESS;
}
static int raw_wait_all(opal_list_t *reqs)
{
return ORTE_SUCCESS;
}
static void xfer_complete(int status, orte_filem_raw_xfer_t *xfer)
{
orte_filem_raw_outbound_t *outbound = xfer->outbound;
@ -586,8 +539,9 @@ static int raw_preposition_files(orte_job_t *jdata,
opal_list_append(&outbound->xfers, &xfer->super);
opal_event_set(orte_event_base, &xfer->ev, fd, OPAL_EV_READ, send_chunk, xfer);
opal_event_set_priority(&xfer->ev, ORTE_MSG_PRI);
opal_event_add(&xfer->ev, 0);
xfer->pending = true;
ORTE_POST_OBJECT(xfer);
opal_event_add(&xfer->ev, 0);
OBJ_RELEASE(item);
}
OBJ_DESTRUCT(&fsets);
@ -804,6 +758,8 @@ static void send_chunk(int fd, short argc, void *cbdata)
opal_buffer_t chunk;
orte_grpcomm_signature_t *sig;
ORTE_ACQUIRE_OBJECT(rev);
/* flag that event has fired */
rev->pending = false;
@ -815,6 +771,7 @@ static void send_chunk(int fd, short argc, void *cbdata)
/* non-blocking, retry */
if (EAGAIN == errno || EINTR == errno) {
ORTE_POST_OBJECT(rev);
opal_event_add(&rev->ev, 0);
return;
}
@ -891,8 +848,9 @@ static void send_chunk(int fd, short argc, void *cbdata)
return;
} else {
/* restart the read event */
opal_event_add(&rev->ev, 0);
rev->pending = true;
ORTE_POST_OBJECT(rev);
opal_event_add(&rev->ev, 0);
}
}
@ -1116,7 +1074,8 @@ static void recv_files(int status, orte_process_name_t* sender,
}
}
free(tmp);
opal_event_set(orte_event_base, &incoming->ev, incoming->fd, OPAL_EV_WRITE, write_handler, incoming);
opal_event_set(orte_event_base, &incoming->ev, incoming->fd,
OPAL_EV_WRITE, write_handler, incoming);
opal_event_set_priority(&incoming->ev, ORTE_MSG_PRI);
}
/* create an output object for this data */
@ -1135,8 +1094,9 @@ static void recv_files(int status, orte_process_name_t* sender,
if (!incoming->pending) {
/* add the event */
opal_event_add(&incoming->ev, 0);
incoming->pending = true;
ORTE_POST_OBJECT(incoming);
opal_event_add(&incoming->ev, 0);
}
/* cleanup */
@ -1154,6 +1114,8 @@ static void write_handler(int fd, short event, void *cbdata)
char homedir[MAXPATHLEN];
int rc;
ORTE_ACQUIRE_OBJECT(sink);
OPAL_OUTPUT_VERBOSE((1, orte_filem_base_framework.framework_output,
"%s write:handler writing data to %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
@ -1226,8 +1188,9 @@ static void write_handler(int fd, short event, void *cbdata)
/* leave the write event running so it will call us again
* when the fd is ready.
*/
opal_event_add(&sink->ev, 0);
sink->pending = true;
ORTE_POST_OBJECT(sink);
opal_event_add(&sink->ev, 0);
return;
}
/* otherwise, something bad happened so all we can do is abort
@ -1250,8 +1213,9 @@ static void write_handler(int fd, short event, void *cbdata)
/* leave the write event running so it will call us again
* when the fd is ready
*/
opal_event_add(&sink->ev, 0);
sink->pending = true;
ORTE_POST_OBJECT(sink);
opal_event_add(&sink->ev, 0);
return;
}
OBJ_RELEASE(output);

Просмотреть файл

@ -44,6 +44,7 @@
#include "orte/mca/state/state.h"
#include "orte/util/name_fns.h"
#include "orte/util/nidmap.h"
#include "orte/util/threads.h"
#include "orte/runtime/orte_globals.h"
#include "orte/mca/grpcomm/grpcomm.h"
@ -144,6 +145,8 @@ static void allgather_stub(int fd, short args, void *cbdata)
orte_grpcomm_coll_t *coll;
uint32_t *seq_number;
ORTE_ACQUIRE_OBJECT(cd);
OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base_framework.framework_output,
"%s grpcomm:base:allgather stub",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
@ -212,6 +215,7 @@ int orte_grpcomm_API_allgather(orte_grpcomm_signature_t *sig,
cd->cbdata = cbdata;
opal_event_set(orte_event_base, &cd->ev, -1, OPAL_EV_WRITE, allgather_stub, cd);
opal_event_set_priority(&cd->ev, ORTE_MSG_PRI);
ORTE_POST_OBJECT(cd);
opal_event_active(&cd->ev, OPAL_EV_WRITE, 1);
return ORTE_SUCCESS;
}

Просмотреть файл

@ -52,6 +52,7 @@
#include "orte/mca/iof/iof.h"
#include "orte/runtime/orte_globals.h"
#include "orte/mca/rml/rml_types.h"
#include "orte/util/threads.h"
BEGIN_C_DECLS
@ -163,6 +164,7 @@ typedef struct orte_iof_base_t orte_iof_base_t;
opal_event_set_priority(ep->wev->ev, ORTE_MSG_PRI); \
} \
*(snk) = ep; \
ORTE_POST_OBJECT(ep); \
} while(0);
/* add list of structs that has name of proc + orte_iof_tag_t - when
@ -192,6 +194,7 @@ typedef struct orte_iof_base_t orte_iof_base_t;
opal_event_set_priority(rev->ev, ORTE_MSG_PRI); \
if ((actv)) { \
rev->active = true; \
ORTE_POST_OBJECT(rev); \
opal_event_add(rev->ev, 0); \
} \
} while(0);

Просмотреть файл

@ -38,6 +38,7 @@
#include "opal/util/output.h"
#include "orte/util/name_fns.h"
#include "orte/util/threads.h"
#include "orte/runtime/orte_globals.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/state/state.h"
@ -147,7 +148,7 @@ int orte_iof_base_write_output(const orte_process_name_t *name, orte_iof_tag_t s
output->numbytes = numbytes;
goto process;
construct:
construct:
starttaglen = strlen(starttag);
endtaglen = strlen(endtag);
endtagged = false;
@ -249,7 +250,7 @@ construct:
}
output->numbytes = k;
process:
process:
/* add this data to the write list for this fd */
opal_list_append(&channel->outputs, &output->super);
@ -262,8 +263,9 @@ process:
OPAL_OUTPUT_VERBOSE((1, orte_iof_base_framework.framework_output,
"%s write:output adding write event",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
opal_event_add(channel->ev, 0);
channel->pending = true;
ORTE_POST_OBJECT(channel);
opal_event_add(channel->ev, 0);
}
return num_buffered;
@ -303,6 +305,8 @@ void orte_iof_base_write_handler(int fd, short event, void *cbdata)
orte_iof_write_output_t *output;
int num_written;
ORTE_ACQUIRE_OBJECT(sink);
OPAL_OUTPUT_VERBOSE((1, orte_iof_base_framework.framework_output,
"%s write:handler writing data to %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
@ -356,8 +360,8 @@ void orte_iof_base_write_handler(int fd, short event, void *cbdata)
}
OBJ_RELEASE(output);
}
ABORT:
ABORT:
opal_event_del(wev->ev);
wev->pending = false;
ORTE_POST_OBJECT(wev);
}

Просмотреть файл

@ -47,6 +47,7 @@
#include "orte/mca/ess/ess.h"
#include "orte/mca/rml/rml.h"
#include "orte/util/name_fns.h"
#include "orte/util/threads.h"
#include "orte/mca/odls/odls_types.h"
#include "orte/mca/iof/base/base.h"
@ -214,10 +215,13 @@ static int hnp_push(const orte_process_name_t* dst_name, orte_iof_tag_t src_tag,
}
}
proct->revstdout->active = true;
ORTE_POST_OBJECT(proct->revstdout);
opal_event_add(proct->revstdout->ev, 0);
proct->revstderr->active = true;
ORTE_POST_OBJECT(proct->revstderr);
opal_event_add(proct->revstderr->ev, 0);
proct->revstddiag->active = true;
ORTE_POST_OBJECT(proct->revstddiag);
opal_event_add(proct->revstddiag->ev, 0);
}
return ORTE_SUCCESS;
@ -299,6 +303,7 @@ static int hnp_push(const orte_process_name_t* dst_name, orte_iof_tag_t src_tag,
*/
if (!(src_tag & ORTE_IOF_STDIN) || orte_iof_hnp_stdin_check(fd)) {
mca_iof_hnp_component.stdinev->active = true;
ORTE_POST_OBJECT(proct->revstdout);
opal_event_add(mca_iof_hnp_component.stdinev->ev, 0);
}
} else {
@ -515,6 +520,8 @@ static void stdin_write_handler(int fd, short event, void *cbdata)
orte_iof_write_output_t *output;
int num_written;
ORTE_ACQUIRE_OBJECT(sink);
OPAL_OUTPUT_VERBOSE((1, orte_iof_base_framework.framework_output,
"%s hnp:stdin:write:handler writing data to %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
@ -558,6 +565,7 @@ static void stdin_write_handler(int fd, short event, void *cbdata)
* when the fd is ready.
*/
wev->pending = true;
ORTE_POST_OBJECT(wev);
opal_event_add(wev->ev, 0);
goto CHECK;
}
@ -583,13 +591,14 @@ static void stdin_write_handler(int fd, short event, void *cbdata)
* when the fd is ready.
*/
wev->pending = true;
ORTE_POST_OBJECT(wev);
opal_event_add(wev->ev, 0);
goto CHECK;
}
OBJ_RELEASE(output);
}
CHECK:
CHECK:
if (NULL != mca_iof_hnp_component.stdinev &&
!orte_abnormal_term_ordered &&
!mca_iof_hnp_component.stdinev->active) {
@ -610,6 +619,7 @@ CHECK:
OPAL_OUTPUT_VERBOSE((1, orte_iof_base_framework.framework_output,
"restarting read event"));
mca_iof_hnp_component.stdinev->active = true;
ORTE_POST_OBJECT(mca_iof_hnp_component.stdinev);
opal_event_add(mca_iof_hnp_component.stdinev->ev, 0);
}
}

Просмотреть файл

@ -35,6 +35,7 @@
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/odls/odls_types.h"
#include "orte/util/name_fns.h"
#include "orte/util/threads.h"
#include "orte/mca/state/state.h"
#include "orte/runtime/orte_globals.h"
#include "orte/runtime/orte_wait.h"
@ -48,10 +49,13 @@ static void restart_stdin(int fd, short event, void *cbdata)
{
orte_timer_t *tm = (orte_timer_t*)cbdata;
ORTE_ACQUIRE_OBJECT(tm);
if (NULL != mca_iof_hnp_component.stdinev &&
!orte_job_term_ordered &&
!mca_iof_hnp_component.stdinev->active) {
mca_iof_hnp_component.stdinev->active = true;
ORTE_POST_OBJECT(mca_iof_hnp_component.stdinev);
opal_event_add(mca_iof_hnp_component.stdinev->ev, 0);
}
@ -74,7 +78,11 @@ bool orte_iof_hnp_stdin_check(int fd)
void orte_iof_hnp_stdin_cb(int fd, short event, void *cbdata)
{
bool should_process = orte_iof_hnp_stdin_check(0);
bool should_process;
ORTE_ACQUIRE_OBJECT(mca_iof_hnp_component.stdinev);
should_process = orte_iof_hnp_stdin_check(0);
if (should_process) {
mca_iof_hnp_component.stdinev->active = true;
@ -99,6 +107,8 @@ void orte_iof_hnp_read_local_handler(int fd, short event, void *cbdata)
bool exclusive;
orte_iof_sink_t *sink;
ORTE_ACQUIRE_OBJECT(rev);
/* read up to the fragment size */
numbytes = read(fd, data, sizeof(data));
@ -293,6 +303,7 @@ void orte_iof_hnp_read_local_handler(int fd, short event, void *cbdata)
}
/* re-add the event */
ORTE_POST_OBJECT(rev);
opal_event_add(rev->ev, 0);
return;

Просмотреть файл

@ -12,7 +12,7 @@
* Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2011-2013 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2014-2016 Intel Corporation. All rights reserved.
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -41,6 +41,7 @@
#include "orte/mca/rml/rml.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/util/name_fns.h"
#include "orte/util/threads.h"
#include "orte/runtime/orte_globals.h"
#include "orte/mca/iof/iof.h"
@ -81,6 +82,7 @@ void orte_iof_hnp_recv(int status, orte_process_name_t* sender,
!orte_job_term_ordered &&
!mca_iof_hnp_component.stdinev->active) {
mca_iof_hnp_component.stdinev->active = true;
ORTE_POST_OBJECT(mca_iof_hnp_component.stdinev);
opal_event_add(mca_iof_hnp_component.stdinev->ev, 0);
}
goto CLEAN_RETURN;

Просмотреть файл

@ -42,6 +42,7 @@
#include "orte/mca/errmgr/errmgr.h"
#include "orte/util/name_fns.h"
#include "orte/util/threads.h"
#include "orte/runtime/orte_globals.h"
#include "orte/mca/odls/odls_types.h"
#include "orte/mca/rml/rml.h"
@ -190,10 +191,13 @@ SETUP:
*/
if (NULL != proct->revstdout && NULL != proct->revstderr && NULL != proct->revstddiag) {
proct->revstdout->active = true;
ORTE_POST_OBJECT(proct->revstdout);
opal_event_add(proct->revstdout->ev, 0);
proct->revstderr->active = true;
ORTE_POST_OBJECT(proct->revstderr);
opal_event_add(proct->revstderr->ev, 0);
proct->revstddiag->active = true;
ORTE_POST_OBJECT(proct->revstddiag);
opal_event_add(proct->revstddiag->ev, 0);
}
return ORTE_SUCCESS;
@ -367,6 +371,8 @@ static void stdin_write_handler(int fd, short event, void *cbdata)
orte_iof_write_output_t *output;
int num_written;
ORTE_ACQUIRE_OBJECT(sink);
OPAL_OUTPUT_VERBOSE((1, orte_iof_base_framework.framework_output,
"%s orted:stdin:write:handler writing data to %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
@ -400,6 +406,7 @@ static void stdin_write_handler(int fd, short event, void *cbdata)
* when the fd is ready.
*/
wev->pending = true;
ORTE_POST_OBJECT(wev);
opal_event_add(wev->ev, 0);
goto CHECK;
}
@ -430,6 +437,7 @@ static void stdin_write_handler(int fd, short event, void *cbdata)
* when the fd is ready.
*/
wev->pending = true;
ORTE_POST_OBJECT(wev);
opal_event_add(wev->ev, 0);
goto CHECK;
}

Просмотреть файл

@ -35,6 +35,7 @@
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/odls/odls_types.h"
#include "orte/util/name_fns.h"
#include "orte/util/threads.h"
#include "orte/mca/state/state.h"
#include "orte/runtime/orte_globals.h"
@ -52,6 +53,8 @@ void orte_iof_orted_read_handler(int fd, short event, void *cbdata)
int32_t numbytes;
orte_iof_proc_t *proct = (orte_iof_proc_t*)rev->proc;
ORTE_ACQUIRE_OBJECT(rev);
/* read up to the fragment size */
#if !defined(__WINDOWS__)
numbytes = read(fd, data, sizeof(data));
@ -100,6 +103,7 @@ void orte_iof_orted_read_handler(int fd, short event, void *cbdata)
}
if (!proct->copy) {
/* re-add the event */
ORTE_POST_OBJECT(rev);
opal_event_add(rev->ev, 0);
return;
}
@ -137,6 +141,7 @@ void orte_iof_orted_read_handler(int fd, short event, void *cbdata)
orte_rml_send_callback, NULL);
/* re-add the event */
ORTE_POST_OBJECT(rev);
opal_event_add(rev->ev, 0);
return;

Просмотреть файл

@ -10,7 +10,7 @@
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2008-2015 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2014-2015 Intel, Inc. All rights reserved.
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -25,6 +25,7 @@
#include "opal/util/argv.h"
#include "orte/util/attr.h"
#include "orte/util/threads.h"
#include "orte/mca/notifier/base/base.h"
@ -38,6 +39,8 @@ void orte_notifier_base_log(int sd, short args, void *cbdata)
orte_notifier_active_module_t *imod;
int i;
ORTE_ACQUIRE_OBJECT(req);
/* if no modules are active, then there is nothing to do */
if (0 == opal_list_get_size(&orte_notifier_base.modules)) {
return;
@ -74,6 +77,8 @@ void orte_notifier_base_event(int sd, short args, void *cbdata)
orte_notifier_active_module_t *imod;
int i;
ORTE_ACQUIRE_OBJECT(req);
/* if no modules are active, then there is nothing to do */
if (0 == opal_list_get_size(&orte_notifier_base.modules)) {
return;
@ -110,6 +115,8 @@ void orte_notifier_base_report(int sd, short args, void *cbdata)
orte_notifier_active_module_t *imod;
int i;
ORTE_ACQUIRE_OBJECT(req);
/* if no modules are active, then there is nothing to do */
if (0 == opal_list_get_size(&orte_notifier_base.modules)) {
return;

Просмотреть файл

@ -13,7 +13,7 @@
* Copyright (c) 2009 Cisco Systems, Inc. All Rights Reserved.
* Copyright (c) 2012-2015 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2014-2015 Intel, Inc. All rights reserved.
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -51,6 +51,7 @@
#include "orte/types.h"
#include "orte/runtime/orte_globals.h"
#include "orte/util/threads.h"
BEGIN_C_DECLS
@ -63,7 +64,7 @@ ORTE_DECLSPEC extern int orte_notifier_debug_output;
* The code has NOT been auditied for use of malloc, so this still
* may fail to get the "OUT_OF_RESOURCE" message out. Oh Well.
*/
#define ORTE_NOTIFIER_MAX_BUF 512
#define ORTE_NOTIFIER_MAX_BUF 512
/* Severities */
typedef enum {
@ -136,6 +137,7 @@ typedef void (*orte_notifier_base_module_report_fn_t)(orte_notifier_request_t *r
opal_event_set(orte_notifier_base.ev_base, &(_n)->ev, -1, \
OPAL_EV_WRITE, orte_notifier_base_log, (_n)); \
opal_event_set_priority(&(_n)->ev, ORTE_ERROR_PRI); \
ORTE_POST_OBJECT(_n); \
opal_event_active(&(_n)->ev, OPAL_EV_WRITE, 1); \
} while(0);
@ -160,6 +162,7 @@ typedef void (*orte_notifier_base_module_report_fn_t)(orte_notifier_request_t *r
opal_event_set(orte_notifier_base.ev_base, &(_n)->ev, -1, \
OPAL_EV_WRITE, orte_notifier_base_report, (_n)); \
opal_event_set_priority(&(_n)->ev, ORTE_ERROR_PRI); \
ORTE_POST_OBJECT(_n); \
opal_event_active(&(_n)->ev, OPAL_EV_WRITE, 1); \
} while(0);
@ -183,6 +186,7 @@ typedef void (*orte_notifier_base_module_report_fn_t)(orte_notifier_request_t *r
opal_event_set(orte_notifier_base.ev_base, &(_n)->ev, -1, \
OPAL_EV_WRITE, orte_notifier_base_event, (_n)); \
opal_event_set_priority(&(_n)->ev, ORTE_ERROR_PRI); \
ORTE_POST_OBJECT(_n); \
opal_event_active(&(_n)->ev, OPAL_EV_WRITE, 1); \
} while(0);

Просмотреть файл

@ -11,7 +11,7 @@
* All rights reserved.
* Copyright (c) 2007 Sun Microsystems, Inc. All rights reserved.
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2014 Intel, Inc. All rights reserved.
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -50,18 +50,10 @@
/* Static API's */
static void mylog(orte_notifier_base_severity_t severity, int errcode,
const char *msg, va_list ap);
static void myhelplog(orte_notifier_base_severity_t severity, int errcode,
const char *filename,
const char *topic, va_list ap);
static void mypeerlog(orte_notifier_base_severity_t severity, int errcode,
orte_process_name_t *peer_proc,
const char *msg, va_list ap);
/* Module */
orte_notifier_base_module_t orte_notifier_smtp_module = {
NULL,
NULL,
mylog,
.log = mylog
};
typedef enum {

Просмотреть файл

@ -10,7 +10,7 @@
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2007 Sun Microsystems, Inc. All rights reserved.
* Copyright (c) 2014-2015 Intel, Inc. All rights reserved.
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -48,11 +48,11 @@ static void myreport(orte_notifier_request_t *req);
/* Module def */
orte_notifier_base_module_t orte_notifier_syslog_module = {
init,
finalize,
mylog,
myevent,
myreport
.init = init,
.finalize = finalize,
.log = mylog,
.event = myevent,
.report = myreport
};
@ -130,4 +130,3 @@ static void myreport(orte_notifier_request_t *req)
orte_job_state_to_str(req->state),
(NULL == req->msg) ? "<N/A>" : req->msg);
}

Просмотреть файл

@ -81,6 +81,7 @@
#include "orte/util/proc_info.h"
#include "orte/util/nidmap.h"
#include "orte/util/show_help.h"
#include "orte/util/threads.h"
#include "orte/runtime/orte_globals.h"
#include "orte/runtime/orte_wait.h"
#include "orte/orted/orted.h"
@ -582,6 +583,8 @@ static void timer_cb(int fd, short event, void *cbdata)
orte_timer_t *tm = (orte_timer_t*)cbdata;
orte_odls_launch_local_t *ll = (orte_odls_launch_local_t*)tm->payload;
ORTE_ACQUIRE_OBJECT(tm);
/* increment the number of retries */
ll->retries++;
@ -629,6 +632,8 @@ void orte_odls_base_spawn_proc(int fd, short sd, void *cbdata)
char *pathenv = NULL, *mpiexec_pathenv = NULL;
char *full_search;
ORTE_ACQUIRE_OBJECT(cd);
/* thread-protect common values */
cd->env = opal_argv_copy(app->env);
@ -820,6 +825,8 @@ void orte_odls_base_default_launch_local(int fd, short sd, void *cbdata)
opal_event_base_t *evb;
char *effective_dir = NULL;
ORTE_ACQUIRE_OBJECT(caddy);
opal_output_verbose(5, orte_odls_base_framework.framework_output,
"%s local:launch",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));

Просмотреть файл

@ -127,6 +127,7 @@
#include "orte/mca/plm/plm.h"
#include "orte/mca/rtc/rtc.h"
#include "orte/util/name_fns.h"
#include "orte/util/threads.h"
#include "orte/mca/odls/base/base.h"
#include "orte/mca/odls/base/odls_private.h"
@ -157,11 +158,11 @@ static int do_child(orte_odls_spawn_caddy_t *cd, int write_fd)
* Module
*/
orte_odls_base_module_t orte_odls_default_module = {
orte_odls_base_default_get_add_procs_data,
orte_odls_default_launch_local_procs,
orte_odls_default_kill_local_procs,
orte_odls_default_signal_local_procs,
orte_odls_default_restart_proc
.get_add_procs_data = orte_odls_base_default_get_add_procs_data,
.launch_local_procs = orte_odls_default_launch_local_procs,
.kill_local_procs = orte_odls_default_kill_local_procs,
.signal_local_procs = orte_odls_default_signal_local_procs,
.restart_proc = orte_odls_default_restart_proc
};

Просмотреть файл

@ -42,9 +42,11 @@
#include "opal/class/opal_hash_table.h"
#include "opal/class/opal_list.h"
#include "opal/util/timings.h"
#include "orte/mca/mca.h"
#include "opal/mca/event/event.h"
#include "orte/mca/mca.h"
#include "orte/util/threads.h"
#include "orte/mca/oob/oob.h"
BEGIN_C_DECLS
@ -119,11 +121,8 @@ ORTE_DECLSPEC void orte_oob_base_send_nb(int fd, short args, void *cbdata);
__FILE__, __LINE__); \
cd = OBJ_NEW(orte_oob_send_t); \
cd->msg = (m); \
opal_event_set(orte_oob_base.ev_base, &cd->ev, -1, \
OPAL_EV_WRITE, \
orte_oob_base_send_nb, cd); \
opal_event_set_priority(&cd->ev, ORTE_MSG_PRI); \
opal_event_active(&cd->ev, OPAL_EV_WRITE, 1); \
ORTE_THREADSHIFT(cd, orte_oob_base.ev_base, \
orte_oob_base_send_nb, ORTE_MSG_PRI); \
}while(0)
/* Our contact info is actually subject to change as transports
@ -168,11 +167,11 @@ typedef struct {
} mca_oob_uri_req_t;
OBJ_CLASS_DECLARATION(mca_oob_uri_req_t);
#define ORTE_OOB_SET_URI(u) \
do { \
mca_oob_uri_req_t *rq; \
rq = OBJ_NEW(mca_oob_uri_req_t); \
rq->uri = strdup((u)); \
#define ORTE_OOB_SET_URI(u) \
do { \
mca_oob_uri_req_t *rq; \
rq = OBJ_NEW(mca_oob_uri_req_t); \
rq->uri = strdup((u)); \
orte_oob_base_set_addr(0, 0, (void*)rq); \
}while(0)

Просмотреть файл

@ -21,7 +21,7 @@
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/state/state.h"
#include "orte/mca/rml/rml.h"
#include "orte/util/threads.h"
#include "orte/mca/oob/base/base.h"
#if OPAL_ENABLE_FT_CR == 1
#include "orte/mca/state/base/base.h"
@ -32,7 +32,7 @@ static void process_uri(char *uri);
void orte_oob_base_send_nb(int fd, short args, void *cbdata)
{
orte_oob_send_t *cd = (orte_oob_send_t*)cbdata;
orte_rml_send_t *msg = cd->msg;
orte_rml_send_t *msg;
mca_base_component_list_item_t *cli;
orte_oob_base_peer_t *pr;
int rc;
@ -42,7 +42,10 @@ void orte_oob_base_send_nb(int fd, short args, void *cbdata)
bool reachable;
char *uri;
ORTE_ACQUIRE_OBJECT(cd);
/* done with this. release it now */
msg = cd->msg;
OBJ_RELEASE(cd);
opal_output_verbose(5, orte_oob_base_framework.framework_output,
@ -276,7 +279,7 @@ void orte_oob_base_get_addr(char **uri)
}
}
unblock:
unblock:
*uri = final;
}
@ -303,7 +306,10 @@ OBJ_CLASS_INSTANCE(mca_oob_uri_req_t,
void orte_oob_base_set_addr(int fd, short args, void *cbdata)
{
mca_oob_uri_req_t *req = (mca_oob_uri_req_t*)cbdata;
char *uri = req->uri;
char *uri;
ORTE_ACQUIRE_OBJECT(req);
uri = req->uri;
opal_output_verbose(5, orte_oob_base_framework.framework_output,
"%s: set_addr to uri %s",

Просмотреть файл

@ -62,6 +62,7 @@
#include "orte/util/name_fns.h"
#include "orte/util/parse_options.h"
#include "orte/util/show_help.h"
#include "orte/util/threads.h"
#include "orte/runtime/orte_globals.h"
#include "orte/mca/oob/tcp/oob_tcp.h"
@ -253,6 +254,8 @@ static void recv_handler(int sd, short flg, void *cbdata)
mca_oob_tcp_hdr_t hdr;
mca_oob_tcp_peer_t *peer;
ORTE_ACQUIRE_OBJECT(op);
opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output,
"%s:tcp:recv:handler called",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));

Просмотреть файл

@ -74,6 +74,7 @@
#include "orte/util/name_fns.h"
#include "orte/util/parse_options.h"
#include "orte/util/show_help.h"
#include "orte/util/threads.h"
#include "orte/runtime/orte_globals.h"
#include "orte/runtime/orte_wait.h"
@ -698,6 +699,9 @@ static void cleanup(int sd, short args, void *cbdata)
{
opal_list_item_t * item;
bool *active = (bool*)cbdata;
ORTE_ACQUIRE_OBJECT(active);
while (NULL != (item = opal_list_remove_first(&mca_oob_tcp_component.listeners))) {
OBJ_RELEASE(item);
}
@ -756,6 +760,7 @@ static void component_shutdown(void)
opal_event_set(orte_event_base, &ev, -1,
OPAL_EV_WRITE, cleanup, &active);
opal_event_set_priority(&ev, ORTE_ERROR_PRI);
ORTE_POST_OBJECT(active);
opal_event_active(&ev, OPAL_EV_WRITE, 1);
ORTE_WAIT_FOR_COMPLETION(active);
} else {
@ -1062,6 +1067,8 @@ void mca_oob_tcp_component_set_module(int fd, short args, void *cbdata)
int rc;
orte_oob_base_peer_t *bpr;
ORTE_ACQUIRE_OBJECT(pop);
opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output,
"%s tcp:set_module called for peer %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
@ -1093,6 +1100,8 @@ void mca_oob_tcp_component_lost_connection(int fd, short args, void *cbdata)
orte_oob_base_peer_t *bpr;
int rc;
ORTE_ACQUIRE_OBJECT(pop);
opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output,
"%s tcp:lost connection called for peer %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
@ -1128,6 +1137,8 @@ void mca_oob_tcp_component_no_route(int fd, short args, void *cbdata)
int rc;
orte_oob_base_peer_t *bpr;
ORTE_ACQUIRE_OBJECT(mop);
opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output,
"%s tcp:no route called for peer %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
@ -1162,6 +1173,8 @@ void mca_oob_tcp_component_hop_unknown(int fd, short args, void *cbdata)
orte_rml_send_t *snd;
orte_oob_base_peer_t *bpr;
ORTE_ACQUIRE_OBJECT(mop);
opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output,
"%s tcp:unknown hop called for peer %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
@ -1235,6 +1248,8 @@ void mca_oob_tcp_component_failed_to_connect(int fd, short args, void *cbdata)
{
mca_oob_tcp_peer_op_t *pop = (mca_oob_tcp_peer_op_t*)cbdata;
ORTE_ACQUIRE_OBJECT(pop);
opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output,
"%s tcp:failed_to_connect called for peer %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),

Просмотреть файл

@ -63,6 +63,7 @@
#include "orte/util/name_fns.h"
#include "orte/util/show_help.h"
#include "orte/util/threads.h"
#include "orte/mca/state/state.h"
#include "orte/runtime/orte_globals.h"
#include "orte/mca/errmgr/errmgr.h"
@ -152,7 +153,7 @@ static int tcp_peer_create_socket(mca_oob_tcp_peer_t* peer)
void mca_oob_tcp_peer_try_connect(int fd, short args, void *cbdata)
{
mca_oob_tcp_conn_op_t *op = (mca_oob_tcp_conn_op_t*)cbdata;
mca_oob_tcp_peer_t *peer = op->peer;
mca_oob_tcp_peer_t *peer;
int rc;
opal_socklen_t addrlen = 0;
mca_oob_tcp_addr_t *addr;
@ -160,6 +161,9 @@ void mca_oob_tcp_peer_try_connect(int fd, short args, void *cbdata)
mca_oob_tcp_send_t *snd;
bool connected = false;
ORTE_ACQUIRE_OBJECT(op);
peer = op->peer;
opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output,
"%s orte_tcp_peer_try_connect: "
"attempting to connect to proc %s",
@ -586,8 +590,9 @@ void mca_oob_tcp_peer_complete_connect(mca_oob_tcp_peer_t *peer)
ORTE_NAME_PRINT(&(peer->name)));
if (!peer->recv_ev_active) {
opal_event_add(&peer->recv_event, 0);
peer->recv_ev_active = true;
ORTE_POST_OBJECT(peer);
opal_event_add(&peer->recv_event, 0);
}
} else {
opal_output(0, "%s tcp_peer_complete_connect: unable to send connect ack to %s",
@ -608,6 +613,8 @@ static int tcp_peer_send_blocking(int sd, void* data, size_t size)
size_t cnt = 0;
int retval;
ORTE_ACQUIRE_OBJECT(ptr);
opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output,
"%s send blocking of %"PRIsize_t" bytes to socket %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
@ -949,8 +956,9 @@ static void tcp_peer_connected(mca_oob_tcp_peer_t* peer)
opal_list_remove_first(&peer->send_queue);
}
if (NULL != peer->send_msg && !peer->send_ev_active) {
opal_event_add(&peer->send_event, 0);
peer->send_ev_active = true;
ORTE_POST_OBJECT(peer);
opal_event_add(&peer->send_event, 0);
}
}
@ -1214,8 +1222,9 @@ bool mca_oob_tcp_peer_accept(mca_oob_tcp_peer_t* peer)
tcp_peer_connected(peer);
if (!peer->recv_ev_active) {
opal_event_add(&peer->recv_event, 0);
peer->recv_ev_active = true;
ORTE_POST_OBJECT(peer);
opal_event_add(&peer->recv_event, 0);
}
if (OOB_TCP_DEBUG_CONNECT <= opal_output_get_verbosity(orte_oob_base_framework.framework_output)) {
mca_oob_tcp_peer_dump(peer, "accepted");

Просмотреть файл

@ -32,6 +32,7 @@
#include <sys/socket.h>
#endif
#include "orte/util/threads.h"
#include "oob_tcp.h"
#include "oob_tcp_peer.h"
@ -59,10 +60,7 @@ OBJ_CLASS_DECLARATION(mca_oob_tcp_conn_op_t);
ORTE_NAME_PRINT((&(p)->name))); \
cop = OBJ_NEW(mca_oob_tcp_conn_op_t); \
cop->peer = (p); \
opal_event_set((p)->ev_base, &cop->ev, -1, \
OPAL_EV_WRITE, (cbfunc), cop); \
opal_event_set_priority(&cop->ev, ORTE_MSG_PRI); \
opal_event_active(&cop->ev, OPAL_EV_WRITE, 1); \
ORTE_THREADSHIFT(cop, (p)->ev_base, (cbfunc), ORTE_MSG_PRI); \
} while(0);
#define ORTE_ACTIVATE_TCP_ACCEPT_STATE(s, a, cbfunc) \
@ -72,6 +70,7 @@ OBJ_CLASS_DECLARATION(mca_oob_tcp_conn_op_t);
opal_event_set(orte_oob_base.ev_base, &cop->ev, s, \
OPAL_EV_READ, (cbfunc), cop); \
opal_event_set_priority(&cop->ev, ORTE_MSG_PRI); \
ORTE_POST_OBJECT(cop); \
opal_event_add(&cop->ev, 0); \
} while(0);
@ -88,6 +87,7 @@ OBJ_CLASS_DECLARATION(mca_oob_tcp_conn_op_t);
opal_event_evtimer_set((p)->ev_base, \
&cop->ev, \
(cbfunc), cop); \
ORTE_POST_OBJECT(cop); \
opal_event_evtimer_add(&cop->ev, (tv)); \
} while(0);

Просмотреть файл

@ -66,6 +66,7 @@
#include "orte/util/name_fns.h"
#include "orte/util/parse_options.h"
#include "orte/util/show_help.h"
#include "orte/util/threads.h"
#include "orte/runtime/orte_globals.h"
#include "orte/mca/oob/tcp/oob_tcp.h"
@ -162,6 +163,7 @@ int orte_oob_tcp_start_listening(void)
connection_event_handler,
0);
opal_event_set_priority(&listener->event, ORTE_MSG_PRI);
ORTE_POST_OBJECT(listener);
opal_event_add(&listener->event, 0);
}
@ -816,6 +818,7 @@ static void* listen_thread(opal_object_t *obj)
}
/* activate the event */
ORTE_POST_OBJECT(pending_connection);
opal_event_active(&pending_connection->ev, OPAL_EV_WRITE, 1);
accepted_connections++;
}
@ -858,6 +861,8 @@ static void connection_handler(int sd, short flags, void* cbdata)
new_connection = (mca_oob_tcp_pending_connection_t*)cbdata;
ORTE_ACQUIRE_OBJECT(new_connection);
opal_output_verbose(4, orte_oob_base_framework.framework_output,
"%s connection_handler: working connection "
"(%d, %d) %s:%d\n",

Просмотреть файл

@ -27,6 +27,7 @@
#include "opal/mca/event/event.h"
#include "orte/util/threads.h"
#include "oob_tcp.h"
#include "oob_tcp_sendrecv.h"
@ -87,10 +88,8 @@ OBJ_CLASS_DECLARATION(mca_oob_tcp_peer_op_t);
if (NULL != proxy) { \
pop->rtmod = strdup(proxy); \
} \
opal_event_set(orte_oob_base.ev_base, &pop->ev, -1, \
OPAL_EV_WRITE, (cbfunc), pop); \
opal_event_set_priority(&pop->ev, ORTE_MSG_PRI); \
opal_event_active(&pop->ev, OPAL_EV_WRITE, 1); \
ORTE_THREADSHIFT(pop, orte_oob_base.ev_base, \
(cbfunc), ORTE_MSG_PRI); \
} while(0);
#endif /* _MCA_OOB_TCP_PEER_H_ */

Просмотреть файл

@ -64,6 +64,7 @@
#include "opal/mca/event/event.h"
#include "orte/util/name_fns.h"
#include "orte/util/threads.h"
#include "orte/runtime/orte_globals.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/ess/ess.h"
@ -82,7 +83,10 @@
void mca_oob_tcp_queue_msg(int sd, short args, void *cbdata)
{
mca_oob_tcp_send_t *snd = (mca_oob_tcp_send_t*)cbdata;
mca_oob_tcp_peer_t *peer = (mca_oob_tcp_peer_t*)snd->peer;
mca_oob_tcp_peer_t *peer;
ORTE_ACQUIRE_OBJECT(snd);
peer = (mca_oob_tcp_peer_t*)snd->peer;
/* if there is no message on-deck, put this one there */
if (NULL == peer->send_msg) {
@ -99,8 +103,9 @@ void mca_oob_tcp_queue_msg(int sd, short args, void *cbdata)
} else {
/* ensure the send event is active */
if (!peer->send_ev_active) {
opal_event_add(&peer->send_event, 0);
peer->send_ev_active = true;
ORTE_POST_OBJECT(peer);
opal_event_add(&peer->send_event, 0);
}
}
}
@ -196,9 +201,12 @@ static int send_msg(mca_oob_tcp_peer_t* peer, mca_oob_tcp_send_t* msg)
void mca_oob_tcp_send_handler(int sd, short flags, void *cbdata)
{
mca_oob_tcp_peer_t* peer = (mca_oob_tcp_peer_t*)cbdata;
mca_oob_tcp_send_t* msg = peer->send_msg;
mca_oob_tcp_send_t* msg;
int rc;
ORTE_ACQUIRE_OBJECT(peer);
msg = peer->send_msg;
opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output,
"%s tcp:send_handler called to send to peer %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
@ -424,6 +432,8 @@ void mca_oob_tcp_recv_handler(int sd, short flags, void *cbdata)
int rc;
orte_rml_send_t *snd;
ORTE_ACQUIRE_OBJECT(peer);
opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output,
"%s:tcp:recv:handler called for peer %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
@ -437,8 +447,9 @@ void mca_oob_tcp_recv_handler(int sd, short flags, void *cbdata)
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
/* we connected! Start the send/recv events */
if (!peer->recv_ev_active) {
opal_event_add(&peer->recv_event, 0);
peer->recv_ev_active = true;
ORTE_POST_OBJECT(peer);
opal_event_add(&peer->recv_event, 0);
}
if (peer->timer_ev_active) {
opal_event_del(&peer->timer_event);
@ -449,8 +460,9 @@ void mca_oob_tcp_recv_handler(int sd, short flags, void *cbdata)
peer->send_msg = (mca_oob_tcp_send_t*)opal_list_remove_first(&peer->send_queue);
}
if (NULL != peer->send_msg && !peer->send_ev_active) {
opal_event_add(&peer->send_event, 0);
peer->send_ev_active = true;
ORTE_POST_OBJECT(peer);
opal_event_add(&peer->send_event, 0);
}
/* update our state */
peer->state = MCA_OOB_TCP_CONNECTED;

Просмотреть файл

@ -28,7 +28,7 @@
#include "opal/class/opal_list.h"
#include "orte/mca/rml/base/base.h"
#include "orte/util/threads.h"
#include "oob_tcp.h"
#include "oob_tcp_hdr.h"
@ -82,10 +82,8 @@ OBJ_CLASS_DECLARATION(mca_oob_tcp_recv_t);
do { \
(s)->peer = (struct mca_oob_tcp_peer_t*)(p); \
(s)->activate = (f); \
opal_event_set((p)->ev_base, &(s)->ev, -1, \
OPAL_EV_WRITE, mca_oob_tcp_queue_msg, (s)); \
opal_event_set_priority(&(s)->ev, ORTE_MSG_PRI); \
opal_event_active(&(s)->ev, OPAL_EV_WRITE, 1); \
ORTE_THREADSHIFT((s), (p)->ev_base, \
mca_oob_tcp_queue_msg, ORTE_MSG_PRI); \
} while(0)
/* queue a message to be sent by one of our modules - must
@ -134,7 +132,7 @@ OBJ_CLASS_DECLARATION(mca_oob_tcp_recv_t);
_s->sdbytes = sizeof(mca_oob_tcp_hdr_t); \
/* add to the msg queue for this peer */ \
MCA_OOB_TCP_QUEUE_MSG((p), _s, true); \
}while(0);
} while(0)
/* queue a message to be sent by one of our modules upon completing
* the connection process - must provide the following params:
@ -182,7 +180,7 @@ OBJ_CLASS_DECLARATION(mca_oob_tcp_recv_t);
_s->sdbytes = sizeof(mca_oob_tcp_hdr_t); \
/* add to the msg queue for this peer */ \
MCA_OOB_TCP_QUEUE_MSG((p), _s, false); \
}while(0);
} while(0)
/* queue a message for relay by one of our modules - must
* provide the following params:
@ -217,7 +215,7 @@ OBJ_CLASS_DECLARATION(mca_oob_tcp_recv_t);
_s->sdbytes = sizeof(mca_oob_tcp_hdr_t); \
/* add to the msg queue for this peer */ \
MCA_OOB_TCP_QUEUE_MSG((p), _s, true); \
}while(0);
} while(0)
/* State machine for processing message */
typedef struct {
@ -237,10 +235,8 @@ OBJ_CLASS_DECLARATION(mca_oob_tcp_msg_op_t);
ORTE_NAME_PRINT(&((ms)->dst))); \
mop = OBJ_NEW(mca_oob_tcp_msg_op_t); \
mop->msg = (ms); \
opal_event_set((ms)->peer->ev_base, &mop->ev, -1, \
OPAL_EV_WRITE, (cbfunc), mop); \
opal_event_set_priority(&mop->ev, ORTE_MSG_PRI); \
opal_event_active(&mop->ev, OPAL_EV_WRITE, 1); \
ORTE_THREADSHIFT(mop, (ms)->peer->ev_base, \
(cbfunc), ORTE_MSG_PRI); \
} while(0);
typedef struct {
@ -285,11 +281,9 @@ OBJ_CLASS_DECLARATION(mca_oob_tcp_msg_error_t);
mop->hop.jobid = (h)->jobid; \
mop->hop.vpid = (h)->vpid; \
/* this goes to the OOB framework, so use that event base */ \
opal_event_set(orte_oob_base.ev_base, &mop->ev, -1, \
OPAL_EV_WRITE, (cbfunc), mop); \
opal_event_set_priority(&mop->ev, ORTE_MSG_PRI); \
opal_event_active(&mop->ev, OPAL_EV_WRITE, 1); \
} while(0);
ORTE_THREADSHIFT(mop, orte_oob_base.ev_base, \
(cbfunc), ORTE_MSG_PRI); \
} while(0)
#define ORTE_ACTIVATE_TCP_NO_ROUTE(r, h, c) \
do { \
@ -305,10 +299,8 @@ OBJ_CLASS_DECLARATION(mca_oob_tcp_msg_error_t);
mop->hop.vpid = (h)->vpid; \
/* this goes to the component, so use the framework \
* event base */ \
opal_event_set(orte_oob_base.ev_base, &mop->ev, -1, \
OPAL_EV_WRITE, (c), mop); \
opal_event_set_priority(&mop->ev, ORTE_MSG_PRI); \
opal_event_active(&mop->ev, OPAL_EV_WRITE, 1); \
} while(0);
ORTE_THREADSHIFT(mop, orte_oob_base.ev_base, \
(c), ORTE_MSG_PRI); \
} while(0)
#endif /* _MCA_OOB_TCP_SENDRECV_H_ */

Просмотреть файл

@ -55,13 +55,14 @@
#include "opal/mca/installdirs/installdirs.h"
#include "opal/util/argv.h"
#include "opal/util/output.h"
#include "orte/util/show_help.h"
#include "opal/util/opal_environ.h"
#include "opal/util/path.h"
#include "opal/util/basename.h"
#include "orte/runtime/orte_globals.h"
#include "orte/util/name_fns.h"
#include "orte/util/show_help.h"
#include "orte/util/threads.h"
#include "orte/runtime/orte_wait.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/rmaps/rmaps.h"
@ -187,6 +188,8 @@ static void launch_daemons(int fd, short args, void *cbdata)
orte_state_caddy_t *state = (orte_state_caddy_t*)cbdata;
char *ltmp;
ORTE_ACQUIRE_OBJECT(state);
/* if we are launching debugger daemons, then just go
* do it - no new daemons will be launched
*/

Просмотреть файл

@ -74,6 +74,7 @@
#include "orte/util/pre_condition_transports.h"
#include "orte/util/proc_info.h"
#include "orte/util/regex.h"
#include "orte/util/threads.h"
#include "orte/mca/state/state.h"
#include "orte/mca/state/base/base.h"
#include "orte/util/hostfile/hostfile.h"
@ -129,6 +130,8 @@ void orte_plm_base_daemons_reported(int fd, short args, void *cbdata)
orte_node_t *node;
int i;
ORTE_ACQUIRE_OBJECT(caddy);
/* if we are not launching, then we just assume that all
* daemons share our topology */
if (orte_do_not_launch) {
@ -182,6 +185,8 @@ void orte_plm_base_allocation_complete(int fd, short args, void *cbdata)
{
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
ORTE_ACQUIRE_OBJECT(caddy);
/* move the state machine along */
caddy->jdata->state = ORTE_JOB_STATE_ALLOCATION_COMPLETE;
ORTE_ACTIVATE_JOB_STATE(caddy->jdata, ORTE_JOB_STATE_LAUNCH_DAEMONS);
@ -194,6 +199,8 @@ void orte_plm_base_daemons_launched(int fd, short args, void *cbdata)
{
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
ORTE_ACQUIRE_OBJECT(caddy);
/* do NOT increment the state - we wait for the
* daemons to report that they have actually
* started before moving to the right state
@ -217,6 +224,8 @@ void orte_plm_base_vm_ready(int fd, short args, void *cbdata)
{
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
ORTE_ACQUIRE_OBJECT(caddy);
/* progress the job */
caddy->jdata->state = ORTE_JOB_STATE_VM_READY;
@ -233,6 +242,8 @@ void orte_plm_base_mapping_complete(int fd, short args, void *cbdata)
{
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
ORTE_ACQUIRE_OBJECT(caddy);
/* move the state machine along */
caddy->jdata->state = ORTE_JOB_STATE_MAP_COMPLETE;
ORTE_ACTIVATE_JOB_STATE(caddy->jdata, ORTE_JOB_STATE_SYSTEM_PREP);
@ -252,6 +263,8 @@ void orte_plm_base_setup_job(int fd, short args, void *cbdata)
orte_job_t *parent;
orte_process_name_t name, *nptr;
ORTE_ACQUIRE_OBJECT(caddy);
OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
"%s plm:base:setup_job",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
@ -357,6 +370,8 @@ void orte_plm_base_setup_job_complete(int fd, short args, void *cbdata)
{
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
ORTE_ACQUIRE_OBJECT(caddy);
/* nothing to do here but move along */
ORTE_ACTIVATE_JOB_STATE(caddy->jdata, ORTE_JOB_STATE_ALLOCATE);
OBJ_RELEASE(caddy);
@ -372,6 +387,8 @@ void orte_plm_base_complete_setup(int fd, short args, void *cbdata)
int i, rc;
char *serial_number;
ORTE_ACQUIRE_OBJECT(caddy);
opal_output_verbose(5, orte_plm_base_framework.framework_output,
"%s complete_setup on job %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
@ -465,6 +482,8 @@ static void timer_cb(int fd, short event, void *cbdata)
orte_job_t *jdata = (orte_job_t*)cbdata;
orte_timer_t *timer=NULL;
ORTE_ACQUIRE_OBJECT(jdata);
/* declare launch failed */
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_FAILED_TO_START);
@ -486,6 +505,8 @@ void orte_plm_base_launch_apps(int fd, short args, void *cbdata)
orte_timer_t *timer;
orte_grpcomm_signature_t *sig;
ORTE_ACQUIRE_OBJECT(caddy);
/* convenience */
jdata = caddy->jdata;
@ -587,6 +608,7 @@ void orte_plm_base_launch_apps(int fd, short args, void *cbdata)
timer->tv.tv_sec = orte_startup_timeout;
timer->tv.tv_usec = 0;
orte_set_attribute(&jdata->attributes, ORTE_JOB_FAILURE_TIMER_EVENT, ORTE_ATTR_LOCAL, timer, OPAL_PTR);
ORTE_POST_OBJECT(timer);
opal_event_evtimer_add(timer->ev, &timer->tv);
}
@ -605,6 +627,8 @@ void orte_plm_base_post_launch(int fd, short args, void *cbdata)
opal_buffer_t *answer;
int room, *rmptr;
ORTE_ACQUIRE_OBJECT(caddy);
/* convenience */
jdata = caddy->jdata;
@ -720,6 +744,8 @@ void orte_plm_base_registered(int fd, short args, void *cbdata)
opal_buffer_t *answer;
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
ORTE_ACQUIRE_OBJECT(caddy);
/* convenience */
jdata = caddy->jdata;
@ -793,7 +819,7 @@ void orte_plm_base_registered(int fd, short args, void *cbdata)
return;
}
cleanup:
cleanup:
/* if this wasn't a debugger job, then need to init_after_spawn for debuggers */
if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_DEBUGGER_DAEMON)) {
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_READY_FOR_DEBUGGERS);

Просмотреть файл

@ -14,7 +14,7 @@
* reserved.
* Copyright (c) 2008-2009 Sun Microsystems, Inc. All rights reserved.
* Copyright (c) 2011 IBM Corporation. All rights reserved.
* Copyright (c) 2014 Intel, Inc. All rights reserved.
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -38,6 +38,7 @@
#include "orte/runtime/orte_globals.h"
#include "orte/util/name_fns.h"
#include "orte/util/proc_info.h"
#include "orte/util/threads.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/state/state.h"
@ -114,6 +115,8 @@ static void launch_daemons(int fd, short args, void *cbdata)
{
orte_state_caddy_t *state = (orte_state_caddy_t*)cbdata;
ORTE_ACQUIRE_OBJECT(state);
/* there are no daemons to launch, so just trigger the
* daemon-launch-complete state
*/

Просмотреть файл

@ -66,6 +66,7 @@
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/rmaps/rmaps.h"
#include "orte/mca/state/state.h"
#include "orte/util/threads.h"
#include "orte/mca/plm/plm.h"
#include "orte/mca/plm/base/base.h"
@ -171,7 +172,10 @@ static void launch_daemons(int fd, short args, void *cbdata)
orte_std_cntr_t nnode;
orte_job_t *daemons;
orte_state_caddy_t *state = (orte_state_caddy_t*)cbdata;
orte_job_t *jdata = state->jdata;
orte_job_t *jdata;
ORTE_ACQUIRE_OBJECT(state);
jdata = state->jdata;
/* start by setting up the virtual machine */
daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid);

Просмотреть файл

@ -80,6 +80,7 @@
#include "orte/util/name_fns.h"
#include "orte/util/nidmap.h"
#include "orte/util/proc_info.h"
#include "orte/util/threads.h"
#include "orte/mca/rml/rml.h"
#include "orte/mca/rml/rml_types.h"
@ -926,6 +927,8 @@ static void process_launch_list(int fd, short args, void *cbdata)
pid_t pid;
orte_plm_rsh_caddy_t *caddy;
ORTE_ACQUIRE_OBJECT(caddy);
while (num_in_progress < mca_plm_rsh_component.num_concurrent) {
item = opal_list_remove_first(&launch_list);
if (NULL == item) {
@ -1021,6 +1024,8 @@ static void launch_daemons(int fd, short args, void *cbdata)
orte_namelist_t *child;
char *rtmod;
ORTE_ACQUIRE_OBJECT(state);
/* if we are launching debugger daemons, then just go
* do it - no new daemons will be launched
*/
@ -1285,6 +1290,7 @@ static void launch_daemons(int fd, short args, void *cbdata)
OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output,
"%s plm:rsh: activating launch event",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
ORTE_POST_OBJECT(state);
opal_event_active(&launch_event, EV_WRITE, 1);
/* now that we've launched the daemons, let the daemon callback

Просмотреть файл

@ -61,6 +61,7 @@
#include "orte/types.h"
#include "orte/util/show_help.h"
#include "orte/util/name_fns.h"
#include "orte/util/threads.h"
#include "orte/runtime/orte_globals.h"
#include "orte/runtime/orte_wait.h"
#include "orte/runtime/orte_quit.h"
@ -108,7 +109,6 @@ orte_plm_base_module_1_0_0_t orte_plm_slurm_module = {
*/
static pid_t primary_srun_pid = 0;
static bool primary_pid_set = false;
static bool launching_daemons;
static void launch_daemons(int fd, short args, void *cbdata);
/**
@ -189,6 +189,8 @@ static void launch_daemons(int fd, short args, void *cbdata)
orte_job_t *daemons;
orte_state_caddy_t *state = (orte_state_caddy_t*)cbdata;
ORTE_ACQUIRE_OBJECT(state);
OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output,
"%s plm:slurm: LAUNCH DAEMONS CALLED",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
@ -545,27 +547,18 @@ static void srun_wait_cb(orte_proc_t *proc, void* cbdata){
jdata = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid);
/* if we are in the launch phase, then any termination is bad */
if (launching_daemons) {
/* report that one or more daemons failed to launch so we can exit */
OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output,
"%s plm:slurm: daemon failed during launch",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
/* notify the error manager */
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_FAILED_TO_START);
} else {
/* if this is after launch, then we need to abort only if the status
* returned is non-zero - i.e., if the orteds exited with an error
/* abort only if the status returned is non-zero - i.e., if
* the orteds exited with an error
*/
if (0 != proc->exit_code) {
/* an orted must have died unexpectedly - report
* that the daemon has failed so we exit
*/
if (0 != proc->exit_code) {
/* an orted must have died unexpectedly after launch - report
* that the daemon has failed so we exit
*/
OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output,
"%s plm:slurm: daemon failed while running",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_ABORTED);
}
OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output,
"%s plm:slurm: daemon failed while running",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_ABORTED);
} else {
/* otherwise, check to see if this is the primary pid */
if (primary_srun_pid == proc->pid) {
/* in this case, we just want to fire the proper trigger so
@ -579,6 +572,7 @@ static void srun_wait_cb(orte_proc_t *proc, void* cbdata){
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_DAEMONS_TERMINATED);
}
}
/* done with this dummy */
OBJ_RELEASE(proc);
}
@ -602,6 +596,13 @@ static int plm_slurm_start_proc(int argc, char **argv, char **env,
free(exec_argv);
return ORTE_ERR_SYS_LIMITS_CHILDREN;
}
/* if this is the primary launch - i.e., not a comm_spawn of a
* child job - then save the pid
*/
if (0 < srun_pid && !primary_pid_set) {
primary_srun_pid = srun_pid;
primary_pid_set = true;
}
/* setup a dummy proc object to track the srun */
dummy = OBJ_NEW(orte_proc_t);
@ -692,14 +693,6 @@ static int plm_slurm_start_proc(int argc, char **argv, char **env,
sides of the fork... */
setpgid(srun_pid, srun_pid);
/* if this is the primary launch - i.e., not a comm_spawn of a
* child job - then save the pid
*/
if (!primary_pid_set) {
primary_srun_pid = srun_pid;
primary_pid_set = true;
}
free(exec_argv);
}

Просмотреть файл

@ -63,6 +63,7 @@
#include "opal/util/basename.h"
#include "orte/util/name_fns.h"
#include "orte/util/threads.h"
#include "orte/runtime/orte_globals.h"
#include "orte/runtime/orte_wait.h"
#include "orte/mca/errmgr/errmgr.h"
@ -185,6 +186,8 @@ static void launch_daemons(int fd, short args, void *cbdata)
int32_t launchid, *ldptr;
char *prefix_dir = NULL;
ORTE_ACQUIRE_OBJECT(state);
jdata = state->jdata;
/* if we are launching debugger daemons, then just go
@ -403,7 +406,7 @@ static void launch_daemons(int fd, short args, void *cbdata)
"%s plm:tm:launch: finished spawning orteds",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
cleanup:
cleanup:
/* cleanup */
OBJ_RELEASE(state);
@ -421,6 +424,8 @@ static void poll_spawns(int fd, short args, void *cbdata)
int local_err;
tm_event_t event;
ORTE_ACQUIRE_OBJECT(state);
/* TM poll for all the spawns */
for (i = 0; i < launched; ++i) {
rc = tm_poll(TM_NULL_EVENT, &event, 1, &local_err);
@ -435,7 +440,7 @@ static void poll_spawns(int fd, short args, void *cbdata)
}
failed_launch = false;
cleanup:
cleanup:
/* cleanup */
OBJ_RELEASE(state);

Просмотреть файл

@ -45,6 +45,7 @@
#include "orte/util/proc_info.h"
#include "orte/util/comm/comm.h"
#include "orte/util/error_strings.h"
#include "orte/util/threads.h"
#include "orte/mca/state/state.h"
#include "orte/runtime/orte_quit.h"
@ -115,6 +116,8 @@ void orte_ras_base_allocate(int fd, short args, void *cbdata)
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
char *hosts=NULL;
ORTE_ACQUIRE_OBJECT(caddy);
OPAL_OUTPUT_VERBOSE((5, orte_ras_base_framework.framework_output,
"%s ras:base:allocate",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));

Просмотреть файл

@ -36,6 +36,7 @@
#include "orte/mca/errmgr/errmgr.h"
#include "orte/runtime/orte_globals.h"
#include "orte/util/show_help.h"
#include "orte/util/threads.h"
#include "orte/mca/state/state.h"
#include "orte/mca/rmaps/base/base.h"
@ -45,7 +46,7 @@
void orte_rmaps_base_map_job(int fd, short args, void *cbdata)
{
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
orte_job_t *jdata = caddy->jdata;
orte_job_t *jdata;
orte_node_t *node;
int rc, i, ppx = 0;
bool did_map, given, pernode = false;
@ -54,6 +55,9 @@ void orte_rmaps_base_map_job(int fd, short args, void *cbdata)
orte_vpid_t nprocs;
orte_app_context_t *app;
ORTE_ACQUIRE_OBJECT(caddy);
jdata = caddy->jdata;
jdata->state = ORTE_JOB_STATE_MAP;
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,

Просмотреть файл

@ -29,6 +29,7 @@
#include "orte/mca/state/state.h"
#include "orte/runtime/orte_wait.h"
#include "orte/util/name_fns.h"
#include "orte/util/threads.h"
#include "orte/mca/rml/base/base.h"
@ -87,8 +88,10 @@ static void cleanup(int sd, short args, void *cbdata)
{
volatile bool *active = (volatile bool*)cbdata;
ORTE_ACQUIRE_OBJECT(active);
OPAL_LIST_DESTRUCT(&orte_rml_base.posted_recvs);
if (NULL != active) {
ORTE_POST_OBJECT(active);
*active = false;
}
}
@ -128,6 +131,7 @@ static int orte_rml_base_close(void)
opal_event_set(orte_event_base, &ev, -1,
OPAL_EV_WRITE, cleanup, (void*)&active);
opal_event_set_priority(&ev, ORTE_ERROR_PRI);
ORTE_POST_OBJECT(ev);
opal_event_active(&ev, OPAL_EV_WRITE, 1);
ORTE_WAIT_FOR_COMPLETION(active);
} else {
@ -243,12 +247,14 @@ void orte_rml_recv_callback(int status, orte_process_name_t* sender,
{
orte_rml_recv_cb_t *blob = (orte_rml_recv_cb_t*)cbdata;
ORTE_ACQUIRE_OBJECT(blob);
/* transfer the sender */
blob->name.jobid = sender->jobid;
blob->name.vpid = sender->vpid;
/* just copy the payload to the buf */
opal_dss.copy_payload(&blob->data, buffer);
/* flag as complete */
ORTE_POST_OBJECT(blob);
blob->active = false;
}

Просмотреть файл

@ -42,6 +42,7 @@
#include "orte/runtime/orte_globals.h"
#include "orte/runtime/orte_wait.h"
#include "orte/util/name_fns.h"
#include "orte/util/threads.h"
#include "orte/mca/rml/rml.h"
#include "orte/mca/rml/base/base.h"
@ -57,6 +58,8 @@ void orte_rml_base_post_recv(int sd, short args, void *cbdata)
orte_rml_posted_recv_t *post, *recv;
orte_ns_cmp_bitmask_t mask = ORTE_NS_CMP_ALL | ORTE_NS_CMP_WILD;
ORTE_ACQUIRE_OBJECT(req);
opal_output_verbose(5, orte_rml_base_framework.framework_output,
"%s posting recv",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
@ -159,6 +162,8 @@ void orte_rml_base_process_msg(int fd, short flags, void *cbdata)
orte_ns_cmp_bitmask_t mask = ORTE_NS_CMP_ALL | ORTE_NS_CMP_WILD;
opal_buffer_t buf;
ORTE_ACQUIRE_OBJECT(msg);
OPAL_OUTPUT_VERBOSE((5, orte_rml_base_framework.framework_output,
"%s message received from %s for tag %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),

Просмотреть файл

@ -30,6 +30,7 @@
#include "orte/mca/state/state.h"
#include "orte/runtime/orte_wait.h"
#include "orte/util/name_fns.h"
#include "orte/util/threads.h"
#include "orte/mca/rml/base/base.h"
@ -269,11 +270,7 @@ void orte_rml_API_recv_nb(orte_process_name_t* peer,
req->post->persistent = persistent;
req->post->cbfunc.iov = cbfunc;
req->post->cbdata = cbdata;
opal_event_set(orte_event_base, &req->ev, -1,
OPAL_EV_WRITE,
orte_rml_base_post_recv, req);
opal_event_set_priority(&req->ev, ORTE_MSG_PRI);
opal_event_active(&req->ev, OPAL_EV_WRITE, 1);
ORTE_THREADSHIFT(req, orte_event_base, orte_rml_base_post_recv, ORTE_MSG_PRI);
}
/** Receive non-blocking buffer message */
@ -300,11 +297,7 @@ void orte_rml_API_recv_buffer_nb(orte_process_name_t* peer,
req->post->persistent = persistent;
req->post->cbfunc.buffer = cbfunc;
req->post->cbdata = cbdata;
opal_event_set(orte_event_base, &req->ev, -1,
OPAL_EV_WRITE,
orte_rml_base_post_recv, req);
opal_event_set_priority(&req->ev, ORTE_MSG_PRI);
opal_event_active(&req->ev, OPAL_EV_WRITE, 1);
ORTE_THREADSHIFT(req, orte_event_base, orte_rml_base_post_recv, ORTE_MSG_PRI);
}
/** Cancel posted non-blocking receive */
@ -316,6 +309,8 @@ void orte_rml_API_recv_cancel(orte_process_name_t* peer, orte_rml_tag_t tag)
"%s rml_recv_cancel for peer %s tag %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(peer), tag);
ORTE_ACQUIRE_OBJECT(orte_event_base_active);
if (!orte_event_base_active) {
/* no event will be processed any more, so simply return. */
return;
@ -328,11 +323,7 @@ void orte_rml_API_recv_cancel(orte_process_name_t* peer, orte_rml_tag_t tag)
req->post->peer.jobid = peer->jobid;
req->post->peer.vpid = peer->vpid;
req->post->tag = tag;
opal_event_set(orte_event_base, &req->ev, -1,
OPAL_EV_WRITE,
orte_rml_base_post_recv, req);
opal_event_set_priority(&req->ev, ORTE_MSG_PRI);
opal_event_active(&req->ev, OPAL_EV_WRITE, 1);
ORTE_THREADSHIFT(req, orte_event_base, orte_rml_base_post_recv, ORTE_MSG_PRI);
}
/** Purge information */

Просмотреть файл

@ -29,6 +29,7 @@
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/oob/base/base.h"
#include "orte/util/name_fns.h"
#include "orte/util/threads.h"
#include "orte/runtime/orte_globals.h"
#include "orte/mca/rml/base/base.h"
@ -39,6 +40,8 @@ static void send_self_exe(int fd, short args, void* data)
{
orte_self_send_xfer_t *xfer = (orte_self_send_xfer_t*)data;
ORTE_ACQUIRE_OBJECT(xfer);
OPAL_OUTPUT_VERBOSE((1, orte_rml_base_framework.framework_output,
"%s rml_send_to_self callback executing for tag %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), xfer->tag));
@ -130,9 +133,7 @@ int orte_rml_oob_send_nb(struct orte_rml_base_module_t *mod,
xfer->tag = tag;
xfer->cbdata = cbdata;
/* setup the event for the send callback */
opal_event_set(orte_event_base, &xfer->ev, -1, OPAL_EV_WRITE, send_self_exe, xfer);
opal_event_set_priority(&xfer->ev, ORTE_MSG_PRI);
opal_event_active(&xfer->ev, OPAL_EV_WRITE, 1);
ORTE_THREADSHIFT(xfer, orte_event_base, send_self_exe, ORTE_MSG_PRI);
/* copy the message for the recv */
rcv = OBJ_NEW(orte_rml_recv_t);
@ -235,9 +236,7 @@ int orte_rml_oob_send_buffer_nb(struct orte_rml_base_module_t *mod,
xfer->tag = tag;
xfer->cbdata = cbdata;
/* setup the event for the send callback */
opal_event_set(orte_event_base, &xfer->ev, -1, OPAL_EV_WRITE, send_self_exe, xfer);
opal_event_set_priority(&xfer->ev, ORTE_MSG_PRI);
opal_event_active(&xfer->ev, OPAL_EV_WRITE, 1);
ORTE_THREADSHIFT(xfer, orte_event_base, send_self_exe, ORTE_MSG_PRI);
/* copy the message for the recv */
rcv = OBJ_NEW(orte_rml_recv_t);

Просмотреть файл

@ -1,5 +1,5 @@
/*
* Copyright (c) 2014-2015 Intel, Inc. All rights reserved
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
* Copyright (c) 2017 Cisco Systems, Inc. All rights reserved
* $COPYRIGHT$
*
@ -39,11 +39,9 @@ static void set(orte_job_t *jdata,
int write_fd);
orte_rtc_base_module_t orte_rtc_hwloc_module = {
init,
finalize,
NULL,
set,
NULL
.init = init,
.finalize = finalize,
.set = set
};
static int init(void)

Просмотреть файл

@ -36,6 +36,7 @@
#include "orte/mca/rml/rml.h"
#include "orte/mca/routed/routed.h"
#include "orte/util/session_dir.h"
#include "orte/util/threads.h"
#include "orte/mca/state/base/base.h"
#include "orte/mca/state/base/state_private.h"
@ -78,9 +79,7 @@ void orte_state_base_activate_job_state(orte_job_t *jdata,
caddy->job_state = state;
OBJ_RETAIN(jdata);
}
opal_event_set(orte_event_base, &caddy->ev, -1, OPAL_EV_WRITE, s->cbfunc, caddy);
opal_event_set_priority(&caddy->ev, s->priority);
opal_event_active(&caddy->ev, OPAL_EV_WRITE, 1);
ORTE_THREADSHIFT(caddy, orte_event_base, s->cbfunc, s->priority);
return;
}
}
@ -107,14 +106,12 @@ void orte_state_base_activate_job_state(orte_job_t *jdata,
caddy->job_state = state;
OBJ_RETAIN(jdata);
}
OPAL_OUTPUT_VERBOSE((1, orte_state_base_framework.framework_output,
"%s ACTIVATING JOB %s STATE %s PRI %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
(NULL == jdata) ? "NULL" : ORTE_JOBID_PRINT(jdata->jobid),
orte_job_state_to_str(state), s->priority));
opal_event_set(orte_event_base, &caddy->ev, -1, OPAL_EV_WRITE, s->cbfunc, caddy);
opal_event_set_priority(&caddy->ev, s->priority);
opal_event_active(&caddy->ev, OPAL_EV_WRITE, 1);
OPAL_OUTPUT_VERBOSE((1, orte_state_base_framework.framework_output,
"%s ACTIVATING JOB %s STATE %s PRI %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
(NULL == jdata) ? "NULL" : ORTE_JOBID_PRINT(jdata->jobid),
orte_job_state_to_str(state), s->priority));
ORTE_THREADSHIFT(caddy, orte_event_base, s->cbfunc, s->priority);
}
@ -262,9 +259,7 @@ void orte_state_base_activate_proc_state(orte_process_name_t *proc,
caddy = OBJ_NEW(orte_state_caddy_t);
caddy->name = *proc;
caddy->proc_state = state;
opal_event_set(orte_event_base, &caddy->ev, -1, OPAL_EV_WRITE, s->cbfunc, caddy);
opal_event_set_priority(&caddy->ev, s->priority);
opal_event_active(&caddy->ev, OPAL_EV_WRITE, 1);
ORTE_THREADSHIFT(caddy, orte_event_base, s->cbfunc, s->priority);
return;
}
}
@ -288,14 +283,12 @@ void orte_state_base_activate_proc_state(orte_process_name_t *proc,
caddy = OBJ_NEW(orte_state_caddy_t);
caddy->name = *proc;
caddy->proc_state = state;
OPAL_OUTPUT_VERBOSE((1, orte_state_base_framework.framework_output,
"%s ACTIVATING PROC %s STATE %s PRI %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(proc),
orte_proc_state_to_str(state), s->priority));
opal_event_set(orte_event_base, &caddy->ev, -1, OPAL_EV_WRITE, s->cbfunc, caddy);
opal_event_set_priority(&caddy->ev, s->priority);
opal_event_active(&caddy->ev, OPAL_EV_WRITE, 1);
OPAL_OUTPUT_VERBOSE((1, orte_state_base_framework.framework_output,
"%s ACTIVATING PROC %s STATE %s PRI %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(proc),
orte_proc_state_to_str(state), s->priority));
ORTE_THREADSHIFT(caddy, orte_event_base, s->cbfunc, s->priority);
}
int orte_state_base_add_proc_state(orte_proc_state_t state,
@ -443,7 +436,10 @@ void orte_state_base_local_launch_complete(int fd, short argc, void *cbdata)
void orte_state_base_cleanup_job(int fd, short argc, void *cbdata)
{
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
orte_job_t *jdata = caddy->jdata;
orte_job_t *jdata;
ORTE_ACQUIRE_OBJECT(caddy);
jdata = caddy->jdata;
OPAL_OUTPUT_VERBOSE((2, orte_state_base_framework.framework_output,
"%s state:base:cleanup on job %s",
@ -460,9 +456,12 @@ void orte_state_base_cleanup_job(int fd, short argc, void *cbdata)
void orte_state_base_report_progress(int fd, short argc, void *cbdata)
{
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
orte_job_t *jdata = caddy->jdata;
orte_job_t *jdata;
opal_output(orte_clean_output, "App launch reported: %d (out of %d) daemons - %d (out of %d) procs",
ORTE_ACQUIRE_OBJECT(caddy);
jdata = caddy->jdata;
opal_output(orte_clean_output, "App launch reported: %d (out of %d) daemons - %d (out of %d) procs",
(int)jdata->num_daemons_reported, (int)orte_process_info.num_procs,
(int)jdata->num_launched, (int)jdata->num_procs);
OBJ_RELEASE(caddy);
@ -659,14 +658,18 @@ static void _send_notification(int status,
void orte_state_base_track_procs(int fd, short argc, void *cbdata)
{
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
orte_process_name_t *proc = &caddy->name;
orte_proc_state_t state = caddy->proc_state;
orte_process_name_t *proc;
orte_proc_state_t state;
orte_job_t *jdata;
orte_proc_t *pdata;
int i;
char *rtmod;
orte_process_name_t parent, target, *npptr;
ORTE_ACQUIRE_OBJECT(caddy);
proc = &caddy->name;
state = caddy->proc_state;
opal_output_verbose(5, orte_state_base_framework.framework_output,
"%s state:base:track_procs called for proc %s state %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
@ -811,8 +814,7 @@ void orte_state_base_track_procs(int fd, short argc, void *cbdata)
void orte_state_base_check_all_complete(int fd, short args, void *cbdata)
{
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
orte_job_t *jdata = caddy->jdata;
orte_job_t *jdata;
orte_proc_t *proc;
int i;
orte_std_cntr_t j;
@ -827,6 +829,9 @@ void orte_state_base_check_all_complete(int fd, short args, void *cbdata)
void *nptr;
char *rtmod;
ORTE_ACQUIRE_OBJECT(caddy);
jdata = caddy->jdata;
opal_output_verbose(2, orte_state_base_framework.framework_output,
"%s state:base:check_job_complete on job %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),

Просмотреть файл

@ -31,6 +31,7 @@
#include "orte/mca/routed/routed.h"
#include "orte/util/nidmap.h"
#include "orte/util/session_dir.h"
#include "orte/util/threads.h"
#include "orte/runtime/orte_quit.h"
#include "orte/runtime/orte_wait.h"
@ -223,6 +224,8 @@ static void init_complete(int sd, short args, void *cbdata)
{
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
ORTE_ACQUIRE_OBJECT(caddy);
/* nothing to do here but move along - if it is the
* daemon job, then next step is allocate */
if (caddy->jdata->jobid == ORTE_PROC_MY_NAME->jobid) {
@ -249,6 +252,8 @@ static void vm_ready(int fd, short args, void *cbdata)
int32_t numbytes;
char *nidmap;
ORTE_ACQUIRE_OBJECT(caddy);
/* if this is my job, then we are done */
if (ORTE_PROC_MY_NAME->jobid == caddy->jdata->jobid) {
/* send the daemon map to every daemon in this DVM - we
@ -353,8 +358,7 @@ static void vm_ready(int fd, short args, void *cbdata)
static void check_complete(int fd, short args, void *cbdata)
{
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
orte_job_t *jdata = caddy->jdata;
orte_job_t *jdata;
orte_proc_t *proc;
int i;
orte_node_t *node;
@ -362,6 +366,9 @@ static void check_complete(int fd, short args, void *cbdata)
orte_std_cntr_t index;
char *rtmod;
ORTE_ACQUIRE_OBJECT(caddy);
jdata = caddy->jdata;
opal_output_verbose(2, orte_state_base_framework.framework_output,
"%s state:dvm:check_job_complete on job %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
@ -472,7 +479,10 @@ static void check_complete(int fd, short args, void *cbdata)
static void cleanup_job(int sd, short args, void *cbdata)
{
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
orte_job_t *jdata = caddy->jdata;
orte_job_t *jdata;
ORTE_ACQUIRE_OBJECT(caddy);
jdata = caddy->jdata;
/* remove this object from the job array */
opal_hash_table_set_value_uint32(orte_job_data, jdata->jobid, NULL);

Просмотреть файл

@ -26,6 +26,7 @@
#include "orte/mca/rmaps/base/base.h"
#include "orte/mca/routed/routed.h"
#include "orte/util/session_dir.h"
#include "orte/util/threads.h"
#include "orte/runtime/orte_quit.h"
#include "orte/mca/state/state.h"
@ -196,12 +197,15 @@ static int finalize(void)
static void allocation_complete(int fd, short args, void *cbdata)
{
orte_state_caddy_t *state = (orte_state_caddy_t*)cbdata;
orte_job_t *jdata = state->jdata;
orte_job_t *jdata;
orte_job_t *daemons;
orte_topology_t *t;
orte_node_t *node;
int i;
ORTE_ACQUIRE_OBJECT(caddy);
jdata = state->jdata;
jdata->state = ORTE_JOB_STATE_ALLOCATION_COMPLETE;
/* get the daemon job object */
@ -252,7 +256,10 @@ static void allocation_complete(int fd, short args, void *cbdata)
static void map_complete(int fd, short args, void *cbdata)
{
orte_state_caddy_t *state = (orte_state_caddy_t*)cbdata;
orte_job_t *jdata = state->jdata;
orte_job_t *jdata;
ORTE_ACQUIRE_OBJECT(caddy);
jdata = state->jdata;
jdata->state = ORTE_JOB_STATE_MAP_COMPLETE;
/* move to the map stage */
@ -265,7 +272,10 @@ static void map_complete(int fd, short args, void *cbdata)
static void vm_ready(int fd, short args, void *cbdata)
{
orte_state_caddy_t *state = (orte_state_caddy_t*)cbdata;
orte_job_t *jdata = state->jdata;
orte_job_t *jdata;
ORTE_ACQUIRE_OBJECT(caddy);
jdata = state->jdata;
/* now that the daemons are launched, we are ready
* to roll

Просмотреть файл

@ -27,6 +27,7 @@
#include "orte/mca/rml/rml.h"
#include "orte/mca/routed/routed.h"
#include "orte/util/session_dir.h"
#include "orte/util/threads.h"
#include "orte/orted/pmix/pmix_server_internal.h"
#include "orte/runtime/orte_data_server.h"
#include "orte/runtime/orte_quit.h"
@ -165,6 +166,8 @@ static void track_jobs(int fd, short argc, void *cbdata)
orte_proc_t *child;
orte_vpid_t null=ORTE_VPID_INVALID;
ORTE_ACQUIRE_OBJECT(caddy);
if (ORTE_JOB_STATE_LOCAL_LAUNCH_COMPLETE == caddy->job_state) {
OPAL_OUTPUT_VERBOSE((5, orte_state_base_framework.framework_output,
"%s state:orted:track_jobs sending local launch complete for job %s",
@ -251,8 +254,8 @@ static void track_jobs(int fd, short argc, void *cbdata)
static void track_procs(int fd, short argc, void *cbdata)
{
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
orte_process_name_t *proc = &caddy->name;
orte_proc_state_t state = caddy->proc_state;
orte_process_name_t *proc;
orte_proc_state_t state;
orte_job_t *jdata;
orte_proc_t *pdata, *pptr;
opal_buffer_t *alert;
@ -264,6 +267,10 @@ static void track_procs(int fd, short argc, void *cbdata)
orte_node_t *node;
orte_process_name_t target;
ORTE_ACQUIRE_OBJECT(caddy);
proc = &caddy->name;
state = caddy->proc_state;
OPAL_OUTPUT_VERBOSE((5, orte_state_base_framework.framework_output,
"%s state:orted:track_procs called for proc %s state %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),

Просмотреть файл

@ -77,6 +77,7 @@
#include "orte/mca/rml/base/rml_contact.h"
#include "orte/util/pre_condition_transports.h"
#include "orte/util/compress.h"
#include "orte/util/threads.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/ess/ess.h"
@ -919,6 +920,7 @@ int orte_daemon(int argc, char *argv[])
while (orte_event_base_active) {
opal_event_loop(orte_event_base, OPAL_EVLOOP_ONCE);
}
ORTE_ACQUIRE_OBJECT(orte_event_base_active);
/* ensure all local procs are dead */
orte_odls.kill_local_procs(NULL);

Просмотреть файл

@ -68,6 +68,7 @@
#include "orte/util/proc_info.h"
#include "orte/util/session_dir.h"
#include "orte/util/show_help.h"
#include "orte/util/threads.h"
#include "orte/runtime/orte_globals.h"
#include "pmix_server.h"
@ -350,6 +351,8 @@ static void _mdxresp(int sd, short args, void *cbdata)
int rc;
opal_buffer_t *reply;
ORTE_ACQUIRE_OBJECT(req);
/* check us out of the hotel */
opal_hotel_checkout(&orte_pmix_server_globals.reqs, req->room_num);
@ -399,6 +402,8 @@ static void modex_resp(int status,
pmix_server_req_t *req = (pmix_server_req_t*)cbdata;
opal_buffer_t xfer;
ORTE_ACQUIRE_OBJECT(req);
req->status = status;
/* we need to preserve the data as the caller
* will free it upon our return */
@ -413,6 +418,7 @@ static void modex_resp(int status,
opal_event_set(orte_event_base, &(req->ev),
-1, OPAL_EV_WRITE, _mdxresp, req);
opal_event_set_priority(&(req->ev), ORTE_MSG_PRI);
ORTE_POST_OBJECT(req);
opal_event_active(&(req->ev), OPAL_EV_WRITE, 1);
}
static void pmix_server_dmdx_recv(int status, orte_process_name_t* sender,

Просмотреть файл

@ -44,6 +44,7 @@
#include "orte/mca/rmaps/base/base.h"
#include "orte/util/name_fns.h"
#include "orte/util/show_help.h"
#include "orte/util/threads.h"
#include "orte/runtime/orte_globals.h"
#include "orte/mca/rml/rml.h"
@ -103,6 +104,8 @@ static void spawn(int sd, short args, void *cbdata)
opal_buffer_t *buf;
orte_plm_cmd_flag_t command;
ORTE_ACQUIRE_OBJECT(req);
/* add this request to our tracker hotel */
if (OPAL_SUCCESS != (rc = opal_hotel_checkin(&orte_pmix_server_globals.reqs, req, &req->room_num))) {
orte_show_help("help-orted.txt", "noroom", true, req->operation, orte_pmix_server_globals.num_rooms);
@ -351,6 +354,8 @@ static void _cnlk(int status, opal_list_t *data, void *cbdata)
orte_job_t *jdata;
opal_buffer_t buf;
ORTE_ACQUIRE_OBJECT(cd);
/* if we failed to get the required data, then just inform
* the embedded server that the connect cannot succeed */
if (ORTE_SUCCESS != status || NULL == data) {
@ -402,6 +407,8 @@ static void _cnct(int sd, short args, void *cbdata)
orte_job_t *jdata;
int rc = ORTE_SUCCESS;
ORTE_ACQUIRE_OBJECT(cd);
/* at some point, we need to add bookeeping to track which
* procs are "connected" so we know who to notify upon
* termination or failure. For now, we have to ensure
@ -477,6 +484,8 @@ static void mdxcbfunc(int status,
{
orte_pmix_server_op_caddy_t *cd = (orte_pmix_server_op_caddy_t*)cbdata;
ORTE_ACQUIRE_OBJECT(cd);
/* ack the call */
if (NULL != cd->cbfunc) {
cd->cbfunc(status, cd->cbdata);

Просмотреть файл

@ -38,6 +38,7 @@
#include "orte/mca/errmgr/errmgr.h"
#include "orte/util/name_fns.h"
#include "orte/util/show_help.h"
#include "orte/util/threads.h"
#include "orte/runtime/orte_globals.h"
#include "orte/mca/grpcomm/grpcomm.h"
#include "orte/mca/rml/rml.h"
@ -59,6 +60,8 @@ static void pmix_server_release(int status, opal_buffer_t *buf, void *cbdata)
int32_t ndata = 0;
int rc = OPAL_SUCCESS;
ORTE_ACQUIRE_OBJECT(cd);
/* unload the buffer */
if (NULL != buf) {
rc = opal_dss.unload(buf, (void**)&data, &ndata);
@ -135,6 +138,8 @@ static void dmodex_req(int sd, short args, void *cbdata)
uint8_t *data=NULL;
int32_t sz=0;
ORTE_ACQUIRE_OBJECT(rq);
/* a race condition exists here because of the thread-shift - it is
* possible that data for the specified proc arrived while we were
* waiting to be serviced. In that case, the tracker that would have

Просмотреть файл

@ -43,6 +43,7 @@
#include "orte/mca/schizo/schizo.h"
#include "orte/mca/state/state.h"
#include "orte/util/name_fns.h"
#include "orte/util/threads.h"
#include "orte/runtime/orte_globals.h"
#include "orte/mca/rml/rml.h"
#include "orte/mca/plm/plm.h"
@ -57,6 +58,8 @@ static void _client_conn(int sd, short args, void *cbdata)
orte_proc_t *p, *ptr;
int i;
ORTE_ACQUIRE_OBJECT(cd);
if (NULL != cd->server_object) {
/* we were passed back the orte_proc_t */
p = (orte_proc_t*)cd->server_object;
@ -106,6 +109,8 @@ static void _client_finalized(int sd, short args, void *cbdata)
orte_proc_t *p, *ptr;
int i;
ORTE_ACQUIRE_OBJECT(cd);
if (NULL != cd->server_object) {
/* we were passed back the orte_proc_t */
p = (orte_proc_t*)cd->server_object;
@ -164,6 +169,8 @@ static void _client_abort(int sd, short args, void *cbdata)
orte_proc_t *p, *ptr;
int i;
ORTE_ACQUIRE_OBJECT(cd);
if (NULL != cd->server_object) {
p = (orte_proc_t*)cd->server_object;
} else {
@ -214,6 +221,8 @@ static void _register_events(int sd, short args, void *cbdata)
orte_pmix_server_op_caddy_t *cd = (orte_pmix_server_op_caddy_t*)cbdata;
opal_value_t *info;
ORTE_ACQUIRE_OBJECT(cd);
/* the OPAL layer "owns" the list, but let's deconstruct it
* here so we don't have to duplicate the data */
while (NULL != (info = (opal_value_t*)opal_list_remove_first(cd->info))) {
@ -246,6 +255,8 @@ static void _deregister_events(int sd, short args, void *cbdata)
orte_pmix_server_op_caddy_t *cd = (orte_pmix_server_op_caddy_t*)cbdata;
opal_value_t *info, *iptr, *nptr;
ORTE_ACQUIRE_OBJECT(cd);
/* the OPAL layer "owns" the list, but let's deconstruct it
* here for consistency */
while (NULL != (info = (opal_value_t*)opal_list_remove_first(cd->info))) {
@ -281,6 +292,8 @@ static void _notify_release(int status, void *cbdata)
{
orte_pmix_server_op_caddy_t *cd = (orte_pmix_server_op_caddy_t*)cbdata;
ORTE_ACQUIRE_OBJECT(cd);
if (NULL != cd->info) {
OPAL_LIST_RELEASE(cd->info);
}
@ -465,6 +478,8 @@ static void _query(int sd, short args, void *cbdata)
opal_pstats_t pstat;
float pss;
ORTE_ACQUIRE_OBJECT(cd);
opal_output_verbose(2, orte_pmix_server_globals.output,
"%s processing query",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
@ -654,6 +669,7 @@ int pmix_server_query_fn(opal_process_name_t *requestor,
opal_event_set(orte_event_base, &(cd->ev), -1,
OPAL_EV_WRITE, _query, cd);
opal_event_set_priority(&(cd->ev), ORTE_MSG_PRI);
ORTE_POST_OBJECT(cd);
opal_event_active(&(cd->ev), OPAL_EV_WRITE, 1);
return ORTE_SUCCESS;
@ -669,6 +685,8 @@ static void _toolconn(int sd, short args, void *cbdata)
orte_process_name_t tool;
int rc;
ORTE_ACQUIRE_OBJECT(cd);
opal_output_verbose(2, orte_pmix_server_globals.output,
"%s TOOL CONNECTION PROCESSING",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
@ -768,6 +786,7 @@ void pmix_tool_connected_fn(opal_list_t *info,
opal_event_set(orte_event_base, &(cd->ev), -1,
OPAL_EV_WRITE, _toolconn, cd);
opal_event_set_priority(&(cd->ev), ORTE_MSG_PRI);
ORTE_POST_OBJECT(cd);
opal_event_active(&(cd->ev), OPAL_EV_WRITE, 1);
}

Просмотреть файл

@ -43,9 +43,11 @@
#include "opal/mca/event/event.h"
#include "opal/mca/pmix/pmix.h"
#include "opal/util/proc.h"
#include "opal/sys/atomic.h"
#include "orte/mca/grpcomm/base/base.h"
#include "orte/runtime/orte_globals.h"
#include "orte/util/threads.h"
BEGIN_C_DECLS
@ -119,6 +121,7 @@ OBJ_CLASS_DECLARATION(orte_pmix_mdx_caddy_t);
opal_event_set(orte_event_base, &(_req->ev), \
-1, OPAL_EV_WRITE, (cf), _req); \
opal_event_set_priority(&(_req->ev), ORTE_MSG_PRI); \
ORTE_POST_OBJECT(_req); \
opal_event_active(&(_req->ev), OPAL_EV_WRITE, 1); \
} while(0);
@ -133,6 +136,7 @@ OBJ_CLASS_DECLARATION(orte_pmix_mdx_caddy_t);
opal_event_set(orte_event_base, &(_req->ev), \
-1, OPAL_EV_WRITE, (cf), _req); \
opal_event_set_priority(&(_req->ev), ORTE_MSG_PRI); \
ORTE_POST_OBJECT(_req); \
opal_event_active(&(_req->ev), OPAL_EV_WRITE, 1); \
} while(0);
@ -147,6 +151,7 @@ OBJ_CLASS_DECLARATION(orte_pmix_mdx_caddy_t);
opal_event_set(orte_event_base, &(_cd->ev), -1, \
OPAL_EV_WRITE, (fn), _cd); \
opal_event_set_priority(&(_cd->ev), ORTE_MSG_PRI); \
ORTE_POST_OBJECT(_cd); \
opal_event_active(&(_cd->ev), OPAL_EV_WRITE, 1); \
} while(0);
@ -165,6 +170,7 @@ OBJ_CLASS_DECLARATION(orte_pmix_mdx_caddy_t);
opal_event_set(orte_event_base, &(_cd->ev), -1, \
OPAL_EV_WRITE, (fn), _cd); \
opal_event_set_priority(&(_cd->ev), ORTE_MSG_PRI); \
ORTE_POST_OBJECT(_cd); \
opal_event_active(&(_cd->ev), OPAL_EV_WRITE, 1); \
} while(0);

Просмотреть файл

@ -39,6 +39,7 @@
#include "orte/mca/errmgr/errmgr.h"
#include "orte/util/name_fns.h"
#include "orte/util/show_help.h"
#include "orte/util/threads.h"
#include "orte/runtime/orte_data_server.h"
#include "orte/runtime/orte_globals.h"
#include "orte/mca/rml/rml.h"
@ -150,6 +151,8 @@ static void execute(int sd, short args, void *cbdata)
opal_buffer_t *xfer;
orte_process_name_t *target;
ORTE_ACQUIRE_OBJECT(req);
if (!orte_pmix_server_globals.pubsub_init) {
/* we need to initialize our connection to the server */
if (ORTE_SUCCESS != (rc = init_server())) {
@ -298,6 +301,7 @@ int pmix_server_publish_fn(opal_process_name_t *proc,
opal_event_set(orte_event_base, &(req->ev),
-1, OPAL_EV_WRITE, execute, req);
opal_event_set_priority(&(req->ev), ORTE_MSG_PRI);
ORTE_POST_OBJECT(req);
opal_event_active(&(req->ev), OPAL_EV_WRITE, 1);
return OPAL_SUCCESS;
@ -395,6 +399,7 @@ int pmix_server_lookup_fn(opal_process_name_t *proc, char **keys,
opal_event_set(orte_event_base, &(req->ev),
-1, OPAL_EV_WRITE, execute, req);
opal_event_set_priority(&(req->ev), ORTE_MSG_PRI);
ORTE_POST_OBJECT(req);
opal_event_active(&(req->ev), OPAL_EV_WRITE, 1);
return OPAL_SUCCESS;
@ -483,6 +488,7 @@ int pmix_server_unpublish_fn(opal_process_name_t *proc, char **keys,
opal_event_set(orte_event_base, &(req->ev),
-1, OPAL_EV_WRITE, execute, req);
opal_event_set_priority(&(req->ev), ORTE_MSG_PRI);
ORTE_POST_OBJECT(req);
opal_event_active(&(req->ev), OPAL_EV_WRITE, 1);
return OPAL_SUCCESS;

Просмотреть файл

@ -54,6 +54,7 @@
#include "orte/util/session_dir.h"
#include "orte/util/show_help.h"
#include "orte/util/threads.h"
#include "orte/runtime/runtime.h"
#include "orte/runtime/orte_globals.h"
@ -75,6 +76,8 @@ void orte_quit(int fd, short args, void *cbdata)
{
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
ORTE_ACQUIRE_OBJECT(caddy);
/* cleanup */
if (NULL != caddy) {
OBJ_RELEASE(caddy);
@ -135,6 +138,7 @@ void orte_quit(int fd, short args, void *cbdata)
* so we will exit
*/
orte_event_base_active = false;
ORTE_POST_OBJECT(orte_event_base_active);
/* break out of the event loop */
opal_event_base_loopbreak(orte_event_base);
}

Просмотреть файл

@ -13,7 +13,7 @@
* reserved.
* Copyright (c) 2008 Institut National de Recherche en Informatique
* et Automatique. All rights reserved.
* Copyright (c) 2014 Intel Corporation. All rights reserved.
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -60,6 +60,7 @@
#include "orte/constants.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/util/name_fns.h"
#include "orte/util/threads.h"
#include "orte/runtime/orte_globals.h"
#include "orte/runtime/orte_wait.h"
@ -188,6 +189,8 @@ static void cancel_callback(int fd, short args, void *cbdata)
orte_wait_tracker_t *trk = (orte_wait_tracker_t*)cbdata;
orte_wait_tracker_t *t2;
ORTE_ACQUIRE_OBJECT(trk);
OPAL_LIST_FOREACH(t2, &pending_cbs, orte_wait_tracker_t) {
if (t2->child == trk->child) {
opal_list_remove_item(&pending_cbs, &t2->super);
@ -214,9 +217,7 @@ void orte_wait_cb_cancel(orte_proc_t *child)
trk = OBJ_NEW(orte_wait_tracker_t);
OBJ_RETAIN(child); // protect against race conditions
trk->child = child;
opal_event_set(orte_event_base, &trk->ev, -1, OPAL_EV_WRITE, cancel_callback, trk);
opal_event_set_priority(&trk->ev, ORTE_SYS_PRI);
opal_event_active(&trk->ev, OPAL_EV_WRITE, 1);
ORTE_THREADSHIFT(trk, orte_event_base, cancel_callback, ORTE_SYS_PRI);
}
@ -228,6 +229,8 @@ static void wait_signal_callback(int fd, short event, void *arg)
pid_t pid;
orte_wait_tracker_t *t2;
ORTE_ACQUIRE_OBJECT(signal);
if (SIGCHLD != OPAL_EVENT_SIGNAL(signal)) {
return;
}

Просмотреть файл

@ -13,7 +13,7 @@
* et Automatique. All rights reserved.
* Copyright (c) 2011 Los Alamos National Security, LLC.
* All rights reserved.
* Copyright (c) 2014 Intel, Inc. All rights reserved.
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -48,6 +48,7 @@
#include "orte/types.h"
#include "orte/mca/rml/rml_types.h"
#include "orte/runtime/orte_globals.h"
#include "orte/util/threads.h"
BEGIN_C_DECLS
@ -95,6 +96,7 @@ ORTE_DECLSPEC void orte_wait_cb_cancel(orte_proc_t *proc);
struct timespec tp = {0, 100000}; \
nanosleep(&tp, NULL); \
} \
ORTE_ACQUIRE_OBJECT(flg); \
}while(0);
/**
@ -135,6 +137,7 @@ ORTE_DECLSPEC void orte_wait_cb_cancel(orte_proc_t *proc);
"defining timeout: %ld sec %ld usec at %s:%d", \
(long)tmp->tv.tv_sec, (long)tmp->tv.tv_usec, \
__FILE__, __LINE__)); \
ORTE_POST_OBJECT(tmp); \
opal_event_evtimer_add(tmp->ev, &tmp->tv); \
}while(0); \
@ -161,6 +164,7 @@ ORTE_DECLSPEC void orte_wait_cb_cancel(orte_proc_t *proc);
"defining timer event: %ld sec %ld usec at %s:%d", \
(long)tm->tv.tv_sec, (long)tm->tv.tv_usec, \
__FILE__, __LINE__)); \
ORTE_POST_OBJECT(tm); \
opal_event_evtimer_add(tm->ev, &tm->tv); \
}while(0); \

Просмотреть файл

@ -1,7 +1,7 @@
PROGS = no_op sigusr_trap spin orte_nodename orte_spawn orte_loop_spawn orte_loop_child orte_abort get_limits \
orte_tool orte_no_op binom oob_stress iof_stress iof_delay radix opal_interface orte_spin segfault \
orte_exit test-time event-threads psm_keygen regex orte_errors evpri-test opal-evpri-test evpri-test2 \
mapper reducer opal_hotel orte_dfs ulfm pmixtool
mapper reducer opal_hotel orte_dfs ulfm pmixtool threads
all: $(PROGS)
@ -19,3 +19,6 @@ oob_stress:
pmixtool:
ortecc -o pmixtool pmixtool.c -lpmix
threads:
ortecc -O0 -g -lpthread -lhwloc threads.c -o threads

335
orte/test/system/threads.c Обычный файл
Просмотреть файл

@ -0,0 +1,335 @@
/*
* Test program for memory consistency in a thread shifting design
*
*
* Run:
* ./threads ITERATIONS [MODE]
* ./threads 9000000 3
*
* Example:
* ./threads 9000000 0 --> Will fail, no memory barriers
* ./threads 9000000 1 --> Will fail, no WMB
* ./threads 9000000 2 --> Will fail, no RMB
* ./threads 9000000 3 --> Success
* ./threads 9000000 4 --> Success
* ./threads 9000000 5 --> N/A
*/
#include <stdio.h>
#include <stdlib.h>
#include <stdbool.h>
#include <string.h>
#include <pthread.h>
#include <stdint.h>
#include <hwloc.h>
#include <sys/time.h>
#include "opal/sys/atomic.h"
// Max value for an int16_t
#define MAX_VAL 32767
typedef struct {
int type;
union {
bool flag;
int integer;
int8_t int8;
int16_t int16;
int32_t int32;
int64_t int64;
//char padding[1];
} data;
} my_value_t;
// Structure to handoff work to the peer thread
typedef struct {
volatile bool working;
void *ptr; // Note that adding a volatile here has no effect
} thread_handoff_t;
// Shared object to handoff work
thread_handoff_t handoff;
// Indicates if the test has finished
bool time_to_stop = false;
// Progress reporting
#define PERC_INC 10.0
double perc_report_after = PERC_INC;
double perc_current = 0.0;
// Memory barrier modes
#define MB_MODE_NONE 0x0
#define MB_MODE_RMB 0x1
#define MB_MODE_WMB 0x2
#define MB_MODE_MB 0x4
#define MB_MODE_XMB 0x8
#define MB_MODE_ALL (MB_MODE_RMB | MB_MODE_WMB)
int mb_mode = MB_MODE_ALL;
// Shared hwloc topology (so we only have to read it once)
static hwloc_topology_t topo;
// Which object we are binding to
// 4 - sockets with 5 cores each
// 20 - cores with 8 PUs each
//#define OBJ_TYPE HWLOC_OBJ_SOCKET
#define OBJ_TYPE HWLOC_OBJ_CORE
/*
* Some basic timing support
*/
double acc_time, start_time, stop_time, delta;
static double get_ts_gettimeofday(void) {
double ret;
struct timeval tv;
gettimeofday(&tv, NULL);
ret = tv.tv_sec;
ret += (double)tv.tv_usec / 1000000.0;
return ret;
}
/*
* Bind either the main or support thread far away from each other
*/
void bind_me_to(bool main_thread);
/*
* Support thread to do the memory allocation and xfer
*/
void *value_xfer_thread(void *arg);
/*
* Main thread
*/
int main(int argc, char **argv) {
pthread_t support_thread;
int rc, i, max_iters = 10, cur_iter;
my_value_t *val = NULL;
int mode;
/*
* Parse command line arguments
*/
if( argc > 1 ) {
max_iters = atoi(argv[1]);
}
if( argc > 2 ) {
mode = atoi(argv[2]);
if( 0 > mode || mode > 5 ) {
printf("Error: Invalid mode %d\n"
"\tNone = 0\n"
"\tRMB = 1\n"
"\tWMB = 2\n"
"\tBoth = 3\n"
"\tMB Only = 4\n",
"\tXMB Only = 5\n",
mode);
exit(-1);
}
}
else {
mode = 3;
}
switch(mode) {
case 0:
mb_mode = MB_MODE_NONE;
break;
case 1:
mb_mode = MB_MODE_RMB;
break;
case 2:
mb_mode = MB_MODE_WMB;
break;
case 3:
mb_mode = MB_MODE_ALL;
break;
case 4:
mb_mode = MB_MODE_MB;
break;
case 5:
mb_mode = MB_MODE_XMB;
break;
}
// Load hwloc topology
hwloc_topology_init(&topo);
hwloc_topology_load(topo);
// Display banner
printf("---------------------------\n");
printf("Iterations: %10d\n", max_iters);
printf("Mode R MB : %10s\n", (mb_mode & MB_MODE_RMB ? "Enabled" : "Disabled") );
printf("Mode W MB : %10s\n", (mb_mode & MB_MODE_WMB ? "Enabled" : "Disabled") );
printf("Mode - MB : %10s\n", (mb_mode & MB_MODE_MB ? "Enabled" : "Disabled") );
printf("Mode X MB : %10s\n", (mb_mode & MB_MODE_XMB ? "Enabled" : "Disabled") );
printf("---------------------------\n");
bind_me_to(true);
handoff.working = false;
/*
* Launch supporting thread
*/
rc = pthread_create(&support_thread, NULL, value_xfer_thread, NULL);
if( 0 != rc ) {
printf("Error: Failed to create a thread! %d\n", rc);
exit(-1);
}
/*
* Main work loop
*/
acc_time = 0.0;
for(cur_iter = 0; cur_iter < max_iters; ++cur_iter) {
perc_current = (cur_iter / ((double)max_iters)) * 100.0;
if( perc_current > perc_report_after ) {
delta = (acc_time / cur_iter) * 1000000;
printf("%6.1f %% complete : Iteration %10d / %10d : %6.1f usec / iter\n",
perc_current, cur_iter+1, max_iters, delta);
perc_report_after += PERC_INC;
}
start_time = get_ts_gettimeofday();
// Initialize values
val = NULL;
handoff.ptr = &val;
if( mb_mode & MB_MODE_RMB ) {
opal_atomic_rmb();
}
if( mb_mode & MB_MODE_MB ) {
opal_atomic_mb();
}
handoff.working = true;
// Wait for work to finish
while( handoff.working ) {
usleep(1);
}
if( mb_mode & MB_MODE_WMB ) {
opal_atomic_wmb();
}
if( mb_mode & MB_MODE_MB ) {
opal_atomic_mb();
}
// Inspect values for correctness
if( NULL == val ) {
printf("[%10d / %10d] Error: val = %s\n", cur_iter+1, max_iters,
(NULL == val ? "NULL" : "Valid") );
exit(-1);
}
else if( 999 != val->type ) {
printf("[%10d / %10d] Error: val->type = %d\n", cur_iter+1, max_iters, val->type);
exit(-1);
}
else if( (cur_iter+1)%MAX_VAL != val->data.int16 ) {
printf("[%10d / %10d] Error: val->data.int16 = %d\n", cur_iter+1, max_iters, val->data.int16);
exit(-1);
}
stop_time = get_ts_gettimeofday();
acc_time += (stop_time - start_time);
// Yes, this is a memory leak!
// I need to make sure that the supporting thread is not reusing a
// previous storage location when it calls malloc. This is to emulate
// a program that calls malloc after the value was acquired, possibly
// reusing this memory location.
//free(val);
val = NULL;
}
delta = (acc_time / max_iters) * 1000000;
/*
* All done - Cleanup
*/
time_to_stop = true;
rc = pthread_join(support_thread, NULL);
if( 0 != rc ) {
printf("Error: Failed to join a thread! %d\n", rc);
exit(-1);
}
hwloc_topology_destroy(topo);
printf("Success - %6.1f usec / iter\n", delta);
return 0;
}
void *value_xfer_thread(void *arg) {
my_value_t **val = NULL;
static int var = 0;
// Bind this thread away from the main thread
bind_me_to(false);
while( !time_to_stop ) {
if( handoff.working ) {
// Make sure I have the right pointer
if( mb_mode & MB_MODE_WMB ) {
opal_atomic_wmb();
}
if( mb_mode & MB_MODE_MB ) {
opal_atomic_mb();
}
// Allocate and set the value
val = (my_value_t**)handoff.ptr;
(*val) = malloc(sizeof(my_value_t));
(*val)->type = 999;
(*val)->data.int16 = (++var)%MAX_VAL;
// Make sure main thread can see the value
// See 'Examples' -> 'Global thread flag' discussion here:
// https://www.ibm.com/developerworks/systems/articles/powerpc.html
if( mb_mode & MB_MODE_RMB ) {
opal_atomic_rmb();
}
if( mb_mode & MB_MODE_MB ) {
opal_atomic_mb();
}
// Release main thread
handoff.working = false;
}
else {
// wait for work
usleep(1);
}
}
pthread_exit(NULL);
}
void bind_me_to(bool main_thread) {
int num_objs;
hwloc_cpuset_t set;
char *buffer = NULL;
hwloc_obj_t obj;
num_objs = hwloc_get_nbobjs_by_type(topo, OBJ_TYPE);
if( main_thread ) {
obj = hwloc_get_obj_by_type(topo, OBJ_TYPE, 0);
}
else {
obj = hwloc_get_obj_by_type(topo, OBJ_TYPE, num_objs-1);
}
if( obj->type == OBJ_TYPE ) {
hwloc_set_cpubind(topo, obj->cpuset, HWLOC_CPUBIND_THREAD);
}
else {
printf("Error: Invalid object\n");
exit(-1);
}
set = hwloc_bitmap_alloc();
hwloc_get_cpubind(topo, set, HWLOC_CPUBIND_THREAD);
hwloc_bitmap_asprintf(&buffer, set);
printf("%s : [objs = %d] : cpuset is %s\n", (main_thread ? "Main" : "Peer"), num_objs, buffer);
free(buffer);
hwloc_bitmap_free(set);
}

Просмотреть файл

@ -84,6 +84,7 @@
#include "orte/runtime/runtime.h"
#include "orte/runtime/orte_globals.h"
#include "orte/util/show_help.h"
#include "orte/util/threads.h"
#include "orte/orted/orted.h"
@ -490,6 +491,7 @@ int main(int argc, char *argv[])
while (orte_event_base_active) {
opal_event_loop(orte_event_base, OPAL_EVLOOP_ONCE);
}
ORTE_ACQUIRE_OBJECT(orte_event_base_active);
/* cleanup and leave */
orte_finalize();

Просмотреть файл

@ -13,7 +13,7 @@
* Copyright (c) 2007-2013 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2007-2016 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2015 Intel, Inc. All rights reserved.
* Copyright (c) 2015-2017 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -54,6 +54,7 @@
#include "orte/util/name_fns.h"
#include "orte/util/proc_info.h"
#include "orte/util/threads.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/rml/rml.h"
#include "orte/orted/orted.h"
@ -283,6 +284,7 @@ int main(int argc, char *argv[])
while (orte_event_base_active) {
opal_event_loop(orte_event_base, OPAL_EVLOOP_ONCE);
}
ORTE_ACQUIRE_OBJECT(orte_event_base_active);
/* should never get here, but if we do... */

Просмотреть файл

@ -13,7 +13,7 @@
* Copyright (c) 2007-2012 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2007-2016 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2015-2016 Intel, Inc. All rights reserved.
* Copyright (c) 2015-2017 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -58,6 +58,7 @@
#include "orte/util/name_fns.h"
#include "orte/util/show_help.h"
#include "orte/util/proc_info.h"
#include "orte/util/threads.h"
#include "orte/runtime/orte_wait.h"
#include "orte/mca/rml/base/rml_contact.h"
#include "orte/runtime/orte_quit.h"
@ -532,6 +533,7 @@ SEND:
while (orte_event_base_active) {
opal_event_loop(orte_event_base, OPAL_EVLOOP_ONCE);
}
ORTE_ACQUIRE_OBJECT(orte_event_base_active);
/***************
* Cleanup

Просмотреть файл

@ -87,6 +87,7 @@
#include "orte/mca/state/state.h"
#include "orte/util/proc_info.h"
#include "orte/util/show_help.h"
#include "orte/util/threads.h"
#include "orte/runtime/runtime.h"
#include "orte/runtime/orte_globals.h"
@ -198,6 +199,7 @@ int orterun(int argc, char *argv[])
while (orte_event_base_active && launchst.active) {
opal_event_loop(orte_event_base, OPAL_EVLOOP_ONCE);
}
ORTE_ACQUIRE_OBJECT(orte_event_base_active);
if (orte_debug_flag) {
opal_output(0, "Job %s has launched",
(NULL == launchst.jdata) ? "UNKNOWN" : ORTE_JOBID_PRINT(launchst.jdata->jobid));
@ -209,6 +211,7 @@ int orterun(int argc, char *argv[])
while (orte_event_base_active && completest.active) {
opal_event_loop(orte_event_base, OPAL_EVLOOP_ONCE);
}
ORTE_ACQUIRE_OBJECT(orte_event_base_active);
if (ORTE_PROC_IS_HNP) {
/* ensure all local procs are dead */

Просмотреть файл

@ -43,14 +43,14 @@ AM_LFLAGS = -Porte_util_hostfile_
LEX_OUTPUT_ROOT = lex.orte_util_hostfile_
headers += \
util/name_fns.h \
util/name_fns.h \
util/proc_info.h \
util/session_dir.h \
util/show_help.h \
util/error_strings.h \
util/context_fns.h \
util/parse_options.h \
util/pre_condition_transports.h \
util/context_fns.h \
util/parse_options.h \
util/pre_condition_transports.h \
util/hnp_contact.h \
util/hostfile/hostfile.h \
util/hostfile/hostfile_lex.h \
@ -60,7 +60,8 @@ headers += \
util/regex.h \
util/attr.h \
util/listener.h \
util/compress.h
util/compress.h \
util/threads.h
lib@ORTE_LIB_PREFIX@open_rte_la_SOURCES += \
util/error_strings.c \
@ -68,9 +69,9 @@ lib@ORTE_LIB_PREFIX@open_rte_la_SOURCES += \
util/proc_info.c \
util/session_dir.c \
util/show_help.c \
util/context_fns.c \
util/parse_options.c \
util/pre_condition_transports.c \
util/context_fns.c \
util/parse_options.c \
util/pre_condition_transports.c \
util/hnp_contact.c \
util/hostfile/hostfile_lex.l \
util/hostfile/hostfile.c \

38
orte/util/threads.h Обычный файл
Просмотреть файл

@ -0,0 +1,38 @@
/*
* Copyright (c) 2017 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#ifndef ORTE_THREADS_H
#define ORTE_THREADS_H
#include "orte_config.h"
#include "opal/sys/atomic.h"
/* provide macros for forward-proofing the shifting
* of objects between threads - at some point, we
* may revamp our threading model */
/* post an object to another thread - for now, we
* only have a memory barrier */
#define ORTE_POST_OBJECT(o) opal_atomic_wmb()
/* acquire an object from another thread - for now,
* we only have a memory barrier */
#define ORTE_ACQUIRE_OBJECT(o) opal_atomic_rmb()
/* define a threadshift macro */
#define ORTE_THREADSHIFT(x, eb, f, p) \
do { \
opal_event_set((eb), &((x)->ev), -1, OPAL_EV_WRITE, (f), (x)); \
opal_event_set_priority(&((x)->ev), (p)); \
ORTE_POST_OBJECT((x)); \
opal_event_active(&((x)->ev), OPAL_EV_WRITE, 1); \
} while(0)
#endif /* ORTE_THREADS_H */