Merge pull request #3659 from rhc54/topic/threads
Update OPAL and ORTE for thread safety
Этот коммит содержится в:
Коммит
21fba8b7f3
1
.gitignore
поставляемый
1
.gitignore
поставляемый
@ -475,6 +475,7 @@ orte/test/system/opal_db
|
|||||||
orte/test/system/ulfm
|
orte/test/system/ulfm
|
||||||
orte/test/system/pmixtool
|
orte/test/system/pmixtool
|
||||||
orte/test/system/orte_notify
|
orte/test/system/orte_notify
|
||||||
|
orte/test/system/threads
|
||||||
|
|
||||||
orte/tools/orte-checkpoint/orte-checkpoint
|
orte/tools/orte-checkpoint/orte-checkpoint
|
||||||
orte/tools/orte-checkpoint/orte-checkpoint.1
|
orte/tools/orte-checkpoint/orte-checkpoint.1
|
||||||
|
@ -31,6 +31,7 @@
|
|||||||
#include "opal/mca/hwloc/base/base.h"
|
#include "opal/mca/hwloc/base/base.h"
|
||||||
#include "opal/runtime/opal.h"
|
#include "opal/runtime/opal.h"
|
||||||
#include "opal/runtime/opal_progress_threads.h"
|
#include "opal/runtime/opal_progress_threads.h"
|
||||||
|
#include "opal/threads/threads.h"
|
||||||
#include "opal/util/argv.h"
|
#include "opal/util/argv.h"
|
||||||
#include "opal/util/error.h"
|
#include "opal/util/error.h"
|
||||||
#include "opal/util/output.h"
|
#include "opal/util/output.h"
|
||||||
@ -164,6 +165,7 @@ static void return_local_event_hdlr(int status, opal_list_t *results,
|
|||||||
pmix_status_t pstatus;
|
pmix_status_t pstatus;
|
||||||
size_t n;
|
size_t n;
|
||||||
|
|
||||||
|
OPAL_ACQUIRE_OBJECT(cd);
|
||||||
if (NULL != cd->pmixcbfunc) {
|
if (NULL != cd->pmixcbfunc) {
|
||||||
op = OBJ_NEW(pmix2x_opcaddy_t);
|
op = OBJ_NEW(pmix2x_opcaddy_t);
|
||||||
|
|
||||||
@ -203,6 +205,8 @@ static void _event_hdlr(int sd, short args, void *cbdata)
|
|||||||
pmix2x_threadshift_t *cd = (pmix2x_threadshift_t*)cbdata;
|
pmix2x_threadshift_t *cd = (pmix2x_threadshift_t*)cbdata;
|
||||||
opal_pmix2x_event_t *event;
|
opal_pmix2x_event_t *event;
|
||||||
|
|
||||||
|
OPAL_ACQUIRE_OBJECT(cd);
|
||||||
|
|
||||||
opal_output_verbose(2, opal_pmix_base_framework.framework_output,
|
opal_output_verbose(2, opal_pmix_base_framework.framework_output,
|
||||||
"%s _EVENT_HDLR RECEIVED NOTIFICATION FOR HANDLER %d OF STATUS %d",
|
"%s _EVENT_HDLR RECEIVED NOTIFICATION FOR HANDLER %d OF STATUS %d",
|
||||||
OPAL_NAME_PRINT(OPAL_PROC_MY_NAME), (int)cd->id, cd->status);
|
OPAL_NAME_PRINT(OPAL_PROC_MY_NAME), (int)cd->id, cd->status);
|
||||||
@ -312,6 +316,7 @@ void pmix2x_event_hdlr(size_t evhdlr_registration_id,
|
|||||||
/* now push it into the local thread */
|
/* now push it into the local thread */
|
||||||
opal_event_assign(&cd->ev, opal_pmix_base.evbase,
|
opal_event_assign(&cd->ev, opal_pmix_base.evbase,
|
||||||
-1, EV_WRITE, _event_hdlr, cd);
|
-1, EV_WRITE, _event_hdlr, cd);
|
||||||
|
OPAL_POST_OBJECT(cd);
|
||||||
opal_event_active(&cd->ev, EV_WRITE, 1);
|
opal_event_active(&cd->ev, EV_WRITE, 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -986,6 +991,7 @@ static void errreg_cbfunc (pmix_status_t status,
|
|||||||
{
|
{
|
||||||
pmix2x_opcaddy_t *op = (pmix2x_opcaddy_t*)cbdata;
|
pmix2x_opcaddy_t *op = (pmix2x_opcaddy_t*)cbdata;
|
||||||
|
|
||||||
|
OPAL_ACQUIRE_OBJECT(op);
|
||||||
op->event->index = errhandler_ref;
|
op->event->index = errhandler_ref;
|
||||||
opal_output_verbose(5, opal_pmix_base_framework.framework_output,
|
opal_output_verbose(5, opal_pmix_base_framework.framework_output,
|
||||||
"PMIX2x errreg_cbfunc - error handler registered status=%d, reference=%lu",
|
"PMIX2x errreg_cbfunc - error handler registered status=%d, reference=%lu",
|
||||||
@ -1003,6 +1009,7 @@ static void _reg_hdlr(int sd, short args, void *cbdata)
|
|||||||
opal_value_t *kv;
|
opal_value_t *kv;
|
||||||
size_t n;
|
size_t n;
|
||||||
|
|
||||||
|
OPAL_ACQUIRE_OBJECT(cd);
|
||||||
opal_output_verbose(2, opal_pmix_base_framework.framework_output,
|
opal_output_verbose(2, opal_pmix_base_framework.framework_output,
|
||||||
"%s REGISTER HANDLER CODES %s",
|
"%s REGISTER HANDLER CODES %s",
|
||||||
OPAL_NAME_PRINT(OPAL_PROC_MY_NAME),
|
OPAL_NAME_PRINT(OPAL_PROC_MY_NAME),
|
||||||
@ -1067,6 +1074,7 @@ static void _dereg_hdlr(int sd, short args, void *cbdata)
|
|||||||
pmix2x_threadshift_t *cd = (pmix2x_threadshift_t*)cbdata;
|
pmix2x_threadshift_t *cd = (pmix2x_threadshift_t*)cbdata;
|
||||||
opal_pmix2x_event_t *event;
|
opal_pmix2x_event_t *event;
|
||||||
|
|
||||||
|
OPAL_ACQUIRE_OBJECT(cd);
|
||||||
/* look for this event */
|
/* look for this event */
|
||||||
OPAL_LIST_FOREACH(event, &mca_pmix_pmix2x_component.events, opal_pmix2x_event_t) {
|
OPAL_LIST_FOREACH(event, &mca_pmix_pmix2x_component.events, opal_pmix2x_event_t) {
|
||||||
if (cd->handler == event->index) {
|
if (cd->handler == event->index) {
|
||||||
@ -1116,6 +1124,8 @@ static void _notify(int sd, short args, void *cbdata)
|
|||||||
pmix_data_range_t prange;
|
pmix_data_range_t prange;
|
||||||
opal_pmix2x_jobid_trkr_t *job, *jptr;
|
opal_pmix2x_jobid_trkr_t *job, *jptr;
|
||||||
|
|
||||||
|
OPAL_ACQUIRE_OBJECT(cd);
|
||||||
|
|
||||||
op = OBJ_NEW(pmix2x_opcaddy_t);
|
op = OBJ_NEW(pmix2x_opcaddy_t);
|
||||||
|
|
||||||
/* convert the status */
|
/* convert the status */
|
||||||
@ -1204,6 +1214,8 @@ static void infocbfunc(pmix_status_t status,
|
|||||||
opal_value_t *iptr;
|
opal_value_t *iptr;
|
||||||
size_t n;
|
size_t n;
|
||||||
|
|
||||||
|
OPAL_ACQUIRE_OBJECT(cd);
|
||||||
|
|
||||||
/* convert the array of pmix_info_t to the list of info */
|
/* convert the array of pmix_info_t to the list of info */
|
||||||
if (NULL != info) {
|
if (NULL != info) {
|
||||||
results = OBJ_NEW(opal_list_t);
|
results = OBJ_NEW(opal_list_t);
|
||||||
@ -1294,6 +1306,8 @@ static void opcbfunc(pmix_status_t status, void *cbdata)
|
|||||||
{
|
{
|
||||||
pmix2x_opcaddy_t *op = (pmix2x_opcaddy_t*)cbdata;
|
pmix2x_opcaddy_t *op = (pmix2x_opcaddy_t*)cbdata;
|
||||||
|
|
||||||
|
OPAL_ACQUIRE_OBJECT(op);
|
||||||
|
|
||||||
if (NULL != op->opcbfunc) {
|
if (NULL != op->opcbfunc) {
|
||||||
op->opcbfunc(pmix2x_convert_rc(status), op->cbdata);
|
op->opcbfunc(pmix2x_convert_rc(status), op->cbdata);
|
||||||
}
|
}
|
||||||
|
@ -156,6 +156,7 @@ OBJ_CLASS_DECLARATION(pmix2x_threadshift_t);
|
|||||||
_cd->cbdata = (cd); \
|
_cd->cbdata = (cd); \
|
||||||
opal_event_assign(&((_cd)->ev), opal_pmix_base.evbase, \
|
opal_event_assign(&((_cd)->ev), opal_pmix_base.evbase, \
|
||||||
-1, EV_WRITE, (fn), (_cd)); \
|
-1, EV_WRITE, (fn), (_cd)); \
|
||||||
|
OPAL_POST_OBJECT(_cd); \
|
||||||
opal_event_active(&((_cd)->ev), EV_WRITE, 1); \
|
opal_event_active(&((_cd)->ev), EV_WRITE, 1); \
|
||||||
} while(0)
|
} while(0)
|
||||||
|
|
||||||
@ -170,6 +171,7 @@ OBJ_CLASS_DECLARATION(pmix2x_threadshift_t);
|
|||||||
_cd->cbdata = (cd); \
|
_cd->cbdata = (cd); \
|
||||||
opal_event_assign(&((_cd)->ev), opal_pmix_base.evbase, \
|
opal_event_assign(&((_cd)->ev), opal_pmix_base.evbase, \
|
||||||
-1, EV_WRITE, (fn), (_cd)); \
|
-1, EV_WRITE, (fn), (_cd)); \
|
||||||
|
OPAL_POST_OBJECT(_cd); \
|
||||||
opal_event_active(&((_cd)->ev), EV_WRITE, 1); \
|
opal_event_active(&((_cd)->ev), EV_WRITE, 1); \
|
||||||
} while(0)
|
} while(0)
|
||||||
|
|
||||||
@ -185,6 +187,7 @@ OBJ_CLASS_DECLARATION(pmix2x_threadshift_t);
|
|||||||
_cd->cbdata = (cd); \
|
_cd->cbdata = (cd); \
|
||||||
opal_event_assign(&((_cd)->ev), opal_pmix_base.evbase, \
|
opal_event_assign(&((_cd)->ev), opal_pmix_base.evbase, \
|
||||||
-1, EV_WRITE, (fn), (_cd)); \
|
-1, EV_WRITE, (fn), (_cd)); \
|
||||||
|
OPAL_POST_OBJECT(_cd); \
|
||||||
opal_event_active(&((_cd)->ev), EV_WRITE, 1); \
|
opal_event_active(&((_cd)->ev), EV_WRITE, 1); \
|
||||||
} while(0)
|
} while(0)
|
||||||
|
|
||||||
|
@ -27,6 +27,7 @@
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
#include "opal/hash_string.h"
|
#include "opal/hash_string.h"
|
||||||
|
#include "opal/threads/threads.h"
|
||||||
#include "opal/util/argv.h"
|
#include "opal/util/argv.h"
|
||||||
#include "opal/util/proc.h"
|
#include "opal/util/proc.h"
|
||||||
|
|
||||||
@ -44,6 +45,7 @@ static bool initialized = false;
|
|||||||
while ((a)) { \
|
while ((a)) { \
|
||||||
usleep(10); \
|
usleep(10); \
|
||||||
} \
|
} \
|
||||||
|
OPAL_ACQUIRE_OBJECT(a); \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
|
|
||||||
@ -53,11 +55,14 @@ static void errreg_cbfunc (pmix_status_t status,
|
|||||||
{
|
{
|
||||||
opal_pmix2x_event_t *event = (opal_pmix2x_event_t*)cbdata;
|
opal_pmix2x_event_t *event = (opal_pmix2x_event_t*)cbdata;
|
||||||
|
|
||||||
|
OPAL_ACQUIRE_OBJECT(event);
|
||||||
|
|
||||||
event->index = errhandler_ref;
|
event->index = errhandler_ref;
|
||||||
opal_output_verbose(5, opal_pmix_base_framework.framework_output,
|
opal_output_verbose(5, opal_pmix_base_framework.framework_output,
|
||||||
"PMIX client errreg_cbfunc - error handler registered status=%d, reference=%lu",
|
"PMIX client errreg_cbfunc - error handler registered status=%d, reference=%lu",
|
||||||
status, (unsigned long)errhandler_ref);
|
status, (unsigned long)errhandler_ref);
|
||||||
regactive = false;
|
regactive = false;
|
||||||
|
OPAL_POST_OBJECT(regactive);
|
||||||
}
|
}
|
||||||
|
|
||||||
int pmix2x_client_init(opal_list_t *ilist)
|
int pmix2x_client_init(opal_list_t *ilist)
|
||||||
@ -272,6 +277,7 @@ static void opcbfunc(pmix_status_t status, void *cbdata)
|
|||||||
{
|
{
|
||||||
pmix2x_opcaddy_t *op = (pmix2x_opcaddy_t*)cbdata;
|
pmix2x_opcaddy_t *op = (pmix2x_opcaddy_t*)cbdata;
|
||||||
|
|
||||||
|
OPAL_ACQUIRE_OBJECT(op);
|
||||||
if (NULL != op->opcbfunc) {
|
if (NULL != op->opcbfunc) {
|
||||||
op->opcbfunc(pmix2x_convert_rc(status), op->cbdata);
|
op->opcbfunc(pmix2x_convert_rc(status), op->cbdata);
|
||||||
}
|
}
|
||||||
@ -521,6 +527,8 @@ static void val_cbfunc(pmix_status_t status,
|
|||||||
int rc;
|
int rc;
|
||||||
opal_value_t val, *v=NULL;
|
opal_value_t val, *v=NULL;
|
||||||
|
|
||||||
|
OPAL_ACQUIRE_OBJECT(op);
|
||||||
|
|
||||||
rc = pmix2x_convert_opalrc(status);
|
rc = pmix2x_convert_opalrc(status);
|
||||||
if (PMIX_SUCCESS == status && NULL != kv) {
|
if (PMIX_SUCCESS == status && NULL != kv) {
|
||||||
rc = pmix2x_value_unload(&val, kv);
|
rc = pmix2x_value_unload(&val, kv);
|
||||||
@ -768,6 +776,8 @@ static void lk_cbfunc(pmix_status_t status,
|
|||||||
size_t n;
|
size_t n;
|
||||||
opal_pmix2x_jobid_trkr_t *job, *jptr;
|
opal_pmix2x_jobid_trkr_t *job, *jptr;
|
||||||
|
|
||||||
|
OPAL_ACQUIRE_OBJECT(op);
|
||||||
|
|
||||||
/* this is in the PMIx local thread - need to threadshift to
|
/* this is in the PMIx local thread - need to threadshift to
|
||||||
* our own thread as we will be accessing framework-global
|
* our own thread as we will be accessing framework-global
|
||||||
* lists and objects */
|
* lists and objects */
|
||||||
@ -817,7 +827,7 @@ static void lk_cbfunc(pmix_status_t status,
|
|||||||
}
|
}
|
||||||
r = &results;
|
r = &results;
|
||||||
}
|
}
|
||||||
release:
|
release:
|
||||||
/* execute the callback */
|
/* execute the callback */
|
||||||
op->lkcbfunc(rc, r, op->cbdata);
|
op->lkcbfunc(rc, r, op->cbdata);
|
||||||
|
|
||||||
@ -994,6 +1004,8 @@ static void spcbfunc(pmix_status_t status,
|
|||||||
opal_jobid_t jobid=OPAL_JOBID_INVALID;
|
opal_jobid_t jobid=OPAL_JOBID_INVALID;
|
||||||
opal_pmix2x_jobid_trkr_t *job;
|
opal_pmix2x_jobid_trkr_t *job;
|
||||||
|
|
||||||
|
OPAL_ACQUIRE_OBJECT(op);
|
||||||
|
|
||||||
/* this is in the PMIx local thread - need to threadshift to
|
/* this is in the PMIx local thread - need to threadshift to
|
||||||
* our own thread as we will be accessing framework-global
|
* our own thread as we will be accessing framework-global
|
||||||
* lists and objects */
|
* lists and objects */
|
||||||
|
@ -29,6 +29,7 @@
|
|||||||
#include "opal/mca/hwloc/base/base.h"
|
#include "opal/mca/hwloc/base/base.h"
|
||||||
#include "opal/runtime/opal.h"
|
#include "opal/runtime/opal.h"
|
||||||
#include "opal/runtime/opal_progress_threads.h"
|
#include "opal/runtime/opal_progress_threads.h"
|
||||||
|
#include "opal/threads/threads.h"
|
||||||
#include "opal/util/argv.h"
|
#include "opal/util/argv.h"
|
||||||
#include "opal/util/error.h"
|
#include "opal/util/error.h"
|
||||||
#include "opal/util/output.h"
|
#include "opal/util/output.h"
|
||||||
@ -142,6 +143,7 @@ static void opal_opcbfunc(int status, void *cbdata)
|
|||||||
{
|
{
|
||||||
pmix2x_opalcaddy_t *opalcaddy = (pmix2x_opalcaddy_t*)cbdata;
|
pmix2x_opalcaddy_t *opalcaddy = (pmix2x_opalcaddy_t*)cbdata;
|
||||||
|
|
||||||
|
OPAL_ACQUIRE_OBJECT(opalcaddy);
|
||||||
if (NULL != opalcaddy->opcbfunc) {
|
if (NULL != opalcaddy->opcbfunc) {
|
||||||
opalcaddy->opcbfunc(pmix2x_convert_opalrc(status), opalcaddy->cbdata);
|
opalcaddy->opcbfunc(pmix2x_convert_opalrc(status), opalcaddy->cbdata);
|
||||||
}
|
}
|
||||||
|
@ -32,6 +32,7 @@
|
|||||||
#include "opal/mca/hwloc/base/base.h"
|
#include "opal/mca/hwloc/base/base.h"
|
||||||
#include "opal/runtime/opal.h"
|
#include "opal/runtime/opal.h"
|
||||||
#include "opal/runtime/opal_progress_threads.h"
|
#include "opal/runtime/opal_progress_threads.h"
|
||||||
|
#include "opal/threads/threads.h"
|
||||||
#include "opal/util/argv.h"
|
#include "opal/util/argv.h"
|
||||||
#include "opal/util/error.h"
|
#include "opal/util/error.h"
|
||||||
#include "opal/util/output.h"
|
#include "opal/util/output.h"
|
||||||
@ -58,6 +59,7 @@ static size_t errhdler_ref = 0;
|
|||||||
while ((a)) { \
|
while ((a)) { \
|
||||||
usleep(10); \
|
usleep(10); \
|
||||||
} \
|
} \
|
||||||
|
OPAL_ACQUIRE_OBJECT(a); \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
static void errreg_cbfunc (pmix_status_t status,
|
static void errreg_cbfunc (pmix_status_t status,
|
||||||
@ -66,10 +68,12 @@ static void errreg_cbfunc (pmix_status_t status,
|
|||||||
{
|
{
|
||||||
volatile bool *active = (volatile bool*)cbdata;
|
volatile bool *active = (volatile bool*)cbdata;
|
||||||
|
|
||||||
|
OPAL_ACQUIRE_OBJECT(active);
|
||||||
errhdler_ref = errhandler_ref;
|
errhdler_ref = errhandler_ref;
|
||||||
opal_output_verbose(5, opal_pmix_base_framework.framework_output,
|
opal_output_verbose(5, opal_pmix_base_framework.framework_output,
|
||||||
"PMIX server errreg_cbfunc - error handler registered status=%d, reference=%lu",
|
"PMIX server errreg_cbfunc - error handler registered status=%d, reference=%lu",
|
||||||
status, (unsigned long)errhandler_ref);
|
status, (unsigned long)errhandler_ref);
|
||||||
|
OPAL_POST_OBJECT(active);
|
||||||
*active = false;
|
*active = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -77,11 +81,14 @@ static void opcbfunc(pmix_status_t status, void *cbdata)
|
|||||||
{
|
{
|
||||||
pmix2x_opcaddy_t *op = (pmix2x_opcaddy_t*)cbdata;
|
pmix2x_opcaddy_t *op = (pmix2x_opcaddy_t*)cbdata;
|
||||||
|
|
||||||
|
OPAL_ACQUIRE_OBJECT(op);
|
||||||
|
|
||||||
if (NULL != op->opcbfunc) {
|
if (NULL != op->opcbfunc) {
|
||||||
op->opcbfunc(pmix2x_convert_rc(status), op->cbdata);
|
op->opcbfunc(pmix2x_convert_rc(status), op->cbdata);
|
||||||
}
|
}
|
||||||
if (op->active) {
|
if (op->active) {
|
||||||
op->status = status;
|
op->status = status;
|
||||||
|
OPAL_POST_OBJECT(op);
|
||||||
op->active = false;
|
op->active = false;
|
||||||
} else {
|
} else {
|
||||||
OBJ_RELEASE(op);
|
OBJ_RELEASE(op);
|
||||||
@ -92,6 +99,7 @@ static void op2cbfunc(pmix_status_t status, void *cbdata)
|
|||||||
{
|
{
|
||||||
volatile bool *active = (volatile bool*)cbdata;
|
volatile bool *active = (volatile bool*)cbdata;
|
||||||
|
|
||||||
|
OPAL_POST_OBJECT(active);
|
||||||
*active = false;
|
*active = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -165,6 +173,7 @@ int pmix2x_server_init(opal_pmix_server_module_t *module,
|
|||||||
static void fincb(pmix_status_t status, void *cbdata)
|
static void fincb(pmix_status_t status, void *cbdata)
|
||||||
{
|
{
|
||||||
volatile bool *active = (volatile bool*)cbdata;
|
volatile bool *active = (volatile bool*)cbdata;
|
||||||
|
OPAL_POST_OBJECT(active);
|
||||||
*active = false;
|
*active = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -211,6 +220,8 @@ static void _reg_nspace(int sd, short args, void *cbdata)
|
|||||||
opal_pmix2x_jobid_trkr_t *job;
|
opal_pmix2x_jobid_trkr_t *job;
|
||||||
pmix2x_opcaddy_t op;
|
pmix2x_opcaddy_t op;
|
||||||
|
|
||||||
|
OPAL_ACQUIRE_OBJECT(cd);
|
||||||
|
|
||||||
/* we must threadshift this request as we might not be in an event
|
/* we must threadshift this request as we might not be in an event
|
||||||
* and we are going to access framework-global lists/objects */
|
* and we are going to access framework-global lists/objects */
|
||||||
|
|
||||||
@ -301,6 +312,7 @@ int pmix2x_server_register_nspace(opal_jobid_t jobid,
|
|||||||
} else {
|
} else {
|
||||||
opal_event_assign(&cd->ev, opal_pmix_base.evbase,
|
opal_event_assign(&cd->ev, opal_pmix_base.evbase,
|
||||||
-1, EV_WRITE, _reg_nspace, cd);
|
-1, EV_WRITE, _reg_nspace, cd);
|
||||||
|
OPAL_POST_OBJECT(cd);
|
||||||
opal_event_active(&cd->ev, EV_WRITE, 1);
|
opal_event_active(&cd->ev, EV_WRITE, 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -311,10 +323,12 @@ static void tdcbfunc(pmix_status_t status, void *cbdata)
|
|||||||
{
|
{
|
||||||
pmix2x_threadshift_t *cd = (pmix2x_threadshift_t*)cbdata;
|
pmix2x_threadshift_t *cd = (pmix2x_threadshift_t*)cbdata;
|
||||||
|
|
||||||
|
OPAL_ACQUIRE_OBJECT(cd);
|
||||||
if (NULL != cd->opcbfunc) {
|
if (NULL != cd->opcbfunc) {
|
||||||
cd->opcbfunc(pmix2x_convert_rc(status), cd->cbdata);
|
cd->opcbfunc(pmix2x_convert_rc(status), cd->cbdata);
|
||||||
}
|
}
|
||||||
if (cd->active) {
|
if (cd->active) {
|
||||||
|
OPAL_POST_OBJECT(cd);
|
||||||
cd->active = false;
|
cd->active = false;
|
||||||
} else {
|
} else {
|
||||||
OBJ_RELEASE(cd);
|
OBJ_RELEASE(cd);
|
||||||
@ -326,6 +340,7 @@ static void _dereg_nspace(int sd, short args, void *cbdata)
|
|||||||
pmix2x_threadshift_t *cd = (pmix2x_threadshift_t*)cbdata;
|
pmix2x_threadshift_t *cd = (pmix2x_threadshift_t*)cbdata;
|
||||||
opal_pmix2x_jobid_trkr_t *jptr;
|
opal_pmix2x_jobid_trkr_t *jptr;
|
||||||
|
|
||||||
|
OPAL_ACQUIRE_OBJECT(cd);
|
||||||
/* if we don't already have it, we can ignore this */
|
/* if we don't already have it, we can ignore this */
|
||||||
OPAL_LIST_FOREACH(jptr, &mca_pmix_pmix2x_component.jobids, opal_pmix2x_jobid_trkr_t) {
|
OPAL_LIST_FOREACH(jptr, &mca_pmix_pmix2x_component.jobids, opal_pmix2x_jobid_trkr_t) {
|
||||||
if (jptr->jobid == cd->jobid) {
|
if (jptr->jobid == cd->jobid) {
|
||||||
@ -361,6 +376,7 @@ void pmix2x_server_deregister_nspace(opal_jobid_t jobid,
|
|||||||
} else {
|
} else {
|
||||||
opal_event_assign(&cd->ev, opal_pmix_base.evbase,
|
opal_event_assign(&cd->ev, opal_pmix_base.evbase,
|
||||||
-1, EV_WRITE, _dereg_nspace, cd);
|
-1, EV_WRITE, _dereg_nspace, cd);
|
||||||
|
OPAL_POST_OBJECT(cd);
|
||||||
opal_event_active(&cd->ev, EV_WRITE, 1);
|
opal_event_active(&cd->ev, EV_WRITE, 1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -397,6 +413,7 @@ static void _dereg_client(int sd, short args, void *cbdata)
|
|||||||
opal_pmix2x_jobid_trkr_t *jptr;
|
opal_pmix2x_jobid_trkr_t *jptr;
|
||||||
pmix_proc_t p;
|
pmix_proc_t p;
|
||||||
|
|
||||||
|
OPAL_ACQUIRE_OBJECT(cd);
|
||||||
/* if we don't already have it, we can ignore this */
|
/* if we don't already have it, we can ignore this */
|
||||||
OPAL_LIST_FOREACH(jptr, &mca_pmix_pmix2x_component.jobids, opal_pmix2x_jobid_trkr_t) {
|
OPAL_LIST_FOREACH(jptr, &mca_pmix_pmix2x_component.jobids, opal_pmix2x_jobid_trkr_t) {
|
||||||
if (jptr->jobid == cd->source->jobid) {
|
if (jptr->jobid == cd->source->jobid) {
|
||||||
@ -431,6 +448,7 @@ void pmix2x_server_deregister_client(const opal_process_name_t *proc,
|
|||||||
} else {
|
} else {
|
||||||
opal_event_assign(&cd->ev, opal_pmix_base.evbase,
|
opal_event_assign(&cd->ev, opal_pmix_base.evbase,
|
||||||
-1, EV_WRITE, _dereg_client, cd);
|
-1, EV_WRITE, _dereg_client, cd);
|
||||||
|
OPAL_POST_OBJECT(cd);
|
||||||
opal_event_active(&cd->ev, EV_WRITE, 1);
|
opal_event_active(&cd->ev, EV_WRITE, 1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -13,6 +13,7 @@
|
|||||||
* Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved.
|
* Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved.
|
||||||
* Copyright (c) 2015-2017 Research Organization for Information Science
|
* Copyright (c) 2015-2017 Research Organization for Information Science
|
||||||
* and Technology (RIST). All rights reserved.
|
* and Technology (RIST). All rights reserved.
|
||||||
|
* Copyright (c) 2017 Intel, Inc. All rights reserved.
|
||||||
* $COPYRIGHT$
|
* $COPYRIGHT$
|
||||||
*
|
*
|
||||||
* Additional copyrights may follow
|
* Additional copyrights may follow
|
||||||
@ -114,6 +115,19 @@ OPAL_DECLSPEC OBJ_CLASS_DECLARATION(opal_thread_t);
|
|||||||
opal_condition_broadcast((cnd)); \
|
opal_condition_broadcast((cnd)); \
|
||||||
} while(0);
|
} while(0);
|
||||||
|
|
||||||
|
/* provide a macro for forward-proofing the shifting
|
||||||
|
* of objects between libevent threads - at some point, we
|
||||||
|
* may revamp that threading model */
|
||||||
|
|
||||||
|
/* post an object to another thread - for now, we
|
||||||
|
* only have a memory barrier */
|
||||||
|
#define OPAL_POST_OBJECT(o) opal_atomic_wmb()
|
||||||
|
|
||||||
|
/* acquire an object from another thread - for now,
|
||||||
|
* we only have a memory barrier */
|
||||||
|
#define OPAL_ACQUIRE_OBJECT(o) opal_atomic_rmb()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
OPAL_DECLSPEC int opal_thread_start(opal_thread_t *);
|
OPAL_DECLSPEC int opal_thread_start(opal_thread_t *);
|
||||||
OPAL_DECLSPEC int opal_thread_join(opal_thread_t *, void **thread_return);
|
OPAL_DECLSPEC int opal_thread_join(opal_thread_t *, void **thread_return);
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
/*
|
/*
|
||||||
* Copyright (c) 2012-2013 Los Alamos National Security, LLC.
|
* Copyright (c) 2012-2013 Los Alamos National Security, LLC.
|
||||||
* All rights reserved.
|
* All rights reserved.
|
||||||
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
|
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
|
||||||
* Copyright (c) 2014 Research Organization for Information Science
|
* Copyright (c) 2014 Research Organization for Information Science
|
||||||
* and Technology (RIST). All rights reserved.
|
* and Technology (RIST). All rights reserved.
|
||||||
* $COPYRIGHT$
|
* $COPYRIGHT$
|
||||||
@ -33,6 +33,7 @@
|
|||||||
#include "orte/util/name_fns.h"
|
#include "orte/util/name_fns.h"
|
||||||
#include "orte/util/proc_info.h"
|
#include "orte/util/proc_info.h"
|
||||||
#include "orte/util/show_help.h"
|
#include "orte/util/show_help.h"
|
||||||
|
#include "orte/util/threads.h"
|
||||||
#include "orte/runtime/orte_globals.h"
|
#include "orte/runtime/orte_globals.h"
|
||||||
#include "orte/mca/errmgr/errmgr.h"
|
#include "orte/mca/errmgr/errmgr.h"
|
||||||
#include "orte/mca/rml/rml.h"
|
#include "orte/mca/rml/rml.h"
|
||||||
@ -507,6 +508,8 @@ static void process_opens(int fd, short args, void *cbdata)
|
|||||||
opal_list_t lt;
|
opal_list_t lt;
|
||||||
opal_namelist_t *nm;
|
opal_namelist_t *nm;
|
||||||
|
|
||||||
|
ORTE_ACQUIRE_OBJECT(dfs);
|
||||||
|
|
||||||
/* get the scheme to determine if we can process locally or not */
|
/* get the scheme to determine if we can process locally or not */
|
||||||
if (NULL == (scheme = opal_uri_get_scheme(dfs->uri))) {
|
if (NULL == (scheme = opal_uri_get_scheme(dfs->uri))) {
|
||||||
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
|
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
|
||||||
@ -661,7 +664,7 @@ static void dfs_open(char *uri,
|
|||||||
dfs->cbdata = cbdata;
|
dfs->cbdata = cbdata;
|
||||||
|
|
||||||
/* post it for processing */
|
/* post it for processing */
|
||||||
ORTE_DFS_POST_REQUEST(dfs, process_opens);
|
ORTE_THREADSHIFT(dfs, orte_event_base, process_opens, ORTE_SYS_PRI);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void process_close(int fd, short args, void *cbdata)
|
static void process_close(int fd, short args, void *cbdata)
|
||||||
@ -672,6 +675,8 @@ static void process_close(int fd, short args, void *cbdata)
|
|||||||
opal_buffer_t *buffer;
|
opal_buffer_t *buffer;
|
||||||
int rc;
|
int rc;
|
||||||
|
|
||||||
|
ORTE_ACQUIRE_OBJECT(close_dfs);
|
||||||
|
|
||||||
opal_output_verbose(1, orte_dfs_base_framework.framework_output,
|
opal_output_verbose(1, orte_dfs_base_framework.framework_output,
|
||||||
"%s closing fd %d",
|
"%s closing fd %d",
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
@ -757,7 +762,7 @@ static void dfs_close(int fd,
|
|||||||
dfs->cbdata = cbdata;
|
dfs->cbdata = cbdata;
|
||||||
|
|
||||||
/* post it for processing */
|
/* post it for processing */
|
||||||
ORTE_DFS_POST_REQUEST(dfs, process_close);
|
ORTE_THREADSHIFT(dfs, orte_event_base, process_close, ORTE_SYS_PRI);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void process_sizes(int fd, short args, void *cbdata)
|
static void process_sizes(int fd, short args, void *cbdata)
|
||||||
@ -769,6 +774,8 @@ static void process_sizes(int fd, short args, void *cbdata)
|
|||||||
int rc;
|
int rc;
|
||||||
struct stat buf;
|
struct stat buf;
|
||||||
|
|
||||||
|
ORTE_ACQUIRE_OBJECT(size_dfs);
|
||||||
|
|
||||||
opal_output_verbose(1, orte_dfs_base_framework.framework_output,
|
opal_output_verbose(1, orte_dfs_base_framework.framework_output,
|
||||||
"%s processing get_size on fd %d",
|
"%s processing get_size on fd %d",
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
@ -881,7 +888,7 @@ static void dfs_get_file_size(int fd,
|
|||||||
dfs->cbdata = cbdata;
|
dfs->cbdata = cbdata;
|
||||||
|
|
||||||
/* post it for processing */
|
/* post it for processing */
|
||||||
ORTE_DFS_POST_REQUEST(dfs, process_sizes);
|
ORTE_THREADSHIFT(dfs, orte_event_base, process_sizes, ORTE_SYS_PRI);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -895,6 +902,8 @@ static void process_seeks(int fd, short args, void *cbdata)
|
|||||||
int rc;
|
int rc;
|
||||||
struct stat buf;
|
struct stat buf;
|
||||||
|
|
||||||
|
ORTE_ACQUIRE_OBJECT(seek_dfs);
|
||||||
|
|
||||||
opal_output_verbose(1, orte_dfs_base_framework.framework_output,
|
opal_output_verbose(1, orte_dfs_base_framework.framework_output,
|
||||||
"%s processing seek on fd %d",
|
"%s processing seek on fd %d",
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
@ -1035,7 +1044,7 @@ static void dfs_seek(int fd, long offset, int whence,
|
|||||||
dfs->cbdata = cbdata;
|
dfs->cbdata = cbdata;
|
||||||
|
|
||||||
/* post it for processing */
|
/* post it for processing */
|
||||||
ORTE_DFS_POST_REQUEST(dfs, process_seeks);
|
ORTE_THREADSHIFT(dfs, orte_event_base, process_seeks, ORTE_SYS_PRI);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void process_reads(int fd, short args, void *cbdata)
|
static void process_reads(int fd, short args, void *cbdata)
|
||||||
@ -1048,6 +1057,8 @@ static void process_reads(int fd, short args, void *cbdata)
|
|||||||
int64_t i64;
|
int64_t i64;
|
||||||
int rc;
|
int rc;
|
||||||
|
|
||||||
|
ORTE_ACQUIRE_OBJECT(read_dfs);
|
||||||
|
|
||||||
/* look in our local records for this fd */
|
/* look in our local records for this fd */
|
||||||
trk = NULL;
|
trk = NULL;
|
||||||
for (item = opal_list_get_first(&active_files);
|
for (item = opal_list_get_first(&active_files);
|
||||||
@ -1145,7 +1156,7 @@ static void dfs_read(int fd, uint8_t *buffer,
|
|||||||
dfs->cbdata = cbdata;
|
dfs->cbdata = cbdata;
|
||||||
|
|
||||||
/* post it for processing */
|
/* post it for processing */
|
||||||
ORTE_DFS_POST_REQUEST(dfs, process_reads);
|
ORTE_THREADSHIFT(dfs, orte_event_base, process_reads, ORTE_SYS_PRI);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void process_posts(int fd, short args, void *cbdata)
|
static void process_posts(int fd, short args, void *cbdata)
|
||||||
@ -1154,6 +1165,8 @@ static void process_posts(int fd, short args, void *cbdata)
|
|||||||
opal_buffer_t *buffer;
|
opal_buffer_t *buffer;
|
||||||
int rc;
|
int rc;
|
||||||
|
|
||||||
|
ORTE_ACQUIRE_OBJECT(dfs);
|
||||||
|
|
||||||
/* we will get confirmation in our receive function, so
|
/* we will get confirmation in our receive function, so
|
||||||
* add this request to our list */
|
* add this request to our list */
|
||||||
dfs->id = req_id++;
|
dfs->id = req_id++;
|
||||||
@ -1212,7 +1225,7 @@ static void dfs_post_file_map(opal_buffer_t *bo,
|
|||||||
dfs->cbdata = cbdata;
|
dfs->cbdata = cbdata;
|
||||||
|
|
||||||
/* post it for processing */
|
/* post it for processing */
|
||||||
ORTE_DFS_POST_REQUEST(dfs, process_posts);
|
ORTE_THREADSHIFT(dfs, orte_event_base, process_posts, ORTE_SYS_PRI);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void process_getfm(int fd, short args, void *cbdata)
|
static void process_getfm(int fd, short args, void *cbdata)
|
||||||
@ -1221,6 +1234,8 @@ static void process_getfm(int fd, short args, void *cbdata)
|
|||||||
opal_buffer_t *buffer;
|
opal_buffer_t *buffer;
|
||||||
int rc;
|
int rc;
|
||||||
|
|
||||||
|
ORTE_ACQUIRE_OBJECT(dfs);
|
||||||
|
|
||||||
/* we will get confirmation in our receive function, so
|
/* we will get confirmation in our receive function, so
|
||||||
* add this request to our list */
|
* add this request to our list */
|
||||||
dfs->id = req_id++;
|
dfs->id = req_id++;
|
||||||
@ -1275,7 +1290,7 @@ static void dfs_get_file_map(orte_process_name_t *target,
|
|||||||
dfs->cbdata = cbdata;
|
dfs->cbdata = cbdata;
|
||||||
|
|
||||||
/* post it for processing */
|
/* post it for processing */
|
||||||
ORTE_DFS_POST_REQUEST(dfs, process_getfm);
|
ORTE_THREADSHIFT(dfs, orte_event_base, process_getfm, ORTE_SYS_PRI);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void dfs_load_file_maps(orte_jobid_t jobid,
|
static void dfs_load_file_maps(orte_jobid_t jobid,
|
||||||
@ -1298,4 +1313,3 @@ static void dfs_purge_file_maps(orte_jobid_t jobid,
|
|||||||
cbfunc(cbdata);
|
cbfunc(cbdata);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
/*
|
/*
|
||||||
* Copyright (c) 2012-2013 Los Alamos National Security, Inc. All rights reserved.
|
* Copyright (c) 2012-2013 Los Alamos National Security, Inc. All rights reserved.
|
||||||
|
* Copyright (c) 2017 Intel, Inc. All rights reserved.
|
||||||
* $COPYRIGHT$
|
* $COPYRIGHT$
|
||||||
*
|
*
|
||||||
* Additional copyrights may follow
|
* Additional copyrights may follow
|
||||||
@ -76,14 +77,6 @@ typedef struct {
|
|||||||
} orte_dfs_request_t;
|
} orte_dfs_request_t;
|
||||||
OBJ_CLASS_DECLARATION(orte_dfs_request_t);
|
OBJ_CLASS_DECLARATION(orte_dfs_request_t);
|
||||||
|
|
||||||
#define ORTE_DFS_POST_REQUEST(d, cb) \
|
|
||||||
do { \
|
|
||||||
opal_event_set(orte_event_base, &((d)->ev), \
|
|
||||||
-1, OPAL_EV_WRITE, (cb), (d)); \
|
|
||||||
opal_event_set_priority(&((d)->ev), ORTE_SYS_PRI); \
|
|
||||||
opal_event_active(&((d)->ev), OPAL_EV_WRITE, 1); \
|
|
||||||
} while(0);
|
|
||||||
|
|
||||||
END_C_DECLS
|
END_C_DECLS
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
@ -2,7 +2,7 @@
|
|||||||
* Copyright (c) 2012-2013 Los Alamos National Security, LLC.
|
* Copyright (c) 2012-2013 Los Alamos National Security, LLC.
|
||||||
* All rights reserved.
|
* All rights reserved.
|
||||||
* Copyright (c) 2013 Cisco Systems, Inc. All rights reserved.
|
* Copyright (c) 2013 Cisco Systems, Inc. All rights reserved.
|
||||||
* Copyright (c) 2015-2016 Intel, Inc. All rights reserved.
|
* Copyright (c) 2015-2017 Intel, Inc. All rights reserved.
|
||||||
* Copyright (c) 2015 Research Organization for Information Science
|
* Copyright (c) 2015 Research Organization for Information Science
|
||||||
* and Technology (RIST). All rights reserved.
|
* and Technology (RIST). All rights reserved.
|
||||||
* $COPYRIGHT$
|
* $COPYRIGHT$
|
||||||
@ -35,6 +35,7 @@
|
|||||||
#include "orte/util/session_dir.h"
|
#include "orte/util/session_dir.h"
|
||||||
#include "orte/util/show_help.h"
|
#include "orte/util/show_help.h"
|
||||||
#include "orte/util/nidmap.h"
|
#include "orte/util/nidmap.h"
|
||||||
|
#include "orte/util/threads.h"
|
||||||
|
|
||||||
#include "orte/mca/errmgr/errmgr.h"
|
#include "orte/mca/errmgr/errmgr.h"
|
||||||
#include "orte/mca/rml/rml.h"
|
#include "orte/mca/rml/rml.h"
|
||||||
@ -304,6 +305,8 @@ static void process_opens(int fd, short args, void *cbdata)
|
|||||||
int v;
|
int v;
|
||||||
orte_node_t *node, *nptr;
|
orte_node_t *node, *nptr;
|
||||||
|
|
||||||
|
ORTE_ACQUIRE_OBJECT(dfs);
|
||||||
|
|
||||||
/* get the scheme to determine if we can process locally or not */
|
/* get the scheme to determine if we can process locally or not */
|
||||||
if (NULL == (scheme = opal_uri_get_scheme(dfs->uri))) {
|
if (NULL == (scheme = opal_uri_get_scheme(dfs->uri))) {
|
||||||
OBJ_RELEASE(dfs);
|
OBJ_RELEASE(dfs);
|
||||||
@ -465,7 +468,7 @@ static void dfs_open(char *uri,
|
|||||||
dfs->cbdata = cbdata;
|
dfs->cbdata = cbdata;
|
||||||
|
|
||||||
/* post it for processing */
|
/* post it for processing */
|
||||||
ORTE_DFS_POST_REQUEST(dfs, process_opens);
|
ORTE_THREADSHIFT(dfs, orte_event_base, process_opens, ORTE_SYS_PRI);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void process_close(int fd, short args, void *cbdata)
|
static void process_close(int fd, short args, void *cbdata)
|
||||||
@ -476,6 +479,8 @@ static void process_close(int fd, short args, void *cbdata)
|
|||||||
opal_buffer_t *buffer;
|
opal_buffer_t *buffer;
|
||||||
int rc;
|
int rc;
|
||||||
|
|
||||||
|
ORTE_ACQUIRE_OBJECT(close_dfs);
|
||||||
|
|
||||||
opal_output_verbose(1, orte_dfs_base_framework.framework_output,
|
opal_output_verbose(1, orte_dfs_base_framework.framework_output,
|
||||||
"%s closing fd %d",
|
"%s closing fd %d",
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
@ -561,7 +566,7 @@ static void dfs_close(int fd,
|
|||||||
dfs->cbdata = cbdata;
|
dfs->cbdata = cbdata;
|
||||||
|
|
||||||
/* post it for processing */
|
/* post it for processing */
|
||||||
ORTE_DFS_POST_REQUEST(dfs, process_close);
|
ORTE_THREADSHIFT(dfs, orte_event_base, process_close, ORTE_SYS_PRI);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void process_sizes(int fd, short args, void *cbdata)
|
static void process_sizes(int fd, short args, void *cbdata)
|
||||||
@ -573,6 +578,8 @@ static void process_sizes(int fd, short args, void *cbdata)
|
|||||||
int rc;
|
int rc;
|
||||||
struct stat buf;
|
struct stat buf;
|
||||||
|
|
||||||
|
ORTE_ACQUIRE_OBJECT(size_dfs);
|
||||||
|
|
||||||
opal_output_verbose(1, orte_dfs_base_framework.framework_output,
|
opal_output_verbose(1, orte_dfs_base_framework.framework_output,
|
||||||
"%s processing get_size on fd %d",
|
"%s processing get_size on fd %d",
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
@ -665,7 +672,7 @@ static void dfs_get_file_size(int fd,
|
|||||||
dfs->cbdata = cbdata;
|
dfs->cbdata = cbdata;
|
||||||
|
|
||||||
/* post it for processing */
|
/* post it for processing */
|
||||||
ORTE_DFS_POST_REQUEST(dfs, process_sizes);
|
ORTE_THREADSHIFT(dfs, orte_event_base, process_sizes, ORTE_SYS_PRI);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -679,6 +686,8 @@ static void process_seeks(int fd, short args, void *cbdata)
|
|||||||
int rc;
|
int rc;
|
||||||
struct stat buf;
|
struct stat buf;
|
||||||
|
|
||||||
|
ORTE_ACQUIRE_OBJECT(seek_dfs);
|
||||||
|
|
||||||
opal_output_verbose(1, orte_dfs_base_framework.framework_output,
|
opal_output_verbose(1, orte_dfs_base_framework.framework_output,
|
||||||
"%s processing seek on fd %d",
|
"%s processing seek on fd %d",
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
@ -814,7 +823,7 @@ static void dfs_seek(int fd, long offset, int whence,
|
|||||||
dfs->cbdata = cbdata;
|
dfs->cbdata = cbdata;
|
||||||
|
|
||||||
/* post it for processing */
|
/* post it for processing */
|
||||||
ORTE_DFS_POST_REQUEST(dfs, process_seeks);
|
ORTE_THREADSHIFT(dfs, orte_event_base, process_seeks, ORTE_SYS_PRI);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void process_reads(int fd, short args, void *cbdata)
|
static void process_reads(int fd, short args, void *cbdata)
|
||||||
@ -827,6 +836,8 @@ static void process_reads(int fd, short args, void *cbdata)
|
|||||||
int64_t i64;
|
int64_t i64;
|
||||||
int rc;
|
int rc;
|
||||||
|
|
||||||
|
ORTE_ACQUIRE_OBJECT(read_dfs);
|
||||||
|
|
||||||
/* look in our local records for this fd */
|
/* look in our local records for this fd */
|
||||||
trk = NULL;
|
trk = NULL;
|
||||||
for (item = opal_list_get_first(&active_files);
|
for (item = opal_list_get_first(&active_files);
|
||||||
@ -924,7 +935,7 @@ static void dfs_read(int fd, uint8_t *buffer,
|
|||||||
dfs->cbdata = cbdata;
|
dfs->cbdata = cbdata;
|
||||||
|
|
||||||
/* post it for processing */
|
/* post it for processing */
|
||||||
ORTE_DFS_POST_REQUEST(dfs, process_reads);
|
ORTE_THREADSHIFT(dfs, orte_event_base, process_reads, ORTE_SYS_PRI);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void process_posts(int fd, short args, void *cbdata)
|
static void process_posts(int fd, short args, void *cbdata)
|
||||||
@ -935,6 +946,8 @@ static void process_posts(int fd, short args, void *cbdata)
|
|||||||
opal_list_item_t *item;
|
opal_list_item_t *item;
|
||||||
int rc;
|
int rc;
|
||||||
|
|
||||||
|
ORTE_ACQUIRE_OBJECT(dfs);
|
||||||
|
|
||||||
opal_output_verbose(1, orte_dfs_base_framework.framework_output,
|
opal_output_verbose(1, orte_dfs_base_framework.framework_output,
|
||||||
"%s posting file map containing %d bytes for target %s",
|
"%s posting file map containing %d bytes for target %s",
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
@ -1009,7 +1022,7 @@ static void dfs_post_file_map(opal_buffer_t *buffer,
|
|||||||
dfs->cbdata = cbdata;
|
dfs->cbdata = cbdata;
|
||||||
|
|
||||||
/* post it for processing */
|
/* post it for processing */
|
||||||
ORTE_DFS_POST_REQUEST(dfs, process_posts);
|
ORTE_THREADSHIFT(dfs, orte_event_base, process_posts, ORTE_SYS_PRI);
|
||||||
}
|
}
|
||||||
|
|
||||||
static int get_job_maps(orte_dfs_jobfm_t *jfm,
|
static int get_job_maps(orte_dfs_jobfm_t *jfm,
|
||||||
@ -1057,6 +1070,8 @@ static void process_getfm(int fd, short args, void *cbdata)
|
|||||||
int32_t n, ntotal;
|
int32_t n, ntotal;
|
||||||
int rc;
|
int rc;
|
||||||
|
|
||||||
|
ORTE_ACQUIRE_OBJECT(dfs);
|
||||||
|
|
||||||
/* if the target job is WILDCARD, then process
|
/* if the target job is WILDCARD, then process
|
||||||
* data for all jobids - else, find the one
|
* data for all jobids - else, find the one
|
||||||
*/
|
*/
|
||||||
@ -1120,7 +1135,7 @@ static void dfs_get_file_map(orte_process_name_t *target,
|
|||||||
dfs->cbdata = cbdata;
|
dfs->cbdata = cbdata;
|
||||||
|
|
||||||
/* post it for processing */
|
/* post it for processing */
|
||||||
ORTE_DFS_POST_REQUEST(dfs, process_getfm);
|
ORTE_THREADSHIFT(dfs, orte_event_base, process_getfm, ORTE_SYS_PRI);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void process_load(int fd, short args, void *cbdata)
|
static void process_load(int fd, short args, void *cbdata)
|
||||||
@ -1135,6 +1150,8 @@ static void process_load(int fd, short args, void *cbdata)
|
|||||||
int rc;
|
int rc;
|
||||||
opal_buffer_t *xfer;
|
opal_buffer_t *xfer;
|
||||||
|
|
||||||
|
ORTE_ACQUIRE_OBJECT(dfs);
|
||||||
|
|
||||||
/* see if we already have a tracker for this job */
|
/* see if we already have a tracker for this job */
|
||||||
jfm = NULL;
|
jfm = NULL;
|
||||||
for (item = opal_list_get_first(&file_maps);
|
for (item = opal_list_get_first(&file_maps);
|
||||||
@ -1233,7 +1250,7 @@ static void dfs_load_file_maps(orte_jobid_t jobid,
|
|||||||
dfs->cbdata = cbdata;
|
dfs->cbdata = cbdata;
|
||||||
|
|
||||||
/* post it for processing */
|
/* post it for processing */
|
||||||
ORTE_DFS_POST_REQUEST(dfs, process_load);
|
ORTE_THREADSHIFT(dfs, orte_event_base, process_load, ORTE_SYS_PRI);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void process_purge(int fd, short args, void *cbdata)
|
static void process_purge(int fd, short args, void *cbdata)
|
||||||
@ -1242,6 +1259,8 @@ static void process_purge(int fd, short args, void *cbdata)
|
|||||||
opal_list_item_t *item;
|
opal_list_item_t *item;
|
||||||
orte_dfs_jobfm_t *jfm, *jptr;
|
orte_dfs_jobfm_t *jfm, *jptr;
|
||||||
|
|
||||||
|
ORTE_ACQUIRE_OBJECT(dfs);
|
||||||
|
|
||||||
/* find the job tracker */
|
/* find the job tracker */
|
||||||
jfm = NULL;
|
jfm = NULL;
|
||||||
for (item = opal_list_get_first(&file_maps);
|
for (item = opal_list_get_first(&file_maps);
|
||||||
@ -1288,7 +1307,7 @@ static void dfs_purge_file_maps(orte_jobid_t jobid,
|
|||||||
dfs->cbdata = cbdata;
|
dfs->cbdata = cbdata;
|
||||||
|
|
||||||
/* post it for processing */
|
/* post it for processing */
|
||||||
ORTE_DFS_POST_REQUEST(dfs, process_purge);
|
ORTE_THREADSHIFT(dfs, orte_event_base, process_purge, ORTE_SYS_PRI);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -2368,4 +2387,3 @@ static void remote_read(int fd, short args, void *cbdata)
|
|||||||
}
|
}
|
||||||
OBJ_RELEASE(req);
|
OBJ_RELEASE(req);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
/*
|
/*
|
||||||
* Copyright (c) 2012-2013 Los Alamos National Security, LLC.
|
* Copyright (c) 2012-2013 Los Alamos National Security, LLC.
|
||||||
* All rights reserved.
|
* All rights reserved.
|
||||||
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
|
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
|
||||||
* Copyright (c) 2014-2015 Research Organization for Information Science
|
* Copyright (c) 2014-2015 Research Organization for Information Science
|
||||||
* and Technology (RIST). All rights reserved.
|
* and Technology (RIST). All rights reserved.
|
||||||
* $COPYRIGHT$
|
* $COPYRIGHT$
|
||||||
@ -32,6 +32,7 @@
|
|||||||
#include "orte/util/error_strings.h"
|
#include "orte/util/error_strings.h"
|
||||||
#include "orte/util/name_fns.h"
|
#include "orte/util/name_fns.h"
|
||||||
#include "orte/util/show_help.h"
|
#include "orte/util/show_help.h"
|
||||||
|
#include "orte/util/threads.h"
|
||||||
#include "orte/runtime/orte_globals.h"
|
#include "orte/runtime/orte_globals.h"
|
||||||
#include "orte/mca/errmgr/errmgr.h"
|
#include "orte/mca/errmgr/errmgr.h"
|
||||||
#include "orte/mca/rml/rml.h"
|
#include "orte/mca/rml/rml.h"
|
||||||
@ -449,6 +450,8 @@ static void process_opens(int fd, short args, void *cbdata)
|
|||||||
opal_list_t lt;
|
opal_list_t lt;
|
||||||
opal_namelist_t *nm;
|
opal_namelist_t *nm;
|
||||||
|
|
||||||
|
ORTE_ACQUIRE_OBJECT(dfs);
|
||||||
|
|
||||||
opal_output_verbose(1, orte_dfs_base_framework.framework_output,
|
opal_output_verbose(1, orte_dfs_base_framework.framework_output,
|
||||||
"%s PROCESSING OPEN", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
|
"%s PROCESSING OPEN", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
|
||||||
|
|
||||||
@ -583,7 +586,7 @@ static void dfs_open(char *uri,
|
|||||||
dfs->cbdata = cbdata;
|
dfs->cbdata = cbdata;
|
||||||
|
|
||||||
/* post it for processing */
|
/* post it for processing */
|
||||||
ORTE_DFS_POST_REQUEST(dfs, process_opens);
|
ORTE_THREADSHIFT(dfs, orte_event_base, process_opens, ORTE_SYS_PRI);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void process_close(int fd, short args, void *cbdata)
|
static void process_close(int fd, short args, void *cbdata)
|
||||||
@ -594,6 +597,8 @@ static void process_close(int fd, short args, void *cbdata)
|
|||||||
opal_buffer_t *buffer;
|
opal_buffer_t *buffer;
|
||||||
int rc;
|
int rc;
|
||||||
|
|
||||||
|
ORTE_ACQUIRE_OBJECT(close_dfs);
|
||||||
|
|
||||||
opal_output_verbose(1, orte_dfs_base_framework.framework_output,
|
opal_output_verbose(1, orte_dfs_base_framework.framework_output,
|
||||||
"%s closing fd %d",
|
"%s closing fd %d",
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
@ -673,7 +678,7 @@ static void dfs_close(int fd,
|
|||||||
dfs->cbdata = cbdata;
|
dfs->cbdata = cbdata;
|
||||||
|
|
||||||
/* post it for processing */
|
/* post it for processing */
|
||||||
ORTE_DFS_POST_REQUEST(dfs, process_close);
|
ORTE_THREADSHIFT(dfs, orte_event_base, process_close, ORTE_SYS_PRI);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void process_sizes(int fd, short args, void *cbdata)
|
static void process_sizes(int fd, short args, void *cbdata)
|
||||||
@ -684,6 +689,8 @@ static void process_sizes(int fd, short args, void *cbdata)
|
|||||||
opal_buffer_t *buffer;
|
opal_buffer_t *buffer;
|
||||||
int rc;
|
int rc;
|
||||||
|
|
||||||
|
ORTE_ACQUIRE_OBJECT(size_dfs);
|
||||||
|
|
||||||
opal_output_verbose(1, orte_dfs_base_framework.framework_output,
|
opal_output_verbose(1, orte_dfs_base_framework.framework_output,
|
||||||
"%s processing get_size on fd %d",
|
"%s processing get_size on fd %d",
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
@ -775,7 +782,7 @@ static void dfs_get_file_size(int fd,
|
|||||||
dfs->cbdata = cbdata;
|
dfs->cbdata = cbdata;
|
||||||
|
|
||||||
/* post it for processing */
|
/* post it for processing */
|
||||||
ORTE_DFS_POST_REQUEST(dfs, process_sizes);
|
ORTE_THREADSHIFT(dfs, orte_event_base, process_sizes, ORTE_SYS_PRI);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -788,6 +795,8 @@ static void process_seeks(int fd, short args, void *cbdata)
|
|||||||
int64_t i64;
|
int64_t i64;
|
||||||
int rc;
|
int rc;
|
||||||
|
|
||||||
|
ORTE_ACQUIRE_OBJECT(seek_dfs);
|
||||||
|
|
||||||
opal_output_verbose(1, orte_dfs_base_framework.framework_output,
|
opal_output_verbose(1, orte_dfs_base_framework.framework_output,
|
||||||
"%s processing seek on fd %d",
|
"%s processing seek on fd %d",
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
@ -885,7 +894,7 @@ static void dfs_seek(int fd, long offset, int whence,
|
|||||||
dfs->cbdata = cbdata;
|
dfs->cbdata = cbdata;
|
||||||
|
|
||||||
/* post it for processing */
|
/* post it for processing */
|
||||||
ORTE_DFS_POST_REQUEST(dfs, process_seeks);
|
ORTE_THREADSHIFT(dfs, orte_event_base, process_seeks, ORTE_SYS_PRI);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void process_reads(int fd, short args, void *cbdata)
|
static void process_reads(int fd, short args, void *cbdata)
|
||||||
@ -897,6 +906,8 @@ static void process_reads(int fd, short args, void *cbdata)
|
|||||||
int64_t i64;
|
int64_t i64;
|
||||||
int rc;
|
int rc;
|
||||||
|
|
||||||
|
ORTE_ACQUIRE_OBJECT(read_dfs);
|
||||||
|
|
||||||
/* look in our local records for this fd */
|
/* look in our local records for this fd */
|
||||||
trk = NULL;
|
trk = NULL;
|
||||||
for (item = opal_list_get_first(&active_files);
|
for (item = opal_list_get_first(&active_files);
|
||||||
@ -979,7 +990,7 @@ static void dfs_read(int fd, uint8_t *buffer,
|
|||||||
dfs->cbdata = cbdata;
|
dfs->cbdata = cbdata;
|
||||||
|
|
||||||
/* post it for processing */
|
/* post it for processing */
|
||||||
ORTE_DFS_POST_REQUEST(dfs, process_reads);
|
ORTE_THREADSHIFT(dfs, orte_event_base, process_reads, ORTE_SYS_PRI);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void process_posts(int fd, short args, void *cbdata)
|
static void process_posts(int fd, short args, void *cbdata)
|
||||||
@ -988,6 +999,8 @@ static void process_posts(int fd, short args, void *cbdata)
|
|||||||
opal_buffer_t *buffer;
|
opal_buffer_t *buffer;
|
||||||
int rc;
|
int rc;
|
||||||
|
|
||||||
|
ORTE_ACQUIRE_OBJECT(dfs);
|
||||||
|
|
||||||
/* we will get confirmation in our receive function, so
|
/* we will get confirmation in our receive function, so
|
||||||
* add this request to our list */
|
* add this request to our list */
|
||||||
dfs->id = req_id++;
|
dfs->id = req_id++;
|
||||||
@ -1046,7 +1059,7 @@ static void dfs_post_file_map(opal_buffer_t *bo,
|
|||||||
dfs->cbdata = cbdata;
|
dfs->cbdata = cbdata;
|
||||||
|
|
||||||
/* post it for processing */
|
/* post it for processing */
|
||||||
ORTE_DFS_POST_REQUEST(dfs, process_posts);
|
ORTE_THREADSHIFT(dfs, orte_event_base, process_posts, ORTE_SYS_PRI);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void process_getfm(int fd, short args, void *cbdata)
|
static void process_getfm(int fd, short args, void *cbdata)
|
||||||
@ -1055,6 +1068,8 @@ static void process_getfm(int fd, short args, void *cbdata)
|
|||||||
opal_buffer_t *buffer;
|
opal_buffer_t *buffer;
|
||||||
int rc;
|
int rc;
|
||||||
|
|
||||||
|
ORTE_ACQUIRE_OBJECT(dfs);
|
||||||
|
|
||||||
/* we will get confirmation in our receive function, so
|
/* we will get confirmation in our receive function, so
|
||||||
* add this request to our list */
|
* add this request to our list */
|
||||||
dfs->id = req_id++;
|
dfs->id = req_id++;
|
||||||
@ -1109,7 +1124,7 @@ static void dfs_get_file_map(orte_process_name_t *target,
|
|||||||
dfs->cbdata = cbdata;
|
dfs->cbdata = cbdata;
|
||||||
|
|
||||||
/* post it for processing */
|
/* post it for processing */
|
||||||
ORTE_DFS_POST_REQUEST(dfs, process_getfm);
|
ORTE_THREADSHIFT(dfs, orte_event_base, process_getfm, ORTE_SYS_PRI);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void dfs_load_file_maps(orte_jobid_t jobid,
|
static void dfs_load_file_maps(orte_jobid_t jobid,
|
||||||
@ -1132,4 +1147,3 @@ static void dfs_purge_file_maps(orte_jobid_t jobid,
|
|||||||
cbfunc(cbdata);
|
cbfunc(cbdata);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -10,6 +10,7 @@
|
|||||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||||
# All rights reserved.
|
# All rights reserved.
|
||||||
# Copyright (c) 2012-2013 Los Alamos National Security, Inc. All rights reserved.
|
# Copyright (c) 2012-2013 Los Alamos National Security, Inc. All rights reserved.
|
||||||
|
# Copyright (c) 2017 Intel, Inc. All rights reserved.
|
||||||
# $COPYRIGHT$
|
# $COPYRIGHT$
|
||||||
#
|
#
|
||||||
# Additional copyrights may follow
|
# Additional copyrights may follow
|
||||||
@ -26,5 +27,4 @@ headers += \
|
|||||||
libmca_errmgr_la_SOURCES += \
|
libmca_errmgr_la_SOURCES += \
|
||||||
base/errmgr_base_select.c \
|
base/errmgr_base_select.c \
|
||||||
base/errmgr_base_frame.c \
|
base/errmgr_base_frame.c \
|
||||||
base/errmgr_base_fns.c \
|
base/errmgr_base_fns.c
|
||||||
base/errmgr_base_tool.c
|
|
||||||
|
@ -13,7 +13,7 @@
|
|||||||
* Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved.
|
* Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved.
|
||||||
* Copyright (c) 2011-2013 Los Alamos National Security, LLC.
|
* Copyright (c) 2011-2013 Los Alamos National Security, LLC.
|
||||||
* All rights reserved.
|
* All rights reserved.
|
||||||
* Copyright (c) 2013-2014 Intel, Inc. All rights reserved
|
* Copyright (c) 2013-2017 Intel, Inc. All rights reserved.
|
||||||
* Copyright (c) 2014 Research Organization for Information Science
|
* Copyright (c) 2014 Research Organization for Information Science
|
||||||
* and Technology (RIST). All rights reserved.
|
* and Technology (RIST). All rights reserved.
|
||||||
* $COPYRIGHT$
|
* $COPYRIGHT$
|
||||||
@ -82,99 +82,6 @@
|
|||||||
#include "orte/mca/errmgr/base/base.h"
|
#include "orte/mca/errmgr/base/base.h"
|
||||||
#include "orte/mca/errmgr/base/errmgr_private.h"
|
#include "orte/mca/errmgr/base/errmgr_private.h"
|
||||||
|
|
||||||
/*
|
|
||||||
* Object stuff
|
|
||||||
*/
|
|
||||||
void orte_errmgr_predicted_proc_construct(orte_errmgr_predicted_proc_t *item);
|
|
||||||
void orte_errmgr_predicted_proc_destruct( orte_errmgr_predicted_proc_t *item);
|
|
||||||
|
|
||||||
OBJ_CLASS_INSTANCE(orte_errmgr_predicted_proc_t,
|
|
||||||
opal_list_item_t,
|
|
||||||
orte_errmgr_predicted_proc_construct,
|
|
||||||
orte_errmgr_predicted_proc_destruct);
|
|
||||||
|
|
||||||
void orte_errmgr_predicted_proc_construct(orte_errmgr_predicted_proc_t *item)
|
|
||||||
{
|
|
||||||
item->proc_name.vpid = ORTE_VPID_INVALID;
|
|
||||||
item->proc_name.jobid = ORTE_JOBID_INVALID;
|
|
||||||
}
|
|
||||||
|
|
||||||
void orte_errmgr_predicted_proc_destruct( orte_errmgr_predicted_proc_t *item)
|
|
||||||
{
|
|
||||||
item->proc_name.vpid = ORTE_VPID_INVALID;
|
|
||||||
item->proc_name.jobid = ORTE_JOBID_INVALID;
|
|
||||||
}
|
|
||||||
|
|
||||||
void orte_errmgr_predicted_node_construct(orte_errmgr_predicted_node_t *item);
|
|
||||||
void orte_errmgr_predicted_node_destruct( orte_errmgr_predicted_node_t *item);
|
|
||||||
|
|
||||||
OBJ_CLASS_INSTANCE(orte_errmgr_predicted_node_t,
|
|
||||||
opal_list_item_t,
|
|
||||||
orte_errmgr_predicted_node_construct,
|
|
||||||
orte_errmgr_predicted_node_destruct);
|
|
||||||
|
|
||||||
void orte_errmgr_predicted_node_construct(orte_errmgr_predicted_node_t *item)
|
|
||||||
{
|
|
||||||
item->node_name = NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
void orte_errmgr_predicted_node_destruct( orte_errmgr_predicted_node_t *item)
|
|
||||||
{
|
|
||||||
if( NULL != item->node_name ) {
|
|
||||||
free(item->node_name);
|
|
||||||
item->node_name = NULL;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void orte_errmgr_predicted_map_construct(orte_errmgr_predicted_map_t *item);
|
|
||||||
void orte_errmgr_predicted_map_destruct( orte_errmgr_predicted_map_t *item);
|
|
||||||
|
|
||||||
OBJ_CLASS_INSTANCE(orte_errmgr_predicted_map_t,
|
|
||||||
opal_list_item_t,
|
|
||||||
orte_errmgr_predicted_map_construct,
|
|
||||||
orte_errmgr_predicted_map_destruct);
|
|
||||||
|
|
||||||
void orte_errmgr_predicted_map_construct(orte_errmgr_predicted_map_t *item)
|
|
||||||
{
|
|
||||||
item->proc_name.vpid = ORTE_VPID_INVALID;
|
|
||||||
item->proc_name.jobid = ORTE_JOBID_INVALID;
|
|
||||||
|
|
||||||
item->node_name = NULL;
|
|
||||||
|
|
||||||
item->map_proc_name.vpid = ORTE_VPID_INVALID;
|
|
||||||
item->map_proc_name.jobid = ORTE_JOBID_INVALID;
|
|
||||||
|
|
||||||
item->map_node_name = NULL;
|
|
||||||
item->off_current_node = false;
|
|
||||||
item->pre_map_fixed_node = NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
void orte_errmgr_predicted_map_destruct( orte_errmgr_predicted_map_t *item)
|
|
||||||
{
|
|
||||||
item->proc_name.vpid = ORTE_VPID_INVALID;
|
|
||||||
item->proc_name.jobid = ORTE_JOBID_INVALID;
|
|
||||||
|
|
||||||
if( NULL != item->node_name ) {
|
|
||||||
free(item->node_name);
|
|
||||||
item->node_name = NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
item->map_proc_name.vpid = ORTE_VPID_INVALID;
|
|
||||||
item->map_proc_name.jobid = ORTE_JOBID_INVALID;
|
|
||||||
|
|
||||||
if( NULL != item->map_node_name ) {
|
|
||||||
free(item->map_node_name);
|
|
||||||
item->map_node_name = NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
item->off_current_node = false;
|
|
||||||
|
|
||||||
if( NULL != item->pre_map_fixed_node ) {
|
|
||||||
free(item->pre_map_fixed_node);
|
|
||||||
item->pre_map_fixed_node = NULL;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Public interfaces
|
* Public interfaces
|
||||||
*/
|
*/
|
||||||
@ -231,12 +138,6 @@ void orte_errmgr_base_abort(int error_code, char *fmt, ...)
|
|||||||
/* No way to reach here */
|
/* No way to reach here */
|
||||||
}
|
}
|
||||||
|
|
||||||
void orte_errmgr_base_register_migration_warning(struct timeval *tv)
|
|
||||||
{
|
|
||||||
/* stub function - ignore */
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
int orte_errmgr_base_abort_peers(orte_process_name_t *procs,
|
int orte_errmgr_base_abort_peers(orte_process_name_t *procs,
|
||||||
orte_std_cntr_t num_procs,
|
orte_std_cntr_t num_procs,
|
||||||
int error_code)
|
int error_code)
|
||||||
@ -244,195 +145,6 @@ int orte_errmgr_base_abort_peers(orte_process_name_t *procs,
|
|||||||
return ORTE_ERR_NOT_IMPLEMENTED;
|
return ORTE_ERR_NOT_IMPLEMENTED;
|
||||||
}
|
}
|
||||||
|
|
||||||
int orte_errmgr_base_register_error_callback(orte_errmgr_error_callback_fn_t *cbfunc,
|
|
||||||
orte_errmgr_error_order_t order)
|
|
||||||
{
|
|
||||||
orte_errmgr_cback_t *cb, *cbcur;
|
|
||||||
|
|
||||||
/* check the order to see what to do */
|
|
||||||
switch(order) {
|
|
||||||
case ORTE_ERRMGR_CALLBACK_FIRST:
|
|
||||||
/* only one can be so designated */
|
|
||||||
if (NULL != (cb = (orte_errmgr_cback_t*)opal_list_get_first(&orte_errmgr_base.error_cbacks))) {
|
|
||||||
if (ORTE_ERRMGR_CALLBACK_FIRST == cb->order) {
|
|
||||||
return ORTE_ERR_NOT_SUPPORTED;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
cb = OBJ_NEW(orte_errmgr_cback_t);
|
|
||||||
cb->order = order;
|
|
||||||
cb->callback =cbfunc;
|
|
||||||
opal_list_prepend(&orte_errmgr_base.error_cbacks, &cb->super);
|
|
||||||
break;
|
|
||||||
case ORTE_ERRMGR_CALLBACK_LAST:
|
|
||||||
/* only one can be so designated */
|
|
||||||
if (NULL != (cb = (orte_errmgr_cback_t*)opal_list_get_last(&orte_errmgr_base.error_cbacks))) {
|
|
||||||
if (ORTE_ERRMGR_CALLBACK_LAST == cb->order) {
|
|
||||||
return ORTE_ERR_NOT_SUPPORTED;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
cb = OBJ_NEW(orte_errmgr_cback_t);
|
|
||||||
cb->order = order;
|
|
||||||
cb->callback = cbfunc;
|
|
||||||
opal_list_append(&orte_errmgr_base.error_cbacks, &cb->super);
|
|
||||||
break;
|
|
||||||
case ORTE_ERRMGR_CALLBACK_PREPEND:
|
|
||||||
cb = OBJ_NEW(orte_errmgr_cback_t);
|
|
||||||
cb->order = order;
|
|
||||||
cb->callback =cbfunc;
|
|
||||||
if (NULL != (cbcur = (orte_errmgr_cback_t*)opal_list_get_first(&orte_errmgr_base.error_cbacks)) &&
|
|
||||||
ORTE_ERRMGR_CALLBACK_FIRST == cbcur->order) {
|
|
||||||
opal_list_insert(&orte_errmgr_base.error_cbacks, &cb->super, 1);
|
|
||||||
} else {
|
|
||||||
opal_list_prepend(&orte_errmgr_base.error_cbacks, &cb->super);
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
case ORTE_ERRMGR_CALLBACK_APPEND:
|
|
||||||
cb = OBJ_NEW(orte_errmgr_cback_t);
|
|
||||||
cb->order = order;
|
|
||||||
cb->callback =cbfunc;
|
|
||||||
if (NULL != (cbcur = (orte_errmgr_cback_t*)opal_list_get_last(&orte_errmgr_base.error_cbacks)) &&
|
|
||||||
ORTE_ERRMGR_CALLBACK_LAST == cbcur->order) {
|
|
||||||
opal_list_insert_pos(&orte_errmgr_base.error_cbacks, &cbcur->super, &cb->super);
|
|
||||||
} else {
|
|
||||||
opal_list_append(&orte_errmgr_base.error_cbacks, &cb->super);
|
|
||||||
}
|
|
||||||
opal_list_append(&orte_errmgr_base.error_cbacks, &cb->super);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
return ORTE_SUCCESS;
|
|
||||||
}
|
|
||||||
|
|
||||||
void orte_errmgr_base_execute_error_callbacks(opal_pointer_array_t *errors)
|
|
||||||
{
|
|
||||||
orte_errmgr_cback_t *cb;
|
|
||||||
char *errstring=NULL;
|
|
||||||
orte_error_t *err;
|
|
||||||
int errcode = ORTE_ERROR_DEFAULT_EXIT_CODE;
|
|
||||||
|
|
||||||
/* if no callbacks have been provided, then we abort */
|
|
||||||
if (0 == opal_list_get_size(&orte_errmgr_base.error_cbacks)) {
|
|
||||||
/* take the first entry, if available */
|
|
||||||
if (NULL != errors &&
|
|
||||||
(NULL != (err = (orte_error_t*)opal_pointer_array_get_item(errors, 0)))) {
|
|
||||||
errstring = (char*)ORTE_ERROR_NAME(err->errcode);
|
|
||||||
errcode = err->errcode;
|
|
||||||
}
|
|
||||||
if (NULL == errstring) {
|
|
||||||
/* if the error is silent, say nothing */
|
|
||||||
orte_errmgr.abort(errcode, NULL);
|
|
||||||
}
|
|
||||||
orte_errmgr.abort(errcode, "Executing default error callback: %s", errstring);
|
|
||||||
}
|
|
||||||
|
|
||||||
/* cycle across the provided callbacks until we complete the list
|
|
||||||
* or one reports that no further action is required
|
|
||||||
*/
|
|
||||||
OPAL_LIST_FOREACH(cb, &orte_errmgr_base.error_cbacks, orte_errmgr_cback_t) {
|
|
||||||
if (ORTE_SUCCESS == cb->callback(errors)) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/********************
|
|
||||||
* Utility functions
|
|
||||||
********************/
|
|
||||||
#if OPAL_ENABLE_FT_CR
|
|
||||||
|
|
||||||
void orte_errmgr_base_migrate_state_notify(int state)
|
|
||||||
{
|
|
||||||
switch(state) {
|
|
||||||
case ORTE_ERRMGR_MIGRATE_STATE_ERROR:
|
|
||||||
case ORTE_ERRMGR_MIGRATE_STATE_ERR_INPROGRESS:
|
|
||||||
opal_output(0, "%d: Migration failed for process %s.",
|
|
||||||
orte_process_info.pid, ORTE_JOBID_PRINT(ORTE_PROC_MY_NAME->jobid));
|
|
||||||
break;
|
|
||||||
case ORTE_ERRMGR_MIGRATE_STATE_FINISH:
|
|
||||||
opal_output(0, "%d: Migration successful for process %s.",
|
|
||||||
orte_process_info.pid, ORTE_JOBID_PRINT(ORTE_PROC_MY_NAME->jobid));
|
|
||||||
break;
|
|
||||||
|
|
||||||
case ORTE_ERRMGR_MIGRATE_STATE_NONE:
|
|
||||||
case ORTE_ERRMGR_MIGRATE_STATE_REQUEST:
|
|
||||||
case ORTE_ERRMGR_MIGRATE_STATE_RUNNING:
|
|
||||||
case ORTE_ERRMGR_MIGRATE_STATE_RUN_CKPT:
|
|
||||||
case ORTE_ERRMGR_MIGRATE_STATE_STARTUP:
|
|
||||||
case ORTE_ERRMGR_MIGRATE_MAX:
|
|
||||||
default:
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void orte_errmgr_base_proc_state_notify(orte_proc_state_t state, orte_process_name_t *proc)
|
|
||||||
{
|
|
||||||
if (NULL != proc) {
|
|
||||||
switch(state) {
|
|
||||||
case ORTE_PROC_STATE_ABORTED:
|
|
||||||
case ORTE_PROC_STATE_ABORTED_BY_SIG:
|
|
||||||
case ORTE_PROC_STATE_TERM_WO_SYNC:
|
|
||||||
case ORTE_PROC_STATE_TERMINATED:
|
|
||||||
case ORTE_PROC_STATE_KILLED_BY_CMD:
|
|
||||||
case ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED:
|
|
||||||
opal_output(0, "%d: Process %s is dead.",
|
|
||||||
orte_process_info.pid, ORTE_JOBID_PRINT(proc->jobid));
|
|
||||||
break;
|
|
||||||
|
|
||||||
case ORTE_PROC_STATE_HEARTBEAT_FAILED:
|
|
||||||
opal_output(0, "%d: Process %s is unreachable.",
|
|
||||||
orte_process_info.pid, ORTE_JOBID_PRINT(proc->jobid));
|
|
||||||
|
|
||||||
case ORTE_PROC_STATE_COMM_FAILED:
|
|
||||||
opal_output(0, "%d: Failed to communicate with process %s.",
|
|
||||||
orte_process_info.pid, ORTE_JOBID_PRINT(proc->jobid));
|
|
||||||
break;
|
|
||||||
|
|
||||||
case ORTE_PROC_STATE_CALLED_ABORT:
|
|
||||||
case ORTE_PROC_STATE_FAILED_TO_START:
|
|
||||||
opal_output(0, "%d: Process %s has called abort.",
|
|
||||||
orte_process_info.pid, ORTE_JOBID_PRINT(proc->jobid));
|
|
||||||
break;
|
|
||||||
case ORTE_PROC_STATE_MIGRATING:
|
|
||||||
default:
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
int orte_errmgr_base_migrate_state_str(char ** state_str, int state)
|
|
||||||
{
|
|
||||||
switch(state) {
|
|
||||||
case ORTE_ERRMGR_MIGRATE_STATE_NONE:
|
|
||||||
*state_str = strdup(" -- ");
|
|
||||||
break;
|
|
||||||
case ORTE_ERRMGR_MIGRATE_STATE_REQUEST:
|
|
||||||
*state_str = strdup("Requested");
|
|
||||||
break;
|
|
||||||
case ORTE_ERRMGR_MIGRATE_STATE_RUNNING:
|
|
||||||
*state_str = strdup("Running");
|
|
||||||
break;
|
|
||||||
case ORTE_ERRMGR_MIGRATE_STATE_RUN_CKPT:
|
|
||||||
*state_str = strdup("Checkpointing");
|
|
||||||
break;
|
|
||||||
case ORTE_ERRMGR_MIGRATE_STATE_STARTUP:
|
|
||||||
*state_str = strdup("Restarting");
|
|
||||||
break;
|
|
||||||
case ORTE_ERRMGR_MIGRATE_STATE_FINISH:
|
|
||||||
*state_str = strdup("Finished");
|
|
||||||
break;
|
|
||||||
case ORTE_ERRMGR_MIGRATE_STATE_ERROR:
|
|
||||||
*state_str = strdup("Error");
|
|
||||||
break;
|
|
||||||
case ORTE_ERRMGR_MIGRATE_STATE_ERR_INPROGRESS:
|
|
||||||
*state_str = strdup("Error: Migration in progress");
|
|
||||||
break;
|
|
||||||
default:
|
|
||||||
asprintf(state_str, "Unknown %d", state);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
return ORTE_SUCCESS;
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#if OPAL_ENABLE_FT_CR
|
#if OPAL_ENABLE_FT_CR
|
||||||
int orte_errmgr_base_update_app_context_for_cr_recovery(orte_job_t *jobdata,
|
int orte_errmgr_base_update_app_context_for_cr_recovery(orte_job_t *jobdata,
|
||||||
|
@ -12,7 +12,7 @@
|
|||||||
* Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved.
|
* Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved.
|
||||||
* Copyright (c) 2011-2013 Los Alamos National Security, LLC.
|
* Copyright (c) 2011-2013 Los Alamos National Security, LLC.
|
||||||
* All rights reserved.
|
* All rights reserved.
|
||||||
* Copyright (c) 2013 Intel, Inc. All rights reserved
|
* Copyright (c) 2013-2017 Intel, Inc. All rights reserved.
|
||||||
* Copyright (c) 2014-2015 Research Organization for Information Science
|
* Copyright (c) 2014-2015 Research Organization for Information Science
|
||||||
* and Technology (RIST). All rights reserved.
|
* and Technology (RIST). All rights reserved.
|
||||||
* $COPYRIGHT$
|
* $COPYRIGHT$
|
||||||
@ -53,17 +53,11 @@ orte_errmgr_base_t orte_errmgr_base = {{{0}}};
|
|||||||
|
|
||||||
/* Public module provides a wrapper around previous functions */
|
/* Public module provides a wrapper around previous functions */
|
||||||
orte_errmgr_base_module_t orte_errmgr_default_fns = {
|
orte_errmgr_base_module_t orte_errmgr_default_fns = {
|
||||||
NULL, /* init */
|
.init = NULL, /* init */
|
||||||
NULL, /* finalize */
|
.finalize = NULL, /* finalize */
|
||||||
orte_errmgr_base_log,
|
.logfn = orte_errmgr_base_log,
|
||||||
orte_errmgr_base_abort,
|
.abort = orte_errmgr_base_abort,
|
||||||
orte_errmgr_base_abort_peers,
|
.abort_peers = orte_errmgr_base_abort_peers
|
||||||
NULL, /* predicted_fault */
|
|
||||||
NULL, /* suggest_map_targets */
|
|
||||||
NULL, /* ft_event */
|
|
||||||
orte_errmgr_base_register_migration_warning,
|
|
||||||
orte_errmgr_base_register_error_callback,
|
|
||||||
orte_errmgr_base_execute_error_callbacks
|
|
||||||
};
|
};
|
||||||
/* NOTE: ABSOLUTELY MUST initialize this
|
/* NOTE: ABSOLUTELY MUST initialize this
|
||||||
* struct to include the log function as it
|
* struct to include the log function as it
|
||||||
@ -71,16 +65,7 @@ orte_errmgr_base_module_t orte_errmgr_default_fns = {
|
|||||||
* opened yet due to error
|
* opened yet due to error
|
||||||
*/
|
*/
|
||||||
orte_errmgr_base_module_t orte_errmgr = {
|
orte_errmgr_base_module_t orte_errmgr = {
|
||||||
NULL,
|
.logfn = orte_errmgr_base_log
|
||||||
NULL,
|
|
||||||
orte_errmgr_base_log,
|
|
||||||
NULL,
|
|
||||||
NULL,
|
|
||||||
NULL,
|
|
||||||
NULL,
|
|
||||||
NULL,
|
|
||||||
NULL,
|
|
||||||
NULL
|
|
||||||
};
|
};
|
||||||
|
|
||||||
static int orte_errmgr_base_close(void)
|
static int orte_errmgr_base_close(void)
|
||||||
@ -118,7 +103,3 @@ static int orte_errmgr_base_open(mca_base_open_flag_t flags)
|
|||||||
MCA_BASE_FRAMEWORK_DECLARE(orte, errmgr, "ORTE Error Manager", NULL,
|
MCA_BASE_FRAMEWORK_DECLARE(orte, errmgr, "ORTE Error Manager", NULL,
|
||||||
orte_errmgr_base_open, orte_errmgr_base_close,
|
orte_errmgr_base_open, orte_errmgr_base_close,
|
||||||
mca_errmgr_base_static_components, 0);
|
mca_errmgr_base_static_components, 0);
|
||||||
|
|
||||||
OBJ_CLASS_INSTANCE(orte_errmgr_cback_t,
|
|
||||||
opal_list_item_t,
|
|
||||||
NULL, NULL);
|
|
||||||
|
@ -1,441 +0,0 @@
|
|||||||
/*
|
|
||||||
* Copyright (c) 2009-2010 The Trustees of Indiana University.
|
|
||||||
* Copyright (c) 2004-2011 The University of Tennessee and The University
|
|
||||||
* of Tennessee Research Foundation. All rights
|
|
||||||
* reserved.
|
|
||||||
* All rights reserved.
|
|
||||||
*
|
|
||||||
* $COPYRIGHT$
|
|
||||||
*
|
|
||||||
* Additional copyrights may follow
|
|
||||||
*
|
|
||||||
* $HEADER$
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include "orte_config.h"
|
|
||||||
|
|
||||||
#include <string.h>
|
|
||||||
#if HAVE_SYS_TYPES_H
|
|
||||||
#include <sys/types.h>
|
|
||||||
#endif /* HAVE_SYS_TYPES_H */
|
|
||||||
#ifdef HAVE_UNISTD_H
|
|
||||||
#include <unistd.h>
|
|
||||||
#endif /* HAVE_UNISTD_H */
|
|
||||||
#if HAVE_SYS_TYPES_H
|
|
||||||
#include <sys/types.h>
|
|
||||||
#endif /* HAVE_SYS_TYPES_H */
|
|
||||||
#if HAVE_SYS_STAT_H
|
|
||||||
#include <sys/stat.h>
|
|
||||||
#endif /* HAVE_SYS_STAT_H */
|
|
||||||
#ifdef HAVE_DIRENT_H
|
|
||||||
#include <dirent.h>
|
|
||||||
#endif /* HAVE_DIRENT_H */
|
|
||||||
#include <time.h>
|
|
||||||
|
|
||||||
#include "opal/dss/dss.h"
|
|
||||||
|
|
||||||
#include "orte/mca/mca.h"
|
|
||||||
#include "opal/mca/base/base.h"
|
|
||||||
|
|
||||||
#include "opal/util/os_dirpath.h"
|
|
||||||
#include "opal/util/output.h"
|
|
||||||
#include "opal/util/basename.h"
|
|
||||||
#include "opal/util/argv.h"
|
|
||||||
#include "opal/mca/crs/crs.h"
|
|
||||||
#include "opal/mca/crs/base/base.h"
|
|
||||||
|
|
||||||
#include "orte/mca/rml/rml.h"
|
|
||||||
#include "orte/mca/rml/rml_types.h"
|
|
||||||
#include "orte/mca/snapc/snapc.h"
|
|
||||||
#include "orte/runtime/orte_globals.h"
|
|
||||||
#include "orte/util/name_fns.h"
|
|
||||||
|
|
||||||
#include "orte/mca/errmgr/errmgr.h"
|
|
||||||
#include "orte/mca/errmgr/base/base.h"
|
|
||||||
#include "orte/mca/errmgr/base/errmgr_private.h"
|
|
||||||
|
|
||||||
/**
|
|
||||||
* This file contains function for the HNP to communicate with the
|
|
||||||
* orte-migrate command.
|
|
||||||
*/
|
|
||||||
#if OPAL_ENABLE_FT_CR
|
|
||||||
|
|
||||||
/******************
|
|
||||||
* Local Functions
|
|
||||||
******************/
|
|
||||||
static int errmgr_base_tool_start_cmdline_listener(void);
|
|
||||||
static int errmgr_base_tool_stop_cmdline_listener(void);
|
|
||||||
|
|
||||||
static void errmgr_base_tool_cmdline_recv(int status,
|
|
||||||
orte_process_name_t* sender,
|
|
||||||
opal_buffer_t* buffer,
|
|
||||||
orte_rml_tag_t tag,
|
|
||||||
void* cbdata);
|
|
||||||
|
|
||||||
/******************
|
|
||||||
* Object stuff
|
|
||||||
******************/
|
|
||||||
static orte_process_name_t errmgr_cmdline_sender = {ORTE_JOBID_INVALID, ORTE_VPID_INVALID};
|
|
||||||
static bool errmgr_cmdline_recv_issued = false;
|
|
||||||
static int errmgr_tool_initialized = false;
|
|
||||||
|
|
||||||
/********************
|
|
||||||
* Module Functions
|
|
||||||
********************/
|
|
||||||
int orte_errmgr_base_tool_init(void)
|
|
||||||
{
|
|
||||||
int ret;
|
|
||||||
|
|
||||||
if( (++errmgr_tool_initialized) != 1 ) {
|
|
||||||
if( errmgr_tool_initialized < 1 ) {
|
|
||||||
return OPAL_ERROR;
|
|
||||||
}
|
|
||||||
return OPAL_SUCCESS;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Only HNP communicates with tools */
|
|
||||||
if (! ORTE_PROC_IS_HNP) {
|
|
||||||
return ORTE_SUCCESS;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Setup command line migrate tool request listener
|
|
||||||
*/
|
|
||||||
if( ORTE_SUCCESS != (ret = errmgr_base_tool_start_cmdline_listener()) ) {
|
|
||||||
ORTE_ERROR_LOG(ret);
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
|
|
||||||
return ORTE_SUCCESS;
|
|
||||||
}
|
|
||||||
|
|
||||||
int orte_errmgr_base_tool_finalize(void)
|
|
||||||
{
|
|
||||||
int ret;
|
|
||||||
|
|
||||||
if( (--errmgr_tool_initialized) != 0 ) {
|
|
||||||
if( errmgr_tool_initialized < 0 ) {
|
|
||||||
return OPAL_ERROR;
|
|
||||||
}
|
|
||||||
return OPAL_SUCCESS;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Only HNP communicates with tools */
|
|
||||||
if (! ORTE_PROC_IS_HNP) {
|
|
||||||
return ORTE_SUCCESS;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Clean up listeners
|
|
||||||
*/
|
|
||||||
if( ORTE_SUCCESS != (ret = errmgr_base_tool_stop_cmdline_listener()) ) {
|
|
||||||
ORTE_ERROR_LOG(ret);
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
|
|
||||||
return ORTE_SUCCESS;
|
|
||||||
}
|
|
||||||
|
|
||||||
int orte_errmgr_base_migrate_update(int status)
|
|
||||||
{
|
|
||||||
int ret, exit_status = ORTE_SUCCESS;
|
|
||||||
opal_buffer_t *loc_buffer = NULL;
|
|
||||||
orte_errmgr_tool_cmd_flag_t command = ORTE_ERRMGR_MIGRATE_TOOL_UPDATE_CMD;
|
|
||||||
|
|
||||||
/* Only HNP communicates with tools */
|
|
||||||
if (! ORTE_PROC_IS_HNP) {
|
|
||||||
return ORTE_SUCCESS;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* If this is an invalid state, then return an error
|
|
||||||
*/
|
|
||||||
if( ORTE_ERRMGR_MIGRATE_MAX < status ) {
|
|
||||||
opal_output(orte_errmgr_base_framework.framework_output,
|
|
||||||
"errmgr:base:tool:update() Error: Invalid state %d < (Max %d)",
|
|
||||||
status, ORTE_ERRMGR_MIGRATE_MAX);
|
|
||||||
return ORTE_ERR_BAD_PARAM;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Report the status over the notifier interface
|
|
||||||
*/
|
|
||||||
orte_errmgr_base_migrate_state_notify(status);
|
|
||||||
|
|
||||||
/*
|
|
||||||
* If the caller is indicating that they are finished and ready for another
|
|
||||||
* command, then repost the RML listener.
|
|
||||||
*/
|
|
||||||
if( ORTE_ERRMGR_MIGRATE_STATE_NONE == status ) {
|
|
||||||
if( ORTE_SUCCESS != (ret = errmgr_base_tool_start_cmdline_listener()) ) {
|
|
||||||
ORTE_ERROR_LOG(ret);
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
return ORTE_SUCCESS;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Noop if invalid peer, or peer not specified
|
|
||||||
*/
|
|
||||||
if( OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, ORTE_NAME_INVALID, &errmgr_cmdline_sender) ) {
|
|
||||||
return ORTE_SUCCESS;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Do not send to self, as that is silly.
|
|
||||||
*/
|
|
||||||
if( OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, ORTE_PROC_MY_HNP, &errmgr_cmdline_sender) ) {
|
|
||||||
OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base_framework.framework_output,
|
|
||||||
"errmgr:base:tool:update() Warning: Do not send to self!\n"));
|
|
||||||
return ORTE_SUCCESS;
|
|
||||||
}
|
|
||||||
|
|
||||||
OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base_framework.framework_output,
|
|
||||||
"errmgr:base:tool:update() Sending update command <status %d>\n",
|
|
||||||
status));
|
|
||||||
|
|
||||||
/********************
|
|
||||||
* Send over the status of the checkpoint
|
|
||||||
* - migration state
|
|
||||||
********************/
|
|
||||||
if (NULL == (loc_buffer = OBJ_NEW(opal_buffer_t))) {
|
|
||||||
exit_status = ORTE_ERROR;
|
|
||||||
goto cleanup;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (ORTE_SUCCESS != (ret = opal_dss.pack(loc_buffer, &command, 1, ORTE_ERRMGR_MIGRATE_TOOL_CMD)) ) {
|
|
||||||
opal_output(orte_errmgr_base_framework.framework_output,
|
|
||||||
"errmgr:base:tool:update() Error: DSS Pack (cmd) Failure (ret = %d)\n",
|
|
||||||
ret);
|
|
||||||
ORTE_ERROR_LOG(ret);
|
|
||||||
exit_status = ret;
|
|
||||||
goto cleanup;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (ORTE_SUCCESS != (ret = opal_dss.pack(loc_buffer, &status, 1, OPAL_INT))) {
|
|
||||||
opal_output(orte_errmgr_base_framework.framework_output,
|
|
||||||
"errmgr:base:tool:update() Error: DSS Pack (status) Failure (ret = %d)\n",
|
|
||||||
ret);
|
|
||||||
ORTE_ERROR_LOG(ret);
|
|
||||||
exit_status = ret;
|
|
||||||
goto cleanup;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (ORTE_SUCCESS != (ret = orte_rml.send_buffer_nb(&errmgr_cmdline_sender,
|
|
||||||
loc_buffer, ORTE_RML_TAG_MIGRATE,
|
|
||||||
orte_rml_send_callback, NULL))) {
|
|
||||||
opal_output(orte_errmgr_base_framework.framework_output,
|
|
||||||
"errmgr:base:tool:update() Error: Send (status) Failure (ret = %d)\n",
|
|
||||||
ret);
|
|
||||||
ORTE_ERROR_LOG(ret);
|
|
||||||
exit_status = ret;
|
|
||||||
goto cleanup;
|
|
||||||
}
|
|
||||||
|
|
||||||
cleanup:
|
|
||||||
if(NULL != loc_buffer) {
|
|
||||||
OBJ_RELEASE(loc_buffer);
|
|
||||||
loc_buffer = NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
return exit_status;
|
|
||||||
}
|
|
||||||
|
|
||||||
/********************
|
|
||||||
* Utility functions
|
|
||||||
********************/
|
|
||||||
|
|
||||||
/********************
|
|
||||||
* Local Functions
|
|
||||||
********************/
|
|
||||||
static int errmgr_base_tool_start_cmdline_listener(void)
|
|
||||||
{
|
|
||||||
if (errmgr_cmdline_recv_issued && ORTE_PROC_IS_HNP) {
|
|
||||||
return ORTE_SUCCESS;
|
|
||||||
}
|
|
||||||
|
|
||||||
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
|
|
||||||
"errmgr:base:tool: Startup Command Line Channel"));
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Coordinator command listener
|
|
||||||
*/
|
|
||||||
errmgr_cmdline_sender.jobid = ORTE_JOBID_INVALID;
|
|
||||||
errmgr_cmdline_sender.vpid = ORTE_VPID_INVALID;
|
|
||||||
orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_MIGRATE,
|
|
||||||
0, errmgr_base_tool_cmdline_recv, NULL);
|
|
||||||
|
|
||||||
errmgr_cmdline_recv_issued = true;
|
|
||||||
|
|
||||||
return ORTE_SUCCESS;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
static int errmgr_base_tool_stop_cmdline_listener(void)
|
|
||||||
{
|
|
||||||
int exit_status = ORTE_SUCCESS;
|
|
||||||
|
|
||||||
if (!errmgr_cmdline_recv_issued && ORTE_PROC_IS_HNP) {
|
|
||||||
return ORTE_SUCCESS;
|
|
||||||
}
|
|
||||||
|
|
||||||
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
|
|
||||||
"errmgr:base:tool: Shutdown Command Line Channel"));
|
|
||||||
|
|
||||||
orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_MIGRATE);
|
|
||||||
|
|
||||||
errmgr_cmdline_recv_issued = false;
|
|
||||||
|
|
||||||
return exit_status;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*****************
|
|
||||||
* Listener Callbacks
|
|
||||||
*****************/
|
|
||||||
static void errmgr_base_tool_cmdline_recv(int status,
|
|
||||||
orte_process_name_t* sender,
|
|
||||||
opal_buffer_t* buffer,
|
|
||||||
orte_rml_tag_t tag,
|
|
||||||
void* cbdata)
|
|
||||||
{
|
|
||||||
int ret;
|
|
||||||
orte_process_name_t swap_dest;
|
|
||||||
orte_errmgr_tool_cmd_flag_t command;
|
|
||||||
orte_std_cntr_t count = 1;
|
|
||||||
char *off_nodes = NULL;
|
|
||||||
char *off_procs = NULL;
|
|
||||||
char *onto_nodes = NULL;
|
|
||||||
char **split_off_nodes = NULL;
|
|
||||||
char **split_off_procs = NULL;
|
|
||||||
char **split_onto_nodes = NULL;
|
|
||||||
opal_list_t *proc_list = NULL;
|
|
||||||
opal_list_t *node_list = NULL;
|
|
||||||
opal_list_t *suggested_map_list = NULL;
|
|
||||||
orte_errmgr_predicted_proc_t *off_proc = NULL;
|
|
||||||
orte_errmgr_predicted_node_t *off_node = NULL;
|
|
||||||
orte_errmgr_predicted_map_t *onto_map = NULL;
|
|
||||||
int cnt = 0, i;
|
|
||||||
|
|
||||||
|
|
||||||
if( ORTE_RML_TAG_MIGRATE != tag ) {
|
|
||||||
opal_output(orte_errmgr_base_framework.framework_output,
|
|
||||||
"errmgr:base:tool:recv() Error: Unknown tag: Received a command message from %s (tag = %d).",
|
|
||||||
ORTE_NAME_PRINT(sender), tag);
|
|
||||||
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base_framework.framework_output,
|
|
||||||
"errmgr:base:tool:recv() Command Line: Start a migration operation [Sender = %s]",
|
|
||||||
ORTE_NAME_PRINT(sender)));
|
|
||||||
|
|
||||||
errmgr_cmdline_recv_issued = false; /* Not a persistent RML message */
|
|
||||||
|
|
||||||
/*
|
|
||||||
* If we are already interacting with a command line tool then reject this
|
|
||||||
* request. Since we only allow the processing of one tool command at a
|
|
||||||
* time.
|
|
||||||
*/
|
|
||||||
if( OPAL_EQUAL != orte_util_compare_name_fields(ORTE_NS_CMP_ALL, ORTE_NAME_INVALID, &errmgr_cmdline_sender) ) {
|
|
||||||
swap_dest.jobid = errmgr_cmdline_sender.jobid;
|
|
||||||
swap_dest.vpid = errmgr_cmdline_sender.vpid;
|
|
||||||
|
|
||||||
errmgr_cmdline_sender = *sender;
|
|
||||||
orte_errmgr_base_migrate_update(ORTE_ERRMGR_MIGRATE_STATE_ERR_INPROGRESS);
|
|
||||||
|
|
||||||
errmgr_cmdline_sender.jobid = swap_dest.jobid;
|
|
||||||
errmgr_cmdline_sender.vpid = swap_dest.vpid;
|
|
||||||
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
errmgr_cmdline_sender = *sender;
|
|
||||||
|
|
||||||
count = 1;
|
|
||||||
if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &command, &count, ORTE_ERRMGR_MIGRATE_TOOL_CMD))) {
|
|
||||||
ORTE_ERROR_LOG(ret);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* orte-migrate has requested that a checkpoint be taken
|
|
||||||
*/
|
|
||||||
if (ORTE_ERRMGR_MIGRATE_TOOL_INIT_CMD == command) {
|
|
||||||
OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base_framework.framework_output,
|
|
||||||
"errmgr:base:tool:recv() Command line requested process migration [command %d]\n",
|
|
||||||
command));
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Unpack the buffer from the orte-migrate command
|
|
||||||
*/
|
|
||||||
count = 1;
|
|
||||||
if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &(off_procs), &count, OPAL_STRING))) {
|
|
||||||
ORTE_ERROR_LOG(ret);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &(off_nodes), &count, OPAL_STRING))) {
|
|
||||||
ORTE_ERROR_LOG(ret);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &(onto_nodes), &count, OPAL_STRING))) {
|
|
||||||
ORTE_ERROR_LOG(ret);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Parse the comma separated list
|
|
||||||
*/
|
|
||||||
proc_list = OBJ_NEW(opal_list_t);
|
|
||||||
node_list = OBJ_NEW(opal_list_t);
|
|
||||||
suggested_map_list = OBJ_NEW(opal_list_t);
|
|
||||||
|
|
||||||
split_off_procs = opal_argv_split(off_procs, ',');
|
|
||||||
cnt = opal_argv_count(split_off_procs);
|
|
||||||
if( cnt > 0 ) {
|
|
||||||
for(i = 0; i < cnt; ++i) {
|
|
||||||
off_proc = OBJ_NEW(orte_errmgr_predicted_proc_t);
|
|
||||||
off_proc->proc_name.vpid = atoi(split_off_procs[i]);
|
|
||||||
opal_list_append(proc_list, &(off_proc->super));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
split_off_nodes = opal_argv_split(off_nodes, ',');
|
|
||||||
cnt = opal_argv_count(split_off_nodes);
|
|
||||||
if( cnt > 0 ) {
|
|
||||||
for(i = 0; i < cnt; ++i) {
|
|
||||||
off_node = OBJ_NEW(orte_errmgr_predicted_node_t);
|
|
||||||
off_node->node_name = strdup(split_off_nodes[i]);
|
|
||||||
opal_list_append(node_list, &(off_node->super));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
split_onto_nodes = opal_argv_split(onto_nodes, ',');
|
|
||||||
cnt = opal_argv_count(split_onto_nodes);
|
|
||||||
if( cnt > 0 ) {
|
|
||||||
for(i = 0; i < cnt; ++i) {
|
|
||||||
onto_map = OBJ_NEW(orte_errmgr_predicted_map_t);
|
|
||||||
onto_map->map_node_name = strdup(split_onto_nodes[i]);
|
|
||||||
opal_list_append(suggested_map_list, &(onto_map->super));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Pass to the predicted fault function to see how they would like to progress
|
|
||||||
*/
|
|
||||||
orte_errmgr.predicted_fault(proc_list, node_list, suggested_map_list);
|
|
||||||
}
|
|
||||||
/*
|
|
||||||
* Unknown command
|
|
||||||
*/
|
|
||||||
else {
|
|
||||||
OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base_framework.framework_output,
|
|
||||||
"errmgr:base:tool:recv() Command line sent an unknown command (command %d)\n",
|
|
||||||
command));
|
|
||||||
ORTE_ERROR_LOG(ORTE_ERR_NOT_SUPPORTED);
|
|
||||||
}
|
|
||||||
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
#endif
|
|
@ -12,6 +12,7 @@
|
|||||||
* Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved.
|
* Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved.
|
||||||
* Copyright (c) 2011 Los Alamos National Security, LLC.
|
* Copyright (c) 2011 Los Alamos National Security, LLC.
|
||||||
* All rights reserved.
|
* All rights reserved.
|
||||||
|
* Copyright (c) 2017 Intel, Inc. All rights reserved.
|
||||||
* $COPYRIGHT$
|
* $COPYRIGHT$
|
||||||
*
|
*
|
||||||
* Additional copyrights may follow
|
* Additional copyrights may follow
|
||||||
@ -53,14 +54,6 @@ typedef struct {
|
|||||||
|
|
||||||
ORTE_DECLSPEC extern orte_errmgr_base_t orte_errmgr_base;
|
ORTE_DECLSPEC extern orte_errmgr_base_t orte_errmgr_base;
|
||||||
|
|
||||||
/* define a struct to hold registered error callbacks */
|
|
||||||
typedef struct {
|
|
||||||
opal_list_item_t super;
|
|
||||||
orte_errmgr_error_order_t order;
|
|
||||||
orte_errmgr_error_callback_fn_t *callback;
|
|
||||||
} orte_errmgr_cback_t;
|
|
||||||
OBJ_CLASS_DECLARATION(orte_errmgr_cback_t);
|
|
||||||
|
|
||||||
/* declare the base default module */
|
/* declare the base default module */
|
||||||
ORTE_DECLSPEC extern orte_errmgr_base_module_t orte_errmgr_default_fns;
|
ORTE_DECLSPEC extern orte_errmgr_base_module_t orte_errmgr_default_fns;
|
||||||
|
|
||||||
@ -75,12 +68,5 @@ ORTE_DECLSPEC int orte_errmgr_base_abort_peers(orte_process_name_t *procs,
|
|||||||
orte_std_cntr_t num_procs,
|
orte_std_cntr_t num_procs,
|
||||||
int error_code);
|
int error_code);
|
||||||
|
|
||||||
ORTE_DECLSPEC void orte_errmgr_base_register_migration_warning(struct timeval *tv);
|
|
||||||
|
|
||||||
ORTE_DECLSPEC int orte_errmgr_base_register_error_callback(orte_errmgr_error_callback_fn_t *cbfunc,
|
|
||||||
orte_errmgr_error_order_t order);
|
|
||||||
|
|
||||||
ORTE_DECLSPEC void orte_errmgr_base_execute_error_callbacks(opal_pointer_array_t *errors);
|
|
||||||
|
|
||||||
END_C_DECLS
|
END_C_DECLS
|
||||||
#endif
|
#endif
|
||||||
|
@ -56,17 +56,11 @@
|
|||||||
* HNP module
|
* HNP module
|
||||||
******************/
|
******************/
|
||||||
orte_errmgr_base_module_t orte_errmgr_default_app_module = {
|
orte_errmgr_base_module_t orte_errmgr_default_app_module = {
|
||||||
init,
|
.init = init,
|
||||||
finalize,
|
.finalize = finalize,
|
||||||
orte_errmgr_base_log,
|
.logfn = orte_errmgr_base_log,
|
||||||
orte_errmgr_base_abort,
|
.abort = orte_errmgr_base_abort,
|
||||||
abort_peers,
|
.abort_peers = abort_peers
|
||||||
NULL,
|
|
||||||
NULL,
|
|
||||||
NULL,
|
|
||||||
orte_errmgr_base_register_migration_warning,
|
|
||||||
orte_errmgr_base_register_error_callback,
|
|
||||||
orte_errmgr_base_execute_error_callbacks
|
|
||||||
};
|
};
|
||||||
|
|
||||||
static void proc_errors(int fd, short args, void *cbdata);
|
static void proc_errors(int fd, short args, void *cbdata);
|
||||||
@ -77,6 +71,7 @@ static void register_cbfunc(int status, size_t errhndler, void *cbdata)
|
|||||||
{
|
{
|
||||||
volatile bool *active = (volatile bool*)cbdata;
|
volatile bool *active = (volatile bool*)cbdata;
|
||||||
myerrhandle = errhndler;
|
myerrhandle = errhndler;
|
||||||
|
ORTE_POST_OBJECT(active);
|
||||||
*active = false;
|
*active = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -112,7 +107,7 @@ static void notify_cbfunc(int status,
|
|||||||
}
|
}
|
||||||
|
|
||||||
/* push it into our event base */
|
/* push it into our event base */
|
||||||
ORTE_ACTIVATE_PROC_STATE(ORTE_PROC_MY_NAME, state);
|
ORTE_ACTIVATE_PROC_STATE((orte_process_name_t*)source, state);
|
||||||
}
|
}
|
||||||
|
|
||||||
/************************
|
/************************
|
||||||
@ -154,8 +149,8 @@ static void proc_errors(int fd, short args, void *cbdata)
|
|||||||
{
|
{
|
||||||
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
|
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
|
||||||
char *nodename;
|
char *nodename;
|
||||||
orte_error_t err;
|
|
||||||
opal_pointer_array_t errors;
|
ORTE_ACQUIRE_OBJECT(caddy);
|
||||||
|
|
||||||
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_framework.framework_output,
|
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_framework.framework_output,
|
||||||
"%s errmgr:default_app: proc %s state %s",
|
"%s errmgr:default_app: proc %s state %s",
|
||||||
@ -171,14 +166,6 @@ static void proc_errors(int fd, short args, void *cbdata)
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* pass the error to the error_callbacks for processing */
|
|
||||||
OBJ_CONSTRUCT(&errors, opal_pointer_array_t);
|
|
||||||
opal_pointer_array_init(&errors, 1, INT_MAX, 1);
|
|
||||||
err.errcode = caddy->proc_state;
|
|
||||||
err.proc = caddy->name;
|
|
||||||
opal_pointer_array_add(&errors, &err);
|
|
||||||
|
|
||||||
|
|
||||||
if (ORTE_PROC_STATE_UNABLE_TO_SEND_MSG == caddy->proc_state) {
|
if (ORTE_PROC_STATE_UNABLE_TO_SEND_MSG == caddy->proc_state) {
|
||||||
/* we can't send a message - print a message */
|
/* we can't send a message - print a message */
|
||||||
nodename = orte_get_proc_hostname(&caddy->name);
|
nodename = orte_get_proc_hostname(&caddy->name);
|
||||||
@ -197,9 +184,6 @@ static void proc_errors(int fd, short args, void *cbdata)
|
|||||||
orte_abnormal_term_ordered = true;
|
orte_abnormal_term_ordered = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
orte_errmgr_base_execute_error_callbacks(&errors);
|
|
||||||
OBJ_DESTRUCT(&errors);
|
|
||||||
|
|
||||||
OBJ_RELEASE(caddy);
|
OBJ_RELEASE(caddy);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -50,6 +50,7 @@
|
|||||||
#include "orte/util/proc_info.h"
|
#include "orte/util/proc_info.h"
|
||||||
#include "orte/util/show_help.h"
|
#include "orte/util/show_help.h"
|
||||||
#include "orte/util/nidmap.h"
|
#include "orte/util/nidmap.h"
|
||||||
|
#include "orte/util/threads.h"
|
||||||
|
|
||||||
#include "orte/runtime/orte_globals.h"
|
#include "orte/runtime/orte_globals.h"
|
||||||
#include "orte/runtime/orte_locks.h"
|
#include "orte/runtime/orte_locks.h"
|
||||||
@ -66,32 +67,15 @@ static int init(void);
|
|||||||
static int finalize(void);
|
static int finalize(void);
|
||||||
static void hnp_abort(int error_code, char *fmt, ...);
|
static void hnp_abort(int error_code, char *fmt, ...);
|
||||||
|
|
||||||
static int predicted_fault(opal_list_t *proc_list,
|
|
||||||
opal_list_t *node_list,
|
|
||||||
opal_list_t *suggested_map);
|
|
||||||
|
|
||||||
static int suggest_map_targets(orte_proc_t *proc,
|
|
||||||
orte_node_t *oldnode,
|
|
||||||
opal_list_t *node_list);
|
|
||||||
|
|
||||||
static int ft_event(int state);
|
|
||||||
|
|
||||||
|
|
||||||
/******************
|
/******************
|
||||||
* default_hnp module
|
* default_hnp module
|
||||||
******************/
|
******************/
|
||||||
orte_errmgr_base_module_t orte_errmgr_default_hnp_module = {
|
orte_errmgr_base_module_t orte_errmgr_default_hnp_module = {
|
||||||
init,
|
.init = init,
|
||||||
finalize,
|
.finalize = finalize,
|
||||||
orte_errmgr_base_log,
|
.logfn = orte_errmgr_base_log,
|
||||||
hnp_abort,
|
.abort = hnp_abort,
|
||||||
orte_errmgr_base_abort_peers,
|
.abort_peers = orte_errmgr_base_abort_peers
|
||||||
predicted_fault,
|
|
||||||
suggest_map_targets,
|
|
||||||
ft_event,
|
|
||||||
orte_errmgr_base_register_migration_warning,
|
|
||||||
NULL,
|
|
||||||
orte_errmgr_base_execute_error_callbacks
|
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
@ -129,6 +113,7 @@ static int finalize(void)
|
|||||||
static void wakeup(int sd, short args, void *cbdata)
|
static void wakeup(int sd, short args, void *cbdata)
|
||||||
{
|
{
|
||||||
/* nothing more we can do */
|
/* nothing more we can do */
|
||||||
|
ORTE_ACQUIRE_OBJECT(cbdata);
|
||||||
orte_quit(0, 0, NULL);
|
orte_quit(0, 0, NULL);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -187,6 +172,7 @@ static void hnp_abort(int error_code, char *fmt, ...)
|
|||||||
timer->tv.tv_usec = 0;
|
timer->tv.tv_usec = 0;
|
||||||
opal_event_evtimer_set(orte_event_base, timer->ev, wakeup, NULL);
|
opal_event_evtimer_set(orte_event_base, timer->ev, wakeup, NULL);
|
||||||
opal_event_set_priority(timer->ev, ORTE_ERROR_PRI);
|
opal_event_set_priority(timer->ev, ORTE_ERROR_PRI);
|
||||||
|
ORTE_POST_OBJECT(timer);
|
||||||
opal_event_evtimer_add(timer->ev, &timer->tv);
|
opal_event_evtimer_add(timer->ev, &timer->tv);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -202,6 +188,8 @@ static void job_errors(int fd, short args, void *cbdata)
|
|||||||
int32_t rc, ret;
|
int32_t rc, ret;
|
||||||
int room, *rmptr;
|
int room, *rmptr;
|
||||||
|
|
||||||
|
ORTE_ACQUIRE_OBJECT(caddy);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* if orte is trying to shutdown, just let it
|
* if orte is trying to shutdown, just let it
|
||||||
*/
|
*/
|
||||||
@ -363,6 +351,8 @@ static void proc_errors(int fd, short args, void *cbdata)
|
|||||||
int32_t i32, *i32ptr;
|
int32_t i32, *i32ptr;
|
||||||
char *rtmod;
|
char *rtmod;
|
||||||
|
|
||||||
|
ORTE_ACQUIRE_OBJECT(caddy);
|
||||||
|
|
||||||
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_framework.framework_output,
|
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_framework.framework_output,
|
||||||
"%s errmgr:default_hnp: for proc %s state %s",
|
"%s errmgr:default_hnp: for proc %s state %s",
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
@ -798,25 +788,6 @@ static void proc_errors(int fd, short args, void *cbdata)
|
|||||||
OBJ_RELEASE(caddy);
|
OBJ_RELEASE(caddy);
|
||||||
}
|
}
|
||||||
|
|
||||||
static int predicted_fault(opal_list_t *proc_list,
|
|
||||||
opal_list_t *node_list,
|
|
||||||
opal_list_t *suggested_map)
|
|
||||||
{
|
|
||||||
return ORTE_ERR_NOT_IMPLEMENTED;
|
|
||||||
}
|
|
||||||
|
|
||||||
static int suggest_map_targets(orte_proc_t *proc,
|
|
||||||
orte_node_t *oldnode,
|
|
||||||
opal_list_t *node_list)
|
|
||||||
{
|
|
||||||
return ORTE_ERR_NOT_IMPLEMENTED;
|
|
||||||
}
|
|
||||||
|
|
||||||
static int ft_event(int state)
|
|
||||||
{
|
|
||||||
return ORTE_SUCCESS;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*****************
|
/*****************
|
||||||
* Local Functions
|
* Local Functions
|
||||||
*****************/
|
*****************/
|
||||||
|
@ -33,6 +33,7 @@
|
|||||||
#include "orte/util/session_dir.h"
|
#include "orte/util/session_dir.h"
|
||||||
#include "orte/util/show_help.h"
|
#include "orte/util/show_help.h"
|
||||||
#include "orte/util/nidmap.h"
|
#include "orte/util/nidmap.h"
|
||||||
|
#include "orte/util/threads.h"
|
||||||
|
|
||||||
#include "orte/mca/iof/base/base.h"
|
#include "orte/mca/iof/base/base.h"
|
||||||
#include "orte/mca/rml/rml.h"
|
#include "orte/mca/rml/rml.h"
|
||||||
@ -60,32 +61,16 @@
|
|||||||
static int init(void);
|
static int init(void);
|
||||||
static int finalize(void);
|
static int finalize(void);
|
||||||
static void orted_abort(int error_code, char *fmt, ...);
|
static void orted_abort(int error_code, char *fmt, ...);
|
||||||
static int predicted_fault(opal_list_t *proc_list,
|
|
||||||
opal_list_t *node_list,
|
|
||||||
opal_list_t *suggested_map);
|
|
||||||
|
|
||||||
static int suggest_map_targets(orte_proc_t *proc,
|
|
||||||
orte_node_t *oldnode,
|
|
||||||
opal_list_t *node_list);
|
|
||||||
|
|
||||||
static int ft_event(int state);
|
|
||||||
|
|
||||||
|
|
||||||
/******************
|
/******************
|
||||||
* default_orted module
|
* default_orted module
|
||||||
******************/
|
******************/
|
||||||
orte_errmgr_base_module_t orte_errmgr_default_orted_module = {
|
orte_errmgr_base_module_t orte_errmgr_default_orted_module = {
|
||||||
init,
|
.init = init,
|
||||||
finalize,
|
.finalize = finalize,
|
||||||
orte_errmgr_base_log,
|
.logfn = orte_errmgr_base_log,
|
||||||
orted_abort,
|
.abort = orted_abort,
|
||||||
orte_errmgr_base_abort_peers,
|
.abort_peers = orte_errmgr_base_abort_peers
|
||||||
predicted_fault,
|
|
||||||
suggest_map_targets,
|
|
||||||
ft_event,
|
|
||||||
orte_errmgr_base_register_migration_warning,
|
|
||||||
NULL,
|
|
||||||
orte_errmgr_base_execute_error_callbacks
|
|
||||||
};
|
};
|
||||||
|
|
||||||
/* Local functions */
|
/* Local functions */
|
||||||
@ -125,6 +110,7 @@ static int finalize(void)
|
|||||||
static void wakeup(int sd, short args, void *cbdata)
|
static void wakeup(int sd, short args, void *cbdata)
|
||||||
{
|
{
|
||||||
/* nothing more we can do */
|
/* nothing more we can do */
|
||||||
|
ORTE_ACQUIRE_OBJECT(cbdata);
|
||||||
orte_quit(0, 0, NULL);
|
orte_quit(0, 0, NULL);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -231,6 +217,7 @@ static void orted_abort(int error_code, char *fmt, ...)
|
|||||||
timer->tv.tv_usec = 0;
|
timer->tv.tv_usec = 0;
|
||||||
opal_event_evtimer_set(orte_event_base, timer->ev, wakeup, NULL);
|
opal_event_evtimer_set(orte_event_base, timer->ev, wakeup, NULL);
|
||||||
opal_event_set_priority(timer->ev, ORTE_ERROR_PRI);
|
opal_event_set_priority(timer->ev, ORTE_ERROR_PRI);
|
||||||
|
ORTE_POST_OBJECT(timer);
|
||||||
opal_event_evtimer_add(timer->ev, &timer->tv);
|
opal_event_evtimer_add(timer->ev, &timer->tv);
|
||||||
|
|
||||||
}
|
}
|
||||||
@ -244,6 +231,8 @@ static void job_errors(int fd, short args, void *cbdata)
|
|||||||
orte_plm_cmd_flag_t cmd;
|
orte_plm_cmd_flag_t cmd;
|
||||||
opal_buffer_t *alert;
|
opal_buffer_t *alert;
|
||||||
|
|
||||||
|
ORTE_ACQUIRE_OBJECT(caddy);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* if orte is trying to shutdown, just let it
|
* if orte is trying to shutdown, just let it
|
||||||
*/
|
*/
|
||||||
@ -330,6 +319,8 @@ static void proc_errors(int fd, short args, void *cbdata)
|
|||||||
int rc=ORTE_SUCCESS;
|
int rc=ORTE_SUCCESS;
|
||||||
int i;
|
int i;
|
||||||
|
|
||||||
|
ORTE_ACQUIRE_OBJECT(caddy);
|
||||||
|
|
||||||
OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output,
|
OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output,
|
||||||
"%s errmgr:default_orted:proc_errors process %s error state %s",
|
"%s errmgr:default_orted:proc_errors process %s error state %s",
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
@ -724,26 +715,6 @@ static void proc_errors(int fd, short args, void *cbdata)
|
|||||||
OBJ_RELEASE(caddy);
|
OBJ_RELEASE(caddy);
|
||||||
}
|
}
|
||||||
|
|
||||||
static int predicted_fault(opal_list_t *proc_list,
|
|
||||||
opal_list_t *node_list,
|
|
||||||
opal_list_t *suggested_map)
|
|
||||||
{
|
|
||||||
return ORTE_ERR_NOT_IMPLEMENTED;
|
|
||||||
}
|
|
||||||
|
|
||||||
static int suggest_map_targets(orte_proc_t *proc,
|
|
||||||
orte_node_t *oldnode,
|
|
||||||
opal_list_t *node_list)
|
|
||||||
{
|
|
||||||
return ORTE_ERR_NOT_IMPLEMENTED;
|
|
||||||
}
|
|
||||||
|
|
||||||
static int ft_event(int state)
|
|
||||||
{
|
|
||||||
return ORTE_SUCCESS;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/*****************
|
/*****************
|
||||||
* Local Functions
|
* Local Functions
|
||||||
*****************/
|
*****************/
|
||||||
|
@ -31,6 +31,7 @@
|
|||||||
#include "orte/util/error_strings.h"
|
#include "orte/util/error_strings.h"
|
||||||
#include "orte/util/name_fns.h"
|
#include "orte/util/name_fns.h"
|
||||||
#include "orte/util/show_help.h"
|
#include "orte/util/show_help.h"
|
||||||
|
#include "orte/util/threads.h"
|
||||||
#include "orte/runtime/orte_globals.h"
|
#include "orte/runtime/orte_globals.h"
|
||||||
#include "orte/mca/rml/rml.h"
|
#include "orte/mca/rml/rml.h"
|
||||||
#include "orte/mca/odls/odls_types.h"
|
#include "orte/mca/odls/odls_types.h"
|
||||||
@ -54,17 +55,11 @@ static int abort_peers(orte_process_name_t *procs,
|
|||||||
* HNP module
|
* HNP module
|
||||||
******************/
|
******************/
|
||||||
orte_errmgr_base_module_t orte_errmgr_default_tool_module = {
|
orte_errmgr_base_module_t orte_errmgr_default_tool_module = {
|
||||||
init,
|
.init= init,
|
||||||
finalize,
|
.finalize = finalize,
|
||||||
orte_errmgr_base_log,
|
.logfn = orte_errmgr_base_log,
|
||||||
orte_errmgr_base_abort,
|
.abort = orte_errmgr_base_abort,
|
||||||
abort_peers,
|
.abort_peers = abort_peers
|
||||||
NULL,
|
|
||||||
NULL,
|
|
||||||
NULL,
|
|
||||||
orte_errmgr_base_register_migration_warning,
|
|
||||||
orte_errmgr_base_register_error_callback,
|
|
||||||
orte_errmgr_base_execute_error_callbacks
|
|
||||||
};
|
};
|
||||||
|
|
||||||
static void proc_errors(int fd, short args, void *cbdata);
|
static void proc_errors(int fd, short args, void *cbdata);
|
||||||
@ -89,6 +84,8 @@ static void proc_errors(int fd, short args, void *cbdata)
|
|||||||
{
|
{
|
||||||
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
|
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
|
||||||
|
|
||||||
|
ORTE_ACQUIRE_OBJECT(caddy);
|
||||||
|
|
||||||
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_framework.framework_output,
|
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_framework.framework_output,
|
||||||
"%s errmgr:default_tool: proc %s state %s",
|
"%s errmgr:default_tool: proc %s state %s",
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
@ -106,6 +103,7 @@ static void proc_errors(int fd, short args, void *cbdata)
|
|||||||
/* if we lost our lifeline, then just stop the event loop
|
/* if we lost our lifeline, then just stop the event loop
|
||||||
* so the main program can cleanly terminate */
|
* so the main program can cleanly terminate */
|
||||||
if (ORTE_PROC_STATE_LIFELINE_LOST == caddy->proc_state) {
|
if (ORTE_PROC_STATE_LIFELINE_LOST == caddy->proc_state) {
|
||||||
|
ORTE_POST_OBJECT(caddy);
|
||||||
orte_event_base_active = false;
|
orte_event_base_active = false;
|
||||||
} else {
|
} else {
|
||||||
/* all other errors require abort */
|
/* all other errors require abort */
|
||||||
|
@ -50,6 +50,7 @@
|
|||||||
#include "orte/util/proc_info.h"
|
#include "orte/util/proc_info.h"
|
||||||
#include "orte/util/show_help.h"
|
#include "orte/util/show_help.h"
|
||||||
#include "orte/util/nidmap.h"
|
#include "orte/util/nidmap.h"
|
||||||
|
#include "orte/util/threads.h"
|
||||||
|
|
||||||
#include "orte/runtime/orte_globals.h"
|
#include "orte/runtime/orte_globals.h"
|
||||||
#include "orte/runtime/orte_locks.h"
|
#include "orte/runtime/orte_locks.h"
|
||||||
@ -65,32 +66,15 @@
|
|||||||
static int init(void);
|
static int init(void);
|
||||||
static int finalize(void);
|
static int finalize(void);
|
||||||
|
|
||||||
static int predicted_fault(opal_list_t *proc_list,
|
|
||||||
opal_list_t *node_list,
|
|
||||||
opal_list_t *suggested_map);
|
|
||||||
|
|
||||||
static int suggest_map_targets(orte_proc_t *proc,
|
|
||||||
orte_node_t *oldnode,
|
|
||||||
opal_list_t *node_list);
|
|
||||||
|
|
||||||
static int ft_event(int state);
|
|
||||||
|
|
||||||
|
|
||||||
/******************
|
/******************
|
||||||
* dvm module
|
* dvm module
|
||||||
******************/
|
******************/
|
||||||
orte_errmgr_base_module_t orte_errmgr_dvm_module = {
|
orte_errmgr_base_module_t orte_errmgr_dvm_module = {
|
||||||
init,
|
.init = init,
|
||||||
finalize,
|
.finalize = finalize,
|
||||||
orte_errmgr_base_log,
|
.logfn = orte_errmgr_base_log,
|
||||||
orte_errmgr_base_abort,
|
.abort = orte_errmgr_base_abort,
|
||||||
orte_errmgr_base_abort_peers,
|
.abort_peers = orte_errmgr_base_abort_peers
|
||||||
predicted_fault,
|
|
||||||
suggest_map_targets,
|
|
||||||
ft_event,
|
|
||||||
orte_errmgr_base_register_migration_warning,
|
|
||||||
NULL,
|
|
||||||
orte_errmgr_base_execute_error_callbacks
|
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
@ -146,6 +130,8 @@ static void job_errors(int fd, short args, void *cbdata)
|
|||||||
int32_t rc, ret;
|
int32_t rc, ret;
|
||||||
int room, *rmptr;
|
int room, *rmptr;
|
||||||
|
|
||||||
|
ORTE_ACQUIRE_OBJECT(caddy);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* if orte is trying to shutdown, just let it
|
* if orte is trying to shutdown, just let it
|
||||||
*/
|
*/
|
||||||
@ -248,6 +234,8 @@ static void proc_errors(int fd, short args, void *cbdata)
|
|||||||
int32_t i32, *i32ptr;
|
int32_t i32, *i32ptr;
|
||||||
char *rtmod;
|
char *rtmod;
|
||||||
|
|
||||||
|
ORTE_ACQUIRE_OBJECT(caddy);
|
||||||
|
|
||||||
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_framework.framework_output,
|
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_framework.framework_output,
|
||||||
"%s errmgr:dvm: for proc %s state %s",
|
"%s errmgr:dvm: for proc %s state %s",
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
@ -643,22 +631,3 @@ static void proc_errors(int fd, short args, void *cbdata)
|
|||||||
cleanup:
|
cleanup:
|
||||||
OBJ_RELEASE(caddy);
|
OBJ_RELEASE(caddy);
|
||||||
}
|
}
|
||||||
|
|
||||||
static int predicted_fault(opal_list_t *proc_list,
|
|
||||||
opal_list_t *node_list,
|
|
||||||
opal_list_t *suggested_map)
|
|
||||||
{
|
|
||||||
return ORTE_ERR_NOT_IMPLEMENTED;
|
|
||||||
}
|
|
||||||
|
|
||||||
static int suggest_map_targets(orte_proc_t *proc,
|
|
||||||
orte_node_t *oldnode,
|
|
||||||
opal_list_t *node_list)
|
|
||||||
{
|
|
||||||
return ORTE_ERR_NOT_IMPLEMENTED;
|
|
||||||
}
|
|
||||||
|
|
||||||
static int ft_event(int state)
|
|
||||||
{
|
|
||||||
return ORTE_SUCCESS;
|
|
||||||
}
|
|
||||||
|
@ -14,7 +14,7 @@
|
|||||||
* Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved.
|
* Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved.
|
||||||
* Copyright (c) 2011-2015 Los Alamos National Security, LLC. All rights
|
* Copyright (c) 2011-2015 Los Alamos National Security, LLC. All rights
|
||||||
* reserved.
|
* reserved.
|
||||||
* Copyright (c) 2013 Intel, Inc. All rights reserved.
|
* Copyright (c) 2013-2017 Intel, Inc. All rights reserved.
|
||||||
* Copyright (c) 2014 NVIDIA Corporation. All rights reserved.
|
* Copyright (c) 2014 NVIDIA Corporation. All rights reserved.
|
||||||
* $COPYRIGHT$
|
* $COPYRIGHT$
|
||||||
*
|
*
|
||||||
@ -63,70 +63,6 @@
|
|||||||
|
|
||||||
BEGIN_C_DECLS
|
BEGIN_C_DECLS
|
||||||
|
|
||||||
/*
|
|
||||||
* Structure to describe a predicted process fault.
|
|
||||||
*
|
|
||||||
* This can be expanded in the future to support assurance levels, and
|
|
||||||
* additional information that may wish to be conveyed.
|
|
||||||
*/
|
|
||||||
struct orte_errmgr_predicted_proc_t {
|
|
||||||
/** This is an object, so must have a super */
|
|
||||||
opal_list_item_t super;
|
|
||||||
|
|
||||||
/** Process Name */
|
|
||||||
orte_process_name_t proc_name;
|
|
||||||
};
|
|
||||||
typedef struct orte_errmgr_predicted_proc_t orte_errmgr_predicted_proc_t;
|
|
||||||
OBJ_CLASS_DECLARATION(orte_errmgr_predicted_proc_t);
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Structure to describe a predicted node fault.
|
|
||||||
*
|
|
||||||
* This can be expanded in the future to support assurance levels, and
|
|
||||||
* additional information that may wish to be conveyed.
|
|
||||||
*/
|
|
||||||
struct orte_errmgr_predicted_node_t {
|
|
||||||
/** This is an object, so must have a super */
|
|
||||||
opal_list_item_t super;
|
|
||||||
|
|
||||||
/** Node Name */
|
|
||||||
char * node_name;
|
|
||||||
};
|
|
||||||
typedef struct orte_errmgr_predicted_node_t orte_errmgr_predicted_node_t;
|
|
||||||
OBJ_CLASS_DECLARATION(orte_errmgr_predicted_node_t);
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Structure to describe a suggested remapping element for a predicted fault.
|
|
||||||
*
|
|
||||||
* This can be expanded in the future to support weights , and
|
|
||||||
* additional information that may wish to be conveyed.
|
|
||||||
*/
|
|
||||||
struct orte_errmgr_predicted_map_t {
|
|
||||||
/** This is an object, so must have a super */
|
|
||||||
opal_list_item_t super;
|
|
||||||
|
|
||||||
/** Process Name (predicted to fail) */
|
|
||||||
orte_process_name_t proc_name;
|
|
||||||
|
|
||||||
/** Node Name (predicted to fail) */
|
|
||||||
char * node_name;
|
|
||||||
|
|
||||||
/** Process Name (Map to) */
|
|
||||||
orte_process_name_t map_proc_name;
|
|
||||||
|
|
||||||
/** Node Name (Map to) */
|
|
||||||
char * map_node_name;
|
|
||||||
|
|
||||||
/** Just off current node */
|
|
||||||
bool off_current_node;
|
|
||||||
|
|
||||||
/** Pre-map fixed node assignment */
|
|
||||||
char * pre_map_fixed_node;
|
|
||||||
};
|
|
||||||
typedef struct orte_errmgr_predicted_map_t orte_errmgr_predicted_map_t;
|
|
||||||
OBJ_CLASS_DECLARATION(orte_errmgr_predicted_map_t);
|
|
||||||
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Macro definitions
|
* Macro definitions
|
||||||
*/
|
*/
|
||||||
@ -183,84 +119,6 @@ typedef int (*orte_errmgr_base_module_abort_peers_fn_t)(orte_process_name_t *pro
|
|||||||
orte_std_cntr_t num_procs,
|
orte_std_cntr_t num_procs,
|
||||||
int error_code);
|
int error_code);
|
||||||
|
|
||||||
/**
|
|
||||||
* Predicted process/node failure notification
|
|
||||||
*
|
|
||||||
* @param[in] proc_list List of processes (or NULL if none)
|
|
||||||
* @param[in] node_list List of nodes (or NULL if none)
|
|
||||||
* @param[in] suggested_map List of mapping suggestions to use on recovery (or NULL if none)
|
|
||||||
*
|
|
||||||
* @retval ORTE_SUCCESS The operation completed successfully
|
|
||||||
* @retval ORTE_ERROR An unspecifed error occurred
|
|
||||||
*/
|
|
||||||
typedef int (*orte_errmgr_base_module_predicted_fault_fn_t)(opal_list_t *proc_list,
|
|
||||||
opal_list_t *node_list,
|
|
||||||
opal_list_t *suggested_map);
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Suggest a node to map a restarting process onto
|
|
||||||
*
|
|
||||||
* @param[in] proc Process that is being mapped
|
|
||||||
* @param[in] oldnode Previous node where this process resided
|
|
||||||
* @param[in|out] node_list List of nodes to select from
|
|
||||||
*
|
|
||||||
* @retval ORTE_SUCCESS The operation completed successfully
|
|
||||||
* @retval ORTE_ERROR An unspecifed error occurred
|
|
||||||
*/
|
|
||||||
typedef int (*orte_errmgr_base_module_suggest_map_targets_fn_t)(orte_proc_t *proc,
|
|
||||||
orte_node_t *oldnode,
|
|
||||||
opal_list_t *node_list);
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Handle fault tolerance updates
|
|
||||||
*
|
|
||||||
* @param[in] state Fault tolerance state update
|
|
||||||
*
|
|
||||||
* @retval ORTE_SUCCESS The operation completed successfully
|
|
||||||
* @retval ORTE_ERROR An unspecifed error occurred
|
|
||||||
*/
|
|
||||||
typedef int (*orte_errmgr_base_module_ft_event_fn_t)(int state);
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Function to perform actions that require the rest of the ORTE layer to be up
|
|
||||||
* and running.
|
|
||||||
*
|
|
||||||
* @retval ORTE_SUCCESS The operation completed successfully
|
|
||||||
* @retval ORTE_ERROR An unspecified error occured
|
|
||||||
*/
|
|
||||||
typedef void (*orte_errmgr_base_module_register_migration_warning_fn_t)(struct timeval *tv);
|
|
||||||
|
|
||||||
typedef enum {
|
|
||||||
ORTE_ERRMGR_CALLBACK_FIRST,
|
|
||||||
ORTE_ERRMGR_CALLBACK_LAST,
|
|
||||||
ORTE_ERRMGR_CALLBACK_PREPEND,
|
|
||||||
ORTE_ERRMGR_CALLBACK_APPEND
|
|
||||||
} orte_errmgr_error_order_t;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Register a callback function for faults.
|
|
||||||
*
|
|
||||||
* This callback function will be used anytime (other than during finalize) the
|
|
||||||
* runtime detects and handles a critical failure. The runtime will complete all
|
|
||||||
* its stabilization before cycling thru all registered callbacks. The order of
|
|
||||||
* the callbacks will proceed in the indicated order with which they were registered.
|
|
||||||
*
|
|
||||||
* The parameter to the callback function will be the orte_process_name_t
|
|
||||||
* of the process involved in the error.
|
|
||||||
*
|
|
||||||
* @param[in] cbfunc The callback function.
|
|
||||||
*
|
|
||||||
*/
|
|
||||||
typedef struct {
|
|
||||||
orte_process_name_t proc;
|
|
||||||
int errcode;
|
|
||||||
} orte_error_t;
|
|
||||||
|
|
||||||
typedef int (orte_errmgr_error_callback_fn_t)(opal_pointer_array_t *errors);
|
|
||||||
typedef int (*orte_errmgr_base_module_register_error_callback_fn_t)(orte_errmgr_error_callback_fn_t *cbfunc,
|
|
||||||
orte_errmgr_error_order_t order);
|
|
||||||
typedef void (*orte_errmgr_base_module_execute_error_callbacks_fn_t)(opal_pointer_array_t *errors);
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Module Structure
|
* Module Structure
|
||||||
*/
|
*/
|
||||||
@ -273,21 +131,6 @@ struct orte_errmgr_base_module_2_3_0_t {
|
|||||||
orte_errmgr_base_module_log_fn_t logfn;
|
orte_errmgr_base_module_log_fn_t logfn;
|
||||||
orte_errmgr_base_module_abort_fn_t abort;
|
orte_errmgr_base_module_abort_fn_t abort;
|
||||||
orte_errmgr_base_module_abort_peers_fn_t abort_peers;
|
orte_errmgr_base_module_abort_peers_fn_t abort_peers;
|
||||||
|
|
||||||
/** Predicted process/node failure notification */
|
|
||||||
orte_errmgr_base_module_predicted_fault_fn_t predicted_fault;
|
|
||||||
/** Suggest a node to map a restarting process onto */
|
|
||||||
orte_errmgr_base_module_suggest_map_targets_fn_t suggest_map_targets;
|
|
||||||
|
|
||||||
/** Handle any FT Notifications */
|
|
||||||
orte_errmgr_base_module_ft_event_fn_t ft_event;
|
|
||||||
|
|
||||||
/* Register to be warned of impending migration */
|
|
||||||
orte_errmgr_base_module_register_migration_warning_fn_t register_migration_warning;
|
|
||||||
|
|
||||||
/* Register a callback function */
|
|
||||||
orte_errmgr_base_module_register_error_callback_fn_t register_error_callback;
|
|
||||||
orte_errmgr_base_module_execute_error_callbacks_fn_t execute_error_callbacks;
|
|
||||||
};
|
};
|
||||||
typedef struct orte_errmgr_base_module_2_3_0_t orte_errmgr_base_module_2_3_0_t;
|
typedef struct orte_errmgr_base_module_2_3_0_t orte_errmgr_base_module_2_3_0_t;
|
||||||
typedef orte_errmgr_base_module_2_3_0_t orte_errmgr_base_module_t;
|
typedef orte_errmgr_base_module_2_3_0_t orte_errmgr_base_module_t;
|
||||||
|
@ -9,6 +9,7 @@
|
|||||||
* All rights reserved.
|
* All rights reserved.
|
||||||
* Copyright (c) 2012-2013 Los Alamos National Security, LLC.
|
* Copyright (c) 2012-2013 Los Alamos National Security, LLC.
|
||||||
* All rights reserved
|
* All rights reserved
|
||||||
|
* Copyright (c) 2017 Intel, Inc. All rights reserved.
|
||||||
* $COPYRIGHT$
|
* $COPYRIGHT$
|
||||||
*
|
*
|
||||||
* Additional copyrights may follow
|
* Additional copyrights may follow
|
||||||
@ -32,18 +33,18 @@
|
|||||||
* Globals
|
* Globals
|
||||||
*/
|
*/
|
||||||
ORTE_DECLSPEC orte_filem_base_module_t orte_filem = {
|
ORTE_DECLSPEC orte_filem_base_module_t orte_filem = {
|
||||||
orte_filem_base_module_init,
|
.filem_init = orte_filem_base_module_init,
|
||||||
orte_filem_base_module_finalize,
|
.filem_finalize = orte_filem_base_module_finalize,
|
||||||
orte_filem_base_none_put,
|
.put = orte_filem_base_none_put,
|
||||||
orte_filem_base_none_put_nb,
|
.put_nb = orte_filem_base_none_put_nb,
|
||||||
orte_filem_base_none_get,
|
.get = orte_filem_base_none_get,
|
||||||
orte_filem_base_none_get_nb,
|
.get_nb = orte_filem_base_none_get_nb,
|
||||||
orte_filem_base_none_rm,
|
.rm = orte_filem_base_none_rm,
|
||||||
orte_filem_base_none_rm_nb,
|
.rm_nb = orte_filem_base_none_rm_nb,
|
||||||
orte_filem_base_none_wait,
|
.wait = orte_filem_base_none_wait,
|
||||||
orte_filem_base_none_wait_all,
|
.wait_all = orte_filem_base_none_wait_all,
|
||||||
orte_filem_base_none_preposition_files,
|
.preposition_files = orte_filem_base_none_preposition_files,
|
||||||
orte_filem_base_none_link_local_files
|
.link_local_files = orte_filem_base_none_link_local_files
|
||||||
};
|
};
|
||||||
bool orte_filem_base_is_active = false;
|
bool orte_filem_base_is_active = false;
|
||||||
|
|
||||||
@ -69,4 +70,3 @@ static int orte_filem_base_open(mca_base_open_flag_t flags)
|
|||||||
|
|
||||||
MCA_BASE_FRAMEWORK_DECLARE(orte, filem, NULL, NULL, orte_filem_base_open, orte_filem_base_close,
|
MCA_BASE_FRAMEWORK_DECLARE(orte, filem, NULL, NULL, orte_filem_base_open, orte_filem_base_close,
|
||||||
mca_filem_base_static_components, 0);
|
mca_filem_base_static_components, 0);
|
||||||
|
|
||||||
|
@ -2,7 +2,7 @@
|
|||||||
* Copyright (c) 2012-2013 Los Alamos National Security, LLC.
|
* Copyright (c) 2012-2013 Los Alamos National Security, LLC.
|
||||||
* All rights reserved
|
* All rights reserved
|
||||||
* Copyright (c) 2013 Cisco Systems, Inc. All rights reserved.
|
* Copyright (c) 2013 Cisco Systems, Inc. All rights reserved.
|
||||||
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
|
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
|
||||||
* Copyright (c) 2015-2017 Research Organization for Information Science
|
* Copyright (c) 2015-2017 Research Organization for Information Science
|
||||||
* and Technology (RIST). All rights reserved.
|
* and Technology (RIST). All rights reserved.
|
||||||
* $COPYRIGHT$
|
* $COPYRIGHT$
|
||||||
@ -49,6 +49,7 @@
|
|||||||
#include "orte/util/name_fns.h"
|
#include "orte/util/name_fns.h"
|
||||||
#include "orte/util/proc_info.h"
|
#include "orte/util/proc_info.h"
|
||||||
#include "orte/util/session_dir.h"
|
#include "orte/util/session_dir.h"
|
||||||
|
#include "orte/util/threads.h"
|
||||||
#include "orte/runtime/orte_globals.h"
|
#include "orte/runtime/orte_globals.h"
|
||||||
#include "orte/mca/errmgr/errmgr.h"
|
#include "orte/mca/errmgr/errmgr.h"
|
||||||
#include "orte/mca/grpcomm/base/base.h"
|
#include "orte/mca/grpcomm/base/base.h"
|
||||||
@ -61,14 +62,6 @@
|
|||||||
|
|
||||||
static int raw_init(void);
|
static int raw_init(void);
|
||||||
static int raw_finalize(void);
|
static int raw_finalize(void);
|
||||||
static int raw_put(orte_filem_base_request_t *req);
|
|
||||||
static int raw_put_nb(orte_filem_base_request_t *req);
|
|
||||||
static int raw_get(orte_filem_base_request_t *req);
|
|
||||||
static int raw_get_nb(orte_filem_base_request_t *req);
|
|
||||||
static int raw_rm(orte_filem_base_request_t *req);
|
|
||||||
static int raw_rm_nb(orte_filem_base_request_t *req);
|
|
||||||
static int raw_wait(orte_filem_base_request_t *req);
|
|
||||||
static int raw_wait_all(opal_list_t *reqs);
|
|
||||||
static int raw_preposition_files(orte_job_t *jdata,
|
static int raw_preposition_files(orte_job_t *jdata,
|
||||||
orte_filem_completion_cbfunc_t cbfunc,
|
orte_filem_completion_cbfunc_t cbfunc,
|
||||||
void *cbdata);
|
void *cbdata);
|
||||||
@ -76,20 +69,20 @@ static int raw_link_local_files(orte_job_t *jdata,
|
|||||||
orte_app_context_t *app);
|
orte_app_context_t *app);
|
||||||
|
|
||||||
orte_filem_base_module_t mca_filem_raw_module = {
|
orte_filem_base_module_t mca_filem_raw_module = {
|
||||||
raw_init,
|
.filem_init = raw_init,
|
||||||
raw_finalize,
|
.filem_finalize = raw_finalize,
|
||||||
/* we don't use any of the following */
|
/* we don't use any of the following */
|
||||||
raw_put,
|
.put = orte_filem_base_none_put,
|
||||||
raw_put_nb,
|
.put_nb = orte_filem_base_none_put_nb,
|
||||||
raw_get,
|
.get = orte_filem_base_none_get,
|
||||||
raw_get_nb,
|
.get_nb = orte_filem_base_none_get_nb,
|
||||||
raw_rm,
|
.rm = orte_filem_base_none_rm,
|
||||||
raw_rm_nb,
|
.rm_nb = orte_filem_base_none_rm_nb,
|
||||||
raw_wait,
|
.wait = orte_filem_base_none_wait,
|
||||||
raw_wait_all,
|
.wait_all = orte_filem_base_none_wait_all,
|
||||||
/* now the APIs we *do* use */
|
/* now the APIs we *do* use */
|
||||||
raw_preposition_files,
|
.preposition_files = raw_preposition_files,
|
||||||
raw_link_local_files
|
.link_local_files = raw_link_local_files
|
||||||
};
|
};
|
||||||
|
|
||||||
static opal_list_t outbound_files;
|
static opal_list_t outbound_files;
|
||||||
@ -164,46 +157,6 @@ static int raw_finalize(void)
|
|||||||
return ORTE_SUCCESS;
|
return ORTE_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
static int raw_put(orte_filem_base_request_t *req)
|
|
||||||
{
|
|
||||||
return ORTE_SUCCESS;
|
|
||||||
}
|
|
||||||
|
|
||||||
static int raw_put_nb(orte_filem_base_request_t *req)
|
|
||||||
{
|
|
||||||
return ORTE_SUCCESS;
|
|
||||||
}
|
|
||||||
|
|
||||||
static int raw_get(orte_filem_base_request_t *req)
|
|
||||||
{
|
|
||||||
return ORTE_SUCCESS;
|
|
||||||
}
|
|
||||||
|
|
||||||
static int raw_get_nb(orte_filem_base_request_t *req)
|
|
||||||
{
|
|
||||||
return ORTE_SUCCESS;
|
|
||||||
}
|
|
||||||
|
|
||||||
static int raw_rm(orte_filem_base_request_t *req)
|
|
||||||
{
|
|
||||||
return ORTE_SUCCESS;
|
|
||||||
}
|
|
||||||
|
|
||||||
static int raw_rm_nb(orte_filem_base_request_t *req)
|
|
||||||
{
|
|
||||||
return ORTE_SUCCESS;
|
|
||||||
}
|
|
||||||
|
|
||||||
static int raw_wait(orte_filem_base_request_t *req)
|
|
||||||
{
|
|
||||||
return ORTE_SUCCESS;
|
|
||||||
}
|
|
||||||
|
|
||||||
static int raw_wait_all(opal_list_t *reqs)
|
|
||||||
{
|
|
||||||
return ORTE_SUCCESS;
|
|
||||||
}
|
|
||||||
|
|
||||||
static void xfer_complete(int status, orte_filem_raw_xfer_t *xfer)
|
static void xfer_complete(int status, orte_filem_raw_xfer_t *xfer)
|
||||||
{
|
{
|
||||||
orte_filem_raw_outbound_t *outbound = xfer->outbound;
|
orte_filem_raw_outbound_t *outbound = xfer->outbound;
|
||||||
@ -586,8 +539,9 @@ static int raw_preposition_files(orte_job_t *jdata,
|
|||||||
opal_list_append(&outbound->xfers, &xfer->super);
|
opal_list_append(&outbound->xfers, &xfer->super);
|
||||||
opal_event_set(orte_event_base, &xfer->ev, fd, OPAL_EV_READ, send_chunk, xfer);
|
opal_event_set(orte_event_base, &xfer->ev, fd, OPAL_EV_READ, send_chunk, xfer);
|
||||||
opal_event_set_priority(&xfer->ev, ORTE_MSG_PRI);
|
opal_event_set_priority(&xfer->ev, ORTE_MSG_PRI);
|
||||||
opal_event_add(&xfer->ev, 0);
|
|
||||||
xfer->pending = true;
|
xfer->pending = true;
|
||||||
|
ORTE_POST_OBJECT(xfer);
|
||||||
|
opal_event_add(&xfer->ev, 0);
|
||||||
OBJ_RELEASE(item);
|
OBJ_RELEASE(item);
|
||||||
}
|
}
|
||||||
OBJ_DESTRUCT(&fsets);
|
OBJ_DESTRUCT(&fsets);
|
||||||
@ -804,6 +758,8 @@ static void send_chunk(int fd, short argc, void *cbdata)
|
|||||||
opal_buffer_t chunk;
|
opal_buffer_t chunk;
|
||||||
orte_grpcomm_signature_t *sig;
|
orte_grpcomm_signature_t *sig;
|
||||||
|
|
||||||
|
ORTE_ACQUIRE_OBJECT(rev);
|
||||||
|
|
||||||
/* flag that event has fired */
|
/* flag that event has fired */
|
||||||
rev->pending = false;
|
rev->pending = false;
|
||||||
|
|
||||||
@ -815,6 +771,7 @@ static void send_chunk(int fd, short argc, void *cbdata)
|
|||||||
|
|
||||||
/* non-blocking, retry */
|
/* non-blocking, retry */
|
||||||
if (EAGAIN == errno || EINTR == errno) {
|
if (EAGAIN == errno || EINTR == errno) {
|
||||||
|
ORTE_POST_OBJECT(rev);
|
||||||
opal_event_add(&rev->ev, 0);
|
opal_event_add(&rev->ev, 0);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
@ -891,8 +848,9 @@ static void send_chunk(int fd, short argc, void *cbdata)
|
|||||||
return;
|
return;
|
||||||
} else {
|
} else {
|
||||||
/* restart the read event */
|
/* restart the read event */
|
||||||
opal_event_add(&rev->ev, 0);
|
|
||||||
rev->pending = true;
|
rev->pending = true;
|
||||||
|
ORTE_POST_OBJECT(rev);
|
||||||
|
opal_event_add(&rev->ev, 0);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1116,7 +1074,8 @@ static void recv_files(int status, orte_process_name_t* sender,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
free(tmp);
|
free(tmp);
|
||||||
opal_event_set(orte_event_base, &incoming->ev, incoming->fd, OPAL_EV_WRITE, write_handler, incoming);
|
opal_event_set(orte_event_base, &incoming->ev, incoming->fd,
|
||||||
|
OPAL_EV_WRITE, write_handler, incoming);
|
||||||
opal_event_set_priority(&incoming->ev, ORTE_MSG_PRI);
|
opal_event_set_priority(&incoming->ev, ORTE_MSG_PRI);
|
||||||
}
|
}
|
||||||
/* create an output object for this data */
|
/* create an output object for this data */
|
||||||
@ -1135,8 +1094,9 @@ static void recv_files(int status, orte_process_name_t* sender,
|
|||||||
|
|
||||||
if (!incoming->pending) {
|
if (!incoming->pending) {
|
||||||
/* add the event */
|
/* add the event */
|
||||||
opal_event_add(&incoming->ev, 0);
|
|
||||||
incoming->pending = true;
|
incoming->pending = true;
|
||||||
|
ORTE_POST_OBJECT(incoming);
|
||||||
|
opal_event_add(&incoming->ev, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* cleanup */
|
/* cleanup */
|
||||||
@ -1154,6 +1114,8 @@ static void write_handler(int fd, short event, void *cbdata)
|
|||||||
char homedir[MAXPATHLEN];
|
char homedir[MAXPATHLEN];
|
||||||
int rc;
|
int rc;
|
||||||
|
|
||||||
|
ORTE_ACQUIRE_OBJECT(sink);
|
||||||
|
|
||||||
OPAL_OUTPUT_VERBOSE((1, orte_filem_base_framework.framework_output,
|
OPAL_OUTPUT_VERBOSE((1, orte_filem_base_framework.framework_output,
|
||||||
"%s write:handler writing data to %d",
|
"%s write:handler writing data to %d",
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
@ -1226,8 +1188,9 @@ static void write_handler(int fd, short event, void *cbdata)
|
|||||||
/* leave the write event running so it will call us again
|
/* leave the write event running so it will call us again
|
||||||
* when the fd is ready.
|
* when the fd is ready.
|
||||||
*/
|
*/
|
||||||
opal_event_add(&sink->ev, 0);
|
|
||||||
sink->pending = true;
|
sink->pending = true;
|
||||||
|
ORTE_POST_OBJECT(sink);
|
||||||
|
opal_event_add(&sink->ev, 0);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
/* otherwise, something bad happened so all we can do is abort
|
/* otherwise, something bad happened so all we can do is abort
|
||||||
@ -1250,8 +1213,9 @@ static void write_handler(int fd, short event, void *cbdata)
|
|||||||
/* leave the write event running so it will call us again
|
/* leave the write event running so it will call us again
|
||||||
* when the fd is ready
|
* when the fd is ready
|
||||||
*/
|
*/
|
||||||
opal_event_add(&sink->ev, 0);
|
|
||||||
sink->pending = true;
|
sink->pending = true;
|
||||||
|
ORTE_POST_OBJECT(sink);
|
||||||
|
opal_event_add(&sink->ev, 0);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
OBJ_RELEASE(output);
|
OBJ_RELEASE(output);
|
||||||
|
@ -44,6 +44,7 @@
|
|||||||
#include "orte/mca/state/state.h"
|
#include "orte/mca/state/state.h"
|
||||||
#include "orte/util/name_fns.h"
|
#include "orte/util/name_fns.h"
|
||||||
#include "orte/util/nidmap.h"
|
#include "orte/util/nidmap.h"
|
||||||
|
#include "orte/util/threads.h"
|
||||||
#include "orte/runtime/orte_globals.h"
|
#include "orte/runtime/orte_globals.h"
|
||||||
|
|
||||||
#include "orte/mca/grpcomm/grpcomm.h"
|
#include "orte/mca/grpcomm/grpcomm.h"
|
||||||
@ -144,6 +145,8 @@ static void allgather_stub(int fd, short args, void *cbdata)
|
|||||||
orte_grpcomm_coll_t *coll;
|
orte_grpcomm_coll_t *coll;
|
||||||
uint32_t *seq_number;
|
uint32_t *seq_number;
|
||||||
|
|
||||||
|
ORTE_ACQUIRE_OBJECT(cd);
|
||||||
|
|
||||||
OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base_framework.framework_output,
|
OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base_framework.framework_output,
|
||||||
"%s grpcomm:base:allgather stub",
|
"%s grpcomm:base:allgather stub",
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||||
@ -212,6 +215,7 @@ int orte_grpcomm_API_allgather(orte_grpcomm_signature_t *sig,
|
|||||||
cd->cbdata = cbdata;
|
cd->cbdata = cbdata;
|
||||||
opal_event_set(orte_event_base, &cd->ev, -1, OPAL_EV_WRITE, allgather_stub, cd);
|
opal_event_set(orte_event_base, &cd->ev, -1, OPAL_EV_WRITE, allgather_stub, cd);
|
||||||
opal_event_set_priority(&cd->ev, ORTE_MSG_PRI);
|
opal_event_set_priority(&cd->ev, ORTE_MSG_PRI);
|
||||||
|
ORTE_POST_OBJECT(cd);
|
||||||
opal_event_active(&cd->ev, OPAL_EV_WRITE, 1);
|
opal_event_active(&cd->ev, OPAL_EV_WRITE, 1);
|
||||||
return ORTE_SUCCESS;
|
return ORTE_SUCCESS;
|
||||||
}
|
}
|
||||||
|
@ -52,6 +52,7 @@
|
|||||||
#include "orte/mca/iof/iof.h"
|
#include "orte/mca/iof/iof.h"
|
||||||
#include "orte/runtime/orte_globals.h"
|
#include "orte/runtime/orte_globals.h"
|
||||||
#include "orte/mca/rml/rml_types.h"
|
#include "orte/mca/rml/rml_types.h"
|
||||||
|
#include "orte/util/threads.h"
|
||||||
|
|
||||||
BEGIN_C_DECLS
|
BEGIN_C_DECLS
|
||||||
|
|
||||||
@ -163,6 +164,7 @@ typedef struct orte_iof_base_t orte_iof_base_t;
|
|||||||
opal_event_set_priority(ep->wev->ev, ORTE_MSG_PRI); \
|
opal_event_set_priority(ep->wev->ev, ORTE_MSG_PRI); \
|
||||||
} \
|
} \
|
||||||
*(snk) = ep; \
|
*(snk) = ep; \
|
||||||
|
ORTE_POST_OBJECT(ep); \
|
||||||
} while(0);
|
} while(0);
|
||||||
|
|
||||||
/* add list of structs that has name of proc + orte_iof_tag_t - when
|
/* add list of structs that has name of proc + orte_iof_tag_t - when
|
||||||
@ -192,6 +194,7 @@ typedef struct orte_iof_base_t orte_iof_base_t;
|
|||||||
opal_event_set_priority(rev->ev, ORTE_MSG_PRI); \
|
opal_event_set_priority(rev->ev, ORTE_MSG_PRI); \
|
||||||
if ((actv)) { \
|
if ((actv)) { \
|
||||||
rev->active = true; \
|
rev->active = true; \
|
||||||
|
ORTE_POST_OBJECT(rev); \
|
||||||
opal_event_add(rev->ev, 0); \
|
opal_event_add(rev->ev, 0); \
|
||||||
} \
|
} \
|
||||||
} while(0);
|
} while(0);
|
||||||
|
@ -38,6 +38,7 @@
|
|||||||
#include "opal/util/output.h"
|
#include "opal/util/output.h"
|
||||||
|
|
||||||
#include "orte/util/name_fns.h"
|
#include "orte/util/name_fns.h"
|
||||||
|
#include "orte/util/threads.h"
|
||||||
#include "orte/runtime/orte_globals.h"
|
#include "orte/runtime/orte_globals.h"
|
||||||
#include "orte/mca/errmgr/errmgr.h"
|
#include "orte/mca/errmgr/errmgr.h"
|
||||||
#include "orte/mca/state/state.h"
|
#include "orte/mca/state/state.h"
|
||||||
@ -147,7 +148,7 @@ int orte_iof_base_write_output(const orte_process_name_t *name, orte_iof_tag_t s
|
|||||||
output->numbytes = numbytes;
|
output->numbytes = numbytes;
|
||||||
goto process;
|
goto process;
|
||||||
|
|
||||||
construct:
|
construct:
|
||||||
starttaglen = strlen(starttag);
|
starttaglen = strlen(starttag);
|
||||||
endtaglen = strlen(endtag);
|
endtaglen = strlen(endtag);
|
||||||
endtagged = false;
|
endtagged = false;
|
||||||
@ -249,7 +250,7 @@ construct:
|
|||||||
}
|
}
|
||||||
output->numbytes = k;
|
output->numbytes = k;
|
||||||
|
|
||||||
process:
|
process:
|
||||||
/* add this data to the write list for this fd */
|
/* add this data to the write list for this fd */
|
||||||
opal_list_append(&channel->outputs, &output->super);
|
opal_list_append(&channel->outputs, &output->super);
|
||||||
|
|
||||||
@ -262,8 +263,9 @@ process:
|
|||||||
OPAL_OUTPUT_VERBOSE((1, orte_iof_base_framework.framework_output,
|
OPAL_OUTPUT_VERBOSE((1, orte_iof_base_framework.framework_output,
|
||||||
"%s write:output adding write event",
|
"%s write:output adding write event",
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||||
opal_event_add(channel->ev, 0);
|
|
||||||
channel->pending = true;
|
channel->pending = true;
|
||||||
|
ORTE_POST_OBJECT(channel);
|
||||||
|
opal_event_add(channel->ev, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
return num_buffered;
|
return num_buffered;
|
||||||
@ -303,6 +305,8 @@ void orte_iof_base_write_handler(int fd, short event, void *cbdata)
|
|||||||
orte_iof_write_output_t *output;
|
orte_iof_write_output_t *output;
|
||||||
int num_written;
|
int num_written;
|
||||||
|
|
||||||
|
ORTE_ACQUIRE_OBJECT(sink);
|
||||||
|
|
||||||
OPAL_OUTPUT_VERBOSE((1, orte_iof_base_framework.framework_output,
|
OPAL_OUTPUT_VERBOSE((1, orte_iof_base_framework.framework_output,
|
||||||
"%s write:handler writing data to %d",
|
"%s write:handler writing data to %d",
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
@ -356,8 +360,8 @@ void orte_iof_base_write_handler(int fd, short event, void *cbdata)
|
|||||||
}
|
}
|
||||||
OBJ_RELEASE(output);
|
OBJ_RELEASE(output);
|
||||||
}
|
}
|
||||||
ABORT:
|
ABORT:
|
||||||
opal_event_del(wev->ev);
|
opal_event_del(wev->ev);
|
||||||
wev->pending = false;
|
wev->pending = false;
|
||||||
|
ORTE_POST_OBJECT(wev);
|
||||||
}
|
}
|
||||||
|
@ -47,6 +47,7 @@
|
|||||||
#include "orte/mca/ess/ess.h"
|
#include "orte/mca/ess/ess.h"
|
||||||
#include "orte/mca/rml/rml.h"
|
#include "orte/mca/rml/rml.h"
|
||||||
#include "orte/util/name_fns.h"
|
#include "orte/util/name_fns.h"
|
||||||
|
#include "orte/util/threads.h"
|
||||||
#include "orte/mca/odls/odls_types.h"
|
#include "orte/mca/odls/odls_types.h"
|
||||||
|
|
||||||
#include "orte/mca/iof/base/base.h"
|
#include "orte/mca/iof/base/base.h"
|
||||||
@ -214,10 +215,13 @@ static int hnp_push(const orte_process_name_t* dst_name, orte_iof_tag_t src_tag,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
proct->revstdout->active = true;
|
proct->revstdout->active = true;
|
||||||
|
ORTE_POST_OBJECT(proct->revstdout);
|
||||||
opal_event_add(proct->revstdout->ev, 0);
|
opal_event_add(proct->revstdout->ev, 0);
|
||||||
proct->revstderr->active = true;
|
proct->revstderr->active = true;
|
||||||
|
ORTE_POST_OBJECT(proct->revstderr);
|
||||||
opal_event_add(proct->revstderr->ev, 0);
|
opal_event_add(proct->revstderr->ev, 0);
|
||||||
proct->revstddiag->active = true;
|
proct->revstddiag->active = true;
|
||||||
|
ORTE_POST_OBJECT(proct->revstddiag);
|
||||||
opal_event_add(proct->revstddiag->ev, 0);
|
opal_event_add(proct->revstddiag->ev, 0);
|
||||||
}
|
}
|
||||||
return ORTE_SUCCESS;
|
return ORTE_SUCCESS;
|
||||||
@ -299,6 +303,7 @@ static int hnp_push(const orte_process_name_t* dst_name, orte_iof_tag_t src_tag,
|
|||||||
*/
|
*/
|
||||||
if (!(src_tag & ORTE_IOF_STDIN) || orte_iof_hnp_stdin_check(fd)) {
|
if (!(src_tag & ORTE_IOF_STDIN) || orte_iof_hnp_stdin_check(fd)) {
|
||||||
mca_iof_hnp_component.stdinev->active = true;
|
mca_iof_hnp_component.stdinev->active = true;
|
||||||
|
ORTE_POST_OBJECT(proct->revstdout);
|
||||||
opal_event_add(mca_iof_hnp_component.stdinev->ev, 0);
|
opal_event_add(mca_iof_hnp_component.stdinev->ev, 0);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
@ -515,6 +520,8 @@ static void stdin_write_handler(int fd, short event, void *cbdata)
|
|||||||
orte_iof_write_output_t *output;
|
orte_iof_write_output_t *output;
|
||||||
int num_written;
|
int num_written;
|
||||||
|
|
||||||
|
ORTE_ACQUIRE_OBJECT(sink);
|
||||||
|
|
||||||
OPAL_OUTPUT_VERBOSE((1, orte_iof_base_framework.framework_output,
|
OPAL_OUTPUT_VERBOSE((1, orte_iof_base_framework.framework_output,
|
||||||
"%s hnp:stdin:write:handler writing data to %d",
|
"%s hnp:stdin:write:handler writing data to %d",
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
@ -558,6 +565,7 @@ static void stdin_write_handler(int fd, short event, void *cbdata)
|
|||||||
* when the fd is ready.
|
* when the fd is ready.
|
||||||
*/
|
*/
|
||||||
wev->pending = true;
|
wev->pending = true;
|
||||||
|
ORTE_POST_OBJECT(wev);
|
||||||
opal_event_add(wev->ev, 0);
|
opal_event_add(wev->ev, 0);
|
||||||
goto CHECK;
|
goto CHECK;
|
||||||
}
|
}
|
||||||
@ -583,13 +591,14 @@ static void stdin_write_handler(int fd, short event, void *cbdata)
|
|||||||
* when the fd is ready.
|
* when the fd is ready.
|
||||||
*/
|
*/
|
||||||
wev->pending = true;
|
wev->pending = true;
|
||||||
|
ORTE_POST_OBJECT(wev);
|
||||||
opal_event_add(wev->ev, 0);
|
opal_event_add(wev->ev, 0);
|
||||||
goto CHECK;
|
goto CHECK;
|
||||||
}
|
}
|
||||||
OBJ_RELEASE(output);
|
OBJ_RELEASE(output);
|
||||||
}
|
}
|
||||||
|
|
||||||
CHECK:
|
CHECK:
|
||||||
if (NULL != mca_iof_hnp_component.stdinev &&
|
if (NULL != mca_iof_hnp_component.stdinev &&
|
||||||
!orte_abnormal_term_ordered &&
|
!orte_abnormal_term_ordered &&
|
||||||
!mca_iof_hnp_component.stdinev->active) {
|
!mca_iof_hnp_component.stdinev->active) {
|
||||||
@ -610,6 +619,7 @@ CHECK:
|
|||||||
OPAL_OUTPUT_VERBOSE((1, orte_iof_base_framework.framework_output,
|
OPAL_OUTPUT_VERBOSE((1, orte_iof_base_framework.framework_output,
|
||||||
"restarting read event"));
|
"restarting read event"));
|
||||||
mca_iof_hnp_component.stdinev->active = true;
|
mca_iof_hnp_component.stdinev->active = true;
|
||||||
|
ORTE_POST_OBJECT(mca_iof_hnp_component.stdinev);
|
||||||
opal_event_add(mca_iof_hnp_component.stdinev->ev, 0);
|
opal_event_add(mca_iof_hnp_component.stdinev->ev, 0);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -35,6 +35,7 @@
|
|||||||
#include "orte/mca/errmgr/errmgr.h"
|
#include "orte/mca/errmgr/errmgr.h"
|
||||||
#include "orte/mca/odls/odls_types.h"
|
#include "orte/mca/odls/odls_types.h"
|
||||||
#include "orte/util/name_fns.h"
|
#include "orte/util/name_fns.h"
|
||||||
|
#include "orte/util/threads.h"
|
||||||
#include "orte/mca/state/state.h"
|
#include "orte/mca/state/state.h"
|
||||||
#include "orte/runtime/orte_globals.h"
|
#include "orte/runtime/orte_globals.h"
|
||||||
#include "orte/runtime/orte_wait.h"
|
#include "orte/runtime/orte_wait.h"
|
||||||
@ -48,10 +49,13 @@ static void restart_stdin(int fd, short event, void *cbdata)
|
|||||||
{
|
{
|
||||||
orte_timer_t *tm = (orte_timer_t*)cbdata;
|
orte_timer_t *tm = (orte_timer_t*)cbdata;
|
||||||
|
|
||||||
|
ORTE_ACQUIRE_OBJECT(tm);
|
||||||
|
|
||||||
if (NULL != mca_iof_hnp_component.stdinev &&
|
if (NULL != mca_iof_hnp_component.stdinev &&
|
||||||
!orte_job_term_ordered &&
|
!orte_job_term_ordered &&
|
||||||
!mca_iof_hnp_component.stdinev->active) {
|
!mca_iof_hnp_component.stdinev->active) {
|
||||||
mca_iof_hnp_component.stdinev->active = true;
|
mca_iof_hnp_component.stdinev->active = true;
|
||||||
|
ORTE_POST_OBJECT(mca_iof_hnp_component.stdinev);
|
||||||
opal_event_add(mca_iof_hnp_component.stdinev->ev, 0);
|
opal_event_add(mca_iof_hnp_component.stdinev->ev, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -74,7 +78,11 @@ bool orte_iof_hnp_stdin_check(int fd)
|
|||||||
|
|
||||||
void orte_iof_hnp_stdin_cb(int fd, short event, void *cbdata)
|
void orte_iof_hnp_stdin_cb(int fd, short event, void *cbdata)
|
||||||
{
|
{
|
||||||
bool should_process = orte_iof_hnp_stdin_check(0);
|
bool should_process;
|
||||||
|
|
||||||
|
ORTE_ACQUIRE_OBJECT(mca_iof_hnp_component.stdinev);
|
||||||
|
|
||||||
|
should_process = orte_iof_hnp_stdin_check(0);
|
||||||
|
|
||||||
if (should_process) {
|
if (should_process) {
|
||||||
mca_iof_hnp_component.stdinev->active = true;
|
mca_iof_hnp_component.stdinev->active = true;
|
||||||
@ -99,6 +107,8 @@ void orte_iof_hnp_read_local_handler(int fd, short event, void *cbdata)
|
|||||||
bool exclusive;
|
bool exclusive;
|
||||||
orte_iof_sink_t *sink;
|
orte_iof_sink_t *sink;
|
||||||
|
|
||||||
|
ORTE_ACQUIRE_OBJECT(rev);
|
||||||
|
|
||||||
/* read up to the fragment size */
|
/* read up to the fragment size */
|
||||||
numbytes = read(fd, data, sizeof(data));
|
numbytes = read(fd, data, sizeof(data));
|
||||||
|
|
||||||
@ -293,6 +303,7 @@ void orte_iof_hnp_read_local_handler(int fd, short event, void *cbdata)
|
|||||||
}
|
}
|
||||||
|
|
||||||
/* re-add the event */
|
/* re-add the event */
|
||||||
|
ORTE_POST_OBJECT(rev);
|
||||||
opal_event_add(rev->ev, 0);
|
opal_event_add(rev->ev, 0);
|
||||||
|
|
||||||
return;
|
return;
|
||||||
|
@ -12,7 +12,7 @@
|
|||||||
* Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
|
* Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
|
||||||
* Copyright (c) 2011-2013 Los Alamos National Security, LLC. All rights
|
* Copyright (c) 2011-2013 Los Alamos National Security, LLC. All rights
|
||||||
* reserved.
|
* reserved.
|
||||||
* Copyright (c) 2014-2016 Intel Corporation. All rights reserved.
|
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
|
||||||
* $COPYRIGHT$
|
* $COPYRIGHT$
|
||||||
*
|
*
|
||||||
* Additional copyrights may follow
|
* Additional copyrights may follow
|
||||||
@ -41,6 +41,7 @@
|
|||||||
#include "orte/mca/rml/rml.h"
|
#include "orte/mca/rml/rml.h"
|
||||||
#include "orte/mca/errmgr/errmgr.h"
|
#include "orte/mca/errmgr/errmgr.h"
|
||||||
#include "orte/util/name_fns.h"
|
#include "orte/util/name_fns.h"
|
||||||
|
#include "orte/util/threads.h"
|
||||||
#include "orte/runtime/orte_globals.h"
|
#include "orte/runtime/orte_globals.h"
|
||||||
|
|
||||||
#include "orte/mca/iof/iof.h"
|
#include "orte/mca/iof/iof.h"
|
||||||
@ -81,6 +82,7 @@ void orte_iof_hnp_recv(int status, orte_process_name_t* sender,
|
|||||||
!orte_job_term_ordered &&
|
!orte_job_term_ordered &&
|
||||||
!mca_iof_hnp_component.stdinev->active) {
|
!mca_iof_hnp_component.stdinev->active) {
|
||||||
mca_iof_hnp_component.stdinev->active = true;
|
mca_iof_hnp_component.stdinev->active = true;
|
||||||
|
ORTE_POST_OBJECT(mca_iof_hnp_component.stdinev);
|
||||||
opal_event_add(mca_iof_hnp_component.stdinev->ev, 0);
|
opal_event_add(mca_iof_hnp_component.stdinev->ev, 0);
|
||||||
}
|
}
|
||||||
goto CLEAN_RETURN;
|
goto CLEAN_RETURN;
|
||||||
|
@ -42,6 +42,7 @@
|
|||||||
|
|
||||||
#include "orte/mca/errmgr/errmgr.h"
|
#include "orte/mca/errmgr/errmgr.h"
|
||||||
#include "orte/util/name_fns.h"
|
#include "orte/util/name_fns.h"
|
||||||
|
#include "orte/util/threads.h"
|
||||||
#include "orte/runtime/orte_globals.h"
|
#include "orte/runtime/orte_globals.h"
|
||||||
#include "orte/mca/odls/odls_types.h"
|
#include "orte/mca/odls/odls_types.h"
|
||||||
#include "orte/mca/rml/rml.h"
|
#include "orte/mca/rml/rml.h"
|
||||||
@ -190,10 +191,13 @@ SETUP:
|
|||||||
*/
|
*/
|
||||||
if (NULL != proct->revstdout && NULL != proct->revstderr && NULL != proct->revstddiag) {
|
if (NULL != proct->revstdout && NULL != proct->revstderr && NULL != proct->revstddiag) {
|
||||||
proct->revstdout->active = true;
|
proct->revstdout->active = true;
|
||||||
|
ORTE_POST_OBJECT(proct->revstdout);
|
||||||
opal_event_add(proct->revstdout->ev, 0);
|
opal_event_add(proct->revstdout->ev, 0);
|
||||||
proct->revstderr->active = true;
|
proct->revstderr->active = true;
|
||||||
|
ORTE_POST_OBJECT(proct->revstderr);
|
||||||
opal_event_add(proct->revstderr->ev, 0);
|
opal_event_add(proct->revstderr->ev, 0);
|
||||||
proct->revstddiag->active = true;
|
proct->revstddiag->active = true;
|
||||||
|
ORTE_POST_OBJECT(proct->revstddiag);
|
||||||
opal_event_add(proct->revstddiag->ev, 0);
|
opal_event_add(proct->revstddiag->ev, 0);
|
||||||
}
|
}
|
||||||
return ORTE_SUCCESS;
|
return ORTE_SUCCESS;
|
||||||
@ -367,6 +371,8 @@ static void stdin_write_handler(int fd, short event, void *cbdata)
|
|||||||
orte_iof_write_output_t *output;
|
orte_iof_write_output_t *output;
|
||||||
int num_written;
|
int num_written;
|
||||||
|
|
||||||
|
ORTE_ACQUIRE_OBJECT(sink);
|
||||||
|
|
||||||
OPAL_OUTPUT_VERBOSE((1, orte_iof_base_framework.framework_output,
|
OPAL_OUTPUT_VERBOSE((1, orte_iof_base_framework.framework_output,
|
||||||
"%s orted:stdin:write:handler writing data to %d",
|
"%s orted:stdin:write:handler writing data to %d",
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
@ -400,6 +406,7 @@ static void stdin_write_handler(int fd, short event, void *cbdata)
|
|||||||
* when the fd is ready.
|
* when the fd is ready.
|
||||||
*/
|
*/
|
||||||
wev->pending = true;
|
wev->pending = true;
|
||||||
|
ORTE_POST_OBJECT(wev);
|
||||||
opal_event_add(wev->ev, 0);
|
opal_event_add(wev->ev, 0);
|
||||||
goto CHECK;
|
goto CHECK;
|
||||||
}
|
}
|
||||||
@ -430,6 +437,7 @@ static void stdin_write_handler(int fd, short event, void *cbdata)
|
|||||||
* when the fd is ready.
|
* when the fd is ready.
|
||||||
*/
|
*/
|
||||||
wev->pending = true;
|
wev->pending = true;
|
||||||
|
ORTE_POST_OBJECT(wev);
|
||||||
opal_event_add(wev->ev, 0);
|
opal_event_add(wev->ev, 0);
|
||||||
goto CHECK;
|
goto CHECK;
|
||||||
}
|
}
|
||||||
|
@ -35,6 +35,7 @@
|
|||||||
#include "orte/mca/errmgr/errmgr.h"
|
#include "orte/mca/errmgr/errmgr.h"
|
||||||
#include "orte/mca/odls/odls_types.h"
|
#include "orte/mca/odls/odls_types.h"
|
||||||
#include "orte/util/name_fns.h"
|
#include "orte/util/name_fns.h"
|
||||||
|
#include "orte/util/threads.h"
|
||||||
#include "orte/mca/state/state.h"
|
#include "orte/mca/state/state.h"
|
||||||
#include "orte/runtime/orte_globals.h"
|
#include "orte/runtime/orte_globals.h"
|
||||||
|
|
||||||
@ -52,6 +53,8 @@ void orte_iof_orted_read_handler(int fd, short event, void *cbdata)
|
|||||||
int32_t numbytes;
|
int32_t numbytes;
|
||||||
orte_iof_proc_t *proct = (orte_iof_proc_t*)rev->proc;
|
orte_iof_proc_t *proct = (orte_iof_proc_t*)rev->proc;
|
||||||
|
|
||||||
|
ORTE_ACQUIRE_OBJECT(rev);
|
||||||
|
|
||||||
/* read up to the fragment size */
|
/* read up to the fragment size */
|
||||||
#if !defined(__WINDOWS__)
|
#if !defined(__WINDOWS__)
|
||||||
numbytes = read(fd, data, sizeof(data));
|
numbytes = read(fd, data, sizeof(data));
|
||||||
@ -100,6 +103,7 @@ void orte_iof_orted_read_handler(int fd, short event, void *cbdata)
|
|||||||
}
|
}
|
||||||
if (!proct->copy) {
|
if (!proct->copy) {
|
||||||
/* re-add the event */
|
/* re-add the event */
|
||||||
|
ORTE_POST_OBJECT(rev);
|
||||||
opal_event_add(rev->ev, 0);
|
opal_event_add(rev->ev, 0);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
@ -137,6 +141,7 @@ void orte_iof_orted_read_handler(int fd, short event, void *cbdata)
|
|||||||
orte_rml_send_callback, NULL);
|
orte_rml_send_callback, NULL);
|
||||||
|
|
||||||
/* re-add the event */
|
/* re-add the event */
|
||||||
|
ORTE_POST_OBJECT(rev);
|
||||||
opal_event_add(rev->ev, 0);
|
opal_event_add(rev->ev, 0);
|
||||||
|
|
||||||
return;
|
return;
|
||||||
|
@ -10,7 +10,7 @@
|
|||||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||||
* All rights reserved.
|
* All rights reserved.
|
||||||
* Copyright (c) 2008-2015 Cisco Systems, Inc. All rights reserved.
|
* Copyright (c) 2008-2015 Cisco Systems, Inc. All rights reserved.
|
||||||
* Copyright (c) 2014-2015 Intel, Inc. All rights reserved.
|
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
|
||||||
* $COPYRIGHT$
|
* $COPYRIGHT$
|
||||||
*
|
*
|
||||||
* Additional copyrights may follow
|
* Additional copyrights may follow
|
||||||
@ -25,6 +25,7 @@
|
|||||||
#include "opal/util/argv.h"
|
#include "opal/util/argv.h"
|
||||||
|
|
||||||
#include "orte/util/attr.h"
|
#include "orte/util/attr.h"
|
||||||
|
#include "orte/util/threads.h"
|
||||||
#include "orte/mca/notifier/base/base.h"
|
#include "orte/mca/notifier/base/base.h"
|
||||||
|
|
||||||
|
|
||||||
@ -38,6 +39,8 @@ void orte_notifier_base_log(int sd, short args, void *cbdata)
|
|||||||
orte_notifier_active_module_t *imod;
|
orte_notifier_active_module_t *imod;
|
||||||
int i;
|
int i;
|
||||||
|
|
||||||
|
ORTE_ACQUIRE_OBJECT(req);
|
||||||
|
|
||||||
/* if no modules are active, then there is nothing to do */
|
/* if no modules are active, then there is nothing to do */
|
||||||
if (0 == opal_list_get_size(&orte_notifier_base.modules)) {
|
if (0 == opal_list_get_size(&orte_notifier_base.modules)) {
|
||||||
return;
|
return;
|
||||||
@ -74,6 +77,8 @@ void orte_notifier_base_event(int sd, short args, void *cbdata)
|
|||||||
orte_notifier_active_module_t *imod;
|
orte_notifier_active_module_t *imod;
|
||||||
int i;
|
int i;
|
||||||
|
|
||||||
|
ORTE_ACQUIRE_OBJECT(req);
|
||||||
|
|
||||||
/* if no modules are active, then there is nothing to do */
|
/* if no modules are active, then there is nothing to do */
|
||||||
if (0 == opal_list_get_size(&orte_notifier_base.modules)) {
|
if (0 == opal_list_get_size(&orte_notifier_base.modules)) {
|
||||||
return;
|
return;
|
||||||
@ -110,6 +115,8 @@ void orte_notifier_base_report(int sd, short args, void *cbdata)
|
|||||||
orte_notifier_active_module_t *imod;
|
orte_notifier_active_module_t *imod;
|
||||||
int i;
|
int i;
|
||||||
|
|
||||||
|
ORTE_ACQUIRE_OBJECT(req);
|
||||||
|
|
||||||
/* if no modules are active, then there is nothing to do */
|
/* if no modules are active, then there is nothing to do */
|
||||||
if (0 == opal_list_get_size(&orte_notifier_base.modules)) {
|
if (0 == opal_list_get_size(&orte_notifier_base.modules)) {
|
||||||
return;
|
return;
|
||||||
|
@ -13,7 +13,7 @@
|
|||||||
* Copyright (c) 2009 Cisco Systems, Inc. All Rights Reserved.
|
* Copyright (c) 2009 Cisco Systems, Inc. All Rights Reserved.
|
||||||
* Copyright (c) 2012-2015 Los Alamos National Security, LLC. All rights
|
* Copyright (c) 2012-2015 Los Alamos National Security, LLC. All rights
|
||||||
* reserved.
|
* reserved.
|
||||||
* Copyright (c) 2014-2015 Intel, Inc. All rights reserved.
|
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
|
||||||
* $COPYRIGHT$
|
* $COPYRIGHT$
|
||||||
*
|
*
|
||||||
* Additional copyrights may follow
|
* Additional copyrights may follow
|
||||||
@ -51,6 +51,7 @@
|
|||||||
#include "orte/types.h"
|
#include "orte/types.h"
|
||||||
|
|
||||||
#include "orte/runtime/orte_globals.h"
|
#include "orte/runtime/orte_globals.h"
|
||||||
|
#include "orte/util/threads.h"
|
||||||
|
|
||||||
BEGIN_C_DECLS
|
BEGIN_C_DECLS
|
||||||
|
|
||||||
@ -136,6 +137,7 @@ typedef void (*orte_notifier_base_module_report_fn_t)(orte_notifier_request_t *r
|
|||||||
opal_event_set(orte_notifier_base.ev_base, &(_n)->ev, -1, \
|
opal_event_set(orte_notifier_base.ev_base, &(_n)->ev, -1, \
|
||||||
OPAL_EV_WRITE, orte_notifier_base_log, (_n)); \
|
OPAL_EV_WRITE, orte_notifier_base_log, (_n)); \
|
||||||
opal_event_set_priority(&(_n)->ev, ORTE_ERROR_PRI); \
|
opal_event_set_priority(&(_n)->ev, ORTE_ERROR_PRI); \
|
||||||
|
ORTE_POST_OBJECT(_n); \
|
||||||
opal_event_active(&(_n)->ev, OPAL_EV_WRITE, 1); \
|
opal_event_active(&(_n)->ev, OPAL_EV_WRITE, 1); \
|
||||||
} while(0);
|
} while(0);
|
||||||
|
|
||||||
@ -160,6 +162,7 @@ typedef void (*orte_notifier_base_module_report_fn_t)(orte_notifier_request_t *r
|
|||||||
opal_event_set(orte_notifier_base.ev_base, &(_n)->ev, -1, \
|
opal_event_set(orte_notifier_base.ev_base, &(_n)->ev, -1, \
|
||||||
OPAL_EV_WRITE, orte_notifier_base_report, (_n)); \
|
OPAL_EV_WRITE, orte_notifier_base_report, (_n)); \
|
||||||
opal_event_set_priority(&(_n)->ev, ORTE_ERROR_PRI); \
|
opal_event_set_priority(&(_n)->ev, ORTE_ERROR_PRI); \
|
||||||
|
ORTE_POST_OBJECT(_n); \
|
||||||
opal_event_active(&(_n)->ev, OPAL_EV_WRITE, 1); \
|
opal_event_active(&(_n)->ev, OPAL_EV_WRITE, 1); \
|
||||||
} while(0);
|
} while(0);
|
||||||
|
|
||||||
@ -183,6 +186,7 @@ typedef void (*orte_notifier_base_module_report_fn_t)(orte_notifier_request_t *r
|
|||||||
opal_event_set(orte_notifier_base.ev_base, &(_n)->ev, -1, \
|
opal_event_set(orte_notifier_base.ev_base, &(_n)->ev, -1, \
|
||||||
OPAL_EV_WRITE, orte_notifier_base_event, (_n)); \
|
OPAL_EV_WRITE, orte_notifier_base_event, (_n)); \
|
||||||
opal_event_set_priority(&(_n)->ev, ORTE_ERROR_PRI); \
|
opal_event_set_priority(&(_n)->ev, ORTE_ERROR_PRI); \
|
||||||
|
ORTE_POST_OBJECT(_n); \
|
||||||
opal_event_active(&(_n)->ev, OPAL_EV_WRITE, 1); \
|
opal_event_active(&(_n)->ev, OPAL_EV_WRITE, 1); \
|
||||||
} while(0);
|
} while(0);
|
||||||
|
|
||||||
|
@ -11,7 +11,7 @@
|
|||||||
* All rights reserved.
|
* All rights reserved.
|
||||||
* Copyright (c) 2007 Sun Microsystems, Inc. All rights reserved.
|
* Copyright (c) 2007 Sun Microsystems, Inc. All rights reserved.
|
||||||
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
|
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
|
||||||
* Copyright (c) 2014 Intel, Inc. All rights reserved.
|
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
|
||||||
* $COPYRIGHT$
|
* $COPYRIGHT$
|
||||||
*
|
*
|
||||||
* Additional copyrights may follow
|
* Additional copyrights may follow
|
||||||
@ -50,18 +50,10 @@
|
|||||||
/* Static API's */
|
/* Static API's */
|
||||||
static void mylog(orte_notifier_base_severity_t severity, int errcode,
|
static void mylog(orte_notifier_base_severity_t severity, int errcode,
|
||||||
const char *msg, va_list ap);
|
const char *msg, va_list ap);
|
||||||
static void myhelplog(orte_notifier_base_severity_t severity, int errcode,
|
|
||||||
const char *filename,
|
|
||||||
const char *topic, va_list ap);
|
|
||||||
static void mypeerlog(orte_notifier_base_severity_t severity, int errcode,
|
|
||||||
orte_process_name_t *peer_proc,
|
|
||||||
const char *msg, va_list ap);
|
|
||||||
|
|
||||||
/* Module */
|
/* Module */
|
||||||
orte_notifier_base_module_t orte_notifier_smtp_module = {
|
orte_notifier_base_module_t orte_notifier_smtp_module = {
|
||||||
NULL,
|
.log = mylog
|
||||||
NULL,
|
|
||||||
mylog,
|
|
||||||
};
|
};
|
||||||
|
|
||||||
typedef enum {
|
typedef enum {
|
||||||
|
@ -10,7 +10,7 @@
|
|||||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||||
* All rights reserved.
|
* All rights reserved.
|
||||||
* Copyright (c) 2007 Sun Microsystems, Inc. All rights reserved.
|
* Copyright (c) 2007 Sun Microsystems, Inc. All rights reserved.
|
||||||
* Copyright (c) 2014-2015 Intel, Inc. All rights reserved.
|
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
|
||||||
* $COPYRIGHT$
|
* $COPYRIGHT$
|
||||||
*
|
*
|
||||||
* Additional copyrights may follow
|
* Additional copyrights may follow
|
||||||
@ -48,11 +48,11 @@ static void myreport(orte_notifier_request_t *req);
|
|||||||
|
|
||||||
/* Module def */
|
/* Module def */
|
||||||
orte_notifier_base_module_t orte_notifier_syslog_module = {
|
orte_notifier_base_module_t orte_notifier_syslog_module = {
|
||||||
init,
|
.init = init,
|
||||||
finalize,
|
.finalize = finalize,
|
||||||
mylog,
|
.log = mylog,
|
||||||
myevent,
|
.event = myevent,
|
||||||
myreport
|
.report = myreport
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
@ -130,4 +130,3 @@ static void myreport(orte_notifier_request_t *req)
|
|||||||
orte_job_state_to_str(req->state),
|
orte_job_state_to_str(req->state),
|
||||||
(NULL == req->msg) ? "<N/A>" : req->msg);
|
(NULL == req->msg) ? "<N/A>" : req->msg);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -81,6 +81,7 @@
|
|||||||
#include "orte/util/proc_info.h"
|
#include "orte/util/proc_info.h"
|
||||||
#include "orte/util/nidmap.h"
|
#include "orte/util/nidmap.h"
|
||||||
#include "orte/util/show_help.h"
|
#include "orte/util/show_help.h"
|
||||||
|
#include "orte/util/threads.h"
|
||||||
#include "orte/runtime/orte_globals.h"
|
#include "orte/runtime/orte_globals.h"
|
||||||
#include "orte/runtime/orte_wait.h"
|
#include "orte/runtime/orte_wait.h"
|
||||||
#include "orte/orted/orted.h"
|
#include "orte/orted/orted.h"
|
||||||
@ -582,6 +583,8 @@ static void timer_cb(int fd, short event, void *cbdata)
|
|||||||
orte_timer_t *tm = (orte_timer_t*)cbdata;
|
orte_timer_t *tm = (orte_timer_t*)cbdata;
|
||||||
orte_odls_launch_local_t *ll = (orte_odls_launch_local_t*)tm->payload;
|
orte_odls_launch_local_t *ll = (orte_odls_launch_local_t*)tm->payload;
|
||||||
|
|
||||||
|
ORTE_ACQUIRE_OBJECT(tm);
|
||||||
|
|
||||||
/* increment the number of retries */
|
/* increment the number of retries */
|
||||||
ll->retries++;
|
ll->retries++;
|
||||||
|
|
||||||
@ -629,6 +632,8 @@ void orte_odls_base_spawn_proc(int fd, short sd, void *cbdata)
|
|||||||
char *pathenv = NULL, *mpiexec_pathenv = NULL;
|
char *pathenv = NULL, *mpiexec_pathenv = NULL;
|
||||||
char *full_search;
|
char *full_search;
|
||||||
|
|
||||||
|
ORTE_ACQUIRE_OBJECT(cd);
|
||||||
|
|
||||||
/* thread-protect common values */
|
/* thread-protect common values */
|
||||||
cd->env = opal_argv_copy(app->env);
|
cd->env = opal_argv_copy(app->env);
|
||||||
|
|
||||||
@ -820,6 +825,8 @@ void orte_odls_base_default_launch_local(int fd, short sd, void *cbdata)
|
|||||||
opal_event_base_t *evb;
|
opal_event_base_t *evb;
|
||||||
char *effective_dir = NULL;
|
char *effective_dir = NULL;
|
||||||
|
|
||||||
|
ORTE_ACQUIRE_OBJECT(caddy);
|
||||||
|
|
||||||
opal_output_verbose(5, orte_odls_base_framework.framework_output,
|
opal_output_verbose(5, orte_odls_base_framework.framework_output,
|
||||||
"%s local:launch",
|
"%s local:launch",
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
|
||||||
|
@ -127,6 +127,7 @@
|
|||||||
#include "orte/mca/plm/plm.h"
|
#include "orte/mca/plm/plm.h"
|
||||||
#include "orte/mca/rtc/rtc.h"
|
#include "orte/mca/rtc/rtc.h"
|
||||||
#include "orte/util/name_fns.h"
|
#include "orte/util/name_fns.h"
|
||||||
|
#include "orte/util/threads.h"
|
||||||
|
|
||||||
#include "orte/mca/odls/base/base.h"
|
#include "orte/mca/odls/base/base.h"
|
||||||
#include "orte/mca/odls/base/odls_private.h"
|
#include "orte/mca/odls/base/odls_private.h"
|
||||||
@ -157,11 +158,11 @@ static int do_child(orte_odls_spawn_caddy_t *cd, int write_fd)
|
|||||||
* Module
|
* Module
|
||||||
*/
|
*/
|
||||||
orte_odls_base_module_t orte_odls_default_module = {
|
orte_odls_base_module_t orte_odls_default_module = {
|
||||||
orte_odls_base_default_get_add_procs_data,
|
.get_add_procs_data = orte_odls_base_default_get_add_procs_data,
|
||||||
orte_odls_default_launch_local_procs,
|
.launch_local_procs = orte_odls_default_launch_local_procs,
|
||||||
orte_odls_default_kill_local_procs,
|
.kill_local_procs = orte_odls_default_kill_local_procs,
|
||||||
orte_odls_default_signal_local_procs,
|
.signal_local_procs = orte_odls_default_signal_local_procs,
|
||||||
orte_odls_default_restart_proc
|
.restart_proc = orte_odls_default_restart_proc
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
|
@ -42,9 +42,11 @@
|
|||||||
#include "opal/class/opal_hash_table.h"
|
#include "opal/class/opal_hash_table.h"
|
||||||
#include "opal/class/opal_list.h"
|
#include "opal/class/opal_list.h"
|
||||||
#include "opal/util/timings.h"
|
#include "opal/util/timings.h"
|
||||||
#include "orte/mca/mca.h"
|
|
||||||
#include "opal/mca/event/event.h"
|
#include "opal/mca/event/event.h"
|
||||||
|
|
||||||
|
#include "orte/mca/mca.h"
|
||||||
|
#include "orte/util/threads.h"
|
||||||
|
|
||||||
#include "orte/mca/oob/oob.h"
|
#include "orte/mca/oob/oob.h"
|
||||||
|
|
||||||
BEGIN_C_DECLS
|
BEGIN_C_DECLS
|
||||||
@ -119,11 +121,8 @@ ORTE_DECLSPEC void orte_oob_base_send_nb(int fd, short args, void *cbdata);
|
|||||||
__FILE__, __LINE__); \
|
__FILE__, __LINE__); \
|
||||||
cd = OBJ_NEW(orte_oob_send_t); \
|
cd = OBJ_NEW(orte_oob_send_t); \
|
||||||
cd->msg = (m); \
|
cd->msg = (m); \
|
||||||
opal_event_set(orte_oob_base.ev_base, &cd->ev, -1, \
|
ORTE_THREADSHIFT(cd, orte_oob_base.ev_base, \
|
||||||
OPAL_EV_WRITE, \
|
orte_oob_base_send_nb, ORTE_MSG_PRI); \
|
||||||
orte_oob_base_send_nb, cd); \
|
|
||||||
opal_event_set_priority(&cd->ev, ORTE_MSG_PRI); \
|
|
||||||
opal_event_active(&cd->ev, OPAL_EV_WRITE, 1); \
|
|
||||||
}while(0)
|
}while(0)
|
||||||
|
|
||||||
/* Our contact info is actually subject to change as transports
|
/* Our contact info is actually subject to change as transports
|
||||||
|
@ -21,7 +21,7 @@
|
|||||||
#include "orte/mca/errmgr/errmgr.h"
|
#include "orte/mca/errmgr/errmgr.h"
|
||||||
#include "orte/mca/state/state.h"
|
#include "orte/mca/state/state.h"
|
||||||
#include "orte/mca/rml/rml.h"
|
#include "orte/mca/rml/rml.h"
|
||||||
|
#include "orte/util/threads.h"
|
||||||
#include "orte/mca/oob/base/base.h"
|
#include "orte/mca/oob/base/base.h"
|
||||||
#if OPAL_ENABLE_FT_CR == 1
|
#if OPAL_ENABLE_FT_CR == 1
|
||||||
#include "orte/mca/state/base/base.h"
|
#include "orte/mca/state/base/base.h"
|
||||||
@ -32,7 +32,7 @@ static void process_uri(char *uri);
|
|||||||
void orte_oob_base_send_nb(int fd, short args, void *cbdata)
|
void orte_oob_base_send_nb(int fd, short args, void *cbdata)
|
||||||
{
|
{
|
||||||
orte_oob_send_t *cd = (orte_oob_send_t*)cbdata;
|
orte_oob_send_t *cd = (orte_oob_send_t*)cbdata;
|
||||||
orte_rml_send_t *msg = cd->msg;
|
orte_rml_send_t *msg;
|
||||||
mca_base_component_list_item_t *cli;
|
mca_base_component_list_item_t *cli;
|
||||||
orte_oob_base_peer_t *pr;
|
orte_oob_base_peer_t *pr;
|
||||||
int rc;
|
int rc;
|
||||||
@ -42,7 +42,10 @@ void orte_oob_base_send_nb(int fd, short args, void *cbdata)
|
|||||||
bool reachable;
|
bool reachable;
|
||||||
char *uri;
|
char *uri;
|
||||||
|
|
||||||
|
ORTE_ACQUIRE_OBJECT(cd);
|
||||||
|
|
||||||
/* done with this. release it now */
|
/* done with this. release it now */
|
||||||
|
msg = cd->msg;
|
||||||
OBJ_RELEASE(cd);
|
OBJ_RELEASE(cd);
|
||||||
|
|
||||||
opal_output_verbose(5, orte_oob_base_framework.framework_output,
|
opal_output_verbose(5, orte_oob_base_framework.framework_output,
|
||||||
@ -303,7 +306,10 @@ OBJ_CLASS_INSTANCE(mca_oob_uri_req_t,
|
|||||||
void orte_oob_base_set_addr(int fd, short args, void *cbdata)
|
void orte_oob_base_set_addr(int fd, short args, void *cbdata)
|
||||||
{
|
{
|
||||||
mca_oob_uri_req_t *req = (mca_oob_uri_req_t*)cbdata;
|
mca_oob_uri_req_t *req = (mca_oob_uri_req_t*)cbdata;
|
||||||
char *uri = req->uri;
|
char *uri;
|
||||||
|
|
||||||
|
ORTE_ACQUIRE_OBJECT(req);
|
||||||
|
uri = req->uri;
|
||||||
|
|
||||||
opal_output_verbose(5, orte_oob_base_framework.framework_output,
|
opal_output_verbose(5, orte_oob_base_framework.framework_output,
|
||||||
"%s: set_addr to uri %s",
|
"%s: set_addr to uri %s",
|
||||||
|
@ -62,6 +62,7 @@
|
|||||||
#include "orte/util/name_fns.h"
|
#include "orte/util/name_fns.h"
|
||||||
#include "orte/util/parse_options.h"
|
#include "orte/util/parse_options.h"
|
||||||
#include "orte/util/show_help.h"
|
#include "orte/util/show_help.h"
|
||||||
|
#include "orte/util/threads.h"
|
||||||
#include "orte/runtime/orte_globals.h"
|
#include "orte/runtime/orte_globals.h"
|
||||||
|
|
||||||
#include "orte/mca/oob/tcp/oob_tcp.h"
|
#include "orte/mca/oob/tcp/oob_tcp.h"
|
||||||
@ -253,6 +254,8 @@ static void recv_handler(int sd, short flg, void *cbdata)
|
|||||||
mca_oob_tcp_hdr_t hdr;
|
mca_oob_tcp_hdr_t hdr;
|
||||||
mca_oob_tcp_peer_t *peer;
|
mca_oob_tcp_peer_t *peer;
|
||||||
|
|
||||||
|
ORTE_ACQUIRE_OBJECT(op);
|
||||||
|
|
||||||
opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output,
|
opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output,
|
||||||
"%s:tcp:recv:handler called",
|
"%s:tcp:recv:handler called",
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
|
||||||
|
@ -74,6 +74,7 @@
|
|||||||
#include "orte/util/name_fns.h"
|
#include "orte/util/name_fns.h"
|
||||||
#include "orte/util/parse_options.h"
|
#include "orte/util/parse_options.h"
|
||||||
#include "orte/util/show_help.h"
|
#include "orte/util/show_help.h"
|
||||||
|
#include "orte/util/threads.h"
|
||||||
#include "orte/runtime/orte_globals.h"
|
#include "orte/runtime/orte_globals.h"
|
||||||
#include "orte/runtime/orte_wait.h"
|
#include "orte/runtime/orte_wait.h"
|
||||||
|
|
||||||
@ -698,6 +699,9 @@ static void cleanup(int sd, short args, void *cbdata)
|
|||||||
{
|
{
|
||||||
opal_list_item_t * item;
|
opal_list_item_t * item;
|
||||||
bool *active = (bool*)cbdata;
|
bool *active = (bool*)cbdata;
|
||||||
|
|
||||||
|
ORTE_ACQUIRE_OBJECT(active);
|
||||||
|
|
||||||
while (NULL != (item = opal_list_remove_first(&mca_oob_tcp_component.listeners))) {
|
while (NULL != (item = opal_list_remove_first(&mca_oob_tcp_component.listeners))) {
|
||||||
OBJ_RELEASE(item);
|
OBJ_RELEASE(item);
|
||||||
}
|
}
|
||||||
@ -756,6 +760,7 @@ static void component_shutdown(void)
|
|||||||
opal_event_set(orte_event_base, &ev, -1,
|
opal_event_set(orte_event_base, &ev, -1,
|
||||||
OPAL_EV_WRITE, cleanup, &active);
|
OPAL_EV_WRITE, cleanup, &active);
|
||||||
opal_event_set_priority(&ev, ORTE_ERROR_PRI);
|
opal_event_set_priority(&ev, ORTE_ERROR_PRI);
|
||||||
|
ORTE_POST_OBJECT(active);
|
||||||
opal_event_active(&ev, OPAL_EV_WRITE, 1);
|
opal_event_active(&ev, OPAL_EV_WRITE, 1);
|
||||||
ORTE_WAIT_FOR_COMPLETION(active);
|
ORTE_WAIT_FOR_COMPLETION(active);
|
||||||
} else {
|
} else {
|
||||||
@ -1062,6 +1067,8 @@ void mca_oob_tcp_component_set_module(int fd, short args, void *cbdata)
|
|||||||
int rc;
|
int rc;
|
||||||
orte_oob_base_peer_t *bpr;
|
orte_oob_base_peer_t *bpr;
|
||||||
|
|
||||||
|
ORTE_ACQUIRE_OBJECT(pop);
|
||||||
|
|
||||||
opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output,
|
opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output,
|
||||||
"%s tcp:set_module called for peer %s",
|
"%s tcp:set_module called for peer %s",
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
@ -1093,6 +1100,8 @@ void mca_oob_tcp_component_lost_connection(int fd, short args, void *cbdata)
|
|||||||
orte_oob_base_peer_t *bpr;
|
orte_oob_base_peer_t *bpr;
|
||||||
int rc;
|
int rc;
|
||||||
|
|
||||||
|
ORTE_ACQUIRE_OBJECT(pop);
|
||||||
|
|
||||||
opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output,
|
opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output,
|
||||||
"%s tcp:lost connection called for peer %s",
|
"%s tcp:lost connection called for peer %s",
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
@ -1128,6 +1137,8 @@ void mca_oob_tcp_component_no_route(int fd, short args, void *cbdata)
|
|||||||
int rc;
|
int rc;
|
||||||
orte_oob_base_peer_t *bpr;
|
orte_oob_base_peer_t *bpr;
|
||||||
|
|
||||||
|
ORTE_ACQUIRE_OBJECT(mop);
|
||||||
|
|
||||||
opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output,
|
opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output,
|
||||||
"%s tcp:no route called for peer %s",
|
"%s tcp:no route called for peer %s",
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
@ -1162,6 +1173,8 @@ void mca_oob_tcp_component_hop_unknown(int fd, short args, void *cbdata)
|
|||||||
orte_rml_send_t *snd;
|
orte_rml_send_t *snd;
|
||||||
orte_oob_base_peer_t *bpr;
|
orte_oob_base_peer_t *bpr;
|
||||||
|
|
||||||
|
ORTE_ACQUIRE_OBJECT(mop);
|
||||||
|
|
||||||
opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output,
|
opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output,
|
||||||
"%s tcp:unknown hop called for peer %s",
|
"%s tcp:unknown hop called for peer %s",
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
@ -1235,6 +1248,8 @@ void mca_oob_tcp_component_failed_to_connect(int fd, short args, void *cbdata)
|
|||||||
{
|
{
|
||||||
mca_oob_tcp_peer_op_t *pop = (mca_oob_tcp_peer_op_t*)cbdata;
|
mca_oob_tcp_peer_op_t *pop = (mca_oob_tcp_peer_op_t*)cbdata;
|
||||||
|
|
||||||
|
ORTE_ACQUIRE_OBJECT(pop);
|
||||||
|
|
||||||
opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output,
|
opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output,
|
||||||
"%s tcp:failed_to_connect called for peer %s",
|
"%s tcp:failed_to_connect called for peer %s",
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
|
@ -63,6 +63,7 @@
|
|||||||
|
|
||||||
#include "orte/util/name_fns.h"
|
#include "orte/util/name_fns.h"
|
||||||
#include "orte/util/show_help.h"
|
#include "orte/util/show_help.h"
|
||||||
|
#include "orte/util/threads.h"
|
||||||
#include "orte/mca/state/state.h"
|
#include "orte/mca/state/state.h"
|
||||||
#include "orte/runtime/orte_globals.h"
|
#include "orte/runtime/orte_globals.h"
|
||||||
#include "orte/mca/errmgr/errmgr.h"
|
#include "orte/mca/errmgr/errmgr.h"
|
||||||
@ -152,7 +153,7 @@ static int tcp_peer_create_socket(mca_oob_tcp_peer_t* peer)
|
|||||||
void mca_oob_tcp_peer_try_connect(int fd, short args, void *cbdata)
|
void mca_oob_tcp_peer_try_connect(int fd, short args, void *cbdata)
|
||||||
{
|
{
|
||||||
mca_oob_tcp_conn_op_t *op = (mca_oob_tcp_conn_op_t*)cbdata;
|
mca_oob_tcp_conn_op_t *op = (mca_oob_tcp_conn_op_t*)cbdata;
|
||||||
mca_oob_tcp_peer_t *peer = op->peer;
|
mca_oob_tcp_peer_t *peer;
|
||||||
int rc;
|
int rc;
|
||||||
opal_socklen_t addrlen = 0;
|
opal_socklen_t addrlen = 0;
|
||||||
mca_oob_tcp_addr_t *addr;
|
mca_oob_tcp_addr_t *addr;
|
||||||
@ -160,6 +161,9 @@ void mca_oob_tcp_peer_try_connect(int fd, short args, void *cbdata)
|
|||||||
mca_oob_tcp_send_t *snd;
|
mca_oob_tcp_send_t *snd;
|
||||||
bool connected = false;
|
bool connected = false;
|
||||||
|
|
||||||
|
ORTE_ACQUIRE_OBJECT(op);
|
||||||
|
peer = op->peer;
|
||||||
|
|
||||||
opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output,
|
opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output,
|
||||||
"%s orte_tcp_peer_try_connect: "
|
"%s orte_tcp_peer_try_connect: "
|
||||||
"attempting to connect to proc %s",
|
"attempting to connect to proc %s",
|
||||||
@ -586,8 +590,9 @@ void mca_oob_tcp_peer_complete_connect(mca_oob_tcp_peer_t *peer)
|
|||||||
ORTE_NAME_PRINT(&(peer->name)));
|
ORTE_NAME_PRINT(&(peer->name)));
|
||||||
|
|
||||||
if (!peer->recv_ev_active) {
|
if (!peer->recv_ev_active) {
|
||||||
opal_event_add(&peer->recv_event, 0);
|
|
||||||
peer->recv_ev_active = true;
|
peer->recv_ev_active = true;
|
||||||
|
ORTE_POST_OBJECT(peer);
|
||||||
|
opal_event_add(&peer->recv_event, 0);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
opal_output(0, "%s tcp_peer_complete_connect: unable to send connect ack to %s",
|
opal_output(0, "%s tcp_peer_complete_connect: unable to send connect ack to %s",
|
||||||
@ -608,6 +613,8 @@ static int tcp_peer_send_blocking(int sd, void* data, size_t size)
|
|||||||
size_t cnt = 0;
|
size_t cnt = 0;
|
||||||
int retval;
|
int retval;
|
||||||
|
|
||||||
|
ORTE_ACQUIRE_OBJECT(ptr);
|
||||||
|
|
||||||
opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output,
|
opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output,
|
||||||
"%s send blocking of %"PRIsize_t" bytes to socket %d",
|
"%s send blocking of %"PRIsize_t" bytes to socket %d",
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
@ -949,8 +956,9 @@ static void tcp_peer_connected(mca_oob_tcp_peer_t* peer)
|
|||||||
opal_list_remove_first(&peer->send_queue);
|
opal_list_remove_first(&peer->send_queue);
|
||||||
}
|
}
|
||||||
if (NULL != peer->send_msg && !peer->send_ev_active) {
|
if (NULL != peer->send_msg && !peer->send_ev_active) {
|
||||||
opal_event_add(&peer->send_event, 0);
|
|
||||||
peer->send_ev_active = true;
|
peer->send_ev_active = true;
|
||||||
|
ORTE_POST_OBJECT(peer);
|
||||||
|
opal_event_add(&peer->send_event, 0);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1214,8 +1222,9 @@ bool mca_oob_tcp_peer_accept(mca_oob_tcp_peer_t* peer)
|
|||||||
|
|
||||||
tcp_peer_connected(peer);
|
tcp_peer_connected(peer);
|
||||||
if (!peer->recv_ev_active) {
|
if (!peer->recv_ev_active) {
|
||||||
opal_event_add(&peer->recv_event, 0);
|
|
||||||
peer->recv_ev_active = true;
|
peer->recv_ev_active = true;
|
||||||
|
ORTE_POST_OBJECT(peer);
|
||||||
|
opal_event_add(&peer->recv_event, 0);
|
||||||
}
|
}
|
||||||
if (OOB_TCP_DEBUG_CONNECT <= opal_output_get_verbosity(orte_oob_base_framework.framework_output)) {
|
if (OOB_TCP_DEBUG_CONNECT <= opal_output_get_verbosity(orte_oob_base_framework.framework_output)) {
|
||||||
mca_oob_tcp_peer_dump(peer, "accepted");
|
mca_oob_tcp_peer_dump(peer, "accepted");
|
||||||
|
@ -32,6 +32,7 @@
|
|||||||
#include <sys/socket.h>
|
#include <sys/socket.h>
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#include "orte/util/threads.h"
|
||||||
#include "oob_tcp.h"
|
#include "oob_tcp.h"
|
||||||
#include "oob_tcp_peer.h"
|
#include "oob_tcp_peer.h"
|
||||||
|
|
||||||
@ -59,10 +60,7 @@ OBJ_CLASS_DECLARATION(mca_oob_tcp_conn_op_t);
|
|||||||
ORTE_NAME_PRINT((&(p)->name))); \
|
ORTE_NAME_PRINT((&(p)->name))); \
|
||||||
cop = OBJ_NEW(mca_oob_tcp_conn_op_t); \
|
cop = OBJ_NEW(mca_oob_tcp_conn_op_t); \
|
||||||
cop->peer = (p); \
|
cop->peer = (p); \
|
||||||
opal_event_set((p)->ev_base, &cop->ev, -1, \
|
ORTE_THREADSHIFT(cop, (p)->ev_base, (cbfunc), ORTE_MSG_PRI); \
|
||||||
OPAL_EV_WRITE, (cbfunc), cop); \
|
|
||||||
opal_event_set_priority(&cop->ev, ORTE_MSG_PRI); \
|
|
||||||
opal_event_active(&cop->ev, OPAL_EV_WRITE, 1); \
|
|
||||||
} while(0);
|
} while(0);
|
||||||
|
|
||||||
#define ORTE_ACTIVATE_TCP_ACCEPT_STATE(s, a, cbfunc) \
|
#define ORTE_ACTIVATE_TCP_ACCEPT_STATE(s, a, cbfunc) \
|
||||||
@ -72,6 +70,7 @@ OBJ_CLASS_DECLARATION(mca_oob_tcp_conn_op_t);
|
|||||||
opal_event_set(orte_oob_base.ev_base, &cop->ev, s, \
|
opal_event_set(orte_oob_base.ev_base, &cop->ev, s, \
|
||||||
OPAL_EV_READ, (cbfunc), cop); \
|
OPAL_EV_READ, (cbfunc), cop); \
|
||||||
opal_event_set_priority(&cop->ev, ORTE_MSG_PRI); \
|
opal_event_set_priority(&cop->ev, ORTE_MSG_PRI); \
|
||||||
|
ORTE_POST_OBJECT(cop); \
|
||||||
opal_event_add(&cop->ev, 0); \
|
opal_event_add(&cop->ev, 0); \
|
||||||
} while(0);
|
} while(0);
|
||||||
|
|
||||||
@ -88,6 +87,7 @@ OBJ_CLASS_DECLARATION(mca_oob_tcp_conn_op_t);
|
|||||||
opal_event_evtimer_set((p)->ev_base, \
|
opal_event_evtimer_set((p)->ev_base, \
|
||||||
&cop->ev, \
|
&cop->ev, \
|
||||||
(cbfunc), cop); \
|
(cbfunc), cop); \
|
||||||
|
ORTE_POST_OBJECT(cop); \
|
||||||
opal_event_evtimer_add(&cop->ev, (tv)); \
|
opal_event_evtimer_add(&cop->ev, (tv)); \
|
||||||
} while(0);
|
} while(0);
|
||||||
|
|
||||||
|
@ -66,6 +66,7 @@
|
|||||||
#include "orte/util/name_fns.h"
|
#include "orte/util/name_fns.h"
|
||||||
#include "orte/util/parse_options.h"
|
#include "orte/util/parse_options.h"
|
||||||
#include "orte/util/show_help.h"
|
#include "orte/util/show_help.h"
|
||||||
|
#include "orte/util/threads.h"
|
||||||
#include "orte/runtime/orte_globals.h"
|
#include "orte/runtime/orte_globals.h"
|
||||||
|
|
||||||
#include "orte/mca/oob/tcp/oob_tcp.h"
|
#include "orte/mca/oob/tcp/oob_tcp.h"
|
||||||
@ -162,6 +163,7 @@ int orte_oob_tcp_start_listening(void)
|
|||||||
connection_event_handler,
|
connection_event_handler,
|
||||||
0);
|
0);
|
||||||
opal_event_set_priority(&listener->event, ORTE_MSG_PRI);
|
opal_event_set_priority(&listener->event, ORTE_MSG_PRI);
|
||||||
|
ORTE_POST_OBJECT(listener);
|
||||||
opal_event_add(&listener->event, 0);
|
opal_event_add(&listener->event, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -816,6 +818,7 @@ static void* listen_thread(opal_object_t *obj)
|
|||||||
}
|
}
|
||||||
|
|
||||||
/* activate the event */
|
/* activate the event */
|
||||||
|
ORTE_POST_OBJECT(pending_connection);
|
||||||
opal_event_active(&pending_connection->ev, OPAL_EV_WRITE, 1);
|
opal_event_active(&pending_connection->ev, OPAL_EV_WRITE, 1);
|
||||||
accepted_connections++;
|
accepted_connections++;
|
||||||
}
|
}
|
||||||
@ -858,6 +861,8 @@ static void connection_handler(int sd, short flags, void* cbdata)
|
|||||||
|
|
||||||
new_connection = (mca_oob_tcp_pending_connection_t*)cbdata;
|
new_connection = (mca_oob_tcp_pending_connection_t*)cbdata;
|
||||||
|
|
||||||
|
ORTE_ACQUIRE_OBJECT(new_connection);
|
||||||
|
|
||||||
opal_output_verbose(4, orte_oob_base_framework.framework_output,
|
opal_output_verbose(4, orte_oob_base_framework.framework_output,
|
||||||
"%s connection_handler: working connection "
|
"%s connection_handler: working connection "
|
||||||
"(%d, %d) %s:%d\n",
|
"(%d, %d) %s:%d\n",
|
||||||
|
@ -27,6 +27,7 @@
|
|||||||
|
|
||||||
#include "opal/mca/event/event.h"
|
#include "opal/mca/event/event.h"
|
||||||
|
|
||||||
|
#include "orte/util/threads.h"
|
||||||
#include "oob_tcp.h"
|
#include "oob_tcp.h"
|
||||||
#include "oob_tcp_sendrecv.h"
|
#include "oob_tcp_sendrecv.h"
|
||||||
|
|
||||||
@ -87,10 +88,8 @@ OBJ_CLASS_DECLARATION(mca_oob_tcp_peer_op_t);
|
|||||||
if (NULL != proxy) { \
|
if (NULL != proxy) { \
|
||||||
pop->rtmod = strdup(proxy); \
|
pop->rtmod = strdup(proxy); \
|
||||||
} \
|
} \
|
||||||
opal_event_set(orte_oob_base.ev_base, &pop->ev, -1, \
|
ORTE_THREADSHIFT(pop, orte_oob_base.ev_base, \
|
||||||
OPAL_EV_WRITE, (cbfunc), pop); \
|
(cbfunc), ORTE_MSG_PRI); \
|
||||||
opal_event_set_priority(&pop->ev, ORTE_MSG_PRI); \
|
|
||||||
opal_event_active(&pop->ev, OPAL_EV_WRITE, 1); \
|
|
||||||
} while(0);
|
} while(0);
|
||||||
|
|
||||||
#endif /* _MCA_OOB_TCP_PEER_H_ */
|
#endif /* _MCA_OOB_TCP_PEER_H_ */
|
||||||
|
@ -64,6 +64,7 @@
|
|||||||
#include "opal/mca/event/event.h"
|
#include "opal/mca/event/event.h"
|
||||||
|
|
||||||
#include "orte/util/name_fns.h"
|
#include "orte/util/name_fns.h"
|
||||||
|
#include "orte/util/threads.h"
|
||||||
#include "orte/runtime/orte_globals.h"
|
#include "orte/runtime/orte_globals.h"
|
||||||
#include "orte/mca/errmgr/errmgr.h"
|
#include "orte/mca/errmgr/errmgr.h"
|
||||||
#include "orte/mca/ess/ess.h"
|
#include "orte/mca/ess/ess.h"
|
||||||
@ -82,7 +83,10 @@
|
|||||||
void mca_oob_tcp_queue_msg(int sd, short args, void *cbdata)
|
void mca_oob_tcp_queue_msg(int sd, short args, void *cbdata)
|
||||||
{
|
{
|
||||||
mca_oob_tcp_send_t *snd = (mca_oob_tcp_send_t*)cbdata;
|
mca_oob_tcp_send_t *snd = (mca_oob_tcp_send_t*)cbdata;
|
||||||
mca_oob_tcp_peer_t *peer = (mca_oob_tcp_peer_t*)snd->peer;
|
mca_oob_tcp_peer_t *peer;
|
||||||
|
|
||||||
|
ORTE_ACQUIRE_OBJECT(snd);
|
||||||
|
peer = (mca_oob_tcp_peer_t*)snd->peer;
|
||||||
|
|
||||||
/* if there is no message on-deck, put this one there */
|
/* if there is no message on-deck, put this one there */
|
||||||
if (NULL == peer->send_msg) {
|
if (NULL == peer->send_msg) {
|
||||||
@ -99,8 +103,9 @@ void mca_oob_tcp_queue_msg(int sd, short args, void *cbdata)
|
|||||||
} else {
|
} else {
|
||||||
/* ensure the send event is active */
|
/* ensure the send event is active */
|
||||||
if (!peer->send_ev_active) {
|
if (!peer->send_ev_active) {
|
||||||
opal_event_add(&peer->send_event, 0);
|
|
||||||
peer->send_ev_active = true;
|
peer->send_ev_active = true;
|
||||||
|
ORTE_POST_OBJECT(peer);
|
||||||
|
opal_event_add(&peer->send_event, 0);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -196,9 +201,12 @@ static int send_msg(mca_oob_tcp_peer_t* peer, mca_oob_tcp_send_t* msg)
|
|||||||
void mca_oob_tcp_send_handler(int sd, short flags, void *cbdata)
|
void mca_oob_tcp_send_handler(int sd, short flags, void *cbdata)
|
||||||
{
|
{
|
||||||
mca_oob_tcp_peer_t* peer = (mca_oob_tcp_peer_t*)cbdata;
|
mca_oob_tcp_peer_t* peer = (mca_oob_tcp_peer_t*)cbdata;
|
||||||
mca_oob_tcp_send_t* msg = peer->send_msg;
|
mca_oob_tcp_send_t* msg;
|
||||||
int rc;
|
int rc;
|
||||||
|
|
||||||
|
ORTE_ACQUIRE_OBJECT(peer);
|
||||||
|
msg = peer->send_msg;
|
||||||
|
|
||||||
opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output,
|
opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output,
|
||||||
"%s tcp:send_handler called to send to peer %s",
|
"%s tcp:send_handler called to send to peer %s",
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
@ -424,6 +432,8 @@ void mca_oob_tcp_recv_handler(int sd, short flags, void *cbdata)
|
|||||||
int rc;
|
int rc;
|
||||||
orte_rml_send_t *snd;
|
orte_rml_send_t *snd;
|
||||||
|
|
||||||
|
ORTE_ACQUIRE_OBJECT(peer);
|
||||||
|
|
||||||
opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output,
|
opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output,
|
||||||
"%s:tcp:recv:handler called for peer %s",
|
"%s:tcp:recv:handler called for peer %s",
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
@ -437,8 +447,9 @@ void mca_oob_tcp_recv_handler(int sd, short flags, void *cbdata)
|
|||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
|
||||||
/* we connected! Start the send/recv events */
|
/* we connected! Start the send/recv events */
|
||||||
if (!peer->recv_ev_active) {
|
if (!peer->recv_ev_active) {
|
||||||
opal_event_add(&peer->recv_event, 0);
|
|
||||||
peer->recv_ev_active = true;
|
peer->recv_ev_active = true;
|
||||||
|
ORTE_POST_OBJECT(peer);
|
||||||
|
opal_event_add(&peer->recv_event, 0);
|
||||||
}
|
}
|
||||||
if (peer->timer_ev_active) {
|
if (peer->timer_ev_active) {
|
||||||
opal_event_del(&peer->timer_event);
|
opal_event_del(&peer->timer_event);
|
||||||
@ -449,8 +460,9 @@ void mca_oob_tcp_recv_handler(int sd, short flags, void *cbdata)
|
|||||||
peer->send_msg = (mca_oob_tcp_send_t*)opal_list_remove_first(&peer->send_queue);
|
peer->send_msg = (mca_oob_tcp_send_t*)opal_list_remove_first(&peer->send_queue);
|
||||||
}
|
}
|
||||||
if (NULL != peer->send_msg && !peer->send_ev_active) {
|
if (NULL != peer->send_msg && !peer->send_ev_active) {
|
||||||
opal_event_add(&peer->send_event, 0);
|
|
||||||
peer->send_ev_active = true;
|
peer->send_ev_active = true;
|
||||||
|
ORTE_POST_OBJECT(peer);
|
||||||
|
opal_event_add(&peer->send_event, 0);
|
||||||
}
|
}
|
||||||
/* update our state */
|
/* update our state */
|
||||||
peer->state = MCA_OOB_TCP_CONNECTED;
|
peer->state = MCA_OOB_TCP_CONNECTED;
|
||||||
|
@ -28,7 +28,7 @@
|
|||||||
#include "opal/class/opal_list.h"
|
#include "opal/class/opal_list.h"
|
||||||
|
|
||||||
#include "orte/mca/rml/base/base.h"
|
#include "orte/mca/rml/base/base.h"
|
||||||
|
#include "orte/util/threads.h"
|
||||||
#include "oob_tcp.h"
|
#include "oob_tcp.h"
|
||||||
#include "oob_tcp_hdr.h"
|
#include "oob_tcp_hdr.h"
|
||||||
|
|
||||||
@ -82,10 +82,8 @@ OBJ_CLASS_DECLARATION(mca_oob_tcp_recv_t);
|
|||||||
do { \
|
do { \
|
||||||
(s)->peer = (struct mca_oob_tcp_peer_t*)(p); \
|
(s)->peer = (struct mca_oob_tcp_peer_t*)(p); \
|
||||||
(s)->activate = (f); \
|
(s)->activate = (f); \
|
||||||
opal_event_set((p)->ev_base, &(s)->ev, -1, \
|
ORTE_THREADSHIFT((s), (p)->ev_base, \
|
||||||
OPAL_EV_WRITE, mca_oob_tcp_queue_msg, (s)); \
|
mca_oob_tcp_queue_msg, ORTE_MSG_PRI); \
|
||||||
opal_event_set_priority(&(s)->ev, ORTE_MSG_PRI); \
|
|
||||||
opal_event_active(&(s)->ev, OPAL_EV_WRITE, 1); \
|
|
||||||
} while(0)
|
} while(0)
|
||||||
|
|
||||||
/* queue a message to be sent by one of our modules - must
|
/* queue a message to be sent by one of our modules - must
|
||||||
@ -134,7 +132,7 @@ OBJ_CLASS_DECLARATION(mca_oob_tcp_recv_t);
|
|||||||
_s->sdbytes = sizeof(mca_oob_tcp_hdr_t); \
|
_s->sdbytes = sizeof(mca_oob_tcp_hdr_t); \
|
||||||
/* add to the msg queue for this peer */ \
|
/* add to the msg queue for this peer */ \
|
||||||
MCA_OOB_TCP_QUEUE_MSG((p), _s, true); \
|
MCA_OOB_TCP_QUEUE_MSG((p), _s, true); \
|
||||||
}while(0);
|
} while(0)
|
||||||
|
|
||||||
/* queue a message to be sent by one of our modules upon completing
|
/* queue a message to be sent by one of our modules upon completing
|
||||||
* the connection process - must provide the following params:
|
* the connection process - must provide the following params:
|
||||||
@ -182,7 +180,7 @@ OBJ_CLASS_DECLARATION(mca_oob_tcp_recv_t);
|
|||||||
_s->sdbytes = sizeof(mca_oob_tcp_hdr_t); \
|
_s->sdbytes = sizeof(mca_oob_tcp_hdr_t); \
|
||||||
/* add to the msg queue for this peer */ \
|
/* add to the msg queue for this peer */ \
|
||||||
MCA_OOB_TCP_QUEUE_MSG((p), _s, false); \
|
MCA_OOB_TCP_QUEUE_MSG((p), _s, false); \
|
||||||
}while(0);
|
} while(0)
|
||||||
|
|
||||||
/* queue a message for relay by one of our modules - must
|
/* queue a message for relay by one of our modules - must
|
||||||
* provide the following params:
|
* provide the following params:
|
||||||
@ -217,7 +215,7 @@ OBJ_CLASS_DECLARATION(mca_oob_tcp_recv_t);
|
|||||||
_s->sdbytes = sizeof(mca_oob_tcp_hdr_t); \
|
_s->sdbytes = sizeof(mca_oob_tcp_hdr_t); \
|
||||||
/* add to the msg queue for this peer */ \
|
/* add to the msg queue for this peer */ \
|
||||||
MCA_OOB_TCP_QUEUE_MSG((p), _s, true); \
|
MCA_OOB_TCP_QUEUE_MSG((p), _s, true); \
|
||||||
}while(0);
|
} while(0)
|
||||||
|
|
||||||
/* State machine for processing message */
|
/* State machine for processing message */
|
||||||
typedef struct {
|
typedef struct {
|
||||||
@ -237,10 +235,8 @@ OBJ_CLASS_DECLARATION(mca_oob_tcp_msg_op_t);
|
|||||||
ORTE_NAME_PRINT(&((ms)->dst))); \
|
ORTE_NAME_PRINT(&((ms)->dst))); \
|
||||||
mop = OBJ_NEW(mca_oob_tcp_msg_op_t); \
|
mop = OBJ_NEW(mca_oob_tcp_msg_op_t); \
|
||||||
mop->msg = (ms); \
|
mop->msg = (ms); \
|
||||||
opal_event_set((ms)->peer->ev_base, &mop->ev, -1, \
|
ORTE_THREADSHIFT(mop, (ms)->peer->ev_base, \
|
||||||
OPAL_EV_WRITE, (cbfunc), mop); \
|
(cbfunc), ORTE_MSG_PRI); \
|
||||||
opal_event_set_priority(&mop->ev, ORTE_MSG_PRI); \
|
|
||||||
opal_event_active(&mop->ev, OPAL_EV_WRITE, 1); \
|
|
||||||
} while(0);
|
} while(0);
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
@ -285,11 +281,9 @@ OBJ_CLASS_DECLARATION(mca_oob_tcp_msg_error_t);
|
|||||||
mop->hop.jobid = (h)->jobid; \
|
mop->hop.jobid = (h)->jobid; \
|
||||||
mop->hop.vpid = (h)->vpid; \
|
mop->hop.vpid = (h)->vpid; \
|
||||||
/* this goes to the OOB framework, so use that event base */ \
|
/* this goes to the OOB framework, so use that event base */ \
|
||||||
opal_event_set(orte_oob_base.ev_base, &mop->ev, -1, \
|
ORTE_THREADSHIFT(mop, orte_oob_base.ev_base, \
|
||||||
OPAL_EV_WRITE, (cbfunc), mop); \
|
(cbfunc), ORTE_MSG_PRI); \
|
||||||
opal_event_set_priority(&mop->ev, ORTE_MSG_PRI); \
|
} while(0)
|
||||||
opal_event_active(&mop->ev, OPAL_EV_WRITE, 1); \
|
|
||||||
} while(0);
|
|
||||||
|
|
||||||
#define ORTE_ACTIVATE_TCP_NO_ROUTE(r, h, c) \
|
#define ORTE_ACTIVATE_TCP_NO_ROUTE(r, h, c) \
|
||||||
do { \
|
do { \
|
||||||
@ -305,10 +299,8 @@ OBJ_CLASS_DECLARATION(mca_oob_tcp_msg_error_t);
|
|||||||
mop->hop.vpid = (h)->vpid; \
|
mop->hop.vpid = (h)->vpid; \
|
||||||
/* this goes to the component, so use the framework \
|
/* this goes to the component, so use the framework \
|
||||||
* event base */ \
|
* event base */ \
|
||||||
opal_event_set(orte_oob_base.ev_base, &mop->ev, -1, \
|
ORTE_THREADSHIFT(mop, orte_oob_base.ev_base, \
|
||||||
OPAL_EV_WRITE, (c), mop); \
|
(c), ORTE_MSG_PRI); \
|
||||||
opal_event_set_priority(&mop->ev, ORTE_MSG_PRI); \
|
} while(0)
|
||||||
opal_event_active(&mop->ev, OPAL_EV_WRITE, 1); \
|
|
||||||
} while(0);
|
|
||||||
|
|
||||||
#endif /* _MCA_OOB_TCP_SENDRECV_H_ */
|
#endif /* _MCA_OOB_TCP_SENDRECV_H_ */
|
||||||
|
@ -55,13 +55,14 @@
|
|||||||
#include "opal/mca/installdirs/installdirs.h"
|
#include "opal/mca/installdirs/installdirs.h"
|
||||||
#include "opal/util/argv.h"
|
#include "opal/util/argv.h"
|
||||||
#include "opal/util/output.h"
|
#include "opal/util/output.h"
|
||||||
#include "orte/util/show_help.h"
|
|
||||||
#include "opal/util/opal_environ.h"
|
#include "opal/util/opal_environ.h"
|
||||||
#include "opal/util/path.h"
|
#include "opal/util/path.h"
|
||||||
#include "opal/util/basename.h"
|
#include "opal/util/basename.h"
|
||||||
|
|
||||||
#include "orte/runtime/orte_globals.h"
|
#include "orte/runtime/orte_globals.h"
|
||||||
#include "orte/util/name_fns.h"
|
#include "orte/util/name_fns.h"
|
||||||
|
#include "orte/util/show_help.h"
|
||||||
|
#include "orte/util/threads.h"
|
||||||
#include "orte/runtime/orte_wait.h"
|
#include "orte/runtime/orte_wait.h"
|
||||||
#include "orte/mca/errmgr/errmgr.h"
|
#include "orte/mca/errmgr/errmgr.h"
|
||||||
#include "orte/mca/rmaps/rmaps.h"
|
#include "orte/mca/rmaps/rmaps.h"
|
||||||
@ -187,6 +188,8 @@ static void launch_daemons(int fd, short args, void *cbdata)
|
|||||||
orte_state_caddy_t *state = (orte_state_caddy_t*)cbdata;
|
orte_state_caddy_t *state = (orte_state_caddy_t*)cbdata;
|
||||||
char *ltmp;
|
char *ltmp;
|
||||||
|
|
||||||
|
ORTE_ACQUIRE_OBJECT(state);
|
||||||
|
|
||||||
/* if we are launching debugger daemons, then just go
|
/* if we are launching debugger daemons, then just go
|
||||||
* do it - no new daemons will be launched
|
* do it - no new daemons will be launched
|
||||||
*/
|
*/
|
||||||
|
@ -74,6 +74,7 @@
|
|||||||
#include "orte/util/pre_condition_transports.h"
|
#include "orte/util/pre_condition_transports.h"
|
||||||
#include "orte/util/proc_info.h"
|
#include "orte/util/proc_info.h"
|
||||||
#include "orte/util/regex.h"
|
#include "orte/util/regex.h"
|
||||||
|
#include "orte/util/threads.h"
|
||||||
#include "orte/mca/state/state.h"
|
#include "orte/mca/state/state.h"
|
||||||
#include "orte/mca/state/base/base.h"
|
#include "orte/mca/state/base/base.h"
|
||||||
#include "orte/util/hostfile/hostfile.h"
|
#include "orte/util/hostfile/hostfile.h"
|
||||||
@ -129,6 +130,8 @@ void orte_plm_base_daemons_reported(int fd, short args, void *cbdata)
|
|||||||
orte_node_t *node;
|
orte_node_t *node;
|
||||||
int i;
|
int i;
|
||||||
|
|
||||||
|
ORTE_ACQUIRE_OBJECT(caddy);
|
||||||
|
|
||||||
/* if we are not launching, then we just assume that all
|
/* if we are not launching, then we just assume that all
|
||||||
* daemons share our topology */
|
* daemons share our topology */
|
||||||
if (orte_do_not_launch) {
|
if (orte_do_not_launch) {
|
||||||
@ -182,6 +185,8 @@ void orte_plm_base_allocation_complete(int fd, short args, void *cbdata)
|
|||||||
{
|
{
|
||||||
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
|
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
|
||||||
|
|
||||||
|
ORTE_ACQUIRE_OBJECT(caddy);
|
||||||
|
|
||||||
/* move the state machine along */
|
/* move the state machine along */
|
||||||
caddy->jdata->state = ORTE_JOB_STATE_ALLOCATION_COMPLETE;
|
caddy->jdata->state = ORTE_JOB_STATE_ALLOCATION_COMPLETE;
|
||||||
ORTE_ACTIVATE_JOB_STATE(caddy->jdata, ORTE_JOB_STATE_LAUNCH_DAEMONS);
|
ORTE_ACTIVATE_JOB_STATE(caddy->jdata, ORTE_JOB_STATE_LAUNCH_DAEMONS);
|
||||||
@ -194,6 +199,8 @@ void orte_plm_base_daemons_launched(int fd, short args, void *cbdata)
|
|||||||
{
|
{
|
||||||
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
|
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
|
||||||
|
|
||||||
|
ORTE_ACQUIRE_OBJECT(caddy);
|
||||||
|
|
||||||
/* do NOT increment the state - we wait for the
|
/* do NOT increment the state - we wait for the
|
||||||
* daemons to report that they have actually
|
* daemons to report that they have actually
|
||||||
* started before moving to the right state
|
* started before moving to the right state
|
||||||
@ -217,6 +224,8 @@ void orte_plm_base_vm_ready(int fd, short args, void *cbdata)
|
|||||||
{
|
{
|
||||||
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
|
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
|
||||||
|
|
||||||
|
ORTE_ACQUIRE_OBJECT(caddy);
|
||||||
|
|
||||||
/* progress the job */
|
/* progress the job */
|
||||||
caddy->jdata->state = ORTE_JOB_STATE_VM_READY;
|
caddy->jdata->state = ORTE_JOB_STATE_VM_READY;
|
||||||
|
|
||||||
@ -233,6 +242,8 @@ void orte_plm_base_mapping_complete(int fd, short args, void *cbdata)
|
|||||||
{
|
{
|
||||||
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
|
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
|
||||||
|
|
||||||
|
ORTE_ACQUIRE_OBJECT(caddy);
|
||||||
|
|
||||||
/* move the state machine along */
|
/* move the state machine along */
|
||||||
caddy->jdata->state = ORTE_JOB_STATE_MAP_COMPLETE;
|
caddy->jdata->state = ORTE_JOB_STATE_MAP_COMPLETE;
|
||||||
ORTE_ACTIVATE_JOB_STATE(caddy->jdata, ORTE_JOB_STATE_SYSTEM_PREP);
|
ORTE_ACTIVATE_JOB_STATE(caddy->jdata, ORTE_JOB_STATE_SYSTEM_PREP);
|
||||||
@ -252,6 +263,8 @@ void orte_plm_base_setup_job(int fd, short args, void *cbdata)
|
|||||||
orte_job_t *parent;
|
orte_job_t *parent;
|
||||||
orte_process_name_t name, *nptr;
|
orte_process_name_t name, *nptr;
|
||||||
|
|
||||||
|
ORTE_ACQUIRE_OBJECT(caddy);
|
||||||
|
|
||||||
OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
|
OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
|
||||||
"%s plm:base:setup_job",
|
"%s plm:base:setup_job",
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||||
@ -357,6 +370,8 @@ void orte_plm_base_setup_job_complete(int fd, short args, void *cbdata)
|
|||||||
{
|
{
|
||||||
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
|
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
|
||||||
|
|
||||||
|
ORTE_ACQUIRE_OBJECT(caddy);
|
||||||
|
|
||||||
/* nothing to do here but move along */
|
/* nothing to do here but move along */
|
||||||
ORTE_ACTIVATE_JOB_STATE(caddy->jdata, ORTE_JOB_STATE_ALLOCATE);
|
ORTE_ACTIVATE_JOB_STATE(caddy->jdata, ORTE_JOB_STATE_ALLOCATE);
|
||||||
OBJ_RELEASE(caddy);
|
OBJ_RELEASE(caddy);
|
||||||
@ -372,6 +387,8 @@ void orte_plm_base_complete_setup(int fd, short args, void *cbdata)
|
|||||||
int i, rc;
|
int i, rc;
|
||||||
char *serial_number;
|
char *serial_number;
|
||||||
|
|
||||||
|
ORTE_ACQUIRE_OBJECT(caddy);
|
||||||
|
|
||||||
opal_output_verbose(5, orte_plm_base_framework.framework_output,
|
opal_output_verbose(5, orte_plm_base_framework.framework_output,
|
||||||
"%s complete_setup on job %s",
|
"%s complete_setup on job %s",
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
@ -465,6 +482,8 @@ static void timer_cb(int fd, short event, void *cbdata)
|
|||||||
orte_job_t *jdata = (orte_job_t*)cbdata;
|
orte_job_t *jdata = (orte_job_t*)cbdata;
|
||||||
orte_timer_t *timer=NULL;
|
orte_timer_t *timer=NULL;
|
||||||
|
|
||||||
|
ORTE_ACQUIRE_OBJECT(jdata);
|
||||||
|
|
||||||
/* declare launch failed */
|
/* declare launch failed */
|
||||||
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_FAILED_TO_START);
|
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_FAILED_TO_START);
|
||||||
|
|
||||||
@ -486,6 +505,8 @@ void orte_plm_base_launch_apps(int fd, short args, void *cbdata)
|
|||||||
orte_timer_t *timer;
|
orte_timer_t *timer;
|
||||||
orte_grpcomm_signature_t *sig;
|
orte_grpcomm_signature_t *sig;
|
||||||
|
|
||||||
|
ORTE_ACQUIRE_OBJECT(caddy);
|
||||||
|
|
||||||
/* convenience */
|
/* convenience */
|
||||||
jdata = caddy->jdata;
|
jdata = caddy->jdata;
|
||||||
|
|
||||||
@ -587,6 +608,7 @@ void orte_plm_base_launch_apps(int fd, short args, void *cbdata)
|
|||||||
timer->tv.tv_sec = orte_startup_timeout;
|
timer->tv.tv_sec = orte_startup_timeout;
|
||||||
timer->tv.tv_usec = 0;
|
timer->tv.tv_usec = 0;
|
||||||
orte_set_attribute(&jdata->attributes, ORTE_JOB_FAILURE_TIMER_EVENT, ORTE_ATTR_LOCAL, timer, OPAL_PTR);
|
orte_set_attribute(&jdata->attributes, ORTE_JOB_FAILURE_TIMER_EVENT, ORTE_ATTR_LOCAL, timer, OPAL_PTR);
|
||||||
|
ORTE_POST_OBJECT(timer);
|
||||||
opal_event_evtimer_add(timer->ev, &timer->tv);
|
opal_event_evtimer_add(timer->ev, &timer->tv);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -605,6 +627,8 @@ void orte_plm_base_post_launch(int fd, short args, void *cbdata)
|
|||||||
opal_buffer_t *answer;
|
opal_buffer_t *answer;
|
||||||
int room, *rmptr;
|
int room, *rmptr;
|
||||||
|
|
||||||
|
ORTE_ACQUIRE_OBJECT(caddy);
|
||||||
|
|
||||||
/* convenience */
|
/* convenience */
|
||||||
jdata = caddy->jdata;
|
jdata = caddy->jdata;
|
||||||
|
|
||||||
@ -720,6 +744,8 @@ void orte_plm_base_registered(int fd, short args, void *cbdata)
|
|||||||
opal_buffer_t *answer;
|
opal_buffer_t *answer;
|
||||||
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
|
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
|
||||||
|
|
||||||
|
ORTE_ACQUIRE_OBJECT(caddy);
|
||||||
|
|
||||||
/* convenience */
|
/* convenience */
|
||||||
jdata = caddy->jdata;
|
jdata = caddy->jdata;
|
||||||
|
|
||||||
|
@ -14,7 +14,7 @@
|
|||||||
* reserved.
|
* reserved.
|
||||||
* Copyright (c) 2008-2009 Sun Microsystems, Inc. All rights reserved.
|
* Copyright (c) 2008-2009 Sun Microsystems, Inc. All rights reserved.
|
||||||
* Copyright (c) 2011 IBM Corporation. All rights reserved.
|
* Copyright (c) 2011 IBM Corporation. All rights reserved.
|
||||||
* Copyright (c) 2014 Intel, Inc. All rights reserved.
|
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
|
||||||
* $COPYRIGHT$
|
* $COPYRIGHT$
|
||||||
*
|
*
|
||||||
* Additional copyrights may follow
|
* Additional copyrights may follow
|
||||||
@ -38,6 +38,7 @@
|
|||||||
#include "orte/runtime/orte_globals.h"
|
#include "orte/runtime/orte_globals.h"
|
||||||
#include "orte/util/name_fns.h"
|
#include "orte/util/name_fns.h"
|
||||||
#include "orte/util/proc_info.h"
|
#include "orte/util/proc_info.h"
|
||||||
|
#include "orte/util/threads.h"
|
||||||
|
|
||||||
#include "orte/mca/errmgr/errmgr.h"
|
#include "orte/mca/errmgr/errmgr.h"
|
||||||
#include "orte/mca/state/state.h"
|
#include "orte/mca/state/state.h"
|
||||||
@ -114,6 +115,8 @@ static void launch_daemons(int fd, short args, void *cbdata)
|
|||||||
{
|
{
|
||||||
orte_state_caddy_t *state = (orte_state_caddy_t*)cbdata;
|
orte_state_caddy_t *state = (orte_state_caddy_t*)cbdata;
|
||||||
|
|
||||||
|
ORTE_ACQUIRE_OBJECT(state);
|
||||||
|
|
||||||
/* there are no daemons to launch, so just trigger the
|
/* there are no daemons to launch, so just trigger the
|
||||||
* daemon-launch-complete state
|
* daemon-launch-complete state
|
||||||
*/
|
*/
|
||||||
|
@ -66,6 +66,7 @@
|
|||||||
#include "orte/mca/errmgr/errmgr.h"
|
#include "orte/mca/errmgr/errmgr.h"
|
||||||
#include "orte/mca/rmaps/rmaps.h"
|
#include "orte/mca/rmaps/rmaps.h"
|
||||||
#include "orte/mca/state/state.h"
|
#include "orte/mca/state/state.h"
|
||||||
|
#include "orte/util/threads.h"
|
||||||
|
|
||||||
#include "orte/mca/plm/plm.h"
|
#include "orte/mca/plm/plm.h"
|
||||||
#include "orte/mca/plm/base/base.h"
|
#include "orte/mca/plm/base/base.h"
|
||||||
@ -171,7 +172,10 @@ static void launch_daemons(int fd, short args, void *cbdata)
|
|||||||
orte_std_cntr_t nnode;
|
orte_std_cntr_t nnode;
|
||||||
orte_job_t *daemons;
|
orte_job_t *daemons;
|
||||||
orte_state_caddy_t *state = (orte_state_caddy_t*)cbdata;
|
orte_state_caddy_t *state = (orte_state_caddy_t*)cbdata;
|
||||||
orte_job_t *jdata = state->jdata;
|
orte_job_t *jdata;
|
||||||
|
|
||||||
|
ORTE_ACQUIRE_OBJECT(state);
|
||||||
|
jdata = state->jdata;
|
||||||
|
|
||||||
/* start by setting up the virtual machine */
|
/* start by setting up the virtual machine */
|
||||||
daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid);
|
daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid);
|
||||||
|
@ -80,6 +80,7 @@
|
|||||||
#include "orte/util/name_fns.h"
|
#include "orte/util/name_fns.h"
|
||||||
#include "orte/util/nidmap.h"
|
#include "orte/util/nidmap.h"
|
||||||
#include "orte/util/proc_info.h"
|
#include "orte/util/proc_info.h"
|
||||||
|
#include "orte/util/threads.h"
|
||||||
|
|
||||||
#include "orte/mca/rml/rml.h"
|
#include "orte/mca/rml/rml.h"
|
||||||
#include "orte/mca/rml/rml_types.h"
|
#include "orte/mca/rml/rml_types.h"
|
||||||
@ -926,6 +927,8 @@ static void process_launch_list(int fd, short args, void *cbdata)
|
|||||||
pid_t pid;
|
pid_t pid;
|
||||||
orte_plm_rsh_caddy_t *caddy;
|
orte_plm_rsh_caddy_t *caddy;
|
||||||
|
|
||||||
|
ORTE_ACQUIRE_OBJECT(caddy);
|
||||||
|
|
||||||
while (num_in_progress < mca_plm_rsh_component.num_concurrent) {
|
while (num_in_progress < mca_plm_rsh_component.num_concurrent) {
|
||||||
item = opal_list_remove_first(&launch_list);
|
item = opal_list_remove_first(&launch_list);
|
||||||
if (NULL == item) {
|
if (NULL == item) {
|
||||||
@ -1021,6 +1024,8 @@ static void launch_daemons(int fd, short args, void *cbdata)
|
|||||||
orte_namelist_t *child;
|
orte_namelist_t *child;
|
||||||
char *rtmod;
|
char *rtmod;
|
||||||
|
|
||||||
|
ORTE_ACQUIRE_OBJECT(state);
|
||||||
|
|
||||||
/* if we are launching debugger daemons, then just go
|
/* if we are launching debugger daemons, then just go
|
||||||
* do it - no new daemons will be launched
|
* do it - no new daemons will be launched
|
||||||
*/
|
*/
|
||||||
@ -1285,6 +1290,7 @@ static void launch_daemons(int fd, short args, void *cbdata)
|
|||||||
OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output,
|
OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output,
|
||||||
"%s plm:rsh: activating launch event",
|
"%s plm:rsh: activating launch event",
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||||
|
ORTE_POST_OBJECT(state);
|
||||||
opal_event_active(&launch_event, EV_WRITE, 1);
|
opal_event_active(&launch_event, EV_WRITE, 1);
|
||||||
|
|
||||||
/* now that we've launched the daemons, let the daemon callback
|
/* now that we've launched the daemons, let the daemon callback
|
||||||
|
@ -61,6 +61,7 @@
|
|||||||
#include "orte/types.h"
|
#include "orte/types.h"
|
||||||
#include "orte/util/show_help.h"
|
#include "orte/util/show_help.h"
|
||||||
#include "orte/util/name_fns.h"
|
#include "orte/util/name_fns.h"
|
||||||
|
#include "orte/util/threads.h"
|
||||||
#include "orte/runtime/orte_globals.h"
|
#include "orte/runtime/orte_globals.h"
|
||||||
#include "orte/runtime/orte_wait.h"
|
#include "orte/runtime/orte_wait.h"
|
||||||
#include "orte/runtime/orte_quit.h"
|
#include "orte/runtime/orte_quit.h"
|
||||||
@ -108,7 +109,6 @@ orte_plm_base_module_1_0_0_t orte_plm_slurm_module = {
|
|||||||
*/
|
*/
|
||||||
static pid_t primary_srun_pid = 0;
|
static pid_t primary_srun_pid = 0;
|
||||||
static bool primary_pid_set = false;
|
static bool primary_pid_set = false;
|
||||||
static bool launching_daemons;
|
|
||||||
static void launch_daemons(int fd, short args, void *cbdata);
|
static void launch_daemons(int fd, short args, void *cbdata);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -189,6 +189,8 @@ static void launch_daemons(int fd, short args, void *cbdata)
|
|||||||
orte_job_t *daemons;
|
orte_job_t *daemons;
|
||||||
orte_state_caddy_t *state = (orte_state_caddy_t*)cbdata;
|
orte_state_caddy_t *state = (orte_state_caddy_t*)cbdata;
|
||||||
|
|
||||||
|
ORTE_ACQUIRE_OBJECT(state);
|
||||||
|
|
||||||
OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output,
|
OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output,
|
||||||
"%s plm:slurm: LAUNCH DAEMONS CALLED",
|
"%s plm:slurm: LAUNCH DAEMONS CALLED",
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||||
@ -545,27 +547,18 @@ static void srun_wait_cb(orte_proc_t *proc, void* cbdata){
|
|||||||
|
|
||||||
jdata = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid);
|
jdata = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid);
|
||||||
|
|
||||||
/* if we are in the launch phase, then any termination is bad */
|
/* abort only if the status returned is non-zero - i.e., if
|
||||||
if (launching_daemons) {
|
* the orteds exited with an error
|
||||||
/* report that one or more daemons failed to launch so we can exit */
|
|
||||||
OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output,
|
|
||||||
"%s plm:slurm: daemon failed during launch",
|
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
|
||||||
/* notify the error manager */
|
|
||||||
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_FAILED_TO_START);
|
|
||||||
} else {
|
|
||||||
/* if this is after launch, then we need to abort only if the status
|
|
||||||
* returned is non-zero - i.e., if the orteds exited with an error
|
|
||||||
*/
|
*/
|
||||||
if (0 != proc->exit_code) {
|
if (0 != proc->exit_code) {
|
||||||
/* an orted must have died unexpectedly after launch - report
|
/* an orted must have died unexpectedly - report
|
||||||
* that the daemon has failed so we exit
|
* that the daemon has failed so we exit
|
||||||
*/
|
*/
|
||||||
OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output,
|
OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output,
|
||||||
"%s plm:slurm: daemon failed while running",
|
"%s plm:slurm: daemon failed while running",
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||||
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_ABORTED);
|
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_ABORTED);
|
||||||
}
|
} else {
|
||||||
/* otherwise, check to see if this is the primary pid */
|
/* otherwise, check to see if this is the primary pid */
|
||||||
if (primary_srun_pid == proc->pid) {
|
if (primary_srun_pid == proc->pid) {
|
||||||
/* in this case, we just want to fire the proper trigger so
|
/* in this case, we just want to fire the proper trigger so
|
||||||
@ -579,6 +572,7 @@ static void srun_wait_cb(orte_proc_t *proc, void* cbdata){
|
|||||||
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_DAEMONS_TERMINATED);
|
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_DAEMONS_TERMINATED);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/* done with this dummy */
|
/* done with this dummy */
|
||||||
OBJ_RELEASE(proc);
|
OBJ_RELEASE(proc);
|
||||||
}
|
}
|
||||||
@ -602,6 +596,13 @@ static int plm_slurm_start_proc(int argc, char **argv, char **env,
|
|||||||
free(exec_argv);
|
free(exec_argv);
|
||||||
return ORTE_ERR_SYS_LIMITS_CHILDREN;
|
return ORTE_ERR_SYS_LIMITS_CHILDREN;
|
||||||
}
|
}
|
||||||
|
/* if this is the primary launch - i.e., not a comm_spawn of a
|
||||||
|
* child job - then save the pid
|
||||||
|
*/
|
||||||
|
if (0 < srun_pid && !primary_pid_set) {
|
||||||
|
primary_srun_pid = srun_pid;
|
||||||
|
primary_pid_set = true;
|
||||||
|
}
|
||||||
|
|
||||||
/* setup a dummy proc object to track the srun */
|
/* setup a dummy proc object to track the srun */
|
||||||
dummy = OBJ_NEW(orte_proc_t);
|
dummy = OBJ_NEW(orte_proc_t);
|
||||||
@ -692,14 +693,6 @@ static int plm_slurm_start_proc(int argc, char **argv, char **env,
|
|||||||
sides of the fork... */
|
sides of the fork... */
|
||||||
setpgid(srun_pid, srun_pid);
|
setpgid(srun_pid, srun_pid);
|
||||||
|
|
||||||
/* if this is the primary launch - i.e., not a comm_spawn of a
|
|
||||||
* child job - then save the pid
|
|
||||||
*/
|
|
||||||
if (!primary_pid_set) {
|
|
||||||
primary_srun_pid = srun_pid;
|
|
||||||
primary_pid_set = true;
|
|
||||||
}
|
|
||||||
|
|
||||||
free(exec_argv);
|
free(exec_argv);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -63,6 +63,7 @@
|
|||||||
#include "opal/util/basename.h"
|
#include "opal/util/basename.h"
|
||||||
|
|
||||||
#include "orte/util/name_fns.h"
|
#include "orte/util/name_fns.h"
|
||||||
|
#include "orte/util/threads.h"
|
||||||
#include "orte/runtime/orte_globals.h"
|
#include "orte/runtime/orte_globals.h"
|
||||||
#include "orte/runtime/orte_wait.h"
|
#include "orte/runtime/orte_wait.h"
|
||||||
#include "orte/mca/errmgr/errmgr.h"
|
#include "orte/mca/errmgr/errmgr.h"
|
||||||
@ -185,6 +186,8 @@ static void launch_daemons(int fd, short args, void *cbdata)
|
|||||||
int32_t launchid, *ldptr;
|
int32_t launchid, *ldptr;
|
||||||
char *prefix_dir = NULL;
|
char *prefix_dir = NULL;
|
||||||
|
|
||||||
|
ORTE_ACQUIRE_OBJECT(state);
|
||||||
|
|
||||||
jdata = state->jdata;
|
jdata = state->jdata;
|
||||||
|
|
||||||
/* if we are launching debugger daemons, then just go
|
/* if we are launching debugger daemons, then just go
|
||||||
@ -421,6 +424,8 @@ static void poll_spawns(int fd, short args, void *cbdata)
|
|||||||
int local_err;
|
int local_err;
|
||||||
tm_event_t event;
|
tm_event_t event;
|
||||||
|
|
||||||
|
ORTE_ACQUIRE_OBJECT(state);
|
||||||
|
|
||||||
/* TM poll for all the spawns */
|
/* TM poll for all the spawns */
|
||||||
for (i = 0; i < launched; ++i) {
|
for (i = 0; i < launched; ++i) {
|
||||||
rc = tm_poll(TM_NULL_EVENT, &event, 1, &local_err);
|
rc = tm_poll(TM_NULL_EVENT, &event, 1, &local_err);
|
||||||
|
@ -45,6 +45,7 @@
|
|||||||
#include "orte/util/proc_info.h"
|
#include "orte/util/proc_info.h"
|
||||||
#include "orte/util/comm/comm.h"
|
#include "orte/util/comm/comm.h"
|
||||||
#include "orte/util/error_strings.h"
|
#include "orte/util/error_strings.h"
|
||||||
|
#include "orte/util/threads.h"
|
||||||
#include "orte/mca/state/state.h"
|
#include "orte/mca/state/state.h"
|
||||||
#include "orte/runtime/orte_quit.h"
|
#include "orte/runtime/orte_quit.h"
|
||||||
|
|
||||||
@ -115,6 +116,8 @@ void orte_ras_base_allocate(int fd, short args, void *cbdata)
|
|||||||
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
|
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
|
||||||
char *hosts=NULL;
|
char *hosts=NULL;
|
||||||
|
|
||||||
|
ORTE_ACQUIRE_OBJECT(caddy);
|
||||||
|
|
||||||
OPAL_OUTPUT_VERBOSE((5, orte_ras_base_framework.framework_output,
|
OPAL_OUTPUT_VERBOSE((5, orte_ras_base_framework.framework_output,
|
||||||
"%s ras:base:allocate",
|
"%s ras:base:allocate",
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||||
|
@ -36,6 +36,7 @@
|
|||||||
#include "orte/mca/errmgr/errmgr.h"
|
#include "orte/mca/errmgr/errmgr.h"
|
||||||
#include "orte/runtime/orte_globals.h"
|
#include "orte/runtime/orte_globals.h"
|
||||||
#include "orte/util/show_help.h"
|
#include "orte/util/show_help.h"
|
||||||
|
#include "orte/util/threads.h"
|
||||||
#include "orte/mca/state/state.h"
|
#include "orte/mca/state/state.h"
|
||||||
|
|
||||||
#include "orte/mca/rmaps/base/base.h"
|
#include "orte/mca/rmaps/base/base.h"
|
||||||
@ -45,7 +46,7 @@
|
|||||||
void orte_rmaps_base_map_job(int fd, short args, void *cbdata)
|
void orte_rmaps_base_map_job(int fd, short args, void *cbdata)
|
||||||
{
|
{
|
||||||
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
|
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
|
||||||
orte_job_t *jdata = caddy->jdata;
|
orte_job_t *jdata;
|
||||||
orte_node_t *node;
|
orte_node_t *node;
|
||||||
int rc, i, ppx = 0;
|
int rc, i, ppx = 0;
|
||||||
bool did_map, given, pernode = false;
|
bool did_map, given, pernode = false;
|
||||||
@ -54,6 +55,9 @@ void orte_rmaps_base_map_job(int fd, short args, void *cbdata)
|
|||||||
orte_vpid_t nprocs;
|
orte_vpid_t nprocs;
|
||||||
orte_app_context_t *app;
|
orte_app_context_t *app;
|
||||||
|
|
||||||
|
ORTE_ACQUIRE_OBJECT(caddy);
|
||||||
|
jdata = caddy->jdata;
|
||||||
|
|
||||||
jdata->state = ORTE_JOB_STATE_MAP;
|
jdata->state = ORTE_JOB_STATE_MAP;
|
||||||
|
|
||||||
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
|
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
|
||||||
|
@ -29,6 +29,7 @@
|
|||||||
#include "orte/mca/state/state.h"
|
#include "orte/mca/state/state.h"
|
||||||
#include "orte/runtime/orte_wait.h"
|
#include "orte/runtime/orte_wait.h"
|
||||||
#include "orte/util/name_fns.h"
|
#include "orte/util/name_fns.h"
|
||||||
|
#include "orte/util/threads.h"
|
||||||
|
|
||||||
#include "orte/mca/rml/base/base.h"
|
#include "orte/mca/rml/base/base.h"
|
||||||
|
|
||||||
@ -87,8 +88,10 @@ static void cleanup(int sd, short args, void *cbdata)
|
|||||||
{
|
{
|
||||||
volatile bool *active = (volatile bool*)cbdata;
|
volatile bool *active = (volatile bool*)cbdata;
|
||||||
|
|
||||||
|
ORTE_ACQUIRE_OBJECT(active);
|
||||||
OPAL_LIST_DESTRUCT(&orte_rml_base.posted_recvs);
|
OPAL_LIST_DESTRUCT(&orte_rml_base.posted_recvs);
|
||||||
if (NULL != active) {
|
if (NULL != active) {
|
||||||
|
ORTE_POST_OBJECT(active);
|
||||||
*active = false;
|
*active = false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -128,6 +131,7 @@ static int orte_rml_base_close(void)
|
|||||||
opal_event_set(orte_event_base, &ev, -1,
|
opal_event_set(orte_event_base, &ev, -1,
|
||||||
OPAL_EV_WRITE, cleanup, (void*)&active);
|
OPAL_EV_WRITE, cleanup, (void*)&active);
|
||||||
opal_event_set_priority(&ev, ORTE_ERROR_PRI);
|
opal_event_set_priority(&ev, ORTE_ERROR_PRI);
|
||||||
|
ORTE_POST_OBJECT(ev);
|
||||||
opal_event_active(&ev, OPAL_EV_WRITE, 1);
|
opal_event_active(&ev, OPAL_EV_WRITE, 1);
|
||||||
ORTE_WAIT_FOR_COMPLETION(active);
|
ORTE_WAIT_FOR_COMPLETION(active);
|
||||||
} else {
|
} else {
|
||||||
@ -243,12 +247,14 @@ void orte_rml_recv_callback(int status, orte_process_name_t* sender,
|
|||||||
{
|
{
|
||||||
orte_rml_recv_cb_t *blob = (orte_rml_recv_cb_t*)cbdata;
|
orte_rml_recv_cb_t *blob = (orte_rml_recv_cb_t*)cbdata;
|
||||||
|
|
||||||
|
ORTE_ACQUIRE_OBJECT(blob);
|
||||||
/* transfer the sender */
|
/* transfer the sender */
|
||||||
blob->name.jobid = sender->jobid;
|
blob->name.jobid = sender->jobid;
|
||||||
blob->name.vpid = sender->vpid;
|
blob->name.vpid = sender->vpid;
|
||||||
/* just copy the payload to the buf */
|
/* just copy the payload to the buf */
|
||||||
opal_dss.copy_payload(&blob->data, buffer);
|
opal_dss.copy_payload(&blob->data, buffer);
|
||||||
/* flag as complete */
|
/* flag as complete */
|
||||||
|
ORTE_POST_OBJECT(blob);
|
||||||
blob->active = false;
|
blob->active = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -42,6 +42,7 @@
|
|||||||
#include "orte/runtime/orte_globals.h"
|
#include "orte/runtime/orte_globals.h"
|
||||||
#include "orte/runtime/orte_wait.h"
|
#include "orte/runtime/orte_wait.h"
|
||||||
#include "orte/util/name_fns.h"
|
#include "orte/util/name_fns.h"
|
||||||
|
#include "orte/util/threads.h"
|
||||||
|
|
||||||
#include "orte/mca/rml/rml.h"
|
#include "orte/mca/rml/rml.h"
|
||||||
#include "orte/mca/rml/base/base.h"
|
#include "orte/mca/rml/base/base.h"
|
||||||
@ -57,6 +58,8 @@ void orte_rml_base_post_recv(int sd, short args, void *cbdata)
|
|||||||
orte_rml_posted_recv_t *post, *recv;
|
orte_rml_posted_recv_t *post, *recv;
|
||||||
orte_ns_cmp_bitmask_t mask = ORTE_NS_CMP_ALL | ORTE_NS_CMP_WILD;
|
orte_ns_cmp_bitmask_t mask = ORTE_NS_CMP_ALL | ORTE_NS_CMP_WILD;
|
||||||
|
|
||||||
|
ORTE_ACQUIRE_OBJECT(req);
|
||||||
|
|
||||||
opal_output_verbose(5, orte_rml_base_framework.framework_output,
|
opal_output_verbose(5, orte_rml_base_framework.framework_output,
|
||||||
"%s posting recv",
|
"%s posting recv",
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
|
||||||
@ -159,6 +162,8 @@ void orte_rml_base_process_msg(int fd, short flags, void *cbdata)
|
|||||||
orte_ns_cmp_bitmask_t mask = ORTE_NS_CMP_ALL | ORTE_NS_CMP_WILD;
|
orte_ns_cmp_bitmask_t mask = ORTE_NS_CMP_ALL | ORTE_NS_CMP_WILD;
|
||||||
opal_buffer_t buf;
|
opal_buffer_t buf;
|
||||||
|
|
||||||
|
ORTE_ACQUIRE_OBJECT(msg);
|
||||||
|
|
||||||
OPAL_OUTPUT_VERBOSE((5, orte_rml_base_framework.framework_output,
|
OPAL_OUTPUT_VERBOSE((5, orte_rml_base_framework.framework_output,
|
||||||
"%s message received from %s for tag %d",
|
"%s message received from %s for tag %d",
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
|
@ -30,6 +30,7 @@
|
|||||||
#include "orte/mca/state/state.h"
|
#include "orte/mca/state/state.h"
|
||||||
#include "orte/runtime/orte_wait.h"
|
#include "orte/runtime/orte_wait.h"
|
||||||
#include "orte/util/name_fns.h"
|
#include "orte/util/name_fns.h"
|
||||||
|
#include "orte/util/threads.h"
|
||||||
|
|
||||||
#include "orte/mca/rml/base/base.h"
|
#include "orte/mca/rml/base/base.h"
|
||||||
|
|
||||||
@ -269,11 +270,7 @@ void orte_rml_API_recv_nb(orte_process_name_t* peer,
|
|||||||
req->post->persistent = persistent;
|
req->post->persistent = persistent;
|
||||||
req->post->cbfunc.iov = cbfunc;
|
req->post->cbfunc.iov = cbfunc;
|
||||||
req->post->cbdata = cbdata;
|
req->post->cbdata = cbdata;
|
||||||
opal_event_set(orte_event_base, &req->ev, -1,
|
ORTE_THREADSHIFT(req, orte_event_base, orte_rml_base_post_recv, ORTE_MSG_PRI);
|
||||||
OPAL_EV_WRITE,
|
|
||||||
orte_rml_base_post_recv, req);
|
|
||||||
opal_event_set_priority(&req->ev, ORTE_MSG_PRI);
|
|
||||||
opal_event_active(&req->ev, OPAL_EV_WRITE, 1);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Receive non-blocking buffer message */
|
/** Receive non-blocking buffer message */
|
||||||
@ -300,11 +297,7 @@ void orte_rml_API_recv_buffer_nb(orte_process_name_t* peer,
|
|||||||
req->post->persistent = persistent;
|
req->post->persistent = persistent;
|
||||||
req->post->cbfunc.buffer = cbfunc;
|
req->post->cbfunc.buffer = cbfunc;
|
||||||
req->post->cbdata = cbdata;
|
req->post->cbdata = cbdata;
|
||||||
opal_event_set(orte_event_base, &req->ev, -1,
|
ORTE_THREADSHIFT(req, orte_event_base, orte_rml_base_post_recv, ORTE_MSG_PRI);
|
||||||
OPAL_EV_WRITE,
|
|
||||||
orte_rml_base_post_recv, req);
|
|
||||||
opal_event_set_priority(&req->ev, ORTE_MSG_PRI);
|
|
||||||
opal_event_active(&req->ev, OPAL_EV_WRITE, 1);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Cancel posted non-blocking receive */
|
/** Cancel posted non-blocking receive */
|
||||||
@ -316,6 +309,8 @@ void orte_rml_API_recv_cancel(orte_process_name_t* peer, orte_rml_tag_t tag)
|
|||||||
"%s rml_recv_cancel for peer %s tag %d",
|
"%s rml_recv_cancel for peer %s tag %d",
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
ORTE_NAME_PRINT(peer), tag);
|
ORTE_NAME_PRINT(peer), tag);
|
||||||
|
|
||||||
|
ORTE_ACQUIRE_OBJECT(orte_event_base_active);
|
||||||
if (!orte_event_base_active) {
|
if (!orte_event_base_active) {
|
||||||
/* no event will be processed any more, so simply return. */
|
/* no event will be processed any more, so simply return. */
|
||||||
return;
|
return;
|
||||||
@ -328,11 +323,7 @@ void orte_rml_API_recv_cancel(orte_process_name_t* peer, orte_rml_tag_t tag)
|
|||||||
req->post->peer.jobid = peer->jobid;
|
req->post->peer.jobid = peer->jobid;
|
||||||
req->post->peer.vpid = peer->vpid;
|
req->post->peer.vpid = peer->vpid;
|
||||||
req->post->tag = tag;
|
req->post->tag = tag;
|
||||||
opal_event_set(orte_event_base, &req->ev, -1,
|
ORTE_THREADSHIFT(req, orte_event_base, orte_rml_base_post_recv, ORTE_MSG_PRI);
|
||||||
OPAL_EV_WRITE,
|
|
||||||
orte_rml_base_post_recv, req);
|
|
||||||
opal_event_set_priority(&req->ev, ORTE_MSG_PRI);
|
|
||||||
opal_event_active(&req->ev, OPAL_EV_WRITE, 1);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Purge information */
|
/** Purge information */
|
||||||
|
@ -29,6 +29,7 @@
|
|||||||
#include "orte/mca/errmgr/errmgr.h"
|
#include "orte/mca/errmgr/errmgr.h"
|
||||||
#include "orte/mca/oob/base/base.h"
|
#include "orte/mca/oob/base/base.h"
|
||||||
#include "orte/util/name_fns.h"
|
#include "orte/util/name_fns.h"
|
||||||
|
#include "orte/util/threads.h"
|
||||||
#include "orte/runtime/orte_globals.h"
|
#include "orte/runtime/orte_globals.h"
|
||||||
|
|
||||||
#include "orte/mca/rml/base/base.h"
|
#include "orte/mca/rml/base/base.h"
|
||||||
@ -39,6 +40,8 @@ static void send_self_exe(int fd, short args, void* data)
|
|||||||
{
|
{
|
||||||
orte_self_send_xfer_t *xfer = (orte_self_send_xfer_t*)data;
|
orte_self_send_xfer_t *xfer = (orte_self_send_xfer_t*)data;
|
||||||
|
|
||||||
|
ORTE_ACQUIRE_OBJECT(xfer);
|
||||||
|
|
||||||
OPAL_OUTPUT_VERBOSE((1, orte_rml_base_framework.framework_output,
|
OPAL_OUTPUT_VERBOSE((1, orte_rml_base_framework.framework_output,
|
||||||
"%s rml_send_to_self callback executing for tag %d",
|
"%s rml_send_to_self callback executing for tag %d",
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), xfer->tag));
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), xfer->tag));
|
||||||
@ -130,9 +133,7 @@ int orte_rml_oob_send_nb(struct orte_rml_base_module_t *mod,
|
|||||||
xfer->tag = tag;
|
xfer->tag = tag;
|
||||||
xfer->cbdata = cbdata;
|
xfer->cbdata = cbdata;
|
||||||
/* setup the event for the send callback */
|
/* setup the event for the send callback */
|
||||||
opal_event_set(orte_event_base, &xfer->ev, -1, OPAL_EV_WRITE, send_self_exe, xfer);
|
ORTE_THREADSHIFT(xfer, orte_event_base, send_self_exe, ORTE_MSG_PRI);
|
||||||
opal_event_set_priority(&xfer->ev, ORTE_MSG_PRI);
|
|
||||||
opal_event_active(&xfer->ev, OPAL_EV_WRITE, 1);
|
|
||||||
|
|
||||||
/* copy the message for the recv */
|
/* copy the message for the recv */
|
||||||
rcv = OBJ_NEW(orte_rml_recv_t);
|
rcv = OBJ_NEW(orte_rml_recv_t);
|
||||||
@ -235,9 +236,7 @@ int orte_rml_oob_send_buffer_nb(struct orte_rml_base_module_t *mod,
|
|||||||
xfer->tag = tag;
|
xfer->tag = tag;
|
||||||
xfer->cbdata = cbdata;
|
xfer->cbdata = cbdata;
|
||||||
/* setup the event for the send callback */
|
/* setup the event for the send callback */
|
||||||
opal_event_set(orte_event_base, &xfer->ev, -1, OPAL_EV_WRITE, send_self_exe, xfer);
|
ORTE_THREADSHIFT(xfer, orte_event_base, send_self_exe, ORTE_MSG_PRI);
|
||||||
opal_event_set_priority(&xfer->ev, ORTE_MSG_PRI);
|
|
||||||
opal_event_active(&xfer->ev, OPAL_EV_WRITE, 1);
|
|
||||||
|
|
||||||
/* copy the message for the recv */
|
/* copy the message for the recv */
|
||||||
rcv = OBJ_NEW(orte_rml_recv_t);
|
rcv = OBJ_NEW(orte_rml_recv_t);
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
/*
|
/*
|
||||||
* Copyright (c) 2014-2015 Intel, Inc. All rights reserved
|
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
|
||||||
* Copyright (c) 2017 Cisco Systems, Inc. All rights reserved
|
* Copyright (c) 2017 Cisco Systems, Inc. All rights reserved
|
||||||
* $COPYRIGHT$
|
* $COPYRIGHT$
|
||||||
*
|
*
|
||||||
@ -39,11 +39,9 @@ static void set(orte_job_t *jdata,
|
|||||||
int write_fd);
|
int write_fd);
|
||||||
|
|
||||||
orte_rtc_base_module_t orte_rtc_hwloc_module = {
|
orte_rtc_base_module_t orte_rtc_hwloc_module = {
|
||||||
init,
|
.init = init,
|
||||||
finalize,
|
.finalize = finalize,
|
||||||
NULL,
|
.set = set
|
||||||
set,
|
|
||||||
NULL
|
|
||||||
};
|
};
|
||||||
|
|
||||||
static int init(void)
|
static int init(void)
|
||||||
|
@ -36,6 +36,7 @@
|
|||||||
#include "orte/mca/rml/rml.h"
|
#include "orte/mca/rml/rml.h"
|
||||||
#include "orte/mca/routed/routed.h"
|
#include "orte/mca/routed/routed.h"
|
||||||
#include "orte/util/session_dir.h"
|
#include "orte/util/session_dir.h"
|
||||||
|
#include "orte/util/threads.h"
|
||||||
|
|
||||||
#include "orte/mca/state/base/base.h"
|
#include "orte/mca/state/base/base.h"
|
||||||
#include "orte/mca/state/base/state_private.h"
|
#include "orte/mca/state/base/state_private.h"
|
||||||
@ -78,9 +79,7 @@ void orte_state_base_activate_job_state(orte_job_t *jdata,
|
|||||||
caddy->job_state = state;
|
caddy->job_state = state;
|
||||||
OBJ_RETAIN(jdata);
|
OBJ_RETAIN(jdata);
|
||||||
}
|
}
|
||||||
opal_event_set(orte_event_base, &caddy->ev, -1, OPAL_EV_WRITE, s->cbfunc, caddy);
|
ORTE_THREADSHIFT(caddy, orte_event_base, s->cbfunc, s->priority);
|
||||||
opal_event_set_priority(&caddy->ev, s->priority);
|
|
||||||
opal_event_active(&caddy->ev, OPAL_EV_WRITE, 1);
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -112,9 +111,7 @@ void orte_state_base_activate_job_state(orte_job_t *jdata,
|
|||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
(NULL == jdata) ? "NULL" : ORTE_JOBID_PRINT(jdata->jobid),
|
(NULL == jdata) ? "NULL" : ORTE_JOBID_PRINT(jdata->jobid),
|
||||||
orte_job_state_to_str(state), s->priority));
|
orte_job_state_to_str(state), s->priority));
|
||||||
opal_event_set(orte_event_base, &caddy->ev, -1, OPAL_EV_WRITE, s->cbfunc, caddy);
|
ORTE_THREADSHIFT(caddy, orte_event_base, s->cbfunc, s->priority);
|
||||||
opal_event_set_priority(&caddy->ev, s->priority);
|
|
||||||
opal_event_active(&caddy->ev, OPAL_EV_WRITE, 1);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -262,9 +259,7 @@ void orte_state_base_activate_proc_state(orte_process_name_t *proc,
|
|||||||
caddy = OBJ_NEW(orte_state_caddy_t);
|
caddy = OBJ_NEW(orte_state_caddy_t);
|
||||||
caddy->name = *proc;
|
caddy->name = *proc;
|
||||||
caddy->proc_state = state;
|
caddy->proc_state = state;
|
||||||
opal_event_set(orte_event_base, &caddy->ev, -1, OPAL_EV_WRITE, s->cbfunc, caddy);
|
ORTE_THREADSHIFT(caddy, orte_event_base, s->cbfunc, s->priority);
|
||||||
opal_event_set_priority(&caddy->ev, s->priority);
|
|
||||||
opal_event_active(&caddy->ev, OPAL_EV_WRITE, 1);
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -293,9 +288,7 @@ void orte_state_base_activate_proc_state(orte_process_name_t *proc,
|
|||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
ORTE_NAME_PRINT(proc),
|
ORTE_NAME_PRINT(proc),
|
||||||
orte_proc_state_to_str(state), s->priority));
|
orte_proc_state_to_str(state), s->priority));
|
||||||
opal_event_set(orte_event_base, &caddy->ev, -1, OPAL_EV_WRITE, s->cbfunc, caddy);
|
ORTE_THREADSHIFT(caddy, orte_event_base, s->cbfunc, s->priority);
|
||||||
opal_event_set_priority(&caddy->ev, s->priority);
|
|
||||||
opal_event_active(&caddy->ev, OPAL_EV_WRITE, 1);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
int orte_state_base_add_proc_state(orte_proc_state_t state,
|
int orte_state_base_add_proc_state(orte_proc_state_t state,
|
||||||
@ -443,7 +436,10 @@ void orte_state_base_local_launch_complete(int fd, short argc, void *cbdata)
|
|||||||
void orte_state_base_cleanup_job(int fd, short argc, void *cbdata)
|
void orte_state_base_cleanup_job(int fd, short argc, void *cbdata)
|
||||||
{
|
{
|
||||||
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
|
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
|
||||||
orte_job_t *jdata = caddy->jdata;
|
orte_job_t *jdata;
|
||||||
|
|
||||||
|
ORTE_ACQUIRE_OBJECT(caddy);
|
||||||
|
jdata = caddy->jdata;
|
||||||
|
|
||||||
OPAL_OUTPUT_VERBOSE((2, orte_state_base_framework.framework_output,
|
OPAL_OUTPUT_VERBOSE((2, orte_state_base_framework.framework_output,
|
||||||
"%s state:base:cleanup on job %s",
|
"%s state:base:cleanup on job %s",
|
||||||
@ -460,7 +456,10 @@ void orte_state_base_cleanup_job(int fd, short argc, void *cbdata)
|
|||||||
void orte_state_base_report_progress(int fd, short argc, void *cbdata)
|
void orte_state_base_report_progress(int fd, short argc, void *cbdata)
|
||||||
{
|
{
|
||||||
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
|
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
|
||||||
orte_job_t *jdata = caddy->jdata;
|
orte_job_t *jdata;
|
||||||
|
|
||||||
|
ORTE_ACQUIRE_OBJECT(caddy);
|
||||||
|
jdata = caddy->jdata;
|
||||||
|
|
||||||
opal_output(orte_clean_output, "App launch reported: %d (out of %d) daemons - %d (out of %d) procs",
|
opal_output(orte_clean_output, "App launch reported: %d (out of %d) daemons - %d (out of %d) procs",
|
||||||
(int)jdata->num_daemons_reported, (int)orte_process_info.num_procs,
|
(int)jdata->num_daemons_reported, (int)orte_process_info.num_procs,
|
||||||
@ -659,14 +658,18 @@ static void _send_notification(int status,
|
|||||||
void orte_state_base_track_procs(int fd, short argc, void *cbdata)
|
void orte_state_base_track_procs(int fd, short argc, void *cbdata)
|
||||||
{
|
{
|
||||||
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
|
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
|
||||||
orte_process_name_t *proc = &caddy->name;
|
orte_process_name_t *proc;
|
||||||
orte_proc_state_t state = caddy->proc_state;
|
orte_proc_state_t state;
|
||||||
orte_job_t *jdata;
|
orte_job_t *jdata;
|
||||||
orte_proc_t *pdata;
|
orte_proc_t *pdata;
|
||||||
int i;
|
int i;
|
||||||
char *rtmod;
|
char *rtmod;
|
||||||
orte_process_name_t parent, target, *npptr;
|
orte_process_name_t parent, target, *npptr;
|
||||||
|
|
||||||
|
ORTE_ACQUIRE_OBJECT(caddy);
|
||||||
|
proc = &caddy->name;
|
||||||
|
state = caddy->proc_state;
|
||||||
|
|
||||||
opal_output_verbose(5, orte_state_base_framework.framework_output,
|
opal_output_verbose(5, orte_state_base_framework.framework_output,
|
||||||
"%s state:base:track_procs called for proc %s state %s",
|
"%s state:base:track_procs called for proc %s state %s",
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
@ -811,8 +814,7 @@ void orte_state_base_track_procs(int fd, short argc, void *cbdata)
|
|||||||
void orte_state_base_check_all_complete(int fd, short args, void *cbdata)
|
void orte_state_base_check_all_complete(int fd, short args, void *cbdata)
|
||||||
{
|
{
|
||||||
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
|
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
|
||||||
orte_job_t *jdata = caddy->jdata;
|
orte_job_t *jdata;
|
||||||
|
|
||||||
orte_proc_t *proc;
|
orte_proc_t *proc;
|
||||||
int i;
|
int i;
|
||||||
orte_std_cntr_t j;
|
orte_std_cntr_t j;
|
||||||
@ -827,6 +829,9 @@ void orte_state_base_check_all_complete(int fd, short args, void *cbdata)
|
|||||||
void *nptr;
|
void *nptr;
|
||||||
char *rtmod;
|
char *rtmod;
|
||||||
|
|
||||||
|
ORTE_ACQUIRE_OBJECT(caddy);
|
||||||
|
jdata = caddy->jdata;
|
||||||
|
|
||||||
opal_output_verbose(2, orte_state_base_framework.framework_output,
|
opal_output_verbose(2, orte_state_base_framework.framework_output,
|
||||||
"%s state:base:check_job_complete on job %s",
|
"%s state:base:check_job_complete on job %s",
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
|
@ -31,6 +31,7 @@
|
|||||||
#include "orte/mca/routed/routed.h"
|
#include "orte/mca/routed/routed.h"
|
||||||
#include "orte/util/nidmap.h"
|
#include "orte/util/nidmap.h"
|
||||||
#include "orte/util/session_dir.h"
|
#include "orte/util/session_dir.h"
|
||||||
|
#include "orte/util/threads.h"
|
||||||
#include "orte/runtime/orte_quit.h"
|
#include "orte/runtime/orte_quit.h"
|
||||||
#include "orte/runtime/orte_wait.h"
|
#include "orte/runtime/orte_wait.h"
|
||||||
|
|
||||||
@ -223,6 +224,8 @@ static void init_complete(int sd, short args, void *cbdata)
|
|||||||
{
|
{
|
||||||
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
|
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
|
||||||
|
|
||||||
|
ORTE_ACQUIRE_OBJECT(caddy);
|
||||||
|
|
||||||
/* nothing to do here but move along - if it is the
|
/* nothing to do here but move along - if it is the
|
||||||
* daemon job, then next step is allocate */
|
* daemon job, then next step is allocate */
|
||||||
if (caddy->jdata->jobid == ORTE_PROC_MY_NAME->jobid) {
|
if (caddy->jdata->jobid == ORTE_PROC_MY_NAME->jobid) {
|
||||||
@ -249,6 +252,8 @@ static void vm_ready(int fd, short args, void *cbdata)
|
|||||||
int32_t numbytes;
|
int32_t numbytes;
|
||||||
char *nidmap;
|
char *nidmap;
|
||||||
|
|
||||||
|
ORTE_ACQUIRE_OBJECT(caddy);
|
||||||
|
|
||||||
/* if this is my job, then we are done */
|
/* if this is my job, then we are done */
|
||||||
if (ORTE_PROC_MY_NAME->jobid == caddy->jdata->jobid) {
|
if (ORTE_PROC_MY_NAME->jobid == caddy->jdata->jobid) {
|
||||||
/* send the daemon map to every daemon in this DVM - we
|
/* send the daemon map to every daemon in this DVM - we
|
||||||
@ -353,8 +358,7 @@ static void vm_ready(int fd, short args, void *cbdata)
|
|||||||
static void check_complete(int fd, short args, void *cbdata)
|
static void check_complete(int fd, short args, void *cbdata)
|
||||||
{
|
{
|
||||||
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
|
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
|
||||||
orte_job_t *jdata = caddy->jdata;
|
orte_job_t *jdata;
|
||||||
|
|
||||||
orte_proc_t *proc;
|
orte_proc_t *proc;
|
||||||
int i;
|
int i;
|
||||||
orte_node_t *node;
|
orte_node_t *node;
|
||||||
@ -362,6 +366,9 @@ static void check_complete(int fd, short args, void *cbdata)
|
|||||||
orte_std_cntr_t index;
|
orte_std_cntr_t index;
|
||||||
char *rtmod;
|
char *rtmod;
|
||||||
|
|
||||||
|
ORTE_ACQUIRE_OBJECT(caddy);
|
||||||
|
jdata = caddy->jdata;
|
||||||
|
|
||||||
opal_output_verbose(2, orte_state_base_framework.framework_output,
|
opal_output_verbose(2, orte_state_base_framework.framework_output,
|
||||||
"%s state:dvm:check_job_complete on job %s",
|
"%s state:dvm:check_job_complete on job %s",
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
@ -472,7 +479,10 @@ static void check_complete(int fd, short args, void *cbdata)
|
|||||||
static void cleanup_job(int sd, short args, void *cbdata)
|
static void cleanup_job(int sd, short args, void *cbdata)
|
||||||
{
|
{
|
||||||
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
|
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
|
||||||
orte_job_t *jdata = caddy->jdata;
|
orte_job_t *jdata;
|
||||||
|
|
||||||
|
ORTE_ACQUIRE_OBJECT(caddy);
|
||||||
|
jdata = caddy->jdata;
|
||||||
|
|
||||||
/* remove this object from the job array */
|
/* remove this object from the job array */
|
||||||
opal_hash_table_set_value_uint32(orte_job_data, jdata->jobid, NULL);
|
opal_hash_table_set_value_uint32(orte_job_data, jdata->jobid, NULL);
|
||||||
|
@ -26,6 +26,7 @@
|
|||||||
#include "orte/mca/rmaps/base/base.h"
|
#include "orte/mca/rmaps/base/base.h"
|
||||||
#include "orte/mca/routed/routed.h"
|
#include "orte/mca/routed/routed.h"
|
||||||
#include "orte/util/session_dir.h"
|
#include "orte/util/session_dir.h"
|
||||||
|
#include "orte/util/threads.h"
|
||||||
#include "orte/runtime/orte_quit.h"
|
#include "orte/runtime/orte_quit.h"
|
||||||
|
|
||||||
#include "orte/mca/state/state.h"
|
#include "orte/mca/state/state.h"
|
||||||
@ -196,12 +197,15 @@ static int finalize(void)
|
|||||||
static void allocation_complete(int fd, short args, void *cbdata)
|
static void allocation_complete(int fd, short args, void *cbdata)
|
||||||
{
|
{
|
||||||
orte_state_caddy_t *state = (orte_state_caddy_t*)cbdata;
|
orte_state_caddy_t *state = (orte_state_caddy_t*)cbdata;
|
||||||
orte_job_t *jdata = state->jdata;
|
orte_job_t *jdata;
|
||||||
orte_job_t *daemons;
|
orte_job_t *daemons;
|
||||||
orte_topology_t *t;
|
orte_topology_t *t;
|
||||||
orte_node_t *node;
|
orte_node_t *node;
|
||||||
int i;
|
int i;
|
||||||
|
|
||||||
|
ORTE_ACQUIRE_OBJECT(caddy);
|
||||||
|
jdata = state->jdata;
|
||||||
|
|
||||||
jdata->state = ORTE_JOB_STATE_ALLOCATION_COMPLETE;
|
jdata->state = ORTE_JOB_STATE_ALLOCATION_COMPLETE;
|
||||||
|
|
||||||
/* get the daemon job object */
|
/* get the daemon job object */
|
||||||
@ -252,7 +256,10 @@ static void allocation_complete(int fd, short args, void *cbdata)
|
|||||||
static void map_complete(int fd, short args, void *cbdata)
|
static void map_complete(int fd, short args, void *cbdata)
|
||||||
{
|
{
|
||||||
orte_state_caddy_t *state = (orte_state_caddy_t*)cbdata;
|
orte_state_caddy_t *state = (orte_state_caddy_t*)cbdata;
|
||||||
orte_job_t *jdata = state->jdata;
|
orte_job_t *jdata;
|
||||||
|
|
||||||
|
ORTE_ACQUIRE_OBJECT(caddy);
|
||||||
|
jdata = state->jdata;
|
||||||
|
|
||||||
jdata->state = ORTE_JOB_STATE_MAP_COMPLETE;
|
jdata->state = ORTE_JOB_STATE_MAP_COMPLETE;
|
||||||
/* move to the map stage */
|
/* move to the map stage */
|
||||||
@ -265,7 +272,10 @@ static void map_complete(int fd, short args, void *cbdata)
|
|||||||
static void vm_ready(int fd, short args, void *cbdata)
|
static void vm_ready(int fd, short args, void *cbdata)
|
||||||
{
|
{
|
||||||
orte_state_caddy_t *state = (orte_state_caddy_t*)cbdata;
|
orte_state_caddy_t *state = (orte_state_caddy_t*)cbdata;
|
||||||
orte_job_t *jdata = state->jdata;
|
orte_job_t *jdata;
|
||||||
|
|
||||||
|
ORTE_ACQUIRE_OBJECT(caddy);
|
||||||
|
jdata = state->jdata;
|
||||||
|
|
||||||
/* now that the daemons are launched, we are ready
|
/* now that the daemons are launched, we are ready
|
||||||
* to roll
|
* to roll
|
||||||
|
@ -27,6 +27,7 @@
|
|||||||
#include "orte/mca/rml/rml.h"
|
#include "orte/mca/rml/rml.h"
|
||||||
#include "orte/mca/routed/routed.h"
|
#include "orte/mca/routed/routed.h"
|
||||||
#include "orte/util/session_dir.h"
|
#include "orte/util/session_dir.h"
|
||||||
|
#include "orte/util/threads.h"
|
||||||
#include "orte/orted/pmix/pmix_server_internal.h"
|
#include "orte/orted/pmix/pmix_server_internal.h"
|
||||||
#include "orte/runtime/orte_data_server.h"
|
#include "orte/runtime/orte_data_server.h"
|
||||||
#include "orte/runtime/orte_quit.h"
|
#include "orte/runtime/orte_quit.h"
|
||||||
@ -165,6 +166,8 @@ static void track_jobs(int fd, short argc, void *cbdata)
|
|||||||
orte_proc_t *child;
|
orte_proc_t *child;
|
||||||
orte_vpid_t null=ORTE_VPID_INVALID;
|
orte_vpid_t null=ORTE_VPID_INVALID;
|
||||||
|
|
||||||
|
ORTE_ACQUIRE_OBJECT(caddy);
|
||||||
|
|
||||||
if (ORTE_JOB_STATE_LOCAL_LAUNCH_COMPLETE == caddy->job_state) {
|
if (ORTE_JOB_STATE_LOCAL_LAUNCH_COMPLETE == caddy->job_state) {
|
||||||
OPAL_OUTPUT_VERBOSE((5, orte_state_base_framework.framework_output,
|
OPAL_OUTPUT_VERBOSE((5, orte_state_base_framework.framework_output,
|
||||||
"%s state:orted:track_jobs sending local launch complete for job %s",
|
"%s state:orted:track_jobs sending local launch complete for job %s",
|
||||||
@ -251,8 +254,8 @@ static void track_jobs(int fd, short argc, void *cbdata)
|
|||||||
static void track_procs(int fd, short argc, void *cbdata)
|
static void track_procs(int fd, short argc, void *cbdata)
|
||||||
{
|
{
|
||||||
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
|
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
|
||||||
orte_process_name_t *proc = &caddy->name;
|
orte_process_name_t *proc;
|
||||||
orte_proc_state_t state = caddy->proc_state;
|
orte_proc_state_t state;
|
||||||
orte_job_t *jdata;
|
orte_job_t *jdata;
|
||||||
orte_proc_t *pdata, *pptr;
|
orte_proc_t *pdata, *pptr;
|
||||||
opal_buffer_t *alert;
|
opal_buffer_t *alert;
|
||||||
@ -264,6 +267,10 @@ static void track_procs(int fd, short argc, void *cbdata)
|
|||||||
orte_node_t *node;
|
orte_node_t *node;
|
||||||
orte_process_name_t target;
|
orte_process_name_t target;
|
||||||
|
|
||||||
|
ORTE_ACQUIRE_OBJECT(caddy);
|
||||||
|
proc = &caddy->name;
|
||||||
|
state = caddy->proc_state;
|
||||||
|
|
||||||
OPAL_OUTPUT_VERBOSE((5, orte_state_base_framework.framework_output,
|
OPAL_OUTPUT_VERBOSE((5, orte_state_base_framework.framework_output,
|
||||||
"%s state:orted:track_procs called for proc %s state %s",
|
"%s state:orted:track_procs called for proc %s state %s",
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
|
@ -77,6 +77,7 @@
|
|||||||
#include "orte/mca/rml/base/rml_contact.h"
|
#include "orte/mca/rml/base/rml_contact.h"
|
||||||
#include "orte/util/pre_condition_transports.h"
|
#include "orte/util/pre_condition_transports.h"
|
||||||
#include "orte/util/compress.h"
|
#include "orte/util/compress.h"
|
||||||
|
#include "orte/util/threads.h"
|
||||||
|
|
||||||
#include "orte/mca/errmgr/errmgr.h"
|
#include "orte/mca/errmgr/errmgr.h"
|
||||||
#include "orte/mca/ess/ess.h"
|
#include "orte/mca/ess/ess.h"
|
||||||
@ -919,6 +920,7 @@ int orte_daemon(int argc, char *argv[])
|
|||||||
while (orte_event_base_active) {
|
while (orte_event_base_active) {
|
||||||
opal_event_loop(orte_event_base, OPAL_EVLOOP_ONCE);
|
opal_event_loop(orte_event_base, OPAL_EVLOOP_ONCE);
|
||||||
}
|
}
|
||||||
|
ORTE_ACQUIRE_OBJECT(orte_event_base_active);
|
||||||
|
|
||||||
/* ensure all local procs are dead */
|
/* ensure all local procs are dead */
|
||||||
orte_odls.kill_local_procs(NULL);
|
orte_odls.kill_local_procs(NULL);
|
||||||
|
@ -68,6 +68,7 @@
|
|||||||
#include "orte/util/proc_info.h"
|
#include "orte/util/proc_info.h"
|
||||||
#include "orte/util/session_dir.h"
|
#include "orte/util/session_dir.h"
|
||||||
#include "orte/util/show_help.h"
|
#include "orte/util/show_help.h"
|
||||||
|
#include "orte/util/threads.h"
|
||||||
#include "orte/runtime/orte_globals.h"
|
#include "orte/runtime/orte_globals.h"
|
||||||
|
|
||||||
#include "pmix_server.h"
|
#include "pmix_server.h"
|
||||||
@ -350,6 +351,8 @@ static void _mdxresp(int sd, short args, void *cbdata)
|
|||||||
int rc;
|
int rc;
|
||||||
opal_buffer_t *reply;
|
opal_buffer_t *reply;
|
||||||
|
|
||||||
|
ORTE_ACQUIRE_OBJECT(req);
|
||||||
|
|
||||||
/* check us out of the hotel */
|
/* check us out of the hotel */
|
||||||
opal_hotel_checkout(&orte_pmix_server_globals.reqs, req->room_num);
|
opal_hotel_checkout(&orte_pmix_server_globals.reqs, req->room_num);
|
||||||
|
|
||||||
@ -399,6 +402,8 @@ static void modex_resp(int status,
|
|||||||
pmix_server_req_t *req = (pmix_server_req_t*)cbdata;
|
pmix_server_req_t *req = (pmix_server_req_t*)cbdata;
|
||||||
opal_buffer_t xfer;
|
opal_buffer_t xfer;
|
||||||
|
|
||||||
|
ORTE_ACQUIRE_OBJECT(req);
|
||||||
|
|
||||||
req->status = status;
|
req->status = status;
|
||||||
/* we need to preserve the data as the caller
|
/* we need to preserve the data as the caller
|
||||||
* will free it upon our return */
|
* will free it upon our return */
|
||||||
@ -413,6 +418,7 @@ static void modex_resp(int status,
|
|||||||
opal_event_set(orte_event_base, &(req->ev),
|
opal_event_set(orte_event_base, &(req->ev),
|
||||||
-1, OPAL_EV_WRITE, _mdxresp, req);
|
-1, OPAL_EV_WRITE, _mdxresp, req);
|
||||||
opal_event_set_priority(&(req->ev), ORTE_MSG_PRI);
|
opal_event_set_priority(&(req->ev), ORTE_MSG_PRI);
|
||||||
|
ORTE_POST_OBJECT(req);
|
||||||
opal_event_active(&(req->ev), OPAL_EV_WRITE, 1);
|
opal_event_active(&(req->ev), OPAL_EV_WRITE, 1);
|
||||||
}
|
}
|
||||||
static void pmix_server_dmdx_recv(int status, orte_process_name_t* sender,
|
static void pmix_server_dmdx_recv(int status, orte_process_name_t* sender,
|
||||||
|
@ -44,6 +44,7 @@
|
|||||||
#include "orte/mca/rmaps/base/base.h"
|
#include "orte/mca/rmaps/base/base.h"
|
||||||
#include "orte/util/name_fns.h"
|
#include "orte/util/name_fns.h"
|
||||||
#include "orte/util/show_help.h"
|
#include "orte/util/show_help.h"
|
||||||
|
#include "orte/util/threads.h"
|
||||||
#include "orte/runtime/orte_globals.h"
|
#include "orte/runtime/orte_globals.h"
|
||||||
#include "orte/mca/rml/rml.h"
|
#include "orte/mca/rml/rml.h"
|
||||||
|
|
||||||
@ -103,6 +104,8 @@ static void spawn(int sd, short args, void *cbdata)
|
|||||||
opal_buffer_t *buf;
|
opal_buffer_t *buf;
|
||||||
orte_plm_cmd_flag_t command;
|
orte_plm_cmd_flag_t command;
|
||||||
|
|
||||||
|
ORTE_ACQUIRE_OBJECT(req);
|
||||||
|
|
||||||
/* add this request to our tracker hotel */
|
/* add this request to our tracker hotel */
|
||||||
if (OPAL_SUCCESS != (rc = opal_hotel_checkin(&orte_pmix_server_globals.reqs, req, &req->room_num))) {
|
if (OPAL_SUCCESS != (rc = opal_hotel_checkin(&orte_pmix_server_globals.reqs, req, &req->room_num))) {
|
||||||
orte_show_help("help-orted.txt", "noroom", true, req->operation, orte_pmix_server_globals.num_rooms);
|
orte_show_help("help-orted.txt", "noroom", true, req->operation, orte_pmix_server_globals.num_rooms);
|
||||||
@ -351,6 +354,8 @@ static void _cnlk(int status, opal_list_t *data, void *cbdata)
|
|||||||
orte_job_t *jdata;
|
orte_job_t *jdata;
|
||||||
opal_buffer_t buf;
|
opal_buffer_t buf;
|
||||||
|
|
||||||
|
ORTE_ACQUIRE_OBJECT(cd);
|
||||||
|
|
||||||
/* if we failed to get the required data, then just inform
|
/* if we failed to get the required data, then just inform
|
||||||
* the embedded server that the connect cannot succeed */
|
* the embedded server that the connect cannot succeed */
|
||||||
if (ORTE_SUCCESS != status || NULL == data) {
|
if (ORTE_SUCCESS != status || NULL == data) {
|
||||||
@ -402,6 +407,8 @@ static void _cnct(int sd, short args, void *cbdata)
|
|||||||
orte_job_t *jdata;
|
orte_job_t *jdata;
|
||||||
int rc = ORTE_SUCCESS;
|
int rc = ORTE_SUCCESS;
|
||||||
|
|
||||||
|
ORTE_ACQUIRE_OBJECT(cd);
|
||||||
|
|
||||||
/* at some point, we need to add bookeeping to track which
|
/* at some point, we need to add bookeeping to track which
|
||||||
* procs are "connected" so we know who to notify upon
|
* procs are "connected" so we know who to notify upon
|
||||||
* termination or failure. For now, we have to ensure
|
* termination or failure. For now, we have to ensure
|
||||||
@ -477,6 +484,8 @@ static void mdxcbfunc(int status,
|
|||||||
{
|
{
|
||||||
orte_pmix_server_op_caddy_t *cd = (orte_pmix_server_op_caddy_t*)cbdata;
|
orte_pmix_server_op_caddy_t *cd = (orte_pmix_server_op_caddy_t*)cbdata;
|
||||||
|
|
||||||
|
ORTE_ACQUIRE_OBJECT(cd);
|
||||||
|
|
||||||
/* ack the call */
|
/* ack the call */
|
||||||
if (NULL != cd->cbfunc) {
|
if (NULL != cd->cbfunc) {
|
||||||
cd->cbfunc(status, cd->cbdata);
|
cd->cbfunc(status, cd->cbdata);
|
||||||
|
@ -38,6 +38,7 @@
|
|||||||
#include "orte/mca/errmgr/errmgr.h"
|
#include "orte/mca/errmgr/errmgr.h"
|
||||||
#include "orte/util/name_fns.h"
|
#include "orte/util/name_fns.h"
|
||||||
#include "orte/util/show_help.h"
|
#include "orte/util/show_help.h"
|
||||||
|
#include "orte/util/threads.h"
|
||||||
#include "orte/runtime/orte_globals.h"
|
#include "orte/runtime/orte_globals.h"
|
||||||
#include "orte/mca/grpcomm/grpcomm.h"
|
#include "orte/mca/grpcomm/grpcomm.h"
|
||||||
#include "orte/mca/rml/rml.h"
|
#include "orte/mca/rml/rml.h"
|
||||||
@ -59,6 +60,8 @@ static void pmix_server_release(int status, opal_buffer_t *buf, void *cbdata)
|
|||||||
int32_t ndata = 0;
|
int32_t ndata = 0;
|
||||||
int rc = OPAL_SUCCESS;
|
int rc = OPAL_SUCCESS;
|
||||||
|
|
||||||
|
ORTE_ACQUIRE_OBJECT(cd);
|
||||||
|
|
||||||
/* unload the buffer */
|
/* unload the buffer */
|
||||||
if (NULL != buf) {
|
if (NULL != buf) {
|
||||||
rc = opal_dss.unload(buf, (void**)&data, &ndata);
|
rc = opal_dss.unload(buf, (void**)&data, &ndata);
|
||||||
@ -135,6 +138,8 @@ static void dmodex_req(int sd, short args, void *cbdata)
|
|||||||
uint8_t *data=NULL;
|
uint8_t *data=NULL;
|
||||||
int32_t sz=0;
|
int32_t sz=0;
|
||||||
|
|
||||||
|
ORTE_ACQUIRE_OBJECT(rq);
|
||||||
|
|
||||||
/* a race condition exists here because of the thread-shift - it is
|
/* a race condition exists here because of the thread-shift - it is
|
||||||
* possible that data for the specified proc arrived while we were
|
* possible that data for the specified proc arrived while we were
|
||||||
* waiting to be serviced. In that case, the tracker that would have
|
* waiting to be serviced. In that case, the tracker that would have
|
||||||
|
@ -43,6 +43,7 @@
|
|||||||
#include "orte/mca/schizo/schizo.h"
|
#include "orte/mca/schizo/schizo.h"
|
||||||
#include "orte/mca/state/state.h"
|
#include "orte/mca/state/state.h"
|
||||||
#include "orte/util/name_fns.h"
|
#include "orte/util/name_fns.h"
|
||||||
|
#include "orte/util/threads.h"
|
||||||
#include "orte/runtime/orte_globals.h"
|
#include "orte/runtime/orte_globals.h"
|
||||||
#include "orte/mca/rml/rml.h"
|
#include "orte/mca/rml/rml.h"
|
||||||
#include "orte/mca/plm/plm.h"
|
#include "orte/mca/plm/plm.h"
|
||||||
@ -57,6 +58,8 @@ static void _client_conn(int sd, short args, void *cbdata)
|
|||||||
orte_proc_t *p, *ptr;
|
orte_proc_t *p, *ptr;
|
||||||
int i;
|
int i;
|
||||||
|
|
||||||
|
ORTE_ACQUIRE_OBJECT(cd);
|
||||||
|
|
||||||
if (NULL != cd->server_object) {
|
if (NULL != cd->server_object) {
|
||||||
/* we were passed back the orte_proc_t */
|
/* we were passed back the orte_proc_t */
|
||||||
p = (orte_proc_t*)cd->server_object;
|
p = (orte_proc_t*)cd->server_object;
|
||||||
@ -106,6 +109,8 @@ static void _client_finalized(int sd, short args, void *cbdata)
|
|||||||
orte_proc_t *p, *ptr;
|
orte_proc_t *p, *ptr;
|
||||||
int i;
|
int i;
|
||||||
|
|
||||||
|
ORTE_ACQUIRE_OBJECT(cd);
|
||||||
|
|
||||||
if (NULL != cd->server_object) {
|
if (NULL != cd->server_object) {
|
||||||
/* we were passed back the orte_proc_t */
|
/* we were passed back the orte_proc_t */
|
||||||
p = (orte_proc_t*)cd->server_object;
|
p = (orte_proc_t*)cd->server_object;
|
||||||
@ -164,6 +169,8 @@ static void _client_abort(int sd, short args, void *cbdata)
|
|||||||
orte_proc_t *p, *ptr;
|
orte_proc_t *p, *ptr;
|
||||||
int i;
|
int i;
|
||||||
|
|
||||||
|
ORTE_ACQUIRE_OBJECT(cd);
|
||||||
|
|
||||||
if (NULL != cd->server_object) {
|
if (NULL != cd->server_object) {
|
||||||
p = (orte_proc_t*)cd->server_object;
|
p = (orte_proc_t*)cd->server_object;
|
||||||
} else {
|
} else {
|
||||||
@ -214,6 +221,8 @@ static void _register_events(int sd, short args, void *cbdata)
|
|||||||
orte_pmix_server_op_caddy_t *cd = (orte_pmix_server_op_caddy_t*)cbdata;
|
orte_pmix_server_op_caddy_t *cd = (orte_pmix_server_op_caddy_t*)cbdata;
|
||||||
opal_value_t *info;
|
opal_value_t *info;
|
||||||
|
|
||||||
|
ORTE_ACQUIRE_OBJECT(cd);
|
||||||
|
|
||||||
/* the OPAL layer "owns" the list, but let's deconstruct it
|
/* the OPAL layer "owns" the list, but let's deconstruct it
|
||||||
* here so we don't have to duplicate the data */
|
* here so we don't have to duplicate the data */
|
||||||
while (NULL != (info = (opal_value_t*)opal_list_remove_first(cd->info))) {
|
while (NULL != (info = (opal_value_t*)opal_list_remove_first(cd->info))) {
|
||||||
@ -246,6 +255,8 @@ static void _deregister_events(int sd, short args, void *cbdata)
|
|||||||
orte_pmix_server_op_caddy_t *cd = (orte_pmix_server_op_caddy_t*)cbdata;
|
orte_pmix_server_op_caddy_t *cd = (orte_pmix_server_op_caddy_t*)cbdata;
|
||||||
opal_value_t *info, *iptr, *nptr;
|
opal_value_t *info, *iptr, *nptr;
|
||||||
|
|
||||||
|
ORTE_ACQUIRE_OBJECT(cd);
|
||||||
|
|
||||||
/* the OPAL layer "owns" the list, but let's deconstruct it
|
/* the OPAL layer "owns" the list, but let's deconstruct it
|
||||||
* here for consistency */
|
* here for consistency */
|
||||||
while (NULL != (info = (opal_value_t*)opal_list_remove_first(cd->info))) {
|
while (NULL != (info = (opal_value_t*)opal_list_remove_first(cd->info))) {
|
||||||
@ -281,6 +292,8 @@ static void _notify_release(int status, void *cbdata)
|
|||||||
{
|
{
|
||||||
orte_pmix_server_op_caddy_t *cd = (orte_pmix_server_op_caddy_t*)cbdata;
|
orte_pmix_server_op_caddy_t *cd = (orte_pmix_server_op_caddy_t*)cbdata;
|
||||||
|
|
||||||
|
ORTE_ACQUIRE_OBJECT(cd);
|
||||||
|
|
||||||
if (NULL != cd->info) {
|
if (NULL != cd->info) {
|
||||||
OPAL_LIST_RELEASE(cd->info);
|
OPAL_LIST_RELEASE(cd->info);
|
||||||
}
|
}
|
||||||
@ -465,6 +478,8 @@ static void _query(int sd, short args, void *cbdata)
|
|||||||
opal_pstats_t pstat;
|
opal_pstats_t pstat;
|
||||||
float pss;
|
float pss;
|
||||||
|
|
||||||
|
ORTE_ACQUIRE_OBJECT(cd);
|
||||||
|
|
||||||
opal_output_verbose(2, orte_pmix_server_globals.output,
|
opal_output_verbose(2, orte_pmix_server_globals.output,
|
||||||
"%s processing query",
|
"%s processing query",
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
|
||||||
@ -654,6 +669,7 @@ int pmix_server_query_fn(opal_process_name_t *requestor,
|
|||||||
opal_event_set(orte_event_base, &(cd->ev), -1,
|
opal_event_set(orte_event_base, &(cd->ev), -1,
|
||||||
OPAL_EV_WRITE, _query, cd);
|
OPAL_EV_WRITE, _query, cd);
|
||||||
opal_event_set_priority(&(cd->ev), ORTE_MSG_PRI);
|
opal_event_set_priority(&(cd->ev), ORTE_MSG_PRI);
|
||||||
|
ORTE_POST_OBJECT(cd);
|
||||||
opal_event_active(&(cd->ev), OPAL_EV_WRITE, 1);
|
opal_event_active(&(cd->ev), OPAL_EV_WRITE, 1);
|
||||||
|
|
||||||
return ORTE_SUCCESS;
|
return ORTE_SUCCESS;
|
||||||
@ -669,6 +685,8 @@ static void _toolconn(int sd, short args, void *cbdata)
|
|||||||
orte_process_name_t tool;
|
orte_process_name_t tool;
|
||||||
int rc;
|
int rc;
|
||||||
|
|
||||||
|
ORTE_ACQUIRE_OBJECT(cd);
|
||||||
|
|
||||||
opal_output_verbose(2, orte_pmix_server_globals.output,
|
opal_output_verbose(2, orte_pmix_server_globals.output,
|
||||||
"%s TOOL CONNECTION PROCESSING",
|
"%s TOOL CONNECTION PROCESSING",
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
|
||||||
@ -768,6 +786,7 @@ void pmix_tool_connected_fn(opal_list_t *info,
|
|||||||
opal_event_set(orte_event_base, &(cd->ev), -1,
|
opal_event_set(orte_event_base, &(cd->ev), -1,
|
||||||
OPAL_EV_WRITE, _toolconn, cd);
|
OPAL_EV_WRITE, _toolconn, cd);
|
||||||
opal_event_set_priority(&(cd->ev), ORTE_MSG_PRI);
|
opal_event_set_priority(&(cd->ev), ORTE_MSG_PRI);
|
||||||
|
ORTE_POST_OBJECT(cd);
|
||||||
opal_event_active(&(cd->ev), OPAL_EV_WRITE, 1);
|
opal_event_active(&(cd->ev), OPAL_EV_WRITE, 1);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -43,9 +43,11 @@
|
|||||||
#include "opal/mca/event/event.h"
|
#include "opal/mca/event/event.h"
|
||||||
#include "opal/mca/pmix/pmix.h"
|
#include "opal/mca/pmix/pmix.h"
|
||||||
#include "opal/util/proc.h"
|
#include "opal/util/proc.h"
|
||||||
|
#include "opal/sys/atomic.h"
|
||||||
|
|
||||||
#include "orte/mca/grpcomm/base/base.h"
|
#include "orte/mca/grpcomm/base/base.h"
|
||||||
#include "orte/runtime/orte_globals.h"
|
#include "orte/runtime/orte_globals.h"
|
||||||
|
#include "orte/util/threads.h"
|
||||||
|
|
||||||
BEGIN_C_DECLS
|
BEGIN_C_DECLS
|
||||||
|
|
||||||
@ -119,6 +121,7 @@ OBJ_CLASS_DECLARATION(orte_pmix_mdx_caddy_t);
|
|||||||
opal_event_set(orte_event_base, &(_req->ev), \
|
opal_event_set(orte_event_base, &(_req->ev), \
|
||||||
-1, OPAL_EV_WRITE, (cf), _req); \
|
-1, OPAL_EV_WRITE, (cf), _req); \
|
||||||
opal_event_set_priority(&(_req->ev), ORTE_MSG_PRI); \
|
opal_event_set_priority(&(_req->ev), ORTE_MSG_PRI); \
|
||||||
|
ORTE_POST_OBJECT(_req); \
|
||||||
opal_event_active(&(_req->ev), OPAL_EV_WRITE, 1); \
|
opal_event_active(&(_req->ev), OPAL_EV_WRITE, 1); \
|
||||||
} while(0);
|
} while(0);
|
||||||
|
|
||||||
@ -133,6 +136,7 @@ OBJ_CLASS_DECLARATION(orte_pmix_mdx_caddy_t);
|
|||||||
opal_event_set(orte_event_base, &(_req->ev), \
|
opal_event_set(orte_event_base, &(_req->ev), \
|
||||||
-1, OPAL_EV_WRITE, (cf), _req); \
|
-1, OPAL_EV_WRITE, (cf), _req); \
|
||||||
opal_event_set_priority(&(_req->ev), ORTE_MSG_PRI); \
|
opal_event_set_priority(&(_req->ev), ORTE_MSG_PRI); \
|
||||||
|
ORTE_POST_OBJECT(_req); \
|
||||||
opal_event_active(&(_req->ev), OPAL_EV_WRITE, 1); \
|
opal_event_active(&(_req->ev), OPAL_EV_WRITE, 1); \
|
||||||
} while(0);
|
} while(0);
|
||||||
|
|
||||||
@ -147,6 +151,7 @@ OBJ_CLASS_DECLARATION(orte_pmix_mdx_caddy_t);
|
|||||||
opal_event_set(orte_event_base, &(_cd->ev), -1, \
|
opal_event_set(orte_event_base, &(_cd->ev), -1, \
|
||||||
OPAL_EV_WRITE, (fn), _cd); \
|
OPAL_EV_WRITE, (fn), _cd); \
|
||||||
opal_event_set_priority(&(_cd->ev), ORTE_MSG_PRI); \
|
opal_event_set_priority(&(_cd->ev), ORTE_MSG_PRI); \
|
||||||
|
ORTE_POST_OBJECT(_cd); \
|
||||||
opal_event_active(&(_cd->ev), OPAL_EV_WRITE, 1); \
|
opal_event_active(&(_cd->ev), OPAL_EV_WRITE, 1); \
|
||||||
} while(0);
|
} while(0);
|
||||||
|
|
||||||
@ -165,6 +170,7 @@ OBJ_CLASS_DECLARATION(orte_pmix_mdx_caddy_t);
|
|||||||
opal_event_set(orte_event_base, &(_cd->ev), -1, \
|
opal_event_set(orte_event_base, &(_cd->ev), -1, \
|
||||||
OPAL_EV_WRITE, (fn), _cd); \
|
OPAL_EV_WRITE, (fn), _cd); \
|
||||||
opal_event_set_priority(&(_cd->ev), ORTE_MSG_PRI); \
|
opal_event_set_priority(&(_cd->ev), ORTE_MSG_PRI); \
|
||||||
|
ORTE_POST_OBJECT(_cd); \
|
||||||
opal_event_active(&(_cd->ev), OPAL_EV_WRITE, 1); \
|
opal_event_active(&(_cd->ev), OPAL_EV_WRITE, 1); \
|
||||||
} while(0);
|
} while(0);
|
||||||
|
|
||||||
|
@ -39,6 +39,7 @@
|
|||||||
#include "orte/mca/errmgr/errmgr.h"
|
#include "orte/mca/errmgr/errmgr.h"
|
||||||
#include "orte/util/name_fns.h"
|
#include "orte/util/name_fns.h"
|
||||||
#include "orte/util/show_help.h"
|
#include "orte/util/show_help.h"
|
||||||
|
#include "orte/util/threads.h"
|
||||||
#include "orte/runtime/orte_data_server.h"
|
#include "orte/runtime/orte_data_server.h"
|
||||||
#include "orte/runtime/orte_globals.h"
|
#include "orte/runtime/orte_globals.h"
|
||||||
#include "orte/mca/rml/rml.h"
|
#include "orte/mca/rml/rml.h"
|
||||||
@ -150,6 +151,8 @@ static void execute(int sd, short args, void *cbdata)
|
|||||||
opal_buffer_t *xfer;
|
opal_buffer_t *xfer;
|
||||||
orte_process_name_t *target;
|
orte_process_name_t *target;
|
||||||
|
|
||||||
|
ORTE_ACQUIRE_OBJECT(req);
|
||||||
|
|
||||||
if (!orte_pmix_server_globals.pubsub_init) {
|
if (!orte_pmix_server_globals.pubsub_init) {
|
||||||
/* we need to initialize our connection to the server */
|
/* we need to initialize our connection to the server */
|
||||||
if (ORTE_SUCCESS != (rc = init_server())) {
|
if (ORTE_SUCCESS != (rc = init_server())) {
|
||||||
@ -298,6 +301,7 @@ int pmix_server_publish_fn(opal_process_name_t *proc,
|
|||||||
opal_event_set(orte_event_base, &(req->ev),
|
opal_event_set(orte_event_base, &(req->ev),
|
||||||
-1, OPAL_EV_WRITE, execute, req);
|
-1, OPAL_EV_WRITE, execute, req);
|
||||||
opal_event_set_priority(&(req->ev), ORTE_MSG_PRI);
|
opal_event_set_priority(&(req->ev), ORTE_MSG_PRI);
|
||||||
|
ORTE_POST_OBJECT(req);
|
||||||
opal_event_active(&(req->ev), OPAL_EV_WRITE, 1);
|
opal_event_active(&(req->ev), OPAL_EV_WRITE, 1);
|
||||||
|
|
||||||
return OPAL_SUCCESS;
|
return OPAL_SUCCESS;
|
||||||
@ -395,6 +399,7 @@ int pmix_server_lookup_fn(opal_process_name_t *proc, char **keys,
|
|||||||
opal_event_set(orte_event_base, &(req->ev),
|
opal_event_set(orte_event_base, &(req->ev),
|
||||||
-1, OPAL_EV_WRITE, execute, req);
|
-1, OPAL_EV_WRITE, execute, req);
|
||||||
opal_event_set_priority(&(req->ev), ORTE_MSG_PRI);
|
opal_event_set_priority(&(req->ev), ORTE_MSG_PRI);
|
||||||
|
ORTE_POST_OBJECT(req);
|
||||||
opal_event_active(&(req->ev), OPAL_EV_WRITE, 1);
|
opal_event_active(&(req->ev), OPAL_EV_WRITE, 1);
|
||||||
|
|
||||||
return OPAL_SUCCESS;
|
return OPAL_SUCCESS;
|
||||||
@ -483,6 +488,7 @@ int pmix_server_unpublish_fn(opal_process_name_t *proc, char **keys,
|
|||||||
opal_event_set(orte_event_base, &(req->ev),
|
opal_event_set(orte_event_base, &(req->ev),
|
||||||
-1, OPAL_EV_WRITE, execute, req);
|
-1, OPAL_EV_WRITE, execute, req);
|
||||||
opal_event_set_priority(&(req->ev), ORTE_MSG_PRI);
|
opal_event_set_priority(&(req->ev), ORTE_MSG_PRI);
|
||||||
|
ORTE_POST_OBJECT(req);
|
||||||
opal_event_active(&(req->ev), OPAL_EV_WRITE, 1);
|
opal_event_active(&(req->ev), OPAL_EV_WRITE, 1);
|
||||||
|
|
||||||
return OPAL_SUCCESS;
|
return OPAL_SUCCESS;
|
||||||
|
@ -54,6 +54,7 @@
|
|||||||
|
|
||||||
#include "orte/util/session_dir.h"
|
#include "orte/util/session_dir.h"
|
||||||
#include "orte/util/show_help.h"
|
#include "orte/util/show_help.h"
|
||||||
|
#include "orte/util/threads.h"
|
||||||
|
|
||||||
#include "orte/runtime/runtime.h"
|
#include "orte/runtime/runtime.h"
|
||||||
#include "orte/runtime/orte_globals.h"
|
#include "orte/runtime/orte_globals.h"
|
||||||
@ -75,6 +76,8 @@ void orte_quit(int fd, short args, void *cbdata)
|
|||||||
{
|
{
|
||||||
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
|
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
|
||||||
|
|
||||||
|
ORTE_ACQUIRE_OBJECT(caddy);
|
||||||
|
|
||||||
/* cleanup */
|
/* cleanup */
|
||||||
if (NULL != caddy) {
|
if (NULL != caddy) {
|
||||||
OBJ_RELEASE(caddy);
|
OBJ_RELEASE(caddy);
|
||||||
@ -135,6 +138,7 @@ void orte_quit(int fd, short args, void *cbdata)
|
|||||||
* so we will exit
|
* so we will exit
|
||||||
*/
|
*/
|
||||||
orte_event_base_active = false;
|
orte_event_base_active = false;
|
||||||
|
ORTE_POST_OBJECT(orte_event_base_active);
|
||||||
/* break out of the event loop */
|
/* break out of the event loop */
|
||||||
opal_event_base_loopbreak(orte_event_base);
|
opal_event_base_loopbreak(orte_event_base);
|
||||||
}
|
}
|
||||||
|
@ -13,7 +13,7 @@
|
|||||||
* reserved.
|
* reserved.
|
||||||
* Copyright (c) 2008 Institut National de Recherche en Informatique
|
* Copyright (c) 2008 Institut National de Recherche en Informatique
|
||||||
* et Automatique. All rights reserved.
|
* et Automatique. All rights reserved.
|
||||||
* Copyright (c) 2014 Intel Corporation. All rights reserved.
|
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
|
||||||
* $COPYRIGHT$
|
* $COPYRIGHT$
|
||||||
*
|
*
|
||||||
* Additional copyrights may follow
|
* Additional copyrights may follow
|
||||||
@ -60,6 +60,7 @@
|
|||||||
#include "orte/constants.h"
|
#include "orte/constants.h"
|
||||||
#include "orte/mca/errmgr/errmgr.h"
|
#include "orte/mca/errmgr/errmgr.h"
|
||||||
#include "orte/util/name_fns.h"
|
#include "orte/util/name_fns.h"
|
||||||
|
#include "orte/util/threads.h"
|
||||||
#include "orte/runtime/orte_globals.h"
|
#include "orte/runtime/orte_globals.h"
|
||||||
|
|
||||||
#include "orte/runtime/orte_wait.h"
|
#include "orte/runtime/orte_wait.h"
|
||||||
@ -188,6 +189,8 @@ static void cancel_callback(int fd, short args, void *cbdata)
|
|||||||
orte_wait_tracker_t *trk = (orte_wait_tracker_t*)cbdata;
|
orte_wait_tracker_t *trk = (orte_wait_tracker_t*)cbdata;
|
||||||
orte_wait_tracker_t *t2;
|
orte_wait_tracker_t *t2;
|
||||||
|
|
||||||
|
ORTE_ACQUIRE_OBJECT(trk);
|
||||||
|
|
||||||
OPAL_LIST_FOREACH(t2, &pending_cbs, orte_wait_tracker_t) {
|
OPAL_LIST_FOREACH(t2, &pending_cbs, orte_wait_tracker_t) {
|
||||||
if (t2->child == trk->child) {
|
if (t2->child == trk->child) {
|
||||||
opal_list_remove_item(&pending_cbs, &t2->super);
|
opal_list_remove_item(&pending_cbs, &t2->super);
|
||||||
@ -214,9 +217,7 @@ void orte_wait_cb_cancel(orte_proc_t *child)
|
|||||||
trk = OBJ_NEW(orte_wait_tracker_t);
|
trk = OBJ_NEW(orte_wait_tracker_t);
|
||||||
OBJ_RETAIN(child); // protect against race conditions
|
OBJ_RETAIN(child); // protect against race conditions
|
||||||
trk->child = child;
|
trk->child = child;
|
||||||
opal_event_set(orte_event_base, &trk->ev, -1, OPAL_EV_WRITE, cancel_callback, trk);
|
ORTE_THREADSHIFT(trk, orte_event_base, cancel_callback, ORTE_SYS_PRI);
|
||||||
opal_event_set_priority(&trk->ev, ORTE_SYS_PRI);
|
|
||||||
opal_event_active(&trk->ev, OPAL_EV_WRITE, 1);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -228,6 +229,8 @@ static void wait_signal_callback(int fd, short event, void *arg)
|
|||||||
pid_t pid;
|
pid_t pid;
|
||||||
orte_wait_tracker_t *t2;
|
orte_wait_tracker_t *t2;
|
||||||
|
|
||||||
|
ORTE_ACQUIRE_OBJECT(signal);
|
||||||
|
|
||||||
if (SIGCHLD != OPAL_EVENT_SIGNAL(signal)) {
|
if (SIGCHLD != OPAL_EVENT_SIGNAL(signal)) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
@ -13,7 +13,7 @@
|
|||||||
* et Automatique. All rights reserved.
|
* et Automatique. All rights reserved.
|
||||||
* Copyright (c) 2011 Los Alamos National Security, LLC.
|
* Copyright (c) 2011 Los Alamos National Security, LLC.
|
||||||
* All rights reserved.
|
* All rights reserved.
|
||||||
* Copyright (c) 2014 Intel, Inc. All rights reserved.
|
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
|
||||||
* $COPYRIGHT$
|
* $COPYRIGHT$
|
||||||
*
|
*
|
||||||
* Additional copyrights may follow
|
* Additional copyrights may follow
|
||||||
@ -48,6 +48,7 @@
|
|||||||
#include "orte/types.h"
|
#include "orte/types.h"
|
||||||
#include "orte/mca/rml/rml_types.h"
|
#include "orte/mca/rml/rml_types.h"
|
||||||
#include "orte/runtime/orte_globals.h"
|
#include "orte/runtime/orte_globals.h"
|
||||||
|
#include "orte/util/threads.h"
|
||||||
|
|
||||||
BEGIN_C_DECLS
|
BEGIN_C_DECLS
|
||||||
|
|
||||||
@ -95,6 +96,7 @@ ORTE_DECLSPEC void orte_wait_cb_cancel(orte_proc_t *proc);
|
|||||||
struct timespec tp = {0, 100000}; \
|
struct timespec tp = {0, 100000}; \
|
||||||
nanosleep(&tp, NULL); \
|
nanosleep(&tp, NULL); \
|
||||||
} \
|
} \
|
||||||
|
ORTE_ACQUIRE_OBJECT(flg); \
|
||||||
}while(0);
|
}while(0);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -135,6 +137,7 @@ ORTE_DECLSPEC void orte_wait_cb_cancel(orte_proc_t *proc);
|
|||||||
"defining timeout: %ld sec %ld usec at %s:%d", \
|
"defining timeout: %ld sec %ld usec at %s:%d", \
|
||||||
(long)tmp->tv.tv_sec, (long)tmp->tv.tv_usec, \
|
(long)tmp->tv.tv_sec, (long)tmp->tv.tv_usec, \
|
||||||
__FILE__, __LINE__)); \
|
__FILE__, __LINE__)); \
|
||||||
|
ORTE_POST_OBJECT(tmp); \
|
||||||
opal_event_evtimer_add(tmp->ev, &tmp->tv); \
|
opal_event_evtimer_add(tmp->ev, &tmp->tv); \
|
||||||
}while(0); \
|
}while(0); \
|
||||||
|
|
||||||
@ -161,6 +164,7 @@ ORTE_DECLSPEC void orte_wait_cb_cancel(orte_proc_t *proc);
|
|||||||
"defining timer event: %ld sec %ld usec at %s:%d", \
|
"defining timer event: %ld sec %ld usec at %s:%d", \
|
||||||
(long)tm->tv.tv_sec, (long)tm->tv.tv_usec, \
|
(long)tm->tv.tv_sec, (long)tm->tv.tv_usec, \
|
||||||
__FILE__, __LINE__)); \
|
__FILE__, __LINE__)); \
|
||||||
|
ORTE_POST_OBJECT(tm); \
|
||||||
opal_event_evtimer_add(tm->ev, &tm->tv); \
|
opal_event_evtimer_add(tm->ev, &tm->tv); \
|
||||||
}while(0); \
|
}while(0); \
|
||||||
|
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
PROGS = no_op sigusr_trap spin orte_nodename orte_spawn orte_loop_spawn orte_loop_child orte_abort get_limits \
|
PROGS = no_op sigusr_trap spin orte_nodename orte_spawn orte_loop_spawn orte_loop_child orte_abort get_limits \
|
||||||
orte_tool orte_no_op binom oob_stress iof_stress iof_delay radix opal_interface orte_spin segfault \
|
orte_tool orte_no_op binom oob_stress iof_stress iof_delay radix opal_interface orte_spin segfault \
|
||||||
orte_exit test-time event-threads psm_keygen regex orte_errors evpri-test opal-evpri-test evpri-test2 \
|
orte_exit test-time event-threads psm_keygen regex orte_errors evpri-test opal-evpri-test evpri-test2 \
|
||||||
mapper reducer opal_hotel orte_dfs ulfm pmixtool
|
mapper reducer opal_hotel orte_dfs ulfm pmixtool threads
|
||||||
|
|
||||||
all: $(PROGS)
|
all: $(PROGS)
|
||||||
|
|
||||||
@ -19,3 +19,6 @@ oob_stress:
|
|||||||
|
|
||||||
pmixtool:
|
pmixtool:
|
||||||
ortecc -o pmixtool pmixtool.c -lpmix
|
ortecc -o pmixtool pmixtool.c -lpmix
|
||||||
|
|
||||||
|
threads:
|
||||||
|
ortecc -O0 -g -lpthread -lhwloc threads.c -o threads
|
||||||
|
335
orte/test/system/threads.c
Обычный файл
335
orte/test/system/threads.c
Обычный файл
@ -0,0 +1,335 @@
|
|||||||
|
/*
|
||||||
|
* Test program for memory consistency in a thread shifting design
|
||||||
|
*
|
||||||
|
*
|
||||||
|
* Run:
|
||||||
|
* ./threads ITERATIONS [MODE]
|
||||||
|
* ./threads 9000000 3
|
||||||
|
*
|
||||||
|
* Example:
|
||||||
|
* ./threads 9000000 0 --> Will fail, no memory barriers
|
||||||
|
* ./threads 9000000 1 --> Will fail, no WMB
|
||||||
|
* ./threads 9000000 2 --> Will fail, no RMB
|
||||||
|
* ./threads 9000000 3 --> Success
|
||||||
|
* ./threads 9000000 4 --> Success
|
||||||
|
* ./threads 9000000 5 --> N/A
|
||||||
|
*/
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <stdbool.h>
|
||||||
|
#include <string.h>
|
||||||
|
#include <pthread.h>
|
||||||
|
#include <stdint.h>
|
||||||
|
#include <hwloc.h>
|
||||||
|
#include <sys/time.h>
|
||||||
|
|
||||||
|
#include "opal/sys/atomic.h"
|
||||||
|
|
||||||
|
|
||||||
|
// Max value for an int16_t
|
||||||
|
#define MAX_VAL 32767
|
||||||
|
|
||||||
|
typedef struct {
|
||||||
|
int type;
|
||||||
|
union {
|
||||||
|
bool flag;
|
||||||
|
int integer;
|
||||||
|
int8_t int8;
|
||||||
|
int16_t int16;
|
||||||
|
int32_t int32;
|
||||||
|
int64_t int64;
|
||||||
|
//char padding[1];
|
||||||
|
} data;
|
||||||
|
} my_value_t;
|
||||||
|
|
||||||
|
// Structure to handoff work to the peer thread
|
||||||
|
typedef struct {
|
||||||
|
volatile bool working;
|
||||||
|
void *ptr; // Note that adding a volatile here has no effect
|
||||||
|
} thread_handoff_t;
|
||||||
|
|
||||||
|
// Shared object to handoff work
|
||||||
|
thread_handoff_t handoff;
|
||||||
|
|
||||||
|
// Indicates if the test has finished
|
||||||
|
bool time_to_stop = false;
|
||||||
|
|
||||||
|
// Progress reporting
|
||||||
|
#define PERC_INC 10.0
|
||||||
|
double perc_report_after = PERC_INC;
|
||||||
|
double perc_current = 0.0;
|
||||||
|
|
||||||
|
// Memory barrier modes
|
||||||
|
#define MB_MODE_NONE 0x0
|
||||||
|
#define MB_MODE_RMB 0x1
|
||||||
|
#define MB_MODE_WMB 0x2
|
||||||
|
#define MB_MODE_MB 0x4
|
||||||
|
#define MB_MODE_XMB 0x8
|
||||||
|
#define MB_MODE_ALL (MB_MODE_RMB | MB_MODE_WMB)
|
||||||
|
int mb_mode = MB_MODE_ALL;
|
||||||
|
|
||||||
|
|
||||||
|
// Shared hwloc topology (so we only have to read it once)
|
||||||
|
static hwloc_topology_t topo;
|
||||||
|
// Which object we are binding to
|
||||||
|
// 4 - sockets with 5 cores each
|
||||||
|
// 20 - cores with 8 PUs each
|
||||||
|
//#define OBJ_TYPE HWLOC_OBJ_SOCKET
|
||||||
|
#define OBJ_TYPE HWLOC_OBJ_CORE
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Some basic timing support
|
||||||
|
*/
|
||||||
|
double acc_time, start_time, stop_time, delta;
|
||||||
|
static double get_ts_gettimeofday(void) {
|
||||||
|
double ret;
|
||||||
|
struct timeval tv;
|
||||||
|
gettimeofday(&tv, NULL);
|
||||||
|
ret = tv.tv_sec;
|
||||||
|
ret += (double)tv.tv_usec / 1000000.0;
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Bind either the main or support thread far away from each other
|
||||||
|
*/
|
||||||
|
void bind_me_to(bool main_thread);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Support thread to do the memory allocation and xfer
|
||||||
|
*/
|
||||||
|
void *value_xfer_thread(void *arg);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Main thread
|
||||||
|
*/
|
||||||
|
int main(int argc, char **argv) {
|
||||||
|
pthread_t support_thread;
|
||||||
|
int rc, i, max_iters = 10, cur_iter;
|
||||||
|
my_value_t *val = NULL;
|
||||||
|
int mode;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Parse command line arguments
|
||||||
|
*/
|
||||||
|
if( argc > 1 ) {
|
||||||
|
max_iters = atoi(argv[1]);
|
||||||
|
}
|
||||||
|
if( argc > 2 ) {
|
||||||
|
mode = atoi(argv[2]);
|
||||||
|
if( 0 > mode || mode > 5 ) {
|
||||||
|
printf("Error: Invalid mode %d\n"
|
||||||
|
"\tNone = 0\n"
|
||||||
|
"\tRMB = 1\n"
|
||||||
|
"\tWMB = 2\n"
|
||||||
|
"\tBoth = 3\n"
|
||||||
|
"\tMB Only = 4\n",
|
||||||
|
"\tXMB Only = 5\n",
|
||||||
|
mode);
|
||||||
|
exit(-1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
mode = 3;
|
||||||
|
}
|
||||||
|
switch(mode) {
|
||||||
|
case 0:
|
||||||
|
mb_mode = MB_MODE_NONE;
|
||||||
|
break;
|
||||||
|
case 1:
|
||||||
|
mb_mode = MB_MODE_RMB;
|
||||||
|
break;
|
||||||
|
case 2:
|
||||||
|
mb_mode = MB_MODE_WMB;
|
||||||
|
break;
|
||||||
|
case 3:
|
||||||
|
mb_mode = MB_MODE_ALL;
|
||||||
|
break;
|
||||||
|
case 4:
|
||||||
|
mb_mode = MB_MODE_MB;
|
||||||
|
break;
|
||||||
|
case 5:
|
||||||
|
mb_mode = MB_MODE_XMB;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Load hwloc topology
|
||||||
|
hwloc_topology_init(&topo);
|
||||||
|
hwloc_topology_load(topo);
|
||||||
|
|
||||||
|
// Display banner
|
||||||
|
printf("---------------------------\n");
|
||||||
|
printf("Iterations: %10d\n", max_iters);
|
||||||
|
printf("Mode R MB : %10s\n", (mb_mode & MB_MODE_RMB ? "Enabled" : "Disabled") );
|
||||||
|
printf("Mode W MB : %10s\n", (mb_mode & MB_MODE_WMB ? "Enabled" : "Disabled") );
|
||||||
|
printf("Mode - MB : %10s\n", (mb_mode & MB_MODE_MB ? "Enabled" : "Disabled") );
|
||||||
|
printf("Mode X MB : %10s\n", (mb_mode & MB_MODE_XMB ? "Enabled" : "Disabled") );
|
||||||
|
printf("---------------------------\n");
|
||||||
|
|
||||||
|
bind_me_to(true);
|
||||||
|
handoff.working = false;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Launch supporting thread
|
||||||
|
*/
|
||||||
|
rc = pthread_create(&support_thread, NULL, value_xfer_thread, NULL);
|
||||||
|
if( 0 != rc ) {
|
||||||
|
printf("Error: Failed to create a thread! %d\n", rc);
|
||||||
|
exit(-1);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Main work loop
|
||||||
|
*/
|
||||||
|
acc_time = 0.0;
|
||||||
|
for(cur_iter = 0; cur_iter < max_iters; ++cur_iter) {
|
||||||
|
perc_current = (cur_iter / ((double)max_iters)) * 100.0;
|
||||||
|
if( perc_current > perc_report_after ) {
|
||||||
|
delta = (acc_time / cur_iter) * 1000000;
|
||||||
|
printf("%6.1f %% complete : Iteration %10d / %10d : %6.1f usec / iter\n",
|
||||||
|
perc_current, cur_iter+1, max_iters, delta);
|
||||||
|
perc_report_after += PERC_INC;
|
||||||
|
}
|
||||||
|
|
||||||
|
start_time = get_ts_gettimeofday();
|
||||||
|
// Initialize values
|
||||||
|
val = NULL;
|
||||||
|
handoff.ptr = &val;
|
||||||
|
if( mb_mode & MB_MODE_RMB ) {
|
||||||
|
opal_atomic_rmb();
|
||||||
|
}
|
||||||
|
if( mb_mode & MB_MODE_MB ) {
|
||||||
|
opal_atomic_mb();
|
||||||
|
}
|
||||||
|
handoff.working = true;
|
||||||
|
|
||||||
|
// Wait for work to finish
|
||||||
|
while( handoff.working ) {
|
||||||
|
usleep(1);
|
||||||
|
}
|
||||||
|
if( mb_mode & MB_MODE_WMB ) {
|
||||||
|
opal_atomic_wmb();
|
||||||
|
}
|
||||||
|
if( mb_mode & MB_MODE_MB ) {
|
||||||
|
opal_atomic_mb();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Inspect values for correctness
|
||||||
|
if( NULL == val ) {
|
||||||
|
printf("[%10d / %10d] Error: val = %s\n", cur_iter+1, max_iters,
|
||||||
|
(NULL == val ? "NULL" : "Valid") );
|
||||||
|
exit(-1);
|
||||||
|
}
|
||||||
|
else if( 999 != val->type ) {
|
||||||
|
printf("[%10d / %10d] Error: val->type = %d\n", cur_iter+1, max_iters, val->type);
|
||||||
|
exit(-1);
|
||||||
|
}
|
||||||
|
else if( (cur_iter+1)%MAX_VAL != val->data.int16 ) {
|
||||||
|
printf("[%10d / %10d] Error: val->data.int16 = %d\n", cur_iter+1, max_iters, val->data.int16);
|
||||||
|
exit(-1);
|
||||||
|
}
|
||||||
|
|
||||||
|
stop_time = get_ts_gettimeofday();
|
||||||
|
acc_time += (stop_time - start_time);
|
||||||
|
|
||||||
|
// Yes, this is a memory leak!
|
||||||
|
// I need to make sure that the supporting thread is not reusing a
|
||||||
|
// previous storage location when it calls malloc. This is to emulate
|
||||||
|
// a program that calls malloc after the value was acquired, possibly
|
||||||
|
// reusing this memory location.
|
||||||
|
//free(val);
|
||||||
|
val = NULL;
|
||||||
|
}
|
||||||
|
delta = (acc_time / max_iters) * 1000000;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* All done - Cleanup
|
||||||
|
*/
|
||||||
|
time_to_stop = true;
|
||||||
|
|
||||||
|
rc = pthread_join(support_thread, NULL);
|
||||||
|
if( 0 != rc ) {
|
||||||
|
printf("Error: Failed to join a thread! %d\n", rc);
|
||||||
|
exit(-1);
|
||||||
|
}
|
||||||
|
|
||||||
|
hwloc_topology_destroy(topo);
|
||||||
|
|
||||||
|
printf("Success - %6.1f usec / iter\n", delta);
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
void *value_xfer_thread(void *arg) {
|
||||||
|
my_value_t **val = NULL;
|
||||||
|
static int var = 0;
|
||||||
|
|
||||||
|
// Bind this thread away from the main thread
|
||||||
|
bind_me_to(false);
|
||||||
|
|
||||||
|
while( !time_to_stop ) {
|
||||||
|
if( handoff.working ) {
|
||||||
|
// Make sure I have the right pointer
|
||||||
|
if( mb_mode & MB_MODE_WMB ) {
|
||||||
|
opal_atomic_wmb();
|
||||||
|
}
|
||||||
|
if( mb_mode & MB_MODE_MB ) {
|
||||||
|
opal_atomic_mb();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Allocate and set the value
|
||||||
|
val = (my_value_t**)handoff.ptr;
|
||||||
|
(*val) = malloc(sizeof(my_value_t));
|
||||||
|
(*val)->type = 999;
|
||||||
|
(*val)->data.int16 = (++var)%MAX_VAL;
|
||||||
|
|
||||||
|
// Make sure main thread can see the value
|
||||||
|
// See 'Examples' -> 'Global thread flag' discussion here:
|
||||||
|
// https://www.ibm.com/developerworks/systems/articles/powerpc.html
|
||||||
|
if( mb_mode & MB_MODE_RMB ) {
|
||||||
|
opal_atomic_rmb();
|
||||||
|
}
|
||||||
|
if( mb_mode & MB_MODE_MB ) {
|
||||||
|
opal_atomic_mb();
|
||||||
|
}
|
||||||
|
// Release main thread
|
||||||
|
handoff.working = false;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
// wait for work
|
||||||
|
usleep(1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
pthread_exit(NULL);
|
||||||
|
}
|
||||||
|
|
||||||
|
void bind_me_to(bool main_thread) {
|
||||||
|
int num_objs;
|
||||||
|
hwloc_cpuset_t set;
|
||||||
|
char *buffer = NULL;
|
||||||
|
hwloc_obj_t obj;
|
||||||
|
|
||||||
|
num_objs = hwloc_get_nbobjs_by_type(topo, OBJ_TYPE);
|
||||||
|
|
||||||
|
if( main_thread ) {
|
||||||
|
obj = hwloc_get_obj_by_type(topo, OBJ_TYPE, 0);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
obj = hwloc_get_obj_by_type(topo, OBJ_TYPE, num_objs-1);
|
||||||
|
}
|
||||||
|
|
||||||
|
if( obj->type == OBJ_TYPE ) {
|
||||||
|
hwloc_set_cpubind(topo, obj->cpuset, HWLOC_CPUBIND_THREAD);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
printf("Error: Invalid object\n");
|
||||||
|
exit(-1);
|
||||||
|
}
|
||||||
|
|
||||||
|
set = hwloc_bitmap_alloc();
|
||||||
|
hwloc_get_cpubind(topo, set, HWLOC_CPUBIND_THREAD);
|
||||||
|
hwloc_bitmap_asprintf(&buffer, set);
|
||||||
|
printf("%s : [objs = %d] : cpuset is %s\n", (main_thread ? "Main" : "Peer"), num_objs, buffer);
|
||||||
|
free(buffer);
|
||||||
|
hwloc_bitmap_free(set);
|
||||||
|
}
|
@ -84,6 +84,7 @@
|
|||||||
#include "orte/runtime/runtime.h"
|
#include "orte/runtime/runtime.h"
|
||||||
#include "orte/runtime/orte_globals.h"
|
#include "orte/runtime/orte_globals.h"
|
||||||
#include "orte/util/show_help.h"
|
#include "orte/util/show_help.h"
|
||||||
|
#include "orte/util/threads.h"
|
||||||
|
|
||||||
#include "orte/orted/orted.h"
|
#include "orte/orted/orted.h"
|
||||||
|
|
||||||
@ -490,6 +491,7 @@ int main(int argc, char *argv[])
|
|||||||
while (orte_event_base_active) {
|
while (orte_event_base_active) {
|
||||||
opal_event_loop(orte_event_base, OPAL_EVLOOP_ONCE);
|
opal_event_loop(orte_event_base, OPAL_EVLOOP_ONCE);
|
||||||
}
|
}
|
||||||
|
ORTE_ACQUIRE_OBJECT(orte_event_base_active);
|
||||||
|
|
||||||
/* cleanup and leave */
|
/* cleanup and leave */
|
||||||
orte_finalize();
|
orte_finalize();
|
||||||
|
@ -13,7 +13,7 @@
|
|||||||
* Copyright (c) 2007-2013 Cisco Systems, Inc. All rights reserved.
|
* Copyright (c) 2007-2013 Cisco Systems, Inc. All rights reserved.
|
||||||
* Copyright (c) 2007-2016 Los Alamos National Security, LLC. All rights
|
* Copyright (c) 2007-2016 Los Alamos National Security, LLC. All rights
|
||||||
* reserved.
|
* reserved.
|
||||||
* Copyright (c) 2015 Intel, Inc. All rights reserved.
|
* Copyright (c) 2015-2017 Intel, Inc. All rights reserved.
|
||||||
* $COPYRIGHT$
|
* $COPYRIGHT$
|
||||||
*
|
*
|
||||||
* Additional copyrights may follow
|
* Additional copyrights may follow
|
||||||
@ -54,6 +54,7 @@
|
|||||||
|
|
||||||
#include "orte/util/name_fns.h"
|
#include "orte/util/name_fns.h"
|
||||||
#include "orte/util/proc_info.h"
|
#include "orte/util/proc_info.h"
|
||||||
|
#include "orte/util/threads.h"
|
||||||
#include "orte/mca/errmgr/errmgr.h"
|
#include "orte/mca/errmgr/errmgr.h"
|
||||||
#include "orte/mca/rml/rml.h"
|
#include "orte/mca/rml/rml.h"
|
||||||
#include "orte/orted/orted.h"
|
#include "orte/orted/orted.h"
|
||||||
@ -283,6 +284,7 @@ int main(int argc, char *argv[])
|
|||||||
while (orte_event_base_active) {
|
while (orte_event_base_active) {
|
||||||
opal_event_loop(orte_event_base, OPAL_EVLOOP_ONCE);
|
opal_event_loop(orte_event_base, OPAL_EVLOOP_ONCE);
|
||||||
}
|
}
|
||||||
|
ORTE_ACQUIRE_OBJECT(orte_event_base_active);
|
||||||
|
|
||||||
/* should never get here, but if we do... */
|
/* should never get here, but if we do... */
|
||||||
|
|
||||||
|
@ -13,7 +13,7 @@
|
|||||||
* Copyright (c) 2007-2012 Cisco Systems, Inc. All rights reserved.
|
* Copyright (c) 2007-2012 Cisco Systems, Inc. All rights reserved.
|
||||||
* Copyright (c) 2007-2016 Los Alamos National Security, LLC. All rights
|
* Copyright (c) 2007-2016 Los Alamos National Security, LLC. All rights
|
||||||
* reserved.
|
* reserved.
|
||||||
* Copyright (c) 2015-2016 Intel, Inc. All rights reserved.
|
* Copyright (c) 2015-2017 Intel, Inc. All rights reserved.
|
||||||
* $COPYRIGHT$
|
* $COPYRIGHT$
|
||||||
*
|
*
|
||||||
* Additional copyrights may follow
|
* Additional copyrights may follow
|
||||||
@ -58,6 +58,7 @@
|
|||||||
#include "orte/util/name_fns.h"
|
#include "orte/util/name_fns.h"
|
||||||
#include "orte/util/show_help.h"
|
#include "orte/util/show_help.h"
|
||||||
#include "orte/util/proc_info.h"
|
#include "orte/util/proc_info.h"
|
||||||
|
#include "orte/util/threads.h"
|
||||||
#include "orte/runtime/orte_wait.h"
|
#include "orte/runtime/orte_wait.h"
|
||||||
#include "orte/mca/rml/base/rml_contact.h"
|
#include "orte/mca/rml/base/rml_contact.h"
|
||||||
#include "orte/runtime/orte_quit.h"
|
#include "orte/runtime/orte_quit.h"
|
||||||
@ -532,6 +533,7 @@ SEND:
|
|||||||
while (orte_event_base_active) {
|
while (orte_event_base_active) {
|
||||||
opal_event_loop(orte_event_base, OPAL_EVLOOP_ONCE);
|
opal_event_loop(orte_event_base, OPAL_EVLOOP_ONCE);
|
||||||
}
|
}
|
||||||
|
ORTE_ACQUIRE_OBJECT(orte_event_base_active);
|
||||||
|
|
||||||
/***************
|
/***************
|
||||||
* Cleanup
|
* Cleanup
|
||||||
|
@ -87,6 +87,7 @@
|
|||||||
#include "orte/mca/state/state.h"
|
#include "orte/mca/state/state.h"
|
||||||
#include "orte/util/proc_info.h"
|
#include "orte/util/proc_info.h"
|
||||||
#include "orte/util/show_help.h"
|
#include "orte/util/show_help.h"
|
||||||
|
#include "orte/util/threads.h"
|
||||||
|
|
||||||
#include "orte/runtime/runtime.h"
|
#include "orte/runtime/runtime.h"
|
||||||
#include "orte/runtime/orte_globals.h"
|
#include "orte/runtime/orte_globals.h"
|
||||||
@ -198,6 +199,7 @@ int orterun(int argc, char *argv[])
|
|||||||
while (orte_event_base_active && launchst.active) {
|
while (orte_event_base_active && launchst.active) {
|
||||||
opal_event_loop(orte_event_base, OPAL_EVLOOP_ONCE);
|
opal_event_loop(orte_event_base, OPAL_EVLOOP_ONCE);
|
||||||
}
|
}
|
||||||
|
ORTE_ACQUIRE_OBJECT(orte_event_base_active);
|
||||||
if (orte_debug_flag) {
|
if (orte_debug_flag) {
|
||||||
opal_output(0, "Job %s has launched",
|
opal_output(0, "Job %s has launched",
|
||||||
(NULL == launchst.jdata) ? "UNKNOWN" : ORTE_JOBID_PRINT(launchst.jdata->jobid));
|
(NULL == launchst.jdata) ? "UNKNOWN" : ORTE_JOBID_PRINT(launchst.jdata->jobid));
|
||||||
@ -209,6 +211,7 @@ int orterun(int argc, char *argv[])
|
|||||||
while (orte_event_base_active && completest.active) {
|
while (orte_event_base_active && completest.active) {
|
||||||
opal_event_loop(orte_event_base, OPAL_EVLOOP_ONCE);
|
opal_event_loop(orte_event_base, OPAL_EVLOOP_ONCE);
|
||||||
}
|
}
|
||||||
|
ORTE_ACQUIRE_OBJECT(orte_event_base_active);
|
||||||
|
|
||||||
if (ORTE_PROC_IS_HNP) {
|
if (ORTE_PROC_IS_HNP) {
|
||||||
/* ensure all local procs are dead */
|
/* ensure all local procs are dead */
|
||||||
|
@ -60,7 +60,8 @@ headers += \
|
|||||||
util/regex.h \
|
util/regex.h \
|
||||||
util/attr.h \
|
util/attr.h \
|
||||||
util/listener.h \
|
util/listener.h \
|
||||||
util/compress.h
|
util/compress.h \
|
||||||
|
util/threads.h
|
||||||
|
|
||||||
lib@ORTE_LIB_PREFIX@open_rte_la_SOURCES += \
|
lib@ORTE_LIB_PREFIX@open_rte_la_SOURCES += \
|
||||||
util/error_strings.c \
|
util/error_strings.c \
|
||||||
|
38
orte/util/threads.h
Обычный файл
38
orte/util/threads.h
Обычный файл
@ -0,0 +1,38 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (c) 2017 Intel, Inc. All rights reserved.
|
||||||
|
* $COPYRIGHT$
|
||||||
|
*
|
||||||
|
* Additional copyrights may follow
|
||||||
|
*
|
||||||
|
* $HEADER$
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef ORTE_THREADS_H
|
||||||
|
#define ORTE_THREADS_H
|
||||||
|
|
||||||
|
#include "orte_config.h"
|
||||||
|
|
||||||
|
#include "opal/sys/atomic.h"
|
||||||
|
|
||||||
|
/* provide macros for forward-proofing the shifting
|
||||||
|
* of objects between threads - at some point, we
|
||||||
|
* may revamp our threading model */
|
||||||
|
|
||||||
|
/* post an object to another thread - for now, we
|
||||||
|
* only have a memory barrier */
|
||||||
|
#define ORTE_POST_OBJECT(o) opal_atomic_wmb()
|
||||||
|
|
||||||
|
/* acquire an object from another thread - for now,
|
||||||
|
* we only have a memory barrier */
|
||||||
|
#define ORTE_ACQUIRE_OBJECT(o) opal_atomic_rmb()
|
||||||
|
|
||||||
|
/* define a threadshift macro */
|
||||||
|
#define ORTE_THREADSHIFT(x, eb, f, p) \
|
||||||
|
do { \
|
||||||
|
opal_event_set((eb), &((x)->ev), -1, OPAL_EV_WRITE, (f), (x)); \
|
||||||
|
opal_event_set_priority(&((x)->ev), (p)); \
|
||||||
|
ORTE_POST_OBJECT((x)); \
|
||||||
|
opal_event_active(&((x)->ev), OPAL_EV_WRITE, 1); \
|
||||||
|
} while(0)
|
||||||
|
|
||||||
|
#endif /* ORTE_THREADS_H */
|
Загрузка…
x
Ссылка в новой задаче
Block a user