2004-09-26 21:43:35 +04:00
|
|
|
/*
|
2005-11-05 22:57:48 +03:00
|
|
|
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
|
|
|
* University Research and Technology
|
|
|
|
* Corporation. All rights reserved.
|
2006-08-23 07:32:36 +04:00
|
|
|
* Copyright (c) 2004-2006 The University of Tennessee and The University
|
2005-11-05 22:57:48 +03:00
|
|
|
* of Tennessee Research Foundation. All rights
|
|
|
|
* reserved.
|
2004-11-28 23:09:25 +03:00
|
|
|
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
|
|
|
* University of Stuttgart. All rights reserved.
|
2005-03-24 15:43:37 +03:00
|
|
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
|
|
|
* All rights reserved.
|
2004-11-22 04:38:40 +03:00
|
|
|
* $COPYRIGHT$
|
|
|
|
*
|
|
|
|
* Additional copyrights may follow
|
|
|
|
*
|
2004-09-26 21:43:35 +04:00
|
|
|
* $HEADER$
|
|
|
|
*/
|
|
|
|
|
|
|
|
/**
|
|
|
|
* @file
|
|
|
|
*
|
|
|
|
* Interface for waitpid / async notification of child death with the
|
|
|
|
* libevent runtime system.
|
|
|
|
*/
|
2005-03-14 23:57:21 +03:00
|
|
|
#ifndef ORTE_WAIT_H
|
|
|
|
#define ORTE_WAIT_H
|
2004-09-26 21:43:35 +04:00
|
|
|
|
2005-03-14 23:57:21 +03:00
|
|
|
#include "orte_config.h"
|
2004-09-26 21:43:35 +04:00
|
|
|
|
2004-10-20 05:03:09 +04:00
|
|
|
#ifdef HAVE_SYS_TYPES_H
|
2004-09-26 21:43:35 +04:00
|
|
|
#include <sys/types.h>
|
2004-10-20 05:03:09 +04:00
|
|
|
#endif
|
2008-02-28 04:57:57 +03:00
|
|
|
#ifdef HAVE_SYS_TIME_H
|
|
|
|
#include <sys/time.h>
|
2006-08-23 07:32:36 +04:00
|
|
|
#endif
|
|
|
|
|
2008-02-28 22:58:32 +03:00
|
|
|
#include "opal/dss/dss.h"
|
|
|
|
#include "opal/class/opal_list.h"
|
2008-06-09 18:53:58 +04:00
|
|
|
#include "orte/util/show_help.h"
|
2008-02-28 04:57:57 +03:00
|
|
|
#include "opal/event/event.h"
|
2008-02-28 22:58:32 +03:00
|
|
|
#include "opal/runtime/opal_progress.h"
|
2008-02-28 04:57:57 +03:00
|
|
|
|
2008-02-28 22:58:32 +03:00
|
|
|
#include "orte/mca/rml/rml_types.h"
|
2008-02-28 04:57:57 +03:00
|
|
|
#include "orte/runtime/orte_globals.h"
|
|
|
|
|
|
|
|
BEGIN_C_DECLS
|
|
|
|
|
2004-09-26 21:43:35 +04:00
|
|
|
/** typedef for callback function used in \c ompi_rte_wait_cb */
|
2005-03-14 23:57:21 +03:00
|
|
|
typedef void (*orte_wait_fn_t)(pid_t wpid, int status, void *data);
|
2004-09-26 21:43:35 +04:00
|
|
|
|
|
|
|
/**
|
|
|
|
* Wait for process terminiation
|
|
|
|
*
|
2005-03-14 23:57:21 +03:00
|
|
|
* Similar to \c waitpid, \c orte_waitpid utilizes the run-time
|
2004-09-26 21:43:35 +04:00
|
|
|
* event library for process terminiation notification. The \c
|
|
|
|
* WUNTRACED option is not supported, but the \c WNOHANG option is
|
|
|
|
* supported.
|
|
|
|
*
|
|
|
|
* \note A \c wpid value of \c -1 is not currently supported and will
|
|
|
|
* return an error.
|
|
|
|
*/
|
2006-08-23 07:32:36 +04:00
|
|
|
ORTE_DECLSPEC pid_t orte_waitpid(pid_t wpid, int *status, int options);
|
2004-09-26 21:43:35 +04:00
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Register a callback for process termination
|
|
|
|
*
|
|
|
|
* Register a callback for notification when \c wpid causes a SIGCHLD.
|
|
|
|
* \c waitpid() will have already been called on the process at this
|
|
|
|
* time.
|
|
|
|
*
|
|
|
|
* If a thread is already blocked in \c ompi_rte_waitpid for \c wpid,
|
2006-02-12 04:33:29 +03:00
|
|
|
* this function will return \c ORTE_ERR_EXISTS. It is illegal for
|
2005-03-18 06:43:59 +03:00
|
|
|
* multiple callbacks to be registered for a single \c wpid
|
|
|
|
* (OMPI_EXISTS will be returned in this case).
|
2004-09-26 21:43:35 +04:00
|
|
|
*
|
|
|
|
* \warning It is not legal for \c wpid to be -1 when registering a
|
|
|
|
* callback.
|
|
|
|
*/
|
2006-08-23 07:32:36 +04:00
|
|
|
ORTE_DECLSPEC int orte_wait_cb(pid_t wpid, orte_wait_fn_t callback, void *data);
|
2004-09-26 21:43:35 +04:00
|
|
|
|
2006-08-23 07:32:36 +04:00
|
|
|
ORTE_DECLSPEC int orte_wait_cb_cancel(pid_t wpid);
|
2004-10-27 02:11:03 +04:00
|
|
|
|
2006-08-23 07:32:36 +04:00
|
|
|
ORTE_DECLSPEC int orte_wait_cb_disable(void);
|
2004-10-27 02:11:03 +04:00
|
|
|
|
2006-08-23 07:32:36 +04:00
|
|
|
ORTE_DECLSPEC int orte_wait_cb_enable(void);
|
2004-09-26 21:43:35 +04:00
|
|
|
|
2008-02-28 04:57:57 +03:00
|
|
|
/**
|
|
|
|
* Setup to wait for an event
|
|
|
|
*
|
|
|
|
* This function is used to setup a pipe that can be used elsewhere
|
|
|
|
* in the code base where we want to wait for some event to
|
|
|
|
* happen. For example, orterun uses this function to setup an event
|
|
|
|
* that is used to notify orterun of abnormal and normal termination
|
|
|
|
* so it can wakeup and exit cleanly.
|
|
|
|
*
|
|
|
|
* The event will be defined so that a write to the provided trigger
|
|
|
|
* pipe will cause the event to trigger and callback to the provided
|
|
|
|
* function
|
|
|
|
*/
|
|
|
|
ORTE_DECLSPEC int orte_wait_event(opal_event_t **event, int *trig,
|
|
|
|
void (*cbfunc)(int, short, void*));
|
|
|
|
|
2008-02-28 22:58:32 +03:00
|
|
|
/**
|
|
|
|
* In a number of places in the code, we need to wait for something
|
|
|
|
* to complete - for example, waiting for all launched procs to
|
|
|
|
* report into the HNP. In such cases, we want to just call progress
|
|
|
|
* so that any messages get processed, but otherwise "hold" the
|
|
|
|
* program at this spot until the counter achieves the specified
|
|
|
|
* value. We also want to provide a boolean flag, though, so that
|
|
|
|
* we break out of the loop should something go wrong.
|
|
|
|
*/
|
|
|
|
#define ORTE_PROGRESSED_WAIT(failed, counter, limit) \
|
|
|
|
do { \
|
2008-06-09 18:53:58 +04:00
|
|
|
OPAL_OUTPUT_VERBOSE((1, orte_debug_output, \
|
2008-02-28 22:58:32 +03:00
|
|
|
"progressed_wait: %s %d", \
|
|
|
|
__FILE__, __LINE__)); \
|
|
|
|
while (!(failed) && (counter) < (limit)) { \
|
|
|
|
opal_progress(); \
|
|
|
|
} \
|
|
|
|
} while(0); \
|
|
|
|
|
|
|
|
|
2008-02-28 04:57:57 +03:00
|
|
|
/**
|
|
|
|
* Trigger a defined event
|
|
|
|
*
|
|
|
|
* This function will trigger a previously-defined event - as setup
|
|
|
|
* by orte_wait_event - by sending a message to the provided pipe
|
|
|
|
*/
|
|
|
|
ORTE_DECLSPEC void orte_trigger_event(int trig);
|
|
|
|
|
|
|
|
/**
|
2008-02-28 22:58:32 +03:00
|
|
|
* Setup an event to process a message
|
|
|
|
*
|
|
|
|
* If we are in an OOB recv callback, we frequently cannot process
|
|
|
|
* the received message until after we return from the callback to
|
|
|
|
* avoid a potential loopback situation - i.e., where processing
|
|
|
|
* the message can result in a message being sent somewhere that
|
|
|
|
* subsequently causes the recv we are in to get called again.
|
|
|
|
* This is typically the problem facing the daemons and HNP.
|
2008-02-28 04:57:57 +03:00
|
|
|
*
|
2008-02-28 22:58:32 +03:00
|
|
|
* To resolve this problem, we place the message to be processed on
|
|
|
|
* a list, and create a zero-time event that calls the function
|
|
|
|
* that will process the received message. The event library kindly
|
|
|
|
* does not trigger this event until after we return from the recv
|
|
|
|
* since the recv itself is considered an "event"! Thus, we will
|
|
|
|
* always execute the specified event cb function -after- leaving
|
|
|
|
* the recv.
|
2008-02-28 04:57:57 +03:00
|
|
|
*/
|
2008-02-28 22:58:32 +03:00
|
|
|
typedef struct {
|
|
|
|
opal_object_t super;
|
Fix a potential, albeit perhaps esoteric, race condition that can occur for fast HNP's, slow orteds, and fast apps. Under those conditions, it is possible for the orted to be caught in its original send of contact info back to the HNP, and thus for the progress stack never to recover back to a high level. In those circumstances, the orted can "hang" when trying to exit.
Add a new function to opal_progress that tells us our recursion depth to support that solution.
Yes, I know this sounds picky, but good ol' Jeff managed to make it happen by driving his cluster near to death...
Also ensure that we declare "failed" for the daemon job when daemons fail instead of the application job. This is important so that orte knows that it cannot use xcast to tell daemons to "exit", nor should it expect all daemons to respond. Otherwise, it is possible to hang.
After lots of testing, decide to default (again) to slurm detecting failed orteds. This proved necessary to avoid rather annoying hangs that were difficult to recover from. There are conditions where slurm will fail to launch all daemons (slurm folks are working on it), and yet again, good ol' Jeff managed to find both of them.
Thanks you Jeff! :-/
This commit was SVN r18611.
2008-06-06 23:36:27 +04:00
|
|
|
opal_event_t *ev;
|
2008-02-28 22:58:32 +03:00
|
|
|
orte_process_name_t sender;
|
|
|
|
opal_buffer_t *buffer;
|
|
|
|
orte_rml_tag_t tag;
|
2008-03-03 19:06:47 +03:00
|
|
|
#if OMPI_ENABLE_DEBUG
|
2008-02-29 23:10:31 +03:00
|
|
|
char *file;
|
|
|
|
int line;
|
2008-03-03 19:06:47 +03:00
|
|
|
#endif
|
2008-02-28 22:58:32 +03:00
|
|
|
} orte_message_event_t;
|
|
|
|
ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_message_event_t);
|
|
|
|
|
Fix a potential, albeit perhaps esoteric, race condition that can occur for fast HNP's, slow orteds, and fast apps. Under those conditions, it is possible for the orted to be caught in its original send of contact info back to the HNP, and thus for the progress stack never to recover back to a high level. In those circumstances, the orted can "hang" when trying to exit.
Add a new function to opal_progress that tells us our recursion depth to support that solution.
Yes, I know this sounds picky, but good ol' Jeff managed to make it happen by driving his cluster near to death...
Also ensure that we declare "failed" for the daemon job when daemons fail instead of the application job. This is important so that orte knows that it cannot use xcast to tell daemons to "exit", nor should it expect all daemons to respond. Otherwise, it is possible to hang.
After lots of testing, decide to default (again) to slurm detecting failed orteds. This proved necessary to avoid rather annoying hangs that were difficult to recover from. There are conditions where slurm will fail to launch all daemons (slurm folks are working on it), and yet again, good ol' Jeff managed to find both of them.
Thanks you Jeff! :-/
This commit was SVN r18611.
2008-06-06 23:36:27 +04:00
|
|
|
#define ORTE_MESSAGE_EVENT_DELAY(delay, mev) \
|
|
|
|
do { \
|
|
|
|
struct timeval now; \
|
2008-06-09 18:53:58 +04:00
|
|
|
OPAL_OUTPUT_VERBOSE((1, orte_debug_output, \
|
Fix a potential, albeit perhaps esoteric, race condition that can occur for fast HNP's, slow orteds, and fast apps. Under those conditions, it is possible for the orted to be caught in its original send of contact info back to the HNP, and thus for the progress stack never to recover back to a high level. In those circumstances, the orted can "hang" when trying to exit.
Add a new function to opal_progress that tells us our recursion depth to support that solution.
Yes, I know this sounds picky, but good ol' Jeff managed to make it happen by driving his cluster near to death...
Also ensure that we declare "failed" for the daemon job when daemons fail instead of the application job. This is important so that orte knows that it cannot use xcast to tell daemons to "exit", nor should it expect all daemons to respond. Otherwise, it is possible to hang.
After lots of testing, decide to default (again) to slurm detecting failed orteds. This proved necessary to avoid rather annoying hangs that were difficult to recover from. There are conditions where slurm will fail to launch all daemons (slurm folks are working on it), and yet again, good ol' Jeff managed to find both of them.
Thanks you Jeff! :-/
This commit was SVN r18611.
2008-06-06 23:36:27 +04:00
|
|
|
"defining message event delay: %s %d", \
|
|
|
|
__FILE__, __LINE__)); \
|
|
|
|
now.tv_sec = delay/1000000; \
|
|
|
|
now.tv_usec = delay%1000000; \
|
|
|
|
opal_evtimer_add(mev->ev, &now); \
|
|
|
|
} while(0);
|
|
|
|
|
2008-02-29 23:10:31 +03:00
|
|
|
#if OMPI_ENABLE_DEBUG
|
|
|
|
|
2008-02-28 22:58:32 +03:00
|
|
|
#define ORTE_MESSAGE_EVENT(sndr, buf, tg, cbfunc) \
|
|
|
|
do { \
|
|
|
|
orte_message_event_t *mev; \
|
|
|
|
struct timeval now; \
|
2008-06-09 18:53:58 +04:00
|
|
|
OPAL_OUTPUT_VERBOSE((1, orte_debug_output, \
|
2008-02-28 22:58:32 +03:00
|
|
|
"defining message event: %s %d", \
|
|
|
|
__FILE__, __LINE__)); \
|
|
|
|
mev = OBJ_NEW(orte_message_event_t); \
|
|
|
|
mev->sender.jobid = (sndr)->jobid; \
|
|
|
|
mev->sender.vpid = (sndr)->vpid; \
|
|
|
|
opal_dss.copy_payload(mev->buffer, (buf)); \
|
|
|
|
mev->tag = (tg); \
|
2008-02-29 23:10:31 +03:00
|
|
|
mev->file = strdup((buf)->parent.cls_init_file_name); \
|
|
|
|
mev->line = (buf)->parent.cls_init_lineno; \
|
Fix a potential, albeit perhaps esoteric, race condition that can occur for fast HNP's, slow orteds, and fast apps. Under those conditions, it is possible for the orted to be caught in its original send of contact info back to the HNP, and thus for the progress stack never to recover back to a high level. In those circumstances, the orted can "hang" when trying to exit.
Add a new function to opal_progress that tells us our recursion depth to support that solution.
Yes, I know this sounds picky, but good ol' Jeff managed to make it happen by driving his cluster near to death...
Also ensure that we declare "failed" for the daemon job when daemons fail instead of the application job. This is important so that orte knows that it cannot use xcast to tell daemons to "exit", nor should it expect all daemons to respond. Otherwise, it is possible to hang.
After lots of testing, decide to default (again) to slurm detecting failed orteds. This proved necessary to avoid rather annoying hangs that were difficult to recover from. There are conditions where slurm will fail to launch all daemons (slurm folks are working on it), and yet again, good ol' Jeff managed to find both of them.
Thanks you Jeff! :-/
This commit was SVN r18611.
2008-06-06 23:36:27 +04:00
|
|
|
opal_evtimer_set(mev->ev, (cbfunc), mev); \
|
2008-02-28 22:58:32 +03:00
|
|
|
now.tv_sec = 0; \
|
|
|
|
now.tv_usec = 0; \
|
Fix a potential, albeit perhaps esoteric, race condition that can occur for fast HNP's, slow orteds, and fast apps. Under those conditions, it is possible for the orted to be caught in its original send of contact info back to the HNP, and thus for the progress stack never to recover back to a high level. In those circumstances, the orted can "hang" when trying to exit.
Add a new function to opal_progress that tells us our recursion depth to support that solution.
Yes, I know this sounds picky, but good ol' Jeff managed to make it happen by driving his cluster near to death...
Also ensure that we declare "failed" for the daemon job when daemons fail instead of the application job. This is important so that orte knows that it cannot use xcast to tell daemons to "exit", nor should it expect all daemons to respond. Otherwise, it is possible to hang.
After lots of testing, decide to default (again) to slurm detecting failed orteds. This proved necessary to avoid rather annoying hangs that were difficult to recover from. There are conditions where slurm will fail to launch all daemons (slurm folks are working on it), and yet again, good ol' Jeff managed to find both of them.
Thanks you Jeff! :-/
This commit was SVN r18611.
2008-06-06 23:36:27 +04:00
|
|
|
opal_evtimer_add(mev->ev, &now); \
|
2008-02-28 22:58:32 +03:00
|
|
|
} while(0);
|
Fix a potential, albeit perhaps esoteric, race condition that can occur for fast HNP's, slow orteds, and fast apps. Under those conditions, it is possible for the orted to be caught in its original send of contact info back to the HNP, and thus for the progress stack never to recover back to a high level. In those circumstances, the orted can "hang" when trying to exit.
Add a new function to opal_progress that tells us our recursion depth to support that solution.
Yes, I know this sounds picky, but good ol' Jeff managed to make it happen by driving his cluster near to death...
Also ensure that we declare "failed" for the daemon job when daemons fail instead of the application job. This is important so that orte knows that it cannot use xcast to tell daemons to "exit", nor should it expect all daemons to respond. Otherwise, it is possible to hang.
After lots of testing, decide to default (again) to slurm detecting failed orteds. This proved necessary to avoid rather annoying hangs that were difficult to recover from. There are conditions where slurm will fail to launch all daemons (slurm folks are working on it), and yet again, good ol' Jeff managed to find both of them.
Thanks you Jeff! :-/
This commit was SVN r18611.
2008-06-06 23:36:27 +04:00
|
|
|
|
2008-02-29 23:10:31 +03:00
|
|
|
#else
|
|
|
|
|
|
|
|
#define ORTE_MESSAGE_EVENT(sndr, buf, tg, cbfunc) \
|
|
|
|
do { \
|
|
|
|
orte_message_event_t *mev; \
|
|
|
|
struct timeval now; \
|
2008-06-09 18:53:58 +04:00
|
|
|
OPAL_OUTPUT_VERBOSE((1, orte_debug_output, \
|
2008-02-29 23:10:31 +03:00
|
|
|
"defining message event: %s %d", \
|
|
|
|
__FILE__, __LINE__)); \
|
|
|
|
mev = OBJ_NEW(orte_message_event_t); \
|
|
|
|
mev->sender.jobid = (sndr)->jobid; \
|
|
|
|
mev->sender.vpid = (sndr)->vpid; \
|
|
|
|
opal_dss.copy_payload(mev->buffer, (buf)); \
|
|
|
|
mev->tag = (tg); \
|
Fix a potential, albeit perhaps esoteric, race condition that can occur for fast HNP's, slow orteds, and fast apps. Under those conditions, it is possible for the orted to be caught in its original send of contact info back to the HNP, and thus for the progress stack never to recover back to a high level. In those circumstances, the orted can "hang" when trying to exit.
Add a new function to opal_progress that tells us our recursion depth to support that solution.
Yes, I know this sounds picky, but good ol' Jeff managed to make it happen by driving his cluster near to death...
Also ensure that we declare "failed" for the daemon job when daemons fail instead of the application job. This is important so that orte knows that it cannot use xcast to tell daemons to "exit", nor should it expect all daemons to respond. Otherwise, it is possible to hang.
After lots of testing, decide to default (again) to slurm detecting failed orteds. This proved necessary to avoid rather annoying hangs that were difficult to recover from. There are conditions where slurm will fail to launch all daemons (slurm folks are working on it), and yet again, good ol' Jeff managed to find both of them.
Thanks you Jeff! :-/
This commit was SVN r18611.
2008-06-06 23:36:27 +04:00
|
|
|
opal_evtimer_set(mev->ev, (cbfunc), mev); \
|
2008-02-29 23:10:31 +03:00
|
|
|
now.tv_sec = 0; \
|
|
|
|
now.tv_usec = 0; \
|
Fix a potential, albeit perhaps esoteric, race condition that can occur for fast HNP's, slow orteds, and fast apps. Under those conditions, it is possible for the orted to be caught in its original send of contact info back to the HNP, and thus for the progress stack never to recover back to a high level. In those circumstances, the orted can "hang" when trying to exit.
Add a new function to opal_progress that tells us our recursion depth to support that solution.
Yes, I know this sounds picky, but good ol' Jeff managed to make it happen by driving his cluster near to death...
Also ensure that we declare "failed" for the daemon job when daemons fail instead of the application job. This is important so that orte knows that it cannot use xcast to tell daemons to "exit", nor should it expect all daemons to respond. Otherwise, it is possible to hang.
After lots of testing, decide to default (again) to slurm detecting failed orteds. This proved necessary to avoid rather annoying hangs that were difficult to recover from. There are conditions where slurm will fail to launch all daemons (slurm folks are working on it), and yet again, good ol' Jeff managed to find both of them.
Thanks you Jeff! :-/
This commit was SVN r18611.
2008-06-06 23:36:27 +04:00
|
|
|
opal_evtimer_add(mev->ev, &now); \
|
2008-02-29 23:10:31 +03:00
|
|
|
} while(0);
|
|
|
|
|
|
|
|
#endif
|
2008-02-28 22:58:32 +03:00
|
|
|
|
2008-02-28 04:57:57 +03:00
|
|
|
|
|
|
|
/**
|
|
|
|
* In a number of places within the code, we want to setup a timer
|
|
|
|
* to detect when some procedure failed to complete. For example,
|
|
|
|
* when we launch the daemons, we frequently have no way to directly
|
|
|
|
* detect that a daemon failed to launch. Setting a timer allows us
|
|
|
|
* to automatically fail out of the launch if we don't hear from a
|
|
|
|
* daemon in some specified time window.
|
|
|
|
*
|
|
|
|
* Computing the amount of time to wait takes a few lines of code, but
|
|
|
|
* this macro encapsulates those lines along with the timer event
|
|
|
|
* definition just as a convenience. It also centralizes the
|
|
|
|
* necessary checks to ensure that the microsecond field is always
|
|
|
|
* less than 1M since some systems care about that, and to ensure
|
|
|
|
* that the computed wait time doesn't exceed the desired max
|
|
|
|
* wait
|
|
|
|
*/
|
|
|
|
#define ORTE_DETECT_TIMEOUT(event, n, deltat, maxwait, cbfunc) \
|
|
|
|
do { \
|
|
|
|
struct timeval now; \
|
|
|
|
opal_event_t *tmp; \
|
Fix a potential, albeit perhaps esoteric, race condition that can occur for fast HNP's, slow orteds, and fast apps. Under those conditions, it is possible for the orted to be caught in its original send of contact info back to the HNP, and thus for the progress stack never to recover back to a high level. In those circumstances, the orted can "hang" when trying to exit.
Add a new function to opal_progress that tells us our recursion depth to support that solution.
Yes, I know this sounds picky, but good ol' Jeff managed to make it happen by driving his cluster near to death...
Also ensure that we declare "failed" for the daemon job when daemons fail instead of the application job. This is important so that orte knows that it cannot use xcast to tell daemons to "exit", nor should it expect all daemons to respond. Otherwise, it is possible to hang.
After lots of testing, decide to default (again) to slurm detecting failed orteds. This proved necessary to avoid rather annoying hangs that were difficult to recover from. There are conditions where slurm will fail to launch all daemons (slurm folks are working on it), and yet again, good ol' Jeff managed to find both of them.
Thanks you Jeff! :-/
This commit was SVN r18611.
2008-06-06 23:36:27 +04:00
|
|
|
int timeout; \
|
2008-02-28 04:57:57 +03:00
|
|
|
tmp = (opal_event_t*)malloc(sizeof(opal_event_t)); \
|
|
|
|
opal_evtimer_set(tmp, (cbfunc), NULL); \
|
Fix a potential, albeit perhaps esoteric, race condition that can occur for fast HNP's, slow orteds, and fast apps. Under those conditions, it is possible for the orted to be caught in its original send of contact info back to the HNP, and thus for the progress stack never to recover back to a high level. In those circumstances, the orted can "hang" when trying to exit.
Add a new function to opal_progress that tells us our recursion depth to support that solution.
Yes, I know this sounds picky, but good ol' Jeff managed to make it happen by driving his cluster near to death...
Also ensure that we declare "failed" for the daemon job when daemons fail instead of the application job. This is important so that orte knows that it cannot use xcast to tell daemons to "exit", nor should it expect all daemons to respond. Otherwise, it is possible to hang.
After lots of testing, decide to default (again) to slurm detecting failed orteds. This proved necessary to avoid rather annoying hangs that were difficult to recover from. There are conditions where slurm will fail to launch all daemons (slurm folks are working on it), and yet again, good ol' Jeff managed to find both of them.
Thanks you Jeff! :-/
This commit was SVN r18611.
2008-06-06 23:36:27 +04:00
|
|
|
timeout = (deltat) * (n); \
|
|
|
|
if ((maxwait) > 0 && timeout > (maxwait)) { \
|
|
|
|
timeout = (maxwait); \
|
2008-02-28 04:57:57 +03:00
|
|
|
} \
|
Fix a potential, albeit perhaps esoteric, race condition that can occur for fast HNP's, slow orteds, and fast apps. Under those conditions, it is possible for the orted to be caught in its original send of contact info back to the HNP, and thus for the progress stack never to recover back to a high level. In those circumstances, the orted can "hang" when trying to exit.
Add a new function to opal_progress that tells us our recursion depth to support that solution.
Yes, I know this sounds picky, but good ol' Jeff managed to make it happen by driving his cluster near to death...
Also ensure that we declare "failed" for the daemon job when daemons fail instead of the application job. This is important so that orte knows that it cannot use xcast to tell daemons to "exit", nor should it expect all daemons to respond. Otherwise, it is possible to hang.
After lots of testing, decide to default (again) to slurm detecting failed orteds. This proved necessary to avoid rather annoying hangs that were difficult to recover from. There are conditions where slurm will fail to launch all daemons (slurm folks are working on it), and yet again, good ol' Jeff managed to find both of them.
Thanks you Jeff! :-/
This commit was SVN r18611.
2008-06-06 23:36:27 +04:00
|
|
|
now.tv_sec = timeout/1000000; \
|
|
|
|
now.tv_usec = timeout%1000000; \
|
2008-06-09 18:53:58 +04:00
|
|
|
OPAL_OUTPUT_VERBOSE((1, orte_debug_output, \
|
2008-02-28 04:57:57 +03:00
|
|
|
"defining timeout: %ld sec %ld usec", \
|
|
|
|
(long)now.tv_sec, (long)now.tv_usec)); \
|
|
|
|
opal_evtimer_add(tmp, &now); \
|
|
|
|
*(event) = tmp; \
|
|
|
|
}while(0); \
|
|
|
|
|
2008-03-05 16:51:32 +03:00
|
|
|
|
|
|
|
/**
|
|
|
|
* There are places in the code where we just want to periodically
|
|
|
|
* wakeup to do something, and then go back to sleep again. Setting
|
|
|
|
* a timer allows us to do this
|
|
|
|
*/
|
|
|
|
#define ORTE_TIMER_EVENT(time, cbfunc) \
|
|
|
|
do { \
|
|
|
|
struct timeval now; \
|
|
|
|
opal_event_t *tmp; \
|
|
|
|
tmp = (opal_event_t*)malloc(sizeof(opal_event_t)); \
|
|
|
|
opal_evtimer_set(tmp, (cbfunc), tmp); \
|
|
|
|
now.tv_sec = (time); \
|
|
|
|
now.tv_usec = 0; \
|
2008-06-09 18:53:58 +04:00
|
|
|
OPAL_OUTPUT_VERBOSE((1, orte_debug_output, \
|
2008-03-05 16:51:32 +03:00
|
|
|
"defining timer event: %ld sec", \
|
|
|
|
(long)now.tv_sec)); \
|
|
|
|
opal_evtimer_add(tmp, &now); \
|
|
|
|
}while(0); \
|
|
|
|
|
|
|
|
|
2004-09-26 21:43:35 +04:00
|
|
|
/**
|
|
|
|
* \internal
|
|
|
|
*
|
|
|
|
* Initialize the wait system (allocate mutexes, etc.)
|
|
|
|
*/
|
2006-08-23 07:32:36 +04:00
|
|
|
ORTE_DECLSPEC int orte_wait_init(void);
|
2004-09-26 21:43:35 +04:00
|
|
|
|
2005-03-14 23:57:21 +03:00
|
|
|
/**
|
|
|
|
* Kill all processes we are waiting on.
|
|
|
|
*/
|
2006-08-23 07:32:36 +04:00
|
|
|
ORTE_DECLSPEC int orte_wait_kill(int sig);
|
2004-09-26 21:43:35 +04:00
|
|
|
|
|
|
|
/**
|
|
|
|
* \internal
|
|
|
|
*
|
|
|
|
* Finalize the wait system (deallocate mutexes, etc.)
|
|
|
|
*/
|
2006-08-23 07:32:36 +04:00
|
|
|
ORTE_DECLSPEC int orte_wait_finalize(void);
|
|
|
|
|
2008-02-28 04:57:57 +03:00
|
|
|
END_C_DECLS
|
2004-09-26 21:43:35 +04:00
|
|
|
|
2005-03-14 23:57:21 +03:00
|
|
|
#endif /* #ifndef ORTE_WAIT_H */
|