Merge pull request #3218 from rhc54/topic/pmix2
Update to include the PMIx 2.0 APIs for monitoring and job control.
Этот коммит содержится в:
Коммит
ea84a53faa
@ -10,7 +10,7 @@
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2010-2012 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -94,7 +94,9 @@ enum {
|
||||
OPAL_ERR_PROC_RESTART = (OPAL_ERR_BASE - 63),
|
||||
OPAL_ERR_PROC_CHECKPOINT = (OPAL_ERR_BASE - 64),
|
||||
OPAL_ERR_PROC_MIGRATE = (OPAL_ERR_BASE - 65),
|
||||
OPAL_ERR_EVENT_REGISTRATION = (OPAL_ERR_BASE - 66)
|
||||
OPAL_ERR_EVENT_REGISTRATION = (OPAL_ERR_BASE - 66),
|
||||
OPAL_ERR_HEARTBEAT_ALERT = (OPAL_ERR_BASE - 67),
|
||||
OPAL_ERR_FILE_ALERT = (OPAL_ERR_BASE - 68)
|
||||
};
|
||||
|
||||
#define OPAL_ERR_MAX (OPAL_ERR_BASE - 100)
|
||||
|
@ -1,6 +1,6 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2014-2017 Research Organization for Information Science
|
||||
* and Technology (RIST). All rights reserved.
|
||||
* Copyright (c) 2014-2015 Mellanox Technologies, Inc.
|
||||
@ -352,7 +352,7 @@ static void _event_hdlr(int sd, short args, void *cbdata)
|
||||
if (NULL != chain->final_cbfunc) {
|
||||
chain->final_cbfunc(PMIX_SUCCESS, chain->final_cbdata);
|
||||
}
|
||||
|
||||
|
||||
OBJ_RELEASE(chain);
|
||||
|
||||
return;
|
||||
|
@ -473,6 +473,59 @@ pmix_status_t PMIx_Allocation_request_nb(pmix_alloc_directive_t directive,
|
||||
pmix_info_t *info, size_t ninfo,
|
||||
pmix_info_cbfunc_t cbfunc, void *cbdata);
|
||||
|
||||
/* Request a job control action. The targets array identifies the
|
||||
* processes to which the requested job control action is to be applied.
|
||||
* A NULL value can be used to indicate all processes in the caller's
|
||||
* nspace. The use of PMIX_RANK_WILDARD can also be used to indicate
|
||||
* that all processes in the given nspace are to be included.
|
||||
*
|
||||
* The directives are provided as pmix_info_t structs in the directives
|
||||
* array. The callback function provides a status to indicate whether or
|
||||
* not the request was granted, and to provide some information as to
|
||||
* the reason for any denial in the pmix_info_cbfunc_t array of pmix_info_t
|
||||
* structures. If non-NULL, then the specified release_fn must be called
|
||||
* when the callback function completes - this will be used to release
|
||||
* any provided pmix_info_t array.
|
||||
*/
|
||||
pmix_status_t PMIx_Job_control_nb(const pmix_proc_t targets[], size_t ntargets,
|
||||
const pmix_info_t directives[], size_t ndirs,
|
||||
pmix_info_cbfunc_t cbfunc, void *cbdata);
|
||||
|
||||
/* Request that something be monitored - e.g., that the server monitor
|
||||
* this process for periodic heartbeats as an indication that the process
|
||||
* has not become "wedged". When a monitor detects the specified alarm
|
||||
* condition, it will generate an event notification using the provided
|
||||
* error code and passing along any available relevant information. It is
|
||||
* up to the caller to register a corresponding event handler.
|
||||
*
|
||||
* Params:
|
||||
*
|
||||
* monitor: attribute indicating the type of monitor being requested - e.g.,
|
||||
* PMIX_MONITOR_FILE to indicate that the requestor is asking that
|
||||
* a file be monitored.
|
||||
*
|
||||
* error: the status code to be used when generating an event notification
|
||||
* alerting that the monitor has been triggered. The range of the
|
||||
* notification defaults to PMIX_RANGE_NAMESPACE - this can be
|
||||
* changed by providing a PMIX_RANGE directive
|
||||
*
|
||||
* directives: characterize the monitoring request (e.g., monitor file size)
|
||||
* and frequency of checking to be done
|
||||
*
|
||||
* cbfunc: provides a status to indicate whether or not the request was granted,
|
||||
* and to provide some information as to the reason for any denial in
|
||||
* the pmix_info_cbfunc_t array of pmix_info_t structures.
|
||||
*
|
||||
* Note: a process can send a heartbeat to the server using the PMIx_Heartbeat
|
||||
* macro provided below*/
|
||||
pmix_status_t PMIx_Process_monitor_nb(const pmix_info_t *monitor, pmix_status_t error,
|
||||
const pmix_info_t directives[], size_t ndirs,
|
||||
pmix_info_cbfunc_t cbfunc, void *cbdata);
|
||||
|
||||
/* define a special macro to simplify sending of a heartbeat */
|
||||
#define PMIx_Heartbeat() \
|
||||
PMIx_Process_monitor_nb(PMIX_SEND_HEARTBEAT, NULL, 0, NULL, NULL)
|
||||
|
||||
#if defined(c_plusplus) || defined(__cplusplus)
|
||||
}
|
||||
#endif
|
||||
|
@ -123,6 +123,8 @@ typedef uint32_t pmix_rank_t;
|
||||
// a local system-level PMIx server
|
||||
#define PMIX_CONNECT_SYSTEM_FIRST "pmix.cnct.sys.first" // (bool) Preferentially look for a system-level PMIx server first
|
||||
#define PMIX_REGISTER_NODATA "pmix.reg.nodata" // (bool) Registration is for nspace only, do not copy job data
|
||||
#define PMIX_SERVER_ENABLE_MONITORING "pmix.srv.monitor" // (bool) Enable PMIx internal monitoring by server
|
||||
|
||||
|
||||
/* identification attributes */
|
||||
#define PMIX_USERID "pmix.euid" // (uint32_t) effective user id
|
||||
@ -218,8 +220,9 @@ typedef uint32_t pmix_rank_t;
|
||||
#define PMIX_COLLECTIVE_ALGO "pmix.calgo" // (char*) comma-delimited list of algorithms to use for collective
|
||||
#define PMIX_COLLECTIVE_ALGO_REQD "pmix.calreqd" // (bool) if true, indicates that the requested choice of algo is mandatory
|
||||
#define PMIX_NOTIFY_COMPLETION "pmix.notecomp" // (bool) notify parent process upon termination of child job
|
||||
#define PMIX_RANGE "pmix.range" // (int) pmix_data_range_t value for calls to publish/lookup/unpublish
|
||||
#define PMIX_PERSISTENCE "pmix.persist" // (int) pmix_persistence_t value for calls to publish
|
||||
#define PMIX_RANGE "pmix.range" // (pmix_data_range_t) value for calls to publish/lookup/unpublish or for
|
||||
// monitoring event notifications
|
||||
#define PMIX_PERSISTENCE "pmix.persist" // (pmix_persistence_t) value for calls to publish
|
||||
#define PMIX_OPTIONAL "pmix.optional" // (bool) look only in the immediate data store for the requested value - do
|
||||
// not request data from the server if not found
|
||||
#define PMIX_EMBED_BARRIER "pmix.embed.barrier" // (bool) execute a blocking fence operation before executing the
|
||||
@ -259,66 +262,72 @@ typedef uint32_t pmix_rank_t;
|
||||
#define PMIX_EVENT_ACTION_TIMEOUT "pmix.evtimeout" // (int) time in sec before RM will execute error response
|
||||
|
||||
/* attributes used to describe "spawn" attributes */
|
||||
#define PMIX_PERSONALITY "pmix.pers" // (char*) name of personality to use
|
||||
#define PMIX_HOST "pmix.host" // (char*) comma-delimited list of hosts to use for spawned procs
|
||||
#define PMIX_HOSTFILE "pmix.hostfile" // (char*) hostfile to use for spawned procs
|
||||
#define PMIX_ADD_HOST "pmix.addhost" // (char*) comma-delimited list of hosts to add to allocation
|
||||
#define PMIX_ADD_HOSTFILE "pmix.addhostfile" // (char*) hostfile to add to existing allocation
|
||||
#define PMIX_PREFIX "pmix.prefix" // (char*) prefix to use for starting spawned procs
|
||||
#define PMIX_WDIR "pmix.wdir" // (char*) working directory for spawned procs
|
||||
#define PMIX_MAPPER "pmix.mapper" // (char*) mapper to use for placing spawned procs
|
||||
#define PMIX_DISPLAY_MAP "pmix.dispmap" // (bool) display process map upon spawn
|
||||
#define PMIX_PPR "pmix.ppr" // (char*) #procs to spawn on each identified resource
|
||||
#define PMIX_MAPBY "pmix.mapby" // (char*) mapping policy
|
||||
#define PMIX_RANKBY "pmix.rankby" // (char*) ranking policy
|
||||
#define PMIX_BINDTO "pmix.bindto" // (char*) binding policy
|
||||
#define PMIX_PRELOAD_BIN "pmix.preloadbin" // (bool) preload binaries
|
||||
#define PMIX_PRELOAD_FILES "pmix.preloadfiles" // (char*) comma-delimited list of files to pre-position
|
||||
#define PMIX_NON_PMI "pmix.nonpmi" // (bool) spawned procs will not call PMIx_Init
|
||||
#define PMIX_STDIN_TGT "pmix.stdin" // (uint32_t) spawned proc rank that is to receive stdin
|
||||
#define PMIX_FWD_STDIN "pmix.fwd.stdin" // (bool) forward my stdin to the designated proc
|
||||
#define PMIX_FWD_STDOUT "pmix.fwd.stdout" // (bool) forward stdout from spawned procs to me
|
||||
#define PMIX_FWD_STDERR "pmix.fwd.stderr" // (bool) forward stderr from spawned procs to me
|
||||
#define PMIX_DEBUGGER_DAEMONS "pmix.debugger" // (bool) spawned app consists of debugger daemons
|
||||
#define PMIX_COSPAWN_APP "pmix.cospawn" // (bool) designated app is to be spawned as a disconnected
|
||||
// job - i.e., not part of the "comm_world" of the job
|
||||
#define PMIX_PERSONALITY "pmix.pers" // (char*) name of personality to use
|
||||
#define PMIX_HOST "pmix.host" // (char*) comma-delimited list of hosts to use for spawned procs
|
||||
#define PMIX_HOSTFILE "pmix.hostfile" // (char*) hostfile to use for spawned procs
|
||||
#define PMIX_ADD_HOST "pmix.addhost" // (char*) comma-delimited list of hosts to add to allocation
|
||||
#define PMIX_ADD_HOSTFILE "pmix.addhostfile" // (char*) hostfile to add to existing allocation
|
||||
#define PMIX_PREFIX "pmix.prefix" // (char*) prefix to use for starting spawned procs
|
||||
#define PMIX_WDIR "pmix.wdir" // (char*) working directory for spawned procs
|
||||
#define PMIX_MAPPER "pmix.mapper" // (char*) mapper to use for placing spawned procs
|
||||
#define PMIX_DISPLAY_MAP "pmix.dispmap" // (bool) display process map upon spawn
|
||||
#define PMIX_PPR "pmix.ppr" // (char*) #procs to spawn on each identified resource
|
||||
#define PMIX_MAPBY "pmix.mapby" // (char*) mapping policy
|
||||
#define PMIX_RANKBY "pmix.rankby" // (char*) ranking policy
|
||||
#define PMIX_BINDTO "pmix.bindto" // (char*) binding policy
|
||||
#define PMIX_PRELOAD_BIN "pmix.preloadbin" // (bool) preload binaries
|
||||
#define PMIX_PRELOAD_FILES "pmix.preloadfiles" // (char*) comma-delimited list of files to pre-position
|
||||
#define PMIX_NON_PMI "pmix.nonpmi" // (bool) spawned procs will not call PMIx_Init
|
||||
#define PMIX_STDIN_TGT "pmix.stdin" // (uint32_t) spawned proc rank that is to receive stdin
|
||||
#define PMIX_FWD_STDIN "pmix.fwd.stdin" // (bool) forward my stdin to the designated proc
|
||||
#define PMIX_FWD_STDOUT "pmix.fwd.stdout" // (bool) forward stdout from spawned procs to me
|
||||
#define PMIX_FWD_STDERR "pmix.fwd.stderr" // (bool) forward stderr from spawned procs to me
|
||||
#define PMIX_DEBUGGER_DAEMONS "pmix.debugger" // (bool) spawned app consists of debugger daemons
|
||||
#define PMIX_COSPAWN_APP "pmix.cospawn" // (bool) designated app is to be spawned as a disconnected
|
||||
// job - i.e., not part of the "comm_world" of the job
|
||||
|
||||
/* query attributes */
|
||||
#define PMIX_QUERY_NAMESPACES "pmix.qry.ns" // (char*) request a comma-delimited list of active nspaces
|
||||
#define PMIX_QUERY_JOB_STATUS "pmix.qry.jst" // (pmix_status_t) status of a specified currently executing job
|
||||
#define PMIX_QUERY_QUEUE_LIST "pmix.qry.qlst" // (char*) request a comma-delimited list of scheduler queues
|
||||
#define PMIX_QUERY_QUEUE_STATUS "pmix.qry.qst" // (TBD) status of a specified scheduler queue
|
||||
#define PMIX_QUERY_PROC_TABLE "pmix.qry.ptable" // (char*) input nspace of job whose info is being requested
|
||||
// returns (pmix_data_array_t) an array of pmix_proc_info_t
|
||||
#define PMIX_QUERY_LOCAL_PROC_TABLE "pmix.qry.lptable" // (char*) input nspace of job whose info is being requested
|
||||
// returns (pmix_data_array_t) an array of pmix_proc_info_t for
|
||||
// procs in job on same node
|
||||
#define PMIX_QUERY_AUTHORIZATIONS "pmix.qry.auths" // return operations tool is authorized to perform
|
||||
#define PMIX_QUERY_SPAWN_SUPPORT "pmix.qry.spawn" // return a comma-delimited list of supported spawn attributes
|
||||
#define PMIX_QUERY_DEBUG_SUPPORT "pmix.qry.debug" // return a comma-delimited list of supported debug attributes
|
||||
#define PMIX_QUERY_MEMORY_USAGE "pmix.qry.mem" // return info on memory usage for the procs indicated in the qualifiers
|
||||
#define PMIX_QUERY_LOCAL_ONLY "pmix.qry.local" // constrain the query to local information only
|
||||
#define PMIX_QUERY_REPORT_AVG "pmix.qry.avg" // report average values
|
||||
#define PMIX_QUERY_REPORT_MINMAX "pmix.qry.minmax" // report minimum and maximum value
|
||||
#define PMIX_QUERY_ALLOC_STATUS "pmix.query.alloc" // (char*) string identifier of the allocation whose status
|
||||
// is being requested
|
||||
#define PMIX_QUERY_NAMESPACES "pmix.qry.ns" // (char*) request a comma-delimited list of active nspaces
|
||||
#define PMIX_QUERY_JOB_STATUS "pmix.qry.jst" // (pmix_status_t) status of a specified currently executing job
|
||||
#define PMIX_QUERY_QUEUE_LIST "pmix.qry.qlst" // (char*) request a comma-delimited list of scheduler queues
|
||||
#define PMIX_QUERY_QUEUE_STATUS "pmix.qry.qst" // (TBD) status of a specified scheduler queue
|
||||
#define PMIX_QUERY_PROC_TABLE "pmix.qry.ptable" // (char*) input nspace of job whose info is being requested
|
||||
// returns (pmix_data_array_t) an array of pmix_proc_info_t
|
||||
#define PMIX_QUERY_LOCAL_PROC_TABLE "pmix.qry.lptable" // (char*) input nspace of job whose info is being requested
|
||||
// returns (pmix_data_array_t) an array of pmix_proc_info_t for
|
||||
// procs in job on same node
|
||||
#define PMIX_QUERY_AUTHORIZATIONS "pmix.qry.auths" // (bool) return operations tool is authorized to perform
|
||||
#define PMIX_QUERY_SPAWN_SUPPORT "pmix.qry.spawn" // (bool) return a comma-delimited list of supported spawn attributes
|
||||
#define PMIX_QUERY_DEBUG_SUPPORT "pmix.qry.debug" // (bool) return a comma-delimited list of supported debug attributes
|
||||
#define PMIX_QUERY_MEMORY_USAGE "pmix.qry.mem" // (bool) return info on memory usage for the procs indicated in the qualifiers
|
||||
#define PMIX_QUERY_LOCAL_ONLY "pmix.qry.local" // (bool) constrain the query to local information only
|
||||
#define PMIX_QUERY_REPORT_AVG "pmix.qry.avg" // (bool) report average values
|
||||
#define PMIX_QUERY_REPORT_MINMAX "pmix.qry.minmax" // (bool) report minimum and maximum value
|
||||
#define PMIX_QUERY_ALLOC_STATUS "pmix.query.alloc" // (char*) string identifier of the allocation whose status
|
||||
// is being requested
|
||||
#define PMIX_TIME_REMAINING "pmix.time.remaining" // (char*) query number of seconds (uint32_t) remaining in allocation
|
||||
// for the specified nspace
|
||||
|
||||
/* log attributes */
|
||||
#define PMIX_LOG_STDERR "pmix.log.stderr" // (char*) log string to stderr
|
||||
#define PMIX_LOG_STDOUT "pmix.log.stdout" // (char*) log string to stdout
|
||||
#define PMIX_LOG_SYSLOG "pmix.log.syslog" // (char*) log data to syslog - defaults to ERROR priority unless
|
||||
#define PMIX_LOG_MSG "pmix.log.msg" // (pmix_byte_object_t) message blob to be sent somewhere
|
||||
#define PMIX_LOG_STDERR "pmix.log.stderr" // (char*) log string to stderr
|
||||
#define PMIX_LOG_STDOUT "pmix.log.stdout" // (char*) log string to stdout
|
||||
#define PMIX_LOG_SYSLOG "pmix.log.syslog" // (char*) log data to syslog - defaults to ERROR priority unless
|
||||
#define PMIX_LOG_MSG "pmix.log.msg" // (pmix_byte_object_t) message blob to be sent somewhere
|
||||
#define PMIX_LOG_EMAIL "pmix.log.email" // (pmix_data_array_t) log via email based on pmix_info_t containing directives
|
||||
#define PMIX_LOG_EMAIL_ADDR "pmix.log.emaddr" // (char*) comma-delimited list of email addresses that are to recv msg
|
||||
#define PMIX_LOG_EMAIL_SUBJECT "pmix.log.emsub" // (char*) subject line for email
|
||||
#define PMIX_LOG_EMAIL_MSG "pmix.log.emmsg" // (char*) msg to be included in email
|
||||
|
||||
/* debugger attributes */
|
||||
#define PMIX_DEBUG_STOP_ON_EXEC "pmix.dbg.exec" // (bool) job is being spawned under debugger - instruct it to pause on start
|
||||
#define PMIX_DEBUG_STOP_IN_INIT "pmix.dbg.init" // (bool) instruct job to stop during PMIx init
|
||||
#define PMIX_DEBUG_WAIT_FOR_NOTIFY "pmix.dbg.notify" // (bool) block at desired point until receiving debugger release notification
|
||||
#define PMIX_DEBUG_JOB "pmix.dbg.job" // (char*) nspace of the job to be debugged - the RM/PMIx server are
|
||||
#define PMIX_DEBUG_WAITING_FOR_NOTIFY "pmix.dbg.waiting" // (bool) job to be debugged is waiting for a release
|
||||
#define PMIX_DEBUG_STOP_ON_EXEC "pmix.dbg.exec" // (bool) job is being spawned under debugger - instruct it to pause on start
|
||||
#define PMIX_DEBUG_STOP_IN_INIT "pmix.dbg.init" // (bool) instruct job to stop during PMIx init
|
||||
#define PMIX_DEBUG_WAIT_FOR_NOTIFY "pmix.dbg.notify" // (bool) block at desired point until receiving debugger release notification
|
||||
#define PMIX_DEBUG_JOB "pmix.dbg.job" // (char*) nspace of the job to be debugged - the RM/PMIx server are
|
||||
#define PMIX_DEBUG_WAITING_FOR_NOTIFY "pmix.dbg.waiting" // (bool) job to be debugged is waiting for a release
|
||||
|
||||
/* Resource Manager identification */
|
||||
#define PMIX_RM_NAME "pmix.rm.name" // (char*) string name of the resource manager
|
||||
#define PMIX_RM_VERSION "pmix.rm.version" // (char*) RM version string
|
||||
#define PMIX_RM_NAME "pmix.rm.name" // (char*) string name of the resource manager
|
||||
#define PMIX_RM_VERSION "pmix.rm.version" // (char*) RM version string
|
||||
|
||||
/* attributes for setting envars */
|
||||
#define PMIX_SET_ENVAR "pmix.set.envar" // (char*) string "key=value" value shall be put into the environment
|
||||
@ -327,7 +336,6 @@ typedef uint32_t pmix_rank_t;
|
||||
/* attributes relating to allocations */
|
||||
#define PMIX_ALLOC_ID "pmix.alloc.id" // (char*) provide a string identifier for this allocation request
|
||||
// which can later be used to query status of the request
|
||||
#define PMIX_TIME_REMAINING "pmix.time.remaining" // (uint32_t) get number of seconds remaining in allocation
|
||||
#define PMIX_ALLOC_NUM_NODES "pmix.alloc.nnodes" // (uint64_t) number of nodes
|
||||
#define PMIX_ALLOC_NODE_LIST "pmix.alloc.nlist" // (char*) regex of specific nodes
|
||||
#define PMIX_ALLOC_NUM_CPUS "pmix.alloc.ncpus" // (uint64_t) number of cpus
|
||||
@ -343,6 +351,38 @@ typedef uint32_t pmix_rank_t;
|
||||
#define PMIX_ALLOC_NETWORK_QOS "pmix.alloc.netqos" // (char*) quality of service level
|
||||
#define PMIX_ALLOC_TIME "pmix.alloc.time" // (uint32_t) time in seconds
|
||||
|
||||
/* job control attributes */
|
||||
#define PMIX_JOB_CTRL_ID "pmix.jctrl.id" // (char*) provide a string identifier for this request
|
||||
#define PMIX_JOB_CTRL_PAUSE "pmix.jctrl.pause" // (bool) pause the specified processes
|
||||
#define PMIX_JOB_CTRL_RESUME "pmix.jctrl.resume" // (bool) "un-pause" the specified processes
|
||||
#define PMIX_JOB_CTRL_CANCEL "pmix.jctrl.cancel" // (char*) cancel the specified request
|
||||
// (NULL => cancel all requests from this requestor)
|
||||
#define PMIX_JOB_CTRL_KILL "pmix.jctrl.kill" // (bool) forcibly terminate the specified processes and cleanup
|
||||
#define PMIX_JOB_CTRL_RESTART "pmix.jctrl.restart" // (char*) restart the specified processes using the given checkpoint ID
|
||||
#define PMIX_JOB_CTRL_CHECKPOINT "pmix.jctrl.ckpt" // (char*) checkpoint the specified processes and assign the given ID to it
|
||||
#define PMIX_JOB_CTRL_CHECKPOINT_EVENT "pmix.jctrl.ckptev" // (bool) use event notification to trigger process checkpoint
|
||||
#define PMIX_JOB_CTRL_CHECKPOINT_SIGNAL "pmix.jctrl.ckptsig" // (int) use the given signal to trigger process checkpoint
|
||||
#define PMIX_JOB_CTRL_CHECKPOINT_TIMEOUT "pmix.jctrl.ckptsig" // (int) time in seconds to wait for checkpoint to complete
|
||||
#define PMIX_JOB_CTRL_SIGNAL "pmix.jctrl.sig" // (int) send given signal to specified processes
|
||||
#define PMIX_JOB_CTRL_PROVISION "pmix.jctrl.pvn" // (char*) regex identifying nodes that are to be provisioned
|
||||
#define PMIX_JOB_CTRL_PROVISION_IMAGE "pmix.jctrl.pvnimg" // (char*) name of the image that is to be provisioned
|
||||
#define PMIX_JOB_CTRL_PREEMPTIBLE "pmix.jctrl.preempt" // (bool) job can be pre-empted
|
||||
|
||||
/* monitoring attributes */
|
||||
#define PMIX_MONITOR_HEARTBEAT "pmix.monitor.mbeat" // (void) register to have the server monitor the requestor for heartbeats
|
||||
#define PMIX_SEND_HEARTBEAT "pmix.monitor.beat" // (void) send heartbeat to local server
|
||||
#define PMIX_MONITOR_HEARTBEAT_TIME "pmix.monitor.btime" // (uint32_t) time in seconds before declaring heartbeat missed
|
||||
#define PMIX_MONITOR_HEARTBEAT_DROPS "pmix.monitor.bdrop" // (uint32_t) number of heartbeats that can be missed before taking
|
||||
// specified action
|
||||
#define PMIX_MONITOR_FILE "pmix.monitor.fmon" // (char*) register to monitor file for signs of life
|
||||
#define PMIX_MONITOR_FILE_SIZE "pmix.monitor.fsize" // (bool) monitor size of given file is growing to determine app is running
|
||||
#define PMIX_MONITOR_FILE_ACCESS "pmix.monitor.faccess" // (char*) monitor time since last access of given file to determine app is running
|
||||
#define PMIX_MONITOR_FILE_MODIFY "pmix.monitor.fmod" // (char*) monitor time since last modified of given file to determine app is running
|
||||
#define PMIX_MONITOR_FILE_CHECK_TIME "pmix.monitor.ftime" // (uint32_t) time in seconds between checking file
|
||||
#define PMIX_MONITOR_FILE_DROPS "pmix.monitor.fdrop" // (uint32_t) number of file checks that can be missed before taking
|
||||
// specified action
|
||||
|
||||
|
||||
/**** PROCESS STATE DEFINITIONS ****/
|
||||
typedef uint8_t pmix_proc_state_t;
|
||||
#define PMIX_PROC_STATE_UNDEF 0 /* undefined process state */
|
||||
@ -455,7 +495,14 @@ typedef int pmix_status_t;
|
||||
#define PMIX_ERR_LOST_CONNECTION_TO_CLIENT (PMIX_ERR_V2X_BASE - 3)
|
||||
/* used by the query system */
|
||||
#define PMIX_QUERY_PARTIAL_SUCCESS (PMIX_ERR_V2X_BASE - 4)
|
||||
/* request responses */
|
||||
#define PMIX_NOTIFY_ALLOC_COMPLETE (PMIX_ERR_V2X_BASE - 5)
|
||||
/* job control */
|
||||
#define PMIX_JCTRL_CHECKPOINT (PMIX_ERR_V2X_BASE - 6)
|
||||
#define PMIX_JCTRL_PREEMPT_ALERT (PMIX_ERR_V2X_BASE - 7)
|
||||
/* monitoring */
|
||||
#define PMIX_MONITOR_HEARTBEAT_ALERT (PMIX_ERR_V2X_BASE - 8)
|
||||
#define PMIX_MONITOR_FILE_ALERT (PMIX_ERR_V2X_BASE - 9)
|
||||
|
||||
/* define a starting point for operational error constants so
|
||||
* we avoid renumbering when making additions */
|
||||
|
@ -328,6 +328,17 @@ typedef pmix_status_t (*pmix_server_alloc_fn_t)(const pmix_proc_t *client,
|
||||
const pmix_info_t data[], size_t ndata,
|
||||
pmix_info_cbfunc_t cbfunc, void *cbdata);
|
||||
|
||||
/* Execute a job control action on behalf of a client */
|
||||
typedef pmix_status_t (*pmix_server_job_control_fn_t)(const pmix_proc_t *requestor,
|
||||
const pmix_proc_t targets[], size_t ntargets,
|
||||
const pmix_info_t directives[], size_t ndirs,
|
||||
pmix_info_cbfunc_t cbfunc, void *cbdata);
|
||||
|
||||
/* Request that a client be monitored for activity */
|
||||
typedef pmix_status_t (*pmix_server_monitor_fn_t)(const pmix_proc_t *requestor, pmix_status_t error,
|
||||
const pmix_info_t directives[], size_t ndirs,
|
||||
pmix_info_cbfunc_t cbfunc, void *cbdata);
|
||||
|
||||
typedef struct pmix_server_module_2_0_0_t {
|
||||
/* v1x interfaces */
|
||||
pmix_server_client_connected_fn_t client_connected;
|
||||
@ -350,12 +361,14 @@ typedef struct pmix_server_module_2_0_0_t {
|
||||
pmix_server_tool_connection_fn_t tool_connected;
|
||||
pmix_server_log_fn_t log;
|
||||
pmix_server_alloc_fn_t allocate;
|
||||
pmix_server_job_control_fn_t job_control;
|
||||
pmix_server_monitor_fn_t monitor;
|
||||
} pmix_server_module_t;
|
||||
|
||||
/**** SERVER SUPPORT INIT/FINALIZE FUNCTIONS ****/
|
||||
|
||||
/* Initialize the server support library, and provide a
|
||||
* pointer to a pmix_server_module_t structure
|
||||
* pointer to a pmix_server_module_t structure
|
||||
* containing the caller's callback functions. The
|
||||
* array of pmix_info_t structs is used to pass
|
||||
* additional info that may be required by the server
|
||||
|
@ -1,6 +1,6 @@
|
||||
# -*- makefile -*-
|
||||
#
|
||||
# Copyright (c) 2015 Intel, Inc. All rights reserved.
|
||||
# Copyright (c) 2015-2017 Intel, Inc. All rights reserved.
|
||||
# Copyright (c) 2016 Cisco Systems, Inc. All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
@ -13,4 +13,5 @@ sources += \
|
||||
common/pmix_query.c \
|
||||
common/pmix_strings.c \
|
||||
common/pmix_log.c \
|
||||
common/pmix_jobdata.c
|
||||
common/pmix_jobdata.c \
|
||||
common/pmix_control.c
|
||||
|
269
opal/mca/pmix/pmix2x/pmix/src/common/pmix_control.c
Обычный файл
269
opal/mca/pmix/pmix2x/pmix/src/common/pmix_control.c
Обычный файл
@ -0,0 +1,269 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2016 Mellanox Technologies, Inc.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2016 IBM Corporation. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
#include <src/include/pmix_config.h>
|
||||
|
||||
#include <src/include/types.h>
|
||||
#include <src/include/pmix_stdint.h>
|
||||
#include <src/include/pmix_socket_errno.h>
|
||||
|
||||
#include <pmix.h>
|
||||
#include <pmix_common.h>
|
||||
#include <pmix_server.h>
|
||||
#include <pmix_rename.h>
|
||||
|
||||
#include "src/util/argv.h"
|
||||
#include "src/util/error.h"
|
||||
#include "src/util/output.h"
|
||||
#include "src/buffer_ops/buffer_ops.h"
|
||||
#include "src/mca/ptl/ptl.h"
|
||||
|
||||
#include "src/client/pmix_client_ops.h"
|
||||
#include "src/server/pmix_server_ops.h"
|
||||
#include "src/include/pmix_globals.h"
|
||||
|
||||
static void relcbfunc(void *cbdata)
|
||||
{
|
||||
pmix_shift_caddy_t *cd = (pmix_shift_caddy_t*)cbdata;
|
||||
|
||||
pmix_output_verbose(2, pmix_globals.debug_output,
|
||||
"pmix:query release callback");
|
||||
|
||||
if (NULL != cd->info) {
|
||||
PMIX_INFO_FREE(cd->info, cd->ninfo);
|
||||
}
|
||||
PMIX_RELEASE(cd);
|
||||
}
|
||||
static void query_cbfunc(struct pmix_peer_t *peer,
|
||||
pmix_ptl_hdr_t *hdr,
|
||||
pmix_buffer_t *buf, void *cbdata)
|
||||
{
|
||||
pmix_query_caddy_t *cd = (pmix_query_caddy_t*)cbdata;
|
||||
pmix_status_t rc;
|
||||
pmix_shift_caddy_t *results;
|
||||
int cnt;
|
||||
|
||||
pmix_output_verbose(2, pmix_globals.debug_output,
|
||||
"pmix:query cback from server");
|
||||
|
||||
results = PMIX_NEW(pmix_shift_caddy_t);
|
||||
|
||||
/* unpack the status */
|
||||
cnt = 1;
|
||||
if (PMIX_SUCCESS != (rc = pmix_bfrop.unpack(buf, &results->status, &cnt, PMIX_STATUS))) {
|
||||
PMIX_ERROR_LOG(rc);
|
||||
goto complete;
|
||||
}
|
||||
if (PMIX_SUCCESS != results->status) {
|
||||
goto complete;
|
||||
}
|
||||
|
||||
/* unpack any returned data */
|
||||
cnt = 1;
|
||||
if (PMIX_SUCCESS != (rc = pmix_bfrop.unpack(buf, &results->ninfo, &cnt, PMIX_SIZE))) {
|
||||
PMIX_ERROR_LOG(rc);
|
||||
goto complete;
|
||||
}
|
||||
if (0 < results->ninfo) {
|
||||
PMIX_INFO_CREATE(results->info, results->ninfo);
|
||||
cnt = results->ninfo;
|
||||
if (PMIX_SUCCESS != (rc = pmix_bfrop.unpack(buf, results->info, &cnt, PMIX_INFO))) {
|
||||
PMIX_ERROR_LOG(rc);
|
||||
goto complete;
|
||||
}
|
||||
}
|
||||
|
||||
complete:
|
||||
pmix_output_verbose(2, pmix_globals.debug_output,
|
||||
"pmix:query cback from server releasing");
|
||||
/* release the caller */
|
||||
if (NULL != cd->cbfunc) {
|
||||
cd->cbfunc(results->status, results->info, results->ninfo, cd->cbdata, relcbfunc, results);
|
||||
}
|
||||
PMIX_RELEASE(cd);
|
||||
}
|
||||
|
||||
PMIX_EXPORT pmix_status_t PMIx_Job_control_nb(const pmix_proc_t targets[], size_t ntargets,
|
||||
const pmix_info_t directives[], size_t ndirs,
|
||||
pmix_info_cbfunc_t cbfunc, void *cbdata)
|
||||
{
|
||||
pmix_buffer_t *msg;
|
||||
pmix_cmd_t cmd = PMIX_JOB_CONTROL_CMD;
|
||||
pmix_status_t rc;
|
||||
pmix_query_caddy_t *cb;
|
||||
|
||||
pmix_output_verbose(2, pmix_globals.debug_output,
|
||||
"pmix: job control called");
|
||||
|
||||
if (pmix_globals.init_cntr <= 0) {
|
||||
return PMIX_ERR_INIT;
|
||||
}
|
||||
|
||||
/* if we are the server, then we just issue the request and
|
||||
* return the response */
|
||||
if (PMIX_PROC_SERVER == pmix_globals.proc_type) {
|
||||
if (NULL == pmix_host_server.job_control) {
|
||||
/* nothing we can do */
|
||||
return PMIX_ERR_NOT_SUPPORTED;
|
||||
}
|
||||
pmix_output_verbose(2, pmix_globals.debug_output,
|
||||
"pmix:job_control handed to RM");
|
||||
rc = pmix_host_server.job_control(&pmix_globals.myid,
|
||||
targets, ntargets,
|
||||
directives, ndirs,
|
||||
cbfunc, cbdata);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* if we are a client, then relay this request to the server */
|
||||
|
||||
/* if we aren't connected, don't attempt to send */
|
||||
if (!pmix_globals.connected) {
|
||||
return PMIX_ERR_UNREACH;
|
||||
}
|
||||
|
||||
msg = PMIX_NEW(pmix_buffer_t);
|
||||
/* pack the cmd */
|
||||
if (PMIX_SUCCESS != (rc = pmix_bfrop.pack(msg, &cmd, 1, PMIX_CMD))) {
|
||||
PMIX_ERROR_LOG(rc);
|
||||
PMIX_RELEASE(msg);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* pack the number of targets */
|
||||
if (PMIX_SUCCESS != (rc = pmix_bfrop.pack(msg, &ntargets, 1, PMIX_SIZE))) {
|
||||
PMIX_ERROR_LOG(rc);
|
||||
PMIX_RELEASE(msg);
|
||||
return rc;
|
||||
}
|
||||
/* remember, the targets can be NULL to indicate that the operation
|
||||
* is to be done against all members of our nspace */
|
||||
if (0 < ntargets) {
|
||||
/* pack the targets */
|
||||
if (PMIX_SUCCESS != (rc = pmix_bfrop.pack(msg, targets, ntargets, PMIX_PROC))) {
|
||||
PMIX_ERROR_LOG(rc);
|
||||
PMIX_RELEASE(msg);
|
||||
return rc;
|
||||
}
|
||||
}
|
||||
|
||||
/* pack the directives */
|
||||
if (PMIX_SUCCESS != (rc = pmix_bfrop.pack(msg, &ndirs, 1, PMIX_SIZE))) {
|
||||
PMIX_ERROR_LOG(rc);
|
||||
PMIX_RELEASE(msg);
|
||||
return rc;
|
||||
}
|
||||
if (0 < ndirs) {
|
||||
if (PMIX_SUCCESS != (rc = pmix_bfrop.pack(msg, directives, ndirs, PMIX_INFO))) {
|
||||
PMIX_ERROR_LOG(rc);
|
||||
PMIX_RELEASE(msg);
|
||||
return rc;
|
||||
}
|
||||
}
|
||||
|
||||
/* create a callback object as we need to pass it to the
|
||||
* recv routine so we know which callback to use when
|
||||
* the return message is recvd */
|
||||
cb = PMIX_NEW(pmix_query_caddy_t);
|
||||
cb->cbfunc = cbfunc;
|
||||
cb->cbdata = cbdata;
|
||||
|
||||
/* push the message into our event base to send to the server */
|
||||
if (PMIX_SUCCESS != (rc = pmix_ptl.send_recv(&pmix_client_globals.myserver, msg, query_cbfunc, (void*)cb))){
|
||||
PMIX_RELEASE(msg);
|
||||
PMIX_RELEASE(cb);
|
||||
}
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
||||
PMIX_EXPORT pmix_status_t PMIx_Process_monitor_nb(const pmix_info_t *monitor, pmix_status_t error,
|
||||
const pmix_info_t directives[], size_t ndirs,
|
||||
pmix_info_cbfunc_t cbfunc, void *cbdata)
|
||||
{
|
||||
pmix_buffer_t *msg;
|
||||
pmix_cmd_t cmd = PMIX_MONITOR_CMD;
|
||||
pmix_status_t rc;
|
||||
pmix_query_caddy_t *cb;
|
||||
|
||||
pmix_output_verbose(2, pmix_globals.debug_output,
|
||||
"pmix: monitor called");
|
||||
|
||||
if (pmix_globals.init_cntr <= 0) {
|
||||
return PMIX_ERR_INIT;
|
||||
}
|
||||
|
||||
/* if we are the server, then we just issue the request and
|
||||
* return the response */
|
||||
if (PMIX_PROC_SERVER == pmix_globals.proc_type) {
|
||||
if (NULL == pmix_host_server.monitor) {
|
||||
/* nothing we can do */
|
||||
return PMIX_ERR_NOT_SUPPORTED;
|
||||
}
|
||||
pmix_output_verbose(2, pmix_globals.debug_output,
|
||||
"pmix:monitor handed to RM");
|
||||
rc = pmix_host_server.monitor(&pmix_globals.myid, error,
|
||||
directives, ndirs, cbfunc, cbdata);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* if we are a client, then relay this request to the server */
|
||||
|
||||
/* if we aren't connected, don't attempt to send */
|
||||
if (!pmix_globals.connected) {
|
||||
return PMIX_ERR_UNREACH;
|
||||
}
|
||||
|
||||
msg = PMIX_NEW(pmix_buffer_t);
|
||||
/* pack the cmd */
|
||||
if (PMIX_SUCCESS != (rc = pmix_bfrop.pack(msg, &cmd, 1, PMIX_CMD))) {
|
||||
PMIX_ERROR_LOG(rc);
|
||||
PMIX_RELEASE(msg);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* pack the error */
|
||||
if (PMIX_SUCCESS != (rc = pmix_bfrop.pack(msg, &error, 1, PMIX_STATUS))) {
|
||||
PMIX_ERROR_LOG(rc);
|
||||
PMIX_RELEASE(msg);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* pack the directives */
|
||||
if (PMIX_SUCCESS != (rc = pmix_bfrop.pack(msg, &ndirs, 1, PMIX_SIZE))) {
|
||||
PMIX_ERROR_LOG(rc);
|
||||
PMIX_RELEASE(msg);
|
||||
return rc;
|
||||
}
|
||||
if (0 < ndirs) {
|
||||
if (PMIX_SUCCESS != (rc = pmix_bfrop.pack(msg, directives, ndirs, PMIX_INFO))) {
|
||||
PMIX_ERROR_LOG(rc);
|
||||
PMIX_RELEASE(msg);
|
||||
return rc;
|
||||
}
|
||||
}
|
||||
|
||||
/* create a callback object as we need to pass it to the
|
||||
* recv routine so we know which callback to use when
|
||||
* the return message is recvd */
|
||||
cb = PMIX_NEW(pmix_query_caddy_t);
|
||||
cb->cbfunc = cbfunc;
|
||||
cb->cbdata = cbdata;
|
||||
|
||||
/* push the message into our event base to send to the server */
|
||||
if (PMIX_SUCCESS != (rc = pmix_ptl.send_recv(&pmix_client_globals.myserver, msg, query_cbfunc, (void*)cb))){
|
||||
PMIX_RELEASE(msg);
|
||||
PMIX_RELEASE(cb);
|
||||
}
|
||||
|
||||
return rc;
|
||||
}
|
@ -257,6 +257,8 @@ static void qcon(pmix_query_caddy_t *p)
|
||||
{
|
||||
p->queries = NULL;
|
||||
p->nqueries = 0;
|
||||
p->targets = NULL;
|
||||
p->ntargets = 0;
|
||||
p->info = NULL;
|
||||
p->ninfo = 0;
|
||||
p->cbfunc = NULL;
|
||||
|
@ -72,7 +72,9 @@ typedef enum {
|
||||
PMIX_DEREGEVENTS_CMD,
|
||||
PMIX_QUERY_CMD,
|
||||
PMIX_LOG_CMD,
|
||||
PMIX_ALLOC_CMD
|
||||
PMIX_ALLOC_CMD,
|
||||
PMIX_JOB_CONTROL_CMD,
|
||||
PMIX_MONITOR_CMD
|
||||
} pmix_cmd_t;
|
||||
|
||||
/* provide a "pretty-print" function for cmds */
|
||||
@ -214,6 +216,8 @@ typedef struct {
|
||||
pmix_status_t status;
|
||||
pmix_query_t *queries;
|
||||
size_t nqueries;
|
||||
pmix_proc_t *targets;
|
||||
size_t ntargets;
|
||||
pmix_info_t *info;
|
||||
size_t ninfo;
|
||||
pmix_info_cbfunc_t cbfunc;
|
||||
|
@ -256,4 +256,13 @@ typedef struct event pmix_event_t;
|
||||
|
||||
#define pmix_event_active(x, y, z) event_active((x), (y), (z))
|
||||
|
||||
#define pmix_event_evtimer_new(b, cb, arg) pmix_event_new((b), -1, 0, (cb), (arg))
|
||||
|
||||
#define pmix_event_evtimer_add(x, tv) pmix_event_add((x), (tv))
|
||||
|
||||
#define pmix_event_evtimer_set(b, x, cb, arg) event_assign((x), (b), -1, 0, (event_callback_fn) (cb), (arg))
|
||||
|
||||
#define pmix_event_evtimer_del(x) pmix_event_del((x))
|
||||
|
||||
|
||||
#endif /* PMIX_TYPES_H */
|
||||
|
@ -3,26 +3,27 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel, Inc. All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
AM_CPPFLAGS = $(LTDLINCL)
|
||||
|
||||
# main library setup
|
||||
noinst_LTLIBRARIES = libmca_sensor.la
|
||||
libmca_sensor_la_SOURCES =
|
||||
noinst_LTLIBRARIES = libmca_psensor.la
|
||||
libmca_psensor_la_SOURCES =
|
||||
|
||||
# local files
|
||||
headers = sensor.h \
|
||||
sensor_types.h
|
||||
headers = psensor.h
|
||||
|
||||
libmca_sensor_la_SOURCES += $(headers)
|
||||
libmca_psensor_la_SOURCES += $(headers)
|
||||
|
||||
# Conditionally install the header files
|
||||
if WANT_INSTALL_HEADERS
|
||||
ortedir = $(ompiincludedir)/$(subdir)
|
||||
nobase_orte_HEADERS = $(headers)
|
||||
pmixdir = $(pmixincludedir)/$(subdir)
|
||||
nobase_pmix_HEADERS = $(headers)
|
||||
endif
|
||||
|
||||
include base/Makefile.am
|
@ -1,5 +1,5 @@
|
||||
#
|
||||
# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
||||
# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
||||
# Copyright (c) 2012-2013 Los Alamos National Security, Inc. All rights reserved.
|
||||
#
|
||||
# Copyright (c) 2017 Intel, Inc. All rights reserved.
|
||||
@ -11,10 +11,9 @@
|
||||
#
|
||||
|
||||
headers += \
|
||||
base/base.h \
|
||||
base/sensor_private.h
|
||||
base/base.h
|
||||
|
||||
libmca_sensor_la_SOURCES += \
|
||||
base/sensor_base_frame.c \
|
||||
base/sensor_base_select.c \
|
||||
base/sensor_base_fns.c
|
||||
libmca_psensor_la_SOURCES += \
|
||||
base/psensor_base_frame.c \
|
||||
base/psensor_base_select.c \
|
||||
base/psensor_base_stubs.c
|
59
opal/mca/pmix/pmix2x/pmix/src/mca/psensor/base/base.h
Обычный файл
59
opal/mca/pmix/pmix2x/pmix/src/mca/psensor/base/base.h
Обычный файл
@ -0,0 +1,59 @@
|
||||
/*
|
||||
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2013 Los Alamos National Security, LLC. All rights reserved.
|
||||
*
|
||||
* Copyright (c) 2017 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
/** @file:
|
||||
*/
|
||||
|
||||
#ifndef PMIX_PSENSOR_BASE_H_
|
||||
#define PMIX_PSENSOR_BASE_H_
|
||||
|
||||
#include <src/include/pmix_config.h>
|
||||
|
||||
#include "src/class/pmix_list.h"
|
||||
#include "src/mca/mca.h"
|
||||
#include "src/mca/base/pmix_mca_base_framework.h"
|
||||
|
||||
#include "src/mca/psensor/psensor.h"
|
||||
|
||||
BEGIN_C_DECLS
|
||||
|
||||
/*
|
||||
* MCA Framework
|
||||
*/
|
||||
PMIX_EXPORT extern pmix_mca_base_framework_t pmix_psensor_base_framework;
|
||||
|
||||
PMIX_EXPORT int pmix_psensor_base_select(void);
|
||||
|
||||
/* define a struct to hold framework-global values */
|
||||
typedef struct {
|
||||
pmix_list_t actives;
|
||||
pmix_event_base_t *evbase;
|
||||
} pmix_psensor_base_t;
|
||||
|
||||
typedef struct {
|
||||
pmix_list_item_t super;
|
||||
pmix_psensor_base_component_t *component;
|
||||
pmix_psensor_base_module_t *module;
|
||||
int priority;
|
||||
} pmix_psensor_active_module_t;
|
||||
PMIX_CLASS_DECLARATION(pmix_psensor_active_module_t);
|
||||
|
||||
PMIX_EXPORT extern pmix_psensor_base_t pmix_psensor_base;
|
||||
|
||||
PMIX_EXPORT pmix_status_t pmix_psensor_base_start(pmix_peer_t *requestor, pmix_status_t error,
|
||||
const pmix_info_t *monitor,
|
||||
const pmix_info_t directives[], size_t ndirs);
|
||||
|
||||
PMIX_EXPORT pmix_status_t pmix_psensor_base_stop(pmix_peer_t *requestor,
|
||||
char *id);
|
||||
|
||||
END_C_DECLS
|
||||
#endif
|
103
opal/mca/pmix/pmix2x/pmix/src/mca/psensor/base/psensor_base_frame.c
Обычный файл
103
opal/mca/pmix/pmix2x/pmix/src/mca/psensor/base/psensor_base_frame.c
Обычный файл
@ -0,0 +1,103 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2012-2013 Los Alamos National Security, Inc. All rights reserved.
|
||||
*
|
||||
* Copyright (c) 2017 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
|
||||
#include <src/include/pmix_config.h>
|
||||
|
||||
#include <pmix_common.h>
|
||||
|
||||
#include <pthread.h>
|
||||
#include PMIX_EVENT_HEADER
|
||||
|
||||
#include "src/mca/mca.h"
|
||||
#include "src/mca/base/base.h"
|
||||
#include "src/class/pmix_list.h"
|
||||
#include "src/runtime/pmix_progress_threads.h"
|
||||
#include "src/include/types.h"
|
||||
|
||||
#include "src/mca/psensor/base/base.h"
|
||||
|
||||
/*
|
||||
* The following file was created by configure. It contains extern
|
||||
* statements and the definition of an array of pointers to each
|
||||
* component's public mca_base_component_t struct.
|
||||
*/
|
||||
|
||||
#include "src/mca/psensor/base/static-components.h"
|
||||
|
||||
/*
|
||||
* Global variables
|
||||
*/
|
||||
pmix_psensor_base_module_t pmix_psensor = {
|
||||
pmix_psensor_base_start,
|
||||
pmix_psensor_base_stop
|
||||
};
|
||||
pmix_psensor_base_t pmix_psensor_base = {{{0}}};;
|
||||
|
||||
static bool use_separate_thread = false;
|
||||
|
||||
static int pmix_psensor_register(pmix_mca_base_register_flag_t flags)
|
||||
{
|
||||
(void) pmix_mca_base_var_register("pmix", "psensor", "base", "use_separate_thread",
|
||||
"Use a separate thread for monitoring local procs",
|
||||
PMIX_MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
|
||||
PMIX_INFO_LVL_9,
|
||||
PMIX_MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&use_separate_thread);
|
||||
return PMIX_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
static int pmix_psensor_base_close(void)
|
||||
{
|
||||
PMIX_LIST_DESTRUCT(&pmix_psensor_base.actives);
|
||||
|
||||
if (use_separate_thread && NULL != pmix_psensor_base.evbase) {
|
||||
(void)pmix_progress_thread_stop("PSENSOR");
|
||||
}
|
||||
|
||||
/* Close all remaining available components */
|
||||
return pmix_mca_base_framework_components_close(&pmix_psensor_base_framework, NULL);
|
||||
}
|
||||
|
||||
/**
|
||||
* Function for finding and opening either all MCA components, or the one
|
||||
* that was specifically requested via a MCA parameter.
|
||||
*/
|
||||
static int pmix_psensor_base_open(pmix_mca_base_open_flag_t flags)
|
||||
{
|
||||
/* construct the list of modules */
|
||||
PMIX_CONSTRUCT(&pmix_psensor_base.actives, pmix_list_t);
|
||||
|
||||
if (use_separate_thread) {
|
||||
/* create an event base and progress thread for us */
|
||||
if (NULL == (pmix_psensor_base.evbase = pmix_progress_thread_init("PSENSOR"))) {
|
||||
return PMIX_ERROR;
|
||||
}
|
||||
|
||||
} else {
|
||||
pmix_psensor_base.evbase = pmix_globals.evbase;
|
||||
}
|
||||
|
||||
/* Open up all available components */
|
||||
return pmix_mca_base_framework_components_open(&pmix_psensor_base_framework, flags);
|
||||
}
|
||||
|
||||
PMIX_MCA_BASE_FRAMEWORK_DECLARE(pmix, psensor, "PMIx Monitoring Sensors",
|
||||
pmix_psensor_register,
|
||||
pmix_psensor_base_open, pmix_psensor_base_close,
|
||||
mca_psensor_base_static_components, 0);
|
||||
|
||||
PMIX_CLASS_INSTANCE(pmix_psensor_active_module_t,
|
||||
pmix_list_item_t,
|
||||
NULL, NULL);
|
@ -0,0 +1,94 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2008 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2016-2017 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include <src/include/pmix_config.h>
|
||||
#include <pmix_common.h>
|
||||
|
||||
#include <string.h>
|
||||
|
||||
#include "src/mca/mca.h"
|
||||
#include "src/mca/base/base.h"
|
||||
|
||||
#include "src/mca/psensor/base/base.h"
|
||||
|
||||
static bool selected = false;
|
||||
|
||||
/* Function for selecting a prioritized list of components
|
||||
* from all those that are available. */
|
||||
int pmix_psensor_base_select(void)
|
||||
{
|
||||
pmix_mca_base_component_list_item_t *cli = NULL;
|
||||
pmix_psensor_base_component_t *component = NULL;
|
||||
pmix_psensor_active_module_t *newactive, *active;
|
||||
pmix_mca_base_module_t *mod;
|
||||
int pri;
|
||||
bool inserted;
|
||||
|
||||
if (selected) {
|
||||
/* ensure we don't do this twice */
|
||||
return PMIX_SUCCESS;
|
||||
}
|
||||
selected = true;
|
||||
|
||||
/* Query all available components and ask if they have a module */
|
||||
PMIX_LIST_FOREACH(cli, &pmix_psensor_base_framework.framework_components, pmix_mca_base_component_list_item_t) {
|
||||
component = (pmix_psensor_base_component_t *) cli->cli_component;
|
||||
|
||||
pmix_output_verbose(5, pmix_psensor_base_framework.framework_output,
|
||||
"mca:psensor:select: checking available component %s",
|
||||
component->base.pmix_mca_component_name);
|
||||
|
||||
/* get the module for this component */
|
||||
if (PMIX_SUCCESS != component->base.pmix_mca_query_component(&mod, &pri)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
/* add to our prioritized list of available actives */
|
||||
newactive = PMIX_NEW(pmix_psensor_active_module_t);
|
||||
newactive->priority = pri;
|
||||
newactive->component = component;
|
||||
newactive->module = (pmix_psensor_base_module_t*)mod;
|
||||
|
||||
/* maintain priority order */
|
||||
inserted = false;
|
||||
PMIX_LIST_FOREACH(active, &pmix_psensor_base.actives, pmix_psensor_active_module_t) {
|
||||
if (newactive->priority > active->priority) {
|
||||
pmix_list_insert_pos(&pmix_psensor_base.actives,
|
||||
(pmix_list_item_t*)active, &newactive->super);
|
||||
inserted = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!inserted) {
|
||||
/* must be lowest priority - add to end */
|
||||
pmix_list_append(&pmix_psensor_base.actives, &newactive->super);
|
||||
}
|
||||
}
|
||||
|
||||
if (4 < pmix_output_get_verbosity(pmix_psensor_base_framework.framework_output)) {
|
||||
pmix_output(0, "Final PSENSOR priorities");
|
||||
/* show the prioritized list */
|
||||
PMIX_LIST_FOREACH(active, &pmix_psensor_base.actives, pmix_psensor_active_module_t) {
|
||||
pmix_output(0, "\tPSENSOR: %s Priority: %d",
|
||||
active->component->base.pmix_mca_component_name, active->priority);
|
||||
}
|
||||
}
|
||||
|
||||
return PMIX_SUCCESS;;
|
||||
}
|
@ -0,0 +1,68 @@
|
||||
/*
|
||||
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2012 Los Alamos National Security, Inc. All rights reserved.
|
||||
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
|
||||
#include <src/include/pmix_config.h>
|
||||
#include <pmix_common.h>
|
||||
|
||||
#include "src/util/error.h"
|
||||
|
||||
#include "src/mca/psensor/base/base.h"
|
||||
|
||||
static bool mods_active = false;
|
||||
|
||||
pmix_status_t pmix_psensor_base_start(pmix_peer_t *requestor, pmix_status_t error,
|
||||
const pmix_info_t *monitor,
|
||||
const pmix_info_t directives[], size_t ndirs)
|
||||
{
|
||||
pmix_psensor_active_module_t *mod;
|
||||
pmix_status_t rc;
|
||||
|
||||
opal_output_verbose(5, pmix_psensor_base_framework.framework_output,
|
||||
"%s:%d sensor:base: starting sensors",
|
||||
pmix_globals.myid.nspace, pmix_globals.myid.rank);
|
||||
|
||||
/* call the start function of all modules in priority order */
|
||||
PMIX_LIST_FOREACH(mod, &pmix_psensor_base.actives, pmix_psensor_active_module_t) {
|
||||
if (NULL != mod->module->start) {
|
||||
rc = mod->module->start(requestor, error, monitor, directives, ndirs);
|
||||
if (PMIX_SUCCESS != rc && PMIX_ERR_TAKE_NEXT_OPTION != rc) {
|
||||
return rc;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return PMIX_SUCCESS;
|
||||
}
|
||||
|
||||
pmix_status_t pmix_psensor_base_stop(pmix_peer_t *requestor,
|
||||
char *id)
|
||||
{
|
||||
pmix_psensor_active_module_t *mod;
|
||||
pmix_status_t rc;
|
||||
|
||||
opal_output_verbose(5, pmix_psensor_base_framework.framework_output,
|
||||
"%s:%d sensor:base: stopping sensors",
|
||||
pmix_globals.myid.nspace, pmix_globals.myid.rank);
|
||||
|
||||
/* call the stop function of all modules in priority order */
|
||||
PMIX_LIST_FOREACH(mod, &pmix_psensor_base.actives, pmix_psensor_active_module_t) {
|
||||
if (NULL != mod->module->stop) {
|
||||
rc = mod->module->stop(requestor, id);
|
||||
if (PMIX_SUCCESS != rc && PMIX_ERR_TAKE_NEXT_OPTION != rc) {
|
||||
return rc;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return PMIX_SUCCESS;
|
||||
}
|
@ -1,37 +1,37 @@
|
||||
#
|
||||
# Copyright (c) 2009-2010 Cisco Systems, Inc. All rights reserved.
|
||||
# Copyright (c) 2009-2010 Cisco Systems, Inc. All rights reserved.
|
||||
# Copyright (c) 2017 Intel, Inc. All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
dist_ompidata_DATA = help-orte-sensor-file.txt
|
||||
dist_pmixdata_DATA = help-pmix-psensor-file.txt
|
||||
|
||||
sources = \
|
||||
sensor_file.c \
|
||||
sensor_file.h \
|
||||
sensor_file_component.c
|
||||
psensor_file.c \
|
||||
psensor_file.h \
|
||||
psensor_file_component.c
|
||||
|
||||
# Make the output library in this directory, and name it either
|
||||
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
|
||||
# (for static builds).
|
||||
|
||||
if MCA_BUILD_orte_sensor_file_DSO
|
||||
if MCA_BUILD_pmix_psensor_file_DSO
|
||||
component_noinst =
|
||||
component_install = mca_sensor_file.la
|
||||
component_install = mca_psensor_file.la
|
||||
else
|
||||
component_noinst = libmca_sensor_file.la
|
||||
component_noinst = libmca_psensor_file.la
|
||||
component_install =
|
||||
endif
|
||||
|
||||
mcacomponentdir = $(ompilibdir)
|
||||
mcacomponentdir = $(pmixlibdir)
|
||||
mcacomponent_LTLIBRARIES = $(component_install)
|
||||
mca_sensor_file_la_SOURCES = $(sources)
|
||||
mca_sensor_file_la_LDFLAGS = -module -avoid-version
|
||||
mca_psensor_file_la_SOURCES = $(sources)
|
||||
mca_psensor_file_la_LDFLAGS = -module -avoid-version
|
||||
|
||||
noinst_LTLIBRARIES = $(component_noinst)
|
||||
libmca_sensor_file_la_SOURCES =$(sources)
|
||||
libmca_sensor_file_la_LDFLAGS = -module -avoid-version
|
||||
libmca_psensor_file_la_SOURCES =$(sources)
|
||||
libmca_psensor_file_la_LDFLAGS = -module -avoid-version
|
@ -4,9 +4,9 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel, Inc. All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
# This is the US/English general help file for the file sensor
|
352
opal/mca/pmix/pmix2x/pmix/src/mca/psensor/file/psensor_file.c
Обычный файл
352
opal/mca/pmix/pmix2x/pmix/src/mca/psensor/file/psensor_file.c
Обычный файл
@ -0,0 +1,352 @@
|
||||
/*
|
||||
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2004-2011 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2011-2012 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
*
|
||||
* Copyright (c) 2017 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include <src/include/pmix_config.h>
|
||||
#include <src/include/types.h>
|
||||
#include <pmix_common.h>
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stddef.h>
|
||||
#include <ctype.h>
|
||||
#ifdef HAVE_UNISTD_H
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
#ifdef HAVE_NETDB_H
|
||||
#include <netdb.h>
|
||||
#endif
|
||||
#ifdef HAVE_SYS_PARAM_H
|
||||
#include <sys/param.h>
|
||||
#endif
|
||||
#include <fcntl.h>
|
||||
#include <errno.h>
|
||||
#include <signal.h>
|
||||
#ifdef HAVE_TIME_H
|
||||
#include <time.h>
|
||||
#endif
|
||||
#include <sys/stat.h>
|
||||
#include <sys/types.h>
|
||||
|
||||
#include "src/class/pmix_list.h"
|
||||
#include "src/include/pmix_globals.h"
|
||||
#include "src/util/error.h"
|
||||
#include "src/util/output.h"
|
||||
#include "src/util/show_help.h"
|
||||
|
||||
#include "src/mca/psensor/base/base.h"
|
||||
#include "psensor_file.h"
|
||||
|
||||
/* declare the API functions */
|
||||
static pmix_status_t start(pmix_peer_t *requestor, pmix_status_t error,
|
||||
const pmix_info_t *monitor,
|
||||
const pmix_info_t directives[], size_t ndirs);
|
||||
static pmix_status_t stop(pmix_peer_t *requestor, char *id);
|
||||
|
||||
/* instantiate the module */
|
||||
pmix_psensor_base_module_t pmix_psensor_file_module = {
|
||||
.start = start,
|
||||
.stop = stop
|
||||
};
|
||||
|
||||
/* define a tracking object */
|
||||
typedef struct {
|
||||
pmix_list_item_t super;
|
||||
pmix_peer_t *requestor;
|
||||
char *id;
|
||||
bool event_active;
|
||||
pmix_event_t ev;
|
||||
pmix_event_t cdev;
|
||||
struct timeval tv;
|
||||
int tick;
|
||||
char *file;
|
||||
bool file_size;
|
||||
bool file_access;
|
||||
bool file_mod;
|
||||
size_t last_size;
|
||||
time_t last_access;
|
||||
time_t last_mod;
|
||||
uint32_t ndrops;
|
||||
uint32_t nmisses;
|
||||
pmix_status_t error;
|
||||
pmix_data_range_t range;
|
||||
pmix_info_t *info;
|
||||
size_t ninfo;
|
||||
} file_tracker_t;
|
||||
static void ft_constructor(file_tracker_t *ft)
|
||||
{
|
||||
ft->requestor = NULL;
|
||||
ft->id = NULL;
|
||||
ft->event_active = false;
|
||||
ft->tv.tv_sec = 0;
|
||||
ft->tv.tv_usec = 0;
|
||||
ft->tick = 0;
|
||||
ft->file_size = false;
|
||||
ft->file_access = false;
|
||||
ft->file_mod = false;
|
||||
ft->last_size = 0;
|
||||
ft->last_access = 0;
|
||||
ft->last_mod = 0;
|
||||
ft->ndrops = 0;
|
||||
ft->nmisses = 0;
|
||||
ft->error = PMIX_SUCCESS;
|
||||
ft->range = PMIX_RANGE_NAMESPACE;
|
||||
ft->info = NULL;
|
||||
ft->ninfo = 0;
|
||||
}
|
||||
static void ft_destructor(file_tracker_t *ft)
|
||||
{
|
||||
if (NULL != ft->requestor) {
|
||||
PMIX_RELEASE(ft->requestor);
|
||||
}
|
||||
if (NULL != ft->id) {
|
||||
free(ft->id);
|
||||
}
|
||||
if (event_active) {
|
||||
pmix_event_del(&ft->ev);
|
||||
}
|
||||
if (NULL != ft->file) {
|
||||
free(ft->file);
|
||||
}
|
||||
if (NULL != ft->info) {
|
||||
PMIX_INFO_FREE(ft->info, ft->ninfo);
|
||||
}
|
||||
}
|
||||
PMIX_CLASS_INSTANCE(file_tracker_t,
|
||||
pmix_list_item_t,
|
||||
ft_constructor, ft_destructor);
|
||||
|
||||
/* define a local caddy */
|
||||
typedef struct {
|
||||
pmix_object_t super;
|
||||
pmix_event_t ev;
|
||||
pmix_peer_t *requestor;
|
||||
char *id;
|
||||
} file_caddy_t;
|
||||
static void cd_con(file_caddy_t *p)
|
||||
{
|
||||
p->requestor = NULL;
|
||||
p->id = NULL;
|
||||
}
|
||||
static void cd_des(file_caddy_t *p)
|
||||
{
|
||||
if (NULL != (p->requestor)) {
|
||||
PMIX_RELEASE(p->requestor);
|
||||
}
|
||||
if (NULL != p->id) {
|
||||
free(p->id);
|
||||
}
|
||||
}
|
||||
PMIX_CLASS_INSTANCE(file_caddy_t,
|
||||
pmix_object_t,
|
||||
cd_con, cd_des);
|
||||
|
||||
static void file_sample(int sd, short args, void *cbdata);
|
||||
|
||||
static void add_tracker(int sd, short flags, void *cbdata)
|
||||
{
|
||||
file_tracker_t *ft = (file_tracker_t*)cbdata;
|
||||
|
||||
/* add the tracker to our list */
|
||||
pmix_list_append(&mca_psensor_file_component.trackers, &ft->super);
|
||||
|
||||
/* setup the timer event */
|
||||
pmix_event_evtimer_set(pmix_psensor_base.evbase, &ft->ev,
|
||||
file_sample, ft);
|
||||
pmix_event_evtimer_add(&ft->ev, &ft->tv);
|
||||
ft->event_active = true;
|
||||
}
|
||||
|
||||
/*
|
||||
* Start monitoring of local processes
|
||||
*/
|
||||
static pmix_status_t start(pmix_peer_t *requestor, pmix_status_t error,
|
||||
const pmix_info_t *monitor,
|
||||
const pmix_info_t directives[], size_t ndirs)
|
||||
{
|
||||
file_tracker_t *ft;
|
||||
pmix_info_t *ptr;
|
||||
size_t n, n2;
|
||||
|
||||
PMIX_OUTPUT_VERBOSE((1, pmix_psensor_base_framework.framework_output,
|
||||
"[%s:%d] checking file monitoring for requestor %s:%d",
|
||||
pmix_globals.myid.nspace, pmix_globals.myid.rank,
|
||||
requestor->info->nptr->nspace, requestor->info->rank));
|
||||
|
||||
/* if they didn't ask to monitor a file, then nothing for us to do */
|
||||
if (0 != strcmp(monitor->key, PMIX_MONITOR_FILE)) {
|
||||
return PMIX_ERR_TAKE_NEXT_OPTION;
|
||||
}
|
||||
|
||||
/* setup to track this monitoring operation */
|
||||
ft = PMIX_NEW(file_tracker_t);
|
||||
PMIX_RETAIN(requestor);
|
||||
ft->requestor = requestor;
|
||||
ft->file = strdup(monitor->value.data.string);
|
||||
|
||||
/* check the directives to see if what they want monitored */
|
||||
for (n=0; n < ndirs; n++) {
|
||||
if (0 == strcmp(directives[n].key, PMIX_MONITOR_FILE_SIZE)) {
|
||||
ft->file_size = directives[n].value.data.flag;
|
||||
} else if (0 == strcmp(directives[n].key, PMIX_MONITOR_FILE_ACCESS)) {
|
||||
ft->file_access = directives[n].value.data.flag;
|
||||
} else if (0 == strcmp(directives[n].key, PMIX_MONITOR_FILE_MODIFY)) {
|
||||
ft->file_mod = directives[n].value.data.flag;
|
||||
} else if (0 == strcmp(directives[n].key, PMIX_MONITOR_FILE_DROPS)) {
|
||||
ft->ndrops = directives[n].value.data.uint32;
|
||||
} else if (0 == strcmp(directives[n].key, PMIX_MONITOR_FILE_CHECK_TIME)) {
|
||||
ft->tv.tv_sec = directives[n].value.data.uint32;
|
||||
} else if (0 == strcmp(directives[n].key, PMIX_RANGE)) {
|
||||
ft->range = directives[n].value.data.range;
|
||||
}
|
||||
}
|
||||
|
||||
if (0 == ft->tv.tv_sec ||
|
||||
(!ft->file_size && !ft->file_access && !ft->file_mod)) {
|
||||
/* didn't specify a sample rate, or what should be sampled */
|
||||
PMIX_RELEASE(ft);
|
||||
return PMIX_ERR_BAD_PARAM;
|
||||
}
|
||||
|
||||
/* need to push into our event base to add this to our trackers */
|
||||
pmix_event_assign(&ft->cdev, pmix_psensor_base.evbase, -1,
|
||||
EV_WRITE, add_tracker, ft);
|
||||
pmix_event_active(&ft->cdev, EV_WRITE, 1);
|
||||
|
||||
return PMIX_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
static void del_tracker(int sd, short flags, void *cbdata)
|
||||
{
|
||||
file_caddy_t *cd = (file_caddy_t*)cbdata;
|
||||
file_tracker_t *ft, *ftnext;
|
||||
|
||||
/* remove the tracker from our list */
|
||||
PMIX_LIST_FOREACH_SAFE(ft, ftnext, &mca_psensor_file_component.trackers, file_tracker_t) {
|
||||
if (ft->requestor != cd->requestor) {
|
||||
continue;
|
||||
}
|
||||
if (NULL == cd->id ||
|
||||
(NULL != ft->id && 0 == strcmp(ft->id, cd->id))) {
|
||||
pmix_list_remove_item(&mca_psensor_file_component.trackers, &ft->super);
|
||||
PMIX_RELEASE(ft);
|
||||
}
|
||||
}
|
||||
PMIX_RELEASE(cd);
|
||||
}
|
||||
|
||||
static pmix_status_t stop(pmix_peer_t *requestor, char *id)
|
||||
{
|
||||
file_caddy_t *cd;
|
||||
|
||||
cd = PMIX_NEW(file_caddy_t);
|
||||
PMIX_RETAIN(requestor);
|
||||
cd->requestor = requestor;
|
||||
cd->id = strdup(id);
|
||||
|
||||
/* need to push into our event base to add this to our trackers */
|
||||
pmix_event_assign(&cd->ev, pmix_psensor_base.evbase, -1,
|
||||
EV_WRITE, del_tracker, cd);
|
||||
pmix_event_active(&cd->ev, EV_WRITE, 1);
|
||||
|
||||
return PMIX_SUCCESS;
|
||||
}
|
||||
|
||||
static void opcbfunc(pmix_status_t status, void *cbdata)
|
||||
{
|
||||
file_tracker_t *ft = (file_tracker_t*)cbdata;
|
||||
|
||||
PMIX_RELEASE(ft);
|
||||
}
|
||||
|
||||
static void file_sample(int sd, short args, void *cbdata)
|
||||
{
|
||||
file_tracker_t *ft = (file_tracker_t*)cbdata;
|
||||
struct stat buf;
|
||||
pmix_status_t rc;
|
||||
pmix_proc_t source;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, pmix_psensor_base_framework.framework_output,
|
||||
"[%s:%d] sampling file %s",
|
||||
pmix_globals.myid.nspace, pmix_globals.myid.rank,
|
||||
ft->file));
|
||||
|
||||
/* stat the file and get its info */
|
||||
if (0 > stat(ft->file, &buf)) {
|
||||
/* cannot stat file */
|
||||
PMIX_OUTPUT_VERBOSE((1, pmix_psensor_base_framework.framework_output,
|
||||
"[%s:%d] could not stat %s",
|
||||
pmix_globals.myid.nspace, pmix_globals.myid.rank,
|
||||
ft->file));
|
||||
/* re-add the timer, in case this file shows up */
|
||||
pmix_event_evtimer_add(&ft->ev, &ft->tv);
|
||||
return;
|
||||
}
|
||||
|
||||
PMIX_OUTPUT_VERBOSE((1, pmix_psensor_base_framework.framework_output,
|
||||
"[%s:%d] size %lu access %s\tmod %s",
|
||||
pmix_globals.myid.nspace, pmix_globals.myid.rank,
|
||||
(unsigned long)buf.st_size, ctime(&buf.st_atime), ctime(&buf.st_mtime)));
|
||||
|
||||
if (ft->file_size) {
|
||||
if (buf.st_size == ft->last_size) {
|
||||
ft->nmisses++;
|
||||
} else {
|
||||
ft->nmisses = 0;
|
||||
ft->last_size = buf.st_size;
|
||||
}
|
||||
} else if (ft->file_access) {
|
||||
if (buf.st_atime == ft->last_access) {
|
||||
ft->nmisses++;
|
||||
} else {
|
||||
ft->nmisses = 0;
|
||||
ft->last_access = buf.st_atime;
|
||||
}
|
||||
} else if (ft->file_mod) {
|
||||
if (buf.st_mtime == ft->last_mod) {
|
||||
ft->nmisses++;
|
||||
} else {
|
||||
ft->nmisses = 0;
|
||||
ft->last_mod = buf.st_mtime;
|
||||
}
|
||||
}
|
||||
|
||||
CHECK:
|
||||
PMIX_OUTPUT_VERBOSE((1, pmix_psensor_base_framework.framework_output,
|
||||
"[%s:%d] sampled file %s misses %d",
|
||||
pmix_globals.myid.nspace, pmix_globals.myid.rank,
|
||||
ft->file, ft->nmisses));
|
||||
|
||||
if (ft->nmisses == ft->ndrops) {
|
||||
if (4 < pmix_output_get_verbosity(pmix_psensor_base_framework.framework_output)) {
|
||||
pmix_show_help("help-pmix-psensor-file.txt", "file-stalled", true,
|
||||
ft->file, ft->last_size, ctime(&ft->last_access), ctime(&ft->last_mod));
|
||||
}
|
||||
/* stop monitoring this client */
|
||||
pmix_list_remove_item(&mca_psensor_file_component.trackers, &ft->super);
|
||||
/* generate an event */
|
||||
(void)strncpy(source.nspace, ft->requestor->info->nptr->nspace, PMIX_MAX_NSLEN);
|
||||
source.rank = ft->requestor->info->rank;
|
||||
rc = PMIx_Notify_event(PMIX_MONITOR_FILE_ALERT, &source,
|
||||
ft->range, ft->info, ft->ninfo, opcbfunc, ft);
|
||||
if (PMIX_SUCCESS != rc) {
|
||||
PMIX_ERROR_LOG(rc);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
/* re-add the timer */
|
||||
pmix_event_evtimer_add(&ft->ev, &ft->tv);
|
||||
}
|
38
opal/mca/pmix/pmix2x/pmix/src/mca/psensor/file/psensor_file.h
Обычный файл
38
opal/mca/pmix/pmix2x/pmix/src/mca/psensor/file/psensor_file.h
Обычный файл
@ -0,0 +1,38 @@
|
||||
/*
|
||||
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
||||
*
|
||||
* Copyright (c) 2017 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
/**
|
||||
* @file
|
||||
*
|
||||
* File movement sensor
|
||||
*/
|
||||
#ifndef PMIX_PSENSOR_FILE_H
|
||||
#define PMIX_PSENSOR_FILE_H
|
||||
|
||||
#include <src/include/pmix_config.h>
|
||||
|
||||
#include "src/class/pmix_list.h"
|
||||
|
||||
#include "src/mca/psensor/psensor.h"
|
||||
|
||||
BEGIN_C_DECLS
|
||||
|
||||
typedef struct {
|
||||
pmix_psensor_base_component_t super;
|
||||
pmix_list_t trackers;
|
||||
} pmix_psensor_file_component_t;
|
||||
|
||||
extern pmix_psensor_file_component_t mca_psensor_file_component;
|
||||
extern pmix_psensor_base_module_t pmix_psensor_file_module;
|
||||
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
#endif
|
@ -0,0 +1,69 @@
|
||||
/*
|
||||
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2017 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include <src/include/pmix_config.h>
|
||||
#include <pmix_common.h>
|
||||
|
||||
#include "src/class/pmix_list.h"
|
||||
|
||||
#include "src/mca/psensor/base/base.h"
|
||||
#include "src/mca/psensor/file/psensor_file.h"
|
||||
|
||||
/*
|
||||
* Local functions
|
||||
*/
|
||||
static int psensor_file_open(void);
|
||||
static int psensor_file_close(void);
|
||||
static int psensor_file_query(pmix_mca_base_module_t **module, int *priority);
|
||||
|
||||
pmix_psensor_file_component_t mca_psensor_file_component = {
|
||||
.super = {
|
||||
.base = {
|
||||
PMIX_PSENSOR_BASE_VERSION_1_0_0,
|
||||
|
||||
/* Component name and version */
|
||||
.pmix_mca_component_name = "file",
|
||||
PMIX_MCA_BASE_MAKE_VERSION(component,
|
||||
PMIX_MAJOR_VERSION,
|
||||
PMIX_MINOR_VERSION,
|
||||
PMIX_RELEASE_VERSION),
|
||||
|
||||
/* Component open and close functions */
|
||||
psensor_file_open, /* component open */
|
||||
psensor_file_close, /* component close */
|
||||
psensor_file_query /* component query */
|
||||
},
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
static int psensor_file_open(void)
|
||||
{
|
||||
PMIX_CONSTRUCT(&mca_psensor_file_component.trackers, pmix_list_t);
|
||||
return PMIX_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
static int psensor_file_query(pmix_mca_base_module_t **module, int *priority)
|
||||
{
|
||||
*priority = 20; /* irrelevant */
|
||||
*module = (pmix_mca_base_module_t *)&pmix_psensor_file_module;
|
||||
return PMIX_SUCCESS;
|
||||
}
|
||||
|
||||
/**
|
||||
* Close all subsystems.
|
||||
*/
|
||||
|
||||
static int psensor_file_close(void)
|
||||
{
|
||||
PMIX_LIST_DESTRUCT(&mca_psensor_file_component.trackers);
|
||||
return PMIX_SUCCESS;
|
||||
}
|
38
opal/mca/pmix/pmix2x/pmix/src/mca/psensor/heartbeat/Makefile.am
Обычный файл
38
opal/mca/pmix/pmix2x/pmix/src/mca/psensor/heartbeat/Makefile.am
Обычный файл
@ -0,0 +1,38 @@
|
||||
#
|
||||
# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
||||
#
|
||||
# Copyright (c) 2017 Intel, Inc. All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
dist_pmixdata_DATA = help-pmix-psensor-heartbeat.txt
|
||||
|
||||
sources = \
|
||||
psensor_heartbeat.c \
|
||||
psensor_heartbeat.h \
|
||||
psensor_heartbeat_component.c
|
||||
|
||||
# Make the output library in this directory, and name it either
|
||||
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
|
||||
# (for static builds).
|
||||
|
||||
if MCA_BUILD_pmix_psensor_heartbeat_DSO
|
||||
component_noinst =
|
||||
component_install = mca_psensor_heartbeat.la
|
||||
else
|
||||
component_noinst = libmca_psensor_heartbeat.la
|
||||
component_install =
|
||||
endif
|
||||
|
||||
mcacomponentdir = $(pmixlibdir)
|
||||
mcacomponent_LTLIBRARIES = $(component_install)
|
||||
mca_psensor_heartbeat_la_SOURCES = $(sources)
|
||||
mca_psensor_heartbeat_la_LDFLAGS = -module -avoid-version
|
||||
|
||||
noinst_LTLIBRARIES = $(component_noinst)
|
||||
libmca_psensor_heartbeat_la_SOURCES =$(sources)
|
||||
libmca_psensor_heartbeat_la_LDFLAGS = -module -avoid-version
|
@ -4,9 +4,9 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel, Inc. All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
# This is the US/English general help file for the memory usage sensor
|
||||
@ -18,4 +18,3 @@ Node: %s
|
||||
Process rank: %s
|
||||
Memory used: %luGbytes
|
||||
Memory limit: %luGbytes
|
||||
|
@ -0,0 +1,330 @@
|
||||
/*
|
||||
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
*
|
||||
* Copyright (c) 2017 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include <src/include/pmix_config.h>
|
||||
#include <pmix_common.h>
|
||||
|
||||
#include <errno.h>
|
||||
#ifdef HAVE_UNISTD_H
|
||||
#include <unistd.h>
|
||||
#endif /* HAVE_UNISTD_H */
|
||||
#ifdef HAVE_STRING_H
|
||||
#include <string.h>
|
||||
#endif /* HAVE_STRING_H */
|
||||
#include <stdio.h>
|
||||
#include <pthread.h>
|
||||
#include PMIX_EVENT_HEADER
|
||||
|
||||
#include "src/util/argv.h"
|
||||
#include "src/util/error.h"
|
||||
#include "src/util/output.h"
|
||||
#include "src/util/show_help.h"
|
||||
#include "src/include/pmix_globals.h"
|
||||
#include "src/mca/ptl/ptl.h"
|
||||
|
||||
#include "src/mca/psensor/base/base.h"
|
||||
#include "psensor_heartbeat.h"
|
||||
|
||||
/* declare the API functions */
|
||||
static pmix_status_t heartbeat_start(pmix_peer_t *requestor, pmix_status_t error,
|
||||
const pmix_info_t *monitor,
|
||||
const pmix_info_t directives[], size_t ndirs);
|
||||
static pmix_status_t heartbeat_stop(pmix_peer_t *requestor, char *id);
|
||||
|
||||
/* instantiate the module */
|
||||
pmix_psensor_base_module_t pmix_psensor_heartbeat_module = {
|
||||
.start = heartbeat_start,
|
||||
.stop = heartbeat_stop
|
||||
};
|
||||
|
||||
/* tracker object */
|
||||
typedef struct {
|
||||
pmix_list_item_t super;
|
||||
pmix_peer_t *requestor;
|
||||
char *id;
|
||||
bool event_active;
|
||||
pmix_event_t ev;
|
||||
pmix_event_t cdev;
|
||||
struct timeval tv;
|
||||
uint32_t nbeats;
|
||||
uint32_t ndrops;
|
||||
uint32_t nmissed;
|
||||
pmix_status_t error;
|
||||
pmix_data_range_t range;
|
||||
pmix_info_t *info;
|
||||
size_t ninfo;
|
||||
} pmix_heartbeat_trkr_t;
|
||||
|
||||
static void ft_constructor(pmix_heartbeat_trkr_t *ft)
|
||||
{
|
||||
ft->requestor = NULL;
|
||||
ft->id = NULL;
|
||||
ft->event_active = false;
|
||||
ft->tv.tv_sec = 0;
|
||||
ft->tv.tv_usec = 0;
|
||||
ft->nbeats = 0;
|
||||
ft->ndrops = 0;
|
||||
ft->nmissed = 0;
|
||||
ft->error = PMIX_SUCCESS;
|
||||
ft->range = PMIX_RANGE_NAMESPACE;
|
||||
ft->info = NULL;
|
||||
ft->ninfo = 0;
|
||||
}
|
||||
static void ft_destructor(pmix_heartbeat_trkr_t *ft)
|
||||
{
|
||||
if (NULL != ft->requestor) {
|
||||
PMIX_RELEASE(ft->requestor);
|
||||
}
|
||||
if (NULL != ft->id) {
|
||||
free(ft->id);
|
||||
}
|
||||
if (event_active) {
|
||||
pmix_event_del(&ft->ev);
|
||||
}
|
||||
if (NULL != ft->info) {
|
||||
PMIX_INFO_FREE(ft->info, ft->ninfo);
|
||||
}
|
||||
}
|
||||
PMIX_CLASS_INSTANCE(pmix_heartbeat_trkr_t,
|
||||
pmix_list_item_t,
|
||||
ft_constructor, ft_destructor);
|
||||
|
||||
/* define a local caddy */
|
||||
typedef struct {
|
||||
pmix_object_t super;
|
||||
pmix_event_t ev;
|
||||
pmix_peer_t *requestor;
|
||||
char *id;
|
||||
} heartbeat_caddy_t;
|
||||
static void cd_con(heartbeat_caddy_t *p)
|
||||
{
|
||||
p->requestor = NULL;
|
||||
p->id = NULL;
|
||||
}
|
||||
static void cd_des(heartbeat_caddy_t *p)
|
||||
{
|
||||
if (NULL != (p->requestor)) {
|
||||
PMIX_RELEASE(p->requestor);
|
||||
}
|
||||
if (NULL != p->id) {
|
||||
free(p->id);
|
||||
}
|
||||
}
|
||||
PMIX_CLASS_INSTANCE(heartbeat_caddy_t,
|
||||
pmix_object_t,
|
||||
cd_con, cd_des);
|
||||
|
||||
typedef struct {
|
||||
pmix_object_t super;
|
||||
pmix_event_t ev;
|
||||
pmix_peer_t *peer;
|
||||
} pmix_psensor_beat_t;
|
||||
|
||||
static void bcon(pmix_psensor_beat_t *p)
|
||||
{
|
||||
p->peer = NULL;
|
||||
}
|
||||
static void bdes(pmix_psensor_beat_t *p)
|
||||
{
|
||||
if (NULL != p->peer) {
|
||||
PMIX_RELEASE(p->peer);
|
||||
}
|
||||
}
|
||||
PMIX_CLASS_INSTANCE(pmix_psensor_beat_t,
|
||||
pmix_object_t,
|
||||
bcon, bdes);
|
||||
|
||||
static void check_heartbeat(int fd, short dummy, void *arg);
|
||||
|
||||
static void add_tracker(int sd, short flags, void *cbdata)
|
||||
{
|
||||
pmix_heartbeat_trkr_t *ft = (pmix_heartbeat_trkr_t*)cbdata;
|
||||
|
||||
/* add the tracker to our list */
|
||||
pmix_list_append(&mca_psensor_heartbeat_component.trackers, &ft->super);
|
||||
|
||||
/* setup the timer event */
|
||||
pmix_event_evtimer_set(pmix_psensor_base.evbase, &ft->ev,
|
||||
check_heartbeat, ft);
|
||||
pmix_event_evtimer_add(&ft->ev, &ft->tv);
|
||||
ft->event_active = true;
|
||||
}
|
||||
|
||||
static pmix_status_t heartbeat_start(pmix_peer_t *requestor, pmix_status_t error,
|
||||
const pmix_info_t *monitor,
|
||||
const pmix_info_t directives[], size_t ndirs)
|
||||
{
|
||||
pmix_heartbeat_trkr_t *ft;
|
||||
size_t n, n2;
|
||||
|
||||
PMIX_OUTPUT_VERBOSE((1, pmix_psensor_base_framework.framework_output,
|
||||
"[%s:%d] checking heartbeat monitoring for requestor %s:%d",
|
||||
pmix_globals.myid.nspace, pmix_globals.myid.rank,
|
||||
requestor->info->nptr->nspace, requestor->info->rank));
|
||||
|
||||
/* if they didn't ask for heartbeats, then nothing for us to do */
|
||||
if (0 != strcmp(monitor->key, PMIX_MONITOR_HEARTBEAT)) {
|
||||
return PMIX_ERR_TAKE_NEXT_OPTION;
|
||||
}
|
||||
|
||||
/* setup to track this monitoring operation */
|
||||
ft = PMIX_NEW(pmix_heartbeat_trkr_t);
|
||||
PMIX_RETAIN(requestor);
|
||||
ft->requestor = requestor;
|
||||
ft->error = error;
|
||||
|
||||
/* check the directives to see what they want monitored */
|
||||
for (n=0; n < ndirs; n++) {
|
||||
if (0 == strcmp(directives[n].key, PMIX_MONITOR_HEARTBEAT_TIME)) {
|
||||
ft->tv.tv_sec = directives[n].value.data.uint32;
|
||||
} else if (0 == strcmp(directives[n].key, PMIX_MONITOR_HEARTBEAT_DROPS)) {
|
||||
ft->ndrops = directives[n].value.data.uint32;
|
||||
} else if (0 == strcmp(directives[n].key, PMIX_RANGE)) {
|
||||
ft->range = directives[n].value.data.range;
|
||||
}
|
||||
}
|
||||
|
||||
if (0 == ft->tv.tv_sec) {
|
||||
/* didn't specify a sample rate, or what should be sampled */
|
||||
PMIX_RELEASE(ft);
|
||||
return PMIX_ERR_BAD_PARAM;
|
||||
}
|
||||
|
||||
/* need to push into our event base to add this to our trackers */
|
||||
pmix_event_assign(&ft->cdev, pmix_psensor_base.evbase, -1,
|
||||
EV_WRITE, add_tracker, ft);
|
||||
pmix_event_active(&ft->cdev, EV_WRITE, 1);
|
||||
|
||||
return PMIX_SUCCESS;
|
||||
}
|
||||
|
||||
static void del_tracker(int sd, short flags, void *cbdata)
|
||||
{
|
||||
heartbeat_caddy_t *cd = (heartbeat_caddy_t*)cbdata;
|
||||
pmix_heartbeat_trkr_t *ft, *ftnext;
|
||||
|
||||
/* remove the tracker from our list */
|
||||
PMIX_LIST_FOREACH_SAFE(ft, ftnext, &mca_psensor_heartbeat_component.trackers, pmix_heartbeat_trkr_t) {
|
||||
if (ft->requestor != cd->requestor) {
|
||||
continue;
|
||||
}
|
||||
if (NULL == cd->id ||
|
||||
(NULL != ft->id && 0 == strcmp(ft->id, cd->id))) {
|
||||
pmix_list_remove_item(&mca_psensor_heartbeat_component.trackers, &ft->super);
|
||||
PMIX_RELEASE(ft);
|
||||
}
|
||||
}
|
||||
PMIX_RELEASE(cd);
|
||||
}
|
||||
|
||||
static pmix_status_t heartbeat_stop(pmix_peer_t *requestor, char *id)
|
||||
{
|
||||
heartbeat_caddy_t *cd;
|
||||
|
||||
cd = PMIX_NEW(heartbeat_caddy_t);
|
||||
PMIX_RETAIN(requestor);
|
||||
cd->requestor = requestor;
|
||||
cd->id = strdup(id);
|
||||
|
||||
/* need to push into our event base to add this to our trackers */
|
||||
pmix_event_assign(&cd->ev, pmix_psensor_base.evbase, -1,
|
||||
EV_WRITE, del_tracker, cd);
|
||||
pmix_event_active(&cd->ev, EV_WRITE, 1);
|
||||
|
||||
return PMIX_SUCCESS;
|
||||
}
|
||||
|
||||
static void opcbfunc(pmix_status_t status, void *cbdata)
|
||||
{
|
||||
pmix_heartbeat_trkr_t *ft = (pmix_heartbeat_trkr_t*)cbdata;
|
||||
|
||||
PMIX_RELEASE(ft);
|
||||
}
|
||||
|
||||
/* this function automatically gets periodically called
|
||||
* by the event library so we can check on the state
|
||||
* of the various procs we are monitoring
|
||||
*/
|
||||
static void check_heartbeat(int fd, short dummy, void *cbdata)
|
||||
{
|
||||
pmix_heartbeat_trkr_t *ft = (pmix_heartbeat_trkr_t*)cbdata;
|
||||
pmix_status_t rc;
|
||||
pmix_proc_t source;
|
||||
|
||||
PMIX_OUTPUT_VERBOSE((1, pmix_psensor_base_framework.framework_output,
|
||||
"[%s:%d] sensor:check_heartbeat for proc %s:%d",
|
||||
pmix_globals.myid.nspace, pmix_globals.myid.rank,
|
||||
ft->requestor->info->nptr->nspace, ft->requestor->info->rank));
|
||||
|
||||
if (0 == ft->nbeats) {
|
||||
/* no heartbeat recvd in last window */
|
||||
PMIX_OUTPUT_VERBOSE((1, pmix_psensor_base_framework.framework_output,
|
||||
"[%s:%d] sensor:check_heartbeat failed for proc %s:%d",
|
||||
pmix_globals.myid.nspace, pmix_globals.myid.rank,
|
||||
ft->requestor->info->nptr->nspace, ft->requestor->info->rank));
|
||||
/* stop monitoring this client */
|
||||
pmix_list_remove_item(&mca_psensor_heartbeat_component.trackers, &ft->super);
|
||||
/* generate an event */
|
||||
(void)strncpy(source.nspace, ft->requestor->info->nptr->nspace, PMIX_MAX_NSLEN);
|
||||
source.rank = ft->requestor->info->rank;
|
||||
rc = PMIx_Notify_event(PMIX_MONITOR_HEARTBEAT_ALERT, &source,
|
||||
ft->range, ft->info, ft->ninfo, opcbfunc, ft);
|
||||
if (PMIX_SUCCESS != rc) {
|
||||
PMIX_ERROR_LOG(rc);
|
||||
}
|
||||
return;
|
||||
} else {
|
||||
PMIX_OUTPUT_VERBOSE((1, pmix_psensor_base_framework.framework_output,
|
||||
"[%s:%d] sensor:check_heartbeat detected %d beats for proc %s:%d",
|
||||
pmix_globals.myid.nspace, pmix_globals.myid.rank, ft->nbeats,
|
||||
ft->requestor->info->nptr->nspace, ft->requestor->info->rank));
|
||||
}
|
||||
/* reset for next period */
|
||||
ft->nbeats = 0;
|
||||
|
||||
/* reset the timer */
|
||||
pmix_event_evtimer_add(&ft->ev, &ft->tv);
|
||||
}
|
||||
|
||||
static void add_beat(int sd, short args, void *cbdata)
|
||||
{
|
||||
pmix_psensor_beat_t *b = (pmix_psensor_beat_t*)cbdata;
|
||||
pmix_heartbeat_trkr_t *ft;
|
||||
|
||||
/* find this peer in our trackers */
|
||||
PMIX_LIST_FOREACH(ft, &mca_psensor_heartbeat_component.trackers, pmix_heartbeat_trkr_t) {
|
||||
if (ft->requestor == b->peer) {
|
||||
/* increment the beat count */
|
||||
++ft->nbeats;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
PMIX_RELEASE(b);
|
||||
}
|
||||
|
||||
void pmix_psensor_heartbeat_recv_beats(struct pmix_peer_t *peer,
|
||||
pmix_ptl_hdr_t *hdr,
|
||||
pmix_buffer_t *buf, void *cbdata)
|
||||
{
|
||||
pmix_psensor_beat_t *b;
|
||||
|
||||
b = PMIX_NEW(pmix_psensor_beat_t);
|
||||
PMIX_RETAIN(peer);
|
||||
b->peer = peer;
|
||||
|
||||
/* shift this to our thread for processing */
|
||||
pmix_event_assign(&b->ev, pmix_psensor_base.evbase, -1,
|
||||
EV_WRITE, add_beat, b);
|
||||
pmix_event_active(&b->ev, EV_WRITE, 1);
|
||||
}
|
@ -0,0 +1,43 @@
|
||||
/*
|
||||
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2012 Los Alamos National Security, Inc. All rights reserved.
|
||||
*
|
||||
* Copyright (c) 2017 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
/**
|
||||
* @file
|
||||
*
|
||||
* Heartbeat sensor
|
||||
*/
|
||||
#ifndef PMIX_PSENSOR_HEARTBEAT_H
|
||||
#define PMIX_PSENSOR_HEARTBEAT_H
|
||||
|
||||
#include <src/include/pmix_config.h>
|
||||
#include <src/include/types.h>
|
||||
|
||||
#include "src/class/pmix_list.h"
|
||||
#include "src/include/pmix_globals.h"
|
||||
#include "src/mca/psensor/psensor.h"
|
||||
|
||||
BEGIN_C_DECLS
|
||||
|
||||
typedef struct {
|
||||
pmix_psensor_base_component_t super;
|
||||
pmix_list_t trackers;
|
||||
} pmix_psensor_heartbeat_component_t;
|
||||
|
||||
PMIX_EXPORT extern pmix_psensor_heartbeat_component_t mca_psensor_heartbeat_component;
|
||||
extern pmix_psensor_base_module_t pmix_psensor_heartbeat_module;
|
||||
|
||||
void pmix_psensor_heartbeat_recv_beats(struct pmix_peer_t *peer,
|
||||
pmix_ptl_hdr_t *hdr,
|
||||
pmix_buffer_t *buf, void *cbdata);
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
#endif
|
@ -0,0 +1,81 @@
|
||||
/*
|
||||
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2012 Los Alamos National Security, Inc. All rights reserved.
|
||||
* Copyright (c) 2017 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include <src/include/pmix_config.h>
|
||||
#include <pmix_common.h>
|
||||
|
||||
#include "src/mca/ptl/ptl.h"
|
||||
#include "src/mca/psensor/base/base.h"
|
||||
#include "src/mca/psensor/heartbeat/psensor_heartbeat.h"
|
||||
|
||||
/*
|
||||
* Local functions
|
||||
*/
|
||||
|
||||
static int heartbeat_open(void);
|
||||
static int heartbeat_close(void);
|
||||
static int heartbeat_query(pmix_mca_base_module_t **module, int *priority);
|
||||
|
||||
pmix_psensor_heartbeat_component_t mca_psensor_heartbeat_component = {
|
||||
.super = {
|
||||
.base = {
|
||||
PMIX_PSENSOR_BASE_VERSION_1_0_0,
|
||||
|
||||
/* Component name and version */
|
||||
.pmix_mca_component_name = "heartbeat",
|
||||
PMIX_MCA_BASE_MAKE_VERSION(component,
|
||||
PMIX_MAJOR_VERSION,
|
||||
PMIX_MINOR_VERSION,
|
||||
PMIX_RELEASE_VERSION),
|
||||
|
||||
/* Component open and close functions */
|
||||
heartbeat_open, /* component open */
|
||||
heartbeat_close, /* component close */
|
||||
heartbeat_query /* component query */
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
/**
|
||||
* component open/close/init function
|
||||
*/
|
||||
static int heartbeat_open(void)
|
||||
{
|
||||
PMIX_CONSTRUCT(&mca_psensor_heartbeat_component.trackers, pmix_list_t);
|
||||
|
||||
/* setup to receive heartbeats */
|
||||
pmix_ptl.recv(pmix_globals.mypeer, pmix_psensor_heartbeat_recv_beats, PMIX_PTL_TAG_HEARTBEAT);
|
||||
|
||||
return PMIX_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
static int heartbeat_query(pmix_mca_base_module_t **module, int *priority)
|
||||
{
|
||||
*priority = 5; // irrelevant
|
||||
*module = (pmix_mca_base_module_t *)&pmix_psensor_heartbeat_module;
|
||||
return PMIX_SUCCESS;
|
||||
}
|
||||
|
||||
/**
|
||||
* Close all subsystems.
|
||||
*/
|
||||
|
||||
static int heartbeat_close(void)
|
||||
{
|
||||
/* cancel our persistent recv */
|
||||
pmix_ptl.cancel(pmix_globals.mypeer, PMIX_PTL_TAG_HEARTBEAT);
|
||||
|
||||
PMIX_LIST_DESTRUCT(&mca_psensor_heartbeat_component.trackers);
|
||||
|
||||
return PMIX_SUCCESS;
|
||||
}
|
86
opal/mca/pmix/pmix2x/pmix/src/mca/psensor/psensor.h
Обычный файл
86
opal/mca/pmix/pmix2x/pmix/src/mca/psensor/psensor.h
Обычный файл
@ -0,0 +1,86 @@
|
||||
/*
|
||||
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2012 Los Alamos National Security, Inc. All rights reserved.
|
||||
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*
|
||||
* @file:
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef PMIX_PSENSOR_H_
|
||||
#define PMIX_PSENSOR_H_
|
||||
|
||||
#include <src/include/pmix_config.h>
|
||||
|
||||
#include "src/class/pmix_list.h"
|
||||
#include "src/mca/mca.h"
|
||||
#include "src/include/pmix_globals.h"
|
||||
|
||||
BEGIN_C_DECLS
|
||||
|
||||
/*
|
||||
* Component functions - all MUST be provided!
|
||||
*/
|
||||
|
||||
/* start a sensor operation:
|
||||
*
|
||||
* requestor - the process requesting this operation
|
||||
*
|
||||
* monitor - a PMIx attribute specifying what is to be monitored
|
||||
*
|
||||
* directives - an array of pmix_info_t specifying relevant limits on values, and action
|
||||
* to be taken when limits exceeded. Can include
|
||||
* user-provided "id" string */
|
||||
typedef pmix_status_t (*pmix_psensor_base_module_start_fn_t)(pmix_peer_t *requestor, pmix_status_t error,
|
||||
const pmix_info_t *monitor,
|
||||
const pmix_info_t directives[], size_t ndirs);
|
||||
|
||||
/* stop a sensor operation:
|
||||
*
|
||||
* requestor - the process requesting this operation
|
||||
*
|
||||
* id - the "id" string provided by the user at the time the
|
||||
* affected monitoring operation was started. A NULL indicates
|
||||
* that all operations started by this requestor are to
|
||||
* be terminated */
|
||||
typedef pmix_status_t (*pmix_psensor_base_module_stop_fn_t)(pmix_peer_t *requestor,
|
||||
char *id);
|
||||
|
||||
/* API module */
|
||||
/*
|
||||
* Ver 1.0
|
||||
*/
|
||||
typedef struct pmix_psensor_base_module_1_0_0_t {
|
||||
pmix_psensor_base_module_start_fn_t start;
|
||||
pmix_psensor_base_module_stop_fn_t stop;
|
||||
} pmix_psensor_base_module_t;
|
||||
|
||||
/*
|
||||
* the standard component data structure
|
||||
*/
|
||||
typedef struct pmix_psensor_base_component_1_0_0_t {
|
||||
pmix_mca_base_component_t base;
|
||||
pmix_mca_base_component_data_t data;
|
||||
} pmix_psensor_base_component_t;
|
||||
|
||||
|
||||
|
||||
/*
|
||||
* Macro for use in components that are of type sensor v1.0.0
|
||||
*/
|
||||
#define PMIX_PSENSOR_BASE_VERSION_1_0_0 \
|
||||
PMIX_MCA_BASE_VERSION_1_0_0("psensor", 1, 0, 0)
|
||||
|
||||
/* Global structure for accessing sensor functions
|
||||
*/
|
||||
PMIX_EXPORT extern pmix_psensor_base_module_t pmix_psensor; /* holds API function pointers */
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
#endif /* MCA_SENSOR_H */
|
@ -11,7 +11,7 @@
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2012 Los Alamos National Security, Inc. All rights reserved.
|
||||
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2015 Research Organization for Information Science
|
||||
* and Technology (RIST). All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
@ -73,6 +73,7 @@ struct pmix_ptl_globals_t {
|
||||
pmix_list_t actives;
|
||||
bool initialized;
|
||||
pmix_list_t posted_recvs; // list of pmix_ptl_posted_recv_t
|
||||
pmix_list_t unexpected_msgs;
|
||||
int stop_thread[2];
|
||||
bool listen_thread_active;
|
||||
pmix_list_t listeners;
|
||||
@ -93,6 +94,11 @@ PMIX_EXPORT pmix_status_t pmix_ptl_stub_send_oneway(struct pmix_peer_t *peer,
|
||||
pmix_ptl_tag_t tag);
|
||||
PMIX_EXPORT pmix_status_t pmix_ptl_stub_connect_to_peer(struct pmix_peer_t *peer,
|
||||
pmix_info_t info[], size_t ninfo);
|
||||
PMIX_EXPORT pmix_status_t pmix_ptl_stub_register_recv(struct pmix_peer_t *peer,
|
||||
pmix_ptl_cbfunc_t cbfunc,
|
||||
pmix_ptl_tag_t tag);
|
||||
PMIX_EXPORT pmix_status_t pmix_ptl_stub_cancel_recv(struct pmix_peer_t *peer,
|
||||
pmix_ptl_tag_t tag);
|
||||
|
||||
PMIX_EXPORT pmix_status_t pmix_ptl_base_start_listening(pmix_info_t *info, size_t ninfo);
|
||||
PMIX_EXPORT void pmix_ptl_base_stop_listening(void);
|
||||
|
@ -61,6 +61,8 @@ pmix_ptl_API_t pmix_ptl = {
|
||||
.send_recv = pmix_ptl_stub_send_recv,
|
||||
.send_oneway = pmix_ptl_stub_send_oneway,
|
||||
.connect_to_peer = pmix_ptl_stub_connect_to_peer,
|
||||
.recv = pmix_ptl_stub_register_recv,
|
||||
.cancel = pmix_ptl_stub_cancel_recv,
|
||||
.start_listening = pmix_ptl_base_start_listening,
|
||||
.stop_listening = pmix_ptl_base_stop_listening
|
||||
};
|
||||
@ -88,6 +90,7 @@ static pmix_status_t pmix_ptl_close(void)
|
||||
/* the components will cleanup when closed */
|
||||
PMIX_LIST_DESTRUCT(&pmix_ptl_globals.actives);
|
||||
PMIX_LIST_DESTRUCT(&pmix_ptl_globals.posted_recvs);
|
||||
PMIX_LIST_DESTRUCT(&pmix_ptl_globals.unexpected_msgs);
|
||||
PMIX_LIST_DESTRUCT(&pmix_ptl_globals.listeners);
|
||||
|
||||
return pmix_mca_base_framework_components_close(&pmix_ptl_base_framework, NULL);
|
||||
@ -99,6 +102,7 @@ static pmix_status_t pmix_ptl_open(pmix_mca_base_open_flag_t flags)
|
||||
pmix_ptl_globals.initialized = true;
|
||||
PMIX_CONSTRUCT(&pmix_ptl_globals.actives, pmix_list_t);
|
||||
PMIX_CONSTRUCT(&pmix_ptl_globals.posted_recvs, pmix_list_t);
|
||||
PMIX_CONSTRUCT(&pmix_ptl_globals.unexpected_msgs, pmix_list_t);
|
||||
pmix_ptl_globals.listen_thread_active = false;
|
||||
PMIX_CONSTRUCT(&pmix_ptl_globals.listeners, pmix_list_t);
|
||||
pmix_client_globals.myserver.sd = -1;
|
||||
|
@ -46,7 +46,7 @@
|
||||
|
||||
#include "src/mca/ptl/base/base.h"
|
||||
|
||||
static uint32_t current_tag = 1; // 0 is reserved for system purposes
|
||||
static uint32_t current_tag = PMIX_PTL_TAG_DYNAMIC;
|
||||
|
||||
static void _notify_complete(pmix_status_t status, void *cbdata)
|
||||
{
|
||||
@ -162,7 +162,7 @@ static pmix_status_t send_msg(int sd, pmix_ptl_send_t *msg)
|
||||
} else {
|
||||
iov_count = 1;
|
||||
}
|
||||
retry:
|
||||
retry:
|
||||
rc = writev(sd, iov, iov_count);
|
||||
if (PMIX_LIKELY(rc == remain)) {
|
||||
/* we successfully sent the header and the msg data if any */
|
||||
@ -521,16 +521,16 @@ void pmix_ptl_base_send_recv(int fd, short args, void *cbdata)
|
||||
return;
|
||||
}
|
||||
|
||||
/* set the tag */
|
||||
tag = current_tag++;
|
||||
/* take the next tag in the sequence */
|
||||
current_tag++;
|
||||
if (UINT32_MAX == current_tag ) {
|
||||
current_tag = PMIX_PTL_TAG_DYNAMIC;
|
||||
}
|
||||
tag = current_tag;
|
||||
|
||||
if (NULL != ms->cbfunc) {
|
||||
/* if a callback msg is expected, setup a recv for it */
|
||||
req = PMIX_NEW(pmix_ptl_posted_recv_t);
|
||||
/* take the next tag in the sequence */
|
||||
if (UINT32_MAX == current_tag ) {
|
||||
current_tag = 1;
|
||||
}
|
||||
req->tag = tag;
|
||||
req->cbfunc = ms->cbfunc;
|
||||
req->cbdata = ms->cbdata;
|
||||
@ -597,23 +597,29 @@ void pmix_ptl_base_process_msg(int fd, short flags, void *cbdata)
|
||||
buf.pack_ptr = ((char*)buf.base_ptr) + buf.bytes_used;
|
||||
}
|
||||
msg->data = NULL; // protect the data region
|
||||
if (NULL != rcv->cbfunc) {
|
||||
rcv->cbfunc(msg->peer, &msg->hdr, &buf, rcv->cbdata);
|
||||
}
|
||||
rcv->cbfunc(msg->peer, &msg->hdr, &buf, rcv->cbdata);
|
||||
PMIX_DESTRUCT(&buf); // free's the msg data
|
||||
/* also done with the recv, if not a wildcard or the error tag */
|
||||
if (UINT32_MAX != rcv->tag && 0 != rcv->tag) {
|
||||
pmix_list_remove_item(&pmix_ptl_globals.posted_recvs, &rcv->super);
|
||||
PMIX_RELEASE(rcv);
|
||||
}
|
||||
PMIX_RELEASE(msg);
|
||||
return;
|
||||
}
|
||||
/* done with the recv if it is a dynamic tag */
|
||||
if (PMIX_PTL_TAG_DYNAMIC <= rcv->tag && UINT_MAX != rcv->tag) {
|
||||
pmix_list_remove_item(&pmix_ptl_globals.posted_recvs, &rcv->super);
|
||||
PMIX_RELEASE(rcv);
|
||||
}
|
||||
PMIX_RELEASE(msg);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
/* we get here if no matching recv was found - this is an error */
|
||||
pmix_output(0, "UNEXPECTED MESSAGE tag = %d", msg->hdr.tag);
|
||||
PMIX_RELEASE(msg);
|
||||
PMIX_REPORT_EVENT(PMIX_ERROR, _notify_complete);
|
||||
/* if the tag in this message is above the dynamic marker, then
|
||||
* that is an error */
|
||||
if (PMIX_PTL_TAG_DYNAMIC <= msg->hdr.tag) {
|
||||
pmix_output(0, "UNEXPECTED MESSAGE tag = %d", msg->hdr.tag);
|
||||
PMIX_RELEASE(msg);
|
||||
PMIX_REPORT_EVENT(PMIX_ERROR, _notify_complete);
|
||||
return;
|
||||
}
|
||||
|
||||
/* it is possible that someone may post a recv for this message
|
||||
* at some point, so we have to hold onto it */
|
||||
pmix_list_append(&pmix_ptl_globals.unexpected_msgs, &msg->super);
|
||||
}
|
||||
|
@ -9,7 +9,7 @@
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2015-2016 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2015-2017 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -105,3 +105,92 @@ pmix_status_t pmix_ptl_stub_connect_to_peer(struct pmix_peer_t *peer,
|
||||
|
||||
return PMIX_ERR_UNREACH;
|
||||
}
|
||||
|
||||
static void post_recv(int fd, short args, void *cbdata)
|
||||
{
|
||||
pmix_ptl_posted_recv_t *req = (pmix_ptl_posted_recv_t*)cbdata;
|
||||
pmix_ptl_recv_t *msg, *nmsg;
|
||||
pmix_buffer_t buf;
|
||||
|
||||
pmix_output_verbose(5, pmix_globals.debug_output,
|
||||
"posting recv on tag %d", req->tag);
|
||||
|
||||
/* add it to the list of recvs */
|
||||
pmix_list_append(&pmix_ptl_globals.posted_recvs, &req->super);
|
||||
|
||||
/* now check the unexpected msg queue to see if we already
|
||||
* recvd something for it */
|
||||
PMIX_LIST_FOREACH_SAFE(msg, nmsg, &pmix_ptl_globals.unexpected_msgs, pmix_ptl_recv_t) {
|
||||
if (msg->hdr.tag == req->tag || UINT_MAX == req->tag) {
|
||||
if (NULL != req->cbfunc) {
|
||||
/* construct and load the buffer */
|
||||
PMIX_CONSTRUCT(&buf, pmix_buffer_t);
|
||||
if (NULL != msg->data) {
|
||||
buf.base_ptr = (char*)msg->data;
|
||||
buf.bytes_allocated = buf.bytes_used = msg->hdr.nbytes;
|
||||
buf.unpack_ptr = buf.base_ptr;
|
||||
buf.pack_ptr = ((char*)buf.base_ptr) + buf.bytes_used;
|
||||
}
|
||||
msg->data = NULL; // protect the data region
|
||||
req->cbfunc(msg->peer, &msg->hdr, &buf, req->cbdata);
|
||||
PMIX_DESTRUCT(&buf); // free's the msg data
|
||||
}
|
||||
pmix_list_remove_item(&pmix_ptl_globals.unexpected_msgs, &msg->super);
|
||||
PMIX_RELEASE(msg);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pmix_status_t pmix_ptl_stub_register_recv(struct pmix_peer_t *peer,
|
||||
pmix_ptl_cbfunc_t cbfunc,
|
||||
pmix_ptl_tag_t tag)
|
||||
{
|
||||
pmix_ptl_posted_recv_t *req;
|
||||
|
||||
req = PMIX_NEW(pmix_ptl_posted_recv_t);
|
||||
if (NULL == req) {
|
||||
return PMIX_ERR_NOMEM;
|
||||
}
|
||||
req->tag = tag;
|
||||
req->cbfunc = cbfunc;
|
||||
/* have to push this into an event so we can add this
|
||||
* to the list of posted recvs */
|
||||
pmix_event_assign(&(req->ev), pmix_globals.evbase, -1,
|
||||
EV_WRITE, post_recv, req);
|
||||
pmix_event_active(&(req->ev), EV_WRITE, 1);
|
||||
return PMIX_SUCCESS;
|
||||
}
|
||||
|
||||
static void cancel_recv(int fd, short args, void *cbdata)
|
||||
{
|
||||
pmix_ptl_posted_recv_t *req = (pmix_ptl_posted_recv_t*)cbdata;
|
||||
pmix_ptl_posted_recv_t *rcv;
|
||||
|
||||
PMIX_LIST_FOREACH(rcv, &pmix_ptl_globals.posted_recvs, pmix_ptl_posted_recv_t) {
|
||||
if (rcv->tag == req->tag) {
|
||||
pmix_list_remove_item(&pmix_ptl_globals.posted_recvs, &rcv->super);
|
||||
PMIX_RELEASE(rcv);
|
||||
PMIX_RELEASE(req);
|
||||
return;
|
||||
}
|
||||
}
|
||||
PMIX_RELEASE(req);
|
||||
}
|
||||
|
||||
pmix_status_t pmix_ptl_stub_cancel_recv(struct pmix_peer_t *peer,
|
||||
pmix_ptl_tag_t tag)
|
||||
{
|
||||
pmix_ptl_posted_recv_t *req;
|
||||
|
||||
req = PMIX_NEW(pmix_ptl_posted_recv_t);
|
||||
if (NULL == req) {
|
||||
return PMIX_ERR_NOMEM;
|
||||
}
|
||||
req->tag = tag;
|
||||
/* have to push this into an event so we can modify
|
||||
* the list of posted recvs */
|
||||
pmix_event_assign(&(req->ev), pmix_globals.evbase, -1,
|
||||
EV_WRITE, cancel_recv, req);
|
||||
pmix_event_active(&(req->ev), EV_WRITE, 1);
|
||||
return PMIX_SUCCESS;
|
||||
}
|
||||
|
@ -11,7 +11,7 @@
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2012 Los Alamos National Security, Inc. All rights reserved.
|
||||
* Copyright (c) 2013-2016 Intel, Inc. All rights reserved
|
||||
* Copyright (c) 2013-2017 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2015 Research Organization for Information Science
|
||||
* and Technology (RIST). All rights reserved.
|
||||
* Copyright (c) 2016 Mellanox Technologies, Inc.
|
||||
@ -110,6 +110,15 @@ typedef pmix_status_t (*pmix_ptl_send_fn_t)(struct pmix_peer_t *peer,
|
||||
pmix_buffer_t *bfr,
|
||||
pmix_ptl_tag_t tag);
|
||||
|
||||
/* (ONE-WAY) register a persistent recv */
|
||||
typedef pmix_status_t (*pmix_ptl_recv_fn_t)(struct pmix_peer_t *peer,
|
||||
pmix_ptl_cbfunc_t cbfunc,
|
||||
pmix_ptl_tag_t tag);
|
||||
|
||||
/* Cancel a persistent recv */
|
||||
typedef pmix_status_t (*pmix_ptl_cancel_fn_t)(struct pmix_peer_t *peer,
|
||||
pmix_ptl_tag_t tag);
|
||||
|
||||
/* connect to a peer - this is a blocking function
|
||||
* to establish a connection to a peer. It assigns
|
||||
* the corresponding module to the peer's compat
|
||||
@ -126,6 +135,8 @@ struct pmix_ptl_module_t {
|
||||
pmix_ptl_finalize_fn_t finalize;
|
||||
pmix_ptl_send_recv_fn_t send_recv;
|
||||
pmix_ptl_send_fn_t send;
|
||||
pmix_ptl_recv_fn_t recv;
|
||||
pmix_ptl_cancel_fn_t cancel;
|
||||
pmix_ptl_connect_to_peer_fn_t connect_to_peer;
|
||||
};
|
||||
typedef struct pmix_ptl_module_t pmix_ptl_module_t;
|
||||
@ -152,6 +163,8 @@ typedef struct {
|
||||
pmix_ptl_get_available_modules_fn_t get_available_modules;
|
||||
pmix_ptl_send_recv_fn_t send_recv;
|
||||
pmix_ptl_send_fn_t send_oneway;
|
||||
pmix_ptl_recv_fn_t recv;
|
||||
pmix_ptl_cancel_fn_t cancel;
|
||||
pmix_ptl_connect_to_peer_fn_t connect_to_peer;
|
||||
pmix_ptl_start_listening_fn_t start_listening;
|
||||
pmix_ptl_stop_listening_fn_t stop_listening;
|
||||
|
@ -63,6 +63,16 @@ struct pmix_ptl_module_t;
|
||||
|
||||
/**** MESSAGING STRUCTURES ****/
|
||||
typedef uint32_t pmix_ptl_tag_t;
|
||||
/* define a range of "reserved" tags - these
|
||||
* are tags that are used for persistent recvs
|
||||
* within the system */
|
||||
#define PMIX_PTL_TAG_NOTIFY 0
|
||||
#define PMIX_PTL_TAG_HEARTBEAT 1
|
||||
|
||||
/* define the start of dynamic tags that are
|
||||
* assigned for send/recv operations */
|
||||
#define PMIX_PTL_TAG_DYNAMIC 100
|
||||
|
||||
|
||||
/* header for messages */
|
||||
typedef struct {
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2015 Cisco Systems, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
@ -13,6 +13,11 @@
|
||||
|
||||
#include "pmix_config.h"
|
||||
|
||||
#include <pthread.h>
|
||||
#include PMIX_EVENT_HEADER
|
||||
|
||||
#include "src/include/types.h"
|
||||
|
||||
/**
|
||||
* Initialize a progress thread name; if a progress thread is not
|
||||
* already associated with that name, start a progress thread.
|
||||
|
@ -2345,6 +2345,18 @@ static pmix_status_t server_switchyard(pmix_peer_t *peer, uint32_t tag,
|
||||
return rc;
|
||||
}
|
||||
|
||||
if (PMIX_JOB_CONTROL_CMD == cmd) {
|
||||
PMIX_PEER_CADDY(cd, peer, tag);
|
||||
rc = pmix_server_job_ctrl(peer, buf, query_cbfunc, cd);
|
||||
return rc;
|
||||
}
|
||||
|
||||
if (PMIX_MONITOR_CMD == cmd) {
|
||||
PMIX_PEER_CADDY(cd, peer, tag);
|
||||
rc = pmix_server_monitor(peer, buf, query_cbfunc, cd);
|
||||
return rc;
|
||||
}
|
||||
|
||||
return PMIX_ERR_NOT_SUPPORTED;
|
||||
}
|
||||
|
||||
|
@ -1562,6 +1562,134 @@ pmix_status_t pmix_server_alloc(pmix_peer_t *peer,
|
||||
return rc;
|
||||
}
|
||||
|
||||
pmix_status_t pmix_server_job_ctrl(pmix_peer_t *peer,
|
||||
pmix_buffer_t *buf,
|
||||
pmix_info_cbfunc_t cbfunc,
|
||||
void *cbdata)
|
||||
{
|
||||
int32_t cnt;
|
||||
pmix_status_t rc;
|
||||
pmix_query_caddy_t *cd;
|
||||
pmix_proc_t proc;
|
||||
|
||||
pmix_output_verbose(2, pmix_globals.debug_output,
|
||||
"recvd job control request from client");
|
||||
|
||||
if (NULL == pmix_host_server.job_control) {
|
||||
return PMIX_ERR_NOT_SUPPORTED;
|
||||
}
|
||||
|
||||
cd = PMIX_NEW(pmix_query_caddy_t);
|
||||
cd->cbdata = cbdata;
|
||||
|
||||
/* unpack the number of targets */
|
||||
cnt = 1;
|
||||
if (PMIX_SUCCESS != (rc = pmix_bfrop.unpack(buf, &cd->ntargets, &cnt, PMIX_SIZE))) {
|
||||
PMIX_ERROR_LOG(rc);
|
||||
goto exit;
|
||||
}
|
||||
if (0 < cd->ntargets) {
|
||||
PMIX_PROC_CREATE(cd->targets, cd->ntargets);
|
||||
cnt = cd->ntargets;
|
||||
if (PMIX_SUCCESS != (rc = pmix_bfrop.unpack(buf, cd->targets, &cnt, PMIX_PROC))) {
|
||||
PMIX_ERROR_LOG(rc);
|
||||
goto exit;
|
||||
}
|
||||
}
|
||||
/* unpack the number of info objects */
|
||||
cnt = 1;
|
||||
if (PMIX_SUCCESS != (rc = pmix_bfrop.unpack(buf, &cd->ninfo, &cnt, PMIX_SIZE))) {
|
||||
PMIX_ERROR_LOG(rc);
|
||||
goto exit;
|
||||
}
|
||||
/* unpack the info */
|
||||
if (0 < cd->ninfo) {
|
||||
PMIX_INFO_CREATE(cd->info, cd->ninfo);
|
||||
cnt = cd->ninfo;
|
||||
if (PMIX_SUCCESS != (rc = pmix_bfrop.unpack(buf, cd->info, &cnt, PMIX_INFO))) {
|
||||
PMIX_ERROR_LOG(rc);
|
||||
goto exit;
|
||||
}
|
||||
}
|
||||
|
||||
/* setup the requesting peer name */
|
||||
(void)strncpy(proc.nspace, peer->info->nptr->nspace, PMIX_MAX_NSLEN);
|
||||
proc.rank = peer->info->rank;
|
||||
|
||||
/* ask the host to execute the request */
|
||||
if (PMIX_SUCCESS != (rc = pmix_host_server.job_control(&proc,
|
||||
cd->targets, cd->ntargets,
|
||||
cd->info, cd->ninfo,
|
||||
cbfunc, cd))) {
|
||||
goto exit;
|
||||
}
|
||||
return PMIX_SUCCESS;
|
||||
|
||||
exit:
|
||||
PMIX_RELEASE(cd);
|
||||
return rc;
|
||||
}
|
||||
|
||||
pmix_status_t pmix_server_monitor(pmix_peer_t *peer,
|
||||
pmix_buffer_t *buf,
|
||||
pmix_info_cbfunc_t cbfunc,
|
||||
void *cbdata)
|
||||
{
|
||||
int32_t cnt;
|
||||
pmix_status_t rc, error;
|
||||
pmix_query_caddy_t *cd;
|
||||
pmix_proc_t proc;
|
||||
|
||||
pmix_output_verbose(2, pmix_globals.debug_output,
|
||||
"recvd monitor request from client");
|
||||
|
||||
if (NULL == pmix_host_server.monitor) {
|
||||
return PMIX_ERR_NOT_SUPPORTED;
|
||||
}
|
||||
|
||||
cd = PMIX_NEW(pmix_query_caddy_t);
|
||||
cd->cbdata = cbdata;
|
||||
|
||||
/* unpack the error code */
|
||||
cnt = 1;
|
||||
if (PMIX_SUCCESS != (rc = pmix_bfrop.unpack(buf, &error, &cnt, PMIX_STATUS))) {
|
||||
PMIX_ERROR_LOG(rc);
|
||||
goto exit;
|
||||
}
|
||||
|
||||
/* unpack the number of directives */
|
||||
cnt = 1;
|
||||
if (PMIX_SUCCESS != (rc = pmix_bfrop.unpack(buf, &cd->ninfo, &cnt, PMIX_SIZE))) {
|
||||
PMIX_ERROR_LOG(rc);
|
||||
goto exit;
|
||||
}
|
||||
/* unpack the directives */
|
||||
if (0 < cd->ninfo) {
|
||||
PMIX_INFO_CREATE(cd->info, cd->ninfo);
|
||||
cnt = cd->ninfo;
|
||||
if (PMIX_SUCCESS != (rc = pmix_bfrop.unpack(buf, cd->info, &cnt, PMIX_INFO))) {
|
||||
PMIX_ERROR_LOG(rc);
|
||||
goto exit;
|
||||
}
|
||||
}
|
||||
|
||||
/* setup the requesting peer name */
|
||||
(void)strncpy(proc.nspace, peer->info->nptr->nspace, PMIX_MAX_NSLEN);
|
||||
proc.rank = peer->info->rank;
|
||||
|
||||
/* ask the host to execute the request */
|
||||
if (PMIX_SUCCESS != (rc = pmix_host_server.monitor(&proc, error,
|
||||
cd->info, cd->ninfo,
|
||||
cbfunc, cd))) {
|
||||
goto exit;
|
||||
}
|
||||
return PMIX_SUCCESS;
|
||||
|
||||
exit:
|
||||
PMIX_RELEASE(cd);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/***** INSTANCE SERVER LIBRARY CLASSES *****/
|
||||
static void tcon(pmix_server_trkr_t *t)
|
||||
{
|
||||
|
@ -218,6 +218,16 @@ pmix_status_t pmix_server_alloc(pmix_peer_t *peer,
|
||||
pmix_info_cbfunc_t cbfunc,
|
||||
void *cbdata);
|
||||
|
||||
pmix_status_t pmix_server_job_ctrl(pmix_peer_t *peer,
|
||||
pmix_buffer_t *buf,
|
||||
pmix_info_cbfunc_t cbfunc,
|
||||
void *cbdata);
|
||||
|
||||
pmix_status_t pmix_server_monitor(pmix_peer_t *peer,
|
||||
pmix_buffer_t *buf,
|
||||
pmix_info_cbfunc_t cbfunc,
|
||||
void *cbdata);
|
||||
|
||||
pmix_status_t pmix_server_event_recvd_from_client(pmix_peer_t *peer,
|
||||
pmix_buffer_t *buf,
|
||||
pmix_op_cbfunc_t cbfunc,
|
||||
|
@ -56,6 +56,8 @@ PMIX_EXPORT const char* PMIx_Error_string(pmix_status_t errnum)
|
||||
return "INVALID-KEYVAL";
|
||||
case PMIX_ERR_INVALID_NUM_PARSED:
|
||||
return "INVALID-NUM-PARSED";
|
||||
case PMIX_ERR_TAKE_NEXT_OPTION:
|
||||
return "TAKE-NEXT-OPTION";
|
||||
|
||||
case PMIX_ERR_INVALID_ARGS:
|
||||
return "INVALID-ARGS";
|
||||
@ -157,6 +159,14 @@ PMIX_EXPORT const char* PMIx_Error_string(pmix_status_t errnum)
|
||||
return "PMIX_ERR_WILDCARD";
|
||||
case PMIX_NOTIFY_ALLOC_COMPLETE:
|
||||
return "PMIX ALLOC OPERATION COMPLETE";
|
||||
case PMIX_JCTRL_CHECKPOINT:
|
||||
return "PMIX JOB CONTROL CHECKPOINT";
|
||||
case PMIX_JCTRL_PREEMPT_ALERT:
|
||||
return "PMIX PRE-EMPTION ALERT";
|
||||
case PMIX_MONITOR_HEARTBEAT_ALERT:
|
||||
return "PMIX HEARTBEAT ALERT";
|
||||
case PMIX_MONITOR_FILE_ALERT:
|
||||
return "PMIX FILE MONITOR ALERT";
|
||||
case PMIX_SUCCESS:
|
||||
return "SUCCESS";
|
||||
default:
|
||||
|
@ -37,6 +37,7 @@
|
||||
#define PMIX_ERR_NETWORK_NOT_PARSEABLE (PMIX_INTERNAL_ERR_BASE - 33)
|
||||
#define PMIX_ERR_FILE_OPEN_FAILURE (PMIX_INTERNAL_ERR_BASE - 34)
|
||||
#define PMIX_ERR_FILE_READ_FAILURE (PMIX_INTERNAL_ERR_BASE - 35)
|
||||
#define PMIX_ERR_TAKE_NEXT_OPTION (PMIX_INTERNAL_ERR_BASE - 36)
|
||||
|
||||
#define PMIX_ERROR_LOG(r) \
|
||||
do { \
|
||||
|
@ -493,6 +493,12 @@ int pmix2x_convert_rc(pmix_status_t rc)
|
||||
case PMIX_QUERY_PARTIAL_SUCCESS:
|
||||
return OPAL_ERR_PARTIAL_SUCCESS;
|
||||
|
||||
case PMIX_MONITOR_HEARTBEAT_ALERT:
|
||||
return OPAL_ERR_HEARTBEAT_ALERT;
|
||||
|
||||
case PMIX_MONITOR_FILE_ALERT:
|
||||
return OPAL_ERR_FILE_ALERT;
|
||||
|
||||
case PMIX_ERROR:
|
||||
return OPAL_ERROR;
|
||||
case PMIX_SUCCESS:
|
||||
@ -1333,6 +1339,22 @@ static void pmix2x_log(opal_list_t *info,
|
||||
OBJ_RELEASE(cd);
|
||||
}
|
||||
|
||||
opal_pmix_alloc_directive_t pmix2x_convert_allocdir(pmix_alloc_directive_t dir)
|
||||
{
|
||||
switch (dir) {
|
||||
case PMIX_ALLOC_NEW:
|
||||
return OPAL_PMIX_ALLOC_NEW;
|
||||
case PMIX_ALLOC_EXTEND:
|
||||
return OPAL_PMIX_ALLOC_EXTEND;
|
||||
case PMIX_ALLOC_RELEASE:
|
||||
return OPAL_PMIX_ALLOC_RELEASE;
|
||||
case PMIX_ALLOC_REAQUIRE:
|
||||
return OPAL_PMIX_ALLOC_REAQCUIRE;
|
||||
default:
|
||||
return OPAL_PMIX_ALLOC_UNDEF;
|
||||
}
|
||||
}
|
||||
|
||||
/**** INSTANTIATE INTERNAL CLASSES ****/
|
||||
OBJ_CLASS_INSTANCE(opal_pmix2x_jobid_trkr_t,
|
||||
opal_list_item_t,
|
||||
|
@ -279,6 +279,8 @@ OPAL_MODULE_DECLSPEC void pmix2x_value_load(pmix_value_t *v,
|
||||
OPAL_MODULE_DECLSPEC int pmix2x_value_unload(opal_value_t *kv,
|
||||
const pmix_value_t *v);
|
||||
|
||||
OPAL_MODULE_DECLSPEC opal_pmix_alloc_directive_t pmix2x_convert_allocdir(pmix_alloc_directive_t dir);
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
#endif /* MCA_PMIX_EXTERNAL_H */
|
||||
|
@ -45,63 +45,73 @@
|
||||
/* These are the interfaces used by the embedded PMIx server
|
||||
* to call up into ORTE for service requests */
|
||||
|
||||
static pmix_status_t server_client_connected_fn(const pmix_proc_t *proc, void* server_object,
|
||||
pmix_op_cbfunc_t cbfunc, void *cbdata);
|
||||
static pmix_status_t server_client_finalized_fn(const pmix_proc_t *proc, void* server_object,
|
||||
pmix_op_cbfunc_t cbfunc, void *cbdata);
|
||||
static pmix_status_t server_abort_fn(const pmix_proc_t *proc, void *server_object,
|
||||
int status, const char msg[],
|
||||
pmix_proc_t procs[], size_t nprocs,
|
||||
pmix_op_cbfunc_t cbfunc, void *cbdata);
|
||||
static pmix_status_t server_fencenb_fn(const pmix_proc_t procs[], size_t nprocs,
|
||||
const pmix_info_t info[], size_t ninfo,
|
||||
char *data, size_t ndata,
|
||||
pmix_modex_cbfunc_t cbfunc, void *cbdata);
|
||||
static pmix_status_t server_dmodex_req_fn(const pmix_proc_t *proc,
|
||||
const pmix_info_t info[], size_t ninfo,
|
||||
pmix_modex_cbfunc_t cbfunc, void *cbdata);
|
||||
static pmix_status_t server_publish_fn(const pmix_proc_t *proc,
|
||||
const pmix_info_t info[], size_t ninfo,
|
||||
pmix_op_cbfunc_t cbfunc, void *cbdata);
|
||||
static pmix_status_t server_lookup_fn(const pmix_proc_t *proc, char **keys,
|
||||
static pmix_status_t server_client_connected_fn(const pmix_proc_t *proc, void* server_object,
|
||||
pmix_op_cbfunc_t cbfunc, void *cbdata);
|
||||
static pmix_status_t server_client_finalized_fn(const pmix_proc_t *proc, void* server_object,
|
||||
pmix_op_cbfunc_t cbfunc, void *cbdata);
|
||||
static pmix_status_t server_abort_fn(const pmix_proc_t *proc, void *server_object,
|
||||
int status, const char msg[],
|
||||
pmix_proc_t procs[], size_t nprocs,
|
||||
pmix_op_cbfunc_t cbfunc, void *cbdata);
|
||||
static pmix_status_t server_fencenb_fn(const pmix_proc_t procs[], size_t nprocs,
|
||||
const pmix_info_t info[], size_t ninfo,
|
||||
pmix_lookup_cbfunc_t cbfunc, void *cbdata);
|
||||
static pmix_status_t server_unpublish_fn(const pmix_proc_t *proc, char **keys,
|
||||
char *data, size_t ndata,
|
||||
pmix_modex_cbfunc_t cbfunc, void *cbdata);
|
||||
static pmix_status_t server_dmodex_req_fn(const pmix_proc_t *proc,
|
||||
const pmix_info_t info[], size_t ninfo,
|
||||
pmix_modex_cbfunc_t cbfunc, void *cbdata);
|
||||
static pmix_status_t server_publish_fn(const pmix_proc_t *proc,
|
||||
const pmix_info_t info[], size_t ninfo,
|
||||
pmix_op_cbfunc_t cbfunc, void *cbdata);
|
||||
static pmix_status_t server_lookup_fn(const pmix_proc_t *proc, char **keys,
|
||||
const pmix_info_t info[], size_t ninfo,
|
||||
pmix_lookup_cbfunc_t cbfunc, void *cbdata);
|
||||
static pmix_status_t server_unpublish_fn(const pmix_proc_t *proc, char **keys,
|
||||
const pmix_info_t info[], size_t ninfo,
|
||||
pmix_op_cbfunc_t cbfunc, void *cbdata);
|
||||
static pmix_status_t server_spawn_fn(const pmix_proc_t *proc,
|
||||
const pmix_info_t job_info[], size_t ninfo,
|
||||
const pmix_app_t apps[], size_t napps,
|
||||
pmix_spawn_cbfunc_t cbfunc, void *cbdata);
|
||||
static pmix_status_t server_connect_fn(const pmix_proc_t procs[], size_t nprocs,
|
||||
const pmix_info_t info[], size_t ninfo,
|
||||
pmix_op_cbfunc_t cbfunc, void *cbdata);
|
||||
static pmix_status_t server_disconnect_fn(const pmix_proc_t procs[], size_t nprocs,
|
||||
const pmix_info_t info[], size_t ninfo,
|
||||
pmix_op_cbfunc_t cbfunc, void *cbdata);
|
||||
static pmix_status_t server_spawn_fn(const pmix_proc_t *proc,
|
||||
const pmix_info_t job_info[], size_t ninfo,
|
||||
const pmix_app_t apps[], size_t napps,
|
||||
pmix_spawn_cbfunc_t cbfunc, void *cbdata);
|
||||
static pmix_status_t server_connect_fn(const pmix_proc_t procs[], size_t nprocs,
|
||||
const pmix_info_t info[], size_t ninfo,
|
||||
pmix_op_cbfunc_t cbfunc, void *cbdata);
|
||||
static pmix_status_t server_disconnect_fn(const pmix_proc_t procs[], size_t nprocs,
|
||||
const pmix_info_t info[], size_t ninfo,
|
||||
pmix_op_cbfunc_t cbfunc, void *cbdata);
|
||||
static pmix_status_t server_register_events(pmix_status_t *codes, size_t ncodes,
|
||||
const pmix_info_t info[], size_t ninfo,
|
||||
pmix_op_cbfunc_t cbfunc, void *cbdata);
|
||||
static pmix_status_t server_deregister_events(pmix_status_t *codes, size_t ncodes,
|
||||
pmix_op_cbfunc_t cbfunc, void *cbdata);
|
||||
static pmix_status_t server_notify_event(pmix_status_t code,
|
||||
const pmix_proc_t *source,
|
||||
pmix_data_range_t range,
|
||||
pmix_info_t info[], size_t ninfo,
|
||||
pmix_op_cbfunc_t cbfunc, void *cbdata);
|
||||
static pmix_status_t server_query(pmix_proc_t *proct,
|
||||
pmix_query_t *queryies, size_t nqueries,
|
||||
pmix_info_cbfunc_t cbfunc,
|
||||
static pmix_status_t server_register_events(pmix_status_t *codes, size_t ncodes,
|
||||
const pmix_info_t info[], size_t ninfo,
|
||||
pmix_op_cbfunc_t cbfunc, void *cbdata);
|
||||
static pmix_status_t server_deregister_events(pmix_status_t *codes, size_t ncodes,
|
||||
pmix_op_cbfunc_t cbfunc, void *cbdata);
|
||||
static pmix_status_t server_notify_event(pmix_status_t code,
|
||||
const pmix_proc_t *source,
|
||||
pmix_data_range_t range,
|
||||
pmix_info_t info[], size_t ninfo,
|
||||
pmix_op_cbfunc_t cbfunc, void *cbdata);
|
||||
static pmix_status_t server_query(pmix_proc_t *proct,
|
||||
pmix_query_t *queryies, size_t nqueries,
|
||||
pmix_info_cbfunc_t cbfunc,
|
||||
void *cbdata);
|
||||
static void server_tool_connection(pmix_info_t *info, size_t ninfo,
|
||||
pmix_tool_connection_cbfunc_t cbfunc,
|
||||
void *cbdata);
|
||||
static void server_tool_connection(pmix_info_t *info, size_t ninfo,
|
||||
pmix_tool_connection_cbfunc_t cbfunc,
|
||||
void *cbdata);
|
||||
static void server_log(const pmix_proc_t *client,
|
||||
const pmix_info_t data[], size_t ndata,
|
||||
const pmix_info_t directives[], size_t ndirs,
|
||||
pmix_op_cbfunc_t cbfunc, void *cbdata);
|
||||
|
||||
pmix_server_module_t mymodule = {
|
||||
static pmix_status_t server_allocate(const pmix_proc_t *client,
|
||||
pmix_alloc_directive_t directive,
|
||||
const pmix_info_t data[], size_t ndata,
|
||||
pmix_info_cbfunc_t cbfunc, void *cbdata);
|
||||
|
||||
static pmix_status_t server_job_control(const pmix_proc_t *requestor,
|
||||
const pmix_proc_t targets[], size_t ntargets,
|
||||
const pmix_info_t directives[], size_t ndirs,
|
||||
pmix_info_cbfunc_t cbfunc, void *cbdata);
|
||||
|
||||
pmix_server_module_t mymodule = {
|
||||
.client_connected = server_client_connected_fn,
|
||||
.client_finalized = server_client_finalized_fn,
|
||||
.abort = server_abort_fn,
|
||||
@ -118,7 +128,11 @@ static void server_log(const pmix_proc_t *client,
|
||||
.notify_event = server_notify_event,
|
||||
.query = server_query,
|
||||
.tool_connected = server_tool_connection,
|
||||
.log = server_log
|
||||
.log = server_log,
|
||||
.allocate = server_allocate,
|
||||
.job_control = server_job_control
|
||||
/* we do not support monitoring, but use the
|
||||
* PMIx internal monitoring capability */
|
||||
};
|
||||
|
||||
opal_pmix_server_module_t *host_module = NULL;
|
||||
@ -1052,3 +1066,117 @@ static void server_log(const pmix_proc_t *proct,
|
||||
&opalcaddy->apps,
|
||||
opal_opcbfunc, opalcaddy);
|
||||
}
|
||||
|
||||
static pmix_status_t server_allocate(const pmix_proc_t *proct,
|
||||
pmix_alloc_directive_t directive,
|
||||
const pmix_info_t data[], size_t ndata,
|
||||
pmix_info_cbfunc_t cbfunc, void *cbdata)
|
||||
{
|
||||
pmix2x_opalcaddy_t *opalcaddy;
|
||||
opal_process_name_t requestor;
|
||||
int rc;
|
||||
size_t n;
|
||||
opal_value_t *oinfo;
|
||||
opal_pmix_alloc_directive_t odir;
|
||||
|
||||
if (NULL == host_module || NULL == host_module->allocate) {
|
||||
return PMIX_ERR_NOT_SUPPORTED;
|
||||
}
|
||||
|
||||
/* setup the caddy */
|
||||
opalcaddy = OBJ_NEW(pmix2x_opalcaddy_t);
|
||||
opalcaddy->infocbfunc = cbfunc;
|
||||
opalcaddy->cbdata = cbdata;
|
||||
|
||||
/* convert the requestor */
|
||||
if (OPAL_SUCCESS != (rc = opal_convert_string_to_jobid(&requestor.jobid, proct->nspace))) {
|
||||
OBJ_RELEASE(opalcaddy);
|
||||
return pmix2x_convert_opalrc(rc);
|
||||
}
|
||||
requestor.vpid = pmix2x_convert_rank(proct->rank);
|
||||
|
||||
/* convert the directive */
|
||||
odir = pmix2x_convert_allocdir(directive);
|
||||
|
||||
/* convert the data */
|
||||
for (n=0; n < ndata; n++) {
|
||||
oinfo = OBJ_NEW(opal_value_t);
|
||||
opal_list_append(&opalcaddy->info, &oinfo->super);
|
||||
if (OPAL_SUCCESS != (rc = pmix2x_value_unload(oinfo, &data[n].value))) {
|
||||
OBJ_RELEASE(opalcaddy);
|
||||
return pmix2x_convert_opalrc(rc);
|
||||
}
|
||||
}
|
||||
|
||||
/* pass the call upwards */
|
||||
if (OPAL_SUCCESS != (rc = host_module->allocate(&requestor, odir,
|
||||
&opalcaddy->info,
|
||||
info_cbfunc, opalcaddy))) {
|
||||
OBJ_RELEASE(opalcaddy);
|
||||
return pmix2x_convert_opalrc(rc);
|
||||
}
|
||||
|
||||
return PMIX_SUCCESS;
|
||||
|
||||
}
|
||||
|
||||
static pmix_status_t server_job_control(const pmix_proc_t *proct,
|
||||
const pmix_proc_t targets[], size_t ntargets,
|
||||
const pmix_info_t directives[], size_t ndirs,
|
||||
pmix_info_cbfunc_t cbfunc, void *cbdata)
|
||||
{
|
||||
pmix2x_opalcaddy_t *opalcaddy;
|
||||
opal_process_name_t requestor;
|
||||
int rc;
|
||||
size_t n;
|
||||
opal_value_t *oinfo;
|
||||
opal_namelist_t *nm;
|
||||
|
||||
if (NULL == host_module || NULL == host_module->job_control) {
|
||||
return PMIX_ERR_NOT_SUPPORTED;
|
||||
}
|
||||
|
||||
/* setup the caddy */
|
||||
opalcaddy = OBJ_NEW(pmix2x_opalcaddy_t);
|
||||
opalcaddy->infocbfunc = cbfunc;
|
||||
opalcaddy->cbdata = cbdata;
|
||||
|
||||
/* convert the requestor */
|
||||
if (OPAL_SUCCESS != (rc = opal_convert_string_to_jobid(&requestor.jobid, proct->nspace))) {
|
||||
OBJ_RELEASE(opalcaddy);
|
||||
return pmix2x_convert_opalrc(rc);
|
||||
}
|
||||
requestor.vpid = pmix2x_convert_rank(proct->rank);
|
||||
|
||||
/* convert the targets */
|
||||
for (n=0; n < ntargets; n++) {
|
||||
nm = OBJ_NEW(opal_namelist_t);
|
||||
opal_list_append(&opalcaddy->procs, &nm->super);
|
||||
if (OPAL_SUCCESS != (rc = opal_convert_string_to_jobid(&nm->name.jobid, targets[n].nspace))) {
|
||||
OBJ_RELEASE(opalcaddy);
|
||||
return pmix2x_convert_opalrc(rc);
|
||||
}
|
||||
nm->name.vpid = pmix2x_convert_rank(targets[n].rank);
|
||||
}
|
||||
|
||||
/* convert the directives */
|
||||
for (n=0; n < ndirs; n++) {
|
||||
oinfo = OBJ_NEW(opal_value_t);
|
||||
opal_list_append(&opalcaddy->info, &oinfo->super);
|
||||
if (OPAL_SUCCESS != (rc = pmix2x_value_unload(oinfo, &directives[n].value))) {
|
||||
OBJ_RELEASE(opalcaddy);
|
||||
return pmix2x_convert_opalrc(rc);
|
||||
}
|
||||
}
|
||||
|
||||
/* pass the call upwards */
|
||||
if (OPAL_SUCCESS != (rc = host_module->job_control(&requestor,
|
||||
&opalcaddy->procs,
|
||||
&opalcaddy->info,
|
||||
info_cbfunc, opalcaddy))) {
|
||||
OBJ_RELEASE(opalcaddy);
|
||||
return pmix2x_convert_opalrc(rc);
|
||||
}
|
||||
|
||||
return PMIX_SUCCESS;
|
||||
}
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -230,6 +230,19 @@ typedef void (*opal_pmix_connection_cbfunc_t)(int incoming_sd);
|
||||
typedef int (*opal_pmix_server_listener_fn_t)(int listening_sd,
|
||||
opal_pmix_connection_cbfunc_t cbfunc);
|
||||
|
||||
/* Request allocation modifications on behalf of a client */
|
||||
typedef int (*opal_pmix_server_alloc_fn_t)(const opal_process_name_t *client,
|
||||
opal_pmix_alloc_directive_t directive,
|
||||
opal_list_t *data,
|
||||
opal_pmix_info_cbfunc_t cbfunc, void *cbdata);
|
||||
|
||||
/* Execute a job control action on behalf of a client */
|
||||
typedef int (*opal_pmix_server_job_control_fn_t)(const opal_process_name_t *requestor,
|
||||
opal_list_t *targets, opal_list_t *directives,
|
||||
opal_pmix_info_cbfunc_t cbfunc, void *cbdata);
|
||||
|
||||
/* we do not provide a monitoring capability */
|
||||
|
||||
typedef struct opal_pmix_server_module_1_0_0_t {
|
||||
opal_pmix_server_client_connected_fn_t client_connected;
|
||||
opal_pmix_server_client_finalized_fn_t client_finalized;
|
||||
@ -249,6 +262,8 @@ typedef struct opal_pmix_server_module_1_0_0_t {
|
||||
opal_pmix_server_tool_connection_fn_t tool_connected;
|
||||
opal_pmix_server_log_fn_t log;
|
||||
opal_pmix_server_listener_fn_t listener;
|
||||
opal_pmix_server_alloc_fn_t allocate;
|
||||
opal_pmix_server_job_control_fn_t job_control;
|
||||
} opal_pmix_server_module_t;
|
||||
|
||||
|
||||
|
@ -32,6 +32,11 @@ BEGIN_C_DECLS
|
||||
* that key */
|
||||
#define OPAL_PMIX_RANK_WILDCARD UINT32_MAX-1
|
||||
|
||||
/* other special rank values will be used to define
|
||||
* groups of ranks for use in collectives */
|
||||
#define OPAL_PMIX_RANK_LOCAL_NODE UINT32_MAX-2 // all ranks on local node
|
||||
|
||||
|
||||
/* define a set of "standard" attributes that can
|
||||
* be queried. Implementations (and users) are free to extend as
|
||||
* desired, so the get functions need to be capable
|
||||
@ -55,12 +60,15 @@ BEGIN_C_DECLS
|
||||
#define OPAL_PMIX_CONNECT_TO_SYSTEM "pmix.cnct.sys" // (bool) The requestor requires that a connection be made only to
|
||||
// a local system-level PMIx server
|
||||
#define OPAL_PMIX_CONNECT_SYSTEM_FIRST "pmix.cnct.sys.first" // (bool) Preferentially look for a system-level PMIx server first
|
||||
#define OPAL_PMIX_REGISTER_NODATA "pmix.reg.nodata" // (bool) Registration is for nspace only, do not copy job data
|
||||
#define OPAL_PMIX_SERVER_ENABLE_MONITORING "pmix.srv.monitor" // (bool) Enable PMIx internal monitoring by server
|
||||
|
||||
|
||||
/* identification attributes */
|
||||
#define OPAL_PMIX_USERID "pmix.euid" // (uint32_t) effective user id
|
||||
#define OPAL_PMIX_GRPID "pmix.egid" // (uint32_t) effective group id
|
||||
|
||||
|
||||
/* attributes for the rendezvous socket */
|
||||
#define OPAL_PMIX_USOCK_DISABLE "pmix.usock.disable" // (bool) disable legacy usock support
|
||||
#define OPAL_PMIX_SOCKET_MODE "pmix.sockmode" // (uint32_t) POSIX mode_t (9 bits valid)
|
||||
@ -76,6 +84,7 @@ BEGIN_C_DECLS
|
||||
#define OPAL_PMIX_TCP_DISABLE_IPV4 "pmix.tcp.disipv4" // (bool) true to disable IPv4 family
|
||||
#define OPAL_PMIX_TCP_DISABLE_IPV6 "pmix.tcp.disipv6" // (bool) true to disable IPv6 family
|
||||
|
||||
|
||||
/* general proc-level attributes */
|
||||
#define OPAL_PMIX_CPUSET "pmix.cpuset" // (char*) hwloc bitmap applied to proc upon launch
|
||||
#define OPAL_PMIX_CREDENTIAL "pmix.cred" // (char*) security credential assigned to proc
|
||||
@ -89,6 +98,7 @@ BEGIN_C_DECLS
|
||||
#define OPAL_PMIX_PROCDIR "pmix.pdir" // (char*) sub-nsdir assigned to proc
|
||||
#define OPAL_PMIX_TDIR_RMCLEAN "pmix.tdir.rmclean" // (bool) Resource Manager will clean session directories
|
||||
|
||||
|
||||
/* information about relative ranks as assigned by the RM */
|
||||
#define OPAL_PMIX_PROCID "pmix.procid" // (opal_process_name_t) process identifier
|
||||
#define OPAL_PMIX_NSPACE "pmix.nspace" // (char*) nspace of a job
|
||||
@ -104,25 +114,26 @@ BEGIN_C_DECLS
|
||||
#define OPAL_PMIX_LOCALLDR "pmix.lldr" // (uint64_t) opal_identifier of lowest rank on this node within this job
|
||||
#define OPAL_PMIX_APPLDR "pmix.aldr" // (uint32_t) lowest rank in this app within this job
|
||||
#define OPAL_PMIX_PROC_PID "pmix.ppid" // (pid_t) pid of specified proc
|
||||
|
||||
/**** no PMIx equivalent ****/
|
||||
#define OPAL_PMIX_LOCALITY "pmix.loc" // (uint16_t) relative locality of two procs
|
||||
#define OPAL_PMIX_TOPOLOGY_SIGNATURE "pmix.toposig" // (char*) topology signature string
|
||||
#define OPAL_PMIX_LOCALITY_STRING "pmix.locstr" // (char*) string describing a proc's location
|
||||
#define OPAL_PMIX_AVAIL_PHYS_MEMORY "pmix.pmem" // (uint64_t) total available physical memory on this node
|
||||
|
||||
#define OPAL_PMIX_SESSION_ID "pmix.session.id" // (uint32_t) session identifier
|
||||
#define OPAL_PMIX_NODE_LIST "pmix.nlist" // (char*) comma-delimited list of nodes running procs for the specified nspace
|
||||
#define OPAL_PMIX_ALLOCATED_NODELIST "pmix.alist" // (char*) comma-delimited list of all nodes in this allocation regardless of
|
||||
// whether or not they currently host procs.
|
||||
#define OPAL_PMIX_HOSTNAME "pmix.hname" // (char*) name of the host the specified proc is on
|
||||
#define OPAL_PMIX_NODEID "pmix.nodeid" // (uint32_t) node identifier
|
||||
#define OPAL_PMIX_LOCAL_PEERS "pmix.lpeers" // (char*) comma-delimited string of ranks on this node within the specified nspace
|
||||
#define OPAL_PMIX_LOCAL_PROCS "pmix.lprocs" // (opal_list_t*) list of opal_namelist_t of procs on the specified node
|
||||
#define OPAL_PMIX_LOCAL_CPUSETS "pmix.lcpus" // (char*) colon-delimited cpusets of local peers within the specified nspace
|
||||
#define OPAL_PMIX_PROC_URI "opal.puri" // (char*) URI containing contact info for proc - NOTE: this is published by procs and
|
||||
// thus cannot be prefixed with "pmix"
|
||||
#define OPAL_PMIX_LOCALITY "pmix.loc" // (uint16_t) relative locality of two procs
|
||||
|
||||
|
||||
/* Memory info */
|
||||
#define OPAL_PMIX_AVAIL_PHYS_MEMORY "pmix.pmem" // (uint64_t) total available physical memory on this node
|
||||
#define OPAL_PMIX_DAEMON_MEMORY "pmix.dmn.mem" // (float) Mbytes of memory currently used by daemon
|
||||
#define OPAL_PMIX_CLIENT_AVG_MEMORY "pmix.cl.mem.avg" // (float) Average Mbytes of memory used by client processes
|
||||
|
||||
|
||||
/* size info */
|
||||
#define OPAL_PMIX_UNIV_SIZE "pmix.univ.size" // (uint32_t) #procs in this nspace
|
||||
#define OPAL_PMIX_JOB_SIZE "pmix.job.size" // (uint32_t) #procs in this job
|
||||
@ -133,11 +144,15 @@ BEGIN_C_DECLS
|
||||
#define OPAL_PMIX_MAX_PROCS "pmix.max.size" // (uint32_t) max #procs for this job
|
||||
#define OPAL_PMIX_NUM_NODES "pmix.num.nodes" // (uint32_t) #nodes in this nspace
|
||||
|
||||
|
||||
/* topology info */
|
||||
#define OPAL_PMIX_NET_TOPO "pmix.ntopo" // (char*) xml-representation of network topology
|
||||
#define OPAL_PMIX_LOCAL_TOPO "pmix.ltopo" // (char*) xml-representation of local node topology
|
||||
#define OPAL_PMIX_NODE_LIST "pmix.nlist" // (char*) comma-delimited list of nodes running procs for this job
|
||||
#define OPAL_PMIX_TOPOLOGY "pmix.topo" // (hwloc_topology_t) pointer to the PMIx client's internal topology object
|
||||
#define OPAL_PMIX_TOPOLOGY_SIGNATURE "pmix.toposig" // (char*) topology signature string
|
||||
#define OPAL_PMIX_LOCALITY_STRING "pmix.locstr" // (char*) string describing a proc's location
|
||||
|
||||
|
||||
/* request-related info */
|
||||
#define OPAL_PMIX_COLLECT_DATA "pmix.collect" // (bool) collect data and return it at the end of the operation
|
||||
@ -156,16 +171,19 @@ BEGIN_C_DECLS
|
||||
#define OPAL_PMIX_EMBED_BARRIER "pmix.embed.barrier" // (bool) execute a blocking fence operation before executing the
|
||||
// specified operation
|
||||
|
||||
|
||||
/* attribute used by host server to pass data to the server convenience library - the
|
||||
* data will then be parsed and provided to the local clients */
|
||||
#define OPAL_PMIX_PROC_DATA "pmix.pdata" // (pmix_value_array_t) starts with rank, then contains more data
|
||||
#define OPAL_PMIX_NODE_MAP "pmix.nmap" // (char*) regex of nodes containing procs for this job
|
||||
#define OPAL_PMIX_PROC_MAP "pmix.pmap" // (char*) regex describing procs on each node within this job
|
||||
|
||||
|
||||
/* attributes used internally to communicate data from the server to the client */
|
||||
#define OPAL_PMIX_PROC_BLOB "pmix.pblob" // (pmix_byte_object_t) packed blob of process data
|
||||
#define OPAL_PMIX_MAP_BLOB "pmix.mblob" // (pmix_byte_object_t) packed blob of process location
|
||||
|
||||
|
||||
/* error handler registration and notification info keys */
|
||||
#define OPAL_PMIX_EVENT_HDLR_NAME "pmix.evname" // (char*) string name identifying this handler
|
||||
#define OPAL_PMIX_EVENT_JOB_LEVEL "pmix.evjob" // (bool) register for job-specific events only
|
||||
@ -187,7 +205,7 @@ BEGIN_C_DECLS
|
||||
#define OPAL_PMIX_EVENT_ACTION_TIMEOUT "pmix.evtimeout" // (int) time in sec before RM will execute error response
|
||||
|
||||
|
||||
/* attributes used to describe "spawm" attributes */
|
||||
/* attributes used to describe "spawn" attributes */
|
||||
#define OPAL_PMIX_PERSONALITY "pmix.pers" // (char*) name of personality to use
|
||||
#define OPAL_PMIX_HOST "pmix.host" // (char*) comma-delimited list of hosts to use for spawned procs
|
||||
#define OPAL_PMIX_HOSTFILE "pmix.hostfile" // (char*) hostfile to use for spawned procs
|
||||
@ -229,19 +247,89 @@ BEGIN_C_DECLS
|
||||
#define OPAL_PMIX_QUERY_LOCAL_ONLY "pmix.qry.local" // constrain the query to local information only
|
||||
#define OPAL_PMIX_QUERY_REPORT_AVG "pmix.qry.avg" // report average values
|
||||
#define OPAL_PMIX_QUERY_REPORT_MINMAX "pmix.qry.minmax" // report minimum and maximum value
|
||||
#define OPAL_PMIX_QUERY_ALLOC_STATUS "pmix.query.alloc" // (char*) string identifier of the allocation whose status
|
||||
// is being requested
|
||||
#define OPAL_PMIX_TIME_REMAINING "pmix.time.remaining" // (char*) query number of seconds (uint32_t) remaining in allocation
|
||||
// for the specified nspace
|
||||
|
||||
/* log attributes */
|
||||
#define OPAL_PMIX_LOG_STDERR "pmix.log.stderr" // (char*) log string to stderr
|
||||
#define OPAL_PMIX_LOG_STDOUT "pmix.log.stdout" // (char*) log string to stdout
|
||||
#define OPAL_PMIX_LOG_SYSLOG "pmix.log.syslog" // (char*) log data to syslog - defaults to ERROR priority unless
|
||||
#define OPAL_PMIX_LOG_MSG "pmix.log.msg" // (pmix_byte_object_t) message blob to be sent somewhere
|
||||
#define OPAL_PMIX_LOG_STDERR "pmix.log.stderr" // (char*) log string to stderr
|
||||
#define OPAL_PMIX_LOG_STDOUT "pmix.log.stdout" // (char*) log string to stdout
|
||||
#define OPAL_PMIX_LOG_SYSLOG "pmix.log.syslog" // (char*) log data to syslog - defaults to ERROR priority unless
|
||||
#define OPAL_PMIX_LOG_MSG "pmix.log.msg" // (pmix_byte_object_t) message blob to be sent somewhere
|
||||
#define OPAL_PMIX_LOG_EMAIL "pmix.log.email" // (pmix_data_array_t) log via email based on pmix_info_t containing directives
|
||||
#define OPAL_PMIX_LOG_EMAIL_ADDR "pmix.log.emaddr" // (char*) comma-delimited list of email addresses that are to recv msg
|
||||
#define OPAL_PMIX_LOG_EMAIL_SUBJECT "pmix.log.emsub" // (char*) subject line for email
|
||||
#define OPAL_PMIX_LOG_EMAIL_MSG "pmix.log.emmsg" // (char*) msg to be included in email
|
||||
|
||||
|
||||
/* debugger attributes */
|
||||
#define OPAL_PMIX_DEBUG_STOP_ON_EXEC "pmix.dbg.exec" // (bool) job is being spawned under debugger - instruct it to pause on start
|
||||
#define OPAL_PMIX_DEBUG_STOP_IN_INIT "pmix.dbg.init" // (bool) instruct job to stop during PMIx init
|
||||
#define OPAL_PMIX_DEBUG_WAIT_FOR_NOTIFY "pmix.dbg.notify" // (bool) block at desired point until receiving debugger release notification
|
||||
#define OPAL_PMIX_DEBUG_JOB "pmix.dbg.job" // (char*) nspace of the job to be debugged - the RM/PMIx server are
|
||||
#define OPAL_PMIX_DEBUG_WAITING_FOR_NOTIFY "pmix.dbg.waiting" // (bool) job to be debugged is waiting for a release
|
||||
#define OPAL_PMIX_DEBUG_STOP_ON_EXEC "pmix.dbg.exec" // (bool) job is being spawned under debugger - instruct it to pause on start
|
||||
#define OPAL_PMIX_DEBUG_STOP_IN_INIT "pmix.dbg.init" // (bool) instruct job to stop during PMIx init
|
||||
#define OPAL_PMIX_DEBUG_WAIT_FOR_NOTIFY "pmix.dbg.notify" // (bool) block at desired point until receiving debugger release notification
|
||||
#define OPAL_PMIX_DEBUG_JOB "pmix.dbg.job" // (char*) nspace of the job to be debugged - the RM/PMIx server are
|
||||
#define OPAL_PMIX_DEBUG_WAITING_FOR_NOTIFY "pmix.dbg.waiting" // (bool) job to be debugged is waiting for a release
|
||||
|
||||
|
||||
/* Resource Manager identification */
|
||||
#define OPAL_PMIX_RM_NAME "pmix.rm.name" // (char*) string name of the resource manager
|
||||
#define OPAL_PMIX_RM_VERSION "pmix.rm.version" // (char*) RM version string
|
||||
|
||||
|
||||
/* attributes for setting envars */
|
||||
#define OPAL_PMIX_SET_ENVAR "pmix.set.envar" // (char*) string "key=value" value shall be put into the environment
|
||||
#define OPAL_PMIX_UNSET_ENVAR "pmix.unset.envar" // (char*) unset envar specified in string
|
||||
|
||||
|
||||
/* attributes relating to allocations */
|
||||
#define OPAL_PMIX_ALLOC_ID "pmix.alloc.id" // (char*) provide a string identifier for this allocation request
|
||||
// which can later be used to query status of the request
|
||||
#define OPAL_PMIX_ALLOC_NUM_NODES "pmix.alloc.nnodes" // (uint64_t) number of nodes
|
||||
#define OPAL_PMIX_ALLOC_NODE_LIST "pmix.alloc.nlist" // (char*) regex of specific nodes
|
||||
#define OPAL_PMIX_ALLOC_NUM_CPUS "pmix.alloc.ncpus" // (uint64_t) number of cpus
|
||||
#define OPAL_PMIX_ALLOC_NUM_CPU_LIST "pmix.alloc.ncpulist" // (char*) regex of #cpus for each node
|
||||
#define OPAL_PMIX_ALLOC_CPU_LIST "pmix.alloc.cpulist" // (char*) regex of specific cpus indicating the cpus involved.
|
||||
#define OPAL_PMIX_ALLOC_MEM_SIZE "pmix.alloc.msize" // (float) number of Mbytes
|
||||
#define OPAL_PMIX_ALLOC_NETWORK "pmix.alloc.net" // (array) array of pmix_info_t describing network resources. If not
|
||||
// given as part of an info struct that identifies the
|
||||
// impacted nodes, then the description will be applied
|
||||
// across all nodes in the requestor's allocation
|
||||
#define OPAL_PMIX_ALLOC_NETWORK_ID "pmix.alloc.netid" // (char*) name of network
|
||||
#define OPAL_PMIX_ALLOC_BANDWIDTH "pmix.alloc.bw" // (float) Mbits/sec
|
||||
#define OPAL_PMIX_ALLOC_NETWORK_QOS "pmix.alloc.netqos" // (char*) quality of service level
|
||||
#define OPAL_PMIX_ALLOC_TIME "pmix.alloc.time" // (uint32_t) time in seconds
|
||||
|
||||
|
||||
/* job control attributes */
|
||||
#define OPAL_PMIX_JOB_CTRL_ID "pmix.jctrl.id" // (char*) provide a string identifier for this request
|
||||
#define OPAL_PMIX_JOB_CTRL_PAUSE "pmix.jctrl.pause" // (bool) pause the specified processes
|
||||
#define OPAL_PMIX_JOB_CTRL_RESUME "pmix.jctrl.resume" // (bool) "un-pause" the specified processes
|
||||
#define OPAL_PMIX_JOB_CTRL_CANCEL "pmix.jctrl.cancel" // (char*) cancel the specified request
|
||||
// (NULL => cancel all requests from this requestor)
|
||||
#define OPAL_PMIX_JOB_CTRL_KILL "pmix.jctrl.kill" // (bool) forcibly terminate the specified processes and cleanup
|
||||
#define OPAL_PMIX_JOB_CTRL_RESTART "pmix.jctrl.restart" // (char*) restart the specified processes using the given checkpoint ID
|
||||
#define OPAL_PMIX_JOB_CTRL_CHECKPOINT "pmix.jctrl.ckpt" // (char*) checkpoint the specified processes and assign the given ID to it
|
||||
#define OPAL_PMIX_JOB_CTRL_CHECKPOINT_EVENT "pmix.jctrl.ckptev" // (bool) use event notification to trigger process checkpoint
|
||||
#define OPAL_PMIX_JOB_CTRL_CHECKPOINT_SIGNAL "pmix.jctrl.ckptsig" // (int) use the given signal to trigger process checkpoint
|
||||
#define OPAL_PMIX_JOB_CTRL_CHECKPOINT_TIMEOUT "pmix.jctrl.ckptsig" // (int) time in seconds to wait for checkpoint to complete
|
||||
#define OPAL_PMIX_JOB_CTRL_SIGNAL "pmix.jctrl.sig" // (int) send given signal to specified processes
|
||||
#define OPAL_PMIX_JOB_CTRL_PROVISION "pmix.jctrl.pvn" // (char*) regex identifying nodes that are to be provisioned
|
||||
#define OPAL_PMIX_JOB_CTRL_PROVISION_IMAGE "pmix.jctrl.pvnimg" // (char*) name of the image that is to be provisioned
|
||||
#define OPAL_PMIX_JOB_CTRL_PREEMPTIBLE "pmix.jctrl.preempt" // (bool) job can be pre-empted
|
||||
|
||||
/* monitoring attributes */
|
||||
#define OPAL_PMIX_MONITOR_HEARTBEAT "pmix.monitor.mbeat" // (void) register to have the server monitor the requestor for heartbeats
|
||||
#define OPAL_PMIX_SEND_HEARTBEAT "pmix.monitor.beat" // (void) send heartbeat to local server
|
||||
#define OPAL_PMIX_MONITOR_HEARTBEAT_TIME "pmix.monitor.btime" // (uint32_t) time in seconds before declaring heartbeat missed
|
||||
#define OPAL_PMIX_MONITOR_HEARTBEAT_DROPS "pmix.monitor.bdrop" // (uint32_t) number of heartbeats that can be missed before taking
|
||||
// specified action
|
||||
#define OPAL_PMIX_MONITOR_FILE "pmix.monitor.fmon" // (char*) register to monitor file for signs of life
|
||||
#define OPAL_PMIX_MONITOR_FILE_SIZE "pmix.monitor.fsize" // (bool) monitor size of given file is growing to determine app is running
|
||||
#define OPAL_PMIX_MONITOR_FILE_ACCESS "pmix.monitor.faccess" // (char*) monitor time since last access of given file to determine app is running
|
||||
#define OPAL_PMIX_MONITOR_FILE_MODIFY "pmix.monitor.fmod" // (char*) monitor time since last modified of given file to determine app is running
|
||||
#define OPAL_PMIX_MONITOR_FILE_CHECK_TIME "pmix.monitor.ftime" // (uint32_t) time in seconds between checking file
|
||||
#define OPAL_PMIX_MONITOR_FILE_DROPS "pmix.monitor.fdrop" // (uint32_t) number of file checks that can be missed before taking
|
||||
// specified action
|
||||
|
||||
|
||||
/* define a scope for data "put" by PMI per the following:
|
||||
@ -285,6 +373,16 @@ typedef enum {
|
||||
} opal_pmix_persistence_t;
|
||||
|
||||
|
||||
/* define allocation request flags */
|
||||
typedef enum {
|
||||
OPAL_PMIX_ALLOC_UNDEF = 0,
|
||||
OPAL_PMIX_ALLOC_NEW,
|
||||
OPAL_PMIX_ALLOC_EXTEND,
|
||||
OPAL_PMIX_ALLOC_RELEASE,
|
||||
OPAL_PMIX_ALLOC_REAQCUIRE
|
||||
} opal_pmix_alloc_directive_t;
|
||||
|
||||
|
||||
/**** PMIX INFO STRUCT ****/
|
||||
|
||||
/* NOTE: the pmix_info_t is essentially equivalent to the opal_value_t
|
||||
|
@ -292,6 +292,12 @@ opal_err2str(int errnum, const char **errmsg)
|
||||
case OPAL_ERR_EVENT_REGISTRATION:
|
||||
retval = "Event registration";
|
||||
break;
|
||||
case OPAL_ERR_HEARTBEAT_ALERT:
|
||||
retval = "Heartbeat not received";
|
||||
break;
|
||||
case OPAL_ERR_FILE_ALERT:
|
||||
retval = "File alert - proc may have stalled";
|
||||
break;
|
||||
default:
|
||||
retval = "UNRECOGNIZED";
|
||||
}
|
||||
|
@ -76,7 +76,7 @@ ORTE_DECLSPEC int orte_schizo_base_setup_child(orte_job_t *jobdat,
|
||||
orte_app_context_t *app,
|
||||
char ***env);
|
||||
ORTE_DECLSPEC orte_schizo_launch_environ_t orte_schizo_base_check_launch_environment(void);
|
||||
ORTE_DECLSPEC long orte_schizo_base_get_remaining_time(void);
|
||||
ORTE_DECLSPEC int orte_schizo_base_get_remaining_time(uint32_t *timeleft);
|
||||
ORTE_DECLSPEC void orte_schizo_base_finalize(void);
|
||||
|
||||
END_C_DECLS
|
||||
|
@ -162,20 +162,20 @@ orte_schizo_launch_environ_t orte_schizo_base_check_launch_environment(void)
|
||||
return ORTE_SCHIZO_UNDETERMINED;
|
||||
}
|
||||
|
||||
long orte_schizo_base_get_remaining_time(void)
|
||||
int orte_schizo_base_get_remaining_time(uint32_t *timeleft)
|
||||
{
|
||||
long rc;
|
||||
int rc;
|
||||
orte_schizo_base_active_module_t *mod;
|
||||
|
||||
OPAL_LIST_FOREACH(mod, &orte_schizo_base.active_modules, orte_schizo_base_active_module_t) {
|
||||
if (NULL != mod->module->get_remaining_time) {
|
||||
rc = mod->module->get_remaining_time();
|
||||
rc = mod->module->get_remaining_time(timeleft);
|
||||
if (ORTE_ERR_TAKE_NEXT_OPTION != rc) {
|
||||
return rc;
|
||||
}
|
||||
}
|
||||
}
|
||||
return -1;
|
||||
return ORTE_ERR_NOT_SUPPORTED;
|
||||
}
|
||||
|
||||
void orte_schizo_base_finalize(void)
|
||||
|
@ -118,7 +118,7 @@ typedef void (*orte_schizo_base_module_finalize_fn_t)(void);
|
||||
* and decides it cannot provide the info in the current situation,
|
||||
* then it can return ORTE_ERR_TAKE_NEXT_OPTION to indicate that
|
||||
* another module should be tried */
|
||||
typedef long (*orte_schizo_base_module_get_rem_time_fn_t)(void);
|
||||
typedef int (*orte_schizo_base_module_get_rem_time_fn_t)(uint32_t *timeleft);
|
||||
|
||||
/*
|
||||
* schizo module version 1.3.0
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2016 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2016-2017 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
@ -29,10 +29,12 @@
|
||||
#include "schizo_slurm.h"
|
||||
|
||||
static orte_schizo_launch_environ_t check_launch_environment(void);
|
||||
static int get_remaining_time(uint32_t *timeleft);
|
||||
static void finalize(void);
|
||||
|
||||
orte_schizo_base_module_t orte_schizo_slurm_module = {
|
||||
.check_launch_environment = check_launch_environment,
|
||||
.get_remaining_time = get_remaining_time,
|
||||
.finalize = finalize
|
||||
};
|
||||
|
||||
@ -123,6 +125,58 @@ static orte_schizo_launch_environ_t check_launch_environment(void)
|
||||
return myenv;
|
||||
}
|
||||
|
||||
static int get_remaining_time(uint32_t *timeleft)
|
||||
{
|
||||
char output[256], *cmd, *jobid, **res;
|
||||
FILE *fp;
|
||||
uint32_t tleft;
|
||||
size_t cnt;
|
||||
|
||||
/* set the default */
|
||||
*timeleft = UINT32_MAX;
|
||||
|
||||
if (NULL == (jobid = getenv("SLURM_JOBID"))) {
|
||||
return ORTE_ERR_TAKE_NEXT_OPTION;
|
||||
}
|
||||
if (0 > asprintf(&cmd, "squeue -h -j %s -o %%L", jobid)) {
|
||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
fp = popen(cmd, "r");
|
||||
if (NULL == fp) {
|
||||
free(cmd);
|
||||
return ORTE_ERR_FILE_OPEN_FAILURE;
|
||||
}
|
||||
if (NULL == fgets(output, 256, fp)) {
|
||||
free(cmd);
|
||||
return ORTE_ERR_FILE_READ_FAILURE;
|
||||
}
|
||||
free(cmd);
|
||||
/* the output is returned in a colon-delimited set of fields */
|
||||
res = opal_argv_split(output, ':');
|
||||
cnt = opal_argv_count(res);
|
||||
tleft = strtol(res[cnt-1], NULL, 10); // has to be at least one field
|
||||
/* the next field would be minutes */
|
||||
if (1 < cnt) {
|
||||
tleft += 60 * strtol(res[cnt-2], NULL, 10);
|
||||
}
|
||||
/* next field would be hours */
|
||||
if (2 < cnt) {
|
||||
tleft += 3600 * strtol(res[cnt-3], NULL, 10);
|
||||
}
|
||||
/* next field is days */
|
||||
if (3 < cnt) {
|
||||
tleft += 24*3600 * strtol(res[cnt-4], NULL, 10);
|
||||
}
|
||||
/* if there are more fields than that, then it is infinite */
|
||||
if (4 < cnt) {
|
||||
tleft = UINT32_MAX;
|
||||
}
|
||||
opal_argv_free(res);
|
||||
|
||||
*timeleft = tleft;
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static void finalize(void)
|
||||
{
|
||||
int i;
|
||||
|
@ -1,6 +1,6 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2016 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2016-2017 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -38,8 +38,8 @@ orte_schizo_base_component_t mca_schizo_slurm_component = {
|
||||
|
||||
static int component_query(mca_base_module_t **module, int *priority)
|
||||
{
|
||||
/* disqualify ourselves if we are not an app or under slurm */
|
||||
if (!ORTE_PROC_IS_APP) {
|
||||
/* disqualify ourselves if we are not under slurm */
|
||||
if (NULL == getenv("SLURM_JOBID")) {
|
||||
*priority = 0;
|
||||
*module = NULL;
|
||||
return OPAL_ERROR;
|
||||
@ -49,4 +49,3 @@ static int component_query(mca_base_module_t **module, int *priority)
|
||||
*priority = 50;
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
@ -1,39 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2013 Los Alamos National Security, LLC. All rights reserved.
|
||||
*
|
||||
* Copyright (c) 2017 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
/** @file:
|
||||
*/
|
||||
|
||||
#ifndef MCA_SENSOR_BASE_H
|
||||
#define MCA_SENSOR_BASE_H
|
||||
|
||||
/*
|
||||
* includes
|
||||
*/
|
||||
#include "orte_config.h"
|
||||
|
||||
#include "opal/class/opal_list.h"
|
||||
#include "opal/mca/base/base.h"
|
||||
|
||||
#include "orte/mca/sensor/sensor.h"
|
||||
|
||||
BEGIN_C_DECLS
|
||||
|
||||
/*
|
||||
* MCA Framework
|
||||
*/
|
||||
ORTE_DECLSPEC extern mca_base_framework_t orte_sensor_base_framework;
|
||||
/* select a component */
|
||||
ORTE_DECLSPEC int orte_sensor_base_select(void);
|
||||
|
||||
|
||||
END_C_DECLS
|
||||
#endif
|
@ -1,158 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2012 Los Alamos National Security, Inc. All rights reserved.
|
||||
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
|
||||
#include "orte_config.h"
|
||||
#include "orte/constants.h"
|
||||
|
||||
#include "opal/dss/dss.h"
|
||||
#include "opal/mca/event/event.h"
|
||||
|
||||
#include "orte/mca/sensor/base/base.h"
|
||||
#include "orte/mca/sensor/base/sensor_private.h"
|
||||
|
||||
static bool mods_active = false;
|
||||
|
||||
void orte_sensor_base_start(orte_jobid_t job)
|
||||
{
|
||||
orte_sensor_active_module_t *i_module;
|
||||
int i;
|
||||
|
||||
if (0 < orte_sensor_base.rate.tv_sec) {
|
||||
opal_output_verbose(5, orte_sensor_base_framework.framework_output,
|
||||
"%s sensor:base: starting sensors",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
|
||||
/* call the start function of all modules in priority order */
|
||||
for (i=0; i < orte_sensor_base.modules.size; i++) {
|
||||
if (NULL == (i_module = (orte_sensor_active_module_t*)opal_pointer_array_get_item(&orte_sensor_base.modules, i))) {
|
||||
continue;
|
||||
}
|
||||
mods_active = true;
|
||||
if (NULL != i_module->module->start) {
|
||||
i_module->module->start(job);
|
||||
}
|
||||
}
|
||||
|
||||
if (mods_active && !orte_sensor_base.active) {
|
||||
/* setup a buffer to collect samples */
|
||||
orte_sensor_base.samples = OBJ_NEW(opal_buffer_t);
|
||||
/* startup a timer to wake us up periodically
|
||||
* for a data sample
|
||||
*/
|
||||
orte_sensor_base.active = true;
|
||||
opal_event_evtimer_set(orte_event_base, &orte_sensor_base.sample_ev,
|
||||
orte_sensor_base_sample, NULL);
|
||||
opal_event_evtimer_add(&orte_sensor_base.sample_ev, &orte_sensor_base.rate);
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
void orte_sensor_base_stop(orte_jobid_t job)
|
||||
{
|
||||
orte_sensor_active_module_t *i_module;
|
||||
int i;
|
||||
|
||||
if (!mods_active) {
|
||||
return;
|
||||
}
|
||||
|
||||
opal_output_verbose(5, orte_sensor_base_framework.framework_output,
|
||||
"%s sensor:base: stopping sensors",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
|
||||
|
||||
if (orte_sensor_base.active) {
|
||||
opal_event_del(&orte_sensor_base.sample_ev);
|
||||
orte_sensor_base.active = false;
|
||||
}
|
||||
|
||||
/* call the stop function of all modules in priority order */
|
||||
for (i=0; i < orte_sensor_base.modules.size; i++) {
|
||||
if (NULL == (i_module = (orte_sensor_active_module_t*)opal_pointer_array_get_item(&orte_sensor_base.modules, i))) {
|
||||
continue;
|
||||
}
|
||||
if (NULL != i_module->module->stop) {
|
||||
i_module->module->stop(job);
|
||||
}
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
void orte_sensor_base_sample(int fd, short args, void *cbdata)
|
||||
{
|
||||
orte_sensor_active_module_t *i_module;
|
||||
int i;
|
||||
|
||||
if (!mods_active) {
|
||||
return;
|
||||
}
|
||||
|
||||
/* see if we were ordered to stop */
|
||||
if (!orte_sensor_base.active) {
|
||||
return;
|
||||
}
|
||||
|
||||
opal_output_verbose(5, orte_sensor_base_framework.framework_output,
|
||||
"%s sensor:base: sampling sensors",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
|
||||
|
||||
/* call the sample function of all modules in priority order from
|
||||
* highest to lowest - the heartbeat should always be the lowest
|
||||
* priority, so it will send any collected data
|
||||
*/
|
||||
for (i=0; i < orte_sensor_base.modules.size; i++) {
|
||||
if (NULL == (i_module = (orte_sensor_active_module_t*)opal_pointer_array_get_item(&orte_sensor_base.modules, i))) {
|
||||
continue;
|
||||
}
|
||||
if (NULL != i_module->module->sample) {
|
||||
opal_output_verbose(5, orte_sensor_base_framework.framework_output,
|
||||
"%s sensor:base: sampling component %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
i_module->component->base_version.mca_component_name);
|
||||
i_module->module->sample();
|
||||
}
|
||||
}
|
||||
|
||||
/* restart the timer */
|
||||
opal_event_evtimer_add(&orte_sensor_base.sample_ev, &orte_sensor_base.rate);
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
void orte_sensor_base_log(char *comp, opal_buffer_t *data)
|
||||
{
|
||||
int i;
|
||||
orte_sensor_active_module_t *i_module;
|
||||
|
||||
if (NULL == comp) {
|
||||
/* nothing we can do */
|
||||
return;
|
||||
}
|
||||
|
||||
opal_output_verbose(5, orte_sensor_base_framework.framework_output,
|
||||
"%s sensor:base: logging sensor %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), comp);
|
||||
|
||||
/* find the specified module */
|
||||
for (i=0; i < orte_sensor_base.modules.size; i++) {
|
||||
if (NULL == (i_module = (orte_sensor_active_module_t*)opal_pointer_array_get_item(&orte_sensor_base.modules, i))) {
|
||||
continue;
|
||||
}
|
||||
if (0 == strcmp(comp, i_module->component->base_version.mca_component_name)) {
|
||||
if (NULL != i_module->module->log) {
|
||||
i_module->module->log(data);
|
||||
}
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
@ -1,133 +0,0 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2012-2013 Los Alamos National Security, Inc. All rights reserved.
|
||||
* Copyright (c) 2017 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2017 Research Organization for Information Science
|
||||
* and Technology (RIST). All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
|
||||
#include "orte_config.h"
|
||||
#include "orte/constants.h"
|
||||
|
||||
#include "opal/mca/mca.h"
|
||||
#include "opal/util/argv.h"
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/mca/base/base.h"
|
||||
#include "opal/class/opal_pointer_array.h"
|
||||
|
||||
#ifdef HAVE_STRING_H
|
||||
#include <string.h>
|
||||
#endif
|
||||
|
||||
#include "orte/mca/sensor/base/base.h"
|
||||
#include "orte/mca/sensor/base/sensor_private.h"
|
||||
|
||||
/*
|
||||
* The following file was created by configure. It contains extern
|
||||
* statements and the definition of an array of pointers to each
|
||||
* component's public mca_base_component_t struct.
|
||||
*/
|
||||
|
||||
#include "orte/mca/sensor/base/static-components.h"
|
||||
|
||||
/*
|
||||
* Global variables
|
||||
*/
|
||||
orte_sensor_base_API_module_t orte_sensor = {
|
||||
orte_sensor_base_start,
|
||||
orte_sensor_base_stop
|
||||
};
|
||||
orte_sensor_base_t orte_sensor_base = {{{0}}};
|
||||
|
||||
/*
|
||||
* Local variables
|
||||
*/
|
||||
static int orte_sensor_base_sample_rate = 0;
|
||||
|
||||
static int orte_sensor_base_register(mca_base_register_flag_t flags)
|
||||
{
|
||||
int var_id;
|
||||
|
||||
orte_sensor_base_sample_rate = 0;
|
||||
var_id = mca_base_var_register("orte", "sensor", "base", "sample_rate",
|
||||
"Sample rate in seconds",
|
||||
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_9,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&orte_sensor_base_sample_rate);
|
||||
mca_base_var_register_synonym(var_id, "orte", "sensor", NULL, "sample_rate",
|
||||
MCA_BASE_VAR_SYN_FLAG_DEPRECATED);
|
||||
|
||||
/* see if we want samples logged */
|
||||
orte_sensor_base.log_samples = false;
|
||||
var_id = mca_base_var_register("orte", "sensor", "base", "log_samples",
|
||||
"Log samples to database",
|
||||
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_9,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&orte_sensor_base.log_samples);
|
||||
mca_base_var_register_synonym(var_id, "orte", "sensor", NULL, "log_samples",
|
||||
MCA_BASE_VAR_SYN_FLAG_DEPRECATED);
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static int orte_sensor_base_close(void)
|
||||
{
|
||||
orte_sensor_active_module_t *i_module;
|
||||
int i;
|
||||
|
||||
for (i=0; i < orte_sensor_base.modules.size; i++) {
|
||||
if (NULL == (i_module = (orte_sensor_active_module_t*)opal_pointer_array_get_item(&orte_sensor_base.modules, i))) {
|
||||
continue;
|
||||
}
|
||||
if (NULL != i_module->module->finalize) {
|
||||
i_module->module->finalize();
|
||||
}
|
||||
}
|
||||
OBJ_DESTRUCT(&orte_sensor_base.modules);
|
||||
|
||||
/* Close all remaining available components */
|
||||
return mca_base_framework_components_close(&orte_sensor_base_framework, NULL);
|
||||
}
|
||||
|
||||
/**
|
||||
* Function for finding and opening either all MCA components, or the one
|
||||
* that was specifically requested via a MCA parameter.
|
||||
*/
|
||||
static int orte_sensor_base_open(mca_base_open_flag_t flags)
|
||||
{
|
||||
/* initialize globals */
|
||||
orte_sensor_base.active = false;
|
||||
|
||||
/* construct the array of modules */
|
||||
OBJ_CONSTRUCT(&orte_sensor_base.modules, opal_pointer_array_t);
|
||||
opal_pointer_array_init(&orte_sensor_base.modules, 3, INT_MAX, 1);
|
||||
|
||||
/* get the sample rate */
|
||||
orte_sensor_base.rate.tv_sec = orte_sensor_base_sample_rate;
|
||||
orte_sensor_base.rate.tv_usec = 0;
|
||||
|
||||
/* Open up all available components */
|
||||
return mca_base_framework_components_open(&orte_sensor_base_framework, flags);
|
||||
}
|
||||
|
||||
MCA_BASE_FRAMEWORK_DECLARE(orte, sensor, "ORTE Monitoring Sensors",
|
||||
orte_sensor_base_register,
|
||||
orte_sensor_base_open, orte_sensor_base_close,
|
||||
mca_sensor_base_static_components, 0);
|
||||
|
||||
static void cons(orte_sensor_active_module_t *t)
|
||||
{
|
||||
t->sampling = true;
|
||||
}
|
||||
OBJ_CLASS_INSTANCE(orte_sensor_active_module_t,
|
||||
opal_object_t,
|
||||
cons, NULL);
|
@ -1,219 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2012-2013 Los Alamos National Security, Inc. All rights reserved.
|
||||
*
|
||||
* Copyright (c) 2017 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
|
||||
#include "orte_config.h"
|
||||
#ifdef HAVE_STRING_H
|
||||
#include <string.h>
|
||||
#endif
|
||||
|
||||
#include "orte/constants.h"
|
||||
|
||||
#include "opal/mca/mca.h"
|
||||
#include "opal/mca/base/base.h"
|
||||
#include "opal/util/argv.h"
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/class/opal_pointer_array.h"
|
||||
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/sensor/base/base.h"
|
||||
#include "orte/mca/sensor/base/sensor_private.h"
|
||||
|
||||
|
||||
static bool selected = false;
|
||||
|
||||
/**
|
||||
* Function for weeding out sensor components that don't want to run.
|
||||
*
|
||||
* Call the init function on all available components to find out if
|
||||
* they want to run. Select all components that don't fail. Failing
|
||||
* components will be closed and unloaded. The selected modules will
|
||||
* be returned to the caller in a opal_list_t.
|
||||
*/
|
||||
int orte_sensor_base_select(void)
|
||||
{
|
||||
mca_base_component_list_item_t *cli = NULL;
|
||||
orte_sensor_base_component_t *component = NULL;
|
||||
mca_base_module_t *module = NULL;
|
||||
orte_sensor_active_module_t *i_module;
|
||||
int priority = 0, i, j, low_i;
|
||||
opal_pointer_array_t tmp_array;
|
||||
bool none_found;
|
||||
orte_sensor_active_module_t *tmp_module = NULL, *tmp_module_sw = NULL;
|
||||
bool duplicate;
|
||||
|
||||
if (selected) {
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
selected = true;
|
||||
|
||||
OBJ_CONSTRUCT(&tmp_array, opal_pointer_array_t);
|
||||
|
||||
opal_output_verbose(10, orte_sensor_base_framework.framework_output,
|
||||
"sensor:base:select: Auto-selecting components");
|
||||
|
||||
/*
|
||||
* Traverse the list of available components.
|
||||
* For each call their 'query' functions to determine relative priority.
|
||||
*/
|
||||
none_found = true;
|
||||
OPAL_LIST_FOREACH(cli, &orte_sensor_base_framework.framework_components, mca_base_component_list_item_t) {
|
||||
component = (orte_sensor_base_component_t *) cli->cli_component;
|
||||
|
||||
/*
|
||||
* If there is a query function then use it.
|
||||
*/
|
||||
if (NULL == component->base_version.mca_query_component) {
|
||||
opal_output_verbose(5, orte_sensor_base_framework.framework_output,
|
||||
"sensor:base:select Skipping component [%s]. It does not implement a query function",
|
||||
component->base_version.mca_component_name );
|
||||
continue;
|
||||
}
|
||||
|
||||
/*
|
||||
* Query this component for the module and priority
|
||||
*/
|
||||
opal_output_verbose(5, orte_sensor_base_framework.framework_output,
|
||||
"sensor:base:select Querying component [%s]",
|
||||
component->base_version.mca_component_name);
|
||||
|
||||
component->base_version.mca_query_component(&module, &priority);
|
||||
|
||||
/*
|
||||
* If no module was returned or negative priority, then skip component
|
||||
*/
|
||||
if (NULL == module || priority < 0) {
|
||||
opal_output_verbose(5, orte_sensor_base_framework.framework_output,
|
||||
"sensor:base:select Skipping component [%s]. Query failed to return a module",
|
||||
component->base_version.mca_component_name );
|
||||
continue;
|
||||
}
|
||||
|
||||
/* check to see if we already have someone who senses the
|
||||
* same things - if so, take the higher priority one
|
||||
*/
|
||||
duplicate = false;
|
||||
for (i=0; i < tmp_array.size; i++) {
|
||||
tmp_module = (orte_sensor_active_module_t*)opal_pointer_array_get_item(&tmp_array, i);
|
||||
if (NULL == tmp_module) {
|
||||
continue;
|
||||
}
|
||||
if (0 == strcmp(component->data_measured, tmp_module->component->data_measured)) {
|
||||
if (tmp_module->priority < priority) {
|
||||
opal_output_verbose(5, orte_sensor_base_framework.framework_output,
|
||||
"sensor:base:select Replacing component %s with %s - both measure %s",
|
||||
tmp_module->component->base_version.mca_component_name,
|
||||
component->base_version.mca_component_name,
|
||||
component->data_measured);
|
||||
OBJ_RELEASE(tmp_module);
|
||||
opal_pointer_array_set_item(&tmp_array, i, NULL);
|
||||
break;
|
||||
} else {
|
||||
duplicate = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (duplicate) {
|
||||
/* ignore this component */
|
||||
opal_output_verbose(5, orte_sensor_base_framework.framework_output,
|
||||
"sensor:base:select Ignoring component %s - duplicate with higher priority measures %s",
|
||||
component->base_version.mca_component_name,
|
||||
component->data_measured);
|
||||
continue;
|
||||
}
|
||||
|
||||
/*
|
||||
* Append them to the temporary list, we will sort later
|
||||
*/
|
||||
opal_output_verbose(5, orte_sensor_base_framework.framework_output,
|
||||
"sensor:base:select Query of component [%s] set priority to %d",
|
||||
component->base_version.mca_component_name, priority);
|
||||
tmp_module = OBJ_NEW(orte_sensor_active_module_t);
|
||||
tmp_module->component = component;
|
||||
tmp_module->module = (orte_sensor_base_module_t*)module;
|
||||
tmp_module->priority = priority;
|
||||
|
||||
opal_pointer_array_add(&tmp_array, (void*)tmp_module);
|
||||
none_found = false;
|
||||
}
|
||||
|
||||
if (none_found) {
|
||||
/* okay for no modules to be found */
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/*
|
||||
* Sort the list by decending priority
|
||||
*/
|
||||
priority = 0;
|
||||
for(j = 0; j < tmp_array.size; ++j) {
|
||||
tmp_module_sw = (orte_sensor_active_module_t*)opal_pointer_array_get_item(&tmp_array, j);
|
||||
if( NULL == tmp_module_sw ) {
|
||||
continue;
|
||||
}
|
||||
|
||||
low_i = -1;
|
||||
priority = tmp_module_sw->priority;
|
||||
|
||||
for(i = 0; i < tmp_array.size; ++i) {
|
||||
tmp_module = (orte_sensor_active_module_t*)opal_pointer_array_get_item(&tmp_array, i);
|
||||
if( NULL == tmp_module ) {
|
||||
continue;
|
||||
}
|
||||
if( tmp_module->priority > priority ) {
|
||||
low_i = i;
|
||||
priority = tmp_module->priority;
|
||||
}
|
||||
}
|
||||
|
||||
if( low_i >= 0 ) {
|
||||
tmp_module = (orte_sensor_active_module_t*)opal_pointer_array_get_item(&tmp_array, low_i);
|
||||
opal_pointer_array_set_item(&tmp_array, low_i, NULL);
|
||||
j--; /* Try this entry again, if it is not the lowest */
|
||||
} else {
|
||||
tmp_module = tmp_module_sw;
|
||||
opal_pointer_array_set_item(&tmp_array, j, NULL);
|
||||
}
|
||||
opal_output_verbose(5, orte_sensor_base_framework.framework_output,
|
||||
"sensor:base:select Add module with priority [%s] %d",
|
||||
tmp_module->component->base_version.mca_component_name, tmp_module->priority);
|
||||
opal_pointer_array_add(&orte_sensor_base.modules, tmp_module);
|
||||
}
|
||||
OBJ_DESTRUCT(&tmp_array);
|
||||
|
||||
/*
|
||||
* Initialize each of the modules in priority order from
|
||||
* highest to lowest
|
||||
*/
|
||||
for(i = 0; i < orte_sensor_base.modules.size; ++i) {
|
||||
i_module = (orte_sensor_active_module_t*)opal_pointer_array_get_item(&orte_sensor_base.modules, i);
|
||||
if( NULL == i_module ) {
|
||||
continue;
|
||||
}
|
||||
if( NULL != i_module->module->init ) {
|
||||
if (ORTE_SUCCESS != i_module->module->init()) {
|
||||
/* can't sample - however, if we are the HNP,
|
||||
* then we need this module
|
||||
* anyway so we can log incoming data
|
||||
*/
|
||||
if (ORTE_PROC_IS_HNP) {
|
||||
i_module->sampling = false;
|
||||
} else {
|
||||
opal_pointer_array_set_item(&orte_sensor_base.modules, i, NULL);
|
||||
OBJ_RELEASE(i_module);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
@ -1,67 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2012 Los Alamos National Security, Inc. All rights reserved.
|
||||
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
/** @file:
|
||||
*/
|
||||
|
||||
#ifndef MCA_SENSOR_PRIVATE_H
|
||||
#define MCA_SENSOR_PRIVATE_H
|
||||
|
||||
/*
|
||||
* includes
|
||||
*/
|
||||
#include "orte_config.h"
|
||||
|
||||
#ifdef HAVE_UNISTD_H
|
||||
#include <unistd.h>
|
||||
#endif /* HAVE_UNISTD_H */
|
||||
|
||||
#include "opal/class/opal_pointer_array.h"
|
||||
#include "opal/mca/event/event.h"
|
||||
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
|
||||
#include "orte/mca/sensor/sensor.h"
|
||||
|
||||
|
||||
/*
|
||||
* Global functions for MCA overall collective open and close
|
||||
*/
|
||||
BEGIN_C_DECLS
|
||||
|
||||
/* define a struct to hold framework-global values */
|
||||
typedef struct {
|
||||
opal_pointer_array_t modules;
|
||||
bool log_samples;
|
||||
bool active;
|
||||
struct timeval rate;
|
||||
opal_event_t sample_ev;
|
||||
opal_buffer_t *samples;
|
||||
} orte_sensor_base_t;
|
||||
|
||||
typedef struct {
|
||||
opal_object_t super;
|
||||
orte_sensor_base_component_t *component;
|
||||
orte_sensor_base_module_t *module;
|
||||
int priority;
|
||||
bool sampling;
|
||||
} orte_sensor_active_module_t;
|
||||
OBJ_CLASS_DECLARATION(orte_sensor_active_module_t);
|
||||
|
||||
|
||||
ORTE_DECLSPEC extern orte_sensor_base_t orte_sensor_base;
|
||||
ORTE_DECLSPEC void orte_sensor_base_start(orte_jobid_t job);
|
||||
ORTE_DECLSPEC void orte_sensor_base_stop(orte_jobid_t job);
|
||||
ORTE_DECLSPEC void orte_sensor_base_sample(int fd, short args, void *cbdata);
|
||||
ORTE_DECLSPEC void orte_sensor_base_log(char *comp, opal_buffer_t *data);
|
||||
|
||||
END_C_DECLS
|
||||
#endif
|
@ -1,24 +0,0 @@
|
||||
# -*- shell-script -*-
|
||||
#
|
||||
# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
||||
# Copyright (c) 2011-2013 Los Alamos National Security, LLC.
|
||||
# All rights reserved.
|
||||
# Copyright (c) 2017 Intel, Inc. All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
# MCA_sensor_file_CONFIG([action-if-found], [action-if-not-found])
|
||||
# -----------------------------------------------------------
|
||||
AC_DEFUN([MCA_orte_sensor_file_CONFIG], [
|
||||
AC_CONFIG_FILES([orte/mca/sensor/file/Makefile])
|
||||
|
||||
# if we don't want sensors, don't compile
|
||||
# this component
|
||||
AS_IF([test "$orte_want_sensors" = "1"],
|
||||
[$1], [$2])
|
||||
])dnl
|
||||
|
@ -1,354 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2004-2011 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2011-2012 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
*
|
||||
* Copyright (c) 2017 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "orte_config.h"
|
||||
#include "orte/constants.h"
|
||||
#include "orte/types.h"
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stddef.h>
|
||||
#include <ctype.h>
|
||||
#ifdef HAVE_UNISTD_H
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
#ifdef HAVE_NETDB_H
|
||||
#include <netdb.h>
|
||||
#endif
|
||||
#ifdef HAVE_SYS_PARAM_H
|
||||
#include <sys/param.h>
|
||||
#endif
|
||||
#include <fcntl.h>
|
||||
#include <errno.h>
|
||||
#include <signal.h>
|
||||
#ifdef HAVE_TIME_H
|
||||
#include <time.h>
|
||||
#endif
|
||||
#include <sys/stat.h>
|
||||
#include <sys/types.h>
|
||||
|
||||
#include "opal_stdint.h"
|
||||
#include "opal/util/output.h"
|
||||
|
||||
#include "orte/util/show_help.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/state/state.h"
|
||||
#include "orte/util/name_fns.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
|
||||
#include "orte/mca/sensor/base/base.h"
|
||||
#include "orte/mca/sensor/base/sensor_private.h"
|
||||
#include "sensor_file.h"
|
||||
|
||||
/* declare the API functions */
|
||||
static int init(void);
|
||||
static void finalize(void);
|
||||
static void start(orte_jobid_t job);
|
||||
static void stop(orte_jobid_t job);
|
||||
static void file_sample(void);
|
||||
static void file_log(opal_buffer_t *sample);
|
||||
|
||||
/* instantiate the module */
|
||||
orte_sensor_base_module_t orte_sensor_file_module = {
|
||||
init,
|
||||
finalize,
|
||||
start,
|
||||
stop,
|
||||
file_sample,
|
||||
file_log
|
||||
};
|
||||
|
||||
/* define a tracking object */
|
||||
typedef struct {
|
||||
opal_list_item_t super;
|
||||
orte_jobid_t jobid;
|
||||
orte_vpid_t vpid;
|
||||
char *file;
|
||||
int tick;
|
||||
bool check_size;
|
||||
bool check_access;
|
||||
bool check_mod;
|
||||
int32_t file_size;
|
||||
time_t last_access;
|
||||
time_t last_mod;
|
||||
int limit;
|
||||
} file_tracker_t;
|
||||
static void ft_constructor(file_tracker_t *ft)
|
||||
{
|
||||
ft->file = NULL;
|
||||
ft->tick = 0;
|
||||
ft->file_size = 0;
|
||||
ft->last_access = 0;
|
||||
ft->last_mod = 0;
|
||||
ft->limit = 0;
|
||||
}
|
||||
static void ft_destructor(file_tracker_t *ft)
|
||||
{
|
||||
if (NULL != ft->file) {
|
||||
free(ft->file);
|
||||
}
|
||||
}
|
||||
OBJ_CLASS_INSTANCE(file_tracker_t,
|
||||
opal_list_item_t,
|
||||
ft_constructor, ft_destructor);
|
||||
|
||||
/* local globals */
|
||||
static opal_list_t jobs;
|
||||
|
||||
static int init(void)
|
||||
{
|
||||
OBJ_CONSTRUCT(&jobs, opal_list_t);
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static void finalize(void)
|
||||
{
|
||||
opal_list_item_t *item;
|
||||
|
||||
while (NULL != (item = opal_list_remove_first(&jobs))) {
|
||||
OBJ_RELEASE(item);
|
||||
}
|
||||
OBJ_DESTRUCT(&jobs);
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
static bool find_value(orte_app_context_t *app,
|
||||
char *pattern, char **value)
|
||||
{
|
||||
int i;
|
||||
char *ptr;
|
||||
|
||||
for (i=0; NULL != app->env[i]; i++) {
|
||||
if (0 == strncmp(app->env[i], pattern, strlen(pattern))) {
|
||||
ptr = strchr(app->env[i], '=');
|
||||
ptr++;
|
||||
if (NULL != value) {
|
||||
*value = strdup(ptr);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/*
|
||||
* Start monitoring of local processes
|
||||
*/
|
||||
static void start(orte_jobid_t jobid)
|
||||
{
|
||||
orte_job_t *jobdat;
|
||||
orte_app_context_t *app, *aptr;
|
||||
int i;
|
||||
char *filename;
|
||||
file_tracker_t *ft;
|
||||
char *ptr;
|
||||
|
||||
/* cannot monitor my own job */
|
||||
if (jobid == ORTE_PROC_MY_NAME->jobid && ORTE_JOBID_WILDCARD != jobid) {
|
||||
return;
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base_framework.framework_output,
|
||||
"%s starting file monitoring for job %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_JOBID_PRINT(jobid)));
|
||||
|
||||
/* get the local jobdat for this job */
|
||||
if (NULL == (jobdat = orte_get_job_data_object(jobid))) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
||||
return;
|
||||
}
|
||||
|
||||
/* must be at least one app_context, so use the first one found */
|
||||
app = NULL;
|
||||
for (i=0; i < jobdat->apps->size; i++) {
|
||||
if (NULL != (aptr = (orte_app_context_t*)opal_pointer_array_get_item(jobdat->apps, i))) {
|
||||
app = aptr;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (NULL == app) {
|
||||
/* got a problem */
|
||||
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
||||
return;
|
||||
}
|
||||
|
||||
/* search the environ to get the filename */
|
||||
if (!find_value(app, "OMPI_MCA_sensor_file_filename", &filename)) {
|
||||
/* was a default file given */
|
||||
if (NULL == mca_sensor_file_component.file) {
|
||||
/* can't do anything without a file */
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base_framework.framework_output,
|
||||
"%s sensor:file no file for job %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_JOBID_PRINT(jobid)));
|
||||
return;
|
||||
}
|
||||
filename = mca_sensor_file_component.file;
|
||||
}
|
||||
|
||||
/* create the tracking object */
|
||||
ft = OBJ_NEW(file_tracker_t);
|
||||
ft->jobid = jobid;
|
||||
ft->file = strdup(filename);
|
||||
|
||||
/* search the environ to see what we are checking */
|
||||
if (!find_value(app, "OMPI_MCA_sensor_file_check_size", &ptr)) {
|
||||
/* was a default value given */
|
||||
if (0 < mca_sensor_file_component.check_size) {
|
||||
ft->check_size = OPAL_INT_TO_BOOL(mca_sensor_file_component.check_size);
|
||||
}
|
||||
} else {
|
||||
ft->check_size = OPAL_INT_TO_BOOL(strtol(ptr, NULL, 10));
|
||||
free(ptr);
|
||||
}
|
||||
|
||||
if (!find_value(app, "OMPI_MCA_sensor_file_check_access", &ptr)) {
|
||||
/* was a default value given */
|
||||
if (0 < mca_sensor_file_component.check_access) {
|
||||
ft->check_access = OPAL_INT_TO_BOOL(mca_sensor_file_component.check_access);
|
||||
}
|
||||
} else {
|
||||
ft->check_access = OPAL_INT_TO_BOOL(strtol(ptr, NULL, 10));
|
||||
free(ptr);
|
||||
}
|
||||
|
||||
if (!find_value(app, "OMPI_MCA_sensor_file_check_mod", &ptr)) {
|
||||
/* was a default value given */
|
||||
if (0 < mca_sensor_file_component.check_mod) {
|
||||
ft->check_mod = OPAL_INT_TO_BOOL(mca_sensor_file_component.check_mod);
|
||||
}
|
||||
} else {
|
||||
ft->check_mod = OPAL_INT_TO_BOOL(strtol(ptr, NULL, 10));
|
||||
free(ptr);
|
||||
}
|
||||
|
||||
if (!find_value(app, "OMPI_MCA_sensor_file_limit", &ptr)) {
|
||||
ft->limit = mca_sensor_file_component.limit;
|
||||
} else {
|
||||
ft->limit = strtol(ptr, NULL, 10);
|
||||
free(ptr);
|
||||
}
|
||||
opal_list_append(&jobs, &ft->super);
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base_framework.framework_output,
|
||||
"%s file %s monitored for %s%s%s with limit %d",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ft->file, ft->check_size ? "SIZE:" : " ",
|
||||
ft->check_access ? "ACCESS TIME:" : " ",
|
||||
ft->check_mod ? "MOD TIME" : " ", ft->limit));
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
static void stop(orte_jobid_t jobid)
|
||||
{
|
||||
opal_list_item_t *item;
|
||||
file_tracker_t *ft;
|
||||
|
||||
/* cannot monitor my own job */
|
||||
if (jobid == ORTE_PROC_MY_NAME->jobid && ORTE_JOBID_WILDCARD != jobid) {
|
||||
return;
|
||||
}
|
||||
|
||||
for (item = opal_list_get_first(&jobs);
|
||||
item != opal_list_get_end(&jobs);
|
||||
item = opal_list_get_next(item)) {
|
||||
ft = (file_tracker_t*)item;
|
||||
if (jobid == ft->jobid || ORTE_JOBID_WILDCARD == jobid) {
|
||||
opal_list_remove_item(&jobs, item);
|
||||
OBJ_RELEASE(item);
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
static void file_sample(void)
|
||||
{
|
||||
struct stat buf;
|
||||
opal_list_item_t *item;
|
||||
file_tracker_t *ft;
|
||||
orte_job_t *jdata;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base_framework.framework_output,
|
||||
"%s sampling files",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
|
||||
for (item = opal_list_get_first(&jobs);
|
||||
item != opal_list_get_end(&jobs);
|
||||
item = opal_list_get_next(item)) {
|
||||
ft = (file_tracker_t*)item;
|
||||
|
||||
/* stat the file and get its size */
|
||||
if (0 > stat(ft->file, &buf)) {
|
||||
/* cannot stat file */
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base_framework.framework_output,
|
||||
"%s could not stat %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ft->file));
|
||||
continue;
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base_framework.framework_output,
|
||||
"%s size %lu access %s\tmod %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
(unsigned long)buf.st_size, ctime(&buf.st_atime), ctime(&buf.st_mtime)));
|
||||
|
||||
if (ft->check_size) {
|
||||
if (buf.st_size == ft->file_size) {
|
||||
ft->tick++;
|
||||
goto CHECK;
|
||||
} else {
|
||||
ft->tick = 0;
|
||||
ft->file_size = buf.st_size;
|
||||
}
|
||||
}
|
||||
if (ft->check_access) {
|
||||
if (buf.st_atime == ft->last_access) {
|
||||
ft->tick++;
|
||||
goto CHECK;
|
||||
} else {
|
||||
ft->tick = 0;
|
||||
ft->last_access = buf.st_atime;
|
||||
}
|
||||
}
|
||||
if (ft->check_mod) {
|
||||
if (buf.st_mtime == ft->last_mod) {
|
||||
ft->tick++;
|
||||
goto CHECK;
|
||||
} else {
|
||||
ft->tick = 0;
|
||||
ft->last_mod = buf.st_mtime;
|
||||
}
|
||||
}
|
||||
|
||||
CHECK:
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base_framework.framework_output,
|
||||
"%s sampled file %s tick %d",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ft->file, ft->tick));
|
||||
|
||||
if (ft->tick == ft->limit) {
|
||||
orte_show_help("help-orte-sensor-file.txt", "file-stalled", true,
|
||||
ft->file, ft->file_size, ctime(&ft->last_access), ctime(&ft->last_mod));
|
||||
jdata = orte_get_job_data_object(ft->jobid);
|
||||
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_SENSOR_BOUND_EXCEEDED);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void file_log(opal_buffer_t *sample)
|
||||
{
|
||||
}
|
@ -1,42 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
||||
*
|
||||
* Copyright (c) 2017 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
/**
|
||||
* @file
|
||||
*
|
||||
* File movement sensor
|
||||
*/
|
||||
#ifndef ORTE_SENSOR_FILE_H
|
||||
#define ORTE_SENSOR_FILE_H
|
||||
|
||||
#include "orte_config.h"
|
||||
|
||||
#include "orte/mca/sensor/sensor.h"
|
||||
|
||||
BEGIN_C_DECLS
|
||||
|
||||
struct orte_sensor_file_component_t {
|
||||
orte_sensor_base_component_t super;
|
||||
int sample_rate;
|
||||
char *file;
|
||||
bool check_size;
|
||||
bool check_access;
|
||||
bool check_mod;
|
||||
int limit;
|
||||
};
|
||||
typedef struct orte_sensor_file_component_t orte_sensor_file_component_t;
|
||||
|
||||
ORTE_MODULE_DECLSPEC extern orte_sensor_file_component_t mca_sensor_file_component;
|
||||
extern orte_sensor_base_module_t orte_sensor_file_module;
|
||||
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
#endif
|
@ -1,120 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2017 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "orte_config.h"
|
||||
#include "orte/constants.h"
|
||||
|
||||
#include "opal/mca/base/base.h"
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/class/opal_pointer_array.h"
|
||||
|
||||
#include "orte/util/proc_info.h"
|
||||
#include "orte/util/show_help.h"
|
||||
|
||||
#include "sensor_file.h"
|
||||
|
||||
/*
|
||||
* Local functions
|
||||
*/
|
||||
static int orte_sensor_file_register (void);
|
||||
static int orte_sensor_file_open(void);
|
||||
static int orte_sensor_file_close(void);
|
||||
static int orte_sensor_file_query(mca_base_module_t **module, int *priority);
|
||||
|
||||
orte_sensor_file_component_t mca_sensor_file_component = {
|
||||
{
|
||||
{
|
||||
ORTE_SENSOR_BASE_VERSION_1_0_0,
|
||||
|
||||
"file", /* MCA component name */
|
||||
ORTE_MAJOR_VERSION, /* MCA component major version */
|
||||
ORTE_MINOR_VERSION, /* MCA component minor version */
|
||||
ORTE_RELEASE_VERSION, /* MCA component release version */
|
||||
orte_sensor_file_open, /* component open */
|
||||
orte_sensor_file_close, /* component close */
|
||||
orte_sensor_file_query, /* component query */
|
||||
orte_sensor_file_register
|
||||
},
|
||||
{
|
||||
/* The component is checkpoint ready */
|
||||
MCA_BASE_METADATA_PARAM_CHECKPOINT
|
||||
},
|
||||
"filemods" // data being sensed
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
/**
|
||||
* component register/open/close/init function
|
||||
*/
|
||||
static int orte_sensor_file_register (void)
|
||||
{
|
||||
mca_base_component_t *c = &mca_sensor_file_component.super.base_version;
|
||||
|
||||
/* lookup parameters */
|
||||
mca_sensor_file_component.file = NULL;
|
||||
(void) mca_base_component_var_register (c, "filename", "File to be monitored",
|
||||
MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_9,
|
||||
MCA_BASE_VAR_SCOPE_ALL_EQ,
|
||||
&mca_sensor_file_component.file);
|
||||
|
||||
mca_sensor_file_component.check_size = false;
|
||||
(void) mca_base_component_var_register (c, "check_size", "Check the file size",
|
||||
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_9,
|
||||
MCA_BASE_VAR_SCOPE_ALL_EQ,
|
||||
&mca_sensor_file_component.check_size);
|
||||
|
||||
mca_sensor_file_component.check_access = false;
|
||||
(void) mca_base_component_var_register (c, "check_access", "Check access time",
|
||||
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_9,
|
||||
MCA_BASE_VAR_SCOPE_ALL_EQ,
|
||||
&mca_sensor_file_component.check_access);
|
||||
|
||||
mca_sensor_file_component.check_mod = false;
|
||||
(void) mca_base_component_var_register (c, "check_mod", "Check modification time",
|
||||
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_9,
|
||||
MCA_BASE_VAR_SCOPE_ALL_EQ,
|
||||
&mca_sensor_file_component.check_mod);
|
||||
|
||||
mca_sensor_file_component.limit = 3;
|
||||
(void) mca_base_component_var_register (c, "limit",
|
||||
"Number of times the sensor can detect no motion before declaring error (default=3)",
|
||||
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_9,
|
||||
MCA_BASE_VAR_SCOPE_ALL_EQ,
|
||||
&mca_sensor_file_component.limit);
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static int orte_sensor_file_open(void)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
static int orte_sensor_file_query(mca_base_module_t **module, int *priority)
|
||||
{
|
||||
*priority = 20; /* higher than heartbeat */
|
||||
*module = (mca_base_module_t *)&orte_sensor_file_module;
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/**
|
||||
* Close all subsystems.
|
||||
*/
|
||||
|
||||
static int orte_sensor_file_close(void)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
@ -1,36 +0,0 @@
|
||||
#
|
||||
# Copyright (c) 2009-2011 Cisco Systems, Inc. All rights reserved.
|
||||
#
|
||||
# Copyright (c) 2017 Intel, Inc. All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
sources = \
|
||||
sensor_ft_tester.c \
|
||||
sensor_ft_tester.h \
|
||||
sensor_ft_tester_component.c
|
||||
|
||||
# Make the output library in this directory, and name it either
|
||||
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
|
||||
# (for static builds).
|
||||
|
||||
if MCA_BUILD_orte_sensor_ft_tester_DSO
|
||||
component_noinst =
|
||||
component_install = mca_sensor_ft_tester.la
|
||||
else
|
||||
component_noinst = libmca_sensor_ft_tester.la
|
||||
component_install =
|
||||
endif
|
||||
|
||||
mcacomponentdir = $(ompilibdir)
|
||||
mcacomponent_LTLIBRARIES = $(component_install)
|
||||
mca_sensor_ft_tester_la_SOURCES = $(sources)
|
||||
mca_sensor_ft_tester_la_LDFLAGS = -module -avoid-version
|
||||
|
||||
noinst_LTLIBRARIES = $(component_noinst)
|
||||
libmca_sensor_ft_tester_la_SOURCES =$(sources)
|
||||
libmca_sensor_ft_tester_la_LDFLAGS = -module -avoid-version
|
@ -1,24 +0,0 @@
|
||||
# -*- shell-script -*-
|
||||
#
|
||||
# Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
|
||||
# Copyright (c) 2011-2013 Los Alamos National Security, LLC.
|
||||
# All rights reserved.
|
||||
# Copyright (c) 2017 Intel, Inc. All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
# MCA_sensor_ft_tester_CONFIG([action-if-found], [action-if-not-found])
|
||||
# -----------------------------------------------------------
|
||||
AC_DEFUN([MCA_orte_sensor_ft_tester_CONFIG], [
|
||||
AC_CONFIG_FILES([orte/mca/sensor/ft_tester/Makefile])
|
||||
|
||||
# if we don't want sensors, don't compile
|
||||
# this component
|
||||
AS_IF([test "$orte_want_sensors" = "1"],
|
||||
[$1], [$2])
|
||||
])dnl
|
||||
|
@ -1,41 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2009-2011 Cisco Systems, Inc. All rights reserved.
|
||||
*
|
||||
* Copyright (c) 2017 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
/**
|
||||
* @file
|
||||
*
|
||||
* Process Resource Utilization sensor
|
||||
*/
|
||||
#ifndef ORTE_SENSOR_FT_TESTER_H
|
||||
#define ORTE_SENSOR_FT_TESTER_H
|
||||
|
||||
#include "orte_config.h"
|
||||
|
||||
#include "orte/mca/sensor/sensor.h"
|
||||
#include "opal/util/alfg.h"
|
||||
|
||||
BEGIN_C_DECLS
|
||||
|
||||
struct orte_sensor_ft_tester_component_t {
|
||||
orte_sensor_base_component_t super;
|
||||
float fail_prob;
|
||||
float daemon_fail_prob;
|
||||
bool multi_fail;
|
||||
};
|
||||
typedef struct orte_sensor_ft_tester_component_t orte_sensor_ft_tester_component_t;
|
||||
|
||||
ORTE_MODULE_DECLSPEC extern orte_sensor_ft_tester_component_t mca_sensor_ft_tester_component;
|
||||
extern orte_sensor_base_module_t orte_sensor_ft_tester_module;
|
||||
|
||||
extern opal_rng_buff_t orte_sensor_ft_rng_buff;
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
#endif
|
@ -1,141 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2010-2011 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2012 Los Alamos National Security, Inc. All rights reserved.
|
||||
* Copyright (c) 2017 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "orte_config.h"
|
||||
#include "orte/constants.h"
|
||||
|
||||
#include "opal/mca/base/base.h"
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/class/opal_pointer_array.h"
|
||||
|
||||
#include "orte/util/proc_info.h"
|
||||
#include "orte/util/show_help.h"
|
||||
|
||||
#include "sensor_ft_tester.h"
|
||||
|
||||
/*
|
||||
* Local functions
|
||||
*/
|
||||
static int orte_sensor_ft_tester_register (void);
|
||||
static int orte_sensor_ft_tester_open(void);
|
||||
static int orte_sensor_ft_tester_close(void);
|
||||
static int orte_sensor_ft_tester_query(mca_base_module_t **module, int *priority);
|
||||
|
||||
orte_sensor_ft_tester_component_t mca_sensor_ft_tester_component = {
|
||||
{
|
||||
{
|
||||
ORTE_SENSOR_BASE_VERSION_1_0_0,
|
||||
|
||||
"ft_tester", /* MCA component name */
|
||||
ORTE_MAJOR_VERSION, /* MCA component major version */
|
||||
ORTE_MINOR_VERSION, /* MCA component minor version */
|
||||
ORTE_RELEASE_VERSION, /* MCA component release version */
|
||||
orte_sensor_ft_tester_open, /* component open */
|
||||
orte_sensor_ft_tester_close, /* component close */
|
||||
orte_sensor_ft_tester_query, /* component query */
|
||||
orte_sensor_ft_tester_register
|
||||
},
|
||||
{
|
||||
/* The component is checkpoint ready */
|
||||
MCA_BASE_METADATA_PARAM_CHECKPOINT
|
||||
},
|
||||
NULL
|
||||
}
|
||||
};
|
||||
|
||||
static char *daemon_fail_prob = NULL;
|
||||
static char *fail_prob = NULL;
|
||||
opal_rng_buff_t orte_sensor_ft_rng_buff;
|
||||
|
||||
/**
|
||||
* component register/open/close/init function
|
||||
*/
|
||||
static int orte_sensor_ft_tester_register (void)
|
||||
{
|
||||
mca_base_component_t *c = &mca_sensor_ft_tester_component.super.base_version;
|
||||
|
||||
fail_prob = NULL;
|
||||
(void) mca_base_component_var_register (c, "fail_prob", "Probability of killing a single executable",
|
||||
MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_9,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&fail_prob);
|
||||
|
||||
mca_sensor_ft_tester_component.multi_fail = false;
|
||||
(void) mca_base_component_var_register (c, "multi_allowed", "Allow multiple executables to be killed at one time",
|
||||
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_9,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&mca_sensor_ft_tester_component.multi_fail);
|
||||
|
||||
daemon_fail_prob = NULL;
|
||||
(void) mca_base_component_var_register (c, "daemon_fail_prob", "Probability of killing a daemon",
|
||||
MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_9,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&daemon_fail_prob);
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static int orte_sensor_ft_tester_open(void)
|
||||
{
|
||||
/* lookup parameters */
|
||||
if (NULL != fail_prob) {
|
||||
mca_sensor_ft_tester_component.fail_prob = strtof(fail_prob, NULL);
|
||||
if (1.0 < mca_sensor_ft_tester_component.fail_prob) {
|
||||
/* given in percent */
|
||||
mca_sensor_ft_tester_component.fail_prob /= 100.0;
|
||||
}
|
||||
} else {
|
||||
mca_sensor_ft_tester_component.fail_prob = 0.0;
|
||||
}
|
||||
|
||||
if (NULL != daemon_fail_prob) {
|
||||
mca_sensor_ft_tester_component.daemon_fail_prob = strtof(daemon_fail_prob, NULL);
|
||||
if (1.0 < mca_sensor_ft_tester_component.daemon_fail_prob) {
|
||||
/* given in percent */
|
||||
mca_sensor_ft_tester_component.daemon_fail_prob /= 100.0;
|
||||
}
|
||||
} else {
|
||||
mca_sensor_ft_tester_component.daemon_fail_prob = 0.0;
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
static int orte_sensor_ft_tester_query(mca_base_module_t **module, int *priority)
|
||||
{
|
||||
if (0.0 < mca_sensor_ft_tester_component.fail_prob ||
|
||||
0.0 < mca_sensor_ft_tester_component.daemon_fail_prob) {
|
||||
*priority = 1; /* at the bottom */
|
||||
*module = (mca_base_module_t *)&orte_sensor_ft_tester_module;
|
||||
/* seed the RNG --- Not sure if we should assume all procs use
|
||||
* the same seed?
|
||||
*/
|
||||
opal_srand(&orte_sensor_ft_rng_buff, (uint32_t) getpid());
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
*priority = 0;
|
||||
*module = NULL;
|
||||
return ORTE_ERROR;
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Close all subsystems.
|
||||
*/
|
||||
|
||||
static int orte_sensor_ft_tester_close(void)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
@ -1,38 +0,0 @@
|
||||
#
|
||||
# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
||||
#
|
||||
# Copyright (c) 2017 Intel, Inc. All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
dist_ompidata_DATA = help-orte-sensor-heartbeat.txt
|
||||
|
||||
sources = \
|
||||
sensor_heartbeat.c \
|
||||
sensor_heartbeat.h \
|
||||
sensor_heartbeat_component.c
|
||||
|
||||
# Make the output library in this directory, and name it either
|
||||
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
|
||||
# (for static builds).
|
||||
|
||||
if MCA_BUILD_orte_sensor_heartbeat_DSO
|
||||
component_noinst =
|
||||
component_install = mca_sensor_heartbeat.la
|
||||
else
|
||||
component_noinst = libmca_sensor_heartbeat.la
|
||||
component_install =
|
||||
endif
|
||||
|
||||
mcacomponentdir = $(ompilibdir)
|
||||
mcacomponent_LTLIBRARIES = $(component_install)
|
||||
mca_sensor_heartbeat_la_SOURCES = $(sources)
|
||||
mca_sensor_heartbeat_la_LDFLAGS = -module -avoid-version
|
||||
|
||||
noinst_LTLIBRARIES = $(component_noinst)
|
||||
libmca_sensor_heartbeat_la_SOURCES =$(sources)
|
||||
libmca_sensor_heartbeat_la_LDFLAGS = -module -avoid-version
|
@ -1,24 +0,0 @@
|
||||
# -*- shell-script -*-
|
||||
#
|
||||
# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
||||
# Copyright (c) 2011-2013 Los Alamos National Security, LLC.
|
||||
# All rights reserved.
|
||||
# Copyright (c) 2017 Intel, Inc. All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
# MCA_sensor_heartbeat_CONFIG([action-if-found], [action-if-not-found])
|
||||
# -----------------------------------------------------------
|
||||
AC_DEFUN([MCA_orte_sensor_heartbeat_CONFIG], [
|
||||
AC_CONFIG_FILES([orte/mca/sensor/heartbeat/Makefile])
|
||||
|
||||
# if we don't want sensors, don't compile
|
||||
# this component
|
||||
AS_IF([test "$orte_want_sensors" = "1"],
|
||||
[$1], [$2])
|
||||
])dnl
|
||||
|
@ -1,279 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
*
|
||||
* Copyright (c) 2017 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "orte_config.h"
|
||||
#include "orte/constants.h"
|
||||
#include "orte/types.h"
|
||||
|
||||
#include <errno.h>
|
||||
#ifdef HAVE_UNISTD_H
|
||||
#include <unistd.h>
|
||||
#endif /* HAVE_UNISTD_H */
|
||||
#ifdef HAVE_STRING_H
|
||||
#include <string.h>
|
||||
#endif /* HAVE_STRING_H */
|
||||
#include <stdio.h>
|
||||
|
||||
#include "opal_stdint.h"
|
||||
#include "opal/util/argv.h"
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/mca/event/event.h"
|
||||
|
||||
#include "orte/util/show_help.h"
|
||||
#include "orte/util/proc_info.h"
|
||||
#include "orte/util/name_fns.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/rml/rml.h"
|
||||
#include "orte/mca/state/state.h"
|
||||
#include "orte/runtime/orte_wait.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
|
||||
#include "orte/mca/sensor/base/base.h"
|
||||
#include "orte/mca/sensor/base/sensor_private.h"
|
||||
#include "sensor_heartbeat.h"
|
||||
|
||||
/* declare the API functions */
|
||||
static int init(void);
|
||||
static void finalize(void);
|
||||
static void start(orte_jobid_t job);
|
||||
static void sample(void);
|
||||
|
||||
/* instantiate the module */
|
||||
orte_sensor_base_module_t orte_sensor_heartbeat_module = {
|
||||
init,
|
||||
finalize,
|
||||
start,
|
||||
NULL,
|
||||
sample,
|
||||
NULL
|
||||
};
|
||||
|
||||
/* declare the local functions */
|
||||
static void check_heartbeat(int fd, short event, void *arg);
|
||||
static void recv_beats(int status, orte_process_name_t* sender,
|
||||
opal_buffer_t *buffer,
|
||||
orte_rml_tag_t tag, void *cbdata);
|
||||
|
||||
/* local globals */
|
||||
static orte_job_t *daemons=NULL;
|
||||
static opal_event_t check_ev;
|
||||
static bool check_active = false;
|
||||
static struct timeval check_time;
|
||||
|
||||
static int init(void)
|
||||
{
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base_framework.framework_output,
|
||||
"%s initializing heartbeat recvs",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
|
||||
/* setup to receive heartbeats */
|
||||
if (ORTE_PROC_IS_HNP || ORTE_PROC_IS_AGGREGATOR) {
|
||||
orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD,
|
||||
ORTE_RML_TAG_HEARTBEAT,
|
||||
ORTE_RML_PERSISTENT,
|
||||
recv_beats, NULL);
|
||||
}
|
||||
|
||||
if (ORTE_PROC_IS_HNP) {
|
||||
daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid);
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static void finalize(void)
|
||||
{
|
||||
orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_HEARTBEAT);
|
||||
if (check_active) {
|
||||
opal_event_del(&check_ev);
|
||||
check_active = false;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
static void start(orte_jobid_t job)
|
||||
{
|
||||
if (!check_active && NULL != daemons) {
|
||||
/* setup the check event */
|
||||
check_time.tv_sec = 3 * orte_sensor_base.rate.tv_sec;
|
||||
check_time.tv_usec = 0;
|
||||
opal_event_evtimer_set(orte_event_base, &check_ev, check_heartbeat, &check_ev);
|
||||
opal_event_evtimer_add(&check_ev, &check_time);
|
||||
check_active = true;
|
||||
}
|
||||
}
|
||||
|
||||
static void sample(void)
|
||||
{
|
||||
opal_buffer_t *buf;
|
||||
int rc;
|
||||
orte_process_name_t *tgt;
|
||||
|
||||
/* if we are aborting or shutting down, ignore this */
|
||||
if (orte_abnormal_term_ordered || orte_finalizing || !orte_initialized) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (ORTE_PROC_IS_CM) {
|
||||
/* we send to our daemon */
|
||||
tgt = ORTE_PROC_MY_DAEMON;
|
||||
} else {
|
||||
tgt = ORTE_PROC_MY_HNP;
|
||||
}
|
||||
/* if my target hasn't been defined yet, ignore - nobody listening yet */
|
||||
if (ORTE_JOBID_INVALID ==tgt->jobid ||
|
||||
ORTE_VPID_INVALID == tgt->vpid) {
|
||||
opal_output_verbose(1, orte_sensor_base_framework.framework_output,
|
||||
"%s sensor:heartbeat: HNP is not defined",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
|
||||
return;
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base_framework.framework_output,
|
||||
"%s sending heartbeat",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
|
||||
/* if we want sampled data included, point to the bucket */
|
||||
buf = OBJ_NEW(opal_buffer_t);
|
||||
if (orte_sensor_base.log_samples) {
|
||||
opal_dss.copy_payload(buf, orte_sensor_base.samples);
|
||||
OBJ_RELEASE(orte_sensor_base.samples);
|
||||
/* start a new sample bucket */
|
||||
orte_sensor_base.samples = OBJ_NEW(opal_buffer_t);
|
||||
}
|
||||
|
||||
/* send heartbeat */
|
||||
if (ORTE_SUCCESS != (rc = orte_rml.send_buffer_nb(tgt, buf,
|
||||
ORTE_RML_TAG_HEARTBEAT,
|
||||
orte_rml_send_callback, NULL))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_RELEASE(buf);
|
||||
}
|
||||
}
|
||||
|
||||
/* this function automatically gets periodically called
|
||||
* by the event library so we can check on the state
|
||||
* of the various orteds
|
||||
*/
|
||||
static void check_heartbeat(int fd, short dummy, void *arg)
|
||||
{
|
||||
int v;
|
||||
orte_proc_t *proc;
|
||||
opal_event_t *tmp = (opal_event_t*)arg;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((3, orte_sensor_base_framework.framework_output,
|
||||
"%s sensor:check_heartbeat",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
|
||||
/* if we are aborting or shutting down, ignore this */
|
||||
if (orte_abnormal_term_ordered || orte_finalizing || !orte_initialized) {
|
||||
OPAL_OUTPUT_VERBOSE((3, orte_sensor_base_framework.framework_output,
|
||||
"%s IGNORING CHECK abnorm_term %s fin %s init %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
orte_abnormal_term_ordered ? "TRUE" : "FALSE",
|
||||
orte_finalizing ? "TRUE" : "FALSE",
|
||||
orte_initialized ? "TRUE" : "FALSE"));
|
||||
check_active = false;
|
||||
return;
|
||||
}
|
||||
|
||||
for (v=0; v < daemons->procs->size; v++) {
|
||||
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(daemons->procs, v))) {
|
||||
continue;
|
||||
}
|
||||
/* ignore myself */
|
||||
if (proc->name.vpid == ORTE_PROC_MY_NAME->vpid) {
|
||||
continue;
|
||||
}
|
||||
if (ORTE_PROC_STATE_RUNNING != proc->state) {
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base_framework.framework_output,
|
||||
"%s sensor:heartbeat DAEMON %s IS NOT RUNNING",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&proc->name)));
|
||||
continue;
|
||||
}
|
||||
|
||||
if (0 == proc->beat) {
|
||||
/* no heartbeat recvd in last window */
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base_framework.framework_output,
|
||||
"%s sensor:check_heartbeat FAILED for daemon %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&proc->name)));
|
||||
ORTE_ACTIVATE_PROC_STATE(&proc->name, ORTE_PROC_STATE_HEARTBEAT_FAILED);
|
||||
} else {
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base_framework.framework_output,
|
||||
"%s HEARTBEAT DETECTED FOR %s: NUM BEATS %d",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&proc->name), proc->beat));
|
||||
}
|
||||
/* reset for next period */
|
||||
proc->beat = 0;
|
||||
}
|
||||
|
||||
/* reset the timer */
|
||||
opal_event_evtimer_add(tmp, &check_time);
|
||||
}
|
||||
|
||||
static void recv_beats(int status, orte_process_name_t* sender,
|
||||
opal_buffer_t *buffer,
|
||||
orte_rml_tag_t tag, void *cbdata)
|
||||
{
|
||||
orte_proc_t *proc;
|
||||
int rc, n;
|
||||
char *component=NULL;
|
||||
opal_buffer_t *buf;
|
||||
|
||||
opal_output_verbose(1, orte_sensor_base_framework.framework_output,
|
||||
"%s received beat from %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(sender));
|
||||
|
||||
/* if we are aborting or shutting down, ignore this */
|
||||
if (orte_abnormal_term_ordered || orte_finalizing || !orte_initialized) {
|
||||
return;
|
||||
}
|
||||
|
||||
/* get this daemon's object */
|
||||
if (NULL != daemons) {
|
||||
if (NULL != (proc = (orte_proc_t*)opal_pointer_array_get_item(daemons->procs, sender->vpid))) {
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base_framework.framework_output,
|
||||
"%s marked beat from %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(sender)));
|
||||
proc->beat++;
|
||||
/* if this daemon has reappeared, reset things */
|
||||
if (ORTE_PROC_STATE_HEARTBEAT_FAILED == proc->state) {
|
||||
proc->state = ORTE_PROC_STATE_RUNNING;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* unload any sampled data */
|
||||
n=1;
|
||||
while (OPAL_SUCCESS == (rc = opal_dss.unpack(buffer, &buf, &n, OPAL_BUFFER))) {
|
||||
if (NULL != buf) {
|
||||
n=1;
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.unpack(buf, &component, &n, OPAL_STRING))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
break;
|
||||
}
|
||||
orte_sensor_base_log(component, buf);
|
||||
OBJ_RELEASE(buf);
|
||||
free(component);
|
||||
n=1;
|
||||
}
|
||||
}
|
||||
if (OPAL_ERR_UNPACK_READ_PAST_END_OF_BUFFER != rc) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
}
|
@ -1,32 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2012 Los Alamos National Security, Inc. All rights reserved.
|
||||
*
|
||||
* Copyright (c) 2017 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
/**
|
||||
* @file
|
||||
*
|
||||
* Heartbeat sensor
|
||||
*/
|
||||
#ifndef ORTE_SENSOR_HEARTBEAT_H
|
||||
#define ORTE_SENSOR_HEARTBEAT_H
|
||||
|
||||
#include "orte_config.h"
|
||||
|
||||
#include "orte/mca/sensor/sensor.h"
|
||||
|
||||
BEGIN_C_DECLS
|
||||
|
||||
ORTE_MODULE_DECLSPEC extern orte_sensor_base_component_t mca_sensor_heartbeat_component;
|
||||
extern orte_sensor_base_module_t orte_sensor_heartbeat_module;
|
||||
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
#endif
|
@ -1,75 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2012 Los Alamos National Security, Inc. All rights reserved.
|
||||
* Copyright (c) 2017 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "orte_config.h"
|
||||
#include "orte/constants.h"
|
||||
|
||||
#include "opal/mca/base/base.h"
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/class/opal_pointer_array.h"
|
||||
|
||||
#include "orte/util/proc_info.h"
|
||||
#include "orte/util/show_help.h"
|
||||
|
||||
#include "sensor_heartbeat.h"
|
||||
|
||||
/*
|
||||
* Local functions
|
||||
*/
|
||||
|
||||
static int orte_sensor_heartbeat_open(void);
|
||||
static int orte_sensor_heartbeat_close(void);
|
||||
static int orte_sensor_heartbeat_query(mca_base_module_t **module, int *priority);
|
||||
|
||||
orte_sensor_base_component_t mca_sensor_heartbeat_component = {
|
||||
{
|
||||
ORTE_SENSOR_BASE_VERSION_1_0_0,
|
||||
|
||||
"heartbeat", /* MCA component name */
|
||||
ORTE_MAJOR_VERSION, /* MCA component major version */
|
||||
ORTE_MINOR_VERSION, /* MCA component minor version */
|
||||
ORTE_RELEASE_VERSION, /* MCA component release version */
|
||||
orte_sensor_heartbeat_open, /* component open */
|
||||
orte_sensor_heartbeat_close, /* component close */
|
||||
orte_sensor_heartbeat_query /* component query */
|
||||
},
|
||||
{
|
||||
/* The component is checkpoint ready */
|
||||
MCA_BASE_METADATA_PARAM_CHECKPOINT
|
||||
},
|
||||
"heartbeat"
|
||||
};
|
||||
|
||||
|
||||
/**
|
||||
* component open/close/init function
|
||||
*/
|
||||
static int orte_sensor_heartbeat_open(void)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
static int orte_sensor_heartbeat_query(mca_base_module_t **module, int *priority)
|
||||
{
|
||||
*priority = 5; /* lower than all other samplers so that their data gets included in heartbeat */
|
||||
*module = (mca_base_module_t *)&orte_sensor_heartbeat_module;
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/**
|
||||
* Close all subsystems.
|
||||
*/
|
||||
|
||||
static int orte_sensor_heartbeat_close(void)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
@ -1,38 +0,0 @@
|
||||
#
|
||||
# Copyright (c) 2009-2010 Cisco Systems, Inc. All rights reserved.
|
||||
#
|
||||
# Copyright (c) 2017 Intel, Inc. All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
dist_ompidata_DATA = help-orte-sensor-resusage.txt
|
||||
|
||||
sources = \
|
||||
sensor_resusage.c \
|
||||
sensor_resusage.h \
|
||||
sensor_resusage_component.c
|
||||
|
||||
# Make the output library in this directory, and name it either
|
||||
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
|
||||
# (for static builds).
|
||||
|
||||
if MCA_BUILD_orte_sensor_resusage_DSO
|
||||
component_noinst =
|
||||
component_install = mca_sensor_resusage.la
|
||||
else
|
||||
component_noinst = libmca_sensor_resusage.la
|
||||
component_install =
|
||||
endif
|
||||
|
||||
mcacomponentdir = $(ompilibdir)
|
||||
mcacomponent_LTLIBRARIES = $(component_install)
|
||||
mca_sensor_resusage_la_SOURCES = $(sources)
|
||||
mca_sensor_resusage_la_LDFLAGS = -module -avoid-version
|
||||
|
||||
noinst_LTLIBRARIES = $(component_noinst)
|
||||
libmca_sensor_resusage_la_SOURCES =$(sources)
|
||||
libmca_sensor_resusage_la_LDFLAGS = -module -avoid-version
|
@ -1,24 +0,0 @@
|
||||
# -*- shell-script -*-
|
||||
#
|
||||
# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
||||
# Copyright (c) 2011-2013 Los Alamos National Security, LLC.
|
||||
# All rights reserved.
|
||||
# Copyright (c) 2017 Intel, Inc. All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
# MCA_sensor_resusage_CONFIG([action-if-found], [action-if-not-found])
|
||||
# -----------------------------------------------------------
|
||||
AC_DEFUN([MCA_orte_sensor_resusage_CONFIG], [
|
||||
AC_CONFIG_FILES([orte/mca/sensor/resusage/Makefile])
|
||||
|
||||
# if we don't want sensors, don't compile
|
||||
# this component
|
||||
AS_IF([test "$orte_want_sensors" = "1"],
|
||||
[$1], [$2])
|
||||
])dnl
|
||||
|
@ -1,21 +0,0 @@
|
||||
# -*- text -*-
|
||||
#
|
||||
# Copyright (c) 2010-2011 Cisco Systems, Inc. All rights reserved.
|
||||
#
|
||||
# Copyright (c) 2017 Intel, Inc. All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
# This is the US/English general help file for the memory usage sensor
|
||||
#
|
||||
[mem-limit-exceeded]
|
||||
A process has exceeded the specified limit on memory usage:
|
||||
|
||||
Node: %s
|
||||
Process rank: %s
|
||||
Memory used: %luGbytes
|
||||
Memory limit: %luGbytes
|
||||
|
@ -1,478 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2009-2011 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
*
|
||||
* Copyright (c) 2017 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "orte_config.h"
|
||||
#include "orte/constants.h"
|
||||
#include "orte/types.h"
|
||||
|
||||
#include <errno.h>
|
||||
#ifdef HAVE_UNISTD_H
|
||||
#include <unistd.h>
|
||||
#endif /* HAVE_UNISTD_H */
|
||||
#ifdef HAVE_STRING_H
|
||||
#include <string.h>
|
||||
#endif /* HAVE_STRING_H */
|
||||
#include <stdio.h>
|
||||
|
||||
#include "opal_stdint.h"
|
||||
#include "opal/class/opal_pointer_array.h"
|
||||
#include "opal/class/opal_ring_buffer.h"
|
||||
#include "opal/dss/dss.h"
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/mca/pstat/pstat.h"
|
||||
#include "opal/mca/db/db.h"
|
||||
|
||||
#include "orte/util/proc_info.h"
|
||||
#include "orte/util/name_fns.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/odls/odls_types.h"
|
||||
#include "orte/mca/odls/base/odls_private.h"
|
||||
#include "orte/mca/rml/rml.h"
|
||||
#include "orte/mca/state/state.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
#include "orte/orted/orted.h"
|
||||
|
||||
#include "orte/mca/sensor/base/base.h"
|
||||
#include "orte/mca/sensor/base/sensor_private.h"
|
||||
#include "sensor_resusage.h"
|
||||
|
||||
/* declare the API functions */
|
||||
static int init(void);
|
||||
static void finalize(void);
|
||||
static void sample(void);
|
||||
static void res_log(opal_buffer_t *sample);
|
||||
|
||||
/* instantiate the module */
|
||||
orte_sensor_base_module_t orte_sensor_resusage_module = {
|
||||
init,
|
||||
finalize,
|
||||
NULL,
|
||||
NULL,
|
||||
sample,
|
||||
res_log
|
||||
};
|
||||
|
||||
static bool log_enabled = true;
|
||||
static orte_node_t *my_node;
|
||||
static orte_proc_t *my_proc;
|
||||
|
||||
static int init(void)
|
||||
{
|
||||
orte_job_t *jdata;
|
||||
|
||||
/* ensure my_proc and my_node are available on the global arrays */
|
||||
if (NULL == (jdata = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid))) {
|
||||
my_proc = OBJ_NEW(orte_proc_t);
|
||||
my_node = OBJ_NEW(orte_node_t);
|
||||
} else {
|
||||
if (NULL == (my_proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, ORTE_PROC_MY_NAME->vpid))) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
||||
return ORTE_ERR_NOT_FOUND;
|
||||
}
|
||||
if (NULL == (my_node = my_proc->node)) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
||||
return ORTE_ERR_NOT_FOUND;
|
||||
}
|
||||
/* protect the objects */
|
||||
OBJ_RETAIN(my_proc);
|
||||
OBJ_RETAIN(my_node);
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static void finalize(void)
|
||||
{
|
||||
if (NULL != my_proc) {
|
||||
OBJ_RELEASE(my_proc);
|
||||
}
|
||||
if (NULL != my_node) {
|
||||
OBJ_RELEASE(my_node);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
static void sample(void)
|
||||
{
|
||||
opal_pstats_t *stats, *st;
|
||||
opal_node_stats_t *nstats, *nst;
|
||||
int rc, i;
|
||||
orte_proc_t *child, *hog=NULL;
|
||||
float in_use, max_mem;
|
||||
opal_buffer_t buf, *bptr;
|
||||
char *comp;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base_framework.framework_output,
|
||||
"sample:resusage sampling resource usage"));
|
||||
|
||||
/* setup a buffer for our stats */
|
||||
OBJ_CONSTRUCT(&buf, opal_buffer_t);
|
||||
/* pack our name */
|
||||
comp = strdup("resusage");
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.pack(&buf, &comp, 1, OPAL_STRING))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_DESTRUCT(&buf);
|
||||
return;
|
||||
}
|
||||
free(comp);
|
||||
|
||||
/* update stats on ourself and the node */
|
||||
stats = OBJ_NEW(opal_pstats_t);
|
||||
nstats = OBJ_NEW(opal_node_stats_t);
|
||||
if (ORTE_SUCCESS != (rc = opal_pstat.query(orte_process_info.pid, stats, nstats))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_DESTRUCT(stats);
|
||||
OBJ_RELEASE(nstats);
|
||||
OBJ_DESTRUCT(&buf);
|
||||
return;
|
||||
}
|
||||
|
||||
/* the stats framework can't know nodename or rank */
|
||||
strncpy(stats->node, orte_process_info.nodename, OPAL_PSTAT_MAX_STRING_LEN);
|
||||
stats->rank = ORTE_PROC_MY_NAME->vpid;
|
||||
/* locally save the stats */
|
||||
if (NULL != (st = (opal_pstats_t*)opal_ring_buffer_push(&my_proc->stats, stats))) {
|
||||
OBJ_RELEASE(st);
|
||||
}
|
||||
if (NULL != (nst = (opal_node_stats_t*)opal_ring_buffer_push(&my_node->stats, nstats))) {
|
||||
/* release the popped value */
|
||||
OBJ_RELEASE(nst);
|
||||
}
|
||||
|
||||
/* pack them */
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.pack(&buf, &orte_process_info.nodename, 1, OPAL_STRING))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_DESTRUCT(&buf);
|
||||
return;
|
||||
}
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.pack(&buf, &nstats, 1, OPAL_NODE_STAT))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_DESTRUCT(&buf);
|
||||
return;
|
||||
}
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.pack(&buf, &stats, 1, OPAL_PSTAT))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_DESTRUCT(&buf);
|
||||
return;
|
||||
}
|
||||
|
||||
/* loop through our children and update their stats */
|
||||
if (NULL != orte_local_children) {
|
||||
for (i=0; i < orte_local_children->size; i++) {
|
||||
if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) {
|
||||
continue;
|
||||
}
|
||||
if (!child->alive) {
|
||||
continue;
|
||||
}
|
||||
if (0 == child->pid) {
|
||||
/* race condition */
|
||||
continue;
|
||||
}
|
||||
stats = OBJ_NEW(opal_pstats_t);
|
||||
if (ORTE_SUCCESS != opal_pstat.query(child->pid, stats, NULL)) {
|
||||
/* may hit a race condition where the process has
|
||||
* terminated, so just ignore any error
|
||||
*/
|
||||
OBJ_RELEASE(stats);
|
||||
continue;
|
||||
}
|
||||
/* the stats framework can't know nodename or rank */
|
||||
strncpy(stats->node, orte_process_info.nodename, OPAL_PSTAT_MAX_STRING_LEN);
|
||||
stats->rank = child->name.vpid;
|
||||
/* store it */
|
||||
if (NULL != (st = (opal_pstats_t*)opal_ring_buffer_push(&child->stats, stats))) {
|
||||
OBJ_RELEASE(st);
|
||||
}
|
||||
/* pack them */
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.pack(&buf, &stats, 1, OPAL_PSTAT))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_DESTRUCT(&buf);
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* xfer any data for transmission */
|
||||
if (0 < buf.bytes_used) {
|
||||
bptr = &buf;
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.pack(orte_sensor_base.samples, &bptr, 1, OPAL_BUFFER))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_DESTRUCT(&buf);
|
||||
return;
|
||||
}
|
||||
}
|
||||
OBJ_DESTRUCT(&buf);
|
||||
|
||||
/* are there any issues with node-level usage? */
|
||||
nst = (opal_node_stats_t*)opal_ring_buffer_poke(&my_node->stats, -1);
|
||||
if (NULL != nst && 0.0 < mca_sensor_resusage_component.node_memory_limit) {
|
||||
OPAL_OUTPUT_VERBOSE((2, orte_sensor_base_framework.framework_output,
|
||||
"%s CHECKING NODE MEM",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
/* compute the percentage of node memory in-use */
|
||||
in_use = 1.0 - (nst->free_mem / nst->total_mem);
|
||||
OPAL_OUTPUT_VERBOSE((2, orte_sensor_base_framework.framework_output,
|
||||
"%s PERCENT USED: %f LIMIT: %f",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
in_use, mca_sensor_resusage_component.node_memory_limit));
|
||||
if (mca_sensor_resusage_component.node_memory_limit <= in_use) {
|
||||
/* loop through our children and find the biggest hog */
|
||||
hog = NULL;
|
||||
max_mem = 0.0;
|
||||
for (i=0; i < orte_local_children->size; i++) {
|
||||
if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) {
|
||||
continue;
|
||||
}
|
||||
if (!child->alive) {
|
||||
continue;
|
||||
}
|
||||
if (0 == child->pid) {
|
||||
/* race condition */
|
||||
continue;
|
||||
}
|
||||
if (NULL == (st = (opal_pstats_t*)opal_ring_buffer_poke(&child->stats, -1))) {
|
||||
continue;
|
||||
}
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_sensor_base_framework.framework_output,
|
||||
"%s PROC %s AT VSIZE %f",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&child->name), st->vsize));
|
||||
if (max_mem < st->vsize) {
|
||||
hog = child;
|
||||
max_mem = st->vsize;
|
||||
}
|
||||
}
|
||||
if (NULL == hog) {
|
||||
/* if all children dead and we are still too big,
|
||||
* then we must be the culprit - abort
|
||||
*/
|
||||
OPAL_OUTPUT_VERBOSE((2, orte_sensor_base_framework.framework_output,
|
||||
"%s NO CHILD: COMMITTING SUICIDE",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
orte_errmgr.abort(ORTE_ERR_MEM_LIMIT_EXCEEDED, NULL);
|
||||
} else {
|
||||
/* report the problem */
|
||||
OPAL_OUTPUT_VERBOSE((2, orte_sensor_base_framework.framework_output,
|
||||
"%s REPORTING %s TO ERRMGR FOR EXCEEDING LIMITS",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&hog->name)));
|
||||
ORTE_ACTIVATE_PROC_STATE(&hog->name, ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED);
|
||||
}
|
||||
/* since we have ordered someone to die, we've done enough for this
|
||||
* time around - don't check proc limits as well
|
||||
*/
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
/* check proc limits */
|
||||
if (0.0 < mca_sensor_resusage_component.proc_memory_limit) {
|
||||
OPAL_OUTPUT_VERBOSE((2, orte_sensor_base_framework.framework_output,
|
||||
"%s CHECKING PROC MEM",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
/* check my children first */
|
||||
for (i=0; i < orte_local_children->size; i++) {
|
||||
if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) {
|
||||
continue;
|
||||
}
|
||||
if (!child->alive) {
|
||||
continue;
|
||||
}
|
||||
if (0 == child->pid) {
|
||||
/* race condition */
|
||||
continue;
|
||||
}
|
||||
if (NULL == (st = (opal_pstats_t*)opal_ring_buffer_poke(&child->stats, -1))) {
|
||||
continue;
|
||||
}
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_sensor_base_framework.framework_output,
|
||||
"%s PROC %s AT VSIZE %f",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&child->name), st->vsize));
|
||||
if (mca_sensor_resusage_component.proc_memory_limit <= st->vsize) {
|
||||
/* report the problem */
|
||||
ORTE_ACTIVATE_PROC_STATE(&child->name, ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void res_log(opal_buffer_t *sample)
|
||||
{
|
||||
opal_pstats_t *st=NULL;
|
||||
opal_node_stats_t *nst=NULL;
|
||||
int rc, n, i;
|
||||
opal_value_t kv[14];
|
||||
char *node;
|
||||
|
||||
if (!log_enabled) {
|
||||
return;
|
||||
}
|
||||
|
||||
/* unpack the node name */
|
||||
n=1;
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.unpack(sample, &node, &n, OPAL_STRING))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return;
|
||||
}
|
||||
|
||||
/* unpack the node stats */
|
||||
n=1;
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.unpack(sample, &nst, &n, OPAL_NODE_STAT))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return;
|
||||
}
|
||||
|
||||
if (mca_sensor_resusage_component.log_node_stats) {
|
||||
/* convert this into an array of opal_value_t's - no clean way
|
||||
* to do this, so have to just manually map each field
|
||||
*/
|
||||
for (i=0; i < 13; i++) {
|
||||
OBJ_CONSTRUCT(&kv[i], opal_value_t);
|
||||
}
|
||||
i=0;
|
||||
kv[i].key = strdup("ctime");
|
||||
kv[i].type = OPAL_TIMEVAL;
|
||||
kv[i].data.tv.tv_sec = nst->sample_time.tv_sec;
|
||||
kv[i++].data.tv.tv_usec = nst->sample_time.tv_usec;
|
||||
|
||||
kv[i].key = "hostname";
|
||||
kv[i].type = OPAL_STRING;
|
||||
kv[i++].data.string = strdup(node);
|
||||
|
||||
kv[i].key = strdup("total_mem");
|
||||
kv[i].type = OPAL_FLOAT;
|
||||
kv[i++].data.fval = nst->total_mem;
|
||||
|
||||
kv[i].key = strdup("free_mem");
|
||||
kv[i].type = OPAL_FLOAT;
|
||||
kv[i++].data.fval = nst->free_mem;
|
||||
|
||||
kv[i].key = strdup("buffers");
|
||||
kv[i].type = OPAL_FLOAT;
|
||||
kv[i++].data.fval = nst->buffers;
|
||||
|
||||
kv[i].key = strdup("cached");
|
||||
kv[i].type = OPAL_FLOAT;
|
||||
kv[i++].data.fval = nst->cached;
|
||||
|
||||
kv[i].key = strdup("swap_total");
|
||||
kv[i].type = OPAL_FLOAT;
|
||||
kv[i++].data.fval = nst->swap_total;
|
||||
|
||||
kv[i].key = strdup("swap_free");
|
||||
kv[i].type = OPAL_FLOAT;
|
||||
kv[i++].data.fval = nst->swap_free;
|
||||
|
||||
kv[i].key = strdup("mapped");
|
||||
kv[i].type = OPAL_FLOAT;
|
||||
kv[i++].data.fval = nst->mapped;
|
||||
|
||||
kv[i].key = strdup("swap_cached");
|
||||
kv[i].type = OPAL_FLOAT;
|
||||
kv[i++].data.fval = nst->swap_cached;
|
||||
|
||||
kv[i].key = strdup("la");
|
||||
kv[i].type = OPAL_FLOAT;
|
||||
kv[i++].data.fval = nst->la;
|
||||
|
||||
kv[i].key = strdup("la5");
|
||||
kv[i].type = OPAL_FLOAT;
|
||||
kv[i++].data.fval = nst->la5;
|
||||
|
||||
kv[i].key = strdup("la15");
|
||||
kv[i].type = OPAL_FLOAT;
|
||||
kv[i++].data.fval = nst->la15;
|
||||
|
||||
/* store it */
|
||||
if (ORTE_SUCCESS != (rc = opal_db.add_log("nodestats", kv, 12))) {
|
||||
/* don't bark about it - just quietly disable the log */
|
||||
log_enabled = false;
|
||||
}
|
||||
for (i=0; i < 12; i++) {
|
||||
OBJ_DESTRUCT(&kv[i]);
|
||||
}
|
||||
}
|
||||
|
||||
OBJ_RELEASE(nst);
|
||||
|
||||
if (mca_sensor_resusage_component.log_process_stats) {
|
||||
/* unpack all process stats */
|
||||
n=1;
|
||||
while (OPAL_SUCCESS == (rc = opal_dss.unpack(sample, &st, &n, OPAL_PSTAT))) {
|
||||
for (i=0; i < 14; i++) {
|
||||
OBJ_CONSTRUCT(&kv[i], opal_value_t);
|
||||
}
|
||||
kv[0].key = strdup("node");
|
||||
kv[0].type = OPAL_STRING;
|
||||
kv[0].data.string = strdup(st->node);
|
||||
kv[1].key = strdup("rank");
|
||||
kv[1].type = OPAL_INT32;
|
||||
kv[1].data.int32 = st->rank;
|
||||
kv[2].key = strdup("pid");
|
||||
kv[2].type = OPAL_PID;
|
||||
kv[2].data.pid = st->pid;
|
||||
kv[3].key = strdup("cmd");
|
||||
kv[3].type = OPAL_STRING;
|
||||
kv[3].data.string = strdup(st->cmd);
|
||||
kv[4].key = strdup("state");
|
||||
kv[4].type = OPAL_STRING;
|
||||
kv[4].data.string = (char*)malloc(3 * sizeof(char));
|
||||
kv[4].data.string[0] = st->state[0];
|
||||
kv[4].data.string[1] = st->state[1];
|
||||
kv[4].data.string[2] = '\0';
|
||||
kv[5].key = strdup("time");
|
||||
kv[5].type = OPAL_TIMEVAL;
|
||||
kv[5].data.tv.tv_sec = st->time.tv_sec;
|
||||
kv[5].data.tv.tv_usec = st->time.tv_usec;
|
||||
kv[6].key = strdup("percent_cpu");
|
||||
kv[6].type = OPAL_FLOAT;
|
||||
kv[6].data.fval = st->percent_cpu;
|
||||
kv[7].key = strdup("priority");
|
||||
kv[7].type = OPAL_INT32;
|
||||
kv[7].data.int32 = st->priority;
|
||||
kv[8].key = strdup("num_threads");
|
||||
kv[8].type = OPAL_INT16;
|
||||
kv[8].data.int16 = st->num_threads;
|
||||
kv[9].key = strdup("vsize");
|
||||
kv[9].type = OPAL_FLOAT;
|
||||
kv[9].data.fval = st->vsize;
|
||||
kv[10].key = strdup("rss");
|
||||
kv[10].type = OPAL_FLOAT;
|
||||
kv[10].data.fval = st->rss;
|
||||
kv[11].key = strdup("peak_vsize");
|
||||
kv[11].type = OPAL_FLOAT;
|
||||
kv[11].data.fval = st->peak_vsize;
|
||||
kv[12].key = strdup("processor");
|
||||
kv[12].type = OPAL_INT16;
|
||||
kv[12].data.int16 = st->processor;
|
||||
kv[13].key = strdup("sample_time");
|
||||
kv[13].type = OPAL_TIMEVAL;
|
||||
kv[13].data.tv.tv_sec = st->sample_time.tv_sec;
|
||||
kv[13].data.tv.tv_usec = st->sample_time.tv_usec;
|
||||
/* store it */
|
||||
if (ORTE_SUCCESS != (rc = opal_db.add_log("procstats", kv, 14))) {
|
||||
log_enabled = false;
|
||||
}
|
||||
for (i=0; i < 14; i++) {
|
||||
OBJ_DESTRUCT(&kv[i]);
|
||||
}
|
||||
OBJ_RELEASE(st);
|
||||
n=1;
|
||||
}
|
||||
if (OPAL_ERR_UNPACK_READ_PAST_END_OF_BUFFER != rc) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
}
|
||||
}
|
@ -1,41 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2009-2011 Cisco Systems, Inc. All rights reserved.
|
||||
*
|
||||
* Copyright (c) 2017 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
/**
|
||||
* @file
|
||||
*
|
||||
* Process Resource Utilization sensor
|
||||
*/
|
||||
#ifndef ORTE_SENSOR_RESUSAGE_H
|
||||
#define ORTE_SENSOR_RESUSAGE_H
|
||||
|
||||
#include "orte_config.h"
|
||||
|
||||
#include "orte/mca/sensor/sensor.h"
|
||||
|
||||
BEGIN_C_DECLS
|
||||
|
||||
struct orte_sensor_resusage_component_t {
|
||||
orte_sensor_base_component_t super;
|
||||
int sample_rate;
|
||||
float node_memory_limit;
|
||||
float proc_memory_limit;
|
||||
bool log_node_stats;
|
||||
bool log_process_stats;
|
||||
};
|
||||
typedef struct orte_sensor_resusage_component_t orte_sensor_resusage_component_t;
|
||||
|
||||
ORTE_MODULE_DECLSPEC extern orte_sensor_resusage_component_t mca_sensor_resusage_component;
|
||||
extern orte_sensor_base_module_t orte_sensor_resusage_module;
|
||||
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
#endif
|
@ -1,138 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2010-2011 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2017 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "orte_config.h"
|
||||
#include "orte/constants.h"
|
||||
|
||||
#include "opal/mca/base/base.h"
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/class/opal_pointer_array.h"
|
||||
|
||||
#include "orte/util/proc_info.h"
|
||||
#include "orte/util/show_help.h"
|
||||
|
||||
#include "sensor_resusage.h"
|
||||
|
||||
/*
|
||||
* Local functions
|
||||
*/
|
||||
static int orte_sensor_resusage_register (void);
|
||||
static int orte_sensor_resusage_open(void);
|
||||
static int orte_sensor_resusage_close(void);
|
||||
static int orte_sensor_resusage_query(mca_base_module_t **module, int *priority);
|
||||
|
||||
orte_sensor_resusage_component_t mca_sensor_resusage_component = {
|
||||
{
|
||||
{
|
||||
ORTE_SENSOR_BASE_VERSION_1_0_0,
|
||||
|
||||
"resusage", /* MCA component name */
|
||||
ORTE_MAJOR_VERSION, /* MCA component major version */
|
||||
ORTE_MINOR_VERSION, /* MCA component minor version */
|
||||
ORTE_RELEASE_VERSION, /* MCA component release version */
|
||||
orte_sensor_resusage_open, /* component open */
|
||||
orte_sensor_resusage_close, /* component close */
|
||||
orte_sensor_resusage_query, /* component query */
|
||||
orte_sensor_resusage_register
|
||||
},
|
||||
{
|
||||
/* The component is checkpoint ready */
|
||||
MCA_BASE_METADATA_PARAM_CHECKPOINT
|
||||
},
|
||||
"procresource,noderesource"
|
||||
}
|
||||
};
|
||||
|
||||
static int node_memory_limit;
|
||||
static int proc_memory_limit;
|
||||
|
||||
/**
|
||||
* component open/close/init function
|
||||
*/
|
||||
static int orte_sensor_resusage_register (void)
|
||||
{
|
||||
mca_base_component_t *c = &mca_sensor_resusage_component.super.base_version;
|
||||
|
||||
mca_sensor_resusage_component.sample_rate = 0;
|
||||
(void) mca_base_component_var_register (c, "sample_rate", "Sample rate in seconds (default: 0)",
|
||||
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_9,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&mca_sensor_resusage_component.sample_rate);
|
||||
if (mca_sensor_resusage_component.sample_rate < 0) {
|
||||
opal_output(0, "Illegal value %d - must be > 0", mca_sensor_resusage_component.sample_rate);
|
||||
return ORTE_ERR_BAD_PARAM;
|
||||
}
|
||||
|
||||
node_memory_limit = 0;
|
||||
(void) mca_base_component_var_register (c, "node_memory_limit",
|
||||
"Percentage of total memory that can be in-use",
|
||||
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_9,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&node_memory_limit);
|
||||
mca_sensor_resusage_component.node_memory_limit = (float)node_memory_limit/100.0;
|
||||
|
||||
proc_memory_limit = 0;
|
||||
(void) mca_base_component_var_register (c, "proc_memory_limit",
|
||||
"Max virtual memory size in MBytes",
|
||||
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_9,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&proc_memory_limit);
|
||||
mca_sensor_resusage_component.proc_memory_limit = (float) proc_memory_limit;
|
||||
|
||||
mca_sensor_resusage_component.log_node_stats = false;
|
||||
(void) mca_base_component_var_register (c, "log_node_stats", "Log the node stats",
|
||||
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_9,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&mca_sensor_resusage_component.log_node_stats);
|
||||
|
||||
mca_sensor_resusage_component.log_process_stats = false;
|
||||
(void) mca_base_component_var_register (c, "log_process_stats", "Log the process stats",
|
||||
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_9,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&mca_sensor_resusage_component.log_process_stats);
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static int orte_sensor_resusage_open(void)
|
||||
{
|
||||
if (mca_sensor_resusage_component.sample_rate < 0) {
|
||||
opal_output(0, "Illegal value %d - must be > 0", mca_sensor_resusage_component.sample_rate);
|
||||
return ORTE_ERR_FATAL;
|
||||
}
|
||||
|
||||
mca_sensor_resusage_component.node_memory_limit = (float) node_memory_limit/100.0;
|
||||
mca_sensor_resusage_component.proc_memory_limit = (float) proc_memory_limit;
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
static int orte_sensor_resusage_query(mca_base_module_t **module, int *priority)
|
||||
{
|
||||
*priority = 100; /* ahead of heartbeat */
|
||||
*module = (mca_base_module_t *)&orte_sensor_resusage_module;
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/**
|
||||
* Close all subsystems.
|
||||
*/
|
||||
|
||||
static int orte_sensor_resusage_close(void)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
@ -1,107 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2012 Los Alamos National Security, Inc. All rights reserved.
|
||||
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*
|
||||
* @file:
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef MCA_SENSOR_H
|
||||
#define MCA_SENSOR_H
|
||||
|
||||
/*
|
||||
* includes
|
||||
*/
|
||||
|
||||
#include "orte_config.h"
|
||||
#include "orte/types.h"
|
||||
|
||||
#include "opal/mca/mca.h"
|
||||
|
||||
BEGIN_C_DECLS
|
||||
|
||||
/*
|
||||
* Component functions - all MUST be provided!
|
||||
*/
|
||||
|
||||
/* start collecting data */
|
||||
typedef void (*orte_sensor_API_module_start_fn_t)(orte_jobid_t job);
|
||||
|
||||
/* stop collecting data */
|
||||
typedef void (*orte_sensor_API_module_stop_fn_t)(orte_jobid_t job);
|
||||
|
||||
/* API module */
|
||||
/*
|
||||
* Ver 1.0
|
||||
*/
|
||||
struct orte_sensor_base_API_module_1_0_0_t {
|
||||
orte_sensor_API_module_start_fn_t start;
|
||||
orte_sensor_API_module_stop_fn_t stop;
|
||||
};
|
||||
|
||||
typedef struct orte_sensor_base_API_module_1_0_0_t orte_sensor_base_API_module_1_0_0_t;
|
||||
typedef orte_sensor_base_API_module_1_0_0_t orte_sensor_base_API_module_t;
|
||||
|
||||
/* initialize the module */
|
||||
typedef int (*orte_sensor_base_module_init_fn_t)(void);
|
||||
|
||||
/* finalize the module */
|
||||
typedef void (*orte_sensor_base_module_finalize_fn_t)(void);
|
||||
|
||||
/* tell the module to sample its sensor */
|
||||
typedef void (*orte_sensor_base_module_sample_fn_t)(void);
|
||||
|
||||
/* pass a buffer to the module for logging */
|
||||
typedef void (*orte_sensor_base_module_log_fn_t)(opal_buffer_t *sample);
|
||||
|
||||
/*
|
||||
* Component modules Ver 1.0
|
||||
*/
|
||||
struct orte_sensor_base_module_1_0_0_t {
|
||||
orte_sensor_base_module_init_fn_t init;
|
||||
orte_sensor_base_module_finalize_fn_t finalize;
|
||||
orte_sensor_API_module_start_fn_t start;
|
||||
orte_sensor_API_module_stop_fn_t stop;
|
||||
orte_sensor_base_module_sample_fn_t sample;
|
||||
orte_sensor_base_module_log_fn_t log;
|
||||
};
|
||||
|
||||
typedef struct orte_sensor_base_module_1_0_0_t orte_sensor_base_module_1_0_0_t;
|
||||
typedef orte_sensor_base_module_1_0_0_t orte_sensor_base_module_t;
|
||||
|
||||
/*
|
||||
* the standard component data structure
|
||||
*/
|
||||
struct orte_sensor_base_component_1_0_0_t {
|
||||
mca_base_component_t base_version;
|
||||
mca_base_component_data_t base_data;
|
||||
char *data_measured;
|
||||
};
|
||||
typedef struct orte_sensor_base_component_1_0_0_t orte_sensor_base_component_1_0_0_t;
|
||||
typedef orte_sensor_base_component_1_0_0_t orte_sensor_base_component_t;
|
||||
|
||||
|
||||
|
||||
/*
|
||||
* Macro for use in components that are of type sensor v1.0.0
|
||||
*/
|
||||
#define ORTE_SENSOR_BASE_VERSION_1_0_0 \
|
||||
/* sensor v1.0 is chained to MCA v2.0 */ \
|
||||
MCA_BASE_VERSION_2_0_0, \
|
||||
/* sensor v1.0 */ \
|
||||
"sensor", 1, 0, 0
|
||||
|
||||
/* Global structure for accessing sensor functions
|
||||
*/
|
||||
ORTE_DECLSPEC extern orte_sensor_base_API_module_t orte_sensor; /* holds API function pointers */
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
#endif /* MCA_SENSOR_H */
|
@ -1,51 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
||||
*
|
||||
* Copyright (c) 2017 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
/** @file:
|
||||
*/
|
||||
|
||||
#ifndef ORTE_MCA_SENSOR_TYPES_H
|
||||
#define ORTE_MCA_SENSOR_TYPES_H
|
||||
|
||||
#include "orte_config.h"
|
||||
#include "orte/constants.h"
|
||||
|
||||
#ifdef HAVE_SYS_TIME_H
|
||||
#include <sys/time.h>
|
||||
#endif /* HAVE_SYS_TIME_H */
|
||||
|
||||
#include "opal/dss/dss_types.h"
|
||||
|
||||
/*
|
||||
* General SENSOR types - instanced in runtime/orte_globals.c
|
||||
*/
|
||||
|
||||
BEGIN_C_DECLS
|
||||
|
||||
enum {
|
||||
ORTE_SENSOR_SCALE_LINEAR,
|
||||
ORTE_SENSOR_SCALE_LOG,
|
||||
ORTE_SENSOR_SCALE_SIGMOID
|
||||
};
|
||||
|
||||
/*
|
||||
* Structure for passing data from sensors
|
||||
*/
|
||||
typedef struct {
|
||||
opal_object_t super;
|
||||
char *sensor;
|
||||
struct timeval timestamp;
|
||||
opal_byte_object_t data;
|
||||
} orte_sensor_data_t;
|
||||
ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_sensor_data_t);
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
#endif
|
@ -2,6 +2,7 @@
|
||||
/*
|
||||
* Copyright (c) 2011-2015 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2017 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -85,27 +86,19 @@ ORTE_DECLSPEC extern mca_base_framework_t orte_state_base_framework;
|
||||
ORTE_JOBID_PRINT(shadow->jobid), \
|
||||
orte_job_state_to_str((s)), \
|
||||
__FILE__, __LINE__); \
|
||||
/* sanity check */ \
|
||||
if ((s) < 0) { \
|
||||
assert(0); \
|
||||
} \
|
||||
orte_state.activate_job_state(shadow, (s)); \
|
||||
} while(0);
|
||||
|
||||
#define ORTE_ACTIVATE_PROC_STATE(p, s) \
|
||||
do { \
|
||||
orte_process_name_t *shadow=(p); \
|
||||
opal_output_verbose(1, orte_state_base_framework.framework_output, \
|
||||
opal_output_verbose(1, orte_state_base_framework.framework_output, \
|
||||
"%s ACTIVATE PROC %s STATE %s AT %s:%d", \
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), \
|
||||
(NULL == shadow) ? "NULL" : \
|
||||
ORTE_NAME_PRINT(shadow), \
|
||||
orte_proc_state_to_str((s)), \
|
||||
__FILE__, __LINE__); \
|
||||
/* sanity check */ \
|
||||
if ((s) < 0) { \
|
||||
assert(0); \
|
||||
} \
|
||||
orte_state.activate_proc_state(shadow, (s)); \
|
||||
} while(0);
|
||||
|
||||
|
@ -1,13 +1,13 @@
|
||||
/*
|
||||
* Copyright (c) 2009-2011 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2009-2011 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2011-2012 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
@ -102,7 +102,9 @@ static opal_pmix_server_module_t pmix_server = {
|
||||
.notify_event = pmix_server_notify_event,
|
||||
.query = pmix_server_query_fn,
|
||||
.tool_connected = pmix_tool_connected_fn,
|
||||
.log = pmix_server_log_fn
|
||||
.log = pmix_server_log_fn,
|
||||
.allocate = pmix_server_alloc_fn,
|
||||
.job_control = pmix_server_job_ctrl_fn
|
||||
};
|
||||
|
||||
void pmix_server_register_params(void)
|
||||
@ -265,6 +267,12 @@ int pmix_server_init(void)
|
||||
kv->type = OPAL_BOOL;
|
||||
kv->data.flag = true;
|
||||
opal_list_append(&info, &kv->super);
|
||||
/* tell the server to use its own internal monitoring */
|
||||
kv = OBJ_NEW(opal_value_t);
|
||||
kv->key = strdup(OPAL_PMIX_SERVER_ENABLE_MONITORING);
|
||||
kv->type = OPAL_BOOL;
|
||||
kv->data.flag = true;
|
||||
opal_list_append(&info, &kv->super);
|
||||
|
||||
/* setup the local server */
|
||||
if (ORTE_SUCCESS != (rc = opal_pmix.server_init(&pmix_server, &info))) {
|
||||
|
@ -511,3 +511,13 @@ int pmix_server_disconnect_fn(opal_list_t *procs, opal_list_t *info,
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
||||
int pmix_server_alloc_fn(const opal_process_name_t *requestor,
|
||||
opal_pmix_alloc_directive_t dir,
|
||||
opal_list_t *info,
|
||||
opal_pmix_info_cbfunc_t cbfunc,
|
||||
void *cbdata)
|
||||
{
|
||||
/* ORTE currently has no way of supporting allocation requests */
|
||||
return ORTE_ERR_NOT_SUPPORTED;
|
||||
}
|
||||
|
@ -40,10 +40,12 @@
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/iof/iof.h"
|
||||
#include "orte/mca/rmaps/rmaps_types.h"
|
||||
#include "orte/mca/schizo/schizo.h"
|
||||
#include "orte/mca/state/state.h"
|
||||
#include "orte/util/name_fns.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
#include "orte/mca/rml/rml.h"
|
||||
#include "orte/mca/plm/plm.h"
|
||||
#include "orte/mca/plm/base/plm_private.h"
|
||||
|
||||
#include "pmix_server_internal.h"
|
||||
@ -611,7 +613,15 @@ static void _query(int sd, short args, void *cbdata)
|
||||
* and ask directly for the info - if rank=wildcard, then
|
||||
* we need to xcast the request and collect the results */
|
||||
}
|
||||
|
||||
} else if (0 == strcmp(q->keys[n], OPAL_PMIX_TIME_REMAINING)) {
|
||||
kv = OBJ_NEW(opal_value_t);
|
||||
kv->key = strdup(OPAL_PMIX_TIME_REMAINING);
|
||||
kv->type = OPAL_UINT32;
|
||||
if (ORTE_SUCCESS != orte_schizo.get_remaining_time(&kv->data.uint32)) {
|
||||
OBJ_RELEASE(kv);
|
||||
} else {
|
||||
opal_list_append(results, &kv->super);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -813,3 +823,62 @@ void pmix_server_log_fn(opal_process_name_t *requestor,
|
||||
cbfunc(OPAL_SUCCESS, cbdata);
|
||||
}
|
||||
}
|
||||
|
||||
int pmix_server_job_ctrl_fn(const opal_process_name_t *requestor,
|
||||
opal_list_t *targets,
|
||||
opal_list_t *info,
|
||||
opal_pmix_info_cbfunc_t cbfunc,
|
||||
void *cbdata)
|
||||
{
|
||||
opal_value_t *val;
|
||||
int rc, n;
|
||||
orte_proc_t *proc;
|
||||
opal_pointer_array_t parray, *ptrarray;
|
||||
opal_namelist_t *nm;
|
||||
|
||||
opal_output_verbose(2, orte_pmix_server_globals.output,
|
||||
"%s job control request from %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(requestor));
|
||||
|
||||
OPAL_LIST_FOREACH(val, info, opal_value_t) {
|
||||
if (NULL == val->key) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (0 == strcmp(val->key, OPAL_PMIX_JOB_CTRL_KILL)) {
|
||||
/* convert the list of targets to a pointer array */
|
||||
if (NULL == targets) {
|
||||
ptrarray = NULL;
|
||||
} else {
|
||||
OBJ_CONSTRUCT(&parray, opal_pointer_array_t);
|
||||
OPAL_LIST_FOREACH(nm, targets, opal_namelist_t) {
|
||||
/* get the proc object for this proc */
|
||||
if (NULL == (proc = orte_get_proc_object(&nm->name))) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
||||
continue;
|
||||
}
|
||||
OBJ_RETAIN(proc);
|
||||
opal_pointer_array_add(&parray, proc);
|
||||
}
|
||||
ptrarray = &parray;
|
||||
}
|
||||
if (ORTE_SUCCESS != (rc = orte_plm.terminate_procs(ptrarray))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
if (NULL != ptrarray) {
|
||||
/* cleanup the array */
|
||||
for (n=0; n < parray.size; n++) {
|
||||
if (NULL != (proc = (orte_proc_t*)opal_pointer_array_get_item(&parray, n))) {
|
||||
OBJ_RELEASE(proc);
|
||||
}
|
||||
}
|
||||
OBJ_DESTRUCT(&parray);
|
||||
}
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
@ -12,7 +12,7 @@
|
||||
* Copyright (c) 2006-2013 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2010-2011 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2013-2016 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2013-2017 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2014 Mellanox Technologies, Inc.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2014 Research Organization for Information Science
|
||||
@ -206,6 +206,18 @@ extern void pmix_server_log_fn(opal_process_name_t *requestor,
|
||||
opal_pmix_op_cbfunc_t cbfunc,
|
||||
void *cbdata);
|
||||
|
||||
extern int pmix_server_alloc_fn(const opal_process_name_t *requestor,
|
||||
opal_pmix_alloc_directive_t dir,
|
||||
opal_list_t *info,
|
||||
opal_pmix_info_cbfunc_t cbfunc,
|
||||
void *cbdata);
|
||||
|
||||
extern int pmix_server_job_ctrl_fn(const opal_process_name_t *requestor,
|
||||
opal_list_t *targets,
|
||||
opal_list_t *info,
|
||||
opal_pmix_info_cbfunc_t cbfunc,
|
||||
void *cbdata);
|
||||
|
||||
/* declare the RML recv functions for responses */
|
||||
extern void pmix_server_launch_resp(int status, orte_process_name_t* sender,
|
||||
opal_buffer_t *buffer,
|
||||
|
@ -705,7 +705,7 @@ int orte_util_decode_daemon_nodemap(opal_buffer_t *buffer)
|
||||
char *ndnames, *rmndr, **tmp;
|
||||
opal_list_t dids, slts, flgs;;
|
||||
opal_buffer_t *bptr=NULL;
|
||||
orte_topology_t *t;
|
||||
orte_topology_t *t2;
|
||||
orte_regex_range_t *rng, *drng, *srng, *frng;
|
||||
uint8_t ui8;
|
||||
|
||||
@ -978,14 +978,13 @@ int orte_util_decode_daemon_nodemap(opal_buffer_t *buffer)
|
||||
|
||||
/* if no topology info was passed, then everyone shares our topology */
|
||||
if (NULL == bptr) {
|
||||
orte_topology_t *t;
|
||||
/* our topology is first in the array */
|
||||
t = (orte_topology_t*)opal_pointer_array_get_item(orte_node_topologies, 0);
|
||||
t2 = (orte_topology_t*)opal_pointer_array_get_item(orte_node_topologies, 0);
|
||||
for (n=0; n < orte_node_pool->size; n++) {
|
||||
if (NULL != (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, n))) {
|
||||
if (NULL == node->topology) {
|
||||
OBJ_RETAIN(t);
|
||||
node->topology = t;
|
||||
OBJ_RETAIN(t2);
|
||||
node->topology = t2;
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -1004,6 +1003,13 @@ int orte_util_decode_daemon_nodemap(opal_buffer_t *buffer)
|
||||
OBJ_RELEASE(bptr);
|
||||
goto cleanup;
|
||||
}
|
||||
if (NULL == sig) {
|
||||
rc = ORTE_ERR_BAD_PARAM;
|
||||
ORTE_ERROR_LOG(rc);
|
||||
opal_argv_free(tmp);
|
||||
OBJ_RELEASE(bptr);
|
||||
goto cleanup;
|
||||
}
|
||||
n = 1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.unpack(bptr, &topo, &n, OPAL_HWLOC_TOPO))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
@ -1013,11 +1019,12 @@ int orte_util_decode_daemon_nodemap(opal_buffer_t *buffer)
|
||||
goto cleanup;
|
||||
}
|
||||
/* see if we already have this topology - could be an update */
|
||||
t2 = NULL;
|
||||
for (n=0; n < orte_node_topologies->size; n++) {
|
||||
if (NULL == (t = (orte_topology_t*)opal_pointer_array_get_item(orte_node_topologies, n))) {
|
||||
if (NULL == (t2 = (orte_topology_t*)opal_pointer_array_get_item(orte_node_topologies, n))) {
|
||||
continue;
|
||||
}
|
||||
if (0 == strcmp(t->sig, sig)) {
|
||||
if (0 == strcmp(t2->sig, sig)) {
|
||||
/* found a match */
|
||||
free(sig);
|
||||
opal_hwloc_base_free_topology(topo);
|
||||
@ -1025,11 +1032,12 @@ int orte_util_decode_daemon_nodemap(opal_buffer_t *buffer)
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (NULL != sig) {
|
||||
if (NULL != sig || NULL == t2) {
|
||||
/* new topology - record it */
|
||||
t = OBJ_NEW(orte_topology_t);
|
||||
t->sig = sig;
|
||||
t->topo = topo;
|
||||
t2 = OBJ_NEW(orte_topology_t);
|
||||
t2->sig = sig;
|
||||
t2->topo = topo;
|
||||
opal_pointer_array_add(orte_node_topologies, t2);
|
||||
}
|
||||
/* point each of the nodes in the regex to this topology */
|
||||
start = strtoul(tmp[nn], &rmndr, 10);
|
||||
@ -1043,8 +1051,8 @@ int orte_util_decode_daemon_nodemap(opal_buffer_t *buffer)
|
||||
for (k=start; k <= endpt; k++) {
|
||||
if (NULL != (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, k))) {
|
||||
if (NULL == node->topology) {
|
||||
OBJ_RETAIN(t);
|
||||
node->topology = t;
|
||||
OBJ_RETAIN(t2);
|
||||
node->topology = t2;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user