1
1

Merge pull request #3218 from rhc54/topic/pmix2

Update to include the PMIx 2.0 APIs for monitoring and job control.
Этот коммит содержится в:
Ralph Castain 2017-03-21 20:11:10 -07:00 коммит произвёл GitHub
родитель 10d401b6ec d645557fa0
Коммит ea84a53faa
83 изменённых файлов: 2709 добавлений и 2975 удалений

Просмотреть файл

@ -10,7 +10,7 @@
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2010-2012 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -94,7 +94,9 @@ enum {
OPAL_ERR_PROC_RESTART = (OPAL_ERR_BASE - 63),
OPAL_ERR_PROC_CHECKPOINT = (OPAL_ERR_BASE - 64),
OPAL_ERR_PROC_MIGRATE = (OPAL_ERR_BASE - 65),
OPAL_ERR_EVENT_REGISTRATION = (OPAL_ERR_BASE - 66)
OPAL_ERR_EVENT_REGISTRATION = (OPAL_ERR_BASE - 66),
OPAL_ERR_HEARTBEAT_ALERT = (OPAL_ERR_BASE - 67),
OPAL_ERR_FILE_ALERT = (OPAL_ERR_BASE - 68)
};
#define OPAL_ERR_MAX (OPAL_ERR_BASE - 100)

Просмотреть файл

@ -1,6 +1,6 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
* Copyright (c) 2014-2017 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* Copyright (c) 2014-2015 Mellanox Technologies, Inc.
@ -352,7 +352,7 @@ static void _event_hdlr(int sd, short args, void *cbdata)
if (NULL != chain->final_cbfunc) {
chain->final_cbfunc(PMIX_SUCCESS, chain->final_cbdata);
}
OBJ_RELEASE(chain);
return;

Просмотреть файл

@ -473,6 +473,59 @@ pmix_status_t PMIx_Allocation_request_nb(pmix_alloc_directive_t directive,
pmix_info_t *info, size_t ninfo,
pmix_info_cbfunc_t cbfunc, void *cbdata);
/* Request a job control action. The targets array identifies the
* processes to which the requested job control action is to be applied.
* A NULL value can be used to indicate all processes in the caller's
* nspace. The use of PMIX_RANK_WILDARD can also be used to indicate
* that all processes in the given nspace are to be included.
*
* The directives are provided as pmix_info_t structs in the directives
* array. The callback function provides a status to indicate whether or
* not the request was granted, and to provide some information as to
* the reason for any denial in the pmix_info_cbfunc_t array of pmix_info_t
* structures. If non-NULL, then the specified release_fn must be called
* when the callback function completes - this will be used to release
* any provided pmix_info_t array.
*/
pmix_status_t PMIx_Job_control_nb(const pmix_proc_t targets[], size_t ntargets,
const pmix_info_t directives[], size_t ndirs,
pmix_info_cbfunc_t cbfunc, void *cbdata);
/* Request that something be monitored - e.g., that the server monitor
* this process for periodic heartbeats as an indication that the process
* has not become "wedged". When a monitor detects the specified alarm
* condition, it will generate an event notification using the provided
* error code and passing along any available relevant information. It is
* up to the caller to register a corresponding event handler.
*
* Params:
*
* monitor: attribute indicating the type of monitor being requested - e.g.,
* PMIX_MONITOR_FILE to indicate that the requestor is asking that
* a file be monitored.
*
* error: the status code to be used when generating an event notification
* alerting that the monitor has been triggered. The range of the
* notification defaults to PMIX_RANGE_NAMESPACE - this can be
* changed by providing a PMIX_RANGE directive
*
* directives: characterize the monitoring request (e.g., monitor file size)
* and frequency of checking to be done
*
* cbfunc: provides a status to indicate whether or not the request was granted,
* and to provide some information as to the reason for any denial in
* the pmix_info_cbfunc_t array of pmix_info_t structures.
*
* Note: a process can send a heartbeat to the server using the PMIx_Heartbeat
* macro provided below*/
pmix_status_t PMIx_Process_monitor_nb(const pmix_info_t *monitor, pmix_status_t error,
const pmix_info_t directives[], size_t ndirs,
pmix_info_cbfunc_t cbfunc, void *cbdata);
/* define a special macro to simplify sending of a heartbeat */
#define PMIx_Heartbeat() \
PMIx_Process_monitor_nb(PMIX_SEND_HEARTBEAT, NULL, 0, NULL, NULL)
#if defined(c_plusplus) || defined(__cplusplus)
}
#endif

Просмотреть файл

@ -123,6 +123,8 @@ typedef uint32_t pmix_rank_t;
// a local system-level PMIx server
#define PMIX_CONNECT_SYSTEM_FIRST "pmix.cnct.sys.first" // (bool) Preferentially look for a system-level PMIx server first
#define PMIX_REGISTER_NODATA "pmix.reg.nodata" // (bool) Registration is for nspace only, do not copy job data
#define PMIX_SERVER_ENABLE_MONITORING "pmix.srv.monitor" // (bool) Enable PMIx internal monitoring by server
/* identification attributes */
#define PMIX_USERID "pmix.euid" // (uint32_t) effective user id
@ -218,8 +220,9 @@ typedef uint32_t pmix_rank_t;
#define PMIX_COLLECTIVE_ALGO "pmix.calgo" // (char*) comma-delimited list of algorithms to use for collective
#define PMIX_COLLECTIVE_ALGO_REQD "pmix.calreqd" // (bool) if true, indicates that the requested choice of algo is mandatory
#define PMIX_NOTIFY_COMPLETION "pmix.notecomp" // (bool) notify parent process upon termination of child job
#define PMIX_RANGE "pmix.range" // (int) pmix_data_range_t value for calls to publish/lookup/unpublish
#define PMIX_PERSISTENCE "pmix.persist" // (int) pmix_persistence_t value for calls to publish
#define PMIX_RANGE "pmix.range" // (pmix_data_range_t) value for calls to publish/lookup/unpublish or for
// monitoring event notifications
#define PMIX_PERSISTENCE "pmix.persist" // (pmix_persistence_t) value for calls to publish
#define PMIX_OPTIONAL "pmix.optional" // (bool) look only in the immediate data store for the requested value - do
// not request data from the server if not found
#define PMIX_EMBED_BARRIER "pmix.embed.barrier" // (bool) execute a blocking fence operation before executing the
@ -259,66 +262,72 @@ typedef uint32_t pmix_rank_t;
#define PMIX_EVENT_ACTION_TIMEOUT "pmix.evtimeout" // (int) time in sec before RM will execute error response
/* attributes used to describe "spawn" attributes */
#define PMIX_PERSONALITY "pmix.pers" // (char*) name of personality to use
#define PMIX_HOST "pmix.host" // (char*) comma-delimited list of hosts to use for spawned procs
#define PMIX_HOSTFILE "pmix.hostfile" // (char*) hostfile to use for spawned procs
#define PMIX_ADD_HOST "pmix.addhost" // (char*) comma-delimited list of hosts to add to allocation
#define PMIX_ADD_HOSTFILE "pmix.addhostfile" // (char*) hostfile to add to existing allocation
#define PMIX_PREFIX "pmix.prefix" // (char*) prefix to use for starting spawned procs
#define PMIX_WDIR "pmix.wdir" // (char*) working directory for spawned procs
#define PMIX_MAPPER "pmix.mapper" // (char*) mapper to use for placing spawned procs
#define PMIX_DISPLAY_MAP "pmix.dispmap" // (bool) display process map upon spawn
#define PMIX_PPR "pmix.ppr" // (char*) #procs to spawn on each identified resource
#define PMIX_MAPBY "pmix.mapby" // (char*) mapping policy
#define PMIX_RANKBY "pmix.rankby" // (char*) ranking policy
#define PMIX_BINDTO "pmix.bindto" // (char*) binding policy
#define PMIX_PRELOAD_BIN "pmix.preloadbin" // (bool) preload binaries
#define PMIX_PRELOAD_FILES "pmix.preloadfiles" // (char*) comma-delimited list of files to pre-position
#define PMIX_NON_PMI "pmix.nonpmi" // (bool) spawned procs will not call PMIx_Init
#define PMIX_STDIN_TGT "pmix.stdin" // (uint32_t) spawned proc rank that is to receive stdin
#define PMIX_FWD_STDIN "pmix.fwd.stdin" // (bool) forward my stdin to the designated proc
#define PMIX_FWD_STDOUT "pmix.fwd.stdout" // (bool) forward stdout from spawned procs to me
#define PMIX_FWD_STDERR "pmix.fwd.stderr" // (bool) forward stderr from spawned procs to me
#define PMIX_DEBUGGER_DAEMONS "pmix.debugger" // (bool) spawned app consists of debugger daemons
#define PMIX_COSPAWN_APP "pmix.cospawn" // (bool) designated app is to be spawned as a disconnected
// job - i.e., not part of the "comm_world" of the job
#define PMIX_PERSONALITY "pmix.pers" // (char*) name of personality to use
#define PMIX_HOST "pmix.host" // (char*) comma-delimited list of hosts to use for spawned procs
#define PMIX_HOSTFILE "pmix.hostfile" // (char*) hostfile to use for spawned procs
#define PMIX_ADD_HOST "pmix.addhost" // (char*) comma-delimited list of hosts to add to allocation
#define PMIX_ADD_HOSTFILE "pmix.addhostfile" // (char*) hostfile to add to existing allocation
#define PMIX_PREFIX "pmix.prefix" // (char*) prefix to use for starting spawned procs
#define PMIX_WDIR "pmix.wdir" // (char*) working directory for spawned procs
#define PMIX_MAPPER "pmix.mapper" // (char*) mapper to use for placing spawned procs
#define PMIX_DISPLAY_MAP "pmix.dispmap" // (bool) display process map upon spawn
#define PMIX_PPR "pmix.ppr" // (char*) #procs to spawn on each identified resource
#define PMIX_MAPBY "pmix.mapby" // (char*) mapping policy
#define PMIX_RANKBY "pmix.rankby" // (char*) ranking policy
#define PMIX_BINDTO "pmix.bindto" // (char*) binding policy
#define PMIX_PRELOAD_BIN "pmix.preloadbin" // (bool) preload binaries
#define PMIX_PRELOAD_FILES "pmix.preloadfiles" // (char*) comma-delimited list of files to pre-position
#define PMIX_NON_PMI "pmix.nonpmi" // (bool) spawned procs will not call PMIx_Init
#define PMIX_STDIN_TGT "pmix.stdin" // (uint32_t) spawned proc rank that is to receive stdin
#define PMIX_FWD_STDIN "pmix.fwd.stdin" // (bool) forward my stdin to the designated proc
#define PMIX_FWD_STDOUT "pmix.fwd.stdout" // (bool) forward stdout from spawned procs to me
#define PMIX_FWD_STDERR "pmix.fwd.stderr" // (bool) forward stderr from spawned procs to me
#define PMIX_DEBUGGER_DAEMONS "pmix.debugger" // (bool) spawned app consists of debugger daemons
#define PMIX_COSPAWN_APP "pmix.cospawn" // (bool) designated app is to be spawned as a disconnected
// job - i.e., not part of the "comm_world" of the job
/* query attributes */
#define PMIX_QUERY_NAMESPACES "pmix.qry.ns" // (char*) request a comma-delimited list of active nspaces
#define PMIX_QUERY_JOB_STATUS "pmix.qry.jst" // (pmix_status_t) status of a specified currently executing job
#define PMIX_QUERY_QUEUE_LIST "pmix.qry.qlst" // (char*) request a comma-delimited list of scheduler queues
#define PMIX_QUERY_QUEUE_STATUS "pmix.qry.qst" // (TBD) status of a specified scheduler queue
#define PMIX_QUERY_PROC_TABLE "pmix.qry.ptable" // (char*) input nspace of job whose info is being requested
// returns (pmix_data_array_t) an array of pmix_proc_info_t
#define PMIX_QUERY_LOCAL_PROC_TABLE "pmix.qry.lptable" // (char*) input nspace of job whose info is being requested
// returns (pmix_data_array_t) an array of pmix_proc_info_t for
// procs in job on same node
#define PMIX_QUERY_AUTHORIZATIONS "pmix.qry.auths" // return operations tool is authorized to perform
#define PMIX_QUERY_SPAWN_SUPPORT "pmix.qry.spawn" // return a comma-delimited list of supported spawn attributes
#define PMIX_QUERY_DEBUG_SUPPORT "pmix.qry.debug" // return a comma-delimited list of supported debug attributes
#define PMIX_QUERY_MEMORY_USAGE "pmix.qry.mem" // return info on memory usage for the procs indicated in the qualifiers
#define PMIX_QUERY_LOCAL_ONLY "pmix.qry.local" // constrain the query to local information only
#define PMIX_QUERY_REPORT_AVG "pmix.qry.avg" // report average values
#define PMIX_QUERY_REPORT_MINMAX "pmix.qry.minmax" // report minimum and maximum value
#define PMIX_QUERY_ALLOC_STATUS "pmix.query.alloc" // (char*) string identifier of the allocation whose status
// is being requested
#define PMIX_QUERY_NAMESPACES "pmix.qry.ns" // (char*) request a comma-delimited list of active nspaces
#define PMIX_QUERY_JOB_STATUS "pmix.qry.jst" // (pmix_status_t) status of a specified currently executing job
#define PMIX_QUERY_QUEUE_LIST "pmix.qry.qlst" // (char*) request a comma-delimited list of scheduler queues
#define PMIX_QUERY_QUEUE_STATUS "pmix.qry.qst" // (TBD) status of a specified scheduler queue
#define PMIX_QUERY_PROC_TABLE "pmix.qry.ptable" // (char*) input nspace of job whose info is being requested
// returns (pmix_data_array_t) an array of pmix_proc_info_t
#define PMIX_QUERY_LOCAL_PROC_TABLE "pmix.qry.lptable" // (char*) input nspace of job whose info is being requested
// returns (pmix_data_array_t) an array of pmix_proc_info_t for
// procs in job on same node
#define PMIX_QUERY_AUTHORIZATIONS "pmix.qry.auths" // (bool) return operations tool is authorized to perform
#define PMIX_QUERY_SPAWN_SUPPORT "pmix.qry.spawn" // (bool) return a comma-delimited list of supported spawn attributes
#define PMIX_QUERY_DEBUG_SUPPORT "pmix.qry.debug" // (bool) return a comma-delimited list of supported debug attributes
#define PMIX_QUERY_MEMORY_USAGE "pmix.qry.mem" // (bool) return info on memory usage for the procs indicated in the qualifiers
#define PMIX_QUERY_LOCAL_ONLY "pmix.qry.local" // (bool) constrain the query to local information only
#define PMIX_QUERY_REPORT_AVG "pmix.qry.avg" // (bool) report average values
#define PMIX_QUERY_REPORT_MINMAX "pmix.qry.minmax" // (bool) report minimum and maximum value
#define PMIX_QUERY_ALLOC_STATUS "pmix.query.alloc" // (char*) string identifier of the allocation whose status
// is being requested
#define PMIX_TIME_REMAINING "pmix.time.remaining" // (char*) query number of seconds (uint32_t) remaining in allocation
// for the specified nspace
/* log attributes */
#define PMIX_LOG_STDERR "pmix.log.stderr" // (char*) log string to stderr
#define PMIX_LOG_STDOUT "pmix.log.stdout" // (char*) log string to stdout
#define PMIX_LOG_SYSLOG "pmix.log.syslog" // (char*) log data to syslog - defaults to ERROR priority unless
#define PMIX_LOG_MSG "pmix.log.msg" // (pmix_byte_object_t) message blob to be sent somewhere
#define PMIX_LOG_STDERR "pmix.log.stderr" // (char*) log string to stderr
#define PMIX_LOG_STDOUT "pmix.log.stdout" // (char*) log string to stdout
#define PMIX_LOG_SYSLOG "pmix.log.syslog" // (char*) log data to syslog - defaults to ERROR priority unless
#define PMIX_LOG_MSG "pmix.log.msg" // (pmix_byte_object_t) message blob to be sent somewhere
#define PMIX_LOG_EMAIL "pmix.log.email" // (pmix_data_array_t) log via email based on pmix_info_t containing directives
#define PMIX_LOG_EMAIL_ADDR "pmix.log.emaddr" // (char*) comma-delimited list of email addresses that are to recv msg
#define PMIX_LOG_EMAIL_SUBJECT "pmix.log.emsub" // (char*) subject line for email
#define PMIX_LOG_EMAIL_MSG "pmix.log.emmsg" // (char*) msg to be included in email
/* debugger attributes */
#define PMIX_DEBUG_STOP_ON_EXEC "pmix.dbg.exec" // (bool) job is being spawned under debugger - instruct it to pause on start
#define PMIX_DEBUG_STOP_IN_INIT "pmix.dbg.init" // (bool) instruct job to stop during PMIx init
#define PMIX_DEBUG_WAIT_FOR_NOTIFY "pmix.dbg.notify" // (bool) block at desired point until receiving debugger release notification
#define PMIX_DEBUG_JOB "pmix.dbg.job" // (char*) nspace of the job to be debugged - the RM/PMIx server are
#define PMIX_DEBUG_WAITING_FOR_NOTIFY "pmix.dbg.waiting" // (bool) job to be debugged is waiting for a release
#define PMIX_DEBUG_STOP_ON_EXEC "pmix.dbg.exec" // (bool) job is being spawned under debugger - instruct it to pause on start
#define PMIX_DEBUG_STOP_IN_INIT "pmix.dbg.init" // (bool) instruct job to stop during PMIx init
#define PMIX_DEBUG_WAIT_FOR_NOTIFY "pmix.dbg.notify" // (bool) block at desired point until receiving debugger release notification
#define PMIX_DEBUG_JOB "pmix.dbg.job" // (char*) nspace of the job to be debugged - the RM/PMIx server are
#define PMIX_DEBUG_WAITING_FOR_NOTIFY "pmix.dbg.waiting" // (bool) job to be debugged is waiting for a release
/* Resource Manager identification */
#define PMIX_RM_NAME "pmix.rm.name" // (char*) string name of the resource manager
#define PMIX_RM_VERSION "pmix.rm.version" // (char*) RM version string
#define PMIX_RM_NAME "pmix.rm.name" // (char*) string name of the resource manager
#define PMIX_RM_VERSION "pmix.rm.version" // (char*) RM version string
/* attributes for setting envars */
#define PMIX_SET_ENVAR "pmix.set.envar" // (char*) string "key=value" value shall be put into the environment
@ -327,7 +336,6 @@ typedef uint32_t pmix_rank_t;
/* attributes relating to allocations */
#define PMIX_ALLOC_ID "pmix.alloc.id" // (char*) provide a string identifier for this allocation request
// which can later be used to query status of the request
#define PMIX_TIME_REMAINING "pmix.time.remaining" // (uint32_t) get number of seconds remaining in allocation
#define PMIX_ALLOC_NUM_NODES "pmix.alloc.nnodes" // (uint64_t) number of nodes
#define PMIX_ALLOC_NODE_LIST "pmix.alloc.nlist" // (char*) regex of specific nodes
#define PMIX_ALLOC_NUM_CPUS "pmix.alloc.ncpus" // (uint64_t) number of cpus
@ -343,6 +351,38 @@ typedef uint32_t pmix_rank_t;
#define PMIX_ALLOC_NETWORK_QOS "pmix.alloc.netqos" // (char*) quality of service level
#define PMIX_ALLOC_TIME "pmix.alloc.time" // (uint32_t) time in seconds
/* job control attributes */
#define PMIX_JOB_CTRL_ID "pmix.jctrl.id" // (char*) provide a string identifier for this request
#define PMIX_JOB_CTRL_PAUSE "pmix.jctrl.pause" // (bool) pause the specified processes
#define PMIX_JOB_CTRL_RESUME "pmix.jctrl.resume" // (bool) "un-pause" the specified processes
#define PMIX_JOB_CTRL_CANCEL "pmix.jctrl.cancel" // (char*) cancel the specified request
// (NULL => cancel all requests from this requestor)
#define PMIX_JOB_CTRL_KILL "pmix.jctrl.kill" // (bool) forcibly terminate the specified processes and cleanup
#define PMIX_JOB_CTRL_RESTART "pmix.jctrl.restart" // (char*) restart the specified processes using the given checkpoint ID
#define PMIX_JOB_CTRL_CHECKPOINT "pmix.jctrl.ckpt" // (char*) checkpoint the specified processes and assign the given ID to it
#define PMIX_JOB_CTRL_CHECKPOINT_EVENT "pmix.jctrl.ckptev" // (bool) use event notification to trigger process checkpoint
#define PMIX_JOB_CTRL_CHECKPOINT_SIGNAL "pmix.jctrl.ckptsig" // (int) use the given signal to trigger process checkpoint
#define PMIX_JOB_CTRL_CHECKPOINT_TIMEOUT "pmix.jctrl.ckptsig" // (int) time in seconds to wait for checkpoint to complete
#define PMIX_JOB_CTRL_SIGNAL "pmix.jctrl.sig" // (int) send given signal to specified processes
#define PMIX_JOB_CTRL_PROVISION "pmix.jctrl.pvn" // (char*) regex identifying nodes that are to be provisioned
#define PMIX_JOB_CTRL_PROVISION_IMAGE "pmix.jctrl.pvnimg" // (char*) name of the image that is to be provisioned
#define PMIX_JOB_CTRL_PREEMPTIBLE "pmix.jctrl.preempt" // (bool) job can be pre-empted
/* monitoring attributes */
#define PMIX_MONITOR_HEARTBEAT "pmix.monitor.mbeat" // (void) register to have the server monitor the requestor for heartbeats
#define PMIX_SEND_HEARTBEAT "pmix.monitor.beat" // (void) send heartbeat to local server
#define PMIX_MONITOR_HEARTBEAT_TIME "pmix.monitor.btime" // (uint32_t) time in seconds before declaring heartbeat missed
#define PMIX_MONITOR_HEARTBEAT_DROPS "pmix.monitor.bdrop" // (uint32_t) number of heartbeats that can be missed before taking
// specified action
#define PMIX_MONITOR_FILE "pmix.monitor.fmon" // (char*) register to monitor file for signs of life
#define PMIX_MONITOR_FILE_SIZE "pmix.monitor.fsize" // (bool) monitor size of given file is growing to determine app is running
#define PMIX_MONITOR_FILE_ACCESS "pmix.monitor.faccess" // (char*) monitor time since last access of given file to determine app is running
#define PMIX_MONITOR_FILE_MODIFY "pmix.monitor.fmod" // (char*) monitor time since last modified of given file to determine app is running
#define PMIX_MONITOR_FILE_CHECK_TIME "pmix.monitor.ftime" // (uint32_t) time in seconds between checking file
#define PMIX_MONITOR_FILE_DROPS "pmix.monitor.fdrop" // (uint32_t) number of file checks that can be missed before taking
// specified action
/**** PROCESS STATE DEFINITIONS ****/
typedef uint8_t pmix_proc_state_t;
#define PMIX_PROC_STATE_UNDEF 0 /* undefined process state */
@ -455,7 +495,14 @@ typedef int pmix_status_t;
#define PMIX_ERR_LOST_CONNECTION_TO_CLIENT (PMIX_ERR_V2X_BASE - 3)
/* used by the query system */
#define PMIX_QUERY_PARTIAL_SUCCESS (PMIX_ERR_V2X_BASE - 4)
/* request responses */
#define PMIX_NOTIFY_ALLOC_COMPLETE (PMIX_ERR_V2X_BASE - 5)
/* job control */
#define PMIX_JCTRL_CHECKPOINT (PMIX_ERR_V2X_BASE - 6)
#define PMIX_JCTRL_PREEMPT_ALERT (PMIX_ERR_V2X_BASE - 7)
/* monitoring */
#define PMIX_MONITOR_HEARTBEAT_ALERT (PMIX_ERR_V2X_BASE - 8)
#define PMIX_MONITOR_FILE_ALERT (PMIX_ERR_V2X_BASE - 9)
/* define a starting point for operational error constants so
* we avoid renumbering when making additions */

Просмотреть файл

@ -328,6 +328,17 @@ typedef pmix_status_t (*pmix_server_alloc_fn_t)(const pmix_proc_t *client,
const pmix_info_t data[], size_t ndata,
pmix_info_cbfunc_t cbfunc, void *cbdata);
/* Execute a job control action on behalf of a client */
typedef pmix_status_t (*pmix_server_job_control_fn_t)(const pmix_proc_t *requestor,
const pmix_proc_t targets[], size_t ntargets,
const pmix_info_t directives[], size_t ndirs,
pmix_info_cbfunc_t cbfunc, void *cbdata);
/* Request that a client be monitored for activity */
typedef pmix_status_t (*pmix_server_monitor_fn_t)(const pmix_proc_t *requestor, pmix_status_t error,
const pmix_info_t directives[], size_t ndirs,
pmix_info_cbfunc_t cbfunc, void *cbdata);
typedef struct pmix_server_module_2_0_0_t {
/* v1x interfaces */
pmix_server_client_connected_fn_t client_connected;
@ -350,12 +361,14 @@ typedef struct pmix_server_module_2_0_0_t {
pmix_server_tool_connection_fn_t tool_connected;
pmix_server_log_fn_t log;
pmix_server_alloc_fn_t allocate;
pmix_server_job_control_fn_t job_control;
pmix_server_monitor_fn_t monitor;
} pmix_server_module_t;
/**** SERVER SUPPORT INIT/FINALIZE FUNCTIONS ****/
/* Initialize the server support library, and provide a
* pointer to a pmix_server_module_t structure
* pointer to a pmix_server_module_t structure
* containing the caller's callback functions. The
* array of pmix_info_t structs is used to pass
* additional info that may be required by the server

Просмотреть файл

@ -1,6 +1,6 @@
# -*- makefile -*-
#
# Copyright (c) 2015 Intel, Inc. All rights reserved.
# Copyright (c) 2015-2017 Intel, Inc. All rights reserved.
# Copyright (c) 2016 Cisco Systems, Inc. All rights reserved.
# $COPYRIGHT$
#
@ -13,4 +13,5 @@ sources += \
common/pmix_query.c \
common/pmix_strings.c \
common/pmix_log.c \
common/pmix_jobdata.c
common/pmix_jobdata.c \
common/pmix_control.c

Просмотреть файл

@ -0,0 +1,269 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
* Copyright (c) 2016 Mellanox Technologies, Inc.
* All rights reserved.
* Copyright (c) 2016 IBM Corporation. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include <src/include/pmix_config.h>
#include <src/include/types.h>
#include <src/include/pmix_stdint.h>
#include <src/include/pmix_socket_errno.h>
#include <pmix.h>
#include <pmix_common.h>
#include <pmix_server.h>
#include <pmix_rename.h>
#include "src/util/argv.h"
#include "src/util/error.h"
#include "src/util/output.h"
#include "src/buffer_ops/buffer_ops.h"
#include "src/mca/ptl/ptl.h"
#include "src/client/pmix_client_ops.h"
#include "src/server/pmix_server_ops.h"
#include "src/include/pmix_globals.h"
static void relcbfunc(void *cbdata)
{
pmix_shift_caddy_t *cd = (pmix_shift_caddy_t*)cbdata;
pmix_output_verbose(2, pmix_globals.debug_output,
"pmix:query release callback");
if (NULL != cd->info) {
PMIX_INFO_FREE(cd->info, cd->ninfo);
}
PMIX_RELEASE(cd);
}
static void query_cbfunc(struct pmix_peer_t *peer,
pmix_ptl_hdr_t *hdr,
pmix_buffer_t *buf, void *cbdata)
{
pmix_query_caddy_t *cd = (pmix_query_caddy_t*)cbdata;
pmix_status_t rc;
pmix_shift_caddy_t *results;
int cnt;
pmix_output_verbose(2, pmix_globals.debug_output,
"pmix:query cback from server");
results = PMIX_NEW(pmix_shift_caddy_t);
/* unpack the status */
cnt = 1;
if (PMIX_SUCCESS != (rc = pmix_bfrop.unpack(buf, &results->status, &cnt, PMIX_STATUS))) {
PMIX_ERROR_LOG(rc);
goto complete;
}
if (PMIX_SUCCESS != results->status) {
goto complete;
}
/* unpack any returned data */
cnt = 1;
if (PMIX_SUCCESS != (rc = pmix_bfrop.unpack(buf, &results->ninfo, &cnt, PMIX_SIZE))) {
PMIX_ERROR_LOG(rc);
goto complete;
}
if (0 < results->ninfo) {
PMIX_INFO_CREATE(results->info, results->ninfo);
cnt = results->ninfo;
if (PMIX_SUCCESS != (rc = pmix_bfrop.unpack(buf, results->info, &cnt, PMIX_INFO))) {
PMIX_ERROR_LOG(rc);
goto complete;
}
}
complete:
pmix_output_verbose(2, pmix_globals.debug_output,
"pmix:query cback from server releasing");
/* release the caller */
if (NULL != cd->cbfunc) {
cd->cbfunc(results->status, results->info, results->ninfo, cd->cbdata, relcbfunc, results);
}
PMIX_RELEASE(cd);
}
PMIX_EXPORT pmix_status_t PMIx_Job_control_nb(const pmix_proc_t targets[], size_t ntargets,
const pmix_info_t directives[], size_t ndirs,
pmix_info_cbfunc_t cbfunc, void *cbdata)
{
pmix_buffer_t *msg;
pmix_cmd_t cmd = PMIX_JOB_CONTROL_CMD;
pmix_status_t rc;
pmix_query_caddy_t *cb;
pmix_output_verbose(2, pmix_globals.debug_output,
"pmix: job control called");
if (pmix_globals.init_cntr <= 0) {
return PMIX_ERR_INIT;
}
/* if we are the server, then we just issue the request and
* return the response */
if (PMIX_PROC_SERVER == pmix_globals.proc_type) {
if (NULL == pmix_host_server.job_control) {
/* nothing we can do */
return PMIX_ERR_NOT_SUPPORTED;
}
pmix_output_verbose(2, pmix_globals.debug_output,
"pmix:job_control handed to RM");
rc = pmix_host_server.job_control(&pmix_globals.myid,
targets, ntargets,
directives, ndirs,
cbfunc, cbdata);
return rc;
}
/* if we are a client, then relay this request to the server */
/* if we aren't connected, don't attempt to send */
if (!pmix_globals.connected) {
return PMIX_ERR_UNREACH;
}
msg = PMIX_NEW(pmix_buffer_t);
/* pack the cmd */
if (PMIX_SUCCESS != (rc = pmix_bfrop.pack(msg, &cmd, 1, PMIX_CMD))) {
PMIX_ERROR_LOG(rc);
PMIX_RELEASE(msg);
return rc;
}
/* pack the number of targets */
if (PMIX_SUCCESS != (rc = pmix_bfrop.pack(msg, &ntargets, 1, PMIX_SIZE))) {
PMIX_ERROR_LOG(rc);
PMIX_RELEASE(msg);
return rc;
}
/* remember, the targets can be NULL to indicate that the operation
* is to be done against all members of our nspace */
if (0 < ntargets) {
/* pack the targets */
if (PMIX_SUCCESS != (rc = pmix_bfrop.pack(msg, targets, ntargets, PMIX_PROC))) {
PMIX_ERROR_LOG(rc);
PMIX_RELEASE(msg);
return rc;
}
}
/* pack the directives */
if (PMIX_SUCCESS != (rc = pmix_bfrop.pack(msg, &ndirs, 1, PMIX_SIZE))) {
PMIX_ERROR_LOG(rc);
PMIX_RELEASE(msg);
return rc;
}
if (0 < ndirs) {
if (PMIX_SUCCESS != (rc = pmix_bfrop.pack(msg, directives, ndirs, PMIX_INFO))) {
PMIX_ERROR_LOG(rc);
PMIX_RELEASE(msg);
return rc;
}
}
/* create a callback object as we need to pass it to the
* recv routine so we know which callback to use when
* the return message is recvd */
cb = PMIX_NEW(pmix_query_caddy_t);
cb->cbfunc = cbfunc;
cb->cbdata = cbdata;
/* push the message into our event base to send to the server */
if (PMIX_SUCCESS != (rc = pmix_ptl.send_recv(&pmix_client_globals.myserver, msg, query_cbfunc, (void*)cb))){
PMIX_RELEASE(msg);
PMIX_RELEASE(cb);
}
return rc;
}
PMIX_EXPORT pmix_status_t PMIx_Process_monitor_nb(const pmix_info_t *monitor, pmix_status_t error,
const pmix_info_t directives[], size_t ndirs,
pmix_info_cbfunc_t cbfunc, void *cbdata)
{
pmix_buffer_t *msg;
pmix_cmd_t cmd = PMIX_MONITOR_CMD;
pmix_status_t rc;
pmix_query_caddy_t *cb;
pmix_output_verbose(2, pmix_globals.debug_output,
"pmix: monitor called");
if (pmix_globals.init_cntr <= 0) {
return PMIX_ERR_INIT;
}
/* if we are the server, then we just issue the request and
* return the response */
if (PMIX_PROC_SERVER == pmix_globals.proc_type) {
if (NULL == pmix_host_server.monitor) {
/* nothing we can do */
return PMIX_ERR_NOT_SUPPORTED;
}
pmix_output_verbose(2, pmix_globals.debug_output,
"pmix:monitor handed to RM");
rc = pmix_host_server.monitor(&pmix_globals.myid, error,
directives, ndirs, cbfunc, cbdata);
return rc;
}
/* if we are a client, then relay this request to the server */
/* if we aren't connected, don't attempt to send */
if (!pmix_globals.connected) {
return PMIX_ERR_UNREACH;
}
msg = PMIX_NEW(pmix_buffer_t);
/* pack the cmd */
if (PMIX_SUCCESS != (rc = pmix_bfrop.pack(msg, &cmd, 1, PMIX_CMD))) {
PMIX_ERROR_LOG(rc);
PMIX_RELEASE(msg);
return rc;
}
/* pack the error */
if (PMIX_SUCCESS != (rc = pmix_bfrop.pack(msg, &error, 1, PMIX_STATUS))) {
PMIX_ERROR_LOG(rc);
PMIX_RELEASE(msg);
return rc;
}
/* pack the directives */
if (PMIX_SUCCESS != (rc = pmix_bfrop.pack(msg, &ndirs, 1, PMIX_SIZE))) {
PMIX_ERROR_LOG(rc);
PMIX_RELEASE(msg);
return rc;
}
if (0 < ndirs) {
if (PMIX_SUCCESS != (rc = pmix_bfrop.pack(msg, directives, ndirs, PMIX_INFO))) {
PMIX_ERROR_LOG(rc);
PMIX_RELEASE(msg);
return rc;
}
}
/* create a callback object as we need to pass it to the
* recv routine so we know which callback to use when
* the return message is recvd */
cb = PMIX_NEW(pmix_query_caddy_t);
cb->cbfunc = cbfunc;
cb->cbdata = cbdata;
/* push the message into our event base to send to the server */
if (PMIX_SUCCESS != (rc = pmix_ptl.send_recv(&pmix_client_globals.myserver, msg, query_cbfunc, (void*)cb))){
PMIX_RELEASE(msg);
PMIX_RELEASE(cb);
}
return rc;
}

Просмотреть файл

@ -257,6 +257,8 @@ static void qcon(pmix_query_caddy_t *p)
{
p->queries = NULL;
p->nqueries = 0;
p->targets = NULL;
p->ntargets = 0;
p->info = NULL;
p->ninfo = 0;
p->cbfunc = NULL;

Просмотреть файл

@ -72,7 +72,9 @@ typedef enum {
PMIX_DEREGEVENTS_CMD,
PMIX_QUERY_CMD,
PMIX_LOG_CMD,
PMIX_ALLOC_CMD
PMIX_ALLOC_CMD,
PMIX_JOB_CONTROL_CMD,
PMIX_MONITOR_CMD
} pmix_cmd_t;
/* provide a "pretty-print" function for cmds */
@ -214,6 +216,8 @@ typedef struct {
pmix_status_t status;
pmix_query_t *queries;
size_t nqueries;
pmix_proc_t *targets;
size_t ntargets;
pmix_info_t *info;
size_t ninfo;
pmix_info_cbfunc_t cbfunc;

Просмотреть файл

@ -256,4 +256,13 @@ typedef struct event pmix_event_t;
#define pmix_event_active(x, y, z) event_active((x), (y), (z))
#define pmix_event_evtimer_new(b, cb, arg) pmix_event_new((b), -1, 0, (cb), (arg))
#define pmix_event_evtimer_add(x, tv) pmix_event_add((x), (tv))
#define pmix_event_evtimer_set(b, x, cb, arg) event_assign((x), (b), -1, 0, (event_callback_fn) (cb), (arg))
#define pmix_event_evtimer_del(x) pmix_event_del((x))
#endif /* PMIX_TYPES_H */

Просмотреть файл

@ -3,26 +3,27 @@
#
# Copyright (c) 2017 Intel, Inc. All rights reserved.
# $COPYRIGHT$
#
#
# Additional copyrights may follow
#
#
# $HEADER$
#
AM_CPPFLAGS = $(LTDLINCL)
# main library setup
noinst_LTLIBRARIES = libmca_sensor.la
libmca_sensor_la_SOURCES =
noinst_LTLIBRARIES = libmca_psensor.la
libmca_psensor_la_SOURCES =
# local files
headers = sensor.h \
sensor_types.h
headers = psensor.h
libmca_sensor_la_SOURCES += $(headers)
libmca_psensor_la_SOURCES += $(headers)
# Conditionally install the header files
if WANT_INSTALL_HEADERS
ortedir = $(ompiincludedir)/$(subdir)
nobase_orte_HEADERS = $(headers)
pmixdir = $(pmixincludedir)/$(subdir)
nobase_pmix_HEADERS = $(headers)
endif
include base/Makefile.am

Просмотреть файл

@ -1,5 +1,5 @@
#
# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
# Copyright (c) 2012-2013 Los Alamos National Security, Inc. All rights reserved.
#
# Copyright (c) 2017 Intel, Inc. All rights reserved.
@ -11,10 +11,9 @@
#
headers += \
base/base.h \
base/sensor_private.h
base/base.h
libmca_sensor_la_SOURCES += \
base/sensor_base_frame.c \
base/sensor_base_select.c \
base/sensor_base_fns.c
libmca_psensor_la_SOURCES += \
base/psensor_base_frame.c \
base/psensor_base_select.c \
base/psensor_base_stubs.c

Просмотреть файл

@ -0,0 +1,59 @@
/*
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2013 Los Alamos National Security, LLC. All rights reserved.
*
* Copyright (c) 2017 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/** @file:
*/
#ifndef PMIX_PSENSOR_BASE_H_
#define PMIX_PSENSOR_BASE_H_
#include <src/include/pmix_config.h>
#include "src/class/pmix_list.h"
#include "src/mca/mca.h"
#include "src/mca/base/pmix_mca_base_framework.h"
#include "src/mca/psensor/psensor.h"
BEGIN_C_DECLS
/*
* MCA Framework
*/
PMIX_EXPORT extern pmix_mca_base_framework_t pmix_psensor_base_framework;
PMIX_EXPORT int pmix_psensor_base_select(void);
/* define a struct to hold framework-global values */
typedef struct {
pmix_list_t actives;
pmix_event_base_t *evbase;
} pmix_psensor_base_t;
typedef struct {
pmix_list_item_t super;
pmix_psensor_base_component_t *component;
pmix_psensor_base_module_t *module;
int priority;
} pmix_psensor_active_module_t;
PMIX_CLASS_DECLARATION(pmix_psensor_active_module_t);
PMIX_EXPORT extern pmix_psensor_base_t pmix_psensor_base;
PMIX_EXPORT pmix_status_t pmix_psensor_base_start(pmix_peer_t *requestor, pmix_status_t error,
const pmix_info_t *monitor,
const pmix_info_t directives[], size_t ndirs);
PMIX_EXPORT pmix_status_t pmix_psensor_base_stop(pmix_peer_t *requestor,
char *id);
END_C_DECLS
#endif

Просмотреть файл

@ -0,0 +1,103 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2012-2013 Los Alamos National Security, Inc. All rights reserved.
*
* Copyright (c) 2017 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include <src/include/pmix_config.h>
#include <pmix_common.h>
#include <pthread.h>
#include PMIX_EVENT_HEADER
#include "src/mca/mca.h"
#include "src/mca/base/base.h"
#include "src/class/pmix_list.h"
#include "src/runtime/pmix_progress_threads.h"
#include "src/include/types.h"
#include "src/mca/psensor/base/base.h"
/*
* The following file was created by configure. It contains extern
* statements and the definition of an array of pointers to each
* component's public mca_base_component_t struct.
*/
#include "src/mca/psensor/base/static-components.h"
/*
* Global variables
*/
pmix_psensor_base_module_t pmix_psensor = {
pmix_psensor_base_start,
pmix_psensor_base_stop
};
pmix_psensor_base_t pmix_psensor_base = {{{0}}};;
static bool use_separate_thread = false;
static int pmix_psensor_register(pmix_mca_base_register_flag_t flags)
{
(void) pmix_mca_base_var_register("pmix", "psensor", "base", "use_separate_thread",
"Use a separate thread for monitoring local procs",
PMIX_MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
PMIX_INFO_LVL_9,
PMIX_MCA_BASE_VAR_SCOPE_READONLY,
&use_separate_thread);
return PMIX_SUCCESS;
}
static int pmix_psensor_base_close(void)
{
PMIX_LIST_DESTRUCT(&pmix_psensor_base.actives);
if (use_separate_thread && NULL != pmix_psensor_base.evbase) {
(void)pmix_progress_thread_stop("PSENSOR");
}
/* Close all remaining available components */
return pmix_mca_base_framework_components_close(&pmix_psensor_base_framework, NULL);
}
/**
* Function for finding and opening either all MCA components, or the one
* that was specifically requested via a MCA parameter.
*/
static int pmix_psensor_base_open(pmix_mca_base_open_flag_t flags)
{
/* construct the list of modules */
PMIX_CONSTRUCT(&pmix_psensor_base.actives, pmix_list_t);
if (use_separate_thread) {
/* create an event base and progress thread for us */
if (NULL == (pmix_psensor_base.evbase = pmix_progress_thread_init("PSENSOR"))) {
return PMIX_ERROR;
}
} else {
pmix_psensor_base.evbase = pmix_globals.evbase;
}
/* Open up all available components */
return pmix_mca_base_framework_components_open(&pmix_psensor_base_framework, flags);
}
PMIX_MCA_BASE_FRAMEWORK_DECLARE(pmix, psensor, "PMIx Monitoring Sensors",
pmix_psensor_register,
pmix_psensor_base_open, pmix_psensor_base_close,
mca_psensor_base_static_components, 0);
PMIX_CLASS_INSTANCE(pmix_psensor_active_module_t,
pmix_list_item_t,
NULL, NULL);

Просмотреть файл

@ -0,0 +1,94 @@
/*
* Copyright (c) 2004-2008 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2016-2017 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include <src/include/pmix_config.h>
#include <pmix_common.h>
#include <string.h>
#include "src/mca/mca.h"
#include "src/mca/base/base.h"
#include "src/mca/psensor/base/base.h"
static bool selected = false;
/* Function for selecting a prioritized list of components
* from all those that are available. */
int pmix_psensor_base_select(void)
{
pmix_mca_base_component_list_item_t *cli = NULL;
pmix_psensor_base_component_t *component = NULL;
pmix_psensor_active_module_t *newactive, *active;
pmix_mca_base_module_t *mod;
int pri;
bool inserted;
if (selected) {
/* ensure we don't do this twice */
return PMIX_SUCCESS;
}
selected = true;
/* Query all available components and ask if they have a module */
PMIX_LIST_FOREACH(cli, &pmix_psensor_base_framework.framework_components, pmix_mca_base_component_list_item_t) {
component = (pmix_psensor_base_component_t *) cli->cli_component;
pmix_output_verbose(5, pmix_psensor_base_framework.framework_output,
"mca:psensor:select: checking available component %s",
component->base.pmix_mca_component_name);
/* get the module for this component */
if (PMIX_SUCCESS != component->base.pmix_mca_query_component(&mod, &pri)) {
continue;
}
/* add to our prioritized list of available actives */
newactive = PMIX_NEW(pmix_psensor_active_module_t);
newactive->priority = pri;
newactive->component = component;
newactive->module = (pmix_psensor_base_module_t*)mod;
/* maintain priority order */
inserted = false;
PMIX_LIST_FOREACH(active, &pmix_psensor_base.actives, pmix_psensor_active_module_t) {
if (newactive->priority > active->priority) {
pmix_list_insert_pos(&pmix_psensor_base.actives,
(pmix_list_item_t*)active, &newactive->super);
inserted = true;
break;
}
}
if (!inserted) {
/* must be lowest priority - add to end */
pmix_list_append(&pmix_psensor_base.actives, &newactive->super);
}
}
if (4 < pmix_output_get_verbosity(pmix_psensor_base_framework.framework_output)) {
pmix_output(0, "Final PSENSOR priorities");
/* show the prioritized list */
PMIX_LIST_FOREACH(active, &pmix_psensor_base.actives, pmix_psensor_active_module_t) {
pmix_output(0, "\tPSENSOR: %s Priority: %d",
active->component->base.pmix_mca_component_name, active->priority);
}
}
return PMIX_SUCCESS;;
}

Просмотреть файл

@ -0,0 +1,68 @@
/*
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2012 Los Alamos National Security, Inc. All rights reserved.
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include <src/include/pmix_config.h>
#include <pmix_common.h>
#include "src/util/error.h"
#include "src/mca/psensor/base/base.h"
static bool mods_active = false;
pmix_status_t pmix_psensor_base_start(pmix_peer_t *requestor, pmix_status_t error,
const pmix_info_t *monitor,
const pmix_info_t directives[], size_t ndirs)
{
pmix_psensor_active_module_t *mod;
pmix_status_t rc;
opal_output_verbose(5, pmix_psensor_base_framework.framework_output,
"%s:%d sensor:base: starting sensors",
pmix_globals.myid.nspace, pmix_globals.myid.rank);
/* call the start function of all modules in priority order */
PMIX_LIST_FOREACH(mod, &pmix_psensor_base.actives, pmix_psensor_active_module_t) {
if (NULL != mod->module->start) {
rc = mod->module->start(requestor, error, monitor, directives, ndirs);
if (PMIX_SUCCESS != rc && PMIX_ERR_TAKE_NEXT_OPTION != rc) {
return rc;
}
}
}
return PMIX_SUCCESS;
}
pmix_status_t pmix_psensor_base_stop(pmix_peer_t *requestor,
char *id)
{
pmix_psensor_active_module_t *mod;
pmix_status_t rc;
opal_output_verbose(5, pmix_psensor_base_framework.framework_output,
"%s:%d sensor:base: stopping sensors",
pmix_globals.myid.nspace, pmix_globals.myid.rank);
/* call the stop function of all modules in priority order */
PMIX_LIST_FOREACH(mod, &pmix_psensor_base.actives, pmix_psensor_active_module_t) {
if (NULL != mod->module->stop) {
rc = mod->module->stop(requestor, id);
if (PMIX_SUCCESS != rc && PMIX_ERR_TAKE_NEXT_OPTION != rc) {
return rc;
}
}
}
return PMIX_SUCCESS;
}

Просмотреть файл

@ -1,37 +1,37 @@
#
# Copyright (c) 2009-2010 Cisco Systems, Inc. All rights reserved.
# Copyright (c) 2009-2010 Cisco Systems, Inc. All rights reserved.
# Copyright (c) 2017 Intel, Inc. All rights reserved.
# $COPYRIGHT$
#
#
# Additional copyrights may follow
#
#
# $HEADER$
#
dist_ompidata_DATA = help-orte-sensor-file.txt
dist_pmixdata_DATA = help-pmix-psensor-file.txt
sources = \
sensor_file.c \
sensor_file.h \
sensor_file_component.c
psensor_file.c \
psensor_file.h \
psensor_file_component.c
# Make the output library in this directory, and name it either
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
# (for static builds).
if MCA_BUILD_orte_sensor_file_DSO
if MCA_BUILD_pmix_psensor_file_DSO
component_noinst =
component_install = mca_sensor_file.la
component_install = mca_psensor_file.la
else
component_noinst = libmca_sensor_file.la
component_noinst = libmca_psensor_file.la
component_install =
endif
mcacomponentdir = $(ompilibdir)
mcacomponentdir = $(pmixlibdir)
mcacomponent_LTLIBRARIES = $(component_install)
mca_sensor_file_la_SOURCES = $(sources)
mca_sensor_file_la_LDFLAGS = -module -avoid-version
mca_psensor_file_la_SOURCES = $(sources)
mca_psensor_file_la_LDFLAGS = -module -avoid-version
noinst_LTLIBRARIES = $(component_noinst)
libmca_sensor_file_la_SOURCES =$(sources)
libmca_sensor_file_la_LDFLAGS = -module -avoid-version
libmca_psensor_file_la_SOURCES =$(sources)
libmca_psensor_file_la_LDFLAGS = -module -avoid-version

Просмотреть файл

@ -4,9 +4,9 @@
#
# Copyright (c) 2017 Intel, Inc. All rights reserved.
# $COPYRIGHT$
#
#
# Additional copyrights may follow
#
#
# $HEADER$
#
# This is the US/English general help file for the file sensor

Просмотреть файл

@ -0,0 +1,352 @@
/*
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2011-2012 Los Alamos National Security, LLC.
* All rights reserved.
*
* Copyright (c) 2017 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include <src/include/pmix_config.h>
#include <src/include/types.h>
#include <pmix_common.h>
#include <stdio.h>
#include <stddef.h>
#include <ctype.h>
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif
#ifdef HAVE_NETDB_H
#include <netdb.h>
#endif
#ifdef HAVE_SYS_PARAM_H
#include <sys/param.h>
#endif
#include <fcntl.h>
#include <errno.h>
#include <signal.h>
#ifdef HAVE_TIME_H
#include <time.h>
#endif
#include <sys/stat.h>
#include <sys/types.h>
#include "src/class/pmix_list.h"
#include "src/include/pmix_globals.h"
#include "src/util/error.h"
#include "src/util/output.h"
#include "src/util/show_help.h"
#include "src/mca/psensor/base/base.h"
#include "psensor_file.h"
/* declare the API functions */
static pmix_status_t start(pmix_peer_t *requestor, pmix_status_t error,
const pmix_info_t *monitor,
const pmix_info_t directives[], size_t ndirs);
static pmix_status_t stop(pmix_peer_t *requestor, char *id);
/* instantiate the module */
pmix_psensor_base_module_t pmix_psensor_file_module = {
.start = start,
.stop = stop
};
/* define a tracking object */
typedef struct {
pmix_list_item_t super;
pmix_peer_t *requestor;
char *id;
bool event_active;
pmix_event_t ev;
pmix_event_t cdev;
struct timeval tv;
int tick;
char *file;
bool file_size;
bool file_access;
bool file_mod;
size_t last_size;
time_t last_access;
time_t last_mod;
uint32_t ndrops;
uint32_t nmisses;
pmix_status_t error;
pmix_data_range_t range;
pmix_info_t *info;
size_t ninfo;
} file_tracker_t;
static void ft_constructor(file_tracker_t *ft)
{
ft->requestor = NULL;
ft->id = NULL;
ft->event_active = false;
ft->tv.tv_sec = 0;
ft->tv.tv_usec = 0;
ft->tick = 0;
ft->file_size = false;
ft->file_access = false;
ft->file_mod = false;
ft->last_size = 0;
ft->last_access = 0;
ft->last_mod = 0;
ft->ndrops = 0;
ft->nmisses = 0;
ft->error = PMIX_SUCCESS;
ft->range = PMIX_RANGE_NAMESPACE;
ft->info = NULL;
ft->ninfo = 0;
}
static void ft_destructor(file_tracker_t *ft)
{
if (NULL != ft->requestor) {
PMIX_RELEASE(ft->requestor);
}
if (NULL != ft->id) {
free(ft->id);
}
if (event_active) {
pmix_event_del(&ft->ev);
}
if (NULL != ft->file) {
free(ft->file);
}
if (NULL != ft->info) {
PMIX_INFO_FREE(ft->info, ft->ninfo);
}
}
PMIX_CLASS_INSTANCE(file_tracker_t,
pmix_list_item_t,
ft_constructor, ft_destructor);
/* define a local caddy */
typedef struct {
pmix_object_t super;
pmix_event_t ev;
pmix_peer_t *requestor;
char *id;
} file_caddy_t;
static void cd_con(file_caddy_t *p)
{
p->requestor = NULL;
p->id = NULL;
}
static void cd_des(file_caddy_t *p)
{
if (NULL != (p->requestor)) {
PMIX_RELEASE(p->requestor);
}
if (NULL != p->id) {
free(p->id);
}
}
PMIX_CLASS_INSTANCE(file_caddy_t,
pmix_object_t,
cd_con, cd_des);
static void file_sample(int sd, short args, void *cbdata);
static void add_tracker(int sd, short flags, void *cbdata)
{
file_tracker_t *ft = (file_tracker_t*)cbdata;
/* add the tracker to our list */
pmix_list_append(&mca_psensor_file_component.trackers, &ft->super);
/* setup the timer event */
pmix_event_evtimer_set(pmix_psensor_base.evbase, &ft->ev,
file_sample, ft);
pmix_event_evtimer_add(&ft->ev, &ft->tv);
ft->event_active = true;
}
/*
* Start monitoring of local processes
*/
static pmix_status_t start(pmix_peer_t *requestor, pmix_status_t error,
const pmix_info_t *monitor,
const pmix_info_t directives[], size_t ndirs)
{
file_tracker_t *ft;
pmix_info_t *ptr;
size_t n, n2;
PMIX_OUTPUT_VERBOSE((1, pmix_psensor_base_framework.framework_output,
"[%s:%d] checking file monitoring for requestor %s:%d",
pmix_globals.myid.nspace, pmix_globals.myid.rank,
requestor->info->nptr->nspace, requestor->info->rank));
/* if they didn't ask to monitor a file, then nothing for us to do */
if (0 != strcmp(monitor->key, PMIX_MONITOR_FILE)) {
return PMIX_ERR_TAKE_NEXT_OPTION;
}
/* setup to track this monitoring operation */
ft = PMIX_NEW(file_tracker_t);
PMIX_RETAIN(requestor);
ft->requestor = requestor;
ft->file = strdup(monitor->value.data.string);
/* check the directives to see if what they want monitored */
for (n=0; n < ndirs; n++) {
if (0 == strcmp(directives[n].key, PMIX_MONITOR_FILE_SIZE)) {
ft->file_size = directives[n].value.data.flag;
} else if (0 == strcmp(directives[n].key, PMIX_MONITOR_FILE_ACCESS)) {
ft->file_access = directives[n].value.data.flag;
} else if (0 == strcmp(directives[n].key, PMIX_MONITOR_FILE_MODIFY)) {
ft->file_mod = directives[n].value.data.flag;
} else if (0 == strcmp(directives[n].key, PMIX_MONITOR_FILE_DROPS)) {
ft->ndrops = directives[n].value.data.uint32;
} else if (0 == strcmp(directives[n].key, PMIX_MONITOR_FILE_CHECK_TIME)) {
ft->tv.tv_sec = directives[n].value.data.uint32;
} else if (0 == strcmp(directives[n].key, PMIX_RANGE)) {
ft->range = directives[n].value.data.range;
}
}
if (0 == ft->tv.tv_sec ||
(!ft->file_size && !ft->file_access && !ft->file_mod)) {
/* didn't specify a sample rate, or what should be sampled */
PMIX_RELEASE(ft);
return PMIX_ERR_BAD_PARAM;
}
/* need to push into our event base to add this to our trackers */
pmix_event_assign(&ft->cdev, pmix_psensor_base.evbase, -1,
EV_WRITE, add_tracker, ft);
pmix_event_active(&ft->cdev, EV_WRITE, 1);
return PMIX_SUCCESS;
}
static void del_tracker(int sd, short flags, void *cbdata)
{
file_caddy_t *cd = (file_caddy_t*)cbdata;
file_tracker_t *ft, *ftnext;
/* remove the tracker from our list */
PMIX_LIST_FOREACH_SAFE(ft, ftnext, &mca_psensor_file_component.trackers, file_tracker_t) {
if (ft->requestor != cd->requestor) {
continue;
}
if (NULL == cd->id ||
(NULL != ft->id && 0 == strcmp(ft->id, cd->id))) {
pmix_list_remove_item(&mca_psensor_file_component.trackers, &ft->super);
PMIX_RELEASE(ft);
}
}
PMIX_RELEASE(cd);
}
static pmix_status_t stop(pmix_peer_t *requestor, char *id)
{
file_caddy_t *cd;
cd = PMIX_NEW(file_caddy_t);
PMIX_RETAIN(requestor);
cd->requestor = requestor;
cd->id = strdup(id);
/* need to push into our event base to add this to our trackers */
pmix_event_assign(&cd->ev, pmix_psensor_base.evbase, -1,
EV_WRITE, del_tracker, cd);
pmix_event_active(&cd->ev, EV_WRITE, 1);
return PMIX_SUCCESS;
}
static void opcbfunc(pmix_status_t status, void *cbdata)
{
file_tracker_t *ft = (file_tracker_t*)cbdata;
PMIX_RELEASE(ft);
}
static void file_sample(int sd, short args, void *cbdata)
{
file_tracker_t *ft = (file_tracker_t*)cbdata;
struct stat buf;
pmix_status_t rc;
pmix_proc_t source;
OPAL_OUTPUT_VERBOSE((1, pmix_psensor_base_framework.framework_output,
"[%s:%d] sampling file %s",
pmix_globals.myid.nspace, pmix_globals.myid.rank,
ft->file));
/* stat the file and get its info */
if (0 > stat(ft->file, &buf)) {
/* cannot stat file */
PMIX_OUTPUT_VERBOSE((1, pmix_psensor_base_framework.framework_output,
"[%s:%d] could not stat %s",
pmix_globals.myid.nspace, pmix_globals.myid.rank,
ft->file));
/* re-add the timer, in case this file shows up */
pmix_event_evtimer_add(&ft->ev, &ft->tv);
return;
}
PMIX_OUTPUT_VERBOSE((1, pmix_psensor_base_framework.framework_output,
"[%s:%d] size %lu access %s\tmod %s",
pmix_globals.myid.nspace, pmix_globals.myid.rank,
(unsigned long)buf.st_size, ctime(&buf.st_atime), ctime(&buf.st_mtime)));
if (ft->file_size) {
if (buf.st_size == ft->last_size) {
ft->nmisses++;
} else {
ft->nmisses = 0;
ft->last_size = buf.st_size;
}
} else if (ft->file_access) {
if (buf.st_atime == ft->last_access) {
ft->nmisses++;
} else {
ft->nmisses = 0;
ft->last_access = buf.st_atime;
}
} else if (ft->file_mod) {
if (buf.st_mtime == ft->last_mod) {
ft->nmisses++;
} else {
ft->nmisses = 0;
ft->last_mod = buf.st_mtime;
}
}
CHECK:
PMIX_OUTPUT_VERBOSE((1, pmix_psensor_base_framework.framework_output,
"[%s:%d] sampled file %s misses %d",
pmix_globals.myid.nspace, pmix_globals.myid.rank,
ft->file, ft->nmisses));
if (ft->nmisses == ft->ndrops) {
if (4 < pmix_output_get_verbosity(pmix_psensor_base_framework.framework_output)) {
pmix_show_help("help-pmix-psensor-file.txt", "file-stalled", true,
ft->file, ft->last_size, ctime(&ft->last_access), ctime(&ft->last_mod));
}
/* stop monitoring this client */
pmix_list_remove_item(&mca_psensor_file_component.trackers, &ft->super);
/* generate an event */
(void)strncpy(source.nspace, ft->requestor->info->nptr->nspace, PMIX_MAX_NSLEN);
source.rank = ft->requestor->info->rank;
rc = PMIx_Notify_event(PMIX_MONITOR_FILE_ALERT, &source,
ft->range, ft->info, ft->ninfo, opcbfunc, ft);
if (PMIX_SUCCESS != rc) {
PMIX_ERROR_LOG(rc);
}
return;
}
/* re-add the timer */
pmix_event_evtimer_add(&ft->ev, &ft->tv);
}

Просмотреть файл

@ -0,0 +1,38 @@
/*
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
*
* Copyright (c) 2017 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/**
* @file
*
* File movement sensor
*/
#ifndef PMIX_PSENSOR_FILE_H
#define PMIX_PSENSOR_FILE_H
#include <src/include/pmix_config.h>
#include "src/class/pmix_list.h"
#include "src/mca/psensor/psensor.h"
BEGIN_C_DECLS
typedef struct {
pmix_psensor_base_component_t super;
pmix_list_t trackers;
} pmix_psensor_file_component_t;
extern pmix_psensor_file_component_t mca_psensor_file_component;
extern pmix_psensor_base_module_t pmix_psensor_file_module;
END_C_DECLS
#endif

Просмотреть файл

@ -0,0 +1,69 @@
/*
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2017 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include <src/include/pmix_config.h>
#include <pmix_common.h>
#include "src/class/pmix_list.h"
#include "src/mca/psensor/base/base.h"
#include "src/mca/psensor/file/psensor_file.h"
/*
* Local functions
*/
static int psensor_file_open(void);
static int psensor_file_close(void);
static int psensor_file_query(pmix_mca_base_module_t **module, int *priority);
pmix_psensor_file_component_t mca_psensor_file_component = {
.super = {
.base = {
PMIX_PSENSOR_BASE_VERSION_1_0_0,
/* Component name and version */
.pmix_mca_component_name = "file",
PMIX_MCA_BASE_MAKE_VERSION(component,
PMIX_MAJOR_VERSION,
PMIX_MINOR_VERSION,
PMIX_RELEASE_VERSION),
/* Component open and close functions */
psensor_file_open, /* component open */
psensor_file_close, /* component close */
psensor_file_query /* component query */
},
}
};
static int psensor_file_open(void)
{
PMIX_CONSTRUCT(&mca_psensor_file_component.trackers, pmix_list_t);
return PMIX_SUCCESS;
}
static int psensor_file_query(pmix_mca_base_module_t **module, int *priority)
{
*priority = 20; /* irrelevant */
*module = (pmix_mca_base_module_t *)&pmix_psensor_file_module;
return PMIX_SUCCESS;
}
/**
* Close all subsystems.
*/
static int psensor_file_close(void)
{
PMIX_LIST_DESTRUCT(&mca_psensor_file_component.trackers);
return PMIX_SUCCESS;
}

Просмотреть файл

@ -0,0 +1,38 @@
#
# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
#
# Copyright (c) 2017 Intel, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
dist_pmixdata_DATA = help-pmix-psensor-heartbeat.txt
sources = \
psensor_heartbeat.c \
psensor_heartbeat.h \
psensor_heartbeat_component.c
# Make the output library in this directory, and name it either
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
# (for static builds).
if MCA_BUILD_pmix_psensor_heartbeat_DSO
component_noinst =
component_install = mca_psensor_heartbeat.la
else
component_noinst = libmca_psensor_heartbeat.la
component_install =
endif
mcacomponentdir = $(pmixlibdir)
mcacomponent_LTLIBRARIES = $(component_install)
mca_psensor_heartbeat_la_SOURCES = $(sources)
mca_psensor_heartbeat_la_LDFLAGS = -module -avoid-version
noinst_LTLIBRARIES = $(component_noinst)
libmca_psensor_heartbeat_la_SOURCES =$(sources)
libmca_psensor_heartbeat_la_LDFLAGS = -module -avoid-version

Просмотреть файл

@ -4,9 +4,9 @@
#
# Copyright (c) 2017 Intel, Inc. All rights reserved.
# $COPYRIGHT$
#
#
# Additional copyrights may follow
#
#
# $HEADER$
#
# This is the US/English general help file for the memory usage sensor
@ -18,4 +18,3 @@ Node: %s
Process rank: %s
Memory used: %luGbytes
Memory limit: %luGbytes

Просмотреть файл

@ -0,0 +1,330 @@
/*
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights
* reserved.
*
* Copyright (c) 2017 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include <src/include/pmix_config.h>
#include <pmix_common.h>
#include <errno.h>
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif /* HAVE_UNISTD_H */
#ifdef HAVE_STRING_H
#include <string.h>
#endif /* HAVE_STRING_H */
#include <stdio.h>
#include <pthread.h>
#include PMIX_EVENT_HEADER
#include "src/util/argv.h"
#include "src/util/error.h"
#include "src/util/output.h"
#include "src/util/show_help.h"
#include "src/include/pmix_globals.h"
#include "src/mca/ptl/ptl.h"
#include "src/mca/psensor/base/base.h"
#include "psensor_heartbeat.h"
/* declare the API functions */
static pmix_status_t heartbeat_start(pmix_peer_t *requestor, pmix_status_t error,
const pmix_info_t *monitor,
const pmix_info_t directives[], size_t ndirs);
static pmix_status_t heartbeat_stop(pmix_peer_t *requestor, char *id);
/* instantiate the module */
pmix_psensor_base_module_t pmix_psensor_heartbeat_module = {
.start = heartbeat_start,
.stop = heartbeat_stop
};
/* tracker object */
typedef struct {
pmix_list_item_t super;
pmix_peer_t *requestor;
char *id;
bool event_active;
pmix_event_t ev;
pmix_event_t cdev;
struct timeval tv;
uint32_t nbeats;
uint32_t ndrops;
uint32_t nmissed;
pmix_status_t error;
pmix_data_range_t range;
pmix_info_t *info;
size_t ninfo;
} pmix_heartbeat_trkr_t;
static void ft_constructor(pmix_heartbeat_trkr_t *ft)
{
ft->requestor = NULL;
ft->id = NULL;
ft->event_active = false;
ft->tv.tv_sec = 0;
ft->tv.tv_usec = 0;
ft->nbeats = 0;
ft->ndrops = 0;
ft->nmissed = 0;
ft->error = PMIX_SUCCESS;
ft->range = PMIX_RANGE_NAMESPACE;
ft->info = NULL;
ft->ninfo = 0;
}
static void ft_destructor(pmix_heartbeat_trkr_t *ft)
{
if (NULL != ft->requestor) {
PMIX_RELEASE(ft->requestor);
}
if (NULL != ft->id) {
free(ft->id);
}
if (event_active) {
pmix_event_del(&ft->ev);
}
if (NULL != ft->info) {
PMIX_INFO_FREE(ft->info, ft->ninfo);
}
}
PMIX_CLASS_INSTANCE(pmix_heartbeat_trkr_t,
pmix_list_item_t,
ft_constructor, ft_destructor);
/* define a local caddy */
typedef struct {
pmix_object_t super;
pmix_event_t ev;
pmix_peer_t *requestor;
char *id;
} heartbeat_caddy_t;
static void cd_con(heartbeat_caddy_t *p)
{
p->requestor = NULL;
p->id = NULL;
}
static void cd_des(heartbeat_caddy_t *p)
{
if (NULL != (p->requestor)) {
PMIX_RELEASE(p->requestor);
}
if (NULL != p->id) {
free(p->id);
}
}
PMIX_CLASS_INSTANCE(heartbeat_caddy_t,
pmix_object_t,
cd_con, cd_des);
typedef struct {
pmix_object_t super;
pmix_event_t ev;
pmix_peer_t *peer;
} pmix_psensor_beat_t;
static void bcon(pmix_psensor_beat_t *p)
{
p->peer = NULL;
}
static void bdes(pmix_psensor_beat_t *p)
{
if (NULL != p->peer) {
PMIX_RELEASE(p->peer);
}
}
PMIX_CLASS_INSTANCE(pmix_psensor_beat_t,
pmix_object_t,
bcon, bdes);
static void check_heartbeat(int fd, short dummy, void *arg);
static void add_tracker(int sd, short flags, void *cbdata)
{
pmix_heartbeat_trkr_t *ft = (pmix_heartbeat_trkr_t*)cbdata;
/* add the tracker to our list */
pmix_list_append(&mca_psensor_heartbeat_component.trackers, &ft->super);
/* setup the timer event */
pmix_event_evtimer_set(pmix_psensor_base.evbase, &ft->ev,
check_heartbeat, ft);
pmix_event_evtimer_add(&ft->ev, &ft->tv);
ft->event_active = true;
}
static pmix_status_t heartbeat_start(pmix_peer_t *requestor, pmix_status_t error,
const pmix_info_t *monitor,
const pmix_info_t directives[], size_t ndirs)
{
pmix_heartbeat_trkr_t *ft;
size_t n, n2;
PMIX_OUTPUT_VERBOSE((1, pmix_psensor_base_framework.framework_output,
"[%s:%d] checking heartbeat monitoring for requestor %s:%d",
pmix_globals.myid.nspace, pmix_globals.myid.rank,
requestor->info->nptr->nspace, requestor->info->rank));
/* if they didn't ask for heartbeats, then nothing for us to do */
if (0 != strcmp(monitor->key, PMIX_MONITOR_HEARTBEAT)) {
return PMIX_ERR_TAKE_NEXT_OPTION;
}
/* setup to track this monitoring operation */
ft = PMIX_NEW(pmix_heartbeat_trkr_t);
PMIX_RETAIN(requestor);
ft->requestor = requestor;
ft->error = error;
/* check the directives to see what they want monitored */
for (n=0; n < ndirs; n++) {
if (0 == strcmp(directives[n].key, PMIX_MONITOR_HEARTBEAT_TIME)) {
ft->tv.tv_sec = directives[n].value.data.uint32;
} else if (0 == strcmp(directives[n].key, PMIX_MONITOR_HEARTBEAT_DROPS)) {
ft->ndrops = directives[n].value.data.uint32;
} else if (0 == strcmp(directives[n].key, PMIX_RANGE)) {
ft->range = directives[n].value.data.range;
}
}
if (0 == ft->tv.tv_sec) {
/* didn't specify a sample rate, or what should be sampled */
PMIX_RELEASE(ft);
return PMIX_ERR_BAD_PARAM;
}
/* need to push into our event base to add this to our trackers */
pmix_event_assign(&ft->cdev, pmix_psensor_base.evbase, -1,
EV_WRITE, add_tracker, ft);
pmix_event_active(&ft->cdev, EV_WRITE, 1);
return PMIX_SUCCESS;
}
static void del_tracker(int sd, short flags, void *cbdata)
{
heartbeat_caddy_t *cd = (heartbeat_caddy_t*)cbdata;
pmix_heartbeat_trkr_t *ft, *ftnext;
/* remove the tracker from our list */
PMIX_LIST_FOREACH_SAFE(ft, ftnext, &mca_psensor_heartbeat_component.trackers, pmix_heartbeat_trkr_t) {
if (ft->requestor != cd->requestor) {
continue;
}
if (NULL == cd->id ||
(NULL != ft->id && 0 == strcmp(ft->id, cd->id))) {
pmix_list_remove_item(&mca_psensor_heartbeat_component.trackers, &ft->super);
PMIX_RELEASE(ft);
}
}
PMIX_RELEASE(cd);
}
static pmix_status_t heartbeat_stop(pmix_peer_t *requestor, char *id)
{
heartbeat_caddy_t *cd;
cd = PMIX_NEW(heartbeat_caddy_t);
PMIX_RETAIN(requestor);
cd->requestor = requestor;
cd->id = strdup(id);
/* need to push into our event base to add this to our trackers */
pmix_event_assign(&cd->ev, pmix_psensor_base.evbase, -1,
EV_WRITE, del_tracker, cd);
pmix_event_active(&cd->ev, EV_WRITE, 1);
return PMIX_SUCCESS;
}
static void opcbfunc(pmix_status_t status, void *cbdata)
{
pmix_heartbeat_trkr_t *ft = (pmix_heartbeat_trkr_t*)cbdata;
PMIX_RELEASE(ft);
}
/* this function automatically gets periodically called
* by the event library so we can check on the state
* of the various procs we are monitoring
*/
static void check_heartbeat(int fd, short dummy, void *cbdata)
{
pmix_heartbeat_trkr_t *ft = (pmix_heartbeat_trkr_t*)cbdata;
pmix_status_t rc;
pmix_proc_t source;
PMIX_OUTPUT_VERBOSE((1, pmix_psensor_base_framework.framework_output,
"[%s:%d] sensor:check_heartbeat for proc %s:%d",
pmix_globals.myid.nspace, pmix_globals.myid.rank,
ft->requestor->info->nptr->nspace, ft->requestor->info->rank));
if (0 == ft->nbeats) {
/* no heartbeat recvd in last window */
PMIX_OUTPUT_VERBOSE((1, pmix_psensor_base_framework.framework_output,
"[%s:%d] sensor:check_heartbeat failed for proc %s:%d",
pmix_globals.myid.nspace, pmix_globals.myid.rank,
ft->requestor->info->nptr->nspace, ft->requestor->info->rank));
/* stop monitoring this client */
pmix_list_remove_item(&mca_psensor_heartbeat_component.trackers, &ft->super);
/* generate an event */
(void)strncpy(source.nspace, ft->requestor->info->nptr->nspace, PMIX_MAX_NSLEN);
source.rank = ft->requestor->info->rank;
rc = PMIx_Notify_event(PMIX_MONITOR_HEARTBEAT_ALERT, &source,
ft->range, ft->info, ft->ninfo, opcbfunc, ft);
if (PMIX_SUCCESS != rc) {
PMIX_ERROR_LOG(rc);
}
return;
} else {
PMIX_OUTPUT_VERBOSE((1, pmix_psensor_base_framework.framework_output,
"[%s:%d] sensor:check_heartbeat detected %d beats for proc %s:%d",
pmix_globals.myid.nspace, pmix_globals.myid.rank, ft->nbeats,
ft->requestor->info->nptr->nspace, ft->requestor->info->rank));
}
/* reset for next period */
ft->nbeats = 0;
/* reset the timer */
pmix_event_evtimer_add(&ft->ev, &ft->tv);
}
static void add_beat(int sd, short args, void *cbdata)
{
pmix_psensor_beat_t *b = (pmix_psensor_beat_t*)cbdata;
pmix_heartbeat_trkr_t *ft;
/* find this peer in our trackers */
PMIX_LIST_FOREACH(ft, &mca_psensor_heartbeat_component.trackers, pmix_heartbeat_trkr_t) {
if (ft->requestor == b->peer) {
/* increment the beat count */
++ft->nbeats;
break;
}
}
PMIX_RELEASE(b);
}
void pmix_psensor_heartbeat_recv_beats(struct pmix_peer_t *peer,
pmix_ptl_hdr_t *hdr,
pmix_buffer_t *buf, void *cbdata)
{
pmix_psensor_beat_t *b;
b = PMIX_NEW(pmix_psensor_beat_t);
PMIX_RETAIN(peer);
b->peer = peer;
/* shift this to our thread for processing */
pmix_event_assign(&b->ev, pmix_psensor_base.evbase, -1,
EV_WRITE, add_beat, b);
pmix_event_active(&b->ev, EV_WRITE, 1);
}

Просмотреть файл

@ -0,0 +1,43 @@
/*
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2012 Los Alamos National Security, Inc. All rights reserved.
*
* Copyright (c) 2017 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/**
* @file
*
* Heartbeat sensor
*/
#ifndef PMIX_PSENSOR_HEARTBEAT_H
#define PMIX_PSENSOR_HEARTBEAT_H
#include <src/include/pmix_config.h>
#include <src/include/types.h>
#include "src/class/pmix_list.h"
#include "src/include/pmix_globals.h"
#include "src/mca/psensor/psensor.h"
BEGIN_C_DECLS
typedef struct {
pmix_psensor_base_component_t super;
pmix_list_t trackers;
} pmix_psensor_heartbeat_component_t;
PMIX_EXPORT extern pmix_psensor_heartbeat_component_t mca_psensor_heartbeat_component;
extern pmix_psensor_base_module_t pmix_psensor_heartbeat_module;
void pmix_psensor_heartbeat_recv_beats(struct pmix_peer_t *peer,
pmix_ptl_hdr_t *hdr,
pmix_buffer_t *buf, void *cbdata);
END_C_DECLS
#endif

Просмотреть файл

@ -0,0 +1,81 @@
/*
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2012 Los Alamos National Security, Inc. All rights reserved.
* Copyright (c) 2017 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include <src/include/pmix_config.h>
#include <pmix_common.h>
#include "src/mca/ptl/ptl.h"
#include "src/mca/psensor/base/base.h"
#include "src/mca/psensor/heartbeat/psensor_heartbeat.h"
/*
* Local functions
*/
static int heartbeat_open(void);
static int heartbeat_close(void);
static int heartbeat_query(pmix_mca_base_module_t **module, int *priority);
pmix_psensor_heartbeat_component_t mca_psensor_heartbeat_component = {
.super = {
.base = {
PMIX_PSENSOR_BASE_VERSION_1_0_0,
/* Component name and version */
.pmix_mca_component_name = "heartbeat",
PMIX_MCA_BASE_MAKE_VERSION(component,
PMIX_MAJOR_VERSION,
PMIX_MINOR_VERSION,
PMIX_RELEASE_VERSION),
/* Component open and close functions */
heartbeat_open, /* component open */
heartbeat_close, /* component close */
heartbeat_query /* component query */
}
}
};
/**
* component open/close/init function
*/
static int heartbeat_open(void)
{
PMIX_CONSTRUCT(&mca_psensor_heartbeat_component.trackers, pmix_list_t);
/* setup to receive heartbeats */
pmix_ptl.recv(pmix_globals.mypeer, pmix_psensor_heartbeat_recv_beats, PMIX_PTL_TAG_HEARTBEAT);
return PMIX_SUCCESS;
}
static int heartbeat_query(pmix_mca_base_module_t **module, int *priority)
{
*priority = 5; // irrelevant
*module = (pmix_mca_base_module_t *)&pmix_psensor_heartbeat_module;
return PMIX_SUCCESS;
}
/**
* Close all subsystems.
*/
static int heartbeat_close(void)
{
/* cancel our persistent recv */
pmix_ptl.cancel(pmix_globals.mypeer, PMIX_PTL_TAG_HEARTBEAT);
PMIX_LIST_DESTRUCT(&mca_psensor_heartbeat_component.trackers);
return PMIX_SUCCESS;
}

Просмотреть файл

@ -0,0 +1,86 @@
/*
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2012 Los Alamos National Security, Inc. All rights reserved.
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*
* @file:
*
*/
#ifndef PMIX_PSENSOR_H_
#define PMIX_PSENSOR_H_
#include <src/include/pmix_config.h>
#include "src/class/pmix_list.h"
#include "src/mca/mca.h"
#include "src/include/pmix_globals.h"
BEGIN_C_DECLS
/*
* Component functions - all MUST be provided!
*/
/* start a sensor operation:
*
* requestor - the process requesting this operation
*
* monitor - a PMIx attribute specifying what is to be monitored
*
* directives - an array of pmix_info_t specifying relevant limits on values, and action
* to be taken when limits exceeded. Can include
* user-provided "id" string */
typedef pmix_status_t (*pmix_psensor_base_module_start_fn_t)(pmix_peer_t *requestor, pmix_status_t error,
const pmix_info_t *monitor,
const pmix_info_t directives[], size_t ndirs);
/* stop a sensor operation:
*
* requestor - the process requesting this operation
*
* id - the "id" string provided by the user at the time the
* affected monitoring operation was started. A NULL indicates
* that all operations started by this requestor are to
* be terminated */
typedef pmix_status_t (*pmix_psensor_base_module_stop_fn_t)(pmix_peer_t *requestor,
char *id);
/* API module */
/*
* Ver 1.0
*/
typedef struct pmix_psensor_base_module_1_0_0_t {
pmix_psensor_base_module_start_fn_t start;
pmix_psensor_base_module_stop_fn_t stop;
} pmix_psensor_base_module_t;
/*
* the standard component data structure
*/
typedef struct pmix_psensor_base_component_1_0_0_t {
pmix_mca_base_component_t base;
pmix_mca_base_component_data_t data;
} pmix_psensor_base_component_t;
/*
* Macro for use in components that are of type sensor v1.0.0
*/
#define PMIX_PSENSOR_BASE_VERSION_1_0_0 \
PMIX_MCA_BASE_VERSION_1_0_0("psensor", 1, 0, 0)
/* Global structure for accessing sensor functions
*/
PMIX_EXPORT extern pmix_psensor_base_module_t pmix_psensor; /* holds API function pointers */
END_C_DECLS
#endif /* MCA_SENSOR_H */

Просмотреть файл

@ -11,7 +11,7 @@
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2012 Los Alamos National Security, Inc. All rights reserved.
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
* Copyright (c) 2015 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* $COPYRIGHT$
@ -73,6 +73,7 @@ struct pmix_ptl_globals_t {
pmix_list_t actives;
bool initialized;
pmix_list_t posted_recvs; // list of pmix_ptl_posted_recv_t
pmix_list_t unexpected_msgs;
int stop_thread[2];
bool listen_thread_active;
pmix_list_t listeners;
@ -93,6 +94,11 @@ PMIX_EXPORT pmix_status_t pmix_ptl_stub_send_oneway(struct pmix_peer_t *peer,
pmix_ptl_tag_t tag);
PMIX_EXPORT pmix_status_t pmix_ptl_stub_connect_to_peer(struct pmix_peer_t *peer,
pmix_info_t info[], size_t ninfo);
PMIX_EXPORT pmix_status_t pmix_ptl_stub_register_recv(struct pmix_peer_t *peer,
pmix_ptl_cbfunc_t cbfunc,
pmix_ptl_tag_t tag);
PMIX_EXPORT pmix_status_t pmix_ptl_stub_cancel_recv(struct pmix_peer_t *peer,
pmix_ptl_tag_t tag);
PMIX_EXPORT pmix_status_t pmix_ptl_base_start_listening(pmix_info_t *info, size_t ninfo);
PMIX_EXPORT void pmix_ptl_base_stop_listening(void);

Просмотреть файл

@ -61,6 +61,8 @@ pmix_ptl_API_t pmix_ptl = {
.send_recv = pmix_ptl_stub_send_recv,
.send_oneway = pmix_ptl_stub_send_oneway,
.connect_to_peer = pmix_ptl_stub_connect_to_peer,
.recv = pmix_ptl_stub_register_recv,
.cancel = pmix_ptl_stub_cancel_recv,
.start_listening = pmix_ptl_base_start_listening,
.stop_listening = pmix_ptl_base_stop_listening
};
@ -88,6 +90,7 @@ static pmix_status_t pmix_ptl_close(void)
/* the components will cleanup when closed */
PMIX_LIST_DESTRUCT(&pmix_ptl_globals.actives);
PMIX_LIST_DESTRUCT(&pmix_ptl_globals.posted_recvs);
PMIX_LIST_DESTRUCT(&pmix_ptl_globals.unexpected_msgs);
PMIX_LIST_DESTRUCT(&pmix_ptl_globals.listeners);
return pmix_mca_base_framework_components_close(&pmix_ptl_base_framework, NULL);
@ -99,6 +102,7 @@ static pmix_status_t pmix_ptl_open(pmix_mca_base_open_flag_t flags)
pmix_ptl_globals.initialized = true;
PMIX_CONSTRUCT(&pmix_ptl_globals.actives, pmix_list_t);
PMIX_CONSTRUCT(&pmix_ptl_globals.posted_recvs, pmix_list_t);
PMIX_CONSTRUCT(&pmix_ptl_globals.unexpected_msgs, pmix_list_t);
pmix_ptl_globals.listen_thread_active = false;
PMIX_CONSTRUCT(&pmix_ptl_globals.listeners, pmix_list_t);
pmix_client_globals.myserver.sd = -1;

Просмотреть файл

@ -46,7 +46,7 @@
#include "src/mca/ptl/base/base.h"
static uint32_t current_tag = 1; // 0 is reserved for system purposes
static uint32_t current_tag = PMIX_PTL_TAG_DYNAMIC;
static void _notify_complete(pmix_status_t status, void *cbdata)
{
@ -162,7 +162,7 @@ static pmix_status_t send_msg(int sd, pmix_ptl_send_t *msg)
} else {
iov_count = 1;
}
retry:
retry:
rc = writev(sd, iov, iov_count);
if (PMIX_LIKELY(rc == remain)) {
/* we successfully sent the header and the msg data if any */
@ -521,16 +521,16 @@ void pmix_ptl_base_send_recv(int fd, short args, void *cbdata)
return;
}
/* set the tag */
tag = current_tag++;
/* take the next tag in the sequence */
current_tag++;
if (UINT32_MAX == current_tag ) {
current_tag = PMIX_PTL_TAG_DYNAMIC;
}
tag = current_tag;
if (NULL != ms->cbfunc) {
/* if a callback msg is expected, setup a recv for it */
req = PMIX_NEW(pmix_ptl_posted_recv_t);
/* take the next tag in the sequence */
if (UINT32_MAX == current_tag ) {
current_tag = 1;
}
req->tag = tag;
req->cbfunc = ms->cbfunc;
req->cbdata = ms->cbdata;
@ -597,23 +597,29 @@ void pmix_ptl_base_process_msg(int fd, short flags, void *cbdata)
buf.pack_ptr = ((char*)buf.base_ptr) + buf.bytes_used;
}
msg->data = NULL; // protect the data region
if (NULL != rcv->cbfunc) {
rcv->cbfunc(msg->peer, &msg->hdr, &buf, rcv->cbdata);
}
rcv->cbfunc(msg->peer, &msg->hdr, &buf, rcv->cbdata);
PMIX_DESTRUCT(&buf); // free's the msg data
/* also done with the recv, if not a wildcard or the error tag */
if (UINT32_MAX != rcv->tag && 0 != rcv->tag) {
pmix_list_remove_item(&pmix_ptl_globals.posted_recvs, &rcv->super);
PMIX_RELEASE(rcv);
}
PMIX_RELEASE(msg);
return;
}
/* done with the recv if it is a dynamic tag */
if (PMIX_PTL_TAG_DYNAMIC <= rcv->tag && UINT_MAX != rcv->tag) {
pmix_list_remove_item(&pmix_ptl_globals.posted_recvs, &rcv->super);
PMIX_RELEASE(rcv);
}
PMIX_RELEASE(msg);
return;
}
}
/* we get here if no matching recv was found - this is an error */
pmix_output(0, "UNEXPECTED MESSAGE tag = %d", msg->hdr.tag);
PMIX_RELEASE(msg);
PMIX_REPORT_EVENT(PMIX_ERROR, _notify_complete);
/* if the tag in this message is above the dynamic marker, then
* that is an error */
if (PMIX_PTL_TAG_DYNAMIC <= msg->hdr.tag) {
pmix_output(0, "UNEXPECTED MESSAGE tag = %d", msg->hdr.tag);
PMIX_RELEASE(msg);
PMIX_REPORT_EVENT(PMIX_ERROR, _notify_complete);
return;
}
/* it is possible that someone may post a recv for this message
* at some point, so we have to hold onto it */
pmix_list_append(&pmix_ptl_globals.unexpected_msgs, &msg->super);
}

Просмотреть файл

@ -9,7 +9,7 @@
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2015-2016 Intel, Inc. All rights reserved.
* Copyright (c) 2015-2017 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -105,3 +105,92 @@ pmix_status_t pmix_ptl_stub_connect_to_peer(struct pmix_peer_t *peer,
return PMIX_ERR_UNREACH;
}
static void post_recv(int fd, short args, void *cbdata)
{
pmix_ptl_posted_recv_t *req = (pmix_ptl_posted_recv_t*)cbdata;
pmix_ptl_recv_t *msg, *nmsg;
pmix_buffer_t buf;
pmix_output_verbose(5, pmix_globals.debug_output,
"posting recv on tag %d", req->tag);
/* add it to the list of recvs */
pmix_list_append(&pmix_ptl_globals.posted_recvs, &req->super);
/* now check the unexpected msg queue to see if we already
* recvd something for it */
PMIX_LIST_FOREACH_SAFE(msg, nmsg, &pmix_ptl_globals.unexpected_msgs, pmix_ptl_recv_t) {
if (msg->hdr.tag == req->tag || UINT_MAX == req->tag) {
if (NULL != req->cbfunc) {
/* construct and load the buffer */
PMIX_CONSTRUCT(&buf, pmix_buffer_t);
if (NULL != msg->data) {
buf.base_ptr = (char*)msg->data;
buf.bytes_allocated = buf.bytes_used = msg->hdr.nbytes;
buf.unpack_ptr = buf.base_ptr;
buf.pack_ptr = ((char*)buf.base_ptr) + buf.bytes_used;
}
msg->data = NULL; // protect the data region
req->cbfunc(msg->peer, &msg->hdr, &buf, req->cbdata);
PMIX_DESTRUCT(&buf); // free's the msg data
}
pmix_list_remove_item(&pmix_ptl_globals.unexpected_msgs, &msg->super);
PMIX_RELEASE(msg);
}
}
}
pmix_status_t pmix_ptl_stub_register_recv(struct pmix_peer_t *peer,
pmix_ptl_cbfunc_t cbfunc,
pmix_ptl_tag_t tag)
{
pmix_ptl_posted_recv_t *req;
req = PMIX_NEW(pmix_ptl_posted_recv_t);
if (NULL == req) {
return PMIX_ERR_NOMEM;
}
req->tag = tag;
req->cbfunc = cbfunc;
/* have to push this into an event so we can add this
* to the list of posted recvs */
pmix_event_assign(&(req->ev), pmix_globals.evbase, -1,
EV_WRITE, post_recv, req);
pmix_event_active(&(req->ev), EV_WRITE, 1);
return PMIX_SUCCESS;
}
static void cancel_recv(int fd, short args, void *cbdata)
{
pmix_ptl_posted_recv_t *req = (pmix_ptl_posted_recv_t*)cbdata;
pmix_ptl_posted_recv_t *rcv;
PMIX_LIST_FOREACH(rcv, &pmix_ptl_globals.posted_recvs, pmix_ptl_posted_recv_t) {
if (rcv->tag == req->tag) {
pmix_list_remove_item(&pmix_ptl_globals.posted_recvs, &rcv->super);
PMIX_RELEASE(rcv);
PMIX_RELEASE(req);
return;
}
}
PMIX_RELEASE(req);
}
pmix_status_t pmix_ptl_stub_cancel_recv(struct pmix_peer_t *peer,
pmix_ptl_tag_t tag)
{
pmix_ptl_posted_recv_t *req;
req = PMIX_NEW(pmix_ptl_posted_recv_t);
if (NULL == req) {
return PMIX_ERR_NOMEM;
}
req->tag = tag;
/* have to push this into an event so we can modify
* the list of posted recvs */
pmix_event_assign(&(req->ev), pmix_globals.evbase, -1,
EV_WRITE, cancel_recv, req);
pmix_event_active(&(req->ev), EV_WRITE, 1);
return PMIX_SUCCESS;
}

Просмотреть файл

@ -11,7 +11,7 @@
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2012 Los Alamos National Security, Inc. All rights reserved.
* Copyright (c) 2013-2016 Intel, Inc. All rights reserved
* Copyright (c) 2013-2017 Intel, Inc. All rights reserved.
* Copyright (c) 2015 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* Copyright (c) 2016 Mellanox Technologies, Inc.
@ -110,6 +110,15 @@ typedef pmix_status_t (*pmix_ptl_send_fn_t)(struct pmix_peer_t *peer,
pmix_buffer_t *bfr,
pmix_ptl_tag_t tag);
/* (ONE-WAY) register a persistent recv */
typedef pmix_status_t (*pmix_ptl_recv_fn_t)(struct pmix_peer_t *peer,
pmix_ptl_cbfunc_t cbfunc,
pmix_ptl_tag_t tag);
/* Cancel a persistent recv */
typedef pmix_status_t (*pmix_ptl_cancel_fn_t)(struct pmix_peer_t *peer,
pmix_ptl_tag_t tag);
/* connect to a peer - this is a blocking function
* to establish a connection to a peer. It assigns
* the corresponding module to the peer's compat
@ -126,6 +135,8 @@ struct pmix_ptl_module_t {
pmix_ptl_finalize_fn_t finalize;
pmix_ptl_send_recv_fn_t send_recv;
pmix_ptl_send_fn_t send;
pmix_ptl_recv_fn_t recv;
pmix_ptl_cancel_fn_t cancel;
pmix_ptl_connect_to_peer_fn_t connect_to_peer;
};
typedef struct pmix_ptl_module_t pmix_ptl_module_t;
@ -152,6 +163,8 @@ typedef struct {
pmix_ptl_get_available_modules_fn_t get_available_modules;
pmix_ptl_send_recv_fn_t send_recv;
pmix_ptl_send_fn_t send_oneway;
pmix_ptl_recv_fn_t recv;
pmix_ptl_cancel_fn_t cancel;
pmix_ptl_connect_to_peer_fn_t connect_to_peer;
pmix_ptl_start_listening_fn_t start_listening;
pmix_ptl_stop_listening_fn_t stop_listening;

Просмотреть файл

@ -63,6 +63,16 @@ struct pmix_ptl_module_t;
/**** MESSAGING STRUCTURES ****/
typedef uint32_t pmix_ptl_tag_t;
/* define a range of "reserved" tags - these
* are tags that are used for persistent recvs
* within the system */
#define PMIX_PTL_TAG_NOTIFY 0
#define PMIX_PTL_TAG_HEARTBEAT 1
/* define the start of dynamic tags that are
* assigned for send/recv operations */
#define PMIX_PTL_TAG_DYNAMIC 100
/* header for messages */
typedef struct {

Просмотреть файл

@ -1,5 +1,5 @@
/*
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
* Copyright (c) 2015 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
@ -13,6 +13,11 @@
#include "pmix_config.h"
#include <pthread.h>
#include PMIX_EVENT_HEADER
#include "src/include/types.h"
/**
* Initialize a progress thread name; if a progress thread is not
* already associated with that name, start a progress thread.

Просмотреть файл

@ -2345,6 +2345,18 @@ static pmix_status_t server_switchyard(pmix_peer_t *peer, uint32_t tag,
return rc;
}
if (PMIX_JOB_CONTROL_CMD == cmd) {
PMIX_PEER_CADDY(cd, peer, tag);
rc = pmix_server_job_ctrl(peer, buf, query_cbfunc, cd);
return rc;
}
if (PMIX_MONITOR_CMD == cmd) {
PMIX_PEER_CADDY(cd, peer, tag);
rc = pmix_server_monitor(peer, buf, query_cbfunc, cd);
return rc;
}
return PMIX_ERR_NOT_SUPPORTED;
}

Просмотреть файл

@ -1562,6 +1562,134 @@ pmix_status_t pmix_server_alloc(pmix_peer_t *peer,
return rc;
}
pmix_status_t pmix_server_job_ctrl(pmix_peer_t *peer,
pmix_buffer_t *buf,
pmix_info_cbfunc_t cbfunc,
void *cbdata)
{
int32_t cnt;
pmix_status_t rc;
pmix_query_caddy_t *cd;
pmix_proc_t proc;
pmix_output_verbose(2, pmix_globals.debug_output,
"recvd job control request from client");
if (NULL == pmix_host_server.job_control) {
return PMIX_ERR_NOT_SUPPORTED;
}
cd = PMIX_NEW(pmix_query_caddy_t);
cd->cbdata = cbdata;
/* unpack the number of targets */
cnt = 1;
if (PMIX_SUCCESS != (rc = pmix_bfrop.unpack(buf, &cd->ntargets, &cnt, PMIX_SIZE))) {
PMIX_ERROR_LOG(rc);
goto exit;
}
if (0 < cd->ntargets) {
PMIX_PROC_CREATE(cd->targets, cd->ntargets);
cnt = cd->ntargets;
if (PMIX_SUCCESS != (rc = pmix_bfrop.unpack(buf, cd->targets, &cnt, PMIX_PROC))) {
PMIX_ERROR_LOG(rc);
goto exit;
}
}
/* unpack the number of info objects */
cnt = 1;
if (PMIX_SUCCESS != (rc = pmix_bfrop.unpack(buf, &cd->ninfo, &cnt, PMIX_SIZE))) {
PMIX_ERROR_LOG(rc);
goto exit;
}
/* unpack the info */
if (0 < cd->ninfo) {
PMIX_INFO_CREATE(cd->info, cd->ninfo);
cnt = cd->ninfo;
if (PMIX_SUCCESS != (rc = pmix_bfrop.unpack(buf, cd->info, &cnt, PMIX_INFO))) {
PMIX_ERROR_LOG(rc);
goto exit;
}
}
/* setup the requesting peer name */
(void)strncpy(proc.nspace, peer->info->nptr->nspace, PMIX_MAX_NSLEN);
proc.rank = peer->info->rank;
/* ask the host to execute the request */
if (PMIX_SUCCESS != (rc = pmix_host_server.job_control(&proc,
cd->targets, cd->ntargets,
cd->info, cd->ninfo,
cbfunc, cd))) {
goto exit;
}
return PMIX_SUCCESS;
exit:
PMIX_RELEASE(cd);
return rc;
}
pmix_status_t pmix_server_monitor(pmix_peer_t *peer,
pmix_buffer_t *buf,
pmix_info_cbfunc_t cbfunc,
void *cbdata)
{
int32_t cnt;
pmix_status_t rc, error;
pmix_query_caddy_t *cd;
pmix_proc_t proc;
pmix_output_verbose(2, pmix_globals.debug_output,
"recvd monitor request from client");
if (NULL == pmix_host_server.monitor) {
return PMIX_ERR_NOT_SUPPORTED;
}
cd = PMIX_NEW(pmix_query_caddy_t);
cd->cbdata = cbdata;
/* unpack the error code */
cnt = 1;
if (PMIX_SUCCESS != (rc = pmix_bfrop.unpack(buf, &error, &cnt, PMIX_STATUS))) {
PMIX_ERROR_LOG(rc);
goto exit;
}
/* unpack the number of directives */
cnt = 1;
if (PMIX_SUCCESS != (rc = pmix_bfrop.unpack(buf, &cd->ninfo, &cnt, PMIX_SIZE))) {
PMIX_ERROR_LOG(rc);
goto exit;
}
/* unpack the directives */
if (0 < cd->ninfo) {
PMIX_INFO_CREATE(cd->info, cd->ninfo);
cnt = cd->ninfo;
if (PMIX_SUCCESS != (rc = pmix_bfrop.unpack(buf, cd->info, &cnt, PMIX_INFO))) {
PMIX_ERROR_LOG(rc);
goto exit;
}
}
/* setup the requesting peer name */
(void)strncpy(proc.nspace, peer->info->nptr->nspace, PMIX_MAX_NSLEN);
proc.rank = peer->info->rank;
/* ask the host to execute the request */
if (PMIX_SUCCESS != (rc = pmix_host_server.monitor(&proc, error,
cd->info, cd->ninfo,
cbfunc, cd))) {
goto exit;
}
return PMIX_SUCCESS;
exit:
PMIX_RELEASE(cd);
return rc;
}
/***** INSTANCE SERVER LIBRARY CLASSES *****/
static void tcon(pmix_server_trkr_t *t)
{

Просмотреть файл

@ -218,6 +218,16 @@ pmix_status_t pmix_server_alloc(pmix_peer_t *peer,
pmix_info_cbfunc_t cbfunc,
void *cbdata);
pmix_status_t pmix_server_job_ctrl(pmix_peer_t *peer,
pmix_buffer_t *buf,
pmix_info_cbfunc_t cbfunc,
void *cbdata);
pmix_status_t pmix_server_monitor(pmix_peer_t *peer,
pmix_buffer_t *buf,
pmix_info_cbfunc_t cbfunc,
void *cbdata);
pmix_status_t pmix_server_event_recvd_from_client(pmix_peer_t *peer,
pmix_buffer_t *buf,
pmix_op_cbfunc_t cbfunc,

Просмотреть файл

@ -56,6 +56,8 @@ PMIX_EXPORT const char* PMIx_Error_string(pmix_status_t errnum)
return "INVALID-KEYVAL";
case PMIX_ERR_INVALID_NUM_PARSED:
return "INVALID-NUM-PARSED";
case PMIX_ERR_TAKE_NEXT_OPTION:
return "TAKE-NEXT-OPTION";
case PMIX_ERR_INVALID_ARGS:
return "INVALID-ARGS";
@ -157,6 +159,14 @@ PMIX_EXPORT const char* PMIx_Error_string(pmix_status_t errnum)
return "PMIX_ERR_WILDCARD";
case PMIX_NOTIFY_ALLOC_COMPLETE:
return "PMIX ALLOC OPERATION COMPLETE";
case PMIX_JCTRL_CHECKPOINT:
return "PMIX JOB CONTROL CHECKPOINT";
case PMIX_JCTRL_PREEMPT_ALERT:
return "PMIX PRE-EMPTION ALERT";
case PMIX_MONITOR_HEARTBEAT_ALERT:
return "PMIX HEARTBEAT ALERT";
case PMIX_MONITOR_FILE_ALERT:
return "PMIX FILE MONITOR ALERT";
case PMIX_SUCCESS:
return "SUCCESS";
default:

Просмотреть файл

@ -37,6 +37,7 @@
#define PMIX_ERR_NETWORK_NOT_PARSEABLE (PMIX_INTERNAL_ERR_BASE - 33)
#define PMIX_ERR_FILE_OPEN_FAILURE (PMIX_INTERNAL_ERR_BASE - 34)
#define PMIX_ERR_FILE_READ_FAILURE (PMIX_INTERNAL_ERR_BASE - 35)
#define PMIX_ERR_TAKE_NEXT_OPTION (PMIX_INTERNAL_ERR_BASE - 36)
#define PMIX_ERROR_LOG(r) \
do { \

Просмотреть файл

@ -493,6 +493,12 @@ int pmix2x_convert_rc(pmix_status_t rc)
case PMIX_QUERY_PARTIAL_SUCCESS:
return OPAL_ERR_PARTIAL_SUCCESS;
case PMIX_MONITOR_HEARTBEAT_ALERT:
return OPAL_ERR_HEARTBEAT_ALERT;
case PMIX_MONITOR_FILE_ALERT:
return OPAL_ERR_FILE_ALERT;
case PMIX_ERROR:
return OPAL_ERROR;
case PMIX_SUCCESS:
@ -1333,6 +1339,22 @@ static void pmix2x_log(opal_list_t *info,
OBJ_RELEASE(cd);
}
opal_pmix_alloc_directive_t pmix2x_convert_allocdir(pmix_alloc_directive_t dir)
{
switch (dir) {
case PMIX_ALLOC_NEW:
return OPAL_PMIX_ALLOC_NEW;
case PMIX_ALLOC_EXTEND:
return OPAL_PMIX_ALLOC_EXTEND;
case PMIX_ALLOC_RELEASE:
return OPAL_PMIX_ALLOC_RELEASE;
case PMIX_ALLOC_REAQUIRE:
return OPAL_PMIX_ALLOC_REAQCUIRE;
default:
return OPAL_PMIX_ALLOC_UNDEF;
}
}
/**** INSTANTIATE INTERNAL CLASSES ****/
OBJ_CLASS_INSTANCE(opal_pmix2x_jobid_trkr_t,
opal_list_item_t,

Просмотреть файл

@ -279,6 +279,8 @@ OPAL_MODULE_DECLSPEC void pmix2x_value_load(pmix_value_t *v,
OPAL_MODULE_DECLSPEC int pmix2x_value_unload(opal_value_t *kv,
const pmix_value_t *v);
OPAL_MODULE_DECLSPEC opal_pmix_alloc_directive_t pmix2x_convert_allocdir(pmix_alloc_directive_t dir);
END_C_DECLS
#endif /* MCA_PMIX_EXTERNAL_H */

Просмотреть файл

@ -45,63 +45,73 @@
/* These are the interfaces used by the embedded PMIx server
* to call up into ORTE for service requests */
static pmix_status_t server_client_connected_fn(const pmix_proc_t *proc, void* server_object,
pmix_op_cbfunc_t cbfunc, void *cbdata);
static pmix_status_t server_client_finalized_fn(const pmix_proc_t *proc, void* server_object,
pmix_op_cbfunc_t cbfunc, void *cbdata);
static pmix_status_t server_abort_fn(const pmix_proc_t *proc, void *server_object,
int status, const char msg[],
pmix_proc_t procs[], size_t nprocs,
pmix_op_cbfunc_t cbfunc, void *cbdata);
static pmix_status_t server_fencenb_fn(const pmix_proc_t procs[], size_t nprocs,
const pmix_info_t info[], size_t ninfo,
char *data, size_t ndata,
pmix_modex_cbfunc_t cbfunc, void *cbdata);
static pmix_status_t server_dmodex_req_fn(const pmix_proc_t *proc,
const pmix_info_t info[], size_t ninfo,
pmix_modex_cbfunc_t cbfunc, void *cbdata);
static pmix_status_t server_publish_fn(const pmix_proc_t *proc,
const pmix_info_t info[], size_t ninfo,
pmix_op_cbfunc_t cbfunc, void *cbdata);
static pmix_status_t server_lookup_fn(const pmix_proc_t *proc, char **keys,
static pmix_status_t server_client_connected_fn(const pmix_proc_t *proc, void* server_object,
pmix_op_cbfunc_t cbfunc, void *cbdata);
static pmix_status_t server_client_finalized_fn(const pmix_proc_t *proc, void* server_object,
pmix_op_cbfunc_t cbfunc, void *cbdata);
static pmix_status_t server_abort_fn(const pmix_proc_t *proc, void *server_object,
int status, const char msg[],
pmix_proc_t procs[], size_t nprocs,
pmix_op_cbfunc_t cbfunc, void *cbdata);
static pmix_status_t server_fencenb_fn(const pmix_proc_t procs[], size_t nprocs,
const pmix_info_t info[], size_t ninfo,
pmix_lookup_cbfunc_t cbfunc, void *cbdata);
static pmix_status_t server_unpublish_fn(const pmix_proc_t *proc, char **keys,
char *data, size_t ndata,
pmix_modex_cbfunc_t cbfunc, void *cbdata);
static pmix_status_t server_dmodex_req_fn(const pmix_proc_t *proc,
const pmix_info_t info[], size_t ninfo,
pmix_modex_cbfunc_t cbfunc, void *cbdata);
static pmix_status_t server_publish_fn(const pmix_proc_t *proc,
const pmix_info_t info[], size_t ninfo,
pmix_op_cbfunc_t cbfunc, void *cbdata);
static pmix_status_t server_lookup_fn(const pmix_proc_t *proc, char **keys,
const pmix_info_t info[], size_t ninfo,
pmix_lookup_cbfunc_t cbfunc, void *cbdata);
static pmix_status_t server_unpublish_fn(const pmix_proc_t *proc, char **keys,
const pmix_info_t info[], size_t ninfo,
pmix_op_cbfunc_t cbfunc, void *cbdata);
static pmix_status_t server_spawn_fn(const pmix_proc_t *proc,
const pmix_info_t job_info[], size_t ninfo,
const pmix_app_t apps[], size_t napps,
pmix_spawn_cbfunc_t cbfunc, void *cbdata);
static pmix_status_t server_connect_fn(const pmix_proc_t procs[], size_t nprocs,
const pmix_info_t info[], size_t ninfo,
pmix_op_cbfunc_t cbfunc, void *cbdata);
static pmix_status_t server_disconnect_fn(const pmix_proc_t procs[], size_t nprocs,
const pmix_info_t info[], size_t ninfo,
pmix_op_cbfunc_t cbfunc, void *cbdata);
static pmix_status_t server_spawn_fn(const pmix_proc_t *proc,
const pmix_info_t job_info[], size_t ninfo,
const pmix_app_t apps[], size_t napps,
pmix_spawn_cbfunc_t cbfunc, void *cbdata);
static pmix_status_t server_connect_fn(const pmix_proc_t procs[], size_t nprocs,
const pmix_info_t info[], size_t ninfo,
pmix_op_cbfunc_t cbfunc, void *cbdata);
static pmix_status_t server_disconnect_fn(const pmix_proc_t procs[], size_t nprocs,
const pmix_info_t info[], size_t ninfo,
pmix_op_cbfunc_t cbfunc, void *cbdata);
static pmix_status_t server_register_events(pmix_status_t *codes, size_t ncodes,
const pmix_info_t info[], size_t ninfo,
pmix_op_cbfunc_t cbfunc, void *cbdata);
static pmix_status_t server_deregister_events(pmix_status_t *codes, size_t ncodes,
pmix_op_cbfunc_t cbfunc, void *cbdata);
static pmix_status_t server_notify_event(pmix_status_t code,
const pmix_proc_t *source,
pmix_data_range_t range,
pmix_info_t info[], size_t ninfo,
pmix_op_cbfunc_t cbfunc, void *cbdata);
static pmix_status_t server_query(pmix_proc_t *proct,
pmix_query_t *queryies, size_t nqueries,
pmix_info_cbfunc_t cbfunc,
static pmix_status_t server_register_events(pmix_status_t *codes, size_t ncodes,
const pmix_info_t info[], size_t ninfo,
pmix_op_cbfunc_t cbfunc, void *cbdata);
static pmix_status_t server_deregister_events(pmix_status_t *codes, size_t ncodes,
pmix_op_cbfunc_t cbfunc, void *cbdata);
static pmix_status_t server_notify_event(pmix_status_t code,
const pmix_proc_t *source,
pmix_data_range_t range,
pmix_info_t info[], size_t ninfo,
pmix_op_cbfunc_t cbfunc, void *cbdata);
static pmix_status_t server_query(pmix_proc_t *proct,
pmix_query_t *queryies, size_t nqueries,
pmix_info_cbfunc_t cbfunc,
void *cbdata);
static void server_tool_connection(pmix_info_t *info, size_t ninfo,
pmix_tool_connection_cbfunc_t cbfunc,
void *cbdata);
static void server_tool_connection(pmix_info_t *info, size_t ninfo,
pmix_tool_connection_cbfunc_t cbfunc,
void *cbdata);
static void server_log(const pmix_proc_t *client,
const pmix_info_t data[], size_t ndata,
const pmix_info_t directives[], size_t ndirs,
pmix_op_cbfunc_t cbfunc, void *cbdata);
pmix_server_module_t mymodule = {
static pmix_status_t server_allocate(const pmix_proc_t *client,
pmix_alloc_directive_t directive,
const pmix_info_t data[], size_t ndata,
pmix_info_cbfunc_t cbfunc, void *cbdata);
static pmix_status_t server_job_control(const pmix_proc_t *requestor,
const pmix_proc_t targets[], size_t ntargets,
const pmix_info_t directives[], size_t ndirs,
pmix_info_cbfunc_t cbfunc, void *cbdata);
pmix_server_module_t mymodule = {
.client_connected = server_client_connected_fn,
.client_finalized = server_client_finalized_fn,
.abort = server_abort_fn,
@ -118,7 +128,11 @@ static void server_log(const pmix_proc_t *client,
.notify_event = server_notify_event,
.query = server_query,
.tool_connected = server_tool_connection,
.log = server_log
.log = server_log,
.allocate = server_allocate,
.job_control = server_job_control
/* we do not support monitoring, but use the
* PMIx internal monitoring capability */
};
opal_pmix_server_module_t *host_module = NULL;
@ -1052,3 +1066,117 @@ static void server_log(const pmix_proc_t *proct,
&opalcaddy->apps,
opal_opcbfunc, opalcaddy);
}
static pmix_status_t server_allocate(const pmix_proc_t *proct,
pmix_alloc_directive_t directive,
const pmix_info_t data[], size_t ndata,
pmix_info_cbfunc_t cbfunc, void *cbdata)
{
pmix2x_opalcaddy_t *opalcaddy;
opal_process_name_t requestor;
int rc;
size_t n;
opal_value_t *oinfo;
opal_pmix_alloc_directive_t odir;
if (NULL == host_module || NULL == host_module->allocate) {
return PMIX_ERR_NOT_SUPPORTED;
}
/* setup the caddy */
opalcaddy = OBJ_NEW(pmix2x_opalcaddy_t);
opalcaddy->infocbfunc = cbfunc;
opalcaddy->cbdata = cbdata;
/* convert the requestor */
if (OPAL_SUCCESS != (rc = opal_convert_string_to_jobid(&requestor.jobid, proct->nspace))) {
OBJ_RELEASE(opalcaddy);
return pmix2x_convert_opalrc(rc);
}
requestor.vpid = pmix2x_convert_rank(proct->rank);
/* convert the directive */
odir = pmix2x_convert_allocdir(directive);
/* convert the data */
for (n=0; n < ndata; n++) {
oinfo = OBJ_NEW(opal_value_t);
opal_list_append(&opalcaddy->info, &oinfo->super);
if (OPAL_SUCCESS != (rc = pmix2x_value_unload(oinfo, &data[n].value))) {
OBJ_RELEASE(opalcaddy);
return pmix2x_convert_opalrc(rc);
}
}
/* pass the call upwards */
if (OPAL_SUCCESS != (rc = host_module->allocate(&requestor, odir,
&opalcaddy->info,
info_cbfunc, opalcaddy))) {
OBJ_RELEASE(opalcaddy);
return pmix2x_convert_opalrc(rc);
}
return PMIX_SUCCESS;
}
static pmix_status_t server_job_control(const pmix_proc_t *proct,
const pmix_proc_t targets[], size_t ntargets,
const pmix_info_t directives[], size_t ndirs,
pmix_info_cbfunc_t cbfunc, void *cbdata)
{
pmix2x_opalcaddy_t *opalcaddy;
opal_process_name_t requestor;
int rc;
size_t n;
opal_value_t *oinfo;
opal_namelist_t *nm;
if (NULL == host_module || NULL == host_module->job_control) {
return PMIX_ERR_NOT_SUPPORTED;
}
/* setup the caddy */
opalcaddy = OBJ_NEW(pmix2x_opalcaddy_t);
opalcaddy->infocbfunc = cbfunc;
opalcaddy->cbdata = cbdata;
/* convert the requestor */
if (OPAL_SUCCESS != (rc = opal_convert_string_to_jobid(&requestor.jobid, proct->nspace))) {
OBJ_RELEASE(opalcaddy);
return pmix2x_convert_opalrc(rc);
}
requestor.vpid = pmix2x_convert_rank(proct->rank);
/* convert the targets */
for (n=0; n < ntargets; n++) {
nm = OBJ_NEW(opal_namelist_t);
opal_list_append(&opalcaddy->procs, &nm->super);
if (OPAL_SUCCESS != (rc = opal_convert_string_to_jobid(&nm->name.jobid, targets[n].nspace))) {
OBJ_RELEASE(opalcaddy);
return pmix2x_convert_opalrc(rc);
}
nm->name.vpid = pmix2x_convert_rank(targets[n].rank);
}
/* convert the directives */
for (n=0; n < ndirs; n++) {
oinfo = OBJ_NEW(opal_value_t);
opal_list_append(&opalcaddy->info, &oinfo->super);
if (OPAL_SUCCESS != (rc = pmix2x_value_unload(oinfo, &directives[n].value))) {
OBJ_RELEASE(opalcaddy);
return pmix2x_convert_opalrc(rc);
}
}
/* pass the call upwards */
if (OPAL_SUCCESS != (rc = host_module->job_control(&requestor,
&opalcaddy->procs,
&opalcaddy->info,
info_cbfunc, opalcaddy))) {
OBJ_RELEASE(opalcaddy);
return pmix2x_convert_opalrc(rc);
}
return PMIX_SUCCESS;
}

Просмотреть файл

@ -1,5 +1,5 @@
/*
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -230,6 +230,19 @@ typedef void (*opal_pmix_connection_cbfunc_t)(int incoming_sd);
typedef int (*opal_pmix_server_listener_fn_t)(int listening_sd,
opal_pmix_connection_cbfunc_t cbfunc);
/* Request allocation modifications on behalf of a client */
typedef int (*opal_pmix_server_alloc_fn_t)(const opal_process_name_t *client,
opal_pmix_alloc_directive_t directive,
opal_list_t *data,
opal_pmix_info_cbfunc_t cbfunc, void *cbdata);
/* Execute a job control action on behalf of a client */
typedef int (*opal_pmix_server_job_control_fn_t)(const opal_process_name_t *requestor,
opal_list_t *targets, opal_list_t *directives,
opal_pmix_info_cbfunc_t cbfunc, void *cbdata);
/* we do not provide a monitoring capability */
typedef struct opal_pmix_server_module_1_0_0_t {
opal_pmix_server_client_connected_fn_t client_connected;
opal_pmix_server_client_finalized_fn_t client_finalized;
@ -249,6 +262,8 @@ typedef struct opal_pmix_server_module_1_0_0_t {
opal_pmix_server_tool_connection_fn_t tool_connected;
opal_pmix_server_log_fn_t log;
opal_pmix_server_listener_fn_t listener;
opal_pmix_server_alloc_fn_t allocate;
opal_pmix_server_job_control_fn_t job_control;
} opal_pmix_server_module_t;

Просмотреть файл

@ -32,6 +32,11 @@ BEGIN_C_DECLS
* that key */
#define OPAL_PMIX_RANK_WILDCARD UINT32_MAX-1
/* other special rank values will be used to define
* groups of ranks for use in collectives */
#define OPAL_PMIX_RANK_LOCAL_NODE UINT32_MAX-2 // all ranks on local node
/* define a set of "standard" attributes that can
* be queried. Implementations (and users) are free to extend as
* desired, so the get functions need to be capable
@ -55,12 +60,15 @@ BEGIN_C_DECLS
#define OPAL_PMIX_CONNECT_TO_SYSTEM "pmix.cnct.sys" // (bool) The requestor requires that a connection be made only to
// a local system-level PMIx server
#define OPAL_PMIX_CONNECT_SYSTEM_FIRST "pmix.cnct.sys.first" // (bool) Preferentially look for a system-level PMIx server first
#define OPAL_PMIX_REGISTER_NODATA "pmix.reg.nodata" // (bool) Registration is for nspace only, do not copy job data
#define OPAL_PMIX_SERVER_ENABLE_MONITORING "pmix.srv.monitor" // (bool) Enable PMIx internal monitoring by server
/* identification attributes */
#define OPAL_PMIX_USERID "pmix.euid" // (uint32_t) effective user id
#define OPAL_PMIX_GRPID "pmix.egid" // (uint32_t) effective group id
/* attributes for the rendezvous socket */
#define OPAL_PMIX_USOCK_DISABLE "pmix.usock.disable" // (bool) disable legacy usock support
#define OPAL_PMIX_SOCKET_MODE "pmix.sockmode" // (uint32_t) POSIX mode_t (9 bits valid)
@ -76,6 +84,7 @@ BEGIN_C_DECLS
#define OPAL_PMIX_TCP_DISABLE_IPV4 "pmix.tcp.disipv4" // (bool) true to disable IPv4 family
#define OPAL_PMIX_TCP_DISABLE_IPV6 "pmix.tcp.disipv6" // (bool) true to disable IPv6 family
/* general proc-level attributes */
#define OPAL_PMIX_CPUSET "pmix.cpuset" // (char*) hwloc bitmap applied to proc upon launch
#define OPAL_PMIX_CREDENTIAL "pmix.cred" // (char*) security credential assigned to proc
@ -89,6 +98,7 @@ BEGIN_C_DECLS
#define OPAL_PMIX_PROCDIR "pmix.pdir" // (char*) sub-nsdir assigned to proc
#define OPAL_PMIX_TDIR_RMCLEAN "pmix.tdir.rmclean" // (bool) Resource Manager will clean session directories
/* information about relative ranks as assigned by the RM */
#define OPAL_PMIX_PROCID "pmix.procid" // (opal_process_name_t) process identifier
#define OPAL_PMIX_NSPACE "pmix.nspace" // (char*) nspace of a job
@ -104,25 +114,26 @@ BEGIN_C_DECLS
#define OPAL_PMIX_LOCALLDR "pmix.lldr" // (uint64_t) opal_identifier of lowest rank on this node within this job
#define OPAL_PMIX_APPLDR "pmix.aldr" // (uint32_t) lowest rank in this app within this job
#define OPAL_PMIX_PROC_PID "pmix.ppid" // (pid_t) pid of specified proc
/**** no PMIx equivalent ****/
#define OPAL_PMIX_LOCALITY "pmix.loc" // (uint16_t) relative locality of two procs
#define OPAL_PMIX_TOPOLOGY_SIGNATURE "pmix.toposig" // (char*) topology signature string
#define OPAL_PMIX_LOCALITY_STRING "pmix.locstr" // (char*) string describing a proc's location
#define OPAL_PMIX_AVAIL_PHYS_MEMORY "pmix.pmem" // (uint64_t) total available physical memory on this node
#define OPAL_PMIX_SESSION_ID "pmix.session.id" // (uint32_t) session identifier
#define OPAL_PMIX_NODE_LIST "pmix.nlist" // (char*) comma-delimited list of nodes running procs for the specified nspace
#define OPAL_PMIX_ALLOCATED_NODELIST "pmix.alist" // (char*) comma-delimited list of all nodes in this allocation regardless of
// whether or not they currently host procs.
#define OPAL_PMIX_HOSTNAME "pmix.hname" // (char*) name of the host the specified proc is on
#define OPAL_PMIX_NODEID "pmix.nodeid" // (uint32_t) node identifier
#define OPAL_PMIX_LOCAL_PEERS "pmix.lpeers" // (char*) comma-delimited string of ranks on this node within the specified nspace
#define OPAL_PMIX_LOCAL_PROCS "pmix.lprocs" // (opal_list_t*) list of opal_namelist_t of procs on the specified node
#define OPAL_PMIX_LOCAL_CPUSETS "pmix.lcpus" // (char*) colon-delimited cpusets of local peers within the specified nspace
#define OPAL_PMIX_PROC_URI "opal.puri" // (char*) URI containing contact info for proc - NOTE: this is published by procs and
// thus cannot be prefixed with "pmix"
#define OPAL_PMIX_LOCALITY "pmix.loc" // (uint16_t) relative locality of two procs
/* Memory info */
#define OPAL_PMIX_AVAIL_PHYS_MEMORY "pmix.pmem" // (uint64_t) total available physical memory on this node
#define OPAL_PMIX_DAEMON_MEMORY "pmix.dmn.mem" // (float) Mbytes of memory currently used by daemon
#define OPAL_PMIX_CLIENT_AVG_MEMORY "pmix.cl.mem.avg" // (float) Average Mbytes of memory used by client processes
/* size info */
#define OPAL_PMIX_UNIV_SIZE "pmix.univ.size" // (uint32_t) #procs in this nspace
#define OPAL_PMIX_JOB_SIZE "pmix.job.size" // (uint32_t) #procs in this job
@ -133,11 +144,15 @@ BEGIN_C_DECLS
#define OPAL_PMIX_MAX_PROCS "pmix.max.size" // (uint32_t) max #procs for this job
#define OPAL_PMIX_NUM_NODES "pmix.num.nodes" // (uint32_t) #nodes in this nspace
/* topology info */
#define OPAL_PMIX_NET_TOPO "pmix.ntopo" // (char*) xml-representation of network topology
#define OPAL_PMIX_LOCAL_TOPO "pmix.ltopo" // (char*) xml-representation of local node topology
#define OPAL_PMIX_NODE_LIST "pmix.nlist" // (char*) comma-delimited list of nodes running procs for this job
#define OPAL_PMIX_TOPOLOGY "pmix.topo" // (hwloc_topology_t) pointer to the PMIx client's internal topology object
#define OPAL_PMIX_TOPOLOGY_SIGNATURE "pmix.toposig" // (char*) topology signature string
#define OPAL_PMIX_LOCALITY_STRING "pmix.locstr" // (char*) string describing a proc's location
/* request-related info */
#define OPAL_PMIX_COLLECT_DATA "pmix.collect" // (bool) collect data and return it at the end of the operation
@ -156,16 +171,19 @@ BEGIN_C_DECLS
#define OPAL_PMIX_EMBED_BARRIER "pmix.embed.barrier" // (bool) execute a blocking fence operation before executing the
// specified operation
/* attribute used by host server to pass data to the server convenience library - the
* data will then be parsed and provided to the local clients */
#define OPAL_PMIX_PROC_DATA "pmix.pdata" // (pmix_value_array_t) starts with rank, then contains more data
#define OPAL_PMIX_NODE_MAP "pmix.nmap" // (char*) regex of nodes containing procs for this job
#define OPAL_PMIX_PROC_MAP "pmix.pmap" // (char*) regex describing procs on each node within this job
/* attributes used internally to communicate data from the server to the client */
#define OPAL_PMIX_PROC_BLOB "pmix.pblob" // (pmix_byte_object_t) packed blob of process data
#define OPAL_PMIX_MAP_BLOB "pmix.mblob" // (pmix_byte_object_t) packed blob of process location
/* error handler registration and notification info keys */
#define OPAL_PMIX_EVENT_HDLR_NAME "pmix.evname" // (char*) string name identifying this handler
#define OPAL_PMIX_EVENT_JOB_LEVEL "pmix.evjob" // (bool) register for job-specific events only
@ -187,7 +205,7 @@ BEGIN_C_DECLS
#define OPAL_PMIX_EVENT_ACTION_TIMEOUT "pmix.evtimeout" // (int) time in sec before RM will execute error response
/* attributes used to describe "spawm" attributes */
/* attributes used to describe "spawn" attributes */
#define OPAL_PMIX_PERSONALITY "pmix.pers" // (char*) name of personality to use
#define OPAL_PMIX_HOST "pmix.host" // (char*) comma-delimited list of hosts to use for spawned procs
#define OPAL_PMIX_HOSTFILE "pmix.hostfile" // (char*) hostfile to use for spawned procs
@ -229,19 +247,89 @@ BEGIN_C_DECLS
#define OPAL_PMIX_QUERY_LOCAL_ONLY "pmix.qry.local" // constrain the query to local information only
#define OPAL_PMIX_QUERY_REPORT_AVG "pmix.qry.avg" // report average values
#define OPAL_PMIX_QUERY_REPORT_MINMAX "pmix.qry.minmax" // report minimum and maximum value
#define OPAL_PMIX_QUERY_ALLOC_STATUS "pmix.query.alloc" // (char*) string identifier of the allocation whose status
// is being requested
#define OPAL_PMIX_TIME_REMAINING "pmix.time.remaining" // (char*) query number of seconds (uint32_t) remaining in allocation
// for the specified nspace
/* log attributes */
#define OPAL_PMIX_LOG_STDERR "pmix.log.stderr" // (char*) log string to stderr
#define OPAL_PMIX_LOG_STDOUT "pmix.log.stdout" // (char*) log string to stdout
#define OPAL_PMIX_LOG_SYSLOG "pmix.log.syslog" // (char*) log data to syslog - defaults to ERROR priority unless
#define OPAL_PMIX_LOG_MSG "pmix.log.msg" // (pmix_byte_object_t) message blob to be sent somewhere
#define OPAL_PMIX_LOG_STDERR "pmix.log.stderr" // (char*) log string to stderr
#define OPAL_PMIX_LOG_STDOUT "pmix.log.stdout" // (char*) log string to stdout
#define OPAL_PMIX_LOG_SYSLOG "pmix.log.syslog" // (char*) log data to syslog - defaults to ERROR priority unless
#define OPAL_PMIX_LOG_MSG "pmix.log.msg" // (pmix_byte_object_t) message blob to be sent somewhere
#define OPAL_PMIX_LOG_EMAIL "pmix.log.email" // (pmix_data_array_t) log via email based on pmix_info_t containing directives
#define OPAL_PMIX_LOG_EMAIL_ADDR "pmix.log.emaddr" // (char*) comma-delimited list of email addresses that are to recv msg
#define OPAL_PMIX_LOG_EMAIL_SUBJECT "pmix.log.emsub" // (char*) subject line for email
#define OPAL_PMIX_LOG_EMAIL_MSG "pmix.log.emmsg" // (char*) msg to be included in email
/* debugger attributes */
#define OPAL_PMIX_DEBUG_STOP_ON_EXEC "pmix.dbg.exec" // (bool) job is being spawned under debugger - instruct it to pause on start
#define OPAL_PMIX_DEBUG_STOP_IN_INIT "pmix.dbg.init" // (bool) instruct job to stop during PMIx init
#define OPAL_PMIX_DEBUG_WAIT_FOR_NOTIFY "pmix.dbg.notify" // (bool) block at desired point until receiving debugger release notification
#define OPAL_PMIX_DEBUG_JOB "pmix.dbg.job" // (char*) nspace of the job to be debugged - the RM/PMIx server are
#define OPAL_PMIX_DEBUG_WAITING_FOR_NOTIFY "pmix.dbg.waiting" // (bool) job to be debugged is waiting for a release
#define OPAL_PMIX_DEBUG_STOP_ON_EXEC "pmix.dbg.exec" // (bool) job is being spawned under debugger - instruct it to pause on start
#define OPAL_PMIX_DEBUG_STOP_IN_INIT "pmix.dbg.init" // (bool) instruct job to stop during PMIx init
#define OPAL_PMIX_DEBUG_WAIT_FOR_NOTIFY "pmix.dbg.notify" // (bool) block at desired point until receiving debugger release notification
#define OPAL_PMIX_DEBUG_JOB "pmix.dbg.job" // (char*) nspace of the job to be debugged - the RM/PMIx server are
#define OPAL_PMIX_DEBUG_WAITING_FOR_NOTIFY "pmix.dbg.waiting" // (bool) job to be debugged is waiting for a release
/* Resource Manager identification */
#define OPAL_PMIX_RM_NAME "pmix.rm.name" // (char*) string name of the resource manager
#define OPAL_PMIX_RM_VERSION "pmix.rm.version" // (char*) RM version string
/* attributes for setting envars */
#define OPAL_PMIX_SET_ENVAR "pmix.set.envar" // (char*) string "key=value" value shall be put into the environment
#define OPAL_PMIX_UNSET_ENVAR "pmix.unset.envar" // (char*) unset envar specified in string
/* attributes relating to allocations */
#define OPAL_PMIX_ALLOC_ID "pmix.alloc.id" // (char*) provide a string identifier for this allocation request
// which can later be used to query status of the request
#define OPAL_PMIX_ALLOC_NUM_NODES "pmix.alloc.nnodes" // (uint64_t) number of nodes
#define OPAL_PMIX_ALLOC_NODE_LIST "pmix.alloc.nlist" // (char*) regex of specific nodes
#define OPAL_PMIX_ALLOC_NUM_CPUS "pmix.alloc.ncpus" // (uint64_t) number of cpus
#define OPAL_PMIX_ALLOC_NUM_CPU_LIST "pmix.alloc.ncpulist" // (char*) regex of #cpus for each node
#define OPAL_PMIX_ALLOC_CPU_LIST "pmix.alloc.cpulist" // (char*) regex of specific cpus indicating the cpus involved.
#define OPAL_PMIX_ALLOC_MEM_SIZE "pmix.alloc.msize" // (float) number of Mbytes
#define OPAL_PMIX_ALLOC_NETWORK "pmix.alloc.net" // (array) array of pmix_info_t describing network resources. If not
// given as part of an info struct that identifies the
// impacted nodes, then the description will be applied
// across all nodes in the requestor's allocation
#define OPAL_PMIX_ALLOC_NETWORK_ID "pmix.alloc.netid" // (char*) name of network
#define OPAL_PMIX_ALLOC_BANDWIDTH "pmix.alloc.bw" // (float) Mbits/sec
#define OPAL_PMIX_ALLOC_NETWORK_QOS "pmix.alloc.netqos" // (char*) quality of service level
#define OPAL_PMIX_ALLOC_TIME "pmix.alloc.time" // (uint32_t) time in seconds
/* job control attributes */
#define OPAL_PMIX_JOB_CTRL_ID "pmix.jctrl.id" // (char*) provide a string identifier for this request
#define OPAL_PMIX_JOB_CTRL_PAUSE "pmix.jctrl.pause" // (bool) pause the specified processes
#define OPAL_PMIX_JOB_CTRL_RESUME "pmix.jctrl.resume" // (bool) "un-pause" the specified processes
#define OPAL_PMIX_JOB_CTRL_CANCEL "pmix.jctrl.cancel" // (char*) cancel the specified request
// (NULL => cancel all requests from this requestor)
#define OPAL_PMIX_JOB_CTRL_KILL "pmix.jctrl.kill" // (bool) forcibly terminate the specified processes and cleanup
#define OPAL_PMIX_JOB_CTRL_RESTART "pmix.jctrl.restart" // (char*) restart the specified processes using the given checkpoint ID
#define OPAL_PMIX_JOB_CTRL_CHECKPOINT "pmix.jctrl.ckpt" // (char*) checkpoint the specified processes and assign the given ID to it
#define OPAL_PMIX_JOB_CTRL_CHECKPOINT_EVENT "pmix.jctrl.ckptev" // (bool) use event notification to trigger process checkpoint
#define OPAL_PMIX_JOB_CTRL_CHECKPOINT_SIGNAL "pmix.jctrl.ckptsig" // (int) use the given signal to trigger process checkpoint
#define OPAL_PMIX_JOB_CTRL_CHECKPOINT_TIMEOUT "pmix.jctrl.ckptsig" // (int) time in seconds to wait for checkpoint to complete
#define OPAL_PMIX_JOB_CTRL_SIGNAL "pmix.jctrl.sig" // (int) send given signal to specified processes
#define OPAL_PMIX_JOB_CTRL_PROVISION "pmix.jctrl.pvn" // (char*) regex identifying nodes that are to be provisioned
#define OPAL_PMIX_JOB_CTRL_PROVISION_IMAGE "pmix.jctrl.pvnimg" // (char*) name of the image that is to be provisioned
#define OPAL_PMIX_JOB_CTRL_PREEMPTIBLE "pmix.jctrl.preempt" // (bool) job can be pre-empted
/* monitoring attributes */
#define OPAL_PMIX_MONITOR_HEARTBEAT "pmix.monitor.mbeat" // (void) register to have the server monitor the requestor for heartbeats
#define OPAL_PMIX_SEND_HEARTBEAT "pmix.monitor.beat" // (void) send heartbeat to local server
#define OPAL_PMIX_MONITOR_HEARTBEAT_TIME "pmix.monitor.btime" // (uint32_t) time in seconds before declaring heartbeat missed
#define OPAL_PMIX_MONITOR_HEARTBEAT_DROPS "pmix.monitor.bdrop" // (uint32_t) number of heartbeats that can be missed before taking
// specified action
#define OPAL_PMIX_MONITOR_FILE "pmix.monitor.fmon" // (char*) register to monitor file for signs of life
#define OPAL_PMIX_MONITOR_FILE_SIZE "pmix.monitor.fsize" // (bool) monitor size of given file is growing to determine app is running
#define OPAL_PMIX_MONITOR_FILE_ACCESS "pmix.monitor.faccess" // (char*) monitor time since last access of given file to determine app is running
#define OPAL_PMIX_MONITOR_FILE_MODIFY "pmix.monitor.fmod" // (char*) monitor time since last modified of given file to determine app is running
#define OPAL_PMIX_MONITOR_FILE_CHECK_TIME "pmix.monitor.ftime" // (uint32_t) time in seconds between checking file
#define OPAL_PMIX_MONITOR_FILE_DROPS "pmix.monitor.fdrop" // (uint32_t) number of file checks that can be missed before taking
// specified action
/* define a scope for data "put" by PMI per the following:
@ -285,6 +373,16 @@ typedef enum {
} opal_pmix_persistence_t;
/* define allocation request flags */
typedef enum {
OPAL_PMIX_ALLOC_UNDEF = 0,
OPAL_PMIX_ALLOC_NEW,
OPAL_PMIX_ALLOC_EXTEND,
OPAL_PMIX_ALLOC_RELEASE,
OPAL_PMIX_ALLOC_REAQCUIRE
} opal_pmix_alloc_directive_t;
/**** PMIX INFO STRUCT ****/
/* NOTE: the pmix_info_t is essentially equivalent to the opal_value_t

Просмотреть файл

@ -292,6 +292,12 @@ opal_err2str(int errnum, const char **errmsg)
case OPAL_ERR_EVENT_REGISTRATION:
retval = "Event registration";
break;
case OPAL_ERR_HEARTBEAT_ALERT:
retval = "Heartbeat not received";
break;
case OPAL_ERR_FILE_ALERT:
retval = "File alert - proc may have stalled";
break;
default:
retval = "UNRECOGNIZED";
}

Просмотреть файл

@ -76,7 +76,7 @@ ORTE_DECLSPEC int orte_schizo_base_setup_child(orte_job_t *jobdat,
orte_app_context_t *app,
char ***env);
ORTE_DECLSPEC orte_schizo_launch_environ_t orte_schizo_base_check_launch_environment(void);
ORTE_DECLSPEC long orte_schizo_base_get_remaining_time(void);
ORTE_DECLSPEC int orte_schizo_base_get_remaining_time(uint32_t *timeleft);
ORTE_DECLSPEC void orte_schizo_base_finalize(void);
END_C_DECLS

Просмотреть файл

@ -162,20 +162,20 @@ orte_schizo_launch_environ_t orte_schizo_base_check_launch_environment(void)
return ORTE_SCHIZO_UNDETERMINED;
}
long orte_schizo_base_get_remaining_time(void)
int orte_schizo_base_get_remaining_time(uint32_t *timeleft)
{
long rc;
int rc;
orte_schizo_base_active_module_t *mod;
OPAL_LIST_FOREACH(mod, &orte_schizo_base.active_modules, orte_schizo_base_active_module_t) {
if (NULL != mod->module->get_remaining_time) {
rc = mod->module->get_remaining_time();
rc = mod->module->get_remaining_time(timeleft);
if (ORTE_ERR_TAKE_NEXT_OPTION != rc) {
return rc;
}
}
}
return -1;
return ORTE_ERR_NOT_SUPPORTED;
}
void orte_schizo_base_finalize(void)

Просмотреть файл

@ -118,7 +118,7 @@ typedef void (*orte_schizo_base_module_finalize_fn_t)(void);
* and decides it cannot provide the info in the current situation,
* then it can return ORTE_ERR_TAKE_NEXT_OPTION to indicate that
* another module should be tried */
typedef long (*orte_schizo_base_module_get_rem_time_fn_t)(void);
typedef int (*orte_schizo_base_module_get_rem_time_fn_t)(uint32_t *timeleft);
/*
* schizo module version 1.3.0

Просмотреть файл

@ -1,5 +1,5 @@
/*
* Copyright (c) 2016 Intel, Inc. All rights reserved.
* Copyright (c) 2016-2017 Intel, Inc. All rights reserved.
* Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
* $COPYRIGHT$
*
@ -29,10 +29,12 @@
#include "schizo_slurm.h"
static orte_schizo_launch_environ_t check_launch_environment(void);
static int get_remaining_time(uint32_t *timeleft);
static void finalize(void);
orte_schizo_base_module_t orte_schizo_slurm_module = {
.check_launch_environment = check_launch_environment,
.get_remaining_time = get_remaining_time,
.finalize = finalize
};
@ -123,6 +125,58 @@ static orte_schizo_launch_environ_t check_launch_environment(void)
return myenv;
}
static int get_remaining_time(uint32_t *timeleft)
{
char output[256], *cmd, *jobid, **res;
FILE *fp;
uint32_t tleft;
size_t cnt;
/* set the default */
*timeleft = UINT32_MAX;
if (NULL == (jobid = getenv("SLURM_JOBID"))) {
return ORTE_ERR_TAKE_NEXT_OPTION;
}
if (0 > asprintf(&cmd, "squeue -h -j %s -o %%L", jobid)) {
return ORTE_ERR_OUT_OF_RESOURCE;
}
fp = popen(cmd, "r");
if (NULL == fp) {
free(cmd);
return ORTE_ERR_FILE_OPEN_FAILURE;
}
if (NULL == fgets(output, 256, fp)) {
free(cmd);
return ORTE_ERR_FILE_READ_FAILURE;
}
free(cmd);
/* the output is returned in a colon-delimited set of fields */
res = opal_argv_split(output, ':');
cnt = opal_argv_count(res);
tleft = strtol(res[cnt-1], NULL, 10); // has to be at least one field
/* the next field would be minutes */
if (1 < cnt) {
tleft += 60 * strtol(res[cnt-2], NULL, 10);
}
/* next field would be hours */
if (2 < cnt) {
tleft += 3600 * strtol(res[cnt-3], NULL, 10);
}
/* next field is days */
if (3 < cnt) {
tleft += 24*3600 * strtol(res[cnt-4], NULL, 10);
}
/* if there are more fields than that, then it is infinite */
if (4 < cnt) {
tleft = UINT32_MAX;
}
opal_argv_free(res);
*timeleft = tleft;
return ORTE_SUCCESS;
}
static void finalize(void)
{
int i;

Просмотреть файл

@ -1,6 +1,6 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2016 Intel, Inc. All rights reserved.
* Copyright (c) 2016-2017 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -38,8 +38,8 @@ orte_schizo_base_component_t mca_schizo_slurm_component = {
static int component_query(mca_base_module_t **module, int *priority)
{
/* disqualify ourselves if we are not an app or under slurm */
if (!ORTE_PROC_IS_APP) {
/* disqualify ourselves if we are not under slurm */
if (NULL == getenv("SLURM_JOBID")) {
*priority = 0;
*module = NULL;
return OPAL_ERROR;
@ -49,4 +49,3 @@ static int component_query(mca_base_module_t **module, int *priority)
*priority = 50;
return ORTE_SUCCESS;
}

Просмотреть файл

@ -1,39 +0,0 @@
/*
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2013 Los Alamos National Security, LLC. All rights reserved.
*
* Copyright (c) 2017 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/** @file:
*/
#ifndef MCA_SENSOR_BASE_H
#define MCA_SENSOR_BASE_H
/*
* includes
*/
#include "orte_config.h"
#include "opal/class/opal_list.h"
#include "opal/mca/base/base.h"
#include "orte/mca/sensor/sensor.h"
BEGIN_C_DECLS
/*
* MCA Framework
*/
ORTE_DECLSPEC extern mca_base_framework_t orte_sensor_base_framework;
/* select a component */
ORTE_DECLSPEC int orte_sensor_base_select(void);
END_C_DECLS
#endif

Просмотреть файл

@ -1,158 +0,0 @@
/*
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2012 Los Alamos National Security, Inc. All rights reserved.
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "orte/constants.h"
#include "opal/dss/dss.h"
#include "opal/mca/event/event.h"
#include "orte/mca/sensor/base/base.h"
#include "orte/mca/sensor/base/sensor_private.h"
static bool mods_active = false;
void orte_sensor_base_start(orte_jobid_t job)
{
orte_sensor_active_module_t *i_module;
int i;
if (0 < orte_sensor_base.rate.tv_sec) {
opal_output_verbose(5, orte_sensor_base_framework.framework_output,
"%s sensor:base: starting sensors",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
/* call the start function of all modules in priority order */
for (i=0; i < orte_sensor_base.modules.size; i++) {
if (NULL == (i_module = (orte_sensor_active_module_t*)opal_pointer_array_get_item(&orte_sensor_base.modules, i))) {
continue;
}
mods_active = true;
if (NULL != i_module->module->start) {
i_module->module->start(job);
}
}
if (mods_active && !orte_sensor_base.active) {
/* setup a buffer to collect samples */
orte_sensor_base.samples = OBJ_NEW(opal_buffer_t);
/* startup a timer to wake us up periodically
* for a data sample
*/
orte_sensor_base.active = true;
opal_event_evtimer_set(orte_event_base, &orte_sensor_base.sample_ev,
orte_sensor_base_sample, NULL);
opal_event_evtimer_add(&orte_sensor_base.sample_ev, &orte_sensor_base.rate);
}
}
return;
}
void orte_sensor_base_stop(orte_jobid_t job)
{
orte_sensor_active_module_t *i_module;
int i;
if (!mods_active) {
return;
}
opal_output_verbose(5, orte_sensor_base_framework.framework_output,
"%s sensor:base: stopping sensors",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
if (orte_sensor_base.active) {
opal_event_del(&orte_sensor_base.sample_ev);
orte_sensor_base.active = false;
}
/* call the stop function of all modules in priority order */
for (i=0; i < orte_sensor_base.modules.size; i++) {
if (NULL == (i_module = (orte_sensor_active_module_t*)opal_pointer_array_get_item(&orte_sensor_base.modules, i))) {
continue;
}
if (NULL != i_module->module->stop) {
i_module->module->stop(job);
}
}
return;
}
void orte_sensor_base_sample(int fd, short args, void *cbdata)
{
orte_sensor_active_module_t *i_module;
int i;
if (!mods_active) {
return;
}
/* see if we were ordered to stop */
if (!orte_sensor_base.active) {
return;
}
opal_output_verbose(5, orte_sensor_base_framework.framework_output,
"%s sensor:base: sampling sensors",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
/* call the sample function of all modules in priority order from
* highest to lowest - the heartbeat should always be the lowest
* priority, so it will send any collected data
*/
for (i=0; i < orte_sensor_base.modules.size; i++) {
if (NULL == (i_module = (orte_sensor_active_module_t*)opal_pointer_array_get_item(&orte_sensor_base.modules, i))) {
continue;
}
if (NULL != i_module->module->sample) {
opal_output_verbose(5, orte_sensor_base_framework.framework_output,
"%s sensor:base: sampling component %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
i_module->component->base_version.mca_component_name);
i_module->module->sample();
}
}
/* restart the timer */
opal_event_evtimer_add(&orte_sensor_base.sample_ev, &orte_sensor_base.rate);
return;
}
void orte_sensor_base_log(char *comp, opal_buffer_t *data)
{
int i;
orte_sensor_active_module_t *i_module;
if (NULL == comp) {
/* nothing we can do */
return;
}
opal_output_verbose(5, orte_sensor_base_framework.framework_output,
"%s sensor:base: logging sensor %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), comp);
/* find the specified module */
for (i=0; i < orte_sensor_base.modules.size; i++) {
if (NULL == (i_module = (orte_sensor_active_module_t*)opal_pointer_array_get_item(&orte_sensor_base.modules, i))) {
continue;
}
if (0 == strcmp(comp, i_module->component->base_version.mca_component_name)) {
if (NULL != i_module->module->log) {
i_module->module->log(data);
}
return;
}
}
}

Просмотреть файл

@ -1,133 +0,0 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2012-2013 Los Alamos National Security, Inc. All rights reserved.
* Copyright (c) 2017 Intel, Inc. All rights reserved.
* Copyright (c) 2017 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "orte/constants.h"
#include "opal/mca/mca.h"
#include "opal/util/argv.h"
#include "opal/util/output.h"
#include "opal/mca/base/base.h"
#include "opal/class/opal_pointer_array.h"
#ifdef HAVE_STRING_H
#include <string.h>
#endif
#include "orte/mca/sensor/base/base.h"
#include "orte/mca/sensor/base/sensor_private.h"
/*
* The following file was created by configure. It contains extern
* statements and the definition of an array of pointers to each
* component's public mca_base_component_t struct.
*/
#include "orte/mca/sensor/base/static-components.h"
/*
* Global variables
*/
orte_sensor_base_API_module_t orte_sensor = {
orte_sensor_base_start,
orte_sensor_base_stop
};
orte_sensor_base_t orte_sensor_base = {{{0}}};
/*
* Local variables
*/
static int orte_sensor_base_sample_rate = 0;
static int orte_sensor_base_register(mca_base_register_flag_t flags)
{
int var_id;
orte_sensor_base_sample_rate = 0;
var_id = mca_base_var_register("orte", "sensor", "base", "sample_rate",
"Sample rate in seconds",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_READONLY,
&orte_sensor_base_sample_rate);
mca_base_var_register_synonym(var_id, "orte", "sensor", NULL, "sample_rate",
MCA_BASE_VAR_SYN_FLAG_DEPRECATED);
/* see if we want samples logged */
orte_sensor_base.log_samples = false;
var_id = mca_base_var_register("orte", "sensor", "base", "log_samples",
"Log samples to database",
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_READONLY,
&orte_sensor_base.log_samples);
mca_base_var_register_synonym(var_id, "orte", "sensor", NULL, "log_samples",
MCA_BASE_VAR_SYN_FLAG_DEPRECATED);
return ORTE_SUCCESS;
}
static int orte_sensor_base_close(void)
{
orte_sensor_active_module_t *i_module;
int i;
for (i=0; i < orte_sensor_base.modules.size; i++) {
if (NULL == (i_module = (orte_sensor_active_module_t*)opal_pointer_array_get_item(&orte_sensor_base.modules, i))) {
continue;
}
if (NULL != i_module->module->finalize) {
i_module->module->finalize();
}
}
OBJ_DESTRUCT(&orte_sensor_base.modules);
/* Close all remaining available components */
return mca_base_framework_components_close(&orte_sensor_base_framework, NULL);
}
/**
* Function for finding and opening either all MCA components, or the one
* that was specifically requested via a MCA parameter.
*/
static int orte_sensor_base_open(mca_base_open_flag_t flags)
{
/* initialize globals */
orte_sensor_base.active = false;
/* construct the array of modules */
OBJ_CONSTRUCT(&orte_sensor_base.modules, opal_pointer_array_t);
opal_pointer_array_init(&orte_sensor_base.modules, 3, INT_MAX, 1);
/* get the sample rate */
orte_sensor_base.rate.tv_sec = orte_sensor_base_sample_rate;
orte_sensor_base.rate.tv_usec = 0;
/* Open up all available components */
return mca_base_framework_components_open(&orte_sensor_base_framework, flags);
}
MCA_BASE_FRAMEWORK_DECLARE(orte, sensor, "ORTE Monitoring Sensors",
orte_sensor_base_register,
orte_sensor_base_open, orte_sensor_base_close,
mca_sensor_base_static_components, 0);
static void cons(orte_sensor_active_module_t *t)
{
t->sampling = true;
}
OBJ_CLASS_INSTANCE(orte_sensor_active_module_t,
opal_object_t,
cons, NULL);

Просмотреть файл

@ -1,219 +0,0 @@
/*
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2012-2013 Los Alamos National Security, Inc. All rights reserved.
*
* Copyright (c) 2017 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#ifdef HAVE_STRING_H
#include <string.h>
#endif
#include "orte/constants.h"
#include "opal/mca/mca.h"
#include "opal/mca/base/base.h"
#include "opal/util/argv.h"
#include "opal/util/output.h"
#include "opal/class/opal_pointer_array.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/sensor/base/base.h"
#include "orte/mca/sensor/base/sensor_private.h"
static bool selected = false;
/**
* Function for weeding out sensor components that don't want to run.
*
* Call the init function on all available components to find out if
* they want to run. Select all components that don't fail. Failing
* components will be closed and unloaded. The selected modules will
* be returned to the caller in a opal_list_t.
*/
int orte_sensor_base_select(void)
{
mca_base_component_list_item_t *cli = NULL;
orte_sensor_base_component_t *component = NULL;
mca_base_module_t *module = NULL;
orte_sensor_active_module_t *i_module;
int priority = 0, i, j, low_i;
opal_pointer_array_t tmp_array;
bool none_found;
orte_sensor_active_module_t *tmp_module = NULL, *tmp_module_sw = NULL;
bool duplicate;
if (selected) {
return ORTE_SUCCESS;
}
selected = true;
OBJ_CONSTRUCT(&tmp_array, opal_pointer_array_t);
opal_output_verbose(10, orte_sensor_base_framework.framework_output,
"sensor:base:select: Auto-selecting components");
/*
* Traverse the list of available components.
* For each call their 'query' functions to determine relative priority.
*/
none_found = true;
OPAL_LIST_FOREACH(cli, &orte_sensor_base_framework.framework_components, mca_base_component_list_item_t) {
component = (orte_sensor_base_component_t *) cli->cli_component;
/*
* If there is a query function then use it.
*/
if (NULL == component->base_version.mca_query_component) {
opal_output_verbose(5, orte_sensor_base_framework.framework_output,
"sensor:base:select Skipping component [%s]. It does not implement a query function",
component->base_version.mca_component_name );
continue;
}
/*
* Query this component for the module and priority
*/
opal_output_verbose(5, orte_sensor_base_framework.framework_output,
"sensor:base:select Querying component [%s]",
component->base_version.mca_component_name);
component->base_version.mca_query_component(&module, &priority);
/*
* If no module was returned or negative priority, then skip component
*/
if (NULL == module || priority < 0) {
opal_output_verbose(5, orte_sensor_base_framework.framework_output,
"sensor:base:select Skipping component [%s]. Query failed to return a module",
component->base_version.mca_component_name );
continue;
}
/* check to see if we already have someone who senses the
* same things - if so, take the higher priority one
*/
duplicate = false;
for (i=0; i < tmp_array.size; i++) {
tmp_module = (orte_sensor_active_module_t*)opal_pointer_array_get_item(&tmp_array, i);
if (NULL == tmp_module) {
continue;
}
if (0 == strcmp(component->data_measured, tmp_module->component->data_measured)) {
if (tmp_module->priority < priority) {
opal_output_verbose(5, orte_sensor_base_framework.framework_output,
"sensor:base:select Replacing component %s with %s - both measure %s",
tmp_module->component->base_version.mca_component_name,
component->base_version.mca_component_name,
component->data_measured);
OBJ_RELEASE(tmp_module);
opal_pointer_array_set_item(&tmp_array, i, NULL);
break;
} else {
duplicate = true;
}
}
}
if (duplicate) {
/* ignore this component */
opal_output_verbose(5, orte_sensor_base_framework.framework_output,
"sensor:base:select Ignoring component %s - duplicate with higher priority measures %s",
component->base_version.mca_component_name,
component->data_measured);
continue;
}
/*
* Append them to the temporary list, we will sort later
*/
opal_output_verbose(5, orte_sensor_base_framework.framework_output,
"sensor:base:select Query of component [%s] set priority to %d",
component->base_version.mca_component_name, priority);
tmp_module = OBJ_NEW(orte_sensor_active_module_t);
tmp_module->component = component;
tmp_module->module = (orte_sensor_base_module_t*)module;
tmp_module->priority = priority;
opal_pointer_array_add(&tmp_array, (void*)tmp_module);
none_found = false;
}
if (none_found) {
/* okay for no modules to be found */
return ORTE_SUCCESS;
}
/*
* Sort the list by decending priority
*/
priority = 0;
for(j = 0; j < tmp_array.size; ++j) {
tmp_module_sw = (orte_sensor_active_module_t*)opal_pointer_array_get_item(&tmp_array, j);
if( NULL == tmp_module_sw ) {
continue;
}
low_i = -1;
priority = tmp_module_sw->priority;
for(i = 0; i < tmp_array.size; ++i) {
tmp_module = (orte_sensor_active_module_t*)opal_pointer_array_get_item(&tmp_array, i);
if( NULL == tmp_module ) {
continue;
}
if( tmp_module->priority > priority ) {
low_i = i;
priority = tmp_module->priority;
}
}
if( low_i >= 0 ) {
tmp_module = (orte_sensor_active_module_t*)opal_pointer_array_get_item(&tmp_array, low_i);
opal_pointer_array_set_item(&tmp_array, low_i, NULL);
j--; /* Try this entry again, if it is not the lowest */
} else {
tmp_module = tmp_module_sw;
opal_pointer_array_set_item(&tmp_array, j, NULL);
}
opal_output_verbose(5, orte_sensor_base_framework.framework_output,
"sensor:base:select Add module with priority [%s] %d",
tmp_module->component->base_version.mca_component_name, tmp_module->priority);
opal_pointer_array_add(&orte_sensor_base.modules, tmp_module);
}
OBJ_DESTRUCT(&tmp_array);
/*
* Initialize each of the modules in priority order from
* highest to lowest
*/
for(i = 0; i < orte_sensor_base.modules.size; ++i) {
i_module = (orte_sensor_active_module_t*)opal_pointer_array_get_item(&orte_sensor_base.modules, i);
if( NULL == i_module ) {
continue;
}
if( NULL != i_module->module->init ) {
if (ORTE_SUCCESS != i_module->module->init()) {
/* can't sample - however, if we are the HNP,
* then we need this module
* anyway so we can log incoming data
*/
if (ORTE_PROC_IS_HNP) {
i_module->sampling = false;
} else {
opal_pointer_array_set_item(&orte_sensor_base.modules, i, NULL);
OBJ_RELEASE(i_module);
}
}
}
}
return ORTE_SUCCESS;
}

Просмотреть файл

@ -1,67 +0,0 @@
/*
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2012 Los Alamos National Security, Inc. All rights reserved.
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/** @file:
*/
#ifndef MCA_SENSOR_PRIVATE_H
#define MCA_SENSOR_PRIVATE_H
/*
* includes
*/
#include "orte_config.h"
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif /* HAVE_UNISTD_H */
#include "opal/class/opal_pointer_array.h"
#include "opal/mca/event/event.h"
#include "orte/runtime/orte_globals.h"
#include "orte/mca/sensor/sensor.h"
/*
* Global functions for MCA overall collective open and close
*/
BEGIN_C_DECLS
/* define a struct to hold framework-global values */
typedef struct {
opal_pointer_array_t modules;
bool log_samples;
bool active;
struct timeval rate;
opal_event_t sample_ev;
opal_buffer_t *samples;
} orte_sensor_base_t;
typedef struct {
opal_object_t super;
orte_sensor_base_component_t *component;
orte_sensor_base_module_t *module;
int priority;
bool sampling;
} orte_sensor_active_module_t;
OBJ_CLASS_DECLARATION(orte_sensor_active_module_t);
ORTE_DECLSPEC extern orte_sensor_base_t orte_sensor_base;
ORTE_DECLSPEC void orte_sensor_base_start(orte_jobid_t job);
ORTE_DECLSPEC void orte_sensor_base_stop(orte_jobid_t job);
ORTE_DECLSPEC void orte_sensor_base_sample(int fd, short args, void *cbdata);
ORTE_DECLSPEC void orte_sensor_base_log(char *comp, opal_buffer_t *data);
END_C_DECLS
#endif

Просмотреть файл

@ -1,24 +0,0 @@
# -*- shell-script -*-
#
# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
# Copyright (c) 2011-2013 Los Alamos National Security, LLC.
# All rights reserved.
# Copyright (c) 2017 Intel, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# MCA_sensor_file_CONFIG([action-if-found], [action-if-not-found])
# -----------------------------------------------------------
AC_DEFUN([MCA_orte_sensor_file_CONFIG], [
AC_CONFIG_FILES([orte/mca/sensor/file/Makefile])
# if we don't want sensors, don't compile
# this component
AS_IF([test "$orte_want_sensors" = "1"],
[$1], [$2])
])dnl

Просмотреть файл

@ -1,354 +0,0 @@
/*
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2011-2012 Los Alamos National Security, LLC.
* All rights reserved.
*
* Copyright (c) 2017 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "orte/constants.h"
#include "orte/types.h"
#include <stdio.h>
#include <stddef.h>
#include <ctype.h>
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif
#ifdef HAVE_NETDB_H
#include <netdb.h>
#endif
#ifdef HAVE_SYS_PARAM_H
#include <sys/param.h>
#endif
#include <fcntl.h>
#include <errno.h>
#include <signal.h>
#ifdef HAVE_TIME_H
#include <time.h>
#endif
#include <sys/stat.h>
#include <sys/types.h>
#include "opal_stdint.h"
#include "opal/util/output.h"
#include "orte/util/show_help.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/state/state.h"
#include "orte/util/name_fns.h"
#include "orte/runtime/orte_globals.h"
#include "orte/mca/sensor/base/base.h"
#include "orte/mca/sensor/base/sensor_private.h"
#include "sensor_file.h"
/* declare the API functions */
static int init(void);
static void finalize(void);
static void start(orte_jobid_t job);
static void stop(orte_jobid_t job);
static void file_sample(void);
static void file_log(opal_buffer_t *sample);
/* instantiate the module */
orte_sensor_base_module_t orte_sensor_file_module = {
init,
finalize,
start,
stop,
file_sample,
file_log
};
/* define a tracking object */
typedef struct {
opal_list_item_t super;
orte_jobid_t jobid;
orte_vpid_t vpid;
char *file;
int tick;
bool check_size;
bool check_access;
bool check_mod;
int32_t file_size;
time_t last_access;
time_t last_mod;
int limit;
} file_tracker_t;
static void ft_constructor(file_tracker_t *ft)
{
ft->file = NULL;
ft->tick = 0;
ft->file_size = 0;
ft->last_access = 0;
ft->last_mod = 0;
ft->limit = 0;
}
static void ft_destructor(file_tracker_t *ft)
{
if (NULL != ft->file) {
free(ft->file);
}
}
OBJ_CLASS_INSTANCE(file_tracker_t,
opal_list_item_t,
ft_constructor, ft_destructor);
/* local globals */
static opal_list_t jobs;
static int init(void)
{
OBJ_CONSTRUCT(&jobs, opal_list_t);
return ORTE_SUCCESS;
}
static void finalize(void)
{
opal_list_item_t *item;
while (NULL != (item = opal_list_remove_first(&jobs))) {
OBJ_RELEASE(item);
}
OBJ_DESTRUCT(&jobs);
return;
}
static bool find_value(orte_app_context_t *app,
char *pattern, char **value)
{
int i;
char *ptr;
for (i=0; NULL != app->env[i]; i++) {
if (0 == strncmp(app->env[i], pattern, strlen(pattern))) {
ptr = strchr(app->env[i], '=');
ptr++;
if (NULL != value) {
*value = strdup(ptr);
}
return true;
}
}
return false;
}
/*
* Start monitoring of local processes
*/
static void start(orte_jobid_t jobid)
{
orte_job_t *jobdat;
orte_app_context_t *app, *aptr;
int i;
char *filename;
file_tracker_t *ft;
char *ptr;
/* cannot monitor my own job */
if (jobid == ORTE_PROC_MY_NAME->jobid && ORTE_JOBID_WILDCARD != jobid) {
return;
}
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base_framework.framework_output,
"%s starting file monitoring for job %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOBID_PRINT(jobid)));
/* get the local jobdat for this job */
if (NULL == (jobdat = orte_get_job_data_object(jobid))) {
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
return;
}
/* must be at least one app_context, so use the first one found */
app = NULL;
for (i=0; i < jobdat->apps->size; i++) {
if (NULL != (aptr = (orte_app_context_t*)opal_pointer_array_get_item(jobdat->apps, i))) {
app = aptr;
break;
}
}
if (NULL == app) {
/* got a problem */
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
return;
}
/* search the environ to get the filename */
if (!find_value(app, "OMPI_MCA_sensor_file_filename", &filename)) {
/* was a default file given */
if (NULL == mca_sensor_file_component.file) {
/* can't do anything without a file */
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base_framework.framework_output,
"%s sensor:file no file for job %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOBID_PRINT(jobid)));
return;
}
filename = mca_sensor_file_component.file;
}
/* create the tracking object */
ft = OBJ_NEW(file_tracker_t);
ft->jobid = jobid;
ft->file = strdup(filename);
/* search the environ to see what we are checking */
if (!find_value(app, "OMPI_MCA_sensor_file_check_size", &ptr)) {
/* was a default value given */
if (0 < mca_sensor_file_component.check_size) {
ft->check_size = OPAL_INT_TO_BOOL(mca_sensor_file_component.check_size);
}
} else {
ft->check_size = OPAL_INT_TO_BOOL(strtol(ptr, NULL, 10));
free(ptr);
}
if (!find_value(app, "OMPI_MCA_sensor_file_check_access", &ptr)) {
/* was a default value given */
if (0 < mca_sensor_file_component.check_access) {
ft->check_access = OPAL_INT_TO_BOOL(mca_sensor_file_component.check_access);
}
} else {
ft->check_access = OPAL_INT_TO_BOOL(strtol(ptr, NULL, 10));
free(ptr);
}
if (!find_value(app, "OMPI_MCA_sensor_file_check_mod", &ptr)) {
/* was a default value given */
if (0 < mca_sensor_file_component.check_mod) {
ft->check_mod = OPAL_INT_TO_BOOL(mca_sensor_file_component.check_mod);
}
} else {
ft->check_mod = OPAL_INT_TO_BOOL(strtol(ptr, NULL, 10));
free(ptr);
}
if (!find_value(app, "OMPI_MCA_sensor_file_limit", &ptr)) {
ft->limit = mca_sensor_file_component.limit;
} else {
ft->limit = strtol(ptr, NULL, 10);
free(ptr);
}
opal_list_append(&jobs, &ft->super);
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base_framework.framework_output,
"%s file %s monitored for %s%s%s with limit %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ft->file, ft->check_size ? "SIZE:" : " ",
ft->check_access ? "ACCESS TIME:" : " ",
ft->check_mod ? "MOD TIME" : " ", ft->limit));
return;
}
static void stop(orte_jobid_t jobid)
{
opal_list_item_t *item;
file_tracker_t *ft;
/* cannot monitor my own job */
if (jobid == ORTE_PROC_MY_NAME->jobid && ORTE_JOBID_WILDCARD != jobid) {
return;
}
for (item = opal_list_get_first(&jobs);
item != opal_list_get_end(&jobs);
item = opal_list_get_next(item)) {
ft = (file_tracker_t*)item;
if (jobid == ft->jobid || ORTE_JOBID_WILDCARD == jobid) {
opal_list_remove_item(&jobs, item);
OBJ_RELEASE(item);
}
}
return;
}
static void file_sample(void)
{
struct stat buf;
opal_list_item_t *item;
file_tracker_t *ft;
orte_job_t *jdata;
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base_framework.framework_output,
"%s sampling files",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
for (item = opal_list_get_first(&jobs);
item != opal_list_get_end(&jobs);
item = opal_list_get_next(item)) {
ft = (file_tracker_t*)item;
/* stat the file and get its size */
if (0 > stat(ft->file, &buf)) {
/* cannot stat file */
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base_framework.framework_output,
"%s could not stat %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ft->file));
continue;
}
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base_framework.framework_output,
"%s size %lu access %s\tmod %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
(unsigned long)buf.st_size, ctime(&buf.st_atime), ctime(&buf.st_mtime)));
if (ft->check_size) {
if (buf.st_size == ft->file_size) {
ft->tick++;
goto CHECK;
} else {
ft->tick = 0;
ft->file_size = buf.st_size;
}
}
if (ft->check_access) {
if (buf.st_atime == ft->last_access) {
ft->tick++;
goto CHECK;
} else {
ft->tick = 0;
ft->last_access = buf.st_atime;
}
}
if (ft->check_mod) {
if (buf.st_mtime == ft->last_mod) {
ft->tick++;
goto CHECK;
} else {
ft->tick = 0;
ft->last_mod = buf.st_mtime;
}
}
CHECK:
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base_framework.framework_output,
"%s sampled file %s tick %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ft->file, ft->tick));
if (ft->tick == ft->limit) {
orte_show_help("help-orte-sensor-file.txt", "file-stalled", true,
ft->file, ft->file_size, ctime(&ft->last_access), ctime(&ft->last_mod));
jdata = orte_get_job_data_object(ft->jobid);
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_SENSOR_BOUND_EXCEEDED);
}
}
}
static void file_log(opal_buffer_t *sample)
{
}

Просмотреть файл

@ -1,42 +0,0 @@
/*
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
*
* Copyright (c) 2017 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/**
* @file
*
* File movement sensor
*/
#ifndef ORTE_SENSOR_FILE_H
#define ORTE_SENSOR_FILE_H
#include "orte_config.h"
#include "orte/mca/sensor/sensor.h"
BEGIN_C_DECLS
struct orte_sensor_file_component_t {
orte_sensor_base_component_t super;
int sample_rate;
char *file;
bool check_size;
bool check_access;
bool check_mod;
int limit;
};
typedef struct orte_sensor_file_component_t orte_sensor_file_component_t;
ORTE_MODULE_DECLSPEC extern orte_sensor_file_component_t mca_sensor_file_component;
extern orte_sensor_base_module_t orte_sensor_file_module;
END_C_DECLS
#endif

Просмотреть файл

@ -1,120 +0,0 @@
/*
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2017 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "orte/constants.h"
#include "opal/mca/base/base.h"
#include "opal/util/output.h"
#include "opal/class/opal_pointer_array.h"
#include "orte/util/proc_info.h"
#include "orte/util/show_help.h"
#include "sensor_file.h"
/*
* Local functions
*/
static int orte_sensor_file_register (void);
static int orte_sensor_file_open(void);
static int orte_sensor_file_close(void);
static int orte_sensor_file_query(mca_base_module_t **module, int *priority);
orte_sensor_file_component_t mca_sensor_file_component = {
{
{
ORTE_SENSOR_BASE_VERSION_1_0_0,
"file", /* MCA component name */
ORTE_MAJOR_VERSION, /* MCA component major version */
ORTE_MINOR_VERSION, /* MCA component minor version */
ORTE_RELEASE_VERSION, /* MCA component release version */
orte_sensor_file_open, /* component open */
orte_sensor_file_close, /* component close */
orte_sensor_file_query, /* component query */
orte_sensor_file_register
},
{
/* The component is checkpoint ready */
MCA_BASE_METADATA_PARAM_CHECKPOINT
},
"filemods" // data being sensed
}
};
/**
* component register/open/close/init function
*/
static int orte_sensor_file_register (void)
{
mca_base_component_t *c = &mca_sensor_file_component.super.base_version;
/* lookup parameters */
mca_sensor_file_component.file = NULL;
(void) mca_base_component_var_register (c, "filename", "File to be monitored",
MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_ALL_EQ,
&mca_sensor_file_component.file);
mca_sensor_file_component.check_size = false;
(void) mca_base_component_var_register (c, "check_size", "Check the file size",
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_ALL_EQ,
&mca_sensor_file_component.check_size);
mca_sensor_file_component.check_access = false;
(void) mca_base_component_var_register (c, "check_access", "Check access time",
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_ALL_EQ,
&mca_sensor_file_component.check_access);
mca_sensor_file_component.check_mod = false;
(void) mca_base_component_var_register (c, "check_mod", "Check modification time",
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_ALL_EQ,
&mca_sensor_file_component.check_mod);
mca_sensor_file_component.limit = 3;
(void) mca_base_component_var_register (c, "limit",
"Number of times the sensor can detect no motion before declaring error (default=3)",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_ALL_EQ,
&mca_sensor_file_component.limit);
return ORTE_SUCCESS;
}
static int orte_sensor_file_open(void)
{
return ORTE_SUCCESS;
}
static int orte_sensor_file_query(mca_base_module_t **module, int *priority)
{
*priority = 20; /* higher than heartbeat */
*module = (mca_base_module_t *)&orte_sensor_file_module;
return ORTE_SUCCESS;
}
/**
* Close all subsystems.
*/
static int orte_sensor_file_close(void)
{
return ORTE_SUCCESS;
}

Просмотреть файл

@ -1,36 +0,0 @@
#
# Copyright (c) 2009-2011 Cisco Systems, Inc. All rights reserved.
#
# Copyright (c) 2017 Intel, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
sources = \
sensor_ft_tester.c \
sensor_ft_tester.h \
sensor_ft_tester_component.c
# Make the output library in this directory, and name it either
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
# (for static builds).
if MCA_BUILD_orte_sensor_ft_tester_DSO
component_noinst =
component_install = mca_sensor_ft_tester.la
else
component_noinst = libmca_sensor_ft_tester.la
component_install =
endif
mcacomponentdir = $(ompilibdir)
mcacomponent_LTLIBRARIES = $(component_install)
mca_sensor_ft_tester_la_SOURCES = $(sources)
mca_sensor_ft_tester_la_LDFLAGS = -module -avoid-version
noinst_LTLIBRARIES = $(component_noinst)
libmca_sensor_ft_tester_la_SOURCES =$(sources)
libmca_sensor_ft_tester_la_LDFLAGS = -module -avoid-version

Просмотреть файл

@ -1,24 +0,0 @@
# -*- shell-script -*-
#
# Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
# Copyright (c) 2011-2013 Los Alamos National Security, LLC.
# All rights reserved.
# Copyright (c) 2017 Intel, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# MCA_sensor_ft_tester_CONFIG([action-if-found], [action-if-not-found])
# -----------------------------------------------------------
AC_DEFUN([MCA_orte_sensor_ft_tester_CONFIG], [
AC_CONFIG_FILES([orte/mca/sensor/ft_tester/Makefile])
# if we don't want sensors, don't compile
# this component
AS_IF([test "$orte_want_sensors" = "1"],
[$1], [$2])
])dnl

Просмотреть файл

@ -1,41 +0,0 @@
/*
* Copyright (c) 2009-2011 Cisco Systems, Inc. All rights reserved.
*
* Copyright (c) 2017 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/**
* @file
*
* Process Resource Utilization sensor
*/
#ifndef ORTE_SENSOR_FT_TESTER_H
#define ORTE_SENSOR_FT_TESTER_H
#include "orte_config.h"
#include "orte/mca/sensor/sensor.h"
#include "opal/util/alfg.h"
BEGIN_C_DECLS
struct orte_sensor_ft_tester_component_t {
orte_sensor_base_component_t super;
float fail_prob;
float daemon_fail_prob;
bool multi_fail;
};
typedef struct orte_sensor_ft_tester_component_t orte_sensor_ft_tester_component_t;
ORTE_MODULE_DECLSPEC extern orte_sensor_ft_tester_component_t mca_sensor_ft_tester_component;
extern orte_sensor_base_module_t orte_sensor_ft_tester_module;
extern opal_rng_buff_t orte_sensor_ft_rng_buff;
END_C_DECLS
#endif

Просмотреть файл

@ -1,141 +0,0 @@
/*
* Copyright (c) 2010-2011 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2012 Los Alamos National Security, Inc. All rights reserved.
* Copyright (c) 2017 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "orte/constants.h"
#include "opal/mca/base/base.h"
#include "opal/util/output.h"
#include "opal/class/opal_pointer_array.h"
#include "orte/util/proc_info.h"
#include "orte/util/show_help.h"
#include "sensor_ft_tester.h"
/*
* Local functions
*/
static int orte_sensor_ft_tester_register (void);
static int orte_sensor_ft_tester_open(void);
static int orte_sensor_ft_tester_close(void);
static int orte_sensor_ft_tester_query(mca_base_module_t **module, int *priority);
orte_sensor_ft_tester_component_t mca_sensor_ft_tester_component = {
{
{
ORTE_SENSOR_BASE_VERSION_1_0_0,
"ft_tester", /* MCA component name */
ORTE_MAJOR_VERSION, /* MCA component major version */
ORTE_MINOR_VERSION, /* MCA component minor version */
ORTE_RELEASE_VERSION, /* MCA component release version */
orte_sensor_ft_tester_open, /* component open */
orte_sensor_ft_tester_close, /* component close */
orte_sensor_ft_tester_query, /* component query */
orte_sensor_ft_tester_register
},
{
/* The component is checkpoint ready */
MCA_BASE_METADATA_PARAM_CHECKPOINT
},
NULL
}
};
static char *daemon_fail_prob = NULL;
static char *fail_prob = NULL;
opal_rng_buff_t orte_sensor_ft_rng_buff;
/**
* component register/open/close/init function
*/
static int orte_sensor_ft_tester_register (void)
{
mca_base_component_t *c = &mca_sensor_ft_tester_component.super.base_version;
fail_prob = NULL;
(void) mca_base_component_var_register (c, "fail_prob", "Probability of killing a single executable",
MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_READONLY,
&fail_prob);
mca_sensor_ft_tester_component.multi_fail = false;
(void) mca_base_component_var_register (c, "multi_allowed", "Allow multiple executables to be killed at one time",
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_READONLY,
&mca_sensor_ft_tester_component.multi_fail);
daemon_fail_prob = NULL;
(void) mca_base_component_var_register (c, "daemon_fail_prob", "Probability of killing a daemon",
MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_READONLY,
&daemon_fail_prob);
return ORTE_SUCCESS;
}
static int orte_sensor_ft_tester_open(void)
{
/* lookup parameters */
if (NULL != fail_prob) {
mca_sensor_ft_tester_component.fail_prob = strtof(fail_prob, NULL);
if (1.0 < mca_sensor_ft_tester_component.fail_prob) {
/* given in percent */
mca_sensor_ft_tester_component.fail_prob /= 100.0;
}
} else {
mca_sensor_ft_tester_component.fail_prob = 0.0;
}
if (NULL != daemon_fail_prob) {
mca_sensor_ft_tester_component.daemon_fail_prob = strtof(daemon_fail_prob, NULL);
if (1.0 < mca_sensor_ft_tester_component.daemon_fail_prob) {
/* given in percent */
mca_sensor_ft_tester_component.daemon_fail_prob /= 100.0;
}
} else {
mca_sensor_ft_tester_component.daemon_fail_prob = 0.0;
}
return ORTE_SUCCESS;
}
static int orte_sensor_ft_tester_query(mca_base_module_t **module, int *priority)
{
if (0.0 < mca_sensor_ft_tester_component.fail_prob ||
0.0 < mca_sensor_ft_tester_component.daemon_fail_prob) {
*priority = 1; /* at the bottom */
*module = (mca_base_module_t *)&orte_sensor_ft_tester_module;
/* seed the RNG --- Not sure if we should assume all procs use
* the same seed?
*/
opal_srand(&orte_sensor_ft_rng_buff, (uint32_t) getpid());
return ORTE_SUCCESS;
}
*priority = 0;
*module = NULL;
return ORTE_ERROR;
}
/**
* Close all subsystems.
*/
static int orte_sensor_ft_tester_close(void)
{
return ORTE_SUCCESS;
}

Просмотреть файл

@ -1,38 +0,0 @@
#
# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
#
# Copyright (c) 2017 Intel, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
dist_ompidata_DATA = help-orte-sensor-heartbeat.txt
sources = \
sensor_heartbeat.c \
sensor_heartbeat.h \
sensor_heartbeat_component.c
# Make the output library in this directory, and name it either
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
# (for static builds).
if MCA_BUILD_orte_sensor_heartbeat_DSO
component_noinst =
component_install = mca_sensor_heartbeat.la
else
component_noinst = libmca_sensor_heartbeat.la
component_install =
endif
mcacomponentdir = $(ompilibdir)
mcacomponent_LTLIBRARIES = $(component_install)
mca_sensor_heartbeat_la_SOURCES = $(sources)
mca_sensor_heartbeat_la_LDFLAGS = -module -avoid-version
noinst_LTLIBRARIES = $(component_noinst)
libmca_sensor_heartbeat_la_SOURCES =$(sources)
libmca_sensor_heartbeat_la_LDFLAGS = -module -avoid-version

Просмотреть файл

@ -1,24 +0,0 @@
# -*- shell-script -*-
#
# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
# Copyright (c) 2011-2013 Los Alamos National Security, LLC.
# All rights reserved.
# Copyright (c) 2017 Intel, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# MCA_sensor_heartbeat_CONFIG([action-if-found], [action-if-not-found])
# -----------------------------------------------------------
AC_DEFUN([MCA_orte_sensor_heartbeat_CONFIG], [
AC_CONFIG_FILES([orte/mca/sensor/heartbeat/Makefile])
# if we don't want sensors, don't compile
# this component
AS_IF([test "$orte_want_sensors" = "1"],
[$1], [$2])
])dnl

Просмотреть файл

@ -1,279 +0,0 @@
/*
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights
* reserved.
*
* Copyright (c) 2017 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "orte/constants.h"
#include "orte/types.h"
#include <errno.h>
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif /* HAVE_UNISTD_H */
#ifdef HAVE_STRING_H
#include <string.h>
#endif /* HAVE_STRING_H */
#include <stdio.h>
#include "opal_stdint.h"
#include "opal/util/argv.h"
#include "opal/util/output.h"
#include "opal/mca/event/event.h"
#include "orte/util/show_help.h"
#include "orte/util/proc_info.h"
#include "orte/util/name_fns.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/rml/rml.h"
#include "orte/mca/state/state.h"
#include "orte/runtime/orte_wait.h"
#include "orte/runtime/orte_globals.h"
#include "orte/mca/sensor/base/base.h"
#include "orte/mca/sensor/base/sensor_private.h"
#include "sensor_heartbeat.h"
/* declare the API functions */
static int init(void);
static void finalize(void);
static void start(orte_jobid_t job);
static void sample(void);
/* instantiate the module */
orte_sensor_base_module_t orte_sensor_heartbeat_module = {
init,
finalize,
start,
NULL,
sample,
NULL
};
/* declare the local functions */
static void check_heartbeat(int fd, short event, void *arg);
static void recv_beats(int status, orte_process_name_t* sender,
opal_buffer_t *buffer,
orte_rml_tag_t tag, void *cbdata);
/* local globals */
static orte_job_t *daemons=NULL;
static opal_event_t check_ev;
static bool check_active = false;
static struct timeval check_time;
static int init(void)
{
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base_framework.framework_output,
"%s initializing heartbeat recvs",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
/* setup to receive heartbeats */
if (ORTE_PROC_IS_HNP || ORTE_PROC_IS_AGGREGATOR) {
orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD,
ORTE_RML_TAG_HEARTBEAT,
ORTE_RML_PERSISTENT,
recv_beats, NULL);
}
if (ORTE_PROC_IS_HNP) {
daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid);
}
return ORTE_SUCCESS;
}
static void finalize(void)
{
orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_HEARTBEAT);
if (check_active) {
opal_event_del(&check_ev);
check_active = false;
}
return;
}
static void start(orte_jobid_t job)
{
if (!check_active && NULL != daemons) {
/* setup the check event */
check_time.tv_sec = 3 * orte_sensor_base.rate.tv_sec;
check_time.tv_usec = 0;
opal_event_evtimer_set(orte_event_base, &check_ev, check_heartbeat, &check_ev);
opal_event_evtimer_add(&check_ev, &check_time);
check_active = true;
}
}
static void sample(void)
{
opal_buffer_t *buf;
int rc;
orte_process_name_t *tgt;
/* if we are aborting or shutting down, ignore this */
if (orte_abnormal_term_ordered || orte_finalizing || !orte_initialized) {
return;
}
if (ORTE_PROC_IS_CM) {
/* we send to our daemon */
tgt = ORTE_PROC_MY_DAEMON;
} else {
tgt = ORTE_PROC_MY_HNP;
}
/* if my target hasn't been defined yet, ignore - nobody listening yet */
if (ORTE_JOBID_INVALID ==tgt->jobid ||
ORTE_VPID_INVALID == tgt->vpid) {
opal_output_verbose(1, orte_sensor_base_framework.framework_output,
"%s sensor:heartbeat: HNP is not defined",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
return;
}
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base_framework.framework_output,
"%s sending heartbeat",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
/* if we want sampled data included, point to the bucket */
buf = OBJ_NEW(opal_buffer_t);
if (orte_sensor_base.log_samples) {
opal_dss.copy_payload(buf, orte_sensor_base.samples);
OBJ_RELEASE(orte_sensor_base.samples);
/* start a new sample bucket */
orte_sensor_base.samples = OBJ_NEW(opal_buffer_t);
}
/* send heartbeat */
if (ORTE_SUCCESS != (rc = orte_rml.send_buffer_nb(tgt, buf,
ORTE_RML_TAG_HEARTBEAT,
orte_rml_send_callback, NULL))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(buf);
}
}
/* this function automatically gets periodically called
* by the event library so we can check on the state
* of the various orteds
*/
static void check_heartbeat(int fd, short dummy, void *arg)
{
int v;
orte_proc_t *proc;
opal_event_t *tmp = (opal_event_t*)arg;
OPAL_OUTPUT_VERBOSE((3, orte_sensor_base_framework.framework_output,
"%s sensor:check_heartbeat",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
/* if we are aborting or shutting down, ignore this */
if (orte_abnormal_term_ordered || orte_finalizing || !orte_initialized) {
OPAL_OUTPUT_VERBOSE((3, orte_sensor_base_framework.framework_output,
"%s IGNORING CHECK abnorm_term %s fin %s init %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
orte_abnormal_term_ordered ? "TRUE" : "FALSE",
orte_finalizing ? "TRUE" : "FALSE",
orte_initialized ? "TRUE" : "FALSE"));
check_active = false;
return;
}
for (v=0; v < daemons->procs->size; v++) {
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(daemons->procs, v))) {
continue;
}
/* ignore myself */
if (proc->name.vpid == ORTE_PROC_MY_NAME->vpid) {
continue;
}
if (ORTE_PROC_STATE_RUNNING != proc->state) {
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base_framework.framework_output,
"%s sensor:heartbeat DAEMON %s IS NOT RUNNING",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&proc->name)));
continue;
}
if (0 == proc->beat) {
/* no heartbeat recvd in last window */
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base_framework.framework_output,
"%s sensor:check_heartbeat FAILED for daemon %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&proc->name)));
ORTE_ACTIVATE_PROC_STATE(&proc->name, ORTE_PROC_STATE_HEARTBEAT_FAILED);
} else {
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base_framework.framework_output,
"%s HEARTBEAT DETECTED FOR %s: NUM BEATS %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&proc->name), proc->beat));
}
/* reset for next period */
proc->beat = 0;
}
/* reset the timer */
opal_event_evtimer_add(tmp, &check_time);
}
static void recv_beats(int status, orte_process_name_t* sender,
opal_buffer_t *buffer,
orte_rml_tag_t tag, void *cbdata)
{
orte_proc_t *proc;
int rc, n;
char *component=NULL;
opal_buffer_t *buf;
opal_output_verbose(1, orte_sensor_base_framework.framework_output,
"%s received beat from %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(sender));
/* if we are aborting or shutting down, ignore this */
if (orte_abnormal_term_ordered || orte_finalizing || !orte_initialized) {
return;
}
/* get this daemon's object */
if (NULL != daemons) {
if (NULL != (proc = (orte_proc_t*)opal_pointer_array_get_item(daemons->procs, sender->vpid))) {
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base_framework.framework_output,
"%s marked beat from %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(sender)));
proc->beat++;
/* if this daemon has reappeared, reset things */
if (ORTE_PROC_STATE_HEARTBEAT_FAILED == proc->state) {
proc->state = ORTE_PROC_STATE_RUNNING;
}
}
}
/* unload any sampled data */
n=1;
while (OPAL_SUCCESS == (rc = opal_dss.unpack(buffer, &buf, &n, OPAL_BUFFER))) {
if (NULL != buf) {
n=1;
if (OPAL_SUCCESS != (rc = opal_dss.unpack(buf, &component, &n, OPAL_STRING))) {
ORTE_ERROR_LOG(rc);
break;
}
orte_sensor_base_log(component, buf);
OBJ_RELEASE(buf);
free(component);
n=1;
}
}
if (OPAL_ERR_UNPACK_READ_PAST_END_OF_BUFFER != rc) {
ORTE_ERROR_LOG(rc);
}
}

Просмотреть файл

@ -1,32 +0,0 @@
/*
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2012 Los Alamos National Security, Inc. All rights reserved.
*
* Copyright (c) 2017 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/**
* @file
*
* Heartbeat sensor
*/
#ifndef ORTE_SENSOR_HEARTBEAT_H
#define ORTE_SENSOR_HEARTBEAT_H
#include "orte_config.h"
#include "orte/mca/sensor/sensor.h"
BEGIN_C_DECLS
ORTE_MODULE_DECLSPEC extern orte_sensor_base_component_t mca_sensor_heartbeat_component;
extern orte_sensor_base_module_t orte_sensor_heartbeat_module;
END_C_DECLS
#endif

Просмотреть файл

@ -1,75 +0,0 @@
/*
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2012 Los Alamos National Security, Inc. All rights reserved.
* Copyright (c) 2017 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "orte/constants.h"
#include "opal/mca/base/base.h"
#include "opal/util/output.h"
#include "opal/class/opal_pointer_array.h"
#include "orte/util/proc_info.h"
#include "orte/util/show_help.h"
#include "sensor_heartbeat.h"
/*
* Local functions
*/
static int orte_sensor_heartbeat_open(void);
static int orte_sensor_heartbeat_close(void);
static int orte_sensor_heartbeat_query(mca_base_module_t **module, int *priority);
orte_sensor_base_component_t mca_sensor_heartbeat_component = {
{
ORTE_SENSOR_BASE_VERSION_1_0_0,
"heartbeat", /* MCA component name */
ORTE_MAJOR_VERSION, /* MCA component major version */
ORTE_MINOR_VERSION, /* MCA component minor version */
ORTE_RELEASE_VERSION, /* MCA component release version */
orte_sensor_heartbeat_open, /* component open */
orte_sensor_heartbeat_close, /* component close */
orte_sensor_heartbeat_query /* component query */
},
{
/* The component is checkpoint ready */
MCA_BASE_METADATA_PARAM_CHECKPOINT
},
"heartbeat"
};
/**
* component open/close/init function
*/
static int orte_sensor_heartbeat_open(void)
{
return ORTE_SUCCESS;
}
static int orte_sensor_heartbeat_query(mca_base_module_t **module, int *priority)
{
*priority = 5; /* lower than all other samplers so that their data gets included in heartbeat */
*module = (mca_base_module_t *)&orte_sensor_heartbeat_module;
return ORTE_SUCCESS;
}
/**
* Close all subsystems.
*/
static int orte_sensor_heartbeat_close(void)
{
return ORTE_SUCCESS;
}

Просмотреть файл

@ -1,38 +0,0 @@
#
# Copyright (c) 2009-2010 Cisco Systems, Inc. All rights reserved.
#
# Copyright (c) 2017 Intel, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
dist_ompidata_DATA = help-orte-sensor-resusage.txt
sources = \
sensor_resusage.c \
sensor_resusage.h \
sensor_resusage_component.c
# Make the output library in this directory, and name it either
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
# (for static builds).
if MCA_BUILD_orte_sensor_resusage_DSO
component_noinst =
component_install = mca_sensor_resusage.la
else
component_noinst = libmca_sensor_resusage.la
component_install =
endif
mcacomponentdir = $(ompilibdir)
mcacomponent_LTLIBRARIES = $(component_install)
mca_sensor_resusage_la_SOURCES = $(sources)
mca_sensor_resusage_la_LDFLAGS = -module -avoid-version
noinst_LTLIBRARIES = $(component_noinst)
libmca_sensor_resusage_la_SOURCES =$(sources)
libmca_sensor_resusage_la_LDFLAGS = -module -avoid-version

Просмотреть файл

@ -1,24 +0,0 @@
# -*- shell-script -*-
#
# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
# Copyright (c) 2011-2013 Los Alamos National Security, LLC.
# All rights reserved.
# Copyright (c) 2017 Intel, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# MCA_sensor_resusage_CONFIG([action-if-found], [action-if-not-found])
# -----------------------------------------------------------
AC_DEFUN([MCA_orte_sensor_resusage_CONFIG], [
AC_CONFIG_FILES([orte/mca/sensor/resusage/Makefile])
# if we don't want sensors, don't compile
# this component
AS_IF([test "$orte_want_sensors" = "1"],
[$1], [$2])
])dnl

Просмотреть файл

@ -1,21 +0,0 @@
# -*- text -*-
#
# Copyright (c) 2010-2011 Cisco Systems, Inc. All rights reserved.
#
# Copyright (c) 2017 Intel, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# This is the US/English general help file for the memory usage sensor
#
[mem-limit-exceeded]
A process has exceeded the specified limit on memory usage:
Node: %s
Process rank: %s
Memory used: %luGbytes
Memory limit: %luGbytes

Просмотреть файл

@ -1,478 +0,0 @@
/*
* Copyright (c) 2009-2011 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights
* reserved.
*
* Copyright (c) 2017 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "orte/constants.h"
#include "orte/types.h"
#include <errno.h>
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif /* HAVE_UNISTD_H */
#ifdef HAVE_STRING_H
#include <string.h>
#endif /* HAVE_STRING_H */
#include <stdio.h>
#include "opal_stdint.h"
#include "opal/class/opal_pointer_array.h"
#include "opal/class/opal_ring_buffer.h"
#include "opal/dss/dss.h"
#include "opal/util/output.h"
#include "opal/mca/pstat/pstat.h"
#include "opal/mca/db/db.h"
#include "orte/util/proc_info.h"
#include "orte/util/name_fns.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/odls/odls_types.h"
#include "orte/mca/odls/base/odls_private.h"
#include "orte/mca/rml/rml.h"
#include "orte/mca/state/state.h"
#include "orte/runtime/orte_globals.h"
#include "orte/orted/orted.h"
#include "orte/mca/sensor/base/base.h"
#include "orte/mca/sensor/base/sensor_private.h"
#include "sensor_resusage.h"
/* declare the API functions */
static int init(void);
static void finalize(void);
static void sample(void);
static void res_log(opal_buffer_t *sample);
/* instantiate the module */
orte_sensor_base_module_t orte_sensor_resusage_module = {
init,
finalize,
NULL,
NULL,
sample,
res_log
};
static bool log_enabled = true;
static orte_node_t *my_node;
static orte_proc_t *my_proc;
static int init(void)
{
orte_job_t *jdata;
/* ensure my_proc and my_node are available on the global arrays */
if (NULL == (jdata = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid))) {
my_proc = OBJ_NEW(orte_proc_t);
my_node = OBJ_NEW(orte_node_t);
} else {
if (NULL == (my_proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, ORTE_PROC_MY_NAME->vpid))) {
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
return ORTE_ERR_NOT_FOUND;
}
if (NULL == (my_node = my_proc->node)) {
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
return ORTE_ERR_NOT_FOUND;
}
/* protect the objects */
OBJ_RETAIN(my_proc);
OBJ_RETAIN(my_node);
}
return ORTE_SUCCESS;
}
static void finalize(void)
{
if (NULL != my_proc) {
OBJ_RELEASE(my_proc);
}
if (NULL != my_node) {
OBJ_RELEASE(my_node);
}
return;
}
static void sample(void)
{
opal_pstats_t *stats, *st;
opal_node_stats_t *nstats, *nst;
int rc, i;
orte_proc_t *child, *hog=NULL;
float in_use, max_mem;
opal_buffer_t buf, *bptr;
char *comp;
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base_framework.framework_output,
"sample:resusage sampling resource usage"));
/* setup a buffer for our stats */
OBJ_CONSTRUCT(&buf, opal_buffer_t);
/* pack our name */
comp = strdup("resusage");
if (OPAL_SUCCESS != (rc = opal_dss.pack(&buf, &comp, 1, OPAL_STRING))) {
ORTE_ERROR_LOG(rc);
OBJ_DESTRUCT(&buf);
return;
}
free(comp);
/* update stats on ourself and the node */
stats = OBJ_NEW(opal_pstats_t);
nstats = OBJ_NEW(opal_node_stats_t);
if (ORTE_SUCCESS != (rc = opal_pstat.query(orte_process_info.pid, stats, nstats))) {
ORTE_ERROR_LOG(rc);
OBJ_DESTRUCT(stats);
OBJ_RELEASE(nstats);
OBJ_DESTRUCT(&buf);
return;
}
/* the stats framework can't know nodename or rank */
strncpy(stats->node, orte_process_info.nodename, OPAL_PSTAT_MAX_STRING_LEN);
stats->rank = ORTE_PROC_MY_NAME->vpid;
/* locally save the stats */
if (NULL != (st = (opal_pstats_t*)opal_ring_buffer_push(&my_proc->stats, stats))) {
OBJ_RELEASE(st);
}
if (NULL != (nst = (opal_node_stats_t*)opal_ring_buffer_push(&my_node->stats, nstats))) {
/* release the popped value */
OBJ_RELEASE(nst);
}
/* pack them */
if (OPAL_SUCCESS != (rc = opal_dss.pack(&buf, &orte_process_info.nodename, 1, OPAL_STRING))) {
ORTE_ERROR_LOG(rc);
OBJ_DESTRUCT(&buf);
return;
}
if (OPAL_SUCCESS != (rc = opal_dss.pack(&buf, &nstats, 1, OPAL_NODE_STAT))) {
ORTE_ERROR_LOG(rc);
OBJ_DESTRUCT(&buf);
return;
}
if (OPAL_SUCCESS != (rc = opal_dss.pack(&buf, &stats, 1, OPAL_PSTAT))) {
ORTE_ERROR_LOG(rc);
OBJ_DESTRUCT(&buf);
return;
}
/* loop through our children and update their stats */
if (NULL != orte_local_children) {
for (i=0; i < orte_local_children->size; i++) {
if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) {
continue;
}
if (!child->alive) {
continue;
}
if (0 == child->pid) {
/* race condition */
continue;
}
stats = OBJ_NEW(opal_pstats_t);
if (ORTE_SUCCESS != opal_pstat.query(child->pid, stats, NULL)) {
/* may hit a race condition where the process has
* terminated, so just ignore any error
*/
OBJ_RELEASE(stats);
continue;
}
/* the stats framework can't know nodename or rank */
strncpy(stats->node, orte_process_info.nodename, OPAL_PSTAT_MAX_STRING_LEN);
stats->rank = child->name.vpid;
/* store it */
if (NULL != (st = (opal_pstats_t*)opal_ring_buffer_push(&child->stats, stats))) {
OBJ_RELEASE(st);
}
/* pack them */
if (OPAL_SUCCESS != (rc = opal_dss.pack(&buf, &stats, 1, OPAL_PSTAT))) {
ORTE_ERROR_LOG(rc);
OBJ_DESTRUCT(&buf);
return;
}
}
}
/* xfer any data for transmission */
if (0 < buf.bytes_used) {
bptr = &buf;
if (OPAL_SUCCESS != (rc = opal_dss.pack(orte_sensor_base.samples, &bptr, 1, OPAL_BUFFER))) {
ORTE_ERROR_LOG(rc);
OBJ_DESTRUCT(&buf);
return;
}
}
OBJ_DESTRUCT(&buf);
/* are there any issues with node-level usage? */
nst = (opal_node_stats_t*)opal_ring_buffer_poke(&my_node->stats, -1);
if (NULL != nst && 0.0 < mca_sensor_resusage_component.node_memory_limit) {
OPAL_OUTPUT_VERBOSE((2, orte_sensor_base_framework.framework_output,
"%s CHECKING NODE MEM",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
/* compute the percentage of node memory in-use */
in_use = 1.0 - (nst->free_mem / nst->total_mem);
OPAL_OUTPUT_VERBOSE((2, orte_sensor_base_framework.framework_output,
"%s PERCENT USED: %f LIMIT: %f",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
in_use, mca_sensor_resusage_component.node_memory_limit));
if (mca_sensor_resusage_component.node_memory_limit <= in_use) {
/* loop through our children and find the biggest hog */
hog = NULL;
max_mem = 0.0;
for (i=0; i < orte_local_children->size; i++) {
if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) {
continue;
}
if (!child->alive) {
continue;
}
if (0 == child->pid) {
/* race condition */
continue;
}
if (NULL == (st = (opal_pstats_t*)opal_ring_buffer_poke(&child->stats, -1))) {
continue;
}
OPAL_OUTPUT_VERBOSE((5, orte_sensor_base_framework.framework_output,
"%s PROC %s AT VSIZE %f",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&child->name), st->vsize));
if (max_mem < st->vsize) {
hog = child;
max_mem = st->vsize;
}
}
if (NULL == hog) {
/* if all children dead and we are still too big,
* then we must be the culprit - abort
*/
OPAL_OUTPUT_VERBOSE((2, orte_sensor_base_framework.framework_output,
"%s NO CHILD: COMMITTING SUICIDE",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
orte_errmgr.abort(ORTE_ERR_MEM_LIMIT_EXCEEDED, NULL);
} else {
/* report the problem */
OPAL_OUTPUT_VERBOSE((2, orte_sensor_base_framework.framework_output,
"%s REPORTING %s TO ERRMGR FOR EXCEEDING LIMITS",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&hog->name)));
ORTE_ACTIVATE_PROC_STATE(&hog->name, ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED);
}
/* since we have ordered someone to die, we've done enough for this
* time around - don't check proc limits as well
*/
return;
}
}
/* check proc limits */
if (0.0 < mca_sensor_resusage_component.proc_memory_limit) {
OPAL_OUTPUT_VERBOSE((2, orte_sensor_base_framework.framework_output,
"%s CHECKING PROC MEM",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
/* check my children first */
for (i=0; i < orte_local_children->size; i++) {
if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) {
continue;
}
if (!child->alive) {
continue;
}
if (0 == child->pid) {
/* race condition */
continue;
}
if (NULL == (st = (opal_pstats_t*)opal_ring_buffer_poke(&child->stats, -1))) {
continue;
}
OPAL_OUTPUT_VERBOSE((5, orte_sensor_base_framework.framework_output,
"%s PROC %s AT VSIZE %f",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&child->name), st->vsize));
if (mca_sensor_resusage_component.proc_memory_limit <= st->vsize) {
/* report the problem */
ORTE_ACTIVATE_PROC_STATE(&child->name, ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED);
}
}
}
}
static void res_log(opal_buffer_t *sample)
{
opal_pstats_t *st=NULL;
opal_node_stats_t *nst=NULL;
int rc, n, i;
opal_value_t kv[14];
char *node;
if (!log_enabled) {
return;
}
/* unpack the node name */
n=1;
if (OPAL_SUCCESS != (rc = opal_dss.unpack(sample, &node, &n, OPAL_STRING))) {
ORTE_ERROR_LOG(rc);
return;
}
/* unpack the node stats */
n=1;
if (OPAL_SUCCESS != (rc = opal_dss.unpack(sample, &nst, &n, OPAL_NODE_STAT))) {
ORTE_ERROR_LOG(rc);
return;
}
if (mca_sensor_resusage_component.log_node_stats) {
/* convert this into an array of opal_value_t's - no clean way
* to do this, so have to just manually map each field
*/
for (i=0; i < 13; i++) {
OBJ_CONSTRUCT(&kv[i], opal_value_t);
}
i=0;
kv[i].key = strdup("ctime");
kv[i].type = OPAL_TIMEVAL;
kv[i].data.tv.tv_sec = nst->sample_time.tv_sec;
kv[i++].data.tv.tv_usec = nst->sample_time.tv_usec;
kv[i].key = "hostname";
kv[i].type = OPAL_STRING;
kv[i++].data.string = strdup(node);
kv[i].key = strdup("total_mem");
kv[i].type = OPAL_FLOAT;
kv[i++].data.fval = nst->total_mem;
kv[i].key = strdup("free_mem");
kv[i].type = OPAL_FLOAT;
kv[i++].data.fval = nst->free_mem;
kv[i].key = strdup("buffers");
kv[i].type = OPAL_FLOAT;
kv[i++].data.fval = nst->buffers;
kv[i].key = strdup("cached");
kv[i].type = OPAL_FLOAT;
kv[i++].data.fval = nst->cached;
kv[i].key = strdup("swap_total");
kv[i].type = OPAL_FLOAT;
kv[i++].data.fval = nst->swap_total;
kv[i].key = strdup("swap_free");
kv[i].type = OPAL_FLOAT;
kv[i++].data.fval = nst->swap_free;
kv[i].key = strdup("mapped");
kv[i].type = OPAL_FLOAT;
kv[i++].data.fval = nst->mapped;
kv[i].key = strdup("swap_cached");
kv[i].type = OPAL_FLOAT;
kv[i++].data.fval = nst->swap_cached;
kv[i].key = strdup("la");
kv[i].type = OPAL_FLOAT;
kv[i++].data.fval = nst->la;
kv[i].key = strdup("la5");
kv[i].type = OPAL_FLOAT;
kv[i++].data.fval = nst->la5;
kv[i].key = strdup("la15");
kv[i].type = OPAL_FLOAT;
kv[i++].data.fval = nst->la15;
/* store it */
if (ORTE_SUCCESS != (rc = opal_db.add_log("nodestats", kv, 12))) {
/* don't bark about it - just quietly disable the log */
log_enabled = false;
}
for (i=0; i < 12; i++) {
OBJ_DESTRUCT(&kv[i]);
}
}
OBJ_RELEASE(nst);
if (mca_sensor_resusage_component.log_process_stats) {
/* unpack all process stats */
n=1;
while (OPAL_SUCCESS == (rc = opal_dss.unpack(sample, &st, &n, OPAL_PSTAT))) {
for (i=0; i < 14; i++) {
OBJ_CONSTRUCT(&kv[i], opal_value_t);
}
kv[0].key = strdup("node");
kv[0].type = OPAL_STRING;
kv[0].data.string = strdup(st->node);
kv[1].key = strdup("rank");
kv[1].type = OPAL_INT32;
kv[1].data.int32 = st->rank;
kv[2].key = strdup("pid");
kv[2].type = OPAL_PID;
kv[2].data.pid = st->pid;
kv[3].key = strdup("cmd");
kv[3].type = OPAL_STRING;
kv[3].data.string = strdup(st->cmd);
kv[4].key = strdup("state");
kv[4].type = OPAL_STRING;
kv[4].data.string = (char*)malloc(3 * sizeof(char));
kv[4].data.string[0] = st->state[0];
kv[4].data.string[1] = st->state[1];
kv[4].data.string[2] = '\0';
kv[5].key = strdup("time");
kv[5].type = OPAL_TIMEVAL;
kv[5].data.tv.tv_sec = st->time.tv_sec;
kv[5].data.tv.tv_usec = st->time.tv_usec;
kv[6].key = strdup("percent_cpu");
kv[6].type = OPAL_FLOAT;
kv[6].data.fval = st->percent_cpu;
kv[7].key = strdup("priority");
kv[7].type = OPAL_INT32;
kv[7].data.int32 = st->priority;
kv[8].key = strdup("num_threads");
kv[8].type = OPAL_INT16;
kv[8].data.int16 = st->num_threads;
kv[9].key = strdup("vsize");
kv[9].type = OPAL_FLOAT;
kv[9].data.fval = st->vsize;
kv[10].key = strdup("rss");
kv[10].type = OPAL_FLOAT;
kv[10].data.fval = st->rss;
kv[11].key = strdup("peak_vsize");
kv[11].type = OPAL_FLOAT;
kv[11].data.fval = st->peak_vsize;
kv[12].key = strdup("processor");
kv[12].type = OPAL_INT16;
kv[12].data.int16 = st->processor;
kv[13].key = strdup("sample_time");
kv[13].type = OPAL_TIMEVAL;
kv[13].data.tv.tv_sec = st->sample_time.tv_sec;
kv[13].data.tv.tv_usec = st->sample_time.tv_usec;
/* store it */
if (ORTE_SUCCESS != (rc = opal_db.add_log("procstats", kv, 14))) {
log_enabled = false;
}
for (i=0; i < 14; i++) {
OBJ_DESTRUCT(&kv[i]);
}
OBJ_RELEASE(st);
n=1;
}
if (OPAL_ERR_UNPACK_READ_PAST_END_OF_BUFFER != rc) {
ORTE_ERROR_LOG(rc);
}
}
}

Просмотреть файл

@ -1,41 +0,0 @@
/*
* Copyright (c) 2009-2011 Cisco Systems, Inc. All rights reserved.
*
* Copyright (c) 2017 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/**
* @file
*
* Process Resource Utilization sensor
*/
#ifndef ORTE_SENSOR_RESUSAGE_H
#define ORTE_SENSOR_RESUSAGE_H
#include "orte_config.h"
#include "orte/mca/sensor/sensor.h"
BEGIN_C_DECLS
struct orte_sensor_resusage_component_t {
orte_sensor_base_component_t super;
int sample_rate;
float node_memory_limit;
float proc_memory_limit;
bool log_node_stats;
bool log_process_stats;
};
typedef struct orte_sensor_resusage_component_t orte_sensor_resusage_component_t;
ORTE_MODULE_DECLSPEC extern orte_sensor_resusage_component_t mca_sensor_resusage_component;
extern orte_sensor_base_module_t orte_sensor_resusage_module;
END_C_DECLS
#endif

Просмотреть файл

@ -1,138 +0,0 @@
/*
* Copyright (c) 2010-2011 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2017 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "orte/constants.h"
#include "opal/mca/base/base.h"
#include "opal/util/output.h"
#include "opal/class/opal_pointer_array.h"
#include "orte/util/proc_info.h"
#include "orte/util/show_help.h"
#include "sensor_resusage.h"
/*
* Local functions
*/
static int orte_sensor_resusage_register (void);
static int orte_sensor_resusage_open(void);
static int orte_sensor_resusage_close(void);
static int orte_sensor_resusage_query(mca_base_module_t **module, int *priority);
orte_sensor_resusage_component_t mca_sensor_resusage_component = {
{
{
ORTE_SENSOR_BASE_VERSION_1_0_0,
"resusage", /* MCA component name */
ORTE_MAJOR_VERSION, /* MCA component major version */
ORTE_MINOR_VERSION, /* MCA component minor version */
ORTE_RELEASE_VERSION, /* MCA component release version */
orte_sensor_resusage_open, /* component open */
orte_sensor_resusage_close, /* component close */
orte_sensor_resusage_query, /* component query */
orte_sensor_resusage_register
},
{
/* The component is checkpoint ready */
MCA_BASE_METADATA_PARAM_CHECKPOINT
},
"procresource,noderesource"
}
};
static int node_memory_limit;
static int proc_memory_limit;
/**
* component open/close/init function
*/
static int orte_sensor_resusage_register (void)
{
mca_base_component_t *c = &mca_sensor_resusage_component.super.base_version;
mca_sensor_resusage_component.sample_rate = 0;
(void) mca_base_component_var_register (c, "sample_rate", "Sample rate in seconds (default: 0)",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_READONLY,
&mca_sensor_resusage_component.sample_rate);
if (mca_sensor_resusage_component.sample_rate < 0) {
opal_output(0, "Illegal value %d - must be > 0", mca_sensor_resusage_component.sample_rate);
return ORTE_ERR_BAD_PARAM;
}
node_memory_limit = 0;
(void) mca_base_component_var_register (c, "node_memory_limit",
"Percentage of total memory that can be in-use",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_READONLY,
&node_memory_limit);
mca_sensor_resusage_component.node_memory_limit = (float)node_memory_limit/100.0;
proc_memory_limit = 0;
(void) mca_base_component_var_register (c, "proc_memory_limit",
"Max virtual memory size in MBytes",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_READONLY,
&proc_memory_limit);
mca_sensor_resusage_component.proc_memory_limit = (float) proc_memory_limit;
mca_sensor_resusage_component.log_node_stats = false;
(void) mca_base_component_var_register (c, "log_node_stats", "Log the node stats",
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_READONLY,
&mca_sensor_resusage_component.log_node_stats);
mca_sensor_resusage_component.log_process_stats = false;
(void) mca_base_component_var_register (c, "log_process_stats", "Log the process stats",
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_READONLY,
&mca_sensor_resusage_component.log_process_stats);
return ORTE_SUCCESS;
}
static int orte_sensor_resusage_open(void)
{
if (mca_sensor_resusage_component.sample_rate < 0) {
opal_output(0, "Illegal value %d - must be > 0", mca_sensor_resusage_component.sample_rate);
return ORTE_ERR_FATAL;
}
mca_sensor_resusage_component.node_memory_limit = (float) node_memory_limit/100.0;
mca_sensor_resusage_component.proc_memory_limit = (float) proc_memory_limit;
return ORTE_SUCCESS;
}
static int orte_sensor_resusage_query(mca_base_module_t **module, int *priority)
{
*priority = 100; /* ahead of heartbeat */
*module = (mca_base_module_t *)&orte_sensor_resusage_module;
return ORTE_SUCCESS;
}
/**
* Close all subsystems.
*/
static int orte_sensor_resusage_close(void)
{
return ORTE_SUCCESS;
}

Просмотреть файл

@ -1,107 +0,0 @@
/*
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2012 Los Alamos National Security, Inc. All rights reserved.
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*
* @file:
*
*/
#ifndef MCA_SENSOR_H
#define MCA_SENSOR_H
/*
* includes
*/
#include "orte_config.h"
#include "orte/types.h"
#include "opal/mca/mca.h"
BEGIN_C_DECLS
/*
* Component functions - all MUST be provided!
*/
/* start collecting data */
typedef void (*orte_sensor_API_module_start_fn_t)(orte_jobid_t job);
/* stop collecting data */
typedef void (*orte_sensor_API_module_stop_fn_t)(orte_jobid_t job);
/* API module */
/*
* Ver 1.0
*/
struct orte_sensor_base_API_module_1_0_0_t {
orte_sensor_API_module_start_fn_t start;
orte_sensor_API_module_stop_fn_t stop;
};
typedef struct orte_sensor_base_API_module_1_0_0_t orte_sensor_base_API_module_1_0_0_t;
typedef orte_sensor_base_API_module_1_0_0_t orte_sensor_base_API_module_t;
/* initialize the module */
typedef int (*orte_sensor_base_module_init_fn_t)(void);
/* finalize the module */
typedef void (*orte_sensor_base_module_finalize_fn_t)(void);
/* tell the module to sample its sensor */
typedef void (*orte_sensor_base_module_sample_fn_t)(void);
/* pass a buffer to the module for logging */
typedef void (*orte_sensor_base_module_log_fn_t)(opal_buffer_t *sample);
/*
* Component modules Ver 1.0
*/
struct orte_sensor_base_module_1_0_0_t {
orte_sensor_base_module_init_fn_t init;
orte_sensor_base_module_finalize_fn_t finalize;
orte_sensor_API_module_start_fn_t start;
orte_sensor_API_module_stop_fn_t stop;
orte_sensor_base_module_sample_fn_t sample;
orte_sensor_base_module_log_fn_t log;
};
typedef struct orte_sensor_base_module_1_0_0_t orte_sensor_base_module_1_0_0_t;
typedef orte_sensor_base_module_1_0_0_t orte_sensor_base_module_t;
/*
* the standard component data structure
*/
struct orte_sensor_base_component_1_0_0_t {
mca_base_component_t base_version;
mca_base_component_data_t base_data;
char *data_measured;
};
typedef struct orte_sensor_base_component_1_0_0_t orte_sensor_base_component_1_0_0_t;
typedef orte_sensor_base_component_1_0_0_t orte_sensor_base_component_t;
/*
* Macro for use in components that are of type sensor v1.0.0
*/
#define ORTE_SENSOR_BASE_VERSION_1_0_0 \
/* sensor v1.0 is chained to MCA v2.0 */ \
MCA_BASE_VERSION_2_0_0, \
/* sensor v1.0 */ \
"sensor", 1, 0, 0
/* Global structure for accessing sensor functions
*/
ORTE_DECLSPEC extern orte_sensor_base_API_module_t orte_sensor; /* holds API function pointers */
END_C_DECLS
#endif /* MCA_SENSOR_H */

Просмотреть файл

@ -1,51 +0,0 @@
/*
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
*
* Copyright (c) 2017 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/** @file:
*/
#ifndef ORTE_MCA_SENSOR_TYPES_H
#define ORTE_MCA_SENSOR_TYPES_H
#include "orte_config.h"
#include "orte/constants.h"
#ifdef HAVE_SYS_TIME_H
#include <sys/time.h>
#endif /* HAVE_SYS_TIME_H */
#include "opal/dss/dss_types.h"
/*
* General SENSOR types - instanced in runtime/orte_globals.c
*/
BEGIN_C_DECLS
enum {
ORTE_SENSOR_SCALE_LINEAR,
ORTE_SENSOR_SCALE_LOG,
ORTE_SENSOR_SCALE_SIGMOID
};
/*
* Structure for passing data from sensors
*/
typedef struct {
opal_object_t super;
char *sensor;
struct timeval timestamp;
opal_byte_object_t data;
} orte_sensor_data_t;
ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_sensor_data_t);
END_C_DECLS
#endif

Просмотреть файл

@ -2,6 +2,7 @@
/*
* Copyright (c) 2011-2015 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2017 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -85,27 +86,19 @@ ORTE_DECLSPEC extern mca_base_framework_t orte_state_base_framework;
ORTE_JOBID_PRINT(shadow->jobid), \
orte_job_state_to_str((s)), \
__FILE__, __LINE__); \
/* sanity check */ \
if ((s) < 0) { \
assert(0); \
} \
orte_state.activate_job_state(shadow, (s)); \
} while(0);
#define ORTE_ACTIVATE_PROC_STATE(p, s) \
do { \
orte_process_name_t *shadow=(p); \
opal_output_verbose(1, orte_state_base_framework.framework_output, \
opal_output_verbose(1, orte_state_base_framework.framework_output, \
"%s ACTIVATE PROC %s STATE %s AT %s:%d", \
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), \
(NULL == shadow) ? "NULL" : \
ORTE_NAME_PRINT(shadow), \
orte_proc_state_to_str((s)), \
__FILE__, __LINE__); \
/* sanity check */ \
if ((s) < 0) { \
assert(0); \
} \
orte_state.activate_proc_state(shadow, (s)); \
} while(0);

Просмотреть файл

@ -1,13 +1,13 @@
/*
* Copyright (c) 2009-2011 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2009-2011 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2011-2012 Los Alamos National Security, LLC.
* All rights reserved.
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
*
* $COPYRIGHT$
*
*
* Additional copyrights may follow
*
*
* $HEADER$
*/

Просмотреть файл

@ -102,7 +102,9 @@ static opal_pmix_server_module_t pmix_server = {
.notify_event = pmix_server_notify_event,
.query = pmix_server_query_fn,
.tool_connected = pmix_tool_connected_fn,
.log = pmix_server_log_fn
.log = pmix_server_log_fn,
.allocate = pmix_server_alloc_fn,
.job_control = pmix_server_job_ctrl_fn
};
void pmix_server_register_params(void)
@ -265,6 +267,12 @@ int pmix_server_init(void)
kv->type = OPAL_BOOL;
kv->data.flag = true;
opal_list_append(&info, &kv->super);
/* tell the server to use its own internal monitoring */
kv = OBJ_NEW(opal_value_t);
kv->key = strdup(OPAL_PMIX_SERVER_ENABLE_MONITORING);
kv->type = OPAL_BOOL;
kv->data.flag = true;
opal_list_append(&info, &kv->super);
/* setup the local server */
if (ORTE_SUCCESS != (rc = opal_pmix.server_init(&pmix_server, &info))) {

Просмотреть файл

@ -511,3 +511,13 @@ int pmix_server_disconnect_fn(opal_list_t *procs, opal_list_t *info,
return rc;
}
int pmix_server_alloc_fn(const opal_process_name_t *requestor,
opal_pmix_alloc_directive_t dir,
opal_list_t *info,
opal_pmix_info_cbfunc_t cbfunc,
void *cbdata)
{
/* ORTE currently has no way of supporting allocation requests */
return ORTE_ERR_NOT_SUPPORTED;
}

Просмотреть файл

@ -40,10 +40,12 @@
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/iof/iof.h"
#include "orte/mca/rmaps/rmaps_types.h"
#include "orte/mca/schizo/schizo.h"
#include "orte/mca/state/state.h"
#include "orte/util/name_fns.h"
#include "orte/runtime/orte_globals.h"
#include "orte/mca/rml/rml.h"
#include "orte/mca/plm/plm.h"
#include "orte/mca/plm/base/plm_private.h"
#include "pmix_server_internal.h"
@ -611,7 +613,15 @@ static void _query(int sd, short args, void *cbdata)
* and ask directly for the info - if rank=wildcard, then
* we need to xcast the request and collect the results */
}
} else if (0 == strcmp(q->keys[n], OPAL_PMIX_TIME_REMAINING)) {
kv = OBJ_NEW(opal_value_t);
kv->key = strdup(OPAL_PMIX_TIME_REMAINING);
kv->type = OPAL_UINT32;
if (ORTE_SUCCESS != orte_schizo.get_remaining_time(&kv->data.uint32)) {
OBJ_RELEASE(kv);
} else {
opal_list_append(results, &kv->super);
}
}
}
}
@ -813,3 +823,62 @@ void pmix_server_log_fn(opal_process_name_t *requestor,
cbfunc(OPAL_SUCCESS, cbdata);
}
}
int pmix_server_job_ctrl_fn(const opal_process_name_t *requestor,
opal_list_t *targets,
opal_list_t *info,
opal_pmix_info_cbfunc_t cbfunc,
void *cbdata)
{
opal_value_t *val;
int rc, n;
orte_proc_t *proc;
opal_pointer_array_t parray, *ptrarray;
opal_namelist_t *nm;
opal_output_verbose(2, orte_pmix_server_globals.output,
"%s job control request from %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(requestor));
OPAL_LIST_FOREACH(val, info, opal_value_t) {
if (NULL == val->key) {
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
continue;
}
if (0 == strcmp(val->key, OPAL_PMIX_JOB_CTRL_KILL)) {
/* convert the list of targets to a pointer array */
if (NULL == targets) {
ptrarray = NULL;
} else {
OBJ_CONSTRUCT(&parray, opal_pointer_array_t);
OPAL_LIST_FOREACH(nm, targets, opal_namelist_t) {
/* get the proc object for this proc */
if (NULL == (proc = orte_get_proc_object(&nm->name))) {
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
continue;
}
OBJ_RETAIN(proc);
opal_pointer_array_add(&parray, proc);
}
ptrarray = &parray;
}
if (ORTE_SUCCESS != (rc = orte_plm.terminate_procs(ptrarray))) {
ORTE_ERROR_LOG(rc);
}
if (NULL != ptrarray) {
/* cleanup the array */
for (n=0; n < parray.size; n++) {
if (NULL != (proc = (orte_proc_t*)opal_pointer_array_get_item(&parray, n))) {
OBJ_RELEASE(proc);
}
}
OBJ_DESTRUCT(&parray);
}
continue;
}
}
return ORTE_SUCCESS;
}

Просмотреть файл

@ -12,7 +12,7 @@
* Copyright (c) 2006-2013 Los Alamos National Security, LLC.
* All rights reserved.
* Copyright (c) 2010-2011 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2013-2016 Intel, Inc. All rights reserved.
* Copyright (c) 2013-2017 Intel, Inc. All rights reserved.
* Copyright (c) 2014 Mellanox Technologies, Inc.
* All rights reserved.
* Copyright (c) 2014 Research Organization for Information Science
@ -206,6 +206,18 @@ extern void pmix_server_log_fn(opal_process_name_t *requestor,
opal_pmix_op_cbfunc_t cbfunc,
void *cbdata);
extern int pmix_server_alloc_fn(const opal_process_name_t *requestor,
opal_pmix_alloc_directive_t dir,
opal_list_t *info,
opal_pmix_info_cbfunc_t cbfunc,
void *cbdata);
extern int pmix_server_job_ctrl_fn(const opal_process_name_t *requestor,
opal_list_t *targets,
opal_list_t *info,
opal_pmix_info_cbfunc_t cbfunc,
void *cbdata);
/* declare the RML recv functions for responses */
extern void pmix_server_launch_resp(int status, orte_process_name_t* sender,
opal_buffer_t *buffer,

Просмотреть файл

@ -705,7 +705,7 @@ int orte_util_decode_daemon_nodemap(opal_buffer_t *buffer)
char *ndnames, *rmndr, **tmp;
opal_list_t dids, slts, flgs;;
opal_buffer_t *bptr=NULL;
orte_topology_t *t;
orte_topology_t *t2;
orte_regex_range_t *rng, *drng, *srng, *frng;
uint8_t ui8;
@ -978,14 +978,13 @@ int orte_util_decode_daemon_nodemap(opal_buffer_t *buffer)
/* if no topology info was passed, then everyone shares our topology */
if (NULL == bptr) {
orte_topology_t *t;
/* our topology is first in the array */
t = (orte_topology_t*)opal_pointer_array_get_item(orte_node_topologies, 0);
t2 = (orte_topology_t*)opal_pointer_array_get_item(orte_node_topologies, 0);
for (n=0; n < orte_node_pool->size; n++) {
if (NULL != (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, n))) {
if (NULL == node->topology) {
OBJ_RETAIN(t);
node->topology = t;
OBJ_RETAIN(t2);
node->topology = t2;
}
}
}
@ -1004,6 +1003,13 @@ int orte_util_decode_daemon_nodemap(opal_buffer_t *buffer)
OBJ_RELEASE(bptr);
goto cleanup;
}
if (NULL == sig) {
rc = ORTE_ERR_BAD_PARAM;
ORTE_ERROR_LOG(rc);
opal_argv_free(tmp);
OBJ_RELEASE(bptr);
goto cleanup;
}
n = 1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(bptr, &topo, &n, OPAL_HWLOC_TOPO))) {
ORTE_ERROR_LOG(rc);
@ -1013,11 +1019,12 @@ int orte_util_decode_daemon_nodemap(opal_buffer_t *buffer)
goto cleanup;
}
/* see if we already have this topology - could be an update */
t2 = NULL;
for (n=0; n < orte_node_topologies->size; n++) {
if (NULL == (t = (orte_topology_t*)opal_pointer_array_get_item(orte_node_topologies, n))) {
if (NULL == (t2 = (orte_topology_t*)opal_pointer_array_get_item(orte_node_topologies, n))) {
continue;
}
if (0 == strcmp(t->sig, sig)) {
if (0 == strcmp(t2->sig, sig)) {
/* found a match */
free(sig);
opal_hwloc_base_free_topology(topo);
@ -1025,11 +1032,12 @@ int orte_util_decode_daemon_nodemap(opal_buffer_t *buffer)
break;
}
}
if (NULL != sig) {
if (NULL != sig || NULL == t2) {
/* new topology - record it */
t = OBJ_NEW(orte_topology_t);
t->sig = sig;
t->topo = topo;
t2 = OBJ_NEW(orte_topology_t);
t2->sig = sig;
t2->topo = topo;
opal_pointer_array_add(orte_node_topologies, t2);
}
/* point each of the nodes in the regex to this topology */
start = strtoul(tmp[nn], &rmndr, 10);
@ -1043,8 +1051,8 @@ int orte_util_decode_daemon_nodemap(opal_buffer_t *buffer)
for (k=start; k <= endpt; k++) {
if (NULL != (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, k))) {
if (NULL == node->topology) {
OBJ_RETAIN(t);
node->topology = t;
OBJ_RETAIN(t2);
node->topology = t2;
}
}
}