1
1

Update to include the PMIx 2.0 APIs for monitoring and job control. Include required integration, but leave the monitors off for now. Move the sensor framework out of ORTE as it is being absorbed into PMIx

Fix typo and silence warnings

Signed-off-by: Ralph Castain <rhc@open-mpi.org>
Этот коммит содержится в:
Ralph Castain 2017-03-14 21:44:05 -07:00
родитель 20bf0dd7c6
Коммит d645557fa0
83 изменённых файлов: 2709 добавлений и 2975 удалений

Просмотреть файл

@ -10,7 +10,7 @@
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2010-2012 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -94,7 +94,9 @@ enum {
OPAL_ERR_PROC_RESTART = (OPAL_ERR_BASE - 63),
OPAL_ERR_PROC_CHECKPOINT = (OPAL_ERR_BASE - 64),
OPAL_ERR_PROC_MIGRATE = (OPAL_ERR_BASE - 65),
OPAL_ERR_EVENT_REGISTRATION = (OPAL_ERR_BASE - 66)
OPAL_ERR_EVENT_REGISTRATION = (OPAL_ERR_BASE - 66),
OPAL_ERR_HEARTBEAT_ALERT = (OPAL_ERR_BASE - 67),
OPAL_ERR_FILE_ALERT = (OPAL_ERR_BASE - 68)
};
#define OPAL_ERR_MAX (OPAL_ERR_BASE - 100)

Просмотреть файл

@ -1,6 +1,6 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
* Copyright (c) 2014-2017 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* Copyright (c) 2014-2015 Mellanox Technologies, Inc.
@ -352,7 +352,7 @@ static void _event_hdlr(int sd, short args, void *cbdata)
if (NULL != chain->final_cbfunc) {
chain->final_cbfunc(PMIX_SUCCESS, chain->final_cbdata);
}
OBJ_RELEASE(chain);
return;

Просмотреть файл

@ -473,6 +473,59 @@ pmix_status_t PMIx_Allocation_request_nb(pmix_alloc_directive_t directive,
pmix_info_t *info, size_t ninfo,
pmix_info_cbfunc_t cbfunc, void *cbdata);
/* Request a job control action. The targets array identifies the
* processes to which the requested job control action is to be applied.
* A NULL value can be used to indicate all processes in the caller's
* nspace. The use of PMIX_RANK_WILDARD can also be used to indicate
* that all processes in the given nspace are to be included.
*
* The directives are provided as pmix_info_t structs in the directives
* array. The callback function provides a status to indicate whether or
* not the request was granted, and to provide some information as to
* the reason for any denial in the pmix_info_cbfunc_t array of pmix_info_t
* structures. If non-NULL, then the specified release_fn must be called
* when the callback function completes - this will be used to release
* any provided pmix_info_t array.
*/
pmix_status_t PMIx_Job_control_nb(const pmix_proc_t targets[], size_t ntargets,
const pmix_info_t directives[], size_t ndirs,
pmix_info_cbfunc_t cbfunc, void *cbdata);
/* Request that something be monitored - e.g., that the server monitor
* this process for periodic heartbeats as an indication that the process
* has not become "wedged". When a monitor detects the specified alarm
* condition, it will generate an event notification using the provided
* error code and passing along any available relevant information. It is
* up to the caller to register a corresponding event handler.
*
* Params:
*
* monitor: attribute indicating the type of monitor being requested - e.g.,
* PMIX_MONITOR_FILE to indicate that the requestor is asking that
* a file be monitored.
*
* error: the status code to be used when generating an event notification
* alerting that the monitor has been triggered. The range of the
* notification defaults to PMIX_RANGE_NAMESPACE - this can be
* changed by providing a PMIX_RANGE directive
*
* directives: characterize the monitoring request (e.g., monitor file size)
* and frequency of checking to be done
*
* cbfunc: provides a status to indicate whether or not the request was granted,
* and to provide some information as to the reason for any denial in
* the pmix_info_cbfunc_t array of pmix_info_t structures.
*
* Note: a process can send a heartbeat to the server using the PMIx_Heartbeat
* macro provided below*/
pmix_status_t PMIx_Process_monitor_nb(const pmix_info_t *monitor, pmix_status_t error,
const pmix_info_t directives[], size_t ndirs,
pmix_info_cbfunc_t cbfunc, void *cbdata);
/* define a special macro to simplify sending of a heartbeat */
#define PMIx_Heartbeat() \
PMIx_Process_monitor_nb(PMIX_SEND_HEARTBEAT, NULL, 0, NULL, NULL)
#if defined(c_plusplus) || defined(__cplusplus)
}
#endif

Просмотреть файл

@ -123,6 +123,8 @@ typedef uint32_t pmix_rank_t;
// a local system-level PMIx server
#define PMIX_CONNECT_SYSTEM_FIRST "pmix.cnct.sys.first" // (bool) Preferentially look for a system-level PMIx server first
#define PMIX_REGISTER_NODATA "pmix.reg.nodata" // (bool) Registration is for nspace only, do not copy job data
#define PMIX_SERVER_ENABLE_MONITORING "pmix.srv.monitor" // (bool) Enable PMIx internal monitoring by server
/* identification attributes */
#define PMIX_USERID "pmix.euid" // (uint32_t) effective user id
@ -218,8 +220,9 @@ typedef uint32_t pmix_rank_t;
#define PMIX_COLLECTIVE_ALGO "pmix.calgo" // (char*) comma-delimited list of algorithms to use for collective
#define PMIX_COLLECTIVE_ALGO_REQD "pmix.calreqd" // (bool) if true, indicates that the requested choice of algo is mandatory
#define PMIX_NOTIFY_COMPLETION "pmix.notecomp" // (bool) notify parent process upon termination of child job
#define PMIX_RANGE "pmix.range" // (int) pmix_data_range_t value for calls to publish/lookup/unpublish
#define PMIX_PERSISTENCE "pmix.persist" // (int) pmix_persistence_t value for calls to publish
#define PMIX_RANGE "pmix.range" // (pmix_data_range_t) value for calls to publish/lookup/unpublish or for
// monitoring event notifications
#define PMIX_PERSISTENCE "pmix.persist" // (pmix_persistence_t) value for calls to publish
#define PMIX_OPTIONAL "pmix.optional" // (bool) look only in the immediate data store for the requested value - do
// not request data from the server if not found
#define PMIX_EMBED_BARRIER "pmix.embed.barrier" // (bool) execute a blocking fence operation before executing the
@ -259,66 +262,72 @@ typedef uint32_t pmix_rank_t;
#define PMIX_EVENT_ACTION_TIMEOUT "pmix.evtimeout" // (int) time in sec before RM will execute error response
/* attributes used to describe "spawn" attributes */
#define PMIX_PERSONALITY "pmix.pers" // (char*) name of personality to use
#define PMIX_HOST "pmix.host" // (char*) comma-delimited list of hosts to use for spawned procs
#define PMIX_HOSTFILE "pmix.hostfile" // (char*) hostfile to use for spawned procs
#define PMIX_ADD_HOST "pmix.addhost" // (char*) comma-delimited list of hosts to add to allocation
#define PMIX_ADD_HOSTFILE "pmix.addhostfile" // (char*) hostfile to add to existing allocation
#define PMIX_PREFIX "pmix.prefix" // (char*) prefix to use for starting spawned procs
#define PMIX_WDIR "pmix.wdir" // (char*) working directory for spawned procs
#define PMIX_MAPPER "pmix.mapper" // (char*) mapper to use for placing spawned procs
#define PMIX_DISPLAY_MAP "pmix.dispmap" // (bool) display process map upon spawn
#define PMIX_PPR "pmix.ppr" // (char*) #procs to spawn on each identified resource
#define PMIX_MAPBY "pmix.mapby" // (char*) mapping policy
#define PMIX_RANKBY "pmix.rankby" // (char*) ranking policy
#define PMIX_BINDTO "pmix.bindto" // (char*) binding policy
#define PMIX_PRELOAD_BIN "pmix.preloadbin" // (bool) preload binaries
#define PMIX_PRELOAD_FILES "pmix.preloadfiles" // (char*) comma-delimited list of files to pre-position
#define PMIX_NON_PMI "pmix.nonpmi" // (bool) spawned procs will not call PMIx_Init
#define PMIX_STDIN_TGT "pmix.stdin" // (uint32_t) spawned proc rank that is to receive stdin
#define PMIX_FWD_STDIN "pmix.fwd.stdin" // (bool) forward my stdin to the designated proc
#define PMIX_FWD_STDOUT "pmix.fwd.stdout" // (bool) forward stdout from spawned procs to me
#define PMIX_FWD_STDERR "pmix.fwd.stderr" // (bool) forward stderr from spawned procs to me
#define PMIX_DEBUGGER_DAEMONS "pmix.debugger" // (bool) spawned app consists of debugger daemons
#define PMIX_COSPAWN_APP "pmix.cospawn" // (bool) designated app is to be spawned as a disconnected
// job - i.e., not part of the "comm_world" of the job
#define PMIX_PERSONALITY "pmix.pers" // (char*) name of personality to use
#define PMIX_HOST "pmix.host" // (char*) comma-delimited list of hosts to use for spawned procs
#define PMIX_HOSTFILE "pmix.hostfile" // (char*) hostfile to use for spawned procs
#define PMIX_ADD_HOST "pmix.addhost" // (char*) comma-delimited list of hosts to add to allocation
#define PMIX_ADD_HOSTFILE "pmix.addhostfile" // (char*) hostfile to add to existing allocation
#define PMIX_PREFIX "pmix.prefix" // (char*) prefix to use for starting spawned procs
#define PMIX_WDIR "pmix.wdir" // (char*) working directory for spawned procs
#define PMIX_MAPPER "pmix.mapper" // (char*) mapper to use for placing spawned procs
#define PMIX_DISPLAY_MAP "pmix.dispmap" // (bool) display process map upon spawn
#define PMIX_PPR "pmix.ppr" // (char*) #procs to spawn on each identified resource
#define PMIX_MAPBY "pmix.mapby" // (char*) mapping policy
#define PMIX_RANKBY "pmix.rankby" // (char*) ranking policy
#define PMIX_BINDTO "pmix.bindto" // (char*) binding policy
#define PMIX_PRELOAD_BIN "pmix.preloadbin" // (bool) preload binaries
#define PMIX_PRELOAD_FILES "pmix.preloadfiles" // (char*) comma-delimited list of files to pre-position
#define PMIX_NON_PMI "pmix.nonpmi" // (bool) spawned procs will not call PMIx_Init
#define PMIX_STDIN_TGT "pmix.stdin" // (uint32_t) spawned proc rank that is to receive stdin
#define PMIX_FWD_STDIN "pmix.fwd.stdin" // (bool) forward my stdin to the designated proc
#define PMIX_FWD_STDOUT "pmix.fwd.stdout" // (bool) forward stdout from spawned procs to me
#define PMIX_FWD_STDERR "pmix.fwd.stderr" // (bool) forward stderr from spawned procs to me
#define PMIX_DEBUGGER_DAEMONS "pmix.debugger" // (bool) spawned app consists of debugger daemons
#define PMIX_COSPAWN_APP "pmix.cospawn" // (bool) designated app is to be spawned as a disconnected
// job - i.e., not part of the "comm_world" of the job
/* query attributes */
#define PMIX_QUERY_NAMESPACES "pmix.qry.ns" // (char*) request a comma-delimited list of active nspaces
#define PMIX_QUERY_JOB_STATUS "pmix.qry.jst" // (pmix_status_t) status of a specified currently executing job
#define PMIX_QUERY_QUEUE_LIST "pmix.qry.qlst" // (char*) request a comma-delimited list of scheduler queues
#define PMIX_QUERY_QUEUE_STATUS "pmix.qry.qst" // (TBD) status of a specified scheduler queue
#define PMIX_QUERY_PROC_TABLE "pmix.qry.ptable" // (char*) input nspace of job whose info is being requested
// returns (pmix_data_array_t) an array of pmix_proc_info_t
#define PMIX_QUERY_LOCAL_PROC_TABLE "pmix.qry.lptable" // (char*) input nspace of job whose info is being requested
// returns (pmix_data_array_t) an array of pmix_proc_info_t for
// procs in job on same node
#define PMIX_QUERY_AUTHORIZATIONS "pmix.qry.auths" // return operations tool is authorized to perform
#define PMIX_QUERY_SPAWN_SUPPORT "pmix.qry.spawn" // return a comma-delimited list of supported spawn attributes
#define PMIX_QUERY_DEBUG_SUPPORT "pmix.qry.debug" // return a comma-delimited list of supported debug attributes
#define PMIX_QUERY_MEMORY_USAGE "pmix.qry.mem" // return info on memory usage for the procs indicated in the qualifiers
#define PMIX_QUERY_LOCAL_ONLY "pmix.qry.local" // constrain the query to local information only
#define PMIX_QUERY_REPORT_AVG "pmix.qry.avg" // report average values
#define PMIX_QUERY_REPORT_MINMAX "pmix.qry.minmax" // report minimum and maximum value
#define PMIX_QUERY_ALLOC_STATUS "pmix.query.alloc" // (char*) string identifier of the allocation whose status
// is being requested
#define PMIX_QUERY_NAMESPACES "pmix.qry.ns" // (char*) request a comma-delimited list of active nspaces
#define PMIX_QUERY_JOB_STATUS "pmix.qry.jst" // (pmix_status_t) status of a specified currently executing job
#define PMIX_QUERY_QUEUE_LIST "pmix.qry.qlst" // (char*) request a comma-delimited list of scheduler queues
#define PMIX_QUERY_QUEUE_STATUS "pmix.qry.qst" // (TBD) status of a specified scheduler queue
#define PMIX_QUERY_PROC_TABLE "pmix.qry.ptable" // (char*) input nspace of job whose info is being requested
// returns (pmix_data_array_t) an array of pmix_proc_info_t
#define PMIX_QUERY_LOCAL_PROC_TABLE "pmix.qry.lptable" // (char*) input nspace of job whose info is being requested
// returns (pmix_data_array_t) an array of pmix_proc_info_t for
// procs in job on same node
#define PMIX_QUERY_AUTHORIZATIONS "pmix.qry.auths" // (bool) return operations tool is authorized to perform
#define PMIX_QUERY_SPAWN_SUPPORT "pmix.qry.spawn" // (bool) return a comma-delimited list of supported spawn attributes
#define PMIX_QUERY_DEBUG_SUPPORT "pmix.qry.debug" // (bool) return a comma-delimited list of supported debug attributes
#define PMIX_QUERY_MEMORY_USAGE "pmix.qry.mem" // (bool) return info on memory usage for the procs indicated in the qualifiers
#define PMIX_QUERY_LOCAL_ONLY "pmix.qry.local" // (bool) constrain the query to local information only
#define PMIX_QUERY_REPORT_AVG "pmix.qry.avg" // (bool) report average values
#define PMIX_QUERY_REPORT_MINMAX "pmix.qry.minmax" // (bool) report minimum and maximum value
#define PMIX_QUERY_ALLOC_STATUS "pmix.query.alloc" // (char*) string identifier of the allocation whose status
// is being requested
#define PMIX_TIME_REMAINING "pmix.time.remaining" // (char*) query number of seconds (uint32_t) remaining in allocation
// for the specified nspace
/* log attributes */
#define PMIX_LOG_STDERR "pmix.log.stderr" // (char*) log string to stderr
#define PMIX_LOG_STDOUT "pmix.log.stdout" // (char*) log string to stdout
#define PMIX_LOG_SYSLOG "pmix.log.syslog" // (char*) log data to syslog - defaults to ERROR priority unless
#define PMIX_LOG_MSG "pmix.log.msg" // (pmix_byte_object_t) message blob to be sent somewhere
#define PMIX_LOG_STDERR "pmix.log.stderr" // (char*) log string to stderr
#define PMIX_LOG_STDOUT "pmix.log.stdout" // (char*) log string to stdout
#define PMIX_LOG_SYSLOG "pmix.log.syslog" // (char*) log data to syslog - defaults to ERROR priority unless
#define PMIX_LOG_MSG "pmix.log.msg" // (pmix_byte_object_t) message blob to be sent somewhere
#define PMIX_LOG_EMAIL "pmix.log.email" // (pmix_data_array_t) log via email based on pmix_info_t containing directives
#define PMIX_LOG_EMAIL_ADDR "pmix.log.emaddr" // (char*) comma-delimited list of email addresses that are to recv msg
#define PMIX_LOG_EMAIL_SUBJECT "pmix.log.emsub" // (char*) subject line for email
#define PMIX_LOG_EMAIL_MSG "pmix.log.emmsg" // (char*) msg to be included in email
/* debugger attributes */
#define PMIX_DEBUG_STOP_ON_EXEC "pmix.dbg.exec" // (bool) job is being spawned under debugger - instruct it to pause on start
#define PMIX_DEBUG_STOP_IN_INIT "pmix.dbg.init" // (bool) instruct job to stop during PMIx init
#define PMIX_DEBUG_WAIT_FOR_NOTIFY "pmix.dbg.notify" // (bool) block at desired point until receiving debugger release notification
#define PMIX_DEBUG_JOB "pmix.dbg.job" // (char*) nspace of the job to be debugged - the RM/PMIx server are
#define PMIX_DEBUG_WAITING_FOR_NOTIFY "pmix.dbg.waiting" // (bool) job to be debugged is waiting for a release
#define PMIX_DEBUG_STOP_ON_EXEC "pmix.dbg.exec" // (bool) job is being spawned under debugger - instruct it to pause on start
#define PMIX_DEBUG_STOP_IN_INIT "pmix.dbg.init" // (bool) instruct job to stop during PMIx init
#define PMIX_DEBUG_WAIT_FOR_NOTIFY "pmix.dbg.notify" // (bool) block at desired point until receiving debugger release notification
#define PMIX_DEBUG_JOB "pmix.dbg.job" // (char*) nspace of the job to be debugged - the RM/PMIx server are
#define PMIX_DEBUG_WAITING_FOR_NOTIFY "pmix.dbg.waiting" // (bool) job to be debugged is waiting for a release
/* Resource Manager identification */
#define PMIX_RM_NAME "pmix.rm.name" // (char*) string name of the resource manager
#define PMIX_RM_VERSION "pmix.rm.version" // (char*) RM version string
#define PMIX_RM_NAME "pmix.rm.name" // (char*) string name of the resource manager
#define PMIX_RM_VERSION "pmix.rm.version" // (char*) RM version string
/* attributes for setting envars */
#define PMIX_SET_ENVAR "pmix.set.envar" // (char*) string "key=value" value shall be put into the environment
@ -327,7 +336,6 @@ typedef uint32_t pmix_rank_t;
/* attributes relating to allocations */
#define PMIX_ALLOC_ID "pmix.alloc.id" // (char*) provide a string identifier for this allocation request
// which can later be used to query status of the request
#define PMIX_TIME_REMAINING "pmix.time.remaining" // (uint32_t) get number of seconds remaining in allocation
#define PMIX_ALLOC_NUM_NODES "pmix.alloc.nnodes" // (uint64_t) number of nodes
#define PMIX_ALLOC_NODE_LIST "pmix.alloc.nlist" // (char*) regex of specific nodes
#define PMIX_ALLOC_NUM_CPUS "pmix.alloc.ncpus" // (uint64_t) number of cpus
@ -343,6 +351,38 @@ typedef uint32_t pmix_rank_t;
#define PMIX_ALLOC_NETWORK_QOS "pmix.alloc.netqos" // (char*) quality of service level
#define PMIX_ALLOC_TIME "pmix.alloc.time" // (uint32_t) time in seconds
/* job control attributes */
#define PMIX_JOB_CTRL_ID "pmix.jctrl.id" // (char*) provide a string identifier for this request
#define PMIX_JOB_CTRL_PAUSE "pmix.jctrl.pause" // (bool) pause the specified processes
#define PMIX_JOB_CTRL_RESUME "pmix.jctrl.resume" // (bool) "un-pause" the specified processes
#define PMIX_JOB_CTRL_CANCEL "pmix.jctrl.cancel" // (char*) cancel the specified request
// (NULL => cancel all requests from this requestor)
#define PMIX_JOB_CTRL_KILL "pmix.jctrl.kill" // (bool) forcibly terminate the specified processes and cleanup
#define PMIX_JOB_CTRL_RESTART "pmix.jctrl.restart" // (char*) restart the specified processes using the given checkpoint ID
#define PMIX_JOB_CTRL_CHECKPOINT "pmix.jctrl.ckpt" // (char*) checkpoint the specified processes and assign the given ID to it
#define PMIX_JOB_CTRL_CHECKPOINT_EVENT "pmix.jctrl.ckptev" // (bool) use event notification to trigger process checkpoint
#define PMIX_JOB_CTRL_CHECKPOINT_SIGNAL "pmix.jctrl.ckptsig" // (int) use the given signal to trigger process checkpoint
#define PMIX_JOB_CTRL_CHECKPOINT_TIMEOUT "pmix.jctrl.ckptsig" // (int) time in seconds to wait for checkpoint to complete
#define PMIX_JOB_CTRL_SIGNAL "pmix.jctrl.sig" // (int) send given signal to specified processes
#define PMIX_JOB_CTRL_PROVISION "pmix.jctrl.pvn" // (char*) regex identifying nodes that are to be provisioned
#define PMIX_JOB_CTRL_PROVISION_IMAGE "pmix.jctrl.pvnimg" // (char*) name of the image that is to be provisioned
#define PMIX_JOB_CTRL_PREEMPTIBLE "pmix.jctrl.preempt" // (bool) job can be pre-empted
/* monitoring attributes */
#define PMIX_MONITOR_HEARTBEAT "pmix.monitor.mbeat" // (void) register to have the server monitor the requestor for heartbeats
#define PMIX_SEND_HEARTBEAT "pmix.monitor.beat" // (void) send heartbeat to local server
#define PMIX_MONITOR_HEARTBEAT_TIME "pmix.monitor.btime" // (uint32_t) time in seconds before declaring heartbeat missed
#define PMIX_MONITOR_HEARTBEAT_DROPS "pmix.monitor.bdrop" // (uint32_t) number of heartbeats that can be missed before taking
// specified action
#define PMIX_MONITOR_FILE "pmix.monitor.fmon" // (char*) register to monitor file for signs of life
#define PMIX_MONITOR_FILE_SIZE "pmix.monitor.fsize" // (bool) monitor size of given file is growing to determine app is running
#define PMIX_MONITOR_FILE_ACCESS "pmix.monitor.faccess" // (char*) monitor time since last access of given file to determine app is running
#define PMIX_MONITOR_FILE_MODIFY "pmix.monitor.fmod" // (char*) monitor time since last modified of given file to determine app is running
#define PMIX_MONITOR_FILE_CHECK_TIME "pmix.monitor.ftime" // (uint32_t) time in seconds between checking file
#define PMIX_MONITOR_FILE_DROPS "pmix.monitor.fdrop" // (uint32_t) number of file checks that can be missed before taking
// specified action
/**** PROCESS STATE DEFINITIONS ****/
typedef uint8_t pmix_proc_state_t;
#define PMIX_PROC_STATE_UNDEF 0 /* undefined process state */
@ -455,7 +495,14 @@ typedef int pmix_status_t;
#define PMIX_ERR_LOST_CONNECTION_TO_CLIENT (PMIX_ERR_V2X_BASE - 3)
/* used by the query system */
#define PMIX_QUERY_PARTIAL_SUCCESS (PMIX_ERR_V2X_BASE - 4)
/* request responses */
#define PMIX_NOTIFY_ALLOC_COMPLETE (PMIX_ERR_V2X_BASE - 5)
/* job control */
#define PMIX_JCTRL_CHECKPOINT (PMIX_ERR_V2X_BASE - 6)
#define PMIX_JCTRL_PREEMPT_ALERT (PMIX_ERR_V2X_BASE - 7)
/* monitoring */
#define PMIX_MONITOR_HEARTBEAT_ALERT (PMIX_ERR_V2X_BASE - 8)
#define PMIX_MONITOR_FILE_ALERT (PMIX_ERR_V2X_BASE - 9)
/* define a starting point for operational error constants so
* we avoid renumbering when making additions */

Просмотреть файл

@ -328,6 +328,17 @@ typedef pmix_status_t (*pmix_server_alloc_fn_t)(const pmix_proc_t *client,
const pmix_info_t data[], size_t ndata,
pmix_info_cbfunc_t cbfunc, void *cbdata);
/* Execute a job control action on behalf of a client */
typedef pmix_status_t (*pmix_server_job_control_fn_t)(const pmix_proc_t *requestor,
const pmix_proc_t targets[], size_t ntargets,
const pmix_info_t directives[], size_t ndirs,
pmix_info_cbfunc_t cbfunc, void *cbdata);
/* Request that a client be monitored for activity */
typedef pmix_status_t (*pmix_server_monitor_fn_t)(const pmix_proc_t *requestor, pmix_status_t error,
const pmix_info_t directives[], size_t ndirs,
pmix_info_cbfunc_t cbfunc, void *cbdata);
typedef struct pmix_server_module_2_0_0_t {
/* v1x interfaces */
pmix_server_client_connected_fn_t client_connected;
@ -350,12 +361,14 @@ typedef struct pmix_server_module_2_0_0_t {
pmix_server_tool_connection_fn_t tool_connected;
pmix_server_log_fn_t log;
pmix_server_alloc_fn_t allocate;
pmix_server_job_control_fn_t job_control;
pmix_server_monitor_fn_t monitor;
} pmix_server_module_t;
/**** SERVER SUPPORT INIT/FINALIZE FUNCTIONS ****/
/* Initialize the server support library, and provide a
* pointer to a pmix_server_module_t structure
* pointer to a pmix_server_module_t structure
* containing the caller's callback functions. The
* array of pmix_info_t structs is used to pass
* additional info that may be required by the server

Просмотреть файл

@ -1,6 +1,6 @@
# -*- makefile -*-
#
# Copyright (c) 2015 Intel, Inc. All rights reserved.
# Copyright (c) 2015-2017 Intel, Inc. All rights reserved.
# Copyright (c) 2016 Cisco Systems, Inc. All rights reserved.
# $COPYRIGHT$
#
@ -13,4 +13,5 @@ sources += \
common/pmix_query.c \
common/pmix_strings.c \
common/pmix_log.c \
common/pmix_jobdata.c
common/pmix_jobdata.c \
common/pmix_control.c

Просмотреть файл

@ -0,0 +1,269 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
* Copyright (c) 2016 Mellanox Technologies, Inc.
* All rights reserved.
* Copyright (c) 2016 IBM Corporation. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include <src/include/pmix_config.h>
#include <src/include/types.h>
#include <src/include/pmix_stdint.h>
#include <src/include/pmix_socket_errno.h>
#include <pmix.h>
#include <pmix_common.h>
#include <pmix_server.h>
#include <pmix_rename.h>
#include "src/util/argv.h"
#include "src/util/error.h"
#include "src/util/output.h"
#include "src/buffer_ops/buffer_ops.h"
#include "src/mca/ptl/ptl.h"
#include "src/client/pmix_client_ops.h"
#include "src/server/pmix_server_ops.h"
#include "src/include/pmix_globals.h"
static void relcbfunc(void *cbdata)
{
pmix_shift_caddy_t *cd = (pmix_shift_caddy_t*)cbdata;
pmix_output_verbose(2, pmix_globals.debug_output,
"pmix:query release callback");
if (NULL != cd->info) {
PMIX_INFO_FREE(cd->info, cd->ninfo);
}
PMIX_RELEASE(cd);
}
static void query_cbfunc(struct pmix_peer_t *peer,
pmix_ptl_hdr_t *hdr,
pmix_buffer_t *buf, void *cbdata)
{
pmix_query_caddy_t *cd = (pmix_query_caddy_t*)cbdata;
pmix_status_t rc;
pmix_shift_caddy_t *results;
int cnt;
pmix_output_verbose(2, pmix_globals.debug_output,
"pmix:query cback from server");
results = PMIX_NEW(pmix_shift_caddy_t);
/* unpack the status */
cnt = 1;
if (PMIX_SUCCESS != (rc = pmix_bfrop.unpack(buf, &results->status, &cnt, PMIX_STATUS))) {
PMIX_ERROR_LOG(rc);
goto complete;
}
if (PMIX_SUCCESS != results->status) {
goto complete;
}
/* unpack any returned data */
cnt = 1;
if (PMIX_SUCCESS != (rc = pmix_bfrop.unpack(buf, &results->ninfo, &cnt, PMIX_SIZE))) {
PMIX_ERROR_LOG(rc);
goto complete;
}
if (0 < results->ninfo) {
PMIX_INFO_CREATE(results->info, results->ninfo);
cnt = results->ninfo;
if (PMIX_SUCCESS != (rc = pmix_bfrop.unpack(buf, results->info, &cnt, PMIX_INFO))) {
PMIX_ERROR_LOG(rc);
goto complete;
}
}
complete:
pmix_output_verbose(2, pmix_globals.debug_output,
"pmix:query cback from server releasing");
/* release the caller */
if (NULL != cd->cbfunc) {
cd->cbfunc(results->status, results->info, results->ninfo, cd->cbdata, relcbfunc, results);
}
PMIX_RELEASE(cd);
}
PMIX_EXPORT pmix_status_t PMIx_Job_control_nb(const pmix_proc_t targets[], size_t ntargets,
const pmix_info_t directives[], size_t ndirs,
pmix_info_cbfunc_t cbfunc, void *cbdata)
{
pmix_buffer_t *msg;
pmix_cmd_t cmd = PMIX_JOB_CONTROL_CMD;
pmix_status_t rc;
pmix_query_caddy_t *cb;
pmix_output_verbose(2, pmix_globals.debug_output,
"pmix: job control called");
if (pmix_globals.init_cntr <= 0) {
return PMIX_ERR_INIT;
}
/* if we are the server, then we just issue the request and
* return the response */
if (PMIX_PROC_SERVER == pmix_globals.proc_type) {
if (NULL == pmix_host_server.job_control) {
/* nothing we can do */
return PMIX_ERR_NOT_SUPPORTED;
}
pmix_output_verbose(2, pmix_globals.debug_output,
"pmix:job_control handed to RM");
rc = pmix_host_server.job_control(&pmix_globals.myid,
targets, ntargets,
directives, ndirs,
cbfunc, cbdata);
return rc;
}
/* if we are a client, then relay this request to the server */
/* if we aren't connected, don't attempt to send */
if (!pmix_globals.connected) {
return PMIX_ERR_UNREACH;
}
msg = PMIX_NEW(pmix_buffer_t);
/* pack the cmd */
if (PMIX_SUCCESS != (rc = pmix_bfrop.pack(msg, &cmd, 1, PMIX_CMD))) {
PMIX_ERROR_LOG(rc);
PMIX_RELEASE(msg);
return rc;
}
/* pack the number of targets */
if (PMIX_SUCCESS != (rc = pmix_bfrop.pack(msg, &ntargets, 1, PMIX_SIZE))) {
PMIX_ERROR_LOG(rc);
PMIX_RELEASE(msg);
return rc;
}
/* remember, the targets can be NULL to indicate that the operation
* is to be done against all members of our nspace */
if (0 < ntargets) {
/* pack the targets */
if (PMIX_SUCCESS != (rc = pmix_bfrop.pack(msg, targets, ntargets, PMIX_PROC))) {
PMIX_ERROR_LOG(rc);
PMIX_RELEASE(msg);
return rc;
}
}
/* pack the directives */
if (PMIX_SUCCESS != (rc = pmix_bfrop.pack(msg, &ndirs, 1, PMIX_SIZE))) {
PMIX_ERROR_LOG(rc);
PMIX_RELEASE(msg);
return rc;
}
if (0 < ndirs) {
if (PMIX_SUCCESS != (rc = pmix_bfrop.pack(msg, directives, ndirs, PMIX_INFO))) {
PMIX_ERROR_LOG(rc);
PMIX_RELEASE(msg);
return rc;
}
}
/* create a callback object as we need to pass it to the
* recv routine so we know which callback to use when
* the return message is recvd */
cb = PMIX_NEW(pmix_query_caddy_t);
cb->cbfunc = cbfunc;
cb->cbdata = cbdata;
/* push the message into our event base to send to the server */
if (PMIX_SUCCESS != (rc = pmix_ptl.send_recv(&pmix_client_globals.myserver, msg, query_cbfunc, (void*)cb))){
PMIX_RELEASE(msg);
PMIX_RELEASE(cb);
}
return rc;
}
PMIX_EXPORT pmix_status_t PMIx_Process_monitor_nb(const pmix_info_t *monitor, pmix_status_t error,
const pmix_info_t directives[], size_t ndirs,
pmix_info_cbfunc_t cbfunc, void *cbdata)
{
pmix_buffer_t *msg;
pmix_cmd_t cmd = PMIX_MONITOR_CMD;
pmix_status_t rc;
pmix_query_caddy_t *cb;
pmix_output_verbose(2, pmix_globals.debug_output,
"pmix: monitor called");
if (pmix_globals.init_cntr <= 0) {
return PMIX_ERR_INIT;
}
/* if we are the server, then we just issue the request and
* return the response */
if (PMIX_PROC_SERVER == pmix_globals.proc_type) {
if (NULL == pmix_host_server.monitor) {
/* nothing we can do */
return PMIX_ERR_NOT_SUPPORTED;
}
pmix_output_verbose(2, pmix_globals.debug_output,
"pmix:monitor handed to RM");
rc = pmix_host_server.monitor(&pmix_globals.myid, error,
directives, ndirs, cbfunc, cbdata);
return rc;
}
/* if we are a client, then relay this request to the server */
/* if we aren't connected, don't attempt to send */
if (!pmix_globals.connected) {
return PMIX_ERR_UNREACH;
}
msg = PMIX_NEW(pmix_buffer_t);
/* pack the cmd */
if (PMIX_SUCCESS != (rc = pmix_bfrop.pack(msg, &cmd, 1, PMIX_CMD))) {
PMIX_ERROR_LOG(rc);
PMIX_RELEASE(msg);
return rc;
}
/* pack the error */
if (PMIX_SUCCESS != (rc = pmix_bfrop.pack(msg, &error, 1, PMIX_STATUS))) {
PMIX_ERROR_LOG(rc);
PMIX_RELEASE(msg);
return rc;
}
/* pack the directives */
if (PMIX_SUCCESS != (rc = pmix_bfrop.pack(msg, &ndirs, 1, PMIX_SIZE))) {
PMIX_ERROR_LOG(rc);
PMIX_RELEASE(msg);
return rc;
}
if (0 < ndirs) {
if (PMIX_SUCCESS != (rc = pmix_bfrop.pack(msg, directives, ndirs, PMIX_INFO))) {
PMIX_ERROR_LOG(rc);
PMIX_RELEASE(msg);
return rc;
}
}
/* create a callback object as we need to pass it to the
* recv routine so we know which callback to use when
* the return message is recvd */
cb = PMIX_NEW(pmix_query_caddy_t);
cb->cbfunc = cbfunc;
cb->cbdata = cbdata;
/* push the message into our event base to send to the server */
if (PMIX_SUCCESS != (rc = pmix_ptl.send_recv(&pmix_client_globals.myserver, msg, query_cbfunc, (void*)cb))){
PMIX_RELEASE(msg);
PMIX_RELEASE(cb);
}
return rc;
}

Просмотреть файл

@ -257,6 +257,8 @@ static void qcon(pmix_query_caddy_t *p)
{
p->queries = NULL;
p->nqueries = 0;
p->targets = NULL;
p->ntargets = 0;
p->info = NULL;
p->ninfo = 0;
p->cbfunc = NULL;

Просмотреть файл

@ -72,7 +72,9 @@ typedef enum {
PMIX_DEREGEVENTS_CMD,
PMIX_QUERY_CMD,
PMIX_LOG_CMD,
PMIX_ALLOC_CMD
PMIX_ALLOC_CMD,
PMIX_JOB_CONTROL_CMD,
PMIX_MONITOR_CMD
} pmix_cmd_t;
/* provide a "pretty-print" function for cmds */
@ -214,6 +216,8 @@ typedef struct {
pmix_status_t status;
pmix_query_t *queries;
size_t nqueries;
pmix_proc_t *targets;
size_t ntargets;
pmix_info_t *info;
size_t ninfo;
pmix_info_cbfunc_t cbfunc;

Просмотреть файл

@ -256,4 +256,13 @@ typedef struct event pmix_event_t;
#define pmix_event_active(x, y, z) event_active((x), (y), (z))
#define pmix_event_evtimer_new(b, cb, arg) pmix_event_new((b), -1, 0, (cb), (arg))
#define pmix_event_evtimer_add(x, tv) pmix_event_add((x), (tv))
#define pmix_event_evtimer_set(b, x, cb, arg) event_assign((x), (b), -1, 0, (event_callback_fn) (cb), (arg))
#define pmix_event_evtimer_del(x) pmix_event_del((x))
#endif /* PMIX_TYPES_H */

Просмотреть файл

@ -3,26 +3,27 @@
#
# Copyright (c) 2017 Intel, Inc. All rights reserved.
# $COPYRIGHT$
#
#
# Additional copyrights may follow
#
#
# $HEADER$
#
AM_CPPFLAGS = $(LTDLINCL)
# main library setup
noinst_LTLIBRARIES = libmca_sensor.la
libmca_sensor_la_SOURCES =
noinst_LTLIBRARIES = libmca_psensor.la
libmca_psensor_la_SOURCES =
# local files
headers = sensor.h \
sensor_types.h
headers = psensor.h
libmca_sensor_la_SOURCES += $(headers)
libmca_psensor_la_SOURCES += $(headers)
# Conditionally install the header files
if WANT_INSTALL_HEADERS
ortedir = $(ompiincludedir)/$(subdir)
nobase_orte_HEADERS = $(headers)
pmixdir = $(pmixincludedir)/$(subdir)
nobase_pmix_HEADERS = $(headers)
endif
include base/Makefile.am

Просмотреть файл

@ -1,5 +1,5 @@
#
# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
# Copyright (c) 2012-2013 Los Alamos National Security, Inc. All rights reserved.
#
# Copyright (c) 2017 Intel, Inc. All rights reserved.
@ -11,10 +11,9 @@
#
headers += \
base/base.h \
base/sensor_private.h
base/base.h
libmca_sensor_la_SOURCES += \
base/sensor_base_frame.c \
base/sensor_base_select.c \
base/sensor_base_fns.c
libmca_psensor_la_SOURCES += \
base/psensor_base_frame.c \
base/psensor_base_select.c \
base/psensor_base_stubs.c

Просмотреть файл

@ -0,0 +1,59 @@
/*
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2013 Los Alamos National Security, LLC. All rights reserved.
*
* Copyright (c) 2017 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/** @file:
*/
#ifndef PMIX_PSENSOR_BASE_H_
#define PMIX_PSENSOR_BASE_H_
#include <src/include/pmix_config.h>
#include "src/class/pmix_list.h"
#include "src/mca/mca.h"
#include "src/mca/base/pmix_mca_base_framework.h"
#include "src/mca/psensor/psensor.h"
BEGIN_C_DECLS
/*
* MCA Framework
*/
PMIX_EXPORT extern pmix_mca_base_framework_t pmix_psensor_base_framework;
PMIX_EXPORT int pmix_psensor_base_select(void);
/* define a struct to hold framework-global values */
typedef struct {
pmix_list_t actives;
pmix_event_base_t *evbase;
} pmix_psensor_base_t;
typedef struct {
pmix_list_item_t super;
pmix_psensor_base_component_t *component;
pmix_psensor_base_module_t *module;
int priority;
} pmix_psensor_active_module_t;
PMIX_CLASS_DECLARATION(pmix_psensor_active_module_t);
PMIX_EXPORT extern pmix_psensor_base_t pmix_psensor_base;
PMIX_EXPORT pmix_status_t pmix_psensor_base_start(pmix_peer_t *requestor, pmix_status_t error,
const pmix_info_t *monitor,
const pmix_info_t directives[], size_t ndirs);
PMIX_EXPORT pmix_status_t pmix_psensor_base_stop(pmix_peer_t *requestor,
char *id);
END_C_DECLS
#endif

Просмотреть файл

@ -0,0 +1,103 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2012-2013 Los Alamos National Security, Inc. All rights reserved.
*
* Copyright (c) 2017 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include <src/include/pmix_config.h>
#include <pmix_common.h>
#include <pthread.h>
#include PMIX_EVENT_HEADER
#include "src/mca/mca.h"
#include "src/mca/base/base.h"
#include "src/class/pmix_list.h"
#include "src/runtime/pmix_progress_threads.h"
#include "src/include/types.h"
#include "src/mca/psensor/base/base.h"
/*
* The following file was created by configure. It contains extern
* statements and the definition of an array of pointers to each
* component's public mca_base_component_t struct.
*/
#include "src/mca/psensor/base/static-components.h"
/*
* Global variables
*/
pmix_psensor_base_module_t pmix_psensor = {
pmix_psensor_base_start,
pmix_psensor_base_stop
};
pmix_psensor_base_t pmix_psensor_base = {{{0}}};;
static bool use_separate_thread = false;
static int pmix_psensor_register(pmix_mca_base_register_flag_t flags)
{
(void) pmix_mca_base_var_register("pmix", "psensor", "base", "use_separate_thread",
"Use a separate thread for monitoring local procs",
PMIX_MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
PMIX_INFO_LVL_9,
PMIX_MCA_BASE_VAR_SCOPE_READONLY,
&use_separate_thread);
return PMIX_SUCCESS;
}
static int pmix_psensor_base_close(void)
{
PMIX_LIST_DESTRUCT(&pmix_psensor_base.actives);
if (use_separate_thread && NULL != pmix_psensor_base.evbase) {
(void)pmix_progress_thread_stop("PSENSOR");
}
/* Close all remaining available components */
return pmix_mca_base_framework_components_close(&pmix_psensor_base_framework, NULL);
}
/**
* Function for finding and opening either all MCA components, or the one
* that was specifically requested via a MCA parameter.
*/
static int pmix_psensor_base_open(pmix_mca_base_open_flag_t flags)
{
/* construct the list of modules */
PMIX_CONSTRUCT(&pmix_psensor_base.actives, pmix_list_t);
if (use_separate_thread) {
/* create an event base and progress thread for us */
if (NULL == (pmix_psensor_base.evbase = pmix_progress_thread_init("PSENSOR"))) {
return PMIX_ERROR;
}
} else {
pmix_psensor_base.evbase = pmix_globals.evbase;
}
/* Open up all available components */
return pmix_mca_base_framework_components_open(&pmix_psensor_base_framework, flags);
}
PMIX_MCA_BASE_FRAMEWORK_DECLARE(pmix, psensor, "PMIx Monitoring Sensors",
pmix_psensor_register,
pmix_psensor_base_open, pmix_psensor_base_close,
mca_psensor_base_static_components, 0);
PMIX_CLASS_INSTANCE(pmix_psensor_active_module_t,
pmix_list_item_t,
NULL, NULL);

Просмотреть файл

@ -0,0 +1,94 @@
/*
* Copyright (c) 2004-2008 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2016-2017 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include <src/include/pmix_config.h>
#include <pmix_common.h>
#include <string.h>
#include "src/mca/mca.h"
#include "src/mca/base/base.h"
#include "src/mca/psensor/base/base.h"
static bool selected = false;
/* Function for selecting a prioritized list of components
* from all those that are available. */
int pmix_psensor_base_select(void)
{
pmix_mca_base_component_list_item_t *cli = NULL;
pmix_psensor_base_component_t *component = NULL;
pmix_psensor_active_module_t *newactive, *active;
pmix_mca_base_module_t *mod;
int pri;
bool inserted;
if (selected) {
/* ensure we don't do this twice */
return PMIX_SUCCESS;
}
selected = true;
/* Query all available components and ask if they have a module */
PMIX_LIST_FOREACH(cli, &pmix_psensor_base_framework.framework_components, pmix_mca_base_component_list_item_t) {
component = (pmix_psensor_base_component_t *) cli->cli_component;
pmix_output_verbose(5, pmix_psensor_base_framework.framework_output,
"mca:psensor:select: checking available component %s",
component->base.pmix_mca_component_name);
/* get the module for this component */
if (PMIX_SUCCESS != component->base.pmix_mca_query_component(&mod, &pri)) {
continue;
}
/* add to our prioritized list of available actives */
newactive = PMIX_NEW(pmix_psensor_active_module_t);
newactive->priority = pri;
newactive->component = component;
newactive->module = (pmix_psensor_base_module_t*)mod;
/* maintain priority order */
inserted = false;
PMIX_LIST_FOREACH(active, &pmix_psensor_base.actives, pmix_psensor_active_module_t) {
if (newactive->priority > active->priority) {
pmix_list_insert_pos(&pmix_psensor_base.actives,
(pmix_list_item_t*)active, &newactive->super);
inserted = true;
break;
}
}
if (!inserted) {
/* must be lowest priority - add to end */
pmix_list_append(&pmix_psensor_base.actives, &newactive->super);
}
}
if (4 < pmix_output_get_verbosity(pmix_psensor_base_framework.framework_output)) {
pmix_output(0, "Final PSENSOR priorities");
/* show the prioritized list */
PMIX_LIST_FOREACH(active, &pmix_psensor_base.actives, pmix_psensor_active_module_t) {
pmix_output(0, "\tPSENSOR: %s Priority: %d",
active->component->base.pmix_mca_component_name, active->priority);
}
}
return PMIX_SUCCESS;;
}

Просмотреть файл

@ -0,0 +1,68 @@
/*
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2012 Los Alamos National Security, Inc. All rights reserved.
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include <src/include/pmix_config.h>
#include <pmix_common.h>
#include "src/util/error.h"
#include "src/mca/psensor/base/base.h"
static bool mods_active = false;
pmix_status_t pmix_psensor_base_start(pmix_peer_t *requestor, pmix_status_t error,
const pmix_info_t *monitor,
const pmix_info_t directives[], size_t ndirs)
{
pmix_psensor_active_module_t *mod;
pmix_status_t rc;
opal_output_verbose(5, pmix_psensor_base_framework.framework_output,
"%s:%d sensor:base: starting sensors",
pmix_globals.myid.nspace, pmix_globals.myid.rank);
/* call the start function of all modules in priority order */
PMIX_LIST_FOREACH(mod, &pmix_psensor_base.actives, pmix_psensor_active_module_t) {
if (NULL != mod->module->start) {
rc = mod->module->start(requestor, error, monitor, directives, ndirs);
if (PMIX_SUCCESS != rc && PMIX_ERR_TAKE_NEXT_OPTION != rc) {
return rc;
}
}
}
return PMIX_SUCCESS;
}
pmix_status_t pmix_psensor_base_stop(pmix_peer_t *requestor,
char *id)
{
pmix_psensor_active_module_t *mod;
pmix_status_t rc;
opal_output_verbose(5, pmix_psensor_base_framework.framework_output,
"%s:%d sensor:base: stopping sensors",
pmix_globals.myid.nspace, pmix_globals.myid.rank);
/* call the stop function of all modules in priority order */
PMIX_LIST_FOREACH(mod, &pmix_psensor_base.actives, pmix_psensor_active_module_t) {
if (NULL != mod->module->stop) {
rc = mod->module->stop(requestor, id);
if (PMIX_SUCCESS != rc && PMIX_ERR_TAKE_NEXT_OPTION != rc) {
return rc;
}
}
}
return PMIX_SUCCESS;
}

Просмотреть файл

@ -1,37 +1,37 @@
#
# Copyright (c) 2009-2010 Cisco Systems, Inc. All rights reserved.
# Copyright (c) 2009-2010 Cisco Systems, Inc. All rights reserved.
# Copyright (c) 2017 Intel, Inc. All rights reserved.
# $COPYRIGHT$
#
#
# Additional copyrights may follow
#
#
# $HEADER$
#
dist_ompidata_DATA = help-orte-sensor-file.txt
dist_pmixdata_DATA = help-pmix-psensor-file.txt
sources = \
sensor_file.c \
sensor_file.h \
sensor_file_component.c
psensor_file.c \
psensor_file.h \
psensor_file_component.c
# Make the output library in this directory, and name it either
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
# (for static builds).
if MCA_BUILD_orte_sensor_file_DSO
if MCA_BUILD_pmix_psensor_file_DSO
component_noinst =
component_install = mca_sensor_file.la
component_install = mca_psensor_file.la
else
component_noinst = libmca_sensor_file.la
component_noinst = libmca_psensor_file.la
component_install =
endif
mcacomponentdir = $(ompilibdir)
mcacomponentdir = $(pmixlibdir)
mcacomponent_LTLIBRARIES = $(component_install)
mca_sensor_file_la_SOURCES = $(sources)
mca_sensor_file_la_LDFLAGS = -module -avoid-version
mca_psensor_file_la_SOURCES = $(sources)
mca_psensor_file_la_LDFLAGS = -module -avoid-version
noinst_LTLIBRARIES = $(component_noinst)
libmca_sensor_file_la_SOURCES =$(sources)
libmca_sensor_file_la_LDFLAGS = -module -avoid-version
libmca_psensor_file_la_SOURCES =$(sources)
libmca_psensor_file_la_LDFLAGS = -module -avoid-version

Просмотреть файл

@ -4,9 +4,9 @@
#
# Copyright (c) 2017 Intel, Inc. All rights reserved.
# $COPYRIGHT$
#
#
# Additional copyrights may follow
#
#
# $HEADER$
#
# This is the US/English general help file for the file sensor

Просмотреть файл

@ -0,0 +1,352 @@
/*
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2011-2012 Los Alamos National Security, LLC.
* All rights reserved.
*
* Copyright (c) 2017 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include <src/include/pmix_config.h>
#include <src/include/types.h>
#include <pmix_common.h>
#include <stdio.h>
#include <stddef.h>
#include <ctype.h>
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif
#ifdef HAVE_NETDB_H
#include <netdb.h>
#endif
#ifdef HAVE_SYS_PARAM_H
#include <sys/param.h>
#endif
#include <fcntl.h>
#include <errno.h>
#include <signal.h>
#ifdef HAVE_TIME_H
#include <time.h>
#endif
#include <sys/stat.h>
#include <sys/types.h>
#include "src/class/pmix_list.h"
#include "src/include/pmix_globals.h"
#include "src/util/error.h"
#include "src/util/output.h"
#include "src/util/show_help.h"
#include "src/mca/psensor/base/base.h"
#include "psensor_file.h"
/* declare the API functions */
static pmix_status_t start(pmix_peer_t *requestor, pmix_status_t error,
const pmix_info_t *monitor,
const pmix_info_t directives[], size_t ndirs);
static pmix_status_t stop(pmix_peer_t *requestor, char *id);
/* instantiate the module */
pmix_psensor_base_module_t pmix_psensor_file_module = {
.start = start,
.stop = stop
};
/* define a tracking object */
typedef struct {
pmix_list_item_t super;
pmix_peer_t *requestor;
char *id;
bool event_active;
pmix_event_t ev;
pmix_event_t cdev;
struct timeval tv;
int tick;
char *file;
bool file_size;
bool file_access;
bool file_mod;
size_t last_size;
time_t last_access;
time_t last_mod;
uint32_t ndrops;
uint32_t nmisses;
pmix_status_t error;
pmix_data_range_t range;
pmix_info_t *info;
size_t ninfo;
} file_tracker_t;
static void ft_constructor(file_tracker_t *ft)
{
ft->requestor = NULL;
ft->id = NULL;
ft->event_active = false;
ft->tv.tv_sec = 0;
ft->tv.tv_usec = 0;
ft->tick = 0;
ft->file_size = false;
ft->file_access = false;
ft->file_mod = false;
ft->last_size = 0;
ft->last_access = 0;
ft->last_mod = 0;
ft->ndrops = 0;
ft->nmisses = 0;
ft->error = PMIX_SUCCESS;
ft->range = PMIX_RANGE_NAMESPACE;
ft->info = NULL;
ft->ninfo = 0;
}
static void ft_destructor(file_tracker_t *ft)
{
if (NULL != ft->requestor) {
PMIX_RELEASE(ft->requestor);
}
if (NULL != ft->id) {
free(ft->id);
}
if (event_active) {
pmix_event_del(&ft->ev);
}
if (NULL != ft->file) {
free(ft->file);
}
if (NULL != ft->info) {
PMIX_INFO_FREE(ft->info, ft->ninfo);
}
}
PMIX_CLASS_INSTANCE(file_tracker_t,
pmix_list_item_t,
ft_constructor, ft_destructor);
/* define a local caddy */
typedef struct {
pmix_object_t super;
pmix_event_t ev;
pmix_peer_t *requestor;
char *id;
} file_caddy_t;
static void cd_con(file_caddy_t *p)
{
p->requestor = NULL;
p->id = NULL;
}
static void cd_des(file_caddy_t *p)
{
if (NULL != (p->requestor)) {
PMIX_RELEASE(p->requestor);
}
if (NULL != p->id) {
free(p->id);
}
}
PMIX_CLASS_INSTANCE(file_caddy_t,
pmix_object_t,
cd_con, cd_des);
static void file_sample(int sd, short args, void *cbdata);
static void add_tracker(int sd, short flags, void *cbdata)
{
file_tracker_t *ft = (file_tracker_t*)cbdata;
/* add the tracker to our list */
pmix_list_append(&mca_psensor_file_component.trackers, &ft->super);
/* setup the timer event */
pmix_event_evtimer_set(pmix_psensor_base.evbase, &ft->ev,
file_sample, ft);
pmix_event_evtimer_add(&ft->ev, &ft->tv);
ft->event_active = true;
}
/*
* Start monitoring of local processes
*/
static pmix_status_t start(pmix_peer_t *requestor, pmix_status_t error,
const pmix_info_t *monitor,
const pmix_info_t directives[], size_t ndirs)
{
file_tracker_t *ft;
pmix_info_t *ptr;
size_t n, n2;
PMIX_OUTPUT_VERBOSE((1, pmix_psensor_base_framework.framework_output,
"[%s:%d] checking file monitoring for requestor %s:%d",
pmix_globals.myid.nspace, pmix_globals.myid.rank,
requestor->info->nptr->nspace, requestor->info->rank));
/* if they didn't ask to monitor a file, then nothing for us to do */
if (0 != strcmp(monitor->key, PMIX_MONITOR_FILE)) {
return PMIX_ERR_TAKE_NEXT_OPTION;
}
/* setup to track this monitoring operation */
ft = PMIX_NEW(file_tracker_t);
PMIX_RETAIN(requestor);
ft->requestor = requestor;
ft->file = strdup(monitor->value.data.string);
/* check the directives to see if what they want monitored */
for (n=0; n < ndirs; n++) {
if (0 == strcmp(directives[n].key, PMIX_MONITOR_FILE_SIZE)) {
ft->file_size = directives[n].value.data.flag;
} else if (0 == strcmp(directives[n].key, PMIX_MONITOR_FILE_ACCESS)) {
ft->file_access = directives[n].value.data.flag;
} else if (0 == strcmp(directives[n].key, PMIX_MONITOR_FILE_MODIFY)) {
ft->file_mod = directives[n].value.data.flag;
} else if (0 == strcmp(directives[n].key, PMIX_MONITOR_FILE_DROPS)) {
ft->ndrops = directives[n].value.data.uint32;
} else if (0 == strcmp(directives[n].key, PMIX_MONITOR_FILE_CHECK_TIME)) {
ft->tv.tv_sec = directives[n].value.data.uint32;
} else if (0 == strcmp(directives[n].key, PMIX_RANGE)) {
ft->range = directives[n].value.data.range;
}
}
if (0 == ft->tv.tv_sec ||
(!ft->file_size && !ft->file_access && !ft->file_mod)) {
/* didn't specify a sample rate, or what should be sampled */
PMIX_RELEASE(ft);
return PMIX_ERR_BAD_PARAM;
}
/* need to push into our event base to add this to our trackers */
pmix_event_assign(&ft->cdev, pmix_psensor_base.evbase, -1,
EV_WRITE, add_tracker, ft);
pmix_event_active(&ft->cdev, EV_WRITE, 1);
return PMIX_SUCCESS;
}
static void del_tracker(int sd, short flags, void *cbdata)
{
file_caddy_t *cd = (file_caddy_t*)cbdata;
file_tracker_t *ft, *ftnext;
/* remove the tracker from our list */
PMIX_LIST_FOREACH_SAFE(ft, ftnext, &mca_psensor_file_component.trackers, file_tracker_t) {
if (ft->requestor != cd->requestor) {
continue;
}
if (NULL == cd->id ||
(NULL != ft->id && 0 == strcmp(ft->id, cd->id))) {
pmix_list_remove_item(&mca_psensor_file_component.trackers, &ft->super);
PMIX_RELEASE(ft);
}
}
PMIX_RELEASE(cd);
}
static pmix_status_t stop(pmix_peer_t *requestor, char *id)
{
file_caddy_t *cd;
cd = PMIX_NEW(file_caddy_t);
PMIX_RETAIN(requestor);
cd->requestor = requestor;
cd->id = strdup(id);
/* need to push into our event base to add this to our trackers */
pmix_event_assign(&cd->ev, pmix_psensor_base.evbase, -1,
EV_WRITE, del_tracker, cd);
pmix_event_active(&cd->ev, EV_WRITE, 1);
return PMIX_SUCCESS;
}
static void opcbfunc(pmix_status_t status, void *cbdata)
{
file_tracker_t *ft = (file_tracker_t*)cbdata;
PMIX_RELEASE(ft);
}
static void file_sample(int sd, short args, void *cbdata)
{
file_tracker_t *ft = (file_tracker_t*)cbdata;
struct stat buf;
pmix_status_t rc;
pmix_proc_t source;
OPAL_OUTPUT_VERBOSE((1, pmix_psensor_base_framework.framework_output,
"[%s:%d] sampling file %s",
pmix_globals.myid.nspace, pmix_globals.myid.rank,
ft->file));
/* stat the file and get its info */
if (0 > stat(ft->file, &buf)) {
/* cannot stat file */
PMIX_OUTPUT_VERBOSE((1, pmix_psensor_base_framework.framework_output,
"[%s:%d] could not stat %s",
pmix_globals.myid.nspace, pmix_globals.myid.rank,
ft->file));
/* re-add the timer, in case this file shows up */
pmix_event_evtimer_add(&ft->ev, &ft->tv);
return;
}
PMIX_OUTPUT_VERBOSE((1, pmix_psensor_base_framework.framework_output,
"[%s:%d] size %lu access %s\tmod %s",
pmix_globals.myid.nspace, pmix_globals.myid.rank,
(unsigned long)buf.st_size, ctime(&buf.st_atime), ctime(&buf.st_mtime)));
if (ft->file_size) {
if (buf.st_size == ft->last_size) {
ft->nmisses++;
} else {
ft->nmisses = 0;
ft->last_size = buf.st_size;
}
} else if (ft->file_access) {
if (buf.st_atime == ft->last_access) {
ft->nmisses++;
} else {
ft->nmisses = 0;
ft->last_access = buf.st_atime;
}
} else if (ft->file_mod) {
if (buf.st_mtime == ft->last_mod) {
ft->nmisses++;
} else {
ft->nmisses = 0;
ft->last_mod = buf.st_mtime;
}
}
CHECK:
PMIX_OUTPUT_VERBOSE((1, pmix_psensor_base_framework.framework_output,
"[%s:%d] sampled file %s misses %d",
pmix_globals.myid.nspace, pmix_globals.myid.rank,
ft->file, ft->nmisses));
if (ft->nmisses == ft->ndrops) {
if (4 < pmix_output_get_verbosity(pmix_psensor_base_framework.framework_output)) {
pmix_show_help("help-pmix-psensor-file.txt", "file-stalled", true,
ft->file, ft->last_size, ctime(&ft->last_access), ctime(&ft->last_mod));
}
/* stop monitoring this client */
pmix_list_remove_item(&mca_psensor_file_component.trackers, &ft->super);
/* generate an event */
(void)strncpy(source.nspace, ft->requestor->info->nptr->nspace, PMIX_MAX_NSLEN);
source.rank = ft->requestor->info->rank;
rc = PMIx_Notify_event(PMIX_MONITOR_FILE_ALERT, &source,
ft->range, ft->info, ft->ninfo, opcbfunc, ft);
if (PMIX_SUCCESS != rc) {
PMIX_ERROR_LOG(rc);
}
return;
}
/* re-add the timer */
pmix_event_evtimer_add(&ft->ev, &ft->tv);
}

Просмотреть файл

@ -0,0 +1,38 @@
/*
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
*
* Copyright (c) 2017 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/**
* @file
*
* File movement sensor
*/
#ifndef PMIX_PSENSOR_FILE_H
#define PMIX_PSENSOR_FILE_H
#include <src/include/pmix_config.h>
#include "src/class/pmix_list.h"
#include "src/mca/psensor/psensor.h"
BEGIN_C_DECLS
typedef struct {
pmix_psensor_base_component_t super;
pmix_list_t trackers;
} pmix_psensor_file_component_t;
extern pmix_psensor_file_component_t mca_psensor_file_component;
extern pmix_psensor_base_module_t pmix_psensor_file_module;
END_C_DECLS
#endif

Просмотреть файл

@ -0,0 +1,69 @@
/*
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2017 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include <src/include/pmix_config.h>
#include <pmix_common.h>
#include "src/class/pmix_list.h"
#include "src/mca/psensor/base/base.h"
#include "src/mca/psensor/file/psensor_file.h"
/*
* Local functions
*/
static int psensor_file_open(void);
static int psensor_file_close(void);
static int psensor_file_query(pmix_mca_base_module_t **module, int *priority);
pmix_psensor_file_component_t mca_psensor_file_component = {
.super = {
.base = {
PMIX_PSENSOR_BASE_VERSION_1_0_0,
/* Component name and version */
.pmix_mca_component_name = "file",
PMIX_MCA_BASE_MAKE_VERSION(component,
PMIX_MAJOR_VERSION,
PMIX_MINOR_VERSION,
PMIX_RELEASE_VERSION),
/* Component open and close functions */
psensor_file_open, /* component open */
psensor_file_close, /* component close */
psensor_file_query /* component query */
},
}
};
static int psensor_file_open(void)
{
PMIX_CONSTRUCT(&mca_psensor_file_component.trackers, pmix_list_t);
return PMIX_SUCCESS;
}
static int psensor_file_query(pmix_mca_base_module_t **module, int *priority)
{
*priority = 20; /* irrelevant */
*module = (pmix_mca_base_module_t *)&pmix_psensor_file_module;
return PMIX_SUCCESS;
}
/**
* Close all subsystems.
*/
static int psensor_file_close(void)
{
PMIX_LIST_DESTRUCT(&mca_psensor_file_component.trackers);
return PMIX_SUCCESS;
}

Просмотреть файл

@ -0,0 +1,38 @@
#
# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
#
# Copyright (c) 2017 Intel, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
dist_pmixdata_DATA = help-pmix-psensor-heartbeat.txt
sources = \
psensor_heartbeat.c \
psensor_heartbeat.h \
psensor_heartbeat_component.c
# Make the output library in this directory, and name it either
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
# (for static builds).
if MCA_BUILD_pmix_psensor_heartbeat_DSO
component_noinst =
component_install = mca_psensor_heartbeat.la
else
component_noinst = libmca_psensor_heartbeat.la
component_install =
endif
mcacomponentdir = $(pmixlibdir)
mcacomponent_LTLIBRARIES = $(component_install)
mca_psensor_heartbeat_la_SOURCES = $(sources)
mca_psensor_heartbeat_la_LDFLAGS = -module -avoid-version
noinst_LTLIBRARIES = $(component_noinst)
libmca_psensor_heartbeat_la_SOURCES =$(sources)
libmca_psensor_heartbeat_la_LDFLAGS = -module -avoid-version

Просмотреть файл

@ -4,9 +4,9 @@
#
# Copyright (c) 2017 Intel, Inc. All rights reserved.
# $COPYRIGHT$
#
#
# Additional copyrights may follow
#
#
# $HEADER$
#
# This is the US/English general help file for the memory usage sensor
@ -18,4 +18,3 @@ Node: %s
Process rank: %s
Memory used: %luGbytes
Memory limit: %luGbytes

Просмотреть файл

@ -0,0 +1,330 @@
/*
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights
* reserved.
*
* Copyright (c) 2017 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include <src/include/pmix_config.h>
#include <pmix_common.h>
#include <errno.h>
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif /* HAVE_UNISTD_H */
#ifdef HAVE_STRING_H
#include <string.h>
#endif /* HAVE_STRING_H */
#include <stdio.h>
#include <pthread.h>
#include PMIX_EVENT_HEADER
#include "src/util/argv.h"
#include "src/util/error.h"
#include "src/util/output.h"
#include "src/util/show_help.h"
#include "src/include/pmix_globals.h"
#include "src/mca/ptl/ptl.h"
#include "src/mca/psensor/base/base.h"
#include "psensor_heartbeat.h"
/* declare the API functions */
static pmix_status_t heartbeat_start(pmix_peer_t *requestor, pmix_status_t error,
const pmix_info_t *monitor,
const pmix_info_t directives[], size_t ndirs);
static pmix_status_t heartbeat_stop(pmix_peer_t *requestor, char *id);
/* instantiate the module */
pmix_psensor_base_module_t pmix_psensor_heartbeat_module = {
.start = heartbeat_start,
.stop = heartbeat_stop
};
/* tracker object */
typedef struct {
pmix_list_item_t super;
pmix_peer_t *requestor;
char *id;
bool event_active;
pmix_event_t ev;
pmix_event_t cdev;
struct timeval tv;
uint32_t nbeats;
uint32_t ndrops;
uint32_t nmissed;
pmix_status_t error;
pmix_data_range_t range;
pmix_info_t *info;
size_t ninfo;
} pmix_heartbeat_trkr_t;
static void ft_constructor(pmix_heartbeat_trkr_t *ft)
{
ft->requestor = NULL;
ft->id = NULL;
ft->event_active = false;
ft->tv.tv_sec = 0;
ft->tv.tv_usec = 0;
ft->nbeats = 0;
ft->ndrops = 0;
ft->nmissed = 0;
ft->error = PMIX_SUCCESS;
ft->range = PMIX_RANGE_NAMESPACE;
ft->info = NULL;
ft->ninfo = 0;
}
static void ft_destructor(pmix_heartbeat_trkr_t *ft)
{
if (NULL != ft->requestor) {
PMIX_RELEASE(ft->requestor);
}
if (NULL != ft->id) {
free(ft->id);
}
if (event_active) {
pmix_event_del(&ft->ev);
}
if (NULL != ft->info) {
PMIX_INFO_FREE(ft->info, ft->ninfo);
}
}
PMIX_CLASS_INSTANCE(pmix_heartbeat_trkr_t,
pmix_list_item_t,
ft_constructor, ft_destructor);
/* define a local caddy */
typedef struct {
pmix_object_t super;
pmix_event_t ev;
pmix_peer_t *requestor;
char *id;
} heartbeat_caddy_t;
static void cd_con(heartbeat_caddy_t *p)
{
p->requestor = NULL;
p->id = NULL;
}
static void cd_des(heartbeat_caddy_t *p)
{
if (NULL != (p->requestor)) {
PMIX_RELEASE(p->requestor);
}
if (NULL != p->id) {
free(p->id);
}
}
PMIX_CLASS_INSTANCE(heartbeat_caddy_t,
pmix_object_t,
cd_con, cd_des);
typedef struct {
pmix_object_t super;
pmix_event_t ev;
pmix_peer_t *peer;
} pmix_psensor_beat_t;
static void bcon(pmix_psensor_beat_t *p)
{
p->peer = NULL;
}
static void bdes(pmix_psensor_beat_t *p)
{
if (NULL != p->peer) {
PMIX_RELEASE(p->peer);
}
}
PMIX_CLASS_INSTANCE(pmix_psensor_beat_t,
pmix_object_t,
bcon, bdes);
static void check_heartbeat(int fd, short dummy, void *arg);
static void add_tracker(int sd, short flags, void *cbdata)
{
pmix_heartbeat_trkr_t *ft = (pmix_heartbeat_trkr_t*)cbdata;
/* add the tracker to our list */
pmix_list_append(&mca_psensor_heartbeat_component.trackers, &ft->super);
/* setup the timer event */
pmix_event_evtimer_set(pmix_psensor_base.evbase, &ft->ev,
check_heartbeat, ft);
pmix_event_evtimer_add(&ft->ev, &ft->tv);
ft->event_active = true;
}
static pmix_status_t heartbeat_start(pmix_peer_t *requestor, pmix_status_t error,
const pmix_info_t *monitor,
const pmix_info_t directives[], size_t ndirs)
{
pmix_heartbeat_trkr_t *ft;
size_t n, n2;
PMIX_OUTPUT_VERBOSE((1, pmix_psensor_base_framework.framework_output,
"[%s:%d] checking heartbeat monitoring for requestor %s:%d",
pmix_globals.myid.nspace, pmix_globals.myid.rank,
requestor->info->nptr->nspace, requestor->info->rank));
/* if they didn't ask for heartbeats, then nothing for us to do */
if (0 != strcmp(monitor->key, PMIX_MONITOR_HEARTBEAT)) {
return PMIX_ERR_TAKE_NEXT_OPTION;
}
/* setup to track this monitoring operation */
ft = PMIX_NEW(pmix_heartbeat_trkr_t);
PMIX_RETAIN(requestor);
ft->requestor = requestor;
ft->error = error;
/* check the directives to see what they want monitored */
for (n=0; n < ndirs; n++) {
if (0 == strcmp(directives[n].key, PMIX_MONITOR_HEARTBEAT_TIME)) {
ft->tv.tv_sec = directives[n].value.data.uint32;
} else if (0 == strcmp(directives[n].key, PMIX_MONITOR_HEARTBEAT_DROPS)) {
ft->ndrops = directives[n].value.data.uint32;
} else if (0 == strcmp(directives[n].key, PMIX_RANGE)) {
ft->range = directives[n].value.data.range;
}
}
if (0 == ft->tv.tv_sec) {
/* didn't specify a sample rate, or what should be sampled */
PMIX_RELEASE(ft);
return PMIX_ERR_BAD_PARAM;
}
/* need to push into our event base to add this to our trackers */
pmix_event_assign(&ft->cdev, pmix_psensor_base.evbase, -1,
EV_WRITE, add_tracker, ft);
pmix_event_active(&ft->cdev, EV_WRITE, 1);
return PMIX_SUCCESS;
}
static void del_tracker(int sd, short flags, void *cbdata)
{
heartbeat_caddy_t *cd = (heartbeat_caddy_t*)cbdata;
pmix_heartbeat_trkr_t *ft, *ftnext;
/* remove the tracker from our list */
PMIX_LIST_FOREACH_SAFE(ft, ftnext, &mca_psensor_heartbeat_component.trackers, pmix_heartbeat_trkr_t) {
if (ft->requestor != cd->requestor) {
continue;
}
if (NULL == cd->id ||
(NULL != ft->id && 0 == strcmp(ft->id, cd->id))) {
pmix_list_remove_item(&mca_psensor_heartbeat_component.trackers, &ft->super);
PMIX_RELEASE(ft);
}
}
PMIX_RELEASE(cd);
}
static pmix_status_t heartbeat_stop(pmix_peer_t *requestor, char *id)
{
heartbeat_caddy_t *cd;
cd = PMIX_NEW(heartbeat_caddy_t);
PMIX_RETAIN(requestor);
cd->requestor = requestor;
cd->id = strdup(id);
/* need to push into our event base to add this to our trackers */
pmix_event_assign(&cd->ev, pmix_psensor_base.evbase, -1,
EV_WRITE, del_tracker, cd);
pmix_event_active(&cd->ev, EV_WRITE, 1);
return PMIX_SUCCESS;
}
static void opcbfunc(pmix_status_t status, void *cbdata)
{
pmix_heartbeat_trkr_t *ft = (pmix_heartbeat_trkr_t*)cbdata;
PMIX_RELEASE(ft);
}
/* this function automatically gets periodically called
* by the event library so we can check on the state
* of the various procs we are monitoring
*/
static void check_heartbeat(int fd, short dummy, void *cbdata)
{
pmix_heartbeat_trkr_t *ft = (pmix_heartbeat_trkr_t*)cbdata;
pmix_status_t rc;
pmix_proc_t source;
PMIX_OUTPUT_VERBOSE((1, pmix_psensor_base_framework.framework_output,
"[%s:%d] sensor:check_heartbeat for proc %s:%d",
pmix_globals.myid.nspace, pmix_globals.myid.rank,
ft->requestor->info->nptr->nspace, ft->requestor->info->rank));
if (0 == ft->nbeats) {
/* no heartbeat recvd in last window */
PMIX_OUTPUT_VERBOSE((1, pmix_psensor_base_framework.framework_output,
"[%s:%d] sensor:check_heartbeat failed for proc %s:%d",
pmix_globals.myid.nspace, pmix_globals.myid.rank,
ft->requestor->info->nptr->nspace, ft->requestor->info->rank));
/* stop monitoring this client */
pmix_list_remove_item(&mca_psensor_heartbeat_component.trackers, &ft->super);
/* generate an event */
(void)strncpy(source.nspace, ft->requestor->info->nptr->nspace, PMIX_MAX_NSLEN);
source.rank = ft->requestor->info->rank;
rc = PMIx_Notify_event(PMIX_MONITOR_HEARTBEAT_ALERT, &source,
ft->range, ft->info, ft->ninfo, opcbfunc, ft);
if (PMIX_SUCCESS != rc) {
PMIX_ERROR_LOG(rc);
}
return;
} else {
PMIX_OUTPUT_VERBOSE((1, pmix_psensor_base_framework.framework_output,
"[%s:%d] sensor:check_heartbeat detected %d beats for proc %s:%d",
pmix_globals.myid.nspace, pmix_globals.myid.rank, ft->nbeats,
ft->requestor->info->nptr->nspace, ft->requestor->info->rank));
}
/* reset for next period */
ft->nbeats = 0;
/* reset the timer */
pmix_event_evtimer_add(&ft->ev, &ft->tv);
}
static void add_beat(int sd, short args, void *cbdata)
{
pmix_psensor_beat_t *b = (pmix_psensor_beat_t*)cbdata;
pmix_heartbeat_trkr_t *ft;
/* find this peer in our trackers */
PMIX_LIST_FOREACH(ft, &mca_psensor_heartbeat_component.trackers, pmix_heartbeat_trkr_t) {
if (ft->requestor == b->peer) {
/* increment the beat count */
++ft->nbeats;
break;
}
}
PMIX_RELEASE(b);
}
void pmix_psensor_heartbeat_recv_beats(struct pmix_peer_t *peer,
pmix_ptl_hdr_t *hdr,
pmix_buffer_t *buf, void *cbdata)
{
pmix_psensor_beat_t *b;
b = PMIX_NEW(pmix_psensor_beat_t);
PMIX_RETAIN(peer);
b->peer = peer;
/* shift this to our thread for processing */
pmix_event_assign(&b->ev, pmix_psensor_base.evbase, -1,
EV_WRITE, add_beat, b);
pmix_event_active(&b->ev, EV_WRITE, 1);
}

Просмотреть файл

@ -0,0 +1,43 @@
/*
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2012 Los Alamos National Security, Inc. All rights reserved.
*
* Copyright (c) 2017 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/**
* @file
*
* Heartbeat sensor
*/
#ifndef PMIX_PSENSOR_HEARTBEAT_H
#define PMIX_PSENSOR_HEARTBEAT_H
#include <src/include/pmix_config.h>
#include <src/include/types.h>
#include "src/class/pmix_list.h"
#include "src/include/pmix_globals.h"
#include "src/mca/psensor/psensor.h"
BEGIN_C_DECLS
typedef struct {
pmix_psensor_base_component_t super;
pmix_list_t trackers;
} pmix_psensor_heartbeat_component_t;
PMIX_EXPORT extern pmix_psensor_heartbeat_component_t mca_psensor_heartbeat_component;
extern pmix_psensor_base_module_t pmix_psensor_heartbeat_module;
void pmix_psensor_heartbeat_recv_beats(struct pmix_peer_t *peer,
pmix_ptl_hdr_t *hdr,
pmix_buffer_t *buf, void *cbdata);
END_C_DECLS
#endif

Просмотреть файл

@ -0,0 +1,81 @@
/*
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2012 Los Alamos National Security, Inc. All rights reserved.
* Copyright (c) 2017 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include <src/include/pmix_config.h>
#include <pmix_common.h>
#include "src/mca/ptl/ptl.h"
#include "src/mca/psensor/base/base.h"
#include "src/mca/psensor/heartbeat/psensor_heartbeat.h"
/*
* Local functions
*/
static int heartbeat_open(void);
static int heartbeat_close(void);
static int heartbeat_query(pmix_mca_base_module_t **module, int *priority);
pmix_psensor_heartbeat_component_t mca_psensor_heartbeat_component = {
.super = {
.base = {
PMIX_PSENSOR_BASE_VERSION_1_0_0,
/* Component name and version */
.pmix_mca_component_name = "heartbeat",
PMIX_MCA_BASE_MAKE_VERSION(component,
PMIX_MAJOR_VERSION,
PMIX_MINOR_VERSION,
PMIX_RELEASE_VERSION),
/* Component open and close functions */
heartbeat_open, /* component open */
heartbeat_close, /* component close */
heartbeat_query /* component query */
}
}
};
/**
* component open/close/init function
*/
static int heartbeat_open(void)
{
PMIX_CONSTRUCT(&mca_psensor_heartbeat_component.trackers, pmix_list_t);
/* setup to receive heartbeats */
pmix_ptl.recv(pmix_globals.mypeer, pmix_psensor_heartbeat_recv_beats, PMIX_PTL_TAG_HEARTBEAT);
return PMIX_SUCCESS;
}
static int heartbeat_query(pmix_mca_base_module_t **module, int *priority)
{
*priority = 5; // irrelevant
*module = (pmix_mca_base_module_t *)&pmix_psensor_heartbeat_module;
return PMIX_SUCCESS;
}
/**
* Close all subsystems.
*/
static int heartbeat_close(void)
{
/* cancel our persistent recv */
pmix_ptl.cancel(pmix_globals.mypeer, PMIX_PTL_TAG_HEARTBEAT);
PMIX_LIST_DESTRUCT(&mca_psensor_heartbeat_component.trackers);
return PMIX_SUCCESS;
}

Просмотреть файл

@ -0,0 +1,86 @@
/*
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2012 Los Alamos National Security, Inc. All rights reserved.
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*
* @file:
*
*/
#ifndef PMIX_PSENSOR_H_
#define PMIX_PSENSOR_H_
#include <src/include/pmix_config.h>
#include "src/class/pmix_list.h"
#include "src/mca/mca.h"
#include "src/include/pmix_globals.h"
BEGIN_C_DECLS
/*
* Component functions - all MUST be provided!
*/
/* start a sensor operation:
*
* requestor - the process requesting this operation
*
* monitor - a PMIx attribute specifying what is to be monitored
*
* directives - an array of pmix_info_t specifying relevant limits on values, and action
* to be taken when limits exceeded. Can include
* user-provided "id" string */
typedef pmix_status_t (*pmix_psensor_base_module_start_fn_t)(pmix_peer_t *requestor, pmix_status_t error,
const pmix_info_t *monitor,
const pmix_info_t directives[], size_t ndirs);
/* stop a sensor operation:
*
* requestor - the process requesting this operation
*
* id - the "id" string provided by the user at the time the
* affected monitoring operation was started. A NULL indicates
* that all operations started by this requestor are to
* be terminated */
typedef pmix_status_t (*pmix_psensor_base_module_stop_fn_t)(pmix_peer_t *requestor,
char *id);
/* API module */
/*
* Ver 1.0
*/
typedef struct pmix_psensor_base_module_1_0_0_t {
pmix_psensor_base_module_start_fn_t start;
pmix_psensor_base_module_stop_fn_t stop;
} pmix_psensor_base_module_t;
/*
* the standard component data structure
*/
typedef struct pmix_psensor_base_component_1_0_0_t {
pmix_mca_base_component_t base;
pmix_mca_base_component_data_t data;
} pmix_psensor_base_component_t;
/*
* Macro for use in components that are of type sensor v1.0.0
*/
#define PMIX_PSENSOR_BASE_VERSION_1_0_0 \
PMIX_MCA_BASE_VERSION_1_0_0("psensor", 1, 0, 0)
/* Global structure for accessing sensor functions
*/
PMIX_EXPORT extern pmix_psensor_base_module_t pmix_psensor; /* holds API function pointers */
END_C_DECLS
#endif /* MCA_SENSOR_H */

Просмотреть файл

@ -11,7 +11,7 @@
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2012 Los Alamos National Security, Inc. All rights reserved.
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
* Copyright (c) 2015 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* $COPYRIGHT$
@ -73,6 +73,7 @@ struct pmix_ptl_globals_t {
pmix_list_t actives;
bool initialized;
pmix_list_t posted_recvs; // list of pmix_ptl_posted_recv_t
pmix_list_t unexpected_msgs;
int stop_thread[2];
bool listen_thread_active;
pmix_list_t listeners;
@ -93,6 +94,11 @@ PMIX_EXPORT pmix_status_t pmix_ptl_stub_send_oneway(struct pmix_peer_t *peer,
pmix_ptl_tag_t tag);
PMIX_EXPORT pmix_status_t pmix_ptl_stub_connect_to_peer(struct pmix_peer_t *peer,
pmix_info_t info[], size_t ninfo);
PMIX_EXPORT pmix_status_t pmix_ptl_stub_register_recv(struct pmix_peer_t *peer,
pmix_ptl_cbfunc_t cbfunc,
pmix_ptl_tag_t tag);
PMIX_EXPORT pmix_status_t pmix_ptl_stub_cancel_recv(struct pmix_peer_t *peer,
pmix_ptl_tag_t tag);
PMIX_EXPORT pmix_status_t pmix_ptl_base_start_listening(pmix_info_t *info, size_t ninfo);
PMIX_EXPORT void pmix_ptl_base_stop_listening(void);

Просмотреть файл

@ -61,6 +61,8 @@ pmix_ptl_API_t pmix_ptl = {
.send_recv = pmix_ptl_stub_send_recv,
.send_oneway = pmix_ptl_stub_send_oneway,
.connect_to_peer = pmix_ptl_stub_connect_to_peer,
.recv = pmix_ptl_stub_register_recv,
.cancel = pmix_ptl_stub_cancel_recv,
.start_listening = pmix_ptl_base_start_listening,
.stop_listening = pmix_ptl_base_stop_listening
};
@ -88,6 +90,7 @@ static pmix_status_t pmix_ptl_close(void)
/* the components will cleanup when closed */
PMIX_LIST_DESTRUCT(&pmix_ptl_globals.actives);
PMIX_LIST_DESTRUCT(&pmix_ptl_globals.posted_recvs);
PMIX_LIST_DESTRUCT(&pmix_ptl_globals.unexpected_msgs);
PMIX_LIST_DESTRUCT(&pmix_ptl_globals.listeners);
return pmix_mca_base_framework_components_close(&pmix_ptl_base_framework, NULL);
@ -99,6 +102,7 @@ static pmix_status_t pmix_ptl_open(pmix_mca_base_open_flag_t flags)
pmix_ptl_globals.initialized = true;
PMIX_CONSTRUCT(&pmix_ptl_globals.actives, pmix_list_t);
PMIX_CONSTRUCT(&pmix_ptl_globals.posted_recvs, pmix_list_t);
PMIX_CONSTRUCT(&pmix_ptl_globals.unexpected_msgs, pmix_list_t);
pmix_ptl_globals.listen_thread_active = false;
PMIX_CONSTRUCT(&pmix_ptl_globals.listeners, pmix_list_t);
pmix_client_globals.myserver.sd = -1;

Просмотреть файл

@ -46,7 +46,7 @@
#include "src/mca/ptl/base/base.h"
static uint32_t current_tag = 1; // 0 is reserved for system purposes
static uint32_t current_tag = PMIX_PTL_TAG_DYNAMIC;
static void _notify_complete(pmix_status_t status, void *cbdata)
{
@ -162,7 +162,7 @@ static pmix_status_t send_msg(int sd, pmix_ptl_send_t *msg)
} else {
iov_count = 1;
}
retry:
retry:
rc = writev(sd, iov, iov_count);
if (PMIX_LIKELY(rc == remain)) {
/* we successfully sent the header and the msg data if any */
@ -521,16 +521,16 @@ void pmix_ptl_base_send_recv(int fd, short args, void *cbdata)
return;
}
/* set the tag */
tag = current_tag++;
/* take the next tag in the sequence */
current_tag++;
if (UINT32_MAX == current_tag ) {
current_tag = PMIX_PTL_TAG_DYNAMIC;
}
tag = current_tag;
if (NULL != ms->cbfunc) {
/* if a callback msg is expected, setup a recv for it */
req = PMIX_NEW(pmix_ptl_posted_recv_t);
/* take the next tag in the sequence */
if (UINT32_MAX == current_tag ) {
current_tag = 1;
}
req->tag = tag;
req->cbfunc = ms->cbfunc;
req->cbdata = ms->cbdata;
@ -597,23 +597,29 @@ void pmix_ptl_base_process_msg(int fd, short flags, void *cbdata)
buf.pack_ptr = ((char*)buf.base_ptr) + buf.bytes_used;
}
msg->data = NULL; // protect the data region
if (NULL != rcv->cbfunc) {
rcv->cbfunc(msg->peer, &msg->hdr, &buf, rcv->cbdata);
}
rcv->cbfunc(msg->peer, &msg->hdr, &buf, rcv->cbdata);
PMIX_DESTRUCT(&buf); // free's the msg data
/* also done with the recv, if not a wildcard or the error tag */
if (UINT32_MAX != rcv->tag && 0 != rcv->tag) {
pmix_list_remove_item(&pmix_ptl_globals.posted_recvs, &rcv->super);
PMIX_RELEASE(rcv);
}
PMIX_RELEASE(msg);
return;
}
/* done with the recv if it is a dynamic tag */
if (PMIX_PTL_TAG_DYNAMIC <= rcv->tag && UINT_MAX != rcv->tag) {
pmix_list_remove_item(&pmix_ptl_globals.posted_recvs, &rcv->super);
PMIX_RELEASE(rcv);
}
PMIX_RELEASE(msg);
return;
}
}
/* we get here if no matching recv was found - this is an error */
pmix_output(0, "UNEXPECTED MESSAGE tag = %d", msg->hdr.tag);
PMIX_RELEASE(msg);
PMIX_REPORT_EVENT(PMIX_ERROR, _notify_complete);
/* if the tag in this message is above the dynamic marker, then
* that is an error */
if (PMIX_PTL_TAG_DYNAMIC <= msg->hdr.tag) {
pmix_output(0, "UNEXPECTED MESSAGE tag = %d", msg->hdr.tag);
PMIX_RELEASE(msg);
PMIX_REPORT_EVENT(PMIX_ERROR, _notify_complete);
return;
}
/* it is possible that someone may post a recv for this message
* at some point, so we have to hold onto it */
pmix_list_append(&pmix_ptl_globals.unexpected_msgs, &msg->super);
}

Просмотреть файл

@ -9,7 +9,7 @@
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2015-2016 Intel, Inc. All rights reserved.
* Copyright (c) 2015-2017 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -105,3 +105,92 @@ pmix_status_t pmix_ptl_stub_connect_to_peer(struct pmix_peer_t *peer,
return PMIX_ERR_UNREACH;
}
static void post_recv(int fd, short args, void *cbdata)
{
pmix_ptl_posted_recv_t *req = (pmix_ptl_posted_recv_t*)cbdata;
pmix_ptl_recv_t *msg, *nmsg;
pmix_buffer_t buf;
pmix_output_verbose(5, pmix_globals.debug_output,
"posting recv on tag %d", req->tag);
/* add it to the list of recvs */
pmix_list_append(&pmix_ptl_globals.posted_recvs, &req->super);
/* now check the unexpected msg queue to see if we already
* recvd something for it */
PMIX_LIST_FOREACH_SAFE(msg, nmsg, &pmix_ptl_globals.unexpected_msgs, pmix_ptl_recv_t) {
if (msg->hdr.tag == req->tag || UINT_MAX == req->tag) {
if (NULL != req->cbfunc) {
/* construct and load the buffer */
PMIX_CONSTRUCT(&buf, pmix_buffer_t);
if (NULL != msg->data) {
buf.base_ptr = (char*)msg->data;
buf.bytes_allocated = buf.bytes_used = msg->hdr.nbytes;
buf.unpack_ptr = buf.base_ptr;
buf.pack_ptr = ((char*)buf.base_ptr) + buf.bytes_used;
}
msg->data = NULL; // protect the data region
req->cbfunc(msg->peer, &msg->hdr, &buf, req->cbdata);
PMIX_DESTRUCT(&buf); // free's the msg data
}
pmix_list_remove_item(&pmix_ptl_globals.unexpected_msgs, &msg->super);
PMIX_RELEASE(msg);
}
}
}
pmix_status_t pmix_ptl_stub_register_recv(struct pmix_peer_t *peer,
pmix_ptl_cbfunc_t cbfunc,
pmix_ptl_tag_t tag)
{
pmix_ptl_posted_recv_t *req;
req = PMIX_NEW(pmix_ptl_posted_recv_t);
if (NULL == req) {
return PMIX_ERR_NOMEM;
}
req->tag = tag;
req->cbfunc = cbfunc;
/* have to push this into an event so we can add this
* to the list of posted recvs */
pmix_event_assign(&(req->ev), pmix_globals.evbase, -1,
EV_WRITE, post_recv, req);
pmix_event_active(&(req->ev), EV_WRITE, 1);
return PMIX_SUCCESS;
}
static void cancel_recv(int fd, short args, void *cbdata)
{
pmix_ptl_posted_recv_t *req = (pmix_ptl_posted_recv_t*)cbdata;
pmix_ptl_posted_recv_t *rcv;
PMIX_LIST_FOREACH(rcv, &pmix_ptl_globals.posted_recvs, pmix_ptl_posted_recv_t) {
if (rcv->tag == req->tag) {
pmix_list_remove_item(&pmix_ptl_globals.posted_recvs, &rcv->super);
PMIX_RELEASE(rcv);
PMIX_RELEASE(req);
return;
}
}
PMIX_RELEASE(req);
}
pmix_status_t pmix_ptl_stub_cancel_recv(struct pmix_peer_t *peer,
pmix_ptl_tag_t tag)
{
pmix_ptl_posted_recv_t *req;
req = PMIX_NEW(pmix_ptl_posted_recv_t);
if (NULL == req) {
return PMIX_ERR_NOMEM;
}
req->tag = tag;
/* have to push this into an event so we can modify
* the list of posted recvs */
pmix_event_assign(&(req->ev), pmix_globals.evbase, -1,
EV_WRITE, cancel_recv, req);
pmix_event_active(&(req->ev), EV_WRITE, 1);
return PMIX_SUCCESS;
}

Просмотреть файл

@ -11,7 +11,7 @@
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2012 Los Alamos National Security, Inc. All rights reserved.
* Copyright (c) 2013-2016 Intel, Inc. All rights reserved
* Copyright (c) 2013-2017 Intel, Inc. All rights reserved.
* Copyright (c) 2015 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* Copyright (c) 2016 Mellanox Technologies, Inc.
@ -110,6 +110,15 @@ typedef pmix_status_t (*pmix_ptl_send_fn_t)(struct pmix_peer_t *peer,
pmix_buffer_t *bfr,
pmix_ptl_tag_t tag);
/* (ONE-WAY) register a persistent recv */
typedef pmix_status_t (*pmix_ptl_recv_fn_t)(struct pmix_peer_t *peer,
pmix_ptl_cbfunc_t cbfunc,
pmix_ptl_tag_t tag);
/* Cancel a persistent recv */
typedef pmix_status_t (*pmix_ptl_cancel_fn_t)(struct pmix_peer_t *peer,
pmix_ptl_tag_t tag);
/* connect to a peer - this is a blocking function
* to establish a connection to a peer. It assigns
* the corresponding module to the peer's compat
@ -126,6 +135,8 @@ struct pmix_ptl_module_t {
pmix_ptl_finalize_fn_t finalize;
pmix_ptl_send_recv_fn_t send_recv;
pmix_ptl_send_fn_t send;
pmix_ptl_recv_fn_t recv;
pmix_ptl_cancel_fn_t cancel;
pmix_ptl_connect_to_peer_fn_t connect_to_peer;
};
typedef struct pmix_ptl_module_t pmix_ptl_module_t;
@ -152,6 +163,8 @@ typedef struct {
pmix_ptl_get_available_modules_fn_t get_available_modules;
pmix_ptl_send_recv_fn_t send_recv;
pmix_ptl_send_fn_t send_oneway;
pmix_ptl_recv_fn_t recv;
pmix_ptl_cancel_fn_t cancel;
pmix_ptl_connect_to_peer_fn_t connect_to_peer;
pmix_ptl_start_listening_fn_t start_listening;
pmix_ptl_stop_listening_fn_t stop_listening;

Просмотреть файл

@ -63,6 +63,16 @@ struct pmix_ptl_module_t;
/**** MESSAGING STRUCTURES ****/
typedef uint32_t pmix_ptl_tag_t;
/* define a range of "reserved" tags - these
* are tags that are used for persistent recvs
* within the system */
#define PMIX_PTL_TAG_NOTIFY 0
#define PMIX_PTL_TAG_HEARTBEAT 1
/* define the start of dynamic tags that are
* assigned for send/recv operations */
#define PMIX_PTL_TAG_DYNAMIC 100
/* header for messages */
typedef struct {

Просмотреть файл

@ -1,5 +1,5 @@
/*
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
* Copyright (c) 2015 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
@ -13,6 +13,11 @@
#include "pmix_config.h"
#include <pthread.h>
#include PMIX_EVENT_HEADER
#include "src/include/types.h"
/**
* Initialize a progress thread name; if a progress thread is not
* already associated with that name, start a progress thread.

Просмотреть файл

@ -2345,6 +2345,18 @@ static pmix_status_t server_switchyard(pmix_peer_t *peer, uint32_t tag,
return rc;
}
if (PMIX_JOB_CONTROL_CMD == cmd) {
PMIX_PEER_CADDY(cd, peer, tag);
rc = pmix_server_job_ctrl(peer, buf, query_cbfunc, cd);
return rc;
}
if (PMIX_MONITOR_CMD == cmd) {
PMIX_PEER_CADDY(cd, peer, tag);
rc = pmix_server_monitor(peer, buf, query_cbfunc, cd);
return rc;
}
return PMIX_ERR_NOT_SUPPORTED;
}

Просмотреть файл

@ -1562,6 +1562,134 @@ pmix_status_t pmix_server_alloc(pmix_peer_t *peer,
return rc;
}
pmix_status_t pmix_server_job_ctrl(pmix_peer_t *peer,
pmix_buffer_t *buf,
pmix_info_cbfunc_t cbfunc,
void *cbdata)
{
int32_t cnt;
pmix_status_t rc;
pmix_query_caddy_t *cd;
pmix_proc_t proc;
pmix_output_verbose(2, pmix_globals.debug_output,
"recvd job control request from client");
if (NULL == pmix_host_server.job_control) {
return PMIX_ERR_NOT_SUPPORTED;
}
cd = PMIX_NEW(pmix_query_caddy_t);
cd->cbdata = cbdata;
/* unpack the number of targets */
cnt = 1;
if (PMIX_SUCCESS != (rc = pmix_bfrop.unpack(buf, &cd->ntargets, &cnt, PMIX_SIZE))) {
PMIX_ERROR_LOG(rc);
goto exit;
}
if (0 < cd->ntargets) {
PMIX_PROC_CREATE(cd->targets, cd->ntargets);
cnt = cd->ntargets;
if (PMIX_SUCCESS != (rc = pmix_bfrop.unpack(buf, cd->targets, &cnt, PMIX_PROC))) {
PMIX_ERROR_LOG(rc);
goto exit;
}
}
/* unpack the number of info objects */
cnt = 1;
if (PMIX_SUCCESS != (rc = pmix_bfrop.unpack(buf, &cd->ninfo, &cnt, PMIX_SIZE))) {
PMIX_ERROR_LOG(rc);
goto exit;
}
/* unpack the info */
if (0 < cd->ninfo) {
PMIX_INFO_CREATE(cd->info, cd->ninfo);
cnt = cd->ninfo;
if (PMIX_SUCCESS != (rc = pmix_bfrop.unpack(buf, cd->info, &cnt, PMIX_INFO))) {
PMIX_ERROR_LOG(rc);
goto exit;
}
}
/* setup the requesting peer name */
(void)strncpy(proc.nspace, peer->info->nptr->nspace, PMIX_MAX_NSLEN);
proc.rank = peer->info->rank;
/* ask the host to execute the request */
if (PMIX_SUCCESS != (rc = pmix_host_server.job_control(&proc,
cd->targets, cd->ntargets,
cd->info, cd->ninfo,
cbfunc, cd))) {
goto exit;
}
return PMIX_SUCCESS;
exit:
PMIX_RELEASE(cd);
return rc;
}
pmix_status_t pmix_server_monitor(pmix_peer_t *peer,
pmix_buffer_t *buf,
pmix_info_cbfunc_t cbfunc,
void *cbdata)
{
int32_t cnt;
pmix_status_t rc, error;
pmix_query_caddy_t *cd;
pmix_proc_t proc;
pmix_output_verbose(2, pmix_globals.debug_output,
"recvd monitor request from client");
if (NULL == pmix_host_server.monitor) {
return PMIX_ERR_NOT_SUPPORTED;
}
cd = PMIX_NEW(pmix_query_caddy_t);
cd->cbdata = cbdata;
/* unpack the error code */
cnt = 1;
if (PMIX_SUCCESS != (rc = pmix_bfrop.unpack(buf, &error, &cnt, PMIX_STATUS))) {
PMIX_ERROR_LOG(rc);
goto exit;
}
/* unpack the number of directives */
cnt = 1;
if (PMIX_SUCCESS != (rc = pmix_bfrop.unpack(buf, &cd->ninfo, &cnt, PMIX_SIZE))) {
PMIX_ERROR_LOG(rc);
goto exit;
}
/* unpack the directives */
if (0 < cd->ninfo) {
PMIX_INFO_CREATE(cd->info, cd->ninfo);
cnt = cd->ninfo;
if (PMIX_SUCCESS != (rc = pmix_bfrop.unpack(buf, cd->info, &cnt, PMIX_INFO))) {
PMIX_ERROR_LOG(rc);
goto exit;
}
}
/* setup the requesting peer name */
(void)strncpy(proc.nspace, peer->info->nptr->nspace, PMIX_MAX_NSLEN);
proc.rank = peer->info->rank;
/* ask the host to execute the request */
if (PMIX_SUCCESS != (rc = pmix_host_server.monitor(&proc, error,
cd->info, cd->ninfo,
cbfunc, cd))) {
goto exit;
}
return PMIX_SUCCESS;
exit:
PMIX_RELEASE(cd);
return rc;
}
/***** INSTANCE SERVER LIBRARY CLASSES *****/
static void tcon(pmix_server_trkr_t *t)
{

Просмотреть файл

@ -218,6 +218,16 @@ pmix_status_t pmix_server_alloc(pmix_peer_t *peer,
pmix_info_cbfunc_t cbfunc,
void *cbdata);
pmix_status_t pmix_server_job_ctrl(pmix_peer_t *peer,
pmix_buffer_t *buf,
pmix_info_cbfunc_t cbfunc,
void *cbdata);
pmix_status_t pmix_server_monitor(pmix_peer_t *peer,
pmix_buffer_t *buf,
pmix_info_cbfunc_t cbfunc,
void *cbdata);
pmix_status_t pmix_server_event_recvd_from_client(pmix_peer_t *peer,
pmix_buffer_t *buf,
pmix_op_cbfunc_t cbfunc,

Просмотреть файл

@ -56,6 +56,8 @@ PMIX_EXPORT const char* PMIx_Error_string(pmix_status_t errnum)
return "INVALID-KEYVAL";
case PMIX_ERR_INVALID_NUM_PARSED:
return "INVALID-NUM-PARSED";
case PMIX_ERR_TAKE_NEXT_OPTION:
return "TAKE-NEXT-OPTION";
case PMIX_ERR_INVALID_ARGS:
return "INVALID-ARGS";
@ -157,6 +159,14 @@ PMIX_EXPORT const char* PMIx_Error_string(pmix_status_t errnum)
return "PMIX_ERR_WILDCARD";
case PMIX_NOTIFY_ALLOC_COMPLETE:
return "PMIX ALLOC OPERATION COMPLETE";
case PMIX_JCTRL_CHECKPOINT:
return "PMIX JOB CONTROL CHECKPOINT";
case PMIX_JCTRL_PREEMPT_ALERT:
return "PMIX PRE-EMPTION ALERT";
case PMIX_MONITOR_HEARTBEAT_ALERT:
return "PMIX HEARTBEAT ALERT";
case PMIX_MONITOR_FILE_ALERT:
return "PMIX FILE MONITOR ALERT";
case PMIX_SUCCESS:
return "SUCCESS";
default:

Просмотреть файл

@ -37,6 +37,7 @@
#define PMIX_ERR_NETWORK_NOT_PARSEABLE (PMIX_INTERNAL_ERR_BASE - 33)
#define PMIX_ERR_FILE_OPEN_FAILURE (PMIX_INTERNAL_ERR_BASE - 34)
#define PMIX_ERR_FILE_READ_FAILURE (PMIX_INTERNAL_ERR_BASE - 35)
#define PMIX_ERR_TAKE_NEXT_OPTION (PMIX_INTERNAL_ERR_BASE - 36)
#define PMIX_ERROR_LOG(r) \
do { \

Просмотреть файл

@ -493,6 +493,12 @@ int pmix2x_convert_rc(pmix_status_t rc)
case PMIX_QUERY_PARTIAL_SUCCESS:
return OPAL_ERR_PARTIAL_SUCCESS;
case PMIX_MONITOR_HEARTBEAT_ALERT:
return OPAL_ERR_HEARTBEAT_ALERT;
case PMIX_MONITOR_FILE_ALERT:
return OPAL_ERR_FILE_ALERT;
case PMIX_ERROR:
return OPAL_ERROR;
case PMIX_SUCCESS:
@ -1333,6 +1339,22 @@ static void pmix2x_log(opal_list_t *info,
OBJ_RELEASE(cd);
}
opal_pmix_alloc_directive_t pmix2x_convert_allocdir(pmix_alloc_directive_t dir)
{
switch (dir) {
case PMIX_ALLOC_NEW:
return OPAL_PMIX_ALLOC_NEW;
case PMIX_ALLOC_EXTEND:
return OPAL_PMIX_ALLOC_EXTEND;
case PMIX_ALLOC_RELEASE:
return OPAL_PMIX_ALLOC_RELEASE;
case PMIX_ALLOC_REAQUIRE:
return OPAL_PMIX_ALLOC_REAQCUIRE;
default:
return OPAL_PMIX_ALLOC_UNDEF;
}
}
/**** INSTANTIATE INTERNAL CLASSES ****/
OBJ_CLASS_INSTANCE(opal_pmix2x_jobid_trkr_t,
opal_list_item_t,

Просмотреть файл

@ -279,6 +279,8 @@ OPAL_MODULE_DECLSPEC void pmix2x_value_load(pmix_value_t *v,
OPAL_MODULE_DECLSPEC int pmix2x_value_unload(opal_value_t *kv,
const pmix_value_t *v);
OPAL_MODULE_DECLSPEC opal_pmix_alloc_directive_t pmix2x_convert_allocdir(pmix_alloc_directive_t dir);
END_C_DECLS
#endif /* MCA_PMIX_EXTERNAL_H */

Просмотреть файл

@ -45,63 +45,73 @@
/* These are the interfaces used by the embedded PMIx server
* to call up into ORTE for service requests */
static pmix_status_t server_client_connected_fn(const pmix_proc_t *proc, void* server_object,
pmix_op_cbfunc_t cbfunc, void *cbdata);
static pmix_status_t server_client_finalized_fn(const pmix_proc_t *proc, void* server_object,
pmix_op_cbfunc_t cbfunc, void *cbdata);
static pmix_status_t server_abort_fn(const pmix_proc_t *proc, void *server_object,
int status, const char msg[],
pmix_proc_t procs[], size_t nprocs,
pmix_op_cbfunc_t cbfunc, void *cbdata);
static pmix_status_t server_fencenb_fn(const pmix_proc_t procs[], size_t nprocs,
const pmix_info_t info[], size_t ninfo,
char *data, size_t ndata,
pmix_modex_cbfunc_t cbfunc, void *cbdata);
static pmix_status_t server_dmodex_req_fn(const pmix_proc_t *proc,
const pmix_info_t info[], size_t ninfo,
pmix_modex_cbfunc_t cbfunc, void *cbdata);
static pmix_status_t server_publish_fn(const pmix_proc_t *proc,
const pmix_info_t info[], size_t ninfo,
pmix_op_cbfunc_t cbfunc, void *cbdata);
static pmix_status_t server_lookup_fn(const pmix_proc_t *proc, char **keys,
static pmix_status_t server_client_connected_fn(const pmix_proc_t *proc, void* server_object,
pmix_op_cbfunc_t cbfunc, void *cbdata);
static pmix_status_t server_client_finalized_fn(const pmix_proc_t *proc, void* server_object,
pmix_op_cbfunc_t cbfunc, void *cbdata);
static pmix_status_t server_abort_fn(const pmix_proc_t *proc, void *server_object,
int status, const char msg[],
pmix_proc_t procs[], size_t nprocs,
pmix_op_cbfunc_t cbfunc, void *cbdata);
static pmix_status_t server_fencenb_fn(const pmix_proc_t procs[], size_t nprocs,
const pmix_info_t info[], size_t ninfo,
pmix_lookup_cbfunc_t cbfunc, void *cbdata);
static pmix_status_t server_unpublish_fn(const pmix_proc_t *proc, char **keys,
char *data, size_t ndata,
pmix_modex_cbfunc_t cbfunc, void *cbdata);
static pmix_status_t server_dmodex_req_fn(const pmix_proc_t *proc,
const pmix_info_t info[], size_t ninfo,
pmix_modex_cbfunc_t cbfunc, void *cbdata);
static pmix_status_t server_publish_fn(const pmix_proc_t *proc,
const pmix_info_t info[], size_t ninfo,
pmix_op_cbfunc_t cbfunc, void *cbdata);
static pmix_status_t server_lookup_fn(const pmix_proc_t *proc, char **keys,
const pmix_info_t info[], size_t ninfo,
pmix_lookup_cbfunc_t cbfunc, void *cbdata);
static pmix_status_t server_unpublish_fn(const pmix_proc_t *proc, char **keys,
const pmix_info_t info[], size_t ninfo,
pmix_op_cbfunc_t cbfunc, void *cbdata);
static pmix_status_t server_spawn_fn(const pmix_proc_t *proc,
const pmix_info_t job_info[], size_t ninfo,
const pmix_app_t apps[], size_t napps,
pmix_spawn_cbfunc_t cbfunc, void *cbdata);
static pmix_status_t server_connect_fn(const pmix_proc_t procs[], size_t nprocs,
const pmix_info_t info[], size_t ninfo,
pmix_op_cbfunc_t cbfunc, void *cbdata);
static pmix_status_t server_disconnect_fn(const pmix_proc_t procs[], size_t nprocs,
const pmix_info_t info[], size_t ninfo,
pmix_op_cbfunc_t cbfunc, void *cbdata);
static pmix_status_t server_spawn_fn(const pmix_proc_t *proc,
const pmix_info_t job_info[], size_t ninfo,
const pmix_app_t apps[], size_t napps,
pmix_spawn_cbfunc_t cbfunc, void *cbdata);
static pmix_status_t server_connect_fn(const pmix_proc_t procs[], size_t nprocs,
const pmix_info_t info[], size_t ninfo,
pmix_op_cbfunc_t cbfunc, void *cbdata);
static pmix_status_t server_disconnect_fn(const pmix_proc_t procs[], size_t nprocs,
const pmix_info_t info[], size_t ninfo,
pmix_op_cbfunc_t cbfunc, void *cbdata);
static pmix_status_t server_register_events(pmix_status_t *codes, size_t ncodes,
const pmix_info_t info[], size_t ninfo,
pmix_op_cbfunc_t cbfunc, void *cbdata);
static pmix_status_t server_deregister_events(pmix_status_t *codes, size_t ncodes,
pmix_op_cbfunc_t cbfunc, void *cbdata);
static pmix_status_t server_notify_event(pmix_status_t code,
const pmix_proc_t *source,
pmix_data_range_t range,
pmix_info_t info[], size_t ninfo,
pmix_op_cbfunc_t cbfunc, void *cbdata);
static pmix_status_t server_query(pmix_proc_t *proct,
pmix_query_t *queryies, size_t nqueries,
pmix_info_cbfunc_t cbfunc,
static pmix_status_t server_register_events(pmix_status_t *codes, size_t ncodes,
const pmix_info_t info[], size_t ninfo,
pmix_op_cbfunc_t cbfunc, void *cbdata);
static pmix_status_t server_deregister_events(pmix_status_t *codes, size_t ncodes,
pmix_op_cbfunc_t cbfunc, void *cbdata);
static pmix_status_t server_notify_event(pmix_status_t code,
const pmix_proc_t *source,
pmix_data_range_t range,
pmix_info_t info[], size_t ninfo,
pmix_op_cbfunc_t cbfunc, void *cbdata);
static pmix_status_t server_query(pmix_proc_t *proct,
pmix_query_t *queryies, size_t nqueries,
pmix_info_cbfunc_t cbfunc,
void *cbdata);
static void server_tool_connection(pmix_info_t *info, size_t ninfo,
pmix_tool_connection_cbfunc_t cbfunc,
void *cbdata);
static void server_tool_connection(pmix_info_t *info, size_t ninfo,
pmix_tool_connection_cbfunc_t cbfunc,
void *cbdata);
static void server_log(const pmix_proc_t *client,
const pmix_info_t data[], size_t ndata,
const pmix_info_t directives[], size_t ndirs,
pmix_op_cbfunc_t cbfunc, void *cbdata);
pmix_server_module_t mymodule = {
static pmix_status_t server_allocate(const pmix_proc_t *client,
pmix_alloc_directive_t directive,
const pmix_info_t data[], size_t ndata,
pmix_info_cbfunc_t cbfunc, void *cbdata);
static pmix_status_t server_job_control(const pmix_proc_t *requestor,
const pmix_proc_t targets[], size_t ntargets,
const pmix_info_t directives[], size_t ndirs,
pmix_info_cbfunc_t cbfunc, void *cbdata);
pmix_server_module_t mymodule = {
.client_connected = server_client_connected_fn,
.client_finalized = server_client_finalized_fn,
.abort = server_abort_fn,
@ -118,7 +128,11 @@ static void server_log(const pmix_proc_t *client,
.notify_event = server_notify_event,
.query = server_query,
.tool_connected = server_tool_connection,
.log = server_log
.log = server_log,
.allocate = server_allocate,
.job_control = server_job_control
/* we do not support monitoring, but use the
* PMIx internal monitoring capability */
};
opal_pmix_server_module_t *host_module = NULL;
@ -1052,3 +1066,117 @@ static void server_log(const pmix_proc_t *proct,
&opalcaddy->apps,
opal_opcbfunc, opalcaddy);
}
static pmix_status_t server_allocate(const pmix_proc_t *proct,
pmix_alloc_directive_t directive,
const pmix_info_t data[], size_t ndata,
pmix_info_cbfunc_t cbfunc, void *cbdata)
{
pmix2x_opalcaddy_t *opalcaddy;
opal_process_name_t requestor;
int rc;
size_t n;
opal_value_t *oinfo;
opal_pmix_alloc_directive_t odir;
if (NULL == host_module || NULL == host_module->allocate) {
return PMIX_ERR_NOT_SUPPORTED;
}
/* setup the caddy */
opalcaddy = OBJ_NEW(pmix2x_opalcaddy_t);
opalcaddy->infocbfunc = cbfunc;
opalcaddy->cbdata = cbdata;
/* convert the requestor */
if (OPAL_SUCCESS != (rc = opal_convert_string_to_jobid(&requestor.jobid, proct->nspace))) {
OBJ_RELEASE(opalcaddy);
return pmix2x_convert_opalrc(rc);
}
requestor.vpid = pmix2x_convert_rank(proct->rank);
/* convert the directive */
odir = pmix2x_convert_allocdir(directive);
/* convert the data */
for (n=0; n < ndata; n++) {
oinfo = OBJ_NEW(opal_value_t);
opal_list_append(&opalcaddy->info, &oinfo->super);
if (OPAL_SUCCESS != (rc = pmix2x_value_unload(oinfo, &data[n].value))) {
OBJ_RELEASE(opalcaddy);
return pmix2x_convert_opalrc(rc);
}
}
/* pass the call upwards */
if (OPAL_SUCCESS != (rc = host_module->allocate(&requestor, odir,
&opalcaddy->info,
info_cbfunc, opalcaddy))) {
OBJ_RELEASE(opalcaddy);
return pmix2x_convert_opalrc(rc);
}
return PMIX_SUCCESS;
}
static pmix_status_t server_job_control(const pmix_proc_t *proct,
const pmix_proc_t targets[], size_t ntargets,
const pmix_info_t directives[], size_t ndirs,
pmix_info_cbfunc_t cbfunc, void *cbdata)
{
pmix2x_opalcaddy_t *opalcaddy;
opal_process_name_t requestor;
int rc;
size_t n;
opal_value_t *oinfo;
opal_namelist_t *nm;
if (NULL == host_module || NULL == host_module->job_control) {
return PMIX_ERR_NOT_SUPPORTED;
}
/* setup the caddy */
opalcaddy = OBJ_NEW(pmix2x_opalcaddy_t);
opalcaddy->infocbfunc = cbfunc;
opalcaddy->cbdata = cbdata;
/* convert the requestor */
if (OPAL_SUCCESS != (rc = opal_convert_string_to_jobid(&requestor.jobid, proct->nspace))) {
OBJ_RELEASE(opalcaddy);
return pmix2x_convert_opalrc(rc);
}
requestor.vpid = pmix2x_convert_rank(proct->rank);
/* convert the targets */
for (n=0; n < ntargets; n++) {
nm = OBJ_NEW(opal_namelist_t);
opal_list_append(&opalcaddy->procs, &nm->super);
if (OPAL_SUCCESS != (rc = opal_convert_string_to_jobid(&nm->name.jobid, targets[n].nspace))) {
OBJ_RELEASE(opalcaddy);
return pmix2x_convert_opalrc(rc);
}
nm->name.vpid = pmix2x_convert_rank(targets[n].rank);
}
/* convert the directives */
for (n=0; n < ndirs; n++) {
oinfo = OBJ_NEW(opal_value_t);
opal_list_append(&opalcaddy->info, &oinfo->super);
if (OPAL_SUCCESS != (rc = pmix2x_value_unload(oinfo, &directives[n].value))) {
OBJ_RELEASE(opalcaddy);
return pmix2x_convert_opalrc(rc);
}
}
/* pass the call upwards */
if (OPAL_SUCCESS != (rc = host_module->job_control(&requestor,
&opalcaddy->procs,
&opalcaddy->info,
info_cbfunc, opalcaddy))) {
OBJ_RELEASE(opalcaddy);
return pmix2x_convert_opalrc(rc);
}
return PMIX_SUCCESS;
}

Просмотреть файл

@ -1,5 +1,5 @@
/*
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -230,6 +230,19 @@ typedef void (*opal_pmix_connection_cbfunc_t)(int incoming_sd);
typedef int (*opal_pmix_server_listener_fn_t)(int listening_sd,
opal_pmix_connection_cbfunc_t cbfunc);
/* Request allocation modifications on behalf of a client */
typedef int (*opal_pmix_server_alloc_fn_t)(const opal_process_name_t *client,
opal_pmix_alloc_directive_t directive,
opal_list_t *data,
opal_pmix_info_cbfunc_t cbfunc, void *cbdata);
/* Execute a job control action on behalf of a client */
typedef int (*opal_pmix_server_job_control_fn_t)(const opal_process_name_t *requestor,
opal_list_t *targets, opal_list_t *directives,
opal_pmix_info_cbfunc_t cbfunc, void *cbdata);
/* we do not provide a monitoring capability */
typedef struct opal_pmix_server_module_1_0_0_t {
opal_pmix_server_client_connected_fn_t client_connected;
opal_pmix_server_client_finalized_fn_t client_finalized;
@ -249,6 +262,8 @@ typedef struct opal_pmix_server_module_1_0_0_t {
opal_pmix_server_tool_connection_fn_t tool_connected;
opal_pmix_server_log_fn_t log;
opal_pmix_server_listener_fn_t listener;
opal_pmix_server_alloc_fn_t allocate;
opal_pmix_server_job_control_fn_t job_control;
} opal_pmix_server_module_t;

Просмотреть файл

@ -32,6 +32,11 @@ BEGIN_C_DECLS
* that key */
#define OPAL_PMIX_RANK_WILDCARD UINT32_MAX-1
/* other special rank values will be used to define
* groups of ranks for use in collectives */
#define OPAL_PMIX_RANK_LOCAL_NODE UINT32_MAX-2 // all ranks on local node
/* define a set of "standard" attributes that can
* be queried. Implementations (and users) are free to extend as
* desired, so the get functions need to be capable
@ -55,12 +60,15 @@ BEGIN_C_DECLS
#define OPAL_PMIX_CONNECT_TO_SYSTEM "pmix.cnct.sys" // (bool) The requestor requires that a connection be made only to
// a local system-level PMIx server
#define OPAL_PMIX_CONNECT_SYSTEM_FIRST "pmix.cnct.sys.first" // (bool) Preferentially look for a system-level PMIx server first
#define OPAL_PMIX_REGISTER_NODATA "pmix.reg.nodata" // (bool) Registration is for nspace only, do not copy job data
#define OPAL_PMIX_SERVER_ENABLE_MONITORING "pmix.srv.monitor" // (bool) Enable PMIx internal monitoring by server
/* identification attributes */
#define OPAL_PMIX_USERID "pmix.euid" // (uint32_t) effective user id
#define OPAL_PMIX_GRPID "pmix.egid" // (uint32_t) effective group id
/* attributes for the rendezvous socket */
#define OPAL_PMIX_USOCK_DISABLE "pmix.usock.disable" // (bool) disable legacy usock support
#define OPAL_PMIX_SOCKET_MODE "pmix.sockmode" // (uint32_t) POSIX mode_t (9 bits valid)
@ -76,6 +84,7 @@ BEGIN_C_DECLS
#define OPAL_PMIX_TCP_DISABLE_IPV4 "pmix.tcp.disipv4" // (bool) true to disable IPv4 family
#define OPAL_PMIX_TCP_DISABLE_IPV6 "pmix.tcp.disipv6" // (bool) true to disable IPv6 family
/* general proc-level attributes */
#define OPAL_PMIX_CPUSET "pmix.cpuset" // (char*) hwloc bitmap applied to proc upon launch
#define OPAL_PMIX_CREDENTIAL "pmix.cred" // (char*) security credential assigned to proc
@ -89,6 +98,7 @@ BEGIN_C_DECLS
#define OPAL_PMIX_PROCDIR "pmix.pdir" // (char*) sub-nsdir assigned to proc
#define OPAL_PMIX_TDIR_RMCLEAN "pmix.tdir.rmclean" // (bool) Resource Manager will clean session directories
/* information about relative ranks as assigned by the RM */
#define OPAL_PMIX_PROCID "pmix.procid" // (opal_process_name_t) process identifier
#define OPAL_PMIX_NSPACE "pmix.nspace" // (char*) nspace of a job
@ -104,25 +114,26 @@ BEGIN_C_DECLS
#define OPAL_PMIX_LOCALLDR "pmix.lldr" // (uint64_t) opal_identifier of lowest rank on this node within this job
#define OPAL_PMIX_APPLDR "pmix.aldr" // (uint32_t) lowest rank in this app within this job
#define OPAL_PMIX_PROC_PID "pmix.ppid" // (pid_t) pid of specified proc
/**** no PMIx equivalent ****/
#define OPAL_PMIX_LOCALITY "pmix.loc" // (uint16_t) relative locality of two procs
#define OPAL_PMIX_TOPOLOGY_SIGNATURE "pmix.toposig" // (char*) topology signature string
#define OPAL_PMIX_LOCALITY_STRING "pmix.locstr" // (char*) string describing a proc's location
#define OPAL_PMIX_AVAIL_PHYS_MEMORY "pmix.pmem" // (uint64_t) total available physical memory on this node
#define OPAL_PMIX_SESSION_ID "pmix.session.id" // (uint32_t) session identifier
#define OPAL_PMIX_NODE_LIST "pmix.nlist" // (char*) comma-delimited list of nodes running procs for the specified nspace
#define OPAL_PMIX_ALLOCATED_NODELIST "pmix.alist" // (char*) comma-delimited list of all nodes in this allocation regardless of
// whether or not they currently host procs.
#define OPAL_PMIX_HOSTNAME "pmix.hname" // (char*) name of the host the specified proc is on
#define OPAL_PMIX_NODEID "pmix.nodeid" // (uint32_t) node identifier
#define OPAL_PMIX_LOCAL_PEERS "pmix.lpeers" // (char*) comma-delimited string of ranks on this node within the specified nspace
#define OPAL_PMIX_LOCAL_PROCS "pmix.lprocs" // (opal_list_t*) list of opal_namelist_t of procs on the specified node
#define OPAL_PMIX_LOCAL_CPUSETS "pmix.lcpus" // (char*) colon-delimited cpusets of local peers within the specified nspace
#define OPAL_PMIX_PROC_URI "opal.puri" // (char*) URI containing contact info for proc - NOTE: this is published by procs and
// thus cannot be prefixed with "pmix"
#define OPAL_PMIX_LOCALITY "pmix.loc" // (uint16_t) relative locality of two procs
/* Memory info */
#define OPAL_PMIX_AVAIL_PHYS_MEMORY "pmix.pmem" // (uint64_t) total available physical memory on this node
#define OPAL_PMIX_DAEMON_MEMORY "pmix.dmn.mem" // (float) Mbytes of memory currently used by daemon
#define OPAL_PMIX_CLIENT_AVG_MEMORY "pmix.cl.mem.avg" // (float) Average Mbytes of memory used by client processes
/* size info */
#define OPAL_PMIX_UNIV_SIZE "pmix.univ.size" // (uint32_t) #procs in this nspace
#define OPAL_PMIX_JOB_SIZE "pmix.job.size" // (uint32_t) #procs in this job
@ -133,11 +144,15 @@ BEGIN_C_DECLS
#define OPAL_PMIX_MAX_PROCS "pmix.max.size" // (uint32_t) max #procs for this job
#define OPAL_PMIX_NUM_NODES "pmix.num.nodes" // (uint32_t) #nodes in this nspace
/* topology info */
#define OPAL_PMIX_NET_TOPO "pmix.ntopo" // (char*) xml-representation of network topology
#define OPAL_PMIX_LOCAL_TOPO "pmix.ltopo" // (char*) xml-representation of local node topology
#define OPAL_PMIX_NODE_LIST "pmix.nlist" // (char*) comma-delimited list of nodes running procs for this job
#define OPAL_PMIX_TOPOLOGY "pmix.topo" // (hwloc_topology_t) pointer to the PMIx client's internal topology object
#define OPAL_PMIX_TOPOLOGY_SIGNATURE "pmix.toposig" // (char*) topology signature string
#define OPAL_PMIX_LOCALITY_STRING "pmix.locstr" // (char*) string describing a proc's location
/* request-related info */
#define OPAL_PMIX_COLLECT_DATA "pmix.collect" // (bool) collect data and return it at the end of the operation
@ -156,16 +171,19 @@ BEGIN_C_DECLS
#define OPAL_PMIX_EMBED_BARRIER "pmix.embed.barrier" // (bool) execute a blocking fence operation before executing the
// specified operation
/* attribute used by host server to pass data to the server convenience library - the
* data will then be parsed and provided to the local clients */
#define OPAL_PMIX_PROC_DATA "pmix.pdata" // (pmix_value_array_t) starts with rank, then contains more data
#define OPAL_PMIX_NODE_MAP "pmix.nmap" // (char*) regex of nodes containing procs for this job
#define OPAL_PMIX_PROC_MAP "pmix.pmap" // (char*) regex describing procs on each node within this job
/* attributes used internally to communicate data from the server to the client */
#define OPAL_PMIX_PROC_BLOB "pmix.pblob" // (pmix_byte_object_t) packed blob of process data
#define OPAL_PMIX_MAP_BLOB "pmix.mblob" // (pmix_byte_object_t) packed blob of process location
/* error handler registration and notification info keys */
#define OPAL_PMIX_EVENT_HDLR_NAME "pmix.evname" // (char*) string name identifying this handler
#define OPAL_PMIX_EVENT_JOB_LEVEL "pmix.evjob" // (bool) register for job-specific events only
@ -187,7 +205,7 @@ BEGIN_C_DECLS
#define OPAL_PMIX_EVENT_ACTION_TIMEOUT "pmix.evtimeout" // (int) time in sec before RM will execute error response
/* attributes used to describe "spawm" attributes */
/* attributes used to describe "spawn" attributes */
#define OPAL_PMIX_PERSONALITY "pmix.pers" // (char*) name of personality to use
#define OPAL_PMIX_HOST "pmix.host" // (char*) comma-delimited list of hosts to use for spawned procs
#define OPAL_PMIX_HOSTFILE "pmix.hostfile" // (char*) hostfile to use for spawned procs
@ -229,19 +247,89 @@ BEGIN_C_DECLS
#define OPAL_PMIX_QUERY_LOCAL_ONLY "pmix.qry.local" // constrain the query to local information only
#define OPAL_PMIX_QUERY_REPORT_AVG "pmix.qry.avg" // report average values
#define OPAL_PMIX_QUERY_REPORT_MINMAX "pmix.qry.minmax" // report minimum and maximum value
#define OPAL_PMIX_QUERY_ALLOC_STATUS "pmix.query.alloc" // (char*) string identifier of the allocation whose status
// is being requested
#define OPAL_PMIX_TIME_REMAINING "pmix.time.remaining" // (char*) query number of seconds (uint32_t) remaining in allocation
// for the specified nspace
/* log attributes */
#define OPAL_PMIX_LOG_STDERR "pmix.log.stderr" // (char*) log string to stderr
#define OPAL_PMIX_LOG_STDOUT "pmix.log.stdout" // (char*) log string to stdout
#define OPAL_PMIX_LOG_SYSLOG "pmix.log.syslog" // (char*) log data to syslog - defaults to ERROR priority unless
#define OPAL_PMIX_LOG_MSG "pmix.log.msg" // (pmix_byte_object_t) message blob to be sent somewhere
#define OPAL_PMIX_LOG_STDERR "pmix.log.stderr" // (char*) log string to stderr
#define OPAL_PMIX_LOG_STDOUT "pmix.log.stdout" // (char*) log string to stdout
#define OPAL_PMIX_LOG_SYSLOG "pmix.log.syslog" // (char*) log data to syslog - defaults to ERROR priority unless
#define OPAL_PMIX_LOG_MSG "pmix.log.msg" // (pmix_byte_object_t) message blob to be sent somewhere
#define OPAL_PMIX_LOG_EMAIL "pmix.log.email" // (pmix_data_array_t) log via email based on pmix_info_t containing directives
#define OPAL_PMIX_LOG_EMAIL_ADDR "pmix.log.emaddr" // (char*) comma-delimited list of email addresses that are to recv msg
#define OPAL_PMIX_LOG_EMAIL_SUBJECT "pmix.log.emsub" // (char*) subject line for email
#define OPAL_PMIX_LOG_EMAIL_MSG "pmix.log.emmsg" // (char*) msg to be included in email
/* debugger attributes */
#define OPAL_PMIX_DEBUG_STOP_ON_EXEC "pmix.dbg.exec" // (bool) job is being spawned under debugger - instruct it to pause on start
#define OPAL_PMIX_DEBUG_STOP_IN_INIT "pmix.dbg.init" // (bool) instruct job to stop during PMIx init
#define OPAL_PMIX_DEBUG_WAIT_FOR_NOTIFY "pmix.dbg.notify" // (bool) block at desired point until receiving debugger release notification
#define OPAL_PMIX_DEBUG_JOB "pmix.dbg.job" // (char*) nspace of the job to be debugged - the RM/PMIx server are
#define OPAL_PMIX_DEBUG_WAITING_FOR_NOTIFY "pmix.dbg.waiting" // (bool) job to be debugged is waiting for a release
#define OPAL_PMIX_DEBUG_STOP_ON_EXEC "pmix.dbg.exec" // (bool) job is being spawned under debugger - instruct it to pause on start
#define OPAL_PMIX_DEBUG_STOP_IN_INIT "pmix.dbg.init" // (bool) instruct job to stop during PMIx init
#define OPAL_PMIX_DEBUG_WAIT_FOR_NOTIFY "pmix.dbg.notify" // (bool) block at desired point until receiving debugger release notification
#define OPAL_PMIX_DEBUG_JOB "pmix.dbg.job" // (char*) nspace of the job to be debugged - the RM/PMIx server are
#define OPAL_PMIX_DEBUG_WAITING_FOR_NOTIFY "pmix.dbg.waiting" // (bool) job to be debugged is waiting for a release
/* Resource Manager identification */
#define OPAL_PMIX_RM_NAME "pmix.rm.name" // (char*) string name of the resource manager
#define OPAL_PMIX_RM_VERSION "pmix.rm.version" // (char*) RM version string
/* attributes for setting envars */
#define OPAL_PMIX_SET_ENVAR "pmix.set.envar" // (char*) string "key=value" value shall be put into the environment
#define OPAL_PMIX_UNSET_ENVAR "pmix.unset.envar" // (char*) unset envar specified in string
/* attributes relating to allocations */
#define OPAL_PMIX_ALLOC_ID "pmix.alloc.id" // (char*) provide a string identifier for this allocation request
// which can later be used to query status of the request
#define OPAL_PMIX_ALLOC_NUM_NODES "pmix.alloc.nnodes" // (uint64_t) number of nodes
#define OPAL_PMIX_ALLOC_NODE_LIST "pmix.alloc.nlist" // (char*) regex of specific nodes
#define OPAL_PMIX_ALLOC_NUM_CPUS "pmix.alloc.ncpus" // (uint64_t) number of cpus
#define OPAL_PMIX_ALLOC_NUM_CPU_LIST "pmix.alloc.ncpulist" // (char*) regex of #cpus for each node
#define OPAL_PMIX_ALLOC_CPU_LIST "pmix.alloc.cpulist" // (char*) regex of specific cpus indicating the cpus involved.
#define OPAL_PMIX_ALLOC_MEM_SIZE "pmix.alloc.msize" // (float) number of Mbytes
#define OPAL_PMIX_ALLOC_NETWORK "pmix.alloc.net" // (array) array of pmix_info_t describing network resources. If not
// given as part of an info struct that identifies the
// impacted nodes, then the description will be applied
// across all nodes in the requestor's allocation
#define OPAL_PMIX_ALLOC_NETWORK_ID "pmix.alloc.netid" // (char*) name of network
#define OPAL_PMIX_ALLOC_BANDWIDTH "pmix.alloc.bw" // (float) Mbits/sec
#define OPAL_PMIX_ALLOC_NETWORK_QOS "pmix.alloc.netqos" // (char*) quality of service level
#define OPAL_PMIX_ALLOC_TIME "pmix.alloc.time" // (uint32_t) time in seconds
/* job control attributes */
#define OPAL_PMIX_JOB_CTRL_ID "pmix.jctrl.id" // (char*) provide a string identifier for this request
#define OPAL_PMIX_JOB_CTRL_PAUSE "pmix.jctrl.pause" // (bool) pause the specified processes
#define OPAL_PMIX_JOB_CTRL_RESUME "pmix.jctrl.resume" // (bool) "un-pause" the specified processes
#define OPAL_PMIX_JOB_CTRL_CANCEL "pmix.jctrl.cancel" // (char*) cancel the specified request
// (NULL => cancel all requests from this requestor)
#define OPAL_PMIX_JOB_CTRL_KILL "pmix.jctrl.kill" // (bool) forcibly terminate the specified processes and cleanup
#define OPAL_PMIX_JOB_CTRL_RESTART "pmix.jctrl.restart" // (char*) restart the specified processes using the given checkpoint ID
#define OPAL_PMIX_JOB_CTRL_CHECKPOINT "pmix.jctrl.ckpt" // (char*) checkpoint the specified processes and assign the given ID to it
#define OPAL_PMIX_JOB_CTRL_CHECKPOINT_EVENT "pmix.jctrl.ckptev" // (bool) use event notification to trigger process checkpoint
#define OPAL_PMIX_JOB_CTRL_CHECKPOINT_SIGNAL "pmix.jctrl.ckptsig" // (int) use the given signal to trigger process checkpoint
#define OPAL_PMIX_JOB_CTRL_CHECKPOINT_TIMEOUT "pmix.jctrl.ckptsig" // (int) time in seconds to wait for checkpoint to complete
#define OPAL_PMIX_JOB_CTRL_SIGNAL "pmix.jctrl.sig" // (int) send given signal to specified processes
#define OPAL_PMIX_JOB_CTRL_PROVISION "pmix.jctrl.pvn" // (char*) regex identifying nodes that are to be provisioned
#define OPAL_PMIX_JOB_CTRL_PROVISION_IMAGE "pmix.jctrl.pvnimg" // (char*) name of the image that is to be provisioned
#define OPAL_PMIX_JOB_CTRL_PREEMPTIBLE "pmix.jctrl.preempt" // (bool) job can be pre-empted
/* monitoring attributes */
#define OPAL_PMIX_MONITOR_HEARTBEAT "pmix.monitor.mbeat" // (void) register to have the server monitor the requestor for heartbeats
#define OPAL_PMIX_SEND_HEARTBEAT "pmix.monitor.beat" // (void) send heartbeat to local server
#define OPAL_PMIX_MONITOR_HEARTBEAT_TIME "pmix.monitor.btime" // (uint32_t) time in seconds before declaring heartbeat missed
#define OPAL_PMIX_MONITOR_HEARTBEAT_DROPS "pmix.monitor.bdrop" // (uint32_t) number of heartbeats that can be missed before taking
// specified action
#define OPAL_PMIX_MONITOR_FILE "pmix.monitor.fmon" // (char*) register to monitor file for signs of life
#define OPAL_PMIX_MONITOR_FILE_SIZE "pmix.monitor.fsize" // (bool) monitor size of given file is growing to determine app is running
#define OPAL_PMIX_MONITOR_FILE_ACCESS "pmix.monitor.faccess" // (char*) monitor time since last access of given file to determine app is running
#define OPAL_PMIX_MONITOR_FILE_MODIFY "pmix.monitor.fmod" // (char*) monitor time since last modified of given file to determine app is running
#define OPAL_PMIX_MONITOR_FILE_CHECK_TIME "pmix.monitor.ftime" // (uint32_t) time in seconds between checking file
#define OPAL_PMIX_MONITOR_FILE_DROPS "pmix.monitor.fdrop" // (uint32_t) number of file checks that can be missed before taking
// specified action
/* define a scope for data "put" by PMI per the following:
@ -285,6 +373,16 @@ typedef enum {
} opal_pmix_persistence_t;
/* define allocation request flags */
typedef enum {
OPAL_PMIX_ALLOC_UNDEF = 0,
OPAL_PMIX_ALLOC_NEW,
OPAL_PMIX_ALLOC_EXTEND,
OPAL_PMIX_ALLOC_RELEASE,
OPAL_PMIX_ALLOC_REAQCUIRE
} opal_pmix_alloc_directive_t;
/**** PMIX INFO STRUCT ****/
/* NOTE: the pmix_info_t is essentially equivalent to the opal_value_t

Просмотреть файл

@ -292,6 +292,12 @@ opal_err2str(int errnum, const char **errmsg)
case OPAL_ERR_EVENT_REGISTRATION:
retval = "Event registration";
break;
case OPAL_ERR_HEARTBEAT_ALERT:
retval = "Heartbeat not received";
break;
case OPAL_ERR_FILE_ALERT:
retval = "File alert - proc may have stalled";
break;
default:
retval = "UNRECOGNIZED";
}

Просмотреть файл

@ -76,7 +76,7 @@ ORTE_DECLSPEC int orte_schizo_base_setup_child(orte_job_t *jobdat,
orte_app_context_t *app,
char ***env);
ORTE_DECLSPEC orte_schizo_launch_environ_t orte_schizo_base_check_launch_environment(void);
ORTE_DECLSPEC long orte_schizo_base_get_remaining_time(void);
ORTE_DECLSPEC int orte_schizo_base_get_remaining_time(uint32_t *timeleft);
ORTE_DECLSPEC void orte_schizo_base_finalize(void);
END_C_DECLS

Просмотреть файл

@ -162,20 +162,20 @@ orte_schizo_launch_environ_t orte_schizo_base_check_launch_environment(void)
return ORTE_SCHIZO_UNDETERMINED;
}
long orte_schizo_base_get_remaining_time(void)
int orte_schizo_base_get_remaining_time(uint32_t *timeleft)
{
long rc;
int rc;
orte_schizo_base_active_module_t *mod;
OPAL_LIST_FOREACH(mod, &orte_schizo_base.active_modules, orte_schizo_base_active_module_t) {
if (NULL != mod->module->get_remaining_time) {
rc = mod->module->get_remaining_time();
rc = mod->module->get_remaining_time(timeleft);
if (ORTE_ERR_TAKE_NEXT_OPTION != rc) {
return rc;
}
}
}
return -1;
return ORTE_ERR_NOT_SUPPORTED;
}
void orte_schizo_base_finalize(void)

Просмотреть файл

@ -118,7 +118,7 @@ typedef void (*orte_schizo_base_module_finalize_fn_t)(void);
* and decides it cannot provide the info in the current situation,
* then it can return ORTE_ERR_TAKE_NEXT_OPTION to indicate that
* another module should be tried */
typedef long (*orte_schizo_base_module_get_rem_time_fn_t)(void);
typedef int (*orte_schizo_base_module_get_rem_time_fn_t)(uint32_t *timeleft);
/*
* schizo module version 1.3.0

Просмотреть файл

@ -1,5 +1,5 @@
/*
* Copyright (c) 2016 Intel, Inc. All rights reserved.
* Copyright (c) 2016-2017 Intel, Inc. All rights reserved.
* Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
* $COPYRIGHT$
*
@ -29,10 +29,12 @@
#include "schizo_slurm.h"
static orte_schizo_launch_environ_t check_launch_environment(void);
static int get_remaining_time(uint32_t *timeleft);
static void finalize(void);
orte_schizo_base_module_t orte_schizo_slurm_module = {
.check_launch_environment = check_launch_environment,
.get_remaining_time = get_remaining_time,
.finalize = finalize
};
@ -123,6 +125,58 @@ static orte_schizo_launch_environ_t check_launch_environment(void)
return myenv;
}
static int get_remaining_time(uint32_t *timeleft)
{
char output[256], *cmd, *jobid, **res;
FILE *fp;
uint32_t tleft;
size_t cnt;
/* set the default */
*timeleft = UINT32_MAX;
if (NULL == (jobid = getenv("SLURM_JOBID"))) {
return ORTE_ERR_TAKE_NEXT_OPTION;
}
if (0 > asprintf(&cmd, "squeue -h -j %s -o %%L", jobid)) {
return ORTE_ERR_OUT_OF_RESOURCE;
}
fp = popen(cmd, "r");
if (NULL == fp) {
free(cmd);
return ORTE_ERR_FILE_OPEN_FAILURE;
}
if (NULL == fgets(output, 256, fp)) {
free(cmd);
return ORTE_ERR_FILE_READ_FAILURE;
}
free(cmd);
/* the output is returned in a colon-delimited set of fields */
res = opal_argv_split(output, ':');
cnt = opal_argv_count(res);
tleft = strtol(res[cnt-1], NULL, 10); // has to be at least one field
/* the next field would be minutes */
if (1 < cnt) {
tleft += 60 * strtol(res[cnt-2], NULL, 10);
}
/* next field would be hours */
if (2 < cnt) {
tleft += 3600 * strtol(res[cnt-3], NULL, 10);
}
/* next field is days */
if (3 < cnt) {
tleft += 24*3600 * strtol(res[cnt-4], NULL, 10);
}
/* if there are more fields than that, then it is infinite */
if (4 < cnt) {
tleft = UINT32_MAX;
}
opal_argv_free(res);
*timeleft = tleft;
return ORTE_SUCCESS;
}
static void finalize(void)
{
int i;

Просмотреть файл

@ -1,6 +1,6 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2016 Intel, Inc. All rights reserved.
* Copyright (c) 2016-2017 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -38,8 +38,8 @@ orte_schizo_base_component_t mca_schizo_slurm_component = {
static int component_query(mca_base_module_t **module, int *priority)
{
/* disqualify ourselves if we are not an app or under slurm */
if (!ORTE_PROC_IS_APP) {
/* disqualify ourselves if we are not under slurm */
if (NULL == getenv("SLURM_JOBID")) {
*priority = 0;
*module = NULL;
return OPAL_ERROR;
@ -49,4 +49,3 @@ static int component_query(mca_base_module_t **module, int *priority)
*priority = 50;
return ORTE_SUCCESS;
}

Просмотреть файл

@ -1,39 +0,0 @@
/*
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2013 Los Alamos National Security, LLC. All rights reserved.
*
* Copyright (c) 2017 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/** @file:
*/
#ifndef MCA_SENSOR_BASE_H
#define MCA_SENSOR_BASE_H
/*
* includes
*/
#include "orte_config.h"
#include "opal/class/opal_list.h"
#include "opal/mca/base/base.h"
#include "orte/mca/sensor/sensor.h"
BEGIN_C_DECLS
/*
* MCA Framework
*/
ORTE_DECLSPEC extern mca_base_framework_t orte_sensor_base_framework;
/* select a component */
ORTE_DECLSPEC int orte_sensor_base_select(void);
END_C_DECLS
#endif

Просмотреть файл

@ -1,158 +0,0 @@
/*
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2012 Los Alamos National Security, Inc. All rights reserved.
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "orte/constants.h"
#include "opal/dss/dss.h"
#include "opal/mca/event/event.h"
#include "orte/mca/sensor/base/base.h"
#include "orte/mca/sensor/base/sensor_private.h"
static bool mods_active = false;
void orte_sensor_base_start(orte_jobid_t job)
{
orte_sensor_active_module_t *i_module;
int i;
if (0 < orte_sensor_base.rate.tv_sec) {
opal_output_verbose(5, orte_sensor_base_framework.framework_output,
"%s sensor:base: starting sensors",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
/* call the start function of all modules in priority order */
for (i=0; i < orte_sensor_base.modules.size; i++) {
if (NULL == (i_module = (orte_sensor_active_module_t*)opal_pointer_array_get_item(&orte_sensor_base.modules, i))) {
continue;
}
mods_active = true;
if (NULL != i_module->module->start) {
i_module->module->start(job);
}
}
if (mods_active && !orte_sensor_base.active) {
/* setup a buffer to collect samples */
orte_sensor_base.samples = OBJ_NEW(opal_buffer_t);
/* startup a timer to wake us up periodically
* for a data sample
*/
orte_sensor_base.active = true;
opal_event_evtimer_set(orte_event_base, &orte_sensor_base.sample_ev,
orte_sensor_base_sample, NULL);
opal_event_evtimer_add(&orte_sensor_base.sample_ev, &orte_sensor_base.rate);
}
}
return;
}
void orte_sensor_base_stop(orte_jobid_t job)
{
orte_sensor_active_module_t *i_module;
int i;
if (!mods_active) {
return;
}
opal_output_verbose(5, orte_sensor_base_framework.framework_output,
"%s sensor:base: stopping sensors",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
if (orte_sensor_base.active) {
opal_event_del(&orte_sensor_base.sample_ev);
orte_sensor_base.active = false;
}
/* call the stop function of all modules in priority order */
for (i=0; i < orte_sensor_base.modules.size; i++) {
if (NULL == (i_module = (orte_sensor_active_module_t*)opal_pointer_array_get_item(&orte_sensor_base.modules, i))) {
continue;
}
if (NULL != i_module->module->stop) {
i_module->module->stop(job);
}
}
return;
}
void orte_sensor_base_sample(int fd, short args, void *cbdata)
{
orte_sensor_active_module_t *i_module;
int i;
if (!mods_active) {
return;
}
/* see if we were ordered to stop */
if (!orte_sensor_base.active) {
return;
}
opal_output_verbose(5, orte_sensor_base_framework.framework_output,
"%s sensor:base: sampling sensors",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
/* call the sample function of all modules in priority order from
* highest to lowest - the heartbeat should always be the lowest
* priority, so it will send any collected data
*/
for (i=0; i < orte_sensor_base.modules.size; i++) {
if (NULL == (i_module = (orte_sensor_active_module_t*)opal_pointer_array_get_item(&orte_sensor_base.modules, i))) {
continue;
}
if (NULL != i_module->module->sample) {
opal_output_verbose(5, orte_sensor_base_framework.framework_output,
"%s sensor:base: sampling component %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
i_module->component->base_version.mca_component_name);
i_module->module->sample();
}
}
/* restart the timer */
opal_event_evtimer_add(&orte_sensor_base.sample_ev, &orte_sensor_base.rate);
return;
}
void orte_sensor_base_log(char *comp, opal_buffer_t *data)
{
int i;
orte_sensor_active_module_t *i_module;
if (NULL == comp) {
/* nothing we can do */
return;
}
opal_output_verbose(5, orte_sensor_base_framework.framework_output,
"%s sensor:base: logging sensor %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), comp);
/* find the specified module */
for (i=0; i < orte_sensor_base.modules.size; i++) {
if (NULL == (i_module = (orte_sensor_active_module_t*)opal_pointer_array_get_item(&orte_sensor_base.modules, i))) {
continue;
}
if (0 == strcmp(comp, i_module->component->base_version.mca_component_name)) {
if (NULL != i_module->module->log) {
i_module->module->log(data);
}
return;
}
}
}

Просмотреть файл

@ -1,133 +0,0 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2012-2013 Los Alamos National Security, Inc. All rights reserved.
* Copyright (c) 2017 Intel, Inc. All rights reserved.
* Copyright (c) 2017 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "orte/constants.h"
#include "opal/mca/mca.h"
#include "opal/util/argv.h"
#include "opal/util/output.h"
#include "opal/mca/base/base.h"
#include "opal/class/opal_pointer_array.h"
#ifdef HAVE_STRING_H
#include <string.h>
#endif
#include "orte/mca/sensor/base/base.h"
#include "orte/mca/sensor/base/sensor_private.h"
/*
* The following file was created by configure. It contains extern
* statements and the definition of an array of pointers to each
* component's public mca_base_component_t struct.
*/
#include "orte/mca/sensor/base/static-components.h"
/*
* Global variables
*/
orte_sensor_base_API_module_t orte_sensor = {
orte_sensor_base_start,
orte_sensor_base_stop
};
orte_sensor_base_t orte_sensor_base = {{{0}}};
/*
* Local variables
*/
static int orte_sensor_base_sample_rate = 0;
static int orte_sensor_base_register(mca_base_register_flag_t flags)
{
int var_id;
orte_sensor_base_sample_rate = 0;
var_id = mca_base_var_register("orte", "sensor", "base", "sample_rate",
"Sample rate in seconds",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_READONLY,
&orte_sensor_base_sample_rate);
mca_base_var_register_synonym(var_id, "orte", "sensor", NULL, "sample_rate",
MCA_BASE_VAR_SYN_FLAG_DEPRECATED);
/* see if we want samples logged */
orte_sensor_base.log_samples = false;
var_id = mca_base_var_register("orte", "sensor", "base", "log_samples",
"Log samples to database",
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_READONLY,
&orte_sensor_base.log_samples);
mca_base_var_register_synonym(var_id, "orte", "sensor", NULL, "log_samples",
MCA_BASE_VAR_SYN_FLAG_DEPRECATED);
return ORTE_SUCCESS;
}
static int orte_sensor_base_close(void)
{
orte_sensor_active_module_t *i_module;
int i;
for (i=0; i < orte_sensor_base.modules.size; i++) {
if (NULL == (i_module = (orte_sensor_active_module_t*)opal_pointer_array_get_item(&orte_sensor_base.modules, i))) {
continue;
}
if (NULL != i_module->module->finalize) {
i_module->module->finalize();
}
}
OBJ_DESTRUCT(&orte_sensor_base.modules);
/* Close all remaining available components */
return mca_base_framework_components_close(&orte_sensor_base_framework, NULL);
}
/**
* Function for finding and opening either all MCA components, or the one
* that was specifically requested via a MCA parameter.
*/
static int orte_sensor_base_open(mca_base_open_flag_t flags)
{
/* initialize globals */
orte_sensor_base.active = false;
/* construct the array of modules */
OBJ_CONSTRUCT(&orte_sensor_base.modules, opal_pointer_array_t);
opal_pointer_array_init(&orte_sensor_base.modules, 3, INT_MAX, 1);
/* get the sample rate */
orte_sensor_base.rate.tv_sec = orte_sensor_base_sample_rate;
orte_sensor_base.rate.tv_usec = 0;
/* Open up all available components */
return mca_base_framework_components_open(&orte_sensor_base_framework, flags);
}
MCA_BASE_FRAMEWORK_DECLARE(orte, sensor, "ORTE Monitoring Sensors",
orte_sensor_base_register,
orte_sensor_base_open, orte_sensor_base_close,
mca_sensor_base_static_components, 0);
static void cons(orte_sensor_active_module_t *t)
{
t->sampling = true;
}
OBJ_CLASS_INSTANCE(orte_sensor_active_module_t,
opal_object_t,
cons, NULL);

Просмотреть файл

@ -1,219 +0,0 @@
/*
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2012-2013 Los Alamos National Security, Inc. All rights reserved.
*
* Copyright (c) 2017 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#ifdef HAVE_STRING_H
#include <string.h>
#endif
#include "orte/constants.h"
#include "opal/mca/mca.h"
#include "opal/mca/base/base.h"
#include "opal/util/argv.h"
#include "opal/util/output.h"
#include "opal/class/opal_pointer_array.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/sensor/base/base.h"
#include "orte/mca/sensor/base/sensor_private.h"
static bool selected = false;
/**
* Function for weeding out sensor components that don't want to run.
*
* Call the init function on all available components to find out if
* they want to run. Select all components that don't fail. Failing
* components will be closed and unloaded. The selected modules will
* be returned to the caller in a opal_list_t.
*/
int orte_sensor_base_select(void)
{
mca_base_component_list_item_t *cli = NULL;
orte_sensor_base_component_t *component = NULL;
mca_base_module_t *module = NULL;
orte_sensor_active_module_t *i_module;
int priority = 0, i, j, low_i;
opal_pointer_array_t tmp_array;
bool none_found;
orte_sensor_active_module_t *tmp_module = NULL, *tmp_module_sw = NULL;
bool duplicate;
if (selected) {
return ORTE_SUCCESS;
}
selected = true;
OBJ_CONSTRUCT(&tmp_array, opal_pointer_array_t);
opal_output_verbose(10, orte_sensor_base_framework.framework_output,
"sensor:base:select: Auto-selecting components");
/*
* Traverse the list of available components.
* For each call their 'query' functions to determine relative priority.
*/
none_found = true;
OPAL_LIST_FOREACH(cli, &orte_sensor_base_framework.framework_components, mca_base_component_list_item_t) {
component = (orte_sensor_base_component_t *) cli->cli_component;
/*
* If there is a query function then use it.
*/
if (NULL == component->base_version.mca_query_component) {
opal_output_verbose(5, orte_sensor_base_framework.framework_output,
"sensor:base:select Skipping component [%s]. It does not implement a query function",
component->base_version.mca_component_name );
continue;
}
/*
* Query this component for the module and priority
*/
opal_output_verbose(5, orte_sensor_base_framework.framework_output,
"sensor:base:select Querying component [%s]",
component->base_version.mca_component_name);
component->base_version.mca_query_component(&module, &priority);
/*
* If no module was returned or negative priority, then skip component
*/
if (NULL == module || priority < 0) {
opal_output_verbose(5, orte_sensor_base_framework.framework_output,
"sensor:base:select Skipping component [%s]. Query failed to return a module",
component->base_version.mca_component_name );
continue;
}
/* check to see if we already have someone who senses the
* same things - if so, take the higher priority one
*/
duplicate = false;
for (i=0; i < tmp_array.size; i++) {
tmp_module = (orte_sensor_active_module_t*)opal_pointer_array_get_item(&tmp_array, i);
if (NULL == tmp_module) {
continue;
}
if (0 == strcmp(component->data_measured, tmp_module->component->data_measured)) {
if (tmp_module->priority < priority) {
opal_output_verbose(5, orte_sensor_base_framework.framework_output,
"sensor:base:select Replacing component %s with %s - both measure %s",
tmp_module->component->base_version.mca_component_name,
component->base_version.mca_component_name,
component->data_measured);
OBJ_RELEASE(tmp_module);
opal_pointer_array_set_item(&tmp_array, i, NULL);
break;
} else {
duplicate = true;
}
}
}
if (duplicate) {
/* ignore this component */
opal_output_verbose(5, orte_sensor_base_framework.framework_output,
"sensor:base:select Ignoring component %s - duplicate with higher priority measures %s",
component->base_version.mca_component_name,
component->data_measured);
continue;
}
/*
* Append them to the temporary list, we will sort later
*/
opal_output_verbose(5, orte_sensor_base_framework.framework_output,
"sensor:base:select Query of component [%s] set priority to %d",
component->base_version.mca_component_name, priority);
tmp_module = OBJ_NEW(orte_sensor_active_module_t);
tmp_module->component = component;
tmp_module->module = (orte_sensor_base_module_t*)module;
tmp_module->priority = priority;
opal_pointer_array_add(&tmp_array, (void*)tmp_module);
none_found = false;
}
if (none_found) {
/* okay for no modules to be found */
return ORTE_SUCCESS;
}
/*
* Sort the list by decending priority
*/
priority = 0;
for(j = 0; j < tmp_array.size; ++j) {
tmp_module_sw = (orte_sensor_active_module_t*)opal_pointer_array_get_item(&tmp_array, j);
if( NULL == tmp_module_sw ) {
continue;
}
low_i = -1;
priority = tmp_module_sw->priority;
for(i = 0; i < tmp_array.size; ++i) {
tmp_module = (orte_sensor_active_module_t*)opal_pointer_array_get_item(&tmp_array, i);
if( NULL == tmp_module ) {
continue;
}
if( tmp_module->priority > priority ) {
low_i = i;
priority = tmp_module->priority;
}
}
if( low_i >= 0 ) {
tmp_module = (orte_sensor_active_module_t*)opal_pointer_array_get_item(&tmp_array, low_i);
opal_pointer_array_set_item(&tmp_array, low_i, NULL);
j--; /* Try this entry again, if it is not the lowest */
} else {
tmp_module = tmp_module_sw;
opal_pointer_array_set_item(&tmp_array, j, NULL);
}
opal_output_verbose(5, orte_sensor_base_framework.framework_output,
"sensor:base:select Add module with priority [%s] %d",
tmp_module->component->base_version.mca_component_name, tmp_module->priority);
opal_pointer_array_add(&orte_sensor_base.modules, tmp_module);
}
OBJ_DESTRUCT(&tmp_array);
/*
* Initialize each of the modules in priority order from
* highest to lowest
*/
for(i = 0; i < orte_sensor_base.modules.size; ++i) {
i_module = (orte_sensor_active_module_t*)opal_pointer_array_get_item(&orte_sensor_base.modules, i);
if( NULL == i_module ) {
continue;
}
if( NULL != i_module->module->init ) {
if (ORTE_SUCCESS != i_module->module->init()) {
/* can't sample - however, if we are the HNP,
* then we need this module
* anyway so we can log incoming data
*/
if (ORTE_PROC_IS_HNP) {
i_module->sampling = false;
} else {
opal_pointer_array_set_item(&orte_sensor_base.modules, i, NULL);
OBJ_RELEASE(i_module);
}
}
}
}
return ORTE_SUCCESS;
}

Просмотреть файл

@ -1,67 +0,0 @@
/*
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2012 Los Alamos National Security, Inc. All rights reserved.
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/** @file:
*/
#ifndef MCA_SENSOR_PRIVATE_H
#define MCA_SENSOR_PRIVATE_H
/*
* includes
*/
#include "orte_config.h"
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif /* HAVE_UNISTD_H */
#include "opal/class/opal_pointer_array.h"
#include "opal/mca/event/event.h"
#include "orte/runtime/orte_globals.h"
#include "orte/mca/sensor/sensor.h"
/*
* Global functions for MCA overall collective open and close
*/
BEGIN_C_DECLS
/* define a struct to hold framework-global values */
typedef struct {
opal_pointer_array_t modules;
bool log_samples;
bool active;
struct timeval rate;
opal_event_t sample_ev;
opal_buffer_t *samples;
} orte_sensor_base_t;
typedef struct {
opal_object_t super;
orte_sensor_base_component_t *component;
orte_sensor_base_module_t *module;
int priority;
bool sampling;
} orte_sensor_active_module_t;
OBJ_CLASS_DECLARATION(orte_sensor_active_module_t);
ORTE_DECLSPEC extern orte_sensor_base_t orte_sensor_base;
ORTE_DECLSPEC void orte_sensor_base_start(orte_jobid_t job);
ORTE_DECLSPEC void orte_sensor_base_stop(orte_jobid_t job);
ORTE_DECLSPEC void orte_sensor_base_sample(int fd, short args, void *cbdata);
ORTE_DECLSPEC void orte_sensor_base_log(char *comp, opal_buffer_t *data);
END_C_DECLS
#endif

Просмотреть файл

@ -1,24 +0,0 @@
# -*- shell-script -*-
#
# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
# Copyright (c) 2011-2013 Los Alamos National Security, LLC.
# All rights reserved.
# Copyright (c) 2017 Intel, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# MCA_sensor_file_CONFIG([action-if-found], [action-if-not-found])
# -----------------------------------------------------------
AC_DEFUN([MCA_orte_sensor_file_CONFIG], [
AC_CONFIG_FILES([orte/mca/sensor/file/Makefile])
# if we don't want sensors, don't compile
# this component
AS_IF([test "$orte_want_sensors" = "1"],
[$1], [$2])
])dnl

Просмотреть файл

@ -1,354 +0,0 @@
/*
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2011-2012 Los Alamos National Security, LLC.
* All rights reserved.
*
* Copyright (c) 2017 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "orte/constants.h"
#include "orte/types.h"
#include <stdio.h>
#include <stddef.h>
#include <ctype.h>
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif
#ifdef HAVE_NETDB_H
#include <netdb.h>
#endif
#ifdef HAVE_SYS_PARAM_H
#include <sys/param.h>
#endif
#include <fcntl.h>
#include <errno.h>
#include <signal.h>
#ifdef HAVE_TIME_H
#include <time.h>
#endif
#include <sys/stat.h>
#include <sys/types.h>
#include "opal_stdint.h"
#include "opal/util/output.h"
#include "orte/util/show_help.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/state/state.h"
#include "orte/util/name_fns.h"
#include "orte/runtime/orte_globals.h"
#include "orte/mca/sensor/base/base.h"
#include "orte/mca/sensor/base/sensor_private.h"
#include "sensor_file.h"
/* declare the API functions */
static int init(void);
static void finalize(void);
static void start(orte_jobid_t job);
static void stop(orte_jobid_t job);
static void file_sample(void);
static void file_log(opal_buffer_t *sample);
/* instantiate the module */
orte_sensor_base_module_t orte_sensor_file_module = {
init,
finalize,
start,
stop,
file_sample,
file_log
};
/* define a tracking object */
typedef struct {
opal_list_item_t super;
orte_jobid_t jobid;
orte_vpid_t vpid;
char *file;
int tick;
bool check_size;
bool check_access;
bool check_mod;
int32_t file_size;
time_t last_access;
time_t last_mod;
int limit;
} file_tracker_t;
static void ft_constructor(file_tracker_t *ft)
{
ft->file = NULL;
ft->tick = 0;
ft->file_size = 0;
ft->last_access = 0;
ft->last_mod = 0;
ft->limit = 0;
}
static void ft_destructor(file_tracker_t *ft)
{
if (NULL != ft->file) {
free(ft->file);
}
}
OBJ_CLASS_INSTANCE(file_tracker_t,
opal_list_item_t,
ft_constructor, ft_destructor);
/* local globals */
static opal_list_t jobs;
static int init(void)
{
OBJ_CONSTRUCT(&jobs, opal_list_t);
return ORTE_SUCCESS;
}
static void finalize(void)
{
opal_list_item_t *item;
while (NULL != (item = opal_list_remove_first(&jobs))) {
OBJ_RELEASE(item);
}
OBJ_DESTRUCT(&jobs);
return;
}
static bool find_value(orte_app_context_t *app,
char *pattern, char **value)
{
int i;
char *ptr;
for (i=0; NULL != app->env[i]; i++) {
if (0 == strncmp(app->env[i], pattern, strlen(pattern))) {
ptr = strchr(app->env[i], '=');
ptr++;
if (NULL != value) {
*value = strdup(ptr);
}
return true;
}
}
return false;
}
/*
* Start monitoring of local processes
*/
static void start(orte_jobid_t jobid)
{
orte_job_t *jobdat;
orte_app_context_t *app, *aptr;
int i;
char *filename;
file_tracker_t *ft;
char *ptr;
/* cannot monitor my own job */
if (jobid == ORTE_PROC_MY_NAME->jobid && ORTE_JOBID_WILDCARD != jobid) {
return;
}
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base_framework.framework_output,
"%s starting file monitoring for job %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOBID_PRINT(jobid)));
/* get the local jobdat for this job */
if (NULL == (jobdat = orte_get_job_data_object(jobid))) {
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
return;
}
/* must be at least one app_context, so use the first one found */
app = NULL;
for (i=0; i < jobdat->apps->size; i++) {
if (NULL != (aptr = (orte_app_context_t*)opal_pointer_array_get_item(jobdat->apps, i))) {
app = aptr;
break;
}
}
if (NULL == app) {
/* got a problem */
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
return;
}
/* search the environ to get the filename */
if (!find_value(app, "OMPI_MCA_sensor_file_filename", &filename)) {
/* was a default file given */
if (NULL == mca_sensor_file_component.file) {
/* can't do anything without a file */
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base_framework.framework_output,
"%s sensor:file no file for job %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOBID_PRINT(jobid)));
return;
}
filename = mca_sensor_file_component.file;
}
/* create the tracking object */
ft = OBJ_NEW(file_tracker_t);
ft->jobid = jobid;
ft->file = strdup(filename);
/* search the environ to see what we are checking */
if (!find_value(app, "OMPI_MCA_sensor_file_check_size", &ptr)) {
/* was a default value given */
if (0 < mca_sensor_file_component.check_size) {
ft->check_size = OPAL_INT_TO_BOOL(mca_sensor_file_component.check_size);
}
} else {
ft->check_size = OPAL_INT_TO_BOOL(strtol(ptr, NULL, 10));
free(ptr);
}
if (!find_value(app, "OMPI_MCA_sensor_file_check_access", &ptr)) {
/* was a default value given */
if (0 < mca_sensor_file_component.check_access) {
ft->check_access = OPAL_INT_TO_BOOL(mca_sensor_file_component.check_access);
}
} else {
ft->check_access = OPAL_INT_TO_BOOL(strtol(ptr, NULL, 10));
free(ptr);
}
if (!find_value(app, "OMPI_MCA_sensor_file_check_mod", &ptr)) {
/* was a default value given */
if (0 < mca_sensor_file_component.check_mod) {
ft->check_mod = OPAL_INT_TO_BOOL(mca_sensor_file_component.check_mod);
}
} else {
ft->check_mod = OPAL_INT_TO_BOOL(strtol(ptr, NULL, 10));
free(ptr);
}
if (!find_value(app, "OMPI_MCA_sensor_file_limit", &ptr)) {
ft->limit = mca_sensor_file_component.limit;
} else {
ft->limit = strtol(ptr, NULL, 10);
free(ptr);
}
opal_list_append(&jobs, &ft->super);
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base_framework.framework_output,
"%s file %s monitored for %s%s%s with limit %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ft->file, ft->check_size ? "SIZE:" : " ",
ft->check_access ? "ACCESS TIME:" : " ",
ft->check_mod ? "MOD TIME" : " ", ft->limit));
return;
}
static void stop(orte_jobid_t jobid)
{
opal_list_item_t *item;
file_tracker_t *ft;
/* cannot monitor my own job */
if (jobid == ORTE_PROC_MY_NAME->jobid && ORTE_JOBID_WILDCARD != jobid) {
return;
}
for (item = opal_list_get_first(&jobs);
item != opal_list_get_end(&jobs);
item = opal_list_get_next(item)) {
ft = (file_tracker_t*)item;
if (jobid == ft->jobid || ORTE_JOBID_WILDCARD == jobid) {
opal_list_remove_item(&jobs, item);
OBJ_RELEASE(item);
}
}
return;
}
static void file_sample(void)
{
struct stat buf;
opal_list_item_t *item;
file_tracker_t *ft;
orte_job_t *jdata;
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base_framework.framework_output,
"%s sampling files",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
for (item = opal_list_get_first(&jobs);
item != opal_list_get_end(&jobs);
item = opal_list_get_next(item)) {
ft = (file_tracker_t*)item;
/* stat the file and get its size */
if (0 > stat(ft->file, &buf)) {
/* cannot stat file */
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base_framework.framework_output,
"%s could not stat %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ft->file));
continue;
}
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base_framework.framework_output,
"%s size %lu access %s\tmod %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
(unsigned long)buf.st_size, ctime(&buf.st_atime), ctime(&buf.st_mtime)));
if (ft->check_size) {
if (buf.st_size == ft->file_size) {
ft->tick++;
goto CHECK;
} else {
ft->tick = 0;
ft->file_size = buf.st_size;
}
}
if (ft->check_access) {
if (buf.st_atime == ft->last_access) {
ft->tick++;
goto CHECK;
} else {
ft->tick = 0;
ft->last_access = buf.st_atime;
}
}
if (ft->check_mod) {
if (buf.st_mtime == ft->last_mod) {
ft->tick++;
goto CHECK;
} else {
ft->tick = 0;
ft->last_mod = buf.st_mtime;
}
}
CHECK:
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base_framework.framework_output,
"%s sampled file %s tick %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ft->file, ft->tick));
if (ft->tick == ft->limit) {
orte_show_help("help-orte-sensor-file.txt", "file-stalled", true,
ft->file, ft->file_size, ctime(&ft->last_access), ctime(&ft->last_mod));
jdata = orte_get_job_data_object(ft->jobid);
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_SENSOR_BOUND_EXCEEDED);
}
}
}
static void file_log(opal_buffer_t *sample)
{
}

Просмотреть файл

@ -1,42 +0,0 @@
/*
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
*
* Copyright (c) 2017 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/**
* @file
*
* File movement sensor
*/
#ifndef ORTE_SENSOR_FILE_H
#define ORTE_SENSOR_FILE_H
#include "orte_config.h"
#include "orte/mca/sensor/sensor.h"
BEGIN_C_DECLS
struct orte_sensor_file_component_t {
orte_sensor_base_component_t super;
int sample_rate;
char *file;
bool check_size;
bool check_access;
bool check_mod;
int limit;
};
typedef struct orte_sensor_file_component_t orte_sensor_file_component_t;
ORTE_MODULE_DECLSPEC extern orte_sensor_file_component_t mca_sensor_file_component;
extern orte_sensor_base_module_t orte_sensor_file_module;
END_C_DECLS
#endif

Просмотреть файл

@ -1,120 +0,0 @@
/*
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2017 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "orte/constants.h"
#include "opal/mca/base/base.h"
#include "opal/util/output.h"
#include "opal/class/opal_pointer_array.h"
#include "orte/util/proc_info.h"
#include "orte/util/show_help.h"
#include "sensor_file.h"
/*
* Local functions
*/
static int orte_sensor_file_register (void);
static int orte_sensor_file_open(void);
static int orte_sensor_file_close(void);
static int orte_sensor_file_query(mca_base_module_t **module, int *priority);
orte_sensor_file_component_t mca_sensor_file_component = {
{
{
ORTE_SENSOR_BASE_VERSION_1_0_0,
"file", /* MCA component name */
ORTE_MAJOR_VERSION, /* MCA component major version */
ORTE_MINOR_VERSION, /* MCA component minor version */
ORTE_RELEASE_VERSION, /* MCA component release version */
orte_sensor_file_open, /* component open */
orte_sensor_file_close, /* component close */
orte_sensor_file_query, /* component query */
orte_sensor_file_register
},
{
/* The component is checkpoint ready */
MCA_BASE_METADATA_PARAM_CHECKPOINT
},
"filemods" // data being sensed
}
};
/**
* component register/open/close/init function
*/
static int orte_sensor_file_register (void)
{
mca_base_component_t *c = &mca_sensor_file_component.super.base_version;
/* lookup parameters */
mca_sensor_file_component.file = NULL;
(void) mca_base_component_var_register (c, "filename", "File to be monitored",
MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_ALL_EQ,
&mca_sensor_file_component.file);
mca_sensor_file_component.check_size = false;
(void) mca_base_component_var_register (c, "check_size", "Check the file size",
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_ALL_EQ,
&mca_sensor_file_component.check_size);
mca_sensor_file_component.check_access = false;
(void) mca_base_component_var_register (c, "check_access", "Check access time",
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_ALL_EQ,
&mca_sensor_file_component.check_access);
mca_sensor_file_component.check_mod = false;
(void) mca_base_component_var_register (c, "check_mod", "Check modification time",
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_ALL_EQ,
&mca_sensor_file_component.check_mod);
mca_sensor_file_component.limit = 3;
(void) mca_base_component_var_register (c, "limit",
"Number of times the sensor can detect no motion before declaring error (default=3)",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_ALL_EQ,
&mca_sensor_file_component.limit);
return ORTE_SUCCESS;
}
static int orte_sensor_file_open(void)
{
return ORTE_SUCCESS;
}
static int orte_sensor_file_query(mca_base_module_t **module, int *priority)
{
*priority = 20; /* higher than heartbeat */
*module = (mca_base_module_t *)&orte_sensor_file_module;
return ORTE_SUCCESS;
}
/**
* Close all subsystems.
*/
static int orte_sensor_file_close(void)
{
return ORTE_SUCCESS;
}

Просмотреть файл

@ -1,36 +0,0 @@
#
# Copyright (c) 2009-2011 Cisco Systems, Inc. All rights reserved.
#
# Copyright (c) 2017 Intel, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
sources = \
sensor_ft_tester.c \
sensor_ft_tester.h \
sensor_ft_tester_component.c
# Make the output library in this directory, and name it either
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
# (for static builds).
if MCA_BUILD_orte_sensor_ft_tester_DSO
component_noinst =
component_install = mca_sensor_ft_tester.la
else
component_noinst = libmca_sensor_ft_tester.la
component_install =
endif
mcacomponentdir = $(ompilibdir)
mcacomponent_LTLIBRARIES = $(component_install)
mca_sensor_ft_tester_la_SOURCES = $(sources)
mca_sensor_ft_tester_la_LDFLAGS = -module -avoid-version
noinst_LTLIBRARIES = $(component_noinst)
libmca_sensor_ft_tester_la_SOURCES =$(sources)
libmca_sensor_ft_tester_la_LDFLAGS = -module -avoid-version

Просмотреть файл

@ -1,24 +0,0 @@
# -*- shell-script -*-
#
# Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
# Copyright (c) 2011-2013 Los Alamos National Security, LLC.
# All rights reserved.
# Copyright (c) 2017 Intel, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# MCA_sensor_ft_tester_CONFIG([action-if-found], [action-if-not-found])
# -----------------------------------------------------------
AC_DEFUN([MCA_orte_sensor_ft_tester_CONFIG], [
AC_CONFIG_FILES([orte/mca/sensor/ft_tester/Makefile])
# if we don't want sensors, don't compile
# this component
AS_IF([test "$orte_want_sensors" = "1"],
[$1], [$2])
])dnl

Просмотреть файл

@ -1,41 +0,0 @@
/*
* Copyright (c) 2009-2011 Cisco Systems, Inc. All rights reserved.
*
* Copyright (c) 2017 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/**
* @file
*
* Process Resource Utilization sensor
*/
#ifndef ORTE_SENSOR_FT_TESTER_H
#define ORTE_SENSOR_FT_TESTER_H
#include "orte_config.h"
#include "orte/mca/sensor/sensor.h"
#include "opal/util/alfg.h"
BEGIN_C_DECLS
struct orte_sensor_ft_tester_component_t {
orte_sensor_base_component_t super;
float fail_prob;
float daemon_fail_prob;
bool multi_fail;
};
typedef struct orte_sensor_ft_tester_component_t orte_sensor_ft_tester_component_t;
ORTE_MODULE_DECLSPEC extern orte_sensor_ft_tester_component_t mca_sensor_ft_tester_component;
extern orte_sensor_base_module_t orte_sensor_ft_tester_module;
extern opal_rng_buff_t orte_sensor_ft_rng_buff;
END_C_DECLS
#endif

Просмотреть файл

@ -1,141 +0,0 @@
/*
* Copyright (c) 2010-2011 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2012 Los Alamos National Security, Inc. All rights reserved.
* Copyright (c) 2017 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "orte/constants.h"
#include "opal/mca/base/base.h"
#include "opal/util/output.h"
#include "opal/class/opal_pointer_array.h"
#include "orte/util/proc_info.h"
#include "orte/util/show_help.h"
#include "sensor_ft_tester.h"
/*
* Local functions
*/
static int orte_sensor_ft_tester_register (void);
static int orte_sensor_ft_tester_open(void);
static int orte_sensor_ft_tester_close(void);
static int orte_sensor_ft_tester_query(mca_base_module_t **module, int *priority);
orte_sensor_ft_tester_component_t mca_sensor_ft_tester_component = {
{
{
ORTE_SENSOR_BASE_VERSION_1_0_0,
"ft_tester", /* MCA component name */
ORTE_MAJOR_VERSION, /* MCA component major version */
ORTE_MINOR_VERSION, /* MCA component minor version */
ORTE_RELEASE_VERSION, /* MCA component release version */
orte_sensor_ft_tester_open, /* component open */
orte_sensor_ft_tester_close, /* component close */
orte_sensor_ft_tester_query, /* component query */
orte_sensor_ft_tester_register
},
{
/* The component is checkpoint ready */
MCA_BASE_METADATA_PARAM_CHECKPOINT
},
NULL
}
};
static char *daemon_fail_prob = NULL;
static char *fail_prob = NULL;
opal_rng_buff_t orte_sensor_ft_rng_buff;
/**
* component register/open/close/init function
*/
static int orte_sensor_ft_tester_register (void)
{
mca_base_component_t *c = &mca_sensor_ft_tester_component.super.base_version;
fail_prob = NULL;
(void) mca_base_component_var_register (c, "fail_prob", "Probability of killing a single executable",
MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_READONLY,
&fail_prob);
mca_sensor_ft_tester_component.multi_fail = false;
(void) mca_base_component_var_register (c, "multi_allowed", "Allow multiple executables to be killed at one time",
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_READONLY,
&mca_sensor_ft_tester_component.multi_fail);
daemon_fail_prob = NULL;
(void) mca_base_component_var_register (c, "daemon_fail_prob", "Probability of killing a daemon",
MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_READONLY,
&daemon_fail_prob);
return ORTE_SUCCESS;
}
static int orte_sensor_ft_tester_open(void)
{
/* lookup parameters */
if (NULL != fail_prob) {
mca_sensor_ft_tester_component.fail_prob = strtof(fail_prob, NULL);
if (1.0 < mca_sensor_ft_tester_component.fail_prob) {
/* given in percent */
mca_sensor_ft_tester_component.fail_prob /= 100.0;
}
} else {
mca_sensor_ft_tester_component.fail_prob = 0.0;
}
if (NULL != daemon_fail_prob) {
mca_sensor_ft_tester_component.daemon_fail_prob = strtof(daemon_fail_prob, NULL);
if (1.0 < mca_sensor_ft_tester_component.daemon_fail_prob) {
/* given in percent */
mca_sensor_ft_tester_component.daemon_fail_prob /= 100.0;
}
} else {
mca_sensor_ft_tester_component.daemon_fail_prob = 0.0;
}
return ORTE_SUCCESS;
}
static int orte_sensor_ft_tester_query(mca_base_module_t **module, int *priority)
{
if (0.0 < mca_sensor_ft_tester_component.fail_prob ||
0.0 < mca_sensor_ft_tester_component.daemon_fail_prob) {
*priority = 1; /* at the bottom */
*module = (mca_base_module_t *)&orte_sensor_ft_tester_module;
/* seed the RNG --- Not sure if we should assume all procs use
* the same seed?
*/
opal_srand(&orte_sensor_ft_rng_buff, (uint32_t) getpid());
return ORTE_SUCCESS;
}
*priority = 0;
*module = NULL;
return ORTE_ERROR;
}
/**
* Close all subsystems.
*/
static int orte_sensor_ft_tester_close(void)
{
return ORTE_SUCCESS;
}

Просмотреть файл

@ -1,38 +0,0 @@
#
# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
#
# Copyright (c) 2017 Intel, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
dist_ompidata_DATA = help-orte-sensor-heartbeat.txt
sources = \
sensor_heartbeat.c \
sensor_heartbeat.h \
sensor_heartbeat_component.c
# Make the output library in this directory, and name it either
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
# (for static builds).
if MCA_BUILD_orte_sensor_heartbeat_DSO
component_noinst =
component_install = mca_sensor_heartbeat.la
else
component_noinst = libmca_sensor_heartbeat.la
component_install =
endif
mcacomponentdir = $(ompilibdir)
mcacomponent_LTLIBRARIES = $(component_install)
mca_sensor_heartbeat_la_SOURCES = $(sources)
mca_sensor_heartbeat_la_LDFLAGS = -module -avoid-version
noinst_LTLIBRARIES = $(component_noinst)
libmca_sensor_heartbeat_la_SOURCES =$(sources)
libmca_sensor_heartbeat_la_LDFLAGS = -module -avoid-version

Просмотреть файл

@ -1,24 +0,0 @@
# -*- shell-script -*-
#
# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
# Copyright (c) 2011-2013 Los Alamos National Security, LLC.
# All rights reserved.
# Copyright (c) 2017 Intel, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# MCA_sensor_heartbeat_CONFIG([action-if-found], [action-if-not-found])
# -----------------------------------------------------------
AC_DEFUN([MCA_orte_sensor_heartbeat_CONFIG], [
AC_CONFIG_FILES([orte/mca/sensor/heartbeat/Makefile])
# if we don't want sensors, don't compile
# this component
AS_IF([test "$orte_want_sensors" = "1"],
[$1], [$2])
])dnl

Просмотреть файл

@ -1,279 +0,0 @@
/*
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights
* reserved.
*
* Copyright (c) 2017 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "orte/constants.h"
#include "orte/types.h"
#include <errno.h>
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif /* HAVE_UNISTD_H */
#ifdef HAVE_STRING_H
#include <string.h>
#endif /* HAVE_STRING_H */
#include <stdio.h>
#include "opal_stdint.h"
#include "opal/util/argv.h"
#include "opal/util/output.h"
#include "opal/mca/event/event.h"
#include "orte/util/show_help.h"
#include "orte/util/proc_info.h"
#include "orte/util/name_fns.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/rml/rml.h"
#include "orte/mca/state/state.h"
#include "orte/runtime/orte_wait.h"
#include "orte/runtime/orte_globals.h"
#include "orte/mca/sensor/base/base.h"
#include "orte/mca/sensor/base/sensor_private.h"
#include "sensor_heartbeat.h"
/* declare the API functions */
static int init(void);
static void finalize(void);
static void start(orte_jobid_t job);
static void sample(void);
/* instantiate the module */
orte_sensor_base_module_t orte_sensor_heartbeat_module = {
init,
finalize,
start,
NULL,
sample,
NULL
};
/* declare the local functions */
static void check_heartbeat(int fd, short event, void *arg);
static void recv_beats(int status, orte_process_name_t* sender,
opal_buffer_t *buffer,
orte_rml_tag_t tag, void *cbdata);
/* local globals */
static orte_job_t *daemons=NULL;
static opal_event_t check_ev;
static bool check_active = false;
static struct timeval check_time;
static int init(void)
{
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base_framework.framework_output,
"%s initializing heartbeat recvs",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
/* setup to receive heartbeats */
if (ORTE_PROC_IS_HNP || ORTE_PROC_IS_AGGREGATOR) {
orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD,
ORTE_RML_TAG_HEARTBEAT,
ORTE_RML_PERSISTENT,
recv_beats, NULL);
}
if (ORTE_PROC_IS_HNP) {
daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid);
}
return ORTE_SUCCESS;
}
static void finalize(void)
{
orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_HEARTBEAT);
if (check_active) {
opal_event_del(&check_ev);
check_active = false;
}
return;
}
static void start(orte_jobid_t job)
{
if (!check_active && NULL != daemons) {
/* setup the check event */
check_time.tv_sec = 3 * orte_sensor_base.rate.tv_sec;
check_time.tv_usec = 0;
opal_event_evtimer_set(orte_event_base, &check_ev, check_heartbeat, &check_ev);
opal_event_evtimer_add(&check_ev, &check_time);
check_active = true;
}
}
static void sample(void)
{
opal_buffer_t *buf;
int rc;
orte_process_name_t *tgt;
/* if we are aborting or shutting down, ignore this */
if (orte_abnormal_term_ordered || orte_finalizing || !orte_initialized) {
return;
}
if (ORTE_PROC_IS_CM) {
/* we send to our daemon */
tgt = ORTE_PROC_MY_DAEMON;
} else {
tgt = ORTE_PROC_MY_HNP;
}
/* if my target hasn't been defined yet, ignore - nobody listening yet */
if (ORTE_JOBID_INVALID ==tgt->jobid ||
ORTE_VPID_INVALID == tgt->vpid) {
opal_output_verbose(1, orte_sensor_base_framework.framework_output,
"%s sensor:heartbeat: HNP is not defined",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
return;
}
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base_framework.framework_output,
"%s sending heartbeat",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
/* if we want sampled data included, point to the bucket */
buf = OBJ_NEW(opal_buffer_t);
if (orte_sensor_base.log_samples) {
opal_dss.copy_payload(buf, orte_sensor_base.samples);
OBJ_RELEASE(orte_sensor_base.samples);
/* start a new sample bucket */
orte_sensor_base.samples = OBJ_NEW(opal_buffer_t);
}
/* send heartbeat */
if (ORTE_SUCCESS != (rc = orte_rml.send_buffer_nb(tgt, buf,
ORTE_RML_TAG_HEARTBEAT,
orte_rml_send_callback, NULL))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(buf);
}
}
/* this function automatically gets periodically called
* by the event library so we can check on the state
* of the various orteds
*/
static void check_heartbeat(int fd, short dummy, void *arg)
{
int v;
orte_proc_t *proc;
opal_event_t *tmp = (opal_event_t*)arg;
OPAL_OUTPUT_VERBOSE((3, orte_sensor_base_framework.framework_output,
"%s sensor:check_heartbeat",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
/* if we are aborting or shutting down, ignore this */
if (orte_abnormal_term_ordered || orte_finalizing || !orte_initialized) {
OPAL_OUTPUT_VERBOSE((3, orte_sensor_base_framework.framework_output,
"%s IGNORING CHECK abnorm_term %s fin %s init %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
orte_abnormal_term_ordered ? "TRUE" : "FALSE",
orte_finalizing ? "TRUE" : "FALSE",
orte_initialized ? "TRUE" : "FALSE"));
check_active = false;
return;
}
for (v=0; v < daemons->procs->size; v++) {
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(daemons->procs, v))) {
continue;
}
/* ignore myself */
if (proc->name.vpid == ORTE_PROC_MY_NAME->vpid) {
continue;
}
if (ORTE_PROC_STATE_RUNNING != proc->state) {
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base_framework.framework_output,
"%s sensor:heartbeat DAEMON %s IS NOT RUNNING",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&proc->name)));
continue;
}
if (0 == proc->beat) {
/* no heartbeat recvd in last window */
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base_framework.framework_output,
"%s sensor:check_heartbeat FAILED for daemon %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&proc->name)));
ORTE_ACTIVATE_PROC_STATE(&proc->name, ORTE_PROC_STATE_HEARTBEAT_FAILED);
} else {
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base_framework.framework_output,
"%s HEARTBEAT DETECTED FOR %s: NUM BEATS %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&proc->name), proc->beat));
}
/* reset for next period */
proc->beat = 0;
}
/* reset the timer */
opal_event_evtimer_add(tmp, &check_time);
}
static void recv_beats(int status, orte_process_name_t* sender,
opal_buffer_t *buffer,
orte_rml_tag_t tag, void *cbdata)
{
orte_proc_t *proc;
int rc, n;
char *component=NULL;
opal_buffer_t *buf;
opal_output_verbose(1, orte_sensor_base_framework.framework_output,
"%s received beat from %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(sender));
/* if we are aborting or shutting down, ignore this */
if (orte_abnormal_term_ordered || orte_finalizing || !orte_initialized) {
return;
}
/* get this daemon's object */
if (NULL != daemons) {
if (NULL != (proc = (orte_proc_t*)opal_pointer_array_get_item(daemons->procs, sender->vpid))) {
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base_framework.framework_output,
"%s marked beat from %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(sender)));
proc->beat++;
/* if this daemon has reappeared, reset things */
if (ORTE_PROC_STATE_HEARTBEAT_FAILED == proc->state) {
proc->state = ORTE_PROC_STATE_RUNNING;
}
}
}
/* unload any sampled data */
n=1;
while (OPAL_SUCCESS == (rc = opal_dss.unpack(buffer, &buf, &n, OPAL_BUFFER))) {
if (NULL != buf) {
n=1;
if (OPAL_SUCCESS != (rc = opal_dss.unpack(buf, &component, &n, OPAL_STRING))) {
ORTE_ERROR_LOG(rc);
break;
}
orte_sensor_base_log(component, buf);
OBJ_RELEASE(buf);
free(component);
n=1;
}
}
if (OPAL_ERR_UNPACK_READ_PAST_END_OF_BUFFER != rc) {
ORTE_ERROR_LOG(rc);
}
}

Просмотреть файл

@ -1,32 +0,0 @@
/*
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2012 Los Alamos National Security, Inc. All rights reserved.
*
* Copyright (c) 2017 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/**
* @file
*
* Heartbeat sensor
*/
#ifndef ORTE_SENSOR_HEARTBEAT_H
#define ORTE_SENSOR_HEARTBEAT_H
#include "orte_config.h"
#include "orte/mca/sensor/sensor.h"
BEGIN_C_DECLS
ORTE_MODULE_DECLSPEC extern orte_sensor_base_component_t mca_sensor_heartbeat_component;
extern orte_sensor_base_module_t orte_sensor_heartbeat_module;
END_C_DECLS
#endif

Просмотреть файл

@ -1,75 +0,0 @@
/*
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2012 Los Alamos National Security, Inc. All rights reserved.
* Copyright (c) 2017 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "orte/constants.h"
#include "opal/mca/base/base.h"
#include "opal/util/output.h"
#include "opal/class/opal_pointer_array.h"
#include "orte/util/proc_info.h"
#include "orte/util/show_help.h"
#include "sensor_heartbeat.h"
/*
* Local functions
*/
static int orte_sensor_heartbeat_open(void);
static int orte_sensor_heartbeat_close(void);
static int orte_sensor_heartbeat_query(mca_base_module_t **module, int *priority);
orte_sensor_base_component_t mca_sensor_heartbeat_component = {
{
ORTE_SENSOR_BASE_VERSION_1_0_0,
"heartbeat", /* MCA component name */
ORTE_MAJOR_VERSION, /* MCA component major version */
ORTE_MINOR_VERSION, /* MCA component minor version */
ORTE_RELEASE_VERSION, /* MCA component release version */
orte_sensor_heartbeat_open, /* component open */
orte_sensor_heartbeat_close, /* component close */
orte_sensor_heartbeat_query /* component query */
},
{
/* The component is checkpoint ready */
MCA_BASE_METADATA_PARAM_CHECKPOINT
},
"heartbeat"
};
/**
* component open/close/init function
*/
static int orte_sensor_heartbeat_open(void)
{
return ORTE_SUCCESS;
}
static int orte_sensor_heartbeat_query(mca_base_module_t **module, int *priority)
{
*priority = 5; /* lower than all other samplers so that their data gets included in heartbeat */
*module = (mca_base_module_t *)&orte_sensor_heartbeat_module;
return ORTE_SUCCESS;
}
/**
* Close all subsystems.
*/
static int orte_sensor_heartbeat_close(void)
{
return ORTE_SUCCESS;
}

Просмотреть файл

@ -1,38 +0,0 @@
#
# Copyright (c) 2009-2010 Cisco Systems, Inc. All rights reserved.
#
# Copyright (c) 2017 Intel, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
dist_ompidata_DATA = help-orte-sensor-resusage.txt
sources = \
sensor_resusage.c \
sensor_resusage.h \
sensor_resusage_component.c
# Make the output library in this directory, and name it either
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
# (for static builds).
if MCA_BUILD_orte_sensor_resusage_DSO
component_noinst =
component_install = mca_sensor_resusage.la
else
component_noinst = libmca_sensor_resusage.la
component_install =
endif
mcacomponentdir = $(ompilibdir)
mcacomponent_LTLIBRARIES = $(component_install)
mca_sensor_resusage_la_SOURCES = $(sources)
mca_sensor_resusage_la_LDFLAGS = -module -avoid-version
noinst_LTLIBRARIES = $(component_noinst)
libmca_sensor_resusage_la_SOURCES =$(sources)
libmca_sensor_resusage_la_LDFLAGS = -module -avoid-version

Просмотреть файл

@ -1,24 +0,0 @@
# -*- shell-script -*-
#
# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
# Copyright (c) 2011-2013 Los Alamos National Security, LLC.
# All rights reserved.
# Copyright (c) 2017 Intel, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# MCA_sensor_resusage_CONFIG([action-if-found], [action-if-not-found])
# -----------------------------------------------------------
AC_DEFUN([MCA_orte_sensor_resusage_CONFIG], [
AC_CONFIG_FILES([orte/mca/sensor/resusage/Makefile])
# if we don't want sensors, don't compile
# this component
AS_IF([test "$orte_want_sensors" = "1"],
[$1], [$2])
])dnl

Просмотреть файл

@ -1,21 +0,0 @@
# -*- text -*-
#
# Copyright (c) 2010-2011 Cisco Systems, Inc. All rights reserved.
#
# Copyright (c) 2017 Intel, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# This is the US/English general help file for the memory usage sensor
#
[mem-limit-exceeded]
A process has exceeded the specified limit on memory usage:
Node: %s
Process rank: %s
Memory used: %luGbytes
Memory limit: %luGbytes

Просмотреть файл

@ -1,478 +0,0 @@
/*
* Copyright (c) 2009-2011 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights
* reserved.
*
* Copyright (c) 2017 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "orte/constants.h"
#include "orte/types.h"
#include <errno.h>
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif /* HAVE_UNISTD_H */
#ifdef HAVE_STRING_H
#include <string.h>
#endif /* HAVE_STRING_H */
#include <stdio.h>
#include "opal_stdint.h"
#include "opal/class/opal_pointer_array.h"
#include "opal/class/opal_ring_buffer.h"
#include "opal/dss/dss.h"
#include "opal/util/output.h"
#include "opal/mca/pstat/pstat.h"
#include "opal/mca/db/db.h"
#include "orte/util/proc_info.h"
#include "orte/util/name_fns.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/odls/odls_types.h"
#include "orte/mca/odls/base/odls_private.h"
#include "orte/mca/rml/rml.h"
#include "orte/mca/state/state.h"
#include "orte/runtime/orte_globals.h"
#include "orte/orted/orted.h"
#include "orte/mca/sensor/base/base.h"
#include "orte/mca/sensor/base/sensor_private.h"
#include "sensor_resusage.h"
/* declare the API functions */
static int init(void);
static void finalize(void);
static void sample(void);
static void res_log(opal_buffer_t *sample);
/* instantiate the module */
orte_sensor_base_module_t orte_sensor_resusage_module = {
init,
finalize,
NULL,
NULL,
sample,
res_log
};
static bool log_enabled = true;
static orte_node_t *my_node;
static orte_proc_t *my_proc;
static int init(void)
{
orte_job_t *jdata;
/* ensure my_proc and my_node are available on the global arrays */
if (NULL == (jdata = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid))) {
my_proc = OBJ_NEW(orte_proc_t);
my_node = OBJ_NEW(orte_node_t);
} else {
if (NULL == (my_proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, ORTE_PROC_MY_NAME->vpid))) {
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
return ORTE_ERR_NOT_FOUND;
}
if (NULL == (my_node = my_proc->node)) {
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
return ORTE_ERR_NOT_FOUND;
}
/* protect the objects */
OBJ_RETAIN(my_proc);
OBJ_RETAIN(my_node);
}
return ORTE_SUCCESS;
}
static void finalize(void)
{
if (NULL != my_proc) {
OBJ_RELEASE(my_proc);
}
if (NULL != my_node) {
OBJ_RELEASE(my_node);
}
return;
}
static void sample(void)
{
opal_pstats_t *stats, *st;
opal_node_stats_t *nstats, *nst;
int rc, i;
orte_proc_t *child, *hog=NULL;
float in_use, max_mem;
opal_buffer_t buf, *bptr;
char *comp;
OPAL_OUTPUT_VERBOSE((1, orte_sensor_base_framework.framework_output,
"sample:resusage sampling resource usage"));
/* setup a buffer for our stats */
OBJ_CONSTRUCT(&buf, opal_buffer_t);
/* pack our name */
comp = strdup("resusage");
if (OPAL_SUCCESS != (rc = opal_dss.pack(&buf, &comp, 1, OPAL_STRING))) {
ORTE_ERROR_LOG(rc);
OBJ_DESTRUCT(&buf);
return;
}
free(comp);
/* update stats on ourself and the node */
stats = OBJ_NEW(opal_pstats_t);
nstats = OBJ_NEW(opal_node_stats_t);
if (ORTE_SUCCESS != (rc = opal_pstat.query(orte_process_info.pid, stats, nstats))) {
ORTE_ERROR_LOG(rc);
OBJ_DESTRUCT(stats);
OBJ_RELEASE(nstats);
OBJ_DESTRUCT(&buf);
return;
}
/* the stats framework can't know nodename or rank */
strncpy(stats->node, orte_process_info.nodename, OPAL_PSTAT_MAX_STRING_LEN);
stats->rank = ORTE_PROC_MY_NAME->vpid;
/* locally save the stats */
if (NULL != (st = (opal_pstats_t*)opal_ring_buffer_push(&my_proc->stats, stats))) {
OBJ_RELEASE(st);
}
if (NULL != (nst = (opal_node_stats_t*)opal_ring_buffer_push(&my_node->stats, nstats))) {
/* release the popped value */
OBJ_RELEASE(nst);
}
/* pack them */
if (OPAL_SUCCESS != (rc = opal_dss.pack(&buf, &orte_process_info.nodename, 1, OPAL_STRING))) {
ORTE_ERROR_LOG(rc);
OBJ_DESTRUCT(&buf);
return;
}
if (OPAL_SUCCESS != (rc = opal_dss.pack(&buf, &nstats, 1, OPAL_NODE_STAT))) {
ORTE_ERROR_LOG(rc);
OBJ_DESTRUCT(&buf);
return;
}
if (OPAL_SUCCESS != (rc = opal_dss.pack(&buf, &stats, 1, OPAL_PSTAT))) {
ORTE_ERROR_LOG(rc);
OBJ_DESTRUCT(&buf);
return;
}
/* loop through our children and update their stats */
if (NULL != orte_local_children) {
for (i=0; i < orte_local_children->size; i++) {
if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) {
continue;
}
if (!child->alive) {
continue;
}
if (0 == child->pid) {
/* race condition */
continue;
}
stats = OBJ_NEW(opal_pstats_t);
if (ORTE_SUCCESS != opal_pstat.query(child->pid, stats, NULL)) {
/* may hit a race condition where the process has
* terminated, so just ignore any error
*/
OBJ_RELEASE(stats);
continue;
}
/* the stats framework can't know nodename or rank */
strncpy(stats->node, orte_process_info.nodename, OPAL_PSTAT_MAX_STRING_LEN);
stats->rank = child->name.vpid;
/* store it */
if (NULL != (st = (opal_pstats_t*)opal_ring_buffer_push(&child->stats, stats))) {
OBJ_RELEASE(st);
}
/* pack them */
if (OPAL_SUCCESS != (rc = opal_dss.pack(&buf, &stats, 1, OPAL_PSTAT))) {
ORTE_ERROR_LOG(rc);
OBJ_DESTRUCT(&buf);
return;
}
}
}
/* xfer any data for transmission */
if (0 < buf.bytes_used) {
bptr = &buf;
if (OPAL_SUCCESS != (rc = opal_dss.pack(orte_sensor_base.samples, &bptr, 1, OPAL_BUFFER))) {
ORTE_ERROR_LOG(rc);
OBJ_DESTRUCT(&buf);
return;
}
}
OBJ_DESTRUCT(&buf);
/* are there any issues with node-level usage? */
nst = (opal_node_stats_t*)opal_ring_buffer_poke(&my_node->stats, -1);
if (NULL != nst && 0.0 < mca_sensor_resusage_component.node_memory_limit) {
OPAL_OUTPUT_VERBOSE((2, orte_sensor_base_framework.framework_output,
"%s CHECKING NODE MEM",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
/* compute the percentage of node memory in-use */
in_use = 1.0 - (nst->free_mem / nst->total_mem);
OPAL_OUTPUT_VERBOSE((2, orte_sensor_base_framework.framework_output,
"%s PERCENT USED: %f LIMIT: %f",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
in_use, mca_sensor_resusage_component.node_memory_limit));
if (mca_sensor_resusage_component.node_memory_limit <= in_use) {
/* loop through our children and find the biggest hog */
hog = NULL;
max_mem = 0.0;
for (i=0; i < orte_local_children->size; i++) {
if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) {
continue;
}
if (!child->alive) {
continue;
}
if (0 == child->pid) {
/* race condition */
continue;
}
if (NULL == (st = (opal_pstats_t*)opal_ring_buffer_poke(&child->stats, -1))) {
continue;
}
OPAL_OUTPUT_VERBOSE((5, orte_sensor_base_framework.framework_output,
"%s PROC %s AT VSIZE %f",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&child->name), st->vsize));
if (max_mem < st->vsize) {
hog = child;
max_mem = st->vsize;
}
}
if (NULL == hog) {
/* if all children dead and we are still too big,
* then we must be the culprit - abort
*/
OPAL_OUTPUT_VERBOSE((2, orte_sensor_base_framework.framework_output,
"%s NO CHILD: COMMITTING SUICIDE",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
orte_errmgr.abort(ORTE_ERR_MEM_LIMIT_EXCEEDED, NULL);
} else {
/* report the problem */
OPAL_OUTPUT_VERBOSE((2, orte_sensor_base_framework.framework_output,
"%s REPORTING %s TO ERRMGR FOR EXCEEDING LIMITS",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&hog->name)));
ORTE_ACTIVATE_PROC_STATE(&hog->name, ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED);
}
/* since we have ordered someone to die, we've done enough for this
* time around - don't check proc limits as well
*/
return;
}
}
/* check proc limits */
if (0.0 < mca_sensor_resusage_component.proc_memory_limit) {
OPAL_OUTPUT_VERBOSE((2, orte_sensor_base_framework.framework_output,
"%s CHECKING PROC MEM",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
/* check my children first */
for (i=0; i < orte_local_children->size; i++) {
if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) {
continue;
}
if (!child->alive) {
continue;
}
if (0 == child->pid) {
/* race condition */
continue;
}
if (NULL == (st = (opal_pstats_t*)opal_ring_buffer_poke(&child->stats, -1))) {
continue;
}
OPAL_OUTPUT_VERBOSE((5, orte_sensor_base_framework.framework_output,
"%s PROC %s AT VSIZE %f",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&child->name), st->vsize));
if (mca_sensor_resusage_component.proc_memory_limit <= st->vsize) {
/* report the problem */
ORTE_ACTIVATE_PROC_STATE(&child->name, ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED);
}
}
}
}
static void res_log(opal_buffer_t *sample)
{
opal_pstats_t *st=NULL;
opal_node_stats_t *nst=NULL;
int rc, n, i;
opal_value_t kv[14];
char *node;
if (!log_enabled) {
return;
}
/* unpack the node name */
n=1;
if (OPAL_SUCCESS != (rc = opal_dss.unpack(sample, &node, &n, OPAL_STRING))) {
ORTE_ERROR_LOG(rc);
return;
}
/* unpack the node stats */
n=1;
if (OPAL_SUCCESS != (rc = opal_dss.unpack(sample, &nst, &n, OPAL_NODE_STAT))) {
ORTE_ERROR_LOG(rc);
return;
}
if (mca_sensor_resusage_component.log_node_stats) {
/* convert this into an array of opal_value_t's - no clean way
* to do this, so have to just manually map each field
*/
for (i=0; i < 13; i++) {
OBJ_CONSTRUCT(&kv[i], opal_value_t);
}
i=0;
kv[i].key = strdup("ctime");
kv[i].type = OPAL_TIMEVAL;
kv[i].data.tv.tv_sec = nst->sample_time.tv_sec;
kv[i++].data.tv.tv_usec = nst->sample_time.tv_usec;
kv[i].key = "hostname";
kv[i].type = OPAL_STRING;
kv[i++].data.string = strdup(node);
kv[i].key = strdup("total_mem");
kv[i].type = OPAL_FLOAT;
kv[i++].data.fval = nst->total_mem;
kv[i].key = strdup("free_mem");
kv[i].type = OPAL_FLOAT;
kv[i++].data.fval = nst->free_mem;
kv[i].key = strdup("buffers");
kv[i].type = OPAL_FLOAT;
kv[i++].data.fval = nst->buffers;
kv[i].key = strdup("cached");
kv[i].type = OPAL_FLOAT;
kv[i++].data.fval = nst->cached;
kv[i].key = strdup("swap_total");
kv[i].type = OPAL_FLOAT;
kv[i++].data.fval = nst->swap_total;
kv[i].key = strdup("swap_free");
kv[i].type = OPAL_FLOAT;
kv[i++].data.fval = nst->swap_free;
kv[i].key = strdup("mapped");
kv[i].type = OPAL_FLOAT;
kv[i++].data.fval = nst->mapped;
kv[i].key = strdup("swap_cached");
kv[i].type = OPAL_FLOAT;
kv[i++].data.fval = nst->swap_cached;
kv[i].key = strdup("la");
kv[i].type = OPAL_FLOAT;
kv[i++].data.fval = nst->la;
kv[i].key = strdup("la5");
kv[i].type = OPAL_FLOAT;
kv[i++].data.fval = nst->la5;
kv[i].key = strdup("la15");
kv[i].type = OPAL_FLOAT;
kv[i++].data.fval = nst->la15;
/* store it */
if (ORTE_SUCCESS != (rc = opal_db.add_log("nodestats", kv, 12))) {
/* don't bark about it - just quietly disable the log */
log_enabled = false;
}
for (i=0; i < 12; i++) {
OBJ_DESTRUCT(&kv[i]);
}
}
OBJ_RELEASE(nst);
if (mca_sensor_resusage_component.log_process_stats) {
/* unpack all process stats */
n=1;
while (OPAL_SUCCESS == (rc = opal_dss.unpack(sample, &st, &n, OPAL_PSTAT))) {
for (i=0; i < 14; i++) {
OBJ_CONSTRUCT(&kv[i], opal_value_t);
}
kv[0].key = strdup("node");
kv[0].type = OPAL_STRING;
kv[0].data.string = strdup(st->node);
kv[1].key = strdup("rank");
kv[1].type = OPAL_INT32;
kv[1].data.int32 = st->rank;
kv[2].key = strdup("pid");
kv[2].type = OPAL_PID;
kv[2].data.pid = st->pid;
kv[3].key = strdup("cmd");
kv[3].type = OPAL_STRING;
kv[3].data.string = strdup(st->cmd);
kv[4].key = strdup("state");
kv[4].type = OPAL_STRING;
kv[4].data.string = (char*)malloc(3 * sizeof(char));
kv[4].data.string[0] = st->state[0];
kv[4].data.string[1] = st->state[1];
kv[4].data.string[2] = '\0';
kv[5].key = strdup("time");
kv[5].type = OPAL_TIMEVAL;
kv[5].data.tv.tv_sec = st->time.tv_sec;
kv[5].data.tv.tv_usec = st->time.tv_usec;
kv[6].key = strdup("percent_cpu");
kv[6].type = OPAL_FLOAT;
kv[6].data.fval = st->percent_cpu;
kv[7].key = strdup("priority");
kv[7].type = OPAL_INT32;
kv[7].data.int32 = st->priority;
kv[8].key = strdup("num_threads");
kv[8].type = OPAL_INT16;
kv[8].data.int16 = st->num_threads;
kv[9].key = strdup("vsize");
kv[9].type = OPAL_FLOAT;
kv[9].data.fval = st->vsize;
kv[10].key = strdup("rss");
kv[10].type = OPAL_FLOAT;
kv[10].data.fval = st->rss;
kv[11].key = strdup("peak_vsize");
kv[11].type = OPAL_FLOAT;
kv[11].data.fval = st->peak_vsize;
kv[12].key = strdup("processor");
kv[12].type = OPAL_INT16;
kv[12].data.int16 = st->processor;
kv[13].key = strdup("sample_time");
kv[13].type = OPAL_TIMEVAL;
kv[13].data.tv.tv_sec = st->sample_time.tv_sec;
kv[13].data.tv.tv_usec = st->sample_time.tv_usec;
/* store it */
if (ORTE_SUCCESS != (rc = opal_db.add_log("procstats", kv, 14))) {
log_enabled = false;
}
for (i=0; i < 14; i++) {
OBJ_DESTRUCT(&kv[i]);
}
OBJ_RELEASE(st);
n=1;
}
if (OPAL_ERR_UNPACK_READ_PAST_END_OF_BUFFER != rc) {
ORTE_ERROR_LOG(rc);
}
}
}

Просмотреть файл

@ -1,41 +0,0 @@
/*
* Copyright (c) 2009-2011 Cisco Systems, Inc. All rights reserved.
*
* Copyright (c) 2017 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/**
* @file
*
* Process Resource Utilization sensor
*/
#ifndef ORTE_SENSOR_RESUSAGE_H
#define ORTE_SENSOR_RESUSAGE_H
#include "orte_config.h"
#include "orte/mca/sensor/sensor.h"
BEGIN_C_DECLS
struct orte_sensor_resusage_component_t {
orte_sensor_base_component_t super;
int sample_rate;
float node_memory_limit;
float proc_memory_limit;
bool log_node_stats;
bool log_process_stats;
};
typedef struct orte_sensor_resusage_component_t orte_sensor_resusage_component_t;
ORTE_MODULE_DECLSPEC extern orte_sensor_resusage_component_t mca_sensor_resusage_component;
extern orte_sensor_base_module_t orte_sensor_resusage_module;
END_C_DECLS
#endif

Просмотреть файл

@ -1,138 +0,0 @@
/*
* Copyright (c) 2010-2011 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2017 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "orte/constants.h"
#include "opal/mca/base/base.h"
#include "opal/util/output.h"
#include "opal/class/opal_pointer_array.h"
#include "orte/util/proc_info.h"
#include "orte/util/show_help.h"
#include "sensor_resusage.h"
/*
* Local functions
*/
static int orte_sensor_resusage_register (void);
static int orte_sensor_resusage_open(void);
static int orte_sensor_resusage_close(void);
static int orte_sensor_resusage_query(mca_base_module_t **module, int *priority);
orte_sensor_resusage_component_t mca_sensor_resusage_component = {
{
{
ORTE_SENSOR_BASE_VERSION_1_0_0,
"resusage", /* MCA component name */
ORTE_MAJOR_VERSION, /* MCA component major version */
ORTE_MINOR_VERSION, /* MCA component minor version */
ORTE_RELEASE_VERSION, /* MCA component release version */
orte_sensor_resusage_open, /* component open */
orte_sensor_resusage_close, /* component close */
orte_sensor_resusage_query, /* component query */
orte_sensor_resusage_register
},
{
/* The component is checkpoint ready */
MCA_BASE_METADATA_PARAM_CHECKPOINT
},
"procresource,noderesource"
}
};
static int node_memory_limit;
static int proc_memory_limit;
/**
* component open/close/init function
*/
static int orte_sensor_resusage_register (void)
{
mca_base_component_t *c = &mca_sensor_resusage_component.super.base_version;
mca_sensor_resusage_component.sample_rate = 0;
(void) mca_base_component_var_register (c, "sample_rate", "Sample rate in seconds (default: 0)",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_READONLY,
&mca_sensor_resusage_component.sample_rate);
if (mca_sensor_resusage_component.sample_rate < 0) {
opal_output(0, "Illegal value %d - must be > 0", mca_sensor_resusage_component.sample_rate);
return ORTE_ERR_BAD_PARAM;
}
node_memory_limit = 0;
(void) mca_base_component_var_register (c, "node_memory_limit",
"Percentage of total memory that can be in-use",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_READONLY,
&node_memory_limit);
mca_sensor_resusage_component.node_memory_limit = (float)node_memory_limit/100.0;
proc_memory_limit = 0;
(void) mca_base_component_var_register (c, "proc_memory_limit",
"Max virtual memory size in MBytes",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_READONLY,
&proc_memory_limit);
mca_sensor_resusage_component.proc_memory_limit = (float) proc_memory_limit;
mca_sensor_resusage_component.log_node_stats = false;
(void) mca_base_component_var_register (c, "log_node_stats", "Log the node stats",
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_READONLY,
&mca_sensor_resusage_component.log_node_stats);
mca_sensor_resusage_component.log_process_stats = false;
(void) mca_base_component_var_register (c, "log_process_stats", "Log the process stats",
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_READONLY,
&mca_sensor_resusage_component.log_process_stats);
return ORTE_SUCCESS;
}
static int orte_sensor_resusage_open(void)
{
if (mca_sensor_resusage_component.sample_rate < 0) {
opal_output(0, "Illegal value %d - must be > 0", mca_sensor_resusage_component.sample_rate);
return ORTE_ERR_FATAL;
}
mca_sensor_resusage_component.node_memory_limit = (float) node_memory_limit/100.0;
mca_sensor_resusage_component.proc_memory_limit = (float) proc_memory_limit;
return ORTE_SUCCESS;
}
static int orte_sensor_resusage_query(mca_base_module_t **module, int *priority)
{
*priority = 100; /* ahead of heartbeat */
*module = (mca_base_module_t *)&orte_sensor_resusage_module;
return ORTE_SUCCESS;
}
/**
* Close all subsystems.
*/
static int orte_sensor_resusage_close(void)
{
return ORTE_SUCCESS;
}

Просмотреть файл

@ -1,107 +0,0 @@
/*
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2012 Los Alamos National Security, Inc. All rights reserved.
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*
* @file:
*
*/
#ifndef MCA_SENSOR_H
#define MCA_SENSOR_H
/*
* includes
*/
#include "orte_config.h"
#include "orte/types.h"
#include "opal/mca/mca.h"
BEGIN_C_DECLS
/*
* Component functions - all MUST be provided!
*/
/* start collecting data */
typedef void (*orte_sensor_API_module_start_fn_t)(orte_jobid_t job);
/* stop collecting data */
typedef void (*orte_sensor_API_module_stop_fn_t)(orte_jobid_t job);
/* API module */
/*
* Ver 1.0
*/
struct orte_sensor_base_API_module_1_0_0_t {
orte_sensor_API_module_start_fn_t start;
orte_sensor_API_module_stop_fn_t stop;
};
typedef struct orte_sensor_base_API_module_1_0_0_t orte_sensor_base_API_module_1_0_0_t;
typedef orte_sensor_base_API_module_1_0_0_t orte_sensor_base_API_module_t;
/* initialize the module */
typedef int (*orte_sensor_base_module_init_fn_t)(void);
/* finalize the module */
typedef void (*orte_sensor_base_module_finalize_fn_t)(void);
/* tell the module to sample its sensor */
typedef void (*orte_sensor_base_module_sample_fn_t)(void);
/* pass a buffer to the module for logging */
typedef void (*orte_sensor_base_module_log_fn_t)(opal_buffer_t *sample);
/*
* Component modules Ver 1.0
*/
struct orte_sensor_base_module_1_0_0_t {
orte_sensor_base_module_init_fn_t init;
orte_sensor_base_module_finalize_fn_t finalize;
orte_sensor_API_module_start_fn_t start;
orte_sensor_API_module_stop_fn_t stop;
orte_sensor_base_module_sample_fn_t sample;
orte_sensor_base_module_log_fn_t log;
};
typedef struct orte_sensor_base_module_1_0_0_t orte_sensor_base_module_1_0_0_t;
typedef orte_sensor_base_module_1_0_0_t orte_sensor_base_module_t;
/*
* the standard component data structure
*/
struct orte_sensor_base_component_1_0_0_t {
mca_base_component_t base_version;
mca_base_component_data_t base_data;
char *data_measured;
};
typedef struct orte_sensor_base_component_1_0_0_t orte_sensor_base_component_1_0_0_t;
typedef orte_sensor_base_component_1_0_0_t orte_sensor_base_component_t;
/*
* Macro for use in components that are of type sensor v1.0.0
*/
#define ORTE_SENSOR_BASE_VERSION_1_0_0 \
/* sensor v1.0 is chained to MCA v2.0 */ \
MCA_BASE_VERSION_2_0_0, \
/* sensor v1.0 */ \
"sensor", 1, 0, 0
/* Global structure for accessing sensor functions
*/
ORTE_DECLSPEC extern orte_sensor_base_API_module_t orte_sensor; /* holds API function pointers */
END_C_DECLS
#endif /* MCA_SENSOR_H */

Просмотреть файл

@ -1,51 +0,0 @@
/*
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
*
* Copyright (c) 2017 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/** @file:
*/
#ifndef ORTE_MCA_SENSOR_TYPES_H
#define ORTE_MCA_SENSOR_TYPES_H
#include "orte_config.h"
#include "orte/constants.h"
#ifdef HAVE_SYS_TIME_H
#include <sys/time.h>
#endif /* HAVE_SYS_TIME_H */
#include "opal/dss/dss_types.h"
/*
* General SENSOR types - instanced in runtime/orte_globals.c
*/
BEGIN_C_DECLS
enum {
ORTE_SENSOR_SCALE_LINEAR,
ORTE_SENSOR_SCALE_LOG,
ORTE_SENSOR_SCALE_SIGMOID
};
/*
* Structure for passing data from sensors
*/
typedef struct {
opal_object_t super;
char *sensor;
struct timeval timestamp;
opal_byte_object_t data;
} orte_sensor_data_t;
ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_sensor_data_t);
END_C_DECLS
#endif

Просмотреть файл

@ -2,6 +2,7 @@
/*
* Copyright (c) 2011-2015 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2017 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -85,27 +86,19 @@ ORTE_DECLSPEC extern mca_base_framework_t orte_state_base_framework;
ORTE_JOBID_PRINT(shadow->jobid), \
orte_job_state_to_str((s)), \
__FILE__, __LINE__); \
/* sanity check */ \
if ((s) < 0) { \
assert(0); \
} \
orte_state.activate_job_state(shadow, (s)); \
} while(0);
#define ORTE_ACTIVATE_PROC_STATE(p, s) \
do { \
orte_process_name_t *shadow=(p); \
opal_output_verbose(1, orte_state_base_framework.framework_output, \
opal_output_verbose(1, orte_state_base_framework.framework_output, \
"%s ACTIVATE PROC %s STATE %s AT %s:%d", \
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), \
(NULL == shadow) ? "NULL" : \
ORTE_NAME_PRINT(shadow), \
orte_proc_state_to_str((s)), \
__FILE__, __LINE__); \
/* sanity check */ \
if ((s) < 0) { \
assert(0); \
} \
orte_state.activate_proc_state(shadow, (s)); \
} while(0);

Просмотреть файл

@ -1,13 +1,13 @@
/*
* Copyright (c) 2009-2011 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2009-2011 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2011-2012 Los Alamos National Security, LLC.
* All rights reserved.
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
*
* $COPYRIGHT$
*
*
* Additional copyrights may follow
*
*
* $HEADER$
*/

Просмотреть файл

@ -102,7 +102,9 @@ static opal_pmix_server_module_t pmix_server = {
.notify_event = pmix_server_notify_event,
.query = pmix_server_query_fn,
.tool_connected = pmix_tool_connected_fn,
.log = pmix_server_log_fn
.log = pmix_server_log_fn,
.allocate = pmix_server_alloc_fn,
.job_control = pmix_server_job_ctrl_fn
};
void pmix_server_register_params(void)
@ -265,6 +267,12 @@ int pmix_server_init(void)
kv->type = OPAL_BOOL;
kv->data.flag = true;
opal_list_append(&info, &kv->super);
/* tell the server to use its own internal monitoring */
kv = OBJ_NEW(opal_value_t);
kv->key = strdup(OPAL_PMIX_SERVER_ENABLE_MONITORING);
kv->type = OPAL_BOOL;
kv->data.flag = true;
opal_list_append(&info, &kv->super);
/* setup the local server */
if (ORTE_SUCCESS != (rc = opal_pmix.server_init(&pmix_server, &info))) {

Просмотреть файл

@ -511,3 +511,13 @@ int pmix_server_disconnect_fn(opal_list_t *procs, opal_list_t *info,
return rc;
}
int pmix_server_alloc_fn(const opal_process_name_t *requestor,
opal_pmix_alloc_directive_t dir,
opal_list_t *info,
opal_pmix_info_cbfunc_t cbfunc,
void *cbdata)
{
/* ORTE currently has no way of supporting allocation requests */
return ORTE_ERR_NOT_SUPPORTED;
}

Просмотреть файл

@ -40,10 +40,12 @@
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/iof/iof.h"
#include "orte/mca/rmaps/rmaps_types.h"
#include "orte/mca/schizo/schizo.h"
#include "orte/mca/state/state.h"
#include "orte/util/name_fns.h"
#include "orte/runtime/orte_globals.h"
#include "orte/mca/rml/rml.h"
#include "orte/mca/plm/plm.h"
#include "orte/mca/plm/base/plm_private.h"
#include "pmix_server_internal.h"
@ -611,7 +613,15 @@ static void _query(int sd, short args, void *cbdata)
* and ask directly for the info - if rank=wildcard, then
* we need to xcast the request and collect the results */
}
} else if (0 == strcmp(q->keys[n], OPAL_PMIX_TIME_REMAINING)) {
kv = OBJ_NEW(opal_value_t);
kv->key = strdup(OPAL_PMIX_TIME_REMAINING);
kv->type = OPAL_UINT32;
if (ORTE_SUCCESS != orte_schizo.get_remaining_time(&kv->data.uint32)) {
OBJ_RELEASE(kv);
} else {
opal_list_append(results, &kv->super);
}
}
}
}
@ -813,3 +823,62 @@ void pmix_server_log_fn(opal_process_name_t *requestor,
cbfunc(OPAL_SUCCESS, cbdata);
}
}
int pmix_server_job_ctrl_fn(const opal_process_name_t *requestor,
opal_list_t *targets,
opal_list_t *info,
opal_pmix_info_cbfunc_t cbfunc,
void *cbdata)
{
opal_value_t *val;
int rc, n;
orte_proc_t *proc;
opal_pointer_array_t parray, *ptrarray;
opal_namelist_t *nm;
opal_output_verbose(2, orte_pmix_server_globals.output,
"%s job control request from %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(requestor));
OPAL_LIST_FOREACH(val, info, opal_value_t) {
if (NULL == val->key) {
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
continue;
}
if (0 == strcmp(val->key, OPAL_PMIX_JOB_CTRL_KILL)) {
/* convert the list of targets to a pointer array */
if (NULL == targets) {
ptrarray = NULL;
} else {
OBJ_CONSTRUCT(&parray, opal_pointer_array_t);
OPAL_LIST_FOREACH(nm, targets, opal_namelist_t) {
/* get the proc object for this proc */
if (NULL == (proc = orte_get_proc_object(&nm->name))) {
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
continue;
}
OBJ_RETAIN(proc);
opal_pointer_array_add(&parray, proc);
}
ptrarray = &parray;
}
if (ORTE_SUCCESS != (rc = orte_plm.terminate_procs(ptrarray))) {
ORTE_ERROR_LOG(rc);
}
if (NULL != ptrarray) {
/* cleanup the array */
for (n=0; n < parray.size; n++) {
if (NULL != (proc = (orte_proc_t*)opal_pointer_array_get_item(&parray, n))) {
OBJ_RELEASE(proc);
}
}
OBJ_DESTRUCT(&parray);
}
continue;
}
}
return ORTE_SUCCESS;
}

Просмотреть файл

@ -12,7 +12,7 @@
* Copyright (c) 2006-2013 Los Alamos National Security, LLC.
* All rights reserved.
* Copyright (c) 2010-2011 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2013-2016 Intel, Inc. All rights reserved.
* Copyright (c) 2013-2017 Intel, Inc. All rights reserved.
* Copyright (c) 2014 Mellanox Technologies, Inc.
* All rights reserved.
* Copyright (c) 2014 Research Organization for Information Science
@ -206,6 +206,18 @@ extern void pmix_server_log_fn(opal_process_name_t *requestor,
opal_pmix_op_cbfunc_t cbfunc,
void *cbdata);
extern int pmix_server_alloc_fn(const opal_process_name_t *requestor,
opal_pmix_alloc_directive_t dir,
opal_list_t *info,
opal_pmix_info_cbfunc_t cbfunc,
void *cbdata);
extern int pmix_server_job_ctrl_fn(const opal_process_name_t *requestor,
opal_list_t *targets,
opal_list_t *info,
opal_pmix_info_cbfunc_t cbfunc,
void *cbdata);
/* declare the RML recv functions for responses */
extern void pmix_server_launch_resp(int status, orte_process_name_t* sender,
opal_buffer_t *buffer,

Просмотреть файл

@ -705,7 +705,7 @@ int orte_util_decode_daemon_nodemap(opal_buffer_t *buffer)
char *ndnames, *rmndr, **tmp;
opal_list_t dids, slts, flgs;;
opal_buffer_t *bptr=NULL;
orte_topology_t *t;
orte_topology_t *t2;
orte_regex_range_t *rng, *drng, *srng, *frng;
uint8_t ui8;
@ -978,14 +978,13 @@ int orte_util_decode_daemon_nodemap(opal_buffer_t *buffer)
/* if no topology info was passed, then everyone shares our topology */
if (NULL == bptr) {
orte_topology_t *t;
/* our topology is first in the array */
t = (orte_topology_t*)opal_pointer_array_get_item(orte_node_topologies, 0);
t2 = (orte_topology_t*)opal_pointer_array_get_item(orte_node_topologies, 0);
for (n=0; n < orte_node_pool->size; n++) {
if (NULL != (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, n))) {
if (NULL == node->topology) {
OBJ_RETAIN(t);
node->topology = t;
OBJ_RETAIN(t2);
node->topology = t2;
}
}
}
@ -1004,6 +1003,13 @@ int orte_util_decode_daemon_nodemap(opal_buffer_t *buffer)
OBJ_RELEASE(bptr);
goto cleanup;
}
if (NULL == sig) {
rc = ORTE_ERR_BAD_PARAM;
ORTE_ERROR_LOG(rc);
opal_argv_free(tmp);
OBJ_RELEASE(bptr);
goto cleanup;
}
n = 1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(bptr, &topo, &n, OPAL_HWLOC_TOPO))) {
ORTE_ERROR_LOG(rc);
@ -1013,11 +1019,12 @@ int orte_util_decode_daemon_nodemap(opal_buffer_t *buffer)
goto cleanup;
}
/* see if we already have this topology - could be an update */
t2 = NULL;
for (n=0; n < orte_node_topologies->size; n++) {
if (NULL == (t = (orte_topology_t*)opal_pointer_array_get_item(orte_node_topologies, n))) {
if (NULL == (t2 = (orte_topology_t*)opal_pointer_array_get_item(orte_node_topologies, n))) {
continue;
}
if (0 == strcmp(t->sig, sig)) {
if (0 == strcmp(t2->sig, sig)) {
/* found a match */
free(sig);
opal_hwloc_base_free_topology(topo);
@ -1025,11 +1032,12 @@ int orte_util_decode_daemon_nodemap(opal_buffer_t *buffer)
break;
}
}
if (NULL != sig) {
if (NULL != sig || NULL == t2) {
/* new topology - record it */
t = OBJ_NEW(orte_topology_t);
t->sig = sig;
t->topo = topo;
t2 = OBJ_NEW(orte_topology_t);
t2->sig = sig;
t2->topo = topo;
opal_pointer_array_add(orte_node_topologies, t2);
}
/* point each of the nodes in the regex to this topology */
start = strtoul(tmp[nn], &rmndr, 10);
@ -1043,8 +1051,8 @@ int orte_util_decode_daemon_nodemap(opal_buffer_t *buffer)
for (k=start; k <= endpt; k++) {
if (NULL != (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, k))) {
if (NULL == node->topology) {
OBJ_RETAIN(t);
node->topology = t;
OBJ_RETAIN(t2);
node->topology = t2;
}
}
}