1
1

Per the earlier RFC and some discussion at the Dec ORTE design meeting, add the ompi-top tool and all its supporting infrastructure. This includes a new OPAL pstat framework and data type, currently with rather weak support for Mac OSX and pretty complete support for Linux. The Sun team promised to add Solaris support as well.

Also, per chat with Jeff, modified the Makefile.am's of a few orte tools so that they were consistent in the way we generate the ompi-equivalent cmds.

This commit was SVN r20165.
Этот коммит содержится в:
Ralph Castain 2008-12-22 20:23:05 +00:00
родитель 9f99f9c63d
Коммит 7787f84540
44 изменённых файлов: 3045 добавлений и 138 удалений

Просмотреть файл

@ -1345,6 +1345,7 @@ AC_CONFIG_FILES([
orte/tools/orte-restart/Makefile
orte/tools/orte-ps/Makefile
orte/tools/orte-clean/Makefile
orte/tools/orte-top/Makefile
ompi/Makefile
ompi/etc/Makefile

Просмотреть файл

@ -29,12 +29,20 @@ install-exec-hook:
(cd $(DESTDIR)$(bindir); rm -f mpiexec$(EXEEXT); $(LN_S) orterun$(EXEEXT) mpiexec$(EXEEXT))
(cd $(DESTDIR)$(bindir); rm -f ompi-ps$(EXEEXT); $(LN_S) orte-ps$(EXEEXT) ompi-ps$(EXEEXT))
(cd $(DESTDIR)$(bindir); rm -f ompi-clean$(EXEEXT); $(LN_S) orte-clean$(EXEEXT) ompi-clean$(EXEEXT))
(cd $(DESTDIR)$(bindir); rm -f ompi-checkpoint$(EXEEXT); $(LN_S) orte-checkpoint$(EXEEXT) ompi-checkpoint$(EXEEXT))
(cd $(DESTDIR)$(bindir); rm -f ompi-iof$(EXEEXT); $(LN_S) orte-iof$(EXEEXT) ompi-iof$(EXEEXT))
(cd $(DESTDIR)$(bindir); rm -f ompi-restart$(EXEEXT); $(LN_S) orte-restart$(EXEEXT) ompi-restart$(EXEEXT))
(cd $(DESTDIR)$(bindir); rm -f ompi-top$(EXEEXT); $(LN_S) orte-top$(EXEEXT) ompi-top$(EXEEXT))
uninstall-local:
rm -f $(DESTDIR)$(bindir)/mpirun$(EXEEXT) \
$(DESTDIR)$(bindir)/mpiexec$(EXEEXT) \
$(DESTDIR)$(bindir)/ompi-ps$(EXEEXT) \
$(DESTDIR)$(bindir)/ompi-clean$(EXEEXT)
$(DESTDIR)$(bindir)/ompi-clean$(EXEEXT) \
$(DESTDIR)$(bindir)/ompi-checkpoint$(EXEEXT) \
$(DESTDIR)$(bindir)/ompi-iof$(EXEEXT) \
$(DESTDIR)$(bindir)/ompi-restart$(EXEEXT) \
$(DESTDIR)$(bindir)/ompi-top$(EXEEXT)
endif # !ORTE_DISABLE_FULL_SUPPORT
endif # OMPI_INSTALL_BINARIES
@ -60,5 +68,29 @@ $(top_builddir)/orte/tools/orte-clean/orte-clean.1:
ompi-clean.1: $(top_builddir)/orte/tools/orte-clean/orte-clean.1
cp -f $(top_builddir)/orte/tools/orte-clean/orte-clean.1 ompi-clean.1
$(top_builddir)/orte/tools/orte-checkpoint/orte-checkpoint.1:
(cd $(top_builddir)/orte/tools/orte-checkpoint && $(MAKE) $(AM_MAKEFLAGS) orte-checkpoint.1)
ompi-checkpoint.1: $(top_builddir)/orte/tools/orte-checkpoint/orte-checkpoint.1
cp -f $(top_builddir)/orte/tools/orte-clean/orte-checkpoint.1 ompi-checkpoint.1
$(top_builddir)/orte/tools/orte-iof/orte-iof.1:
(cd $(top_builddir)/orte/tools/orte-iof && $(MAKE) $(AM_MAKEFLAGS) orte-iof.1)
ompi-iof.1: $(top_builddir)/orte/tools/orte-iof/orte-iof.1
cp -f $(top_builddir)/orte/tools/orte-iof/orte-iof.1 ompi-iof.1
$(top_builddir)/orte/tools/orte-restart/orte-restart.1:
(cd $(top_builddir)/orte/tools/orte-restart && $(MAKE) $(AM_MAKEFLAGS) orte-restart.1)
ompi-restart.1: $(top_builddir)/orte/tools/orte-restart/orte-restart.1
cp -f $(top_builddir)/orte/tools/orte-restart/orte-restart.1 ompi-restart.1
$(top_builddir)/orte/tools/orte-top/orte-top.1:
(cd $(top_builddir)/orte/tools/orte-top && $(MAKE) $(AM_MAKEFLAGS) orte-top.1)
ompi-top.1: $(top_builddir)/orte/tools/orte-top/orte-top.1
cp -f $(top_builddir)/orte/tools/orte-top/orte-top.1 ompi-top.1
distclean-local:
rm -f $(man_pages)

Просмотреть файл

@ -253,3 +253,9 @@ int opal_dss_compare_byte_object(opal_byte_object_t *value1, opal_byte_object_t
return OPAL_EQUAL; /* sum of both value's bytes was identical */
}
/* OPAL_PSTAT */
int opal_dss_compare_pstat(opal_pstats_t *value1, opal_pstats_t *value2, opal_data_type_t type)
{
return OPAL_EQUAL; /* eventually compare field to field */
}

Просмотреть файл

@ -197,3 +197,34 @@ int opal_dss_copy_byte_object(opal_byte_object_t **dest, opal_byte_object_t *src
return OPAL_SUCCESS;
}
/* OPAL_PSTAT */
int opal_dss_copy_pstat(opal_pstats_t **dest, opal_pstats_t *src,
opal_data_type_t type)
{
opal_pstats_t *p;
/* create the new object */
*dest = OBJ_NEW(opal_pstats_t);
if (NULL == *dest) {
return OPAL_ERR_OUT_OF_RESOURCE;
}
p = *dest;
/* copy the individual fields */
memcpy(p->node, src->node, sizeof(src->node));
p->rank = src->rank;
p->pid = src->pid;
memcpy(p->cmd, src->cmd, sizeof(src->node));
p->state = src->state;
p->time = src->time;
p->priority = src->priority;
p->num_threads = src->num_threads;
p->vsize = src->vsize;
p->rss = src->rss;
p->peak_vsize = src->peak_vsize;
p->shared_size = src->shared_size;
p->processor = src->processor;
return OPAL_SUCCESS;
}

Просмотреть файл

@ -308,6 +308,9 @@ OPAL_DECLSPEC int opal_dss_unpack_buffer(opal_buffer_t *buffer, void *dst,
int opal_dss_pack_byte_object(opal_buffer_t *buffer, const void *src,
int32_t num_vals, opal_data_type_t type);
int opal_dss_pack_pstat(opal_buffer_t *buffer, const void *src,
int32_t num_vals, opal_data_type_t type);
/*
* Internal unpack functions
*/
@ -347,6 +350,9 @@ OPAL_DECLSPEC int opal_dss_unpack_buffer(opal_buffer_t *buffer, void *dst,
int opal_dss_unpack_byte_object(opal_buffer_t *buffer, void *dest,
int32_t *num_vals, opal_data_type_t type);
int opal_dss_unpack_pstat(opal_buffer_t *buffer, void *dest,
int32_t *num_vals, opal_data_type_t type);
/*
* Internal copy functions
*/
@ -362,6 +368,10 @@ OPAL_DECLSPEC int opal_dss_unpack_buffer(opal_buffer_t *buffer, void *dst,
int opal_dss_copy_data_value(opal_dss_value_t **dest, opal_dss_value_t *src,
opal_data_type_t type);
int opal_dss_copy_pstat(opal_pstats_t **dest, opal_pstats_t *src,
opal_data_type_t type);
/*
* Internal compare functions
*/
@ -399,6 +409,8 @@ OPAL_DECLSPEC int opal_dss_unpack_buffer(opal_buffer_t *buffer, void *dst,
int opal_dss_compare_byte_object(opal_byte_object_t *value1, opal_byte_object_t *value2, opal_data_type_t type);
int opal_dss_compare_pstat(opal_pstats_t *value1, opal_pstats_t *value2, opal_data_type_t type);
/*
* Internal size functions
*/
@ -410,6 +422,8 @@ OPAL_DECLSPEC int opal_dss_unpack_buffer(opal_buffer_t *buffer, void *dst,
int opal_dss_size_byte_object(size_t *size, opal_byte_object_t *src, opal_data_type_t type);
int opal_dss_size_pstat(size_t *size, opal_pstats_t *src, opal_data_type_t type);
/*
* Internal print functions
*/
@ -439,6 +453,7 @@ OPAL_DECLSPEC int opal_dss_unpack_buffer(opal_buffer_t *buffer, void *dst,
int opal_dss_print_data_type(char **output, char *prefix, opal_data_type_t *src, opal_data_type_t type);
int opal_dss_print_data_value(char **output, char *prefix, opal_dss_value_t *src, opal_data_type_t type);
int opal_dss_print_byte_object(char **output, char *prefix, opal_byte_object_t *src, opal_data_type_t type);
int opal_dss_print_pstat(char **output, char *prefix, opal_pstats_t *src, opal_data_type_t type);
/*

Просмотреть файл

@ -133,6 +133,26 @@ OBJ_CLASS_INSTANCE(opal_dss_type_info_t, opal_object_t,
opal_dss_type_info_destruct);
static void opal_pstat_construct(opal_pstats_t *obj)
{
memset(obj->node, 0, sizeof(obj->node));
memset(obj->cmd, 0, sizeof(obj->cmd));
obj->state = 'U';
obj->time = 0;
obj->priority = -1;
obj->num_threads = -1;
obj->vsize = 0;
obj->rss = 0;
obj->peak_vsize = 0;
obj->shared_size = 0;
obj->processor = -1;
}
OBJ_CLASS_INSTANCE(opal_pstats_t, opal_list_item_t,
opal_pstat_construct,
NULL);
int opal_dss_open(void)
{
char *enviro_val;
@ -418,6 +438,18 @@ int opal_dss_open(void)
return rc;
}
tmp = OPAL_PSTAT;
if (OPAL_SUCCESS != (rc = opal_dss.register_type(opal_dss_pack_pstat,
opal_dss_unpack_pstat,
(opal_dss_copy_fn_t)opal_dss_copy_pstat,
(opal_dss_compare_fn_t)opal_dss_compare_pstat,
(opal_dss_size_fn_t)opal_dss_size_pstat,
(opal_dss_print_fn_t)opal_dss_print_pstat,
(opal_dss_release_fn_t)opal_dss_std_obj_release,
OPAL_DSS_STRUCTURED,
"OPAL_PSTAT", &tmp))) {
return rc;
}
/* All done */
return OPAL_SUCCESS;

Просмотреть файл

@ -417,3 +417,64 @@ int opal_dss_pack_byte_object(opal_buffer_t *buffer, const void *src, int32_t nu
return OPAL_SUCCESS;
}
/*
* OPAL_PSTAT
*/
int opal_dss_pack_pstat(opal_buffer_t *buffer, const void *src,
int32_t num_vals, opal_data_type_t type)
{
opal_pstats_t **ptr;
int32_t i;
int ret;
char *cptr;
ptr = (opal_pstats_t **) src;
for (i = 0; i < num_vals; ++i) {
cptr = ptr[i]->node;
if (OPAL_SUCCESS != (ret = opal_dss_pack_buffer(buffer, &cptr, 1, OPAL_STRING))) {
return ret;
}
if (OPAL_SUCCESS != (ret = opal_dss_pack_buffer(buffer, &ptr[i]->rank, 1, OPAL_INT32))) {
return ret;
}
if (OPAL_SUCCESS != (ret = opal_dss_pack_buffer(buffer, &ptr[i]->pid, 1, OPAL_PID))) {
return ret;
}
cptr = ptr[i]->cmd;
if (OPAL_SUCCESS != (ret = opal_dss_pack_buffer(buffer, &cptr, 1, OPAL_STRING))) {
return ret;
}
if (OPAL_SUCCESS != (ret = opal_dss_pack_buffer(buffer, &ptr[i]->state, 1, OPAL_BYTE))) {
return ret;
}
if (OPAL_SUCCESS != (ret = opal_dss_pack_buffer(buffer, &ptr[i]->time, 1, OPAL_UINT64))) {
return ret;
}
if (OPAL_SUCCESS != (ret = opal_dss_pack_buffer(buffer, &ptr[i]->priority, 1, OPAL_INT32))) {
return ret;
}
if (OPAL_SUCCESS != (ret = opal_dss_pack_buffer(buffer, &ptr[i]->num_threads, 1, OPAL_INT16))) {
return ret;
}
if (OPAL_SUCCESS != (ret = opal_dss_pack_buffer(buffer, &ptr[i]->vsize, 1, OPAL_UINT64))) {
return ret;
}
if (OPAL_SUCCESS != (ret = opal_dss_pack_buffer(buffer, &ptr[i]->rss, 1, OPAL_UINT64))) {
return ret;
}
if (OPAL_SUCCESS != (ret = opal_dss_pack_buffer(buffer, &ptr[i]->peak_vsize, 1, OPAL_UINT64))) {
return ret;
}
if (OPAL_SUCCESS != (ret = opal_dss_pack_buffer(buffer, &ptr[i]->shared_size, 1, OPAL_UINT64))) {
return ret;
}
if (OPAL_SUCCESS != (ret = opal_dss_pack_buffer(buffer, &ptr[i]->processor, 1, OPAL_INT16))) {
return ret;
}
}
return OPAL_SUCCESS;
}

Просмотреть файл

@ -459,3 +459,28 @@ int opal_dss_print_byte_object(char **output, char *prefix, opal_byte_object_t *
return OPAL_SUCCESS;
}
/*
* OPAL_PSTAT
*/
int opal_dss_print_pstat(char **output, char *prefix, opal_pstats_t *src, opal_data_type_t type)
{
char *prefx;
/* deal with NULL prefix */
if (NULL == prefix) asprintf(&prefx, " ");
else prefx = prefix;
/* if src is NULL, just print data type and return */
if (NULL == src) {
asprintf(output, "%sData type: OPAL_PSTATS\tValue: NULL pointer", prefx);
return OPAL_SUCCESS;
}
asprintf(output, "%snode: %s rank: %d pid: %d cmd: %s state: %c pri: %d #threads: %d Processor: %d\n"
"%s\ttime: %lu VMsize: %lu PeakVMSize: %lu RSS: %lu Share: %lu\n",
prefx, src->node, src->rank, src->pid, src->cmd, src->state, src->priority, src->num_threads, src->processor,
prefx, src->time, src->vsize, src->peak_vsize, src->rss, src->shared_size);
return OPAL_SUCCESS;
}

Просмотреть файл

@ -152,3 +152,13 @@ int opal_dss_size_byte_object(size_t *size, opal_byte_object_t *src, opal_data_t
return OPAL_SUCCESS;
}
/*
* OPAL_PSTAT
*/
int opal_dss_size_pstat(size_t *size, opal_pstats_t *src, opal_data_type_t type)
{
*size = sizeof(opal_pstats_t);
return OPAL_SUCCESS;
}

Просмотреть файл

@ -30,6 +30,7 @@
#include "opal/types.h"
#include "opal/class/opal_object.h"
#include "opal/class/opal_list.h"
BEGIN_C_DECLS
@ -69,8 +70,9 @@ typedef struct {
#define OPAL_DATA_TYPE (opal_data_type_t) 17 /**< data type */
#define OPAL_NULL (opal_data_type_t) 18 /**< don't interpret data type */
#define OPAL_DATA_VALUE (opal_data_type_t) 19 /**< data value */
#define OPAL_PSTAT (opal_data_type_t) 20 /**< process statistics */
#define OPAL_DSS_ID_DYNAMIC (opal_data_type_t) 20
#define OPAL_DSS_ID_DYNAMIC (opal_data_type_t) 30
/* define the results values for comparisons so we can change them in only one place */
#define OPAL_VALUE1_GREATER +1
@ -87,6 +89,26 @@ OPAL_DECLSPEC OBJ_CLASS_DECLARATION(opal_dss_value_t);
#define OPAL_DATA_VALUE_EMPTY { OPAL_OBJ_STATIC_INIT(opal_dss_value_t), OPAL_UNDEF, NULL}
/* Process statistics object */
#define OPAL_PSTAT_MAX_STRING_LEN 32
typedef struct {
opal_list_item_t super; /* required for this to be on a list */
char node[OPAL_PSTAT_MAX_STRING_LEN];
int32_t rank;
pid_t pid;
char cmd[OPAL_PSTAT_MAX_STRING_LEN];
char state;
uint64_t time;
int32_t priority;
int16_t num_threads;
uint64_t vsize; /* in kBytes */
uint64_t rss; /* in kBytes */
uint64_t peak_vsize; /* in kBytes */
uint64_t shared_size; /* in kBytes */
int16_t processor;
} opal_pstats_t;
OPAL_DECLSPEC OBJ_CLASS_DECLARATION(opal_pstats_t);
/* structured-unstructured data flags */
#define OPAL_DSS_STRUCTURED true
#define OPAL_DSS_UNSTRUCTURED false

Просмотреть файл

@ -509,3 +509,84 @@ int opal_dss_unpack_byte_object(opal_buffer_t *buffer, void *dest, int32_t *num,
return OPAL_SUCCESS;
}
/*
* OPAL_PSTAT
*/
int opal_dss_unpack_pstat(opal_buffer_t *buffer, void *dest,
int32_t *num_vals, opal_data_type_t type)
{
opal_pstats_t **ptr;
int32_t i, n, m;
int ret;
char *cptr;
ptr = (opal_pstats_t **) dest;
n = *num_vals;
for (i = 0; i < n; ++i) {
/* allocate the new object */
ptr[i] = OBJ_NEW(opal_pstats_t);
if (NULL == ptr[i]) {
return OPAL_ERR_OUT_OF_RESOURCE;
}
m=1;
if (OPAL_SUCCESS != (ret = opal_dss_unpack_buffer(buffer, &cptr, &m, OPAL_STRING))) {
return ret;
}
memmove(ptr[i]->node, cptr, strlen(cptr));
free(cptr);
m=1;
if (OPAL_SUCCESS != (ret = opal_dss_unpack_buffer(buffer, &ptr[i]->rank, &m, OPAL_INT32))) {
return ret;
}
m=1;
if (OPAL_SUCCESS != (ret = opal_dss_unpack_buffer(buffer, &ptr[i]->pid, &m, OPAL_PID))) {
return ret;
}
m=1;
if (OPAL_SUCCESS != (ret = opal_dss_unpack_buffer(buffer, &cptr, &m, OPAL_STRING))) {
return ret;
}
memmove(ptr[i]->cmd, cptr, strlen(cptr));
free(cptr);
m=1;
if (OPAL_SUCCESS != (ret = opal_dss_unpack_buffer(buffer, &ptr[i]->state, &m, OPAL_BYTE))) {
return ret;
}
m=1;
if (OPAL_SUCCESS != (ret = opal_dss_unpack_buffer(buffer, &ptr[i]->time, &m, OPAL_UINT64))) {
return ret;
}
m=1;
if (OPAL_SUCCESS != (ret = opal_dss_unpack_buffer(buffer, &ptr[i]->priority, &m, OPAL_INT32))) {
return ret;
}
m=1;
if (OPAL_SUCCESS != (ret = opal_dss_unpack_buffer(buffer, &ptr[i]->num_threads, &m, OPAL_INT16))) {
return ret;
}
m=1;
if (OPAL_SUCCESS != (ret = opal_dss_unpack_buffer(buffer, &ptr[i]->vsize, &m, OPAL_UINT64))) {
return ret;
}
m=1;
if (OPAL_SUCCESS != (ret = opal_dss_unpack_buffer(buffer, &ptr[i]->rss, &m, OPAL_UINT64))) {
return ret;
}
m=1;
if (OPAL_SUCCESS != (ret = opal_dss_unpack_buffer(buffer, &ptr[i]->peak_vsize, &m, OPAL_UINT64))) {
return ret;
}
m=1;
if (OPAL_SUCCESS != (ret = opal_dss_unpack_buffer(buffer, &ptr[i]->shared_size, &m, OPAL_UINT64))) {
return ret;
}
m=1;
if (OPAL_SUCCESS != (ret = opal_dss_unpack_buffer(buffer, &ptr[i]->processor, &m, OPAL_INT16))) {
return ret;
}
}
return OPAL_SUCCESS;
}

41
opal/mca/pstat/Makefile.am Обычный файл
Просмотреть файл

@ -0,0 +1,41 @@
#
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2004-2005 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# main library setup
noinst_LTLIBRARIES = libmca_pstat.la
libmca_pstat_la_SOURCES =
# header setup
nobase_opal_HEADERS =
# local files
headers = pstat.h
libmca_pstat_la_SOURCES += $(headers)
# Conditionally install the header files
if WANT_INSTALL_HEADERS
nobase_opal_HEADERS += $(headers)
opaldir = $(includedir)/openmpi/opal/mca/pstat
else
opaldir = $(includedir)
endif
include base/Makefile.am
distclean-local:
rm -f base/static-components.h

25
opal/mca/pstat/base/Makefile.am Обычный файл
Просмотреть файл

@ -0,0 +1,25 @@
#
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2004-2005 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
headers += \
base/base.h
libmca_pstat_la_SOURCES += \
base/pstat_base_close.c \
base/pstat_base_select.c \
base/pstat_base_open.c

82
opal/mca/pstat/base/base.h Обычный файл
Просмотреть файл

@ -0,0 +1,82 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2006 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*
*/
#ifndef OPAL_PSTAT_BASE_H
#define OPAL_PSTAT_BASE_H
#include "opal_config.h"
#include "opal/mca/pstat/pstat.h"
/*
* Global functions for MCA overall pstat open and close
*/
BEGIN_C_DECLS
/**
* Initialize the pstat MCA framework
*
* @retval OPAL_SUCCESS Upon success
* @retval OPAL_ERROR Upon failure
*
* This must be the first function invoked in the pstat MCA
* framework. It initializes the pstat MCA framework, finds
* and opens pstat components, etc.
*
* This function is invoked during opal_init().
*/
OPAL_DECLSPEC int opal_pstat_base_open(void);
/**
* Close the pstat MCA framework
*
* @retval OPAL_SUCCESS Upon success
* @retval OPAL_ERROR Upon failure
*
* This must be the last function invoked in the pstat MCA
* framework.
*
* This function is invoked during opal_finalize().
*/
OPAL_DECLSPEC int opal_pstat_base_close(void);
/**
* Select an available component.
*
* @return OPAL_SUCCESS Upon success.
* @return OPAL_NOT_FOUND If no component can be selected.
* @return OPAL_ERROR Upon other failure.
*
* At the end of this process, we'll either have a single
* component that is selected and initialized, or no component was
* selected. If no component was selected, subsequent invocation
* of the pstat functions will return an error indicating no data
* could be obtained
*/
OPAL_DECLSPEC int opal_pstat_base_select(void);
OPAL_DECLSPEC extern int opal_pstat_base_output;
OPAL_DECLSPEC extern opal_list_t opal_pstat_base_components_opened;
OPAL_DECLSPEC extern opal_pstat_base_component_t *opal_pstat_base_component;
END_C_DECLS
#endif /* OPAL_BASE_PSTAT_H */

40
opal/mca/pstat/base/pstat_base_close.c Обычный файл
Просмотреть файл

@ -0,0 +1,40 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2006 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2007 Cisco, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "opal_config.h"
#include "opal/constants.h"
#include "opal/mca/mca.h"
#include "opal/mca/base/base.h"
#include "opal/mca/pstat/pstat.h"
#include "opal/mca/pstat/base/base.h"
int opal_pstat_base_close(void)
{
/* Close all components that are still open (this should only
happen during ompi_info). */
mca_base_components_close(opal_pstat_base_output,
&opal_pstat_base_components_opened, NULL);
OBJ_DESTRUCT(&opal_pstat_base_components_opened);
/* All done */
return OPAL_SUCCESS;
}

94
opal/mca/pstat/base/pstat_base_open.c Обычный файл
Просмотреть файл

@ -0,0 +1,94 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2008 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "opal_config.h"
#include "opal/constants.h"
#include "opal/util/output.h"
#include "opal/mca/mca.h"
#include "opal/mca/base/base.h"
#include "opal/mca/base/mca_base_param.h"
#include "opal/mca/pstat/pstat.h"
#include "opal/mca/pstat/base/base.h"
/*
* The following file was created by configure. It contains extern
* statements and the definition of an array of pointers to each
* component's public mca_base_component_t struct.
*/
#include "opal/mca/pstat/base/static-components.h"
/* unsupported functions */
static int opal_pstat_base_unsupported_init(void);
static int opal_pstat_base_unsupported_query(pid_t pid, opal_pstats_t *stats);
static int opal_pstat_base_unsupported_finalize(void);
/*
* Globals
*/
int opal_pstat_base_output = -1;
opal_list_t opal_pstat_base_components_opened;
opal_pstat_base_component_t *opal_pstat_base_component = NULL;
opal_pstat_base_module_t opal_pstat = {
opal_pstat_base_unsupported_init,
opal_pstat_base_unsupported_query,
opal_pstat_base_unsupported_finalize
};
/*
* Function for finding and opening either all MCA components, or the one
* that was specifically requested via a MCA parameter.
*/
int opal_pstat_base_open(void)
{
opal_pstat_base_output = opal_output_open(NULL);
/* Open up all available components */
OBJ_CONSTRUCT( &opal_pstat_base_components_opened, opal_list_t );
if (OPAL_SUCCESS !=
mca_base_components_open("pstat", opal_pstat_base_output,
mca_pstat_base_static_components,
&opal_pstat_base_components_opened,
true)) {
return OPAL_ERROR;
}
/* All done */
return OPAL_SUCCESS;
}
static int opal_pstat_base_unsupported_init(void)
{
return OPAL_ERR_NOT_SUPPORTED;
}
static int opal_pstat_base_unsupported_query(pid_t pid, opal_pstats_t *stats)
{
return OPAL_ERR_NOT_SUPPORTED;
}
static int opal_pstat_base_unsupported_finalize(void)
{
return OPAL_ERR_NOT_SUPPORTED;
}

66
opal/mca/pstat/base/pstat_base_select.c Обычный файл
Просмотреть файл

@ -0,0 +1,66 @@
/*
* Copyright (c) 2004-2008 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2006 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2007 Cisco, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "opal_config.h"
#include "opal/constants.h"
#include "opal/util/output.h"
#include "opal/mca/mca.h"
#include "opal/mca/base/base.h"
#include "opal/mca/base/mca_base_param.h"
#include "opal/mca/pstat/pstat.h"
#include "opal/mca/pstat/base/base.h"
/*
* Globals
*/
int opal_pstat_base_select(void)
{
int ret, exit_status = OPAL_SUCCESS;
opal_pstat_base_component_t *best_component = NULL;
opal_pstat_base_module_t *best_module = NULL;
/*
* Select the best component
*/
if( OPAL_SUCCESS != mca_base_select("pstat", opal_pstat_base_output,
&opal_pstat_base_components_opened,
(mca_base_module_t **) &best_module,
(mca_base_component_t **) &best_component) ) {
/* It is okay if we don't find a runnable component - default
* to the unsupported default.
*/
goto cleanup;
}
/* Save the winner */
opal_pstat_base_component = best_component;
opal_pstat = *best_module;
/* Initialize the winner */
if (OPAL_SUCCESS != (ret = opal_pstat.init()) ) {
exit_status = ret;
goto cleanup;
}
cleanup:
return exit_status;
}

13
opal/mca/pstat/configure.m4 Обычный файл
Просмотреть файл

@ -0,0 +1,13 @@
dnl -*- shell-script -*-
dnl
dnl Copyright (c) 2007 Los Alamos National Security, LLC.
dnl All rights reserved.
dnl $COPYRIGHT$
dnl
dnl Additional copyrights may follow
dnl
dnl $HEADER$
dnl
dnl we only want those at same priority
m4_define(MCA_pstat_CONFIGURE_MODE, STOP_AT_FIRST_PRIORITY)

43
opal/mca/pstat/darwin/Makefile.am Обычный файл
Просмотреть файл

@ -0,0 +1,43 @@
#
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2004-2005 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
sources = \
pstat_darwin.h \
pstat_darwin_component.c \
pstat_darwin_module.c
# Make the output library in this directory, and name it either
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
# (for static builds).
if OMPI_BUILD_pstat_darwin_DSO
component_noinst =
component_install = mca_pstat_darwin.la
else
component_noinst = libmca_pstat_darwin.la
component_install =
endif
mcacomponentdir = $(pkglibdir)
mcacomponent_LTLIBRARIES = $(component_install)
mca_pstat_darwin_la_SOURCES = $(sources)
mca_pstat_darwin_la_LDFLAGS = -module -avoid-version
noinst_LTLIBRARIES = $(component_noinst)
libmca_pstat_darwin_la_SOURCES =$(sources)
libmca_pstat_darwin_la_LDFLAGS = -module -avoid-version

33
opal/mca/pstat/darwin/configure.m4 Обычный файл
Просмотреть файл

@ -0,0 +1,33 @@
# -*- shell-script -*-
#
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2004-2005 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# Copyright (c) 2007-2008 Cisco, Inc. All rights reserved.
# Copyright (c) 2008 Sun Microsystems, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# MCA_pstat_darwin_CONFIG([action-if-found], [action-if-not-found])
# -----------------------------------------------------------
AC_DEFUN([MCA_pstat_darwin_CONFIG],[
OMPI_VAR_SCOPE_PUSH([paff_darwin_happy])
# check to see if we have <mach/mach_host.h>
# as this is a Darwin-specific thing
AC_CHECK_HEADER([mach/mach_host.h], [paff_darwin_happy=yes], [paff_darwin_happy=no])
AS_IF([test "$paff_darwin_happy" = "yes"], [$1], [$2])
OMPI_VAR_SCOPE_POP
])dnl

28
opal/mca/pstat/darwin/configure.params Обычный файл
Просмотреть файл

@ -0,0 +1,28 @@
# -*- shell-script -*-
#
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2004-2005 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# Copyright (c) 2007 Los Alamos National Security, LLC. All rights
# reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
PARAM_CONFIG_FILES="Makefile"
#
# Set the config priority so that, if we can build,
# only this component will build
PARAM_CONFIG_PRIORITY=50

40
opal/mca/pstat/darwin/pstat_darwin.h Обычный файл
Просмотреть файл

@ -0,0 +1,40 @@
/*
* Copyright (c) 2004-2008 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2007 Cisco, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#ifndef MCA_PSTAT_DARWIN_EXPORT_H
#define MCA_PSTAT_DARWIN_EXPORT_H
#include "opal_config.h"
#include "opal/mca/mca.h"
#include "opal/mca/pstat/pstat.h"
BEGIN_C_DECLS
/*
* Globally exported variable
*/
OPAL_DECLSPEC extern const opal_pstat_base_component_t mca_pstat_darwin_component;
OPAL_DECLSPEC extern const opal_pstat_base_module_t opal_pstat_darwin_module;
END_C_DECLS
#endif /* MCA_PSTAT_DARWIN_EXPORT_H */

Просмотреть файл

@ -0,0 +1,88 @@
/*
* Copyright (c) 2004-2008 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2007 Cisco, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*
* These symbols are in a file by themselves to provide nice linker
* semantics. Since linkers generally pull in symbols by object
* files, keeping these symbols as the only symbols in this file
* prevents utility programs such as "ompi_info" from having to import
* entire components just to query their version and parameters.
*/
#include "opal_config.h"
#include "opal/constants.h"
#include "opal/mca/pstat/pstat.h"
#include "pstat_darwin.h"
/*
* Public string showing the pstat ompi_darwin component version number
*/
const char *opal_pstat_darwin_component_version_string =
"OPAL darwin pstat MCA component version " OPAL_VERSION;
/*
* Local function
*/
static int pstat_darwin_component_query(mca_base_module_t **module, int *priority);
/*
* Instantiate the public struct with all of our public information
* and pointers to our public functions in it
*/
const opal_pstat_base_component_t mca_pstat_darwin_component = {
/* First, the mca_component_t struct containing meta information
about the component itself */
{
/* Indicate that we are a pstat v1.1.0 component (which also
implies a specific MCA version) */
OPAL_PSTAT_BASE_VERSION_2_0_0,
/* Component name and version */
"darwin",
OPAL_MAJOR_VERSION,
OPAL_MINOR_VERSION,
OPAL_RELEASE_VERSION,
/* Component open and close functions */
NULL,
NULL,
pstat_darwin_component_query,
NULL
},
/* Next the MCA v1.0.0 component meta data */
{
/* The component is checkpoint ready */
MCA_BASE_METADATA_PARAM_CHECKPOINT
}
};
static int pstat_darwin_component_query(mca_base_module_t **module, int *priority)
{
*priority = 20;
*module = (mca_base_module_t *)&opal_pstat_darwin_module;
return OPAL_SUCCESS;
}

149
opal/mca/pstat/darwin/pstat_darwin_module.c Обычный файл
Просмотреть файл

@ -0,0 +1,149 @@
/*
* Copyright (c) 2004-2008 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2008 Sun Microsystems, Inc. All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "opal_config.h"
#include "opal/constants.h"
/* This component will only be compiled on Mac OSX, where we are
guaranteed to have these headers */
#include <stdio.h>
#include <unistd.h>
#include <errno.h>
#include <fcntl.h>
#include <stdbool.h>
#include <stdlib.h>
#include <sys/sysctl.h>
#include <assert.h>
#include <time.h>
#include "opal/mca/base/mca_base_param.h"
#include "opal/mca/pstat/pstat.h"
#include "opal/mca/pstat/base/base.h"
#include "opal/util/output.h"
#include "pstat_darwin.h"
static int init(void);
static int query(pid_t pid, opal_pstats_t *stats);
static int fini(void);
/*
* Linux pstat module
*/
const opal_pstat_base_module_t opal_pstat_darwin_module = {
init,
query,
fini
};
static int init(void)
{
return OPAL_SUCCESS;
}
static int fini(void)
{
return OPAL_SUCCESS;
}
/* Trivial helper function to convert system error codes to OPAL_ERR_*
codes */
static int convert(int ret)
{
switch(ret) {
case 0:
return OPAL_SUCCESS;
case ENOSYS:
return OPAL_ERR_NOT_SUPPORTED;
case EINVAL:
return OPAL_ERR_BAD_PARAM;
default:
return OPAL_ERROR;
}
}
/* Mac OSX does things a little differently than Linux
* by providing process stats via an API. This means we
* don't have to parse files that could change!
*/
static int query(pid_t pid, opal_pstats_t *stats)
{
struct kinfo_proc *procs;
int kprocinfo[] = { CTL_KERN, KERN_PROC, KERN_PROC_PID, 0, 0 };
size_t length;
size_t cnt;
kprocinfo[3] = pid;
/* Call sysctl with a NULL buffer to find out how much memory the
* eventual data will consume
*/
length = 0;
if (0 != sysctl(kprocinfo, (sizeof(kprocinfo) / sizeof(*kprocinfo)) - 1,
NULL, &length, NULL, 0)) {
/* something went wrong */
return convert(errno);
}
/* Allocate an appropriately sized buffer based on the results
* from the previous call.
*/
if (NULL == (procs = malloc(length))) {
return OPAL_ERR_OUT_OF_RESOURCE;
}
/* Call sysctl again with the new buffer to get the info */
if (0 != sysctl(kprocinfo, (sizeof(kprocinfo) / sizeof(*kprocinfo)) - 1,
procs, &length,
NULL, 0)) {
/* something went wrong */
free(procs);
return convert(errno);
}
/* figure out how many results we got */
cnt = length / sizeof(struct kinfo_proc);
if (1 < cnt) {
/* if we got more than one, something is wrong */
free(procs);
return OPAL_ERROR;
}
stats->pid = pid;
if (MAXCOMLEN < OPAL_PSTAT_MAX_STRING_LEN) {
memcpy(stats->cmd, procs->kp_proc.p_comm, MAXCOMLEN);
} else {
/* leave the trailing NULL to end the string */
memcpy(stats->cmd, procs->kp_proc.p_comm, OPAL_PSTAT_MAX_STRING_LEN-1);
}
/* we aren't getting anything useful back on state, so just leave it
* as undefined
* stats->state = procs->kp_proc.p_stat;
*/
stats->time = procs->kp_proc.p_cpticks / CLOCKS_PER_SEC;
stats->priority = procs->kp_proc.p_priority;
return OPAL_SUCCESS;
}

43
opal/mca/pstat/linux/Makefile.am Обычный файл
Просмотреть файл

@ -0,0 +1,43 @@
#
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2004-2005 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
sources = \
pstat_linux.h \
pstat_linux_component.c \
pstat_linux_module.c
# Make the output library in this directory, and name it either
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
# (for static builds).
if OMPI_BUILD_pstat_linux_DSO
component_noinst =
component_install = mca_pstat_linux.la
else
component_noinst = libmca_pstat_linux.la
component_install =
endif
mcacomponentdir = $(pkglibdir)
mcacomponent_LTLIBRARIES = $(component_install)
mca_pstat_linux_la_SOURCES = $(sources)
mca_pstat_linux_la_LDFLAGS = -module -avoid-version
noinst_LTLIBRARIES = $(component_noinst)
libmca_pstat_linux_la_SOURCES =$(sources)
libmca_pstat_linux_la_LDFLAGS = -module -avoid-version

39
opal/mca/pstat/linux/configure.m4 Обычный файл
Просмотреть файл

@ -0,0 +1,39 @@
# -*- shell-script -*-
#
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2004-2005 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# MCA_pstat_linux_CONFIG(action-if-can-compile,
# [action-if-cant-compile])
# ------------------------------------------------
AC_DEFUN([MCA_pstat_linux_CONFIG],[
case "${host}" in
i?86-*|x86_64*|ia64-*|powerpc-*|powerpc64-*|sparc*-*)
AS_IF([test -r "/proc/cpuinfo"],
[pstat_linux_happy="yes"],
[pstat_linux_happy="no"])
;;
*)
pstat_linux_happy="no"
;;
esac
AS_IF([test "$pstat_linux_happy" = "yes"],
[$1],
[$2])
])

28
opal/mca/pstat/linux/configure.params Обычный файл
Просмотреть файл

@ -0,0 +1,28 @@
# -*- shell-script -*-
#
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2004-2005 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# Copyright (c) 2007 Los Alamos National Security, LLC. All rights
# reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
PARAM_CONFIG_FILES="Makefile"
#
# Set the config priority so that, if we can build,
# only this component will build
PARAM_CONFIG_PRIORITY=60

48
opal/mca/pstat/linux/pstat_linux.h Обычный файл
Просмотреть файл

@ -0,0 +1,48 @@
/*
* Copyright (c) 2004-2008 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2006-2007 Cisco Systems, Inc. All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/**
* @file
*
* Processor stats for Posix systems.
*
*/
#ifndef MCA_PSTAT_LINUX_EXPORT_H
#define MCA_PSTAT_LINUX_EXPORT_H
#include "opal_config.h"
#include "opal/mca/mca.h"
#include "opal/mca/pstat/pstat.h"
BEGIN_C_DECLS
/**
* Globally exported variable
*/
OPAL_DECLSPEC extern const opal_pstat_base_component_t mca_pstat_linux_component;
OPAL_DECLSPEC extern const opal_pstat_base_module_t opal_pstat_linux_module;
END_C_DECLS
#endif /* MCA_PSTAT_LINUX_EXPORT_H */

Просмотреть файл

@ -0,0 +1,82 @@
/*
* Copyright (c) 2004-2008 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2007-2008 Cisco, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*
* These symbols are in a file by themselves to provide nice linker
* semantics. Since linkers generally pull in symbols by object
* files, keeping these symbols as the only symbols in this file
* prevents utility programs such as "ompi_info" from having to import
* entire components just to query their version and parameters.
*/
#include "opal_config.h"
#include "opal/constants.h"
#include "opal/mca/pstat/pstat.h"
#include "pstat_linux.h"
/*
* Public string showing the pstat ompi_linux component version number
*/
const char *opal_pstat_linux_component_version_string =
"OPAL linux pstat MCA component version " OPAL_VERSION;
/*
* Local function
*/
static int pstat_linux_component_query(mca_base_module_t **module, int *priority);
/*
* Instantiate the public struct with all of our public information
* and pointers to our public functions in it
*/
const opal_pstat_base_component_t mca_pstat_linux_component = {
/* First, the mca_component_t struct containing meta information
about the component itself */
{
OPAL_PSTAT_BASE_VERSION_2_0_0,
/* Component name and version */
"linux",
OPAL_MAJOR_VERSION,
OPAL_MINOR_VERSION,
OPAL_RELEASE_VERSION,
/* Component open and close functions */
NULL,
NULL,
pstat_linux_component_query,
NULL,
},
{
/* The component is checkpoint ready */
MCA_BASE_METADATA_PARAM_CHECKPOINT
}
};
static int pstat_linux_component_query(mca_base_module_t **module, int *priority)
{
*priority = 20;
*module = (mca_base_module_t *)&opal_pstat_linux_module;
return OPAL_SUCCESS;
}

262
opal/mca/pstat/linux/pstat_linux_module.c Обычный файл
Просмотреть файл

@ -0,0 +1,262 @@
/*
* Copyright (c) 2004-2008 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2006-2007 Cisco Systems, Inc. All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "opal_config.h"
#include "opal/constants.h"
/* This component will only be compiled on Linux, where we are
guaranteed to have <unistd.h> and friends */
#include <stdio.h>
#include <fcntl.h>
#include <unistd.h>
#include <stdlib.h>
#include <string.h>
#include <errno.h>
#include <ctype.h>
#include <asm/page.h> /* provides conversion of pages to memory bytes */
#include <sys/param.h> /* for HZ to convert jiffies to actual time */
#include "opal/mca/base/mca_base_param.h"
#include "opal/dss/dss_types.h"
#include "opal/util/printf.h"
#include "pstat_linux.h"
/*
* Local functions
*/
static int linux_module_init(void);
static int query(pid_t pid, opal_pstats_t *stats);
static int linux_module_fini(void);
/*
* Linux pstat module
*/
const opal_pstat_base_module_t opal_pstat_linux_module = {
/* Initialization function */
linux_module_init,
query,
linux_module_fini
};
static int linux_module_init(void)
{
return OPAL_SUCCESS;
}
static int linux_module_fini(void)
{
return OPAL_SUCCESS;
}
static char *next_field(char *ptr, int barrier)
{
int i=0;
/* we are probably pointing to the last char
* of the current field, so look for whitespace
*/
while (!isspace(*ptr) && i < barrier) {
ptr++; /* step over the current char */
i++;
}
/* now look for the next field */
while (isspace(*ptr) && i < barrier) {
ptr++;
i++;
}
return ptr;
}
static int query(pid_t pid, opal_pstats_t *stats)
{
char data[4096];
int fd;
size_t numchars;
char *ptr, *eptr;
int i;
int len;
/* create the stat filename for this proc */
numchars = snprintf(data, sizeof(data), "/proc/%d/stat", pid);
if (numchars >= sizeof(data)) {
return OPAL_ERR_VALUE_OUT_OF_BOUNDS;
}
if (0 > (fd = open(data, O_RDONLY))) {
/* can't access this file - most likely, this means we
* aren't really on a supported system, or the proc no
* longer exists. Just return an error
*/
return OPAL_ERR_FILE_OPEN_FAILURE;
}
/* absorb all of the file's contents in one gulp - we'll process
* it once it is in memory for speed
*/
memset(data, 0, sizeof(data));
len = read(fd, data, sizeof(data)-1);
close(fd);
/* remove newline at end */
data[len] = '\0';
/* the stat file consists of a single line in a carefully formatted
* form. Parse it field by field as per proc(3) to get the ones we want
*/
/* we don't need to read the pid from the file - we already know it! */
stats->pid = pid;
/* the cmd is surrounded by parentheses - find the start */
if (NULL == (ptr = strchr(data, '('))) {
/* no cmd => something wrong with data, return error */
return OPAL_ERR_BAD_PARAM;
}
/* step over the paren */
ptr++;
/* find the ending paren */
if (NULL == (eptr = strchr(ptr, ')'))) {
/* no end to cmd => something wrong with data, return error */
return OPAL_ERR_BAD_PARAM;
}
/* save the cmd name, up to the limit of the array */
i = 0;
while (ptr < eptr && i < OPAL_PSTAT_MAX_STRING_LEN) {
stats->cmd[i++] = *ptr++;
}
/* move to the next field in the data */
ptr = next_field(eptr, len);
/* next is the process state - a single character */
stats->state = *ptr;
/* move to next field */
ptr = next_field(ptr, len);
/* skip fields until we get to the times */
ptr = next_field(ptr, len); /* ppid */
ptr = next_field(ptr, len); /* pgrp */
ptr = next_field(ptr, len); /* session */
ptr = next_field(ptr, len); /* tty_nr */
ptr = next_field(ptr, len); /* tpgid */
ptr = next_field(ptr, len); /* flags */
ptr = next_field(ptr, len); /* minflt */
ptr = next_field(ptr, len); /* cminflt */
ptr = next_field(ptr, len); /* majflt */
ptr = next_field(ptr, len); /* cmajflt */
/* grab the process time usage fields */
stats->time = strtoul(ptr, &ptr, 10); /* utime */
stats->time += strtoul(ptr, &ptr, 10); /* add the stime */
stats->time = stats->time / HZ; /* convert to time */
/* move to next field */
ptr = next_field(ptr, len);
/* skip fields until we get to priority */
ptr = next_field(ptr, len); /* cutime */
ptr = next_field(ptr, len); /* cstime */
/* save the priority */
stats->priority = strtol(ptr, &ptr, 10);
/* that's all we care about from this data - ignore the rest */
/* now create the status filename for this proc */
memset(data, 0, sizeof(data));
numchars = snprintf(data, sizeof(data), "/proc/%d/status", pid);
if (numchars >= sizeof(data)) {
return OPAL_ERR_VALUE_OUT_OF_BOUNDS;
}
if (0 > (fd = open(data, O_RDONLY))) {
/* can't access this file - most likely, this means we
* aren't really on a supported system, or the proc no
* longer exists. Just return an error
*/
return OPAL_ERR_FILE_OPEN_FAILURE;
}
/* absorb all of the file's contents in one gulp - we'll process
* it once it is in memory for speed
*/
memset(data, 0, sizeof(data));
len = read(fd, data, sizeof(data)-1);
close(fd);
/* remove newline at end */
data[len] = '\0';
/* parse it according to proc(3) */
eptr = data;
/* look for VmPeak */
if (NULL != (ptr = strstr(data, "VmPeak:"))) {
/* found it - step past colon */
ptr += 8;
eptr = strchr(ptr, 'k');
*eptr = '\0';
stats->peak_vsize = strtoul(ptr, NULL, 10); /* already in kB */
eptr++;
}
/* look for VmSize */
if (NULL != (ptr = strstr(eptr, "VmSize:"))) {
/* found it - step past colon */
ptr += 8;
eptr = strchr(ptr, 'k');
*eptr = '\0';
stats->vsize = strtoul(ptr, NULL, 10); /* already in kB */
eptr++;
}
/* look for RSS */
if (NULL != (ptr = strstr(eptr, "VmRSS:"))) {
/* found it - step past colon */
ptr += 8;
eptr = strchr(ptr, 'k');
*eptr = '\0';
stats->rss = strtoul(ptr, NULL, 10); /* already in kB */
eptr++;
}
/* look for Libraries */
if (NULL != (ptr = strstr(eptr, "VmLib:"))) {
/* found it - step past colon */
ptr += 8;
eptr = strchr(ptr, 'k');
*eptr = '\0';
stats->shared_size = strtoul(ptr, NULL, 10); /* already in kB */
eptr++;
}
/* look for threads */
if (NULL != (ptr = strstr(eptr, "Threads:"))) {
/* found it - step past colon */
ptr += 8;
stats->num_threads = strtoul(ptr, NULL, 10);
}
return OPAL_SUCCESS;
}

95
opal/mca/pstat/pstat.h Обычный файл
Просмотреть файл

@ -0,0 +1,95 @@
/*
* Copyright (c) 2004-2007 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2008 The Trustees of Indiana University.
* All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/**
* @file
*
* pstat (memory checker) framework component interface.
*
* Intent
*
* This is a very thin framework to abstract memory checking tools,
* such as valgrind and possibly Sun rtc (memory checking available
* possibly only under Solaris/Sparc).
*
* Currently, only functionality for hiding and unhiding of memory
* is added; further functions provided by the memory checker/api
* checker could be added, however, this comes (at least for valgrind)
* with considerable overhead.
* One possible option would be to have error_print_callbacks, that
* output different error messages, depending on the memory location
* being hit by certain error.
*/
#ifndef OPAL_MCA_PSTAT_H
#define OPAL_MCA_PSTAT_H
#include "opal_config.h"
#include "opal/mca/mca.h"
#include "opal/mca/base/base.h"
#include "opal/dss/dss_types.h"
/**
* Module initialization function. Should return OPAL_SUCCESS.
*/
typedef int (*opal_pstat_base_module_init_fn_t)(void);
typedef int (*opal_pstat_base_module_query_fn_t)(pid_t pid, opal_pstats_t *stats);
typedef int (*opal_pstat_base_module_fini_fn_t)(void);
/**
* Structure for pstat components.
*/
struct opal_pstat_base_component_2_0_0_t {
/** MCA base component */
mca_base_component_t base_version;
/** MCA base data */
mca_base_component_data_t base_data;
};
/**
* Convenience typedef
*/
typedef struct opal_pstat_base_component_2_0_0_t opal_pstat_base_component_2_0_0_t;
typedef struct opal_pstat_base_component_2_0_0_t opal_pstat_base_component_t;
/**
* Structure for pstat modules
*/
struct opal_pstat_base_module_1_0_0_t {
opal_pstat_base_module_init_fn_t init;
opal_pstat_base_module_query_fn_t query;
opal_pstat_base_module_fini_fn_t finalize;
};
/**
* Convenience typedef
*/
typedef struct opal_pstat_base_module_1_0_0_t opal_pstat_base_module_1_0_0_t;
typedef struct opal_pstat_base_module_1_0_0_t opal_pstat_base_module_t;
/**
* Macro for use in components that are of type pstat
*/
#define OPAL_PSTAT_BASE_VERSION_2_0_0 \
MCA_BASE_VERSION_2_0_0, \
"pstat", 2, 0, 0
/* Global structure for accessing pstat functions */
OPAL_DECLSPEC extern opal_pstat_base_module_t opal_pstat;
#endif /* OPAL_MCA_PSTAT_H */

Просмотреть файл

@ -32,6 +32,7 @@
#include "orte/util/show_help.h"
#include "opal/runtime/opal.h"
#include "opal/runtime/opal_cr.h"
#include "opal/mca/pstat/base/base.h"
#include "orte/mca/rml/base/base.h"
#include "orte/mca/routed/base/base.h"
@ -41,7 +42,6 @@
#include "orte/mca/plm/base/base.h"
#include "orte/mca/odls/base/base.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/errmgr/base/base.h"
#if OPAL_ENABLE_FT == 1
#include "orte/mca/snapc/base/base.h"
#endif
@ -66,6 +66,20 @@ int orte_ess_base_orted_setup(void)
char *error = NULL;
char *plm_to_use;
/* open and setup the opal_pstat framework so we can provide
* process stats if requested
*/
if (ORTE_SUCCESS != (ret = opal_pstat_base_open())) {
ORTE_ERROR_LOG(ret);
error = "opal_pstat_base_open";
goto error;
}
if (ORTE_SUCCESS != (ret = opal_pstat_base_select())) {
ORTE_ERROR_LOG(ret);
error = "orte_pstat_base_select";
goto error;
}
/* some environments allow remote launches - e.g., ssh - so
* open the PLM and select something -only- if we are given
* a specific module to use
@ -290,7 +304,6 @@ int orte_ess_base_orted_finalize(void)
if (plm_in_use) {
orte_plm_base_close();
}
orte_errmgr_base_close();
/* now can close the rml and its friendly group comm */
orte_grpcomm_base_close();

Просмотреть файл

@ -36,6 +36,7 @@
#include "opal/util/os_path.h"
#include "opal/util/malloc.h"
#include "opal/util/basename.h"
#include "opal/mca/pstat/base/base.h"
#include "orte/util/show_help.h"
#include "orte/mca/rml/base/base.h"
@ -116,6 +117,20 @@ static int rte_init(char flags)
goto error;
}
/* open and setup the opal_pstat framework so we can provide
* process stats if requested
*/
if (ORTE_SUCCESS != (ret = opal_pstat_base_open())) {
ORTE_ERROR_LOG(ret);
error = "opal_pstat_base_open";
goto error;
}
if (ORTE_SUCCESS != (ret = opal_pstat_base_select())) {
ORTE_ERROR_LOG(ret);
error = "orte_pstat_base_select";
goto error;
}
/* Since we are the HNP, then responsibility for
* defining the name falls to the PLM component for our
* respective environment - hence, we have to open the PLM

Просмотреть файл

@ -32,6 +32,9 @@
#ifdef HAVE_SYS_PARAM_H
#include <sys/param.h>
#endif
#ifdef HAVE_TIME_H
#include <time.h>
#endif
#include <signal.h>
@ -42,6 +45,7 @@
#include "opal/class/opal_pointer_array.h"
#include "opal/dss/dss.h"
#include "opal/mca/paffinity/base/base.h"
#include "opal/mca/pstat/pstat.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/rml/rml.h"
@ -2680,3 +2684,60 @@ CLEANUP:
OPAL_THREAD_UNLOCK(&orte_odls_globals.mutex);
return rc;
}
int orte_odls_base_get_proc_stats(opal_buffer_t *answer,
orte_process_name_t *proc)
{
int rc;
orte_odls_child_t *child;
opal_list_item_t *item;
opal_pstats_t stats, *statsptr;
int j;
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
"%s odls:get_proc_stats for proc %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(proc)));
/* find this child */
for (item = opal_list_get_first(&orte_odls_globals.children);
item != opal_list_get_end(&orte_odls_globals.children);
item = opal_list_get_next(item)) {
child = (orte_odls_child_t*)item;
if (proc->jobid == child->name->jobid &&
(proc->vpid == child->name->vpid ||
ORTE_VPID_WILDCARD == proc->vpid)) { /* found it */
OBJ_CONSTRUCT(&stats, opal_pstats_t);
/* record node up to first '.' */
for (j=0; j < (int)strlen(orte_process_info.nodename) &&
j < OPAL_PSTAT_MAX_STRING_LEN-1 &&
orte_process_info.nodename[j] != '.'; j++) {
stats.node[j] = orte_process_info.nodename[j];
}
/* record rank */
stats.rank = child->name->vpid;
/* get stats */
rc = opal_pstat.query(child->pid, &stats);
if (ORTE_SUCCESS != rc) {
OBJ_DESTRUCT(&stats);
return rc;
}
if (ORTE_SUCCESS != (rc = opal_dss.pack(answer, proc, 1, ORTE_NAME))) {
ORTE_ERROR_LOG(rc);
OBJ_DESTRUCT(&stats);
return rc;
}
statsptr = &stats;
if (ORTE_SUCCESS != (rc = opal_dss.pack(answer, &statsptr, 1, OPAL_PSTAT))) {
ORTE_ERROR_LOG(rc);
OBJ_DESTRUCT(&stats);
return rc;
}
OBJ_DESTRUCT(&stats);
}
}
return ORTE_SUCCESS;
}

Просмотреть файл

@ -183,10 +183,15 @@ ORTE_DECLSPEC int orte_odls_base_preload_files_app_context(orte_app_context_t* c
ORTE_DECLSPEC int orte_odls_base_default_collect_data(orte_process_name_t *proc, opal_buffer_t *buf);
/*
* Retrive the daemon map
* Retrieve the daemon map
*/
ORTE_DECLSPEC opal_pointer_array_t* orte_odls_base_get_daemon_map(void);
/*
* Obtain process stats on a child proc
*/
ORTE_DECLSPEC int orte_odls_base_get_proc_stats(opal_buffer_t *answer, orte_process_name_t *proc);
END_C_DECLS
#endif

Просмотреть файл

@ -50,20 +50,19 @@ typedef uint8_t orte_daemon_cmd_flag_t;
#define ORTE_DAEMON_REPORT_JOB_INFO_CMD (orte_daemon_cmd_flag_t) 14
#define ORTE_DAEMON_REPORT_NODE_INFO_CMD (orte_daemon_cmd_flag_t) 15
#define ORTE_DAEMON_REPORT_PROC_INFO_CMD (orte_daemon_cmd_flag_t) 16
#define ORTE_DAEMON_ATTACH_STDOUT_CMD (orte_daemon_cmd_flag_t) 17
#define ORTE_DAEMON_ATTACH_STDERR_CMD (orte_daemon_cmd_flag_t) 18
#define ORTE_DAEMON_DETACH_STDOUT_CMD (orte_daemon_cmd_flag_t) 19
#define ORTE_DAEMON_DETACH_STDERR_CMD (orte_daemon_cmd_flag_t) 20
#define ORTE_DAEMON_SPAWN_JOB_CMD (orte_daemon_cmd_flag_t) 21
#define ORTE_DAEMON_TERMINATE_JOB_CMD (orte_daemon_cmd_flag_t) 22
#define ORTE_DAEMON_HALT_VM_CMD (orte_daemon_cmd_flag_t) 23
#define ORTE_DAEMON_SPAWN_JOB_CMD (orte_daemon_cmd_flag_t) 17
#define ORTE_DAEMON_TERMINATE_JOB_CMD (orte_daemon_cmd_flag_t) 18
#define ORTE_DAEMON_HALT_VM_CMD (orte_daemon_cmd_flag_t) 19
/* collective-based cmds */
#define ORTE_DAEMON_COLL_CMD (orte_daemon_cmd_flag_t) 24
#define ORTE_DAEMON_COLL_CMD (orte_daemon_cmd_flag_t) 20
/* proc termination sync cmds */
#define ORTE_DAEMON_WAITPID_FIRED (orte_daemon_cmd_flag_t) 25
#define ORTE_DAEMON_IOF_COMPLETE (orte_daemon_cmd_flag_t) 26
#define ORTE_DAEMON_WAITPID_FIRED (orte_daemon_cmd_flag_t) 21
#define ORTE_DAEMON_IOF_COMPLETE (orte_daemon_cmd_flag_t) 22
/* request proc resource usage */
#define ORTE_DAEMON_TOP_CMD (orte_daemon_cmd_flag_t) 23
END_C_DECLS

Просмотреть файл

@ -54,9 +54,8 @@
#include "opal/runtime/opal.h"
#include "opal/runtime/opal_progress.h"
#include "opal/mca/base/mca_base_param.h"
#include "opal/dss/dss.h"
#include "orte/util/show_help.h"
#include "orte/util/proc_info.h"
#include "orte/util/session_dir.h"
@ -71,6 +70,7 @@
#include "orte/mca/plm/plm.h"
#include "orte/mca/plm/base/plm_private.h"
#include "orte/mca/routed/routed.h"
#include "orte/mca/ess/ess.h"
#include "orte/mca/odls/base/odls_private.h"
@ -439,8 +439,11 @@ static int process_commands(orte_process_name_t* sender,
opal_buffer_t *answer;
orte_rml_cmd_flag_t rml_cmd;
orte_job_t *jdata;
orte_process_name_t proc;
orte_process_name_t proc, proc2;
int32_t status;
orte_process_name_t *return_addr;
int32_t num_replies;
bool hnp_accounted_for;
/* unpack the command */
n = 1;
@ -1091,103 +1094,6 @@ SEND_ANSWER:
}
break;
/**** ATTACH_STDIO COMMAND ****/
case ORTE_DAEMON_ATTACH_STDOUT_CMD:
if (orte_debug_daemons_flag) {
opal_output(0, "%s orted_cmd: received attach stdio cmd",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
}
#if 0
/* if we are not the HNP, we can do nothing - report
* back error so the tool won't hang
*/
if (!orte_process_info.hnp) {
int status=ORTE_ERR_NOT_SUPPORTED;
answer = OBJ_NEW(opal_buffer_t);
if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &status, 1, OPAL_INT))) {
ORTE_ERROR_LOG(ret);
OBJ_RELEASE(answer);
goto CLEANUP;
}
/* callback function will release buffer */
if (0 > orte_rml.send_buffer_nb(sender, answer, ORTE_RML_TAG_TOOL, 0,
send_callback, NULL)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
ret = ORTE_ERR_COMM_FAILURE;
}
} else {
/* if we are the HNP, process the request */
int fd, status;
orte_vpid_t vpid;
orte_process_name_t source;
/* setup the answer */
answer = OBJ_NEW(opal_buffer_t);
/* unpack the jobid */
n = 1;
if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &job, &n, ORTE_JOBID))) {
ORTE_ERROR_LOG(ret);
goto CLEANUP;
}
/* unpack the vpid */
n = 1;
if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &vpid, &n, ORTE_VPID))) {
ORTE_ERROR_LOG(ret);
goto PACK_ANSWER;
}
/* unpack the file descriptor */
n = 1;
if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &fd, &n, OPAL_INT))) {
ORTE_ERROR_LOG(ret);
goto PACK_ANSWER;
}
/* tell the iof to attach it */
status = orte_iof.pull(
/* if they asked for a specific proc, then just get that info */
if (ORTE_VPID_WILDCARD != vpid) {
/* find this proc */
procs = (orte_proc_t**)jdata->procs->addr;
for (i=0; i < jdata->procs->size; i++) {
if (NULL == procs[i]) break; /* stop when we get past the end of data */
if (vpid == procs[i]->name.vpid) {
procs = &procs[i];
num_procs = 1;
break;
}
}
} else {
procs = (orte_proc_t**)jdata->procs->addr;
num_procs = jdata->num_procs;
}
PACK_ANSWER:
/* pack number of procs */
if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &num_procs, 1, ORTE_VPID))) {
ORTE_ERROR_LOG(ret);
goto SEND_ANSWER;
}
if (0 < num_procs) {
if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, procs, jdata->num_procs, ORTE_PROC))) {
ORTE_ERROR_LOG(ret);
goto SEND_ANSWER;
}
}
SEND_ANSWER:
/* callback function will release buffer */
if (0 > orte_rml.send_buffer_nb(sender, answer, ORTE_RML_TAG_TOOL, 0,
send_callback, NULL)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
ret = ORTE_ERR_COMM_FAILURE;
}
}
#endif
break;
/**** HEARTBEAT COMMAND ****/
case ORTE_DAEMON_HEARTBEAT_CMD:
ORTE_ERROR_LOG(ORTE_ERR_NOT_IMPLEMENTED);
@ -1219,6 +1125,160 @@ SEND_ANSWER:
}
break;
/**** TOP COMMAND ****/
case ORTE_DAEMON_TOP_CMD:
/* setup the answer */
answer = OBJ_NEW(opal_buffer_t);
num_replies = 0;
hnp_accounted_for = false;
n = 1;
while (ORTE_SUCCESS == opal_dss.unpack(buffer, &proc, &n, ORTE_NAME)) {
/* the jobid provided will, of course, have the job family of
* the requestor. We need to convert that to our own job family
*/
proc.jobid = ORTE_CONSTRUCT_LOCAL_JOBID(ORTE_PROC_MY_NAME->jobid, proc.jobid);
if (orte_process_info.hnp) {
return_addr = sender;
/* if the request is for a wildcard vpid, then it goes to every
* daemon. For scalability, we should probably xcast this some
* day - but for now, we just loop
*/
if (ORTE_VPID_WILDCARD == proc.vpid) {
/* loop across all daemons */
proc2.jobid = ORTE_PROC_MY_NAME->jobid;
for (proc2.vpid=1; proc2.vpid < orte_process_info.num_procs; proc2.vpid++) {
/* setup the cmd */
relay_msg = OBJ_NEW(opal_buffer_t);
command = ORTE_DAEMON_TOP_CMD;
if (ORTE_SUCCESS != (ret = opal_dss.pack(relay_msg, &command, 1, ORTE_DAEMON_CMD))) {
ORTE_ERROR_LOG(ret);
OBJ_RELEASE(relay_msg);
goto SEND_TOP_ANSWER;
}
if (ORTE_SUCCESS != (ret = opal_dss.pack(relay_msg, &proc, 1, ORTE_NAME))) {
ORTE_ERROR_LOG(ret);
OBJ_RELEASE(relay_msg);
goto SEND_TOP_ANSWER;
}
if (ORTE_SUCCESS != (ret = opal_dss.pack(relay_msg, sender, 1, ORTE_NAME))) {
ORTE_ERROR_LOG(ret);
OBJ_RELEASE(relay_msg);
goto SEND_TOP_ANSWER;
}
/* the callback function will release relay_msg buffer */
if (0 > orte_rml.send_buffer_nb(&proc2, relay_msg, ORTE_RML_TAG_DAEMON, 0,
send_callback, NULL)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_RELEASE(relay_msg);
ret = ORTE_ERR_COMM_FAILURE;
}
num_replies++;
}
/* account for our own reply */
if (!hnp_accounted_for) {
hnp_accounted_for = true;
num_replies++;
}
/* now get the data for my own procs */
goto GET_TOP;
} else {
/* this is for a single proc - see which daemon
* this rank is on
*/
if (ORTE_VPID_INVALID == (proc2.vpid = orte_ess.proc_get_daemon(&proc))) {
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
goto SEND_TOP_ANSWER;
}
/* if the vpid is me, then just handle this myself */
if (proc2.vpid == ORTE_PROC_MY_NAME->vpid) {
if (!hnp_accounted_for) {
hnp_accounted_for = true;
num_replies++;
}
goto GET_TOP;
}
/* otherwise, forward the cmd on to the appropriate daemon */
relay_msg = OBJ_NEW(opal_buffer_t);
command = ORTE_DAEMON_TOP_CMD;
if (ORTE_SUCCESS != (ret = opal_dss.pack(relay_msg, &command, 1, ORTE_DAEMON_CMD))) {
ORTE_ERROR_LOG(ret);
OBJ_RELEASE(relay_msg);
goto SEND_TOP_ANSWER;
}
proc2.jobid = ORTE_PROC_MY_NAME->jobid;
if (ORTE_SUCCESS != (ret = opal_dss.pack(relay_msg, &proc, 1, ORTE_NAME))) {
ORTE_ERROR_LOG(ret);
OBJ_RELEASE(relay_msg);
goto SEND_TOP_ANSWER;
}
if (ORTE_SUCCESS != (ret = opal_dss.pack(relay_msg, sender, 1, ORTE_NAME))) {
ORTE_ERROR_LOG(ret);
OBJ_RELEASE(relay_msg);
goto SEND_TOP_ANSWER;
}
/* the callback function will release relay_msg buffer */
if (0 > orte_rml.send_buffer_nb(&proc2, relay_msg, ORTE_RML_TAG_DAEMON, 0,
send_callback, NULL)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_RELEASE(relay_msg);
ret = ORTE_ERR_COMM_FAILURE;
}
}
/* end if HNP */
} else {
/* this came from the HNP, but needs to go back to the original
* requestor. Unpack the name of that entity first
*/
n = 1;
if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &proc2, &n, ORTE_NAME))) {
ORTE_ERROR_LOG(ret);
goto SEND_TOP_ANSWER;
}
return_addr = &proc2;
GET_TOP:
/* this rank must be local to me, or the HNP wouldn't
* have sent it to me - process the request
*/
if (ORTE_SUCCESS != (ret = orte_odls_base_get_proc_stats(answer, &proc))) {
ORTE_ERROR_LOG(ret);
goto SEND_TOP_ANSWER;
}
}
}
SEND_TOP_ANSWER:
/* send the answer back to requester - callback
* function will release buffer
*/
if (orte_process_info.hnp) {
/* if I am the HNP, I need to also provide the number of
* replies the caller should recv and the sample time
*/
time_t mytime;
char *cptr;
relay_msg = OBJ_NEW(opal_buffer_t);
if (ORTE_SUCCESS != (ret = opal_dss.pack(relay_msg, &num_replies, 1, OPAL_INT32))) {
ORTE_ERROR_LOG(ret);
}
time(&mytime);
cptr = ctime(&mytime);
cptr[strlen(cptr)-1] = '\0'; /* remove trailing newline */
if (ORTE_SUCCESS != (ret = opal_dss.pack(relay_msg, &cptr, 1, OPAL_STRING))) {
ORTE_ERROR_LOG(ret);
}
/* copy the stats payload */
opal_dss.copy_payload(relay_msg, answer);
OBJ_RELEASE(answer);
answer = relay_msg;
}
if (0 > orte_rml.send_buffer_nb(return_addr, answer, ORTE_RML_TAG_TOOL, 0,
send_callback, NULL)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
ret = ORTE_ERR_COMM_FAILURE;
}
break;
default:
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
ret = ORTE_ERR_BAD_PARAM;

Просмотреть файл

@ -29,7 +29,8 @@ SUBDIRS += \
tools/orte-restart \
tools/orted \
tools/orterun \
tools/wrappers
tools/wrappers \
tools/orte-top
DIST_SUBDIRS += \
tools/orte-checkpoint \
@ -39,4 +40,6 @@ DIST_SUBDIRS += \
tools/orte-restart \
tools/orted \
tools/orterun \
tools/wrappers
tools/wrappers \
tools/orte-top

Просмотреть файл

@ -20,7 +20,7 @@
include $(top_srcdir)/Makefile.man-page-rules
man_pages = orte-checkpoint.1 ompi-checkpoint.1
man_pages = orte-checkpoint.1
EXTRA_DIST = orte-checkpoint.1in
if !ORTE_DISABLE_FULL_SUPPORT
@ -38,12 +38,6 @@ $(nodist_man_MANS): $(top_builddir)/opal/include/opal_config.h
dist_pkgdata_DATA = help-orte-checkpoint.txt
install-exec-hook:
(cd $(DESTDIR)$(bindir); rm -f ompi-checkpoint$(EXEEXT); $(LN_S) orte-checkpoint$(EXEEXT) ompi-checkpoint$(EXEEXT))
uninstall-local:
rm -f $(DESTDIR)$(bindir)/ompi-checkpoint$(EXEEXT)
endif # OMPI_INSTALL_BINARIES
orte_checkpoint_SOURCES = orte-checkpoint.c
@ -52,8 +46,5 @@ orte_checkpoint_LDADD = $(top_builddir)/orte/libopen-rte.la
endif # WANT_FT
endif # !ORTE_DISABLE_FULL_SUPPORT
ompi-checkpoint.1: $(top_builddir)/orte/tools/orte-checkpoint/orte-checkpoint.1
cp -f $(top_builddir)/orte/tools/orte-checkpoint/orte-checkpoint.1 ompi-checkpoint.1
distclean-local:
rm -f $(man_pages)

Просмотреть файл

@ -20,7 +20,7 @@
include $(top_srcdir)/Makefile.man-page-rules
man_pages = orte-restart.1 ompi-restart.1
man_pages = orte-restart.1
EXTRA_DIST = orte-restart.1in
if !ORTE_DISABLE_FULL_SUPPORT
@ -38,12 +38,6 @@ $(nodist_man_MANS): $(top_builddir)/opal/include/opal_config.h
dist_pkgdata_DATA = help-orte-restart.txt
install-exec-hook:
(cd $(DESTDIR)$(bindir); rm -f ompi-restart$(EXEEXT); $(LN_S) orte-restart$(EXEEXT) ompi-restart$(EXEEXT))
uninstall-local:
rm -f $(DESTDIR)$(bindir)/ompi-restart$(EXEEXT)
endif # OMPI_INSTALL_BINARIES
orte_restart_SOURCES = orte-restart.c
@ -52,8 +46,5 @@ orte_restart_LDADD = $(top_builddir)/orte/libopen-rte.la
endif # WANT_FT
endif # ORTE_DISABLE_FULL_SUPPORT
ompi-restart.1: $(top_builddir)/orte/tools/orte-restart/orte-restart.1
cp -f $(top_builddir)/orte/tools/orte-restart/orte-restart.1 ompi-restart.1
distclean-local:
rm -f $(man_pages)

48
orte/tools/orte-top/Makefile.am Обычный файл
Просмотреть файл

@ -0,0 +1,48 @@
#
# Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2004-2005 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# Copyright (c) 2008 Cisco Systems, Inc. All rights reserved.
# Copyright (c) 2008 Sun Microsystems, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
include $(top_srcdir)/Makefile.man-page-rules
man_pages = orte-top.1
EXTRA_DIST = orte-top.1in
if !ORTE_DISABLE_FULL_SUPPORT
if OMPI_INSTALL_BINARIES
bin_PROGRAMS = orte-top
nodist_man_MANS = $(man_pages)
# Ensure that the man pages are rebuilt if the opal_config.h file
# changes; a "good enough" way to know if configure was run again (and
# therefore the release date or version may have changed)
$(nodist_man_MANS): $(top_builddir)/opal/include/opal_config.h
dist_pkgdata_DATA = help-orte-top.txt
endif # OMPI_INSTALL_BINARIES
orte_top_SOURCES = orte-top.c
orte_top_LDADD = $(top_builddir)/orte/libopen-rte.la
endif # ORTE_DISABLE_FULL_SUPPORT
distclean-local:
rm -f $(man_pages)

37
orte/tools/orte-top/help-orte-top.txt Обычный файл
Просмотреть файл

@ -0,0 +1,37 @@
# -*- text -*-
#
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2004-2005 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# This is the US/English general help file for Open MPI's orte-top tool.
#
[orte-top]
Return statistics on specified process ranks
Usage: %s [OPTIONS]
%s
#
[orte-top:pid-not-found]
We could not find an mpirun matching the provided pid on this machine.
Pid provided: %d
#
[orte-top:pid-required]
This tool requires that you specify the pid of the mpirun executing
the specified rank(s). Please use the --help option for more information.

93
orte/tools/orte-top/orte-top.1in Обычный файл
Просмотреть файл

@ -0,0 +1,93 @@
.\"
.\" Copyright (c) 2007 Los Alamos National Security, LLC
.\" All rights reserved.
.\" Copyright (c) 2008 Sun Microsystems, Inc. All rights reserved.
.\"
.\" Man page for OMPI's ompi-server command
.\"
.\" .TH name section center-footer left-footer center-header
.TH OMPI-TOP 1 "#OMPI_DATE#" "#PACKAGE_VERSION#" "#PACKAGE_NAME#"
.\" **************************
.\" Name Section
.\" **************************
.SH NAME
.
ompi-top \- Diagnostic to provide process info similar to the popular "top" program.
.
.PP
.
.\" **************************
.\" Synopsis Section
.\" **************************
.SH SYNOPSIS
.
.BR ompi-top " [ options ]"
.
.\" **************************
.\" Options Section
.\" **************************
.SH Options
.
\fIompi-top\fR collects and displays process information in a manner similar
to that of the popular "top" program.
.
.TP 10
.B -h | --help
Display help for this command
.
.
.TP
.B -pid | --pid \fR<value>\fP
The pid of the mpirun whose processes you want information about. Note that
the ompi-top command must be executed on the same node as mpirun.
.
.
.TP
.B -rank | --rank \fR<value>\fP
The rank of the processes to be monitored. This can consist of a single rank, or
a comma-separated list of ranks. These can include rank ranges separated by a '-'.
If this option is not provided, or a value of -1 is given, ompi-top will default
to displaying information on all ranks.
.
.
.TP
.B -bynode | --bynode
Display the results grouped by node, with each node's processes reported in rank
order. If this option is not provided, ompi-top will default to displaying all
results in rank order.
.
.
.TP
.B -update-rate | --update-rate \fR<value>\fP
The time (in seconds) between updates of the displayed information. If this option
is not provided, ompi-top will default to executing only once.
.
.
.TP
.B -timestamp | --timestamp
Provide an approximate time when each sample was taken. This time is approximate as it
only shows the time when the sample command was issued.
.
.
.TP
.B -log-file | --log-file \fR<value>\fP
Log the results to the specified file instead of displaying them to stdout.
.
.
.\" **************************
.\" Description Section
.\" **************************
.SH DESCRIPTION
.
.PP
\fIompi-top\fR collects and displays process information in a manner similar
to that of the popular "top" program. It doesn't do the fancy screen display, but
does allow you to monitor available process information (to the limits of the underlying
operating system) of processes irrespective of their location.
.
.\" **************************
.\" See Also Section
.\" **************************
.
.SH SEE ALSO
.

936
orte/tools/orte-top/orte-top.c Обычный файл
Просмотреть файл

@ -0,0 +1,936 @@
/*
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2006 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2007 Cisco, Inc. All rights reserved.
* Copyright (c) 2007 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include <stdio.h>
#include <ctype.h>
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif
#ifdef HAVE_NETDB_H
#include <netdb.h>
#endif
#ifdef HAVE_SYS_PARAM_H
#include <sys/param.h>
#endif
#include <fcntl.h>
#include <errno.h>
#include <signal.h>
#include <stdlib.h>
#include "opal/util/cmd_line.h"
#include "opal/util/argv.h"
#include "opal/dss/dss.h"
#include "opal/mca/base/base.h"
#include "opal/util/opal_environ.h"
#include "opal/runtime/opal.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/rml/rml.h"
#include "orte/mca/odls/odls_types.h"
#include "orte/mca/routed/routed.h"
#include "orte/util/proc_info.h"
#include "orte/runtime/orte_globals.h"
#include "orte/util/hnp_contact.h"
#include "orte/util/name_fns.h"
#include "orte/runtime/orte_wait.h"
/*
* Local variables & functions
*/
static void abort_exit_callback(int fd, short flags, void *arg);
static struct opal_event term_handler;
static struct opal_event int_handler;
static opal_list_t hnp_list;
static bool all_recvd;
static int32_t num_replies;
static int32_t num_recvd;
static opal_buffer_t cmdbuf;
static opal_event_t *my_exit_event;
static FILE *fp = NULL;
static bool help;
static char *hnppidstr;
static char *ranks;
static orte_hnp_contact_t *target_hnp;
static int update_rate;
static bool timestamp;
static char *logfile;
static bool bynode;
static opal_list_t recvd_stats;
static char *sample_time;
static bool need_header = true;
static int num_lines=0;
static bool fields_set = false;
static int nodefield = 0;
static int rankfield = 0;
static int pidfield = 0;
static int cmdfield = 0;
static int timefield = 6;
static int prifield = 0;
static int thrfield = 0;
static int vsizefield = 0;
static int rssfield = 0;
static int pkvfield = 0;
static int shfield = 0;
static int pfield = 0;
/* flag what fields were actually found */
static bool pri_found = false;
static bool thr_found = false;
static bool vsize_found = false;
static bool rss_found = false;
static bool pkv_found = false;
static bool sh_found = false;
static bool p_found = false;
#define MAX_LINES 20
opal_cmd_line_init_t cmd_line_opts[] = {
{ NULL, NULL, NULL,
'h', NULL, "help",
0,
&help, OPAL_CMD_LINE_TYPE_BOOL,
"This help message" },
{ NULL, NULL, NULL,
'\0', "pid", "pid",
1,
&hnppidstr, OPAL_CMD_LINE_TYPE_STRING,
"The pid of the mpirun that you wish to query/monitor" },
{ NULL, NULL, NULL,
'\0', "rank", "rank",
1,
&ranks, OPAL_CMD_LINE_TYPE_STRING,
"Rank whose resource usage is to be displayed/monitored" },
{ NULL, NULL, NULL,
'\0', "update-rate", "update-rate",
1,
&update_rate, OPAL_CMD_LINE_TYPE_INT,
"Number of seconds between updates" },
{ NULL, NULL, NULL,
'\0', "timestamp", "timestamp",
0,
&timestamp, OPAL_CMD_LINE_TYPE_BOOL,
"Time stamp each sample" },
{ NULL, NULL, NULL,
'\0', "log-file", "log-file",
1,
&logfile, OPAL_CMD_LINE_TYPE_STRING,
"Output file for returned statistics" },
{ NULL, NULL, NULL,
'\0', "bynode", "bynode",
0,
&bynode, OPAL_CMD_LINE_TYPE_BOOL,
"Group statistics by node, sorted by rank within each node" },
/* End of list */
{ NULL, NULL, NULL,
'\0', NULL, NULL,
0,
NULL, OPAL_CMD_LINE_TYPE_NULL,
NULL }
};
static void recv_stats(int status, orte_process_name_t* sender,
opal_buffer_t *buffer, orte_rml_tag_t tag,
void* cbdata);
static void pretty_print(void);
static void print_headers(void);
static void send_cmd(int fd, short dummy, void *arg)
{
int ret;
all_recvd = false;
num_replies = INT_MAX;
num_recvd = 0;
if (0 > (ret = orte_rml.send_buffer(&(target_hnp->name), &cmdbuf, ORTE_RML_TAG_DAEMON, 0))) {
ORTE_ERROR_LOG(ret);
orte_trigger_event(&orteds_exit);
return;
}
ORTE_PROGRESSED_WAIT(all_recvd, 0, 1);
/* flag that field sizes are set */
fields_set = true;
/* pretty-print what we got */
pretty_print();
/* see if we want to do it again */
if (0 < update_rate) {
ORTE_TIMER_EVENT(update_rate, 0, send_cmd);
} else {
orte_trigger_event(&orte_exit);
}
}
int
main(int argc, char *argv[])
{
int ret;
opal_cmd_line_t cmd_line;
opal_list_item_t* item = NULL;
orte_daemon_cmd_flag_t command;
pid_t hnppid;
orte_process_name_t proc;
char **r1=NULL, **r2;
int i;
orte_vpid_t vstart, vend;
int vint;
/***************
* Initialize
***************/
/*
* Make sure to init util before parse_args
* to ensure installdirs is setup properly
* before calling mca_base_open();
*/
if( ORTE_SUCCESS != (ret = opal_init_util()) ) {
return ret;
}
/* initialize the globals */
help = false;
hnppidstr = NULL;
ranks = NULL;
target_hnp = NULL;
update_rate = -1;
timestamp = false;
logfile = NULL;
/* Parse the command line options */
opal_cmd_line_create(&cmd_line, cmd_line_opts);
mca_base_open();
mca_base_cmd_line_setup(&cmd_line);
ret = opal_cmd_line_parse(&cmd_line, true, argc, argv);
/**
* Now start parsing our specific arguments
*/
if (OPAL_SUCCESS != ret || help) {
char *args = NULL;
args = opal_cmd_line_get_usage_msg(&cmd_line);
orte_show_help("help-orte-top.txt", "orte-top:usage", true, args);
free(args);
return ORTE_ERROR;
}
/*
* Must specify the mpirun pid
*/
if (NULL == hnppidstr) {
orte_show_help("help-orte-top.txt", "orte-top:pid-required", true);
return ORTE_ERROR;
}
/* convert the pid */
hnppid = strtoul(hnppidstr, NULL, 10);
/* if an output file was specified, open it */
if (NULL != logfile) {
fp = fopen(logfile, "w");
if (NULL == fp) {
orte_show_help("help-orte-top.txt", "orte-top:cant-open-logfile", true, logfile);
return ORTE_ERROR;
}
} else {
fp = stdout;
}
/***************************
* We need all of OPAL and the TOOL portion of ORTE
***************************/
if (ORTE_SUCCESS != orte_init(ORTE_TOOL)) {
orte_finalize();
return 1;
}
OBJ_CONSTRUCT(&orte_exit, orte_trigger_event_t);
if (ORTE_SUCCESS != orte_wait_event(&my_exit_event, &orte_exit, "job_complete", abort_exit_callback)) {
orte_finalize();
return 1;
}
/** setup callbacks for abort signals - from this point
* forward, we need to abort in a manner that allows us
* to cleanup
*/
opal_signal_set(&term_handler, SIGTERM,
abort_exit_callback, &term_handler);
opal_signal_add(&term_handler, NULL);
opal_signal_set(&int_handler, SIGINT,
abort_exit_callback, &int_handler);
opal_signal_add(&int_handler, NULL);
/* setup the list for recvd stats */
OBJ_CONSTRUCT(&recvd_stats, opal_list_t);
/*
* Get the list of available hnp's and setup contact info
* to them in the RML
*/
OBJ_CONSTRUCT(&hnp_list, opal_list_t);
if (ORTE_SUCCESS != (ret = orte_list_local_hnps(&hnp_list, true) ) ) {
goto cleanup;
}
/*
* For each hnp in the listing
*/
while (NULL != (item = opal_list_remove_first(&hnp_list))) {
orte_hnp_contact_t *hnp = (orte_hnp_contact_t*)item;
if (hnppid == hnp->pid) {
/* this is the one we want */
target_hnp = hnp;
break;
}
OBJ_RELEASE(hnp);
}
/* if we get here without finding the one we wanted, then abort */
if (NULL == target_hnp) {
orte_show_help("help-orte-top.txt", "orte-top:pid-not-found", true, hnppid);
goto cleanup;
}
/* set the target hnp as our lifeline so we will terminate if it exits */
orte_routed.set_lifeline(&target_hnp->name);
/* setup a non-blocking recv to get answers - we don't know how
* many daemons are going to send replies, so we just have to
* accept whatever comes back
*/
ret = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_TOOL,
ORTE_RML_NON_PERSISTENT, recv_stats, NULL);
if (ret != ORTE_SUCCESS) {
ORTE_ERROR_LOG(ret);
goto cleanup;
}
/* setup the command to get the resource usage */
OBJ_CONSTRUCT(&cmdbuf, opal_buffer_t);
command = ORTE_DAEMON_TOP_CMD;
if (ORTE_SUCCESS != (ret = opal_dss.pack(&cmdbuf, &command, 1, ORTE_DAEMON_CMD))) {
ORTE_ERROR_LOG(ret);
goto cleanup;
}
proc.jobid = ORTE_PROC_MY_NAME->jobid+1; /* only support initial launch at this time */
/* parse the rank list - this can be a comma-separated list of ranks,
* each element being either a single rank or a range. We also allow
* for a -1 to indicate all ranks. If not rank is given, we assume -1
*/
if (NULL == ranks) {
/* take all ranks */
proc.vpid = ORTE_VPID_WILDCARD;
if (ORTE_SUCCESS != (ret = opal_dss.pack(&cmdbuf, &proc, 1, ORTE_NAME))) {
ORTE_ERROR_LOG(ret);
goto cleanup;
}
goto SEND;
}
/* split on commas */
r1 = opal_argv_split(ranks, ',');
/* for each resulting element, check for range */
for (i=0; i < opal_argv_count(r1); i++) {
r2 = opal_argv_split(r1[i], '-');
if (1 < opal_argv_count(r2)) {
/* given range - get start and end */
vstart = strtol(r2[0], NULL, 10);
vend = strtol(r2[1], NULL, 10);
} else {
/* check for wildcard - have to do this here because
* the -1 would have been caught in the split
*/
vint = strtol(r1[i], NULL, 10);
if (-1 == vint) {
proc.vpid = ORTE_VPID_WILDCARD;
if (ORTE_SUCCESS != (ret = opal_dss.pack(&cmdbuf, &proc, 1, ORTE_NAME))) {
ORTE_ERROR_LOG(ret);
goto cleanup;
}
opal_argv_free(r2);
goto SEND;
}
vstart = strtol(r2[0], NULL, 10);
vend = vstart + 1;
}
for (proc.vpid = vstart; proc.vpid < vend; proc.vpid++) {
if (ORTE_SUCCESS != (ret = opal_dss.pack(&cmdbuf, &proc, 1, ORTE_NAME))) {
ORTE_ERROR_LOG(ret);
goto cleanup;
}
}
opal_argv_free(r2);
}
SEND:
if (NULL != r1) {
opal_argv_free(r1);
}
send_cmd(0, 0, NULL);
/* now wait until the termination event fires */
opal_event_dispatch();
/***************
* Cleanup
***************/
cleanup:
/* Remove the TERM and INT signal handlers */
opal_signal_del(&term_handler);
opal_signal_del(&int_handler);
while (NULL != (item = opal_list_remove_first(&hnp_list))) {
OBJ_RELEASE(item);
}
OBJ_DESTRUCT(&hnp_list);
while (NULL != (item = opal_list_remove_first(&recvd_stats))) {
OBJ_RELEASE(item);
}
OBJ_DESTRUCT(&recvd_stats);
OBJ_DESTRUCT(&cmdbuf);
if (NULL != fp && fp != stdout) {
fclose(fp);
}
orte_finalize();
return ret;
}
static void abort_exit_callback(int fd, short ign, void *arg)
{
opal_list_item_t *item;
/* Remove the TERM and INT signal handlers */
opal_signal_del(&term_handler);
opal_signal_del(&int_handler);
while (NULL != (item = opal_list_remove_first(&hnp_list))) {
OBJ_RELEASE(item);
}
OBJ_DESTRUCT(&hnp_list);
while (NULL != (item = opal_list_remove_first(&recvd_stats))) {
OBJ_RELEASE(item);
}
OBJ_DESTRUCT(&recvd_stats);
OBJ_DESTRUCT(&cmdbuf);
if (NULL != fp && fp != stdout) {
fclose(fp);
}
orte_finalize();
exit(1);
}
static void process_stats(int fd, short event, void *data)
{
orte_message_event_t *mev = (orte_message_event_t*)data;
opal_buffer_t *buffer = mev->buffer;
orte_process_name_t *sender = &(mev->sender);
int32_t n;
opal_pstats_t *stats;
orte_process_name_t proc;
int ret;
/* if the sender is the HNP we contacted, this message
* contains info on the number of responses we should get
*/
if (sender->vpid == 0) {
n = 1;
if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &num_replies, &n, OPAL_INT32))) {
ORTE_ERROR_LOG(ret);
goto cleanup;
}
n = 1;
if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &sample_time, &n, OPAL_STRING))) {
ORTE_ERROR_LOG(ret);
goto cleanup;
}
}
n = 1;
while (ORTE_SUCCESS == opal_dss.unpack(buffer, &proc, &n, ORTE_NAME)) {
n = 1;
if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &stats, &n, OPAL_PSTAT))) {
ORTE_ERROR_LOG(ret);
goto cleanup;
}
/* if field sizes are not yet set, do so now */
if (!fields_set) {
int tmp;
char *ctmp;
tmp = strlen(stats->node);
if (nodefield < tmp) {
nodefield = tmp;
}
asprintf(&ctmp, "%d", stats->rank);
tmp = strlen(ctmp);
free(ctmp);
if (rankfield < tmp) {
rankfield = tmp;
}
asprintf(&ctmp, "%lu", (unsigned long)stats->pid);
tmp = strlen(ctmp);
free(ctmp);
if (pidfield < tmp) {
pidfield = tmp;
}
tmp = strlen(stats->cmd);
if (cmdfield < tmp) {
cmdfield = tmp;
}
if (0 <= stats->priority) {
pri_found = true;
asprintf(&ctmp, "%d", stats->priority);
tmp = strlen(ctmp);
free(ctmp);
if (prifield < tmp) {
prifield = tmp;
}
}
if (0 <= stats->num_threads) {
thr_found = true;
asprintf(&ctmp, "%d", stats->num_threads);
tmp = strlen(ctmp);
free(ctmp);
if (thrfield < tmp) {
thrfield = tmp;
}
}
if (0 < stats->vsize) {
vsize_found = true;
asprintf(&ctmp, "%lu", (unsigned long)stats->vsize);
tmp = strlen(ctmp);
free(ctmp);
if (vsizefield < tmp) {
vsizefield = tmp;
}
}
if (0 < stats->rss) {
rss_found = true;
asprintf(&ctmp, "%lu", (unsigned long)stats->rss);
tmp = strlen(ctmp);
free(ctmp);
if (rssfield < tmp) {
rssfield = tmp;
}
}
if (0 < stats->peak_vsize) {
pkv_found = true;
asprintf(&ctmp, "%lu", (unsigned long)stats->peak_vsize);
tmp = strlen(ctmp);
free(ctmp);
if (pkvfield < tmp) {
pkvfield = tmp;
}
}
if (0 < stats->shared_size) {
sh_found = true;
asprintf(&ctmp, "%lu", (unsigned long)stats->shared_size);
tmp = strlen(ctmp);
free(ctmp);
if (shfield < tmp) {
shfield = tmp;
}
}
if (0 <= stats->processor) {
p_found = true;
asprintf(&ctmp, "%d", stats->processor);
tmp = strlen(ctmp);
free(ctmp);
if (pfield < tmp) {
pfield = tmp;
}
}
}
/* add it to the list */
opal_list_append(&recvd_stats, &stats->super);
}
cleanup:
OBJ_RELEASE(mev);
/* check for completion */
num_recvd++;
if (num_replies <= num_recvd) {
all_recvd = true;
}
/* repost the receive */
ret = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_TOOL,
ORTE_RML_NON_PERSISTENT, recv_stats, NULL);
if (ret != ORTE_SUCCESS) {
ORTE_ERROR_LOG(ret);
}
}
static void recv_stats(int status, orte_process_name_t* sender,
opal_buffer_t *buffer, orte_rml_tag_t tag,
void* cbdata)
{
/* don't process this right away - we need to get out of the recv before
* we process the message as it may ask us to do something that involves
* more messaging! Instead, setup an event so that the message gets processed
* as soon as we leave the recv.
*
* The macro makes a copy of the buffer, which we release when processed - the incoming
* buffer, however, is NOT released here, although its payload IS transferred
* to the message buffer for later processing
*/
ORTE_MESSAGE_EVENT(sender, buffer, tag, process_stats);
OPAL_OUTPUT_VERBOSE((1, orte_debug_output,
"%s recv_stats: reissued recv",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
}
/* static values needed for printing */
static int lennode = 0;
static int lenrank = 0;
static int lenpid = 0;
static int lencmd = 0;
static int lenstate = 0;
static int lentime = 0;
static int lenpri = 0;
static int lenthr = 0;
static int lenvsize = 0;
static int lenrss = 0;
static int lenpkv = 0;
static int lensh = 0;
static int lenp = 0;
static void print_ranks(opal_list_t *statlist)
{
opal_list_item_t *item;
opal_pstats_t *stats, *pstats;
int32_t minrank;
char pretty_time[10];
int i;
/* sort the results by rank */
while (0 < opal_list_get_size(statlist)) {
minrank = INT32_MAX;
pstats = NULL;
for (item = opal_list_get_first(statlist);
item != opal_list_get_end(statlist);
item = opal_list_get_next(item)) {
stats = (opal_pstats_t*)item;
if (stats->rank < minrank) {
pstats = stats;
minrank = stats->rank;
}
}
memset(pretty_time, 0, sizeof(pretty_time));
if (pstats->time >= 3600) {
sprintf(pretty_time, "%5.1fH", (double)pstats->time / (double)(3600));
} else {
sprintf(pretty_time, "%3ld:%02ld", (unsigned long)pstats->time/60, (unsigned long)pstats->time & 60);
}
if (bynode) {
/* print blanks in the nodename field */
for (i=0; i < lennode; i++) {
fprintf(fp, " ");
}
fprintf(fp, " | ");
/* print fields */
fprintf(fp, "%*d | ", lenrank, pstats->rank);
} else {
fprintf(fp, "%*d | ", lenrank, pstats->rank);
fprintf(fp, "%*s | ", lennode, pstats->node);
}
fprintf(fp, "%*s | ", lencmd, pstats->cmd);
fprintf(fp, "%*lu | ", lenpid, (unsigned long)pstats->pid);
fprintf(fp, "%*c | ", lenstate, pstats->state);
fprintf(fp, "%*s | ", lentime, pretty_time);
if (pri_found) {
fprintf(fp, "%*d | ", lenpri, pstats->priority);
}
if (thr_found) {
fprintf(fp, "%*d | ", lenthr, pstats->num_threads);
}
if (vsize_found) {
fprintf(fp, "%*lu | ", lenvsize, (unsigned long)pstats->vsize);
}
if (rss_found) {
fprintf(fp, "%*lu | ", lenvsize, (unsigned long)pstats->rss);
}
if (pkv_found) {
fprintf(fp, "%*lu | ", lenpkv, (unsigned long)pstats->peak_vsize);
}
if (sh_found) {
fprintf(fp, "%*lu | ", lensh, (unsigned long)pstats->shared_size);
}
if (p_found) {
fprintf(fp, "%*d | ", lenp, pstats->processor);
}
fprintf(fp, "\n");
num_lines++;
opal_list_remove_item(statlist, &pstats->super);
OBJ_RELEASE(pstats);
}
}
static void pretty_print(void)
{
opal_list_item_t *item, *next;
opal_pstats_t *stats;
opal_list_t tmplist;
char *node;
if (bynode) {
if (need_header) {
print_headers();
need_header = false;
}
if (timestamp) {
fprintf(fp, "TIMESTAMP: %s\n", sample_time);
}
if (NULL != sample_time) {
free(sample_time);
sample_time = NULL;
}
/* sort the results by node and then rank */
while (NULL != (item = opal_list_remove_first(&recvd_stats))) {
OBJ_CONSTRUCT(&tmplist, opal_list_t);
stats = (opal_pstats_t*)item;
node = strdup(stats->node);
opal_list_append(&tmplist, &stats->super);
/* cycle through the rest of the list looking
* for matching nodes
*/
item = opal_list_get_first(&recvd_stats);
while (item != opal_list_get_end(&recvd_stats)) {
stats = (opal_pstats_t*)item;
next = opal_list_get_next(item);
if (0 == strcmp(stats->node, node)) {
opal_list_remove_item(&recvd_stats, item);
opal_list_append(&tmplist, &stats->super);
}
item = next;
}
fprintf(fp, "%*s\n", lennode, node);
free(node);
print_ranks(&tmplist);
OBJ_DESTRUCT(&tmplist);
}
} else {
if (need_header) {
print_headers();
need_header = false;
}
if (timestamp) {
fprintf(fp, "\n\nTIMESTAMP: %s\n", sample_time);
}
if (NULL != sample_time) {
free(sample_time);
sample_time = NULL;
}
print_ranks(&recvd_stats);
}
/* provide some separation between iterations */
fprintf(fp, "\n");
/* if we have printed more than MAX_LINES since the last header,
* flag that we need to print the header next time
*/
if (MAX_LINES < num_lines) {
need_header = true;
num_lines = 0;
fprintf(fp, "\n\n");
}
}
static void print_headers(void)
{
int num_fields = 0;
int i;
int linelen;
lennode = strlen("Nodename");
if (nodefield > lennode) {
lennode = nodefield;
}
num_fields++;
lenrank = strlen("Rank");
if (rankfield > lenrank) {
lenrank = rankfield;
}
num_fields++;
lenpid = strlen("Pid");
if (pidfield > lenpid) {
lenpid = pidfield;
}
num_fields++;
lencmd = strlen("Command");
if (cmdfield > lencmd) {
lencmd = cmdfield;
}
num_fields++;
lenstate = strlen("State");
num_fields++;
lentime = strlen("Time");
if (timefield > lentime) {
lentime = timefield;
}
num_fields++;
if (pri_found) {
lenpri = strlen("Pri");
if (prifield > lenpri) {
lenpri = prifield;
}
num_fields++;
}
if (thr_found) {
lenthr = strlen("#threads");
if (thrfield > lenthr) {
lenthr = thrfield;
}
num_fields++;
}
if (vsize_found) {
lenvsize = strlen("Vsize");
if (vsizefield > lenvsize) {
lenvsize = vsizefield;
}
num_fields++;
}
if (rss_found) {
lenrss = strlen("RSS");
if (rssfield > lenrss) {
lenrss = rssfield;
}
num_fields++;
}
if (pkv_found) {
lenpkv = strlen("Peak Vsize");
if (pkvfield > lenpkv) {
lenpkv = pkvfield;
}
num_fields++;
}
if (sh_found) {
lensh = strlen("Shr Size");
if (shfield > lensh) {
lensh = shfield;
}
num_fields++;
}
if (p_found) {
lenp = strlen("Processor");
if (pfield > lenp) {
lenp = pfield;
}
num_fields++;
}
linelen = lennode + lenrank + lenpid + lencmd + lenstate + lentime + lenpri + lenthr + lenvsize + lenrss + lenpkv + lensh + lenp;
/* add spacing */
linelen += num_fields * 3;
/* print the rip line */
for(i = 0; i < linelen; ++i) {
fprintf(fp, "=");
}
fprintf(fp, "\n");
/* print the header */
if (bynode) {
fprintf(fp, "%*s | ", lennode , "Nodename");
fprintf(fp, "%*s | ", lenrank , "Rank");
} else {
fprintf(fp, "%*s | ", lenrank , "Rank");
fprintf(fp, "%*s | ", lennode , "Nodename");
}
fprintf(fp, "%*s | ", lencmd , "Command");
fprintf(fp, "%*s | ", lenpid , "Pid");
fprintf(fp, "%*s | ", lenstate , "State");
fprintf(fp, "%*s | ", lentime , "Time");
if (pri_found) {
fprintf(fp, "%*s | ", lenpri , "Pri");
}
if (thr_found) {
fprintf(fp, "%*s | ", lenthr , "#threads");
}
if (vsize_found) {
fprintf(fp, "%*s | ", lenvsize , "Vsize");
}
if (rss_found) {
fprintf(fp, "%*s | ", lenrss , "RSS");
}
if (pkv_found) {
fprintf(fp, "%*s | ", lenpkv , "Peak Vsize");
}
if (sh_found) {
fprintf(fp, "%*s | ", lensh , "Shr Size");
}
if (p_found) {
fprintf(fp, "%*s | ", lenp , "Processor");
}
fprintf(fp, "\n");
/* print the separator */
for(i = 0; i < linelen; ++i) {
fprintf(fp, "-");
}
fprintf(fp, "\n");
}