Update DFS to support multi-node operations
This commit was SVN r27594.
Этот коммит содержится в:
родитель
fefec03e78
Коммит
fe6dfad625
@ -214,7 +214,10 @@ static void recv_dfs(int status, orte_process_name_t* sender,
|
|||||||
trk->requestor.vpid = ORTE_PROC_MY_NAME->vpid;
|
trk->requestor.vpid = ORTE_PROC_MY_NAME->vpid;
|
||||||
trk->host_daemon.jobid = sender->jobid;
|
trk->host_daemon.jobid = sender->jobid;
|
||||||
trk->host_daemon.vpid = sender->vpid;
|
trk->host_daemon.vpid = sender->vpid;
|
||||||
trk->filename = strdup(dfs->uri);
|
trk->uri = strdup(dfs->uri);
|
||||||
|
/* break the uri down into scheme and filename */
|
||||||
|
trk->scheme = opal_uri_get_scheme(dfs->uri);
|
||||||
|
trk->filename = opal_filename_from_uri(dfs->uri, NULL);
|
||||||
/* define the local fd */
|
/* define the local fd */
|
||||||
trk->local_fd = local_fd++;
|
trk->local_fd = local_fd++;
|
||||||
/* record the remote file descriptor */
|
/* record the remote file descriptor */
|
||||||
@ -472,6 +475,9 @@ static void open_local_file(orte_dfs_request_t *dfs)
|
|||||||
trk = OBJ_NEW(orte_dfs_tracker_t);
|
trk = OBJ_NEW(orte_dfs_tracker_t);
|
||||||
trk->requestor.jobid = ORTE_PROC_MY_NAME->jobid;
|
trk->requestor.jobid = ORTE_PROC_MY_NAME->jobid;
|
||||||
trk->requestor.vpid = ORTE_PROC_MY_NAME->vpid;
|
trk->requestor.vpid = ORTE_PROC_MY_NAME->vpid;
|
||||||
|
trk->uri = strdup(dfs->uri);
|
||||||
|
/* break the uri down into scheme and filename */
|
||||||
|
trk->scheme = opal_uri_get_scheme(dfs->uri);
|
||||||
trk->filename = strdup(filename);
|
trk->filename = strdup(filename);
|
||||||
/* define the local fd */
|
/* define the local fd */
|
||||||
trk->local_fd = local_fd++;
|
trk->local_fd = local_fd++;
|
||||||
@ -503,13 +509,11 @@ static void process_opens(int fd, short args, void *cbdata)
|
|||||||
bool found;
|
bool found;
|
||||||
orte_vpid_t v;
|
orte_vpid_t v;
|
||||||
|
|
||||||
opal_output(0, "%s PROCESSING OPEN", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
|
|
||||||
/* get the scheme to determine if we can process locally or not */
|
/* get the scheme to determine if we can process locally or not */
|
||||||
if (NULL == (scheme = opal_uri_get_scheme(dfs->uri))) {
|
if (NULL == (scheme = opal_uri_get_scheme(dfs->uri))) {
|
||||||
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
|
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
|
||||||
goto complete;
|
goto complete;
|
||||||
}
|
}
|
||||||
opal_output(0, "%s GOT SCHEME", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
|
|
||||||
|
|
||||||
if (0 == strcmp(scheme, "nfs")) {
|
if (0 == strcmp(scheme, "nfs")) {
|
||||||
open_local_file(dfs);
|
open_local_file(dfs);
|
||||||
@ -529,12 +533,10 @@ static void process_opens(int fd, short args, void *cbdata)
|
|||||||
if (NULL == (filename = opal_filename_from_uri(dfs->uri, &host))) {
|
if (NULL == (filename = opal_filename_from_uri(dfs->uri, &host))) {
|
||||||
goto complete;
|
goto complete;
|
||||||
}
|
}
|
||||||
opal_output(0, "%s GOT FILENAME %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), filename);
|
|
||||||
if (NULL == host) {
|
if (NULL == host) {
|
||||||
host = strdup(orte_process_info.nodename);
|
host = strdup(orte_process_info.nodename);
|
||||||
}
|
}
|
||||||
|
|
||||||
#if 0
|
|
||||||
/* if the host is our own, then treat it as a local file */
|
/* if the host is our own, then treat it as a local file */
|
||||||
if (NULL == host ||
|
if (NULL == host ||
|
||||||
0 == strcmp(host, orte_process_info.nodename) ||
|
0 == strcmp(host, orte_process_info.nodename) ||
|
||||||
@ -549,7 +551,6 @@ static void process_opens(int fd, short args, void *cbdata)
|
|||||||
OBJ_RELEASE(dfs);
|
OBJ_RELEASE(dfs);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
|
|
||||||
/* ident the daemon on that host */
|
/* ident the daemon on that host */
|
||||||
daemon.jobid = ORTE_PROC_MY_DAEMON->jobid;
|
daemon.jobid = ORTE_PROC_MY_DAEMON->jobid;
|
||||||
@ -575,7 +576,7 @@ static void process_opens(int fd, short args, void *cbdata)
|
|||||||
"%s file %s on host %s daemon %s",
|
"%s file %s on host %s daemon %s",
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
filename, host, ORTE_NAME_PRINT(&daemon));
|
filename, host, ORTE_NAME_PRINT(&daemon));
|
||||||
#if 0
|
|
||||||
/* double-check: if it is our local daemon, then we
|
/* double-check: if it is our local daemon, then we
|
||||||
* treat this as local
|
* treat this as local
|
||||||
*/
|
*/
|
||||||
@ -589,7 +590,7 @@ static void process_opens(int fd, short args, void *cbdata)
|
|||||||
OBJ_RELEASE(dfs);
|
OBJ_RELEASE(dfs);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
/* add this request to our local list so we can
|
/* add this request to our local list so we can
|
||||||
* match it with the returned response when it comes
|
* match it with the returned response when it comes
|
||||||
*/
|
*/
|
||||||
|
@ -49,7 +49,9 @@ typedef struct {
|
|||||||
opal_list_item_t super;
|
opal_list_item_t super;
|
||||||
orte_process_name_t requestor;
|
orte_process_name_t requestor;
|
||||||
orte_process_name_t host_daemon;
|
orte_process_name_t host_daemon;
|
||||||
char *filename; /* for debug purposes */
|
char *uri;
|
||||||
|
char *scheme;
|
||||||
|
char *filename;
|
||||||
int local_fd;
|
int local_fd;
|
||||||
int remote_fd;
|
int remote_fd;
|
||||||
size_t location;
|
size_t location;
|
||||||
|
@ -92,11 +92,19 @@ static void trk_con(orte_dfs_tracker_t *trk)
|
|||||||
{
|
{
|
||||||
trk->host_daemon.jobid = ORTE_JOBID_INVALID;
|
trk->host_daemon.jobid = ORTE_JOBID_INVALID;
|
||||||
trk->host_daemon.vpid = ORTE_VPID_INVALID;
|
trk->host_daemon.vpid = ORTE_VPID_INVALID;
|
||||||
|
trk->uri = NULL;
|
||||||
|
trk->scheme = NULL;
|
||||||
trk->filename = NULL;
|
trk->filename = NULL;
|
||||||
trk->location = 0;
|
trk->location = 0;
|
||||||
}
|
}
|
||||||
static void trk_des(orte_dfs_tracker_t *trk)
|
static void trk_des(orte_dfs_tracker_t *trk)
|
||||||
{
|
{
|
||||||
|
if (NULL != trk->uri) {
|
||||||
|
free(trk->uri);
|
||||||
|
}
|
||||||
|
if (NULL != trk->scheme) {
|
||||||
|
free(trk->scheme);
|
||||||
|
}
|
||||||
if (NULL != trk->filename) {
|
if (NULL != trk->filename) {
|
||||||
free(trk->filename);
|
free(trk->filename);
|
||||||
}
|
}
|
||||||
|
@ -21,15 +21,16 @@ BEGIN_C_DECLS
|
|||||||
typedef uint8_t orte_dfs_cmd_t;
|
typedef uint8_t orte_dfs_cmd_t;
|
||||||
#define ORTE_DFS_CMD_T OPAL_UINT8
|
#define ORTE_DFS_CMD_T OPAL_UINT8
|
||||||
|
|
||||||
#define ORTE_DFS_OPEN_CMD 1
|
#define ORTE_DFS_OPEN_CMD 1
|
||||||
#define ORTE_DFS_CLOSE_CMD 2
|
#define ORTE_DFS_CLOSE_CMD 2
|
||||||
#define ORTE_DFS_SIZE_CMD 3
|
#define ORTE_DFS_SIZE_CMD 3
|
||||||
#define ORTE_DFS_SEEK_CMD 4
|
#define ORTE_DFS_SEEK_CMD 4
|
||||||
#define ORTE_DFS_READ_CMD 5
|
#define ORTE_DFS_READ_CMD 5
|
||||||
#define ORTE_DFS_POST_CMD 6
|
#define ORTE_DFS_POST_CMD 6
|
||||||
#define ORTE_DFS_GETFM_CMD 7
|
#define ORTE_DFS_GETFM_CMD 7
|
||||||
#define ORTE_DFS_LOAD_CMD 8
|
#define ORTE_DFS_LOAD_CMD 8
|
||||||
#define ORTE_DFS_PURGE_CMD 9
|
#define ORTE_DFS_PURGE_CMD 9
|
||||||
|
#define ORTE_DFS_RELAY_POSTS_CMD 10
|
||||||
|
|
||||||
/* file maps */
|
/* file maps */
|
||||||
typedef struct {
|
typedef struct {
|
||||||
|
@ -949,6 +949,7 @@ static void process_getfm(int fd, short args, void *cbdata)
|
|||||||
* data for all jobids - else, find the one
|
* data for all jobids - else, find the one
|
||||||
*/
|
*/
|
||||||
ntotal = 0;
|
ntotal = 0;
|
||||||
|
n = -1;
|
||||||
for (item = opal_list_get_first(&file_maps);
|
for (item = opal_list_get_first(&file_maps);
|
||||||
item != opal_list_get_end(&file_maps);
|
item != opal_list_get_end(&file_maps);
|
||||||
item = opal_list_get_next(item)) {
|
item = opal_list_get_next(item)) {
|
||||||
@ -1051,6 +1052,10 @@ static void process_load(int fd, short args, void *cbdata)
|
|||||||
goto complete;
|
goto complete;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
opal_output_verbose(1, orte_dfs_base.output,
|
||||||
|
"%s loading file maps from %d vpids",
|
||||||
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), nvpids);
|
||||||
|
|
||||||
/* unpack the buffer */
|
/* unpack the buffer */
|
||||||
for (i=0; i < nvpids; i++) {
|
for (i=0; i < nvpids; i++) {
|
||||||
/* unpack this vpid */
|
/* unpack this vpid */
|
||||||
@ -1136,15 +1141,15 @@ static void process_purge(int fd, short args, void *cbdata)
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (NULL == jptr) {
|
if (NULL == jfm) {
|
||||||
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
||||||
} else {
|
} else {
|
||||||
/* remove it from the list */
|
/* remove it from the list */
|
||||||
opal_list_remove_item(&file_maps, &jptr->super);
|
opal_list_remove_item(&file_maps, &jfm->super);
|
||||||
/* the destructor will release the list of maps
|
/* the destructor will release the list of maps
|
||||||
* in the jobfm object
|
* in the jobfm object
|
||||||
*/
|
*/
|
||||||
OBJ_RELEASE(jptr);
|
OBJ_RELEASE(jfm);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (NULL != dfs->purge_cbfunc) {
|
if (NULL != dfs->purge_cbfunc) {
|
||||||
@ -1195,10 +1200,14 @@ static void recv_dfs_cmd(int status, orte_process_name_t* sender,
|
|||||||
int whence;
|
int whence;
|
||||||
struct stat buf;
|
struct stat buf;
|
||||||
orte_process_name_t source;
|
orte_process_name_t source;
|
||||||
opal_buffer_t *bptr;
|
opal_buffer_t *bptr, *xfer;
|
||||||
orte_dfs_request_t *dfs;
|
orte_dfs_request_t *dfs;
|
||||||
orte_dfs_jobfm_t *jfm;
|
orte_dfs_jobfm_t *jfm, *jptr;
|
||||||
|
orte_dfs_vpidfm_t *vfm, *vptr;
|
||||||
opal_buffer_t *answer, bucket;
|
opal_buffer_t *answer, bucket;
|
||||||
|
int i, j;
|
||||||
|
orte_vpid_t vpid;
|
||||||
|
int32_t nentries, ncontributors;
|
||||||
|
|
||||||
/* unpack the command */
|
/* unpack the command */
|
||||||
cnt = 1;
|
cnt = 1;
|
||||||
@ -1539,6 +1548,10 @@ static void recv_dfs_cmd(int status, orte_process_name_t* sender,
|
|||||||
break;
|
break;
|
||||||
|
|
||||||
case ORTE_DFS_POST_CMD:
|
case ORTE_DFS_POST_CMD:
|
||||||
|
opal_output_verbose(1, orte_dfs_base.output,
|
||||||
|
"%s received post command from %s",
|
||||||
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
|
ORTE_NAME_PRINT(sender));
|
||||||
/* unpack their request id */
|
/* unpack their request id */
|
||||||
cnt = 1;
|
cnt = 1;
|
||||||
if (OPAL_SUCCESS != (rc = opal_dss.unpack(buffer, &rid, &cnt, OPAL_UINT64))) {
|
if (OPAL_SUCCESS != (rc = opal_dss.unpack(buffer, &rid, &cnt, OPAL_UINT64))) {
|
||||||
@ -1566,22 +1579,120 @@ static void recv_dfs_cmd(int status, orte_process_name_t* sender,
|
|||||||
process_posts(0, 0, (void*)dfs);
|
process_posts(0, 0, (void*)dfs);
|
||||||
OBJ_RELEASE(bptr);
|
OBJ_RELEASE(bptr);
|
||||||
answer_post:
|
answer_post:
|
||||||
/* return an ack */
|
if (UINT64_MAX != rid) {
|
||||||
answer = OBJ_NEW(opal_buffer_t);
|
/* return an ack */
|
||||||
if (OPAL_SUCCESS != (rc = opal_dss.pack(answer, &cmd, 1, ORTE_DFS_CMD_T))) {
|
answer = OBJ_NEW(opal_buffer_t);
|
||||||
|
if (OPAL_SUCCESS != (rc = opal_dss.pack(answer, &cmd, 1, ORTE_DFS_CMD_T))) {
|
||||||
|
ORTE_ERROR_LOG(rc);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
if (OPAL_SUCCESS != (rc = opal_dss.pack(answer, &rid, 1, OPAL_UINT64))) {
|
||||||
|
ORTE_ERROR_LOG(rc);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
if (0 > (rc = orte_rml.send_buffer_nb(sender, answer,
|
||||||
|
ORTE_RML_TAG_DFS_DATA, 0,
|
||||||
|
orte_rml_send_callback, NULL))) {
|
||||||
|
ORTE_ERROR_LOG(rc);
|
||||||
|
OBJ_RELEASE(answer);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
|
||||||
|
case ORTE_DFS_RELAY_POSTS_CMD:
|
||||||
|
/* unpack the name of the source of this data */
|
||||||
|
cnt = 1;
|
||||||
|
if (OPAL_SUCCESS != (rc = opal_dss.unpack(buffer, &source, &cnt, ORTE_NAME))) {
|
||||||
ORTE_ERROR_LOG(rc);
|
ORTE_ERROR_LOG(rc);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
if (OPAL_SUCCESS != (rc = opal_dss.pack(answer, &rid, 1, OPAL_UINT64))) {
|
opal_output_verbose(1, orte_dfs_base.output,
|
||||||
|
"%s received relayed posts from sender %s for source %s",
|
||||||
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
|
ORTE_NAME_PRINT(sender),
|
||||||
|
ORTE_NAME_PRINT(&source));
|
||||||
|
/* lookup the job map */
|
||||||
|
jfm = NULL;
|
||||||
|
for (item = opal_list_get_first(&file_maps);
|
||||||
|
item != opal_list_get_end(&file_maps);
|
||||||
|
item = opal_list_get_next(item)) {
|
||||||
|
jptr = (orte_dfs_jobfm_t*)item;
|
||||||
|
if (jptr->jobid == source.jobid) {
|
||||||
|
jfm = jptr;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (NULL == jfm) {
|
||||||
|
/* add it */
|
||||||
|
jfm = OBJ_NEW(orte_dfs_jobfm_t);
|
||||||
|
jfm->jobid = source.jobid;
|
||||||
|
opal_list_append(&file_maps, &jfm->super);
|
||||||
|
}
|
||||||
|
/* see if we already have an entry for this source */
|
||||||
|
vfm = NULL;
|
||||||
|
for (item = opal_list_get_first(&jfm->maps);
|
||||||
|
item != opal_list_get_end(&jfm->maps);
|
||||||
|
item = opal_list_get_next(item)) {
|
||||||
|
vptr = (orte_dfs_vpidfm_t*)item;
|
||||||
|
if (vptr->vpid == source.vpid) {
|
||||||
|
vfm = vptr;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (NULL == vfm) {
|
||||||
|
/* add it */
|
||||||
|
vfm = OBJ_NEW(orte_dfs_vpidfm_t);
|
||||||
|
vfm->vpid = source.vpid;
|
||||||
|
opal_list_append(&jfm->maps, &vfm->super);
|
||||||
|
}
|
||||||
|
/* unpack their buffer object */
|
||||||
|
cnt = 1;
|
||||||
|
if (OPAL_SUCCESS != (rc = opal_dss.unpack(buffer, &bptr, &cnt, OPAL_BUFFER))) {
|
||||||
ORTE_ERROR_LOG(rc);
|
ORTE_ERROR_LOG(rc);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
if (0 > (rc = orte_rml.send_buffer_nb(sender, answer,
|
/* the buffer object came from a call to get_file_maps, so it isn't quite
|
||||||
ORTE_RML_TAG_DFS_DATA, 0,
|
* the same as when someone posts directly to us. So process it here by
|
||||||
orte_rml_send_callback, NULL))) {
|
* starting with getting the number of vpids that contributed. This
|
||||||
|
* should always be one, but leave it open for flexibility
|
||||||
|
*/
|
||||||
|
cnt = 1;
|
||||||
|
if (OPAL_SUCCESS != (rc = opal_dss.unpack(bptr, &ncontributors, &cnt, OPAL_INT32))) {
|
||||||
ORTE_ERROR_LOG(rc);
|
ORTE_ERROR_LOG(rc);
|
||||||
OBJ_RELEASE(answer);
|
return;
|
||||||
}
|
}
|
||||||
|
/* loop thru the number of contributors */
|
||||||
|
for (i=0; i < ncontributors; i++) {
|
||||||
|
/* unpack the vpid of the contributor */
|
||||||
|
cnt = 1;
|
||||||
|
if (OPAL_SUCCESS != (rc = opal_dss.unpack(bptr, &vpid, &cnt, ORTE_VPID))) {
|
||||||
|
ORTE_ERROR_LOG(rc);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
/* unpack the number of entries */
|
||||||
|
cnt = 1;
|
||||||
|
if (OPAL_SUCCESS != (rc = opal_dss.unpack(bptr, &nentries, &cnt, OPAL_INT32))) {
|
||||||
|
ORTE_ERROR_LOG(rc);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
for (j=0; j < nentries; j++) {
|
||||||
|
/* get the entry */
|
||||||
|
cnt = 1;
|
||||||
|
if (OPAL_SUCCESS != (rc = opal_dss.unpack(bptr, &xfer, &cnt, OPAL_BUFFER))) {
|
||||||
|
ORTE_ERROR_LOG(rc);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
/* store it */
|
||||||
|
if (OPAL_SUCCESS != (rc = opal_dss.pack(&vfm->data, &xfer, 1, OPAL_BUFFER))) {
|
||||||
|
ORTE_ERROR_LOG(rc);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
OBJ_RELEASE(xfer);
|
||||||
|
vfm->num_entries++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
OBJ_RELEASE(bptr);
|
||||||
|
/* no reply required */
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case ORTE_DFS_GETFM_CMD:
|
case ORTE_DFS_GETFM_CMD:
|
||||||
@ -1645,9 +1756,9 @@ static void recv_dfs_cmd(int status, orte_process_name_t* sender,
|
|||||||
}
|
}
|
||||||
OBJ_DESTRUCT(&bucket);
|
OBJ_DESTRUCT(&bucket);
|
||||||
opal_output_verbose(1, orte_dfs_base.output,
|
opal_output_verbose(1, orte_dfs_base.output,
|
||||||
"%s getf-cmd: returning %d maps to sender %s",
|
"%s getf-cmd: returning %d maps with %d bytes to sender %s",
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), nmaps,
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), nmaps,
|
||||||
ORTE_NAME_PRINT(sender));
|
(int)answer->bytes_used, ORTE_NAME_PRINT(sender));
|
||||||
if (0 > (rc = orte_rml.send_buffer_nb(sender, answer,
|
if (0 > (rc = orte_rml.send_buffer_nb(sender, answer,
|
||||||
ORTE_RML_TAG_DFS_DATA, 0,
|
ORTE_RML_TAG_DFS_DATA, 0,
|
||||||
orte_rml_send_callback, NULL))) {
|
orte_rml_send_callback, NULL))) {
|
||||||
|
@ -20,6 +20,7 @@
|
|||||||
|
|
||||||
#include "opal/util/output.h"
|
#include "opal/util/output.h"
|
||||||
|
|
||||||
|
#include "orte/mca/dfs/dfs.h"
|
||||||
#include "orte/mca/errmgr/errmgr.h"
|
#include "orte/mca/errmgr/errmgr.h"
|
||||||
#include "orte/mca/iof/iof.h"
|
#include "orte/mca/iof/iof.h"
|
||||||
#include "orte/mca/rml/rml.h"
|
#include "orte/mca/rml/rml.h"
|
||||||
@ -158,17 +159,98 @@ static void track_jobs(int fd, short argc, void *cbdata)
|
|||||||
OBJ_RELEASE(caddy);
|
OBJ_RELEASE(caddy);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void send_fms(opal_buffer_t *bptr, void *cbdata)
|
||||||
|
{
|
||||||
|
orte_proc_t *pdata = (orte_proc_t*)cbdata;
|
||||||
|
orte_proc_t *pptr;
|
||||||
|
orte_job_t *jdata;
|
||||||
|
opal_buffer_t *xfer, *alert;
|
||||||
|
orte_dfs_cmd_t cmd = ORTE_DFS_RELAY_POSTS_CMD;
|
||||||
|
int rc, i;
|
||||||
|
orte_plm_cmd_flag_t cmd2;
|
||||||
|
|
||||||
|
opal_output(0, "%s SENDING FILE MAPS FOR %s OF SIZE %d",
|
||||||
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
|
ORTE_NAME_PRINT(&pdata->name), (int)bptr->bytes_used);
|
||||||
|
xfer = OBJ_NEW(opal_buffer_t);
|
||||||
|
if (OPAL_SUCCESS != (rc = opal_dss.pack(xfer, &cmd, 1, ORTE_DFS_CMD_T))) {
|
||||||
|
ORTE_ERROR_LOG(rc);
|
||||||
|
OBJ_RELEASE(xfer);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
if (OPAL_SUCCESS != (rc = opal_dss.pack(xfer, &pdata->name, 1, ORTE_NAME))) {
|
||||||
|
ORTE_ERROR_LOG(rc);
|
||||||
|
OBJ_RELEASE(xfer);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
if (OPAL_SUCCESS != (rc = opal_dss.pack(xfer, &bptr, 1, OPAL_BUFFER))) {
|
||||||
|
ORTE_ERROR_LOG(rc);
|
||||||
|
OBJ_RELEASE(xfer);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, xfer,
|
||||||
|
ORTE_RML_TAG_DFS_CMD, 0,
|
||||||
|
orte_rml_send_callback, NULL))) {
|
||||||
|
ORTE_ERROR_LOG(rc);
|
||||||
|
OBJ_RELEASE(xfer);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Clean up the session directory as if we were the process
|
||||||
|
* itself. This covers the case where the process died abnormally
|
||||||
|
* and didn't cleanup its own session directory.
|
||||||
|
*/
|
||||||
|
orte_session_dir_finalize(&pdata->name);
|
||||||
|
/* alert the HNP */
|
||||||
|
cmd2 = ORTE_PLM_UPDATE_PROC_STATE;
|
||||||
|
alert = OBJ_NEW(opal_buffer_t);
|
||||||
|
if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd2, 1, ORTE_PLM_CMD))) {
|
||||||
|
ORTE_ERROR_LOG(rc);
|
||||||
|
OBJ_RELEASE(alert);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
/* get the job object for this proc */
|
||||||
|
if (NULL == (jdata = orte_get_job_data_object(pdata->name.jobid))) {
|
||||||
|
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
/* pack the info */
|
||||||
|
if (ORTE_SUCCESS != (rc = pack_state_update(alert, jdata, pdata))) {
|
||||||
|
ORTE_ERROR_LOG(rc);
|
||||||
|
}
|
||||||
|
/* send it */
|
||||||
|
OPAL_OUTPUT_VERBOSE((5, orte_state_base_output,
|
||||||
|
"%s SENDING TERMINATION UPDATE FOR PROC %s",
|
||||||
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
|
ORTE_NAME_PRINT(&pdata->name)));
|
||||||
|
if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert,
|
||||||
|
ORTE_RML_TAG_PLM, 0,
|
||||||
|
orte_rml_send_callback, NULL))) {
|
||||||
|
ORTE_ERROR_LOG(rc);
|
||||||
|
}
|
||||||
|
/* find this proc in the children array and remove it so
|
||||||
|
* we don't keep telling the HNP that it died
|
||||||
|
*/
|
||||||
|
for (i=0; i < orte_local_children->size; i++) {
|
||||||
|
if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (pptr == pdata) {
|
||||||
|
opal_pointer_array_set_item(orte_local_children, i, NULL);
|
||||||
|
OBJ_RELEASE(pdata);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
static void track_procs(int fd, short argc, void *cbdata)
|
static void track_procs(int fd, short argc, void *cbdata)
|
||||||
{
|
{
|
||||||
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
|
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
|
||||||
orte_process_name_t *proc = &caddy->name;
|
orte_process_name_t *proc = &caddy->name;
|
||||||
orte_proc_state_t state = caddy->proc_state;
|
orte_proc_state_t state = caddy->proc_state;
|
||||||
orte_job_t *jdata;
|
orte_job_t *jdata;
|
||||||
orte_proc_t *pdata, *pptr;
|
orte_proc_t *pdata;
|
||||||
opal_buffer_t *alert;
|
|
||||||
int rc;
|
|
||||||
orte_plm_cmd_flag_t cmd;
|
|
||||||
int i;
|
|
||||||
|
|
||||||
OPAL_OUTPUT_VERBOSE((5, orte_state_base_output,
|
OPAL_OUTPUT_VERBOSE((5, orte_state_base_output,
|
||||||
"%s state:staged_orted:track_procs called for proc %s state %s",
|
"%s state:staged_orted:track_procs called for proc %s state %s",
|
||||||
@ -213,45 +295,10 @@ static void track_procs(int fd, short argc, void *cbdata)
|
|||||||
/* the proc has terminated */
|
/* the proc has terminated */
|
||||||
pdata->alive = false;
|
pdata->alive = false;
|
||||||
pdata->state = ORTE_PROC_STATE_TERMINATED;
|
pdata->state = ORTE_PROC_STATE_TERMINATED;
|
||||||
/* Clean up the session directory as if we were the process
|
/* retrieve any file maps posted by this process and forward them
|
||||||
* itself. This covers the case where the process died abnormally
|
* to the HNP for collection
|
||||||
* and didn't cleanup its own session directory.
|
|
||||||
*/
|
*/
|
||||||
orte_session_dir_finalize(proc);
|
orte_dfs.get_file_map(proc, send_fms, pdata);
|
||||||
/* alert the HNP */
|
|
||||||
cmd = ORTE_PLM_UPDATE_PROC_STATE;
|
|
||||||
alert = OBJ_NEW(opal_buffer_t);
|
|
||||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) {
|
|
||||||
ORTE_ERROR_LOG(rc);
|
|
||||||
goto cleanup;
|
|
||||||
}
|
|
||||||
/* pack the info */
|
|
||||||
if (ORTE_SUCCESS != (rc = pack_state_update(alert, jdata, pdata))) {
|
|
||||||
ORTE_ERROR_LOG(rc);
|
|
||||||
}
|
|
||||||
/* send it */
|
|
||||||
OPAL_OUTPUT_VERBOSE((5, orte_state_base_output,
|
|
||||||
"%s SENDING TERMINATION UPDATE FOR PROC %s",
|
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
|
||||||
ORTE_NAME_PRINT(&pdata->name)));
|
|
||||||
if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert,
|
|
||||||
ORTE_RML_TAG_PLM, 0,
|
|
||||||
orte_rml_send_callback, NULL))) {
|
|
||||||
ORTE_ERROR_LOG(rc);
|
|
||||||
}
|
|
||||||
/* find this proc in the children array and remove it so
|
|
||||||
* we don't keep telling the HNP that it died
|
|
||||||
*/
|
|
||||||
for (i=0; i < orte_local_children->size; i++) {
|
|
||||||
if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
if (pptr == pdata) {
|
|
||||||
opal_pointer_array_set_item(orte_local_children, i, NULL);
|
|
||||||
OBJ_RELEASE(pdata);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
/* Release the stdin IOF file descriptor for this child, if one
|
/* Release the stdin IOF file descriptor for this child, if one
|
||||||
* was defined. File descriptors for the other IOF channels - stdout,
|
* was defined. File descriptors for the other IOF channels - stdout,
|
||||||
@ -275,45 +322,10 @@ static void track_procs(int fd, short argc, void *cbdata)
|
|||||||
/* the proc has terminated */
|
/* the proc has terminated */
|
||||||
pdata->alive = false;
|
pdata->alive = false;
|
||||||
pdata->state = ORTE_PROC_STATE_TERMINATED;
|
pdata->state = ORTE_PROC_STATE_TERMINATED;
|
||||||
/* Clean up the session directory as if we were the process
|
/* retrieve any file maps posted by this process and forward them
|
||||||
* itself. This covers the case where the process died abnormally
|
* to the HNP for collection
|
||||||
* and didn't cleanup its own session directory.
|
|
||||||
*/
|
*/
|
||||||
orte_session_dir_finalize(proc);
|
orte_dfs.get_file_map(proc, send_fms, pdata);
|
||||||
/* alert the HNP */
|
|
||||||
cmd = ORTE_PLM_UPDATE_PROC_STATE;
|
|
||||||
alert = OBJ_NEW(opal_buffer_t);
|
|
||||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) {
|
|
||||||
ORTE_ERROR_LOG(rc);
|
|
||||||
goto cleanup;
|
|
||||||
}
|
|
||||||
/* pack the info */
|
|
||||||
if (ORTE_SUCCESS != (rc = pack_state_update(alert, jdata, pdata))) {
|
|
||||||
ORTE_ERROR_LOG(rc);
|
|
||||||
}
|
|
||||||
/* send it */
|
|
||||||
OPAL_OUTPUT_VERBOSE((5, orte_state_base_output,
|
|
||||||
"%s SENDING TERMINATION UPDATE FOR PROC %s",
|
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
|
||||||
ORTE_NAME_PRINT(&pdata->name)));
|
|
||||||
if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert,
|
|
||||||
ORTE_RML_TAG_PLM, 0,
|
|
||||||
orte_rml_send_callback, NULL))) {
|
|
||||||
ORTE_ERROR_LOG(rc);
|
|
||||||
}
|
|
||||||
/* find this proc in the children array and remove it so
|
|
||||||
* we don't keep telling the HNP that it died
|
|
||||||
*/
|
|
||||||
for (i=0; i < orte_local_children->size; i++) {
|
|
||||||
if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
if (pptr == pdata) {
|
|
||||||
opal_pointer_array_set_item(orte_local_children, i, NULL);
|
|
||||||
OBJ_RELEASE(pdata);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
|
||||||
|
@ -83,7 +83,6 @@ static void read_cbfunc(long status, uint8_t *buffer, void *cbdata)
|
|||||||
{
|
{
|
||||||
int *check = (int*)cbdata;
|
int *check = (int*)cbdata;
|
||||||
|
|
||||||
opal_output(0, "GOT READ STATUS %d", (int)status);
|
|
||||||
if (status < 0) {
|
if (status < 0) {
|
||||||
read_active = false;
|
read_active = false;
|
||||||
active = false;
|
active = false;
|
||||||
@ -158,7 +157,6 @@ int main(int argc, char* argv[])
|
|||||||
numread = 0;
|
numread = 0;
|
||||||
while (read_active) {
|
while (read_active) {
|
||||||
i = READ_SIZE;
|
i = READ_SIZE;
|
||||||
opal_output(0, "%s reading next %d bytes\n", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), i);
|
|
||||||
active = true;
|
active = true;
|
||||||
orte_dfs.read(fd, buffer, READ_SIZE, read_cbfunc, &i);
|
orte_dfs.read(fd, buffer, READ_SIZE, read_cbfunc, &i);
|
||||||
ORTE_WAIT_FOR_COMPLETION(active);
|
ORTE_WAIT_FOR_COMPLETION(active);
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user