mdtest/src/aiori-MPIIO.c

616 lines
25 KiB
C
Executable File

/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
* vim:expandtab:shiftwidth=8:tabstop=8:
*/
/******************************************************************************\
* *
* Copyright (c) 2003, The Regents of the University of California *
* See the file COPYRIGHT for a complete copyright notice and license. *
* *
********************************************************************************
*
* Implement abstract I/O interface for MPIIO.
*
\******************************************************************************/
#ifdef HAVE_CONFIG_H
#include "config.h"
#endif
#include <stdio.h>
#include <stdlib.h>
#include <sys/stat.h>
#include "ior.h"
#include "iordef.h"
#include "aiori.h"
#include "utilities.h"
#ifndef MPIAPI
#define MPIAPI /* defined as __stdcall on Windows */
#endif
/**************************** P R O T O T Y P E S *****************************/
static IOR_offset_t SeekOffset(MPI_File, IOR_offset_t, aiori_mod_opt_t *);
static aiori_fd_t *MPIIO_Create(char *, int iorflags, aiori_mod_opt_t *);
static aiori_fd_t *MPIIO_Open(char *, int flags, aiori_mod_opt_t *);
static IOR_offset_t MPIIO_Xfer(int, aiori_fd_t *, IOR_size_t *,
IOR_offset_t, IOR_offset_t, aiori_mod_opt_t *);
static void MPIIO_Close(aiori_fd_t *, aiori_mod_opt_t *);
static char* MPIIO_GetVersion();
static void MPIIO_Fsync(aiori_fd_t *, aiori_mod_opt_t *);
static int MPIIO_check_params(aiori_mod_opt_t * options);
/************************** D E C L A R A T I O N S ***************************/
typedef struct{
MPI_File fd;
MPI_Datatype transferType; /* datatype for transfer */
MPI_Datatype contigType; /* elem datatype */
MPI_Datatype fileType; /* filetype for file view */
} mpiio_fd_t;
typedef struct {
int showHints; /* show hints */
int useFileView; /* use MPI_File_set_view */
int preallocate; /* preallocate file size */
int useSharedFilePointer; /* use shared file pointer */
int useStridedDatatype; /* put strided access into datatype */
char * hintsFileName; /* full name for hints file */
} mpiio_options_t;
static option_help * MPIIO_options(aiori_mod_opt_t ** init_backend_options, aiori_mod_opt_t * init_values){
mpiio_options_t * o = malloc(sizeof(mpiio_options_t));
if (init_values != NULL){
memcpy(o, init_values, sizeof(mpiio_options_t));
}else{
memset(o, 0, sizeof(mpiio_options_t));
}
*init_backend_options = (aiori_mod_opt_t*) o;
option_help h [] = {
{0, "mpiio.hintsFileName","Full name for hints file", OPTION_OPTIONAL_ARGUMENT, 's', & o->hintsFileName},
{0, "mpiio.showHints", "Show MPI hints", OPTION_FLAG, 'd', & o->showHints},
{0, "mpiio.preallocate", "Preallocate file size", OPTION_FLAG, 'd', & o->preallocate},
{0, "mpiio.useStridedDatatype", "put strided access into datatype", OPTION_FLAG, 'd', & o->useStridedDatatype},
//{'P', NULL, "useSharedFilePointer -- use shared file pointer [not working]", OPTION_FLAG, 'd', & params->useSharedFilePointer},
{0, "mpiio.useFileView", "Use MPI_File_set_view", OPTION_FLAG, 'd', & o->useFileView},
LAST_OPTION
};
option_help * help = malloc(sizeof(h));
memcpy(help, h, sizeof(h));
return help;
}
ior_aiori_t mpiio_aiori = {
.name = "MPIIO",
.name_legacy = NULL,
.create = MPIIO_Create,
.get_options = MPIIO_options,
.xfer_hints = MPIIO_xfer_hints,
.open = MPIIO_Open,
.xfer = MPIIO_Xfer,
.close = MPIIO_Close,
.delete = MPIIO_Delete,
.get_version = MPIIO_GetVersion,
.fsync = MPIIO_Fsync,
.get_file_size = MPIIO_GetFileSize,
.statfs = aiori_posix_statfs,
.mkdir = aiori_posix_mkdir,
.rmdir = aiori_posix_rmdir,
.access = MPIIO_Access,
.stat = aiori_posix_stat,
.check_params = MPIIO_check_params
};
/***************************** F U N C T I O N S ******************************/
static aiori_xfer_hint_t * hints = NULL;
void MPIIO_xfer_hints(aiori_xfer_hint_t * params){
hints = params;
}
static int MPIIO_check_params(aiori_mod_opt_t * module_options){
mpiio_options_t * param = (mpiio_options_t*) module_options;
if ((param->useFileView == TRUE)
&& (sizeof(MPI_Aint) < 8) /* used for 64-bit datatypes */
&&((hints->numTasks * hints->blockSize) >
(2 * (IOR_offset_t) GIBIBYTE)))
ERR("segment size must be < 2GiB");
if (param->useSharedFilePointer)
ERR("shared file pointer not implemented");
if (param->useStridedDatatype && (hints->blockSize < sizeof(IOR_size_t)
|| hints->transferSize <
sizeof(IOR_size_t)))
ERR("need larger file size for strided datatype in MPIIO");
if (hints->randomOffset && hints->collective)
ERR("random offset not available with collective MPIIO");
if (hints->randomOffset && param->useFileView)
ERR("random offset not available with MPIIO fileviews");
return 0;
}
/*
* Try to access a file through the MPIIO interface.
*/
int MPIIO_Access(const char *path, int mode, aiori_mod_opt_t *module_options)
{
if(hints->dryRun){
return MPI_SUCCESS;
}
mpiio_options_t * param = (mpiio_options_t*) module_options;
MPI_File fd;
int mpi_mode = MPI_MODE_UNIQUE_OPEN;
MPI_Info mpiHints = MPI_INFO_NULL;
if ((mode & W_OK) && (mode & R_OK))
mpi_mode |= MPI_MODE_RDWR;
else if (mode & W_OK)
mpi_mode |= MPI_MODE_WRONLY;
else
mpi_mode |= MPI_MODE_RDONLY;
SetHints(&mpiHints, param->hintsFileName);
int ret = MPI_File_open(MPI_COMM_SELF, path, mpi_mode, mpiHints, &fd);
if (!ret)
MPI_File_close(&fd);
if (mpiHints != MPI_INFO_NULL)
MPI_CHECK(MPI_Info_free(&mpiHints), "MPI_Info_free failed");
return ret;
}
/*
* Create and open a file through the MPIIO interface.
*/
static aiori_fd_t *MPIIO_Create(char *testFileName, int iorflags, aiori_mod_opt_t * module_options)
{
return MPIIO_Open(testFileName, iorflags, module_options);
}
/*
* Open a file through the MPIIO interface. Setup file view.
*/
static aiori_fd_t *MPIIO_Open(char *testFileName, int flags, aiori_mod_opt_t * module_options)
{
mpiio_options_t * param = (mpiio_options_t*) module_options;
int fd_mode = (int)0,
offsetFactor,
tasksPerFile,
transfersPerBlock = hints->blockSize / hints->transferSize;
mpiio_fd_t * mfd = malloc(sizeof(mpiio_fd_t));
memset(mfd, 0, sizeof(mpiio_fd_t));
MPI_Comm comm;
MPI_Info mpiHints = MPI_INFO_NULL;
/* set IOR file flags to MPIIO flags */
/* -- file open flags -- */
if (flags & IOR_RDONLY) {
fd_mode |= MPI_MODE_RDONLY;
}
if (flags & IOR_WRONLY) {
fd_mode |= MPI_MODE_WRONLY;
}
if (flags & IOR_RDWR) {
fd_mode |= MPI_MODE_RDWR;
}
if (flags & IOR_APPEND) {
fd_mode |= MPI_MODE_APPEND;
}
if (flags & IOR_CREAT) {
fd_mode |= MPI_MODE_CREATE;
}
if (flags & IOR_EXCL) {
fd_mode |= MPI_MODE_EXCL;
}
if (flags & IOR_DIRECT) {
fprintf(stdout, "O_DIRECT not implemented in MPIIO\n");
}
/*
* MPI_MODE_UNIQUE_OPEN mode optimization eliminates the overhead of file
* locking. Only open a file in this mode when the file will not be con-
* currently opened elsewhere, either inside or outside the MPI environment.
*/
fd_mode |= MPI_MODE_UNIQUE_OPEN;
if (hints->filePerProc) {
comm = MPI_COMM_SELF;
} else {
comm = testComm;
}
SetHints(&mpiHints, param->hintsFileName);
/*
* note that with MP_HINTS_FILTERED=no, all key/value pairs will
* be in the info object. The info object that is attached to
* the file during MPI_File_open() will only contain those pairs
* deemed valid by the implementation.
*/
/* show hints passed to file */
if (rank == 0 && param->showHints) {
fprintf(stdout, "\nhints passed to MPI_File_open() {\n");
ShowHints(&mpiHints);
fprintf(stdout, "}\n");
}
if(! hints->dryRun){
MPI_CHECKF(MPI_File_open(comm, testFileName, fd_mode, mpiHints, & mfd->fd),
"cannot open file: %s", testFileName);
if (flags & IOR_TRUNC) {
MPI_CHECKF(MPI_File_set_size(mfd->fd, 0), "cannot truncate file: %s", testFileName);
}
}
/* show hints actually attached to file handle */
if (rank == 0 && param->showHints && ! hints->dryRun) {
if (mpiHints != MPI_INFO_NULL)
MPI_CHECK(MPI_Info_free(&mpiHints), "MPI_Info_free failed");
MPI_CHECK(MPI_File_get_info(mfd->fd, &mpiHints),
"cannot get file info");
fprintf(stdout, "\nhints returned from opened file {\n");
ShowHints(&mpiHints);
fprintf(stdout, "}\n");
}
/* preallocate space for file */
if (param->preallocate && flags & IOR_CREAT && ! hints->dryRun) {
MPI_CHECK(MPI_File_preallocate(mfd->fd,
(MPI_Offset) (hints->segmentCount
*
hints->blockSize *
hints->numTasks)),
"cannot preallocate file");
}
/* create file view */
if (param->useFileView) {
/* Create in-memory datatype */
MPI_CHECK(MPI_Type_contiguous (hints->transferSize / sizeof(IOR_size_t), MPI_LONG_LONG_INT, & mfd->contigType), "cannot create contiguous datatype");
MPI_CHECK(MPI_Type_create_resized( mfd->contigType, 0, 0, & mfd->transferType), "cannot create resized type");
MPI_CHECK(MPI_Type_commit(& mfd->contigType), "cannot commit datatype");
MPI_CHECK(MPI_Type_commit(& mfd->transferType), "cannot commit datatype");
/* create contiguous transfer datatype */
if (hints->filePerProc) {
offsetFactor = 0;
tasksPerFile = 1;
} else {
offsetFactor = (rank + rankOffset) % hints->numTasks;
tasksPerFile = hints->numTasks;
}
if(! hints->dryRun) {
if(! param->useStridedDatatype){
struct fileTypeStruct {
int globalSizes[2], localSizes[2], startIndices[2];
} fileTypeStruct;
/*
* create file type using subarray
*/
fileTypeStruct.globalSizes[0] = 1;
fileTypeStruct.globalSizes[1] = transfersPerBlock * tasksPerFile;
fileTypeStruct.localSizes[0] = 1;
fileTypeStruct.localSizes[1] = transfersPerBlock;
fileTypeStruct.startIndices[0] = 0;
fileTypeStruct.startIndices[1] = transfersPerBlock * offsetFactor;
MPI_CHECK(MPI_Type_create_subarray
(2, fileTypeStruct.globalSizes,
fileTypeStruct.localSizes,
fileTypeStruct.startIndices, MPI_ORDER_C,
mfd->contigType, & mfd->fileType),
"cannot create subarray");
MPI_CHECK(MPI_Type_commit(& mfd->fileType), "cannot commit datatype");
MPI_CHECK(MPI_File_set_view(mfd->fd, 0,
mfd->contigType,
mfd->fileType,
"native",
(MPI_Info) MPI_INFO_NULL),
"cannot set file view");
}else{
MPI_CHECK(MPI_Type_create_resized(mfd->contigType, 0, tasksPerFile * hints->blockSize, & mfd->fileType), "cannot create MPI_Type_create_hvector");
MPI_CHECK(MPI_Type_commit(& mfd->fileType), "cannot commit datatype");
}
}
}
if (mpiHints != MPI_INFO_NULL)
MPI_CHECK(MPI_Info_free(&mpiHints), "MPI_Info_free failed");
return ((void *) mfd);
}
/*
* Write or read access to file using the MPIIO interface.
*/
static IOR_offset_t MPIIO_Xfer(int access, aiori_fd_t * fdp, IOR_size_t * buffer,
IOR_offset_t length, IOR_offset_t offset, aiori_mod_opt_t * module_options)
{
/* NOTE: The second arg is (void *) for reads, and (const void *)
for writes. Therefore, one of the two sets of assignments below
will get "assignment from incompatible pointer-type" warnings,
if we only use this one set of signatures. */
mpiio_options_t * param = (mpiio_options_t*) module_options;
if(hints->dryRun)
return length;
mpiio_fd_t * mfd = (mpiio_fd_t*) fdp;
int (MPIAPI * Access) (MPI_File, void *, int,
MPI_Datatype, MPI_Status *);
int (MPIAPI * Access_at) (MPI_File, MPI_Offset, void *, int,
MPI_Datatype, MPI_Status *);
int (MPIAPI * Access_all) (MPI_File, void *, int,
MPI_Datatype, MPI_Status *);
int (MPIAPI * Access_at_all) (MPI_File, MPI_Offset, void *, int,
MPI_Datatype, MPI_Status *);
/*
* this needs to be properly implemented:
*
* int (*Access_ordered)(MPI_File, void *, int,
* MPI_Datatype, MPI_Status *);
*/
MPI_Status status;
/* point functions to appropriate MPIIO calls */
if (access == WRITE) { /* WRITE */
Access = (int (MPIAPI *)(MPI_File, void *, int,
MPI_Datatype, MPI_Status *)) MPI_File_write;
Access_at = (int (MPIAPI *)(MPI_File, MPI_Offset, void *, int,
MPI_Datatype, MPI_Status *)) MPI_File_write_at;
Access_all = (int (MPIAPI *) (MPI_File, void *, int,
MPI_Datatype, MPI_Status *)) MPI_File_write_all;
Access_at_all = (int (MPIAPI *) (MPI_File, MPI_Offset, void *, int,
MPI_Datatype, MPI_Status *)) MPI_File_write_at_all;
/*
* this needs to be properly implemented:
*
* Access_ordered = MPI_File_write_ordered;
*/
} else { /* READ or CHECK */
Access = MPI_File_read;
Access_at = MPI_File_read_at;
Access_all = MPI_File_read_all;
Access_at_all = MPI_File_read_at_all;
/*
* this needs to be properly implemented:
*
* Access_ordered = MPI_File_read_ordered;
*/
}
/*
* 'useFileView' uses derived datatypes and individual file pointers
*/
if (param->useFileView) {
/* find offset in file */
if (SeekOffset(mfd->fd, offset, module_options) <
0) {
/* if unsuccessful */
length = -1;
} else {
/*
* 'useStridedDatatype' fits multi-strided pattern into a datatype;
* must use 'length' to determine repetitions (fix this for
* multi-segments someday, WEL):
* e.g., 'IOR -s 2 -b 32K -t 32K -a MPIIO --mpiio.useStridedDatatype --mpiio.useFileView'
*/
if (param->useStridedDatatype) {
if(offset >= (rank+1) * hints->blockSize){
/* we shall write only once per transferSize */
/* printf("FAKE access %d %lld\n", rank, offset); */
return hints->transferSize;
}
length = hints->segmentCount;
MPI_CHECK(MPI_File_set_view(mfd->fd, offset,
mfd->contigType,
mfd->fileType,
"native",
(MPI_Info) MPI_INFO_NULL), "cannot set file view");
/* printf("ACCESS %d %lld -> %lld\n", rank, offset, length); */
}else{
length = 1;
}
if (hints->collective) {
/* individual, collective call */
MPI_CHECK(Access_all
(mfd->fd, buffer, length,
mfd->transferType, &status),
"cannot access collective");
} else {
/* individual, noncollective call */
MPI_CHECK(Access
(mfd->fd, buffer, length,
mfd->transferType, &status),
"cannot access noncollective");
}
/* MPI-IO driver does "nontcontiguous" by transfering
* 'segment' regions of 'transfersize' bytes, but
* our caller WriteOrReadSingle does not know how to
* deal with us reporting that we wrote N times more
* data than requested. */
length = hints->transferSize;
}
} else {
/*
* !useFileView does not use derived datatypes, but it uses either
* shared or explicit file pointers
*/
if (param->useSharedFilePointer) {
/* find offset in file */
if (SeekOffset
(mfd->fd, offset, module_options) < 0) {
/* if unsuccessful */
length = -1;
} else {
/* shared, collective call */
/*
* this needs to be properly implemented:
*
* MPI_CHECK(Access_ordered(fd.MPIIO, buffer, length,
* MPI_BYTE, &status),
* "cannot access shared, collective");
*/
fprintf(stdout,
"useSharedFilePointer not implemented\n");
}
} else {
if (hints->collective) {
/* explicit, collective call */
MPI_CHECK(Access_at_all
(mfd->fd, offset,
buffer, length, MPI_BYTE, &status),
"cannot access explicit, collective");
} else {
/* explicit, noncollective call */
MPI_CHECK(Access_at
(mfd->fd, offset,
buffer, length, MPI_BYTE, &status),
"cannot access explicit, noncollective");
}
}
}
return hints->transferSize;
}
/*
* Perform fsync().
*/
static void MPIIO_Fsync(aiori_fd_t *fdp, aiori_mod_opt_t * module_options)
{
mpiio_options_t * param = (mpiio_options_t*) module_options;
if(hints->dryRun)
return;
mpiio_fd_t * mfd = (mpiio_fd_t*) fdp;
if (MPI_File_sync(mfd->fd) != MPI_SUCCESS)
EWARN("fsync() failed");
}
/*
* Close a file through the MPIIO interface.
*/
static void MPIIO_Close(aiori_fd_t *fdp, aiori_mod_opt_t * module_options)
{
mpiio_options_t * param = (mpiio_options_t*) module_options;
mpiio_fd_t * mfd = (mpiio_fd_t*) fdp;
if(! hints->dryRun){
MPI_CHECK(MPI_File_close(& mfd->fd), "cannot close file");
}
if (param->useFileView == TRUE) {
/*
* need to free the datatype, so done in the close process
*/
MPI_CHECK(MPI_Type_free(& mfd->fileType), "cannot free MPI file datatype");
MPI_CHECK(MPI_Type_free(& mfd->transferType), "cannot free MPI transfer datatype");
MPI_CHECK(MPI_Type_free(& mfd->contigType), "cannot free type");
}
free(fdp);
}
/*
* Delete a file through the MPIIO interface.
*/
void MPIIO_Delete(char *testFileName, aiori_mod_opt_t * module_options)
{
mpiio_options_t * param = (mpiio_options_t*) module_options;
if(hints->dryRun)
return;
MPI_CHECKF(MPI_File_delete(testFileName, (MPI_Info) MPI_INFO_NULL),
"cannot delete file: %s", testFileName);
}
/*
* Determine api version.
*/
static char* MPIIO_GetVersion()
{
static char ver[1024] = {};
int version, subversion;
MPI_CHECK(MPI_Get_version(&version, &subversion), "cannot get MPI version");
sprintf(ver, "(%d.%d)", version, subversion);
return ver;
}
/*
* Seek to offset in file using the MPIIO interface.
*/
static IOR_offset_t SeekOffset(MPI_File fd, IOR_offset_t offset,
aiori_mod_opt_t * module_options)
{
mpiio_options_t * param = (mpiio_options_t*) module_options;
int offsetFactor, tasksPerFile;
IOR_offset_t tempOffset;
tempOffset = offset;
if (hints->filePerProc) {
offsetFactor = 0;
tasksPerFile = 1;
} else {
offsetFactor = (rank + rankOffset) % hints->numTasks;
tasksPerFile = hints->numTasks;
}
if (param->useFileView) {
/* recall that offsets in a file view are
counted in units of transfer size */
if (hints->filePerProc) {
tempOffset = tempOffset / hints->transferSize;
} else {
/*
* this formula finds a file view offset for a task
* from an absolute offset
*/
tempOffset = ((hints->blockSize / hints->transferSize)
* (tempOffset /
(hints->blockSize * tasksPerFile)))
+ (((tempOffset % (hints->blockSize * tasksPerFile))
- (offsetFactor * hints->blockSize))
/ hints->transferSize);
}
}
MPI_CHECK(MPI_File_seek(fd, tempOffset, MPI_SEEK_SET),
"cannot seek offset");
return (offset);
}
/*
* Use MPI_File_get_size() to return aggregate file size.
* NOTE: This function is used by the HDF5 and NCMPI backends.
*/
IOR_offset_t MPIIO_GetFileSize(aiori_mod_opt_t * module_options, char *testFileName)
{
mpiio_options_t * test = (mpiio_options_t*) module_options;
if(hints->dryRun)
return 0;
IOR_offset_t aggFileSizeFromStat, tmpMin, tmpMax, tmpSum;
MPI_File fd;
MPI_Comm comm;
MPI_Info mpiHints = MPI_INFO_NULL;
if (hints->filePerProc == TRUE) {
comm = MPI_COMM_SELF;
} else {
comm = testComm;
}
SetHints(&mpiHints, test->hintsFileName);
MPI_CHECK(MPI_File_open(comm, testFileName, MPI_MODE_RDONLY,
mpiHints, &fd),
"cannot open file to get file size");
MPI_CHECK(MPI_File_get_size(fd, (MPI_Offset *) & aggFileSizeFromStat),
"cannot get file size");
MPI_CHECK(MPI_File_close(&fd), "cannot close file");
if (mpiHints != MPI_INFO_NULL)
MPI_CHECK(MPI_Info_free(&mpiHints), "MPI_Info_free failed");
return (aggFileSizeFromStat);
}