diff --git a/.gitignore b/.gitignore
index 7d88e4a..73dd929 100644
--- a/.gitignore
+++ b/.gitignore
@@ -30,6 +30,6 @@ src/*.i
 src/*.s
 src/ior
 
-doc/doxygen/html
-doc/doxygen/xml
+doc/doxygen/build
 doc/sphinx/_*/
+!doc/sphinx/Makefile
diff --git a/doc/doxygen/Doxyfile b/doc/doxygen/Doxyfile
index 5d7ae94..032ac39 100644
--- a/doc/doxygen/Doxyfile
+++ b/doc/doxygen/Doxyfile
@@ -58,7 +58,7 @@ PROJECT_LOGO           =
 # entered, it will be relative to the location where doxygen was started. If
 # left blank the current directory will be used.
 
-OUTPUT_DIRECTORY       =
+OUTPUT_DIRECTORY       = build
 
 # If the CREATE_SUBDIRS tag is set to YES then doxygen will create 4096 sub-
 # directories (in 2 levels) under the output directory of each output format and
@@ -1111,7 +1111,7 @@ GENERATE_HTML          = YES
 # The default directory is: html.
 # This tag requires that the tag GENERATE_HTML is set to YES.
 
-HTML_OUTPUT            = html
+HTML_OUTPUT            = doxygen_html
 
 # The HTML_FILE_EXTENSION tag can be used to specify the file extension for each
 # generated HTML page (for example: .htm, .php, .asp).
@@ -1932,7 +1932,7 @@ GENERATE_XML           = YES
 # The default directory is: xml.
 # This tag requires that the tag GENERATE_XML is set to YES.
 
-XML_OUTPUT             = xml
+XML_OUTPUT             = doxygen_xml
 
 # If the XML_PROGRAMLISTING tag is set to YES, doxygen will dump the program
 # listings (including syntax highlighting and cross-referencing information) to
diff --git a/doc/sphinx/conf.py b/doc/sphinx/conf.py
index 02cae4e..05e5807 100644
--- a/doc/sphinx/conf.py
+++ b/doc/sphinx/conf.py
@@ -22,16 +22,19 @@ import sys
 sys.path.insert(0, os.path.abspath('.'))
 
 
-# -- Breathe -------------------------------------------------------------
-
-sys.path.append( "/usr/local/bin/breathe-apidoc" )
-
-# compile doxygen
+# -- compile doxygen --------------
+# this is needed for breath and to compile doxygen on read the docs
 import subprocess
 subprocess.call('cd ../doxygen ; doxygen', shell=True)
 
-breathe_projects = { "IOR":"../doxygen/xml/" }
-breathe_default_project = 'IOR'
+html_extra_path = ['../doxygen/build/']
+
+# -- Breathe -------------------------------------------------------------
+#
+# sys.path.append( "/usr/local/bin/breathe-apidoc" )
+
+# breathe_projects = { "IOR":"../doxygen/xml/" }
+# breathe_default_project = 'IOR'
 # breathe_default_members = ('members', 'private-members', 'undoc-members')
 # breathe_domain_by_extension = {"h" : "c", 'c': 'c',}
 # breathe_build_directory
@@ -45,7 +48,8 @@ breathe_default_project = 'IOR'
 # Add any Sphinx extension module names here, as strings. They can be
 # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 # ones.
-extensions = ['sphinx.ext.imgmath', 'sphinx.ext.todo', 'breathe' ]
+# extensions = ['sphinx.ext.imgmath', 'sphinx.ext.todo', 'breathe' ]
+extensions = ['sphinx.ext.imgmath', 'sphinx.ext.todo']
 
 # Add any paths that contain templates here, relative to this directory.
 templates_path = ['_templates']
@@ -69,7 +73,7 @@ author = u'IOR'
 # built documents.
 #
 # The short X.Y version.
-version = u'3.0.1'
+version = u'3.1.0'
 # The full version, including alpha/beta/rc tags.
 release = u'0'
 
diff --git a/doc/sphinx/devDoc/CI.rst b/doc/sphinx/devDoc/CI.rst
new file mode 100644
index 0000000..39cbf9e
--- /dev/null
+++ b/doc/sphinx/devDoc/CI.rst
@@ -0,0 +1,10 @@
+Continues Integration
+=====================
+
+Continues Integration is used for basic sanity checking. Travis-CI provides free
+CI for open source github projects and is configured via a .travis.yml.
+
+For now this is set up to compile IOR on a ubuntu 14.04 machine with gcc 4.8,
+openmpi and hdf5 for the backends. This is a pretty basic check and should be
+advance over time. Nevertheless this should detect major errors early as they
+are shown in pull requests.
diff --git a/doc/sphinx/devDoc/doxygen.rst b/doc/sphinx/devDoc/doxygen.rst
index 0a46b61..53bfb28 100644
--- a/doc/sphinx/devDoc/doxygen.rst
+++ b/doc/sphinx/devDoc/doxygen.rst
@@ -1,8 +1,8 @@
 Doxygen
 =======
 
-Click `here <../../../../doxygen/html/index.html>`_ for doxygen.
+Click `here <../doxygen_html/index.html>`_ for doxygen.
 
 This documentation utilities doxygen for parsing the c code. Therefore a doxygen
-instances is created in the background. This might be helpfull as doxygen
+instances is created in the background anyway. This might be helpful as doxygen
 produces nice call graphs.
diff --git a/doc/sphinx/index.rst b/doc/sphinx/index.rst
index b0a1a11..de693ac 100644
--- a/doc/sphinx/index.rst
+++ b/doc/sphinx/index.rst
@@ -11,20 +11,25 @@
     :caption: User Documentation
 
     userDoc/install
-    userDoc/tutorial
-    userDoc/userguid
+    First Steps <userDoc/tutorial>
+    userDoc/options
+    userDoc/skripts
+    userDoc/compatibility
+    FAQ <userDoc/faq>
+
 
 .. toctree::
     :hidden:
     :caption: Developer Documentation
 
     devDoc/doxygen
+    devDoc/CI
 
 .. toctree::
     :hidden:
     :caption: Miscellaneous
 
-    Git Repository <github.com/IOR-LANL/ior#http://>
+    Git Repository <https://github.com/hpc/ior>
     changes
 
 ..  Indices and tables
diff --git a/doc/sphinx/intro.rst b/doc/sphinx/intro.rst
index 1e2e875..3964bbc 100644
--- a/doc/sphinx/intro.rst
+++ b/doc/sphinx/intro.rst
@@ -1,14 +1,26 @@
 Introduction
 ============
 
+Welcome to the IOR documentation.
+
+**I**\ nterleaved **o**\ r **R**\ andom is a parallel IO benchmark.
 IOR can be used for testing performance of parallel file systems using various
-interfaces and access patterns.  IOR uses MPI for process synchronization.
-IOR version 2 is a complete rewrite of the original IOR (Interleaved-Or-Random)
-version 1 code.
+interfaces and access patterns. IOR uses MPI for process synchronization.
+This documentation provides information for versions 3 and higher, for other
+versions check :ref:`compatibility`
 
+This documentation consists of tow parts.
 
-RUNNING IOR
---------------
+The first part is a user documentation were you find instructions on compilation, a
+beginners tutorial (:ref:`first-steps`) as well as information about all
+available :ref:`options`.
 
-GENERAL:
-^^^^^^^^^^^^^^
+The second part is the developer documentation. It currently only consists of a
+auto generated Doxygen and some notes about the contiguous integration with travis.
+As there are quite some people how needs to modify or extend IOR to there needs
+it would be great to have documentation on what and how to alter IOR without
+breaking other stuff. Currently there is neither a documentation on the overall
+concept of the code nor on implementation details. If you are getting your
+hands dirty in code anyways or have deeper understanding of IOR, you are more
+then welcome to comment the code directly, which will result in better Doxygen
+output or add your insight to this sphinx documentation.
diff --git a/doc/sphinx/userDoc/compatibility.rst b/doc/sphinx/userDoc/compatibility.rst
new file mode 100644
index 0000000..6b87d68
--- /dev/null
+++ b/doc/sphinx/userDoc/compatibility.rst
@@ -0,0 +1,27 @@
+.. _compatibility:
+
+Compatibility
+=============
+
+IOR has a long history. Here are some hints about compatibility with older
+versions.
+
+1)  IOR version 1 (c. 1996-2002) and IOR version 2 (c. 2003-present) are
+    incompatible.  Input decks from one will not work on the other.  As version
+    1 is not included in this release, this shouldn't be case for concern.  All
+    subsequent compatibility issues are for IOR version 2.
+
+2)  IOR versions prior to release 2.8 provided data size and rates in powers
+    of two.  E.g., 1 MB/sec referred to 1,048,576 bytes per second.  With the
+    IOR release 2.8 and later versions, MB is now defined as 1,000,000 bytes
+    and MiB is 1,048,576 bytes.
+
+3)  In IOR versions 2.5.3 to 2.8.7, IOR could be run without any command line
+    options.  This assumed that if both write and read options (-w -r) were
+    omitted, the run with them both set as default.  Later, it became clear
+    that in certain cases (data checking, e.g.) this caused difficulties.  In
+    IOR versions 2.8.8 and later, if not one of the -w -r -W or -R options is
+    set, then -w and -r are set implicitly.
+
+4)  IOR version 3 (Jan 2012-present) has changed the output of IOR somewhat,
+    and the "testNum" option was renamed "refNum".
diff --git a/doc/sphinx/userDoc/faq.rst b/doc/sphinx/userDoc/faq.rst
new file mode 100644
index 0000000..0e9a8a9
--- /dev/null
+++ b/doc/sphinx/userDoc/faq.rst
@@ -0,0 +1,175 @@
+Frequently Asked Questions
+==========================
+
+HOW DO I PERFORM MULTIPLE DATA CHECKS ON AN EXISTING FILE?
+
+  Use this command line:  IOR -k -E -W -i 5 -o file
+
+  -k keeps the file after the access rather than deleting it
+  -E uses the existing file rather than truncating it first
+  -W performs the writecheck
+  -i number of iterations of checking
+  -o filename
+
+  On versions of IOR prior to 2.8.8, you need the -r flag also, otherwise
+  you'll first overwrite the existing file.  (In earlier versions, omitting -w
+  and -r implied using both.  This semantic has been subsequently altered to be
+  omitting -w, -r, -W, and -R implied using both -w and -r.)
+
+  If you're running new tests to create a file and want repeat data checking on
+  this file multiple times, there is an undocumented option for this.  It's -O
+  multiReRead=1, and you'd need to have an IOR version compiled with the
+  USE_UNDOC_OPT=1 (in iordef.h).  The command line would look like this:
+
+  IOR -k -E -w -W -i 5 -o file -O multiReRead=1
+
+  For the first iteration, the file would be written (w/o data checking).  Then
+  for any additional iterations (four, in this example) the file would be
+  reread for whatever data checking option is used.
+
+
+HOW DOES IOR CALCULATE PERFORMANCE?
+
+  IOR performs get a time stamp START, then has all participating tasks open a
+  shared or independent file, transfer data, close the file(s), and then get a
+  STOP time.  A stat() or MPI_File_get_size() is performed on the file(s) and
+  compared against the aggregate amount of data transferred.  If this value
+  does not match, a warning is issued and the amount of data transferred as
+  calculated from write(), e.g., return codes is used.  The calculated
+  bandwidth is the amount of data transferred divided by the elapsed
+  STOP-minus-START time.
+
+  IOR also gets time stamps to report the open, transfer, and close times.
+  Each of these times is based on the earliest start time for any task and the
+  latest stop time for any task.  Without using barriers between these
+  operations (-g), the sum of the open, transfer, and close times may not equal
+  the elapsed time from the first open to the last close.
+
+
+HOW DO I ACCESS MULTIPLE FILE SYSTEMS IN IOR?
+
+  It is possible when using the filePerProc option to have tasks round-robin
+  across multiple file names.  Rather than use a single file name '-o file',
+  additional names '-o file1@file2@file3' may be used.  In this case, a file
+  per process would have three different file names (which may be full path
+  names) to access.  The '@' delimiter is arbitrary, and may be set in the
+  FILENAME_DELIMITER definition in iordef.h.
+
+  Note that this option of multiple filenames only works with the filePerProc
+  -F option.  This will not work for shared files.
+
+
+HOW DO I BALANCE LOAD ACROSS MULTIPLE FILE SYSTEMS?
+
+  As for the balancing of files per file system where different file systems
+  offer different performance, additional instances of the same destination
+  path can generally achieve good balance.
+
+  For example, with FS1 getting 50% better performance than FS2, set the '-o'
+  flag such that there are additional instances of the FS1 directory.  In this
+  case, '-o FS1/file@FS1/file@FS1/file@FS2/file@FS2/file' should adjust for
+  the performance difference and balance accordingly.
+
+
+HOW DO I USE STONEWALLING?
+
+  To use stonewalling (-D), it's generally best to separate write testing from
+  read testing.  Start with writing a file with '-D 0' (stonewalling disabled)
+  to determine how long the file takes to be written.  If it takes 10 seconds
+  for the data transfer, run again with a shorter duration, '-D 7' e.g., to
+  stop before the file would be completed without stonewalling.  For reading,
+  it's best to create a full file (not an incompletely written file from a
+  stonewalling run) and then run with stonewalling set on this preexisting
+  file.  If a write and read test are performed in the same run with
+  stonewalling, it's likely that the read will encounter an error upon hitting
+  the EOF.  Separating the runs can correct for this.  E.g.,
+
+  IOR -w -k -o file -D 10  # write and keep file, stonewall after 10 seconds
+  IOR -r -E -o file -D 7   # read existing file, stonewall after 7 seconds
+
+  Also, when running multiple iterations of a read-only stonewall test, it may
+  be necessary to set the -D value high enough so that each iteration is not
+  reading from cache.  Otherwise, in some cases, the first iteration may show
+  100 MB/s, the next 200 MB/s, the third 300 MB/s.  Each of these tests is
+  actually reading the same amount from disk in the allotted time, but they
+  are also reading the cached data from the previous test each time to get the
+  increased performance.  Setting -D high enough so that the cache is
+  overfilled will prevent this.
+
+
+HOW DO I BYPASS CACHING WHEN READING BACK A FILE I'VE JUST WRITTEN?
+
+  One issue with testing file systems is handling cached data.  When a file is
+  written, that data may be stored locally on the node writing the file.  When
+  the same node attempts to read the data back from the file system either for
+  performance or data integrity checking, it may be reading from its own cache
+  rather from the file system.
+
+  The reorderTasksConstant '-C' option attempts to address this by having a
+  different node read back data than wrote it.  For example, node N writes the
+  data to file, node N+1 reads back the data for read performance, node N+2
+  reads back the data for write data checking, and node N+3 reads the data for
+  read data checking, comparing this with the reread data from node N+4.  The
+  objective is to make sure on file access that the data is not being read from
+  cached data.
+
+    Node 0: writes data
+    Node 1: reads data
+    Node 2: reads written data for write checking
+    Node 3: reads written data for read checking
+    Node 4: reads written data for read checking, comparing with Node 3
+
+  The algorithm for skipping from N to N+1, e.g., expects consecutive task
+  numbers on nodes (block assignment), not those assigned round robin (cyclic
+  assignment).  For example, a test running 6 tasks on 3 nodes would expect
+  tasks 0,1 on node 0; tasks 2,3 on node 1; and tasks 4,5 on node 2.  Were the
+  assignment for tasks-to-node in round robin fashion, there would be tasks 0,3
+  on node 0; tasks 1,4 on node 1; and tasks 2,5 on node 2.  In this case, there
+  would be no expectation that a task would not be reading from data cached on
+  a node.
+
+
+HOW DO I USE HINTS?
+
+  It is possible to pass hints to the I/O library or file system layers
+  following this form::
+    'setenv IOR_HINT__<layer>__<hint> <value>'
+
+  For example::
+    'setenv IOR_HINT__MPI__IBM_largeblock_io true'
+    'setenv IOR_HINT__GPFS__important_hint true'
+
+  or, in a file in the form::
+    'IOR_HINT__<layer>__<hint>=<value>'
+
+  Note that hints to MPI from the HDF5 or NCMPI layers are of the form::
+    'setenv IOR_HINT__MPI__<hint> <value>'
+
+
+HOW DO I EXPLICITY SET THE FILE DATA SIGNATURE?
+
+  The data signature for a transfer contains the MPI task number, transfer-
+  buffer offset, and also timestamp for the start of iteration.  As IOR works
+  with 8-byte long long ints, the even-numbered long longs written contain a
+  32-bit MPI task number and a 32-bit timestamp.  The odd-numbered long longs
+  contain a 64-bit transferbuffer offset (or file offset if the '-l'
+  storeFileOffset option is used).  To set the timestamp value, use '-G' or
+  setTimeStampSignature.
+
+
+HOW DO I EASILY CHECK OR CHANGE A BYTE IN AN OUTPUT DATA FILE?
+
+  There is a simple utility IOR/src/C/cbif/cbif.c that may be built.  This is a
+  stand-alone, serial application called cbif (Change Byte In File).  The
+  utility allows a file offset to be checked, returning the data at that
+  location in IOR's data check format.  It also allows a byte at that location
+  to be changed.
+
+
+HOW DO I CORRECT FOR CLOCK SKEW BETWEEN NODES IN A CLUSTER?
+
+  To correct for clock skew between nodes, IOR compares times between nodes,
+  then broadcasts the root node's timestamp so all nodes can adjust by the
+  difference.  To see an egregious outlier, use the '-j' option.  Be sure
+  to set this value high enough to only show a node outside a certain time
+  from the mean.
diff --git a/doc/sphinx/userDoc/install.rst b/doc/sphinx/userDoc/install.rst
index 53b1696..48948ac 100644
--- a/doc/sphinx/userDoc/install.rst
+++ b/doc/sphinx/userDoc/install.rst
@@ -1,4 +1,22 @@
 Install
 =======
 
-sdgsd
+Building
+--------
+
+0. If "configure" is missing from the top level directory, you
+   probably retrieved this code directly from the repository.
+   Run "./bootstrap".
+
+   If your versions of the autotools are not new enough to run
+   this script, download and official tarball in which the
+   configure script is already provided.
+
+1. Run "./configure"
+
+   See "./configure --help" for configuration options.
+
+2. Run "make"
+
+3. Optionally, run "make install".  The installation prefix
+   can be changed as an option to the "configure" script.
diff --git a/doc/sphinx/userDoc/userguid.rst b/doc/sphinx/userDoc/options.rst
similarity index 53%
rename from doc/sphinx/userDoc/userguid.rst
rename to doc/sphinx/userDoc/options.rst
index 88cb6c0..7626d0e 100644
--- a/doc/sphinx/userDoc/userguid.rst
+++ b/doc/sphinx/userDoc/options.rst
@@ -1,36 +1,21 @@
-IOR USER GUIDE
-===============
+.. _options:
+
+Options
+=======
+
+IOR provides many options, in fact there are now more than there are one letter
+flags in the alphabet.
+For this and to run IOR by a config script, there are some options which are
+only available via directives. When both script and command line options are in
+use, command line options set in front of -f are the defaults which may be
+overridden by the script.
+Directives can also be set from the command line via "-O" option. In combination
+with a script they behave like the normal command line options. But directives and
+normal parameters override each other, so the last one executed.
 
 
-1.  DESCRIPTION
----------------
-IOR can be used for testing performance of parallel file systems using various
-interfaces and access patterns.  IOR uses MPI for process synchronization.
-IOR version 2 is a complete rewrite of the original IOR (Interleaved-Or-Random)
-version 1 code.
-
-
-
-2. RUNNING IOR
---------------
-Two ways to run IOR:
-
-  * Command line with arguments -- executable followed by command line options.
-
-    E.g., to execute:  IOR -w -r -o filename
-    This performs a write and a read to the file 'filename'.
-
-  * Command line with scripts -- any arguments on the command line will
-    establish the default for the test run, but a script may be used in
-    conjunction with this for varying specific tests during an execution of the
-    code.
-
-    E.g., to execute:  IOR -W -f script
-    This defaults all tests in 'script' to use write data checking.
-
-
-3. OPTIONS
-----------
+Command line options
+--------------------
 These options are to be used on the command line. E.g., 'IOR -a POSIX -b 4K'.
   -a S  api --  API for I/O [POSIX|MPIIO|HDF5|HDFS|S3|S3_EMC|NCMPI]
   -A N  refNum -- user reference number to include in long summary
@@ -89,7 +74,7 @@ NOTES: * S is a string, N is an integer number.
          suffices are recognized.  I.e., '4k' or '4K' is accepted as 4096.
 
 
-4. OPTION DETAILS
+Directive Options
 ------------------
 For each of the general settings, note the default is shown in brackets.
 IMPORTANT NOTE: For all true/false options below [1]=true, [0]=false
@@ -173,9 +158,9 @@ GENERAL:
 
   * checkWrite           - read data back and check for errors against known
                            pattern; can be used independently of writeFile [0=FALSE]
-                           NOTES: * data checking is not timed and does not
+                           NOTES: - data checking is not timed and does not
                                     affect other performance timings
-                                  * all errors tallied and returned as program
+                                  - all errors tallied and returned as program
                                     exit code, unless quitOnError set
 
   * checkRead            - reread data and check for errors between reads; can
@@ -190,12 +175,12 @@ GENERAL:
   * useExistingTestFile  - do not remove test file before write access [0=FALSE]
 
   * segmentCount         - number of segments in file [1]
-                           NOTES: * a segment is a contiguous chunk of data
+                           NOTES: - a segment is a contiguous chunk of data
                                     accessed by multiple clients each writing/
                                     reading their own contiguous data;
                                     comprised of blocks accessed by multiple
                                     clients
-                                  * with HDF5 this repeats the pattern of an
+                                  - with HDF5 this repeats the pattern of an
                                     entire shared dataset
 
   * blockSize            - size (in bytes) of a contiguous chunk of data
@@ -238,7 +223,7 @@ GENERAL:
                                     to complete without interruption
 
   * deadlineForStonewalling - seconds before stopping write or read phase [0]
-                           NOTES: * used for measuring the amount of data moved
+                           NOTES: - used for measuring the amount of data moved
                                     in a fixed time.  After the barrier, each
                                     task starts its own timer, begins moving
                                     data, and the stops moving data at a pre-
@@ -248,11 +233,11 @@ GENERAL:
                                     data moved in a fixed amount of time.  The
                                     objective is to prevent tasks slow to
                                     complete from skewing the performance.
-                                  * setting this to zero (0) unsets this option
-                                  * this option is incompatible w/data checking
+                                  - setting this to zero (0) unsets this option
+                                  - this option is incompatible w/data checking
 
   * randomOffset         - access is to random, not sequential, offsets within a file [0=FALSE]
-                           NOTES: * this option is currently incompatible with:
+                           NOTES: - this option is currently incompatible with:
                                     -checkRead
                                     -storeFileOffset
                                     -MPIIO collective or useFileView
@@ -330,118 +315,28 @@ GPFS-SPECIFIC
 			   traffic when many proceses write/read to same file.
 
 
-5. VERBOSITY LEVELS
+
+Verbosity levels
 ---------------------
 The verbosity of output for IOR can be set with -v.  Increasing the number of
 -v instances on a command line sets the verbosity higher.
 
 Here is an overview of the information shown for different verbosity levels:
-  0 - default; only bare essentials shown
-  1 - max clock deviation, participating tasks, free space, access pattern,
-      commence/verify access notification w/time
-  2 - rank/hostname, machine name, timer used, individual repetition
-      performance results, timestamp used for data signature
-  3 - full test details, transfer block/offset compared, individual data
-      checking errors, environment variables, task writing/reading file name,
-      all test operation times
-  4 - task id and offset for each transfer
-  5 - each 8-byte data signature comparison (WARNING: more data to STDOUT
-      than stored in file, use carefully)
+
+0)  default; only bare essentials shown
+1)  max clock deviation, participating tasks, free space, access pattern,
+    commence/verify access notification w/time
+2)  rank/hostname, machine name, timer used, individual repetition
+    performance results, timestamp used for data signature
+3)  full test details, transfer block/offset compared, individual data
+    checking errors, environment variables, task writing/reading file name,
+    all test operation times
+4)  task id and offset for each transfer
+5)  each 8-byte data signature comparison (WARNING: more data to STDOUT
+    than stored in file, use carefully)
 
 
-6. USING SCRIPTS
------------------
-IOR can use a script with the command line.  Any options on the command line
-will be considered the default settings for running the script.  (I.e.,
-'IOR -W -f script' will have all tests in the script run with the -W option as
-default.)  The script itself can override these settings and may be set to run
-run many different tests of IOR under a single execution.
-The command line is: ::
-
-  IOR/bin/IOR -f script
-
-In IOR/scripts, there are scripts of testcases for simulating I/O behavior of
-various application codes.  Details are included in each script as necessary.
-
-An example of a script: ::
-
-  IOR START
-    api=[POSIX|MPIIO|HDF5|HDFS|S3|S3_EMC|NCMPI]
-    testFile=testFile
-    hintsFileName=hintsFile
-    repetitions=8
-    multiFile=0
-    interTestDelay=5
-    readFile=1
-    writeFile=1
-    filePerProc=0
-    checkWrite=0
-    checkRead=0
-    keepFile=1
-    quitOnError=0
-    segmentCount=1
-    blockSize=32k
-    outlierThreshold=0
-    setAlignment=1
-    transferSize=32
-    singleXferAttempt=0
-    individualDataSets=0
-    verbose=0
-    numTasks=32
-    collective=1
-    preallocate=0
-    useFileView=0
-    keepFileWithError=0
-    setTimeStampSignature=0
-    useSharedFilePointer=0
-    useStridedDatatype=0
-    uniqueDir=0
-    fsync=0
-    storeFileOffset=0
-    maxTimeDuration=60
-    deadlineForStonewalling=0
-    useExistingTestFile=0
-    useO_DIRECT=0
-    showHints=0
-    showHelp=0
-  RUN
-    # additional tests are optional
-    <snip>
-  RUN
-    <snip>
-  RUN
-  IOR STOP
-
-
-NOTES:
-  * Not all test parameters need be set.
-  * White space is ignored in script, as are comments starting with '#'.
-
-
-7. COMPATIBILITY WITH OLDER VERSIONS
--------------------------------------
-1)  IOR version 1 (c. 1996-2002) and IOR version 2 (c. 2003-present) are
-    incompatible.  Input decks from one will not work on the other.  As version
-    1 is not included in this release, this shouldn't be case for concern.  All
-    subsequent compatibility issues are for IOR version 2.
-
-2)  IOR versions prior to release 2.8 provided data size and rates in powers
-    of two.  E.g., 1 MB/sec referred to 1,048,576 bytes per second.  With the
-    IOR release 2.8 and later versions, MB is now defined as 1,000,000 bytes
-    and MiB is 1,048,576 bytes.
-
-3)  In IOR versions 2.5.3 to 2.8.7, IOR could be run without any command line
-    options.  This assumed that if both write and read options (-w -r) were
-    omitted, the run with them both set as default.  Later, it became clear
-    that in certain cases (data checking, e.g.) this caused difficulties.  In
-    IOR versions 2.8.8 and later, if not one of the -w -r -W or -R options is
-    set, then -w and -r are set implicitly.
-
-4)  IOR version 3 (Jan 2012-present) has changed the output of IOR somewhat,
-    and the "testNum" option was renamed "refNum".
-
-
-8. INCOMPRESSIBLE NOTES
+Incompressible notes
 -------------------------
 Please note that incompressibility is a factor of how large a block compression
 algorithm uses.  The incompressible buffer is filled only once before write times,
@@ -449,190 +344,13 @@ so if the compression algorithm takes in blocks larger than the transfer size,
 there will be compression.  Below are some baselines that I established for
 zip, gzip, and bzip.
 
-1) zip:  For zipped files, a transfer size of 1k is sufficient.
+1)  zip:  For zipped files, a transfer size of 1k is sufficient.
 
-2) gzip: For gzipped files, a transfer size of 1k is sufficient.
+2)  gzip: For gzipped files, a transfer size of 1k is sufficient.
 
-3) bzip2: For bziped files a transfer size of 1k is insufficient (~50% compressed).
-   To avoid compression a transfer size of greater than the bzip block size is required
-   (default = 900KB). I suggest a transfer size of greather than 1MB to avoid bzip2 compression.
+3)  bzip2: For bziped files a transfer size of 1k is insufficient (~50% compressed).
+    To avoid compression a transfer size of greater than the bzip block size is required
+    (default = 900KB). I suggest a transfer size of greather than 1MB to avoid bzip2 compression.
 
 Be aware of the block size your compression algorithm will look at, and adjust the transfer size
 accordingly.
-
-
-9. FREQUENTLY ASKED QUESTIONS
-------------------------------
-HOW DO I PERFORM MULTIPLE DATA CHECKS ON AN EXISTING FILE?
-
-  Use this command line:  IOR -k -E -W -i 5 -o file
-
-  -k keeps the file after the access rather than deleting it
-  -E uses the existing file rather than truncating it first
-  -W performs the writecheck
-  -i number of iterations of checking
-  -o filename
-
-  On versions of IOR prior to 2.8.8, you need the -r flag also, otherwise
-  you'll first overwrite the existing file.  (In earlier versions, omitting -w
-  and -r implied using both.  This semantic has been subsequently altered to be
-  omitting -w, -r, -W, and -R implied using both -w and -r.)
-
-  If you're running new tests to create a file and want repeat data checking on
-  this file multiple times, there is an undocumented option for this.  It's -O
-  multiReRead=1, and you'd need to have an IOR version compiled with the
-  USE_UNDOC_OPT=1 (in iordef.h).  The command line would look like this:
-
-  IOR -k -E -w -W -i 5 -o file -O multiReRead=1
-
-  For the first iteration, the file would be written (w/o data checking).  Then
-  for any additional iterations (four, in this example) the file would be
-  reread for whatever data checking option is used.
-
-
-HOW DOES IOR CALCULATE PERFORMANCE?
-
-  IOR performs get a time stamp START, then has all participating tasks open a
-  shared or independent file, transfer data, close the file(s), and then get a
-  STOP time.  A stat() or MPI_File_get_size() is performed on the file(s) and
-  compared against the aggregate amount of data transferred.  If this value
-  does not match, a warning is issued and the amount of data transferred as
-  calculated from write(), e.g., return codes is used.  The calculated
-  bandwidth is the amount of data transferred divided by the elapsed
-  STOP-minus-START time.
-
-  IOR also gets time stamps to report the open, transfer, and close times.
-  Each of these times is based on the earliest start time for any task and the
-  latest stop time for any task.  Without using barriers between these
-  operations (-g), the sum of the open, transfer, and close times may not equal
-  the elapsed time from the first open to the last close.
-
-
-HOW DO I ACCESS MULTIPLE FILE SYSTEMS IN IOR?
-
-  It is possible when using the filePerProc option to have tasks round-robin
-  across multiple file names.  Rather than use a single file name '-o file',
-  additional names '-o file1@file2@file3' may be used.  In this case, a file
-  per process would have three different file names (which may be full path
-  names) to access.  The '@' delimiter is arbitrary, and may be set in the
-  FILENAME_DELIMITER definition in iordef.h.
-
-  Note that this option of multiple filenames only works with the filePerProc
-  -F option.  This will not work for shared files.
-
-
-HOW DO I BALANCE LOAD ACROSS MULTIPLE FILE SYSTEMS?
-
-  As for the balancing of files per file system where different file systems
-  offer different performance, additional instances of the same destination
-  path can generally achieve good balance.
-
-  For example, with FS1 getting 50% better performance than FS2, set the '-o'
-  flag such that there are additional instances of the FS1 directory.  In this
-  case, '-o FS1/file@FS1/file@FS1/file@FS2/file@FS2/file' should adjust for
-  the performance difference and balance accordingly.
-
-
-HOW DO I USE STONEWALLING?
-
-  To use stonewalling (-D), it's generally best to separate write testing from
-  read testing.  Start with writing a file with '-D 0' (stonewalling disabled)
-  to determine how long the file takes to be written.  If it takes 10 seconds
-  for the data transfer, run again with a shorter duration, '-D 7' e.g., to
-  stop before the file would be completed without stonewalling.  For reading,
-  it's best to create a full file (not an incompletely written file from a
-  stonewalling run) and then run with stonewalling set on this preexisting
-  file.  If a write and read test are performed in the same run with
-  stonewalling, it's likely that the read will encounter an error upon hitting
-  the EOF.  Separating the runs can correct for this.  E.g.,
-
-  IOR -w -k -o file -D 10  # write and keep file, stonewall after 10 seconds
-  IOR -r -E -o file -D 7   # read existing file, stonewall after 7 seconds
-
-  Also, when running multiple iterations of a read-only stonewall test, it may
-  be necessary to set the -D value high enough so that each iteration is not
-  reading from cache.  Otherwise, in some cases, the first iteration may show
-  100 MB/s, the next 200 MB/s, the third 300 MB/s.  Each of these tests is
-  actually reading the same amount from disk in the allotted time, but they
-  are also reading the cached data from the previous test each time to get the
-  increased performance.  Setting -D high enough so that the cache is
-  overfilled will prevent this.
-
-
-HOW DO I BYPASS CACHING WHEN READING BACK A FILE I'VE JUST WRITTEN?
-
-  One issue with testing file systems is handling cached data.  When a file is
-  written, that data may be stored locally on the node writing the file.  When
-  the same node attempts to read the data back from the file system either for
-  performance or data integrity checking, it may be reading from its own cache
-  rather from the file system.
-
-  The reorderTasksConstant '-C' option attempts to address this by having a
-  different node read back data than wrote it.  For example, node N writes the
-  data to file, node N+1 reads back the data for read performance, node N+2
-  reads back the data for write data checking, and node N+3 reads the data for
-  read data checking, comparing this with the reread data from node N+4.  The
-  objective is to make sure on file access that the data is not being read from
-  cached data.
-
-    Node 0: writes data
-    Node 1: reads data
-    Node 2: reads written data for write checking
-    Node 3: reads written data for read checking
-    Node 4: reads written data for read checking, comparing with Node 3
-
-  The algorithm for skipping from N to N+1, e.g., expects consecutive task
-  numbers on nodes (block assignment), not those assigned round robin (cyclic
-  assignment).  For example, a test running 6 tasks on 3 nodes would expect
-  tasks 0,1 on node 0; tasks 2,3 on node 1; and tasks 4,5 on node 2.  Were the
-  assignment for tasks-to-node in round robin fashion, there would be tasks 0,3
-  on node 0; tasks 1,4 on node 1; and tasks 2,5 on node 2.  In this case, there
-  would be no expectation that a task would not be reading from data cached on
-  a node.
-
-
-HOW DO I USE HINTS?
-
-  It is possible to pass hints to the I/O library or file system layers
-  following this form:
-    'setenv IOR_HINT__<layer>__<hint> <value>'
-  For example:
-    'setenv IOR_HINT__MPI__IBM_largeblock_io true'
-    'setenv IOR_HINT__GPFS__important_hint true'
-  or, in a file in the form:
-    'IOR_HINT__<layer>__<hint>=<value>'
-  Note that hints to MPI from the HDF5 or NCMPI layers are of the form:
-    'setenv IOR_HINT__MPI__<hint> <value>'
-
-
-HOW DO I EXPLICITY SET THE FILE DATA SIGNATURE?
-
-  The data signature for a transfer contains the MPI task number, transfer-
-  buffer offset, and also timestamp for the start of iteration.  As IOR works
-  with 8-byte long long ints, the even-numbered long longs written contain a
-  32-bit MPI task number and a 32-bit timestamp.  The odd-numbered long longs
-  contain a 64-bit transferbuffer offset (or file offset if the '-l'
-  storeFileOffset option is used).  To set the timestamp value, use '-G' or
-  setTimeStampSignature.
-
-
-HOW DO I EASILY CHECK OR CHANGE A BYTE IN AN OUTPUT DATA FILE?
-
-  There is a simple utility IOR/src/C/cbif/cbif.c that may be built.  This is a
-  stand-alone, serial application called cbif (Change Byte In File).  The
-  utility allows a file offset to be checked, returning the data at that
-  location in IOR's data check format.  It also allows a byte at that location
-  to be changed.
-
-
-HOW DO I CORRECT FOR CLOCK SKEW BETWEEN NODES IN A CLUSTER?
-
-  To correct for clock skew between nodes, IOR compares times between nodes,
-  then broadcasts the root node's timestamp so all nodes can adjust by the
-  difference.  To see an egregious outlier, use the '-j' option.  Be sure
-  to set this value high enough to only show a node outside a certain time
-  from the mean.
-
-
-Copyright (c) 2003, The Regents of the University of California
-See the file COPYRIGHT for a complete copyright notice and license.
diff --git a/doc/sphinx/userDoc/skripts.rst b/doc/sphinx/userDoc/skripts.rst
new file mode 100644
index 0000000..bfb07b2
--- /dev/null
+++ b/doc/sphinx/userDoc/skripts.rst
@@ -0,0 +1,72 @@
+Scripting
+=========
+
+IOR can use a script with the command line.  Any options on the command line set
+before the script will be considered the default settings for running the script.
+(I.e.,'$ ./IOR -W -f script' will have all tests in the script run with the -W
+option as default.)
+The script itself can override these settings and may be set to run
+run many different tests of IOR under a single execution.
+The command line is: ::
+
+  ./IOR -f script
+
+In IOR/scripts, there are scripts of test cases for simulating I/O behavior of
+various application codes.  Details are included in each script as necessary.
+
+Syntax:
+    * IOR START / IOR END: marks the beginning and end of the script
+    * RUN: Delimiter for next Test
+    * All previous set parameter stay set for the next test. They are not reset
+      to the default! For default the musst be rest manually.
+    * White space is ignored in script, as are comments starting with '#'.
+    * Not all test parameters need be set.
+
+An example of a script: ::
+
+  IOR START
+    api=[POSIX|MPIIO|HDF5|HDFS|S3|S3_EMC|NCMPI]
+    testFile=testFile
+    hintsFileName=hintsFile
+    repetitions=8
+    multiFile=0
+    interTestDelay=5
+    readFile=1
+    writeFile=1
+    filePerProc=0
+    checkWrite=0
+    checkRead=0
+    keepFile=1
+    quitOnError=0
+    segmentCount=1
+    blockSize=32k
+    outlierThreshold=0
+    setAlignment=1
+    transferSize=32
+    singleXferAttempt=0
+    individualDataSets=0
+    verbose=0
+    numTasks=32
+    collective=1
+    preallocate=0
+    useFileView=0
+    keepFileWithError=0
+    setTimeStampSignature=0
+    useSharedFilePointer=0
+    useStridedDatatype=0
+    uniqueDir=0
+    fsync=0
+    storeFileOffset=0
+    maxTimeDuration=60
+    deadlineForStonewalling=0
+    useExistingTestFile=0
+    useO_DIRECT=0
+    showHints=0
+    showHelp=0
+  RUN
+    # additional tests are optional
+    <snip>
+  RUN
+    <snip>
+  RUN
+  IOR STOP
diff --git a/doc/sphinx/userDoc/tutorial-cache-vs-nocache.png b/doc/sphinx/userDoc/tutorial-cache-vs-nocache.png
new file mode 100644
index 0000000..a743350
Binary files /dev/null and b/doc/sphinx/userDoc/tutorial-cache-vs-nocache.png differ
diff --git a/doc/sphinx/userDoc/tutorial-ior-io-pattern.png b/doc/sphinx/userDoc/tutorial-ior-io-pattern.png
new file mode 100644
index 0000000..02c52a6
Binary files /dev/null and b/doc/sphinx/userDoc/tutorial-ior-io-pattern.png differ
diff --git a/doc/sphinx/userDoc/tutorial-ior-memPerNode-test.png b/doc/sphinx/userDoc/tutorial-ior-memPerNode-test.png
new file mode 100644
index 0000000..c5dd127
Binary files /dev/null and b/doc/sphinx/userDoc/tutorial-ior-memPerNode-test.png differ
diff --git a/doc/sphinx/userDoc/tutorial-ior-overflowing-cache.png b/doc/sphinx/userDoc/tutorial-ior-overflowing-cache.png
new file mode 100644
index 0000000..d4e54fd
Binary files /dev/null and b/doc/sphinx/userDoc/tutorial-ior-overflowing-cache.png differ
diff --git a/doc/sphinx/userDoc/tutorial-ior-reorderTasks.png b/doc/sphinx/userDoc/tutorial-ior-reorderTasks.png
new file mode 100644
index 0000000..c325985
Binary files /dev/null and b/doc/sphinx/userDoc/tutorial-ior-reorderTasks.png differ
diff --git a/doc/sphinx/userDoc/tutorial.rst b/doc/sphinx/userDoc/tutorial.rst
index 488498b..9556ceb 100644
--- a/doc/sphinx/userDoc/tutorial.rst
+++ b/doc/sphinx/userDoc/tutorial.rst
@@ -1,12 +1,274 @@
+.. _first-steps:
+
 First Steps with IOR
 ====================
 
-test
+This is a short tutorial for the basic usage of IOR and some tips on how to use
+IOR to handel caching effects as these are very likely to affect your
+measurements.
 
-.. doxygenvariable:: buffer
-   :project: IOR
+Running IOR
+-----------
+There are two ways of running IOR:
 
-.. doxygenfunction:: main()
-  :project: IOR
+    1) Command line with arguments -- executable followed by command line
+        options.
 
-.. doxygenindex::
+        ::
+            $ ./IOR -w -r -o filename
+
+        This performs a write and a read to the file 'filename'.
+
+    2) Command line with scripts -- any arguments on the command line will
+        establish the default for the test run, but a script may be used in
+        conjunction with this for varying specific tests during an execution of
+        the code. Only arguments before the script will be used!
+
+        ::
+            $ ./IOR -W -f script
+
+        This defaults all tests in 'script' to use write data checking.
+
+
+In this tutorial the first one is used as it is much easier to toy around with
+an get to know IOR. The second option thought is much more useful to safe
+benchmark setups to rerun later or to test many different cases.
+
+
+Getting Started with IOR
+------------------------
+
+IOR writes data sequentially with the following parameters:
+
+   * blockSize (-b)
+   * transferSize (-t)
+   * segmentCount (-s)
+   * numTasks (-n)
+
+which are best illustrated with a diagram:
+
+.. image:: tutorial-ior-io-pattern.png
+
+
+These four parameters are all you need to get started with IOR.  However,
+naively running IOR usually gives disappointing results.  For example, if we run
+a four-node IOR test that writes a total of 16 GiB::
+
+    $ mpirun -n 64 ./ior -t 1m -b 16m -s 16
+    ...
+    access bw(MiB/s) block(KiB) xfer(KiB) open(s)  wr/rd(s) close(s) total(s) iter
+    ------ --------- ---------- --------- -------- -------- -------- -------- ----
+    write  427.36    16384      1024.00   0.107961 38.34    32.48    38.34    2
+    read   239.08    16384      1024.00   0.005789 68.53    65.53    68.53    2
+    remove -         -          -         -        -        -        0.534400 2
+
+
+we can only get a couple hundred megabytes per second out of a Lustre file
+system that should be capable of a lot more.
+
+Switching from writing to a single-shared file to one file per process using the
+-F (filePerProcess=1) option changes the performance dramatically::
+
+    $ mpirun -n 64 ./ior -t 1m -b 16m -s 16 -F
+    ...
+    access bw(MiB/s) block(KiB) xfer(KiB) open(s)  wr/rd(s) close(s) total(s) iter
+    ------ --------- ---------- --------- -------- -------- -------- -------- ----
+    write  33645     16384      1024.00   0.007693 0.486249 0.195494 0.486972 1
+    read   149473    16384      1024.00   0.004936 0.108627 0.016479 0.109612 1
+    remove -         -          -         -        -        -        6.08     1
+
+
+This is in large part because letting each MPI process work on its own file cuts
+out any contention that would arise because of file locking.
+
+However, the performance difference between our naive test and the
+file-per-process test is a bit extreme.  In fact, the only way that 146 GB/sec
+read rate could be achievable on Lustre is if each of the four compute nodes had
+over 45 GB/sec of network bandwidth to Lustre--that is, a 400 Gbit link on every
+compute and storage node.
+
+
+Effect of Page Cache on Benchmarking
+------------------------------------
+What's really happening is that the data being read by IOR isn't actually coming
+from Lustre; rather, files' contents are already cached, and IOR is able to
+read them directly out of each compute node's DRAM.  The data wound up getting
+cached during the write phase of IOR as a result of Linux (and Lustre) using a
+write-back cache to buffer I/O, so that instead of IOR writing and reading data
+directly to Lustre, it's actually mostly talking to the memory on each compute
+node.
+
+To be more specific, although each IOR process thinks it is writing to a file on
+Lustre and then reading back the contents of that file from Lustre, it is
+actually
+
+    1)  writing data to a copy of the file that is cached in memory.  If there
+        is no copy of the file cached in memory before this write, the parts
+        being modified are loaded into memory first.
+    2)  those parts of the file in memory (called "pages") that are now
+        different from what's on Lustre are marked as being "dirty"
+    3)  the write() call completes and IOR continues on, even though the written
+        data still hasn't been committed to Lustre
+    4)  independent of IOR, the OS kernel continually scans the file cache for
+        files who have been updated in memory but not on Lustre ("dirt pages"),
+        and then commits the cached modifications to Lustre
+    5)  dirty pages are declared non-dirty since they are now in sync with
+        what's on disk, but they remain in memory
+
+Then when the read phase of IOR follows the write phase, IOR is able to just
+retrieve the file's contents from memory instead of having to communicate with
+Lustre over the network.
+
+There are a couple of ways to measure the read performance of the underlying
+Lustre file system. The most crude way is to simply write more data than will
+fit into the total page cache so that by the time the write phase has completed,
+the beginning of the file has already been evicted from cache. For example,
+increasing the number of segments (-s) to write more data reveals the point at
+which the nodes' page cache on my test system runs over very clearly:
+
+.. image:: tutorial-ior-overflowing-cache.png
+
+
+However, this can make running IOR on systems with a lot of on-node memory take
+forever.
+
+A better option would be to get the MPI processes on each node to only read data
+that they didn't write.  For example, on a four-process-per-node test, shifting
+the mapping of MPI processes to blocks by four makes each node N read the data
+written by node N-1.
+
+.. image:: tutorial-ior-reorderTasks.png
+
+Since page cache is not shared between compute nodes, shifting tasks this way
+ensures that each MPI process is reading data it did not write.
+
+IOR provides the -C option (reorderTasks) to do this, and it forces each MPI
+process to read the data written by its neighboring node. Running IOR with
+this option gives much more credible read performance::
+
+    $ mpirun -n 64 ./ior -t 1m -b 16m -s 16 -F -C
+    ...
+    access bw(MiB/s) block(KiB) xfer(KiB) open(s)  wr/rd(s) close(s) total(s) iter
+    ------ --------- ---------- --------- -------- -------- -------- -------- ----
+    write  41326     16384      1024.00   0.005756 0.395859 0.095360 0.396453 0
+    read   3310.00   16384      1024.00   0.011786 4.95     4.20     4.95     1
+    remove -         -          -         -        -        -        0.237291 1
+
+
+But now it should seem obvious that the write performance is also ridiculously
+high. And again, this is due to the page cache, which signals to IOR that writes
+are complete when they have been committed to memory rather than the underlying
+Lustre file system.
+
+To work around the effects of the page cache on write performance, we can issue
+an fsync() call immediately after all of the write()s return to force the dirty
+pages we just wrote to flush out to Lustre. Including the time it takes for
+fsync() to finish gives us a measure of how long it takes for our data to write
+to the page cache and for the page cache to write back to Lustre.
+
+IOR provides another convenient option, -e (fsync), to do just this. And, once
+again, using this option changes our performance measurement quite a bit::
+
+    $ mpirun -n 64 ./ior -t 1m -b 16m -s 16 -F -C -e
+    ...
+    access bw(MiB/s) block(KiB) xfer(KiB) open(s)  wr/rd(s) close(s) total(s) iter
+    ------ --------- ---------- --------- -------- -------- -------- -------- ----
+    write  2937.89   16384      1024.00   0.011841 5.56     4.93     5.58     0
+    read   2712.55   16384      1024.00   0.005214 6.04     5.08     6.04     3
+    remove -         -          -         -        -        -        0.037706 0
+
+
+and we finally have a believable bandwidth measurement for our file system.
+
+Defeating Page Cache
+Since IOR is specifically designed to benchmark I/O, it provides these options
+that make it as easy as possible to ensure that you are actually measuring the
+performance of your file system and not your compute nodes' memory.  That being
+said, the I/O patterns it generates are designed to demonstrate peak performance,
+not reflect what a real application might be trying to do, and as a result,
+there are plenty of cases where measuring I/O performance with IOR is not always
+the best choice.  There are several ways in which we can get clever and defeat
+page cache in a more general sense to get meaningful performance numbers.
+
+When measuring write performance, bypassing page cache is actually quite simple;
+opening a file with the O_DIRECT flag going directly to disk.  In addition,
+the fsync() call can be inserted into applications, as is done with IOR's -e
+option.
+
+Measuring read performance is a lot trickier.  If you are fortunate enough to
+have root access on a test system, you can force the Linux kernel to empty out
+its page cache by doing
+
+::
+    # echo 1 > /proc/sys/vm/drop_caches
+
+and in fact, this is often good practice before running any benchmark
+(e.g., Linpack) because it ensures that you aren't losing performance to the
+kernel trying to evict pages as your benchmark application starts allocating
+memory for its own use.
+
+Unfortunately, many of us do not have root on our systems, so we have to get
+even more clever.  As it turns out, there is a way to pass a hint to the kernel
+that a file is no longer needed in page cache::
+
+    #define _XOPEN_SOURCE 600
+    #include <unistd.h>
+    #include <fcntl.h>
+    int main(int argc, char *argv[]) {
+        int fd;
+        fd = open(argv[1], O_RDONLY);
+        fdatasync(fd);
+        posix_fadvise(fd, 0,0,POSIX_FADV_DONTNEED);
+        close(fd);
+        return 0;
+    }
+
+The effect of passing POSIX_FADV_DONTNEED using posix_fadvise() is usually that
+all pages belonging to that file are evicted from page cache in Linux.  However,
+this is just a hint--not a guarantee--and the kernel evicts these pages
+asynchronously, so it may take a second or two for pages to actually leave page
+cache.  Fortunately, Linux also provides a way to probe pages in a file to see
+if they are resident in memory.
+
+Finally, it's often easiest to just limit the amount of memory available for
+page cache.  Because application memory always takes precedence over cache
+memory, simply allocating most of the memory on a node will force most of the
+cached pages to be evicted.  Newer versions of IOR provide the memoryPerNode
+option that do just that, and the effects are what one would expect:
+
+.. image:: tutorial-ior-memPerNode-test.png
+
+The above diagram shows the measured bandwidth from a single node with 128 GiB
+of total DRAM.  The first percent on each x-label is the amount of this 128 GiB
+that was reserved by the benchmark as application memory, and the second percent
+is the total write volume.  For example, the "50%/150%" data points correspond
+to 50% of the node memory (64 GiB) being allocated for the application, and a
+total of 192 GiB of data being read.
+
+This benchmark was run on a single spinning disk which is not capable of more
+than 130 MB/sec, so the conditions that showed performance higher than this were
+benefiting from some pages being served from cache.  And this makes perfect
+sense given that the anomalously high performance measurements were obtained
+when there was plenty of memory to cache relative to the amount of data being
+read.
+
+Corollary
+---------
+Measuring I/O performance is a bit trickier than CPU performance in large part
+due to the effects of page caching.  That being said, page cache exists for a
+reason, and there are many cases where an application's I/O performance really
+is best represented by a benchmark that heavily utilizes cache.
+
+For example, the BLAST bioinformatics application re-reads all of its input data
+twice; the first time initializes data structures, and the second time fills
+them up.  Because the first read caches each page and allows the second read to
+come out of cache rather than the file system, running this I/O pattern with
+page cache disabled causes it to be about 2x slower:
+
+.. image:: tutorial-cache-vs-nocache.png
+
+
+Thus, letting the page cache do its thing is often the most realistic way to
+benchmark with realistic application I/O patterns.  Once you know how page cache
+might be affecting your measurements, you stand a good chance of being able to
+reason about what the most meaningful performance metrics are.