Improving the IO - works now with rank of arrays

mikaem · mikaem · commit fb22b8779e35 · 2019-03-13T12:56:22.000+01:00
diff --git a/docs/source/howtocite.rst b/docs/source/howtocite.rst
@@ -10,7 +10,7 @@ Please cite mpi4py-fft using
         year = {{2019}},
         title = {{Fast parallel multidimensional FFT using advanced MPI}},
         journal = {{Journal of Parallel and Distributed Computing}},
-        volume = {{in press}}
+        doi = {10.1016/j.jpdc.2019.02.006}
     }
     @electronic{mpi4py-fft,
         author = {{Lisandro Dalcin and Mikael Mortensen}},
diff --git a/docs/source/io.rst b/docs/source/io.rst
@@ -24,26 +24,31 @@ reads data in parallel. A simple example of usage is::
     u[:] = np.random.random(u.shape)
     # Store by first creating output files
     fields = {'u': [u], 'v': [v]}
-    f0 = HDF5File('h5test.h5', global_shape=N, mode='w')
-    f1 = NCFile('nctest.nc', global_shape=N, mode='w')
+    f0 = HDF5File('h5test.h5', mode='w')
+    f1 = NCFile('nctest.nc', mode='w')
     f0.write(0, fields)
     f1.write(0, fields)
     v[:] = 3
     f0.write(1, fields)
     f1.write(1, fields)
-    # Alternatively, just use write method of each distributed array
-    u.write('h5test.h5', 'u', step=2)
-    v.write('h5test.h5', 'v', step=2)
-    u.write('nctest.nc', 'u', step=2)
-    v.write('nctest.nc', 'v', step=2)
 
 Note that we are here creating two datafiles ``h5test.h5`` and ``nctest.nc``,
 for storing in HDF5 or NetCDF4 formats respectively. Normally, one would be
 satisfied using only one format, so this is only for illustration. We store
 the fields ``u`` and ``v`` on three different occasions,
 so the datafiles will contain three snapshots of each field ``u`` and ``v``.
 
-The stored dataarrays can be retrieved later on::
+Also note that an alternative and perhaps simpler approach is to just use
+the ``write`` method of each distributed array::
+
+    u.write('h5test.h5', 'u', step=2)
+    v.write('h5test.h5', 'v', step=2)
+    u.write('nctest.nc', 'u', step=2)
+    v.write('nctest.nc', 'v', step=2)
+
+The two different approaches can be used on the same output files.
+
+The stored dataarrays can also be retrieved later on::
 
     u0 = newDistArray(T, forward_output=False)
     u1 = newDistArray(T, forward_output=False)
@@ -53,26 +58,33 @@ The stored dataarrays can be retrieved later on::
     #u0.read('nctest.nc', 'u', 0)
     #u1.read('nctest.nc', 'u', 1)
 
-
 Note that one does not have to use the same number of processors when
 retrieving the data as when they were stored.
 
 It is also possible to store only parts of the, potentially large, arrays.
-Any chosen slice may be stored, using a *global* view of the arrays::
+Any chosen slice may be stored, using a *global* view of the arrays. It is
+possible to store both complete fields and slices in one single call by
+using the following appraoch::
 
-    f2 = HDF5File('variousfields.h5', global_shape=N, mode='w')
+    f2 = HDF5File('variousfields.h5', mode='w')
     fields = {'u': [u,
                     (u, [slice(None), slice(None), 4]),
                     (u, [5, 5, slice(None)])],
               'v': [v,
                     (v, [slice(None), 6, slice(None)])]}
     f2.write(0, fields)
     f2.write(1, fields)
-    f2.write(2, fields)
-    # or, using write method of field, e.g.
-    #u.write('variousfields.h5', 'u', 0, [slice(None), slice(None), 4])
 
-This will lead to an hdf5-file with groups::
+Alternatively, one can use the write method of each field with the ``global_slice``
+keyword argument::
+
+    u.write('variousfields.h5', 'u', 2)
+    u.write('variousfields.h5', 'u', 2, global_slice=[slice(None), slice(None), 4])
+    u.write('variousfields.h5', 'u', 2, global_slice=[5, 5, slice(None)])
+    v.write('variousfields.h5', 'v', 2)
+    v.write('variousfields.h5', 'v', 2, global_slice=[slice(None), 6, slice(None)])
+
+In the end this will lead to an hdf5-file with groups::
 
     variousfields.h5/
     ├─ u/
@@ -86,41 +98,49 @@ This will lead to an hdf5-file with groups::
     |  |     ├─ 0
     |  |     ├─ 1
     |  |     └─ 2
-    |  └─ 3D/
-    |     ├─ 0
-    |     ├─ 1
-    |     └─ 2
-    ├─ v/
-    |  ├─ 2D/
-    |  |  └─ slice_6_slice/
-    |  |     ├─ 0
-    |  |     ├─ 1
-    |  |     └─ 2
-    |  └─ 3D/
-    |     ├─ 0
-    |     ├─ 1
-    |     └─ 2
-    └─ mesh/
-       ├─ x0
-       ├─ x1
-       └─ x2
-
-Note that a mesh is stored along with all the data. This mesh can be given in
-two different ways when creating the datafiles:
+    |  ├─ 3D/
+    |  |   ├─ 0
+    |  |   ├─ 1
+    |  |   └─ 2
+    |  └─ mesh/
+    |      ├─ x0
+    |      ├─ x1
+    |      └─ x2
+    └─ v/
+       ├─ 2D/
+       |  └─ slice_6_slice/
+       |     ├─ 0
+       |     ├─ 1
+       |     └─ 2
+       ├─ 3D/
+       |  ├─ 0
+       |  ├─ 1
+       |  └─ 2
+       └─ mesh/
+          ├─ x0
+          ├─ x1
+          └─ x2
+
+Note that a mesh is stored along with each group of data. This mesh can be
+given in two different ways when creating the datafiles:
 
     1) A sequence of 2-tuples, where each 2-tuple contains the (origin, length)
        of the domain along its dimension. For example, a uniform mesh that
        originates from the origin, with lengths :math:`\pi, 2\pi, 3\pi`, can be
-       given as::
+       given when creating the output file as::
+
+        f0 = HDF5File('filename.h5', domain=((0, pi), (0, 2*np.pi), (0, 3*np.pi)))
+
+        or, using the write method of the distributed array:
 
-        f0 = HDF5File('filename.h5', global_shape=N, domain=((0, pi), (0, 2*np.pi), (0, 3*np.pi)))
+        u.write('filename.h5', 'u', 0, domain=((0, pi), (0, 2*np.pi), (0, 3*np.pi)))
 
     2) A sequence of arrays giving the coordinates for each dimension. For example::
 
         d = (np.arange(N[0], dtype=np.float)*1*np.pi/N[0],
              np.arange(N[1], dtype=np.float)*2*np.pi/N[1],
              np.arange(N[2], dtype=np.float)*2*np.pi/N[2])
-        f0 = HDF5File('filename.h5', global_shape=N, domain=d)
+        f0 = HDF5File('filename.h5', domain=d)
 
 With NetCDF4 the layout is somewhat different. For ``variousfields`` above,
 if we were using :class:`.NCFile` instead of :class:`.HDF5File`,
diff --git a/mpi4py_fft/distarray.py b/mpi4py_fft/distarray.py
@@ -55,7 +55,7 @@ class DistArray(np.ndarray):
     """
     def __new__(cls, global_shape, subcomm=None, val=None, dtype=np.float,
                 buffer=None, alignment=None, rank=0):
-        if len(global_shape) < 2:
+        if len(global_shape[rank:]) < 2:
             obj = np.ndarray.__new__(cls, global_shape, dtype=dtype, buffer=buffer)
             if buffer is None and isinstance(val, Number):
                 obj.fill(val)
@@ -356,7 +356,7 @@ def redistribute(self, axis=None, out=None):
         return out
 
     def write(self, filename, name='darray', step=0, global_slice=None,
-              as_scalar=False):
+              domain=None, as_scalar=False):
         """Write snapshot ``step`` of ``self`` to file ``filename``
 
         Parameters
@@ -370,6 +370,14 @@ def write(self, filename, name='darray', step=0, global_slice=None,
             Index used for snapshot in file.
         global_slice : sequence of slices or integers, optional
             Store only this global slice of ``self``
+        domain : sequence, optional
+            An optional spatial mesh or domain to go with the data.
+            Sequence of either
+
+                - 2-tuples, where each 2-tuple contains the (origin, length)
+                  of each dimension, e.g., (0, 2*pi).
+                - Arrays of coordinates, e.g., np.linspace(0, 2*pi, N). One
+                  array per dimension
         as_scalar : boolean, optional
             Whether to store rank > 0 arrays as scalars. Default is False.
 
@@ -382,7 +390,7 @@ def write(self, filename, name='darray', step=0, global_slice=None,
         """
         if isinstance(filename, str):
             writer = HDF5File if filename.endswith('.h5') else NCFile
-            f = writer(filename, u=self, mode='a')
+            f = writer(filename, domain=domain, mode='a')
         elif isinstance(filename, FileBase):
             f = filename
         field = [self] if global_slice is None else [(self, global_slice)]
diff --git a/mpi4py_fft/io/h5py_file.py b/mpi4py_fft/io/h5py_file.py
@@ -18,8 +18,6 @@ class HDF5File(FileBase):
     ----------
     h5name : str
         Name of hdf5 file to be created.
-    mode : str, optional
-        ``r``, ``w`` or ``a`` for read, write or append. Default is ``a``.
     domain : sequence, optional
         An optional spatial mesh or domain to go with the data.
         Sequence of either
@@ -28,8 +26,10 @@ class HDF5File(FileBase):
               of each dimension, e.g., (0, 2*pi).
             - Arrays of coordinates, e.g., np.linspace(0, 2*pi, N). One
               array per dimension.
+    mode : str, optional
+        ``r``, ``w`` or ``a`` for read, write or append. Default is ``a``.
     """
-    def __init__(self, h5name, mode='a', domain=None, **kw):
+    def __init__(self, h5name, domain=None, mode='a', **kw):
         FileBase.__init__(self, domain=domain, **kw)
         import h5py
         self.filename = h5name
diff --git a/mpi4py_fft/io/nc_file.py b/mpi4py_fft/io/nc_file.py
@@ -23,8 +23,6 @@ class NCFile(FileBase):
     ----------
     ncname : str
         Name of netcdf file to be created
-    mode : str
-        ``r``, ``w`` or ``a`` for read, write or append. Default is ``a``.
     domain : Sequence, optional
         An optional spatial mesh or domain to go with the data.
         Sequence of either
@@ -33,7 +31,10 @@ class NCFile(FileBase):
               of each dimension, e.g., (0, 2*pi).
             - Arrays of coordinates, e.g., np.linspace(0, 2*pi, N). One
               array per dimension.
+    mode : str
+        ``r``, ``w`` or ``a`` for read, write or append. Default is ``a``.
     clobber : bool, optional
+
     Note
     ----
     Each class instance creates one unique NetCDF4-file, with one step-counter.
@@ -42,7 +43,7 @@ class NCFile(FileBase):
     every 10th timestep and another every 20th timestep, then use two different
     class instances and as such two NetCDF4-files.
     """
-    def __init__(self, ncname, domain=None, clobber=True, mode='a', **kw):
+    def __init__(self, ncname, domain=None, mode='a', clobber=True, **kw):
         FileBase.__init__(self, domain=domain, **kw)
         from netCDF4 import Dataset
         self.filename = ncname
@@ -122,7 +123,7 @@ def write(self, step, fields, **kw):
             it = np.argwhere(nc_t.__array__() == step)[0][0]
         else:
             nc_t[it] = step
-        FileBase.write(self, it, fields)
+        FileBase.write(self, it, fields, **kw)
         self.close()
 
     def read(self, u, name, **kw):
diff --git a/tests/test_darray.py b/tests/test_darray.py
@@ -5,6 +5,12 @@
 
 comm = MPI.COMM_WORLD
 
+def test_1Darray():
+    N = (8,)
+    z = DistArray(N, val=2)
+    assert z[0] == 2
+    assert z.shape == N
+
 def test_2Darray():
     N = (8, 8)
     for subcomm in ((0, 1), (1, 0), None, Subcomm(comm, (0, 1))):
@@ -114,6 +120,7 @@ def test_newDistArray():
                     assert a.base.rank == rank
 
 if __name__ == '__main__':
+    test_1Darray()
     test_2Darray()
     test_3Darray()
     test_newDistArray()
diff --git a/tests/test_io.py b/tests/test_io.py

Original file line number	Diff line number	Diff line change
`@@ -10,7 +10,7 @@ Please cite mpi4py-fft using`
`10`	`10`	`year = {{2019}},`
`11`	`11`	`title = {{Fast parallel multidimensional FFT using advanced MPI}},`
`12`	`12`	`journal = {{Journal of Parallel and Distributed Computing}},`
`13`		`- volume = {{in press}}`
	`13`	`+ doi = {10.1016/j.jpdc.2019.02.006}`
`14`	`14`	`}`
`15`	`15`	`@electronic{mpi4py-fft,`
`16`	`16`	`author = {{Lisandro Dalcin and Mikael Mortensen}},`