Merge pull request #999 from pauldmccarthy/rf/gifti-externalfile

effigies · web-flow · commit 62aea04248e7 · 2021-03-09T08:05:45.000-05:00
ENH: Add support for GIFTI ExternalFileBinary
diff --git a/nibabel/gifti/gifti.py b/nibabel/gifti/gifti.py
@@ -881,27 +881,41 @@ def to_file_map(self, file_map=None):
         f.write(self.to_xml())
 
     @classmethod
-    def from_file_map(klass, file_map, buffer_size=35000000):
-        """ Load a Gifti image from a file_map
+    def from_file_map(klass, file_map, buffer_size=35000000, mmap=True):
+        """Load a Gifti image from a file_map
 
         Parameters
         ----------
         file_map : dict
             Dictionary with single key ``image`` with associated value which is
             a :class:`FileHolder` instance pointing to the image file.
 
+        buffer_size: None or int, optional
+            size of read buffer. None uses default buffer_size
+            from xml.parsers.expat.
+
+        mmap : {True, False, 'c', 'r', 'r+'}
+            Controls the use of numpy memory mapping for reading data.  Only
+            has an effect when loading GIFTI images with data stored in
+            external files (``DataArray`` elements with an ``Encoding`` equal
+            to ``ExternalFileBinary``).  If ``False``, do not try numpy
+            ``memmap`` for data array.  If one of ``{'c', 'r', 'r+'}``, try
+            numpy ``memmap`` with ``mode=mmap``.  A `mmap` value of ``True``
+            gives the same behavior as ``mmap='c'``.  If the file cannot be
+            memory-mapped, ignore `mmap` value and read array from file.
+
         Returns
         -------
         img : GiftiImage
         """
-        parser = klass.parser(buffer_size=buffer_size)
+        parser = klass.parser(buffer_size=buffer_size, mmap=mmap)
         parser.parse(fptr=file_map['image'].get_prepare_fileobj('rb'))
         return parser.img
 
     @classmethod
-    def from_filename(klass, filename, buffer_size=35000000):
+    def from_filename(klass, filename, buffer_size=35000000, mmap=True):
         file_map = klass.filespec_to_file_map(filename)
-        img = klass.from_file_map(file_map, buffer_size=buffer_size)
+        img = klass.from_file_map(file_map, buffer_size=buffer_size, mmap=mmap)
         return img
 
 
diff --git a/nibabel/gifti/parse_gifti_fast.py b/nibabel/gifti/parse_gifti_fast.py
@@ -11,6 +11,7 @@
 import sys
 import warnings
 import zlib
+import os.path as op
 from io import StringIO
 from xml.parsers.expat import ExpatError
 
@@ -30,45 +31,109 @@ class GiftiParseError(ExpatError):
     """ Gifti-specific parsing error """
 
 
-def read_data_block(encoding, endian, ordering, datatype, shape, data):
-    """ Tries to unzip, decode, parse the funny string data """
-    enclabel = gifti_encoding_codes.label[encoding]
-    dtype = data_type_codes.type[datatype]
+def read_data_block(darray, fname, data, mmap):
+    """Parses data from a <Data> element, or loads from an external file.
+
+    Parameters
+    ----------
+    darray : GiftiDataArray
+         GiftiDataArray object representing the parent <DataArray> of this
+         <Data> element
+
+    fname : str or None
+         Name of GIFTI file being loaded, or None if in-memory
+
+    data : str or None
+         Data to parse, or None if data is in an external file
+
+    mmap : {True, False, 'c', 'r', 'r+'}
+        Controls the use of numpy memory mapping for reading data.  Only has
+        an effect when loading GIFTI images with data stored in external files
+        (``DataArray`` elements with an ``Encoding`` equal to
+        ``ExternalFileBinary``).  If ``False``, do not try numpy ``memmap``
+        for data array.  If one of ``{'c', 'r', 'r+'}``, try numpy ``memmap``
+        with ``mode=mmap``.  A `mmap` value of ``True`` gives the same
+        behavior as ``mmap='c'``.  If the file cannot be memory-mapped, ignore
+        `mmap` value and read array from file.
+
+    Returns
+    -------
+    ``numpy.ndarray`` or ``numpy.memmap`` containing the parsed data
+    """
+    if mmap not in (True, False, 'c', 'r', 'r+'):
+        raise ValueError("mmap value should be one of True, False, 'c', "
+                         "'r', 'r+'")
+    if mmap is True:
+        mmap = 'c'
+    enclabel = gifti_encoding_codes.label[darray.encoding]
+    dtype = data_type_codes.type[darray.datatype]
+
     if enclabel == 'ASCII':
         # GIFTI_ENCODING_ASCII
         c = StringIO(data)
         da = np.loadtxt(c, dtype=dtype)
         return da  # independent of the endianness
-
-    elif enclabel == 'External':
-        # GIFTI_ENCODING_EXTBIN
-        raise NotImplementedError("In what format are the external files?")
-
-    elif enclabel not in ('B64BIN', 'B64GZ'):
+    elif enclabel not in ('B64BIN', 'B64GZ', 'External'):
         return 0
 
+    # GIFTI_ENCODING_EXTBIN
+    # We assume that the external data file is raw uncompressed binary, with
+    # the data type/endianness/ordering specified by the other DataArray
+    # attributes
+    if enclabel == 'External':
+        if fname is None:
+            raise GiftiParseError('ExternalFileBinary is not supported '
+                                  'when loading from in-memory XML')
+        ext_fname = op.join(op.dirname(fname), darray.ext_fname)
+        if not op.exists(ext_fname):
+            raise GiftiParseError('Cannot locate external file ' + ext_fname)
+        # We either create a memmap, or load into memory
+        newarr = None
+        if mmap:
+            try:
+                newarr = np.memmap(ext_fname,
+                                   dtype=dtype,
+                                   mode=mmap,
+                                   offset=darray.ext_offset,
+                                   shape=tuple(darray.dims))
+            # If the memmap fails, we ignore the error and load the data into
+            # memory below
+            except (AttributeError, TypeError, ValueError):
+                pass
+        # mmap=False or np.memmap failed
+        if newarr is None:
+            # We can replace this with a call to np.fromfile in numpy>=1.17,
+            # as an "offset" paramter was added in that version.
+            with open(ext_fname, 'rb') as f:
+                f.seek(darray.ext_offset)
+                nbytes = np.prod(darray.dims) * dtype().itemsize
+                buff = f.read(nbytes)
+                newarr = np.frombuffer(buff, dtype=dtype)
+
     # Numpy arrays created from bytes objects are read-only.
     # Neither b64decode nor decompress will return bytearrays, and there
     # are not equivalents to fobj.readinto to allow us to pass them, so
     # there is not a simple way to avoid making copies.
     # If this becomes a problem, we should write a decoding interface with
     # a tunable chunk size.
-    dec = base64.b64decode(data.encode('ascii'))
-    if enclabel == 'B64BIN':
-        # GIFTI_ENCODING_B64BIN
-        buff = bytearray(dec)
     else:
-        # GIFTI_ENCODING_B64GZ
-        buff = bytearray(zlib.decompress(dec))
-    del dec
-
-    sh = tuple(shape)
-    newarr = np.frombuffer(buff, dtype=dtype)
+        dec = base64.b64decode(data.encode('ascii'))
+        if enclabel == 'B64BIN':
+            # GIFTI_ENCODING_B64BIN
+            buff = bytearray(dec)
+        else:
+            # GIFTI_ENCODING_B64GZ
+            buff = bytearray(zlib.decompress(dec))
+        del dec
+        newarr = np.frombuffer(buff, dtype=dtype)
+
+    sh = tuple(darray.dims)
     if len(newarr.shape) != len(sh):
-        newarr = newarr.reshape(sh, order=array_index_order_codes.npcode[ordering])
+        newarr = newarr.reshape(
+            sh, order=array_index_order_codes.npcode[darray.ind_ord])
 
     # check if we need to byteswap
-    required_byteorder = gifti_endian_codes.byteorder[endian]
+    required_byteorder = gifti_endian_codes.byteorder[darray.endian]
     if (required_byteorder in ('big', 'little') and
             required_byteorder != sys.byteorder):
         newarr = newarr.byteswap()
@@ -82,13 +147,17 @@ def _str2int(in_str):
 
 class GiftiImageParser(XmlParser):
 
-    def __init__(self, encoding=None, buffer_size=35000000, verbose=0):
+    def __init__(self, encoding=None, buffer_size=35000000, verbose=0,
+                 mmap=True):
         super(GiftiImageParser, self).__init__(encoding=encoding,
                                                buffer_size=buffer_size,
                                                verbose=verbose)
         # output
         self.img = None
 
+        # Queried when loading data from <Data> elements - see read_data_block
+        self.mmap = mmap
+
         # finite state machine stack
         self.fsm_state = []
 
@@ -288,12 +357,17 @@ def CharacterDataHandler(self, data):
 
     def flush_chardata(self):
         """ Collate and process collected character data"""
-        if self._char_blocks is None:
+        # Nothing to do for empty elements, except for Data elements which
+        # are within a DataArray with an external file
+        if self.write_to != 'Data' and self._char_blocks is None:
             return
         # Just join the strings to get the data.  Maybe there are some memory
         # optimizations we could do by passing the list of strings to the
         # read_data_block function.
-        data = ''.join(self._char_blocks)
+        if self._char_blocks is not None:
+            data = ''.join(self._char_blocks)
+        else:
+            data = None
         # Reset the char collector
         self._char_blocks = None
 
@@ -321,10 +395,8 @@ def flush_chardata(self):
             c.close()
 
         elif self.write_to == 'Data':
-            da_tmp = self.img.darrays[-1]
-            da_tmp.data = read_data_block(da_tmp.encoding, da_tmp.endian,
-                                          da_tmp.ind_ord, da_tmp.datatype,
-                                          da_tmp.dims, data)
+            self.da.data = read_data_block(self.da, self.fname, data,
+                                           self.mmap)
             # update the endianness according to the
             # current machine setting
             self.endian = gifti_endian_codes.code[sys.byteorder]
diff --git a/nibabel/gifti/tests/data/external.dat b/nibabel/gifti/tests/data/external.dat
diff --git a/nibabel/gifti/tests/data/external.gii b/nibabel/gifti/tests/data/external.gii
@@ -0,0 +1,37 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE GIFTI SYSTEM "http://www.nitrc.org/frs/download.php/115/gifti.dtd">
+<GIFTI Version="1.0"  NumberOfDataArrays="2">
+   <DataArray  ArrayIndexingOrder="RowMajorOrder"
+               DataType="NIFTI_TYPE_FLOAT32"
+               Dim0="8"
+               Dim1="3"
+               Dimensionality="2"
+               Encoding="ExternalFileBinary"
+               Endian="LittleEndian"
+               ExternalFileName="external.dat"
+               ExternalFileOffset="0"
+               Intent="NIFTI_INTENT_POINTSET">
+      <MetaData>
+      </MetaData>
+      <CoordinateSystemTransformMatrix>
+         <DataSpace><![CDATA[NIFTI_XFORM_UNKNOWN]]></DataSpace>
+         <TransformedSpace><![CDATA[NIFTI_XFORM_UNKNOWN]]></TransformedSpace>
+         <MatrixData>1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 </MatrixData>
+      </CoordinateSystemTransformMatrix>
+      <Data></Data>
+   </DataArray>
+   <DataArray  ArrayIndexingOrder="RowMajorOrder"
+               DataType="NIFTI_TYPE_INT32"
+               Dim0="12"
+               Dim1="3"
+               Dimensionality="2"
+               Encoding="ExternalFileBinary"
+               Endian="LittleEndian"
+               ExternalFileName="external.dat"
+               ExternalFileOffset="96"
+               Intent="NIFTI_INTENT_TRIANGLE">
+      <MetaData>
+      </MetaData>
+      <Data></Data>
+   </DataArray>
+</GIFTI>
diff --git a/nibabel/gifti/tests/test_parse_gifti_fast.py b/nibabel/gifti/tests/test_parse_gifti_fast.py
@@ -7,15 +7,18 @@
 #
 ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
 
-from os.path import join as pjoin, dirname
+from os.path import join as pjoin, dirname, basename
 import sys
 import warnings
+import shutil
+from unittest import mock
 
 import numpy as np
 
 from .. import gifti as gi
 from ..util import gifti_endian_codes
-from ..parse_gifti_fast import Outputter, parse_gifti_file
+from ..parse_gifti_fast import (Outputter, parse_gifti_file, GiftiParseError,
+                                GiftiImageParser)
 from ...loadsave import load, save
 from ...nifti1 import xform_codes
 from ...tmpdirs import InTemporaryDirectory
@@ -38,9 +41,10 @@
 # wb_command -gifti-convert ASCII base64bin.gii test.gii
 DATA_FILE5 = pjoin(IO_DATA_PATH, 'base64bin.gii')
 DATA_FILE6 = pjoin(IO_DATA_PATH, 'rh.aparc.annot.gii')
+DATA_FILE7 = pjoin(IO_DATA_PATH, 'external.gii')
 
-datafiles = [DATA_FILE1, DATA_FILE2, DATA_FILE3, DATA_FILE4, DATA_FILE5, DATA_FILE6]
-numDA = [2, 1, 1, 1, 2, 1]
+datafiles = [DATA_FILE1, DATA_FILE2, DATA_FILE3, DATA_FILE4, DATA_FILE5, DATA_FILE6, DATA_FILE7]
+numDA = [2, 1, 1, 1, 2, 1, 2]
 
 DATA_FILE1_darr1 = np.array(
     [[-16.07201, -66.187515, 21.266994],
@@ -96,6 +100,28 @@
 
 DATA_FILE6_darr1 = np.array([9182740, 9182740, 9182740], dtype=np.float32)
 
+DATA_FILE7_darr1 = np.array([[-1., -1., -1.],
+                             [-1., -1.,  1.],
+                             [-1.,  1., -1.],
+                             [-1.,  1.,  1.],
+                             [ 1., -1., -1.],
+                             [ 1., -1.,  1.],
+                             [ 1.,  1., -1.],
+                             [ 1.,  1.,  1.]], dtype=np.float32)
+
+DATA_FILE7_darr2 = np.array([[0, 6, 4],
+                             [0, 2, 6],
+                             [1, 5, 3],
+                             [3, 5, 7],
+                             [0, 4, 1],
+                             [1, 4, 5],
+                             [2, 7, 6],
+                             [2, 3, 7],
+                             [0, 1, 2],
+                             [1, 3, 2],
+                             [4, 7, 5],
+                             [4, 6, 7]], dtype=np.int32)
+
 
 def assert_default_types(loaded):
     default = loaded.__class__()
@@ -382,3 +408,57 @@ def test_parse_with_buffersize():
     for buff_sz in [None, 1, 2**12]:
         img2 = load(DATA_FILE2, buffer_size=buff_sz)
         assert img2.darrays[0].data.shape == (143479, 1)
+
+
+def test_dataarray7():
+    img7 = load(DATA_FILE7)
+    assert_array_almost_equal(img7.darrays[0].data, DATA_FILE7_darr1)
+    assert_array_almost_equal(img7.darrays[1].data, DATA_FILE7_darr2)
+
+
+def test_parse_with_memmmap():
+    img1 = load(DATA_FILE7)
+    img2 = load(DATA_FILE7, mmap=True)
+    img3 = load(DATA_FILE7, mmap=False)
+    assert len(img1.darrays) == len(img2.darrays) == 2
+    assert isinstance(img1.darrays[0].data, np.memmap)
+    assert isinstance(img1.darrays[1].data, np.memmap)
+    assert isinstance(img2.darrays[0].data, np.memmap)
+    assert isinstance(img2.darrays[1].data, np.memmap)
+    assert not isinstance(img3.darrays[0].data, np.memmap)
+    assert not isinstance(img3.darrays[1].data, np.memmap)
+    assert_array_almost_equal(img1.darrays[0].data, DATA_FILE7_darr1)
+    assert_array_almost_equal(img1.darrays[1].data, DATA_FILE7_darr2)
+    assert_array_almost_equal(img2.darrays[0].data, DATA_FILE7_darr1)
+    assert_array_almost_equal(img2.darrays[1].data, DATA_FILE7_darr2)
+    assert_array_almost_equal(img3.darrays[0].data, DATA_FILE7_darr1)
+    assert_array_almost_equal(img3.darrays[1].data, DATA_FILE7_darr2)
+
+
+def test_parse_with_memmap_fallback():
+    img1 = load(DATA_FILE7, mmap=True)
+    with mock.patch('numpy.memmap', side_effect=ValueError):
+        img2 = load(DATA_FILE7, mmap=True)
+    assert isinstance(img1.darrays[0].data, np.memmap)
+    assert isinstance(img1.darrays[1].data, np.memmap)
+    assert not isinstance(img2.darrays[0].data, np.memmap)
+    assert not isinstance(img2.darrays[1].data, np.memmap)
+    assert_array_almost_equal(img1.darrays[0].data, DATA_FILE7_darr1)
+    assert_array_almost_equal(img1.darrays[1].data, DATA_FILE7_darr2)
+    assert_array_almost_equal(img2.darrays[0].data, DATA_FILE7_darr1)
+    assert_array_almost_equal(img2.darrays[1].data, DATA_FILE7_darr2)
+
+
+def test_external_file_failure_cases():
+    # external file cannot be found
+    with InTemporaryDirectory() as tmpdir:
+        shutil.copy(DATA_FILE7, '.')
+        filename = pjoin(tmpdir, basename(DATA_FILE7))
+        with pytest.raises(GiftiParseError):
+            img = load(filename)
+    # load from in-memory xml string (parser requires it as bytes)
+    with open(DATA_FILE7, 'rb') as f:
+        xmldata = f.read()
+    parser = GiftiImageParser()
+    with pytest.raises(GiftiParseError):
+        img = parser.parse(xmldata)
diff --git a/nibabel/xmlutils.py b/nibabel/xmlutils.py