Skip to content

ENH: Add support for GIFTI ExternalFileBinary #999

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 10 commits into from
Mar 9, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 19 additions & 5 deletions nibabel/gifti/gifti.py
Original file line number Diff line number Diff line change
Expand Up @@ -881,27 +881,41 @@ def to_file_map(self, file_map=None):
f.write(self.to_xml())

@classmethod
def from_file_map(klass, file_map, buffer_size=35000000):
""" Load a Gifti image from a file_map
def from_file_map(klass, file_map, buffer_size=35000000, mmap=True):
"""Load a Gifti image from a file_map

Parameters
----------
file_map : dict
Dictionary with single key ``image`` with associated value which is
a :class:`FileHolder` instance pointing to the image file.

buffer_size: None or int, optional
size of read buffer. None uses default buffer_size
from xml.parsers.expat.

mmap : {True, False, 'c', 'r', 'r+'}
Controls the use of numpy memory mapping for reading data. Only
has an effect when loading GIFTI images with data stored in
external files (``DataArray`` elements with an ``Encoding`` equal
to ``ExternalFileBinary``). If ``False``, do not try numpy
``memmap`` for data array. If one of ``{'c', 'r', 'r+'}``, try
numpy ``memmap`` with ``mode=mmap``. A `mmap` value of ``True``
gives the same behavior as ``mmap='c'``. If the file cannot be
memory-mapped, ignore `mmap` value and read array from file.

Returns
-------
img : GiftiImage
"""
parser = klass.parser(buffer_size=buffer_size)
parser = klass.parser(buffer_size=buffer_size, mmap=mmap)
parser.parse(fptr=file_map['image'].get_prepare_fileobj('rb'))
return parser.img

@classmethod
def from_filename(klass, filename, buffer_size=35000000):
def from_filename(klass, filename, buffer_size=35000000, mmap=True):
file_map = klass.filespec_to_file_map(filename)
img = klass.from_file_map(file_map, buffer_size=buffer_size)
img = klass.from_file_map(file_map, buffer_size=buffer_size, mmap=mmap)
return img


Expand Down
130 changes: 101 additions & 29 deletions nibabel/gifti/parse_gifti_fast.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import sys
import warnings
import zlib
import os.path as op
from io import StringIO
from xml.parsers.expat import ExpatError

Expand All @@ -30,45 +31,109 @@ class GiftiParseError(ExpatError):
""" Gifti-specific parsing error """


def read_data_block(encoding, endian, ordering, datatype, shape, data):
""" Tries to unzip, decode, parse the funny string data """
enclabel = gifti_encoding_codes.label[encoding]
dtype = data_type_codes.type[datatype]
def read_data_block(darray, fname, data, mmap):
"""Parses data from a <Data> element, or loads from an external file.

Parameters
----------
darray : GiftiDataArray
GiftiDataArray object representing the parent <DataArray> of this
<Data> element

fname : str or None
Name of GIFTI file being loaded, or None if in-memory

data : str or None
Data to parse, or None if data is in an external file

mmap : {True, False, 'c', 'r', 'r+'}
Controls the use of numpy memory mapping for reading data. Only has
an effect when loading GIFTI images with data stored in external files
(``DataArray`` elements with an ``Encoding`` equal to
``ExternalFileBinary``). If ``False``, do not try numpy ``memmap``
for data array. If one of ``{'c', 'r', 'r+'}``, try numpy ``memmap``
with ``mode=mmap``. A `mmap` value of ``True`` gives the same
behavior as ``mmap='c'``. If the file cannot be memory-mapped, ignore
`mmap` value and read array from file.

Returns
-------
``numpy.ndarray`` or ``numpy.memmap`` containing the parsed data
"""
if mmap not in (True, False, 'c', 'r', 'r+'):
raise ValueError("mmap value should be one of True, False, 'c', "
"'r', 'r+'")
if mmap is True:
mmap = 'c'
enclabel = gifti_encoding_codes.label[darray.encoding]
dtype = data_type_codes.type[darray.datatype]

if enclabel == 'ASCII':
# GIFTI_ENCODING_ASCII
c = StringIO(data)
da = np.loadtxt(c, dtype=dtype)
return da # independent of the endianness

elif enclabel == 'External':
# GIFTI_ENCODING_EXTBIN
raise NotImplementedError("In what format are the external files?")

elif enclabel not in ('B64BIN', 'B64GZ'):
elif enclabel not in ('B64BIN', 'B64GZ', 'External'):
return 0

# GIFTI_ENCODING_EXTBIN
# We assume that the external data file is raw uncompressed binary, with
# the data type/endianness/ordering specified by the other DataArray
# attributes
if enclabel == 'External':
if fname is None:
raise GiftiParseError('ExternalFileBinary is not supported '
'when loading from in-memory XML')
ext_fname = op.join(op.dirname(fname), darray.ext_fname)
if not op.exists(ext_fname):
raise GiftiParseError('Cannot locate external file ' + ext_fname)
# We either create a memmap, or load into memory
newarr = None
if mmap:
try:
newarr = np.memmap(ext_fname,
dtype=dtype,
mode=mmap,
offset=darray.ext_offset,
shape=tuple(darray.dims))
# If the memmap fails, we ignore the error and load the data into
# memory below
except (AttributeError, TypeError, ValueError):
pass
# mmap=False or np.memmap failed
if newarr is None:
# We can replace this with a call to np.fromfile in numpy>=1.17,
# as an "offset" paramter was added in that version.
with open(ext_fname, 'rb') as f:
f.seek(darray.ext_offset)
nbytes = np.prod(darray.dims) * dtype().itemsize
buff = f.read(nbytes)
newarr = np.frombuffer(buff, dtype=dtype)

# Numpy arrays created from bytes objects are read-only.
# Neither b64decode nor decompress will return bytearrays, and there
# are not equivalents to fobj.readinto to allow us to pass them, so
# there is not a simple way to avoid making copies.
# If this becomes a problem, we should write a decoding interface with
# a tunable chunk size.
dec = base64.b64decode(data.encode('ascii'))
if enclabel == 'B64BIN':
# GIFTI_ENCODING_B64BIN
buff = bytearray(dec)
else:
# GIFTI_ENCODING_B64GZ
buff = bytearray(zlib.decompress(dec))
del dec

sh = tuple(shape)
newarr = np.frombuffer(buff, dtype=dtype)
dec = base64.b64decode(data.encode('ascii'))
if enclabel == 'B64BIN':
# GIFTI_ENCODING_B64BIN
buff = bytearray(dec)
else:
# GIFTI_ENCODING_B64GZ
buff = bytearray(zlib.decompress(dec))
del dec
newarr = np.frombuffer(buff, dtype=dtype)

sh = tuple(darray.dims)
if len(newarr.shape) != len(sh):
newarr = newarr.reshape(sh, order=array_index_order_codes.npcode[ordering])
newarr = newarr.reshape(
sh, order=array_index_order_codes.npcode[darray.ind_ord])

# check if we need to byteswap
required_byteorder = gifti_endian_codes.byteorder[endian]
required_byteorder = gifti_endian_codes.byteorder[darray.endian]
if (required_byteorder in ('big', 'little') and
required_byteorder != sys.byteorder):
newarr = newarr.byteswap()
Expand All @@ -82,13 +147,17 @@ def _str2int(in_str):

class GiftiImageParser(XmlParser):

def __init__(self, encoding=None, buffer_size=35000000, verbose=0):
def __init__(self, encoding=None, buffer_size=35000000, verbose=0,
mmap=True):
super(GiftiImageParser, self).__init__(encoding=encoding,
buffer_size=buffer_size,
verbose=verbose)
# output
self.img = None

# Queried when loading data from <Data> elements - see read_data_block
self.mmap = mmap

# finite state machine stack
self.fsm_state = []

Expand Down Expand Up @@ -288,12 +357,17 @@ def CharacterDataHandler(self, data):

def flush_chardata(self):
""" Collate and process collected character data"""
if self._char_blocks is None:
# Nothing to do for empty elements, except for Data elements which
# are within a DataArray with an external file
if self.write_to != 'Data' and self._char_blocks is None:
return
# Just join the strings to get the data. Maybe there are some memory
# optimizations we could do by passing the list of strings to the
# read_data_block function.
data = ''.join(self._char_blocks)
if self._char_blocks is not None:
data = ''.join(self._char_blocks)
else:
data = None
# Reset the char collector
self._char_blocks = None

Expand Down Expand Up @@ -321,10 +395,8 @@ def flush_chardata(self):
c.close()

elif self.write_to == 'Data':
da_tmp = self.img.darrays[-1]
da_tmp.data = read_data_block(da_tmp.encoding, da_tmp.endian,
da_tmp.ind_ord, da_tmp.datatype,
da_tmp.dims, data)
self.da.data = read_data_block(self.da, self.fname, data,
self.mmap)
# update the endianness according to the
# current machine setting
self.endian = gifti_endian_codes.code[sys.byteorder]
Expand Down
Binary file added nibabel/gifti/tests/data/external.dat
Binary file not shown.
37 changes: 37 additions & 0 deletions nibabel/gifti/tests/data/external.gii
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE GIFTI SYSTEM "http://www.nitrc.org/frs/download.php/115/gifti.dtd">
<GIFTI Version="1.0" NumberOfDataArrays="2">
<DataArray ArrayIndexingOrder="RowMajorOrder"
DataType="NIFTI_TYPE_FLOAT32"
Dim0="8"
Dim1="3"
Dimensionality="2"
Encoding="ExternalFileBinary"
Endian="LittleEndian"
ExternalFileName="external.dat"
ExternalFileOffset="0"
Intent="NIFTI_INTENT_POINTSET">
<MetaData>
</MetaData>
<CoordinateSystemTransformMatrix>
<DataSpace><![CDATA[NIFTI_XFORM_UNKNOWN]]></DataSpace>
<TransformedSpace><![CDATA[NIFTI_XFORM_UNKNOWN]]></TransformedSpace>
<MatrixData>1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 </MatrixData>
</CoordinateSystemTransformMatrix>
<Data></Data>
</DataArray>
<DataArray ArrayIndexingOrder="RowMajorOrder"
DataType="NIFTI_TYPE_INT32"
Dim0="12"
Dim1="3"
Dimensionality="2"
Encoding="ExternalFileBinary"
Endian="LittleEndian"
ExternalFileName="external.dat"
ExternalFileOffset="96"
Intent="NIFTI_INTENT_TRIANGLE">
<MetaData>
</MetaData>
<Data></Data>
</DataArray>
</GIFTI>
88 changes: 84 additions & 4 deletions nibabel/gifti/tests/test_parse_gifti_fast.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,15 +7,18 @@
#
### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##

from os.path import join as pjoin, dirname
from os.path import join as pjoin, dirname, basename
import sys
import warnings
import shutil
from unittest import mock

import numpy as np

from .. import gifti as gi
from ..util import gifti_endian_codes
from ..parse_gifti_fast import Outputter, parse_gifti_file
from ..parse_gifti_fast import (Outputter, parse_gifti_file, GiftiParseError,
GiftiImageParser)
from ...loadsave import load, save
from ...nifti1 import xform_codes
from ...tmpdirs import InTemporaryDirectory
Expand All @@ -38,9 +41,10 @@
# wb_command -gifti-convert ASCII base64bin.gii test.gii
DATA_FILE5 = pjoin(IO_DATA_PATH, 'base64bin.gii')
DATA_FILE6 = pjoin(IO_DATA_PATH, 'rh.aparc.annot.gii')
DATA_FILE7 = pjoin(IO_DATA_PATH, 'external.gii')

datafiles = [DATA_FILE1, DATA_FILE2, DATA_FILE3, DATA_FILE4, DATA_FILE5, DATA_FILE6]
numDA = [2, 1, 1, 1, 2, 1]
datafiles = [DATA_FILE1, DATA_FILE2, DATA_FILE3, DATA_FILE4, DATA_FILE5, DATA_FILE6, DATA_FILE7]
numDA = [2, 1, 1, 1, 2, 1, 2]

DATA_FILE1_darr1 = np.array(
[[-16.07201, -66.187515, 21.266994],
Expand Down Expand Up @@ -96,6 +100,28 @@

DATA_FILE6_darr1 = np.array([9182740, 9182740, 9182740], dtype=np.float32)

DATA_FILE7_darr1 = np.array([[-1., -1., -1.],
[-1., -1., 1.],
[-1., 1., -1.],
[-1., 1., 1.],
[ 1., -1., -1.],
[ 1., -1., 1.],
[ 1., 1., -1.],
[ 1., 1., 1.]], dtype=np.float32)

DATA_FILE7_darr2 = np.array([[0, 6, 4],
[0, 2, 6],
[1, 5, 3],
[3, 5, 7],
[0, 4, 1],
[1, 4, 5],
[2, 7, 6],
[2, 3, 7],
[0, 1, 2],
[1, 3, 2],
[4, 7, 5],
[4, 6, 7]], dtype=np.int32)


def assert_default_types(loaded):
default = loaded.__class__()
Expand Down Expand Up @@ -382,3 +408,57 @@ def test_parse_with_buffersize():
for buff_sz in [None, 1, 2**12]:
img2 = load(DATA_FILE2, buffer_size=buff_sz)
assert img2.darrays[0].data.shape == (143479, 1)


def test_dataarray7():
img7 = load(DATA_FILE7)
assert_array_almost_equal(img7.darrays[0].data, DATA_FILE7_darr1)
assert_array_almost_equal(img7.darrays[1].data, DATA_FILE7_darr2)


def test_parse_with_memmmap():
img1 = load(DATA_FILE7)
img2 = load(DATA_FILE7, mmap=True)
img3 = load(DATA_FILE7, mmap=False)
assert len(img1.darrays) == len(img2.darrays) == 2
assert isinstance(img1.darrays[0].data, np.memmap)
assert isinstance(img1.darrays[1].data, np.memmap)
assert isinstance(img2.darrays[0].data, np.memmap)
assert isinstance(img2.darrays[1].data, np.memmap)
assert not isinstance(img3.darrays[0].data, np.memmap)
assert not isinstance(img3.darrays[1].data, np.memmap)
assert_array_almost_equal(img1.darrays[0].data, DATA_FILE7_darr1)
assert_array_almost_equal(img1.darrays[1].data, DATA_FILE7_darr2)
assert_array_almost_equal(img2.darrays[0].data, DATA_FILE7_darr1)
assert_array_almost_equal(img2.darrays[1].data, DATA_FILE7_darr2)
assert_array_almost_equal(img3.darrays[0].data, DATA_FILE7_darr1)
assert_array_almost_equal(img3.darrays[1].data, DATA_FILE7_darr2)


def test_parse_with_memmap_fallback():
img1 = load(DATA_FILE7, mmap=True)
with mock.patch('numpy.memmap', side_effect=ValueError):
img2 = load(DATA_FILE7, mmap=True)
assert isinstance(img1.darrays[0].data, np.memmap)
assert isinstance(img1.darrays[1].data, np.memmap)
assert not isinstance(img2.darrays[0].data, np.memmap)
assert not isinstance(img2.darrays[1].data, np.memmap)
assert_array_almost_equal(img1.darrays[0].data, DATA_FILE7_darr1)
assert_array_almost_equal(img1.darrays[1].data, DATA_FILE7_darr2)
assert_array_almost_equal(img2.darrays[0].data, DATA_FILE7_darr1)
assert_array_almost_equal(img2.darrays[1].data, DATA_FILE7_darr2)


def test_external_file_failure_cases():
# external file cannot be found
with InTemporaryDirectory() as tmpdir:
shutil.copy(DATA_FILE7, '.')
filename = pjoin(tmpdir, basename(DATA_FILE7))
with pytest.raises(GiftiParseError):
img = load(filename)
# load from in-memory xml string (parser requires it as bytes)
with open(DATA_FILE7, 'rb') as f:
xmldata = f.read()
parser = GiftiImageParser()
with pytest.raises(GiftiParseError):
img = parser.parse(xmldata)
Loading