Skip to content

Commit 62aea04

Browse files
authored
Merge pull request #999 from pauldmccarthy/rf/gifti-externalfile
ENH: Add support for GIFTI ExternalFileBinary
2 parents 95e1fbe + 8cd83c2 commit 62aea04

File tree

6 files changed

+246
-40
lines changed

6 files changed

+246
-40
lines changed

nibabel/gifti/gifti.py

Lines changed: 19 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -881,27 +881,41 @@ def to_file_map(self, file_map=None):
881881
f.write(self.to_xml())
882882

883883
@classmethod
884-
def from_file_map(klass, file_map, buffer_size=35000000):
885-
""" Load a Gifti image from a file_map
884+
def from_file_map(klass, file_map, buffer_size=35000000, mmap=True):
885+
"""Load a Gifti image from a file_map
886886
887887
Parameters
888888
----------
889889
file_map : dict
890890
Dictionary with single key ``image`` with associated value which is
891891
a :class:`FileHolder` instance pointing to the image file.
892892
893+
buffer_size: None or int, optional
894+
size of read buffer. None uses default buffer_size
895+
from xml.parsers.expat.
896+
897+
mmap : {True, False, 'c', 'r', 'r+'}
898+
Controls the use of numpy memory mapping for reading data. Only
899+
has an effect when loading GIFTI images with data stored in
900+
external files (``DataArray`` elements with an ``Encoding`` equal
901+
to ``ExternalFileBinary``). If ``False``, do not try numpy
902+
``memmap`` for data array. If one of ``{'c', 'r', 'r+'}``, try
903+
numpy ``memmap`` with ``mode=mmap``. A `mmap` value of ``True``
904+
gives the same behavior as ``mmap='c'``. If the file cannot be
905+
memory-mapped, ignore `mmap` value and read array from file.
906+
893907
Returns
894908
-------
895909
img : GiftiImage
896910
"""
897-
parser = klass.parser(buffer_size=buffer_size)
911+
parser = klass.parser(buffer_size=buffer_size, mmap=mmap)
898912
parser.parse(fptr=file_map['image'].get_prepare_fileobj('rb'))
899913
return parser.img
900914

901915
@classmethod
902-
def from_filename(klass, filename, buffer_size=35000000):
916+
def from_filename(klass, filename, buffer_size=35000000, mmap=True):
903917
file_map = klass.filespec_to_file_map(filename)
904-
img = klass.from_file_map(file_map, buffer_size=buffer_size)
918+
img = klass.from_file_map(file_map, buffer_size=buffer_size, mmap=mmap)
905919
return img
906920

907921

nibabel/gifti/parse_gifti_fast.py

Lines changed: 101 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
import sys
1212
import warnings
1313
import zlib
14+
import os.path as op
1415
from io import StringIO
1516
from xml.parsers.expat import ExpatError
1617

@@ -30,45 +31,109 @@ class GiftiParseError(ExpatError):
3031
""" Gifti-specific parsing error """
3132

3233

33-
def read_data_block(encoding, endian, ordering, datatype, shape, data):
34-
""" Tries to unzip, decode, parse the funny string data """
35-
enclabel = gifti_encoding_codes.label[encoding]
36-
dtype = data_type_codes.type[datatype]
34+
def read_data_block(darray, fname, data, mmap):
35+
"""Parses data from a <Data> element, or loads from an external file.
36+
37+
Parameters
38+
----------
39+
darray : GiftiDataArray
40+
GiftiDataArray object representing the parent <DataArray> of this
41+
<Data> element
42+
43+
fname : str or None
44+
Name of GIFTI file being loaded, or None if in-memory
45+
46+
data : str or None
47+
Data to parse, or None if data is in an external file
48+
49+
mmap : {True, False, 'c', 'r', 'r+'}
50+
Controls the use of numpy memory mapping for reading data. Only has
51+
an effect when loading GIFTI images with data stored in external files
52+
(``DataArray`` elements with an ``Encoding`` equal to
53+
``ExternalFileBinary``). If ``False``, do not try numpy ``memmap``
54+
for data array. If one of ``{'c', 'r', 'r+'}``, try numpy ``memmap``
55+
with ``mode=mmap``. A `mmap` value of ``True`` gives the same
56+
behavior as ``mmap='c'``. If the file cannot be memory-mapped, ignore
57+
`mmap` value and read array from file.
58+
59+
Returns
60+
-------
61+
``numpy.ndarray`` or ``numpy.memmap`` containing the parsed data
62+
"""
63+
if mmap not in (True, False, 'c', 'r', 'r+'):
64+
raise ValueError("mmap value should be one of True, False, 'c', "
65+
"'r', 'r+'")
66+
if mmap is True:
67+
mmap = 'c'
68+
enclabel = gifti_encoding_codes.label[darray.encoding]
69+
dtype = data_type_codes.type[darray.datatype]
70+
3771
if enclabel == 'ASCII':
3872
# GIFTI_ENCODING_ASCII
3973
c = StringIO(data)
4074
da = np.loadtxt(c, dtype=dtype)
4175
return da # independent of the endianness
42-
43-
elif enclabel == 'External':
44-
# GIFTI_ENCODING_EXTBIN
45-
raise NotImplementedError("In what format are the external files?")
46-
47-
elif enclabel not in ('B64BIN', 'B64GZ'):
76+
elif enclabel not in ('B64BIN', 'B64GZ', 'External'):
4877
return 0
4978

79+
# GIFTI_ENCODING_EXTBIN
80+
# We assume that the external data file is raw uncompressed binary, with
81+
# the data type/endianness/ordering specified by the other DataArray
82+
# attributes
83+
if enclabel == 'External':
84+
if fname is None:
85+
raise GiftiParseError('ExternalFileBinary is not supported '
86+
'when loading from in-memory XML')
87+
ext_fname = op.join(op.dirname(fname), darray.ext_fname)
88+
if not op.exists(ext_fname):
89+
raise GiftiParseError('Cannot locate external file ' + ext_fname)
90+
# We either create a memmap, or load into memory
91+
newarr = None
92+
if mmap:
93+
try:
94+
newarr = np.memmap(ext_fname,
95+
dtype=dtype,
96+
mode=mmap,
97+
offset=darray.ext_offset,
98+
shape=tuple(darray.dims))
99+
# If the memmap fails, we ignore the error and load the data into
100+
# memory below
101+
except (AttributeError, TypeError, ValueError):
102+
pass
103+
# mmap=False or np.memmap failed
104+
if newarr is None:
105+
# We can replace this with a call to np.fromfile in numpy>=1.17,
106+
# as an "offset" paramter was added in that version.
107+
with open(ext_fname, 'rb') as f:
108+
f.seek(darray.ext_offset)
109+
nbytes = np.prod(darray.dims) * dtype().itemsize
110+
buff = f.read(nbytes)
111+
newarr = np.frombuffer(buff, dtype=dtype)
112+
50113
# Numpy arrays created from bytes objects are read-only.
51114
# Neither b64decode nor decompress will return bytearrays, and there
52115
# are not equivalents to fobj.readinto to allow us to pass them, so
53116
# there is not a simple way to avoid making copies.
54117
# If this becomes a problem, we should write a decoding interface with
55118
# a tunable chunk size.
56-
dec = base64.b64decode(data.encode('ascii'))
57-
if enclabel == 'B64BIN':
58-
# GIFTI_ENCODING_B64BIN
59-
buff = bytearray(dec)
60119
else:
61-
# GIFTI_ENCODING_B64GZ
62-
buff = bytearray(zlib.decompress(dec))
63-
del dec
64-
65-
sh = tuple(shape)
66-
newarr = np.frombuffer(buff, dtype=dtype)
120+
dec = base64.b64decode(data.encode('ascii'))
121+
if enclabel == 'B64BIN':
122+
# GIFTI_ENCODING_B64BIN
123+
buff = bytearray(dec)
124+
else:
125+
# GIFTI_ENCODING_B64GZ
126+
buff = bytearray(zlib.decompress(dec))
127+
del dec
128+
newarr = np.frombuffer(buff, dtype=dtype)
129+
130+
sh = tuple(darray.dims)
67131
if len(newarr.shape) != len(sh):
68-
newarr = newarr.reshape(sh, order=array_index_order_codes.npcode[ordering])
132+
newarr = newarr.reshape(
133+
sh, order=array_index_order_codes.npcode[darray.ind_ord])
69134

70135
# check if we need to byteswap
71-
required_byteorder = gifti_endian_codes.byteorder[endian]
136+
required_byteorder = gifti_endian_codes.byteorder[darray.endian]
72137
if (required_byteorder in ('big', 'little') and
73138
required_byteorder != sys.byteorder):
74139
newarr = newarr.byteswap()
@@ -82,13 +147,17 @@ def _str2int(in_str):
82147

83148
class GiftiImageParser(XmlParser):
84149

85-
def __init__(self, encoding=None, buffer_size=35000000, verbose=0):
150+
def __init__(self, encoding=None, buffer_size=35000000, verbose=0,
151+
mmap=True):
86152
super(GiftiImageParser, self).__init__(encoding=encoding,
87153
buffer_size=buffer_size,
88154
verbose=verbose)
89155
# output
90156
self.img = None
91157

158+
# Queried when loading data from <Data> elements - see read_data_block
159+
self.mmap = mmap
160+
92161
# finite state machine stack
93162
self.fsm_state = []
94163

@@ -288,12 +357,17 @@ def CharacterDataHandler(self, data):
288357

289358
def flush_chardata(self):
290359
""" Collate and process collected character data"""
291-
if self._char_blocks is None:
360+
# Nothing to do for empty elements, except for Data elements which
361+
# are within a DataArray with an external file
362+
if self.write_to != 'Data' and self._char_blocks is None:
292363
return
293364
# Just join the strings to get the data. Maybe there are some memory
294365
# optimizations we could do by passing the list of strings to the
295366
# read_data_block function.
296-
data = ''.join(self._char_blocks)
367+
if self._char_blocks is not None:
368+
data = ''.join(self._char_blocks)
369+
else:
370+
data = None
297371
# Reset the char collector
298372
self._char_blocks = None
299373

@@ -321,10 +395,8 @@ def flush_chardata(self):
321395
c.close()
322396

323397
elif self.write_to == 'Data':
324-
da_tmp = self.img.darrays[-1]
325-
da_tmp.data = read_data_block(da_tmp.encoding, da_tmp.endian,
326-
da_tmp.ind_ord, da_tmp.datatype,
327-
da_tmp.dims, data)
398+
self.da.data = read_data_block(self.da, self.fname, data,
399+
self.mmap)
328400
# update the endianness according to the
329401
# current machine setting
330402
self.endian = gifti_endian_codes.code[sys.byteorder]

nibabel/gifti/tests/data/external.dat

240 Bytes
Binary file not shown.

nibabel/gifti/tests/data/external.gii

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
<?xml version="1.0" encoding="UTF-8"?>
2+
<!DOCTYPE GIFTI SYSTEM "http://www.nitrc.org/frs/download.php/115/gifti.dtd">
3+
<GIFTI Version="1.0" NumberOfDataArrays="2">
4+
<DataArray ArrayIndexingOrder="RowMajorOrder"
5+
DataType="NIFTI_TYPE_FLOAT32"
6+
Dim0="8"
7+
Dim1="3"
8+
Dimensionality="2"
9+
Encoding="ExternalFileBinary"
10+
Endian="LittleEndian"
11+
ExternalFileName="external.dat"
12+
ExternalFileOffset="0"
13+
Intent="NIFTI_INTENT_POINTSET">
14+
<MetaData>
15+
</MetaData>
16+
<CoordinateSystemTransformMatrix>
17+
<DataSpace><![CDATA[NIFTI_XFORM_UNKNOWN]]></DataSpace>
18+
<TransformedSpace><![CDATA[NIFTI_XFORM_UNKNOWN]]></TransformedSpace>
19+
<MatrixData>1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 </MatrixData>
20+
</CoordinateSystemTransformMatrix>
21+
<Data></Data>
22+
</DataArray>
23+
<DataArray ArrayIndexingOrder="RowMajorOrder"
24+
DataType="NIFTI_TYPE_INT32"
25+
Dim0="12"
26+
Dim1="3"
27+
Dimensionality="2"
28+
Encoding="ExternalFileBinary"
29+
Endian="LittleEndian"
30+
ExternalFileName="external.dat"
31+
ExternalFileOffset="96"
32+
Intent="NIFTI_INTENT_TRIANGLE">
33+
<MetaData>
34+
</MetaData>
35+
<Data></Data>
36+
</DataArray>
37+
</GIFTI>

nibabel/gifti/tests/test_parse_gifti_fast.py

Lines changed: 84 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,15 +7,18 @@
77
#
88
### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
99

10-
from os.path import join as pjoin, dirname
10+
from os.path import join as pjoin, dirname, basename
1111
import sys
1212
import warnings
13+
import shutil
14+
from unittest import mock
1315

1416
import numpy as np
1517

1618
from .. import gifti as gi
1719
from ..util import gifti_endian_codes
18-
from ..parse_gifti_fast import Outputter, parse_gifti_file
20+
from ..parse_gifti_fast import (Outputter, parse_gifti_file, GiftiParseError,
21+
GiftiImageParser)
1922
from ...loadsave import load, save
2023
from ...nifti1 import xform_codes
2124
from ...tmpdirs import InTemporaryDirectory
@@ -38,9 +41,10 @@
3841
# wb_command -gifti-convert ASCII base64bin.gii test.gii
3942
DATA_FILE5 = pjoin(IO_DATA_PATH, 'base64bin.gii')
4043
DATA_FILE6 = pjoin(IO_DATA_PATH, 'rh.aparc.annot.gii')
44+
DATA_FILE7 = pjoin(IO_DATA_PATH, 'external.gii')
4145

42-
datafiles = [DATA_FILE1, DATA_FILE2, DATA_FILE3, DATA_FILE4, DATA_FILE5, DATA_FILE6]
43-
numDA = [2, 1, 1, 1, 2, 1]
46+
datafiles = [DATA_FILE1, DATA_FILE2, DATA_FILE3, DATA_FILE4, DATA_FILE5, DATA_FILE6, DATA_FILE7]
47+
numDA = [2, 1, 1, 1, 2, 1, 2]
4448

4549
DATA_FILE1_darr1 = np.array(
4650
[[-16.07201, -66.187515, 21.266994],
@@ -96,6 +100,28 @@
96100

97101
DATA_FILE6_darr1 = np.array([9182740, 9182740, 9182740], dtype=np.float32)
98102

103+
DATA_FILE7_darr1 = np.array([[-1., -1., -1.],
104+
[-1., -1., 1.],
105+
[-1., 1., -1.],
106+
[-1., 1., 1.],
107+
[ 1., -1., -1.],
108+
[ 1., -1., 1.],
109+
[ 1., 1., -1.],
110+
[ 1., 1., 1.]], dtype=np.float32)
111+
112+
DATA_FILE7_darr2 = np.array([[0, 6, 4],
113+
[0, 2, 6],
114+
[1, 5, 3],
115+
[3, 5, 7],
116+
[0, 4, 1],
117+
[1, 4, 5],
118+
[2, 7, 6],
119+
[2, 3, 7],
120+
[0, 1, 2],
121+
[1, 3, 2],
122+
[4, 7, 5],
123+
[4, 6, 7]], dtype=np.int32)
124+
99125

100126
def assert_default_types(loaded):
101127
default = loaded.__class__()
@@ -382,3 +408,57 @@ def test_parse_with_buffersize():
382408
for buff_sz in [None, 1, 2**12]:
383409
img2 = load(DATA_FILE2, buffer_size=buff_sz)
384410
assert img2.darrays[0].data.shape == (143479, 1)
411+
412+
413+
def test_dataarray7():
414+
img7 = load(DATA_FILE7)
415+
assert_array_almost_equal(img7.darrays[0].data, DATA_FILE7_darr1)
416+
assert_array_almost_equal(img7.darrays[1].data, DATA_FILE7_darr2)
417+
418+
419+
def test_parse_with_memmmap():
420+
img1 = load(DATA_FILE7)
421+
img2 = load(DATA_FILE7, mmap=True)
422+
img3 = load(DATA_FILE7, mmap=False)
423+
assert len(img1.darrays) == len(img2.darrays) == 2
424+
assert isinstance(img1.darrays[0].data, np.memmap)
425+
assert isinstance(img1.darrays[1].data, np.memmap)
426+
assert isinstance(img2.darrays[0].data, np.memmap)
427+
assert isinstance(img2.darrays[1].data, np.memmap)
428+
assert not isinstance(img3.darrays[0].data, np.memmap)
429+
assert not isinstance(img3.darrays[1].data, np.memmap)
430+
assert_array_almost_equal(img1.darrays[0].data, DATA_FILE7_darr1)
431+
assert_array_almost_equal(img1.darrays[1].data, DATA_FILE7_darr2)
432+
assert_array_almost_equal(img2.darrays[0].data, DATA_FILE7_darr1)
433+
assert_array_almost_equal(img2.darrays[1].data, DATA_FILE7_darr2)
434+
assert_array_almost_equal(img3.darrays[0].data, DATA_FILE7_darr1)
435+
assert_array_almost_equal(img3.darrays[1].data, DATA_FILE7_darr2)
436+
437+
438+
def test_parse_with_memmap_fallback():
439+
img1 = load(DATA_FILE7, mmap=True)
440+
with mock.patch('numpy.memmap', side_effect=ValueError):
441+
img2 = load(DATA_FILE7, mmap=True)
442+
assert isinstance(img1.darrays[0].data, np.memmap)
443+
assert isinstance(img1.darrays[1].data, np.memmap)
444+
assert not isinstance(img2.darrays[0].data, np.memmap)
445+
assert not isinstance(img2.darrays[1].data, np.memmap)
446+
assert_array_almost_equal(img1.darrays[0].data, DATA_FILE7_darr1)
447+
assert_array_almost_equal(img1.darrays[1].data, DATA_FILE7_darr2)
448+
assert_array_almost_equal(img2.darrays[0].data, DATA_FILE7_darr1)
449+
assert_array_almost_equal(img2.darrays[1].data, DATA_FILE7_darr2)
450+
451+
452+
def test_external_file_failure_cases():
453+
# external file cannot be found
454+
with InTemporaryDirectory() as tmpdir:
455+
shutil.copy(DATA_FILE7, '.')
456+
filename = pjoin(tmpdir, basename(DATA_FILE7))
457+
with pytest.raises(GiftiParseError):
458+
img = load(filename)
459+
# load from in-memory xml string (parser requires it as bytes)
460+
with open(DATA_FILE7, 'rb') as f:
461+
xmldata = f.read()
462+
parser = GiftiImageParser()
463+
with pytest.raises(GiftiParseError):
464+
img = parser.parse(xmldata)

0 commit comments

Comments
 (0)