11
11
import sys
12
12
import warnings
13
13
import zlib
14
+ import os .path as op
14
15
from io import StringIO
15
16
from xml .parsers .expat import ExpatError
16
17
@@ -30,45 +31,109 @@ class GiftiParseError(ExpatError):
30
31
""" Gifti-specific parsing error """
31
32
32
33
33
- def read_data_block (encoding , endian , ordering , datatype , shape , data ):
34
- """ Tries to unzip, decode, parse the funny string data """
35
- enclabel = gifti_encoding_codes .label [encoding ]
36
- dtype = data_type_codes .type [datatype ]
34
+ def read_data_block (darray , fname , data , mmap ):
35
+ """Parses data from a <Data> element, or loads from an external file.
36
+
37
+ Parameters
38
+ ----------
39
+ darray : GiftiDataArray
40
+ GiftiDataArray object representing the parent <DataArray> of this
41
+ <Data> element
42
+
43
+ fname : str or None
44
+ Name of GIFTI file being loaded, or None if in-memory
45
+
46
+ data : str or None
47
+ Data to parse, or None if data is in an external file
48
+
49
+ mmap : {True, False, 'c', 'r', 'r+'}
50
+ Controls the use of numpy memory mapping for reading data. Only has
51
+ an effect when loading GIFTI images with data stored in external files
52
+ (``DataArray`` elements with an ``Encoding`` equal to
53
+ ``ExternalFileBinary``). If ``False``, do not try numpy ``memmap``
54
+ for data array. If one of ``{'c', 'r', 'r+'}``, try numpy ``memmap``
55
+ with ``mode=mmap``. A `mmap` value of ``True`` gives the same
56
+ behavior as ``mmap='c'``. If the file cannot be memory-mapped, ignore
57
+ `mmap` value and read array from file.
58
+
59
+ Returns
60
+ -------
61
+ ``numpy.ndarray`` or ``numpy.memmap`` containing the parsed data
62
+ """
63
+ if mmap not in (True , False , 'c' , 'r' , 'r+' ):
64
+ raise ValueError ("mmap value should be one of True, False, 'c', "
65
+ "'r', 'r+'" )
66
+ if mmap is True :
67
+ mmap = 'c'
68
+ enclabel = gifti_encoding_codes .label [darray .encoding ]
69
+ dtype = data_type_codes .type [darray .datatype ]
70
+
37
71
if enclabel == 'ASCII' :
38
72
# GIFTI_ENCODING_ASCII
39
73
c = StringIO (data )
40
74
da = np .loadtxt (c , dtype = dtype )
41
75
return da # independent of the endianness
42
-
43
- elif enclabel == 'External' :
44
- # GIFTI_ENCODING_EXTBIN
45
- raise NotImplementedError ("In what format are the external files?" )
46
-
47
- elif enclabel not in ('B64BIN' , 'B64GZ' ):
76
+ elif enclabel not in ('B64BIN' , 'B64GZ' , 'External' ):
48
77
return 0
49
78
79
+ # GIFTI_ENCODING_EXTBIN
80
+ # We assume that the external data file is raw uncompressed binary, with
81
+ # the data type/endianness/ordering specified by the other DataArray
82
+ # attributes
83
+ if enclabel == 'External' :
84
+ if fname is None :
85
+ raise GiftiParseError ('ExternalFileBinary is not supported '
86
+ 'when loading from in-memory XML' )
87
+ ext_fname = op .join (op .dirname (fname ), darray .ext_fname )
88
+ if not op .exists (ext_fname ):
89
+ raise GiftiParseError ('Cannot locate external file ' + ext_fname )
90
+ # We either create a memmap, or load into memory
91
+ newarr = None
92
+ if mmap :
93
+ try :
94
+ newarr = np .memmap (ext_fname ,
95
+ dtype = dtype ,
96
+ mode = mmap ,
97
+ offset = darray .ext_offset ,
98
+ shape = tuple (darray .dims ))
99
+ # If the memmap fails, we ignore the error and load the data into
100
+ # memory below
101
+ except (AttributeError , TypeError , ValueError ):
102
+ pass
103
+ # mmap=False or np.memmap failed
104
+ if newarr is None :
105
+ # We can replace this with a call to np.fromfile in numpy>=1.17,
106
+ # as an "offset" paramter was added in that version.
107
+ with open (ext_fname , 'rb' ) as f :
108
+ f .seek (darray .ext_offset )
109
+ nbytes = np .prod (darray .dims ) * dtype ().itemsize
110
+ buff = f .read (nbytes )
111
+ newarr = np .frombuffer (buff , dtype = dtype )
112
+
50
113
# Numpy arrays created from bytes objects are read-only.
51
114
# Neither b64decode nor decompress will return bytearrays, and there
52
115
# are not equivalents to fobj.readinto to allow us to pass them, so
53
116
# there is not a simple way to avoid making copies.
54
117
# If this becomes a problem, we should write a decoding interface with
55
118
# a tunable chunk size.
56
- dec = base64 .b64decode (data .encode ('ascii' ))
57
- if enclabel == 'B64BIN' :
58
- # GIFTI_ENCODING_B64BIN
59
- buff = bytearray (dec )
60
119
else :
61
- # GIFTI_ENCODING_B64GZ
62
- buff = bytearray (zlib .decompress (dec ))
63
- del dec
64
-
65
- sh = tuple (shape )
66
- newarr = np .frombuffer (buff , dtype = dtype )
120
+ dec = base64 .b64decode (data .encode ('ascii' ))
121
+ if enclabel == 'B64BIN' :
122
+ # GIFTI_ENCODING_B64BIN
123
+ buff = bytearray (dec )
124
+ else :
125
+ # GIFTI_ENCODING_B64GZ
126
+ buff = bytearray (zlib .decompress (dec ))
127
+ del dec
128
+ newarr = np .frombuffer (buff , dtype = dtype )
129
+
130
+ sh = tuple (darray .dims )
67
131
if len (newarr .shape ) != len (sh ):
68
- newarr = newarr .reshape (sh , order = array_index_order_codes .npcode [ordering ])
132
+ newarr = newarr .reshape (
133
+ sh , order = array_index_order_codes .npcode [darray .ind_ord ])
69
134
70
135
# check if we need to byteswap
71
- required_byteorder = gifti_endian_codes .byteorder [endian ]
136
+ required_byteorder = gifti_endian_codes .byteorder [darray . endian ]
72
137
if (required_byteorder in ('big' , 'little' ) and
73
138
required_byteorder != sys .byteorder ):
74
139
newarr = newarr .byteswap ()
@@ -82,13 +147,17 @@ def _str2int(in_str):
82
147
83
148
class GiftiImageParser (XmlParser ):
84
149
85
- def __init__ (self , encoding = None , buffer_size = 35000000 , verbose = 0 ):
150
+ def __init__ (self , encoding = None , buffer_size = 35000000 , verbose = 0 ,
151
+ mmap = True ):
86
152
super (GiftiImageParser , self ).__init__ (encoding = encoding ,
87
153
buffer_size = buffer_size ,
88
154
verbose = verbose )
89
155
# output
90
156
self .img = None
91
157
158
+ # Queried when loading data from <Data> elements - see read_data_block
159
+ self .mmap = mmap
160
+
92
161
# finite state machine stack
93
162
self .fsm_state = []
94
163
@@ -288,12 +357,17 @@ def CharacterDataHandler(self, data):
288
357
289
358
def flush_chardata (self ):
290
359
""" Collate and process collected character data"""
291
- if self ._char_blocks is None :
360
+ # Nothing to do for empty elements, except for Data elements which
361
+ # are within a DataArray with an external file
362
+ if self .write_to != 'Data' and self ._char_blocks is None :
292
363
return
293
364
# Just join the strings to get the data. Maybe there are some memory
294
365
# optimizations we could do by passing the list of strings to the
295
366
# read_data_block function.
296
- data = '' .join (self ._char_blocks )
367
+ if self ._char_blocks is not None :
368
+ data = '' .join (self ._char_blocks )
369
+ else :
370
+ data = None
297
371
# Reset the char collector
298
372
self ._char_blocks = None
299
373
@@ -321,10 +395,8 @@ def flush_chardata(self):
321
395
c .close ()
322
396
323
397
elif self .write_to == 'Data' :
324
- da_tmp = self .img .darrays [- 1 ]
325
- da_tmp .data = read_data_block (da_tmp .encoding , da_tmp .endian ,
326
- da_tmp .ind_ord , da_tmp .datatype ,
327
- da_tmp .dims , data )
398
+ self .da .data = read_data_block (self .da , self .fname , data ,
399
+ self .mmap )
328
400
# update the endianness according to the
329
401
# current machine setting
330
402
self .endian = gifti_endian_codes .code [sys .byteorder ]
0 commit comments