Skip to content

BF: Make sure xml is encoded as utf-8 #354

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Oct 17, 2015
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion nibabel/gifti/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,4 +19,4 @@

from .giftiio import read, write
from .gifti import (GiftiMetaData, GiftiNVPairs, GiftiLabelTable, GiftiLabel,
GiftiCoordSystem, data_tag, GiftiDataArray, GiftiImage)
GiftiCoordSystem, GiftiDataArray, GiftiImage)
172 changes: 95 additions & 77 deletions nibabel/gifti/gifti.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@

import numpy as np

from .. import xmlutils as xml
from ..nifti1 import data_type_codes, xform_codes, intent_codes
from .util import (array_index_order_codes, gifti_encoding_codes,
gifti_endian_codes, KIND2FMT)
Expand All @@ -22,7 +23,7 @@
import base64


class GiftiMetaData(object):
class GiftiMetaData(xml.XmlSerializable):
""" A list of GiftiNVPairs in stored in
the list self.data """
def __init__(self, nvpair=None):
Expand Down Expand Up @@ -50,18 +51,15 @@ def metadata(self):
self.data_as_dict[ele.name] = ele.value
return self.data_as_dict

def to_xml(self):
if len(self.data) == 0:
return "<MetaData/>\n"
res = "<MetaData>\n"
def _to_xml_element(self):
metadata = xml.Element('MetaData')
for ele in self.data:
nvpair = """<MD>
\t<Name><![CDATA[%s]]></Name>
\t<Value><![CDATA[%s]]></Value>
</MD>\n""" % (ele.name, ele.value)
res = res + nvpair
res = res + "</MetaData>\n"
return res
md = xml.SubElement(metadata, 'MD')
name = xml.SubElement(md, 'Name')
value = xml.SubElement(md, 'Value')
name.text = ele.name
value.text = ele.value
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Did we lose the CDATA bit here? Maybe:

name.text = '<![CDATA[{0}]]>'.format(ele.name)
value.text = '<![CDATA[{0}]]>'.format(ele.value)

Edit: Or possibly we don't care, because xml will handle any escaping that needs to be done? (I am not very familiar with XML.)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I believe the CDATA bit is a way to avoid encoding strings as XML-safe. That trick isn't necessary now; the library will XML encode.

In addition, if you do as you suggest, I believe the < and > characters will be escaped, and you'll get something you don't want.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You're right. Just did a little reading and playing in ipython.

return metadata

def print_summary(self):
print(self.metadata)
Expand All @@ -77,7 +75,7 @@ def __init__(self, name='', value=''):
self.value = value


class GiftiLabelTable(object):
class GiftiLabelTable(xml.XmlSerializable):

def __init__(self):
self.labels = []
Expand All @@ -88,31 +86,22 @@ def get_labels_as_dict(self):
self.labels_as_dict[ele.key] = ele.label
return self.labels_as_dict

def to_xml(self):
if len(self.labels) == 0:
return "<LabelTable/>\n"
res = "<LabelTable>\n"
def _to_xml_element(self):
labeltable = xml.Element('LabelTable')
for ele in self.labels:
col = ''
if not ele.red is None:
col += ' Red="%s"' % str(ele.red)
if not ele.green is None:
col += ' Green="%s"' % str(ele.green)
if not ele.blue is None:
col += ' Blue="%s"' % str(ele.blue)
if not ele.alpha is None:
col += ' Alpha="%s"' % str(ele.alpha)
lab = """\t<Label Key="%s"%s><![CDATA[%s]]></Label>\n""" % \
(str(ele.key), col, ele.label)
res = res + lab
res = res + "</LabelTable>\n"
return res
label = xml.SubElement(labeltable, 'Label')
label.attrib['Key'] = str(ele.key)
label.text = ele.label
for attr in ['Red', 'Green', 'Blue', 'Alpha']:
if getattr(ele, attr.lower(), None) is not None:
label.attrib[attr] = str(getattr(ele, attr.lower()))
return labeltable

def print_summary(self):
print(self.get_labels_as_dict())


class GiftiLabel(object):
class GiftiLabel(xml.XmlSerializable):
key = int
label = str
# rgba
Expand Down Expand Up @@ -165,7 +154,7 @@ def _arr2txt(arr, elem_fmt):
return '\n'.join(fmt % tuple(row) for row in arr)


class GiftiCoordSystem(object):
class GiftiCoordSystem(xml.XmlSerializable):
dataspace = int
xformspace = int
xform = np.ndarray # 4x4 numpy array
Expand All @@ -179,28 +168,37 @@ def __init__(self, dataspace=0, xformspace=0, xform=None):
else:
self.xform = xform

def to_xml(self):
if self.xform is None:
return "<CoordinateSystemTransformMatrix/>\n"
res = ("""<CoordinateSystemTransformMatrix>
\t<DataSpace><![CDATA[%s]]></DataSpace>
\t<TransformedSpace><![CDATA[%s]]></TransformedSpace>\n"""
% (xform_codes.niistring[self.dataspace],
xform_codes.niistring[self.xformspace]))
res = res + "<MatrixData>\n"
res += _arr2txt(self.xform, '%10.6f')
res = res + "</MatrixData>\n"
res = res + "</CoordinateSystemTransformMatrix>\n"
return res
def _to_xml_element(self):
coord_xform = xml.Element('CoordinateSystemTransformMatrix')
if self.xform is not None:
dataspace = xml.SubElement(coord_xform, 'DataSpace')
dataspace.text = xform_codes.niistring[self.dataspace]
xformed_space = xml.SubElement(coord_xform, 'TransformedSpace')
xformed_space.text = xform_codes.niistring[self.xformspace]
matrix_data = xml.SubElement(coord_xform, 'MatrixData')
matrix_data.text = _arr2txt(self.xform, '%10.6f')
return coord_xform

def print_summary(self):
print('Dataspace: ', xform_codes.niistring[self.dataspace])
print('XFormSpace: ', xform_codes.niistring[self.xformspace])
print('Affine Transformation Matrix: \n', self.xform)


@np.deprecate_with_doc("This is an internal API that will be discontinued.")
def data_tag(dataarray, encoding, datatype, ordering):
""" Creates the data tag depending on the required encoding """
class DataTag(xml.XmlSerializable):
def __init__(self, *args):
self.args = args
def _to_xml_element(self):
return _data_tag_element(*self.args)

return DataTag(dataarray, encoding, datatype, ordering).to_xml()


def _data_tag_element(dataarray, encoding, datatype, ordering):
""" Creates the data tag depending on the required encoding,
returns as XML element"""
import zlib
ord = array_index_order_codes.npcode[ordering]
enclabel = gifti_encoding_codes.label[encoding]
Expand All @@ -215,10 +213,13 @@ def data_tag(dataarray, encoding, datatype, ordering):
raise NotImplementedError("In what format are the external files?")
else:
da = ''
return "<Data>" + da + "</Data>\n"

data = xml.Element('Data')
data.text = da
return data
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

return xml.Element('Data', text=da)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, I read the docs and had tried this. On my machine it set an attribute: <Data text='blah' />.



class GiftiDataArray(object):
class GiftiDataArray(xml.XmlSerializable):

# These are for documentation only; we don't use these class variables
intent = int
Expand Down Expand Up @@ -299,26 +300,37 @@ def from_array(klass,
cda.meta = GiftiMetaData.from_dict(meta)
return cda

def to_xml(self):
def _to_xml_element(self):
# fix endianness to machine endianness
self.endian = gifti_endian_codes.code[sys.byteorder]
result = ""
result += self.to_xml_open()
# write metadata
if not self.meta is None:
result += self.meta.to_xml()
# write coord sys
if not self.coordsys is None:
result += self.coordsys.to_xml()

data_array = xml.Element('DataArray', attrib={
'Intent': intent_codes.niistring[self.intent],
'DataType': data_type_codes.niistring[self.datatype],
'ArrayIndexingOrder': array_index_order_codes.label[self.ind_ord],
'Dimensionality': str(self.num_dim),
'Encoding': gifti_encoding_codes.specs[self.encoding],
'Endian': gifti_endian_codes.specs[self.endian],
'ExternalFileName': self.ext_fname,
'ExternalFileOffset': self.ext_offset})
for di, dn in enumerate(self.dims):
data_array.attrib['Dim%d' % di] = str(dn)

if self.meta is not None:
data_array.append(self.meta._to_xml_element())
if self.coordsys is not None:
data_array.append(self.coordsys._to_xml_element())
# write data array depending on the encoding
dt_kind = data_type_codes.dtype[self.datatype].kind
result += data_tag(self.data,
gifti_encoding_codes.specs[self.encoding],
KIND2FMT[dt_kind],
self.ind_ord)
result = result + self.to_xml_close()
return result
data_array.append(
_data_tag_element(self.data,
gifti_encoding_codes.specs[self.encoding],
KIND2FMT[dt_kind],
self.ind_ord))

return data_array

@np.deprecate_with_doc("Use the to_xml() function instead.")
def to_xml_open(self):
out = """<DataArray Intent="%s"
\tDataType="%s"
Expand All @@ -342,6 +354,7 @@ def to_xml_open(self):
self.ext_offset,
)

@np.deprecate_with_doc("Use the to_xml() function instead.")
def to_xml_close(self):
return "</DataArray>\n"

Expand Down Expand Up @@ -371,7 +384,8 @@ def metadata(self):
return self.meta.metadata


class GiftiImage(object):
class GiftiImage(xml.XmlSerializable):

def __init__(self, meta=None, labeltable=None, darrays=None,
version="1.0"):
if darrays is None:
Expand Down Expand Up @@ -497,17 +511,21 @@ def print_summary(self):
print(da.print_summary())
print('----end----')

def to_xml(self):

def _to_xml_element(self):
GIFTI = xml.Element('GIFTI', attrib={
'Version': self.version,
'NumberOfDataArrays': str(self.numDA)})
if self.meta is not None:
GIFTI.append(self.meta._to_xml_element())
if self.labeltable is not None:
GIFTI.append(self.labeltable._to_xml_element())
for dar in self.darrays:
GIFTI.append(dar._to_xml_element())
return GIFTI

def to_xml(self, enc='utf-8'):
""" Return XML corresponding to image content """
res = """<?xml version="1.0" encoding="UTF-8"?>
return b"""<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE GIFTI SYSTEM "http://www.nitrc.org/frs/download.php/115/gifti.dtd">
<GIFTI Version="%s" NumberOfDataArrays="%s">\n""" % (self.version,
str(self.numDA))
if not self.meta is None:
res += self.meta.to_xml()
if not self.labeltable is None:
res += self.labeltable.to_xml()
for dar in self.darrays:
res += dar.to_xml()
res += "</GIFTI>"
return res
""" + xml.XmlSerializable.to_xml(self, enc)
2 changes: 1 addition & 1 deletion nibabel/gifti/giftiio.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,5 +80,5 @@ def write(image, filename):
The Gifti file is stored in endian convention of the current machine.
"""
# Our giftis are always utf-8 encoded - see GiftiImage.to_xml
with codecs.open(filename, 'wb', encoding='utf-8') as f:
with open(filename, 'wb') as f:
f.write(image.to_xml())
34 changes: 27 additions & 7 deletions nibabel/gifti/tests/test_gifti.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,17 +4,18 @@

import numpy as np

from nibabel.gifti import giftiio
from nibabel.externals.six import string_types
from nibabel.gifti import (GiftiImage, GiftiDataArray, GiftiLabel,
GiftiLabelTable, GiftiMetaData, giftiio)
from nibabel.gifti.gifti import data_tag
from nibabel.nifti1 import data_type_codes, intent_codes

from .test_giftiio import (DATA_FILE1, DATA_FILE2, DATA_FILE3, DATA_FILE4,
DATA_FILE5, DATA_FILE6)
from ..gifti import (GiftiImage, GiftiDataArray, GiftiLabel, GiftiLabelTable,
GiftiMetaData)
from ...nifti1 import data_type_codes, intent_codes
from ...testing import clear_and_catch_warnings
from numpy.testing import (assert_array_almost_equal,
assert_array_equal)
from nose.tools import (assert_true, assert_false, assert_equal, assert_raises)
from nibabel.testing import clear_and_catch_warnings
from .test_giftiio import (DATA_FILE1, DATA_FILE2, DATA_FILE3, DATA_FILE4,
DATA_FILE5, DATA_FILE6)


def test_gifti_image():
Expand Down Expand Up @@ -70,6 +71,17 @@ def test_dataarray():
da = GiftiDataArray.from_array(bs_arr, 'triangle')
assert_equal(da.datatype, data_type_codes[arr.dtype])

# Smoke test on deprecated functions
da = GiftiDataArray.from_array(np.ones((1,)), 'triangle')
with clear_and_catch_warnings() as w:
warnings.filterwarnings('always', category=DeprecationWarning)
assert_true(isinstance(da.to_xml_open(), string_types))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

string_types needs to be imported from externals.six.

Rebase and I think we're almost done.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks. Sorry for all the errors; I'm juggling changes three branches (trying to keep things simple/clean #fail) and I'm doing a poor job :)

Fixed, rebased, and pushed up. Looking forward to getting this one in; the next PR is the most interesting one! Looking forward to discussing!

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No worries. Thanks for your patience. Heading out of town in a couple hours, so the next PR might have to wait for Monday, but I do want to get this in so you don't have to juggle too much more.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for all your work. No worries--that one is a WIP and will take some time. I've just been looking forward to beginning that discussion!

I made the change requested. Just more sloppiness from my side. Reordered & diff looks cleaner now.

assert_equal(len(w), 1)
with clear_and_catch_warnings() as w:
warnings.filterwarnings('once', category=DeprecationWarning)
assert_true(isinstance(da.to_xml_close(), string_types))
assert_equal(len(w), 1)


def test_labeltable():
img = GiftiImage()
Expand Down Expand Up @@ -163,3 +175,11 @@ def assign_labeltable(val):
def assign_metadata(val):
img.meta = val
assert_raises(TypeError, assign_metadata, 'not-a-meta')


def test_data_tag_deprecated():
img = GiftiImage()
with clear_and_catch_warnings() as w:
warnings.filterwarnings('once', category=DeprecationWarning)
data_tag(np.array([]), 'ASCII', '%i', 1)
assert_equal(len(w), 1)
25 changes: 25 additions & 0 deletions nibabel/xmlutils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
# emacs: -*- mode: python-mode; py-indent-offset: 4; indent-tabs-mode: nil -*-
# vi: set ft=python sts=4 ts=4 sw=4 et:
### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
#
# See COPYING file distributed along with the NiBabel package for the
# copyright and license terms.
#
### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
"""
Thin layer around xml.etree.ElementTree, to abstract nibabel xml support.
"""
from xml.etree.ElementTree import Element, SubElement, tostring


class XmlSerializable(object):
""" Basic interface for serializing an object to xml"""

def _to_xml_element(self):
""" Output should be a xml.etree.ElementTree.Element"""
raise NotImplementedError()

def to_xml(self, enc='utf-8'):
""" Output should be an xml string with the given encoding.
(default: utf-8)"""
return tostring(self._to_xml_element(), enc)