Skip to content

Add sz using ctypes #422

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 16 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .github/workflows/ci-linux.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,11 @@ jobs:
c-compiler cxx-compiler
python=${{matrix.python-version}} wheel pip

- name: Add libaec
shell: "bash -l {0}"
run: >
conda install -n env libaec -c conda-forge --no-deps

- name: Show info about `env` environment
shell: "bash -l {0}"
run: |
Expand Down
1 change: 1 addition & 0 deletions docs/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ Contents
zfpy
zstd
zlib
szip
gzip
bz2
lzma
Expand Down
3 changes: 3 additions & 0 deletions docs/release.rst
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,9 @@ Enhancements
* Add ``fletcher32`` checksum codec
By :user:`Martin Durant <martindurant>`, :issue:`410`.

* Add ``hdf5_szip`` codec
By :user:`Martin Durant <martindurant>`, :issue:`422`.

Fix
~~~

Expand Down
19 changes: 19 additions & 0 deletions docs/szip.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
SZIP
====

This codec is a numcodecs translation of the `HDF5 implementation`_. The binary driver is
available in the open source package ``libaec``, which must be installed for use.

SZIP is used, for instance, by `NASA-EOSS`_. By including this codec, we
make such data readable in zarr by using kerchunk. As such, the
compression parameters are provided in the HDF5 metadata.

.. _HDF5 implementation: https://portal.hdfgroup.org/display/HDF5/Szip+Compression+in+HDF+Products

.. _NASA-EOSS: https://www.earthdata.nasa.gov/esdis/esco/standards-and-practices/hdf-eos5

If you wish to compress new data with this codec, you will need to
find reasonable values for the parameters, see `this section of HDF5 code`_


.. _this section of HDF code: https://github.com/HDFGroup/hdf5/blob/7b833f04b5146bdad339ff10d42aadc416fb2f00/src/H5Zszip.c#L106-L244)
3 changes: 3 additions & 0 deletions numcodecs/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,3 +114,6 @@

from numcodecs.fletcher32 import Fletcher32
register_codec(Fletcher32)

from numcodecs.sz import HDF5SzipCodec
register_codec(HDF5SzipCodec)
131 changes: 131 additions & 0 deletions numcodecs/sz.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
import ctypes

from numcodecs.abc import Codec
from numcodecs.compat import ensure_ndarray_like

# constants from libaec.h
AEC_OK = 0
AEC_CONF_ERROR = -1
AEC_STREAM_ERROR = -2
AEC_DATA_ERROR = -3
AEC_MEM_ERROR = -4

# constants from szlib.h
# Only SZ_MSB_OPTION_MASK and SZ_NN_OPTION_MASK are relevant to the libaec implementation
SZ_ALLOW_K13_OPTION_MASK = 1
SZ_CHIP_OPTION_MASK = 2
SZ_EC_OPTION_MASK = 4
SZ_LSB_OPTION_MASK = 8
SZ_MSB_OPTION_MASK = 16
SZ_NN_OPTION_MASK = 32
SZ_RAW_OPTION_MASK = 128

SZ_OK = AEC_OK
SZ_OUTBUFF_FULL = 2

SZ_NO_ENCODER_ERROR = -1
SZ_PARAM_ERROR = AEC_CONF_ERROR
SZ_MEM_ERROR = AEC_MEM_ERROR

SZ_MAX_PIXELS_PER_BLOCK = 32
SZ_MAX_BLOCKS_PER_SCANLINE = 128
SZ_MAX_PIXELS_PER_SCANLINE = SZ_MAX_PIXELS_PER_BLOCK * SZ_MAX_BLOCKS_PER_SCANLINE


class Params(ctypes.Structure):

_fields_ = [
("options_mask", ctypes.c_int),
("bits_per_pixel", ctypes.c_int),
("pixels_per_block", ctypes.c_int),
("pixels_per_scanline", ctypes.c_int)
]


libsz = False
for ext in [".so", ".dylib", ".dll"]:
try:
libsz = ctypes.cdll.LoadLibrary("libsz" + ext)
except OSError:
pass

if libsz:
BufftoBuffCompress = libsz.SZ_BufftoBuffCompress
BufftoBuffCompress.restype = ctypes.c_int
BufftoBuffCompress.argtypes = [
ctypes.c_void_p, ctypes.POINTER(ctypes.c_size_t), ctypes.c_void_p, ctypes.c_size_t,
ctypes.POINTER(Params)
]

BufftoBuffDecompress = libsz.SZ_BufftoBuffDecompress
BufftoBuffDecompress.restype = ctypes.c_int
BufftoBuffDecompress.argtypes = [
ctypes.c_void_p, ctypes.POINTER(ctypes.c_size_t), ctypes.c_void_p, ctypes.c_size_t,
ctypes.POINTER(Params)
]


def check_sz():
if not libsz: # pragma: no cover
raise ImportError("libsz could not be loaded, please install libaec")


class HDF5SzipCodec(Codec):
"""The SZIP codec, as implemented in NASA HDF5

See description:

The shared library used could in principle be the original SZIP, but it is
also includes in libaec, which is available on conda-forge and recommended.

All parameters must be defined.
"""

codec_id = "hdf5_szip"

def __init__(self, mask, bits_per_pixel, pixels_per_block, pixels_per_scanline):
assert pixels_per_block <= SZ_MAX_PIXELS_PER_BLOCK
assert pixels_per_scanline <= SZ_MAX_PIXELS_PER_SCANLINE
self.mask = mask
self.pixels_per_block = pixels_per_block
self.bits_per_pixel = bits_per_pixel
self.pixels_per_scanline = pixels_per_scanline

def decode(self, buf, out=None):
check_sz()
buf = memoryview(buf)
param = Params(
self.mask, self.bits_per_pixel, self.pixels_per_block, self.pixels_per_scanline
)
lout = int.from_bytes(buf[:4], "little")
dest_len = ctypes.c_size_t(lout)
p_dest_len = ctypes.pointer(dest_len)
if out is None:
out = ctypes.create_string_buffer(lout)
buf2 = ctypes.c_char_p(buf.obj[4:])
ok = BufftoBuffDecompress(
out, p_dest_len, buf2, buf.nbytes - 4, param
)
assert dest_len.value == lout # that we got the expected number of bytes
assert ok == 0
return out

def encode(self, buf):
check_sz()
buf = ensure_ndarray_like(buf)
param = Params(
self.mask, self.bits_per_pixel, self.pixels_per_block, self.pixels_per_scanline
)
buf_nbytes = buf.nbytes
buf2 = buf.ctypes.data
out = ctypes.create_string_buffer(buf_nbytes+4)
# Store input nbytes as first four bytes little endian
in_len = ctypes.c_int32(buf_nbytes)
out[:4] = bytes(in_len)
dest_len = ctypes.c_size_t(buf_nbytes)
p_dest_len = ctypes.pointer(dest_len)
ok = BufftoBuffCompress(
ctypes.addressof(out)+4, p_dest_len, buf2, buf_nbytes, param
)
assert ok == 0
return out[:dest_len.value+4]
39 changes: 39 additions & 0 deletions numcodecs/tests/test_sz.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
import pytest

import numpy as np

from numcodecs.sz import HDF5SzipCodec, libsz


@pytest.mark.skipif(not libsz, reason="no libaec")
def test_canonical():
sample_buffer = b'\x00\x02\x00\x00\x15UUUUUUUQUUUUUUUU\x15UUUUUUUQUUUUUUUU' \
b'\x15UUUUUUUQUUUUUUUU\x15UUUUUUUQUUUUUUUU'
# pl = h5obj.id.get_create_plist().get_filter(0)
# mask, pixels_per_block, bits_per_pixel, pixels_per_scanline = pl[2]
# (141, 32, 16, 256)
# lout = 512

codec = HDF5SzipCodec(mask=141, pixels_per_block=32, bits_per_pixel=16, pixels_per_scanline=256)
out = codec.decode(sample_buffer) # Bus Error
arr = np.frombuffer(out, dtype="uint16")
assert (arr == 1).all()

comp_buff = codec.encode(arr)
assert comp_buff == sample_buffer


@pytest.mark.skipif(not libsz, reason="no libaec")
@pytest.mark.parametrize(
"shape", [(10, 100), (10000,)]
)
@pytest.mark.parametrize(
"dtype", ["uint16", "int32", "int64"]
)
def test_random(shape, dtype):
arr = np.random.randint(0, 40, size=np.prod(shape), dtype=dtype).reshape(shape)
codec = HDF5SzipCodec(mask=141, pixels_per_block=32, bits_per_pixel=16, pixels_per_scanline=256)
buff = codec.encode(arr)
buff2 = codec.decode(buff)
arr2 = np.frombuffer(buff2, dtype=dtype).reshape(shape)
assert (arr == arr2).all()