diff --git a/.github/workflows/ci-linux.yaml b/.github/workflows/ci-linux.yaml index bd617ba6..bff619c9 100644 --- a/.github/workflows/ci-linux.yaml +++ b/.github/workflows/ci-linux.yaml @@ -36,6 +36,11 @@ jobs: c-compiler cxx-compiler python=${{matrix.python-version}} wheel pip + - name: Add libaec + shell: "bash -l {0}" + run: > + conda install -n env libaec -c conda-forge --no-deps + - name: Show info about `env` environment shell: "bash -l {0}" run: | diff --git a/docs/index.rst b/docs/index.rst index db9f935d..2e715c6a 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -65,6 +65,7 @@ Contents zfpy zstd zlib + szip gzip bz2 lzma diff --git a/docs/release.rst b/docs/release.rst index 90d62750..78ef9325 100644 --- a/docs/release.rst +++ b/docs/release.rst @@ -18,6 +18,9 @@ Enhancements * Add ``fletcher32`` checksum codec By :user:`Martin Durant `, :issue:`410`. +* Add ``hdf5_szip`` codec + By :user:`Martin Durant `, :issue:`422`. + Fix ~~~ diff --git a/docs/szip.rst b/docs/szip.rst new file mode 100644 index 00000000..da7fda69 --- /dev/null +++ b/docs/szip.rst @@ -0,0 +1,19 @@ +SZIP +==== + +This codec is a numcodecs translation of the `HDF5 implementation`_. The binary driver is +available in the open source package ``libaec``, which must be installed for use. + +SZIP is used, for instance, by `NASA-EOSS`_. By including this codec, we +make such data readable in zarr by using kerchunk. As such, the +compression parameters are provided in the HDF5 metadata. + +.. _HDF5 implementation: https://portal.hdfgroup.org/display/HDF5/Szip+Compression+in+HDF+Products + +.. _NASA-EOSS: https://www.earthdata.nasa.gov/esdis/esco/standards-and-practices/hdf-eos5 + +If you wish to compress new data with this codec, you will need to +find reasonable values for the parameters, see `this section of HDF5 code`_ + + +.. _this section of HDF code: https://github.com/HDFGroup/hdf5/blob/7b833f04b5146bdad339ff10d42aadc416fb2f00/src/H5Zszip.c#L106-L244) diff --git a/numcodecs/__init__.py b/numcodecs/__init__.py index 1e3c8536..84cb076f 100644 --- a/numcodecs/__init__.py +++ b/numcodecs/__init__.py @@ -114,3 +114,6 @@ from numcodecs.fletcher32 import Fletcher32 register_codec(Fletcher32) + +from numcodecs.sz import HDF5SzipCodec +register_codec(HDF5SzipCodec) diff --git a/numcodecs/sz.py b/numcodecs/sz.py new file mode 100644 index 00000000..775433dd --- /dev/null +++ b/numcodecs/sz.py @@ -0,0 +1,131 @@ +import ctypes + +from numcodecs.abc import Codec +from numcodecs.compat import ensure_ndarray_like + +# constants from libaec.h +AEC_OK = 0 +AEC_CONF_ERROR = -1 +AEC_STREAM_ERROR = -2 +AEC_DATA_ERROR = -3 +AEC_MEM_ERROR = -4 + +# constants from szlib.h +# Only SZ_MSB_OPTION_MASK and SZ_NN_OPTION_MASK are relevant to the libaec implementation +SZ_ALLOW_K13_OPTION_MASK = 1 +SZ_CHIP_OPTION_MASK = 2 +SZ_EC_OPTION_MASK = 4 +SZ_LSB_OPTION_MASK = 8 +SZ_MSB_OPTION_MASK = 16 +SZ_NN_OPTION_MASK = 32 +SZ_RAW_OPTION_MASK = 128 + +SZ_OK = AEC_OK +SZ_OUTBUFF_FULL = 2 + +SZ_NO_ENCODER_ERROR = -1 +SZ_PARAM_ERROR = AEC_CONF_ERROR +SZ_MEM_ERROR = AEC_MEM_ERROR + +SZ_MAX_PIXELS_PER_BLOCK = 32 +SZ_MAX_BLOCKS_PER_SCANLINE = 128 +SZ_MAX_PIXELS_PER_SCANLINE = SZ_MAX_PIXELS_PER_BLOCK * SZ_MAX_BLOCKS_PER_SCANLINE + + +class Params(ctypes.Structure): + + _fields_ = [ + ("options_mask", ctypes.c_int), + ("bits_per_pixel", ctypes.c_int), + ("pixels_per_block", ctypes.c_int), + ("pixels_per_scanline", ctypes.c_int) + ] + + +libsz = False +for ext in [".so", ".dylib", ".dll"]: + try: + libsz = ctypes.cdll.LoadLibrary("libsz" + ext) + except OSError: + pass + +if libsz: + BufftoBuffCompress = libsz.SZ_BufftoBuffCompress + BufftoBuffCompress.restype = ctypes.c_int + BufftoBuffCompress.argtypes = [ + ctypes.c_void_p, ctypes.POINTER(ctypes.c_size_t), ctypes.c_void_p, ctypes.c_size_t, + ctypes.POINTER(Params) + ] + + BufftoBuffDecompress = libsz.SZ_BufftoBuffDecompress + BufftoBuffDecompress.restype = ctypes.c_int + BufftoBuffDecompress.argtypes = [ + ctypes.c_void_p, ctypes.POINTER(ctypes.c_size_t), ctypes.c_void_p, ctypes.c_size_t, + ctypes.POINTER(Params) + ] + + +def check_sz(): + if not libsz: # pragma: no cover + raise ImportError("libsz could not be loaded, please install libaec") + + +class HDF5SzipCodec(Codec): + """The SZIP codec, as implemented in NASA HDF5 + + See description: + + The shared library used could in principle be the original SZIP, but it is + also includes in libaec, which is available on conda-forge and recommended. + + All parameters must be defined. + """ + + codec_id = "hdf5_szip" + + def __init__(self, mask, bits_per_pixel, pixels_per_block, pixels_per_scanline): + assert pixels_per_block <= SZ_MAX_PIXELS_PER_BLOCK + assert pixels_per_scanline <= SZ_MAX_PIXELS_PER_SCANLINE + self.mask = mask + self.pixels_per_block = pixels_per_block + self.bits_per_pixel = bits_per_pixel + self.pixels_per_scanline = pixels_per_scanline + + def decode(self, buf, out=None): + check_sz() + buf = memoryview(buf) + param = Params( + self.mask, self.bits_per_pixel, self.pixels_per_block, self.pixels_per_scanline + ) + lout = int.from_bytes(buf[:4], "little") + dest_len = ctypes.c_size_t(lout) + p_dest_len = ctypes.pointer(dest_len) + if out is None: + out = ctypes.create_string_buffer(lout) + buf2 = ctypes.c_char_p(buf.obj[4:]) + ok = BufftoBuffDecompress( + out, p_dest_len, buf2, buf.nbytes - 4, param + ) + assert dest_len.value == lout # that we got the expected number of bytes + assert ok == 0 + return out + + def encode(self, buf): + check_sz() + buf = ensure_ndarray_like(buf) + param = Params( + self.mask, self.bits_per_pixel, self.pixels_per_block, self.pixels_per_scanline + ) + buf_nbytes = buf.nbytes + buf2 = buf.ctypes.data + out = ctypes.create_string_buffer(buf_nbytes+4) + # Store input nbytes as first four bytes little endian + in_len = ctypes.c_int32(buf_nbytes) + out[:4] = bytes(in_len) + dest_len = ctypes.c_size_t(buf_nbytes) + p_dest_len = ctypes.pointer(dest_len) + ok = BufftoBuffCompress( + ctypes.addressof(out)+4, p_dest_len, buf2, buf_nbytes, param + ) + assert ok == 0 + return out[:dest_len.value+4] diff --git a/numcodecs/tests/test_sz.py b/numcodecs/tests/test_sz.py new file mode 100644 index 00000000..bbaffd9b --- /dev/null +++ b/numcodecs/tests/test_sz.py @@ -0,0 +1,39 @@ +import pytest + +import numpy as np + +from numcodecs.sz import HDF5SzipCodec, libsz + + +@pytest.mark.skipif(not libsz, reason="no libaec") +def test_canonical(): + sample_buffer = b'\x00\x02\x00\x00\x15UUUUUUUQUUUUUUUU\x15UUUUUUUQUUUUUUUU' \ + b'\x15UUUUUUUQUUUUUUUU\x15UUUUUUUQUUUUUUUU' + # pl = h5obj.id.get_create_plist().get_filter(0) + # mask, pixels_per_block, bits_per_pixel, pixels_per_scanline = pl[2] + # (141, 32, 16, 256) + # lout = 512 + + codec = HDF5SzipCodec(mask=141, pixels_per_block=32, bits_per_pixel=16, pixels_per_scanline=256) + out = codec.decode(sample_buffer) # Bus Error + arr = np.frombuffer(out, dtype="uint16") + assert (arr == 1).all() + + comp_buff = codec.encode(arr) + assert comp_buff == sample_buffer + + +@pytest.mark.skipif(not libsz, reason="no libaec") +@pytest.mark.parametrize( + "shape", [(10, 100), (10000,)] +) +@pytest.mark.parametrize( + "dtype", ["uint16", "int32", "int64"] +) +def test_random(shape, dtype): + arr = np.random.randint(0, 40, size=np.prod(shape), dtype=dtype).reshape(shape) + codec = HDF5SzipCodec(mask=141, pixels_per_block=32, bits_per_pixel=16, pixels_per_scanline=256) + buff = codec.encode(arr) + buff2 = codec.decode(buff) + arr2 = np.frombuffer(buff2, dtype=dtype).reshape(shape) + assert (arr == arr2).all()