From 4f9d8ae04107ae6f76a1ce70f6d252c73c2f4935 Mon Sep 17 00:00:00 2001 From: Tiramisu Mokka Date: Wed, 8 Oct 2025 17:16:07 +0200 Subject: [PATCH] RFC: keep hdf5 file open We see that the application that accesses an HDF5 file with CLAM library creates an extremely high load on metadata servers if the data is on the cluster filesystem. The reason is that the file is opened and closed for each access over the iterator. This change is just an idea of how such an access pattern can be optimised by keeping the file open and closing it when the object is deleted. NOTE: The code is not tested and should be treated only as a PoC --- dataset_modules/dataset_h5.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/dataset_modules/dataset_h5.py b/dataset_modules/dataset_h5.py index bbc19ed9..6108a224 100755 --- a/dataset_modules/dataset_h5.py +++ b/dataset_modules/dataset_h5.py @@ -60,11 +60,11 @@ def __init__(self, self.file_path = file_path - with h5py.File(self.file_path, "r") as f: - dset = f['coords'] - self.patch_level = f['coords'].attrs['patch_level'] - self.patch_size = f['coords'].attrs['patch_size'] - self.length = len(dset) + self.hdf5_file = h5py.File(self.file_path, "r") + dset = self.hdf5_file['coords'] + self.patch_level = self.hdf5_file['coords'].attrs['patch_level'] + self.patch_size = self.hdf5_file['coords'].attrs['patch_size'] + self.length = len(dset) self.summary() @@ -72,8 +72,7 @@ def __len__(self): return self.length def summary(self): - hdf5_file = h5py.File(self.file_path, "r") - dset = hdf5_file['coords'] + dset = self.hdf5_file['coords'] for name, value in dset.attrs.items(): print(name, value) @@ -81,13 +80,15 @@ def summary(self): print('transformations: ', self.roi_transforms) def __getitem__(self, idx): - with h5py.File(self.file_path,'r') as hdf5_file: - coord = hdf5_file['coords'][idx] + coord = self.hdf5_file['coords'][idx] img = self.wsi.read_region(coord, self.patch_level, (self.patch_size, self.patch_size)).convert('RGB') img = self.roi_transforms(img) return {'img': img, 'coord': coord} + def __del__(self): + self.hdf5_file.close() + class Dataset_All_Bags(Dataset): def __init__(self, csv_path):