15
15
from zarr .codecs import BloscCodec , BytesCodec
16
16
17
17
from .. import constants , core , provenance
18
+ from ..zarr_v3_utils import VLenUTF8Codec
18
19
from . import icf
19
20
20
21
logger = logging .getLogger (__name__ )
@@ -34,6 +35,8 @@ def inspect(path):
34
35
35
36
DEFAULT_ZARR_COMPRESSOR = numcodecs .Blosc (cname = "zstd" , clevel = 7 )
36
37
DEFAULT_ZARR_CODECS = [BytesCodec (), BloscCodec (cname = "lz4" , clevel = 7 )]
38
+ STRING_ZARR_CODECS = [VLenUTF8Codec (), BloscCodec (cname = "lz4" , clevel = 7 )]
39
+
37
40
38
41
_fixed_field_descriptions = {
39
42
"variant_contig" : "An identifier from the reference genome or an angle-bracketed ID"
@@ -574,27 +577,28 @@ def init(
574
577
def encode_samples (self , root ):
575
578
if self .schema .samples != self .icf .metadata .samples :
576
579
raise ValueError ("Subsetting or reordering samples not supported currently" )
577
- data = np .array ([sample .id for sample in self .schema .samples ], dtype = str )
580
+ data = np .array ([sample .id for sample in self .schema .samples ], dtype = object )
578
581
array = root .create_array (
579
582
"sample_id" ,
580
583
shape = data .shape ,
581
584
dtype = data .dtype ,
582
- codecs = DEFAULT_ZARR_CODECS ,
585
+ codecs = STRING_ZARR_CODECS ,
583
586
chunks = (self .schema .samples_chunk_size ,),
584
587
)
585
588
array [...] = data
586
589
array .attrs ["_ARRAY_DIMENSIONS" ] = ["samples" ]
587
590
logger .debug ("Samples done" )
588
591
589
592
def encode_contig_id (self , root ):
590
- data = np .array ([contig .id for contig in self .schema .contigs ], dtype = str )
593
+ data = np .array ([contig .id for contig in self .schema .contigs ], dtype = object )
591
594
array = root .create_array (
592
595
"contig_id" ,
593
596
shape = data .shape ,
594
597
dtype = data .dtype ,
595
- codecs = DEFAULT_ZARR_CODECS ,
598
+ codecs = STRING_ZARR_CODECS ,
596
599
chunks = data .shape , # no chunking
597
600
)
601
+ array [...] = data
598
602
array .attrs ["_ARRAY_DIMENSIONS" ] = ["contigs" ]
599
603
if all (contig .length is not None for contig in self .schema .contigs ):
600
604
data = np .array (
@@ -604,9 +608,10 @@ def encode_contig_id(self, root):
604
608
"contig_length" ,
605
609
shape = data .shape ,
606
610
dtype = data .dtype ,
607
- compressor = DEFAULT_ZARR_CODECS ,
611
+ codecs = DEFAULT_ZARR_CODECS ,
608
612
chunks = data .shape , # no chunking
609
613
)
614
+ array [...] = data
610
615
array .attrs ["_ARRAY_DIMENSIONS" ] = ["contigs" ]
611
616
612
617
def encode_filter_id (self , root ):
@@ -617,23 +622,29 @@ def encode_filter_id(self, root):
617
622
"filter_id" ,
618
623
shape = data .shape ,
619
624
dtype = data .dtype ,
620
- codecs = DEFAULT_ZARR_CODECS ,
625
+ codecs = STRING_ZARR_CODECS ,
621
626
chunks = data .shape , # no chunking
622
627
)
628
+ array [...] = data
623
629
array .attrs ["_ARRAY_DIMENSIONS" ] = ["filters" ]
624
630
625
631
def init_array (self , root , array_spec , variants_dim_size ):
626
632
object_codec = None
627
633
if array_spec .dtype == "O" :
628
634
object_codec = numcodecs .VLenUTF8 ()
635
+ codecs = STRING_ZARR_CODECS
636
+ else :
637
+ codecs = DEFAULT_ZARR_CODECS
629
638
shape = list (array_spec .shape )
630
639
# Truncate the variants dimension is max_variant_chunks was specified
631
640
shape [0 ] = variants_dim_size
641
+ compressor = numcodecs .get_codec (array_spec .compressor )
632
642
a = root .create_array ( # empty raises NotImplemented
633
643
array_spec .name ,
634
644
shape = shape ,
635
645
chunks = array_spec .chunks ,
636
646
dtype = array_spec .dtype ,
647
+ codecs = codecs ,
637
648
# TODO
638
649
# compressor=numcodecs.get_codec(array_spec.compressor),
639
650
# filters=[numcodecs.get_codec(filt) for filt in array_spec.filters],
@@ -915,7 +926,7 @@ def finalise(self, show_progress=False):
915
926
logger .debug (f"Removing { self .wip_path } " )
916
927
shutil .rmtree (self .wip_path )
917
928
logger .info ("Consolidating Zarr metadata" )
918
- zarr .consolidate_metadata (self .path )
929
+ # zarr.consolidate_metadata(self.path)
919
930
920
931
######################
921
932
# encode_all_partitions
0 commit comments