Skip to content

Commit 883a37e

Browse files
Add chunk size options to mkschema
Closes #294
1 parent b1d7ef2 commit 883a37e

File tree

5 files changed

+68
-5
lines changed

5 files changed

+68
-5
lines changed

CHANGELOG.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@ Breaking changes
55
- ICF metadata format version bumped to ensure long-term compatility between numpy 1.26.x
66
and numpy >= 2. Existing ICFs will need to be recreated.
77

8+
- Add chunksize options to mkschema (issue:294)
9+
810
# 0.1.1 2024-06-19
911

1012
Maintenance release:

bio2zarr/cli.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -338,12 +338,19 @@ def inspect(path, verbose):
338338

339339
@click.command
340340
@icf_path
341-
def mkschema(icf_path):
341+
@variants_chunk_size
342+
@samples_chunk_size
343+
def mkschema(icf_path, variants_chunk_size, samples_chunk_size):
342344
"""
343345
Generate a schema for zarr encoding
344346
"""
345347
stream = click.get_text_stream("stdout")
346-
vcf2zarr.mkschema(icf_path, stream)
348+
vcf2zarr.mkschema(
349+
icf_path,
350+
stream,
351+
variants_chunk_size=variants_chunk_size,
352+
samples_chunk_size=samples_chunk_size,
353+
)
347354

348355

349356
@click.command

bio2zarr/vcf2zarr/vcz.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1027,9 +1027,13 @@ def encode_all_partitions(
10271027
pwm.submit(self.encode_partition, partition_index)
10281028

10291029

1030-
def mkschema(if_path, out):
1030+
def mkschema(if_path, out, *, variants_chunk_size=None, samples_chunk_size=None):
10311031
store = icf.IntermediateColumnarFormat(if_path)
1032-
spec = VcfZarrSchema.generate(store)
1032+
spec = VcfZarrSchema.generate(
1033+
store,
1034+
variants_chunk_size=variants_chunk_size,
1035+
samples_chunk_size=samples_chunk_size,
1036+
)
10331037
out.write(spec.asjson())
10341038

10351039

tests/test_cli.py

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -461,7 +461,9 @@ def test_inspect(self, mocked, tmp_path):
461461
def test_mkschema(self, mocked, tmp_path):
462462
runner = ct.CliRunner(mix_stderr=False)
463463
result = runner.invoke(
464-
cli.vcf2zarr_main, f"mkschema {tmp_path}", catch_exceptions=False
464+
cli.vcf2zarr_main,
465+
f"mkschema {tmp_path} --variants-chunk-size=3 " "--samples-chunk-size=4",
466+
catch_exceptions=False,
465467
)
466468
assert result.exit_code == 0
467469
assert len(result.stdout) == 0
@@ -705,6 +707,25 @@ def test_explode(self, tmp_path):
705707
# Arbitrary check
706708
assert "CHROM" in result.stdout
707709

710+
def test_mkschema(self, tmp_path):
711+
icf_path = tmp_path / "icf"
712+
runner = ct.CliRunner(mix_stderr=False)
713+
result = runner.invoke(
714+
cli.vcf2zarr_main,
715+
f"explode {self.vcf_path} {icf_path}",
716+
catch_exceptions=False,
717+
)
718+
assert result.exit_code == 0
719+
result = runner.invoke(
720+
cli.vcf2zarr_main,
721+
f"mkschema {icf_path} --variants-chunk-size=3 " "--samples-chunk-size=2",
722+
catch_exceptions=False,
723+
)
724+
assert result.exit_code == 0
725+
d = json.loads(result.stdout)
726+
assert d["samples_chunk_size"] == 2
727+
assert d["variants_chunk_size"] == 3
728+
708729
def test_encode(self, tmp_path):
709730
icf_path = tmp_path / "icf"
710731
zarr_path = tmp_path / "zarr"

tests/test_vcz.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -136,6 +136,35 @@ def test_bad_value(self, tmp_path, icf_path, dimension_separator):
136136
)
137137

138138

139+
class TestSchemaChunkSize:
140+
@pytest.mark.parametrize(
141+
("samples_chunk_size", "variants_chunk_size"),
142+
[
143+
(1, 2),
144+
(2, 1),
145+
(3, 5),
146+
],
147+
)
148+
def test_chunk_sizes(self, icf_path, samples_chunk_size, variants_chunk_size):
149+
icf = vcf2zarr.IntermediateColumnarFormat(icf_path)
150+
schema = vcf2zarr.VcfZarrSchema.generate(
151+
icf,
152+
variants_chunk_size=variants_chunk_size,
153+
samples_chunk_size=samples_chunk_size,
154+
)
155+
assert schema.samples_chunk_size == samples_chunk_size
156+
assert schema.variants_chunk_size == variants_chunk_size
157+
found = 0
158+
for field in schema.fields:
159+
assert field.dimensions[0] == "variants"
160+
assert field.chunks[0] == variants_chunk_size
161+
if "samples" in field.dimensions:
162+
dim = field.dimensions.index("samples")
163+
assert field.chunks[dim] == samples_chunk_size
164+
found += 1
165+
assert found > 0
166+
167+
139168
class TestSchemaJsonRoundTrip:
140169
def assert_json_round_trip(self, schema):
141170
schema2 = vcf2zarr.VcfZarrSchema.fromjson(schema.asjson())

0 commit comments

Comments
 (0)