@@ -96,7 +96,9 @@ def count_call_alleles(
96
96
def count_variant_alleles (
97
97
ds : Dataset ,
98
98
* ,
99
+ call_genotype : Hashable = variables .call_genotype ,
99
100
call_allele_count : Hashable = variables .call_allele_count ,
101
+ from_call_allele_count : bool = True ,
100
102
merge : bool = True ,
101
103
) -> Dataset :
102
104
"""Compute allele count from per-sample allele counts, or genotype calls.
@@ -105,11 +107,22 @@ def count_variant_alleles(
105
107
----------
106
108
ds
107
109
Dataset containing genotype calls.
110
+ call_genotype
111
+ Input variable name holding call_genotype as defined by
112
+ :data:`sgkit.variables.call_genotype_spec`.
113
+ Must be present in ``ds`` unless from_call_allele_count is True.
108
114
call_allele_count
109
115
Input variable name holding call_allele_count as defined by
110
116
:data:`sgkit.variables.call_allele_count_spec`.
111
117
If the variable is not present in ``ds``, it will be computed
112
118
using :func:`count_call_alleles`.
119
+ This variable is only used if from_call_allele_count is True.
120
+ from_call_allele_count
121
+ if True (the default), the result will be calculated from the
122
+ call_allele_count variable rather than the call_genotype variable.
123
+ If False, the result will be calculated directly from the
124
+ call_genotype variable without computing the call_allele_count
125
+ variable as an intermediate.
113
126
merge
114
127
If True (the default), merge the input dataset and the computed
115
128
output variables into a single dataset, otherwise return only
@@ -141,14 +154,25 @@ def count_variant_alleles(
141
154
[2, 2],
142
155
[4, 0]], dtype=uint64)
143
156
"""
144
- ds = define_variable_if_absent (
145
- ds , variables .call_allele_count , call_allele_count , count_call_alleles
146
- )
147
- variables .validate (ds , {call_allele_count : variables .call_allele_count_spec })
148
-
149
- new_ds = create_dataset (
150
- {variables .variant_allele_count : ds [call_allele_count ].sum (dim = "samples" )}
151
- )
157
+ if from_call_allele_count :
158
+ ds = define_variable_if_absent (
159
+ ds , variables .call_allele_count , call_allele_count , count_call_alleles
160
+ )
161
+ variables .validate (ds , {call_allele_count : variables .call_allele_count_spec })
162
+ AC = ds [call_allele_count ].sum (dim = "samples" )
163
+ else :
164
+ from .aggregation_numba_fns import count_alleles
165
+
166
+ variables .validate (ds , {call_genotype : variables .call_genotype_spec })
167
+ n_alleles = ds .dims ["alleles" ]
168
+ n_variant = ds .dims ["variants" ]
169
+ G = da .asarray (ds [call_genotype ]).reshape ((n_variant , - 1 ))
170
+ shape = (G .chunks [0 ], n_alleles )
171
+ # use uint64 dummy array to return uin64 counts array
172
+ N = np .empty (n_alleles , dtype = np .uint64 )
173
+ AC = da .map_blocks (count_alleles , G , N , chunks = shape , drop_axis = 1 , new_axis = 1 )
174
+ AC = xr .DataArray (AC , dims = ["variants" , "alleles" ])
175
+ new_ds = create_dataset ({variables .variant_allele_count : AC })
152
176
return conditional_merge_datasets (ds , new_ds , merge )
153
177
154
178
@@ -629,7 +653,6 @@ def allele_frequency(
629
653
def variant_stats (
630
654
ds : Dataset ,
631
655
* ,
632
- call_genotype_mask : Hashable = variables .call_genotype_mask ,
633
656
call_genotype : Hashable = variables .call_genotype ,
634
657
variant_allele_count : Hashable = variables .variant_allele_count ,
635
658
merge : bool = True ,
@@ -644,10 +667,6 @@ def variant_stats(
644
667
Input variable name holding call_genotype.
645
668
Defined by :data:`sgkit.variables.call_genotype_spec`.
646
669
Must be present in ``ds``.
647
- call_genotype_mask
648
- Input variable name holding call_genotype_mask.
649
- Defined by :data:`sgkit.variables.call_genotype_mask_spec`
650
- Must be present in ``ds``.
651
670
variant_allele_count
652
671
Input variable name holding variant_allele_count,
653
672
as defined by :data:`sgkit.variables.variant_allele_count_spec`.
@@ -681,31 +700,85 @@ def variant_stats(
681
700
The number of occurrences of all alleles.
682
701
- :data:`sgkit.variables.variant_allele_frequency_spec` (variants, alleles):
683
702
The frequency of occurrence of each allele.
703
+
704
+ Note
705
+ ----
706
+ If the dataset contains partial genotype calls (i.e., genotype calls with
707
+ a mixture of called and missing alleles), these genotypes will be ignored
708
+ when counting the number of homozygous, heterozygous or total genotype calls.
709
+ However, the called alleles will be counted when calculating allele counts
710
+ and frequencies using :func:`count_variant_alleles`.
711
+
712
+ Note
713
+ ----
714
+ When used on autopolyploid genotypes, this method treats genotypes calls
715
+ with any level of heterozygosity as 'heterozygous'. Only fully homozygous
716
+ genotype calls (e.g. 0/0/0/0) will be classified as 'homozygous'.
717
+
718
+ Warnings
719
+ --------
720
+ This method does not support mixed-ploidy datasets.
721
+
722
+ Raises
723
+ ------
724
+ ValueError
725
+ If the dataset contains mixed-ploidy genotype calls.
726
+
727
+ See Also
728
+ --------
729
+ :func:`count_variant_genotypes`
684
730
"""
685
- variables .validate (
731
+ from .aggregation_numba_fns import count_hom
732
+
733
+ variables .validate (ds , {call_genotype : variables .call_genotype_spec })
734
+ mixed_ploidy = ds [call_genotype ].attrs .get ("mixed_ploidy" , False )
735
+ if mixed_ploidy :
736
+ raise ValueError ("Mixed-ploidy dataset" )
737
+ AC = define_variable_if_absent (
686
738
ds ,
687
- {
688
- call_genotype : variables .call_genotype_spec ,
689
- call_genotype_mask : variables .call_genotype_mask_spec ,
690
- },
739
+ variables .variant_allele_count ,
740
+ variant_allele_count ,
741
+ count_variant_alleles ,
742
+ from_call_allele_count = False ,
743
+ merge = False ,
744
+ )[variant_allele_count ]
745
+ G = da .array (ds [call_genotype ].data )
746
+ H = xr .DataArray (
747
+ da .map_blocks (
748
+ count_hom ,
749
+ G ,
750
+ np .zeros (3 , np .uint64 ),
751
+ drop_axis = (1 , 2 ),
752
+ new_axis = 1 ,
753
+ dtype = np .int64 ,
754
+ chunks = (G .chunks [0 ], 3 ),
755
+ ),
756
+ dims = ["variants" , "categories" ],
691
757
)
692
- new_ds = xr .merge (
693
- [
694
- call_rate (ds , dim = "samples" , call_genotype_mask = call_genotype_mask ),
695
- count_genotypes (
696
- ds ,
697
- dim = "samples" ,
698
- call_genotype = call_genotype ,
699
- call_genotype_mask = call_genotype_mask ,
700
- merge = False ,
701
- ),
702
- allele_frequency (
703
- ds ,
704
- call_genotype_mask = call_genotype_mask ,
705
- variant_allele_count = variant_allele_count ,
706
- ),
707
- ]
758
+ _ , n_sample , _ = G .shape
759
+ n_called = H .sum (axis = - 1 )
760
+ call_rate = n_called / n_sample
761
+ n_hom_ref = H [:, 0 ]
762
+ n_hom_alt = H [:, 1 ]
763
+ n_het = H [:, 2 ]
764
+ n_non_ref = n_called - n_hom_ref
765
+ allele_total = AC .sum (axis = - 1 ).astype (int ) # backwards compatibility
766
+ new_ds = xr .Dataset (
767
+ {
768
+ variables .variant_n_called : n_called ,
769
+ variables .variant_call_rate : call_rate ,
770
+ variables .variant_n_het : n_het ,
771
+ variables .variant_n_hom_ref : n_hom_ref ,
772
+ variables .variant_n_hom_alt : n_hom_alt ,
773
+ variables .variant_n_non_ref : n_non_ref ,
774
+ variables .variant_allele_count : AC ,
775
+ variables .variant_allele_total : allele_total ,
776
+ variables .variant_allele_frequency : AC / allele_total ,
777
+ }
708
778
)
779
+ # for backwards compatible behavior
780
+ if (variant_allele_count in ds ) and merge :
781
+ new_ds = new_ds .drop_vars (variant_allele_count )
709
782
return conditional_merge_datasets (ds , variables .validate (new_ds ), merge )
710
783
711
784
0 commit comments