Improve performance of variant_stats sgkit-dev#1116

timothymillar · timothymillar · commit 3f0dbda2c543 · 2023-08-30T09:40:07.000+12:00
* Add count_variant_alleles option to calculate directly from calls
* Improve performance of variant_stats using gufuncs
* Raise error is variant_stats used on mixed-ploidy data
* Document behavior of variant_stats with partial genotype calls
diff --git a/sgkit/stats/aggregation.py b/sgkit/stats/aggregation.py
@@ -96,7 +96,9 @@ def count_call_alleles(
 def count_variant_alleles(
     ds: Dataset,
     *,
+    call_genotype: Hashable = variables.call_genotype,
     call_allele_count: Hashable = variables.call_allele_count,
+    from_call_allele_count: bool = True,
     merge: bool = True,
 ) -> Dataset:
     """Compute allele count from per-sample allele counts, or genotype calls.
@@ -105,11 +107,22 @@ def count_variant_alleles(
     ----------
     ds
         Dataset containing genotype calls.
+    call_genotype
+        Input variable name holding call_genotype as defined by
+        :data:`sgkit.variables.call_genotype_spec`.
+        Must be present in ``ds`` unless from_call_allele_count is True.
     call_allele_count
         Input variable name holding call_allele_count as defined by
         :data:`sgkit.variables.call_allele_count_spec`.
         If the variable is not present in ``ds``, it will be computed
         using :func:`count_call_alleles`.
+        This variable is only used if from_call_allele_count is True.
+    from_call_allele_count
+        if True (the default), the result will be calculated from the
+        call_allele_count variable rather than the call_genotype variable.
+        If False, the result will be calculated directly from the
+        call_genotype variable without computing the call_allele_count
+        variable as an intermediate.
     merge
         If True (the default), merge the input dataset and the computed
         output variables into a single dataset, otherwise return only
@@ -141,14 +154,25 @@ def count_variant_alleles(
            [2, 2],
            [4, 0]], dtype=uint64)
     """
-    ds = define_variable_if_absent(
-        ds, variables.call_allele_count, call_allele_count, count_call_alleles
-    )
-    variables.validate(ds, {call_allele_count: variables.call_allele_count_spec})
-
-    new_ds = create_dataset(
-        {variables.variant_allele_count: ds[call_allele_count].sum(dim="samples")}
-    )
+    if from_call_allele_count:
+        ds = define_variable_if_absent(
+            ds, variables.call_allele_count, call_allele_count, count_call_alleles
+        )
+        variables.validate(ds, {call_allele_count: variables.call_allele_count_spec})
+        AC = ds[call_allele_count].sum(dim="samples")
+    else:
+        from .aggregation_numba_fns import count_alleles
+
+        variables.validate(ds, {call_genotype: variables.call_genotype_spec})
+        n_alleles = ds.dims["alleles"]
+        n_variant = ds.dims["variants"]
+        G = da.asarray(ds[call_genotype]).reshape((n_variant, -1))
+        shape = (G.chunks[0], n_alleles)
+        # use uint64 dummy array to return uin64 counts array
+        N = np.empty(n_alleles, dtype=np.uint64)
+        AC = da.map_blocks(count_alleles, G, N, chunks=shape, drop_axis=1, new_axis=1)
+        AC = xr.DataArray(AC, dims=["variants", "alleles"])
+    new_ds = create_dataset({variables.variant_allele_count: AC})
     return conditional_merge_datasets(ds, new_ds, merge)
 
 
@@ -629,7 +653,6 @@ def allele_frequency(
 def variant_stats(
     ds: Dataset,
     *,
-    call_genotype_mask: Hashable = variables.call_genotype_mask,
     call_genotype: Hashable = variables.call_genotype,
     variant_allele_count: Hashable = variables.variant_allele_count,
     merge: bool = True,
@@ -644,10 +667,6 @@ def variant_stats(
         Input variable name holding call_genotype.
         Defined by :data:`sgkit.variables.call_genotype_spec`.
         Must be present in ``ds``.
-    call_genotype_mask
-        Input variable name holding call_genotype_mask.
-        Defined by :data:`sgkit.variables.call_genotype_mask_spec`
-        Must be present in ``ds``.
     variant_allele_count
         Input variable name holding variant_allele_count,
         as defined by :data:`sgkit.variables.variant_allele_count_spec`.
@@ -681,31 +700,85 @@ def variant_stats(
       The number of occurrences of all alleles.
     - :data:`sgkit.variables.variant_allele_frequency_spec` (variants, alleles):
       The frequency of occurrence of each allele.
+
+    Note
+    ----
+    If the dataset contains partial genotype calls (i.e., genotype calls with
+    a mixture of called and missing alleles), these genotypes will be ignored
+    when counting the number of homozygous, heterozygous or total genotype calls.
+    However, the called alleles will be counted when calculating allele counts
+    and frequencies using :func:`count_variant_alleles`.
+
+    Note
+    ----
+    When used on autopolyploid genotypes, this method treats genotypes calls
+    with any level of heterozygosity as 'heterozygous'. Only fully homozygous
+    genotype calls (e.g. 0/0/0/0) will be classified as 'homozygous'.
+
+    Warnings
+    --------
+    This method does not support mixed-ploidy datasets.
+
+    Raises
+    ------
+    ValueError
+        If the dataset contains mixed-ploidy genotype calls.
+
+    See Also
+    --------
+    :func:`count_variant_genotypes`
     """
-    variables.validate(
+    from .aggregation_numba_fns import count_hom
+
+    variables.validate(ds, {call_genotype: variables.call_genotype_spec})
+    mixed_ploidy = ds[call_genotype].attrs.get("mixed_ploidy", False)
+    if mixed_ploidy:
+        raise ValueError("Mixed-ploidy dataset")
+    AC = define_variable_if_absent(
         ds,
-        {
-            call_genotype: variables.call_genotype_spec,
-            call_genotype_mask: variables.call_genotype_mask_spec,
-        },
+        variables.variant_allele_count,
+        variant_allele_count,
+        count_variant_alleles,
+        from_call_allele_count=False,
+        merge=False,
+    )[variant_allele_count]
+    G = da.array(ds[call_genotype].data)
+    H = xr.DataArray(
+        da.map_blocks(
+            count_hom,
+            G,
+            np.zeros(3, np.uint64),
+            drop_axis=(1, 2),
+            new_axis=1,
+            dtype=np.int64,
+            chunks=(G.chunks[0], 3),
+        ),
+        dims=["variants", "categories"],
     )
-    new_ds = xr.merge(
-        [
-            call_rate(ds, dim="samples", call_genotype_mask=call_genotype_mask),
-            count_genotypes(
-                ds,
-                dim="samples",
-                call_genotype=call_genotype,
-                call_genotype_mask=call_genotype_mask,
-                merge=False,
-            ),
-            allele_frequency(
-                ds,
-                call_genotype_mask=call_genotype_mask,
-                variant_allele_count=variant_allele_count,
-            ),
-        ]
+    _, n_sample, _ = G.shape
+    n_called = H.sum(axis=-1)
+    call_rate = n_called / n_sample
+    n_hom_ref = H[:, 0]
+    n_hom_alt = H[:, 1]
+    n_het = H[:, 2]
+    n_non_ref = n_called - n_hom_ref
+    allele_total = AC.sum(axis=-1).astype(int)  # backwards compatibility
+    new_ds = xr.Dataset(
+        {
+            variables.variant_n_called: n_called,
+            variables.variant_call_rate: call_rate,
+            variables.variant_n_het: n_het,
+            variables.variant_n_hom_ref: n_hom_ref,
+            variables.variant_n_hom_alt: n_hom_alt,
+            variables.variant_n_non_ref: n_non_ref,
+            variables.variant_allele_count: AC,
+            variables.variant_allele_total: allele_total,
+            variables.variant_allele_frequency: AC / allele_total,
+        }
     )
+    # for backwards compatible behavior
+    if (variant_allele_count in ds) and merge:
+        new_ds = new_ds.drop_vars(variant_allele_count)
     return conditional_merge_datasets(ds, variables.validate(new_ds), merge)
 
 
diff --git a/sgkit/stats/aggregation_numba_fns.py b/sgkit/stats/aggregation_numba_fns.py
@@ -2,7 +2,7 @@
 # in a separate file here, and imported dynamically to avoid
 # initial compilation overhead.
 
-from sgkit.accelerate import numba_guvectorize
+from sgkit.accelerate import numba_guvectorize, numba_jit
 from sgkit.typing import ArrayLike
 
 
@@ -12,6 +12,10 @@
         "void(int16[:], uint8[:], uint8[:])",
         "void(int32[:], uint8[:], uint8[:])",
         "void(int64[:], uint8[:], uint8[:])",
+        "void(int8[:], uint64[:], uint64[:])",
+        "void(int16[:], uint64[:], uint64[:])",
+        "void(int32[:], uint64[:], uint64[:])",
+        "void(int64[:], uint64[:], uint64[:])",
     ],
     "(k),(n)->(n)",
 )
@@ -26,9 +30,10 @@ def count_alleles(
         Genotype call of shape (ploidy,) containing alleles encoded as
         type `int` with values < 0 indicating a missing allele.
     _
-        Dummy variable of type `uint8` and shape (alleles,) used to
-        define the number of unique alleles to be counted in the
-        return value.
+        Dummy variable of type `uint8` or `uint64` and shape (alleles,)
+        used to define the number of unique alleles to be counted in the
+        return value. The dtype of this array determines the dtype of the
+        returned array.
 
     Returns
     -------
@@ -43,3 +48,57 @@ def count_alleles(
         a = g[i]
         if a >= 0:
             out[a] += 1
+
+
+@numba_jit(nogil=True)
+def _classify_hom(genotype: ArrayLike) -> int:
+    a0 = genotype[0]
+    cat = min(a0, 1)  # -1, 0, 1
+    for i in range(1, len(genotype)):
+        if cat < 0:
+            break
+        a = genotype[i]
+        if a != a0:
+            cat = 2  # het
+        if a < 0:
+            cat = -1
+    return cat
+
+
+@numba_guvectorize(  # type: ignore
+    [
+        "void(int8[:,:], uint64[:], int64[:])",
+        "void(int16[:,:], uint64[:], int64[:])",
+        "void(int32[:,:], uint64[:], int64[:])",
+        "void(int64[:,:], uint64[:], int64[:])",
+    ],
+    "(n, k),(c)->(c)",
+)
+def count_hom(
+    genotypes: ArrayLike, _: ArrayLike, out: ArrayLike
+) -> None:  # pragma: no cover
+    """Generalized U-function for counting homozygous and heterozygous genotypes.
+
+    Parameters
+    ----------
+    g
+        Genotype call of shape (ploidy,) containing alleles encoded as
+        type `int` with values < 0 indicating a missing allele.
+    _
+        Dummy variable of type `uint64` with length 3 which determines the
+        number of categories returned (this should always be 3).
+
+    Note
+    ----
+    This method is not suitable for mixed-ploidy genotypes.
+
+    Returns
+    -------
+    counts : ndarray
+        Counts of homozygous reference, homozygous alternate, and heterozygous genotypes.
+    """
+    out[:] = 0
+    for i in range(len(genotypes)):
+        index = _classify_hom(genotypes[i])
+        if index >= 0:
+            out[index] += 1
diff --git a/sgkit/tests/test_aggregation.py b/sgkit/tests/test_aggregation.py