diff --git a/src/main/java/com/google/cloud/genomics/dataflow/pipelines/AnnotateVariants.java b/src/main/java/com/google/cloud/genomics/dataflow/pipelines/AnnotateVariants.java new file mode 100644 index 0000000..b05922b --- /dev/null +++ b/src/main/java/com/google/cloud/genomics/dataflow/pipelines/AnnotateVariants.java @@ -0,0 +1,316 @@ +/* + * Copyright (C) 2015 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except + * in compliance with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. + */ +package com.google.cloud.genomics.dataflow.pipelines; + +import com.google.api.client.util.Strings; +import com.google.api.services.genomics.Genomics; +import com.google.api.services.genomics.model.Annotation; +import com.google.api.services.genomics.model.AnnotationSet; +import com.google.api.services.genomics.model.ListBasesResponse; +import com.google.api.services.genomics.model.QueryRange; +import com.google.api.services.genomics.model.RangePosition; +import com.google.api.services.genomics.model.SearchAnnotationsRequest; +import com.google.api.services.genomics.model.Variant; +import com.google.api.services.genomics.model.VariantAnnotation; +import com.google.cloud.dataflow.sdk.Pipeline; +import com.google.cloud.dataflow.sdk.io.TextIO; +import com.google.cloud.dataflow.sdk.options.PipelineOptionsFactory; +import com.google.cloud.dataflow.sdk.transforms.Create; +import com.google.cloud.dataflow.sdk.transforms.DoFn; +import com.google.cloud.dataflow.sdk.transforms.GroupByKey; +import com.google.cloud.dataflow.sdk.transforms.ParDo; +import com.google.cloud.dataflow.sdk.values.KV; +import com.google.cloud.genomics.dataflow.utils.AnnotationUtils; +import com.google.cloud.genomics.dataflow.utils.AnnotationUtils.VariantEffect; +import com.google.cloud.genomics.dataflow.utils.DataflowWorkarounds; +import com.google.cloud.genomics.dataflow.utils.GenomicsDatasetOptions; +import com.google.cloud.genomics.dataflow.utils.GenomicsOptions; +import com.google.cloud.genomics.dataflow.utils.VariantUtils; +import com.google.cloud.genomics.utils.Contig; +import com.google.cloud.genomics.utils.Contig.SexChromosomeFilter; +import com.google.cloud.genomics.utils.GenomicsFactory; +import com.google.cloud.genomics.utils.Paginator; +import com.google.cloud.genomics.utils.Paginator.ShardBoundary; +import com.google.common.base.Joiner; +import com.google.common.base.Stopwatch; +import com.google.common.collect.ArrayListMultimap; +import com.google.common.collect.FluentIterable; +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ListMultimap; +import com.google.common.collect.Maps; +import com.google.common.collect.Range; + +import htsjdk.samtools.util.IntervalTree; +import htsjdk.samtools.util.IntervalTree.Node; + +import java.io.IOException; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.concurrent.TimeUnit; +import java.util.logging.Logger; + +/** + * Demonstrates a simple variant annotation program which takes + * {@code VariantSet}s and {@code AnnotationSet}s as input and emits + * {@code VariantAnnotation}s. This program is intended to serve as an example + * of how a variant annotation program can be structured to run in parallel, + * operating over Google Genomics input sources; the current output has limited + * biological significance. + * + * Currently only annotates SNPs and uses the following two methods: + * + */ +public final class AnnotateVariants extends DoFn> { + private static final Logger LOG = Logger.getLogger(AnnotateVariants.class.getName()); + private static final int VARIANTS_PAGE_SIZE = 5000; + private static final String VARIANT_FIELDS + = "nextPageToken,variants(id,referenceName,start,end,alternateBases,referenceBases)"; + + private final GenomicsFactory.OfflineAuth auth; + private final String varsetId; + private final List callSetIds, transcriptSetIds, variantAnnotationSetIds; + private final Map, String> refBaseCache; + + public AnnotateVariants(GenomicsFactory.OfflineAuth auth, String varsetId, + List callSetIds, List transcriptSetIds, + List variantAnnotationSetIds) { + this.auth = auth; + this.varsetId = varsetId; + this.callSetIds = callSetIds; + this.transcriptSetIds = transcriptSetIds; + this.variantAnnotationSetIds = variantAnnotationSetIds; + refBaseCache = Maps.newHashMap(); + } + + @Override + public void processElement( + DoFn>.ProcessContext c) throws Exception { + Genomics genomics = auth.getGenomics(auth.getDefaultFactory()); + + Contig contig = c.element(); + LOG.info("processing contig " + contig); + Iterable varIter = FluentIterable + .from(Paginator.Variants.create(genomics, ShardBoundary.STRICT).search( + contig.getVariantsRequest(varsetId) + // TODO: Variants-only retrieval is not well support currently. For now + // we parameterize by CallSet for performance. + .setCallSetIds(callSetIds) + .setPageSize(VARIANTS_PAGE_SIZE), + VARIANT_FIELDS)) + .filter(VariantUtils.IS_SNP); + if (!varIter.iterator().hasNext()) { + LOG.info("region has no variants, skipping"); + return; + } + + IntervalTree transcripts = retrieveTranscripts(genomics, contig); + ListMultimap, Annotation> variantAnnotations = + retrieveVariantAnnotations(genomics, contig); + + Stopwatch stopwatch = Stopwatch.createStarted(); + int varCount = 0; + for (Variant variant : varIter) { + List alleles = ImmutableList.builder() + .addAll(variant.getAlternateBases()) + .add(variant.getReferenceBases()) + .build(); + Range pos = Range.openClosed(variant.getStart(), variant.getEnd()); + for (String allele : alleles) { + String outKey = Joiner.on(":").join( + variant.getReferenceName(), variant.getStart(), allele, variant.getId()); + for (Annotation match : variantAnnotations.get(pos)) { + if (allele.equals(match.getVariant().getAlternateBases())) { + // Exact match to a known variant annotation; straightforward join. + c.output(KV.of(outKey, match.getVariant())); + } + } + + Iterator> transcriptIter = transcripts.overlappers( + pos.lowerEndpoint().intValue(), pos.upperEndpoint().intValue() - 1); // Inclusive. + while (transcriptIter.hasNext()) { + // Calculate an effect of this allele on the coding region of the given transcript. + Annotation transcript = transcriptIter.next().getValue(); + VariantEffect effect = AnnotationUtils.determineVariantTranscriptEffect( + variant.getStart(), allele, transcript, + getCachedTranscriptBases(genomics, transcript)); + if (effect != null && !VariantEffect.SYNONYMOUS_SNP.equals(effect)) { + c.output(KV.of(outKey, new VariantAnnotation() + .setAlternateBases(allele) + .setType("SNP") + .setEffect(effect.toString()) + .setGeneId(transcript.getTranscript().getGeneId()) + .setTranscriptIds(ImmutableList.of(transcript.getId())))); + } + } + } + varCount++; + if (varCount%1e3 == 0) { + LOG.info(String.format("read %d variants (%.2f / s)", + varCount, (double)varCount / stopwatch.elapsed(TimeUnit.SECONDS))); + } + } + LOG.info("finished reading " + varCount + " variants in " + stopwatch); + } + + private ListMultimap, Annotation> retrieveVariantAnnotations( + Genomics genomics, Contig contig) { + Stopwatch stopwatch = Stopwatch.createStarted(); + ListMultimap, Annotation> annotationMap = ArrayListMultimap.create(); + Iterable annotationIter = + Paginator.Annotations.create(genomics, ShardBoundary.OVERLAPS).search( + new SearchAnnotationsRequest() + .setAnnotationSetIds(variantAnnotationSetIds) + .setRange(new QueryRange() + .setReferenceName(canonicalizeRefName(contig.referenceName)) + .setStart(contig.start) + .setEnd(contig.end))); + for (Annotation annotation : annotationIter) { + RangePosition pos = annotation.getPosition(); + long start = 0; + if (pos.getStart() != null) { + start = pos.getStart(); + } + annotationMap.put(Range.closedOpen(start, pos.getEnd()), annotation); + } + LOG.info(String.format("read %d variant annotations in %s (%.2f / s)", annotationMap.size(), + stopwatch, (double)annotationMap.size() / stopwatch.elapsed(TimeUnit.SECONDS))); + return annotationMap; + } + + private IntervalTree retrieveTranscripts(Genomics genomics, Contig contig) { + Stopwatch stopwatch = Stopwatch.createStarted(); + IntervalTree transcripts = new IntervalTree<>(); + Iterable transcriptIter = + Paginator.Annotations.create(genomics, ShardBoundary.OVERLAPS).search( + new SearchAnnotationsRequest() + .setAnnotationSetIds(transcriptSetIds) + .setRange(new QueryRange() + .setReferenceName(canonicalizeRefName(contig.referenceName)) + .setStart(contig.start) + .setEnd(contig.end))); + for (Annotation annotation : transcriptIter) { + RangePosition pos = annotation.getPosition(); + transcripts.put(pos.getStart().intValue(), pos.getEnd().intValue(), annotation); + } + LOG.info(String.format("read %d transcripts in %s (%.2f / s)", transcripts.size(), + stopwatch, (double)transcripts.size() / stopwatch.elapsed(TimeUnit.SECONDS))); + return transcripts; + } + + private String getCachedTranscriptBases(Genomics genomics, Annotation transcript) + throws IOException { + RangePosition pos = transcript.getPosition(); + Range rng = Range.closedOpen(pos.getStart(), pos.getEnd()); + if (!refBaseCache.containsKey(rng)) { + refBaseCache.put(rng, retrieveReferenceBases(genomics, pos)); + } + return refBaseCache.get(rng); + } + + private String retrieveReferenceBases(Genomics genomics, RangePosition pos) throws IOException { + StringBuilder b = new StringBuilder(); + String pageToken = ""; + while (true) { + // TODO: Support full request parameterization for Paginator.References.Bases. + ListBasesResponse response = genomics.references().bases() + .list(pos.getReferenceId()) + .setStart(pos.getStart()) + .setEnd(pos.getEnd()) + .setPageToken(pageToken) + .execute(); + b.append(response.getSequence()); + pageToken = response.getNextPageToken(); + if (Strings.isNullOrEmpty(pageToken)) { + break; + } + } + return b.toString(); + } + + private static String canonicalizeRefName(String refName) { + return refName.replace("chr", ""); + } + + public static void main(String[] args) throws Exception { + GenomicsDatasetOptions opts = PipelineOptionsFactory.fromArgs(args) + .withValidation().as(GenomicsDatasetOptions.class); + GenomicsFactory.OfflineAuth auth = GenomicsOptions.Methods.getGenomicsAuth(opts); + Genomics genomics = auth.getGenomics(auth.getDefaultFactory()); + + List callSetIds = ImmutableList.of(); + if (!Strings.isNullOrEmpty(opts.getCallSetIds().trim())) { + callSetIds = ImmutableList.copyOf(opts.getCallSetIds().split(",")); + } + List transcriptSetIds = + validateAnnotationSetsFlag(genomics, opts.getTranscriptSetIds(), "TRANSCRIPT"); + List variantAnnotationSetIds = + validateAnnotationSetsFlag(genomics, opts.getVariantAnnotationSetIds(), "VARIANT"); + validateRefsetForAnnotationSets(genomics, transcriptSetIds); + + Iterable contigs = opts.isAllReferences() + ? Contig.getContigsInVariantSet( + genomics, opts.getDatasetId(), SexChromosomeFilter.INCLUDE_XY) + : Contig.parseContigsFromCommandLine(opts.getReferences()); + + Pipeline p = Pipeline.create(opts); + DataflowWorkarounds.registerGenomicsCoders(p); + p.begin() + .apply(Create.of(contigs)) + .apply(ParDo.of(new AnnotateVariants(auth, + opts.getDatasetId(), callSetIds, transcriptSetIds, variantAnnotationSetIds))) + .apply(GroupByKey.create()) + .apply(ParDo.of(new DoFn>, String>() { + @Override + public void processElement(ProcessContext c) { + c.output(c.element().getKey() + ": " + c.element().getValue()); + } + })) + .apply(TextIO.Write.to(opts.getOutput())); + p.run(); + } + + private static void validateRefsetForAnnotationSets( + Genomics genomics, List annosetIds) throws IOException { + String refsetId = null; + for (String annosetId : annosetIds) { + String gotId = genomics.annotationSets().get(annosetId).execute().getReferenceSetId(); + if (refsetId == null) { + refsetId = gotId; + } else if (!refsetId.equals(gotId)) { + throw new IllegalArgumentException("want consistent reference sets across the provided " + + "annotation sets, got " + refsetId + " and " + gotId); + } + } + } + + private static List validateAnnotationSetsFlag( + Genomics genomics, String flagValue, String wantType) throws IOException { + List annosetIds = ImmutableList.copyOf(flagValue.split(",")); + for (String annosetId : annosetIds) { + AnnotationSet annoset = genomics.annotationSets().get(annosetId).execute(); + if (!wantType.equals(annoset.getType())) { + throw new IllegalArgumentException("annotation set " + annosetId + " has type " + + annoset.getType() + ", wanted type " + wantType); + } + } + return annosetIds; + } +} diff --git a/src/main/java/com/google/cloud/genomics/dataflow/utils/AnnotationUtils.java b/src/main/java/com/google/cloud/genomics/dataflow/utils/AnnotationUtils.java new file mode 100644 index 0000000..a52adc0 --- /dev/null +++ b/src/main/java/com/google/cloud/genomics/dataflow/utils/AnnotationUtils.java @@ -0,0 +1,156 @@ +/* + * Copyright (C) 2015 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except + * in compliance with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. + */ +package com.google.cloud.genomics.dataflow.utils; + +import com.google.api.client.util.Preconditions; +import com.google.api.services.genomics.model.Annotation; +import com.google.api.services.genomics.model.TranscriptExon; +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.Range; + +import htsjdk.samtools.util.SequenceUtil; + +import java.util.Map; +import java.util.logging.Logger; + +/** + * Utilities for dealing with genomics {@link Annotation}s. Provides some very + * basic functions for determining the effect of a given mutation on a + * transcript. + */ +public class AnnotationUtils { + private static final Logger LOG = Logger.getLogger(AnnotationUtils.class.getName()); + private static final char STOP = '*'; + private static final Map AMINO_ACIDS = + ImmutableMap.builder() + .put("TTT", 'F').put("TTC", 'F').put("TTA", 'L').put("TTG", 'L') + .put("TCT", 'S').put("TCC", 'S').put("TCA", 'S').put("TCG", 'S') + .put("TAT", 'Y').put("TAC", 'Y').put("TAA", STOP).put("TAG", STOP) + .put("TGT", 'C').put("TGC", 'C').put("TGA", STOP).put("TGG", 'W') + .put("CTT", 'L').put("CTC", 'L').put("CTA", 'L').put("CTG", 'L') + .put("CCT", 'P').put("CCC", 'P').put("CCA", 'P').put("CCG", 'P') + .put("CAT", 'H').put("CAC", 'H').put("CAA", 'Q').put("CAG", 'Q') + .put("CGT", 'R').put("CGC", 'R').put("CGA", 'R').put("CGG", 'R') + .put("ATT", 'I').put("ATC", 'I').put("ATA", 'I').put("ATG", 'M') + .put("ACT", 'T').put("ACC", 'T').put("ACA", 'T').put("ACG", 'T') + .put("AAT", 'N').put("AAC", 'N').put("AAA", 'K').put("AAG", 'K') + .put("AGT", 'S').put("AGC", 'S').put("AGA", 'R').put("AGG", 'R') + .put("GTT", 'V').put("GTC", 'V').put("GTA", 'V').put("GTG", 'V') + .put("GCT", 'A').put("GCC", 'A').put("GCA", 'A').put("GCG", 'A') + .put("GAT", 'D').put("GAC", 'D').put("GAA", 'E').put("GAG", 'E') + .put("GGT", 'G').put("GGC", 'G').put("GGA", 'G').put("GGG", 'G').build(); + + private AnnotationUtils() {} + + /** + * Describes the effect of a given variant on a transcript. Subset of the + * annotations API field {@code VariantAnnotation.effect}. + */ + public enum VariantEffect { SYNONYMOUS_SNP, STOP_GAIN, STOP_LOSS, NONSYNONYMOUS_SNP } + + /** + * Determines the effect of the given allele at the given position within a + * transcript. Currently, this routine is relatively primitive: it only works + * with SNPs and only computes effects on coding regions of the transcript. + * This utility currently does not handle any splice sites well, including + * splice site disruption and codons which span two exons. + * + * @param variantStart 0-based absolute start for the variant. + * @param allele Bases to substitute at {@code variantStart}. + * @param transcript The affected transcript. + * @param transcriptBases The reference bases which span the provided + * {@code transcript}. Must match the exact length of the + * {@code transcript.position}. + * @return The effect of this variant on the given transcript, or null if + * unknown. + */ + public static VariantEffect determineVariantTranscriptEffect( + long variantStart, String allele, Annotation transcript, String transcriptBases) { + long txLen = transcript.getPosition().getEnd() - transcript.getPosition().getStart(); + Preconditions.checkArgument(transcriptBases.length() == txLen, + "transcriptBases must have equal length to the transcript; got " + + transcriptBases.length() + " and " + txLen + ", respectively"); + if (allele.length() != 1) { + LOG.fine("determineVariantTranscriptEffects() only supports SNPs, ignoring " + allele); + return null; + } + if (transcript.getTranscript().getCodingSequence() == null) { + LOG.fine("determineVariantTranscriptEffects() only supports intersection with coding " + + "transcript, ignoring "); + return null; + } + + long variantEnd = variantStart + 1; + Range variantRange = Range.closedOpen(variantStart, variantEnd); + Range codingRange = Range.closedOpen( + transcript.getTranscript().getCodingSequence().getStart(), + transcript.getTranscript().getCodingSequence().getEnd()); + if (Boolean.TRUE.equals(transcript.getPosition().getReverseStrand())) { + allele = SequenceUtil.reverseComplement(allele); + } + for (TranscriptExon exon : transcript.getTranscript().getExons()) { + // For now, only compute effects on variants within the coding region of an exon. + Range exonRange = Range.closedOpen(exon.getStart(), exon.getEnd()); + if (exonRange.isConnected(codingRange) && + exonRange.intersection(codingRange).isConnected(variantRange) && + !exonRange.intersection(codingRange).intersection(variantRange).isEmpty()) { + // Get the bases which correspond to this exon. + int txOffset = transcript.getPosition().getStart().intValue(); + String exonBases = transcriptBases.substring( + exon.getStart().intValue() - txOffset, exon.getEnd().intValue() - txOffset); + int variantExonOffset = (int) (variantStart - exon.getStart()); + + if (Boolean.TRUE.equals(transcript.getPosition().getReverseStrand())) { + // Normalize the offset and bases to 5' -> 3'. + exonBases = SequenceUtil.reverseComplement(exonBases); + variantExonOffset = (int) (exon.getEnd() - variantEnd); + } + + // Determine the indices for the codon which contains this variant. + if (exon.getFrame() == null) { + LOG.fine("exon lacks frame data, cannot determine effect"); + return null; + } + int offsetWithinCodon = (variantExonOffset + exon.getFrame().getValue()) % 3; + int codonExonOffset = variantExonOffset - offsetWithinCodon; + if (codonExonOffset < 0 || exonBases.length() <= codonExonOffset+3) { + LOG.fine("variant codon spans multiple exons, this case is not yet handled"); + return null; + } + String fromCodon = exonBases.substring(codonExonOffset, codonExonOffset+3); + String toCodon = + fromCodon.substring(0, offsetWithinCodon) + allele + + fromCodon.substring(offsetWithinCodon + 1); + return codonChangeToEffect(fromCodon, toCodon); + } + } + return null; + } + + private static VariantEffect codonChangeToEffect(String fromCodon, String toCodon) { + Preconditions.checkArgument(AMINO_ACIDS.containsKey(fromCodon), "no such codon: " + fromCodon); + Preconditions.checkArgument(AMINO_ACIDS.containsKey(toCodon), "no such codon: " + toCodon); + char fromAA = AMINO_ACIDS.get(fromCodon); + char toAA = AMINO_ACIDS.get(toCodon); + if (fromAA == toAA) { + return VariantEffect.SYNONYMOUS_SNP; + } else if (toAA == STOP) { + return VariantEffect.STOP_GAIN; + } else if (fromAA == STOP) { + return VariantEffect.STOP_LOSS; + } else { + return VariantEffect.NONSYNONYMOUS_SNP; + } + } +} diff --git a/src/main/java/com/google/cloud/genomics/dataflow/utils/GenomicsDatasetOptions.java b/src/main/java/com/google/cloud/genomics/dataflow/utils/GenomicsDatasetOptions.java index db9bf55..3156768 100644 --- a/src/main/java/com/google/cloud/genomics/dataflow/utils/GenomicsDatasetOptions.java +++ b/src/main/java/com/google/cloud/genomics/dataflow/utils/GenomicsDatasetOptions.java @@ -1,11 +1,11 @@ /* * Copyright (C) 2014 Google Inc. - * + * * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except * in compliance with the License. You may obtain a copy of the License at - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software distributed under the License * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express * or implied. See the License for the specific language governing permissions and limitations under @@ -13,11 +13,6 @@ */ package com.google.cloud.genomics.dataflow.utils; -import java.io.IOException; -import java.security.GeneralSecurityException; -import java.util.List; -import java.util.logging.Logger; - import com.google.api.services.genomics.Genomics; import com.google.api.services.genomics.model.SearchVariantsRequest; import com.google.cloud.dataflow.sdk.options.Default; @@ -28,6 +23,11 @@ import com.google.common.base.Preconditions; import com.google.common.collect.Lists; +import java.io.IOException; +import java.security.GeneralSecurityException; +import java.util.List; +import java.util.logging.Logger; + /** * A common options class for all pipelines that operate over a single dataset and write their * analysis to a file. @@ -70,38 +70,50 @@ public static void validateOptions(GenomicsDatasetOptions options) { Preconditions.checkArgument(0 < options.getBinSize(), "binSize must be greater than zero"); GenomicsOptions.Methods.validateOptions(options); } - } @Description("The ID of the Google Genomics dataset this pipeline is accessing. " + "Defaults to 1000 Genomes.") @Default.String("10473108253681171589") String getDatasetId(); - void setDatasetId(String datasetId); + @Description("The IDs of the Google Genomics call sets this pipeline is working with, comma " + + "delimited.Defaults to 1000 Genomes HG00261.") + @Default.String("10473108253681171589-0") + String getCallSetIds(); + void setCallSetIds(String callSetIds); + @Description("Path of the file to which to write.") String getOutput(); - void setOutput(String output); + @Description("The IDs of the Google Genomics transcript sets this pipeline is working with, " + + "comma delimited. Defaults to UCSC refGene (hg19).") + @Default.String("CIjfoPXj9LqPlAEQr_GXvYPKvtPqAQ") + String getTranscriptSetIds(); + void setTranscriptSetIds(String transcriptSetIds); + + @Description("The IDs of the Google Genomics variant annotation sets this pipeline is working " + + "with, comma delimited. Defaults to ClinVar (GRCh37).") + @Default.String("CILSqfjtlY6tHxCu1Keyxtu472M") + String getVariantAnnotationSetIds(); + void setVariantAnnotationSetIds(String variantAnnotationSetIds); + @Description("By default, variants analyses will be run on BRCA1. Pass this flag to run on all " + "references present in the dataset. Note that certain jobs such as PCA and IBS " + "will automatically exclude X and Y chromosomes when this option is true.") boolean isAllReferences(); - void setAllReferences(boolean allReferences); @Description("Comma separated tuples of reference:start:end,... Defaults to " + Contig.BRCA1) @Default.String(Contig.BRCA1) String getReferences(); - void setReferences(String references); @Description("The maximum number of bases per shard.") @Default.Long(Contig.DEFAULT_NUMBER_OF_BASES_PER_SHARD) long getBasesPerShard(); - void setBasesPerShard(long basesPerShard); @Description("The maximum number of calls to return. However, at least one variant will always " @@ -117,13 +129,11 @@ public static void validateOptions(GenomicsDatasetOptions options) { + "takes into account non-variant segment records that overlap variants within the dataset.") @Default.Boolean(false) boolean getHasNonVariantSegments(); - void setHasNonVariantSegments(boolean hasNonVariantSegments); @Description("Genomic window \"bin\" size to use for data containing non-variant segments when " + "joining those non-variant segment records with variant records.") @Default.Integer(1000) int getBinSize(); - void setBinSize(int binSize); } diff --git a/src/main/java/com/google/cloud/genomics/dataflow/utils/VariantUtils.java b/src/main/java/com/google/cloud/genomics/dataflow/utils/VariantUtils.java index f4c64bf..dca07f9 100644 --- a/src/main/java/com/google/cloud/genomics/dataflow/utils/VariantUtils.java +++ b/src/main/java/com/google/cloud/genomics/dataflow/utils/VariantUtils.java @@ -1,11 +1,11 @@ /* * Copyright (C) 2015 Google Inc. - * + * * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except * in compliance with the License. You may obtain a copy of the License at - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software distributed under the License * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express * or implied. See the License for the specific language governing permissions and limitations under @@ -13,9 +13,6 @@ */ package com.google.cloud.genomics.dataflow.utils; -import java.util.Comparator; -import java.util.List; - import com.google.api.services.genomics.model.Call; import com.google.api.services.genomics.model.Variant; import com.google.common.base.Function; @@ -24,6 +21,9 @@ import com.google.common.collect.Iterables; import com.google.common.collect.Ordering; +import java.util.Comparator; +import java.util.List; + public class VariantUtils { public static boolean isVariant(Variant variant) { @@ -31,6 +31,13 @@ public static boolean isVariant(Variant variant) { return !(null == alternateBases || alternateBases.isEmpty()); } + public static final Predicate IS_SNP = new Predicate() { + @Override + public boolean apply(Variant variant) { + return isSnp(variant); + } + }; + public static boolean isSnp(Variant variant) { return isVariant(variant) && LENGTH_IS_1.apply(variant.getReferenceBases()) && Iterables.all(variant.getAlternateBases(), LENGTH_IS_1); diff --git a/src/test/java/com/google/cloud/genomics/dataflow/utils/AnnotationUtilsTest.java b/src/test/java/com/google/cloud/genomics/dataflow/utils/AnnotationUtilsTest.java new file mode 100644 index 0000000..ac1fb4c --- /dev/null +++ b/src/test/java/com/google/cloud/genomics/dataflow/utils/AnnotationUtilsTest.java @@ -0,0 +1,132 @@ +package com.google.cloud.genomics.dataflow.utils; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNull; + +import com.google.api.services.genomics.model.Annotation; +import com.google.api.services.genomics.model.Int32Value; +import com.google.api.services.genomics.model.RangePosition; +import com.google.api.services.genomics.model.Transcript; +import com.google.api.services.genomics.model.TranscriptCodingSequence; +import com.google.api.services.genomics.model.TranscriptExon; +import com.google.cloud.genomics.dataflow.utils.AnnotationUtils.VariantEffect; +import com.google.common.base.Strings; +import com.google.common.collect.ImmutableList; + +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.JUnit4; + +import htsjdk.samtools.util.SequenceUtil; + +@RunWith(JUnit4.class) +public class AnnotationUtilsTest { + @Test + public void testDetermineVariantTranscriptEffect_simpleShort() { + Annotation transcript = new Annotation() + .setPosition(new RangePosition().setReferenceName("1").setStart(2L).setEnd(9L)) + .setTranscript(new Transcript() + .setCodingSequence(new TranscriptCodingSequence().setStart(2L).setEnd(9L)) + .setExons(ImmutableList.of( + new TranscriptExon().setStart(2L).setEnd(9L).setFrame(new Int32Value().setValue(0))))); + + assertEquals("GATTACA -> GCTTACA, codon is GAT -> GCT, AA is D -> A", + VariantEffect.NONSYNONYMOUS_SNP, + AnnotationUtils.determineVariantTranscriptEffect(3L, "C", transcript, "GATTACA")); + assertEquals("ATGTGAA -> ATGTGGA, codon is TGA -> TGG, AA is STOP -> W", + VariantEffect.STOP_LOSS, + AnnotationUtils.determineVariantTranscriptEffect(7L, "G", transcript, "ATGTGAA")); + assertEquals("CCCAAAT -> CCCTAAT, codon is AAA -> TAA, AA is K -> STOP", + VariantEffect.STOP_GAIN, + AnnotationUtils.determineVariantTranscriptEffect(5L, "T", transcript, "CCCAAAT")); + assertEquals("GATTACA -> GACTACA, codon is GAT -> GAC, AA is D -> D", + VariantEffect.SYNONYMOUS_SNP, + AnnotationUtils.determineVariantTranscriptEffect(4L, "C", transcript, "GATTACA")); + assertNull("variant does not intersect transcript", + AnnotationUtils.determineVariantTranscriptEffect(123L, "C", transcript, "GATTACA")); + } + + @Test + public void testDetermineVariantTranscriptEffect_reverseStrand() { + Annotation transcript = new Annotation() + .setPosition( + new RangePosition().setReferenceName("1").setStart(2L).setEnd(20L).setReverseStrand(true)) + .setTranscript(new Transcript() + .setCodingSequence(new TranscriptCodingSequence().setStart(3L).setEnd(18L)) + .setExons(ImmutableList.of( + new TranscriptExon().setStart(2L).setEnd(7L).setFrame(new Int32Value().setValue(2)), + new TranscriptExon().setStart(10L).setEnd(20L).setFrame(new Int32Value().setValue(1))) + )); + + String bases = SequenceUtil.reverseComplement( + // First exon [10, 20) + "AC" + // 5' UTR + "ATG" + "ACG" + "GT" + + // intron + "CCC" + + // Second exon [2, 7) + "G" + "TAG" + + "G"); // 3' UTR + assertEquals("ATG -> ACG (reverse complement), AA is M -> T", + VariantEffect.NONSYNONYMOUS_SNP, + AnnotationUtils.determineVariantTranscriptEffect(16L, "G", transcript, bases)); + assertEquals("TAG -> CAG (reverse complement), AA is STOP -> Q", + VariantEffect.STOP_LOSS, + AnnotationUtils.determineVariantTranscriptEffect(5L, "G", transcript, bases)); + assertNull("mutates intron", + AnnotationUtils.determineVariantTranscriptEffect(9L, "C", transcript, bases)); + assertNull("mutates 5' UTR", + AnnotationUtils.determineVariantTranscriptEffect(19L, "C", transcript, bases)); + } + + @Test + public void testDetermineVariantTranscriptEffect_noncoding() { + Annotation transcript = new Annotation() + .setPosition(new RangePosition().setReferenceName("1").setStart(2L).setEnd(9L)) + .setTranscript(new Transcript() + .setExons(ImmutableList.of(new TranscriptExon().setStart(2L).setEnd(9L)))); + + assertNull(AnnotationUtils.determineVariantTranscriptEffect(3L, "C", transcript, "GATTACA")); + assertNull(AnnotationUtils.determineVariantTranscriptEffect(11L, "C", transcript, "GATTACA")); + } + + @Test + public void testDetermineVariantTranscriptEffect_frameless() { + Annotation transcript = new Annotation() + .setPosition(new RangePosition().setReferenceName("1").setStart(2L).setEnd(9L)) + .setTranscript(new Transcript() + .setCodingSequence(new TranscriptCodingSequence().setStart(2L).setEnd(9L)) + .setExons(ImmutableList.of(new TranscriptExon().setStart(2L).setEnd(9L)))); + + assertNull(AnnotationUtils.determineVariantTranscriptEffect(3L, "C", transcript, "GATTACA")); + assertNull(AnnotationUtils.determineVariantTranscriptEffect(11L, "C", transcript, "GATTACA")); + } + + @Test + public void testDetermineVariantTranscriptEffect_multiExon() { + String bases = Strings.repeat("ACTTGGGTCA", 60); + Annotation transcript = new Annotation() + .setPosition(new RangePosition().setReferenceName("1").setStart(100L).setEnd(700L)) + .setTranscript(new Transcript() + .setCodingSequence(new TranscriptCodingSequence().setStart(250L).setEnd(580L)) + .setExons(ImmutableList.of( + new TranscriptExon().setStart(100L).setEnd(180L), + new TranscriptExon().setStart(200L).setEnd(300L).setFrame(new Int32Value().setValue(2)), + new TranscriptExon().setStart(400L).setEnd(500L).setFrame(new Int32Value().setValue(1)), + new TranscriptExon().setStart(550L).setEnd(600L).setFrame(new Int32Value().setValue(0))) + )); + + assertNull("mutates noncoding exon", + AnnotationUtils.determineVariantTranscriptEffect(150L, "C", transcript, bases)); + assertNull("mutates noncoding region of coding exon", + AnnotationUtils.determineVariantTranscriptEffect(240L, "C", transcript, bases)); + assertNull("mutates intron", + AnnotationUtils.determineVariantTranscriptEffect(350L, "C", transcript, bases)); + assertEquals("mutates first coding base, ACT -> TCT", + VariantEffect.NONSYNONYMOUS_SNP, + AnnotationUtils.determineVariantTranscriptEffect(250L, "T", transcript, bases)); + assertEquals("mutates middle exon, TGG -> TCG", + VariantEffect.NONSYNONYMOUS_SNP, + AnnotationUtils.determineVariantTranscriptEffect(454L, "C", transcript, bases)); + } +}