diff --git a/src/main/java/com/google/cloud/genomics/dataflow/pipelines/AnnotateVariants.java b/src/main/java/com/google/cloud/genomics/dataflow/pipelines/AnnotateVariants.java
new file mode 100644
index 0000000..b05922b
--- /dev/null
+++ b/src/main/java/com/google/cloud/genomics/dataflow/pipelines/AnnotateVariants.java
@@ -0,0 +1,316 @@
+/*
+ * Copyright (C) 2015 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+ * in compliance with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the License
+ * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing permissions and limitations under
+ * the License.
+ */
+package com.google.cloud.genomics.dataflow.pipelines;
+
+import com.google.api.client.util.Strings;
+import com.google.api.services.genomics.Genomics;
+import com.google.api.services.genomics.model.Annotation;
+import com.google.api.services.genomics.model.AnnotationSet;
+import com.google.api.services.genomics.model.ListBasesResponse;
+import com.google.api.services.genomics.model.QueryRange;
+import com.google.api.services.genomics.model.RangePosition;
+import com.google.api.services.genomics.model.SearchAnnotationsRequest;
+import com.google.api.services.genomics.model.Variant;
+import com.google.api.services.genomics.model.VariantAnnotation;
+import com.google.cloud.dataflow.sdk.Pipeline;
+import com.google.cloud.dataflow.sdk.io.TextIO;
+import com.google.cloud.dataflow.sdk.options.PipelineOptionsFactory;
+import com.google.cloud.dataflow.sdk.transforms.Create;
+import com.google.cloud.dataflow.sdk.transforms.DoFn;
+import com.google.cloud.dataflow.sdk.transforms.GroupByKey;
+import com.google.cloud.dataflow.sdk.transforms.ParDo;
+import com.google.cloud.dataflow.sdk.values.KV;
+import com.google.cloud.genomics.dataflow.utils.AnnotationUtils;
+import com.google.cloud.genomics.dataflow.utils.AnnotationUtils.VariantEffect;
+import com.google.cloud.genomics.dataflow.utils.DataflowWorkarounds;
+import com.google.cloud.genomics.dataflow.utils.GenomicsDatasetOptions;
+import com.google.cloud.genomics.dataflow.utils.GenomicsOptions;
+import com.google.cloud.genomics.dataflow.utils.VariantUtils;
+import com.google.cloud.genomics.utils.Contig;
+import com.google.cloud.genomics.utils.Contig.SexChromosomeFilter;
+import com.google.cloud.genomics.utils.GenomicsFactory;
+import com.google.cloud.genomics.utils.Paginator;
+import com.google.cloud.genomics.utils.Paginator.ShardBoundary;
+import com.google.common.base.Joiner;
+import com.google.common.base.Stopwatch;
+import com.google.common.collect.ArrayListMultimap;
+import com.google.common.collect.FluentIterable;
+import com.google.common.collect.ImmutableList;
+import com.google.common.collect.ListMultimap;
+import com.google.common.collect.Maps;
+import com.google.common.collect.Range;
+
+import htsjdk.samtools.util.IntervalTree;
+import htsjdk.samtools.util.IntervalTree.Node;
+
+import java.io.IOException;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+import java.util.concurrent.TimeUnit;
+import java.util.logging.Logger;
+
+/**
+ * Demonstrates a simple variant annotation program which takes
+ * {@code VariantSet}s and {@code AnnotationSet}s as input and emits
+ * {@code VariantAnnotation}s. This program is intended to serve as an example
+ * of how a variant annotation program can be structured to run in parallel,
+ * operating over Google Genomics input sources; the current output has limited
+ * biological significance.
+ *
+ * Currently only annotates SNPs and uses the following two methods:
+ *
+ * - Determines the effect of the provided variants on the provided
+ * transcripts, if any. A result is only emitted for a subset of cases where
+ * the variant appears to cause a coding change in the transcript.
+ *
- Performs an exact match join on the provided existing variant
+ * annotations, if any.
+ *
+ */
+public final class AnnotateVariants extends DoFn> {
+ private static final Logger LOG = Logger.getLogger(AnnotateVariants.class.getName());
+ private static final int VARIANTS_PAGE_SIZE = 5000;
+ private static final String VARIANT_FIELDS
+ = "nextPageToken,variants(id,referenceName,start,end,alternateBases,referenceBases)";
+
+ private final GenomicsFactory.OfflineAuth auth;
+ private final String varsetId;
+ private final List callSetIds, transcriptSetIds, variantAnnotationSetIds;
+ private final Map, String> refBaseCache;
+
+ public AnnotateVariants(GenomicsFactory.OfflineAuth auth, String varsetId,
+ List callSetIds, List transcriptSetIds,
+ List variantAnnotationSetIds) {
+ this.auth = auth;
+ this.varsetId = varsetId;
+ this.callSetIds = callSetIds;
+ this.transcriptSetIds = transcriptSetIds;
+ this.variantAnnotationSetIds = variantAnnotationSetIds;
+ refBaseCache = Maps.newHashMap();
+ }
+
+ @Override
+ public void processElement(
+ DoFn>.ProcessContext c) throws Exception {
+ Genomics genomics = auth.getGenomics(auth.getDefaultFactory());
+
+ Contig contig = c.element();
+ LOG.info("processing contig " + contig);
+ Iterable varIter = FluentIterable
+ .from(Paginator.Variants.create(genomics, ShardBoundary.STRICT).search(
+ contig.getVariantsRequest(varsetId)
+ // TODO: Variants-only retrieval is not well support currently. For now
+ // we parameterize by CallSet for performance.
+ .setCallSetIds(callSetIds)
+ .setPageSize(VARIANTS_PAGE_SIZE),
+ VARIANT_FIELDS))
+ .filter(VariantUtils.IS_SNP);
+ if (!varIter.iterator().hasNext()) {
+ LOG.info("region has no variants, skipping");
+ return;
+ }
+
+ IntervalTree transcripts = retrieveTranscripts(genomics, contig);
+ ListMultimap, Annotation> variantAnnotations =
+ retrieveVariantAnnotations(genomics, contig);
+
+ Stopwatch stopwatch = Stopwatch.createStarted();
+ int varCount = 0;
+ for (Variant variant : varIter) {
+ List alleles = ImmutableList.builder()
+ .addAll(variant.getAlternateBases())
+ .add(variant.getReferenceBases())
+ .build();
+ Range pos = Range.openClosed(variant.getStart(), variant.getEnd());
+ for (String allele : alleles) {
+ String outKey = Joiner.on(":").join(
+ variant.getReferenceName(), variant.getStart(), allele, variant.getId());
+ for (Annotation match : variantAnnotations.get(pos)) {
+ if (allele.equals(match.getVariant().getAlternateBases())) {
+ // Exact match to a known variant annotation; straightforward join.
+ c.output(KV.of(outKey, match.getVariant()));
+ }
+ }
+
+ Iterator> transcriptIter = transcripts.overlappers(
+ pos.lowerEndpoint().intValue(), pos.upperEndpoint().intValue() - 1); // Inclusive.
+ while (transcriptIter.hasNext()) {
+ // Calculate an effect of this allele on the coding region of the given transcript.
+ Annotation transcript = transcriptIter.next().getValue();
+ VariantEffect effect = AnnotationUtils.determineVariantTranscriptEffect(
+ variant.getStart(), allele, transcript,
+ getCachedTranscriptBases(genomics, transcript));
+ if (effect != null && !VariantEffect.SYNONYMOUS_SNP.equals(effect)) {
+ c.output(KV.of(outKey, new VariantAnnotation()
+ .setAlternateBases(allele)
+ .setType("SNP")
+ .setEffect(effect.toString())
+ .setGeneId(transcript.getTranscript().getGeneId())
+ .setTranscriptIds(ImmutableList.of(transcript.getId()))));
+ }
+ }
+ }
+ varCount++;
+ if (varCount%1e3 == 0) {
+ LOG.info(String.format("read %d variants (%.2f / s)",
+ varCount, (double)varCount / stopwatch.elapsed(TimeUnit.SECONDS)));
+ }
+ }
+ LOG.info("finished reading " + varCount + " variants in " + stopwatch);
+ }
+
+ private ListMultimap, Annotation> retrieveVariantAnnotations(
+ Genomics genomics, Contig contig) {
+ Stopwatch stopwatch = Stopwatch.createStarted();
+ ListMultimap, Annotation> annotationMap = ArrayListMultimap.create();
+ Iterable annotationIter =
+ Paginator.Annotations.create(genomics, ShardBoundary.OVERLAPS).search(
+ new SearchAnnotationsRequest()
+ .setAnnotationSetIds(variantAnnotationSetIds)
+ .setRange(new QueryRange()
+ .setReferenceName(canonicalizeRefName(contig.referenceName))
+ .setStart(contig.start)
+ .setEnd(contig.end)));
+ for (Annotation annotation : annotationIter) {
+ RangePosition pos = annotation.getPosition();
+ long start = 0;
+ if (pos.getStart() != null) {
+ start = pos.getStart();
+ }
+ annotationMap.put(Range.closedOpen(start, pos.getEnd()), annotation);
+ }
+ LOG.info(String.format("read %d variant annotations in %s (%.2f / s)", annotationMap.size(),
+ stopwatch, (double)annotationMap.size() / stopwatch.elapsed(TimeUnit.SECONDS)));
+ return annotationMap;
+ }
+
+ private IntervalTree retrieveTranscripts(Genomics genomics, Contig contig) {
+ Stopwatch stopwatch = Stopwatch.createStarted();
+ IntervalTree transcripts = new IntervalTree<>();
+ Iterable transcriptIter =
+ Paginator.Annotations.create(genomics, ShardBoundary.OVERLAPS).search(
+ new SearchAnnotationsRequest()
+ .setAnnotationSetIds(transcriptSetIds)
+ .setRange(new QueryRange()
+ .setReferenceName(canonicalizeRefName(contig.referenceName))
+ .setStart(contig.start)
+ .setEnd(contig.end)));
+ for (Annotation annotation : transcriptIter) {
+ RangePosition pos = annotation.getPosition();
+ transcripts.put(pos.getStart().intValue(), pos.getEnd().intValue(), annotation);
+ }
+ LOG.info(String.format("read %d transcripts in %s (%.2f / s)", transcripts.size(),
+ stopwatch, (double)transcripts.size() / stopwatch.elapsed(TimeUnit.SECONDS)));
+ return transcripts;
+ }
+
+ private String getCachedTranscriptBases(Genomics genomics, Annotation transcript)
+ throws IOException {
+ RangePosition pos = transcript.getPosition();
+ Range rng = Range.closedOpen(pos.getStart(), pos.getEnd());
+ if (!refBaseCache.containsKey(rng)) {
+ refBaseCache.put(rng, retrieveReferenceBases(genomics, pos));
+ }
+ return refBaseCache.get(rng);
+ }
+
+ private String retrieveReferenceBases(Genomics genomics, RangePosition pos) throws IOException {
+ StringBuilder b = new StringBuilder();
+ String pageToken = "";
+ while (true) {
+ // TODO: Support full request parameterization for Paginator.References.Bases.
+ ListBasesResponse response = genomics.references().bases()
+ .list(pos.getReferenceId())
+ .setStart(pos.getStart())
+ .setEnd(pos.getEnd())
+ .setPageToken(pageToken)
+ .execute();
+ b.append(response.getSequence());
+ pageToken = response.getNextPageToken();
+ if (Strings.isNullOrEmpty(pageToken)) {
+ break;
+ }
+ }
+ return b.toString();
+ }
+
+ private static String canonicalizeRefName(String refName) {
+ return refName.replace("chr", "");
+ }
+
+ public static void main(String[] args) throws Exception {
+ GenomicsDatasetOptions opts = PipelineOptionsFactory.fromArgs(args)
+ .withValidation().as(GenomicsDatasetOptions.class);
+ GenomicsFactory.OfflineAuth auth = GenomicsOptions.Methods.getGenomicsAuth(opts);
+ Genomics genomics = auth.getGenomics(auth.getDefaultFactory());
+
+ List callSetIds = ImmutableList.of();
+ if (!Strings.isNullOrEmpty(opts.getCallSetIds().trim())) {
+ callSetIds = ImmutableList.copyOf(opts.getCallSetIds().split(","));
+ }
+ List transcriptSetIds =
+ validateAnnotationSetsFlag(genomics, opts.getTranscriptSetIds(), "TRANSCRIPT");
+ List variantAnnotationSetIds =
+ validateAnnotationSetsFlag(genomics, opts.getVariantAnnotationSetIds(), "VARIANT");
+ validateRefsetForAnnotationSets(genomics, transcriptSetIds);
+
+ Iterable contigs = opts.isAllReferences()
+ ? Contig.getContigsInVariantSet(
+ genomics, opts.getDatasetId(), SexChromosomeFilter.INCLUDE_XY)
+ : Contig.parseContigsFromCommandLine(opts.getReferences());
+
+ Pipeline p = Pipeline.create(opts);
+ DataflowWorkarounds.registerGenomicsCoders(p);
+ p.begin()
+ .apply(Create.of(contigs))
+ .apply(ParDo.of(new AnnotateVariants(auth,
+ opts.getDatasetId(), callSetIds, transcriptSetIds, variantAnnotationSetIds)))
+ .apply(GroupByKey.create())
+ .apply(ParDo.of(new DoFn>, String>() {
+ @Override
+ public void processElement(ProcessContext c) {
+ c.output(c.element().getKey() + ": " + c.element().getValue());
+ }
+ }))
+ .apply(TextIO.Write.to(opts.getOutput()));
+ p.run();
+ }
+
+ private static void validateRefsetForAnnotationSets(
+ Genomics genomics, List annosetIds) throws IOException {
+ String refsetId = null;
+ for (String annosetId : annosetIds) {
+ String gotId = genomics.annotationSets().get(annosetId).execute().getReferenceSetId();
+ if (refsetId == null) {
+ refsetId = gotId;
+ } else if (!refsetId.equals(gotId)) {
+ throw new IllegalArgumentException("want consistent reference sets across the provided " +
+ "annotation sets, got " + refsetId + " and " + gotId);
+ }
+ }
+ }
+
+ private static List validateAnnotationSetsFlag(
+ Genomics genomics, String flagValue, String wantType) throws IOException {
+ List annosetIds = ImmutableList.copyOf(flagValue.split(","));
+ for (String annosetId : annosetIds) {
+ AnnotationSet annoset = genomics.annotationSets().get(annosetId).execute();
+ if (!wantType.equals(annoset.getType())) {
+ throw new IllegalArgumentException("annotation set " + annosetId + " has type " +
+ annoset.getType() + ", wanted type " + wantType);
+ }
+ }
+ return annosetIds;
+ }
+}
diff --git a/src/main/java/com/google/cloud/genomics/dataflow/utils/AnnotationUtils.java b/src/main/java/com/google/cloud/genomics/dataflow/utils/AnnotationUtils.java
new file mode 100644
index 0000000..a52adc0
--- /dev/null
+++ b/src/main/java/com/google/cloud/genomics/dataflow/utils/AnnotationUtils.java
@@ -0,0 +1,156 @@
+/*
+ * Copyright (C) 2015 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+ * in compliance with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the License
+ * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing permissions and limitations under
+ * the License.
+ */
+package com.google.cloud.genomics.dataflow.utils;
+
+import com.google.api.client.util.Preconditions;
+import com.google.api.services.genomics.model.Annotation;
+import com.google.api.services.genomics.model.TranscriptExon;
+import com.google.common.collect.ImmutableMap;
+import com.google.common.collect.Range;
+
+import htsjdk.samtools.util.SequenceUtil;
+
+import java.util.Map;
+import java.util.logging.Logger;
+
+/**
+ * Utilities for dealing with genomics {@link Annotation}s. Provides some very
+ * basic functions for determining the effect of a given mutation on a
+ * transcript.
+ */
+public class AnnotationUtils {
+ private static final Logger LOG = Logger.getLogger(AnnotationUtils.class.getName());
+ private static final char STOP = '*';
+ private static final Map AMINO_ACIDS =
+ ImmutableMap.builder()
+ .put("TTT", 'F').put("TTC", 'F').put("TTA", 'L').put("TTG", 'L')
+ .put("TCT", 'S').put("TCC", 'S').put("TCA", 'S').put("TCG", 'S')
+ .put("TAT", 'Y').put("TAC", 'Y').put("TAA", STOP).put("TAG", STOP)
+ .put("TGT", 'C').put("TGC", 'C').put("TGA", STOP).put("TGG", 'W')
+ .put("CTT", 'L').put("CTC", 'L').put("CTA", 'L').put("CTG", 'L')
+ .put("CCT", 'P').put("CCC", 'P').put("CCA", 'P').put("CCG", 'P')
+ .put("CAT", 'H').put("CAC", 'H').put("CAA", 'Q').put("CAG", 'Q')
+ .put("CGT", 'R').put("CGC", 'R').put("CGA", 'R').put("CGG", 'R')
+ .put("ATT", 'I').put("ATC", 'I').put("ATA", 'I').put("ATG", 'M')
+ .put("ACT", 'T').put("ACC", 'T').put("ACA", 'T').put("ACG", 'T')
+ .put("AAT", 'N').put("AAC", 'N').put("AAA", 'K').put("AAG", 'K')
+ .put("AGT", 'S').put("AGC", 'S').put("AGA", 'R').put("AGG", 'R')
+ .put("GTT", 'V').put("GTC", 'V').put("GTA", 'V').put("GTG", 'V')
+ .put("GCT", 'A').put("GCC", 'A').put("GCA", 'A').put("GCG", 'A')
+ .put("GAT", 'D').put("GAC", 'D').put("GAA", 'E').put("GAG", 'E')
+ .put("GGT", 'G').put("GGC", 'G').put("GGA", 'G').put("GGG", 'G').build();
+
+ private AnnotationUtils() {}
+
+ /**
+ * Describes the effect of a given variant on a transcript. Subset of the
+ * annotations API field {@code VariantAnnotation.effect}.
+ */
+ public enum VariantEffect { SYNONYMOUS_SNP, STOP_GAIN, STOP_LOSS, NONSYNONYMOUS_SNP }
+
+ /**
+ * Determines the effect of the given allele at the given position within a
+ * transcript. Currently, this routine is relatively primitive: it only works
+ * with SNPs and only computes effects on coding regions of the transcript.
+ * This utility currently does not handle any splice sites well, including
+ * splice site disruption and codons which span two exons.
+ *
+ * @param variantStart 0-based absolute start for the variant.
+ * @param allele Bases to substitute at {@code variantStart}.
+ * @param transcript The affected transcript.
+ * @param transcriptBases The reference bases which span the provided
+ * {@code transcript}. Must match the exact length of the
+ * {@code transcript.position}.
+ * @return The effect of this variant on the given transcript, or null if
+ * unknown.
+ */
+ public static VariantEffect determineVariantTranscriptEffect(
+ long variantStart, String allele, Annotation transcript, String transcriptBases) {
+ long txLen = transcript.getPosition().getEnd() - transcript.getPosition().getStart();
+ Preconditions.checkArgument(transcriptBases.length() == txLen,
+ "transcriptBases must have equal length to the transcript; got " +
+ transcriptBases.length() + " and " + txLen + ", respectively");
+ if (allele.length() != 1) {
+ LOG.fine("determineVariantTranscriptEffects() only supports SNPs, ignoring " + allele);
+ return null;
+ }
+ if (transcript.getTranscript().getCodingSequence() == null) {
+ LOG.fine("determineVariantTranscriptEffects() only supports intersection with coding " +
+ "transcript, ignoring ");
+ return null;
+ }
+
+ long variantEnd = variantStart + 1;
+ Range variantRange = Range.closedOpen(variantStart, variantEnd);
+ Range codingRange = Range.closedOpen(
+ transcript.getTranscript().getCodingSequence().getStart(),
+ transcript.getTranscript().getCodingSequence().getEnd());
+ if (Boolean.TRUE.equals(transcript.getPosition().getReverseStrand())) {
+ allele = SequenceUtil.reverseComplement(allele);
+ }
+ for (TranscriptExon exon : transcript.getTranscript().getExons()) {
+ // For now, only compute effects on variants within the coding region of an exon.
+ Range exonRange = Range.closedOpen(exon.getStart(), exon.getEnd());
+ if (exonRange.isConnected(codingRange) &&
+ exonRange.intersection(codingRange).isConnected(variantRange) &&
+ !exonRange.intersection(codingRange).intersection(variantRange).isEmpty()) {
+ // Get the bases which correspond to this exon.
+ int txOffset = transcript.getPosition().getStart().intValue();
+ String exonBases = transcriptBases.substring(
+ exon.getStart().intValue() - txOffset, exon.getEnd().intValue() - txOffset);
+ int variantExonOffset = (int) (variantStart - exon.getStart());
+
+ if (Boolean.TRUE.equals(transcript.getPosition().getReverseStrand())) {
+ // Normalize the offset and bases to 5' -> 3'.
+ exonBases = SequenceUtil.reverseComplement(exonBases);
+ variantExonOffset = (int) (exon.getEnd() - variantEnd);
+ }
+
+ // Determine the indices for the codon which contains this variant.
+ if (exon.getFrame() == null) {
+ LOG.fine("exon lacks frame data, cannot determine effect");
+ return null;
+ }
+ int offsetWithinCodon = (variantExonOffset + exon.getFrame().getValue()) % 3;
+ int codonExonOffset = variantExonOffset - offsetWithinCodon;
+ if (codonExonOffset < 0 || exonBases.length() <= codonExonOffset+3) {
+ LOG.fine("variant codon spans multiple exons, this case is not yet handled");
+ return null;
+ }
+ String fromCodon = exonBases.substring(codonExonOffset, codonExonOffset+3);
+ String toCodon =
+ fromCodon.substring(0, offsetWithinCodon) + allele +
+ fromCodon.substring(offsetWithinCodon + 1);
+ return codonChangeToEffect(fromCodon, toCodon);
+ }
+ }
+ return null;
+ }
+
+ private static VariantEffect codonChangeToEffect(String fromCodon, String toCodon) {
+ Preconditions.checkArgument(AMINO_ACIDS.containsKey(fromCodon), "no such codon: " + fromCodon);
+ Preconditions.checkArgument(AMINO_ACIDS.containsKey(toCodon), "no such codon: " + toCodon);
+ char fromAA = AMINO_ACIDS.get(fromCodon);
+ char toAA = AMINO_ACIDS.get(toCodon);
+ if (fromAA == toAA) {
+ return VariantEffect.SYNONYMOUS_SNP;
+ } else if (toAA == STOP) {
+ return VariantEffect.STOP_GAIN;
+ } else if (fromAA == STOP) {
+ return VariantEffect.STOP_LOSS;
+ } else {
+ return VariantEffect.NONSYNONYMOUS_SNP;
+ }
+ }
+}
diff --git a/src/main/java/com/google/cloud/genomics/dataflow/utils/GenomicsDatasetOptions.java b/src/main/java/com/google/cloud/genomics/dataflow/utils/GenomicsDatasetOptions.java
index db9bf55..3156768 100644
--- a/src/main/java/com/google/cloud/genomics/dataflow/utils/GenomicsDatasetOptions.java
+++ b/src/main/java/com/google/cloud/genomics/dataflow/utils/GenomicsDatasetOptions.java
@@ -1,11 +1,11 @@
/*
* Copyright (C) 2014 Google Inc.
- *
+ *
* Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
* in compliance with the License. You may obtain a copy of the License at
- *
+ *
* http://www.apache.org/licenses/LICENSE-2.0
- *
+ *
* Unless required by applicable law or agreed to in writing, software distributed under the License
* is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
* or implied. See the License for the specific language governing permissions and limitations under
@@ -13,11 +13,6 @@
*/
package com.google.cloud.genomics.dataflow.utils;
-import java.io.IOException;
-import java.security.GeneralSecurityException;
-import java.util.List;
-import java.util.logging.Logger;
-
import com.google.api.services.genomics.Genomics;
import com.google.api.services.genomics.model.SearchVariantsRequest;
import com.google.cloud.dataflow.sdk.options.Default;
@@ -28,6 +23,11 @@
import com.google.common.base.Preconditions;
import com.google.common.collect.Lists;
+import java.io.IOException;
+import java.security.GeneralSecurityException;
+import java.util.List;
+import java.util.logging.Logger;
+
/**
* A common options class for all pipelines that operate over a single dataset and write their
* analysis to a file.
@@ -70,38 +70,50 @@ public static void validateOptions(GenomicsDatasetOptions options) {
Preconditions.checkArgument(0 < options.getBinSize(), "binSize must be greater than zero");
GenomicsOptions.Methods.validateOptions(options);
}
-
}
@Description("The ID of the Google Genomics dataset this pipeline is accessing. "
+ "Defaults to 1000 Genomes.")
@Default.String("10473108253681171589")
String getDatasetId();
-
void setDatasetId(String datasetId);
+ @Description("The IDs of the Google Genomics call sets this pipeline is working with, comma "
+ + "delimited.Defaults to 1000 Genomes HG00261.")
+ @Default.String("10473108253681171589-0")
+ String getCallSetIds();
+ void setCallSetIds(String callSetIds);
+
@Description("Path of the file to which to write.")
String getOutput();
-
void setOutput(String output);
+ @Description("The IDs of the Google Genomics transcript sets this pipeline is working with, "
+ + "comma delimited. Defaults to UCSC refGene (hg19).")
+ @Default.String("CIjfoPXj9LqPlAEQr_GXvYPKvtPqAQ")
+ String getTranscriptSetIds();
+ void setTranscriptSetIds(String transcriptSetIds);
+
+ @Description("The IDs of the Google Genomics variant annotation sets this pipeline is working "
+ + "with, comma delimited. Defaults to ClinVar (GRCh37).")
+ @Default.String("CILSqfjtlY6tHxCu1Keyxtu472M")
+ String getVariantAnnotationSetIds();
+ void setVariantAnnotationSetIds(String variantAnnotationSetIds);
+
@Description("By default, variants analyses will be run on BRCA1. Pass this flag to run on all "
+ "references present in the dataset. Note that certain jobs such as PCA and IBS "
+ "will automatically exclude X and Y chromosomes when this option is true.")
boolean isAllReferences();
-
void setAllReferences(boolean allReferences);
@Description("Comma separated tuples of reference:start:end,... Defaults to " + Contig.BRCA1)
@Default.String(Contig.BRCA1)
String getReferences();
-
void setReferences(String references);
@Description("The maximum number of bases per shard.")
@Default.Long(Contig.DEFAULT_NUMBER_OF_BASES_PER_SHARD)
long getBasesPerShard();
-
void setBasesPerShard(long basesPerShard);
@Description("The maximum number of calls to return. However, at least one variant will always "
@@ -117,13 +129,11 @@ public static void validateOptions(GenomicsDatasetOptions options) {
+ "takes into account non-variant segment records that overlap variants within the dataset.")
@Default.Boolean(false)
boolean getHasNonVariantSegments();
-
void setHasNonVariantSegments(boolean hasNonVariantSegments);
@Description("Genomic window \"bin\" size to use for data containing non-variant segments when "
+ "joining those non-variant segment records with variant records.")
@Default.Integer(1000)
int getBinSize();
-
void setBinSize(int binSize);
}
diff --git a/src/main/java/com/google/cloud/genomics/dataflow/utils/VariantUtils.java b/src/main/java/com/google/cloud/genomics/dataflow/utils/VariantUtils.java
index f4c64bf..dca07f9 100644
--- a/src/main/java/com/google/cloud/genomics/dataflow/utils/VariantUtils.java
+++ b/src/main/java/com/google/cloud/genomics/dataflow/utils/VariantUtils.java
@@ -1,11 +1,11 @@
/*
* Copyright (C) 2015 Google Inc.
- *
+ *
* Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
* in compliance with the License. You may obtain a copy of the License at
- *
+ *
* http://www.apache.org/licenses/LICENSE-2.0
- *
+ *
* Unless required by applicable law or agreed to in writing, software distributed under the License
* is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
* or implied. See the License for the specific language governing permissions and limitations under
@@ -13,9 +13,6 @@
*/
package com.google.cloud.genomics.dataflow.utils;
-import java.util.Comparator;
-import java.util.List;
-
import com.google.api.services.genomics.model.Call;
import com.google.api.services.genomics.model.Variant;
import com.google.common.base.Function;
@@ -24,6 +21,9 @@
import com.google.common.collect.Iterables;
import com.google.common.collect.Ordering;
+import java.util.Comparator;
+import java.util.List;
+
public class VariantUtils {
public static boolean isVariant(Variant variant) {
@@ -31,6 +31,13 @@ public static boolean isVariant(Variant variant) {
return !(null == alternateBases || alternateBases.isEmpty());
}
+ public static final Predicate IS_SNP = new Predicate() {
+ @Override
+ public boolean apply(Variant variant) {
+ return isSnp(variant);
+ }
+ };
+
public static boolean isSnp(Variant variant) {
return isVariant(variant) && LENGTH_IS_1.apply(variant.getReferenceBases())
&& Iterables.all(variant.getAlternateBases(), LENGTH_IS_1);
diff --git a/src/test/java/com/google/cloud/genomics/dataflow/utils/AnnotationUtilsTest.java b/src/test/java/com/google/cloud/genomics/dataflow/utils/AnnotationUtilsTest.java
new file mode 100644
index 0000000..ac1fb4c
--- /dev/null
+++ b/src/test/java/com/google/cloud/genomics/dataflow/utils/AnnotationUtilsTest.java
@@ -0,0 +1,132 @@
+package com.google.cloud.genomics.dataflow.utils;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNull;
+
+import com.google.api.services.genomics.model.Annotation;
+import com.google.api.services.genomics.model.Int32Value;
+import com.google.api.services.genomics.model.RangePosition;
+import com.google.api.services.genomics.model.Transcript;
+import com.google.api.services.genomics.model.TranscriptCodingSequence;
+import com.google.api.services.genomics.model.TranscriptExon;
+import com.google.cloud.genomics.dataflow.utils.AnnotationUtils.VariantEffect;
+import com.google.common.base.Strings;
+import com.google.common.collect.ImmutableList;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.JUnit4;
+
+import htsjdk.samtools.util.SequenceUtil;
+
+@RunWith(JUnit4.class)
+public class AnnotationUtilsTest {
+ @Test
+ public void testDetermineVariantTranscriptEffect_simpleShort() {
+ Annotation transcript = new Annotation()
+ .setPosition(new RangePosition().setReferenceName("1").setStart(2L).setEnd(9L))
+ .setTranscript(new Transcript()
+ .setCodingSequence(new TranscriptCodingSequence().setStart(2L).setEnd(9L))
+ .setExons(ImmutableList.of(
+ new TranscriptExon().setStart(2L).setEnd(9L).setFrame(new Int32Value().setValue(0)))));
+
+ assertEquals("GATTACA -> GCTTACA, codon is GAT -> GCT, AA is D -> A",
+ VariantEffect.NONSYNONYMOUS_SNP,
+ AnnotationUtils.determineVariantTranscriptEffect(3L, "C", transcript, "GATTACA"));
+ assertEquals("ATGTGAA -> ATGTGGA, codon is TGA -> TGG, AA is STOP -> W",
+ VariantEffect.STOP_LOSS,
+ AnnotationUtils.determineVariantTranscriptEffect(7L, "G", transcript, "ATGTGAA"));
+ assertEquals("CCCAAAT -> CCCTAAT, codon is AAA -> TAA, AA is K -> STOP",
+ VariantEffect.STOP_GAIN,
+ AnnotationUtils.determineVariantTranscriptEffect(5L, "T", transcript, "CCCAAAT"));
+ assertEquals("GATTACA -> GACTACA, codon is GAT -> GAC, AA is D -> D",
+ VariantEffect.SYNONYMOUS_SNP,
+ AnnotationUtils.determineVariantTranscriptEffect(4L, "C", transcript, "GATTACA"));
+ assertNull("variant does not intersect transcript",
+ AnnotationUtils.determineVariantTranscriptEffect(123L, "C", transcript, "GATTACA"));
+ }
+
+ @Test
+ public void testDetermineVariantTranscriptEffect_reverseStrand() {
+ Annotation transcript = new Annotation()
+ .setPosition(
+ new RangePosition().setReferenceName("1").setStart(2L).setEnd(20L).setReverseStrand(true))
+ .setTranscript(new Transcript()
+ .setCodingSequence(new TranscriptCodingSequence().setStart(3L).setEnd(18L))
+ .setExons(ImmutableList.of(
+ new TranscriptExon().setStart(2L).setEnd(7L).setFrame(new Int32Value().setValue(2)),
+ new TranscriptExon().setStart(10L).setEnd(20L).setFrame(new Int32Value().setValue(1)))
+ ));
+
+ String bases = SequenceUtil.reverseComplement(
+ // First exon [10, 20)
+ "AC" + // 5' UTR
+ "ATG" + "ACG" + "GT" +
+ // intron
+ "CCC" +
+ // Second exon [2, 7)
+ "G" + "TAG" +
+ "G"); // 3' UTR
+ assertEquals("ATG -> ACG (reverse complement), AA is M -> T",
+ VariantEffect.NONSYNONYMOUS_SNP,
+ AnnotationUtils.determineVariantTranscriptEffect(16L, "G", transcript, bases));
+ assertEquals("TAG -> CAG (reverse complement), AA is STOP -> Q",
+ VariantEffect.STOP_LOSS,
+ AnnotationUtils.determineVariantTranscriptEffect(5L, "G", transcript, bases));
+ assertNull("mutates intron",
+ AnnotationUtils.determineVariantTranscriptEffect(9L, "C", transcript, bases));
+ assertNull("mutates 5' UTR",
+ AnnotationUtils.determineVariantTranscriptEffect(19L, "C", transcript, bases));
+ }
+
+ @Test
+ public void testDetermineVariantTranscriptEffect_noncoding() {
+ Annotation transcript = new Annotation()
+ .setPosition(new RangePosition().setReferenceName("1").setStart(2L).setEnd(9L))
+ .setTranscript(new Transcript()
+ .setExons(ImmutableList.of(new TranscriptExon().setStart(2L).setEnd(9L))));
+
+ assertNull(AnnotationUtils.determineVariantTranscriptEffect(3L, "C", transcript, "GATTACA"));
+ assertNull(AnnotationUtils.determineVariantTranscriptEffect(11L, "C", transcript, "GATTACA"));
+ }
+
+ @Test
+ public void testDetermineVariantTranscriptEffect_frameless() {
+ Annotation transcript = new Annotation()
+ .setPosition(new RangePosition().setReferenceName("1").setStart(2L).setEnd(9L))
+ .setTranscript(new Transcript()
+ .setCodingSequence(new TranscriptCodingSequence().setStart(2L).setEnd(9L))
+ .setExons(ImmutableList.of(new TranscriptExon().setStart(2L).setEnd(9L))));
+
+ assertNull(AnnotationUtils.determineVariantTranscriptEffect(3L, "C", transcript, "GATTACA"));
+ assertNull(AnnotationUtils.determineVariantTranscriptEffect(11L, "C", transcript, "GATTACA"));
+ }
+
+ @Test
+ public void testDetermineVariantTranscriptEffect_multiExon() {
+ String bases = Strings.repeat("ACTTGGGTCA", 60);
+ Annotation transcript = new Annotation()
+ .setPosition(new RangePosition().setReferenceName("1").setStart(100L).setEnd(700L))
+ .setTranscript(new Transcript()
+ .setCodingSequence(new TranscriptCodingSequence().setStart(250L).setEnd(580L))
+ .setExons(ImmutableList.of(
+ new TranscriptExon().setStart(100L).setEnd(180L),
+ new TranscriptExon().setStart(200L).setEnd(300L).setFrame(new Int32Value().setValue(2)),
+ new TranscriptExon().setStart(400L).setEnd(500L).setFrame(new Int32Value().setValue(1)),
+ new TranscriptExon().setStart(550L).setEnd(600L).setFrame(new Int32Value().setValue(0)))
+ ));
+
+ assertNull("mutates noncoding exon",
+ AnnotationUtils.determineVariantTranscriptEffect(150L, "C", transcript, bases));
+ assertNull("mutates noncoding region of coding exon",
+ AnnotationUtils.determineVariantTranscriptEffect(240L, "C", transcript, bases));
+ assertNull("mutates intron",
+ AnnotationUtils.determineVariantTranscriptEffect(350L, "C", transcript, bases));
+ assertEquals("mutates first coding base, ACT -> TCT",
+ VariantEffect.NONSYNONYMOUS_SNP,
+ AnnotationUtils.determineVariantTranscriptEffect(250L, "T", transcript, bases));
+ assertEquals("mutates middle exon, TGG -> TCG",
+ VariantEffect.NONSYNONYMOUS_SNP,
+ AnnotationUtils.determineVariantTranscriptEffect(454L, "C", transcript, bases));
+ }
+}