Skip to content
This repository was archived by the owner on Oct 29, 2023. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@
<dependency>
<groupId>com.google.apis</groupId>
<artifactId>google-api-services-genomics</artifactId>
<version>v1beta2-rev9-1.19.0</version>
<version>v1beta2-rev25-1.19.1</version>
<exclusions>
<!-- Exclude an old version of guava which is being pulled
in by a transitive dependency google-api-client 1.19.0 -->
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -88,9 +88,9 @@ public static PCollection<Variant> joinVariantsTransform(
PCollection<SearchVariantsRequest> input, GenomicsFactory.OfflineAuth auth, String fields) {
for (String field : REQUIRED_FIELDS) {
Preconditions
.checkState(
.checkArgument(
fields.contains(field),
"Required field missing: %s Add this field to the list of Variants fields returned in the partial response.",
"Required field missing: '%s' Add this field to the list of Variants fields returned in the partial response.",
field);
}
return joinVariants(input, auth, fields);
Expand Down

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,7 @@ private static PCollection<Read> getReads() throws IOException {
private static PCollection<Read> getReadsFromAPI() {
List<SearchReadsRequest> requests = getReadRequests(options);
PCollection<SearchReadsRequest> readRequests =
DataflowWorkarounds.getPCollection(requests, p, options.getNumWorkers());
DataflowWorkarounds.getPCollection(requests, p);
PCollection<Read> reads =
readRequests.apply(
ParDo.of(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@

public class IdentityByState {
private static final String VARIANT_FIELDS =
"nextPageToken,variants(id,calls(genotype,callSetName))";
"nextPageToken,variants(start,calls(genotype,callSetName))";

public static void main(String[] args) throws IOException, GeneralSecurityException,
InstantiationException, IllegalAccessException {
Expand All @@ -64,7 +64,7 @@ public static void main(String[] args) throws IOException, GeneralSecurityExcept
Pipeline p = Pipeline.create(options);
DataflowWorkarounds.registerGenomicsCoders(p);
PCollection<SearchVariantsRequest> input =
DataflowWorkarounds.getPCollection(requests, p, options.getNumWorkers());
DataflowWorkarounds.getPCollection(requests, p);

PCollection<Variant> variants =
options.getHasNonVariantSegments()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@
*/
public class TransmissionProbability {
private static final String VARIANT_FIELDS
= "nextPageToken,variants(id,names,calls(info,callSetName))";
= "nextPageToken,variants(id,start,names,calls(info,callSetName))";

public static void main(String[] args) throws IOException, GeneralSecurityException {
GenomicsDatasetOptions options = PipelineOptionsFactory.fromArgs(args)
Expand All @@ -65,7 +65,7 @@ public static void main(String[] args) throws IOException, GeneralSecurityExcept
// - Groups Transmission sources by Variant,
// - Calculate transmission Probability for each variant
// - Print calculated values to a file.
DataflowWorkarounds.getPCollection(requests, p, options.getNumWorkers())
DataflowWorkarounds.getPCollection(requests, p)
.apply(ParDo.named("VariantReader")
.of(new VariantReader(auth, ShardBoundary.STRICT, VARIANT_FIELDS)))
.apply(ParDo.named("ExtractFamilyVariantStatus")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@
*/
public class VariantSimilarity {
private static final String VARIANT_FIELDS
= "nextPageToken,variants(id,calls(genotype,callSetName))";
= "nextPageToken,variants(start,calls(genotype,callSetName))";

public static void main(String[] args) throws IOException, GeneralSecurityException {
GenomicsDatasetOptions options = PipelineOptionsFactory.fromArgs(args)
Expand All @@ -54,7 +54,7 @@ public static void main(String[] args) throws IOException, GeneralSecurityExcept
Pipeline p = Pipeline.create(options);
DataflowWorkarounds.registerGenomicsCoders(p);
DataflowWorkarounds
.getPCollection(requests, p, options.getNumWorkers())
.getPCollection(requests, p)
.apply(
ParDo.named("VariantReader").of(
new VariantReader(auth, ShardBoundary.STRICT, VARIANT_FIELDS)))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,9 @@

import com.google.api.client.json.GenericJson;
import com.google.api.services.genomics.Genomics;
import com.google.cloud.dataflow.sdk.transforms.Aggregator;
import com.google.cloud.dataflow.sdk.transforms.DoFn;
import com.google.cloud.dataflow.sdk.transforms.Sum;
import com.google.cloud.genomics.utils.GenomicsFactory;

import java.io.IOException;
Expand All @@ -31,18 +33,33 @@ public abstract class GenomicsApiReader<I extends GenericJson, O extends Generic
// Used for access to the genomics API
protected final GenomicsFactory.OfflineAuth auth;
protected final String fields;

protected Aggregator<Integer> initializedRequestsCount;
protected Aggregator<Integer> unsuccessfulResponsesCount;
protected Aggregator<Integer> ioExceptionsCount;
protected Aggregator<Long> itemCount;

public GenomicsApiReader(GenomicsFactory.OfflineAuth auth, String fields) {
this.auth = auth;
this.fields = fields;
}

@Override
public void startBundle(Context c) {
initializedRequestsCount = c.createAggregator("Genomics API Initialized Request Count", new Sum.SumIntegerFn());
unsuccessfulResponsesCount = c.createAggregator("Genomics API Unsuccessful Response Count", new Sum.SumIntegerFn());
ioExceptionsCount = c.createAggregator("Genomics API IOException Response Count", new Sum.SumIntegerFn());
itemCount = c.createAggregator("Genomics API Item Count", new Sum.SumLongFn());
}

@Override
public void processElement(ProcessContext c) {
try {
GenomicsFactory factory = auth.getDefaultFactory();
processApiCall(auth.getGenomics(factory), c, c.element());

initializedRequestsCount.addValue(factory.initializedRequestsCount());
unsuccessfulResponsesCount.addValue(factory.unsuccessfulResponsesCount());
ioExceptionsCount.addValue(factory.ioExceptionsCount());
LOG.info("ApiReader processed " + factory.initializedRequestsCount() + " requests ("
+ factory.unsuccessfulResponsesCount() + " server errors and "
+ factory.ioExceptionsCount() + " IO exceptions)");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ protected void processApiCall(Genomics genomics, ProcessContext c, SearchReadsRe

for (Read read : Paginator.Reads.create(genomics, shardBoundary).search(request, fields)) {
c.output(read);
itemCount.addValue(1L);
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ public class VariantReader extends GenomicsApiReader<SearchVariantsRequest, Vari

/**
* Create a VariantReader using a auth and fields parameter. All fields not specified under
* readFields will not be returned in the API response.
* variantFields will not be returned in the API response.
*
* @param auth Auth class containing credentials.
* @param variantFields Fields to return in responses.
Expand All @@ -52,12 +52,13 @@ public VariantReader(GenomicsFactory.OfflineAuth auth, ShardBoundary shardBounda
@Override
protected void processApiCall(Genomics genomics, ProcessContext c, SearchVariantsRequest request)
throws IOException {
LOG.info("Starting Variants read loop");
LOG.info("Starting Variants read loop: " + request);

int numberOfVariants = 0;
for (Variant variant : Paginator.Variants.create(genomics, shardBoundary).search(request, fields)) {
c.output(variant);
++numberOfVariants;
itemCount.addValue(1L);
}

LOG.info("Read " + numberOfVariants + " variants at: " + request.getReferenceName() + "-" + "["
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,14 +19,20 @@
import com.google.cloud.dataflow.sdk.Pipeline;
import com.google.cloud.dataflow.sdk.coders.Coder;
import com.google.cloud.dataflow.sdk.coders.CoderRegistry;
import com.google.cloud.dataflow.sdk.options.DataflowPipelineOptions;
import com.google.cloud.dataflow.sdk.options.PipelineOptions;
import com.google.cloud.dataflow.sdk.runners.DataflowPipelineRegistrar.Options;
import com.google.cloud.dataflow.sdk.transforms.Create;
import com.google.cloud.dataflow.sdk.transforms.Flatten;
import com.google.cloud.dataflow.sdk.values.PCollection;
import com.google.cloud.dataflow.sdk.values.PCollectionList;
import com.google.cloud.genomics.dataflow.coders.GenericJsonCoder;
import com.google.common.base.Predicate;
import com.google.common.base.Splitter;
import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
import com.google.common.primitives.UnsignedInts;

import org.reflections.Reflections;
import org.reflections.scanners.ResourcesScanner;
import org.reflections.scanners.SubTypesScanner;
Expand Down Expand Up @@ -107,14 +113,32 @@ public boolean apply(URL url) {
* multiple workers. In the future, this shouldn't be necessary.
*/
public static <T> PCollection<T> getPCollection(List<T> shardOptions, Coder<T> coder,
Pipeline p, double numWorkers) {
Pipeline p) {

DataflowPipelineOptions options = (GenomicsOptions) p.getOptions();
int numWorkers = options.getNumWorkers();
String machineType = options.getMachineType();

if (machineType != null) {
String[] machineNameParts =
Iterables.toArray(Splitter.on('-').split(machineType), String.class);
if (3 == machineNameParts.length) {
try {
int numCores = UnsignedInts.parseUnsignedInt(machineNameParts[2]);
numWorkers *= numCores;
} catch (Exception e) {
LOG.warning("Assuming one core per worker: " + e);
}
}
}

LOG.info("Turning " + shardOptions.size() + " options into " + numWorkers + " workers");
numWorkers = Math.min(shardOptions.size(), numWorkers);

int optionsPerWorker = (int) Math.ceil(shardOptions.size() / numWorkers);
List<PCollection<T>> pCollections = Lists.newArrayList();


for (int i = 0; i < numWorkers; i++) {
int start = i * optionsPerWorker;
int end = Math.min(shardOptions.size(), start + optionsPerWorker);
Expand All @@ -137,7 +161,7 @@ public static <T> PCollection<T> getPCollection(List<T> shardOptions, Coder<T> c
}

public static <T> PCollection<T> getPCollection(
List<T> shardOptions, Pipeline p, double numWorkers) {
return getPCollection(shardOptions, null, p, numWorkers);
List<T> shardOptions, Pipeline p) {
return getPCollection(shardOptions, null, p);
}
}
Loading