Skip to content
This repository was archived by the owner on Oct 29, 2023. It is now read-only.

Commit 48b0bad

Browse files
committed
Merge pull request #45 from lbergelson/lb_ReadConverterFix
Fixing a few bugs in ReadConverter and adding a new test for Sam -> Read -> Sam conversion.
2 parents d8b04b5 + 0a20aa7 commit 48b0bad

File tree

4 files changed

+72
-20
lines changed

4 files changed

+72
-20
lines changed

pom.xml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -207,6 +207,11 @@
207207
</exclusion>
208208
</exclusions>
209209
</dependency>
210+
<dependency>
211+
<groupId>com.google.cloud.genomics</groupId>
212+
<artifactId>gatk-tools-java</artifactId>
213+
<version>1.0</version>
214+
</dependency>
210215
</dependencies>
211216

212217
<profiles>

src/main/java/com/google/cloud/genomics/dataflow/readers/bam/ReadConverter.java

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
*/
1616
package com.google.cloud.genomics.dataflow.readers.bam;
1717

18+
import com.google.api.client.util.Maps;
1819
import com.google.api.services.genomics.model.CigarUnit;
1920
import com.google.api.services.genomics.model.LinearAlignment;
2021
import com.google.api.services.genomics.model.Position;
@@ -23,15 +24,12 @@
2324
import com.google.common.collect.BiMap;
2425
import com.google.common.collect.HashBiMap;
2526
import com.google.common.collect.Lists;
26-
27-
import htsjdk.samtools.CigarElement;
28-
import htsjdk.samtools.CigarOperator;
29-
import htsjdk.samtools.SAMException;
30-
import htsjdk.samtools.SAMRecord;
27+
import htsjdk.samtools.*;
3128
import htsjdk.samtools.util.SequenceUtil;
3229

3330
import java.util.ArrayList;
3431
import java.util.List;
32+
import java.util.Map;
3533

3634
/**
3735
* Converts SAMRecords to Reads.
@@ -62,7 +60,7 @@ public static final Read makeRead(final SAMRecord record) {
6260
read.setId(record.getReadName()); // TODO: make more unique
6361
read.setFragmentName(record.getReadName());
6462
read.setReadGroupId(getAttr(record, "RG"));
65-
read.setNumberReads(record.getReadPairedFlag() ? 1 : 2);
63+
read.setNumberReads(record.getReadPairedFlag() ? 2 : 1);
6664
read.setProperPlacement(record.getReadPairedFlag() && record.getProperPairFlag());
6765
if (!record.getReadUnmappedFlag() && record.getAlignmentStart() > 0) {
6866
LinearAlignment alignment = new LinearAlignment();
@@ -97,7 +95,7 @@ public CigarUnit apply(CigarElement c) {
9795
read.setAlignment(alignment);
9896
}
9997
read.setDuplicateFragment(record.getDuplicateReadFlag());
100-
read.setFragmentLength(record.getReadLength());
98+
read.setFragmentLength(record.getInferredInsertSize());
10199
if (record.getReadPairedFlag()) {
102100
if (record.getFirstOfPairFlag()) {
103101
read.setReadNumber(0);
@@ -126,6 +124,12 @@ public CigarUnit apply(CigarElement c) {
126124
read.setAlignedQuality(readBaseQualities);
127125
}
128126

127+
Map<String, List<String>> attributes = Maps.newHashMap();
128+
for( SAMRecord.SAMTagAndValue tagAndValue: record.getAttributes()) {
129+
attributes.put(tagAndValue.tag, Lists.newArrayList(tagAndValue.value.toString()));
130+
}
131+
read.setInfo(attributes);
132+
129133
return read;
130134
}
131135

src/test/java/com/google/cloud/genomics/dataflow/readers/bam/ReadConverterTest.java

Lines changed: 27 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,19 @@
11
package com.google.cloud.genomics.dataflow.readers.bam;
22

33
import com.google.api.services.genomics.model.Read;
4-
5-
import static org.junit.Assert.assertEquals;
6-
import static org.junit.Assert.assertFalse;
7-
import static org.junit.Assert.assertThat;
8-
import static org.junit.Assert.assertTrue;
9-
10-
import java.util.Arrays;
11-
import java.util.Comparator;
12-
import java.util.List;
13-
14-
import org.junit.Before;
4+
import com.google.cloud.genomics.gatk.common.GenomicsConverter;
5+
import htsjdk.samtools.SAMFileHeader;
6+
import htsjdk.samtools.SAMRecord;
7+
import htsjdk.samtools.SamReader;
8+
import htsjdk.samtools.SamReaderFactory;
159
import org.junit.Test;
1610
import org.junit.runner.RunWith;
1711
import org.junit.runners.JUnit4;
1812

19-
import htsjdk.samtools.SAMRecord;
13+
import java.io.File;
14+
import java.io.IOException;
15+
16+
import static org.junit.Assert.assertEquals;
2017

2118
@RunWith(JUnit4.class)
2219
public class ReadConverterTest {
@@ -41,5 +38,22 @@ public void testConversion() {
4138
assertEquals("chr20", read.getNextMatePosition().getReferenceName());
4239
assertEquals((Boolean)true, read.getNextMatePosition().getReverseStrand());
4340
}
44-
41+
42+
@Test
43+
public void SamToReadToSamTest() throws IOException {
44+
String filePath = "src/test/resources/com/google/cloud/genomics/dataflow/readers/bam/conversion_test.sam";
45+
File samInput = new File(filePath);
46+
SamReader reads = SamReaderFactory.makeDefault().open(samInput);
47+
SAMFileHeader header = reads.getFileHeader();
48+
49+
int numReads = 0;
50+
for (SAMRecord sam : reads){
51+
Read read = ReadConverter.makeRead(sam);
52+
SAMRecord newSam = GenomicsConverter.makeSAMRecord(read, header );
53+
assertEquals(newSam.getSAMString(), sam.getSAMString());
54+
numReads++;
55+
}
56+
assertEquals(19, numReads);//sanity check to make sure we actually read the file
57+
}
58+
4559
}
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
@HD VN:1.0 SO:coordinate
2+
@SQ SN:chr1 LN:101
3+
@SQ SN:chr2 LN:101
4+
@SQ SN:chr3 LN:101
5+
@SQ SN:chr4 LN:101
6+
@SQ SN:chr5 LN:101
7+
@SQ SN:chr6 LN:101
8+
@SQ SN:chr7 LN:202
9+
@SQ SN:chr8 LN:202
10+
@RG ID:0 SM:Hi,Momma! LB:whatever PU:me PL:ILLUMINA
11+
SL-XAV:1:1:0:764#0/1 89 chr1 1 255 101M * 0 0 TTCATGCTGANGCNCTCTTACGATCGTACAGATGCAAATATTAACANNCNTTNAAGNNCANNNNNNNNNCAATACAATANTAGAGTACGTNAACACTCCAN &/,&-.1/6/&&)&).)/,&0768)&/.,/874,&.4137572)&/&&,&1-&.0/&&*,&&&&&&&&&&18775799,&16:8775-56256/69::;0& RG:Z:0 NN:Z:Hello
12+
SL-XAV:1:1:0:1668#0/2 153 chr2 1 255 101M * 0 0 CATCTCTACANGCGCGTCCTACCAGACGCGCTTCCGATCTGAGAGCATACTTTTCATTGGATTCCAGCACAACTCCATTTTTGATCCACTNGACACCTTTN (/,'-/'0////(1'&&1&&&&'2''-6/,/3-33653.6:1'.86/-++32.-4864653/5/583/346423203+28888644446688456/4880& RG:Z:0 NN:Z:Goodbye
13+
SL-XAV:1:1:0:1914#0/2 153 chr3 1 255 101M * 0 0 CGTATGCGCTNTTTATGTCGCCCACAGTGCCTAGTATAGCCCCTGCTAATAAAAAGAGATGAATACGTTTACTTAAAAAACTGAAACTAGNAATGTGCAAN (0,7&&*/*0*,)10/).-*&.&*/6669.&-337599;3,&,6/.,5::999987893+387020775777547999::668997448:::9;999::0& RG:Z:0
14+
SL-XAV:1:1:0:1639#0/2 153 chr4 1 255 101M * 0 0 CGTGATACCANCTCATGTTCACAGCCAAAGCCTGAAGCTGTCTATTATATTTCTCAACCATAAACTTTTGCCTCAGGCATCCGCAGAATGNTTTGCAGCCN '.&.&&'.0+01'2(1'(''-)','+0041/.+032;:867115/5267-.0/)-5.&-26200224,,0+0/0275/5605688::646875568882*& RG:Z:0
15+
SL-XAV:1:1:0:68#0/2 137 chr5 1 255 101M * 0 0 NTCTCATTTANAAATGGTTATAAAAACATTTATGCTGAAAAGGTGAAGTTCATTAATGAACAGGCTGACTGTCTCACTATCGCGTTCGCANGACGTTATCT &1<<999;;;;<<<87579:556972789977444.'.023.&,7621/54.49.)/53055-22--''+(.'-))6-168/(3&&0(<).))*&&&&&'0 RG:Z:0
16+
SL-XAV:1:1:0:700#0/2 137 chr6 1 255 101M * 0 0 NAATTGTTCTNAGTTTCTCGGTTTATGTGCTCTTCCAGGTGGGTAACACAATAATGGCCTTCCAGATCGTAAGAGCGACGTGTGTTGCACNAGTGTCGATC &0::887::::6/646::838388811/679:87640&./2+/-4/28:3,536/4''&&.78/(/554/./02*)*',-(57()&.6(6:(0601'/(,* RG:Z:0
17+
SL-XAV:1:1:0:1721#0/1 83 chr7 1 255 101M = 102 40 CAACAGAAGGNGGNATCTGTGTTTGTGTTTCGGATTTCCTGCTGAANNGNTTNTCGNNTCNNNNNNNNATCCCGATTTCNTTCCGCAGCTNACCTCCCGAN )'.*.+2,))&&'&*/)-&*-)&.-)&)&),/-&&..)./.,.).*&&,&.&&-)&&&0*&&&&&&&&/32/,01460&&/6/*0*/2/283//36868/& RG:Z:0
18+
SL-XAV:1:1:0:105#0/2 403 chr7 1 255 101M = 102 79 CACATCGTGANTCTTACAATCTGCGGTTTCAGATGTGGAGCGATGTGTGAGAGATTGAGCAACTGATCTGAAAAGCAGACACAGCTATTCNTAAGATGACN /))3--/&*()&)&&+'++.'-&,(.))'4,)&'&&,')8,&&*'.&*0'225/&)3-8//)*,5-*).7851453583.3568526:863688:::85.& RG:Z:0
19+
SL-XAV:1:1:0:1721#0/2 163 chr7 102 255 101M = 1 -40 NCGCGGCATCNCGATTTCTTTCCGCAGCTAACCTCCCGACAGATCGGCAGCGCGTCGTGTAGGTCACTATGGTACATCTTGTCGTGCGGCNAGAGCATACA &/15445666651/566666553+2/14/&/555512+3/)-'/-&-'*+))*''13+3)'//++''/'))/3+&*5++)&'2+&+/*&-&&*)&-./1'1 RG:Z:0
20+
SL-XAV:1:1:0:105#0/2 147 chr8 1 255 101M = 102 79 CACATCGTGANTCTTACAATCTGCGGTTTCAGATGTGGAGCGATGTGTGAGAGATTGAGCAACTGATCTGAAAAGCAGACACAGCTATTCNTAAGATGACN /))3--/&*()&)&&+'++.'-&,(.))'4,)&'&&,')8,&&*'.&*0'225/&)3-8//)*,5-*).7851453583.3568526:863688:::85.& RG:Z:0
21+
SL-XAV:1:1:0:105#0/1 99 chr8 102 255 101M = 1 -79 NCAGGTTCAANTGTGCAGCCCNTTTTGAGAGATNNNNNNNNTGNNCTGNAANANNGACACAGCTATTCCTAAGATGACAAGATCAGANAANAAGTCAAGCA &06665578::41.*/7577/&/77403-324.&&&&&&&&/.&&..&&.0&&&&',:9:/-/(55002020+3'12+2/&.2-&//&),&*&&&&&&&51 RG:Z:0
22+
SL-XAV:1:1:0:1300#0/1 77 * 0 0 * * 0 0 NAAACACAAGNNANAGTCTTANCNGCTATTCCNNNNNNNNNCTNNNCTNAGNANNACATACAACAGTATCCACACAAGTGTACTCGTNCANACATGTGAAC &*5535)*-,,&.&.*-1)*,&'&)&1&&.,)&&&&&&&&&)0&&&0'&&&&.&&*2'/4''0/**&)&,'-&*,&,&&&.0.&)&&&**&,.&&&')&&) RG:Z:0
23+
SL-XAV:1:1:0:1300#0/2 141 * 0 0 * * 0 0 NGATCATGGANGACTCTCCCCATCCCCCGCTCCAGCGCTCAGTTATATGCCTAGCCTCGGACACGTCACCAACATCTCACGCACTCTGCANAGTCTCTCAC &&'+''3*&-/)/1'26/*-2-/542-*&-&/'/*/&-'&)-')&.'-/&&2+122*'&+,(/-&)((,/-,,.'2(2'+)/&/&-66-&&/16&)&*&'3 RG:Z:0
24+
SL-XAV:1:1:0:1639#0/1 101 * 0 0 * chr1 1 0 NCCCTCTCAGNNTNTCTGCCANANCCTTAAGCNNNNNNNNNTANNNCTNAANCNNAAACTTTTGCCTCAGGCATCCGCAGAATGTTTNTCNGCCTATATCG &1::::::64/&/&0:3.280&/&087881,/&&&&&&&&&..&&&..&,,&-&&,265341-)/5680&-.5552-25/322/42/&)&&).421&-&-/ RG:Z:0
25+
SL-XAV:1:1:0:1668#0/1 101 * 0 0 * chr2 1 0 NATAGCATACNNTNCATTGGANTNCAGCACAANNNNNNNNNTGNNNCANTNNANNCCTTTGAGATCGGAAGAGCGGTTCAGCAGGAANNCNCAGACCGATC &1988998890&0&.8863//&.&.0-2875.&&&&&&&&&.)&&&..&.&&.&&.5782-2+262)&-0-0510*.332-2.-,0*&&*&'.&-2-)0., RG:Z:0
26+
SL-XAV:1:1:0:1914#0/1 101 * 0 0 * chr3 1 0 NTTTTTCTCCNNCNGTGCCTANTNTAGCCCCTNNNNNNNNNAANNNATNANNANNTTTACTTAAAAAACTGAAACTAGTAATGTGCANNANATCGNAAGAG &0::::<<;90&/&.244760&,&.414798/&&&&&&&&&00&&&0.&/&&-&&.4475687363504.&.557/.*)65.&/*./&&.&.+*)&..).& RG:Z:0
27+
SL-XAV:1:1:0:68#0/1 581 * 0 0 * chr4 1 0 NAATATTCATNNGNTCAGCCTNTNCATTAATTNNNNNNNNNTTNNNATNATNANNTTTTTTATAACCATTTATAAATGAGAGAGATCNTANCACAATATCA &0<<:::::</&&&.73'290&.&0;:::90&&&&&&&&&&..&&&0)&0-&0&&&.743799995253348597921.,.'050.*&.0&)*)&&&&*). RG:Z:0
28+
SL-XAV:1:1:0:700#0/1 581 * 0 0 * chr5 1 0 NGAAGCCCATNNTNGTGTTACNCNCCTGGAAGNNNNNNNNNACNNNGANACNTNNAACAATTCAGATCGGAAGAGCGGTTCAGCAGANNTNCCGAGACCGA &.88888:88/&0&,03189.&/&.8/))12/&&&&&&&&&./&&&&.&1.&)&&/35962/6432-3&),0&/2+0,),61&-6,&&&'&/,.0&...)0 RG:Z:0
29+
SL-XAV:1:1:0:764#0/2 165 * 0 0 * chr6 1 0 NACAGATGCANATATTAACAGGCTTTAAAGGACAGATGGACTGCAATACAATAATAGAGTACGTCAACACTCCACAGATCGCTAGAGCATNACATCGGTGT &/:5358::9999::99998255::7275,,/5567-'+387537857:54-4.51'31059547320;73/720+22.4(6.;((.;(;8()(''&&2&& RG:Z:0

0 commit comments

Comments
 (0)