jpayne@68
|
1 package bloom;
|
jpayne@68
|
2
|
jpayne@68
|
3 import java.util.ArrayList;
|
jpayne@68
|
4 import java.util.Locale;
|
jpayne@68
|
5
|
jpayne@68
|
6 import dna.AminoAcid;
|
jpayne@68
|
7 import fileIO.ReadWrite;
|
jpayne@68
|
8 import shared.Timer;
|
jpayne@68
|
9 import stream.ConcurrentGenericReadInputStream;
|
jpayne@68
|
10 import stream.FastqReadInputStream;
|
jpayne@68
|
11 import stream.Read;
|
jpayne@68
|
12 import structures.ListNum;
|
jpayne@68
|
13
|
jpayne@68
|
14 /**
|
jpayne@68
|
15 * @author Brian Bushnell
|
jpayne@68
|
16 * @date Jul 6, 2012
|
jpayne@68
|
17 *
|
jpayne@68
|
18 */
|
jpayne@68
|
19 public class LargeKmerCount {
|
jpayne@68
|
20
|
jpayne@68
|
21 public static void main(String[] args){
|
jpayne@68
|
22
|
jpayne@68
|
23 Timer t=new Timer();
|
jpayne@68
|
24
|
jpayne@68
|
25 String fname1=args[0];
|
jpayne@68
|
26 String fname2=(args.length>4 || args[1].contains(".") ? args[1] : null);
|
jpayne@68
|
27 int indexbits=Integer.parseInt(args[args.length-3]);
|
jpayne@68
|
28 int cbits=Integer.parseInt(args[args.length-2]);
|
jpayne@68
|
29 int k=Integer.parseInt(args[args.length-1]);
|
jpayne@68
|
30
|
jpayne@68
|
31 KCountArray2 count=countFastq(fname1, fname2, indexbits, cbits, k);
|
jpayne@68
|
32 t.stop();
|
jpayne@68
|
33 System.out.println("Finished counting; time = "+t);
|
jpayne@68
|
34
|
jpayne@68
|
35 long[] freq=count.transformToFrequency();
|
jpayne@68
|
36
|
jpayne@68
|
37 // System.out.println(count+"\n");
|
jpayne@68
|
38 // System.out.println(Arrays.toString(freq)+"\n");
|
jpayne@68
|
39
|
jpayne@68
|
40 long sum=sum(freq);
|
jpayne@68
|
41 System.out.println("Kmer fraction:");
|
jpayne@68
|
42 int lim1=8, lim2=16;
|
jpayne@68
|
43 for(int i=0; i<lim1; i++){
|
jpayne@68
|
44 String prefix=i+"";
|
jpayne@68
|
45 while(prefix.length()<8){prefix=prefix+" ";}
|
jpayne@68
|
46 System.out.println(prefix+"\t"+String.format(Locale.ROOT, "%.3f%% ",(100l*freq[i]/(double)sum))+"\t"+freq[i]);
|
jpayne@68
|
47 }
|
jpayne@68
|
48 while(lim1<=freq.length){
|
jpayne@68
|
49 int x=0;
|
jpayne@68
|
50 for(int i=lim1; i<lim2; i++){
|
jpayne@68
|
51 x+=freq[i];
|
jpayne@68
|
52 }
|
jpayne@68
|
53 String prefix=lim1+"-"+(lim2-1);
|
jpayne@68
|
54 if(lim2>=freq.length){prefix=lim1+"+";}
|
jpayne@68
|
55 while(prefix.length()<8){prefix=prefix+" ";}
|
jpayne@68
|
56 System.out.println(prefix+"\t"+String.format(Locale.ROOT, "%.3f%% ",(100l*x/(double)sum))+"\t"+x);
|
jpayne@68
|
57 lim1*=2;
|
jpayne@68
|
58 lim2=min(lim2*2, freq.length);
|
jpayne@68
|
59 }
|
jpayne@68
|
60
|
jpayne@68
|
61 long sum2=sum-freq[0];
|
jpayne@68
|
62 long x=freq[1];
|
jpayne@68
|
63 System.out.println();
|
jpayne@68
|
64 System.out.println("Unique: \t \t"+sum2);
|
jpayne@68
|
65 System.out.println("CollisionsA:\t \t"+collisionsA);
|
jpayne@68
|
66 System.out.println("CollisionsB:\t \t"+collisionsB);
|
jpayne@68
|
67
|
jpayne@68
|
68 double modifier=(collisionsB)/(double)(32*collisionsA+8*collisionsB);
|
jpayne@68
|
69
|
jpayne@68
|
70 System.out.println("Estimate: \t \t"+(sum2+collisionsA+collisionsB-(long)(collisionsA*modifier)));
|
jpayne@68
|
71 System.out.println();
|
jpayne@68
|
72 System.out.println("Singleton: \t"+String.format(Locale.ROOT, "%.3f%% ",(100l*x/(double)sum2))+"\t"+x);
|
jpayne@68
|
73 x=sum2-x;
|
jpayne@68
|
74 System.out.println("Useful: \t"+String.format(Locale.ROOT, "%.3f%% ",(100l*x/(double)sum2))+"\t"+x);
|
jpayne@68
|
75
|
jpayne@68
|
76 }
|
jpayne@68
|
77
|
jpayne@68
|
78 public static KCountArray2 countFastq(String reads1, String reads2, int indexbits, int cbits, int k){
|
jpayne@68
|
79 assert(indexbits>=1 && indexbits<40);
|
jpayne@68
|
80 collisionsA=0;
|
jpayne@68
|
81 collisionsB=0;
|
jpayne@68
|
82 final long cells=1L<<indexbits;
|
jpayne@68
|
83 final int kbits=ROTATE_DIST*k;
|
jpayne@68
|
84 final int xorShift=kbits%64;
|
jpayne@68
|
85 final long[] rotMasks=makeRotMasks(xorShift);
|
jpayne@68
|
86 final int[] buffer=new int[k];
|
jpayne@68
|
87 if(verbose){System.err.println("k="+k+", kbits="+kbits+", indexbits="+indexbits+", cells="+cells+", cbits="+cbits);}
|
jpayne@68
|
88 if(verbose){System.err.println("xorShift="+xorShift+", rotMasks[3]="+Long.toHexString(rotMasks[3]));}
|
jpayne@68
|
89 final KCountArray2 count=new KCountArray2(cells, cbits);
|
jpayne@68
|
90
|
jpayne@68
|
91 FastqReadInputStream fris1=new FastqReadInputStream(reads1, false);
|
jpayne@68
|
92 FastqReadInputStream fris2=(reads2==null ? null : new FastqReadInputStream(reads2, false));
|
jpayne@68
|
93 ConcurrentGenericReadInputStream cris=new ConcurrentGenericReadInputStream(fris1, fris2, maxReads);
|
jpayne@68
|
94
|
jpayne@68
|
95 cris.start();
|
jpayne@68
|
96 System.err.println("Started cris");
|
jpayne@68
|
97 boolean paired=cris.paired();
|
jpayne@68
|
98 System.err.println("Paired: "+paired);
|
jpayne@68
|
99
|
jpayne@68
|
100 long kmer=0; //current kmer
|
jpayne@68
|
101 int len=0; //distance since last contig start or ambiguous base
|
jpayne@68
|
102
|
jpayne@68
|
103 {
|
jpayne@68
|
104 ListNum<Read> ln=cris.nextList();
|
jpayne@68
|
105 ArrayList<Read> reads=(ln!=null ? ln.list : null);
|
jpayne@68
|
106
|
jpayne@68
|
107 if(reads!=null && !reads.isEmpty()){
|
jpayne@68
|
108 Read r=reads.get(0);
|
jpayne@68
|
109 assert(paired==(r.mate!=null));
|
jpayne@68
|
110 }
|
jpayne@68
|
111
|
jpayne@68
|
112 while(ln!=null && reads!=null && reads.size()>0){//ln!=null prevents a compiler potential null access warning
|
jpayne@68
|
113 //System.err.println("reads.size()="+reads.size());
|
jpayne@68
|
114 for(Read r : reads){
|
jpayne@68
|
115 readsProcessed++;
|
jpayne@68
|
116
|
jpayne@68
|
117 len=0;
|
jpayne@68
|
118 kmer=0;
|
jpayne@68
|
119 byte[] bases=r.bases;
|
jpayne@68
|
120 byte[] quals=r.quality;
|
jpayne@68
|
121
|
jpayne@68
|
122 for(int i=0; i<bases.length; i++){
|
jpayne@68
|
123 byte b=bases[i];
|
jpayne@68
|
124 int x=AminoAcid.baseToNumber[b];
|
jpayne@68
|
125 int x2=buffer[len%buffer.length];
|
jpayne@68
|
126 buffer[len%buffer.length]=x;
|
jpayne@68
|
127 if(x<0){
|
jpayne@68
|
128 len=0;
|
jpayne@68
|
129 kmer=0;
|
jpayne@68
|
130 }else{
|
jpayne@68
|
131 kmer=(Long.rotateLeft(kmer,ROTATE_DIST)^x);
|
jpayne@68
|
132 len++;
|
jpayne@68
|
133 if(len>=k){
|
jpayne@68
|
134 if(len>k){kmer=kmer^rotMasks[x2];}
|
jpayne@68
|
135 long hashcode=kmer&0x7fffffffffffffffL;
|
jpayne@68
|
136 long code1=hashcode%(cells-3);
|
jpayne@68
|
137 long code2=((~hashcode)&0x7fffffffffffffffL)%(cells-5);
|
jpayne@68
|
138 int value=count.increment2(code1, 1);
|
jpayne@68
|
139 long temp=count.read(code2);
|
jpayne@68
|
140 if(temp>0){
|
jpayne@68
|
141 if(value==0){collisionsA++;}
|
jpayne@68
|
142 else{collisionsB++;}
|
jpayne@68
|
143 }
|
jpayne@68
|
144 }
|
jpayne@68
|
145 }
|
jpayne@68
|
146 }
|
jpayne@68
|
147
|
jpayne@68
|
148
|
jpayne@68
|
149 if(r.mate!=null){
|
jpayne@68
|
150 len=0;
|
jpayne@68
|
151 kmer=0;
|
jpayne@68
|
152 bases=r.mate.bases;
|
jpayne@68
|
153 quals=r.mate.quality;
|
jpayne@68
|
154 for(int i=0; i<bases.length; i++){
|
jpayne@68
|
155 byte b=bases[i];
|
jpayne@68
|
156 int x=AminoAcid.baseToNumber[b];
|
jpayne@68
|
157 int x2=buffer[len%buffer.length];
|
jpayne@68
|
158 buffer[len%buffer.length]=x;
|
jpayne@68
|
159 if(x<0){
|
jpayne@68
|
160 len=0;
|
jpayne@68
|
161 kmer=0;
|
jpayne@68
|
162 }else{
|
jpayne@68
|
163 kmer=(Long.rotateLeft(kmer,ROTATE_DIST)^x);
|
jpayne@68
|
164 len++;
|
jpayne@68
|
165 if(len>=k){
|
jpayne@68
|
166 if(len>k){kmer=kmer^rotMasks[x2];}
|
jpayne@68
|
167 long hashcode=kmer&0x7fffffffffffffffL;
|
jpayne@68
|
168 long code1=hashcode%(cells-3);
|
jpayne@68
|
169 long code2=((~hashcode)&0x7fffffffffffffffL)%(cells-5);
|
jpayne@68
|
170 int value=count.increment2(code1, 1);
|
jpayne@68
|
171 long temp=count.read(code2);
|
jpayne@68
|
172 if(temp>0){
|
jpayne@68
|
173 if(value==0){collisionsA++;}
|
jpayne@68
|
174 else{collisionsB++;}
|
jpayne@68
|
175 }
|
jpayne@68
|
176 }
|
jpayne@68
|
177 }
|
jpayne@68
|
178 }
|
jpayne@68
|
179 }
|
jpayne@68
|
180
|
jpayne@68
|
181 }
|
jpayne@68
|
182 //System.err.println("returning list");
|
jpayne@68
|
183 cris.returnList(ln);
|
jpayne@68
|
184 //System.err.println("fetching list");
|
jpayne@68
|
185 ln=cris.nextList();
|
jpayne@68
|
186 reads=(ln!=null ? ln.list : null);
|
jpayne@68
|
187 }
|
jpayne@68
|
188 System.err.println("Finished reading");
|
jpayne@68
|
189 cris.returnList(ln);
|
jpayne@68
|
190 System.err.println("Returned list");
|
jpayne@68
|
191 ReadWrite.closeStream(cris);
|
jpayne@68
|
192 System.err.println("Closed stream");
|
jpayne@68
|
193 System.err.println("Processed "+readsProcessed+" reads.");
|
jpayne@68
|
194 }
|
jpayne@68
|
195
|
jpayne@68
|
196 return count;
|
jpayne@68
|
197 }
|
jpayne@68
|
198
|
jpayne@68
|
199 public static final long[] makeRotMasks(int rotDist){
|
jpayne@68
|
200 long[] masks=new long[4];
|
jpayne@68
|
201 for(long i=0; i<4; i++){
|
jpayne@68
|
202 masks[(int)i]=Long.rotateLeft(i, rotDist);
|
jpayne@68
|
203 }
|
jpayne@68
|
204 return masks;
|
jpayne@68
|
205 }
|
jpayne@68
|
206
|
jpayne@68
|
207 public static long[] transformToFrequency(int[] count){
|
jpayne@68
|
208 long[] freq=new long[2000];
|
jpayne@68
|
209 int max=freq.length-1;
|
jpayne@68
|
210 for(int i=0; i<count.length; i++){
|
jpayne@68
|
211 int x=count[i];
|
jpayne@68
|
212 x=min(x, max);
|
jpayne@68
|
213 freq[x]++;
|
jpayne@68
|
214 }
|
jpayne@68
|
215 return freq;
|
jpayne@68
|
216 }
|
jpayne@68
|
217
|
jpayne@68
|
218 public static long sum(int[] array){
|
jpayne@68
|
219 long x=0;
|
jpayne@68
|
220 for(int y : array){x+=y;}
|
jpayne@68
|
221 return x;
|
jpayne@68
|
222 }
|
jpayne@68
|
223
|
jpayne@68
|
224 public static long sum(long[] array){
|
jpayne@68
|
225 long x=0;
|
jpayne@68
|
226 for(long y : array){x+=y;}
|
jpayne@68
|
227 return x;
|
jpayne@68
|
228 }
|
jpayne@68
|
229
|
jpayne@68
|
230 public static final int min(int x, int y){return x<y ? x : y;}
|
jpayne@68
|
231 public static final int max(int x, int y){return x>y ? x : y;}
|
jpayne@68
|
232
|
jpayne@68
|
233 public static boolean verbose=true;
|
jpayne@68
|
234 public static byte minQuality=-5;
|
jpayne@68
|
235 public static long readsProcessed=0;
|
jpayne@68
|
236 public static long maxReads=1000000L;
|
jpayne@68
|
237 public static final int ROTATE_DIST=2;
|
jpayne@68
|
238
|
jpayne@68
|
239 public static long collisionsA=0;
|
jpayne@68
|
240 public static long collisionsB=0;
|
jpayne@68
|
241
|
jpayne@68
|
242 }
|