Mercurial > repos > rliterman > csp2
comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/bloom/KmerCount4.java @ 68:5028fdace37b
planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author | jpayne |
---|---|
date | Tue, 18 Mar 2025 16:23:26 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
67:0e9998148a16 | 68:5028fdace37b |
---|---|
1 package bloom; | |
2 | |
3 import java.util.ArrayList; | |
4 import java.util.Locale; | |
5 | |
6 import dna.AminoAcid; | |
7 import fileIO.FileFormat; | |
8 import shared.Timer; | |
9 import stream.ConcurrentReadInputStream; | |
10 import stream.FastaReadInputStream; | |
11 import stream.Read; | |
12 import structures.ListNum; | |
13 | |
14 /** | |
15 * @author Brian Bushnell | |
16 * @date Jul 5, 2012 | |
17 * | |
18 */ | |
19 public class KmerCount4 extends KmerCountAbstract { | |
20 | |
21 public static void main(String[] args){ | |
22 | |
23 Timer t=new Timer(); | |
24 | |
25 String fname1=args[0]; | |
26 String fname2=(args.length>3 || args[1].contains(".") ? args[1] : null); | |
27 int k=14; | |
28 int cbits=16; | |
29 int gap=0; | |
30 | |
31 for(int i=(fname2==null ? 1 : 2); i<args.length; i++){ | |
32 final String arg=args[i]; | |
33 final String[] split=arg.split("="); | |
34 String a=split[0].toLowerCase(); | |
35 String b=split.length>1 ? split[1] : null; | |
36 | |
37 if(a.equals("k") || a.equals("kmer")){ | |
38 k=Integer.parseInt(b); | |
39 }else if(a.startsWith("cbits") || a.startsWith("cellbits")){ | |
40 cbits=Integer.parseInt(b); | |
41 }else if(a.startsWith("gap")){ | |
42 gap=Integer.parseInt(b); | |
43 }else{ | |
44 throw new RuntimeException("Unknown parameter "+args[i]); | |
45 } | |
46 } | |
47 | |
48 KCountArray2 count=null; | |
49 | |
50 if(fileIO.FileFormat.hasFastaExtension(fname1)){ | |
51 assert(!FastaReadInputStream.SPLIT_READS); | |
52 FastaReadInputStream.MIN_READ_LEN=k; | |
53 } | |
54 | |
55 if(gap==0){ | |
56 count=count(fname1, fname2, k, cbits, true); | |
57 }else{ | |
58 count=countFastqSplit(fname1, fname2, (k+1)/2, k/2, gap, cbits, true, null); | |
59 } | |
60 | |
61 | |
62 t.stop(); | |
63 System.out.println("Finished counting; time = "+t); | |
64 | |
65 printStatistics(count); | |
66 | |
67 } | |
68 | |
69 public static void printStatistics(KCountArray2 count){ | |
70 long[] freq=count.transformToFrequency(); | |
71 | |
72 // System.out.println(count+"\n"); | |
73 // System.out.println(Arrays.toString(freq)+"\n"); | |
74 | |
75 long sum=sum(freq); | |
76 System.out.println("Kmer fraction:"); | |
77 int lim1=8, lim2=16; | |
78 for(int i=0; i<lim1; i++){ | |
79 String prefix=i+""; | |
80 while(prefix.length()<8){prefix=prefix+" ";} | |
81 System.out.println(prefix+"\t"+String.format(Locale.ROOT, "%.3f%% ",(100l*freq[i]/(double)sum))+"\t"+freq[i]); | |
82 } | |
83 while(lim1<=freq.length){ | |
84 int x=0; | |
85 for(int i=lim1; i<lim2; i++){ | |
86 x+=freq[i]; | |
87 } | |
88 String prefix=lim1+"-"+(lim2-1); | |
89 if(lim2>=freq.length){prefix=lim1+"+";} | |
90 while(prefix.length()<8){prefix=prefix+" ";} | |
91 System.out.println(prefix+"\t"+String.format(Locale.ROOT, "%.3f%% ",(100l*x/(double)sum))+"\t"+x); | |
92 lim1*=2; | |
93 lim2=min(lim2*2, freq.length); | |
94 } | |
95 | |
96 long sum2=sum-freq[0]; | |
97 long x=freq[1]; | |
98 System.out.println(); | |
99 System.out.println("Keys Counted: \t \t"+keysCounted); | |
100 System.out.println("Unique: \t \t"+sum2); | |
101 System.out.println("Avg Sites/Key: \t \t"+String.format(Locale.ROOT, "%.3f ",(keysCounted*1d/sum2))); | |
102 System.out.println(); | |
103 System.out.println("Singleton: \t"+String.format(Locale.ROOT, "%.3f%% ",(100l*x/(double)sum2))+"\t"+x); | |
104 x=sum2-x; | |
105 System.out.println("Useful: \t"+String.format(Locale.ROOT, "%.3f%% ",(100l*x/(double)sum2))+"\t"+x); | |
106 } | |
107 | |
108 public static KCountArray2 count(String reads1, String reads2, int k, int cbits, boolean rcomp){ | |
109 return count(reads1, reads2, k, cbits, rcomp, null); | |
110 } | |
111 | |
112 public static KCountArray2 count(String reads1, String reads2, int k, int cbits, boolean rcomp, KCountArray2 count){ | |
113 assert(k>=1 && k<20); | |
114 final int kbits=2*k; | |
115 final long mask=(kbits>63 ? -1L : ~((-1L)<<kbits)); | |
116 | |
117 if(count==null){ | |
118 final long cells=1L<<kbits; | |
119 if(verbose){System.err.println("k="+k+", kbits="+kbits+", cells="+cells+", mask="+Long.toHexString(mask));} | |
120 count=new KCountArray2(cells, cbits); | |
121 } | |
122 | |
123 final ConcurrentReadInputStream cris; | |
124 { | |
125 FileFormat ff1=FileFormat.testInput(reads1, FileFormat.FASTQ, null, true, true); | |
126 FileFormat ff2=FileFormat.testInput(reads2, FileFormat.FASTQ, null, true, true); | |
127 cris=ConcurrentReadInputStream.getReadInputStream(maxReads, true, ff1, ff2); | |
128 cris.start(); //4567 | |
129 } | |
130 | |
131 assert(cris!=null) : reads1; | |
132 System.err.println("Started cris"); | |
133 boolean paired=cris.paired(); | |
134 if(verbose){System.err.println("Paired: "+paired);} | |
135 | |
136 ListNum<Read> ln=cris.nextList(); | |
137 ArrayList<Read> reads=(ln!=null ? ln.list : null); | |
138 | |
139 if(reads!=null && !reads.isEmpty()){ | |
140 Read r=reads.get(0); | |
141 assert(paired==(r.mate!=null)); | |
142 } | |
143 | |
144 while(ln!=null && reads!=null && reads.size()>0){//ln!=null prevents a compiler potential null access warning | |
145 //System.err.println("reads.size()="+reads.size()); | |
146 for(Read r : reads){ | |
147 readsProcessed++; | |
148 | |
149 addRead(r, count, k, mask, rcomp); | |
150 if(r.mate!=null){ | |
151 addRead(r.mate, count, k, mask, rcomp); | |
152 } | |
153 | |
154 } | |
155 //System.err.println("returning list"); | |
156 cris.returnList(ln); | |
157 //System.err.println("fetching list"); | |
158 ln=cris.nextList(); | |
159 reads=(ln!=null ? ln.list : null); | |
160 } | |
161 if(verbose){System.err.println("Finished reading");} | |
162 cris.returnList(ln); | |
163 if(verbose){System.err.println("Returned list");} | |
164 cris.close(); | |
165 if(verbose){System.err.println("Closed stream");} | |
166 if(verbose){System.err.println("Processed "+readsProcessed+" reads.");} | |
167 | |
168 | |
169 return count; | |
170 } | |
171 | |
172 public static KCountArray2 countFastqSplit(String reads1, String reads2, int k1, int k2, int gap, int cbits, boolean rcomp, KCountArray2 count){ | |
173 assert(k1+k2>=1 && k1+k2<20); | |
174 assert(gap>=0); | |
175 final int kbits1=2*k1; | |
176 final int kbits2=2*k2; | |
177 final long mask1=~((-1L)<<(kbits1)); | |
178 final long mask2=~((-1L)<<(kbits2)); | |
179 | |
180 if(count==null){ | |
181 final long cells=1L<<(kbits1+kbits2); | |
182 if(verbose){System.err.println("k1="+k1+", k2="+k2+", kbits1="+kbits1+", kbits2="+kbits2+", cells="+cells+ | |
183 ", mask1="+Long.toHexString(mask1)+", mask2="+Long.toHexString(mask2));} | |
184 count=new KCountArray2(cells, cbits, gap); | |
185 } | |
186 assert(count.gap==gap); | |
187 | |
188 final ConcurrentReadInputStream cris; | |
189 { | |
190 FileFormat ff1=FileFormat.testInput(reads1, FileFormat.FASTQ, null, true, true); | |
191 FileFormat ff2=FileFormat.testInput(reads2, FileFormat.FASTQ, null, true, true); | |
192 cris=ConcurrentReadInputStream.getReadInputStream(maxReads, true, ff1, ff2); | |
193 cris.start(); //4567 | |
194 } | |
195 | |
196 assert(cris!=null) : reads1; | |
197 System.err.println("Started cris"); | |
198 boolean paired=cris.paired(); | |
199 if(verbose){System.err.println("Paired: "+paired);} | |
200 | |
201 ListNum<Read> ln=cris.nextList(); | |
202 ArrayList<Read> reads=(ln!=null ? ln.list : null); | |
203 | |
204 if(reads!=null && !reads.isEmpty()){ | |
205 Read r=reads.get(0); | |
206 assert(paired==(r.mate!=null)); | |
207 } | |
208 | |
209 while(ln!=null && reads!=null && reads.size()>0){//ln!=null prevents a compiler potential null access warning | |
210 //System.err.println("reads.size()="+reads.size()); | |
211 for(Read r : reads){ | |
212 readsProcessed++; | |
213 | |
214 addReadSplit(r, count, k1, k2, mask1, mask2, gap, rcomp); | |
215 if(r.mate!=null){ | |
216 addReadSplit(r.mate, count, k1, k2, mask1, mask2, gap, rcomp); | |
217 } | |
218 | |
219 } | |
220 //System.err.println("returning list"); | |
221 cris.returnList(ln); | |
222 //System.err.println("fetching list"); | |
223 ln=cris.nextList(); | |
224 reads=(ln!=null ? ln.list : null); | |
225 } | |
226 if(verbose){System.err.println("Finished reading");} | |
227 cris.returnList(ln); | |
228 if(verbose){System.err.println("Returned list");} | |
229 cris.close(); | |
230 if(verbose){System.err.println("Closed stream");} | |
231 if(verbose){System.err.println("Processed "+readsProcessed+" reads.");} | |
232 | |
233 | |
234 return count; | |
235 } | |
236 | |
237 public static void addRead(final Read r, final KCountArray2 count, final int k, final long mask, boolean rcomp){ | |
238 int len=0; | |
239 long kmer=0; | |
240 byte[] bases=r.bases; | |
241 byte[] quals=r.quality; | |
242 for(int i=0; i<bases.length; i++){ | |
243 byte b=bases[i]; | |
244 int x=AminoAcid.baseToNumber[b]; | |
245 if(x<0 || (quals!=null && quals[i]<minQuality)){ | |
246 len=0; | |
247 kmer=0; | |
248 }else{ | |
249 kmer=((kmer<<2)|x)&mask; | |
250 len++; | |
251 if(len>=k){ | |
252 keysCounted++; | |
253 // System.out.print("Incrementing "+Long.toHexString(kmer)+": "+count.read(kmer)); | |
254 count.increment(kmer, 1); | |
255 // System.out.println(" -> "+count.read(kmer)); | |
256 // System.out.print("Incrementing array for "+Long.toHexString(kmer)+": "+array[(int)kmer]); | |
257 // array[(int)kmer]++; | |
258 // System.out.println(" -> "+array[(int)kmer]+"\n"); | |
259 // assert(array[(int)kmer]==count.read(kmer) || array[(int)kmer]>3); | |
260 } | |
261 } | |
262 } | |
263 if(rcomp){ | |
264 r.reverseComplement(); | |
265 addRead(r, count, k, mask, false); | |
266 } | |
267 } | |
268 | |
269 public static void addReadSplit(final Read r, final KCountArray2 count, final int k1, final int k2, final long mask1, final long mask2, final int gap, boolean rcomp){ | |
270 int len=0; | |
271 int shift=k2*2; | |
272 long kmer1=0; | |
273 long kmer2=0; | |
274 byte[] bases=r.bases; | |
275 byte[] quals=r.quality; | |
276 | |
277 assert(kmer1>=kmer2); | |
278 | |
279 // assert(false) : k1+", "+k2+", "+mask1+", "+mask2+", "+gap; | |
280 | |
281 for(int i=0, j=i+k1+gap; j<bases.length; i++, j++){ | |
282 int x1=AminoAcid.baseToNumber[bases[i]]; | |
283 int x2=AminoAcid.baseToNumber[bases[j]]; | |
284 if(x1<0 || x2<0 || (quals!=null && (quals[i]<minQuality || quals[j]<minQuality))){ | |
285 len=0; | |
286 kmer1=0; | |
287 kmer2=0; | |
288 }else{ | |
289 kmer1=((kmer1<<2)|x1)&mask1; | |
290 kmer2=((kmer2<<2)|x2)&mask2; | |
291 len++; | |
292 if(len>=k1){ | |
293 keysCounted++; | |
294 // System.out.print("Incrementing "+Long.toHexString(kmer)+": "+count.read(kmer)); | |
295 | |
296 long key=(kmer1<<shift)|kmer2; | |
297 // System.err.println(Long.toHexString(key)); | |
298 count.increment(key, 1); | |
299 // System.out.println(" -> "+count.read(kmer)); | |
300 // System.out.print("Incrementing array for "+Long.toHexString(kmer)+": "+array[(int)kmer]); | |
301 // array[(int)kmer]++; | |
302 // System.out.println(" -> "+array[(int)kmer]+"\n"); | |
303 // assert(array[(int)kmer]==count.read(kmer) || array[(int)kmer]>3); | |
304 } | |
305 } | |
306 } | |
307 if(rcomp){ | |
308 r.reverseComplement(); | |
309 addReadSplit(r, count, k1, k2, mask1, mask2, gap, false); | |
310 } | |
311 } | |
312 | |
313 public static void addReadSplit(final byte[] bases, final KCountArray2 count, final int k1, final int k2, final long mask1, final long mask2, final int gap, boolean rcomp){ | |
314 int len=0; | |
315 int shift=k2*2; | |
316 long kmer1=0; | |
317 long kmer2=0; | |
318 byte[] quals=null; | |
319 | |
320 assert(kmer1>=kmer2); | |
321 | |
322 // assert(false) : k1+", "+k2+", "+mask1+", "+mask2+", "+gap; | |
323 | |
324 for(int i=0, j=i+k1+gap; j<bases.length; i++, j++){ | |
325 int x1=AminoAcid.baseToNumber[bases[i]]; | |
326 int x2=AminoAcid.baseToNumber[bases[j]]; | |
327 if(x1<0 || x2<0 || (quals!=null && (quals[i]<minQuality || quals[j]<minQuality))){ | |
328 len=0; | |
329 kmer1=0; | |
330 kmer2=0; | |
331 }else{ | |
332 kmer1=((kmer1<<2)|x1)&mask1; | |
333 kmer2=((kmer2<<2)|x2)&mask2; | |
334 len++; | |
335 if(len>=k1){ | |
336 keysCounted++; | |
337 // System.out.print("Incrementing "+Long.toHexString(kmer)+": "+count.read(kmer)); | |
338 | |
339 long key=(kmer1<<shift)|kmer2; | |
340 System.out.println(Long.toHexString(kmer1)); | |
341 System.out.println(Long.toHexString(kmer2)); | |
342 System.out.println(Long.toHexString(key)); | |
343 count.increment(key, 1); | |
344 // System.out.println(" -> "+count.read(kmer)); | |
345 // System.out.print("Incrementing array for "+Long.toHexString(kmer)+": "+array[(int)kmer]); | |
346 // array[(int)kmer]++; | |
347 // System.out.println(" -> "+array[(int)kmer]+"\n"); | |
348 // assert(array[(int)kmer]==count.read(kmer) || array[(int)kmer]>3); | |
349 } | |
350 } | |
351 } | |
352 if(rcomp){ | |
353 AminoAcid.reverseComplementBasesInPlace(bases); | |
354 addReadSplit(bases, count, k1, k2, mask1, mask2, gap, false); | |
355 } | |
356 } | |
357 | |
358 } |