comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/bloom/KmerCount4.java @ 68:5028fdace37b

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 16:23:26 -0400
parents
children
comparison
equal deleted inserted replaced
67:0e9998148a16 68:5028fdace37b
1 package bloom;
2
3 import java.util.ArrayList;
4 import java.util.Locale;
5
6 import dna.AminoAcid;
7 import fileIO.FileFormat;
8 import shared.Timer;
9 import stream.ConcurrentReadInputStream;
10 import stream.FastaReadInputStream;
11 import stream.Read;
12 import structures.ListNum;
13
14 /**
15 * @author Brian Bushnell
16 * @date Jul 5, 2012
17 *
18 */
19 public class KmerCount4 extends KmerCountAbstract {
20
21 public static void main(String[] args){
22
23 Timer t=new Timer();
24
25 String fname1=args[0];
26 String fname2=(args.length>3 || args[1].contains(".") ? args[1] : null);
27 int k=14;
28 int cbits=16;
29 int gap=0;
30
31 for(int i=(fname2==null ? 1 : 2); i<args.length; i++){
32 final String arg=args[i];
33 final String[] split=arg.split("=");
34 String a=split[0].toLowerCase();
35 String b=split.length>1 ? split[1] : null;
36
37 if(a.equals("k") || a.equals("kmer")){
38 k=Integer.parseInt(b);
39 }else if(a.startsWith("cbits") || a.startsWith("cellbits")){
40 cbits=Integer.parseInt(b);
41 }else if(a.startsWith("gap")){
42 gap=Integer.parseInt(b);
43 }else{
44 throw new RuntimeException("Unknown parameter "+args[i]);
45 }
46 }
47
48 KCountArray2 count=null;
49
50 if(fileIO.FileFormat.hasFastaExtension(fname1)){
51 assert(!FastaReadInputStream.SPLIT_READS);
52 FastaReadInputStream.MIN_READ_LEN=k;
53 }
54
55 if(gap==0){
56 count=count(fname1, fname2, k, cbits, true);
57 }else{
58 count=countFastqSplit(fname1, fname2, (k+1)/2, k/2, gap, cbits, true, null);
59 }
60
61
62 t.stop();
63 System.out.println("Finished counting; time = "+t);
64
65 printStatistics(count);
66
67 }
68
69 public static void printStatistics(KCountArray2 count){
70 long[] freq=count.transformToFrequency();
71
72 // System.out.println(count+"\n");
73 // System.out.println(Arrays.toString(freq)+"\n");
74
75 long sum=sum(freq);
76 System.out.println("Kmer fraction:");
77 int lim1=8, lim2=16;
78 for(int i=0; i<lim1; i++){
79 String prefix=i+"";
80 while(prefix.length()<8){prefix=prefix+" ";}
81 System.out.println(prefix+"\t"+String.format(Locale.ROOT, "%.3f%% ",(100l*freq[i]/(double)sum))+"\t"+freq[i]);
82 }
83 while(lim1<=freq.length){
84 int x=0;
85 for(int i=lim1; i<lim2; i++){
86 x+=freq[i];
87 }
88 String prefix=lim1+"-"+(lim2-1);
89 if(lim2>=freq.length){prefix=lim1+"+";}
90 while(prefix.length()<8){prefix=prefix+" ";}
91 System.out.println(prefix+"\t"+String.format(Locale.ROOT, "%.3f%% ",(100l*x/(double)sum))+"\t"+x);
92 lim1*=2;
93 lim2=min(lim2*2, freq.length);
94 }
95
96 long sum2=sum-freq[0];
97 long x=freq[1];
98 System.out.println();
99 System.out.println("Keys Counted: \t \t"+keysCounted);
100 System.out.println("Unique: \t \t"+sum2);
101 System.out.println("Avg Sites/Key: \t \t"+String.format(Locale.ROOT, "%.3f ",(keysCounted*1d/sum2)));
102 System.out.println();
103 System.out.println("Singleton: \t"+String.format(Locale.ROOT, "%.3f%% ",(100l*x/(double)sum2))+"\t"+x);
104 x=sum2-x;
105 System.out.println("Useful: \t"+String.format(Locale.ROOT, "%.3f%% ",(100l*x/(double)sum2))+"\t"+x);
106 }
107
108 public static KCountArray2 count(String reads1, String reads2, int k, int cbits, boolean rcomp){
109 return count(reads1, reads2, k, cbits, rcomp, null);
110 }
111
112 public static KCountArray2 count(String reads1, String reads2, int k, int cbits, boolean rcomp, KCountArray2 count){
113 assert(k>=1 && k<20);
114 final int kbits=2*k;
115 final long mask=(kbits>63 ? -1L : ~((-1L)<<kbits));
116
117 if(count==null){
118 final long cells=1L<<kbits;
119 if(verbose){System.err.println("k="+k+", kbits="+kbits+", cells="+cells+", mask="+Long.toHexString(mask));}
120 count=new KCountArray2(cells, cbits);
121 }
122
123 final ConcurrentReadInputStream cris;
124 {
125 FileFormat ff1=FileFormat.testInput(reads1, FileFormat.FASTQ, null, true, true);
126 FileFormat ff2=FileFormat.testInput(reads2, FileFormat.FASTQ, null, true, true);
127 cris=ConcurrentReadInputStream.getReadInputStream(maxReads, true, ff1, ff2);
128 cris.start(); //4567
129 }
130
131 assert(cris!=null) : reads1;
132 System.err.println("Started cris");
133 boolean paired=cris.paired();
134 if(verbose){System.err.println("Paired: "+paired);}
135
136 ListNum<Read> ln=cris.nextList();
137 ArrayList<Read> reads=(ln!=null ? ln.list : null);
138
139 if(reads!=null && !reads.isEmpty()){
140 Read r=reads.get(0);
141 assert(paired==(r.mate!=null));
142 }
143
144 while(ln!=null && reads!=null && reads.size()>0){//ln!=null prevents a compiler potential null access warning
145 //System.err.println("reads.size()="+reads.size());
146 for(Read r : reads){
147 readsProcessed++;
148
149 addRead(r, count, k, mask, rcomp);
150 if(r.mate!=null){
151 addRead(r.mate, count, k, mask, rcomp);
152 }
153
154 }
155 //System.err.println("returning list");
156 cris.returnList(ln);
157 //System.err.println("fetching list");
158 ln=cris.nextList();
159 reads=(ln!=null ? ln.list : null);
160 }
161 if(verbose){System.err.println("Finished reading");}
162 cris.returnList(ln);
163 if(verbose){System.err.println("Returned list");}
164 cris.close();
165 if(verbose){System.err.println("Closed stream");}
166 if(verbose){System.err.println("Processed "+readsProcessed+" reads.");}
167
168
169 return count;
170 }
171
172 public static KCountArray2 countFastqSplit(String reads1, String reads2, int k1, int k2, int gap, int cbits, boolean rcomp, KCountArray2 count){
173 assert(k1+k2>=1 && k1+k2<20);
174 assert(gap>=0);
175 final int kbits1=2*k1;
176 final int kbits2=2*k2;
177 final long mask1=~((-1L)<<(kbits1));
178 final long mask2=~((-1L)<<(kbits2));
179
180 if(count==null){
181 final long cells=1L<<(kbits1+kbits2);
182 if(verbose){System.err.println("k1="+k1+", k2="+k2+", kbits1="+kbits1+", kbits2="+kbits2+", cells="+cells+
183 ", mask1="+Long.toHexString(mask1)+", mask2="+Long.toHexString(mask2));}
184 count=new KCountArray2(cells, cbits, gap);
185 }
186 assert(count.gap==gap);
187
188 final ConcurrentReadInputStream cris;
189 {
190 FileFormat ff1=FileFormat.testInput(reads1, FileFormat.FASTQ, null, true, true);
191 FileFormat ff2=FileFormat.testInput(reads2, FileFormat.FASTQ, null, true, true);
192 cris=ConcurrentReadInputStream.getReadInputStream(maxReads, true, ff1, ff2);
193 cris.start(); //4567
194 }
195
196 assert(cris!=null) : reads1;
197 System.err.println("Started cris");
198 boolean paired=cris.paired();
199 if(verbose){System.err.println("Paired: "+paired);}
200
201 ListNum<Read> ln=cris.nextList();
202 ArrayList<Read> reads=(ln!=null ? ln.list : null);
203
204 if(reads!=null && !reads.isEmpty()){
205 Read r=reads.get(0);
206 assert(paired==(r.mate!=null));
207 }
208
209 while(ln!=null && reads!=null && reads.size()>0){//ln!=null prevents a compiler potential null access warning
210 //System.err.println("reads.size()="+reads.size());
211 for(Read r : reads){
212 readsProcessed++;
213
214 addReadSplit(r, count, k1, k2, mask1, mask2, gap, rcomp);
215 if(r.mate!=null){
216 addReadSplit(r.mate, count, k1, k2, mask1, mask2, gap, rcomp);
217 }
218
219 }
220 //System.err.println("returning list");
221 cris.returnList(ln);
222 //System.err.println("fetching list");
223 ln=cris.nextList();
224 reads=(ln!=null ? ln.list : null);
225 }
226 if(verbose){System.err.println("Finished reading");}
227 cris.returnList(ln);
228 if(verbose){System.err.println("Returned list");}
229 cris.close();
230 if(verbose){System.err.println("Closed stream");}
231 if(verbose){System.err.println("Processed "+readsProcessed+" reads.");}
232
233
234 return count;
235 }
236
237 public static void addRead(final Read r, final KCountArray2 count, final int k, final long mask, boolean rcomp){
238 int len=0;
239 long kmer=0;
240 byte[] bases=r.bases;
241 byte[] quals=r.quality;
242 for(int i=0; i<bases.length; i++){
243 byte b=bases[i];
244 int x=AminoAcid.baseToNumber[b];
245 if(x<0 || (quals!=null && quals[i]<minQuality)){
246 len=0;
247 kmer=0;
248 }else{
249 kmer=((kmer<<2)|x)&mask;
250 len++;
251 if(len>=k){
252 keysCounted++;
253 // System.out.print("Incrementing "+Long.toHexString(kmer)+": "+count.read(kmer));
254 count.increment(kmer, 1);
255 // System.out.println(" -> "+count.read(kmer));
256 // System.out.print("Incrementing array for "+Long.toHexString(kmer)+": "+array[(int)kmer]);
257 // array[(int)kmer]++;
258 // System.out.println(" -> "+array[(int)kmer]+"\n");
259 // assert(array[(int)kmer]==count.read(kmer) || array[(int)kmer]>3);
260 }
261 }
262 }
263 if(rcomp){
264 r.reverseComplement();
265 addRead(r, count, k, mask, false);
266 }
267 }
268
269 public static void addReadSplit(final Read r, final KCountArray2 count, final int k1, final int k2, final long mask1, final long mask2, final int gap, boolean rcomp){
270 int len=0;
271 int shift=k2*2;
272 long kmer1=0;
273 long kmer2=0;
274 byte[] bases=r.bases;
275 byte[] quals=r.quality;
276
277 assert(kmer1>=kmer2);
278
279 // assert(false) : k1+", "+k2+", "+mask1+", "+mask2+", "+gap;
280
281 for(int i=0, j=i+k1+gap; j<bases.length; i++, j++){
282 int x1=AminoAcid.baseToNumber[bases[i]];
283 int x2=AminoAcid.baseToNumber[bases[j]];
284 if(x1<0 || x2<0 || (quals!=null && (quals[i]<minQuality || quals[j]<minQuality))){
285 len=0;
286 kmer1=0;
287 kmer2=0;
288 }else{
289 kmer1=((kmer1<<2)|x1)&mask1;
290 kmer2=((kmer2<<2)|x2)&mask2;
291 len++;
292 if(len>=k1){
293 keysCounted++;
294 // System.out.print("Incrementing "+Long.toHexString(kmer)+": "+count.read(kmer));
295
296 long key=(kmer1<<shift)|kmer2;
297 // System.err.println(Long.toHexString(key));
298 count.increment(key, 1);
299 // System.out.println(" -> "+count.read(kmer));
300 // System.out.print("Incrementing array for "+Long.toHexString(kmer)+": "+array[(int)kmer]);
301 // array[(int)kmer]++;
302 // System.out.println(" -> "+array[(int)kmer]+"\n");
303 // assert(array[(int)kmer]==count.read(kmer) || array[(int)kmer]>3);
304 }
305 }
306 }
307 if(rcomp){
308 r.reverseComplement();
309 addReadSplit(r, count, k1, k2, mask1, mask2, gap, false);
310 }
311 }
312
313 public static void addReadSplit(final byte[] bases, final KCountArray2 count, final int k1, final int k2, final long mask1, final long mask2, final int gap, boolean rcomp){
314 int len=0;
315 int shift=k2*2;
316 long kmer1=0;
317 long kmer2=0;
318 byte[] quals=null;
319
320 assert(kmer1>=kmer2);
321
322 // assert(false) : k1+", "+k2+", "+mask1+", "+mask2+", "+gap;
323
324 for(int i=0, j=i+k1+gap; j<bases.length; i++, j++){
325 int x1=AminoAcid.baseToNumber[bases[i]];
326 int x2=AminoAcid.baseToNumber[bases[j]];
327 if(x1<0 || x2<0 || (quals!=null && (quals[i]<minQuality || quals[j]<minQuality))){
328 len=0;
329 kmer1=0;
330 kmer2=0;
331 }else{
332 kmer1=((kmer1<<2)|x1)&mask1;
333 kmer2=((kmer2<<2)|x2)&mask2;
334 len++;
335 if(len>=k1){
336 keysCounted++;
337 // System.out.print("Incrementing "+Long.toHexString(kmer)+": "+count.read(kmer));
338
339 long key=(kmer1<<shift)|kmer2;
340 System.out.println(Long.toHexString(kmer1));
341 System.out.println(Long.toHexString(kmer2));
342 System.out.println(Long.toHexString(key));
343 count.increment(key, 1);
344 // System.out.println(" -> "+count.read(kmer));
345 // System.out.print("Incrementing array for "+Long.toHexString(kmer)+": "+array[(int)kmer]);
346 // array[(int)kmer]++;
347 // System.out.println(" -> "+array[(int)kmer]+"\n");
348 // assert(array[(int)kmer]==count.read(kmer) || array[(int)kmer]>3);
349 }
350 }
351 }
352 if(rcomp){
353 AminoAcid.reverseComplementBasesInPlace(bases);
354 addReadSplit(bases, count, k1, k2, mask1, mask2, gap, false);
355 }
356 }
357
358 }