Mercurial > repos > rliterman > csp2
comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/clump/KmerSplit.java @ 68:5028fdace37b
planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author | jpayne |
---|---|
date | Tue, 18 Mar 2025 16:23:26 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
67:0e9998148a16 | 68:5028fdace37b |
---|---|
1 package clump; | |
2 | |
3 import java.io.File; | |
4 import java.io.PrintStream; | |
5 import java.util.ArrayList; | |
6 | |
7 import bloom.KCountArray; | |
8 import fileIO.ByteFile; | |
9 import fileIO.FileFormat; | |
10 import fileIO.ReadWrite; | |
11 import jgi.BBMerge; | |
12 import shared.KillSwitch; | |
13 import shared.Parse; | |
14 import shared.Parser; | |
15 import shared.PreParser; | |
16 import shared.ReadStats; | |
17 import shared.Shared; | |
18 import shared.Timer; | |
19 import shared.Tools; | |
20 import stream.ConcurrentReadInputStream; | |
21 import stream.ConcurrentReadOutputStream; | |
22 import stream.FASTQ; | |
23 import stream.FastaReadInputStream; | |
24 import stream.Read; | |
25 import structures.ListNum; | |
26 import structures.Quantizer; | |
27 | |
28 /** | |
29 * @author Brian Bushnell | |
30 * @date June 20, 2014 | |
31 * | |
32 */ | |
33 public class KmerSplit { | |
34 | |
35 /*--------------------------------------------------------------*/ | |
36 /*---------------- Initialization ----------------*/ | |
37 /*--------------------------------------------------------------*/ | |
38 | |
39 /** | |
40 * Code entrance from the command line. | |
41 * @param args Command line arguments | |
42 */ | |
43 public static void main(String[] args){ | |
44 final boolean pigz=ReadWrite.USE_PIGZ, unpigz=ReadWrite.USE_UNPIGZ; | |
45 final boolean oldFInt=FASTQ.FORCE_INTERLEAVED, oldTInt=FASTQ.TEST_INTERLEAVED; | |
46 final int zl=ReadWrite.ZIPLEVEL; | |
47 final float ztd=ReadWrite.ZIP_THREAD_MULT; | |
48 final int mzt=ReadWrite.MAX_ZIP_THREADS; | |
49 Timer t=new Timer(); | |
50 KmerSplit x=new KmerSplit(args); | |
51 ReadWrite.ZIPLEVEL=Tools.min(ReadWrite.ZIPLEVEL, maxZipLevel); | |
52 x.process(t); | |
53 ReadWrite.USE_PIGZ=pigz; | |
54 ReadWrite.USE_UNPIGZ=unpigz; | |
55 ReadWrite.ZIPLEVEL=zl; | |
56 ReadWrite.ZIP_THREAD_MULT=ztd; | |
57 ReadWrite.MAX_ZIP_THREADS=mzt; | |
58 FASTQ.FORCE_INTERLEAVED=oldFInt; | |
59 FASTQ.TEST_INTERLEAVED=oldTInt; | |
60 | |
61 //Close the print stream if it was redirected | |
62 Shared.closeStream(x.outstream); | |
63 } | |
64 | |
65 /** | |
66 * Constructor. | |
67 * @param args Command line arguments | |
68 */ | |
69 public KmerSplit(String[] args){ | |
70 | |
71 {//Preparse block for help, config files, and outstream | |
72 PreParser pp=new PreParser(args, getClass(), false); | |
73 args=pp.args; | |
74 outstream=pp.outstream; | |
75 } | |
76 | |
77 ReadWrite.USE_PIGZ=false; | |
78 ReadWrite.USE_UNPIGZ=true; | |
79 ReadWrite.MAX_ZIP_THREADS=Shared.threads(); | |
80 | |
81 boolean setInterleaved=false; //Whether it was explicitly set. | |
82 Parser parser=new Parser(); | |
83 | |
84 for(int i=0; i<args.length; i++){ | |
85 String arg=args[i]; | |
86 String[] split=arg.split("="); | |
87 String a=split[0].toLowerCase(); | |
88 String b=split.length>1 ? split[1] : null; | |
89 | |
90 if(parser.parse(arg, a, b)){ | |
91 //do nothing | |
92 }else if(a.equals("verbose")){ | |
93 verbose=KmerComparator.verbose=Parse.parseBoolean(b); | |
94 }else if(a.equals("parse_flag_goes_here")){ | |
95 //Set a variable here | |
96 }else if(a.equals("k")){ | |
97 k=Integer.parseInt(b); | |
98 assert(k>0 && k<32); | |
99 }else if(a.equals("mincount") || a.equals("mincr")){ | |
100 minCount=Integer.parseInt(b); | |
101 }else if(a.equals("groups") || a.equals("g") || a.equals("sets") || a.equals("ways")){ | |
102 groups=Integer.parseInt(b); | |
103 }else if(a.equals("rename") || a.equals("addname")){ | |
104 //Do nothing | |
105 // addName=Parse.parseBoolean(b); | |
106 }else if(a.equals("shortname") || a.equals("shortnames")){ | |
107 if(b!=null && b.equals("shrink")){ | |
108 shrinkName=true; | |
109 }else{ | |
110 shrinkName=false; | |
111 shortName=Parse.parseBoolean(b); | |
112 } | |
113 }else if(a.equals("rcomp") || a.equals("reversecomplement")){ | |
114 //ignore rcomp=Parse.parseBoolean(b); | |
115 }else if(a.equals("condense") || a.equals("consensus") || a.equals("concensus")){//Note the last one is intentionally misspelled | |
116 //ignore | |
117 }else if(a.equals("correct") || a.equals("ecc")){ | |
118 //ignore | |
119 }else if(a.equals("passes")){ | |
120 int x=Integer.parseInt(b); | |
121 // if(x>1){outstream.println("Warning: KmerSplit does not support multiple passes.");} | |
122 } | |
123 | |
124 else if(a.equals("dedupe")){ | |
125 //ignore | |
126 }else if(a.equals("entryfilter")){ | |
127 //ignore | |
128 }else if(a.equals("markduplicates")){ | |
129 //ignore | |
130 }else if(a.equals("markall")){ | |
131 //ignore | |
132 }else if(a.equals("addcount") || a.equals("renamebycount")){ | |
133 //ignore | |
134 }else if(a.equals("optical") || a.equals("opticalonly")){ | |
135 //ignore | |
136 }else if(a.equals("dupesubs") || a.equals("duplicatesubs") || a.equals("dsubs") || a.equals("subs") || a.equals("s")){ | |
137 //ignore | |
138 }else if(a.equals("dupedist") || a.equals("duplicatedistance") || a.equals("ddist") || a.equals("dist") || a.equals("opticaldist") || a.equals("distance")){ | |
139 //ignore | |
140 }else if(a.equals("scanlimit") || a.equals("scan")){ | |
141 //ignore | |
142 }else if(a.equals("removeallduplicates") || a.equals("allduplicates")){ | |
143 //ignore | |
144 }else if(a.equals("allowns")){ | |
145 //ignore | |
146 }else if(a.equals("containment") || a.equals("absorbcontainment") || a.equals("ac") || a.equals("contains")){ | |
147 //ignore | |
148 }else if(a.equalsIgnoreCase("prefixOrSuffix") || a.equalsIgnoreCase("suffixOrPrefix") || a.equals("affix") || a.equals("pos")){ | |
149 //ignore | |
150 }else if(a.equals("printduplicates")){ | |
151 //ignore | |
152 }else if(a.equals("dupeidentity")){ | |
153 //ignore | |
154 }else if(a.equals("dupesubrate") || a.equals("dsr") || a.equals("subrate")){ | |
155 //ignore | |
156 } | |
157 | |
158 else if(a.equals("prefilter")){ | |
159 KmerReduce.prefilter=Parse.parseBoolean(b); | |
160 }else if(a.equals("ecco")){ | |
161 ecco=Parse.parseBoolean(b); | |
162 }else if(a.equals("seed")){ | |
163 KmerComparator.defaultSeed=Long.parseLong(b); | |
164 }else if(a.equals("hashes")){ | |
165 KmerComparator.setHashes(Integer.parseInt(b)); | |
166 }else if(a.equals("border")){ | |
167 KmerComparator.defaultBorder=Integer.parseInt(b); | |
168 }else if(a.equals("minprob")){ | |
169 KmerComparator.minProb=Float.parseFloat(b); | |
170 }else if(a.equals("unpair")){ | |
171 unpair=Parse.parseBoolean(b); | |
172 }else if(a.equals("repair")){ | |
173 //Do nothing | |
174 }else if(a.equals("namesort") || a.equals("sort")){ | |
175 //Do nothing | |
176 }else if(a.equals("fetchthreads")){ | |
177 //Do nothing | |
178 }else if(a.equals("reorder") || a.equals("reorderclumps")){ | |
179 //reorder=Parse.parseBoolean(b); | |
180 }else if(a.equals("reorderpaired") || a.equals("reorderclumpspaired")){ | |
181 // reorderpaired=Parse.parseBoolean(b); | |
182 } | |
183 | |
184 | |
185 else if(Clump.parseStatic(arg, a, b)){ | |
186 //Do nothing | |
187 } | |
188 | |
189 else{ | |
190 outstream.println("Unknown parameter "+args[i]); | |
191 assert(false) : "Unknown parameter "+args[i]; | |
192 // throw new RuntimeException("Unknown parameter "+args[i]); | |
193 } | |
194 } | |
195 | |
196 {//Process parser fields | |
197 Parser.processQuality(); | |
198 | |
199 maxReads=parser.maxReads; | |
200 | |
201 overwrite=ReadStats.overwrite=parser.overwrite; | |
202 append=ReadStats.append=parser.append; | |
203 | |
204 setInterleaved=parser.setInterleaved; | |
205 | |
206 in1=parser.in1; | |
207 in2=parser.in2; | |
208 | |
209 out1=parser.out1; | |
210 | |
211 extin=parser.extin; | |
212 extout=parser.extout; | |
213 } | |
214 | |
215 if(groups>2){ReadWrite.USE_PIGZ=false;} | |
216 | |
217 if(in1!=null && in2==null && in1.indexOf('#')>-1 && !new File(in1).exists()){ | |
218 in2=in1.replace("#", "2"); | |
219 in1=in1.replace("#", "1"); | |
220 } | |
221 if(in2!=null){ | |
222 if(FASTQ.FORCE_INTERLEAVED){outstream.println("Reset INTERLEAVED to false because paired input files were specified.");} | |
223 FASTQ.FORCE_INTERLEAVED=FASTQ.TEST_INTERLEAVED=false; | |
224 } | |
225 | |
226 assert(FastaReadInputStream.settingsOK()); | |
227 | |
228 if(in1==null){throw new RuntimeException("Error - at least one input file is required.");} | |
229 if(!ByteFile.FORCE_MODE_BF1 && !ByteFile.FORCE_MODE_BF2 && Shared.threads()>2){ | |
230 ByteFile.FORCE_MODE_BF2=true; | |
231 } | |
232 | |
233 if(!setInterleaved){ | |
234 assert(in1!=null) : "\nin1="+in1+"\nin2="+in2+"\nout1="+out1+"\n"; | |
235 if(in2!=null){ //If there are 2 input streams. | |
236 FASTQ.FORCE_INTERLEAVED=FASTQ.TEST_INTERLEAVED=false; | |
237 outstream.println("Set INTERLEAVED to "+FASTQ.FORCE_INTERLEAVED); | |
238 } | |
239 } | |
240 | |
241 if(out1!=null && out1.equalsIgnoreCase("null")){out1=null;} | |
242 | |
243 if(out1!=null){ | |
244 assert(out1.contains("%")); | |
245 outArray=new String[groups]; | |
246 for(int i=0; i<groups; i++){ | |
247 outArray[i]=out1.replaceFirst("%", ""+i); | |
248 } | |
249 if(!Tools.testOutputFiles(overwrite, append, false, outArray)){ | |
250 outstream.println((out1==null)+", "+out1); | |
251 throw new RuntimeException("\n\noverwrite="+overwrite+"; Can't write to output files "+out1+"\n"); | |
252 } | |
253 ffout=new FileFormat[groups]; | |
254 if(groups>1){ReadWrite.setZipThreadMult(Tools.min(0.5f, 2f/(groups+1)));} | |
255 for(int i=0; i<groups; i++){ | |
256 ffout[i]=FileFormat.testOutput(outArray[i], FileFormat.FASTQ, extout, groups<10, overwrite, append, false); | |
257 } | |
258 }else{ | |
259 outArray=null; | |
260 throw new RuntimeException("out is a required parameter."); | |
261 } | |
262 | |
263 ffin1=FileFormat.testInput(in1, FileFormat.FASTQ, extin, true, true); | |
264 ffin2=FileFormat.testInput(in2, FileFormat.FASTQ, extin, true, true); | |
265 } | |
266 | |
267 | |
268 /*--------------------------------------------------------------*/ | |
269 /*---------------- Outer Methods ----------------*/ | |
270 /*--------------------------------------------------------------*/ | |
271 | |
272 /** Count kmers */ | |
273 void preprocess(){ | |
274 if(minCount>1){ | |
275 table=ClumpTools.getTable(in1, in2, k, minCount); | |
276 } | |
277 } | |
278 | |
279 /** Create read streams and process all data */ | |
280 void process(Timer t){ | |
281 | |
282 preprocess(); | |
283 | |
284 final ConcurrentReadInputStream cris; | |
285 { | |
286 cris=ConcurrentReadInputStream.getReadInputStream(maxReads, true, ffin1, ffin2, null, null); | |
287 cris.start(); | |
288 if(verbose){outstream.println("Started cris");} | |
289 } | |
290 boolean paired=cris.paired(); | |
291 if(!ffin1.samOrBam()){outstream.println("Input is being processed as "+(paired ? "paired" : "unpaired"));} | |
292 if(cris.paired() && (in1==null || !in1.contains(".sam") && !unpair)){ | |
293 outstream.println("Writing interleaved."); | |
294 } | |
295 | |
296 final ConcurrentReadOutputStream ros[]=new ConcurrentReadOutputStream[groups]; | |
297 try { | |
298 for(int i=0; i<groups; i++){ | |
299 final int buff=8; | |
300 | |
301 assert(!out1.equalsIgnoreCase(in1) && !out1.equalsIgnoreCase(in1)) : "Input file and output file have same name."; | |
302 | |
303 ros[i]=ConcurrentReadOutputStream.getStream(ffout[i], null, null, null, buff, null, false); | |
304 ros[i].start(); | |
305 } | |
306 } catch (OutOfMemoryError e) { | |
307 KillSwitch.memKill(e); | |
308 } | |
309 | |
310 readsProcessed=0; | |
311 basesProcessed=0; | |
312 | |
313 //Process the read stream | |
314 processInner(cris, ros); | |
315 | |
316 errorState|=ReadStats.writeAll(); | |
317 | |
318 t.stop(); | |
319 | |
320 outstream.println(Tools.timeReadsBasesProcessed(t, readsProcessed, basesProcessed, 8)); | |
321 | |
322 if(errorState){ | |
323 Clumpify.sharedErrorState=true; | |
324 throw new RuntimeException(getClass().getName()+" terminated in an error state; the output may be corrupt."); | |
325 } | |
326 } | |
327 | |
328 /** Collect and sort the reads */ | |
329 void processInner(final ConcurrentReadInputStream cris, final ConcurrentReadOutputStream[] ros){ | |
330 if(verbose){outstream.println("Making comparator.");} | |
331 KmerComparator kc=new KmerComparator(k, false, false); | |
332 if(verbose){outstream.println("Seed: "+kc.seed);} | |
333 | |
334 if(verbose){outstream.println("Splitting reads.");} | |
335 splitReads(cris, ros, kc); | |
336 lastMemProcessed=memProcessed; | |
337 | |
338 if(verbose){outstream.println("Done!");} | |
339 } | |
340 | |
341 public void splitReads(final ConcurrentReadInputStream cris, final ConcurrentReadOutputStream[] ros, final KmerComparator kc){ | |
342 Timer t=new Timer(); | |
343 if(verbose){t.start("Making hash threads.");} | |
344 final int threads=Shared.threads(); | |
345 ArrayList<HashThread> alht=new ArrayList<HashThread>(threads); | |
346 for(int i=0; i<threads; i++){alht.add(new HashThread(i, cris, ros, kc));} | |
347 | |
348 if(verbose){outstream.println("Starting threads.");} | |
349 for(HashThread ht : alht){ht.start();} | |
350 | |
351 | |
352 if(verbose){outstream.println("Waiting for threads.");} | |
353 /* Wait for threads to die */ | |
354 for(HashThread ht : alht){ | |
355 | |
356 /* Wait for a thread to die */ | |
357 while(ht.getState()!=Thread.State.TERMINATED){ | |
358 try { | |
359 ht.join(); | |
360 } catch (InterruptedException e) { | |
361 e.printStackTrace(); | |
362 } | |
363 } | |
364 readsProcessed+=ht.readsProcessedT; | |
365 basesProcessed+=ht.basesProcessedT; | |
366 diskProcessed+=ht.diskProcessedT; | |
367 memProcessed+=ht.memProcessedT; | |
368 } | |
369 | |
370 if(verbose){outstream.println("Closing streams.");} | |
371 errorState=ReadWrite.closeStreams(cris, ros)|errorState; | |
372 if(verbose){t.stop("Split time: ");} | |
373 } | |
374 | |
375 /*--------------------------------------------------------------*/ | |
376 /*---------------- Inner Methods ----------------*/ | |
377 /*--------------------------------------------------------------*/ | |
378 | |
379 /*--------------------------------------------------------------*/ | |
380 /*---------------- Inner Classes ----------------*/ | |
381 /*--------------------------------------------------------------*/ | |
382 | |
383 private class HashThread extends Thread{ | |
384 | |
385 HashThread(int id_, ConcurrentReadInputStream cris_, ConcurrentReadOutputStream[] ros_, KmerComparator kc_){ | |
386 id=id_; | |
387 cris=cris_; | |
388 ros=ros_; | |
389 kc=kc_; | |
390 } | |
391 | |
392 @Override | |
393 public void run(){ | |
394 | |
395 final boolean paired=cris.paired(); | |
396 ListNum<Read> ln=cris.nextList(); | |
397 ArrayList<Read> reads=(ln!=null ? ln.list : null); | |
398 | |
399 ArrayList<Read>[] array=new ArrayList[groups]; | |
400 for(int i=0; i<groups; i++){ | |
401 array[i]=new ArrayList<Read>(buffer); | |
402 } | |
403 | |
404 while(ln!=null && reads!=null && reads.size()>0){//ln!=null prevents a compiler potential null access warning | |
405 | |
406 for(Read r : reads){ | |
407 if(!r.validated()){ | |
408 r.validate(true); | |
409 if(r.mate!=null){r.mate.validate(true);} | |
410 } | |
411 readsProcessedT+=1+r.mateCount(); | |
412 basesProcessedT+=r.length()+r.mateLength(); | |
413 diskProcessedT+=r.countFastqBytes()+r.countMateFastqBytes(); | |
414 memProcessedT+=r.countBytes()+r.countMateBytes()+ReadKey.overhead; | |
415 if(shrinkName){ | |
416 Clumpify.shrinkName(r); | |
417 Clumpify.shrinkName(r.mate); | |
418 }else if(shortName){ | |
419 Clumpify.shortName(r); | |
420 Clumpify.shortName(r.mate); | |
421 } | |
422 | |
423 if(quantizeQuality){ | |
424 Quantizer.quantize(r, r.mate); | |
425 } | |
426 } | |
427 | |
428 if(ecco){ | |
429 for(Read r : reads){ | |
430 if(r.mate!=null){BBMerge.findOverlapStrict(r, r.mate, true);} | |
431 } | |
432 } | |
433 | |
434 ArrayList<Read> hashList=reads; | |
435 if(paired && unpair){ | |
436 hashList=new ArrayList<Read>(reads.size()*2); | |
437 for(Read r1 : reads){ | |
438 Read r2=r1.mate; | |
439 hashList.add(r1); | |
440 hashList.add(r2); | |
441 r1.mate=null; | |
442 r2.mate=null; | |
443 } | |
444 } | |
445 | |
446 kc.hash(hashList, table, minCount, true); | |
447 for(Read r : hashList){ | |
448 long kmer=((ReadKey)r.obj).kmer; | |
449 long code=kc.hash(kmer); | |
450 int code2=(int)(code%groups); | |
451 assert(code2>=0 && code2<array.length) : code2+", "+groups+", "+array.length+", "+kmer+", "+r.obj+"\n"+r; | |
452 array[code2].add(r); | |
453 if(array[code2].size()>=buffer){ | |
454 ros[code2].add(array[code2], 0); | |
455 array[code2]=new ArrayList<Read>(buffer); | |
456 } | |
457 } | |
458 cris.returnList(ln); | |
459 ln=cris.nextList(); | |
460 reads=(ln!=null ? ln.list : null); | |
461 } | |
462 if(ln!=null){ | |
463 cris.returnList(ln.id, ln.list==null || ln.list.isEmpty()); | |
464 } | |
465 for(int i=0; i<groups; i++){ | |
466 if(!array[i].isEmpty()){ | |
467 ros[i].add(array[i], 0); | |
468 } | |
469 } | |
470 } | |
471 | |
472 final int id; | |
473 final ConcurrentReadInputStream cris; | |
474 final ConcurrentReadOutputStream[] ros; | |
475 final KmerComparator kc; | |
476 static final int buffer=200; | |
477 | |
478 protected long readsProcessedT=0; | |
479 protected long basesProcessedT=0; | |
480 protected long diskProcessedT=0; | |
481 protected long memProcessedT=0; | |
482 } | |
483 | |
484 /*--------------------------------------------------------------*/ | |
485 /*---------------- Fields ----------------*/ | |
486 /*--------------------------------------------------------------*/ | |
487 | |
488 private int k=31; | |
489 int groups=16; | |
490 int minCount=0; | |
491 | |
492 KCountArray table=null; | |
493 | |
494 /*--------------------------------------------------------------*/ | |
495 /*---------------- I/O Fields ----------------*/ | |
496 /*--------------------------------------------------------------*/ | |
497 | |
498 private String in1=null; | |
499 private String in2=null; | |
500 | |
501 private String out1=null; | |
502 private String[] outArray=null; | |
503 | |
504 private String extin=null; | |
505 private String extout=null; | |
506 | |
507 /*--------------------------------------------------------------*/ | |
508 | |
509 protected long readsProcessed=0; | |
510 protected long basesProcessed=0; | |
511 protected long diskProcessed=0; | |
512 protected long memProcessed=0; | |
513 | |
514 protected static long lastMemProcessed=0; | |
515 | |
516 private long maxReads=-1; | |
517 // private boolean addName=false; | |
518 boolean shortName=false; | |
519 boolean shrinkName=false; | |
520 boolean ecco=false; | |
521 boolean unpair=false; | |
522 | |
523 static int maxZipLevel=2; | |
524 | |
525 static boolean quantizeQuality=false; | |
526 | |
527 /*--------------------------------------------------------------*/ | |
528 /*---------------- Final Fields ----------------*/ | |
529 /*--------------------------------------------------------------*/ | |
530 | |
531 private final FileFormat ffin1; | |
532 private final FileFormat ffin2; | |
533 | |
534 private final FileFormat[] ffout; | |
535 | |
536 /*--------------------------------------------------------------*/ | |
537 /*---------------- Common Fields ----------------*/ | |
538 /*--------------------------------------------------------------*/ | |
539 | |
540 private PrintStream outstream=System.err; | |
541 public static boolean verbose=false; | |
542 public boolean errorState=false; | |
543 private boolean overwrite=false; | |
544 private boolean append=false; | |
545 | |
546 } |