comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/clump/KmerSplit.java @ 68:5028fdace37b

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 16:23:26 -0400
parents
children
comparison
equal deleted inserted replaced
67:0e9998148a16 68:5028fdace37b
1 package clump;
2
3 import java.io.File;
4 import java.io.PrintStream;
5 import java.util.ArrayList;
6
7 import bloom.KCountArray;
8 import fileIO.ByteFile;
9 import fileIO.FileFormat;
10 import fileIO.ReadWrite;
11 import jgi.BBMerge;
12 import shared.KillSwitch;
13 import shared.Parse;
14 import shared.Parser;
15 import shared.PreParser;
16 import shared.ReadStats;
17 import shared.Shared;
18 import shared.Timer;
19 import shared.Tools;
20 import stream.ConcurrentReadInputStream;
21 import stream.ConcurrentReadOutputStream;
22 import stream.FASTQ;
23 import stream.FastaReadInputStream;
24 import stream.Read;
25 import structures.ListNum;
26 import structures.Quantizer;
27
28 /**
29 * @author Brian Bushnell
30 * @date June 20, 2014
31 *
32 */
33 public class KmerSplit {
34
35 /*--------------------------------------------------------------*/
36 /*---------------- Initialization ----------------*/
37 /*--------------------------------------------------------------*/
38
39 /**
40 * Code entrance from the command line.
41 * @param args Command line arguments
42 */
43 public static void main(String[] args){
44 final boolean pigz=ReadWrite.USE_PIGZ, unpigz=ReadWrite.USE_UNPIGZ;
45 final boolean oldFInt=FASTQ.FORCE_INTERLEAVED, oldTInt=FASTQ.TEST_INTERLEAVED;
46 final int zl=ReadWrite.ZIPLEVEL;
47 final float ztd=ReadWrite.ZIP_THREAD_MULT;
48 final int mzt=ReadWrite.MAX_ZIP_THREADS;
49 Timer t=new Timer();
50 KmerSplit x=new KmerSplit(args);
51 ReadWrite.ZIPLEVEL=Tools.min(ReadWrite.ZIPLEVEL, maxZipLevel);
52 x.process(t);
53 ReadWrite.USE_PIGZ=pigz;
54 ReadWrite.USE_UNPIGZ=unpigz;
55 ReadWrite.ZIPLEVEL=zl;
56 ReadWrite.ZIP_THREAD_MULT=ztd;
57 ReadWrite.MAX_ZIP_THREADS=mzt;
58 FASTQ.FORCE_INTERLEAVED=oldFInt;
59 FASTQ.TEST_INTERLEAVED=oldTInt;
60
61 //Close the print stream if it was redirected
62 Shared.closeStream(x.outstream);
63 }
64
65 /**
66 * Constructor.
67 * @param args Command line arguments
68 */
69 public KmerSplit(String[] args){
70
71 {//Preparse block for help, config files, and outstream
72 PreParser pp=new PreParser(args, getClass(), false);
73 args=pp.args;
74 outstream=pp.outstream;
75 }
76
77 ReadWrite.USE_PIGZ=false;
78 ReadWrite.USE_UNPIGZ=true;
79 ReadWrite.MAX_ZIP_THREADS=Shared.threads();
80
81 boolean setInterleaved=false; //Whether it was explicitly set.
82 Parser parser=new Parser();
83
84 for(int i=0; i<args.length; i++){
85 String arg=args[i];
86 String[] split=arg.split("=");
87 String a=split[0].toLowerCase();
88 String b=split.length>1 ? split[1] : null;
89
90 if(parser.parse(arg, a, b)){
91 //do nothing
92 }else if(a.equals("verbose")){
93 verbose=KmerComparator.verbose=Parse.parseBoolean(b);
94 }else if(a.equals("parse_flag_goes_here")){
95 //Set a variable here
96 }else if(a.equals("k")){
97 k=Integer.parseInt(b);
98 assert(k>0 && k<32);
99 }else if(a.equals("mincount") || a.equals("mincr")){
100 minCount=Integer.parseInt(b);
101 }else if(a.equals("groups") || a.equals("g") || a.equals("sets") || a.equals("ways")){
102 groups=Integer.parseInt(b);
103 }else if(a.equals("rename") || a.equals("addname")){
104 //Do nothing
105 // addName=Parse.parseBoolean(b);
106 }else if(a.equals("shortname") || a.equals("shortnames")){
107 if(b!=null && b.equals("shrink")){
108 shrinkName=true;
109 }else{
110 shrinkName=false;
111 shortName=Parse.parseBoolean(b);
112 }
113 }else if(a.equals("rcomp") || a.equals("reversecomplement")){
114 //ignore rcomp=Parse.parseBoolean(b);
115 }else if(a.equals("condense") || a.equals("consensus") || a.equals("concensus")){//Note the last one is intentionally misspelled
116 //ignore
117 }else if(a.equals("correct") || a.equals("ecc")){
118 //ignore
119 }else if(a.equals("passes")){
120 int x=Integer.parseInt(b);
121 // if(x>1){outstream.println("Warning: KmerSplit does not support multiple passes.");}
122 }
123
124 else if(a.equals("dedupe")){
125 //ignore
126 }else if(a.equals("entryfilter")){
127 //ignore
128 }else if(a.equals("markduplicates")){
129 //ignore
130 }else if(a.equals("markall")){
131 //ignore
132 }else if(a.equals("addcount") || a.equals("renamebycount")){
133 //ignore
134 }else if(a.equals("optical") || a.equals("opticalonly")){
135 //ignore
136 }else if(a.equals("dupesubs") || a.equals("duplicatesubs") || a.equals("dsubs") || a.equals("subs") || a.equals("s")){
137 //ignore
138 }else if(a.equals("dupedist") || a.equals("duplicatedistance") || a.equals("ddist") || a.equals("dist") || a.equals("opticaldist") || a.equals("distance")){
139 //ignore
140 }else if(a.equals("scanlimit") || a.equals("scan")){
141 //ignore
142 }else if(a.equals("removeallduplicates") || a.equals("allduplicates")){
143 //ignore
144 }else if(a.equals("allowns")){
145 //ignore
146 }else if(a.equals("containment") || a.equals("absorbcontainment") || a.equals("ac") || a.equals("contains")){
147 //ignore
148 }else if(a.equalsIgnoreCase("prefixOrSuffix") || a.equalsIgnoreCase("suffixOrPrefix") || a.equals("affix") || a.equals("pos")){
149 //ignore
150 }else if(a.equals("printduplicates")){
151 //ignore
152 }else if(a.equals("dupeidentity")){
153 //ignore
154 }else if(a.equals("dupesubrate") || a.equals("dsr") || a.equals("subrate")){
155 //ignore
156 }
157
158 else if(a.equals("prefilter")){
159 KmerReduce.prefilter=Parse.parseBoolean(b);
160 }else if(a.equals("ecco")){
161 ecco=Parse.parseBoolean(b);
162 }else if(a.equals("seed")){
163 KmerComparator.defaultSeed=Long.parseLong(b);
164 }else if(a.equals("hashes")){
165 KmerComparator.setHashes(Integer.parseInt(b));
166 }else if(a.equals("border")){
167 KmerComparator.defaultBorder=Integer.parseInt(b);
168 }else if(a.equals("minprob")){
169 KmerComparator.minProb=Float.parseFloat(b);
170 }else if(a.equals("unpair")){
171 unpair=Parse.parseBoolean(b);
172 }else if(a.equals("repair")){
173 //Do nothing
174 }else if(a.equals("namesort") || a.equals("sort")){
175 //Do nothing
176 }else if(a.equals("fetchthreads")){
177 //Do nothing
178 }else if(a.equals("reorder") || a.equals("reorderclumps")){
179 //reorder=Parse.parseBoolean(b);
180 }else if(a.equals("reorderpaired") || a.equals("reorderclumpspaired")){
181 // reorderpaired=Parse.parseBoolean(b);
182 }
183
184
185 else if(Clump.parseStatic(arg, a, b)){
186 //Do nothing
187 }
188
189 else{
190 outstream.println("Unknown parameter "+args[i]);
191 assert(false) : "Unknown parameter "+args[i];
192 // throw new RuntimeException("Unknown parameter "+args[i]);
193 }
194 }
195
196 {//Process parser fields
197 Parser.processQuality();
198
199 maxReads=parser.maxReads;
200
201 overwrite=ReadStats.overwrite=parser.overwrite;
202 append=ReadStats.append=parser.append;
203
204 setInterleaved=parser.setInterleaved;
205
206 in1=parser.in1;
207 in2=parser.in2;
208
209 out1=parser.out1;
210
211 extin=parser.extin;
212 extout=parser.extout;
213 }
214
215 if(groups>2){ReadWrite.USE_PIGZ=false;}
216
217 if(in1!=null && in2==null && in1.indexOf('#')>-1 && !new File(in1).exists()){
218 in2=in1.replace("#", "2");
219 in1=in1.replace("#", "1");
220 }
221 if(in2!=null){
222 if(FASTQ.FORCE_INTERLEAVED){outstream.println("Reset INTERLEAVED to false because paired input files were specified.");}
223 FASTQ.FORCE_INTERLEAVED=FASTQ.TEST_INTERLEAVED=false;
224 }
225
226 assert(FastaReadInputStream.settingsOK());
227
228 if(in1==null){throw new RuntimeException("Error - at least one input file is required.");}
229 if(!ByteFile.FORCE_MODE_BF1 && !ByteFile.FORCE_MODE_BF2 && Shared.threads()>2){
230 ByteFile.FORCE_MODE_BF2=true;
231 }
232
233 if(!setInterleaved){
234 assert(in1!=null) : "\nin1="+in1+"\nin2="+in2+"\nout1="+out1+"\n";
235 if(in2!=null){ //If there are 2 input streams.
236 FASTQ.FORCE_INTERLEAVED=FASTQ.TEST_INTERLEAVED=false;
237 outstream.println("Set INTERLEAVED to "+FASTQ.FORCE_INTERLEAVED);
238 }
239 }
240
241 if(out1!=null && out1.equalsIgnoreCase("null")){out1=null;}
242
243 if(out1!=null){
244 assert(out1.contains("%"));
245 outArray=new String[groups];
246 for(int i=0; i<groups; i++){
247 outArray[i]=out1.replaceFirst("%", ""+i);
248 }
249 if(!Tools.testOutputFiles(overwrite, append, false, outArray)){
250 outstream.println((out1==null)+", "+out1);
251 throw new RuntimeException("\n\noverwrite="+overwrite+"; Can't write to output files "+out1+"\n");
252 }
253 ffout=new FileFormat[groups];
254 if(groups>1){ReadWrite.setZipThreadMult(Tools.min(0.5f, 2f/(groups+1)));}
255 for(int i=0; i<groups; i++){
256 ffout[i]=FileFormat.testOutput(outArray[i], FileFormat.FASTQ, extout, groups<10, overwrite, append, false);
257 }
258 }else{
259 outArray=null;
260 throw new RuntimeException("out is a required parameter.");
261 }
262
263 ffin1=FileFormat.testInput(in1, FileFormat.FASTQ, extin, true, true);
264 ffin2=FileFormat.testInput(in2, FileFormat.FASTQ, extin, true, true);
265 }
266
267
268 /*--------------------------------------------------------------*/
269 /*---------------- Outer Methods ----------------*/
270 /*--------------------------------------------------------------*/
271
272 /** Count kmers */
273 void preprocess(){
274 if(minCount>1){
275 table=ClumpTools.getTable(in1, in2, k, minCount);
276 }
277 }
278
279 /** Create read streams and process all data */
280 void process(Timer t){
281
282 preprocess();
283
284 final ConcurrentReadInputStream cris;
285 {
286 cris=ConcurrentReadInputStream.getReadInputStream(maxReads, true, ffin1, ffin2, null, null);
287 cris.start();
288 if(verbose){outstream.println("Started cris");}
289 }
290 boolean paired=cris.paired();
291 if(!ffin1.samOrBam()){outstream.println("Input is being processed as "+(paired ? "paired" : "unpaired"));}
292 if(cris.paired() && (in1==null || !in1.contains(".sam") && !unpair)){
293 outstream.println("Writing interleaved.");
294 }
295
296 final ConcurrentReadOutputStream ros[]=new ConcurrentReadOutputStream[groups];
297 try {
298 for(int i=0; i<groups; i++){
299 final int buff=8;
300
301 assert(!out1.equalsIgnoreCase(in1) && !out1.equalsIgnoreCase(in1)) : "Input file and output file have same name.";
302
303 ros[i]=ConcurrentReadOutputStream.getStream(ffout[i], null, null, null, buff, null, false);
304 ros[i].start();
305 }
306 } catch (OutOfMemoryError e) {
307 KillSwitch.memKill(e);
308 }
309
310 readsProcessed=0;
311 basesProcessed=0;
312
313 //Process the read stream
314 processInner(cris, ros);
315
316 errorState|=ReadStats.writeAll();
317
318 t.stop();
319
320 outstream.println(Tools.timeReadsBasesProcessed(t, readsProcessed, basesProcessed, 8));
321
322 if(errorState){
323 Clumpify.sharedErrorState=true;
324 throw new RuntimeException(getClass().getName()+" terminated in an error state; the output may be corrupt.");
325 }
326 }
327
328 /** Collect and sort the reads */
329 void processInner(final ConcurrentReadInputStream cris, final ConcurrentReadOutputStream[] ros){
330 if(verbose){outstream.println("Making comparator.");}
331 KmerComparator kc=new KmerComparator(k, false, false);
332 if(verbose){outstream.println("Seed: "+kc.seed);}
333
334 if(verbose){outstream.println("Splitting reads.");}
335 splitReads(cris, ros, kc);
336 lastMemProcessed=memProcessed;
337
338 if(verbose){outstream.println("Done!");}
339 }
340
341 public void splitReads(final ConcurrentReadInputStream cris, final ConcurrentReadOutputStream[] ros, final KmerComparator kc){
342 Timer t=new Timer();
343 if(verbose){t.start("Making hash threads.");}
344 final int threads=Shared.threads();
345 ArrayList<HashThread> alht=new ArrayList<HashThread>(threads);
346 for(int i=0; i<threads; i++){alht.add(new HashThread(i, cris, ros, kc));}
347
348 if(verbose){outstream.println("Starting threads.");}
349 for(HashThread ht : alht){ht.start();}
350
351
352 if(verbose){outstream.println("Waiting for threads.");}
353 /* Wait for threads to die */
354 for(HashThread ht : alht){
355
356 /* Wait for a thread to die */
357 while(ht.getState()!=Thread.State.TERMINATED){
358 try {
359 ht.join();
360 } catch (InterruptedException e) {
361 e.printStackTrace();
362 }
363 }
364 readsProcessed+=ht.readsProcessedT;
365 basesProcessed+=ht.basesProcessedT;
366 diskProcessed+=ht.diskProcessedT;
367 memProcessed+=ht.memProcessedT;
368 }
369
370 if(verbose){outstream.println("Closing streams.");}
371 errorState=ReadWrite.closeStreams(cris, ros)|errorState;
372 if(verbose){t.stop("Split time: ");}
373 }
374
375 /*--------------------------------------------------------------*/
376 /*---------------- Inner Methods ----------------*/
377 /*--------------------------------------------------------------*/
378
379 /*--------------------------------------------------------------*/
380 /*---------------- Inner Classes ----------------*/
381 /*--------------------------------------------------------------*/
382
383 private class HashThread extends Thread{
384
385 HashThread(int id_, ConcurrentReadInputStream cris_, ConcurrentReadOutputStream[] ros_, KmerComparator kc_){
386 id=id_;
387 cris=cris_;
388 ros=ros_;
389 kc=kc_;
390 }
391
392 @Override
393 public void run(){
394
395 final boolean paired=cris.paired();
396 ListNum<Read> ln=cris.nextList();
397 ArrayList<Read> reads=(ln!=null ? ln.list : null);
398
399 ArrayList<Read>[] array=new ArrayList[groups];
400 for(int i=0; i<groups; i++){
401 array[i]=new ArrayList<Read>(buffer);
402 }
403
404 while(ln!=null && reads!=null && reads.size()>0){//ln!=null prevents a compiler potential null access warning
405
406 for(Read r : reads){
407 if(!r.validated()){
408 r.validate(true);
409 if(r.mate!=null){r.mate.validate(true);}
410 }
411 readsProcessedT+=1+r.mateCount();
412 basesProcessedT+=r.length()+r.mateLength();
413 diskProcessedT+=r.countFastqBytes()+r.countMateFastqBytes();
414 memProcessedT+=r.countBytes()+r.countMateBytes()+ReadKey.overhead;
415 if(shrinkName){
416 Clumpify.shrinkName(r);
417 Clumpify.shrinkName(r.mate);
418 }else if(shortName){
419 Clumpify.shortName(r);
420 Clumpify.shortName(r.mate);
421 }
422
423 if(quantizeQuality){
424 Quantizer.quantize(r, r.mate);
425 }
426 }
427
428 if(ecco){
429 for(Read r : reads){
430 if(r.mate!=null){BBMerge.findOverlapStrict(r, r.mate, true);}
431 }
432 }
433
434 ArrayList<Read> hashList=reads;
435 if(paired && unpair){
436 hashList=new ArrayList<Read>(reads.size()*2);
437 for(Read r1 : reads){
438 Read r2=r1.mate;
439 hashList.add(r1);
440 hashList.add(r2);
441 r1.mate=null;
442 r2.mate=null;
443 }
444 }
445
446 kc.hash(hashList, table, minCount, true);
447 for(Read r : hashList){
448 long kmer=((ReadKey)r.obj).kmer;
449 long code=kc.hash(kmer);
450 int code2=(int)(code%groups);
451 assert(code2>=0 && code2<array.length) : code2+", "+groups+", "+array.length+", "+kmer+", "+r.obj+"\n"+r;
452 array[code2].add(r);
453 if(array[code2].size()>=buffer){
454 ros[code2].add(array[code2], 0);
455 array[code2]=new ArrayList<Read>(buffer);
456 }
457 }
458 cris.returnList(ln);
459 ln=cris.nextList();
460 reads=(ln!=null ? ln.list : null);
461 }
462 if(ln!=null){
463 cris.returnList(ln.id, ln.list==null || ln.list.isEmpty());
464 }
465 for(int i=0; i<groups; i++){
466 if(!array[i].isEmpty()){
467 ros[i].add(array[i], 0);
468 }
469 }
470 }
471
472 final int id;
473 final ConcurrentReadInputStream cris;
474 final ConcurrentReadOutputStream[] ros;
475 final KmerComparator kc;
476 static final int buffer=200;
477
478 protected long readsProcessedT=0;
479 protected long basesProcessedT=0;
480 protected long diskProcessedT=0;
481 protected long memProcessedT=0;
482 }
483
484 /*--------------------------------------------------------------*/
485 /*---------------- Fields ----------------*/
486 /*--------------------------------------------------------------*/
487
488 private int k=31;
489 int groups=16;
490 int minCount=0;
491
492 KCountArray table=null;
493
494 /*--------------------------------------------------------------*/
495 /*---------------- I/O Fields ----------------*/
496 /*--------------------------------------------------------------*/
497
498 private String in1=null;
499 private String in2=null;
500
501 private String out1=null;
502 private String[] outArray=null;
503
504 private String extin=null;
505 private String extout=null;
506
507 /*--------------------------------------------------------------*/
508
509 protected long readsProcessed=0;
510 protected long basesProcessed=0;
511 protected long diskProcessed=0;
512 protected long memProcessed=0;
513
514 protected static long lastMemProcessed=0;
515
516 private long maxReads=-1;
517 // private boolean addName=false;
518 boolean shortName=false;
519 boolean shrinkName=false;
520 boolean ecco=false;
521 boolean unpair=false;
522
523 static int maxZipLevel=2;
524
525 static boolean quantizeQuality=false;
526
527 /*--------------------------------------------------------------*/
528 /*---------------- Final Fields ----------------*/
529 /*--------------------------------------------------------------*/
530
531 private final FileFormat ffin1;
532 private final FileFormat ffin2;
533
534 private final FileFormat[] ffout;
535
536 /*--------------------------------------------------------------*/
537 /*---------------- Common Fields ----------------*/
538 /*--------------------------------------------------------------*/
539
540 private PrintStream outstream=System.err;
541 public static boolean verbose=false;
542 public boolean errorState=false;
543 private boolean overwrite=false;
544 private boolean append=false;
545
546 }