comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/clump/Clumpify.java @ 68:5028fdace37b

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 16:23:26 -0400
parents
children
comparison
equal deleted inserted replaced
67:0e9998148a16 68:5028fdace37b
1 package clump;
2
3 import java.io.File;
4 import java.io.PrintStream;
5 import java.util.ArrayList;
6 import java.util.Random;
7
8 import fileIO.FileFormat;
9 import fileIO.ReadWrite;
10 import jgi.BBMerge;
11 import shared.Parse;
12 import shared.Parser;
13 import shared.PreParser;
14 import shared.Shared;
15 import shared.Timer;
16 import shared.Tools;
17 import sort.SortByName;
18 import stream.FASTQ;
19 import stream.Read;
20 import structures.ByteBuilder;
21 import structures.Quantizer;
22
23 /**
24 * @author Brian Bushnell
25 * @date Nov 6, 2015
26 *
27 */
28 public class Clumpify {
29
30 /**
31 * Code entrance from the command line.
32 * @param args Command line arguments
33 */
34 public static void main(String[] args){
35 Timer t=new Timer();
36 ReadWrite.ZIPLEVEL=Tools.max(ReadWrite.ZIPLEVEL, 6);
37
38 //Capture values of static variables that might be modified in case this is called by another class.
39 final boolean oldCQ=Read.CHANGE_QUALITY;
40 final boolean oldBgzip=ReadWrite.USE_BGZIP, oldPreferBgzip=ReadWrite.PREFER_BGZIP;
41
42 BBMerge.changeQuality=Read.CHANGE_QUALITY=false;
43 ReadWrite.USE_BGZIP=true;
44 ReadWrite.PREFER_BGZIP=true;
45
46 Clumpify x=new Clumpify(args);
47 x.process(t);
48
49 //Restore values of static variables.
50 // Shared.setBuffers(oldCap);
51 // ReadWrite.ZIPLEVEL=oldZl;
52 // ReadWrite.USE_PIGZ=oldPigz;
53 ReadWrite.USE_BGZIP=oldBgzip;
54 ReadWrite.PREFER_BGZIP=oldPreferBgzip;
55 // ReadWrite.USE_UNPIGZ=oldUnpigz;
56 // ReadWrite.MAX_ZIP_THREADS=oldZipThreads;
57 BBMerge.changeQuality=Read.CHANGE_QUALITY=oldCQ;
58
59 //Close the print stream if it was redirected
60 Shared.closeStream(x.outstream);
61 }
62
63 /**
64 * Constructor.
65 * @param args Command line arguments
66 */
67 public Clumpify(String[] args){
68
69 {//Preparse block for help, config files, and outstream
70 PreParser pp=new PreParser(args, getClass(), true);
71 args=pp.args;
72 outstream=pp.outstream;
73 }
74
75 Read.VALIDATE_IN_CONSTRUCTOR=Shared.threads()<4;
76
77 args2=new ArrayList<String>();
78 args2.add("in1");
79 args2.add("in2");
80 args2.add("out1");
81 args2.add("out2");
82 args2.add("groups");
83 args2.add("ecco=f");
84 args2.add("rename=f");
85 args2.add("shortname=f");
86 args2.add("unpair=f");
87 args2.add("repair=f");
88 args2.add("namesort=f");
89 args2.add("overwrite=t");
90
91 String gString="auto";
92 for(int i=0; i<args.length; i++){
93 String arg=args[i];
94 String[] split=arg.split("=");
95 String a=split[0].toLowerCase();
96 String b=split.length>1 ? split[1] : null;
97
98 if(a.equals("in") || a.equals("in1")){
99 in1=b;
100 }else if(a.equals("in2")){
101 in2=b;
102 }else if(a.equals("out") || a.equals("out1")){
103 out1=b;
104 }else if(a.equals("out2")){
105 out2=b;
106 }else if(a.equals("groups") || a.equals("g") || a.equals("sets") || a.equals("ways")){
107 gString=b;
108 }else if(a.equals("delete") || a.equals("deletetemp")){
109 delete=Parse.parseBoolean(b);
110 }else if(a.equals("deleteinput")){
111 deleteInput=Parse.parseBoolean(b);
112 }else if(a.equals("usetmpdir")){
113 useTmpdir=Parse.parseBoolean(b);
114 }else if(a.equals("ecco")){
115 ecco=Parse.parseBoolean(b);
116 }else if(a.equals("compresstemp") || a.equals("ct")){
117 if(b!=null && b.equalsIgnoreCase("auto")){forceCompressTemp=forceRawTemp=false;}
118 else{
119 forceCompressTemp=Parse.parseBoolean(b);
120 forceRawTemp=!forceCompressTemp;
121 }
122 }else if(a.equals("tmpdir")){
123 Shared.setTmpdir(b);
124 }else if(a.equals("rename") || a.equals("addname")){
125 addName=Parse.parseBoolean(b);
126 }else if(a.equals("shortname") || a.equals("shortnames")){
127 shortName=b;
128 }else if(a.equals("seed")){
129 KmerComparator.defaultSeed=Long.parseLong(b);
130 }else if(a.equals("hashes")){
131 KmerComparator.setHashes(Integer.parseInt(b));
132 }else if(a.equals("passes")){
133 passes=Integer.parseInt(b);
134 args2.add(arg);
135 // }else if(a.equals("k")){
136 // k=Integer.parseInt(b);
137 // args2.add(arg);
138 }else if(a.equals("border")){
139 KmerComparator.defaultBorder=Integer.parseInt(b);
140 }
141
142 else if(a.equals("unpair")){
143 unpair=Parse.parseBoolean(b);
144 }else if(a.equals("repair")){
145 repair=Parse.parseBoolean(b);
146 }else if(a.equals("namesort") || a.equals("sort")){
147 namesort=Parse.parseBoolean(b);
148 }else if(a.equals("overwrite")){
149 overwrite=Parse.parseBoolean(b);
150 }else if(a.equals("v1") || a.equals("kmersort1")){
151 boolean x=Parse.parseBoolean(b);
152 if(x){V2=V3=false;}
153 }else if(a.equals("v2") || a.equals("kmersort2")){
154 V2=Parse.parseBoolean(b);
155 if(V2){V3=false;}
156 }else if(a.equals("v3") || a.equals("kmersort3")){
157 V3=Parse.parseBoolean(b);
158 if(V3){V2=false;}
159 }else if(a.equals("fetchthreads")){
160 KmerSort3.fetchThreads=Integer.parseInt(b);
161 assert(KmerSort3.fetchThreads>0) : KmerSort3.fetchThreads+"\nFetch threads must be at least 1.";
162 }
163
164 else if(a.equals("comparesequence")){
165 KmerComparator.compareSequence=Parse.parseBoolean(b);
166 }else if(a.equals("allowadjacenttiles") || a.equals("spantiles")){
167 ReadKey.spanTilesX=ReadKey.spanTilesY=Parse.parseBoolean(b);
168 }else if(a.equals("spanx") || a.equals("spantilesx")){
169 ReadKey.spanTilesX=Parse.parseBoolean(b);
170 }else if(a.equals("spany") || a.equals("spantilesy")){
171 ReadKey.spanTilesY=Parse.parseBoolean(b);
172 }else if(a.equals("spanadjacent") || a.equals("spanadjacentonly") || a.equals("adjacentonly") || a.equals("adjacent")){
173 ReadKey.spanAdjacentOnly=Parse.parseBoolean(b);
174 }
175
176 // else if(a.equals("repair")){
177 // repair=Parse.parseBoolean(b);
178 // }else if(a.equals("namesort") || a.equals("sort")){
179 // namesort=Parse.parseBoolean(b);
180 // }
181
182 else if(a.equals("interleaved") || a.equals("int")){
183 if("auto".equalsIgnoreCase(b)){FASTQ.FORCE_INTERLEAVED=!(FASTQ.TEST_INTERLEAVED=true);}
184 else{
185 FASTQ.FORCE_INTERLEAVED=FASTQ.TEST_INTERLEAVED=Parse.parseBoolean(b);
186 System.err.println("Set INTERLEAVED to "+FASTQ.FORCE_INTERLEAVED);
187 }
188 }else if(a.equals("cq") || a.equals("changequality")){
189 BBMerge.changeQuality=Read.CHANGE_QUALITY=Parse.parseBoolean(b);
190 }else if(a.equals("quantize") || a.equals("quantizesticky")){
191 quantizeQuality=Quantizer.parse(arg, a, b);
192 }else if(a.equals("lowcomplexity")){
193 lowComplexity=Parse.parseBoolean(b);
194 }
195
196 else if(Clump.parseStatic(arg, a, b)){
197 //Do nothing
198 }else if(Parser.parseQuality(arg, a, b)){
199 //Do nothing
200 }
201
202 else{
203 args2.add(arg);
204 }
205 }
206
207 Clump.setXY();
208
209 KmerSplit.quantizeQuality=KmerSort1.quantizeQuality=quantizeQuality;
210
211 Parser.processQuality();
212
213 assert(!unpair || !KmerComparator.mergeFirst) : "Unpair and mergefirst may not be used together.";
214
215 if(in1==null){throw new RuntimeException("\nOne input file is required.\n");}
216
217 if(in1!=null && in2==null && in1.indexOf('#')>-1 && !new File(in1).exists()){
218 in2=in1.replace("#", "2");
219 in1=in1.replace("#", "1");
220 }
221 if(out1!=null && out2==null && out1.indexOf('#')>-1){
222 out2=out1.replace("#", "2");
223 out1=out1.replace("#", "1");
224 }
225
226 //Ensure input files can be read
227 if(!Tools.testInputFiles(false, true, in1)){
228 throw new RuntimeException("\nCan't read some input files.\n");
229 }
230
231 // assert(false) : ReadKey.spanTiles()+", "+ReadKey.spanTilesX+", "+ReadKey.spanTilesY+", "+Clump.sortX+", "+Clump.sortY;
232
233 autoSetGroups(gString);
234
235 if((in2!=null || out2!=null) && groups>1){FASTQ.FORCE_INTERLEAVED=true;} //Fix for crash with twin fasta files
236 }
237
238
239 /*--------------------------------------------------------------*/
240 /*---------------- Outer Methods ----------------*/
241 /*--------------------------------------------------------------*/
242
243 /** Create read streams and process all data */
244 public void process(Timer t){
245 String[] args=args2.toArray(new String[0]);
246 args[4]="groups="+groups;
247
248 useSharedHeader=(FileFormat.hasSamOrBamExtension(in1) && out1!=null
249 && FileFormat.hasSamOrBamExtension(out1));
250
251 if(groups==1){
252 args[0]="in1="+in1;
253 args[1]="in2="+in2;
254 args[2]="out1="+out1;
255 args[3]="out2="+out2;
256 args[5]="ecco="+ecco;
257 args[6]="rename="+addName;
258 args[7]="shortname="+shortName;
259 args[8]="unpair="+unpair;
260 args[9]="repair="+repair;
261 args[10]="namesort="+namesort;
262 args[11]="ow="+overwrite;
263 KmerSort1.main(args);
264 }else{
265 String pin1=in1, pin2=in2, temp;
266 final int conservativePasses=Clump.conservativeFlag ? passes : Tools.max(1, passes/2);
267 if(passes>1){Clump.setConservative(true);}
268 long fileMem=-1;
269 for(int pass=1; pass<=passes; pass++){
270 if(/*passes>1 &&*/ (V2 || V3)){
271 // System.err.println("Running pass with fileMem="+fileMem);
272 // out=(pass==passes ? out1 : getTempFname("clumpify_p"+(pass+1)+"_temp%_"));
273 temp=getTempFname("clumpify_p"+(pass+1)+"_temp%_");
274 if(pass==passes){
275 fileMem=runOnePass_v2(args, pass, pin1, pin2, out1, out2, fileMem);
276 }else{
277 fileMem=runOnePass_v2(args, pass, pin1, pin2, temp, null, fileMem);
278 }
279 // System.err.println("New fileMem="+fileMem);
280 }else{
281 // out=(pass==passes ? out1 : getTempFname("clumpify_temp_pass"+pass+"_"));
282 temp=getTempFname("clumpify_temp_pass"+pass+"_");
283 if(pass==passes){
284 runOnePass(args, pass, pin1, pin2, out1, out2);
285 }else{
286 runOnePass(args, pass, pin1, pin2, temp, null);
287 }
288 }
289 pin1=temp;
290 pin2=null;
291 KmerComparator.defaultBorder=Tools.max(0, KmerComparator.defaultBorder-1);
292 KmerComparator.defaultSeed++;
293 if(pass>=conservativePasses){Clump.setConservative(false);}
294 }
295 }
296
297 if(deleteInput && !sharedErrorState && out1!=null && in1!=null){
298 try {
299 new File(in1).delete();
300 if(in2!=null){new File(in2).delete();}
301 } catch (Exception e) {
302 System.err.println("WARNING: Failed to delete input files.");
303 }
304 }
305
306 t.stop();
307 System.err.println("Total time: \t"+t);
308
309 }
310
311 private void runOnePass(String[] args, int pass, String in1, String in2, String out1, String out2){
312 assert(groups>1);
313 if(pass>1){
314 ecco=false;
315 shortName="f";
316 addName=false;
317 }
318
319 String temp=getTempFname("clumpify_p"+pass+"_temp%_");
320
321 String temp2=temp.replace("%", "FINAL");
322 final boolean externalSort=(pass==passes && (repair || namesort));
323
324 args[0]="in1="+in1;
325 args[1]="in2="+in2;
326 args[2]="out="+temp;
327 args[3]="out2="+null;
328 args[5]="ecco="+ecco;
329 args[6]="addname=f";
330 args[7]="shortname="+shortName;
331 args[8]="unpair="+unpair;
332 args[9]="repair=f";
333 args[10]="namesort=f";
334 args[11]="ow="+overwrite;
335 KmerSplit.maxZipLevel=2;
336 KmerSplit.main(args);
337
338 FASTQ.DETECT_QUALITY=FASTQ.DETECT_QUALITY_OUT=false;
339 FASTQ.ASCII_OFFSET=FASTQ.ASCII_OFFSET_OUT;
340
341 args[0]="in="+temp;
342 args[1]="in2="+null;
343 args[2]="out="+(externalSort ? temp2 : out1);
344 args[3]="out2="+(externalSort ? "null" : out2);
345 args[5]="ecco=f";
346 args[6]="addname="+addName;
347 args[7]="shortname=f";
348 args[8]="unpair=f";
349 args[9]="repair="+(repair && externalSort);
350 args[10]="namesort="+(namesort && externalSort);
351 args[11]="ow="+overwrite;
352 if(unpair){
353 FASTQ.FORCE_INTERLEAVED=FASTQ.TEST_INTERLEAVED=false;
354 }
355 KmerSort1.main(args);
356
357 if(delete){
358 for(int i=0; i<groups; i++){
359 new File(temp.replaceFirst("%", ""+i)).delete();
360 }
361 if(pass>1){
362 assert(in2==null);
363 new File(in1).delete();
364 }
365 }
366
367 if(externalSort){
368 outstream.println();
369 String[] sortArgs=new String[] {"in="+temp2, "out="+out1, "ow="+overwrite};
370 if(out2!=null){sortArgs=new String[] {"in="+temp2, "out="+out1, "out2="+out2, "ow="+overwrite};}
371 SortByName.main(sortArgs);
372 if(delete){new File(temp2).delete();}
373 }
374 }
375
376 private long runOnePass_v2(String[] args, int pass, String in1, String in2, String out1, String out2, long fileMem){
377 assert(groups>1);
378 if(pass>1){
379 ecco=false;
380 shortName="f";
381 addName=false;
382 }
383
384 String temp=getTempFname("clumpify_p"+pass+"_temp%_");
385
386 // String temp2=temp.replace("%", "FINAL");
387 String namesorted=temp.replace("%", "namesorted_%");
388 final boolean externalSort=(pass==passes && (repair || namesort));
389
390 if(pass==1){
391 args[0]="in1="+in1;
392 args[1]="in2="+in2;
393 args[2]="out="+temp;
394 args[3]="out2="+null;
395 args[5]="ecco="+ecco;
396 args[6]="addname=f";
397 args[7]="shortname="+shortName;
398 args[8]="unpair="+unpair;
399 args[9]="repair=f";
400 args[10]="namesort=f";
401 args[11]="ow="+overwrite;
402 KmerSplit.maxZipLevel=2;
403 KmerSplit.main(args);
404 fileMem=KmerSplit.lastMemProcessed;
405
406 FASTQ.DETECT_QUALITY=FASTQ.DETECT_QUALITY_OUT=false;
407 FASTQ.ASCII_OFFSET=FASTQ.ASCII_OFFSET_OUT;
408 }
409
410 args[0]="in1="+(pass==1 ? temp : in1);
411 args[1]="in2="+null;
412 args[2]="out="+(externalSort ? namesorted : out1);
413 args[3]="out2="+(externalSort ? "null" : out2);
414 args[5]="ecco=f";
415 args[6]="addname="+addName;
416 args[7]="shortname=f";
417 args[8]="unpair=f";
418 args[9]="repair="+(repair && externalSort);
419 args[10]="namesort="+(namesort && externalSort);
420 args[11]="ow="+overwrite;
421 if(unpair){
422 FASTQ.FORCE_INTERLEAVED=FASTQ.TEST_INTERLEAVED=false;
423 }
424 if(externalSort){
425 KmerSort.doHashAndSplit=false;
426 }
427 if(V3){
428 KmerSort3.main(fileMem, pass, passes, args);
429 if(fileMem<1){fileMem=KmerSort3.lastMemProcessed;}
430 }else{KmerSort2.main(args);}
431
432 if(delete){
433 for(int i=0; i<groups; i++){
434 new File((pass==1 ? temp : in1).replaceFirst("%", ""+i)).delete();
435 }
436 }
437
438 if(externalSort){
439 outstream.println();
440
441 ArrayList<String> names=new ArrayList<String>();
442 for(int i=0; i<groups; i++){
443 names.add(namesorted.replaceFirst("%", ""+i));
444 }
445 ReadWrite.MAX_ZIP_THREADS=Shared.threads();
446
447 ReadWrite.USE_PIGZ=true;
448 ReadWrite.ZIPLEVEL=Tools.max(ReadWrite.ZIPLEVEL, 6);
449 FASTQ.TEST_INTERLEAVED=FASTQ.FORCE_INTERLEAVED=false;
450 FileFormat dest=FileFormat.testOutput(out1, FileFormat.FASTQ, null, true, overwrite, false, false);
451 FileFormat dest2=FileFormat.testOutput(out2, FileFormat.FASTQ, null, true, overwrite, false, false);
452 SortByName.mergeAndDump(names, /*null, */dest, dest2, delete, useSharedHeader, false, outstream, 1000);
453 }
454
455 // if(externalSort){
456 // outstream.println();
457 // SortByName.main(new String[] {"in="+temp2, "out="+out, "ow="+overwrite});
458 // if(delete){new File(temp2).delete();}
459 // }
460 return fileMem;
461 }
462
463 /*--------------------------------------------------------------*/
464 /*---------------- Inner Methods ----------------*/
465 /*--------------------------------------------------------------*/
466
467 private void autoSetGroups(String s) {
468 if(s==null || s.equalsIgnoreCase("null")){return;}
469 if(Tools.isDigit(s.charAt(0))){
470 groups=Integer.parseInt(s);
471 return;
472 }
473 assert(s.equalsIgnoreCase("auto")) : "Unknown groups setting: "+s;
474
475 final long maxMem=Shared.memAvailable(1);
476 FileFormat ff1=FileFormat.testInput(in1, FileFormat.FASTQ, null, false, false);
477 if(ff1==null || ff1.stdio()){return;}
478
479 // outstream.println("in1="+in1+", overhead="+(0.5*(ReadKey.overhead+Clump.overhead)));
480
481 double[] estimates=Tools.estimateFileMemory(in1, 1000, 0.5*(ReadKey.overhead+Clump.overhead), true, lowComplexity);
482 if(in2!=null){
483 double[] estimates2=Tools.estimateFileMemory(in2, 1000, 0.5*(ReadKey.overhead+Clump.overhead), true, lowComplexity);
484 estimates[0]+=estimates2[0];
485 estimates[1]+=estimates2[1];
486 estimates[4]+=estimates2[4];
487 }
488
489 // outstream.println(Arrays.toString(estimates));
490
491 double memEstimate=estimates==null ? 0 : estimates[0];
492 double diskEstimate=estimates==null ? 0 : estimates[1];
493 double readEstimate=estimates==null ? 0 : estimates[4];
494 double worstCase=memEstimate*1.5;
495
496 // outstream.println("Raw Disk Size Estimate: "+(long)(diskEstimate/(1024*1024))+" MB");
497 outstream.println("Read Estimate: "+(long)(readEstimate));
498 outstream.println("Memory Estimate: "+(long)(memEstimate/(1024*1024))+" MB");
499 outstream.println("Memory Available: "+(maxMem/(1024*1024))+" MB");
500
501 if(maxMem>worstCase && readEstimate<Integer.MAX_VALUE){
502 groups=1;
503 }else{
504 groups=Tools.max(11, (int)(3+(3*worstCase/maxMem)*(V3 ? KmerSort3.fetchThreads : 2)), (int)((2*readEstimate)/Integer.MAX_VALUE))|1;
505 }
506 outstream.println("Set groups to "+groups);
507 }
508
509 private String getTempFname(String core){
510 // outstream.println(core);
511 String temp;
512 String path="", extension=".fq";
513 if(out1!=null){
514 core=ReadWrite.stripToCore(out1)+"_"+core;
515 path=ReadWrite.getPath(out1);
516 extension=ReadWrite.getExtension(out1);
517 }
518
519 if(useTmpdir && Shared.tmpdir()!=null){
520 temp=Shared.tmpdir()+core+Long.toHexString((randy.nextLong()&Long.MAX_VALUE))+extension;
521 }else{
522 temp=path+core+Long.toHexString((randy.nextLong()&Long.MAX_VALUE))+extension;
523 }
524 // assert(false) : path+", "+temp+", "+core+", "+out1;
525
526 String comp=ReadWrite.compressionType(temp);
527 if(comp!=null){comp=".gz";} //Prevent bz2 temp files which cause a crash
528
529 if(forceCompressTemp && comp==null){
530 temp+=".gz";
531 }else if(comp!=null && forceRawTemp){
532 temp=temp.substring(0, temp.lastIndexOf('.'));
533 }
534 if(temp.endsWith(".bz2")){temp=temp.substring(0, temp.length()-4);} //Prevent bz2 temp files which cause a crash
535
536 // outstream.println(temp);
537 return temp;
538 }
539
540 public static void shrinkName(Read r) {
541 if(r==null){return;}
542 String s=r.id;
543 if(s.contains("HISEQ")){s=s.replace("HISEQ", "H");}
544 if(s.contains("MISEQ")){
545 s=s.replace("MISEQ", "M");
546 }
547 if(s.contains(":000000000-")){
548 s=s.replace(":000000000-", ":");
549 }
550 r.id=s;
551 }
552
553 public static void shortName(Read r) {
554 ByteBuilder sb=new ByteBuilder(14);
555 long x=r.numericID|1;
556
557 while(x<1000000000L){
558 x*=10;
559 sb.append('0');
560 }
561 sb.append(r.numericID);
562
563 // while(x<0x10000000L){
564 // x*=16;
565 // sb.append('0');
566 // }
567 // sb.append(Long.toHexString(r.numericID));
568
569 sb.append(r.pairnum()==0 ? " 1:" : " 2:");
570 r.id=sb.toString();
571 }
572
573 /*--------------------------------------------------------------*/
574 /*---------------- Fields ----------------*/
575 /*--------------------------------------------------------------*/
576
577 private boolean lowComplexity=false;
578
579 private boolean quantizeQuality=false;
580 private Random randy=new Random();
581 private int groups=31;
582 private int passes=1;
583 private boolean ecco=false;
584 private boolean addName=false;
585 private String shortName="f";
586 private boolean useTmpdir=false;
587 private boolean delete=true;
588 private boolean deleteInput=false;
589 private boolean useSharedHeader=false;
590 private boolean forceCompressTemp=false;
591 private boolean forceRawTemp=false;
592 private boolean overwrite=true;
593
594 private boolean unpair=false;
595 private boolean repair=false;
596 private boolean namesort=false;
597 private boolean V2=false;
598 private boolean V3=true;
599
600 private String in1=null;
601 private String in2=null;
602 private String out1=null;
603 private String out2=null;
604
605 ArrayList<String> args2=new ArrayList<String>();
606 private PrintStream outstream=System.err;
607
608 public static boolean sharedErrorState=false;
609
610 }