Mercurial > repos > rliterman > csp2
view CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/clump/Clumpify.java @ 68:5028fdace37b
planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author | jpayne |
---|---|
date | Tue, 18 Mar 2025 16:23:26 -0400 |
parents | |
children |
line wrap: on
line source
package clump; import java.io.File; import java.io.PrintStream; import java.util.ArrayList; import java.util.Random; import fileIO.FileFormat; import fileIO.ReadWrite; import jgi.BBMerge; import shared.Parse; import shared.Parser; import shared.PreParser; import shared.Shared; import shared.Timer; import shared.Tools; import sort.SortByName; import stream.FASTQ; import stream.Read; import structures.ByteBuilder; import structures.Quantizer; /** * @author Brian Bushnell * @date Nov 6, 2015 * */ public class Clumpify { /** * Code entrance from the command line. * @param args Command line arguments */ public static void main(String[] args){ Timer t=new Timer(); ReadWrite.ZIPLEVEL=Tools.max(ReadWrite.ZIPLEVEL, 6); //Capture values of static variables that might be modified in case this is called by another class. final boolean oldCQ=Read.CHANGE_QUALITY; final boolean oldBgzip=ReadWrite.USE_BGZIP, oldPreferBgzip=ReadWrite.PREFER_BGZIP; BBMerge.changeQuality=Read.CHANGE_QUALITY=false; ReadWrite.USE_BGZIP=true; ReadWrite.PREFER_BGZIP=true; Clumpify x=new Clumpify(args); x.process(t); //Restore values of static variables. // Shared.setBuffers(oldCap); // ReadWrite.ZIPLEVEL=oldZl; // ReadWrite.USE_PIGZ=oldPigz; ReadWrite.USE_BGZIP=oldBgzip; ReadWrite.PREFER_BGZIP=oldPreferBgzip; // ReadWrite.USE_UNPIGZ=oldUnpigz; // ReadWrite.MAX_ZIP_THREADS=oldZipThreads; BBMerge.changeQuality=Read.CHANGE_QUALITY=oldCQ; //Close the print stream if it was redirected Shared.closeStream(x.outstream); } /** * Constructor. * @param args Command line arguments */ public Clumpify(String[] args){ {//Preparse block for help, config files, and outstream PreParser pp=new PreParser(args, getClass(), true); args=pp.args; outstream=pp.outstream; } Read.VALIDATE_IN_CONSTRUCTOR=Shared.threads()<4; args2=new ArrayList<String>(); args2.add("in1"); args2.add("in2"); args2.add("out1"); args2.add("out2"); args2.add("groups"); args2.add("ecco=f"); args2.add("rename=f"); args2.add("shortname=f"); args2.add("unpair=f"); args2.add("repair=f"); args2.add("namesort=f"); args2.add("overwrite=t"); String gString="auto"; for(int i=0; i<args.length; i++){ String arg=args[i]; String[] split=arg.split("="); String a=split[0].toLowerCase(); String b=split.length>1 ? split[1] : null; if(a.equals("in") || a.equals("in1")){ in1=b; }else if(a.equals("in2")){ in2=b; }else if(a.equals("out") || a.equals("out1")){ out1=b; }else if(a.equals("out2")){ out2=b; }else if(a.equals("groups") || a.equals("g") || a.equals("sets") || a.equals("ways")){ gString=b; }else if(a.equals("delete") || a.equals("deletetemp")){ delete=Parse.parseBoolean(b); }else if(a.equals("deleteinput")){ deleteInput=Parse.parseBoolean(b); }else if(a.equals("usetmpdir")){ useTmpdir=Parse.parseBoolean(b); }else if(a.equals("ecco")){ ecco=Parse.parseBoolean(b); }else if(a.equals("compresstemp") || a.equals("ct")){ if(b!=null && b.equalsIgnoreCase("auto")){forceCompressTemp=forceRawTemp=false;} else{ forceCompressTemp=Parse.parseBoolean(b); forceRawTemp=!forceCompressTemp; } }else if(a.equals("tmpdir")){ Shared.setTmpdir(b); }else if(a.equals("rename") || a.equals("addname")){ addName=Parse.parseBoolean(b); }else if(a.equals("shortname") || a.equals("shortnames")){ shortName=b; }else if(a.equals("seed")){ KmerComparator.defaultSeed=Long.parseLong(b); }else if(a.equals("hashes")){ KmerComparator.setHashes(Integer.parseInt(b)); }else if(a.equals("passes")){ passes=Integer.parseInt(b); args2.add(arg); // }else if(a.equals("k")){ // k=Integer.parseInt(b); // args2.add(arg); }else if(a.equals("border")){ KmerComparator.defaultBorder=Integer.parseInt(b); } else if(a.equals("unpair")){ unpair=Parse.parseBoolean(b); }else if(a.equals("repair")){ repair=Parse.parseBoolean(b); }else if(a.equals("namesort") || a.equals("sort")){ namesort=Parse.parseBoolean(b); }else if(a.equals("overwrite")){ overwrite=Parse.parseBoolean(b); }else if(a.equals("v1") || a.equals("kmersort1")){ boolean x=Parse.parseBoolean(b); if(x){V2=V3=false;} }else if(a.equals("v2") || a.equals("kmersort2")){ V2=Parse.parseBoolean(b); if(V2){V3=false;} }else if(a.equals("v3") || a.equals("kmersort3")){ V3=Parse.parseBoolean(b); if(V3){V2=false;} }else if(a.equals("fetchthreads")){ KmerSort3.fetchThreads=Integer.parseInt(b); assert(KmerSort3.fetchThreads>0) : KmerSort3.fetchThreads+"\nFetch threads must be at least 1."; } else if(a.equals("comparesequence")){ KmerComparator.compareSequence=Parse.parseBoolean(b); }else if(a.equals("allowadjacenttiles") || a.equals("spantiles")){ ReadKey.spanTilesX=ReadKey.spanTilesY=Parse.parseBoolean(b); }else if(a.equals("spanx") || a.equals("spantilesx")){ ReadKey.spanTilesX=Parse.parseBoolean(b); }else if(a.equals("spany") || a.equals("spantilesy")){ ReadKey.spanTilesY=Parse.parseBoolean(b); }else if(a.equals("spanadjacent") || a.equals("spanadjacentonly") || a.equals("adjacentonly") || a.equals("adjacent")){ ReadKey.spanAdjacentOnly=Parse.parseBoolean(b); } // else if(a.equals("repair")){ // repair=Parse.parseBoolean(b); // }else if(a.equals("namesort") || a.equals("sort")){ // namesort=Parse.parseBoolean(b); // } else if(a.equals("interleaved") || a.equals("int")){ if("auto".equalsIgnoreCase(b)){FASTQ.FORCE_INTERLEAVED=!(FASTQ.TEST_INTERLEAVED=true);} else{ FASTQ.FORCE_INTERLEAVED=FASTQ.TEST_INTERLEAVED=Parse.parseBoolean(b); System.err.println("Set INTERLEAVED to "+FASTQ.FORCE_INTERLEAVED); } }else if(a.equals("cq") || a.equals("changequality")){ BBMerge.changeQuality=Read.CHANGE_QUALITY=Parse.parseBoolean(b); }else if(a.equals("quantize") || a.equals("quantizesticky")){ quantizeQuality=Quantizer.parse(arg, a, b); }else if(a.equals("lowcomplexity")){ lowComplexity=Parse.parseBoolean(b); } else if(Clump.parseStatic(arg, a, b)){ //Do nothing }else if(Parser.parseQuality(arg, a, b)){ //Do nothing } else{ args2.add(arg); } } Clump.setXY(); KmerSplit.quantizeQuality=KmerSort1.quantizeQuality=quantizeQuality; Parser.processQuality(); assert(!unpair || !KmerComparator.mergeFirst) : "Unpair and mergefirst may not be used together."; if(in1==null){throw new RuntimeException("\nOne input file is required.\n");} if(in1!=null && in2==null && in1.indexOf('#')>-1 && !new File(in1).exists()){ in2=in1.replace("#", "2"); in1=in1.replace("#", "1"); } if(out1!=null && out2==null && out1.indexOf('#')>-1){ out2=out1.replace("#", "2"); out1=out1.replace("#", "1"); } //Ensure input files can be read if(!Tools.testInputFiles(false, true, in1)){ throw new RuntimeException("\nCan't read some input files.\n"); } // assert(false) : ReadKey.spanTiles()+", "+ReadKey.spanTilesX+", "+ReadKey.spanTilesY+", "+Clump.sortX+", "+Clump.sortY; autoSetGroups(gString); if((in2!=null || out2!=null) && groups>1){FASTQ.FORCE_INTERLEAVED=true;} //Fix for crash with twin fasta files } /*--------------------------------------------------------------*/ /*---------------- Outer Methods ----------------*/ /*--------------------------------------------------------------*/ /** Create read streams and process all data */ public void process(Timer t){ String[] args=args2.toArray(new String[0]); args[4]="groups="+groups; useSharedHeader=(FileFormat.hasSamOrBamExtension(in1) && out1!=null && FileFormat.hasSamOrBamExtension(out1)); if(groups==1){ args[0]="in1="+in1; args[1]="in2="+in2; args[2]="out1="+out1; args[3]="out2="+out2; args[5]="ecco="+ecco; args[6]="rename="+addName; args[7]="shortname="+shortName; args[8]="unpair="+unpair; args[9]="repair="+repair; args[10]="namesort="+namesort; args[11]="ow="+overwrite; KmerSort1.main(args); }else{ String pin1=in1, pin2=in2, temp; final int conservativePasses=Clump.conservativeFlag ? passes : Tools.max(1, passes/2); if(passes>1){Clump.setConservative(true);} long fileMem=-1; for(int pass=1; pass<=passes; pass++){ if(/*passes>1 &&*/ (V2 || V3)){ // System.err.println("Running pass with fileMem="+fileMem); // out=(pass==passes ? out1 : getTempFname("clumpify_p"+(pass+1)+"_temp%_")); temp=getTempFname("clumpify_p"+(pass+1)+"_temp%_"); if(pass==passes){ fileMem=runOnePass_v2(args, pass, pin1, pin2, out1, out2, fileMem); }else{ fileMem=runOnePass_v2(args, pass, pin1, pin2, temp, null, fileMem); } // System.err.println("New fileMem="+fileMem); }else{ // out=(pass==passes ? out1 : getTempFname("clumpify_temp_pass"+pass+"_")); temp=getTempFname("clumpify_temp_pass"+pass+"_"); if(pass==passes){ runOnePass(args, pass, pin1, pin2, out1, out2); }else{ runOnePass(args, pass, pin1, pin2, temp, null); } } pin1=temp; pin2=null; KmerComparator.defaultBorder=Tools.max(0, KmerComparator.defaultBorder-1); KmerComparator.defaultSeed++; if(pass>=conservativePasses){Clump.setConservative(false);} } } if(deleteInput && !sharedErrorState && out1!=null && in1!=null){ try { new File(in1).delete(); if(in2!=null){new File(in2).delete();} } catch (Exception e) { System.err.println("WARNING: Failed to delete input files."); } } t.stop(); System.err.println("Total time: \t"+t); } private void runOnePass(String[] args, int pass, String in1, String in2, String out1, String out2){ assert(groups>1); if(pass>1){ ecco=false; shortName="f"; addName=false; } String temp=getTempFname("clumpify_p"+pass+"_temp%_"); String temp2=temp.replace("%", "FINAL"); final boolean externalSort=(pass==passes && (repair || namesort)); args[0]="in1="+in1; args[1]="in2="+in2; args[2]="out="+temp; args[3]="out2="+null; args[5]="ecco="+ecco; args[6]="addname=f"; args[7]="shortname="+shortName; args[8]="unpair="+unpair; args[9]="repair=f"; args[10]="namesort=f"; args[11]="ow="+overwrite; KmerSplit.maxZipLevel=2; KmerSplit.main(args); FASTQ.DETECT_QUALITY=FASTQ.DETECT_QUALITY_OUT=false; FASTQ.ASCII_OFFSET=FASTQ.ASCII_OFFSET_OUT; args[0]="in="+temp; args[1]="in2="+null; args[2]="out="+(externalSort ? temp2 : out1); args[3]="out2="+(externalSort ? "null" : out2); args[5]="ecco=f"; args[6]="addname="+addName; args[7]="shortname=f"; args[8]="unpair=f"; args[9]="repair="+(repair && externalSort); args[10]="namesort="+(namesort && externalSort); args[11]="ow="+overwrite; if(unpair){ FASTQ.FORCE_INTERLEAVED=FASTQ.TEST_INTERLEAVED=false; } KmerSort1.main(args); if(delete){ for(int i=0; i<groups; i++){ new File(temp.replaceFirst("%", ""+i)).delete(); } if(pass>1){ assert(in2==null); new File(in1).delete(); } } if(externalSort){ outstream.println(); String[] sortArgs=new String[] {"in="+temp2, "out="+out1, "ow="+overwrite}; if(out2!=null){sortArgs=new String[] {"in="+temp2, "out="+out1, "out2="+out2, "ow="+overwrite};} SortByName.main(sortArgs); if(delete){new File(temp2).delete();} } } private long runOnePass_v2(String[] args, int pass, String in1, String in2, String out1, String out2, long fileMem){ assert(groups>1); if(pass>1){ ecco=false; shortName="f"; addName=false; } String temp=getTempFname("clumpify_p"+pass+"_temp%_"); // String temp2=temp.replace("%", "FINAL"); String namesorted=temp.replace("%", "namesorted_%"); final boolean externalSort=(pass==passes && (repair || namesort)); if(pass==1){ args[0]="in1="+in1; args[1]="in2="+in2; args[2]="out="+temp; args[3]="out2="+null; args[5]="ecco="+ecco; args[6]="addname=f"; args[7]="shortname="+shortName; args[8]="unpair="+unpair; args[9]="repair=f"; args[10]="namesort=f"; args[11]="ow="+overwrite; KmerSplit.maxZipLevel=2; KmerSplit.main(args); fileMem=KmerSplit.lastMemProcessed; FASTQ.DETECT_QUALITY=FASTQ.DETECT_QUALITY_OUT=false; FASTQ.ASCII_OFFSET=FASTQ.ASCII_OFFSET_OUT; } args[0]="in1="+(pass==1 ? temp : in1); args[1]="in2="+null; args[2]="out="+(externalSort ? namesorted : out1); args[3]="out2="+(externalSort ? "null" : out2); args[5]="ecco=f"; args[6]="addname="+addName; args[7]="shortname=f"; args[8]="unpair=f"; args[9]="repair="+(repair && externalSort); args[10]="namesort="+(namesort && externalSort); args[11]="ow="+overwrite; if(unpair){ FASTQ.FORCE_INTERLEAVED=FASTQ.TEST_INTERLEAVED=false; } if(externalSort){ KmerSort.doHashAndSplit=false; } if(V3){ KmerSort3.main(fileMem, pass, passes, args); if(fileMem<1){fileMem=KmerSort3.lastMemProcessed;} }else{KmerSort2.main(args);} if(delete){ for(int i=0; i<groups; i++){ new File((pass==1 ? temp : in1).replaceFirst("%", ""+i)).delete(); } } if(externalSort){ outstream.println(); ArrayList<String> names=new ArrayList<String>(); for(int i=0; i<groups; i++){ names.add(namesorted.replaceFirst("%", ""+i)); } ReadWrite.MAX_ZIP_THREADS=Shared.threads(); ReadWrite.USE_PIGZ=true; ReadWrite.ZIPLEVEL=Tools.max(ReadWrite.ZIPLEVEL, 6); FASTQ.TEST_INTERLEAVED=FASTQ.FORCE_INTERLEAVED=false; FileFormat dest=FileFormat.testOutput(out1, FileFormat.FASTQ, null, true, overwrite, false, false); FileFormat dest2=FileFormat.testOutput(out2, FileFormat.FASTQ, null, true, overwrite, false, false); SortByName.mergeAndDump(names, /*null, */dest, dest2, delete, useSharedHeader, false, outstream, 1000); } // if(externalSort){ // outstream.println(); // SortByName.main(new String[] {"in="+temp2, "out="+out, "ow="+overwrite}); // if(delete){new File(temp2).delete();} // } return fileMem; } /*--------------------------------------------------------------*/ /*---------------- Inner Methods ----------------*/ /*--------------------------------------------------------------*/ private void autoSetGroups(String s) { if(s==null || s.equalsIgnoreCase("null")){return;} if(Tools.isDigit(s.charAt(0))){ groups=Integer.parseInt(s); return; } assert(s.equalsIgnoreCase("auto")) : "Unknown groups setting: "+s; final long maxMem=Shared.memAvailable(1); FileFormat ff1=FileFormat.testInput(in1, FileFormat.FASTQ, null, false, false); if(ff1==null || ff1.stdio()){return;} // outstream.println("in1="+in1+", overhead="+(0.5*(ReadKey.overhead+Clump.overhead))); double[] estimates=Tools.estimateFileMemory(in1, 1000, 0.5*(ReadKey.overhead+Clump.overhead), true, lowComplexity); if(in2!=null){ double[] estimates2=Tools.estimateFileMemory(in2, 1000, 0.5*(ReadKey.overhead+Clump.overhead), true, lowComplexity); estimates[0]+=estimates2[0]; estimates[1]+=estimates2[1]; estimates[4]+=estimates2[4]; } // outstream.println(Arrays.toString(estimates)); double memEstimate=estimates==null ? 0 : estimates[0]; double diskEstimate=estimates==null ? 0 : estimates[1]; double readEstimate=estimates==null ? 0 : estimates[4]; double worstCase=memEstimate*1.5; // outstream.println("Raw Disk Size Estimate: "+(long)(diskEstimate/(1024*1024))+" MB"); outstream.println("Read Estimate: "+(long)(readEstimate)); outstream.println("Memory Estimate: "+(long)(memEstimate/(1024*1024))+" MB"); outstream.println("Memory Available: "+(maxMem/(1024*1024))+" MB"); if(maxMem>worstCase && readEstimate<Integer.MAX_VALUE){ groups=1; }else{ groups=Tools.max(11, (int)(3+(3*worstCase/maxMem)*(V3 ? KmerSort3.fetchThreads : 2)), (int)((2*readEstimate)/Integer.MAX_VALUE))|1; } outstream.println("Set groups to "+groups); } private String getTempFname(String core){ // outstream.println(core); String temp; String path="", extension=".fq"; if(out1!=null){ core=ReadWrite.stripToCore(out1)+"_"+core; path=ReadWrite.getPath(out1); extension=ReadWrite.getExtension(out1); } if(useTmpdir && Shared.tmpdir()!=null){ temp=Shared.tmpdir()+core+Long.toHexString((randy.nextLong()&Long.MAX_VALUE))+extension; }else{ temp=path+core+Long.toHexString((randy.nextLong()&Long.MAX_VALUE))+extension; } // assert(false) : path+", "+temp+", "+core+", "+out1; String comp=ReadWrite.compressionType(temp); if(comp!=null){comp=".gz";} //Prevent bz2 temp files which cause a crash if(forceCompressTemp && comp==null){ temp+=".gz"; }else if(comp!=null && forceRawTemp){ temp=temp.substring(0, temp.lastIndexOf('.')); } if(temp.endsWith(".bz2")){temp=temp.substring(0, temp.length()-4);} //Prevent bz2 temp files which cause a crash // outstream.println(temp); return temp; } public static void shrinkName(Read r) { if(r==null){return;} String s=r.id; if(s.contains("HISEQ")){s=s.replace("HISEQ", "H");} if(s.contains("MISEQ")){ s=s.replace("MISEQ", "M"); } if(s.contains(":000000000-")){ s=s.replace(":000000000-", ":"); } r.id=s; } public static void shortName(Read r) { ByteBuilder sb=new ByteBuilder(14); long x=r.numericID|1; while(x<1000000000L){ x*=10; sb.append('0'); } sb.append(r.numericID); // while(x<0x10000000L){ // x*=16; // sb.append('0'); // } // sb.append(Long.toHexString(r.numericID)); sb.append(r.pairnum()==0 ? " 1:" : " 2:"); r.id=sb.toString(); } /*--------------------------------------------------------------*/ /*---------------- Fields ----------------*/ /*--------------------------------------------------------------*/ private boolean lowComplexity=false; private boolean quantizeQuality=false; private Random randy=new Random(); private int groups=31; private int passes=1; private boolean ecco=false; private boolean addName=false; private String shortName="f"; private boolean useTmpdir=false; private boolean delete=true; private boolean deleteInput=false; private boolean useSharedHeader=false; private boolean forceCompressTemp=false; private boolean forceRawTemp=false; private boolean overwrite=true; private boolean unpair=false; private boolean repair=false; private boolean namesort=false; private boolean V2=false; private boolean V3=true; private String in1=null; private String in2=null; private String out1=null; private String out2=null; ArrayList<String> args2=new ArrayList<String>(); private PrintStream outstream=System.err; public static boolean sharedErrorState=false; }