Mercurial > repos > rliterman > csp2
diff CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/clump/Clumpify.java @ 68:5028fdace37b
planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author | jpayne |
---|---|
date | Tue, 18 Mar 2025 16:23:26 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/clump/Clumpify.java Tue Mar 18 16:23:26 2025 -0400 @@ -0,0 +1,610 @@ +package clump; + +import java.io.File; +import java.io.PrintStream; +import java.util.ArrayList; +import java.util.Random; + +import fileIO.FileFormat; +import fileIO.ReadWrite; +import jgi.BBMerge; +import shared.Parse; +import shared.Parser; +import shared.PreParser; +import shared.Shared; +import shared.Timer; +import shared.Tools; +import sort.SortByName; +import stream.FASTQ; +import stream.Read; +import structures.ByteBuilder; +import structures.Quantizer; + +/** + * @author Brian Bushnell + * @date Nov 6, 2015 + * + */ +public class Clumpify { + + /** + * Code entrance from the command line. + * @param args Command line arguments + */ + public static void main(String[] args){ + Timer t=new Timer(); + ReadWrite.ZIPLEVEL=Tools.max(ReadWrite.ZIPLEVEL, 6); + + //Capture values of static variables that might be modified in case this is called by another class. + final boolean oldCQ=Read.CHANGE_QUALITY; + final boolean oldBgzip=ReadWrite.USE_BGZIP, oldPreferBgzip=ReadWrite.PREFER_BGZIP; + + BBMerge.changeQuality=Read.CHANGE_QUALITY=false; + ReadWrite.USE_BGZIP=true; + ReadWrite.PREFER_BGZIP=true; + + Clumpify x=new Clumpify(args); + x.process(t); + + //Restore values of static variables. +// Shared.setBuffers(oldCap); +// ReadWrite.ZIPLEVEL=oldZl; +// ReadWrite.USE_PIGZ=oldPigz; + ReadWrite.USE_BGZIP=oldBgzip; + ReadWrite.PREFER_BGZIP=oldPreferBgzip; +// ReadWrite.USE_UNPIGZ=oldUnpigz; +// ReadWrite.MAX_ZIP_THREADS=oldZipThreads; + BBMerge.changeQuality=Read.CHANGE_QUALITY=oldCQ; + + //Close the print stream if it was redirected + Shared.closeStream(x.outstream); + } + + /** + * Constructor. + * @param args Command line arguments + */ + public Clumpify(String[] args){ + + {//Preparse block for help, config files, and outstream + PreParser pp=new PreParser(args, getClass(), true); + args=pp.args; + outstream=pp.outstream; + } + + Read.VALIDATE_IN_CONSTRUCTOR=Shared.threads()<4; + + args2=new ArrayList<String>(); + args2.add("in1"); + args2.add("in2"); + args2.add("out1"); + args2.add("out2"); + args2.add("groups"); + args2.add("ecco=f"); + args2.add("rename=f"); + args2.add("shortname=f"); + args2.add("unpair=f"); + args2.add("repair=f"); + args2.add("namesort=f"); + args2.add("overwrite=t"); + + String gString="auto"; + for(int i=0; i<args.length; i++){ + String arg=args[i]; + String[] split=arg.split("="); + String a=split[0].toLowerCase(); + String b=split.length>1 ? split[1] : null; + + if(a.equals("in") || a.equals("in1")){ + in1=b; + }else if(a.equals("in2")){ + in2=b; + }else if(a.equals("out") || a.equals("out1")){ + out1=b; + }else if(a.equals("out2")){ + out2=b; + }else if(a.equals("groups") || a.equals("g") || a.equals("sets") || a.equals("ways")){ + gString=b; + }else if(a.equals("delete") || a.equals("deletetemp")){ + delete=Parse.parseBoolean(b); + }else if(a.equals("deleteinput")){ + deleteInput=Parse.parseBoolean(b); + }else if(a.equals("usetmpdir")){ + useTmpdir=Parse.parseBoolean(b); + }else if(a.equals("ecco")){ + ecco=Parse.parseBoolean(b); + }else if(a.equals("compresstemp") || a.equals("ct")){ + if(b!=null && b.equalsIgnoreCase("auto")){forceCompressTemp=forceRawTemp=false;} + else{ + forceCompressTemp=Parse.parseBoolean(b); + forceRawTemp=!forceCompressTemp; + } + }else if(a.equals("tmpdir")){ + Shared.setTmpdir(b); + }else if(a.equals("rename") || a.equals("addname")){ + addName=Parse.parseBoolean(b); + }else if(a.equals("shortname") || a.equals("shortnames")){ + shortName=b; + }else if(a.equals("seed")){ + KmerComparator.defaultSeed=Long.parseLong(b); + }else if(a.equals("hashes")){ + KmerComparator.setHashes(Integer.parseInt(b)); + }else if(a.equals("passes")){ + passes=Integer.parseInt(b); + args2.add(arg); +// }else if(a.equals("k")){ +// k=Integer.parseInt(b); +// args2.add(arg); + }else if(a.equals("border")){ + KmerComparator.defaultBorder=Integer.parseInt(b); + } + + else if(a.equals("unpair")){ + unpair=Parse.parseBoolean(b); + }else if(a.equals("repair")){ + repair=Parse.parseBoolean(b); + }else if(a.equals("namesort") || a.equals("sort")){ + namesort=Parse.parseBoolean(b); + }else if(a.equals("overwrite")){ + overwrite=Parse.parseBoolean(b); + }else if(a.equals("v1") || a.equals("kmersort1")){ + boolean x=Parse.parseBoolean(b); + if(x){V2=V3=false;} + }else if(a.equals("v2") || a.equals("kmersort2")){ + V2=Parse.parseBoolean(b); + if(V2){V3=false;} + }else if(a.equals("v3") || a.equals("kmersort3")){ + V3=Parse.parseBoolean(b); + if(V3){V2=false;} + }else if(a.equals("fetchthreads")){ + KmerSort3.fetchThreads=Integer.parseInt(b); + assert(KmerSort3.fetchThreads>0) : KmerSort3.fetchThreads+"\nFetch threads must be at least 1."; + } + + else if(a.equals("comparesequence")){ + KmerComparator.compareSequence=Parse.parseBoolean(b); + }else if(a.equals("allowadjacenttiles") || a.equals("spantiles")){ + ReadKey.spanTilesX=ReadKey.spanTilesY=Parse.parseBoolean(b); + }else if(a.equals("spanx") || a.equals("spantilesx")){ + ReadKey.spanTilesX=Parse.parseBoolean(b); + }else if(a.equals("spany") || a.equals("spantilesy")){ + ReadKey.spanTilesY=Parse.parseBoolean(b); + }else if(a.equals("spanadjacent") || a.equals("spanadjacentonly") || a.equals("adjacentonly") || a.equals("adjacent")){ + ReadKey.spanAdjacentOnly=Parse.parseBoolean(b); + } + +// else if(a.equals("repair")){ +// repair=Parse.parseBoolean(b); +// }else if(a.equals("namesort") || a.equals("sort")){ +// namesort=Parse.parseBoolean(b); +// } + + else if(a.equals("interleaved") || a.equals("int")){ + if("auto".equalsIgnoreCase(b)){FASTQ.FORCE_INTERLEAVED=!(FASTQ.TEST_INTERLEAVED=true);} + else{ + FASTQ.FORCE_INTERLEAVED=FASTQ.TEST_INTERLEAVED=Parse.parseBoolean(b); + System.err.println("Set INTERLEAVED to "+FASTQ.FORCE_INTERLEAVED); + } + }else if(a.equals("cq") || a.equals("changequality")){ + BBMerge.changeQuality=Read.CHANGE_QUALITY=Parse.parseBoolean(b); + }else if(a.equals("quantize") || a.equals("quantizesticky")){ + quantizeQuality=Quantizer.parse(arg, a, b); + }else if(a.equals("lowcomplexity")){ + lowComplexity=Parse.parseBoolean(b); + } + + else if(Clump.parseStatic(arg, a, b)){ + //Do nothing + }else if(Parser.parseQuality(arg, a, b)){ + //Do nothing + } + + else{ + args2.add(arg); + } + } + + Clump.setXY(); + + KmerSplit.quantizeQuality=KmerSort1.quantizeQuality=quantizeQuality; + + Parser.processQuality(); + + assert(!unpair || !KmerComparator.mergeFirst) : "Unpair and mergefirst may not be used together."; + + if(in1==null){throw new RuntimeException("\nOne input file is required.\n");} + + if(in1!=null && in2==null && in1.indexOf('#')>-1 && !new File(in1).exists()){ + in2=in1.replace("#", "2"); + in1=in1.replace("#", "1"); + } + if(out1!=null && out2==null && out1.indexOf('#')>-1){ + out2=out1.replace("#", "2"); + out1=out1.replace("#", "1"); + } + + //Ensure input files can be read + if(!Tools.testInputFiles(false, true, in1)){ + throw new RuntimeException("\nCan't read some input files.\n"); + } + +// assert(false) : ReadKey.spanTiles()+", "+ReadKey.spanTilesX+", "+ReadKey.spanTilesY+", "+Clump.sortX+", "+Clump.sortY; + + autoSetGroups(gString); + + if((in2!=null || out2!=null) && groups>1){FASTQ.FORCE_INTERLEAVED=true;} //Fix for crash with twin fasta files + } + + + /*--------------------------------------------------------------*/ + /*---------------- Outer Methods ----------------*/ + /*--------------------------------------------------------------*/ + + /** Create read streams and process all data */ + public void process(Timer t){ + String[] args=args2.toArray(new String[0]); + args[4]="groups="+groups; + + useSharedHeader=(FileFormat.hasSamOrBamExtension(in1) && out1!=null + && FileFormat.hasSamOrBamExtension(out1)); + + if(groups==1){ + args[0]="in1="+in1; + args[1]="in2="+in2; + args[2]="out1="+out1; + args[3]="out2="+out2; + args[5]="ecco="+ecco; + args[6]="rename="+addName; + args[7]="shortname="+shortName; + args[8]="unpair="+unpair; + args[9]="repair="+repair; + args[10]="namesort="+namesort; + args[11]="ow="+overwrite; + KmerSort1.main(args); + }else{ + String pin1=in1, pin2=in2, temp; + final int conservativePasses=Clump.conservativeFlag ? passes : Tools.max(1, passes/2); + if(passes>1){Clump.setConservative(true);} + long fileMem=-1; + for(int pass=1; pass<=passes; pass++){ + if(/*passes>1 &&*/ (V2 || V3)){ +// System.err.println("Running pass with fileMem="+fileMem); +// out=(pass==passes ? out1 : getTempFname("clumpify_p"+(pass+1)+"_temp%_")); + temp=getTempFname("clumpify_p"+(pass+1)+"_temp%_"); + if(pass==passes){ + fileMem=runOnePass_v2(args, pass, pin1, pin2, out1, out2, fileMem); + }else{ + fileMem=runOnePass_v2(args, pass, pin1, pin2, temp, null, fileMem); + } +// System.err.println("New fileMem="+fileMem); + }else{ +// out=(pass==passes ? out1 : getTempFname("clumpify_temp_pass"+pass+"_")); + temp=getTempFname("clumpify_temp_pass"+pass+"_"); + if(pass==passes){ + runOnePass(args, pass, pin1, pin2, out1, out2); + }else{ + runOnePass(args, pass, pin1, pin2, temp, null); + } + } + pin1=temp; + pin2=null; + KmerComparator.defaultBorder=Tools.max(0, KmerComparator.defaultBorder-1); + KmerComparator.defaultSeed++; + if(pass>=conservativePasses){Clump.setConservative(false);} + } + } + + if(deleteInput && !sharedErrorState && out1!=null && in1!=null){ + try { + new File(in1).delete(); + if(in2!=null){new File(in2).delete();} + } catch (Exception e) { + System.err.println("WARNING: Failed to delete input files."); + } + } + + t.stop(); + System.err.println("Total time: \t"+t); + + } + + private void runOnePass(String[] args, int pass, String in1, String in2, String out1, String out2){ + assert(groups>1); + if(pass>1){ + ecco=false; + shortName="f"; + addName=false; + } + + String temp=getTempFname("clumpify_p"+pass+"_temp%_"); + + String temp2=temp.replace("%", "FINAL"); + final boolean externalSort=(pass==passes && (repair || namesort)); + + args[0]="in1="+in1; + args[1]="in2="+in2; + args[2]="out="+temp; + args[3]="out2="+null; + args[5]="ecco="+ecco; + args[6]="addname=f"; + args[7]="shortname="+shortName; + args[8]="unpair="+unpair; + args[9]="repair=f"; + args[10]="namesort=f"; + args[11]="ow="+overwrite; + KmerSplit.maxZipLevel=2; + KmerSplit.main(args); + + FASTQ.DETECT_QUALITY=FASTQ.DETECT_QUALITY_OUT=false; + FASTQ.ASCII_OFFSET=FASTQ.ASCII_OFFSET_OUT; + + args[0]="in="+temp; + args[1]="in2="+null; + args[2]="out="+(externalSort ? temp2 : out1); + args[3]="out2="+(externalSort ? "null" : out2); + args[5]="ecco=f"; + args[6]="addname="+addName; + args[7]="shortname=f"; + args[8]="unpair=f"; + args[9]="repair="+(repair && externalSort); + args[10]="namesort="+(namesort && externalSort); + args[11]="ow="+overwrite; + if(unpair){ + FASTQ.FORCE_INTERLEAVED=FASTQ.TEST_INTERLEAVED=false; + } + KmerSort1.main(args); + + if(delete){ + for(int i=0; i<groups; i++){ + new File(temp.replaceFirst("%", ""+i)).delete(); + } + if(pass>1){ + assert(in2==null); + new File(in1).delete(); + } + } + + if(externalSort){ + outstream.println(); + String[] sortArgs=new String[] {"in="+temp2, "out="+out1, "ow="+overwrite}; + if(out2!=null){sortArgs=new String[] {"in="+temp2, "out="+out1, "out2="+out2, "ow="+overwrite};} + SortByName.main(sortArgs); + if(delete){new File(temp2).delete();} + } + } + + private long runOnePass_v2(String[] args, int pass, String in1, String in2, String out1, String out2, long fileMem){ + assert(groups>1); + if(pass>1){ + ecco=false; + shortName="f"; + addName=false; + } + + String temp=getTempFname("clumpify_p"+pass+"_temp%_"); + +// String temp2=temp.replace("%", "FINAL"); + String namesorted=temp.replace("%", "namesorted_%"); + final boolean externalSort=(pass==passes && (repair || namesort)); + + if(pass==1){ + args[0]="in1="+in1; + args[1]="in2="+in2; + args[2]="out="+temp; + args[3]="out2="+null; + args[5]="ecco="+ecco; + args[6]="addname=f"; + args[7]="shortname="+shortName; + args[8]="unpair="+unpair; + args[9]="repair=f"; + args[10]="namesort=f"; + args[11]="ow="+overwrite; + KmerSplit.maxZipLevel=2; + KmerSplit.main(args); + fileMem=KmerSplit.lastMemProcessed; + + FASTQ.DETECT_QUALITY=FASTQ.DETECT_QUALITY_OUT=false; + FASTQ.ASCII_OFFSET=FASTQ.ASCII_OFFSET_OUT; + } + + args[0]="in1="+(pass==1 ? temp : in1); + args[1]="in2="+null; + args[2]="out="+(externalSort ? namesorted : out1); + args[3]="out2="+(externalSort ? "null" : out2); + args[5]="ecco=f"; + args[6]="addname="+addName; + args[7]="shortname=f"; + args[8]="unpair=f"; + args[9]="repair="+(repair && externalSort); + args[10]="namesort="+(namesort && externalSort); + args[11]="ow="+overwrite; + if(unpair){ + FASTQ.FORCE_INTERLEAVED=FASTQ.TEST_INTERLEAVED=false; + } + if(externalSort){ + KmerSort.doHashAndSplit=false; + } + if(V3){ + KmerSort3.main(fileMem, pass, passes, args); + if(fileMem<1){fileMem=KmerSort3.lastMemProcessed;} + }else{KmerSort2.main(args);} + + if(delete){ + for(int i=0; i<groups; i++){ + new File((pass==1 ? temp : in1).replaceFirst("%", ""+i)).delete(); + } + } + + if(externalSort){ + outstream.println(); + + ArrayList<String> names=new ArrayList<String>(); + for(int i=0; i<groups; i++){ + names.add(namesorted.replaceFirst("%", ""+i)); + } + ReadWrite.MAX_ZIP_THREADS=Shared.threads(); + + ReadWrite.USE_PIGZ=true; + ReadWrite.ZIPLEVEL=Tools.max(ReadWrite.ZIPLEVEL, 6); + FASTQ.TEST_INTERLEAVED=FASTQ.FORCE_INTERLEAVED=false; + FileFormat dest=FileFormat.testOutput(out1, FileFormat.FASTQ, null, true, overwrite, false, false); + FileFormat dest2=FileFormat.testOutput(out2, FileFormat.FASTQ, null, true, overwrite, false, false); + SortByName.mergeAndDump(names, /*null, */dest, dest2, delete, useSharedHeader, false, outstream, 1000); + } + +// if(externalSort){ +// outstream.println(); +// SortByName.main(new String[] {"in="+temp2, "out="+out, "ow="+overwrite}); +// if(delete){new File(temp2).delete();} +// } + return fileMem; + } + + /*--------------------------------------------------------------*/ + /*---------------- Inner Methods ----------------*/ + /*--------------------------------------------------------------*/ + + private void autoSetGroups(String s) { + if(s==null || s.equalsIgnoreCase("null")){return;} + if(Tools.isDigit(s.charAt(0))){ + groups=Integer.parseInt(s); + return; + } + assert(s.equalsIgnoreCase("auto")) : "Unknown groups setting: "+s; + + final long maxMem=Shared.memAvailable(1); + FileFormat ff1=FileFormat.testInput(in1, FileFormat.FASTQ, null, false, false); + if(ff1==null || ff1.stdio()){return;} + +// outstream.println("in1="+in1+", overhead="+(0.5*(ReadKey.overhead+Clump.overhead))); + + double[] estimates=Tools.estimateFileMemory(in1, 1000, 0.5*(ReadKey.overhead+Clump.overhead), true, lowComplexity); + if(in2!=null){ + double[] estimates2=Tools.estimateFileMemory(in2, 1000, 0.5*(ReadKey.overhead+Clump.overhead), true, lowComplexity); + estimates[0]+=estimates2[0]; + estimates[1]+=estimates2[1]; + estimates[4]+=estimates2[4]; + } + +// outstream.println(Arrays.toString(estimates)); + + double memEstimate=estimates==null ? 0 : estimates[0]; + double diskEstimate=estimates==null ? 0 : estimates[1]; + double readEstimate=estimates==null ? 0 : estimates[4]; + double worstCase=memEstimate*1.5; + +// outstream.println("Raw Disk Size Estimate: "+(long)(diskEstimate/(1024*1024))+" MB"); + outstream.println("Read Estimate: "+(long)(readEstimate)); + outstream.println("Memory Estimate: "+(long)(memEstimate/(1024*1024))+" MB"); + outstream.println("Memory Available: "+(maxMem/(1024*1024))+" MB"); + + if(maxMem>worstCase && readEstimate<Integer.MAX_VALUE){ + groups=1; + }else{ + groups=Tools.max(11, (int)(3+(3*worstCase/maxMem)*(V3 ? KmerSort3.fetchThreads : 2)), (int)((2*readEstimate)/Integer.MAX_VALUE))|1; + } + outstream.println("Set groups to "+groups); + } + + private String getTempFname(String core){ +// outstream.println(core); + String temp; + String path="", extension=".fq"; + if(out1!=null){ + core=ReadWrite.stripToCore(out1)+"_"+core; + path=ReadWrite.getPath(out1); + extension=ReadWrite.getExtension(out1); + } + + if(useTmpdir && Shared.tmpdir()!=null){ + temp=Shared.tmpdir()+core+Long.toHexString((randy.nextLong()&Long.MAX_VALUE))+extension; + }else{ + temp=path+core+Long.toHexString((randy.nextLong()&Long.MAX_VALUE))+extension; + } +// assert(false) : path+", "+temp+", "+core+", "+out1; + + String comp=ReadWrite.compressionType(temp); + if(comp!=null){comp=".gz";} //Prevent bz2 temp files which cause a crash + + if(forceCompressTemp && comp==null){ + temp+=".gz"; + }else if(comp!=null && forceRawTemp){ + temp=temp.substring(0, temp.lastIndexOf('.')); + } + if(temp.endsWith(".bz2")){temp=temp.substring(0, temp.length()-4);} //Prevent bz2 temp files which cause a crash + +// outstream.println(temp); + return temp; + } + + public static void shrinkName(Read r) { + if(r==null){return;} + String s=r.id; + if(s.contains("HISEQ")){s=s.replace("HISEQ", "H");} + if(s.contains("MISEQ")){ + s=s.replace("MISEQ", "M"); + } + if(s.contains(":000000000-")){ + s=s.replace(":000000000-", ":"); + } + r.id=s; + } + + public static void shortName(Read r) { + ByteBuilder sb=new ByteBuilder(14); + long x=r.numericID|1; + + while(x<1000000000L){ + x*=10; + sb.append('0'); + } + sb.append(r.numericID); + +// while(x<0x10000000L){ +// x*=16; +// sb.append('0'); +// } +// sb.append(Long.toHexString(r.numericID)); + + sb.append(r.pairnum()==0 ? " 1:" : " 2:"); + r.id=sb.toString(); + } + + /*--------------------------------------------------------------*/ + /*---------------- Fields ----------------*/ + /*--------------------------------------------------------------*/ + + private boolean lowComplexity=false; + + private boolean quantizeQuality=false; + private Random randy=new Random(); + private int groups=31; + private int passes=1; + private boolean ecco=false; + private boolean addName=false; + private String shortName="f"; + private boolean useTmpdir=false; + private boolean delete=true; + private boolean deleteInput=false; + private boolean useSharedHeader=false; + private boolean forceCompressTemp=false; + private boolean forceRawTemp=false; + private boolean overwrite=true; + + private boolean unpair=false; + private boolean repair=false; + private boolean namesort=false; + private boolean V2=false; + private boolean V3=true; + + private String in1=null; + private String in2=null; + private String out1=null; + private String out2=null; + + ArrayList<String> args2=new ArrayList<String>(); + private PrintStream outstream=System.err; + + public static boolean sharedErrorState=false; + +}