Mercurial > repos > rliterman > csp2
diff CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/prok/MergeRibo.java @ 68:5028fdace37b
planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author | jpayne |
---|---|
date | Tue, 18 Mar 2025 16:23:26 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/prok/MergeRibo.java Tue Mar 18 16:23:26 2025 -0400 @@ -0,0 +1,732 @@ +package prok; + +import java.io.File; +import java.io.PrintStream; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Comparator; +import java.util.HashMap; +import java.util.Map.Entry; +import java.util.concurrent.ConcurrentLinkedQueue; + +import aligner.SingleStateAlignerFlat2; +import consensus.BaseGraph; +import fileIO.ByteFile; +import fileIO.FileFormat; +import fileIO.ReadWrite; +import shared.Parse; +import shared.Parser; +import shared.PreParser; +import shared.ReadStats; +import shared.Shared; +import shared.Timer; +import shared.Tools; +import stream.ConcurrentReadInputStream; +import stream.ConcurrentReadOutputStream; +import stream.FASTQ; +import stream.FastaReadInputStream; +import stream.Read; +import structures.IntHashSet; +import structures.ListNum; +import tax.GiToTaxid; +import template.Accumulator; +import template.ThreadWaiter; + +/** + * Picks one ribosomal (16S) sequence per taxID. + * + * @author Brian Bushnell + * @date November 19, 2015 + * + */ +public class MergeRibo implements Accumulator<MergeRibo.ProcessThread> { + + /*--------------------------------------------------------------*/ + /*---------------- Initialization ----------------*/ + /*--------------------------------------------------------------*/ + + /** + * Code entrance from the command line. + * @param args Command line arguments + */ + public static void main(String[] args){ + //Start a timer immediately upon code entrance. + Timer t=new Timer(); + + //Create an instance of this class + MergeRibo x=new MergeRibo(args); + + //Run the object + x.process(t); + + //Close the print stream if it was redirected + Shared.closeStream(x.outstream); + } + + /** + * Constructor. + * @param args Command line arguments + */ + public MergeRibo(String[] args){ + + {//Preparse block for help, config files, and outstream + PreParser pp=new PreParser(args, getClass(), false); + args=pp.args; + outstream=pp.outstream; + } + + //Set shared static variables prior to parsing + ReadWrite.USE_PIGZ=ReadWrite.USE_UNPIGZ=true; + ReadWrite.MAX_ZIP_THREADS=Shared.threads(); +// Shared.capBufferLen(40);//This does not help; the slowness comes from unevenness in list length during pickBest. + //To fix it, long lists should be sorted to be first. + + BaseGraph.MAF_sub=0.251f; + BaseGraph.MAF_del=0.0f; + BaseGraph.MAF_ins=0.0f; + BaseGraph.MAF_noref=0.0f; + BaseGraph.trimDepthFraction=0.3f; + BaseGraph.trimNs=true; + + {//Parse the arguments + final Parser parser=parse(args); + Parser.processQuality(); + + maxReads=parser.maxReads; + overwrite=ReadStats.overwrite=parser.overwrite; + append=ReadStats.append=parser.append; + + extin=parser.extin; + + out1=parser.out1; + extout=parser.extout; + } + + validateParams(); + adjustInterleaving(); //Make sure interleaving agrees with number of input and output files + checkFileExistence(); //Ensure files can be read and written + checkStatics(); //Adjust file-related static fields as needed for this program + + //Create output FileFormat objects + ffout1=FileFormat.testOutput(out1, FileFormat.FASTA, extout, true, overwrite, append, ordered); + + //Create input FileFormat objects + ffin=new ArrayList<FileFormat>(in.size()); + ffalt=FileFormat.testInput(alt, FileFormat.FASTA, extin, true, true); + for(String s : in){ + FileFormat ff=FileFormat.testInput(s, FileFormat.FASTA, extin, true, true); + ffin.add(ff); + } + + //Determine how many threads may be used + threads=Shared.threads(); + } + + /*--------------------------------------------------------------*/ + /*---------------- Initialization Helpers ----------------*/ + /*--------------------------------------------------------------*/ + + /** Parse arguments from the command line */ + private Parser parse(String[] args){ + + //Create a parser object + Parser parser=new Parser(); + + //Set any necessary Parser defaults here + //parser.foo=bar; + + //Parse each argument + for(int i=0; i<args.length; i++){ + String arg=args[i]; + + //Break arguments into their constituent parts, in the form of "a=b" + String[] split=arg.split("="); + String a=split[0].toLowerCase(); + String b=split.length>1 ? split[1] : null; + if(b!=null && b.equalsIgnoreCase("null")){b=null;} + + if(a.equals("verbose")){ + verbose=Parse.parseBoolean(b); + }else if(a.equals("ordered")){ + ordered=Parse.parseBoolean(b); + }else if(a.equals("consensus")){ + useConsensus=Parse.parseBoolean(b); + }else if(a.equals("best")){ + useConsensus=!Parse.parseBoolean(b); + }else if(a.equals("fast")){ + fast=Parse.parseBoolean(b); + }else if(a.equals("minid")){ + minID=Float.parseFloat(b); + }else if(a.equals("maxns")){ + maxns=Integer.parseInt(b); + }else if(a.equals("minlen")){ + minlen=Integer.parseInt(b); + }else if(a.equals("maxlen")){ + maxlen=Integer.parseInt(b); + }else if(a.equals("in")){ + Tools.addFiles(b, in); + }else if(a.equals("alt")){ + alt=b; + }else if(a.equalsIgnoreCase("process16S") || a.equalsIgnoreCase("16S")){ + process16S=Parse.parseBoolean(b); + process18S=!process16S; + }else if(a.equalsIgnoreCase("process18S") || a.equalsIgnoreCase("18S")){ + process18S=Parse.parseBoolean(b); + process16S=!process18S; + }else if(a.equals("parse_flag_goes_here")){ + long fake_variable=Parse.parseKMG(b); + //Set a variable here + }else if(parser.parse(arg, a, b)){//Parse standard flags in the parser + //do nothing + }else if(b==null && new File(arg).exists()){ + in.add(arg); + }else{ + outstream.println("Unknown parameter "+args[i]); + assert(false) : "Unknown parameter "+args[i]; + } + } + assert(!in.isEmpty()) : "No input file."; + return parser; + } + + /** Ensure files can be read and written */ + private void checkFileExistence(){ + //Ensure output files can be written + if(!Tools.testOutputFiles(overwrite, append, false, out1)){ + outstream.println((out1==null)+", "+out1); + throw new RuntimeException("\n\noverwrite="+overwrite+"; Can't write to output files "+out1+"\n"); + } + + //Ensure input files can be read + if(!Tools.testInputFiles(false, true, in)){ + throw new RuntimeException("\nCan't read some input files.\n"); + } + +// //Ensure that no file was specified multiple times +// if(!Tools.testForDuplicateFiles(true, out1, in.toArray(new String[0]))){ +// throw new RuntimeException("\nSome file names were specified multiple times.\n"); +// } + } + + /** Make sure interleaving agrees with number of input and output files */ + private void adjustInterleaving(){ + FASTQ.FORCE_INTERLEAVED=FASTQ.TEST_INTERLEAVED=false; + } + + /** Adjust file-related static fields as needed for this program */ + private static void checkStatics(){ + //Adjust the number of threads for input file reading + if(!ByteFile.FORCE_MODE_BF1 && !ByteFile.FORCE_MODE_BF2 && Shared.threads()>2){ + ByteFile.FORCE_MODE_BF2=true; + } + + assert(FastaReadInputStream.settingsOK()); + } + + /** Ensure parameter ranges are within bounds and required parameters are set */ + private boolean validateParams(){ +// assert(minfoo>0 && minfoo<=maxfoo) : minfoo+", "+maxfoo; +// assert(false) : "TODO"; + assert(process16S || process18S) : "16S or 18S must be selected."; + assert(!process16S || !process18S) : "16S or 18S are both selected; only one may be active."; + return true; + } + + /*--------------------------------------------------------------*/ + /*---------------- Outer Methods ----------------*/ + /*--------------------------------------------------------------*/ + + /** Create read streams and process all data */ + void process(Timer t){ + + if(process16S){ + Read[] data=ProkObject.loadConsensusSequenceType("16S", true, true); + consensus16S=data[0].bases; + if(verbose){System.err.println("process16S: Loaded 16S consensus, length "+consensus16S.length+": "+new String(consensus16S));} + } + if(process18S){ + Read[] data=ProkObject.loadConsensusSequenceType("18S", true, true); + consensus18S=data[0].bases; + if(verbose){System.err.println("process18S: Loaded 18S consensus, length "+consensus18S.length+": "+new String(consensus18S));} + } + + //Turn off read validation in the input threads to increase speed + final boolean vic=Read.VALIDATE_IN_CONSTRUCTOR; + Read.VALIDATE_IN_CONSTRUCTOR=Shared.threads()<4; + + //Reset counters + readsProcessed=readsOut=0; + basesProcessed=basesOut=0; + + //Align everything to global consensus + for(FileFormat ff : ffin) { + //Create a read input stream + final ConcurrentReadInputStream cris=makeCris(ff); + + //Process the reads in separate threads + spawnThreads(cris, false); + errorState|=ReadWrite.closeStream(cris); + } + + if(ffalt!=null){ + //Create a read input stream + final ConcurrentReadInputStream cris=makeCris(ffalt); + + //Process the reads in separate threads + spawnThreads(cris, true); + errorState|=ReadWrite.closeStream(cris); + } + +// queue=new ConcurrentLinkedQueue<ArrayList<Ribo>>(); +// for(Entry<Integer, ArrayList<Ribo>> e : listMap.entrySet()){ +// queue.add(e.getValue()); +// } +// listMap=null; + queue=makeQueue(); + + //Run a second pass to pick the best SSU per taxID + spawnThreads(null, false); + + //Do anything necessary after processing + if(ffout1!=null){ + //Optionally create a read output stream + final ConcurrentReadOutputStream ros=makeCros(); + long num=0; + for(Ribo ribo : bestList){ + Read r=ribo.r; + readsOut++; + basesOut+=r.length(); + ArrayList<Read> list=new ArrayList<Read>(1); + list.add(r); + ros.add(list, num); + num++; + } + //Close the read streams + errorState|=ReadWrite.closeStream(ros); + } + + if(verbose){outstream.println("Finished; closing streams.");} + + //Write anything that was accumulated by ReadStats + errorState|=ReadStats.writeAll(); + + //Reset read validation + Read.VALIDATE_IN_CONSTRUCTOR=vic; + + //Report timing and results + t.stop(); + outstream.println(Tools.timeReadsBasesProcessed(t, readsProcessed, basesProcessed, 8)); + outstream.println(Tools.readsBasesOut(readsProcessed, basesProcessed, readsOut, basesOut, 8, false)); + + //Throw an exception of there was an error in a thread + if(errorState){ + throw new RuntimeException(getClass().getName()+" terminated in an error state; the output may be corrupt."); + } + } + + private ConcurrentLinkedQueue<ArrayList<Ribo>> makeQueue(){ + ArrayList<ArrayList<Ribo>> listList=new ArrayList<ArrayList<Ribo>>(listMap.size()); + for(Entry<Integer, ArrayList<Ribo>> e : listMap.entrySet()){ + listList.add(e.getValue()); + } + listMap=null; + Collections.sort(listList, new ListComparator()); + assert(listList.isEmpty() || listList.get(0).size()>=listList.get(listList.size()-1).size()); + ConcurrentLinkedQueue<ArrayList<Ribo>> q=new ConcurrentLinkedQueue<ArrayList<Ribo>>(); + for(ArrayList<Ribo> x : listList){ + q.add(x); + } + return q; + } + + private ConcurrentReadInputStream makeCris(FileFormat ff){ + ConcurrentReadInputStream cris=ConcurrentReadInputStream.getReadInputStream(maxReads, true, ff, null); + cris.start(); //Start the stream + if(verbose){outstream.println("Started cris");} + boolean paired=cris.paired(); + assert(!paired) : "This should not be paired input."; + return cris; + } + + private ConcurrentReadOutputStream makeCros(){ + if(ffout1==null){return null;} + + //Select output buffer size based on whether it needs to be ordered + final int buff=(ordered ? Tools.mid(16, 128, (Shared.threads()*2)/3) : 8); + + final ConcurrentReadOutputStream ros=ConcurrentReadOutputStream.getStream(ffout1, null, buff, null, false); + ros.start(); //Start the stream + return ros; + } + + /*--------------------------------------------------------------*/ + /*---------------- Thread Management ----------------*/ + /*--------------------------------------------------------------*/ + + /** Spawn process threads */ + private void spawnThreads(final ConcurrentReadInputStream cris, boolean altData){ + + //Do anything necessary prior to processing + + //Fill a list with ProcessThreads + if(verbose){System.err.println("Spawning "+threads+" threads.");} + ArrayList<ProcessThread> alpt=new ArrayList<ProcessThread>(threads); + for(int i=0; i<threads; i++){ + alpt.add(new ProcessThread(cris, i, altData)); + } + + //Start the threads and wait for them to finish + boolean success=ThreadWaiter.startAndWait(alpt, this); + if(verbose){System.err.println("Threads finished with success="+success+".");} + errorState&=!success; + } + + @Override + public final void accumulate(ProcessThread pt){ + readsProcessed+=pt.readsProcessedT; + basesProcessed+=pt.basesProcessedT; + errorState|=(!pt.success); + } + + @Override + public final boolean success(){return !errorState;} + + /*--------------------------------------------------------------*/ + /*---------------- Inner Methods ----------------*/ + /*--------------------------------------------------------------*/ + + /*--------------------------------------------------------------*/ + /*---------------- Inner Classes ----------------*/ + /*--------------------------------------------------------------*/ + + /** This class is static to prevent accidental writing to shared variables. + * It is safe to remove the static modifier. */ + class ProcessThread extends Thread { + + //Constructor + ProcessThread(final ConcurrentReadInputStream cris_, final int tid_, boolean alt_){ + cris=cris_; + tid=tid_; + processInput=(cris!=null); + altData=alt_; + } + + //Called by start() + @Override + public void run(){ + //Do anything necessary prior to processing + + if(processInput){ + //Process the reads + processInner(); + }else{ + pickBest(); + } + + //Do anything necessary after processing + + //Indicate successful exit status + success=true; + } + + /** Iterate through the reads */ + void processInner(){ + if(verbose && tid==0){System.err.println("processInner() for tid="+tid);} + + //Grab the first ListNum of reads + ListNum<Read> ln=cris.nextList(); + + //Check to ensure pairing is as expected + if(ln!=null && !ln.isEmpty()){ + Read r=ln.get(0); +// assert(ffin1.samOrBam() || (r.mate!=null)==cris.paired()); //Disabled due to non-static access + } + + //As long as there is a nonempty read list... + while(ln!=null && ln.size()>0){ +// if(verbose){outstream.println("Fetched "+reads.size()+" reads.");} //Disabled due to non-static access + + processInput(ln); + + //Notify the input stream that the list was used + cris.returnList(ln); +// if(verbose){outstream.println("Returned a list.");} //Disabled due to non-static access + + //Fetch a new list + ln=cris.nextList(); + } + + //Notify the input stream that the final list was used + if(ln!=null){ + cris.returnList(ln.id, ln.list==null || ln.list.isEmpty()); + } + } + + void processInput(ListNum<Read> ln){ + if(verbose && tid==0){System.err.println("processInput() for tid="+tid);} + + //Grab the actual read list from the ListNum + final ArrayList<Read> reads=ln.list; + + //Loop through each read in the list + for(int idx=0; idx<reads.size(); idx++){ + final Read r1=reads.get(idx); + + //Validate reads in worker threads + if(!r1.validated()){r1.validate(true);} + + //Track the initial length for statistics + final int initialLength1=r1.length(); + + //Increment counters + readsProcessedT++; + basesProcessedT+=initialLength1; + + processRead(r1); + } + } + + void pickBest(){ + if(verbose && tid==0){System.err.println("pickBest() for tid="+tid);} + for(ArrayList<Ribo> list=queue.poll(); list!=null; list=queue.poll()){ + Ribo best=pickBest(list); + list.clear(); + synchronized(bestList){ + bestList.add(best); + } + } + } + + Ribo pickBest(ArrayList<Ribo> list){ + if(verbose && tid==0){System.err.println("pickBest(list[="+list.size()+"]) for tid="+tid);} + assert(list!=null && list.size()>0); + if(list.size()==1){return list.get(0);} + Collections.sort(list); + Collections.reverse(list); + assert(list.get(0).product>=list.get(1).product); + if(list.size()<3 || fast){return list.get(0);} + + Ribo base=list.get(0); + int pad=Tools.max(10, (1600-base.r.length())); + BaseGraph bg=new BaseGraph(base.r.name(), base.r.bases, base.r.quality, base.r.numericID, pad); + for(Ribo r : list){ + bg.alignAndGenerateMatch(r.r, ssa); + } + Read consensus=bg.traverse(); + Ribo best; + if(useConsensus){ + best=new Ribo(consensus, base.tid, 1); + }else{ + for(Ribo r : list){ + float id=align(r.r.bases, consensus.bases); + r.identity=id; + r.product=score(r.length(), r.identity); + } + Collections.sort(list); + Collections.reverse(list); + assert(list.get(0).product>=list.get(1).product); + best=list.get(0); + } + return best; + } + + /** + * Process a read or a read pair. + * @return True if the reads should be kept, false if they should be discarded. + */ + void processRead(final Read r){ + if(verbose && tid==0){System.err.println("processRead()");} + if(r.length()<minlen || r.length()>maxlen){return;} + if(maxns>=0 && r.countNocalls()>maxns){return;} + Integer key=GiToTaxid.parseTaxidNumber(r.id, '|'); + if(verbose && tid==0){System.err.println("key="+key);} + if(key==null || key==-1 || (altData && seenTaxID.contains(key))){return;} + float id=align(r); + if(id<minID){return;} + Ribo ribo=new Ribo(r, key, id); + + synchronized(listMap){ + ArrayList<Ribo> list=listMap.get(key); + if(list==null){ + list=new ArrayList<Ribo>(8); + listMap.put(key, list); + } + list.add(ribo); + if(!altData){seenTaxID.add(key);} + } + } + + float align(Read r){ + float a=(process16S ? align(r.bases, consensus16S) : 0); + float b=(process18S ? align(r.bases, consensus18S) : 0); + if(verbose && tid==0){System.err.println("Aligned; a="+a+", b="+b);} + return Tools.max(a, b); + } + + float align(byte[] query, byte[] ref){ + int a=0, b=ref.length-1; + int[] max=ssa.fillUnlimited(query, ref, a, b, -9999); + if(max==null){return 0;} + + final int rows=max[0]; + final int maxCol=max[1]; + final int maxState=max[2]; + final float id=ssa.tracebackIdentity(query, ref, a, b, rows, maxCol, maxState, null); + return id; + } + + SingleStateAlignerFlat2 ssa=new SingleStateAlignerFlat2(); + + /** Number of reads processed by this thread */ + protected long readsProcessedT=0; + /** Number of bases processed by this thread */ + protected long basesProcessedT=0; + + /** True only if this thread has completed successfully */ + boolean success=false; + + /** Shared input stream */ + private final ConcurrentReadInputStream cris; + /** Thread ID */ + final int tid; + + //Run mode + final boolean processInput; + final boolean altData; + } + + private class Ribo implements Comparable<Ribo>{ + Ribo(Read r_, int tid_, float identity_){ + r=r_; + tid=tid_; + identity=identity_; + product=score(r.length(), identity); + } + + @Override + public int compareTo(Ribo b) { + if(b.product>product){return -1;} + else if(b.product<product){return 1;} + else if(b.r.length()>r.length()){return -1;} + else if(b.r.length()<r.length()){return 1;} + return 0; + } + + int length(){return r.length();} + + Read r; + int tid; + float identity; + float product; + } + + private class ListComparator implements Comparator<ArrayList<Ribo>> { + + @Override + public int compare(ArrayList<Ribo> a, ArrayList<Ribo> b) { + return a.size()>b.size() ? -1 : a.size()<b.size() ? 1 : 0; + } + + } + + private float lengthMult(int len){ + int idealLength=idealLength(); + int max=Tools.max(len, idealLength, 1); + int min=Tools.min(len, idealLength); + return min/(float)max; + } + + private float score(int len, float identity){ + return lengthMult(len)*identity; + } + + /*--------------------------------------------------------------*/ + /*---------------- Fields ----------------*/ + /*--------------------------------------------------------------*/ + + /** Primary input file path */ + private ArrayList<String> in=new ArrayList<String>(); + + /** Alternate input file path */ + private String alt=null; + + /** Primary output file path */ + private String out1=null; + + /** Override input file extension */ + private String extin=null; + /** Override output file extension */ + private String extout=null; + + ArrayList<Ribo> bestList=new ArrayList<Ribo>(); + HashMap<Integer, ArrayList<Ribo>> listMap=new HashMap<Integer, ArrayList<Ribo>>(100000); + ConcurrentLinkedQueue<ArrayList<Ribo>> queue; + + + IntHashSet seenTaxID=new IntHashSet(1000000); + + byte[] consensus16S; + byte[] consensus18S; + + int idealLength(){ + if(process16S){return consensus16S.length;} + return consensus18S.length; + } + + boolean useConsensus=false; + boolean fast=false; + int maxns=-1; + int minlen=1; + int maxlen=4000; + + /*--------------------------------------------------------------*/ + + /** Number of reads processed */ + protected long readsProcessed=0; + /** Number of bases processed */ + protected long basesProcessed=0; + + /** Number of reads retained */ + protected long readsOut=0; + /** Number of bases retained */ + protected long basesOut=0; + + /** Quit after processing this many input reads; -1 means no limit */ + private long maxReads=-1; + + private float minID=0.62f; + + private boolean process16S=true; + private boolean process18S=false; + + /*--------------------------------------------------------------*/ + /*---------------- Final Fields ----------------*/ + /*--------------------------------------------------------------*/ + + /** Primary input file */ + private final ArrayList<FileFormat> ffin; + private final FileFormat ffalt; + + /** Primary output file */ + private final FileFormat ffout1; + + final int threads; + + /*--------------------------------------------------------------*/ + /*---------------- Common Fields ----------------*/ + /*--------------------------------------------------------------*/ + + /** Print status messages to this output stream */ + private PrintStream outstream=System.err; + /** Print verbose messages */ + public static boolean verbose=false; + /** True if an error was encountered */ + public boolean errorState=false; + /** Overwrite existing output files */ + private boolean overwrite=false; + /** Append to existing output files */ + private boolean append=false; + /** Reads are output in input order */ + private boolean ordered=false; + +}