Mercurial > repos > rliterman > csp2
diff CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/icecream/IceCreamFinder.java @ 68:5028fdace37b
planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author | jpayne |
---|---|
date | Tue, 18 Mar 2025 16:23:26 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/icecream/IceCreamFinder.java Tue Mar 18 16:23:26 2025 -0400 @@ -0,0 +1,1861 @@ +package icecream; + +import java.io.PrintStream; +import java.util.ArrayList; +import java.util.Arrays; + +import aligner.AlignmentResult; +import aligner.FlatAligner; +import aligner.MultiStateAligner9PacBioAdapter2; +import aligner.SingleStateAlignerPacBioAdapter; +import dna.AminoAcid; +import dna.Data; +import fileIO.ByteFile; +import fileIO.ByteStreamWriter; +import fileIO.FileFormat; +import fileIO.ReadWrite; +import json.JsonObject; +import shared.Parse; +import shared.Parser; +import shared.PreParser; +import shared.ReadStats; +import shared.Shared; +import shared.Timer; +import shared.Tools; +import shared.TrimRead; +import stream.ConcurrentReadOutputStream; +import stream.FASTQ; +import stream.FastaReadInputStream; +import stream.Read; +import stream.SamLine; +import structures.ByteBuilder; +import structures.EntropyTracker; + +/** + * Detects inverted repeats in PacBio reads. + * Attempts to determine whether reads are chimeric + * due to a missing adapter. + * + * @author Brian Bushnell + * @date June 5, 2019 + * + */ +public final class IceCreamFinder { + + /*--------------------------------------------------------------*/ + /*---------------- Initialization ----------------*/ + /*--------------------------------------------------------------*/ + + /** + * Code entrance from the command line. + * @param args Command line arguments + */ + public static void main(String[] args){ + //Start a timer immediately upon code entrance. + Timer t=new Timer(); + + //Create an instance of this class + IceCreamFinder x=new IceCreamFinder(args); + + //Run the object + x.process(t); + + //Close the print stream if it was redirected + Shared.closeStream(x.outstream); + } + + /** + * Constructor. + * @param args Command line arguments + */ + public IceCreamFinder(String[] args){ + + {//Preparse block for help, config files, and outstream + PreParser pp=new PreParser(args, getClass(), false); + args=pp.args; + outstream=pp.outstream; + } + + Parser.setQuality(33); + + //Set shared static variables prior to parsing + ReadWrite.USE_PIGZ=ReadWrite.USE_UNPIGZ=true; + ReadWrite.USE_BGZIP=ReadWrite.USE_UNBGZIP=ReadWrite.PREFER_BGZIP=true; + ReadWrite.MAX_ZIP_THREADS=Shared.threads(); + FASTQ.TEST_INTERLEAVED=FASTQ.FORCE_INTERLEAVED=false; + SamLine.SET_FROM_OK=true; + Shared.setBufferData(1000000); +// Shared.FASTA_WRAP=511; + Data.USE_SAMBAMBA=false;//Sambamba changes PacBio headers. + Read.CHANGE_QUALITY=false; + EntropyTracker.defaultK=3; + + {//Parse the arguments + final Parser parser=parse(args); + Parser.processQuality(); + + maxReads=parser.maxReads; + overwrite=ReadStats.overwrite=parser.overwrite; + append=ReadStats.append=parser.append; + + in1=parser.in1; + extin=parser.extin; + + if(outg==null){outg=parser.out1;} + extout=parser.extout; + } + + //Determine how many threads may be used + threads=Shared.threads(); + + //Ensure there is an input file + if(in1==null){throw new RuntimeException("Error - at least one input file is required.");} + fixExtensions(); //Add or remove .gz or .bz2 as needed + checkFileExistence(); //Ensure files can be read and written + checkStatics(); //Adjust file-related static fields as needed for this program + + //Create output FileFormat objects + ffoutg=FileFormat.testOutput(outg, FileFormat.FASTQ, extout, true, overwrite, append, false); + ffouta=FileFormat.testOutput(outa, FileFormat.FASTQ, extout, true, overwrite, append, false); + ffoutb=FileFormat.testOutput(outb, FileFormat.FASTQ, extout, true, overwrite, append, false); + ffoutj=FileFormat.testOutput(outj, FileFormat.FASTA, extout, true, overwrite, append, false); + ffstats=FileFormat.testOutput(outstats, FileFormat.TXT, null, false, overwrite, append, false); + ffasrhist=FileFormat.testOutput(asrhist, FileFormat.TXT, null, false, overwrite, append, false); + ffirsrhist=FileFormat.testOutput(irsrhist, FileFormat.TXT, null, false, overwrite, append, false); + + if(!setAmbig && ffouta!=null){ + sendAmbigToGood=sendAmbigToBad=false; + } + + //Create input FileFormat objects + ffin1=FileFormat.testInput(in1, FileFormat.FASTQ, extin, true, true); + + final int alen=(adapter==null ? 20 : adapter.length); + stride=(int)(alen*1.9f); + window=(int)(alen*3.8f+10); + ALIGN_ROWS=alen+1; + ALIGN_COLUMNS=window+2; + + maxSwScore=aligner.MultiStateAligner9PacBioAdapter.maxQuality(alen); + minSwScore=(int)(maxSwScore*adapterRatio2); + minSwScoreSuspect=(int)(maxSwScore*Tools.min(adapterRatio2*suspectRatio, adapterRatio2-((1-suspectRatio)*.2f))); + maxImperfectSwScore=aligner.MultiStateAligner9PacBioAdapter.maxImperfectScore(alen); + suspectMidpoint=(minSwScoreSuspect+minSwScore)/2; + if(adapter==null){alignAdapter=false;} + } + + /*--------------------------------------------------------------*/ + /*---------------- Initialization Helpers ----------------*/ + /*--------------------------------------------------------------*/ + + /** Parse arguments from the command line */ + private Parser parse(String[] args){ + + //Create a parser object + Parser parser=new Parser(); + + //Set any necessary Parser defaults here + //parser.foo=bar; + + //Parse each argument + for(int i=0; i<args.length; i++){ + String arg=args[i]; + + //Break arguments into their constituent parts, in the form of "a=b" + String[] split=arg.split("="); + String a=split[0].toLowerCase(); + String b=split.length>1 ? split[1] : null; + if(b!=null && b.equalsIgnoreCase("null")){b=null;} + + if(a.equals("verbose")){ + verbose=Parse.parseBoolean(b); + }else if(a.equals("format")){ + if(b==null){ + assert(false) : arg; + }else if(Tools.isDigit(b.charAt(i))){ + format=Integer.parseInt(b); + }else if(b.equalsIgnoreCase("json")){ + format=FORMAT_JSON; + }else if(b.equalsIgnoreCase("text")){ + format=FORMAT_TEXT; + }else{ + assert(false) : arg; + } + assert(format>=1 && format<=2) : arg; + }else if(a.equals("json")){ + boolean x=Parse.parseBoolean(b); + format=(x ? FORMAT_JSON : FORMAT_TEXT); + }else if(a.equals("ss") || a.equals("samstreamer") || a.equals("streamer")){ + if(b!=null && Tools.isDigit(b.charAt(0))){ + ZMWStreamer.useStreamer=true; + assert(Integer.parseInt(b)==1) : "ZMWStreamer threads currently capped at 1."; +// ZMWStreamer.streamerThreads=Tools.max(1, Integer.parseInt(b)); + }else{ + ZMWStreamer.useStreamer=Parse.parseBoolean(b); + } + }else if(a.equals("icecreamonly") || a.equals("ico")){ + filterIceCreamOnly=Parse.parseBoolean(b); + }else if(a.equals("keepshortreads") || a.equals("ksr")){ + keepShortReads=Parse.parseBoolean(b); + }else if(a.equalsIgnoreCase("keepzmwstogether") || a.equals("kzt") || a.equals("keepreadstogether") || a.equals("krt")){ + keepZMWsTogether=Parse.parseBoolean(b); + }else if(a.equalsIgnoreCase("samplerate")){ + float f=Float.parseFloat(b); + assert(false) : "TODO"; //TODO + }else if(a.equals("modifyheader") || a.equals("modifyheaders") || a.equals("changeheader") || a.equals("changeheaders")){ + modifyHeader=Parse.parseBoolean(b); + }else if(a.equalsIgnoreCase("ccs")){ + CCS=Parse.parseBoolean(b); + }else if(a.equals("npad")){ + npad=Integer.parseInt(b); + }else if(a.equals("minlength") || a.equals("minlen")){ + minLengthAfterTrimming=Integer.parseInt(b); + }else if(a.equals("realign")){ + realign=Parse.parseBoolean(b); + }else if(a.equals("aligninverse") || a.equals("alignrc") || a.equals("findicecream")){ + alignRC=Parse.parseBoolean(b); + }else if(a.equals("alignadapter")){ + alignAdapter=Parse.parseBoolean(b); + }else if(a.equals("timeless")){ + timeless=Parse.parseBoolean(b); + }else if(a.equals("flaglongreads")){ + flagLongReads=Parse.parseBoolean(b); + }else if(a.equals("trimreads") || a.equals("trim")){ + trimReads=Parse.parseBoolean(b); + }else if(a.equals("adapter")){ + adapter=b==null ? null : b.getBytes(); + }else if(a.equals("targetqlen") || a.equals("qlen")){ + targetQlen=Integer.parseInt(b); + }else if(a.equals("maxqlenfraction") || a.equals("maxfraction") || a.equals("qlenfraction")){ + maxQlenFraction=Float.parseFloat(b); + }else if(a.equals("junctionfraction") || a.equals("shortfraction")){ + minJunctionFraction=Float.parseFloat(b); + }else if(a.equals("minratio1") || a.equals("ratio1") || a.equals("id1")){ + minRatio1=Float.parseFloat(b); + }else if(a.equals("minratio2") || a.equals("ratio2") || a.equals("id2")){ + minRatio2=Float.parseFloat(b); + }else if(a.equals("minratio") || a.equals("ratio") || a.equals("id")){ + minRatio1=minRatio2=Float.parseFloat(b); + }else if(a.equals("adapterratio") || a.equals("adapterratio1") || a.equals("ratior") || a.equals("ratior1")){ + adapterRatio=Float.parseFloat(b); + }else if(a.equals("adapterratio2") || a.equals("ratior2")){ + adapterRatio2=Float.parseFloat(b); + }else if(a.equals("suspectratio")){ + suspectRatio=Float.parseFloat(b); + }else if(a.equals("minqlen")){ + minQlen=Integer.parseInt(b); + }else if(a.equals("minscore")){ + minScore=Integer.parseInt(b); + }else if(a.equals("parsecustom")){ + parseCustom=Parse.parseBoolean(b); + }else if(a.equals("printtiming") || a.equals("extended")){ + printTiming=Parse.parseBoolean(b); + }else if(a.equals("queuelen") || a.equals("qlen")){ + queuelen=Integer.parseInt(b); + }else if(a.equals("outg") || a.equals("outgood")){ + outg=b; + }else if(a.equals("outa") || a.equals("outambig")){ + outa=b; + }else if(a.equals("outb") || a.equals("outbad")){ + outb=b; + }else if(a.equals("outj") || a.equals("outjunctions") || a.equals("junctions")){ + outj=b; + }else if(a.equals("outs") || a.equals("outstats") || a.equals("stats")){ + outstats=b; + }else if(a.equals("asrhist") || a.equals("ahist")){ + asrhist=b; + }else if(a.equals("irsrhist") || a.equals("irhist")){ + irsrhist=b; + }else if(a.equals("ambig")){ + sendAmbigToGood=sendAmbigToBad=false; + if(b!=null){ + String[] split2=b.split(","); + for(String s2 : split2){ + if(s2.equalsIgnoreCase("good")){sendAmbigToGood=true;} + else if(s2.equalsIgnoreCase("bad") || s2.equalsIgnoreCase("toss")){sendAmbigToBad=true;} + else if(s2.equalsIgnoreCase("ambig")){} + else{assert(false) : "Bad argument: '"+s2+"' in '"+arg+"'; should be good or bad";} + } + } + setAmbig=true; + }else if(a.equalsIgnoreCase("trimpolya")){//Parse standard flags in the parser + trimPolyA=Parse.parseBoolean(b); + }else if(PolymerTrimmer.parse(arg, a, b)){ + //do nothing + }else if(a.equals("minentropy") || a.equals("entropy") || a.equals("entropyfilter") || a.equals("efilter")){ + if(b==null || Character.isLetter(b.charAt(0))){ + if(Parse.parseBoolean(b)){ + entropyCutoff=0.55f; + }else{ + entropyCutoff=-1; + } + }else{ + entropyCutoff=Float.parseFloat(b); + } + }else if(a.equals("entropyblock") || a.equals("entropylength") || a.equals("entropylen") || a.equals("entlen")){ + entropyLength=Parse.parseIntKMG(b); + }else if(a.equals("entropyfraction") || a.equals("entfraction")){ + entropyFraction=Float.parseFloat(b); + }else if(a.equals("monomerfraction") || a.equals("maxmonomerfraction") || a.equals("mmf")){ + maxMonomerFraction=Float.parseFloat(b); + }else if(a.equals("parse_flag_goes_here")){ + long fake_variable=Parse.parseKMG(b); + //Set a variable here + }else if(parser.parse(arg, a, b)){//Parse standard flags in the parser + //do nothing + }else{ + outstream.println("Unknown parameter "+args[i]); + assert(false) : "Unknown parameter "+args[i]; + } + } + + return parser; + } + + /** Add or remove .gz or .bz2 as needed */ + private void fixExtensions(){ + in1=Tools.fixExtension(in1); + } + + /** Ensure files can be read and written */ + private void checkFileExistence(){ + + //Ensure output files can be written + if(!Tools.testOutputFiles(overwrite, append, false, outg, outa, outb, outj, outstats, asrhist, irsrhist)){ + outstream.println((outg==null)+", "+(outb==null)+", "+outg+", "+outa+", "+outb+", "+outj+", "+outstats); + throw new RuntimeException("\n\noverwrite="+overwrite+"; Can't write to output files "+outg+", "+outa+", "+outb+", "+outj+"\n"); + } + + //Ensure input files can be read + if(!Tools.testInputFiles(false, true, in1)){ + throw new RuntimeException("\nCan't read some input files.\n"); + } + + //Ensure that no file was specified multiple times + if(!Tools.testForDuplicateFiles(true, in1, outg, outa, outb, outj, outstats, asrhist, irsrhist)){ + throw new RuntimeException("\nSome file names were specified multiple times.\n"); + } + } + + /** Adjust file-related static fields as needed for this program */ + private static void checkStatics(){ + //Adjust the number of threads for input file reading + if(!ByteFile.FORCE_MODE_BF1 && !ByteFile.FORCE_MODE_BF2 && Shared.threads()>2){ + ByteFile.FORCE_MODE_BF2=true; + } + + assert(FastaReadInputStream.settingsOK()); + } + + /*--------------------------------------------------------------*/ + /*---------------- Outer Methods ----------------*/ + /*--------------------------------------------------------------*/ + + /** Create read streams and process all data */ + void process(Timer t){ + + //Turn off read validation in the input threads to increase speed + final boolean vic=Read.VALIDATE_IN_CONSTRUCTOR; + Read.VALIDATE_IN_CONSTRUCTOR=Shared.threads()<4; + + //Create a read input stream + ZMWStreamer zstream=new ZMWStreamer(ffin1, Shared.threads(), maxReads, -1); + + //Optionally create read output streams + final ConcurrentReadOutputStream rosg=makeCros(ffoutg); + final ConcurrentReadOutputStream rosa=makeCros(ffouta); + final ConcurrentReadOutputStream rosb=makeCros(ffoutb); + final ConcurrentReadOutputStream rosj=makeCros(ffoutj); + + //Reset counters + readsProcessed=readsOut=0; + basesProcessed=basesOut=0; + junctionsOut=0; + + //Process the reads in separate threads + spawnThreads(zstream, rosg, rosa, rosb, rosj); + + if(verbose){outstream.println("Finished; closing streams.");} + + //Write anything that was accumulated by ReadStats + errorState|=ReadStats.writeAll(); + //Close the read streams + errorState|=ReadWrite.closeStreams(null, rosg, rosa, rosb, rosj); + + //Reset read validation + Read.VALIDATE_IN_CONSTRUCTOR=vic; + + writeScoreRatioHistogram(ffasrhist, adapterScores); + writeScoreRatioHistogram(ffirsrhist, repeatScores); + + //Report timing and results + t.stop(); + + String stats=null; + if(format==FORMAT_TEXT){ + ByteBuilder bb=toText(t); + stats=bb.nl().toString(); + }else if(format==FORMAT_JSON){ + JsonObject jo=toJson(t); + stats=jo.toStringln(); + }else{ + assert(false) : "Bad format: "+format; + } + if(ffstats==null){ + outstream.print(stats); + }else{ + ReadWrite.writeString(stats, outstats); + } + + //Throw an exception of there was an error in a thread + if(errorState){ + throw new RuntimeException(getClass().getName()+" terminated in an error state; the output may be corrupt."); + } + } + + private ByteBuilder toText(Timer t){ + ByteBuilder bb=new ByteBuilder(); + + bb.appendln(Tools.timeReadsBasesProcessed(t, readsProcessed, basesProcessed, 8)); + bb.appendln(Tools.readsBasesOut(readsProcessed, basesProcessed, readsOut, basesOut, 8, false)); + + long readsFiltered=readsProcessed-readsOut; + bb.appendln(Tools.numberPercent("Reads Filtered:", readsFiltered, readsFiltered*100.0/(readsProcessed), 3, 8)); + if(trimReads || trimPolyA){ + bb.appendln(Tools.numberPercent("Reads Trimmed:", readsTrimmed, readsTrimmed*100.0/(readsProcessed), 3, 8)); + bb.appendln(Tools.numberPercent("Bases Trimmed:", basesTrimmed, basesTrimmed*100.0/(basesProcessed), 3, 8)); + } + bb.appendln(Tools.number("Total ZMWs:", ZMWs, 8)); + bb.appendln(Tools.numberPercent("Bad ZMWs:", iceCreamZMWs, iceCreamZMWs*100.0/(ZMWs), 3, 8)); + bb.appendln(Tools.numberPercent("Absent Adapter:", missingAdapterZMWs, missingAdapterZMWs*100.0/(ZMWs), 3, 8)); + bb.appendln(Tools.numberPercent("Hidden Adapter:", hiddenAdapterZMWs, hiddenAdapterZMWs*100.0/(ZMWs), 3, 8)); + bb.appendln(Tools.numberPercent("Ambiguous IR:", ambiguousZMWs, ambiguousZMWs*100.0/(ZMWs), 3, 8)); +// bb.appendln(Tools.numberPercent("Low Entropy:", lowEntropyReads, lowEntropyReads*100.0/(readsProcessed), 3, 8)); + bb.appendln(Tools.numberPercent("Low Entropy:", lowEntropyZMWs, lowEntropyZMWs*100.0/(ZMWs), 3, 8)); + + bb.appendln(Tools.number("Avg Score Ratio:", iceCreamRatio/ratiosCounted, 3, 8)); + bb.appendln(Tools.number("Score Cutoff:", minRatio2, 3, 8)); + + if(printTiming){ + bb.appendln("Iterations: "+alignmentIters/1000000+"m"); + bb.appendln("Short Iterations: "+alignmentItersShort/1000000+"m"); + bb.appendln("Elapsed: "+elapsed/1000000+"ms"); + bb.appendln("Elapsed Short: "+elapsedShort/1000000+"ms"); + } + + if(parseCustom){ + { + bb.appendln("\nReads:"); + bb.appendln(Tools.numberPercent("True Positive:", truePositiveReads, truePositiveReads*100.0/(readsProcessed), 2, 8)); + bb.appendln(Tools.numberPercent("True Negative:", trueNegativeReads, trueNegativeReads*100.0/(readsProcessed), 2, 8)); + bb.appendln(Tools.numberPercent("False Positive:", falsePositiveReads, falsePositiveReads*100.0/(readsProcessed), 2, 8)); + bb.appendln(Tools.numberPercent("False Negative:", falseNegativeReads, falseNegativeReads*100.0/(readsProcessed), 2, 8)); + bb.appendln(Tools.numberPercent("Ambiguous:", ambiguousReads, ambiguousReads*100.0/(readsProcessed), 2, 8)); + + double snr=(truePositiveReads+trueNegativeReads+ambiguousReads)/Tools.max(1, falsePositiveReads+falseNegativeReads+ambiguousReads); + snr=10*Math.log10(snr); + bb.appendln(Tools.number("SNR:", snr, 2, 8)); + } + { + bb.appendln("\nZMWs:"); + bb.appendln(Tools.numberPercent("True Positive:", truePositiveZMWs, truePositiveZMWs*100.0/(ZMWs), 2, 8)); + bb.appendln(Tools.numberPercent("True Negative:", trueNegativeZMWs, trueNegativeZMWs*100.0/(ZMWs), 2, 8)); + bb.appendln(Tools.numberPercent("False Positive:", falsePositiveZMWs, falsePositiveZMWs*100.0/(ZMWs), 2, 8)); + bb.appendln(Tools.numberPercent("False Negative:", falseNegativeZMWs, falseNegativeZMWs*100.0/(ZMWs), 2, 8)); + bb.appendln(Tools.numberPercent("Ambiguous:", ambiguousZMWs, ambiguousZMWs*100.0/(ZMWs), 2, 8)); + + double snr=(truePositiveZMWs+trueNegativeZMWs+ambiguousReads)/Tools.max(1, falsePositiveZMWs+falseNegativeZMWs+ambiguousZMWs); + snr=10*Math.log10(snr); + bb.appendln(Tools.number("SNR:", snr, 2, 8)); + } + } + return bb; + } + + private JsonObject toJson(Timer t){ + JsonObject jo=new JsonObject(); + long readsFiltered=readsProcessed-readsOut; + + jo.add("Time", t.timeInSeconds()); + jo.add("Reads_Processed", readsProcessed); + jo.add("Bases_Processed", basesProcessed); + jo.add("Reads_Out", readsOut); + jo.add("Bases_Out", basesOut); + jo.add("Reads_Filtered", readsFiltered); + jo.add("Reads_Filtered_Pct", readsFiltered*100.0/(readsProcessed)); + if(trimReads){ + jo.add("Reads_Trimmed", readsTrimmed); + jo.add("Reads_Trimmed_Pct", readsTrimmed*100.0/(readsProcessed)); + jo.add("Bases_Trimmed", basesTrimmed); + jo.add("Bases_Trimmed_Pct", basesTrimmed*100.0/(basesProcessed)); + } + jo.add("Total_ZMWs", ZMWs); + jo.add("Bad_ZMWs", iceCreamZMWs); + jo.add("Bad_ZMWs_Pct", iceCreamZMWs*100.0/(ZMWs)); + jo.add("Absent_Adapter", missingAdapterZMWs); + jo.add("Absent_Adapter_Pct", missingAdapterZMWs*100.0/(ZMWs)); + jo.add("Hidden_Adapter", hiddenAdapterZMWs); + jo.add("Hidden_Adapter_Pct", hiddenAdapterZMWs*100.0/(ZMWs)); +// jo.add("Low_Entropy", lowEntropyReads); +// jo.add("Low_Entropy_Pct", lowEntropyReads*100.0/(readsProcessed)); + jo.add("Low_Entropy", lowEntropyZMWs); + jo.add("Low_Entropy_Pct", lowEntropyZMWs*100.0/(ZMWs)); + jo.add("Ambiguous_Inverted_Repeat", ambiguousZMWs); + jo.add("Ambiguous_Inverted_Repeat_Pct", ambiguousZMWs*100.0/(ZMWs)); + jo.add("Avg_Score_Ratio_IR", iceCreamRatio/ratiosCounted); + jo.add("Score_Cutoff_IR", minRatio2); + + jo.add("Alignment_Iterations_IR", alignmentIters); + jo.add("Short_Alignment_Iterations_IR", alignmentItersShort); + + if(parseCustom){ + { + double snr=(truePositiveReads+trueNegativeReads+ambiguousReads)/Tools.max(1, falsePositiveReads+falseNegativeReads+ambiguousReads); + snr=10*Math.log10(snr); + jo.add("TP_Reads", truePositiveReads); + jo.add("TN_Reads", trueNegativeReads); + jo.add("FP_Reads", falsePositiveReads); + jo.add("FN_Reads", falseNegativeReads); + jo.add("AM_Reads", ambiguousReads); + + jo.add("TP_Reads_Pct", truePositiveReads*100.0/(readsProcessed)); + jo.add("TN_Reads_Pct", trueNegativeReads*100.0/(readsProcessed)); + jo.add("FP_Reads_Pct", falsePositiveReads*100.0/(readsProcessed)); + jo.add("FN_Reads_Pct", falseNegativeReads*100.0/(readsProcessed)); + jo.add("AM_Reads_Pct", ambiguousReads*100.0/(readsProcessed)); + + jo.add("SNR_Reads", snr); + } + { + double snr=(truePositiveZMWs+trueNegativeZMWs+ambiguousZMWs)/Tools.max(1, falsePositiveZMWs+falseNegativeZMWs+ambiguousZMWs); + snr=10*Math.log10(snr); + jo.add("TP_ZMWs", truePositiveZMWs); + jo.add("TN_ZMWs", trueNegativeZMWs); + jo.add("FP_ZMWs", falsePositiveZMWs); + jo.add("FN_ZMWs", falseNegativeZMWs); + jo.add("AM_ZMWs", ambiguousZMWs); + + jo.add("TP_ZMWs_Pct", truePositiveZMWs*100.0/(ZMWs)); + jo.add("TN_ZMWs_Pct", trueNegativeZMWs*100.0/(ZMWs)); + jo.add("FP_ZMWs_Pct", falsePositiveZMWs*100.0/(ZMWs)); + jo.add("FN_ZMWs_Pct", falseNegativeZMWs*100.0/(ZMWs)); + jo.add("AM_ZMWs_Pct", ambiguousZMWs*100.0/(ZMWs)); + + jo.add("SNR_ZMWs", snr); + } + } + return jo; + } + + private static void writeScoreRatioHistogram(FileFormat ff, long[] hist){ + if(ff==null){return;} + final ByteStreamWriter bsw=new ByteStreamWriter(ff); + bsw.start(); + final float mult=1.0f/(hist.length-1); + + bsw.print("#Counted\t").println(Tools.sum(hist)); + bsw.print("#Mean\t").println(Tools.averageHistogram(hist)*mult, 3); + bsw.print("#Median\t").println(Tools.medianHistogram(hist)*mult, 3); + bsw.print("#Mode\t").println(Tools.calcModeHistogram(hist)*mult, 3); + bsw.print("#STDev\t").println(Tools.standardDeviationHistogram(hist)*mult, 3); + bsw.print("#Value\tOccurances\n"); + + for(int i=0; i<hist.length; i++){ + bsw.print(i*mult, 3).tab().println(hist[i]); + } + bsw.poisonAndWait(); + } + + private ConcurrentReadOutputStream makeCros(FileFormat ff){ + if(ff==null){return null;} + + //Select output buffer size based on whether it needs to be ordered + final int buff=16; + + final ConcurrentReadOutputStream ros=ConcurrentReadOutputStream.getStream(ff, null, buff, null, ff.samOrBam() && ffin1.samOrBam()); + ros.start(); //Start the stream + return ros; + } + + /** Spawn process threads */ + private void spawnThreads(final ZMWStreamer zstream, + final ConcurrentReadOutputStream rosg, final ConcurrentReadOutputStream rosa, + final ConcurrentReadOutputStream rosb, final ConcurrentReadOutputStream rosj){ + + //Do anything necessary prior to processing + + //Fill a list with ProcessThreads + ArrayList<ProcessThread> alpt=new ArrayList<ProcessThread>(threads); + for(int i=0; i<threads; i++){ + alpt.add(new ProcessThread(zstream, rosg, rosa, rosb, rosj, i)); + } + + //Start the threads + for(ProcessThread pt : alpt){ + pt.start(); + } + + zstream.runStreamer(false); + + //Wait for threads to finish + waitForThreads(alpt); + + //Do anything necessary after processing + ZMWs=zstream.ZMWs; + } + + private void waitForThreads(ArrayList<ProcessThread> alpt){ + + //Wait for completion of all threads + boolean success=true; + for(ProcessThread pt : alpt){ + + //Wait until this thread has terminated + while(pt.getState()!=Thread.State.TERMINATED){ + try { + //Attempt a join operation + pt.join(); + } catch (InterruptedException e) { + //Potentially handle this, if it is expected to occur + e.printStackTrace(); + } + } + + //Accumulate per-thread statistics + readsProcessed+=pt.readsProcessedT; + basesProcessed+=pt.basesProcessedT; + + truePositiveReads+=pt.truePositiveReadsT; + trueNegativeReads+=pt.trueNegativeReadsT; + falsePositiveReads+=pt.falsePositiveReadsT; + falseNegativeReads+=pt.falseNegativeReadsT; + ambiguousReads+=pt.ambiguousReadsT; + + truePositiveZMWs+=pt.truePositiveZMWsT; + trueNegativeZMWs+=pt.trueNegativeZMWsT; + falsePositiveZMWs+=pt.falsePositiveZMWsT; + falseNegativeZMWs+=pt.falseNegativeZMWsT; + ambiguousZMWs+=pt.ambiguousZMWsT; + + readsOut+=pt.readsOutT; + basesOut+=pt.basesOutT; + junctionsOut+=pt.junctionsOutT; + + alignmentIters+=pt.ica.iters(); + alignmentItersShort+=pt.ica.itersShort(); + elapsed+=pt.elapsedT; + elapsedShort+=pt.elapsedShortT; + + iceCreamReads+=pt.iceCreamReadsT; + iceCreamBases+=pt.iceCreamBasesT; + iceCreamZMWs+=pt.iceCreamZMWsT; + iceCreamRatio+=pt.iceCreamRatioT; + ratiosCounted+=pt.ratiosCountedT; + missingAdapterZMWs+=pt.missingAdapterZMWsT; + hiddenAdapterZMWs+=pt.hiddenAdapterZMWsT; + lowEntropyZMWs+=pt.lowEntropyZMWsT; + lowEntropyReads+=pt.lowEntropyReadsT; + + basesTrimmed+=pt.basesTrimmedT; + readsTrimmed+=pt.readsTrimmedT; + + for(int i=0; i<adapterScores.length; i++){ + adapterScores[i]+=pt.adapterScoresT[i]; + repeatScores[i]+=pt.repeatScoresT[i]; + } + + success&=pt.success; + } + + //Track whether any threads failed + if(!success){errorState=true;} + } + + /*--------------------------------------------------------------*/ + /*---------------- Inner Methods ----------------*/ + /*--------------------------------------------------------------*/ + + /*--------------------------------------------------------------*/ + /*---------------- Inner Classes ----------------*/ + /*--------------------------------------------------------------*/ + + /** Processes reads. */ + private class ProcessThread extends Thread { + + //Constructor + ProcessThread(final ZMWStreamer zstream_, + final ConcurrentReadOutputStream ros_, final ConcurrentReadOutputStream rosa_, + final ConcurrentReadOutputStream rosb_, final ConcurrentReadOutputStream rosj_, final int tid_){ + zstream=zstream_; + rosg=ros_; + rosa=rosa_; + rosb=rosb_; + rosj=rosj_; + tid=tid_; + + Arrays.fill(tipBufferLeft, (byte)'N'); + Arrays.fill(tipBufferRight, (byte)'N'); + + if(entropyCutoff>=0){ + eTracker=new EntropyTracker(false, entropyCutoff, true); + }else{ + eTracker=null; + } + } + + //Called by start() + @Override + public void run(){ + //Do anything necessary prior to processing + + //Process the reads + processInner(); + + //Do anything necessary after processing + + //Indicate successful exit status + success=true; + } + + /** Iterate through the reads */ + void processInner(){ + + //As long as there is a nonempty read list... + for(ZMW reads=zstream.nextZMW(); reads!=null; reads=zstream.nextZMW()){ +// if(verbose){outstream.println("Fetched "+reads.size()+" reads.");} //Disabled due to non-static access + + processList(reads); + } + } + + int flagLowEntropyReads(final ZMW reads, final float minEnt, + final int minLen0, final float minFract){ + int found=0; + for(Read r : reads){ + if(!r.discarded()){ + int minLen=Tools.min((int)(r.length()*minFract), minLen0); + int maxBlock=eTracker.longestLowEntropyBlock(r.bases, false, maxMonomerFraction); + if(maxBlock>=minLen){ + r.setDiscarded(true); + r.setJunk(true); + found++; +// System.err.println(r.toFasta()); + } + } + } + return found; + } + + int flagLongReads(final ZMW reads, int median){ + int found=0; + for(Read r : reads){ + if(r.length()>longReadMult*median){ + r.setDiscarded(true); + r.setHasAdapter(true); + found++; + } + } + return found; + } + + /** Each list is presumed to be all reads from a ZMW, in order */ + void processList(final ZMW reads){ + long numBases=0; + + //Loop through each read in the list for statistics + for(int idx=0; idx<reads.size(); idx++){ + final Read r1=reads.get(idx); + + //Validate reads in worker threads + if(!r1.validated()){r1.validate(true);} + + //Track the initial length for statistics + final int initialLength1=r1.length(); + + //Increment counters + readsProcessedT++; + basesProcessedT+=initialLength1; + numBases+=initialLength1; + } + final boolean fullPass=CCS || reads.size()>=3; + + if(trimReads || trimPolyA){ + int removed=0; + for(int i=0; i<reads.size(); i++){ + Read r=reads.get(i); + byte a=r.bases[0], b=r.bases[r.length()-1]; + int trimmed=0; + if(!AminoAcid.isFullyDefined(a) || !AminoAcid.isFullyDefined(b)){ + trimmed+=trimRead(r, 80); + } + + if(trimReads && adapter!=null){ + int leftAdapterBases=alignLeftTipAdapter(r); + int rightAdapterBases=alignRightTipAdapter(r); + if(leftAdapterBases+rightAdapterBases>0){ + trimmed+=trimRead(r, adapterTipLen); + r.setTrimmed(true); + } + } + if(trimPolyA){ + int x=trimPolyA(r); + trimmed+=x; + if(x>0){r.setTrimmed(true);} + } + + if(trimmed>0){ + basesTrimmedT+=trimmed; + readsTrimmedT++; + } + + //TODO: Note again, removing intermediate reads messes up forward-rev ordering + if(r.length()<minLengthAfterTrimming){//Discard short trash + reads.set(i, null); + removed++; + } + } + if(removed>0){ + Tools.condenseStrict(reads); + } + } + + if(entropyCutoff>0){ + int bad=flagLowEntropyReads(reads, entropyCutoff, entropyLength, entropyFraction); + if(bad>0){ + lowEntropyZMWsT++; + lowEntropyReadsT+=bad; + if(bad>=reads.size()){ + if(!reads.isEmpty()){outputReads(reads, null);} + return; //No point in continuing... + } + } + } + + if(reads.isEmpty()){return;} + + Read sample=null;//Read that will be searched for inverted repeat, typically median length + Read shortest=null;//shortest read in the middle, or overall if no middle reads + final int medianLength=reads.medianLength(true); + boolean foundInvertedRepeat=false; + int longReads=0; + int adapterReads=0; + int maxAdapters=0; + int sampleNum=0; + + if(reads.size()>=3){ + if(flagLongReads){longReads=flagLongReads(reads, medianLength);} + for(int i=1; i<reads.size()-1; i++){ + Read r=reads.get(i); + if(sample==null && r.length()==medianLength){ + sample=r; + sampleNum=i; + } + if(shortest==null || r.length()<shortest.length()){shortest=r;} + } + }else{ + for(int i=0; i<reads.size(); i++){ + Read r=reads.get(i); + if(sample==null || sample.length()<r.length()){ + sample=r; + sampleNum=i; + } + if(shortest==null || r.length()<shortest.length()){shortest=r;} + } + } + assert(sample!=null); + final AlignmentResult ar=align(sample, fullPass, reads.size(), sampleNum); + + if(ar!=null){ + foundInvertedRepeat=true; + sample.setInvertedRepeat(true); + if(ar.icecream || !filterIceCreamOnly){ + sample.setDiscarded(true); + }else if(ar.ambiguous){ + sample.setAmbiguous(true); + } + } + + if(alignAdapter){ + double mult=foundInvertedRepeat ? 0.9 : 1.0; + if(needsAdapterTest(sample)){ + int x=lookForAdapter(sample, mult); + adapterReads+=(x>0 ? 1 : 0); + maxAdapters=Tools.max(x, maxAdapters); + } + + if(reads.size()>2){ + Read a=reads.get(0), b=reads.get(reads.size()-1); + Read r=a.length()>b.length() ? a : b; + if(needsAdapterTest(r)){ + int x=lookForAdapter(r, mult); + adapterReads+=(x>0 ? 1 : 0); + maxAdapters=Tools.max(x, maxAdapters); + } + } + + for(Read r : reads){ + if((r.length()>=shortReadMult*shortest.length() || adapterReads>0 || longReads>0 || foundInvertedRepeat) + && needsAdapterTest(r)){ + int x=lookForAdapter(r, mult); + adapterReads+=(x>0 ? 1 : 0); + maxAdapters=Tools.max(x, maxAdapters); + } + } + } + + if(ar!=null && ar.icecream){ + iceCreamRatioT+=ar.ratio; + ratiosCountedT++; + int idx=(int)(ar.ratio*200); + repeatScoresT[idx]++; + if(longReads+adapterReads==0){missingAdapterZMWsT++;} + } + if(longReads+adapterReads>0){ + hiddenAdapterZMWsT++; + } + + if(keepShortReads && maxAdapters<2){ + if(foundInvertedRepeat && !sample.hasAdapter()){ + if(reads.size()>2){ + for(int i=1; i<reads.size()-1; i++){ + reads.get(i).setDiscarded(true); + } + Read r=reads.get(0); + if(r.length()>=veryShortMult*medianLength){r.setDiscarded(true);} + r=reads.get(reads.size()-1); + if(r.length()>=veryShortMult*medianLength){r.setDiscarded(true);} + }else if(reads.size()==2){ + for(Read r : reads){ + if(r.length()>=veryShortMult*sample.length()){ + if(ar.icecream){ + r.setDiscarded(true); + }else if(ar.ambiguous){ + r.setAmbiguous(true); + } + } + } + } + } + }else if(sample.discarded() || (longReads+adapterReads>0)){ + for(Read r : reads){ + r.setDiscarded(true); + } + } + + ArrayList<Read> junctions=null; + if(ar!=null){ + if(rosj!=null && !sample.hasAdapter()){ + Read r=ar.alignedRead; + int width=Tools.min(200, ar.junctionLoc, r.length()-ar.junctionLoc); + int a=ar.junctionLoc-width, b=ar.junctionLoc+width; + byte[] bases=Arrays.copyOfRange(r.bases, a, b); + Read junction=new Read(bases, null, r.id+"\tjunction:"+a+"-"+b, r.numericID); + junctions=new ArrayList<Read>(1); + junctions.add(junction); + junctionsOutT++; + } + if(modifyHeader){ + sample.id=sample.id+"\tratio="+ar.ratio+"\tjunction="+ar.junctionLoc+ + "\tIR="+sample.invertedRepeat()+"\tAD="+sample.hasAdapter()+"\tFP="+fullPass+"\tsubreads="+reads.size(); + } + } + + removeShortTrash(reads); + + if(!reads.isEmpty()){outputReads(reads, junctions);} + } + + //TODO: Now that I think about it. The order of the reads is important. + //Since they go forward-rev-forward-rev it's imprudent to discard inner reads. + private void removeShortTrash(ZMW reads) { + int removed=0; + for(int i=0; i<reads.size(); i++){ + Read r=reads.get(i); + if(r.length()<minLengthAfterTrimming){//Discard short trash + reads.set(i, null); + removed++; + } + } + if(removed>0){Tools.condenseStrict(reads);} + } + + int trimRead(Read r, int lookahead){ + final byte[] bases=r.bases; + + int left=calcLeftTrim(bases, lookahead); + int right=calcRightTrim(bases, lookahead); + int trimmed=0; + + if(left+right>0){ +// System.err.println(r.length()+", "+left+", "+right+", "+r.id); + trimmed=TrimRead.trimByAmount(r, left, right, 1, false); +// System.err.println(r.length()+", "+left+", "+right+", "+r.id); + ZMW.fixReadHeader(r, left, right); +// System.err.println(r.length()+", "+left+", "+right+", "+r.id); + } + + if(r.samline!=null){ + r.samline.seq=r.bases; + r.samline.qual=r.quality; + } + return trimmed; + } + + int trimPolyA(Read r){ + final byte[] bases=r.bases; + + int left=Tools.max(PolymerTrimmer.testLeft(bases, 'A'), PolymerTrimmer.testLeft(bases, 'T')); + int right=Tools.max(PolymerTrimmer.testRight(bases, 'A'), PolymerTrimmer.testRight(bases, 'T')); + int trimmed=0; + + if(left+right>0){ +// System.err.println(r.length()+", "+left+", "+right+", "+r.id); + trimmed=TrimRead.trimByAmount(r, left, right, 1, false); +// System.err.println(r.length()+", "+left+", "+right+", "+r.id); + ZMW.fixReadHeader(r, left, right); +// System.err.println(r.length()+", "+left+", "+right+", "+r.id); + } + + if(r.samline!=null){ + r.samline.seq=r.bases; + r.samline.qual=r.quality; + } + return trimmed; + } + + final int calcLeftTrim(final byte[] bases, int lookahead){ + final int len=bases.length; + int lastUndef=-1; + for(int i=0, defined=0; i<len && defined<lookahead; i++){ + if(AminoAcid.isFullyDefined(bases[i])){ + defined++; + }else{ + lastUndef=i; + defined=0; + } + } + return lastUndef+1; + } + + final int calcRightTrim(final byte[] bases, int lookahead){ + final int len=bases.length; + int lastUndef=len; + for(int i=len-1, defined=0; i>=0 && defined<lookahead; i--){ + if(AminoAcid.isFullyDefined(bases[i])){ + defined++; + }else{ + lastUndef=i; + defined=0; + } + } + return len-lastUndef-1; + } + + final int alignLeftTipAdapter(Read r){ + assert(adapter.length<adapterTipLen); //Time to increase adapterTipLen + if(r.length()<adapterTipLen){return 0;} + final byte[] array=tipBufferLeft; + + for(int i=adapterTipPad, j=0; i<array.length; i++, j++){array[i]=r.bases[j];} + int[] rvec=ssa.fillAndScoreLimited(adapter, array, 0, array.length, minSwScore); + + if(rvec==null || rvec[0]<minSwScore){return 0;} + final int score=rvec[0]; + final int start=Tools.max(0, rvec[1]-adapterTipPad); + final int stop=rvec[2]-adapterTipPad; + for(int i=start; i<=stop; i++){r.bases[i]='X';} + return stop-start+1; + } + + final int alignRightTipAdapter(Read r){ + final byte[] bases=r.bases; + assert(adapter.length<adapterTipLen); //Time to increase adapterTipLen + if(r.length()<adapterTipLen){return 0;} + final byte[] array=tipBufferRight; + + for(int i=0, j=bases.length-adapterTipLen; i<adapterTipLen; i++, j++){array[i]=bases[j];} + int[] rvec=ssa.fillAndScoreLimited(adapter, array, 0, array.length, minSwScore); + + if(rvec==null || rvec[0]<minSwScore){return 0;} + final int score=rvec[0]; + final int start=Tools.max(0, rvec[1]-adapterTipPad); + final int stop=rvec[2]-adapterTipPad; + for(int i=start; i<=stop; i++){r.bases[i]='X';} + return stop-start+1; + } + + boolean needsAdapterTest(Read r){ + if(r.tested() || r.hasAdapter()){return false;} + if(adapterRatio<=0 || r.discarded()){return true;} + AlignmentResult aa=fla.alignForwardShort(adapter, r.bases, 0, r.bases.length-1, adapterRatio); + return aa!=null; + } + + private void outputReads(ZMW reads, ArrayList<Read> junctions){ + final int size=reads.size(); + final ArrayList<Read> good=(rosg==null ? null : new ArrayList<Read>(size)); + final ArrayList<Read> ambig=(rosa==null ? null : new ArrayList<Read>(size)); + final ArrayList<Read> bad=(rosb==null ? null : new ArrayList<Read>(size)); + + int discardedReads=0; + int ambigReads=0; + int trimmedReads=0; + boolean sendAllToDiscarded=false; + boolean sendAllToAmbiguous=false; + + //Check to see if any reads are discarded or ambiguous + for(Read r : reads){ + if(r.discarded()){ + discardedReads++; + }else if(r.ambiguous()){ + ambigReads++; + }else if(r.trimmed()){ + trimmedReads++; + } + } + + //Unify flags on all reads + if(keepZMWsTogether){ + if(discardedReads>0){sendAllToDiscarded=true;} + else if(ambigReads>0){sendAllToAmbiguous=true;} + } +// if(discardedReads>0 || ambigReads>0){System.err.println("\nd="+discardedReads+", a="+ambigReads);} + if(discardedReads>0){iceCreamZMWsT++;} + + int trueIceCreamReads=0; + for(Read r : reads){ + boolean trueIceCream=(parseCustom ? ReadBuilder.isIceCream(r.id) : false); + trueIceCreamReads+=(trueIceCream ? 1 : 0); + if(r.discarded() || sendAllToDiscarded){ +// assert(false); + if(bad!=null){bad.add(r);} + if(trueIceCream){truePositiveReadsT++;} + else{falsePositiveReadsT++;} +// System.err.println("d:t="+r.tested()+",ad="+r.hasAdapter()+",ir="+r.invertedRepeat()+","+r.id+", reads="+reads.size()); + }else if(r.ambiguous() || sendAllToAmbiguous){ + if(ambig!=null){ambig.add(r);} + if(sendAmbigToGood){ + readsOutT++; + basesOutT+=r.length(); + if(good!=null) {good.add(r);} + } + if(sendAmbigToBad && bad!=null){bad.add(r);} + ambiguousReadsT++; +// System.err.println("a*:t="+r.tested()+",ad="+r.hasAdapter()+",ir="+r.invertedRepeat()+","+r.id+", reads="+reads.size()); + }else{ + if(good!=null){ + good.add(r); + } + readsOutT++; + basesOutT+=r.length(); + if(trueIceCream && !r.trimmed()){falseNegativeReadsT++;} + else{trueNegativeReadsT++;} +// if(discardedReads>0 || ambigReads>0){System.err.println("g*:t="+r.tested()+",ad="+r.hasAdapter()+",ir="+r.invertedRepeat()+","+r.id+", reads="+reads.size());} + } + } + + if(trueIceCreamReads>0){ + if(discardedReads>0 || trimmedReads>0){ + truePositiveZMWsT++; + }else if(ambigReads>0){ + ambiguousZMWs++; + }else{ + falseNegativeZMWsT++; +// StringBuilder sb=new StringBuilder(); +// for(Read r : reads) {sb.append("\n").append(r.id);} +// System.err.println(sb); + } + }else{ + if(discardedReads>0){ + falsePositiveZMWsT++; + }else if(ambigReads>0){ + ambiguousZMWs++; + }else{ + trueNegativeZMWsT++; + } + } + + if(rosg!=null && good!=null && !good.isEmpty()){rosg.add(good, 0);} + if(rosa!=null && ambig!=null && !ambig.isEmpty()){rosa.add(ambig, 0);} + if(rosb!=null && bad!=null && !bad.isEmpty()){rosb.add(bad, 0);} + if(rosj!=null && junctions!=null && !junctions.isEmpty()){rosj.add(junctions, 0);} + } + + /** + * Align part of a read to itself to look for inverted repeats. + */ + AlignmentResult align(final Read r, boolean fullPass, int passes, int readNum){ + if(!alignRC){return null;} + final byte[] bases=r.bases; + + int qlen=(int)Tools.max(minQlen, Tools.min(targetQlen, bases.length*maxQlenFraction)); + if(qlen>0.45f*bases.length){return null;}//Ignore short stuff + + //Perform an initial scan using the tips of the reads to look for an inverted repeat + boolean tipOnly=filterIceCreamOnly && fullPass; +// System.err.println(filterIceCreamOnly+", "+fullPass+", "+tipOnly); + AlignmentResult a=alignLeft(bases, qlen, minRatio1, true, tipOnly); + AlignmentResult b=(fullPass && false ? null : alignRight(bases, qlen, minRatio1, true, tipOnly));//A second alignment is not needed for a full pass. + AlignmentResult ar=(a==null ? b : b==null ? a : a.maxScore>=b.maxScore ? a : b); + + //If nothing was detected, return + if(ar==null){return null;} + ar.alignedRead=r; + + //At this point, the read contains an inverted repeat of length qlen. + final int expectedJunction=ar.rLen/2; + + if(ar.left){ + ar.junctionLoc=ar.maxRpos/2; + }else{ + int innerLeft=ar.maxRpos; + int innerRight=ar.rLen-ar.qLen; + ar.junctionLoc=(innerLeft+innerRight)/2; + } + +// if(fullPass){//This code doesn't seem to have any effect and it's not clear why it is present +// int dif=Tools.absdif(expectedJunction, ar.junctionLoc); +// if(dif>expectedJunction*0.1) { +// if(filterIceCreamOnly){return ar;} +// } +// } + + if(realign){ + if(ar.junctionLoc<expectedJunction){ + int qlen2=(int)(ar.junctionLoc*0.9); + if(qlen2>=qlen){ + ar=alignLeft(bases, qlen2, minRatio2, false, false); + } + }else{ + int qlen2=(int)((ar.rLen-ar.junctionLoc)*0.9); + if(qlen2>=qlen){ + ar=alignRight(bases, qlen2, minRatio2, false, false); + } + } + if(ar==null){return null;} + ar.alignedRead=r; + } + + //At this point, the read contains an inverted repeat mirrored across a junction. + final float junctionFraction; + if(ar.left){ + ar.junctionLoc=ar.maxRpos/2; + junctionFraction=ar.junctionLoc/(float)ar.rLen; + }else{ + int innerLeft=ar.maxRpos; + int innerRight=ar.rLen-ar.qLen; + ar.junctionLoc=(innerLeft+innerRight)/2; + junctionFraction=(ar.rLen-ar.junctionLoc)/(float)ar.rLen; + } + + final int dif=Tools.absdif(expectedJunction, ar.junctionLoc); + if(fullPass){ + if(junctionFraction<minJunctionFraction){ + ar.icecream=false; + }else{ + ar.icecream=true; + } + }else if(passes==2){ + if(junctionFraction<minJunctionFraction){ + if(readNum==0){ + //First read, the junction should be closer to the beginning for real ice cream + if(ar.junctionLoc>expectedJunction){ + ar.icecream=false; + }else{ + ar.ambiguous=true; + } + }else{ + //Last read, the junction should be closer to the end for real ice cream + assert(readNum==1) : readNum; + if(ar.junctionLoc<expectedJunction){ + ar.icecream=false; + }else{ + ar.ambiguous=true; + } + } + return ar; + }else{ + ar.icecream=true; + } + }else{ + if(junctionFraction<minJunctionFraction){ + ar.icecream=true; + }else{ + ar.ambiguous=true; + } + } + return ar; + } + + /** Align the left qlen bases to the rest of the read. */ + private AlignmentResult alignLeft(final byte[] bases, final int qlen, final float minRatio, boolean v2, boolean tipOnly){ + final byte[] query=Arrays.copyOfRange(bases, 0, qlen); + AminoAcid.reverseComplementBasesInPlace(query); + final AlignmentResult ar; + final int rstop=bases.length-1; + final int rstart=(tipOnly ? Tools.max(qlen, rstop-(int)(tipRatio*qlen)) : qlen); + if(v2){ +// elapsedShortT-=System.nanoTime(); + ar=ica.alignForwardShort(query, bases, rstart, rstop, minScore, minRatio); +// elapsedShortT+=System.nanoTime(); + }else{ +// elapsedT-=System.nanoTime(); + ar=ica.alignForward(query, bases, rstart, rstop, minScore, minRatio); +// elapsedT+=System.nanoTime(); + } + if(ar!=null){ar.left=true;} + return ar; + } + + /** Align the right qlen bases to the rest of the read. */ + private AlignmentResult alignRight(final byte[] bases, final int qlen, final float minRatio, boolean v2, boolean tipOnly){ + final byte[] query=Arrays.copyOfRange(bases, bases.length-qlen, bases.length); + AminoAcid.reverseComplementBasesInPlace(query); + final AlignmentResult ar; + final int rstop=(tipOnly ? Tools.min((int)(tipRatio*qlen), bases.length-qlen-1) : bases.length-qlen-1); + final int rstart=0; + if(v2){ +// elapsedShortT-=System.nanoTime(); + ar=ica.alignForwardShort(query, bases, rstart, rstop, minScore, minRatio); +// elapsedShortT+=System.nanoTime(); + }else{ +// elapsedT-=System.nanoTime(); + ar=ica.alignForward(query, bases, rstart, rstop, minScore, minRatio); +// elapsedT+=System.nanoTime(); + } + if(ar!=null){ar.left=false;} + return ar; + } + + /** Align the adapter sequence to the read, piecewise, to count matches. */ + private int lookForAdapter(Read r, double mult) { + assert(!r.hasAdapter() && !r.tested()); + int begin=0; + while(begin<r.length() && r.bases[begin]=='N'){begin++;} //Skip reads made of 'N' + if(begin>=r.length()){return 0;} + + int suspectThresh=(int)(minSwScoreSuspect*mult); + int scoreThresh=(int)(minSwScore*mult); + +// final byte[] array=npad(r.bases, pad ? npad : 0); + final byte[] array=npad(r.bases, npad); + +// assert(array==r.bases) : npad; + + int lim=array.length-npad-stride; + + int found=0; + + int lastSuspect=-1; + int lastConfirmed=-1; + int maxScore=0; + +// final int revisedStride=(r.length()>1800 ? stride : (int)(stride*0.6f)); + for(int i=begin; i<lim; i+=stride){ + int j=Tools.min(i+window, array.length-1); + if(j-i<window){ + lim=0; //Last loop cycle +// i=Tools.max(0, array.length-2*query1.length); + } + + { + + int[] rvec; +// if(timeless){//A speed-optimized version + rvec=ssa.fillAndScoreLimited(adapter, array, i, j, suspectThresh); +// }else{rvec=msa.fillAndScoreLimited(adapter, array, i, j, suspectThresh);} + if(rvec!=null && rvec[0]>=suspectThresh){ + final int score=rvec[0]; + final int start=rvec[1]; + final int stop=rvec[2]; + assert(score>=suspectThresh); + if((i==0 || start>i) && (j==array.length-1 || stop<j)){ + boolean kill=(score>=scoreThresh || + (score>=suspectMidpoint && lastSuspect>0 && start>=lastSuspect && start-lastSuspect<suspectDistance) || + (lastConfirmed>0 && start>=lastConfirmed && start-lastConfirmed<suspectDistance)); + + if(!kill && useLocality && array.length-stop>window){//Look ahead + rvec=ssa.fillAndScoreLimited(adapter, array, stop, stop+window, suspectThresh); + if(rvec!=null){ + if(score>=suspectMidpoint && rvec[0]>=suspectThresh && rvec[1]-stop<suspectDistance){kill=true;} + else if(score>=suspectThresh && rvec[0]>=scoreThresh && rvec[1]-stop<suspectDistance){kill=true;} + } + } + + if(!kill && useAltMsa){//Try alternate msa + rvec=msa2.fillAndScoreLimited(adapter, array, Tools.max(0, start-4), Tools.min(stop+4, array.length-1), suspectThresh); + if(rvec!=null && rvec[0]>=(scoreThresh)){kill=true;} + } + + if(kill){ + maxScore=Tools.max(maxScore, score); +// if(print) {System.err.println("Found adapter at distance "+Tools.min(start, array.length-stop));} +// System.out.println("-:"+score+", "+scoreThresh+", "+suspectThresh+"\t"+lastSuspect+", "+start+", "+stop); + found++; + for(int x=Tools.max(0, start); x<=stop; x++){array[x]='X';} +// assert(Shared.threads()!=1) : new String(array)+"\n"+start+", "+stop; + if(useLocality && score>=scoreThresh){lastConfirmed=Tools.max(lastConfirmed, stop);} + } + } +// System.out.println("Set lastSuspect="+stop+" on score "+score); + if(useLocality){lastSuspect=Tools.max(lastSuspect, stop);} + } + } + } + + r.setTested(true); + if(found>0){ + r.setHasAdapter(true); + r.setDiscarded(true); + r.setAmbiguous(false); + + int idx=(int)((maxScore*200.0)/maxSwScore); + adapterScoresT[idx]++; + }else{ + r.setHasAdapter(false); + } + + return found; + } + + /** Align the adapter sequence to the read ends, and trim if needed. */ + private int trimTerminalAdapters(Read r, double mult) { + assert(!r.hasAdapter() && !r.tested()); + int begin=0; + while(begin<r.length() && r.bases[begin]=='N'){begin++;} //Skip reads made of 'N' + if(begin>=r.length()){return 0;} + + int suspectThresh=(int)(minSwScoreSuspect*mult); + int scoreThresh=(int)(minSwScore*mult); + +// final byte[] array=npad(r.bases, pad ? npad : 0); + final byte[] array=npad(r.bases, npad); + +// assert(array==r.bases) : npad; + + int lim=array.length-npad-stride; + + int found=0; + + int lastSuspect=-1; + int lastConfirmed=-1; + int maxScore=0; + +// final int revisedStride=(r.length()>1800 ? stride : (int)(stride*0.6f)); + for(int i=begin; i<lim; i+=stride){ + int j=Tools.min(i+window, array.length-1); + if(j-i<window){ + lim=0; //Last loop cycle +// i=Tools.max(0, array.length-2*query1.length); + } + + { + + int[] rvec; +// if(timeless){//A speed-optimized version + rvec=ssa.fillAndScoreLimited(adapter, array, i, j, suspectThresh); +// }else{rvec=msa.fillAndScoreLimited(adapter, array, i, j, suspectThresh);} + if(rvec!=null && rvec[0]>=suspectThresh){ + final int score=rvec[0]; + final int start=rvec[1]; + final int stop=rvec[2]; + assert(score>=suspectThresh); + if((i==0 || start>i) && (j==array.length-1 || stop<j)){ + boolean kill=(score>=scoreThresh || + (score>=suspectMidpoint && lastSuspect>0 && start>=lastSuspect && start-lastSuspect<suspectDistance) || + (lastConfirmed>0 && start>=lastConfirmed && start-lastConfirmed<suspectDistance)); + + if(!kill && useLocality && array.length-stop>window){//Look ahead + rvec=ssa.fillAndScoreLimited(adapter, array, stop, stop+window, suspectThresh); + if(rvec!=null){ + if(score>=suspectMidpoint && rvec[0]>=suspectThresh && rvec[1]-stop<suspectDistance){kill=true;} + else if(score>=suspectThresh && rvec[0]>=scoreThresh && rvec[1]-stop<suspectDistance){kill=true;} + } + } + + if(!kill && useAltMsa){//Try alternate msa + rvec=msa2.fillAndScoreLimited(adapter, array, Tools.max(0, start-4), Tools.min(stop+4, array.length-1), suspectThresh); + if(rvec!=null && rvec[0]>=(scoreThresh)){kill=true;} + } + + if(kill){ + maxScore=Tools.max(maxScore, score); +// if(print) {System.err.println("Found adapter at distance "+Tools.min(start, array.length-stop));} +// System.out.println("-:"+score+", "+scoreThresh+", "+suspectThresh+"\t"+lastSuspect+", "+start+", "+stop); + found++; + for(int x=Tools.max(0, start); x<=stop; x++){array[x]='X';} +// assert(Shared.threads()!=1) : new String(array)+"\n"+start+", "+stop; + if(useLocality && score>=scoreThresh){lastConfirmed=Tools.max(lastConfirmed, stop);} + } + } +// System.out.println("Set lastSuspect="+stop+" on score "+score); + if(useLocality){lastSuspect=Tools.max(lastSuspect, stop);} + } + } + } + + r.setTested(true); + if(found>0){ + r.setHasAdapter(true); + r.setDiscarded(true); + r.setAmbiguous(false); + + int idx=(int)((maxScore*200.0)/maxSwScore); + adapterScoresT[idx]++; + }else{ + r.setHasAdapter(false); + } + + return found; + } + + private byte[] npad(final byte[] array, final int pad){ + if(pad<=0){return array;} + final int len=array.length+2*pad; + byte[] r=new byte[len]; + for(int i=0; i<pad; i++){r[i]='N';} + for(int i=pad, j=0; j<array.length; i++, j++){r[i]=array[j];} + for(int i=array.length+pad, limit=Tools.min(r.length, len+50); i<limit; i++){r[i]='N';} + return r; + } + + /*--------------------------------------------------------------*/ + /*---------------- Inner Class Fields ----------------*/ + /*--------------------------------------------------------------*/ + + /** Number of reads processed by this thread */ + protected long readsProcessedT=0; + /** Number of bases processed by this thread */ + protected long basesProcessedT=0; + + + protected long truePositiveReadsT=0; + protected long falsePositiveReadsT=0; + protected long trueNegativeReadsT=0; + protected long falseNegativeReadsT=0; + protected long ambiguousReadsT=0; + + protected long truePositiveZMWsT=0; + protected long falsePositiveZMWsT=0; + protected long trueNegativeZMWsT=0; + protected long falseNegativeZMWsT=0; + protected long ambiguousZMWsT=0; + + + protected long elapsedT=0; + protected long elapsedShortT=0; + + //Unused + protected long iceCreamReadsT=0; + protected long iceCreamBasesT=0; + + protected long iceCreamZMWsT=0; + protected double iceCreamRatioT=0; + protected long ratiosCountedT=0; + protected long missingAdapterZMWsT=0; + protected long hiddenAdapterZMWsT=0; + + protected long basesTrimmedT=0; + protected long readsTrimmedT=0; + + /** Number of reads retained by this thread */ + protected long readsOutT=0; + /** Number of bases retained by this thread */ + protected long basesOutT=0; + /** Number of junctions detected in IR reads by this thread */ + protected long junctionsOutT=0; + + protected long lowEntropyZMWsT=0; + protected long lowEntropyReadsT=0; + + /** True only if this thread has completed successfully */ + boolean success=false; + + /** Shared output stream */ + private final ConcurrentReadOutputStream rosg; + /** Shared output stream for ambiguous reads */ + private final ConcurrentReadOutputStream rosa; + /** Shared output stream for bad reads */ + private final ConcurrentReadOutputStream rosb; + /** Shared output stream for junctions */ + private final ConcurrentReadOutputStream rosj; + /** Thread ID */ + final int tid; + + /* Aligners for this thread */ + + /** For inverted repeat alignment */ + final IceCreamAligner ica=IceCreamAligner.makeAligner(32); + /** For quickly aligning adapter sequence to whole read */ + final FlatAligner fla=new FlatAligner(); +// final MultiStateAligner9PacBioAdapter msa=alignAdapter ? new MultiStateAligner9PacBioAdapter(ALIGN_ROWS, ALIGN_COLUMNS) : null; + /** For slowly aligning adapter sequence sectionwise */ + final SingleStateAlignerPacBioAdapter ssa=(alignAdapter || trimReads) ? new SingleStateAlignerPacBioAdapter(ALIGN_ROWS, ALIGN_COLUMNS, adapter.length) : null; + /** Alternate scoring for slow adapter alignment */ + final MultiStateAligner9PacBioAdapter2 msa2=(alignAdapter || trimReads) ? new MultiStateAligner9PacBioAdapter2() : null; + + final ZMWStreamer zstream; + final EntropyTracker eTracker; + + final long[] adapterScoresT=new long[201]; + final long[] repeatScoresT=new long[201]; + final byte[] tipBufferLeft=new byte[adapterTipLen+adapterTipPad]; + final byte[] tipBufferRight=new byte[adapterTipLen+adapterTipPad]; + } + + /*--------------------------------------------------------------*/ + /*---------------- Fields ----------------*/ + /*--------------------------------------------------------------*/ + + /** Primary input file path */ + private String in1=null; + + /** Primary output file path */ + private String outg=null; + /** Ambiguous output file path */ + private String outa=null; + /** Bad output file path */ + private String outb=null; + /** Junction output file path */ + private String outj=null; + /** Stats (screen) output file path */ + private String outstats=null; + + /** Adapter score ratio histogram */ + private String asrhist=null; + + /** Inverted repeat score ratio histogram */ + private String irsrhist=null; + + /** Override input file extension */ + private String extin=null; + /** Override output file extension */ + private String extout=null; + + private int targetQlen=352; + private int minQlen=100; + + /** Make a query at most this fraction of read length */ + private float maxQlenFraction=0.15f; + + /** Exit alignment early if score drops below this. + * An aggressive optimization that may miss some low-quality inverted repeats. + * -700 seems safe. */ + private int minScore=-800; + + /** Fraction of maximum alignment score to consider as matching for initial alignment */ + private float minRatio1=0.59f; + + /** Fraction of maximum alignment score to consider as matching for realignment */ + private float minRatio2=0.64f; + + private float adapterRatio=0.18f; //.18 for fa, or 0.57/0.6 for ica + private float adapterRatio2=0.325f; //0.31f normal, 0.325 timeless + private float suspectRatio=0.85f; + private boolean useLocality=true; + private boolean useAltMsa=true; + + + private float tipRatio=1.5f; + + private float longReadMult=1.5f; + + private float shortReadMult=1.5f; + + private float veryShortMult=0.35f; + + /** Short half of inverted repeat must be at least this fraction of read length to be classed as a triangle */ + private float minJunctionFraction=0.4f; + + /** Only filter triangle reads, not all inverted repeats */ + private boolean filterIceCreamOnly=true; + + /** Align again once the junction position is provisionally identified */ + private boolean realign=true; + + /** For internal read array transfers */ + private int queuelen=80; + + /** For grading synthetic data */ + private boolean parseCustom; + + /** Input reads are CCS (full pass) */ + private boolean CCS; + + private boolean modifyHeader=false; + + private boolean sendAmbigToGood=true; + private boolean sendAmbigToBad=false; + private boolean setAmbig=false; + + //Note: These flags are very similar... they need to be better-defined or merged. + private boolean keepZMWsTogether=false; + private boolean keepShortReads=true; + + private int format=FORMAT_TEXT; + private static final int FORMAT_TEXT=1, FORMAT_JSON=2; + + /*--------------------------------------------------------------*/ + + /** Alignment iterations for inverted repeat calculation with ref columns and query rows */ + protected long alignmentIters=0; + + /** Alignment iterations for inverted repeat calculation with query columns and ref rows */ + protected long alignmentItersShort=0; + + /** Time spent in long iterations */ + protected long elapsed=0; + + /** Time spent in short iterations */ + protected long elapsedShort=0; + + /** Print iteration time statistics */ + protected boolean printTiming=false; + + /** Number of reads processed */ + protected long readsProcessed=0; + /** Number of bases processed */ + protected long basesProcessed=0; + + /** Number of reads retained */ + protected long readsOut=0; + /** Number of bases retained */ + protected long basesOut=0; + /** Number of junctions detected in IR reads */ + protected long junctionsOut=0; + + /** Quit after processing this many input reads; -1 means no limit */ + private long maxReads=-1; + + //Unused + protected long iceCreamReads=0; + protected long iceCreamBases=0; + + /** ZMWs with discarded reads */ + protected long iceCreamZMWs=0; + + /** Sum of IR alignment ratios for IR reads not containing adapters */ + protected double iceCreamRatio=0; + + /** Number of ratios in iceCreamRatio */ + protected long ratiosCounted=0; + + /** Histogram */ + protected final long[] adapterScores=new long[201]; + + /** Histogram */ + protected final long[] repeatScores=new long[201]; + + /** ZMWs with a triangle read but no hidden adapters */ + protected long missingAdapterZMWs=0; + + /** ZMWs with hidden adapters */ + protected long hiddenAdapterZMWs=0; + + protected long basesTrimmed=0; + protected long readsTrimmed=0; + + protected long lowEntropyZMWs=0; + protected long lowEntropyReads=0; + + /** Total ZMWs observed */ + protected long ZMWs=0; + + protected long truePositiveReads=0; + protected long falsePositiveReads=0; + protected long trueNegativeReads=0; + protected long falseNegativeReads=0; + protected long ambiguousReads=0; + + protected long truePositiveZMWs=0; + protected long falsePositiveZMWs=0; + protected long trueNegativeZMWs=0; + protected long falseNegativeZMWs=0; + protected long ambiguousZMWs=0; + + protected final int stride; + protected final int window; + protected final int ALIGN_ROWS; + protected final int ALIGN_COLUMNS; + + private boolean timeless=true; + protected final int maxSwScore; + protected final int minSwScore; + protected final int minSwScoreSuspect; + protected final int maxImperfectSwScore; + protected final int suspectMidpoint; + protected final int suspectDistance=100; + protected int npad=0; //This is for catching adapters on the tips, which are not very relevant to ice cream. Set to 35 for tip apdapters. + + private byte[] adapter="ATCTCTCTCAACAACAACAACGGAGGAGGAGGAAAAGAGAGAGAT".getBytes(); //This one seems to be correct +// private byte[] adapter="ATCTCTCTCTTTTCCTCCTCCTCCGTTGTTGTTGTTGAGAGAGAT".getBytes(); + private boolean alignAdapter=true; + private boolean alignRC=true; + private boolean flagLongReads=true; + private boolean trimReads=true; + private int minLengthAfterTrimming=40; + private int adapterTipLen=100; + private int adapterTipPad=35; + + boolean trimPolyA=false; + + /*--------------------------------------------------------------*/ + /*---------------- Entropy Fields ----------------*/ + /*--------------------------------------------------------------*/ + + /** Minimum entropy to be considered "complex", on a scale of 0-1 */ + float entropyCutoff=-1; //suggested 0.55 + /** Minimum length of a low-entropy block to fail a read */ + int entropyLength=350; + /** Minimum length of a low-entropy block as a fraction of read length; + * the minimum of the two will be used */ + float entropyFraction=0.5f; + + float maxMonomerFraction=0.74f; //Suggested... 0.74 + + /*--------------------------------------------------------------*/ + /*---------------- Final Fields ----------------*/ + /*--------------------------------------------------------------*/ + + /** Primary input file */ + private final FileFormat ffin1; + + /** Primary output file */ + private final FileFormat ffoutg; + + /** Ambiguous output file */ + private final FileFormat ffouta; + + /** Bad output file */ + private final FileFormat ffoutb; + + /** Junction output file */ + private final FileFormat ffoutj; + + /** Stats output file */ + private final FileFormat ffstats; + + /** Adapter score ratio histogram */ + private final FileFormat ffasrhist; + + /** Inverted repeat score ratio histogram */ + private final FileFormat ffirsrhist; + + private final int threads; + + /*--------------------------------------------------------------*/ + /*---------------- Common Fields ----------------*/ + /*--------------------------------------------------------------*/ + + /** Print status messages to this output stream */ + private PrintStream outstream=System.err; + /** Print verbose messages */ + public static boolean verbose=false; + /** True if an error was encountered */ + public boolean errorState=false; + /** Overwrite existing output files */ + private boolean overwrite=false; + /** Append to existing output files */ + private boolean append=false; + +}