comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/icecream/IceCreamFinder.java @ 68:5028fdace37b

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 16:23:26 -0400
parents
children
comparison
equal deleted inserted replaced
67:0e9998148a16 68:5028fdace37b
1 package icecream;
2
3 import java.io.PrintStream;
4 import java.util.ArrayList;
5 import java.util.Arrays;
6
7 import aligner.AlignmentResult;
8 import aligner.FlatAligner;
9 import aligner.MultiStateAligner9PacBioAdapter2;
10 import aligner.SingleStateAlignerPacBioAdapter;
11 import dna.AminoAcid;
12 import dna.Data;
13 import fileIO.ByteFile;
14 import fileIO.ByteStreamWriter;
15 import fileIO.FileFormat;
16 import fileIO.ReadWrite;
17 import json.JsonObject;
18 import shared.Parse;
19 import shared.Parser;
20 import shared.PreParser;
21 import shared.ReadStats;
22 import shared.Shared;
23 import shared.Timer;
24 import shared.Tools;
25 import shared.TrimRead;
26 import stream.ConcurrentReadOutputStream;
27 import stream.FASTQ;
28 import stream.FastaReadInputStream;
29 import stream.Read;
30 import stream.SamLine;
31 import structures.ByteBuilder;
32 import structures.EntropyTracker;
33
34 /**
35 * Detects inverted repeats in PacBio reads.
36 * Attempts to determine whether reads are chimeric
37 * due to a missing adapter.
38 *
39 * @author Brian Bushnell
40 * @date June 5, 2019
41 *
42 */
43 public final class IceCreamFinder {
44
45 /*--------------------------------------------------------------*/
46 /*---------------- Initialization ----------------*/
47 /*--------------------------------------------------------------*/
48
49 /**
50 * Code entrance from the command line.
51 * @param args Command line arguments
52 */
53 public static void main(String[] args){
54 //Start a timer immediately upon code entrance.
55 Timer t=new Timer();
56
57 //Create an instance of this class
58 IceCreamFinder x=new IceCreamFinder(args);
59
60 //Run the object
61 x.process(t);
62
63 //Close the print stream if it was redirected
64 Shared.closeStream(x.outstream);
65 }
66
67 /**
68 * Constructor.
69 * @param args Command line arguments
70 */
71 public IceCreamFinder(String[] args){
72
73 {//Preparse block for help, config files, and outstream
74 PreParser pp=new PreParser(args, getClass(), false);
75 args=pp.args;
76 outstream=pp.outstream;
77 }
78
79 Parser.setQuality(33);
80
81 //Set shared static variables prior to parsing
82 ReadWrite.USE_PIGZ=ReadWrite.USE_UNPIGZ=true;
83 ReadWrite.USE_BGZIP=ReadWrite.USE_UNBGZIP=ReadWrite.PREFER_BGZIP=true;
84 ReadWrite.MAX_ZIP_THREADS=Shared.threads();
85 FASTQ.TEST_INTERLEAVED=FASTQ.FORCE_INTERLEAVED=false;
86 SamLine.SET_FROM_OK=true;
87 Shared.setBufferData(1000000);
88 // Shared.FASTA_WRAP=511;
89 Data.USE_SAMBAMBA=false;//Sambamba changes PacBio headers.
90 Read.CHANGE_QUALITY=false;
91 EntropyTracker.defaultK=3;
92
93 {//Parse the arguments
94 final Parser parser=parse(args);
95 Parser.processQuality();
96
97 maxReads=parser.maxReads;
98 overwrite=ReadStats.overwrite=parser.overwrite;
99 append=ReadStats.append=parser.append;
100
101 in1=parser.in1;
102 extin=parser.extin;
103
104 if(outg==null){outg=parser.out1;}
105 extout=parser.extout;
106 }
107
108 //Determine how many threads may be used
109 threads=Shared.threads();
110
111 //Ensure there is an input file
112 if(in1==null){throw new RuntimeException("Error - at least one input file is required.");}
113 fixExtensions(); //Add or remove .gz or .bz2 as needed
114 checkFileExistence(); //Ensure files can be read and written
115 checkStatics(); //Adjust file-related static fields as needed for this program
116
117 //Create output FileFormat objects
118 ffoutg=FileFormat.testOutput(outg, FileFormat.FASTQ, extout, true, overwrite, append, false);
119 ffouta=FileFormat.testOutput(outa, FileFormat.FASTQ, extout, true, overwrite, append, false);
120 ffoutb=FileFormat.testOutput(outb, FileFormat.FASTQ, extout, true, overwrite, append, false);
121 ffoutj=FileFormat.testOutput(outj, FileFormat.FASTA, extout, true, overwrite, append, false);
122 ffstats=FileFormat.testOutput(outstats, FileFormat.TXT, null, false, overwrite, append, false);
123 ffasrhist=FileFormat.testOutput(asrhist, FileFormat.TXT, null, false, overwrite, append, false);
124 ffirsrhist=FileFormat.testOutput(irsrhist, FileFormat.TXT, null, false, overwrite, append, false);
125
126 if(!setAmbig && ffouta!=null){
127 sendAmbigToGood=sendAmbigToBad=false;
128 }
129
130 //Create input FileFormat objects
131 ffin1=FileFormat.testInput(in1, FileFormat.FASTQ, extin, true, true);
132
133 final int alen=(adapter==null ? 20 : adapter.length);
134 stride=(int)(alen*1.9f);
135 window=(int)(alen*3.8f+10);
136 ALIGN_ROWS=alen+1;
137 ALIGN_COLUMNS=window+2;
138
139 maxSwScore=aligner.MultiStateAligner9PacBioAdapter.maxQuality(alen);
140 minSwScore=(int)(maxSwScore*adapterRatio2);
141 minSwScoreSuspect=(int)(maxSwScore*Tools.min(adapterRatio2*suspectRatio, adapterRatio2-((1-suspectRatio)*.2f)));
142 maxImperfectSwScore=aligner.MultiStateAligner9PacBioAdapter.maxImperfectScore(alen);
143 suspectMidpoint=(minSwScoreSuspect+minSwScore)/2;
144 if(adapter==null){alignAdapter=false;}
145 }
146
147 /*--------------------------------------------------------------*/
148 /*---------------- Initialization Helpers ----------------*/
149 /*--------------------------------------------------------------*/
150
151 /** Parse arguments from the command line */
152 private Parser parse(String[] args){
153
154 //Create a parser object
155 Parser parser=new Parser();
156
157 //Set any necessary Parser defaults here
158 //parser.foo=bar;
159
160 //Parse each argument
161 for(int i=0; i<args.length; i++){
162 String arg=args[i];
163
164 //Break arguments into their constituent parts, in the form of "a=b"
165 String[] split=arg.split("=");
166 String a=split[0].toLowerCase();
167 String b=split.length>1 ? split[1] : null;
168 if(b!=null && b.equalsIgnoreCase("null")){b=null;}
169
170 if(a.equals("verbose")){
171 verbose=Parse.parseBoolean(b);
172 }else if(a.equals("format")){
173 if(b==null){
174 assert(false) : arg;
175 }else if(Tools.isDigit(b.charAt(i))){
176 format=Integer.parseInt(b);
177 }else if(b.equalsIgnoreCase("json")){
178 format=FORMAT_JSON;
179 }else if(b.equalsIgnoreCase("text")){
180 format=FORMAT_TEXT;
181 }else{
182 assert(false) : arg;
183 }
184 assert(format>=1 && format<=2) : arg;
185 }else if(a.equals("json")){
186 boolean x=Parse.parseBoolean(b);
187 format=(x ? FORMAT_JSON : FORMAT_TEXT);
188 }else if(a.equals("ss") || a.equals("samstreamer") || a.equals("streamer")){
189 if(b!=null && Tools.isDigit(b.charAt(0))){
190 ZMWStreamer.useStreamer=true;
191 assert(Integer.parseInt(b)==1) : "ZMWStreamer threads currently capped at 1.";
192 // ZMWStreamer.streamerThreads=Tools.max(1, Integer.parseInt(b));
193 }else{
194 ZMWStreamer.useStreamer=Parse.parseBoolean(b);
195 }
196 }else if(a.equals("icecreamonly") || a.equals("ico")){
197 filterIceCreamOnly=Parse.parseBoolean(b);
198 }else if(a.equals("keepshortreads") || a.equals("ksr")){
199 keepShortReads=Parse.parseBoolean(b);
200 }else if(a.equalsIgnoreCase("keepzmwstogether") || a.equals("kzt") || a.equals("keepreadstogether") || a.equals("krt")){
201 keepZMWsTogether=Parse.parseBoolean(b);
202 }else if(a.equalsIgnoreCase("samplerate")){
203 float f=Float.parseFloat(b);
204 assert(false) : "TODO"; //TODO
205 }else if(a.equals("modifyheader") || a.equals("modifyheaders") || a.equals("changeheader") || a.equals("changeheaders")){
206 modifyHeader=Parse.parseBoolean(b);
207 }else if(a.equalsIgnoreCase("ccs")){
208 CCS=Parse.parseBoolean(b);
209 }else if(a.equals("npad")){
210 npad=Integer.parseInt(b);
211 }else if(a.equals("minlength") || a.equals("minlen")){
212 minLengthAfterTrimming=Integer.parseInt(b);
213 }else if(a.equals("realign")){
214 realign=Parse.parseBoolean(b);
215 }else if(a.equals("aligninverse") || a.equals("alignrc") || a.equals("findicecream")){
216 alignRC=Parse.parseBoolean(b);
217 }else if(a.equals("alignadapter")){
218 alignAdapter=Parse.parseBoolean(b);
219 }else if(a.equals("timeless")){
220 timeless=Parse.parseBoolean(b);
221 }else if(a.equals("flaglongreads")){
222 flagLongReads=Parse.parseBoolean(b);
223 }else if(a.equals("trimreads") || a.equals("trim")){
224 trimReads=Parse.parseBoolean(b);
225 }else if(a.equals("adapter")){
226 adapter=b==null ? null : b.getBytes();
227 }else if(a.equals("targetqlen") || a.equals("qlen")){
228 targetQlen=Integer.parseInt(b);
229 }else if(a.equals("maxqlenfraction") || a.equals("maxfraction") || a.equals("qlenfraction")){
230 maxQlenFraction=Float.parseFloat(b);
231 }else if(a.equals("junctionfraction") || a.equals("shortfraction")){
232 minJunctionFraction=Float.parseFloat(b);
233 }else if(a.equals("minratio1") || a.equals("ratio1") || a.equals("id1")){
234 minRatio1=Float.parseFloat(b);
235 }else if(a.equals("minratio2") || a.equals("ratio2") || a.equals("id2")){
236 minRatio2=Float.parseFloat(b);
237 }else if(a.equals("minratio") || a.equals("ratio") || a.equals("id")){
238 minRatio1=minRatio2=Float.parseFloat(b);
239 }else if(a.equals("adapterratio") || a.equals("adapterratio1") || a.equals("ratior") || a.equals("ratior1")){
240 adapterRatio=Float.parseFloat(b);
241 }else if(a.equals("adapterratio2") || a.equals("ratior2")){
242 adapterRatio2=Float.parseFloat(b);
243 }else if(a.equals("suspectratio")){
244 suspectRatio=Float.parseFloat(b);
245 }else if(a.equals("minqlen")){
246 minQlen=Integer.parseInt(b);
247 }else if(a.equals("minscore")){
248 minScore=Integer.parseInt(b);
249 }else if(a.equals("parsecustom")){
250 parseCustom=Parse.parseBoolean(b);
251 }else if(a.equals("printtiming") || a.equals("extended")){
252 printTiming=Parse.parseBoolean(b);
253 }else if(a.equals("queuelen") || a.equals("qlen")){
254 queuelen=Integer.parseInt(b);
255 }else if(a.equals("outg") || a.equals("outgood")){
256 outg=b;
257 }else if(a.equals("outa") || a.equals("outambig")){
258 outa=b;
259 }else if(a.equals("outb") || a.equals("outbad")){
260 outb=b;
261 }else if(a.equals("outj") || a.equals("outjunctions") || a.equals("junctions")){
262 outj=b;
263 }else if(a.equals("outs") || a.equals("outstats") || a.equals("stats")){
264 outstats=b;
265 }else if(a.equals("asrhist") || a.equals("ahist")){
266 asrhist=b;
267 }else if(a.equals("irsrhist") || a.equals("irhist")){
268 irsrhist=b;
269 }else if(a.equals("ambig")){
270 sendAmbigToGood=sendAmbigToBad=false;
271 if(b!=null){
272 String[] split2=b.split(",");
273 for(String s2 : split2){
274 if(s2.equalsIgnoreCase("good")){sendAmbigToGood=true;}
275 else if(s2.equalsIgnoreCase("bad") || s2.equalsIgnoreCase("toss")){sendAmbigToBad=true;}
276 else if(s2.equalsIgnoreCase("ambig")){}
277 else{assert(false) : "Bad argument: '"+s2+"' in '"+arg+"'; should be good or bad";}
278 }
279 }
280 setAmbig=true;
281 }else if(a.equalsIgnoreCase("trimpolya")){//Parse standard flags in the parser
282 trimPolyA=Parse.parseBoolean(b);
283 }else if(PolymerTrimmer.parse(arg, a, b)){
284 //do nothing
285 }else if(a.equals("minentropy") || a.equals("entropy") || a.equals("entropyfilter") || a.equals("efilter")){
286 if(b==null || Character.isLetter(b.charAt(0))){
287 if(Parse.parseBoolean(b)){
288 entropyCutoff=0.55f;
289 }else{
290 entropyCutoff=-1;
291 }
292 }else{
293 entropyCutoff=Float.parseFloat(b);
294 }
295 }else if(a.equals("entropyblock") || a.equals("entropylength") || a.equals("entropylen") || a.equals("entlen")){
296 entropyLength=Parse.parseIntKMG(b);
297 }else if(a.equals("entropyfraction") || a.equals("entfraction")){
298 entropyFraction=Float.parseFloat(b);
299 }else if(a.equals("monomerfraction") || a.equals("maxmonomerfraction") || a.equals("mmf")){
300 maxMonomerFraction=Float.parseFloat(b);
301 }else if(a.equals("parse_flag_goes_here")){
302 long fake_variable=Parse.parseKMG(b);
303 //Set a variable here
304 }else if(parser.parse(arg, a, b)){//Parse standard flags in the parser
305 //do nothing
306 }else{
307 outstream.println("Unknown parameter "+args[i]);
308 assert(false) : "Unknown parameter "+args[i];
309 }
310 }
311
312 return parser;
313 }
314
315 /** Add or remove .gz or .bz2 as needed */
316 private void fixExtensions(){
317 in1=Tools.fixExtension(in1);
318 }
319
320 /** Ensure files can be read and written */
321 private void checkFileExistence(){
322
323 //Ensure output files can be written
324 if(!Tools.testOutputFiles(overwrite, append, false, outg, outa, outb, outj, outstats, asrhist, irsrhist)){
325 outstream.println((outg==null)+", "+(outb==null)+", "+outg+", "+outa+", "+outb+", "+outj+", "+outstats);
326 throw new RuntimeException("\n\noverwrite="+overwrite+"; Can't write to output files "+outg+", "+outa+", "+outb+", "+outj+"\n");
327 }
328
329 //Ensure input files can be read
330 if(!Tools.testInputFiles(false, true, in1)){
331 throw new RuntimeException("\nCan't read some input files.\n");
332 }
333
334 //Ensure that no file was specified multiple times
335 if(!Tools.testForDuplicateFiles(true, in1, outg, outa, outb, outj, outstats, asrhist, irsrhist)){
336 throw new RuntimeException("\nSome file names were specified multiple times.\n");
337 }
338 }
339
340 /** Adjust file-related static fields as needed for this program */
341 private static void checkStatics(){
342 //Adjust the number of threads for input file reading
343 if(!ByteFile.FORCE_MODE_BF1 && !ByteFile.FORCE_MODE_BF2 && Shared.threads()>2){
344 ByteFile.FORCE_MODE_BF2=true;
345 }
346
347 assert(FastaReadInputStream.settingsOK());
348 }
349
350 /*--------------------------------------------------------------*/
351 /*---------------- Outer Methods ----------------*/
352 /*--------------------------------------------------------------*/
353
354 /** Create read streams and process all data */
355 void process(Timer t){
356
357 //Turn off read validation in the input threads to increase speed
358 final boolean vic=Read.VALIDATE_IN_CONSTRUCTOR;
359 Read.VALIDATE_IN_CONSTRUCTOR=Shared.threads()<4;
360
361 //Create a read input stream
362 ZMWStreamer zstream=new ZMWStreamer(ffin1, Shared.threads(), maxReads, -1);
363
364 //Optionally create read output streams
365 final ConcurrentReadOutputStream rosg=makeCros(ffoutg);
366 final ConcurrentReadOutputStream rosa=makeCros(ffouta);
367 final ConcurrentReadOutputStream rosb=makeCros(ffoutb);
368 final ConcurrentReadOutputStream rosj=makeCros(ffoutj);
369
370 //Reset counters
371 readsProcessed=readsOut=0;
372 basesProcessed=basesOut=0;
373 junctionsOut=0;
374
375 //Process the reads in separate threads
376 spawnThreads(zstream, rosg, rosa, rosb, rosj);
377
378 if(verbose){outstream.println("Finished; closing streams.");}
379
380 //Write anything that was accumulated by ReadStats
381 errorState|=ReadStats.writeAll();
382 //Close the read streams
383 errorState|=ReadWrite.closeStreams(null, rosg, rosa, rosb, rosj);
384
385 //Reset read validation
386 Read.VALIDATE_IN_CONSTRUCTOR=vic;
387
388 writeScoreRatioHistogram(ffasrhist, adapterScores);
389 writeScoreRatioHistogram(ffirsrhist, repeatScores);
390
391 //Report timing and results
392 t.stop();
393
394 String stats=null;
395 if(format==FORMAT_TEXT){
396 ByteBuilder bb=toText(t);
397 stats=bb.nl().toString();
398 }else if(format==FORMAT_JSON){
399 JsonObject jo=toJson(t);
400 stats=jo.toStringln();
401 }else{
402 assert(false) : "Bad format: "+format;
403 }
404 if(ffstats==null){
405 outstream.print(stats);
406 }else{
407 ReadWrite.writeString(stats, outstats);
408 }
409
410 //Throw an exception of there was an error in a thread
411 if(errorState){
412 throw new RuntimeException(getClass().getName()+" terminated in an error state; the output may be corrupt.");
413 }
414 }
415
416 private ByteBuilder toText(Timer t){
417 ByteBuilder bb=new ByteBuilder();
418
419 bb.appendln(Tools.timeReadsBasesProcessed(t, readsProcessed, basesProcessed, 8));
420 bb.appendln(Tools.readsBasesOut(readsProcessed, basesProcessed, readsOut, basesOut, 8, false));
421
422 long readsFiltered=readsProcessed-readsOut;
423 bb.appendln(Tools.numberPercent("Reads Filtered:", readsFiltered, readsFiltered*100.0/(readsProcessed), 3, 8));
424 if(trimReads || trimPolyA){
425 bb.appendln(Tools.numberPercent("Reads Trimmed:", readsTrimmed, readsTrimmed*100.0/(readsProcessed), 3, 8));
426 bb.appendln(Tools.numberPercent("Bases Trimmed:", basesTrimmed, basesTrimmed*100.0/(basesProcessed), 3, 8));
427 }
428 bb.appendln(Tools.number("Total ZMWs:", ZMWs, 8));
429 bb.appendln(Tools.numberPercent("Bad ZMWs:", iceCreamZMWs, iceCreamZMWs*100.0/(ZMWs), 3, 8));
430 bb.appendln(Tools.numberPercent("Absent Adapter:", missingAdapterZMWs, missingAdapterZMWs*100.0/(ZMWs), 3, 8));
431 bb.appendln(Tools.numberPercent("Hidden Adapter:", hiddenAdapterZMWs, hiddenAdapterZMWs*100.0/(ZMWs), 3, 8));
432 bb.appendln(Tools.numberPercent("Ambiguous IR:", ambiguousZMWs, ambiguousZMWs*100.0/(ZMWs), 3, 8));
433 // bb.appendln(Tools.numberPercent("Low Entropy:", lowEntropyReads, lowEntropyReads*100.0/(readsProcessed), 3, 8));
434 bb.appendln(Tools.numberPercent("Low Entropy:", lowEntropyZMWs, lowEntropyZMWs*100.0/(ZMWs), 3, 8));
435
436 bb.appendln(Tools.number("Avg Score Ratio:", iceCreamRatio/ratiosCounted, 3, 8));
437 bb.appendln(Tools.number("Score Cutoff:", minRatio2, 3, 8));
438
439 if(printTiming){
440 bb.appendln("Iterations: "+alignmentIters/1000000+"m");
441 bb.appendln("Short Iterations: "+alignmentItersShort/1000000+"m");
442 bb.appendln("Elapsed: "+elapsed/1000000+"ms");
443 bb.appendln("Elapsed Short: "+elapsedShort/1000000+"ms");
444 }
445
446 if(parseCustom){
447 {
448 bb.appendln("\nReads:");
449 bb.appendln(Tools.numberPercent("True Positive:", truePositiveReads, truePositiveReads*100.0/(readsProcessed), 2, 8));
450 bb.appendln(Tools.numberPercent("True Negative:", trueNegativeReads, trueNegativeReads*100.0/(readsProcessed), 2, 8));
451 bb.appendln(Tools.numberPercent("False Positive:", falsePositiveReads, falsePositiveReads*100.0/(readsProcessed), 2, 8));
452 bb.appendln(Tools.numberPercent("False Negative:", falseNegativeReads, falseNegativeReads*100.0/(readsProcessed), 2, 8));
453 bb.appendln(Tools.numberPercent("Ambiguous:", ambiguousReads, ambiguousReads*100.0/(readsProcessed), 2, 8));
454
455 double snr=(truePositiveReads+trueNegativeReads+ambiguousReads)/Tools.max(1, falsePositiveReads+falseNegativeReads+ambiguousReads);
456 snr=10*Math.log10(snr);
457 bb.appendln(Tools.number("SNR:", snr, 2, 8));
458 }
459 {
460 bb.appendln("\nZMWs:");
461 bb.appendln(Tools.numberPercent("True Positive:", truePositiveZMWs, truePositiveZMWs*100.0/(ZMWs), 2, 8));
462 bb.appendln(Tools.numberPercent("True Negative:", trueNegativeZMWs, trueNegativeZMWs*100.0/(ZMWs), 2, 8));
463 bb.appendln(Tools.numberPercent("False Positive:", falsePositiveZMWs, falsePositiveZMWs*100.0/(ZMWs), 2, 8));
464 bb.appendln(Tools.numberPercent("False Negative:", falseNegativeZMWs, falseNegativeZMWs*100.0/(ZMWs), 2, 8));
465 bb.appendln(Tools.numberPercent("Ambiguous:", ambiguousZMWs, ambiguousZMWs*100.0/(ZMWs), 2, 8));
466
467 double snr=(truePositiveZMWs+trueNegativeZMWs+ambiguousReads)/Tools.max(1, falsePositiveZMWs+falseNegativeZMWs+ambiguousZMWs);
468 snr=10*Math.log10(snr);
469 bb.appendln(Tools.number("SNR:", snr, 2, 8));
470 }
471 }
472 return bb;
473 }
474
475 private JsonObject toJson(Timer t){
476 JsonObject jo=new JsonObject();
477 long readsFiltered=readsProcessed-readsOut;
478
479 jo.add("Time", t.timeInSeconds());
480 jo.add("Reads_Processed", readsProcessed);
481 jo.add("Bases_Processed", basesProcessed);
482 jo.add("Reads_Out", readsOut);
483 jo.add("Bases_Out", basesOut);
484 jo.add("Reads_Filtered", readsFiltered);
485 jo.add("Reads_Filtered_Pct", readsFiltered*100.0/(readsProcessed));
486 if(trimReads){
487 jo.add("Reads_Trimmed", readsTrimmed);
488 jo.add("Reads_Trimmed_Pct", readsTrimmed*100.0/(readsProcessed));
489 jo.add("Bases_Trimmed", basesTrimmed);
490 jo.add("Bases_Trimmed_Pct", basesTrimmed*100.0/(basesProcessed));
491 }
492 jo.add("Total_ZMWs", ZMWs);
493 jo.add("Bad_ZMWs", iceCreamZMWs);
494 jo.add("Bad_ZMWs_Pct", iceCreamZMWs*100.0/(ZMWs));
495 jo.add("Absent_Adapter", missingAdapterZMWs);
496 jo.add("Absent_Adapter_Pct", missingAdapterZMWs*100.0/(ZMWs));
497 jo.add("Hidden_Adapter", hiddenAdapterZMWs);
498 jo.add("Hidden_Adapter_Pct", hiddenAdapterZMWs*100.0/(ZMWs));
499 // jo.add("Low_Entropy", lowEntropyReads);
500 // jo.add("Low_Entropy_Pct", lowEntropyReads*100.0/(readsProcessed));
501 jo.add("Low_Entropy", lowEntropyZMWs);
502 jo.add("Low_Entropy_Pct", lowEntropyZMWs*100.0/(ZMWs));
503 jo.add("Ambiguous_Inverted_Repeat", ambiguousZMWs);
504 jo.add("Ambiguous_Inverted_Repeat_Pct", ambiguousZMWs*100.0/(ZMWs));
505 jo.add("Avg_Score_Ratio_IR", iceCreamRatio/ratiosCounted);
506 jo.add("Score_Cutoff_IR", minRatio2);
507
508 jo.add("Alignment_Iterations_IR", alignmentIters);
509 jo.add("Short_Alignment_Iterations_IR", alignmentItersShort);
510
511 if(parseCustom){
512 {
513 double snr=(truePositiveReads+trueNegativeReads+ambiguousReads)/Tools.max(1, falsePositiveReads+falseNegativeReads+ambiguousReads);
514 snr=10*Math.log10(snr);
515 jo.add("TP_Reads", truePositiveReads);
516 jo.add("TN_Reads", trueNegativeReads);
517 jo.add("FP_Reads", falsePositiveReads);
518 jo.add("FN_Reads", falseNegativeReads);
519 jo.add("AM_Reads", ambiguousReads);
520
521 jo.add("TP_Reads_Pct", truePositiveReads*100.0/(readsProcessed));
522 jo.add("TN_Reads_Pct", trueNegativeReads*100.0/(readsProcessed));
523 jo.add("FP_Reads_Pct", falsePositiveReads*100.0/(readsProcessed));
524 jo.add("FN_Reads_Pct", falseNegativeReads*100.0/(readsProcessed));
525 jo.add("AM_Reads_Pct", ambiguousReads*100.0/(readsProcessed));
526
527 jo.add("SNR_Reads", snr);
528 }
529 {
530 double snr=(truePositiveZMWs+trueNegativeZMWs+ambiguousZMWs)/Tools.max(1, falsePositiveZMWs+falseNegativeZMWs+ambiguousZMWs);
531 snr=10*Math.log10(snr);
532 jo.add("TP_ZMWs", truePositiveZMWs);
533 jo.add("TN_ZMWs", trueNegativeZMWs);
534 jo.add("FP_ZMWs", falsePositiveZMWs);
535 jo.add("FN_ZMWs", falseNegativeZMWs);
536 jo.add("AM_ZMWs", ambiguousZMWs);
537
538 jo.add("TP_ZMWs_Pct", truePositiveZMWs*100.0/(ZMWs));
539 jo.add("TN_ZMWs_Pct", trueNegativeZMWs*100.0/(ZMWs));
540 jo.add("FP_ZMWs_Pct", falsePositiveZMWs*100.0/(ZMWs));
541 jo.add("FN_ZMWs_Pct", falseNegativeZMWs*100.0/(ZMWs));
542 jo.add("AM_ZMWs_Pct", ambiguousZMWs*100.0/(ZMWs));
543
544 jo.add("SNR_ZMWs", snr);
545 }
546 }
547 return jo;
548 }
549
550 private static void writeScoreRatioHistogram(FileFormat ff, long[] hist){
551 if(ff==null){return;}
552 final ByteStreamWriter bsw=new ByteStreamWriter(ff);
553 bsw.start();
554 final float mult=1.0f/(hist.length-1);
555
556 bsw.print("#Counted\t").println(Tools.sum(hist));
557 bsw.print("#Mean\t").println(Tools.averageHistogram(hist)*mult, 3);
558 bsw.print("#Median\t").println(Tools.medianHistogram(hist)*mult, 3);
559 bsw.print("#Mode\t").println(Tools.calcModeHistogram(hist)*mult, 3);
560 bsw.print("#STDev\t").println(Tools.standardDeviationHistogram(hist)*mult, 3);
561 bsw.print("#Value\tOccurances\n");
562
563 for(int i=0; i<hist.length; i++){
564 bsw.print(i*mult, 3).tab().println(hist[i]);
565 }
566 bsw.poisonAndWait();
567 }
568
569 private ConcurrentReadOutputStream makeCros(FileFormat ff){
570 if(ff==null){return null;}
571
572 //Select output buffer size based on whether it needs to be ordered
573 final int buff=16;
574
575 final ConcurrentReadOutputStream ros=ConcurrentReadOutputStream.getStream(ff, null, buff, null, ff.samOrBam() && ffin1.samOrBam());
576 ros.start(); //Start the stream
577 return ros;
578 }
579
580 /** Spawn process threads */
581 private void spawnThreads(final ZMWStreamer zstream,
582 final ConcurrentReadOutputStream rosg, final ConcurrentReadOutputStream rosa,
583 final ConcurrentReadOutputStream rosb, final ConcurrentReadOutputStream rosj){
584
585 //Do anything necessary prior to processing
586
587 //Fill a list with ProcessThreads
588 ArrayList<ProcessThread> alpt=new ArrayList<ProcessThread>(threads);
589 for(int i=0; i<threads; i++){
590 alpt.add(new ProcessThread(zstream, rosg, rosa, rosb, rosj, i));
591 }
592
593 //Start the threads
594 for(ProcessThread pt : alpt){
595 pt.start();
596 }
597
598 zstream.runStreamer(false);
599
600 //Wait for threads to finish
601 waitForThreads(alpt);
602
603 //Do anything necessary after processing
604 ZMWs=zstream.ZMWs;
605 }
606
607 private void waitForThreads(ArrayList<ProcessThread> alpt){
608
609 //Wait for completion of all threads
610 boolean success=true;
611 for(ProcessThread pt : alpt){
612
613 //Wait until this thread has terminated
614 while(pt.getState()!=Thread.State.TERMINATED){
615 try {
616 //Attempt a join operation
617 pt.join();
618 } catch (InterruptedException e) {
619 //Potentially handle this, if it is expected to occur
620 e.printStackTrace();
621 }
622 }
623
624 //Accumulate per-thread statistics
625 readsProcessed+=pt.readsProcessedT;
626 basesProcessed+=pt.basesProcessedT;
627
628 truePositiveReads+=pt.truePositiveReadsT;
629 trueNegativeReads+=pt.trueNegativeReadsT;
630 falsePositiveReads+=pt.falsePositiveReadsT;
631 falseNegativeReads+=pt.falseNegativeReadsT;
632 ambiguousReads+=pt.ambiguousReadsT;
633
634 truePositiveZMWs+=pt.truePositiveZMWsT;
635 trueNegativeZMWs+=pt.trueNegativeZMWsT;
636 falsePositiveZMWs+=pt.falsePositiveZMWsT;
637 falseNegativeZMWs+=pt.falseNegativeZMWsT;
638 ambiguousZMWs+=pt.ambiguousZMWsT;
639
640 readsOut+=pt.readsOutT;
641 basesOut+=pt.basesOutT;
642 junctionsOut+=pt.junctionsOutT;
643
644 alignmentIters+=pt.ica.iters();
645 alignmentItersShort+=pt.ica.itersShort();
646 elapsed+=pt.elapsedT;
647 elapsedShort+=pt.elapsedShortT;
648
649 iceCreamReads+=pt.iceCreamReadsT;
650 iceCreamBases+=pt.iceCreamBasesT;
651 iceCreamZMWs+=pt.iceCreamZMWsT;
652 iceCreamRatio+=pt.iceCreamRatioT;
653 ratiosCounted+=pt.ratiosCountedT;
654 missingAdapterZMWs+=pt.missingAdapterZMWsT;
655 hiddenAdapterZMWs+=pt.hiddenAdapterZMWsT;
656 lowEntropyZMWs+=pt.lowEntropyZMWsT;
657 lowEntropyReads+=pt.lowEntropyReadsT;
658
659 basesTrimmed+=pt.basesTrimmedT;
660 readsTrimmed+=pt.readsTrimmedT;
661
662 for(int i=0; i<adapterScores.length; i++){
663 adapterScores[i]+=pt.adapterScoresT[i];
664 repeatScores[i]+=pt.repeatScoresT[i];
665 }
666
667 success&=pt.success;
668 }
669
670 //Track whether any threads failed
671 if(!success){errorState=true;}
672 }
673
674 /*--------------------------------------------------------------*/
675 /*---------------- Inner Methods ----------------*/
676 /*--------------------------------------------------------------*/
677
678 /*--------------------------------------------------------------*/
679 /*---------------- Inner Classes ----------------*/
680 /*--------------------------------------------------------------*/
681
682 /** Processes reads. */
683 private class ProcessThread extends Thread {
684
685 //Constructor
686 ProcessThread(final ZMWStreamer zstream_,
687 final ConcurrentReadOutputStream ros_, final ConcurrentReadOutputStream rosa_,
688 final ConcurrentReadOutputStream rosb_, final ConcurrentReadOutputStream rosj_, final int tid_){
689 zstream=zstream_;
690 rosg=ros_;
691 rosa=rosa_;
692 rosb=rosb_;
693 rosj=rosj_;
694 tid=tid_;
695
696 Arrays.fill(tipBufferLeft, (byte)'N');
697 Arrays.fill(tipBufferRight, (byte)'N');
698
699 if(entropyCutoff>=0){
700 eTracker=new EntropyTracker(false, entropyCutoff, true);
701 }else{
702 eTracker=null;
703 }
704 }
705
706 //Called by start()
707 @Override
708 public void run(){
709 //Do anything necessary prior to processing
710
711 //Process the reads
712 processInner();
713
714 //Do anything necessary after processing
715
716 //Indicate successful exit status
717 success=true;
718 }
719
720 /** Iterate through the reads */
721 void processInner(){
722
723 //As long as there is a nonempty read list...
724 for(ZMW reads=zstream.nextZMW(); reads!=null; reads=zstream.nextZMW()){
725 // if(verbose){outstream.println("Fetched "+reads.size()+" reads.");} //Disabled due to non-static access
726
727 processList(reads);
728 }
729 }
730
731 int flagLowEntropyReads(final ZMW reads, final float minEnt,
732 final int minLen0, final float minFract){
733 int found=0;
734 for(Read r : reads){
735 if(!r.discarded()){
736 int minLen=Tools.min((int)(r.length()*minFract), minLen0);
737 int maxBlock=eTracker.longestLowEntropyBlock(r.bases, false, maxMonomerFraction);
738 if(maxBlock>=minLen){
739 r.setDiscarded(true);
740 r.setJunk(true);
741 found++;
742 // System.err.println(r.toFasta());
743 }
744 }
745 }
746 return found;
747 }
748
749 int flagLongReads(final ZMW reads, int median){
750 int found=0;
751 for(Read r : reads){
752 if(r.length()>longReadMult*median){
753 r.setDiscarded(true);
754 r.setHasAdapter(true);
755 found++;
756 }
757 }
758 return found;
759 }
760
761 /** Each list is presumed to be all reads from a ZMW, in order */
762 void processList(final ZMW reads){
763 long numBases=0;
764
765 //Loop through each read in the list for statistics
766 for(int idx=0; idx<reads.size(); idx++){
767 final Read r1=reads.get(idx);
768
769 //Validate reads in worker threads
770 if(!r1.validated()){r1.validate(true);}
771
772 //Track the initial length for statistics
773 final int initialLength1=r1.length();
774
775 //Increment counters
776 readsProcessedT++;
777 basesProcessedT+=initialLength1;
778 numBases+=initialLength1;
779 }
780 final boolean fullPass=CCS || reads.size()>=3;
781
782 if(trimReads || trimPolyA){
783 int removed=0;
784 for(int i=0; i<reads.size(); i++){
785 Read r=reads.get(i);
786 byte a=r.bases[0], b=r.bases[r.length()-1];
787 int trimmed=0;
788 if(!AminoAcid.isFullyDefined(a) || !AminoAcid.isFullyDefined(b)){
789 trimmed+=trimRead(r, 80);
790 }
791
792 if(trimReads && adapter!=null){
793 int leftAdapterBases=alignLeftTipAdapter(r);
794 int rightAdapterBases=alignRightTipAdapter(r);
795 if(leftAdapterBases+rightAdapterBases>0){
796 trimmed+=trimRead(r, adapterTipLen);
797 r.setTrimmed(true);
798 }
799 }
800 if(trimPolyA){
801 int x=trimPolyA(r);
802 trimmed+=x;
803 if(x>0){r.setTrimmed(true);}
804 }
805
806 if(trimmed>0){
807 basesTrimmedT+=trimmed;
808 readsTrimmedT++;
809 }
810
811 //TODO: Note again, removing intermediate reads messes up forward-rev ordering
812 if(r.length()<minLengthAfterTrimming){//Discard short trash
813 reads.set(i, null);
814 removed++;
815 }
816 }
817 if(removed>0){
818 Tools.condenseStrict(reads);
819 }
820 }
821
822 if(entropyCutoff>0){
823 int bad=flagLowEntropyReads(reads, entropyCutoff, entropyLength, entropyFraction);
824 if(bad>0){
825 lowEntropyZMWsT++;
826 lowEntropyReadsT+=bad;
827 if(bad>=reads.size()){
828 if(!reads.isEmpty()){outputReads(reads, null);}
829 return; //No point in continuing...
830 }
831 }
832 }
833
834 if(reads.isEmpty()){return;}
835
836 Read sample=null;//Read that will be searched for inverted repeat, typically median length
837 Read shortest=null;//shortest read in the middle, or overall if no middle reads
838 final int medianLength=reads.medianLength(true);
839 boolean foundInvertedRepeat=false;
840 int longReads=0;
841 int adapterReads=0;
842 int maxAdapters=0;
843 int sampleNum=0;
844
845 if(reads.size()>=3){
846 if(flagLongReads){longReads=flagLongReads(reads, medianLength);}
847 for(int i=1; i<reads.size()-1; i++){
848 Read r=reads.get(i);
849 if(sample==null && r.length()==medianLength){
850 sample=r;
851 sampleNum=i;
852 }
853 if(shortest==null || r.length()<shortest.length()){shortest=r;}
854 }
855 }else{
856 for(int i=0; i<reads.size(); i++){
857 Read r=reads.get(i);
858 if(sample==null || sample.length()<r.length()){
859 sample=r;
860 sampleNum=i;
861 }
862 if(shortest==null || r.length()<shortest.length()){shortest=r;}
863 }
864 }
865 assert(sample!=null);
866 final AlignmentResult ar=align(sample, fullPass, reads.size(), sampleNum);
867
868 if(ar!=null){
869 foundInvertedRepeat=true;
870 sample.setInvertedRepeat(true);
871 if(ar.icecream || !filterIceCreamOnly){
872 sample.setDiscarded(true);
873 }else if(ar.ambiguous){
874 sample.setAmbiguous(true);
875 }
876 }
877
878 if(alignAdapter){
879 double mult=foundInvertedRepeat ? 0.9 : 1.0;
880 if(needsAdapterTest(sample)){
881 int x=lookForAdapter(sample, mult);
882 adapterReads+=(x>0 ? 1 : 0);
883 maxAdapters=Tools.max(x, maxAdapters);
884 }
885
886 if(reads.size()>2){
887 Read a=reads.get(0), b=reads.get(reads.size()-1);
888 Read r=a.length()>b.length() ? a : b;
889 if(needsAdapterTest(r)){
890 int x=lookForAdapter(r, mult);
891 adapterReads+=(x>0 ? 1 : 0);
892 maxAdapters=Tools.max(x, maxAdapters);
893 }
894 }
895
896 for(Read r : reads){
897 if((r.length()>=shortReadMult*shortest.length() || adapterReads>0 || longReads>0 || foundInvertedRepeat)
898 && needsAdapterTest(r)){
899 int x=lookForAdapter(r, mult);
900 adapterReads+=(x>0 ? 1 : 0);
901 maxAdapters=Tools.max(x, maxAdapters);
902 }
903 }
904 }
905
906 if(ar!=null && ar.icecream){
907 iceCreamRatioT+=ar.ratio;
908 ratiosCountedT++;
909 int idx=(int)(ar.ratio*200);
910 repeatScoresT[idx]++;
911 if(longReads+adapterReads==0){missingAdapterZMWsT++;}
912 }
913 if(longReads+adapterReads>0){
914 hiddenAdapterZMWsT++;
915 }
916
917 if(keepShortReads && maxAdapters<2){
918 if(foundInvertedRepeat && !sample.hasAdapter()){
919 if(reads.size()>2){
920 for(int i=1; i<reads.size()-1; i++){
921 reads.get(i).setDiscarded(true);
922 }
923 Read r=reads.get(0);
924 if(r.length()>=veryShortMult*medianLength){r.setDiscarded(true);}
925 r=reads.get(reads.size()-1);
926 if(r.length()>=veryShortMult*medianLength){r.setDiscarded(true);}
927 }else if(reads.size()==2){
928 for(Read r : reads){
929 if(r.length()>=veryShortMult*sample.length()){
930 if(ar.icecream){
931 r.setDiscarded(true);
932 }else if(ar.ambiguous){
933 r.setAmbiguous(true);
934 }
935 }
936 }
937 }
938 }
939 }else if(sample.discarded() || (longReads+adapterReads>0)){
940 for(Read r : reads){
941 r.setDiscarded(true);
942 }
943 }
944
945 ArrayList<Read> junctions=null;
946 if(ar!=null){
947 if(rosj!=null && !sample.hasAdapter()){
948 Read r=ar.alignedRead;
949 int width=Tools.min(200, ar.junctionLoc, r.length()-ar.junctionLoc);
950 int a=ar.junctionLoc-width, b=ar.junctionLoc+width;
951 byte[] bases=Arrays.copyOfRange(r.bases, a, b);
952 Read junction=new Read(bases, null, r.id+"\tjunction:"+a+"-"+b, r.numericID);
953 junctions=new ArrayList<Read>(1);
954 junctions.add(junction);
955 junctionsOutT++;
956 }
957 if(modifyHeader){
958 sample.id=sample.id+"\tratio="+ar.ratio+"\tjunction="+ar.junctionLoc+
959 "\tIR="+sample.invertedRepeat()+"\tAD="+sample.hasAdapter()+"\tFP="+fullPass+"\tsubreads="+reads.size();
960 }
961 }
962
963 removeShortTrash(reads);
964
965 if(!reads.isEmpty()){outputReads(reads, junctions);}
966 }
967
968 //TODO: Now that I think about it. The order of the reads is important.
969 //Since they go forward-rev-forward-rev it's imprudent to discard inner reads.
970 private void removeShortTrash(ZMW reads) {
971 int removed=0;
972 for(int i=0; i<reads.size(); i++){
973 Read r=reads.get(i);
974 if(r.length()<minLengthAfterTrimming){//Discard short trash
975 reads.set(i, null);
976 removed++;
977 }
978 }
979 if(removed>0){Tools.condenseStrict(reads);}
980 }
981
982 int trimRead(Read r, int lookahead){
983 final byte[] bases=r.bases;
984
985 int left=calcLeftTrim(bases, lookahead);
986 int right=calcRightTrim(bases, lookahead);
987 int trimmed=0;
988
989 if(left+right>0){
990 // System.err.println(r.length()+", "+left+", "+right+", "+r.id);
991 trimmed=TrimRead.trimByAmount(r, left, right, 1, false);
992 // System.err.println(r.length()+", "+left+", "+right+", "+r.id);
993 ZMW.fixReadHeader(r, left, right);
994 // System.err.println(r.length()+", "+left+", "+right+", "+r.id);
995 }
996
997 if(r.samline!=null){
998 r.samline.seq=r.bases;
999 r.samline.qual=r.quality;
1000 }
1001 return trimmed;
1002 }
1003
1004 int trimPolyA(Read r){
1005 final byte[] bases=r.bases;
1006
1007 int left=Tools.max(PolymerTrimmer.testLeft(bases, 'A'), PolymerTrimmer.testLeft(bases, 'T'));
1008 int right=Tools.max(PolymerTrimmer.testRight(bases, 'A'), PolymerTrimmer.testRight(bases, 'T'));
1009 int trimmed=0;
1010
1011 if(left+right>0){
1012 // System.err.println(r.length()+", "+left+", "+right+", "+r.id);
1013 trimmed=TrimRead.trimByAmount(r, left, right, 1, false);
1014 // System.err.println(r.length()+", "+left+", "+right+", "+r.id);
1015 ZMW.fixReadHeader(r, left, right);
1016 // System.err.println(r.length()+", "+left+", "+right+", "+r.id);
1017 }
1018
1019 if(r.samline!=null){
1020 r.samline.seq=r.bases;
1021 r.samline.qual=r.quality;
1022 }
1023 return trimmed;
1024 }
1025
1026 final int calcLeftTrim(final byte[] bases, int lookahead){
1027 final int len=bases.length;
1028 int lastUndef=-1;
1029 for(int i=0, defined=0; i<len && defined<lookahead; i++){
1030 if(AminoAcid.isFullyDefined(bases[i])){
1031 defined++;
1032 }else{
1033 lastUndef=i;
1034 defined=0;
1035 }
1036 }
1037 return lastUndef+1;
1038 }
1039
1040 final int calcRightTrim(final byte[] bases, int lookahead){
1041 final int len=bases.length;
1042 int lastUndef=len;
1043 for(int i=len-1, defined=0; i>=0 && defined<lookahead; i--){
1044 if(AminoAcid.isFullyDefined(bases[i])){
1045 defined++;
1046 }else{
1047 lastUndef=i;
1048 defined=0;
1049 }
1050 }
1051 return len-lastUndef-1;
1052 }
1053
1054 final int alignLeftTipAdapter(Read r){
1055 assert(adapter.length<adapterTipLen); //Time to increase adapterTipLen
1056 if(r.length()<adapterTipLen){return 0;}
1057 final byte[] array=tipBufferLeft;
1058
1059 for(int i=adapterTipPad, j=0; i<array.length; i++, j++){array[i]=r.bases[j];}
1060 int[] rvec=ssa.fillAndScoreLimited(adapter, array, 0, array.length, minSwScore);
1061
1062 if(rvec==null || rvec[0]<minSwScore){return 0;}
1063 final int score=rvec[0];
1064 final int start=Tools.max(0, rvec[1]-adapterTipPad);
1065 final int stop=rvec[2]-adapterTipPad;
1066 for(int i=start; i<=stop; i++){r.bases[i]='X';}
1067 return stop-start+1;
1068 }
1069
1070 final int alignRightTipAdapter(Read r){
1071 final byte[] bases=r.bases;
1072 assert(adapter.length<adapterTipLen); //Time to increase adapterTipLen
1073 if(r.length()<adapterTipLen){return 0;}
1074 final byte[] array=tipBufferRight;
1075
1076 for(int i=0, j=bases.length-adapterTipLen; i<adapterTipLen; i++, j++){array[i]=bases[j];}
1077 int[] rvec=ssa.fillAndScoreLimited(adapter, array, 0, array.length, minSwScore);
1078
1079 if(rvec==null || rvec[0]<minSwScore){return 0;}
1080 final int score=rvec[0];
1081 final int start=Tools.max(0, rvec[1]-adapterTipPad);
1082 final int stop=rvec[2]-adapterTipPad;
1083 for(int i=start; i<=stop; i++){r.bases[i]='X';}
1084 return stop-start+1;
1085 }
1086
1087 boolean needsAdapterTest(Read r){
1088 if(r.tested() || r.hasAdapter()){return false;}
1089 if(adapterRatio<=0 || r.discarded()){return true;}
1090 AlignmentResult aa=fla.alignForwardShort(adapter, r.bases, 0, r.bases.length-1, adapterRatio);
1091 return aa!=null;
1092 }
1093
1094 private void outputReads(ZMW reads, ArrayList<Read> junctions){
1095 final int size=reads.size();
1096 final ArrayList<Read> good=(rosg==null ? null : new ArrayList<Read>(size));
1097 final ArrayList<Read> ambig=(rosa==null ? null : new ArrayList<Read>(size));
1098 final ArrayList<Read> bad=(rosb==null ? null : new ArrayList<Read>(size));
1099
1100 int discardedReads=0;
1101 int ambigReads=0;
1102 int trimmedReads=0;
1103 boolean sendAllToDiscarded=false;
1104 boolean sendAllToAmbiguous=false;
1105
1106 //Check to see if any reads are discarded or ambiguous
1107 for(Read r : reads){
1108 if(r.discarded()){
1109 discardedReads++;
1110 }else if(r.ambiguous()){
1111 ambigReads++;
1112 }else if(r.trimmed()){
1113 trimmedReads++;
1114 }
1115 }
1116
1117 //Unify flags on all reads
1118 if(keepZMWsTogether){
1119 if(discardedReads>0){sendAllToDiscarded=true;}
1120 else if(ambigReads>0){sendAllToAmbiguous=true;}
1121 }
1122 // if(discardedReads>0 || ambigReads>0){System.err.println("\nd="+discardedReads+", a="+ambigReads);}
1123 if(discardedReads>0){iceCreamZMWsT++;}
1124
1125 int trueIceCreamReads=0;
1126 for(Read r : reads){
1127 boolean trueIceCream=(parseCustom ? ReadBuilder.isIceCream(r.id) : false);
1128 trueIceCreamReads+=(trueIceCream ? 1 : 0);
1129 if(r.discarded() || sendAllToDiscarded){
1130 // assert(false);
1131 if(bad!=null){bad.add(r);}
1132 if(trueIceCream){truePositiveReadsT++;}
1133 else{falsePositiveReadsT++;}
1134 // System.err.println("d:t="+r.tested()+",ad="+r.hasAdapter()+",ir="+r.invertedRepeat()+","+r.id+", reads="+reads.size());
1135 }else if(r.ambiguous() || sendAllToAmbiguous){
1136 if(ambig!=null){ambig.add(r);}
1137 if(sendAmbigToGood){
1138 readsOutT++;
1139 basesOutT+=r.length();
1140 if(good!=null) {good.add(r);}
1141 }
1142 if(sendAmbigToBad && bad!=null){bad.add(r);}
1143 ambiguousReadsT++;
1144 // System.err.println("a*:t="+r.tested()+",ad="+r.hasAdapter()+",ir="+r.invertedRepeat()+","+r.id+", reads="+reads.size());
1145 }else{
1146 if(good!=null){
1147 good.add(r);
1148 }
1149 readsOutT++;
1150 basesOutT+=r.length();
1151 if(trueIceCream && !r.trimmed()){falseNegativeReadsT++;}
1152 else{trueNegativeReadsT++;}
1153 // if(discardedReads>0 || ambigReads>0){System.err.println("g*:t="+r.tested()+",ad="+r.hasAdapter()+",ir="+r.invertedRepeat()+","+r.id+", reads="+reads.size());}
1154 }
1155 }
1156
1157 if(trueIceCreamReads>0){
1158 if(discardedReads>0 || trimmedReads>0){
1159 truePositiveZMWsT++;
1160 }else if(ambigReads>0){
1161 ambiguousZMWs++;
1162 }else{
1163 falseNegativeZMWsT++;
1164 // StringBuilder sb=new StringBuilder();
1165 // for(Read r : reads) {sb.append("\n").append(r.id);}
1166 // System.err.println(sb);
1167 }
1168 }else{
1169 if(discardedReads>0){
1170 falsePositiveZMWsT++;
1171 }else if(ambigReads>0){
1172 ambiguousZMWs++;
1173 }else{
1174 trueNegativeZMWsT++;
1175 }
1176 }
1177
1178 if(rosg!=null && good!=null && !good.isEmpty()){rosg.add(good, 0);}
1179 if(rosa!=null && ambig!=null && !ambig.isEmpty()){rosa.add(ambig, 0);}
1180 if(rosb!=null && bad!=null && !bad.isEmpty()){rosb.add(bad, 0);}
1181 if(rosj!=null && junctions!=null && !junctions.isEmpty()){rosj.add(junctions, 0);}
1182 }
1183
1184 /**
1185 * Align part of a read to itself to look for inverted repeats.
1186 */
1187 AlignmentResult align(final Read r, boolean fullPass, int passes, int readNum){
1188 if(!alignRC){return null;}
1189 final byte[] bases=r.bases;
1190
1191 int qlen=(int)Tools.max(minQlen, Tools.min(targetQlen, bases.length*maxQlenFraction));
1192 if(qlen>0.45f*bases.length){return null;}//Ignore short stuff
1193
1194 //Perform an initial scan using the tips of the reads to look for an inverted repeat
1195 boolean tipOnly=filterIceCreamOnly && fullPass;
1196 // System.err.println(filterIceCreamOnly+", "+fullPass+", "+tipOnly);
1197 AlignmentResult a=alignLeft(bases, qlen, minRatio1, true, tipOnly);
1198 AlignmentResult b=(fullPass && false ? null : alignRight(bases, qlen, minRatio1, true, tipOnly));//A second alignment is not needed for a full pass.
1199 AlignmentResult ar=(a==null ? b : b==null ? a : a.maxScore>=b.maxScore ? a : b);
1200
1201 //If nothing was detected, return
1202 if(ar==null){return null;}
1203 ar.alignedRead=r;
1204
1205 //At this point, the read contains an inverted repeat of length qlen.
1206 final int expectedJunction=ar.rLen/2;
1207
1208 if(ar.left){
1209 ar.junctionLoc=ar.maxRpos/2;
1210 }else{
1211 int innerLeft=ar.maxRpos;
1212 int innerRight=ar.rLen-ar.qLen;
1213 ar.junctionLoc=(innerLeft+innerRight)/2;
1214 }
1215
1216 // if(fullPass){//This code doesn't seem to have any effect and it's not clear why it is present
1217 // int dif=Tools.absdif(expectedJunction, ar.junctionLoc);
1218 // if(dif>expectedJunction*0.1) {
1219 // if(filterIceCreamOnly){return ar;}
1220 // }
1221 // }
1222
1223 if(realign){
1224 if(ar.junctionLoc<expectedJunction){
1225 int qlen2=(int)(ar.junctionLoc*0.9);
1226 if(qlen2>=qlen){
1227 ar=alignLeft(bases, qlen2, minRatio2, false, false);
1228 }
1229 }else{
1230 int qlen2=(int)((ar.rLen-ar.junctionLoc)*0.9);
1231 if(qlen2>=qlen){
1232 ar=alignRight(bases, qlen2, minRatio2, false, false);
1233 }
1234 }
1235 if(ar==null){return null;}
1236 ar.alignedRead=r;
1237 }
1238
1239 //At this point, the read contains an inverted repeat mirrored across a junction.
1240 final float junctionFraction;
1241 if(ar.left){
1242 ar.junctionLoc=ar.maxRpos/2;
1243 junctionFraction=ar.junctionLoc/(float)ar.rLen;
1244 }else{
1245 int innerLeft=ar.maxRpos;
1246 int innerRight=ar.rLen-ar.qLen;
1247 ar.junctionLoc=(innerLeft+innerRight)/2;
1248 junctionFraction=(ar.rLen-ar.junctionLoc)/(float)ar.rLen;
1249 }
1250
1251 final int dif=Tools.absdif(expectedJunction, ar.junctionLoc);
1252 if(fullPass){
1253 if(junctionFraction<minJunctionFraction){
1254 ar.icecream=false;
1255 }else{
1256 ar.icecream=true;
1257 }
1258 }else if(passes==2){
1259 if(junctionFraction<minJunctionFraction){
1260 if(readNum==0){
1261 //First read, the junction should be closer to the beginning for real ice cream
1262 if(ar.junctionLoc>expectedJunction){
1263 ar.icecream=false;
1264 }else{
1265 ar.ambiguous=true;
1266 }
1267 }else{
1268 //Last read, the junction should be closer to the end for real ice cream
1269 assert(readNum==1) : readNum;
1270 if(ar.junctionLoc<expectedJunction){
1271 ar.icecream=false;
1272 }else{
1273 ar.ambiguous=true;
1274 }
1275 }
1276 return ar;
1277 }else{
1278 ar.icecream=true;
1279 }
1280 }else{
1281 if(junctionFraction<minJunctionFraction){
1282 ar.icecream=true;
1283 }else{
1284 ar.ambiguous=true;
1285 }
1286 }
1287 return ar;
1288 }
1289
1290 /** Align the left qlen bases to the rest of the read. */
1291 private AlignmentResult alignLeft(final byte[] bases, final int qlen, final float minRatio, boolean v2, boolean tipOnly){
1292 final byte[] query=Arrays.copyOfRange(bases, 0, qlen);
1293 AminoAcid.reverseComplementBasesInPlace(query);
1294 final AlignmentResult ar;
1295 final int rstop=bases.length-1;
1296 final int rstart=(tipOnly ? Tools.max(qlen, rstop-(int)(tipRatio*qlen)) : qlen);
1297 if(v2){
1298 // elapsedShortT-=System.nanoTime();
1299 ar=ica.alignForwardShort(query, bases, rstart, rstop, minScore, minRatio);
1300 // elapsedShortT+=System.nanoTime();
1301 }else{
1302 // elapsedT-=System.nanoTime();
1303 ar=ica.alignForward(query, bases, rstart, rstop, minScore, minRatio);
1304 // elapsedT+=System.nanoTime();
1305 }
1306 if(ar!=null){ar.left=true;}
1307 return ar;
1308 }
1309
1310 /** Align the right qlen bases to the rest of the read. */
1311 private AlignmentResult alignRight(final byte[] bases, final int qlen, final float minRatio, boolean v2, boolean tipOnly){
1312 final byte[] query=Arrays.copyOfRange(bases, bases.length-qlen, bases.length);
1313 AminoAcid.reverseComplementBasesInPlace(query);
1314 final AlignmentResult ar;
1315 final int rstop=(tipOnly ? Tools.min((int)(tipRatio*qlen), bases.length-qlen-1) : bases.length-qlen-1);
1316 final int rstart=0;
1317 if(v2){
1318 // elapsedShortT-=System.nanoTime();
1319 ar=ica.alignForwardShort(query, bases, rstart, rstop, minScore, minRatio);
1320 // elapsedShortT+=System.nanoTime();
1321 }else{
1322 // elapsedT-=System.nanoTime();
1323 ar=ica.alignForward(query, bases, rstart, rstop, minScore, minRatio);
1324 // elapsedT+=System.nanoTime();
1325 }
1326 if(ar!=null){ar.left=false;}
1327 return ar;
1328 }
1329
1330 /** Align the adapter sequence to the read, piecewise, to count matches. */
1331 private int lookForAdapter(Read r, double mult) {
1332 assert(!r.hasAdapter() && !r.tested());
1333 int begin=0;
1334 while(begin<r.length() && r.bases[begin]=='N'){begin++;} //Skip reads made of 'N'
1335 if(begin>=r.length()){return 0;}
1336
1337 int suspectThresh=(int)(minSwScoreSuspect*mult);
1338 int scoreThresh=(int)(minSwScore*mult);
1339
1340 // final byte[] array=npad(r.bases, pad ? npad : 0);
1341 final byte[] array=npad(r.bases, npad);
1342
1343 // assert(array==r.bases) : npad;
1344
1345 int lim=array.length-npad-stride;
1346
1347 int found=0;
1348
1349 int lastSuspect=-1;
1350 int lastConfirmed=-1;
1351 int maxScore=0;
1352
1353 // final int revisedStride=(r.length()>1800 ? stride : (int)(stride*0.6f));
1354 for(int i=begin; i<lim; i+=stride){
1355 int j=Tools.min(i+window, array.length-1);
1356 if(j-i<window){
1357 lim=0; //Last loop cycle
1358 // i=Tools.max(0, array.length-2*query1.length);
1359 }
1360
1361 {
1362
1363 int[] rvec;
1364 // if(timeless){//A speed-optimized version
1365 rvec=ssa.fillAndScoreLimited(adapter, array, i, j, suspectThresh);
1366 // }else{rvec=msa.fillAndScoreLimited(adapter, array, i, j, suspectThresh);}
1367 if(rvec!=null && rvec[0]>=suspectThresh){
1368 final int score=rvec[0];
1369 final int start=rvec[1];
1370 final int stop=rvec[2];
1371 assert(score>=suspectThresh);
1372 if((i==0 || start>i) && (j==array.length-1 || stop<j)){
1373 boolean kill=(score>=scoreThresh ||
1374 (score>=suspectMidpoint && lastSuspect>0 && start>=lastSuspect && start-lastSuspect<suspectDistance) ||
1375 (lastConfirmed>0 && start>=lastConfirmed && start-lastConfirmed<suspectDistance));
1376
1377 if(!kill && useLocality && array.length-stop>window){//Look ahead
1378 rvec=ssa.fillAndScoreLimited(adapter, array, stop, stop+window, suspectThresh);
1379 if(rvec!=null){
1380 if(score>=suspectMidpoint && rvec[0]>=suspectThresh && rvec[1]-stop<suspectDistance){kill=true;}
1381 else if(score>=suspectThresh && rvec[0]>=scoreThresh && rvec[1]-stop<suspectDistance){kill=true;}
1382 }
1383 }
1384
1385 if(!kill && useAltMsa){//Try alternate msa
1386 rvec=msa2.fillAndScoreLimited(adapter, array, Tools.max(0, start-4), Tools.min(stop+4, array.length-1), suspectThresh);
1387 if(rvec!=null && rvec[0]>=(scoreThresh)){kill=true;}
1388 }
1389
1390 if(kill){
1391 maxScore=Tools.max(maxScore, score);
1392 // if(print) {System.err.println("Found adapter at distance "+Tools.min(start, array.length-stop));}
1393 // System.out.println("-:"+score+", "+scoreThresh+", "+suspectThresh+"\t"+lastSuspect+", "+start+", "+stop);
1394 found++;
1395 for(int x=Tools.max(0, start); x<=stop; x++){array[x]='X';}
1396 // assert(Shared.threads()!=1) : new String(array)+"\n"+start+", "+stop;
1397 if(useLocality && score>=scoreThresh){lastConfirmed=Tools.max(lastConfirmed, stop);}
1398 }
1399 }
1400 // System.out.println("Set lastSuspect="+stop+" on score "+score);
1401 if(useLocality){lastSuspect=Tools.max(lastSuspect, stop);}
1402 }
1403 }
1404 }
1405
1406 r.setTested(true);
1407 if(found>0){
1408 r.setHasAdapter(true);
1409 r.setDiscarded(true);
1410 r.setAmbiguous(false);
1411
1412 int idx=(int)((maxScore*200.0)/maxSwScore);
1413 adapterScoresT[idx]++;
1414 }else{
1415 r.setHasAdapter(false);
1416 }
1417
1418 return found;
1419 }
1420
1421 /** Align the adapter sequence to the read ends, and trim if needed. */
1422 private int trimTerminalAdapters(Read r, double mult) {
1423 assert(!r.hasAdapter() && !r.tested());
1424 int begin=0;
1425 while(begin<r.length() && r.bases[begin]=='N'){begin++;} //Skip reads made of 'N'
1426 if(begin>=r.length()){return 0;}
1427
1428 int suspectThresh=(int)(minSwScoreSuspect*mult);
1429 int scoreThresh=(int)(minSwScore*mult);
1430
1431 // final byte[] array=npad(r.bases, pad ? npad : 0);
1432 final byte[] array=npad(r.bases, npad);
1433
1434 // assert(array==r.bases) : npad;
1435
1436 int lim=array.length-npad-stride;
1437
1438 int found=0;
1439
1440 int lastSuspect=-1;
1441 int lastConfirmed=-1;
1442 int maxScore=0;
1443
1444 // final int revisedStride=(r.length()>1800 ? stride : (int)(stride*0.6f));
1445 for(int i=begin; i<lim; i+=stride){
1446 int j=Tools.min(i+window, array.length-1);
1447 if(j-i<window){
1448 lim=0; //Last loop cycle
1449 // i=Tools.max(0, array.length-2*query1.length);
1450 }
1451
1452 {
1453
1454 int[] rvec;
1455 // if(timeless){//A speed-optimized version
1456 rvec=ssa.fillAndScoreLimited(adapter, array, i, j, suspectThresh);
1457 // }else{rvec=msa.fillAndScoreLimited(adapter, array, i, j, suspectThresh);}
1458 if(rvec!=null && rvec[0]>=suspectThresh){
1459 final int score=rvec[0];
1460 final int start=rvec[1];
1461 final int stop=rvec[2];
1462 assert(score>=suspectThresh);
1463 if((i==0 || start>i) && (j==array.length-1 || stop<j)){
1464 boolean kill=(score>=scoreThresh ||
1465 (score>=suspectMidpoint && lastSuspect>0 && start>=lastSuspect && start-lastSuspect<suspectDistance) ||
1466 (lastConfirmed>0 && start>=lastConfirmed && start-lastConfirmed<suspectDistance));
1467
1468 if(!kill && useLocality && array.length-stop>window){//Look ahead
1469 rvec=ssa.fillAndScoreLimited(adapter, array, stop, stop+window, suspectThresh);
1470 if(rvec!=null){
1471 if(score>=suspectMidpoint && rvec[0]>=suspectThresh && rvec[1]-stop<suspectDistance){kill=true;}
1472 else if(score>=suspectThresh && rvec[0]>=scoreThresh && rvec[1]-stop<suspectDistance){kill=true;}
1473 }
1474 }
1475
1476 if(!kill && useAltMsa){//Try alternate msa
1477 rvec=msa2.fillAndScoreLimited(adapter, array, Tools.max(0, start-4), Tools.min(stop+4, array.length-1), suspectThresh);
1478 if(rvec!=null && rvec[0]>=(scoreThresh)){kill=true;}
1479 }
1480
1481 if(kill){
1482 maxScore=Tools.max(maxScore, score);
1483 // if(print) {System.err.println("Found adapter at distance "+Tools.min(start, array.length-stop));}
1484 // System.out.println("-:"+score+", "+scoreThresh+", "+suspectThresh+"\t"+lastSuspect+", "+start+", "+stop);
1485 found++;
1486 for(int x=Tools.max(0, start); x<=stop; x++){array[x]='X';}
1487 // assert(Shared.threads()!=1) : new String(array)+"\n"+start+", "+stop;
1488 if(useLocality && score>=scoreThresh){lastConfirmed=Tools.max(lastConfirmed, stop);}
1489 }
1490 }
1491 // System.out.println("Set lastSuspect="+stop+" on score "+score);
1492 if(useLocality){lastSuspect=Tools.max(lastSuspect, stop);}
1493 }
1494 }
1495 }
1496
1497 r.setTested(true);
1498 if(found>0){
1499 r.setHasAdapter(true);
1500 r.setDiscarded(true);
1501 r.setAmbiguous(false);
1502
1503 int idx=(int)((maxScore*200.0)/maxSwScore);
1504 adapterScoresT[idx]++;
1505 }else{
1506 r.setHasAdapter(false);
1507 }
1508
1509 return found;
1510 }
1511
1512 private byte[] npad(final byte[] array, final int pad){
1513 if(pad<=0){return array;}
1514 final int len=array.length+2*pad;
1515 byte[] r=new byte[len];
1516 for(int i=0; i<pad; i++){r[i]='N';}
1517 for(int i=pad, j=0; j<array.length; i++, j++){r[i]=array[j];}
1518 for(int i=array.length+pad, limit=Tools.min(r.length, len+50); i<limit; i++){r[i]='N';}
1519 return r;
1520 }
1521
1522 /*--------------------------------------------------------------*/
1523 /*---------------- Inner Class Fields ----------------*/
1524 /*--------------------------------------------------------------*/
1525
1526 /** Number of reads processed by this thread */
1527 protected long readsProcessedT=0;
1528 /** Number of bases processed by this thread */
1529 protected long basesProcessedT=0;
1530
1531
1532 protected long truePositiveReadsT=0;
1533 protected long falsePositiveReadsT=0;
1534 protected long trueNegativeReadsT=0;
1535 protected long falseNegativeReadsT=0;
1536 protected long ambiguousReadsT=0;
1537
1538 protected long truePositiveZMWsT=0;
1539 protected long falsePositiveZMWsT=0;
1540 protected long trueNegativeZMWsT=0;
1541 protected long falseNegativeZMWsT=0;
1542 protected long ambiguousZMWsT=0;
1543
1544
1545 protected long elapsedT=0;
1546 protected long elapsedShortT=0;
1547
1548 //Unused
1549 protected long iceCreamReadsT=0;
1550 protected long iceCreamBasesT=0;
1551
1552 protected long iceCreamZMWsT=0;
1553 protected double iceCreamRatioT=0;
1554 protected long ratiosCountedT=0;
1555 protected long missingAdapterZMWsT=0;
1556 protected long hiddenAdapterZMWsT=0;
1557
1558 protected long basesTrimmedT=0;
1559 protected long readsTrimmedT=0;
1560
1561 /** Number of reads retained by this thread */
1562 protected long readsOutT=0;
1563 /** Number of bases retained by this thread */
1564 protected long basesOutT=0;
1565 /** Number of junctions detected in IR reads by this thread */
1566 protected long junctionsOutT=0;
1567
1568 protected long lowEntropyZMWsT=0;
1569 protected long lowEntropyReadsT=0;
1570
1571 /** True only if this thread has completed successfully */
1572 boolean success=false;
1573
1574 /** Shared output stream */
1575 private final ConcurrentReadOutputStream rosg;
1576 /** Shared output stream for ambiguous reads */
1577 private final ConcurrentReadOutputStream rosa;
1578 /** Shared output stream for bad reads */
1579 private final ConcurrentReadOutputStream rosb;
1580 /** Shared output stream for junctions */
1581 private final ConcurrentReadOutputStream rosj;
1582 /** Thread ID */
1583 final int tid;
1584
1585 /* Aligners for this thread */
1586
1587 /** For inverted repeat alignment */
1588 final IceCreamAligner ica=IceCreamAligner.makeAligner(32);
1589 /** For quickly aligning adapter sequence to whole read */
1590 final FlatAligner fla=new FlatAligner();
1591 // final MultiStateAligner9PacBioAdapter msa=alignAdapter ? new MultiStateAligner9PacBioAdapter(ALIGN_ROWS, ALIGN_COLUMNS) : null;
1592 /** For slowly aligning adapter sequence sectionwise */
1593 final SingleStateAlignerPacBioAdapter ssa=(alignAdapter || trimReads) ? new SingleStateAlignerPacBioAdapter(ALIGN_ROWS, ALIGN_COLUMNS, adapter.length) : null;
1594 /** Alternate scoring for slow adapter alignment */
1595 final MultiStateAligner9PacBioAdapter2 msa2=(alignAdapter || trimReads) ? new MultiStateAligner9PacBioAdapter2() : null;
1596
1597 final ZMWStreamer zstream;
1598 final EntropyTracker eTracker;
1599
1600 final long[] adapterScoresT=new long[201];
1601 final long[] repeatScoresT=new long[201];
1602 final byte[] tipBufferLeft=new byte[adapterTipLen+adapterTipPad];
1603 final byte[] tipBufferRight=new byte[adapterTipLen+adapterTipPad];
1604 }
1605
1606 /*--------------------------------------------------------------*/
1607 /*---------------- Fields ----------------*/
1608 /*--------------------------------------------------------------*/
1609
1610 /** Primary input file path */
1611 private String in1=null;
1612
1613 /** Primary output file path */
1614 private String outg=null;
1615 /** Ambiguous output file path */
1616 private String outa=null;
1617 /** Bad output file path */
1618 private String outb=null;
1619 /** Junction output file path */
1620 private String outj=null;
1621 /** Stats (screen) output file path */
1622 private String outstats=null;
1623
1624 /** Adapter score ratio histogram */
1625 private String asrhist=null;
1626
1627 /** Inverted repeat score ratio histogram */
1628 private String irsrhist=null;
1629
1630 /** Override input file extension */
1631 private String extin=null;
1632 /** Override output file extension */
1633 private String extout=null;
1634
1635 private int targetQlen=352;
1636 private int minQlen=100;
1637
1638 /** Make a query at most this fraction of read length */
1639 private float maxQlenFraction=0.15f;
1640
1641 /** Exit alignment early if score drops below this.
1642 * An aggressive optimization that may miss some low-quality inverted repeats.
1643 * -700 seems safe. */
1644 private int minScore=-800;
1645
1646 /** Fraction of maximum alignment score to consider as matching for initial alignment */
1647 private float minRatio1=0.59f;
1648
1649 /** Fraction of maximum alignment score to consider as matching for realignment */
1650 private float minRatio2=0.64f;
1651
1652 private float adapterRatio=0.18f; //.18 for fa, or 0.57/0.6 for ica
1653 private float adapterRatio2=0.325f; //0.31f normal, 0.325 timeless
1654 private float suspectRatio=0.85f;
1655 private boolean useLocality=true;
1656 private boolean useAltMsa=true;
1657
1658
1659 private float tipRatio=1.5f;
1660
1661 private float longReadMult=1.5f;
1662
1663 private float shortReadMult=1.5f;
1664
1665 private float veryShortMult=0.35f;
1666
1667 /** Short half of inverted repeat must be at least this fraction of read length to be classed as a triangle */
1668 private float minJunctionFraction=0.4f;
1669
1670 /** Only filter triangle reads, not all inverted repeats */
1671 private boolean filterIceCreamOnly=true;
1672
1673 /** Align again once the junction position is provisionally identified */
1674 private boolean realign=true;
1675
1676 /** For internal read array transfers */
1677 private int queuelen=80;
1678
1679 /** For grading synthetic data */
1680 private boolean parseCustom;
1681
1682 /** Input reads are CCS (full pass) */
1683 private boolean CCS;
1684
1685 private boolean modifyHeader=false;
1686
1687 private boolean sendAmbigToGood=true;
1688 private boolean sendAmbigToBad=false;
1689 private boolean setAmbig=false;
1690
1691 //Note: These flags are very similar... they need to be better-defined or merged.
1692 private boolean keepZMWsTogether=false;
1693 private boolean keepShortReads=true;
1694
1695 private int format=FORMAT_TEXT;
1696 private static final int FORMAT_TEXT=1, FORMAT_JSON=2;
1697
1698 /*--------------------------------------------------------------*/
1699
1700 /** Alignment iterations for inverted repeat calculation with ref columns and query rows */
1701 protected long alignmentIters=0;
1702
1703 /** Alignment iterations for inverted repeat calculation with query columns and ref rows */
1704 protected long alignmentItersShort=0;
1705
1706 /** Time spent in long iterations */
1707 protected long elapsed=0;
1708
1709 /** Time spent in short iterations */
1710 protected long elapsedShort=0;
1711
1712 /** Print iteration time statistics */
1713 protected boolean printTiming=false;
1714
1715 /** Number of reads processed */
1716 protected long readsProcessed=0;
1717 /** Number of bases processed */
1718 protected long basesProcessed=0;
1719
1720 /** Number of reads retained */
1721 protected long readsOut=0;
1722 /** Number of bases retained */
1723 protected long basesOut=0;
1724 /** Number of junctions detected in IR reads */
1725 protected long junctionsOut=0;
1726
1727 /** Quit after processing this many input reads; -1 means no limit */
1728 private long maxReads=-1;
1729
1730 //Unused
1731 protected long iceCreamReads=0;
1732 protected long iceCreamBases=0;
1733
1734 /** ZMWs with discarded reads */
1735 protected long iceCreamZMWs=0;
1736
1737 /** Sum of IR alignment ratios for IR reads not containing adapters */
1738 protected double iceCreamRatio=0;
1739
1740 /** Number of ratios in iceCreamRatio */
1741 protected long ratiosCounted=0;
1742
1743 /** Histogram */
1744 protected final long[] adapterScores=new long[201];
1745
1746 /** Histogram */
1747 protected final long[] repeatScores=new long[201];
1748
1749 /** ZMWs with a triangle read but no hidden adapters */
1750 protected long missingAdapterZMWs=0;
1751
1752 /** ZMWs with hidden adapters */
1753 protected long hiddenAdapterZMWs=0;
1754
1755 protected long basesTrimmed=0;
1756 protected long readsTrimmed=0;
1757
1758 protected long lowEntropyZMWs=0;
1759 protected long lowEntropyReads=0;
1760
1761 /** Total ZMWs observed */
1762 protected long ZMWs=0;
1763
1764 protected long truePositiveReads=0;
1765 protected long falsePositiveReads=0;
1766 protected long trueNegativeReads=0;
1767 protected long falseNegativeReads=0;
1768 protected long ambiguousReads=0;
1769
1770 protected long truePositiveZMWs=0;
1771 protected long falsePositiveZMWs=0;
1772 protected long trueNegativeZMWs=0;
1773 protected long falseNegativeZMWs=0;
1774 protected long ambiguousZMWs=0;
1775
1776 protected final int stride;
1777 protected final int window;
1778 protected final int ALIGN_ROWS;
1779 protected final int ALIGN_COLUMNS;
1780
1781 private boolean timeless=true;
1782 protected final int maxSwScore;
1783 protected final int minSwScore;
1784 protected final int minSwScoreSuspect;
1785 protected final int maxImperfectSwScore;
1786 protected final int suspectMidpoint;
1787 protected final int suspectDistance=100;
1788 protected int npad=0; //This is for catching adapters on the tips, which are not very relevant to ice cream. Set to 35 for tip apdapters.
1789
1790 private byte[] adapter="ATCTCTCTCAACAACAACAACGGAGGAGGAGGAAAAGAGAGAGAT".getBytes(); //This one seems to be correct
1791 // private byte[] adapter="ATCTCTCTCTTTTCCTCCTCCTCCGTTGTTGTTGTTGAGAGAGAT".getBytes();
1792 private boolean alignAdapter=true;
1793 private boolean alignRC=true;
1794 private boolean flagLongReads=true;
1795 private boolean trimReads=true;
1796 private int minLengthAfterTrimming=40;
1797 private int adapterTipLen=100;
1798 private int adapterTipPad=35;
1799
1800 boolean trimPolyA=false;
1801
1802 /*--------------------------------------------------------------*/
1803 /*---------------- Entropy Fields ----------------*/
1804 /*--------------------------------------------------------------*/
1805
1806 /** Minimum entropy to be considered "complex", on a scale of 0-1 */
1807 float entropyCutoff=-1; //suggested 0.55
1808 /** Minimum length of a low-entropy block to fail a read */
1809 int entropyLength=350;
1810 /** Minimum length of a low-entropy block as a fraction of read length;
1811 * the minimum of the two will be used */
1812 float entropyFraction=0.5f;
1813
1814 float maxMonomerFraction=0.74f; //Suggested... 0.74
1815
1816 /*--------------------------------------------------------------*/
1817 /*---------------- Final Fields ----------------*/
1818 /*--------------------------------------------------------------*/
1819
1820 /** Primary input file */
1821 private final FileFormat ffin1;
1822
1823 /** Primary output file */
1824 private final FileFormat ffoutg;
1825
1826 /** Ambiguous output file */
1827 private final FileFormat ffouta;
1828
1829 /** Bad output file */
1830 private final FileFormat ffoutb;
1831
1832 /** Junction output file */
1833 private final FileFormat ffoutj;
1834
1835 /** Stats output file */
1836 private final FileFormat ffstats;
1837
1838 /** Adapter score ratio histogram */
1839 private final FileFormat ffasrhist;
1840
1841 /** Inverted repeat score ratio histogram */
1842 private final FileFormat ffirsrhist;
1843
1844 private final int threads;
1845
1846 /*--------------------------------------------------------------*/
1847 /*---------------- Common Fields ----------------*/
1848 /*--------------------------------------------------------------*/
1849
1850 /** Print status messages to this output stream */
1851 private PrintStream outstream=System.err;
1852 /** Print verbose messages */
1853 public static boolean verbose=false;
1854 /** True if an error was encountered */
1855 public boolean errorState=false;
1856 /** Overwrite existing output files */
1857 private boolean overwrite=false;
1858 /** Append to existing output files */
1859 private boolean append=false;
1860
1861 }