Mercurial > repos > rliterman > csp2
comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/icecream/IceCreamFinder.java @ 68:5028fdace37b
planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author | jpayne |
---|---|
date | Tue, 18 Mar 2025 16:23:26 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
67:0e9998148a16 | 68:5028fdace37b |
---|---|
1 package icecream; | |
2 | |
3 import java.io.PrintStream; | |
4 import java.util.ArrayList; | |
5 import java.util.Arrays; | |
6 | |
7 import aligner.AlignmentResult; | |
8 import aligner.FlatAligner; | |
9 import aligner.MultiStateAligner9PacBioAdapter2; | |
10 import aligner.SingleStateAlignerPacBioAdapter; | |
11 import dna.AminoAcid; | |
12 import dna.Data; | |
13 import fileIO.ByteFile; | |
14 import fileIO.ByteStreamWriter; | |
15 import fileIO.FileFormat; | |
16 import fileIO.ReadWrite; | |
17 import json.JsonObject; | |
18 import shared.Parse; | |
19 import shared.Parser; | |
20 import shared.PreParser; | |
21 import shared.ReadStats; | |
22 import shared.Shared; | |
23 import shared.Timer; | |
24 import shared.Tools; | |
25 import shared.TrimRead; | |
26 import stream.ConcurrentReadOutputStream; | |
27 import stream.FASTQ; | |
28 import stream.FastaReadInputStream; | |
29 import stream.Read; | |
30 import stream.SamLine; | |
31 import structures.ByteBuilder; | |
32 import structures.EntropyTracker; | |
33 | |
34 /** | |
35 * Detects inverted repeats in PacBio reads. | |
36 * Attempts to determine whether reads are chimeric | |
37 * due to a missing adapter. | |
38 * | |
39 * @author Brian Bushnell | |
40 * @date June 5, 2019 | |
41 * | |
42 */ | |
43 public final class IceCreamFinder { | |
44 | |
45 /*--------------------------------------------------------------*/ | |
46 /*---------------- Initialization ----------------*/ | |
47 /*--------------------------------------------------------------*/ | |
48 | |
49 /** | |
50 * Code entrance from the command line. | |
51 * @param args Command line arguments | |
52 */ | |
53 public static void main(String[] args){ | |
54 //Start a timer immediately upon code entrance. | |
55 Timer t=new Timer(); | |
56 | |
57 //Create an instance of this class | |
58 IceCreamFinder x=new IceCreamFinder(args); | |
59 | |
60 //Run the object | |
61 x.process(t); | |
62 | |
63 //Close the print stream if it was redirected | |
64 Shared.closeStream(x.outstream); | |
65 } | |
66 | |
67 /** | |
68 * Constructor. | |
69 * @param args Command line arguments | |
70 */ | |
71 public IceCreamFinder(String[] args){ | |
72 | |
73 {//Preparse block for help, config files, and outstream | |
74 PreParser pp=new PreParser(args, getClass(), false); | |
75 args=pp.args; | |
76 outstream=pp.outstream; | |
77 } | |
78 | |
79 Parser.setQuality(33); | |
80 | |
81 //Set shared static variables prior to parsing | |
82 ReadWrite.USE_PIGZ=ReadWrite.USE_UNPIGZ=true; | |
83 ReadWrite.USE_BGZIP=ReadWrite.USE_UNBGZIP=ReadWrite.PREFER_BGZIP=true; | |
84 ReadWrite.MAX_ZIP_THREADS=Shared.threads(); | |
85 FASTQ.TEST_INTERLEAVED=FASTQ.FORCE_INTERLEAVED=false; | |
86 SamLine.SET_FROM_OK=true; | |
87 Shared.setBufferData(1000000); | |
88 // Shared.FASTA_WRAP=511; | |
89 Data.USE_SAMBAMBA=false;//Sambamba changes PacBio headers. | |
90 Read.CHANGE_QUALITY=false; | |
91 EntropyTracker.defaultK=3; | |
92 | |
93 {//Parse the arguments | |
94 final Parser parser=parse(args); | |
95 Parser.processQuality(); | |
96 | |
97 maxReads=parser.maxReads; | |
98 overwrite=ReadStats.overwrite=parser.overwrite; | |
99 append=ReadStats.append=parser.append; | |
100 | |
101 in1=parser.in1; | |
102 extin=parser.extin; | |
103 | |
104 if(outg==null){outg=parser.out1;} | |
105 extout=parser.extout; | |
106 } | |
107 | |
108 //Determine how many threads may be used | |
109 threads=Shared.threads(); | |
110 | |
111 //Ensure there is an input file | |
112 if(in1==null){throw new RuntimeException("Error - at least one input file is required.");} | |
113 fixExtensions(); //Add or remove .gz or .bz2 as needed | |
114 checkFileExistence(); //Ensure files can be read and written | |
115 checkStatics(); //Adjust file-related static fields as needed for this program | |
116 | |
117 //Create output FileFormat objects | |
118 ffoutg=FileFormat.testOutput(outg, FileFormat.FASTQ, extout, true, overwrite, append, false); | |
119 ffouta=FileFormat.testOutput(outa, FileFormat.FASTQ, extout, true, overwrite, append, false); | |
120 ffoutb=FileFormat.testOutput(outb, FileFormat.FASTQ, extout, true, overwrite, append, false); | |
121 ffoutj=FileFormat.testOutput(outj, FileFormat.FASTA, extout, true, overwrite, append, false); | |
122 ffstats=FileFormat.testOutput(outstats, FileFormat.TXT, null, false, overwrite, append, false); | |
123 ffasrhist=FileFormat.testOutput(asrhist, FileFormat.TXT, null, false, overwrite, append, false); | |
124 ffirsrhist=FileFormat.testOutput(irsrhist, FileFormat.TXT, null, false, overwrite, append, false); | |
125 | |
126 if(!setAmbig && ffouta!=null){ | |
127 sendAmbigToGood=sendAmbigToBad=false; | |
128 } | |
129 | |
130 //Create input FileFormat objects | |
131 ffin1=FileFormat.testInput(in1, FileFormat.FASTQ, extin, true, true); | |
132 | |
133 final int alen=(adapter==null ? 20 : adapter.length); | |
134 stride=(int)(alen*1.9f); | |
135 window=(int)(alen*3.8f+10); | |
136 ALIGN_ROWS=alen+1; | |
137 ALIGN_COLUMNS=window+2; | |
138 | |
139 maxSwScore=aligner.MultiStateAligner9PacBioAdapter.maxQuality(alen); | |
140 minSwScore=(int)(maxSwScore*adapterRatio2); | |
141 minSwScoreSuspect=(int)(maxSwScore*Tools.min(adapterRatio2*suspectRatio, adapterRatio2-((1-suspectRatio)*.2f))); | |
142 maxImperfectSwScore=aligner.MultiStateAligner9PacBioAdapter.maxImperfectScore(alen); | |
143 suspectMidpoint=(minSwScoreSuspect+minSwScore)/2; | |
144 if(adapter==null){alignAdapter=false;} | |
145 } | |
146 | |
147 /*--------------------------------------------------------------*/ | |
148 /*---------------- Initialization Helpers ----------------*/ | |
149 /*--------------------------------------------------------------*/ | |
150 | |
151 /** Parse arguments from the command line */ | |
152 private Parser parse(String[] args){ | |
153 | |
154 //Create a parser object | |
155 Parser parser=new Parser(); | |
156 | |
157 //Set any necessary Parser defaults here | |
158 //parser.foo=bar; | |
159 | |
160 //Parse each argument | |
161 for(int i=0; i<args.length; i++){ | |
162 String arg=args[i]; | |
163 | |
164 //Break arguments into their constituent parts, in the form of "a=b" | |
165 String[] split=arg.split("="); | |
166 String a=split[0].toLowerCase(); | |
167 String b=split.length>1 ? split[1] : null; | |
168 if(b!=null && b.equalsIgnoreCase("null")){b=null;} | |
169 | |
170 if(a.equals("verbose")){ | |
171 verbose=Parse.parseBoolean(b); | |
172 }else if(a.equals("format")){ | |
173 if(b==null){ | |
174 assert(false) : arg; | |
175 }else if(Tools.isDigit(b.charAt(i))){ | |
176 format=Integer.parseInt(b); | |
177 }else if(b.equalsIgnoreCase("json")){ | |
178 format=FORMAT_JSON; | |
179 }else if(b.equalsIgnoreCase("text")){ | |
180 format=FORMAT_TEXT; | |
181 }else{ | |
182 assert(false) : arg; | |
183 } | |
184 assert(format>=1 && format<=2) : arg; | |
185 }else if(a.equals("json")){ | |
186 boolean x=Parse.parseBoolean(b); | |
187 format=(x ? FORMAT_JSON : FORMAT_TEXT); | |
188 }else if(a.equals("ss") || a.equals("samstreamer") || a.equals("streamer")){ | |
189 if(b!=null && Tools.isDigit(b.charAt(0))){ | |
190 ZMWStreamer.useStreamer=true; | |
191 assert(Integer.parseInt(b)==1) : "ZMWStreamer threads currently capped at 1."; | |
192 // ZMWStreamer.streamerThreads=Tools.max(1, Integer.parseInt(b)); | |
193 }else{ | |
194 ZMWStreamer.useStreamer=Parse.parseBoolean(b); | |
195 } | |
196 }else if(a.equals("icecreamonly") || a.equals("ico")){ | |
197 filterIceCreamOnly=Parse.parseBoolean(b); | |
198 }else if(a.equals("keepshortreads") || a.equals("ksr")){ | |
199 keepShortReads=Parse.parseBoolean(b); | |
200 }else if(a.equalsIgnoreCase("keepzmwstogether") || a.equals("kzt") || a.equals("keepreadstogether") || a.equals("krt")){ | |
201 keepZMWsTogether=Parse.parseBoolean(b); | |
202 }else if(a.equalsIgnoreCase("samplerate")){ | |
203 float f=Float.parseFloat(b); | |
204 assert(false) : "TODO"; //TODO | |
205 }else if(a.equals("modifyheader") || a.equals("modifyheaders") || a.equals("changeheader") || a.equals("changeheaders")){ | |
206 modifyHeader=Parse.parseBoolean(b); | |
207 }else if(a.equalsIgnoreCase("ccs")){ | |
208 CCS=Parse.parseBoolean(b); | |
209 }else if(a.equals("npad")){ | |
210 npad=Integer.parseInt(b); | |
211 }else if(a.equals("minlength") || a.equals("minlen")){ | |
212 minLengthAfterTrimming=Integer.parseInt(b); | |
213 }else if(a.equals("realign")){ | |
214 realign=Parse.parseBoolean(b); | |
215 }else if(a.equals("aligninverse") || a.equals("alignrc") || a.equals("findicecream")){ | |
216 alignRC=Parse.parseBoolean(b); | |
217 }else if(a.equals("alignadapter")){ | |
218 alignAdapter=Parse.parseBoolean(b); | |
219 }else if(a.equals("timeless")){ | |
220 timeless=Parse.parseBoolean(b); | |
221 }else if(a.equals("flaglongreads")){ | |
222 flagLongReads=Parse.parseBoolean(b); | |
223 }else if(a.equals("trimreads") || a.equals("trim")){ | |
224 trimReads=Parse.parseBoolean(b); | |
225 }else if(a.equals("adapter")){ | |
226 adapter=b==null ? null : b.getBytes(); | |
227 }else if(a.equals("targetqlen") || a.equals("qlen")){ | |
228 targetQlen=Integer.parseInt(b); | |
229 }else if(a.equals("maxqlenfraction") || a.equals("maxfraction") || a.equals("qlenfraction")){ | |
230 maxQlenFraction=Float.parseFloat(b); | |
231 }else if(a.equals("junctionfraction") || a.equals("shortfraction")){ | |
232 minJunctionFraction=Float.parseFloat(b); | |
233 }else if(a.equals("minratio1") || a.equals("ratio1") || a.equals("id1")){ | |
234 minRatio1=Float.parseFloat(b); | |
235 }else if(a.equals("minratio2") || a.equals("ratio2") || a.equals("id2")){ | |
236 minRatio2=Float.parseFloat(b); | |
237 }else if(a.equals("minratio") || a.equals("ratio") || a.equals("id")){ | |
238 minRatio1=minRatio2=Float.parseFloat(b); | |
239 }else if(a.equals("adapterratio") || a.equals("adapterratio1") || a.equals("ratior") || a.equals("ratior1")){ | |
240 adapterRatio=Float.parseFloat(b); | |
241 }else if(a.equals("adapterratio2") || a.equals("ratior2")){ | |
242 adapterRatio2=Float.parseFloat(b); | |
243 }else if(a.equals("suspectratio")){ | |
244 suspectRatio=Float.parseFloat(b); | |
245 }else if(a.equals("minqlen")){ | |
246 minQlen=Integer.parseInt(b); | |
247 }else if(a.equals("minscore")){ | |
248 minScore=Integer.parseInt(b); | |
249 }else if(a.equals("parsecustom")){ | |
250 parseCustom=Parse.parseBoolean(b); | |
251 }else if(a.equals("printtiming") || a.equals("extended")){ | |
252 printTiming=Parse.parseBoolean(b); | |
253 }else if(a.equals("queuelen") || a.equals("qlen")){ | |
254 queuelen=Integer.parseInt(b); | |
255 }else if(a.equals("outg") || a.equals("outgood")){ | |
256 outg=b; | |
257 }else if(a.equals("outa") || a.equals("outambig")){ | |
258 outa=b; | |
259 }else if(a.equals("outb") || a.equals("outbad")){ | |
260 outb=b; | |
261 }else if(a.equals("outj") || a.equals("outjunctions") || a.equals("junctions")){ | |
262 outj=b; | |
263 }else if(a.equals("outs") || a.equals("outstats") || a.equals("stats")){ | |
264 outstats=b; | |
265 }else if(a.equals("asrhist") || a.equals("ahist")){ | |
266 asrhist=b; | |
267 }else if(a.equals("irsrhist") || a.equals("irhist")){ | |
268 irsrhist=b; | |
269 }else if(a.equals("ambig")){ | |
270 sendAmbigToGood=sendAmbigToBad=false; | |
271 if(b!=null){ | |
272 String[] split2=b.split(","); | |
273 for(String s2 : split2){ | |
274 if(s2.equalsIgnoreCase("good")){sendAmbigToGood=true;} | |
275 else if(s2.equalsIgnoreCase("bad") || s2.equalsIgnoreCase("toss")){sendAmbigToBad=true;} | |
276 else if(s2.equalsIgnoreCase("ambig")){} | |
277 else{assert(false) : "Bad argument: '"+s2+"' in '"+arg+"'; should be good or bad";} | |
278 } | |
279 } | |
280 setAmbig=true; | |
281 }else if(a.equalsIgnoreCase("trimpolya")){//Parse standard flags in the parser | |
282 trimPolyA=Parse.parseBoolean(b); | |
283 }else if(PolymerTrimmer.parse(arg, a, b)){ | |
284 //do nothing | |
285 }else if(a.equals("minentropy") || a.equals("entropy") || a.equals("entropyfilter") || a.equals("efilter")){ | |
286 if(b==null || Character.isLetter(b.charAt(0))){ | |
287 if(Parse.parseBoolean(b)){ | |
288 entropyCutoff=0.55f; | |
289 }else{ | |
290 entropyCutoff=-1; | |
291 } | |
292 }else{ | |
293 entropyCutoff=Float.parseFloat(b); | |
294 } | |
295 }else if(a.equals("entropyblock") || a.equals("entropylength") || a.equals("entropylen") || a.equals("entlen")){ | |
296 entropyLength=Parse.parseIntKMG(b); | |
297 }else if(a.equals("entropyfraction") || a.equals("entfraction")){ | |
298 entropyFraction=Float.parseFloat(b); | |
299 }else if(a.equals("monomerfraction") || a.equals("maxmonomerfraction") || a.equals("mmf")){ | |
300 maxMonomerFraction=Float.parseFloat(b); | |
301 }else if(a.equals("parse_flag_goes_here")){ | |
302 long fake_variable=Parse.parseKMG(b); | |
303 //Set a variable here | |
304 }else if(parser.parse(arg, a, b)){//Parse standard flags in the parser | |
305 //do nothing | |
306 }else{ | |
307 outstream.println("Unknown parameter "+args[i]); | |
308 assert(false) : "Unknown parameter "+args[i]; | |
309 } | |
310 } | |
311 | |
312 return parser; | |
313 } | |
314 | |
315 /** Add or remove .gz or .bz2 as needed */ | |
316 private void fixExtensions(){ | |
317 in1=Tools.fixExtension(in1); | |
318 } | |
319 | |
320 /** Ensure files can be read and written */ | |
321 private void checkFileExistence(){ | |
322 | |
323 //Ensure output files can be written | |
324 if(!Tools.testOutputFiles(overwrite, append, false, outg, outa, outb, outj, outstats, asrhist, irsrhist)){ | |
325 outstream.println((outg==null)+", "+(outb==null)+", "+outg+", "+outa+", "+outb+", "+outj+", "+outstats); | |
326 throw new RuntimeException("\n\noverwrite="+overwrite+"; Can't write to output files "+outg+", "+outa+", "+outb+", "+outj+"\n"); | |
327 } | |
328 | |
329 //Ensure input files can be read | |
330 if(!Tools.testInputFiles(false, true, in1)){ | |
331 throw new RuntimeException("\nCan't read some input files.\n"); | |
332 } | |
333 | |
334 //Ensure that no file was specified multiple times | |
335 if(!Tools.testForDuplicateFiles(true, in1, outg, outa, outb, outj, outstats, asrhist, irsrhist)){ | |
336 throw new RuntimeException("\nSome file names were specified multiple times.\n"); | |
337 } | |
338 } | |
339 | |
340 /** Adjust file-related static fields as needed for this program */ | |
341 private static void checkStatics(){ | |
342 //Adjust the number of threads for input file reading | |
343 if(!ByteFile.FORCE_MODE_BF1 && !ByteFile.FORCE_MODE_BF2 && Shared.threads()>2){ | |
344 ByteFile.FORCE_MODE_BF2=true; | |
345 } | |
346 | |
347 assert(FastaReadInputStream.settingsOK()); | |
348 } | |
349 | |
350 /*--------------------------------------------------------------*/ | |
351 /*---------------- Outer Methods ----------------*/ | |
352 /*--------------------------------------------------------------*/ | |
353 | |
354 /** Create read streams and process all data */ | |
355 void process(Timer t){ | |
356 | |
357 //Turn off read validation in the input threads to increase speed | |
358 final boolean vic=Read.VALIDATE_IN_CONSTRUCTOR; | |
359 Read.VALIDATE_IN_CONSTRUCTOR=Shared.threads()<4; | |
360 | |
361 //Create a read input stream | |
362 ZMWStreamer zstream=new ZMWStreamer(ffin1, Shared.threads(), maxReads, -1); | |
363 | |
364 //Optionally create read output streams | |
365 final ConcurrentReadOutputStream rosg=makeCros(ffoutg); | |
366 final ConcurrentReadOutputStream rosa=makeCros(ffouta); | |
367 final ConcurrentReadOutputStream rosb=makeCros(ffoutb); | |
368 final ConcurrentReadOutputStream rosj=makeCros(ffoutj); | |
369 | |
370 //Reset counters | |
371 readsProcessed=readsOut=0; | |
372 basesProcessed=basesOut=0; | |
373 junctionsOut=0; | |
374 | |
375 //Process the reads in separate threads | |
376 spawnThreads(zstream, rosg, rosa, rosb, rosj); | |
377 | |
378 if(verbose){outstream.println("Finished; closing streams.");} | |
379 | |
380 //Write anything that was accumulated by ReadStats | |
381 errorState|=ReadStats.writeAll(); | |
382 //Close the read streams | |
383 errorState|=ReadWrite.closeStreams(null, rosg, rosa, rosb, rosj); | |
384 | |
385 //Reset read validation | |
386 Read.VALIDATE_IN_CONSTRUCTOR=vic; | |
387 | |
388 writeScoreRatioHistogram(ffasrhist, adapterScores); | |
389 writeScoreRatioHistogram(ffirsrhist, repeatScores); | |
390 | |
391 //Report timing and results | |
392 t.stop(); | |
393 | |
394 String stats=null; | |
395 if(format==FORMAT_TEXT){ | |
396 ByteBuilder bb=toText(t); | |
397 stats=bb.nl().toString(); | |
398 }else if(format==FORMAT_JSON){ | |
399 JsonObject jo=toJson(t); | |
400 stats=jo.toStringln(); | |
401 }else{ | |
402 assert(false) : "Bad format: "+format; | |
403 } | |
404 if(ffstats==null){ | |
405 outstream.print(stats); | |
406 }else{ | |
407 ReadWrite.writeString(stats, outstats); | |
408 } | |
409 | |
410 //Throw an exception of there was an error in a thread | |
411 if(errorState){ | |
412 throw new RuntimeException(getClass().getName()+" terminated in an error state; the output may be corrupt."); | |
413 } | |
414 } | |
415 | |
416 private ByteBuilder toText(Timer t){ | |
417 ByteBuilder bb=new ByteBuilder(); | |
418 | |
419 bb.appendln(Tools.timeReadsBasesProcessed(t, readsProcessed, basesProcessed, 8)); | |
420 bb.appendln(Tools.readsBasesOut(readsProcessed, basesProcessed, readsOut, basesOut, 8, false)); | |
421 | |
422 long readsFiltered=readsProcessed-readsOut; | |
423 bb.appendln(Tools.numberPercent("Reads Filtered:", readsFiltered, readsFiltered*100.0/(readsProcessed), 3, 8)); | |
424 if(trimReads || trimPolyA){ | |
425 bb.appendln(Tools.numberPercent("Reads Trimmed:", readsTrimmed, readsTrimmed*100.0/(readsProcessed), 3, 8)); | |
426 bb.appendln(Tools.numberPercent("Bases Trimmed:", basesTrimmed, basesTrimmed*100.0/(basesProcessed), 3, 8)); | |
427 } | |
428 bb.appendln(Tools.number("Total ZMWs:", ZMWs, 8)); | |
429 bb.appendln(Tools.numberPercent("Bad ZMWs:", iceCreamZMWs, iceCreamZMWs*100.0/(ZMWs), 3, 8)); | |
430 bb.appendln(Tools.numberPercent("Absent Adapter:", missingAdapterZMWs, missingAdapterZMWs*100.0/(ZMWs), 3, 8)); | |
431 bb.appendln(Tools.numberPercent("Hidden Adapter:", hiddenAdapterZMWs, hiddenAdapterZMWs*100.0/(ZMWs), 3, 8)); | |
432 bb.appendln(Tools.numberPercent("Ambiguous IR:", ambiguousZMWs, ambiguousZMWs*100.0/(ZMWs), 3, 8)); | |
433 // bb.appendln(Tools.numberPercent("Low Entropy:", lowEntropyReads, lowEntropyReads*100.0/(readsProcessed), 3, 8)); | |
434 bb.appendln(Tools.numberPercent("Low Entropy:", lowEntropyZMWs, lowEntropyZMWs*100.0/(ZMWs), 3, 8)); | |
435 | |
436 bb.appendln(Tools.number("Avg Score Ratio:", iceCreamRatio/ratiosCounted, 3, 8)); | |
437 bb.appendln(Tools.number("Score Cutoff:", minRatio2, 3, 8)); | |
438 | |
439 if(printTiming){ | |
440 bb.appendln("Iterations: "+alignmentIters/1000000+"m"); | |
441 bb.appendln("Short Iterations: "+alignmentItersShort/1000000+"m"); | |
442 bb.appendln("Elapsed: "+elapsed/1000000+"ms"); | |
443 bb.appendln("Elapsed Short: "+elapsedShort/1000000+"ms"); | |
444 } | |
445 | |
446 if(parseCustom){ | |
447 { | |
448 bb.appendln("\nReads:"); | |
449 bb.appendln(Tools.numberPercent("True Positive:", truePositiveReads, truePositiveReads*100.0/(readsProcessed), 2, 8)); | |
450 bb.appendln(Tools.numberPercent("True Negative:", trueNegativeReads, trueNegativeReads*100.0/(readsProcessed), 2, 8)); | |
451 bb.appendln(Tools.numberPercent("False Positive:", falsePositiveReads, falsePositiveReads*100.0/(readsProcessed), 2, 8)); | |
452 bb.appendln(Tools.numberPercent("False Negative:", falseNegativeReads, falseNegativeReads*100.0/(readsProcessed), 2, 8)); | |
453 bb.appendln(Tools.numberPercent("Ambiguous:", ambiguousReads, ambiguousReads*100.0/(readsProcessed), 2, 8)); | |
454 | |
455 double snr=(truePositiveReads+trueNegativeReads+ambiguousReads)/Tools.max(1, falsePositiveReads+falseNegativeReads+ambiguousReads); | |
456 snr=10*Math.log10(snr); | |
457 bb.appendln(Tools.number("SNR:", snr, 2, 8)); | |
458 } | |
459 { | |
460 bb.appendln("\nZMWs:"); | |
461 bb.appendln(Tools.numberPercent("True Positive:", truePositiveZMWs, truePositiveZMWs*100.0/(ZMWs), 2, 8)); | |
462 bb.appendln(Tools.numberPercent("True Negative:", trueNegativeZMWs, trueNegativeZMWs*100.0/(ZMWs), 2, 8)); | |
463 bb.appendln(Tools.numberPercent("False Positive:", falsePositiveZMWs, falsePositiveZMWs*100.0/(ZMWs), 2, 8)); | |
464 bb.appendln(Tools.numberPercent("False Negative:", falseNegativeZMWs, falseNegativeZMWs*100.0/(ZMWs), 2, 8)); | |
465 bb.appendln(Tools.numberPercent("Ambiguous:", ambiguousZMWs, ambiguousZMWs*100.0/(ZMWs), 2, 8)); | |
466 | |
467 double snr=(truePositiveZMWs+trueNegativeZMWs+ambiguousReads)/Tools.max(1, falsePositiveZMWs+falseNegativeZMWs+ambiguousZMWs); | |
468 snr=10*Math.log10(snr); | |
469 bb.appendln(Tools.number("SNR:", snr, 2, 8)); | |
470 } | |
471 } | |
472 return bb; | |
473 } | |
474 | |
475 private JsonObject toJson(Timer t){ | |
476 JsonObject jo=new JsonObject(); | |
477 long readsFiltered=readsProcessed-readsOut; | |
478 | |
479 jo.add("Time", t.timeInSeconds()); | |
480 jo.add("Reads_Processed", readsProcessed); | |
481 jo.add("Bases_Processed", basesProcessed); | |
482 jo.add("Reads_Out", readsOut); | |
483 jo.add("Bases_Out", basesOut); | |
484 jo.add("Reads_Filtered", readsFiltered); | |
485 jo.add("Reads_Filtered_Pct", readsFiltered*100.0/(readsProcessed)); | |
486 if(trimReads){ | |
487 jo.add("Reads_Trimmed", readsTrimmed); | |
488 jo.add("Reads_Trimmed_Pct", readsTrimmed*100.0/(readsProcessed)); | |
489 jo.add("Bases_Trimmed", basesTrimmed); | |
490 jo.add("Bases_Trimmed_Pct", basesTrimmed*100.0/(basesProcessed)); | |
491 } | |
492 jo.add("Total_ZMWs", ZMWs); | |
493 jo.add("Bad_ZMWs", iceCreamZMWs); | |
494 jo.add("Bad_ZMWs_Pct", iceCreamZMWs*100.0/(ZMWs)); | |
495 jo.add("Absent_Adapter", missingAdapterZMWs); | |
496 jo.add("Absent_Adapter_Pct", missingAdapterZMWs*100.0/(ZMWs)); | |
497 jo.add("Hidden_Adapter", hiddenAdapterZMWs); | |
498 jo.add("Hidden_Adapter_Pct", hiddenAdapterZMWs*100.0/(ZMWs)); | |
499 // jo.add("Low_Entropy", lowEntropyReads); | |
500 // jo.add("Low_Entropy_Pct", lowEntropyReads*100.0/(readsProcessed)); | |
501 jo.add("Low_Entropy", lowEntropyZMWs); | |
502 jo.add("Low_Entropy_Pct", lowEntropyZMWs*100.0/(ZMWs)); | |
503 jo.add("Ambiguous_Inverted_Repeat", ambiguousZMWs); | |
504 jo.add("Ambiguous_Inverted_Repeat_Pct", ambiguousZMWs*100.0/(ZMWs)); | |
505 jo.add("Avg_Score_Ratio_IR", iceCreamRatio/ratiosCounted); | |
506 jo.add("Score_Cutoff_IR", minRatio2); | |
507 | |
508 jo.add("Alignment_Iterations_IR", alignmentIters); | |
509 jo.add("Short_Alignment_Iterations_IR", alignmentItersShort); | |
510 | |
511 if(parseCustom){ | |
512 { | |
513 double snr=(truePositiveReads+trueNegativeReads+ambiguousReads)/Tools.max(1, falsePositiveReads+falseNegativeReads+ambiguousReads); | |
514 snr=10*Math.log10(snr); | |
515 jo.add("TP_Reads", truePositiveReads); | |
516 jo.add("TN_Reads", trueNegativeReads); | |
517 jo.add("FP_Reads", falsePositiveReads); | |
518 jo.add("FN_Reads", falseNegativeReads); | |
519 jo.add("AM_Reads", ambiguousReads); | |
520 | |
521 jo.add("TP_Reads_Pct", truePositiveReads*100.0/(readsProcessed)); | |
522 jo.add("TN_Reads_Pct", trueNegativeReads*100.0/(readsProcessed)); | |
523 jo.add("FP_Reads_Pct", falsePositiveReads*100.0/(readsProcessed)); | |
524 jo.add("FN_Reads_Pct", falseNegativeReads*100.0/(readsProcessed)); | |
525 jo.add("AM_Reads_Pct", ambiguousReads*100.0/(readsProcessed)); | |
526 | |
527 jo.add("SNR_Reads", snr); | |
528 } | |
529 { | |
530 double snr=(truePositiveZMWs+trueNegativeZMWs+ambiguousZMWs)/Tools.max(1, falsePositiveZMWs+falseNegativeZMWs+ambiguousZMWs); | |
531 snr=10*Math.log10(snr); | |
532 jo.add("TP_ZMWs", truePositiveZMWs); | |
533 jo.add("TN_ZMWs", trueNegativeZMWs); | |
534 jo.add("FP_ZMWs", falsePositiveZMWs); | |
535 jo.add("FN_ZMWs", falseNegativeZMWs); | |
536 jo.add("AM_ZMWs", ambiguousZMWs); | |
537 | |
538 jo.add("TP_ZMWs_Pct", truePositiveZMWs*100.0/(ZMWs)); | |
539 jo.add("TN_ZMWs_Pct", trueNegativeZMWs*100.0/(ZMWs)); | |
540 jo.add("FP_ZMWs_Pct", falsePositiveZMWs*100.0/(ZMWs)); | |
541 jo.add("FN_ZMWs_Pct", falseNegativeZMWs*100.0/(ZMWs)); | |
542 jo.add("AM_ZMWs_Pct", ambiguousZMWs*100.0/(ZMWs)); | |
543 | |
544 jo.add("SNR_ZMWs", snr); | |
545 } | |
546 } | |
547 return jo; | |
548 } | |
549 | |
550 private static void writeScoreRatioHistogram(FileFormat ff, long[] hist){ | |
551 if(ff==null){return;} | |
552 final ByteStreamWriter bsw=new ByteStreamWriter(ff); | |
553 bsw.start(); | |
554 final float mult=1.0f/(hist.length-1); | |
555 | |
556 bsw.print("#Counted\t").println(Tools.sum(hist)); | |
557 bsw.print("#Mean\t").println(Tools.averageHistogram(hist)*mult, 3); | |
558 bsw.print("#Median\t").println(Tools.medianHistogram(hist)*mult, 3); | |
559 bsw.print("#Mode\t").println(Tools.calcModeHistogram(hist)*mult, 3); | |
560 bsw.print("#STDev\t").println(Tools.standardDeviationHistogram(hist)*mult, 3); | |
561 bsw.print("#Value\tOccurances\n"); | |
562 | |
563 for(int i=0; i<hist.length; i++){ | |
564 bsw.print(i*mult, 3).tab().println(hist[i]); | |
565 } | |
566 bsw.poisonAndWait(); | |
567 } | |
568 | |
569 private ConcurrentReadOutputStream makeCros(FileFormat ff){ | |
570 if(ff==null){return null;} | |
571 | |
572 //Select output buffer size based on whether it needs to be ordered | |
573 final int buff=16; | |
574 | |
575 final ConcurrentReadOutputStream ros=ConcurrentReadOutputStream.getStream(ff, null, buff, null, ff.samOrBam() && ffin1.samOrBam()); | |
576 ros.start(); //Start the stream | |
577 return ros; | |
578 } | |
579 | |
580 /** Spawn process threads */ | |
581 private void spawnThreads(final ZMWStreamer zstream, | |
582 final ConcurrentReadOutputStream rosg, final ConcurrentReadOutputStream rosa, | |
583 final ConcurrentReadOutputStream rosb, final ConcurrentReadOutputStream rosj){ | |
584 | |
585 //Do anything necessary prior to processing | |
586 | |
587 //Fill a list with ProcessThreads | |
588 ArrayList<ProcessThread> alpt=new ArrayList<ProcessThread>(threads); | |
589 for(int i=0; i<threads; i++){ | |
590 alpt.add(new ProcessThread(zstream, rosg, rosa, rosb, rosj, i)); | |
591 } | |
592 | |
593 //Start the threads | |
594 for(ProcessThread pt : alpt){ | |
595 pt.start(); | |
596 } | |
597 | |
598 zstream.runStreamer(false); | |
599 | |
600 //Wait for threads to finish | |
601 waitForThreads(alpt); | |
602 | |
603 //Do anything necessary after processing | |
604 ZMWs=zstream.ZMWs; | |
605 } | |
606 | |
607 private void waitForThreads(ArrayList<ProcessThread> alpt){ | |
608 | |
609 //Wait for completion of all threads | |
610 boolean success=true; | |
611 for(ProcessThread pt : alpt){ | |
612 | |
613 //Wait until this thread has terminated | |
614 while(pt.getState()!=Thread.State.TERMINATED){ | |
615 try { | |
616 //Attempt a join operation | |
617 pt.join(); | |
618 } catch (InterruptedException e) { | |
619 //Potentially handle this, if it is expected to occur | |
620 e.printStackTrace(); | |
621 } | |
622 } | |
623 | |
624 //Accumulate per-thread statistics | |
625 readsProcessed+=pt.readsProcessedT; | |
626 basesProcessed+=pt.basesProcessedT; | |
627 | |
628 truePositiveReads+=pt.truePositiveReadsT; | |
629 trueNegativeReads+=pt.trueNegativeReadsT; | |
630 falsePositiveReads+=pt.falsePositiveReadsT; | |
631 falseNegativeReads+=pt.falseNegativeReadsT; | |
632 ambiguousReads+=pt.ambiguousReadsT; | |
633 | |
634 truePositiveZMWs+=pt.truePositiveZMWsT; | |
635 trueNegativeZMWs+=pt.trueNegativeZMWsT; | |
636 falsePositiveZMWs+=pt.falsePositiveZMWsT; | |
637 falseNegativeZMWs+=pt.falseNegativeZMWsT; | |
638 ambiguousZMWs+=pt.ambiguousZMWsT; | |
639 | |
640 readsOut+=pt.readsOutT; | |
641 basesOut+=pt.basesOutT; | |
642 junctionsOut+=pt.junctionsOutT; | |
643 | |
644 alignmentIters+=pt.ica.iters(); | |
645 alignmentItersShort+=pt.ica.itersShort(); | |
646 elapsed+=pt.elapsedT; | |
647 elapsedShort+=pt.elapsedShortT; | |
648 | |
649 iceCreamReads+=pt.iceCreamReadsT; | |
650 iceCreamBases+=pt.iceCreamBasesT; | |
651 iceCreamZMWs+=pt.iceCreamZMWsT; | |
652 iceCreamRatio+=pt.iceCreamRatioT; | |
653 ratiosCounted+=pt.ratiosCountedT; | |
654 missingAdapterZMWs+=pt.missingAdapterZMWsT; | |
655 hiddenAdapterZMWs+=pt.hiddenAdapterZMWsT; | |
656 lowEntropyZMWs+=pt.lowEntropyZMWsT; | |
657 lowEntropyReads+=pt.lowEntropyReadsT; | |
658 | |
659 basesTrimmed+=pt.basesTrimmedT; | |
660 readsTrimmed+=pt.readsTrimmedT; | |
661 | |
662 for(int i=0; i<adapterScores.length; i++){ | |
663 adapterScores[i]+=pt.adapterScoresT[i]; | |
664 repeatScores[i]+=pt.repeatScoresT[i]; | |
665 } | |
666 | |
667 success&=pt.success; | |
668 } | |
669 | |
670 //Track whether any threads failed | |
671 if(!success){errorState=true;} | |
672 } | |
673 | |
674 /*--------------------------------------------------------------*/ | |
675 /*---------------- Inner Methods ----------------*/ | |
676 /*--------------------------------------------------------------*/ | |
677 | |
678 /*--------------------------------------------------------------*/ | |
679 /*---------------- Inner Classes ----------------*/ | |
680 /*--------------------------------------------------------------*/ | |
681 | |
682 /** Processes reads. */ | |
683 private class ProcessThread extends Thread { | |
684 | |
685 //Constructor | |
686 ProcessThread(final ZMWStreamer zstream_, | |
687 final ConcurrentReadOutputStream ros_, final ConcurrentReadOutputStream rosa_, | |
688 final ConcurrentReadOutputStream rosb_, final ConcurrentReadOutputStream rosj_, final int tid_){ | |
689 zstream=zstream_; | |
690 rosg=ros_; | |
691 rosa=rosa_; | |
692 rosb=rosb_; | |
693 rosj=rosj_; | |
694 tid=tid_; | |
695 | |
696 Arrays.fill(tipBufferLeft, (byte)'N'); | |
697 Arrays.fill(tipBufferRight, (byte)'N'); | |
698 | |
699 if(entropyCutoff>=0){ | |
700 eTracker=new EntropyTracker(false, entropyCutoff, true); | |
701 }else{ | |
702 eTracker=null; | |
703 } | |
704 } | |
705 | |
706 //Called by start() | |
707 @Override | |
708 public void run(){ | |
709 //Do anything necessary prior to processing | |
710 | |
711 //Process the reads | |
712 processInner(); | |
713 | |
714 //Do anything necessary after processing | |
715 | |
716 //Indicate successful exit status | |
717 success=true; | |
718 } | |
719 | |
720 /** Iterate through the reads */ | |
721 void processInner(){ | |
722 | |
723 //As long as there is a nonempty read list... | |
724 for(ZMW reads=zstream.nextZMW(); reads!=null; reads=zstream.nextZMW()){ | |
725 // if(verbose){outstream.println("Fetched "+reads.size()+" reads.");} //Disabled due to non-static access | |
726 | |
727 processList(reads); | |
728 } | |
729 } | |
730 | |
731 int flagLowEntropyReads(final ZMW reads, final float minEnt, | |
732 final int minLen0, final float minFract){ | |
733 int found=0; | |
734 for(Read r : reads){ | |
735 if(!r.discarded()){ | |
736 int minLen=Tools.min((int)(r.length()*minFract), minLen0); | |
737 int maxBlock=eTracker.longestLowEntropyBlock(r.bases, false, maxMonomerFraction); | |
738 if(maxBlock>=minLen){ | |
739 r.setDiscarded(true); | |
740 r.setJunk(true); | |
741 found++; | |
742 // System.err.println(r.toFasta()); | |
743 } | |
744 } | |
745 } | |
746 return found; | |
747 } | |
748 | |
749 int flagLongReads(final ZMW reads, int median){ | |
750 int found=0; | |
751 for(Read r : reads){ | |
752 if(r.length()>longReadMult*median){ | |
753 r.setDiscarded(true); | |
754 r.setHasAdapter(true); | |
755 found++; | |
756 } | |
757 } | |
758 return found; | |
759 } | |
760 | |
761 /** Each list is presumed to be all reads from a ZMW, in order */ | |
762 void processList(final ZMW reads){ | |
763 long numBases=0; | |
764 | |
765 //Loop through each read in the list for statistics | |
766 for(int idx=0; idx<reads.size(); idx++){ | |
767 final Read r1=reads.get(idx); | |
768 | |
769 //Validate reads in worker threads | |
770 if(!r1.validated()){r1.validate(true);} | |
771 | |
772 //Track the initial length for statistics | |
773 final int initialLength1=r1.length(); | |
774 | |
775 //Increment counters | |
776 readsProcessedT++; | |
777 basesProcessedT+=initialLength1; | |
778 numBases+=initialLength1; | |
779 } | |
780 final boolean fullPass=CCS || reads.size()>=3; | |
781 | |
782 if(trimReads || trimPolyA){ | |
783 int removed=0; | |
784 for(int i=0; i<reads.size(); i++){ | |
785 Read r=reads.get(i); | |
786 byte a=r.bases[0], b=r.bases[r.length()-1]; | |
787 int trimmed=0; | |
788 if(!AminoAcid.isFullyDefined(a) || !AminoAcid.isFullyDefined(b)){ | |
789 trimmed+=trimRead(r, 80); | |
790 } | |
791 | |
792 if(trimReads && adapter!=null){ | |
793 int leftAdapterBases=alignLeftTipAdapter(r); | |
794 int rightAdapterBases=alignRightTipAdapter(r); | |
795 if(leftAdapterBases+rightAdapterBases>0){ | |
796 trimmed+=trimRead(r, adapterTipLen); | |
797 r.setTrimmed(true); | |
798 } | |
799 } | |
800 if(trimPolyA){ | |
801 int x=trimPolyA(r); | |
802 trimmed+=x; | |
803 if(x>0){r.setTrimmed(true);} | |
804 } | |
805 | |
806 if(trimmed>0){ | |
807 basesTrimmedT+=trimmed; | |
808 readsTrimmedT++; | |
809 } | |
810 | |
811 //TODO: Note again, removing intermediate reads messes up forward-rev ordering | |
812 if(r.length()<minLengthAfterTrimming){//Discard short trash | |
813 reads.set(i, null); | |
814 removed++; | |
815 } | |
816 } | |
817 if(removed>0){ | |
818 Tools.condenseStrict(reads); | |
819 } | |
820 } | |
821 | |
822 if(entropyCutoff>0){ | |
823 int bad=flagLowEntropyReads(reads, entropyCutoff, entropyLength, entropyFraction); | |
824 if(bad>0){ | |
825 lowEntropyZMWsT++; | |
826 lowEntropyReadsT+=bad; | |
827 if(bad>=reads.size()){ | |
828 if(!reads.isEmpty()){outputReads(reads, null);} | |
829 return; //No point in continuing... | |
830 } | |
831 } | |
832 } | |
833 | |
834 if(reads.isEmpty()){return;} | |
835 | |
836 Read sample=null;//Read that will be searched for inverted repeat, typically median length | |
837 Read shortest=null;//shortest read in the middle, or overall if no middle reads | |
838 final int medianLength=reads.medianLength(true); | |
839 boolean foundInvertedRepeat=false; | |
840 int longReads=0; | |
841 int adapterReads=0; | |
842 int maxAdapters=0; | |
843 int sampleNum=0; | |
844 | |
845 if(reads.size()>=3){ | |
846 if(flagLongReads){longReads=flagLongReads(reads, medianLength);} | |
847 for(int i=1; i<reads.size()-1; i++){ | |
848 Read r=reads.get(i); | |
849 if(sample==null && r.length()==medianLength){ | |
850 sample=r; | |
851 sampleNum=i; | |
852 } | |
853 if(shortest==null || r.length()<shortest.length()){shortest=r;} | |
854 } | |
855 }else{ | |
856 for(int i=0; i<reads.size(); i++){ | |
857 Read r=reads.get(i); | |
858 if(sample==null || sample.length()<r.length()){ | |
859 sample=r; | |
860 sampleNum=i; | |
861 } | |
862 if(shortest==null || r.length()<shortest.length()){shortest=r;} | |
863 } | |
864 } | |
865 assert(sample!=null); | |
866 final AlignmentResult ar=align(sample, fullPass, reads.size(), sampleNum); | |
867 | |
868 if(ar!=null){ | |
869 foundInvertedRepeat=true; | |
870 sample.setInvertedRepeat(true); | |
871 if(ar.icecream || !filterIceCreamOnly){ | |
872 sample.setDiscarded(true); | |
873 }else if(ar.ambiguous){ | |
874 sample.setAmbiguous(true); | |
875 } | |
876 } | |
877 | |
878 if(alignAdapter){ | |
879 double mult=foundInvertedRepeat ? 0.9 : 1.0; | |
880 if(needsAdapterTest(sample)){ | |
881 int x=lookForAdapter(sample, mult); | |
882 adapterReads+=(x>0 ? 1 : 0); | |
883 maxAdapters=Tools.max(x, maxAdapters); | |
884 } | |
885 | |
886 if(reads.size()>2){ | |
887 Read a=reads.get(0), b=reads.get(reads.size()-1); | |
888 Read r=a.length()>b.length() ? a : b; | |
889 if(needsAdapterTest(r)){ | |
890 int x=lookForAdapter(r, mult); | |
891 adapterReads+=(x>0 ? 1 : 0); | |
892 maxAdapters=Tools.max(x, maxAdapters); | |
893 } | |
894 } | |
895 | |
896 for(Read r : reads){ | |
897 if((r.length()>=shortReadMult*shortest.length() || adapterReads>0 || longReads>0 || foundInvertedRepeat) | |
898 && needsAdapterTest(r)){ | |
899 int x=lookForAdapter(r, mult); | |
900 adapterReads+=(x>0 ? 1 : 0); | |
901 maxAdapters=Tools.max(x, maxAdapters); | |
902 } | |
903 } | |
904 } | |
905 | |
906 if(ar!=null && ar.icecream){ | |
907 iceCreamRatioT+=ar.ratio; | |
908 ratiosCountedT++; | |
909 int idx=(int)(ar.ratio*200); | |
910 repeatScoresT[idx]++; | |
911 if(longReads+adapterReads==0){missingAdapterZMWsT++;} | |
912 } | |
913 if(longReads+adapterReads>0){ | |
914 hiddenAdapterZMWsT++; | |
915 } | |
916 | |
917 if(keepShortReads && maxAdapters<2){ | |
918 if(foundInvertedRepeat && !sample.hasAdapter()){ | |
919 if(reads.size()>2){ | |
920 for(int i=1; i<reads.size()-1; i++){ | |
921 reads.get(i).setDiscarded(true); | |
922 } | |
923 Read r=reads.get(0); | |
924 if(r.length()>=veryShortMult*medianLength){r.setDiscarded(true);} | |
925 r=reads.get(reads.size()-1); | |
926 if(r.length()>=veryShortMult*medianLength){r.setDiscarded(true);} | |
927 }else if(reads.size()==2){ | |
928 for(Read r : reads){ | |
929 if(r.length()>=veryShortMult*sample.length()){ | |
930 if(ar.icecream){ | |
931 r.setDiscarded(true); | |
932 }else if(ar.ambiguous){ | |
933 r.setAmbiguous(true); | |
934 } | |
935 } | |
936 } | |
937 } | |
938 } | |
939 }else if(sample.discarded() || (longReads+adapterReads>0)){ | |
940 for(Read r : reads){ | |
941 r.setDiscarded(true); | |
942 } | |
943 } | |
944 | |
945 ArrayList<Read> junctions=null; | |
946 if(ar!=null){ | |
947 if(rosj!=null && !sample.hasAdapter()){ | |
948 Read r=ar.alignedRead; | |
949 int width=Tools.min(200, ar.junctionLoc, r.length()-ar.junctionLoc); | |
950 int a=ar.junctionLoc-width, b=ar.junctionLoc+width; | |
951 byte[] bases=Arrays.copyOfRange(r.bases, a, b); | |
952 Read junction=new Read(bases, null, r.id+"\tjunction:"+a+"-"+b, r.numericID); | |
953 junctions=new ArrayList<Read>(1); | |
954 junctions.add(junction); | |
955 junctionsOutT++; | |
956 } | |
957 if(modifyHeader){ | |
958 sample.id=sample.id+"\tratio="+ar.ratio+"\tjunction="+ar.junctionLoc+ | |
959 "\tIR="+sample.invertedRepeat()+"\tAD="+sample.hasAdapter()+"\tFP="+fullPass+"\tsubreads="+reads.size(); | |
960 } | |
961 } | |
962 | |
963 removeShortTrash(reads); | |
964 | |
965 if(!reads.isEmpty()){outputReads(reads, junctions);} | |
966 } | |
967 | |
968 //TODO: Now that I think about it. The order of the reads is important. | |
969 //Since they go forward-rev-forward-rev it's imprudent to discard inner reads. | |
970 private void removeShortTrash(ZMW reads) { | |
971 int removed=0; | |
972 for(int i=0; i<reads.size(); i++){ | |
973 Read r=reads.get(i); | |
974 if(r.length()<minLengthAfterTrimming){//Discard short trash | |
975 reads.set(i, null); | |
976 removed++; | |
977 } | |
978 } | |
979 if(removed>0){Tools.condenseStrict(reads);} | |
980 } | |
981 | |
982 int trimRead(Read r, int lookahead){ | |
983 final byte[] bases=r.bases; | |
984 | |
985 int left=calcLeftTrim(bases, lookahead); | |
986 int right=calcRightTrim(bases, lookahead); | |
987 int trimmed=0; | |
988 | |
989 if(left+right>0){ | |
990 // System.err.println(r.length()+", "+left+", "+right+", "+r.id); | |
991 trimmed=TrimRead.trimByAmount(r, left, right, 1, false); | |
992 // System.err.println(r.length()+", "+left+", "+right+", "+r.id); | |
993 ZMW.fixReadHeader(r, left, right); | |
994 // System.err.println(r.length()+", "+left+", "+right+", "+r.id); | |
995 } | |
996 | |
997 if(r.samline!=null){ | |
998 r.samline.seq=r.bases; | |
999 r.samline.qual=r.quality; | |
1000 } | |
1001 return trimmed; | |
1002 } | |
1003 | |
1004 int trimPolyA(Read r){ | |
1005 final byte[] bases=r.bases; | |
1006 | |
1007 int left=Tools.max(PolymerTrimmer.testLeft(bases, 'A'), PolymerTrimmer.testLeft(bases, 'T')); | |
1008 int right=Tools.max(PolymerTrimmer.testRight(bases, 'A'), PolymerTrimmer.testRight(bases, 'T')); | |
1009 int trimmed=0; | |
1010 | |
1011 if(left+right>0){ | |
1012 // System.err.println(r.length()+", "+left+", "+right+", "+r.id); | |
1013 trimmed=TrimRead.trimByAmount(r, left, right, 1, false); | |
1014 // System.err.println(r.length()+", "+left+", "+right+", "+r.id); | |
1015 ZMW.fixReadHeader(r, left, right); | |
1016 // System.err.println(r.length()+", "+left+", "+right+", "+r.id); | |
1017 } | |
1018 | |
1019 if(r.samline!=null){ | |
1020 r.samline.seq=r.bases; | |
1021 r.samline.qual=r.quality; | |
1022 } | |
1023 return trimmed; | |
1024 } | |
1025 | |
1026 final int calcLeftTrim(final byte[] bases, int lookahead){ | |
1027 final int len=bases.length; | |
1028 int lastUndef=-1; | |
1029 for(int i=0, defined=0; i<len && defined<lookahead; i++){ | |
1030 if(AminoAcid.isFullyDefined(bases[i])){ | |
1031 defined++; | |
1032 }else{ | |
1033 lastUndef=i; | |
1034 defined=0; | |
1035 } | |
1036 } | |
1037 return lastUndef+1; | |
1038 } | |
1039 | |
1040 final int calcRightTrim(final byte[] bases, int lookahead){ | |
1041 final int len=bases.length; | |
1042 int lastUndef=len; | |
1043 for(int i=len-1, defined=0; i>=0 && defined<lookahead; i--){ | |
1044 if(AminoAcid.isFullyDefined(bases[i])){ | |
1045 defined++; | |
1046 }else{ | |
1047 lastUndef=i; | |
1048 defined=0; | |
1049 } | |
1050 } | |
1051 return len-lastUndef-1; | |
1052 } | |
1053 | |
1054 final int alignLeftTipAdapter(Read r){ | |
1055 assert(adapter.length<adapterTipLen); //Time to increase adapterTipLen | |
1056 if(r.length()<adapterTipLen){return 0;} | |
1057 final byte[] array=tipBufferLeft; | |
1058 | |
1059 for(int i=adapterTipPad, j=0; i<array.length; i++, j++){array[i]=r.bases[j];} | |
1060 int[] rvec=ssa.fillAndScoreLimited(adapter, array, 0, array.length, minSwScore); | |
1061 | |
1062 if(rvec==null || rvec[0]<minSwScore){return 0;} | |
1063 final int score=rvec[0]; | |
1064 final int start=Tools.max(0, rvec[1]-adapterTipPad); | |
1065 final int stop=rvec[2]-adapterTipPad; | |
1066 for(int i=start; i<=stop; i++){r.bases[i]='X';} | |
1067 return stop-start+1; | |
1068 } | |
1069 | |
1070 final int alignRightTipAdapter(Read r){ | |
1071 final byte[] bases=r.bases; | |
1072 assert(adapter.length<adapterTipLen); //Time to increase adapterTipLen | |
1073 if(r.length()<adapterTipLen){return 0;} | |
1074 final byte[] array=tipBufferRight; | |
1075 | |
1076 for(int i=0, j=bases.length-adapterTipLen; i<adapterTipLen; i++, j++){array[i]=bases[j];} | |
1077 int[] rvec=ssa.fillAndScoreLimited(adapter, array, 0, array.length, minSwScore); | |
1078 | |
1079 if(rvec==null || rvec[0]<minSwScore){return 0;} | |
1080 final int score=rvec[0]; | |
1081 final int start=Tools.max(0, rvec[1]-adapterTipPad); | |
1082 final int stop=rvec[2]-adapterTipPad; | |
1083 for(int i=start; i<=stop; i++){r.bases[i]='X';} | |
1084 return stop-start+1; | |
1085 } | |
1086 | |
1087 boolean needsAdapterTest(Read r){ | |
1088 if(r.tested() || r.hasAdapter()){return false;} | |
1089 if(adapterRatio<=0 || r.discarded()){return true;} | |
1090 AlignmentResult aa=fla.alignForwardShort(adapter, r.bases, 0, r.bases.length-1, adapterRatio); | |
1091 return aa!=null; | |
1092 } | |
1093 | |
1094 private void outputReads(ZMW reads, ArrayList<Read> junctions){ | |
1095 final int size=reads.size(); | |
1096 final ArrayList<Read> good=(rosg==null ? null : new ArrayList<Read>(size)); | |
1097 final ArrayList<Read> ambig=(rosa==null ? null : new ArrayList<Read>(size)); | |
1098 final ArrayList<Read> bad=(rosb==null ? null : new ArrayList<Read>(size)); | |
1099 | |
1100 int discardedReads=0; | |
1101 int ambigReads=0; | |
1102 int trimmedReads=0; | |
1103 boolean sendAllToDiscarded=false; | |
1104 boolean sendAllToAmbiguous=false; | |
1105 | |
1106 //Check to see if any reads are discarded or ambiguous | |
1107 for(Read r : reads){ | |
1108 if(r.discarded()){ | |
1109 discardedReads++; | |
1110 }else if(r.ambiguous()){ | |
1111 ambigReads++; | |
1112 }else if(r.trimmed()){ | |
1113 trimmedReads++; | |
1114 } | |
1115 } | |
1116 | |
1117 //Unify flags on all reads | |
1118 if(keepZMWsTogether){ | |
1119 if(discardedReads>0){sendAllToDiscarded=true;} | |
1120 else if(ambigReads>0){sendAllToAmbiguous=true;} | |
1121 } | |
1122 // if(discardedReads>0 || ambigReads>0){System.err.println("\nd="+discardedReads+", a="+ambigReads);} | |
1123 if(discardedReads>0){iceCreamZMWsT++;} | |
1124 | |
1125 int trueIceCreamReads=0; | |
1126 for(Read r : reads){ | |
1127 boolean trueIceCream=(parseCustom ? ReadBuilder.isIceCream(r.id) : false); | |
1128 trueIceCreamReads+=(trueIceCream ? 1 : 0); | |
1129 if(r.discarded() || sendAllToDiscarded){ | |
1130 // assert(false); | |
1131 if(bad!=null){bad.add(r);} | |
1132 if(trueIceCream){truePositiveReadsT++;} | |
1133 else{falsePositiveReadsT++;} | |
1134 // System.err.println("d:t="+r.tested()+",ad="+r.hasAdapter()+",ir="+r.invertedRepeat()+","+r.id+", reads="+reads.size()); | |
1135 }else if(r.ambiguous() || sendAllToAmbiguous){ | |
1136 if(ambig!=null){ambig.add(r);} | |
1137 if(sendAmbigToGood){ | |
1138 readsOutT++; | |
1139 basesOutT+=r.length(); | |
1140 if(good!=null) {good.add(r);} | |
1141 } | |
1142 if(sendAmbigToBad && bad!=null){bad.add(r);} | |
1143 ambiguousReadsT++; | |
1144 // System.err.println("a*:t="+r.tested()+",ad="+r.hasAdapter()+",ir="+r.invertedRepeat()+","+r.id+", reads="+reads.size()); | |
1145 }else{ | |
1146 if(good!=null){ | |
1147 good.add(r); | |
1148 } | |
1149 readsOutT++; | |
1150 basesOutT+=r.length(); | |
1151 if(trueIceCream && !r.trimmed()){falseNegativeReadsT++;} | |
1152 else{trueNegativeReadsT++;} | |
1153 // if(discardedReads>0 || ambigReads>0){System.err.println("g*:t="+r.tested()+",ad="+r.hasAdapter()+",ir="+r.invertedRepeat()+","+r.id+", reads="+reads.size());} | |
1154 } | |
1155 } | |
1156 | |
1157 if(trueIceCreamReads>0){ | |
1158 if(discardedReads>0 || trimmedReads>0){ | |
1159 truePositiveZMWsT++; | |
1160 }else if(ambigReads>0){ | |
1161 ambiguousZMWs++; | |
1162 }else{ | |
1163 falseNegativeZMWsT++; | |
1164 // StringBuilder sb=new StringBuilder(); | |
1165 // for(Read r : reads) {sb.append("\n").append(r.id);} | |
1166 // System.err.println(sb); | |
1167 } | |
1168 }else{ | |
1169 if(discardedReads>0){ | |
1170 falsePositiveZMWsT++; | |
1171 }else if(ambigReads>0){ | |
1172 ambiguousZMWs++; | |
1173 }else{ | |
1174 trueNegativeZMWsT++; | |
1175 } | |
1176 } | |
1177 | |
1178 if(rosg!=null && good!=null && !good.isEmpty()){rosg.add(good, 0);} | |
1179 if(rosa!=null && ambig!=null && !ambig.isEmpty()){rosa.add(ambig, 0);} | |
1180 if(rosb!=null && bad!=null && !bad.isEmpty()){rosb.add(bad, 0);} | |
1181 if(rosj!=null && junctions!=null && !junctions.isEmpty()){rosj.add(junctions, 0);} | |
1182 } | |
1183 | |
1184 /** | |
1185 * Align part of a read to itself to look for inverted repeats. | |
1186 */ | |
1187 AlignmentResult align(final Read r, boolean fullPass, int passes, int readNum){ | |
1188 if(!alignRC){return null;} | |
1189 final byte[] bases=r.bases; | |
1190 | |
1191 int qlen=(int)Tools.max(minQlen, Tools.min(targetQlen, bases.length*maxQlenFraction)); | |
1192 if(qlen>0.45f*bases.length){return null;}//Ignore short stuff | |
1193 | |
1194 //Perform an initial scan using the tips of the reads to look for an inverted repeat | |
1195 boolean tipOnly=filterIceCreamOnly && fullPass; | |
1196 // System.err.println(filterIceCreamOnly+", "+fullPass+", "+tipOnly); | |
1197 AlignmentResult a=alignLeft(bases, qlen, minRatio1, true, tipOnly); | |
1198 AlignmentResult b=(fullPass && false ? null : alignRight(bases, qlen, minRatio1, true, tipOnly));//A second alignment is not needed for a full pass. | |
1199 AlignmentResult ar=(a==null ? b : b==null ? a : a.maxScore>=b.maxScore ? a : b); | |
1200 | |
1201 //If nothing was detected, return | |
1202 if(ar==null){return null;} | |
1203 ar.alignedRead=r; | |
1204 | |
1205 //At this point, the read contains an inverted repeat of length qlen. | |
1206 final int expectedJunction=ar.rLen/2; | |
1207 | |
1208 if(ar.left){ | |
1209 ar.junctionLoc=ar.maxRpos/2; | |
1210 }else{ | |
1211 int innerLeft=ar.maxRpos; | |
1212 int innerRight=ar.rLen-ar.qLen; | |
1213 ar.junctionLoc=(innerLeft+innerRight)/2; | |
1214 } | |
1215 | |
1216 // if(fullPass){//This code doesn't seem to have any effect and it's not clear why it is present | |
1217 // int dif=Tools.absdif(expectedJunction, ar.junctionLoc); | |
1218 // if(dif>expectedJunction*0.1) { | |
1219 // if(filterIceCreamOnly){return ar;} | |
1220 // } | |
1221 // } | |
1222 | |
1223 if(realign){ | |
1224 if(ar.junctionLoc<expectedJunction){ | |
1225 int qlen2=(int)(ar.junctionLoc*0.9); | |
1226 if(qlen2>=qlen){ | |
1227 ar=alignLeft(bases, qlen2, minRatio2, false, false); | |
1228 } | |
1229 }else{ | |
1230 int qlen2=(int)((ar.rLen-ar.junctionLoc)*0.9); | |
1231 if(qlen2>=qlen){ | |
1232 ar=alignRight(bases, qlen2, minRatio2, false, false); | |
1233 } | |
1234 } | |
1235 if(ar==null){return null;} | |
1236 ar.alignedRead=r; | |
1237 } | |
1238 | |
1239 //At this point, the read contains an inverted repeat mirrored across a junction. | |
1240 final float junctionFraction; | |
1241 if(ar.left){ | |
1242 ar.junctionLoc=ar.maxRpos/2; | |
1243 junctionFraction=ar.junctionLoc/(float)ar.rLen; | |
1244 }else{ | |
1245 int innerLeft=ar.maxRpos; | |
1246 int innerRight=ar.rLen-ar.qLen; | |
1247 ar.junctionLoc=(innerLeft+innerRight)/2; | |
1248 junctionFraction=(ar.rLen-ar.junctionLoc)/(float)ar.rLen; | |
1249 } | |
1250 | |
1251 final int dif=Tools.absdif(expectedJunction, ar.junctionLoc); | |
1252 if(fullPass){ | |
1253 if(junctionFraction<minJunctionFraction){ | |
1254 ar.icecream=false; | |
1255 }else{ | |
1256 ar.icecream=true; | |
1257 } | |
1258 }else if(passes==2){ | |
1259 if(junctionFraction<minJunctionFraction){ | |
1260 if(readNum==0){ | |
1261 //First read, the junction should be closer to the beginning for real ice cream | |
1262 if(ar.junctionLoc>expectedJunction){ | |
1263 ar.icecream=false; | |
1264 }else{ | |
1265 ar.ambiguous=true; | |
1266 } | |
1267 }else{ | |
1268 //Last read, the junction should be closer to the end for real ice cream | |
1269 assert(readNum==1) : readNum; | |
1270 if(ar.junctionLoc<expectedJunction){ | |
1271 ar.icecream=false; | |
1272 }else{ | |
1273 ar.ambiguous=true; | |
1274 } | |
1275 } | |
1276 return ar; | |
1277 }else{ | |
1278 ar.icecream=true; | |
1279 } | |
1280 }else{ | |
1281 if(junctionFraction<minJunctionFraction){ | |
1282 ar.icecream=true; | |
1283 }else{ | |
1284 ar.ambiguous=true; | |
1285 } | |
1286 } | |
1287 return ar; | |
1288 } | |
1289 | |
1290 /** Align the left qlen bases to the rest of the read. */ | |
1291 private AlignmentResult alignLeft(final byte[] bases, final int qlen, final float minRatio, boolean v2, boolean tipOnly){ | |
1292 final byte[] query=Arrays.copyOfRange(bases, 0, qlen); | |
1293 AminoAcid.reverseComplementBasesInPlace(query); | |
1294 final AlignmentResult ar; | |
1295 final int rstop=bases.length-1; | |
1296 final int rstart=(tipOnly ? Tools.max(qlen, rstop-(int)(tipRatio*qlen)) : qlen); | |
1297 if(v2){ | |
1298 // elapsedShortT-=System.nanoTime(); | |
1299 ar=ica.alignForwardShort(query, bases, rstart, rstop, minScore, minRatio); | |
1300 // elapsedShortT+=System.nanoTime(); | |
1301 }else{ | |
1302 // elapsedT-=System.nanoTime(); | |
1303 ar=ica.alignForward(query, bases, rstart, rstop, minScore, minRatio); | |
1304 // elapsedT+=System.nanoTime(); | |
1305 } | |
1306 if(ar!=null){ar.left=true;} | |
1307 return ar; | |
1308 } | |
1309 | |
1310 /** Align the right qlen bases to the rest of the read. */ | |
1311 private AlignmentResult alignRight(final byte[] bases, final int qlen, final float minRatio, boolean v2, boolean tipOnly){ | |
1312 final byte[] query=Arrays.copyOfRange(bases, bases.length-qlen, bases.length); | |
1313 AminoAcid.reverseComplementBasesInPlace(query); | |
1314 final AlignmentResult ar; | |
1315 final int rstop=(tipOnly ? Tools.min((int)(tipRatio*qlen), bases.length-qlen-1) : bases.length-qlen-1); | |
1316 final int rstart=0; | |
1317 if(v2){ | |
1318 // elapsedShortT-=System.nanoTime(); | |
1319 ar=ica.alignForwardShort(query, bases, rstart, rstop, minScore, minRatio); | |
1320 // elapsedShortT+=System.nanoTime(); | |
1321 }else{ | |
1322 // elapsedT-=System.nanoTime(); | |
1323 ar=ica.alignForward(query, bases, rstart, rstop, minScore, minRatio); | |
1324 // elapsedT+=System.nanoTime(); | |
1325 } | |
1326 if(ar!=null){ar.left=false;} | |
1327 return ar; | |
1328 } | |
1329 | |
1330 /** Align the adapter sequence to the read, piecewise, to count matches. */ | |
1331 private int lookForAdapter(Read r, double mult) { | |
1332 assert(!r.hasAdapter() && !r.tested()); | |
1333 int begin=0; | |
1334 while(begin<r.length() && r.bases[begin]=='N'){begin++;} //Skip reads made of 'N' | |
1335 if(begin>=r.length()){return 0;} | |
1336 | |
1337 int suspectThresh=(int)(minSwScoreSuspect*mult); | |
1338 int scoreThresh=(int)(minSwScore*mult); | |
1339 | |
1340 // final byte[] array=npad(r.bases, pad ? npad : 0); | |
1341 final byte[] array=npad(r.bases, npad); | |
1342 | |
1343 // assert(array==r.bases) : npad; | |
1344 | |
1345 int lim=array.length-npad-stride; | |
1346 | |
1347 int found=0; | |
1348 | |
1349 int lastSuspect=-1; | |
1350 int lastConfirmed=-1; | |
1351 int maxScore=0; | |
1352 | |
1353 // final int revisedStride=(r.length()>1800 ? stride : (int)(stride*0.6f)); | |
1354 for(int i=begin; i<lim; i+=stride){ | |
1355 int j=Tools.min(i+window, array.length-1); | |
1356 if(j-i<window){ | |
1357 lim=0; //Last loop cycle | |
1358 // i=Tools.max(0, array.length-2*query1.length); | |
1359 } | |
1360 | |
1361 { | |
1362 | |
1363 int[] rvec; | |
1364 // if(timeless){//A speed-optimized version | |
1365 rvec=ssa.fillAndScoreLimited(adapter, array, i, j, suspectThresh); | |
1366 // }else{rvec=msa.fillAndScoreLimited(adapter, array, i, j, suspectThresh);} | |
1367 if(rvec!=null && rvec[0]>=suspectThresh){ | |
1368 final int score=rvec[0]; | |
1369 final int start=rvec[1]; | |
1370 final int stop=rvec[2]; | |
1371 assert(score>=suspectThresh); | |
1372 if((i==0 || start>i) && (j==array.length-1 || stop<j)){ | |
1373 boolean kill=(score>=scoreThresh || | |
1374 (score>=suspectMidpoint && lastSuspect>0 && start>=lastSuspect && start-lastSuspect<suspectDistance) || | |
1375 (lastConfirmed>0 && start>=lastConfirmed && start-lastConfirmed<suspectDistance)); | |
1376 | |
1377 if(!kill && useLocality && array.length-stop>window){//Look ahead | |
1378 rvec=ssa.fillAndScoreLimited(adapter, array, stop, stop+window, suspectThresh); | |
1379 if(rvec!=null){ | |
1380 if(score>=suspectMidpoint && rvec[0]>=suspectThresh && rvec[1]-stop<suspectDistance){kill=true;} | |
1381 else if(score>=suspectThresh && rvec[0]>=scoreThresh && rvec[1]-stop<suspectDistance){kill=true;} | |
1382 } | |
1383 } | |
1384 | |
1385 if(!kill && useAltMsa){//Try alternate msa | |
1386 rvec=msa2.fillAndScoreLimited(adapter, array, Tools.max(0, start-4), Tools.min(stop+4, array.length-1), suspectThresh); | |
1387 if(rvec!=null && rvec[0]>=(scoreThresh)){kill=true;} | |
1388 } | |
1389 | |
1390 if(kill){ | |
1391 maxScore=Tools.max(maxScore, score); | |
1392 // if(print) {System.err.println("Found adapter at distance "+Tools.min(start, array.length-stop));} | |
1393 // System.out.println("-:"+score+", "+scoreThresh+", "+suspectThresh+"\t"+lastSuspect+", "+start+", "+stop); | |
1394 found++; | |
1395 for(int x=Tools.max(0, start); x<=stop; x++){array[x]='X';} | |
1396 // assert(Shared.threads()!=1) : new String(array)+"\n"+start+", "+stop; | |
1397 if(useLocality && score>=scoreThresh){lastConfirmed=Tools.max(lastConfirmed, stop);} | |
1398 } | |
1399 } | |
1400 // System.out.println("Set lastSuspect="+stop+" on score "+score); | |
1401 if(useLocality){lastSuspect=Tools.max(lastSuspect, stop);} | |
1402 } | |
1403 } | |
1404 } | |
1405 | |
1406 r.setTested(true); | |
1407 if(found>0){ | |
1408 r.setHasAdapter(true); | |
1409 r.setDiscarded(true); | |
1410 r.setAmbiguous(false); | |
1411 | |
1412 int idx=(int)((maxScore*200.0)/maxSwScore); | |
1413 adapterScoresT[idx]++; | |
1414 }else{ | |
1415 r.setHasAdapter(false); | |
1416 } | |
1417 | |
1418 return found; | |
1419 } | |
1420 | |
1421 /** Align the adapter sequence to the read ends, and trim if needed. */ | |
1422 private int trimTerminalAdapters(Read r, double mult) { | |
1423 assert(!r.hasAdapter() && !r.tested()); | |
1424 int begin=0; | |
1425 while(begin<r.length() && r.bases[begin]=='N'){begin++;} //Skip reads made of 'N' | |
1426 if(begin>=r.length()){return 0;} | |
1427 | |
1428 int suspectThresh=(int)(minSwScoreSuspect*mult); | |
1429 int scoreThresh=(int)(minSwScore*mult); | |
1430 | |
1431 // final byte[] array=npad(r.bases, pad ? npad : 0); | |
1432 final byte[] array=npad(r.bases, npad); | |
1433 | |
1434 // assert(array==r.bases) : npad; | |
1435 | |
1436 int lim=array.length-npad-stride; | |
1437 | |
1438 int found=0; | |
1439 | |
1440 int lastSuspect=-1; | |
1441 int lastConfirmed=-1; | |
1442 int maxScore=0; | |
1443 | |
1444 // final int revisedStride=(r.length()>1800 ? stride : (int)(stride*0.6f)); | |
1445 for(int i=begin; i<lim; i+=stride){ | |
1446 int j=Tools.min(i+window, array.length-1); | |
1447 if(j-i<window){ | |
1448 lim=0; //Last loop cycle | |
1449 // i=Tools.max(0, array.length-2*query1.length); | |
1450 } | |
1451 | |
1452 { | |
1453 | |
1454 int[] rvec; | |
1455 // if(timeless){//A speed-optimized version | |
1456 rvec=ssa.fillAndScoreLimited(adapter, array, i, j, suspectThresh); | |
1457 // }else{rvec=msa.fillAndScoreLimited(adapter, array, i, j, suspectThresh);} | |
1458 if(rvec!=null && rvec[0]>=suspectThresh){ | |
1459 final int score=rvec[0]; | |
1460 final int start=rvec[1]; | |
1461 final int stop=rvec[2]; | |
1462 assert(score>=suspectThresh); | |
1463 if((i==0 || start>i) && (j==array.length-1 || stop<j)){ | |
1464 boolean kill=(score>=scoreThresh || | |
1465 (score>=suspectMidpoint && lastSuspect>0 && start>=lastSuspect && start-lastSuspect<suspectDistance) || | |
1466 (lastConfirmed>0 && start>=lastConfirmed && start-lastConfirmed<suspectDistance)); | |
1467 | |
1468 if(!kill && useLocality && array.length-stop>window){//Look ahead | |
1469 rvec=ssa.fillAndScoreLimited(adapter, array, stop, stop+window, suspectThresh); | |
1470 if(rvec!=null){ | |
1471 if(score>=suspectMidpoint && rvec[0]>=suspectThresh && rvec[1]-stop<suspectDistance){kill=true;} | |
1472 else if(score>=suspectThresh && rvec[0]>=scoreThresh && rvec[1]-stop<suspectDistance){kill=true;} | |
1473 } | |
1474 } | |
1475 | |
1476 if(!kill && useAltMsa){//Try alternate msa | |
1477 rvec=msa2.fillAndScoreLimited(adapter, array, Tools.max(0, start-4), Tools.min(stop+4, array.length-1), suspectThresh); | |
1478 if(rvec!=null && rvec[0]>=(scoreThresh)){kill=true;} | |
1479 } | |
1480 | |
1481 if(kill){ | |
1482 maxScore=Tools.max(maxScore, score); | |
1483 // if(print) {System.err.println("Found adapter at distance "+Tools.min(start, array.length-stop));} | |
1484 // System.out.println("-:"+score+", "+scoreThresh+", "+suspectThresh+"\t"+lastSuspect+", "+start+", "+stop); | |
1485 found++; | |
1486 for(int x=Tools.max(0, start); x<=stop; x++){array[x]='X';} | |
1487 // assert(Shared.threads()!=1) : new String(array)+"\n"+start+", "+stop; | |
1488 if(useLocality && score>=scoreThresh){lastConfirmed=Tools.max(lastConfirmed, stop);} | |
1489 } | |
1490 } | |
1491 // System.out.println("Set lastSuspect="+stop+" on score "+score); | |
1492 if(useLocality){lastSuspect=Tools.max(lastSuspect, stop);} | |
1493 } | |
1494 } | |
1495 } | |
1496 | |
1497 r.setTested(true); | |
1498 if(found>0){ | |
1499 r.setHasAdapter(true); | |
1500 r.setDiscarded(true); | |
1501 r.setAmbiguous(false); | |
1502 | |
1503 int idx=(int)((maxScore*200.0)/maxSwScore); | |
1504 adapterScoresT[idx]++; | |
1505 }else{ | |
1506 r.setHasAdapter(false); | |
1507 } | |
1508 | |
1509 return found; | |
1510 } | |
1511 | |
1512 private byte[] npad(final byte[] array, final int pad){ | |
1513 if(pad<=0){return array;} | |
1514 final int len=array.length+2*pad; | |
1515 byte[] r=new byte[len]; | |
1516 for(int i=0; i<pad; i++){r[i]='N';} | |
1517 for(int i=pad, j=0; j<array.length; i++, j++){r[i]=array[j];} | |
1518 for(int i=array.length+pad, limit=Tools.min(r.length, len+50); i<limit; i++){r[i]='N';} | |
1519 return r; | |
1520 } | |
1521 | |
1522 /*--------------------------------------------------------------*/ | |
1523 /*---------------- Inner Class Fields ----------------*/ | |
1524 /*--------------------------------------------------------------*/ | |
1525 | |
1526 /** Number of reads processed by this thread */ | |
1527 protected long readsProcessedT=0; | |
1528 /** Number of bases processed by this thread */ | |
1529 protected long basesProcessedT=0; | |
1530 | |
1531 | |
1532 protected long truePositiveReadsT=0; | |
1533 protected long falsePositiveReadsT=0; | |
1534 protected long trueNegativeReadsT=0; | |
1535 protected long falseNegativeReadsT=0; | |
1536 protected long ambiguousReadsT=0; | |
1537 | |
1538 protected long truePositiveZMWsT=0; | |
1539 protected long falsePositiveZMWsT=0; | |
1540 protected long trueNegativeZMWsT=0; | |
1541 protected long falseNegativeZMWsT=0; | |
1542 protected long ambiguousZMWsT=0; | |
1543 | |
1544 | |
1545 protected long elapsedT=0; | |
1546 protected long elapsedShortT=0; | |
1547 | |
1548 //Unused | |
1549 protected long iceCreamReadsT=0; | |
1550 protected long iceCreamBasesT=0; | |
1551 | |
1552 protected long iceCreamZMWsT=0; | |
1553 protected double iceCreamRatioT=0; | |
1554 protected long ratiosCountedT=0; | |
1555 protected long missingAdapterZMWsT=0; | |
1556 protected long hiddenAdapterZMWsT=0; | |
1557 | |
1558 protected long basesTrimmedT=0; | |
1559 protected long readsTrimmedT=0; | |
1560 | |
1561 /** Number of reads retained by this thread */ | |
1562 protected long readsOutT=0; | |
1563 /** Number of bases retained by this thread */ | |
1564 protected long basesOutT=0; | |
1565 /** Number of junctions detected in IR reads by this thread */ | |
1566 protected long junctionsOutT=0; | |
1567 | |
1568 protected long lowEntropyZMWsT=0; | |
1569 protected long lowEntropyReadsT=0; | |
1570 | |
1571 /** True only if this thread has completed successfully */ | |
1572 boolean success=false; | |
1573 | |
1574 /** Shared output stream */ | |
1575 private final ConcurrentReadOutputStream rosg; | |
1576 /** Shared output stream for ambiguous reads */ | |
1577 private final ConcurrentReadOutputStream rosa; | |
1578 /** Shared output stream for bad reads */ | |
1579 private final ConcurrentReadOutputStream rosb; | |
1580 /** Shared output stream for junctions */ | |
1581 private final ConcurrentReadOutputStream rosj; | |
1582 /** Thread ID */ | |
1583 final int tid; | |
1584 | |
1585 /* Aligners for this thread */ | |
1586 | |
1587 /** For inverted repeat alignment */ | |
1588 final IceCreamAligner ica=IceCreamAligner.makeAligner(32); | |
1589 /** For quickly aligning adapter sequence to whole read */ | |
1590 final FlatAligner fla=new FlatAligner(); | |
1591 // final MultiStateAligner9PacBioAdapter msa=alignAdapter ? new MultiStateAligner9PacBioAdapter(ALIGN_ROWS, ALIGN_COLUMNS) : null; | |
1592 /** For slowly aligning adapter sequence sectionwise */ | |
1593 final SingleStateAlignerPacBioAdapter ssa=(alignAdapter || trimReads) ? new SingleStateAlignerPacBioAdapter(ALIGN_ROWS, ALIGN_COLUMNS, adapter.length) : null; | |
1594 /** Alternate scoring for slow adapter alignment */ | |
1595 final MultiStateAligner9PacBioAdapter2 msa2=(alignAdapter || trimReads) ? new MultiStateAligner9PacBioAdapter2() : null; | |
1596 | |
1597 final ZMWStreamer zstream; | |
1598 final EntropyTracker eTracker; | |
1599 | |
1600 final long[] adapterScoresT=new long[201]; | |
1601 final long[] repeatScoresT=new long[201]; | |
1602 final byte[] tipBufferLeft=new byte[adapterTipLen+adapterTipPad]; | |
1603 final byte[] tipBufferRight=new byte[adapterTipLen+adapterTipPad]; | |
1604 } | |
1605 | |
1606 /*--------------------------------------------------------------*/ | |
1607 /*---------------- Fields ----------------*/ | |
1608 /*--------------------------------------------------------------*/ | |
1609 | |
1610 /** Primary input file path */ | |
1611 private String in1=null; | |
1612 | |
1613 /** Primary output file path */ | |
1614 private String outg=null; | |
1615 /** Ambiguous output file path */ | |
1616 private String outa=null; | |
1617 /** Bad output file path */ | |
1618 private String outb=null; | |
1619 /** Junction output file path */ | |
1620 private String outj=null; | |
1621 /** Stats (screen) output file path */ | |
1622 private String outstats=null; | |
1623 | |
1624 /** Adapter score ratio histogram */ | |
1625 private String asrhist=null; | |
1626 | |
1627 /** Inverted repeat score ratio histogram */ | |
1628 private String irsrhist=null; | |
1629 | |
1630 /** Override input file extension */ | |
1631 private String extin=null; | |
1632 /** Override output file extension */ | |
1633 private String extout=null; | |
1634 | |
1635 private int targetQlen=352; | |
1636 private int minQlen=100; | |
1637 | |
1638 /** Make a query at most this fraction of read length */ | |
1639 private float maxQlenFraction=0.15f; | |
1640 | |
1641 /** Exit alignment early if score drops below this. | |
1642 * An aggressive optimization that may miss some low-quality inverted repeats. | |
1643 * -700 seems safe. */ | |
1644 private int minScore=-800; | |
1645 | |
1646 /** Fraction of maximum alignment score to consider as matching for initial alignment */ | |
1647 private float minRatio1=0.59f; | |
1648 | |
1649 /** Fraction of maximum alignment score to consider as matching for realignment */ | |
1650 private float minRatio2=0.64f; | |
1651 | |
1652 private float adapterRatio=0.18f; //.18 for fa, or 0.57/0.6 for ica | |
1653 private float adapterRatio2=0.325f; //0.31f normal, 0.325 timeless | |
1654 private float suspectRatio=0.85f; | |
1655 private boolean useLocality=true; | |
1656 private boolean useAltMsa=true; | |
1657 | |
1658 | |
1659 private float tipRatio=1.5f; | |
1660 | |
1661 private float longReadMult=1.5f; | |
1662 | |
1663 private float shortReadMult=1.5f; | |
1664 | |
1665 private float veryShortMult=0.35f; | |
1666 | |
1667 /** Short half of inverted repeat must be at least this fraction of read length to be classed as a triangle */ | |
1668 private float minJunctionFraction=0.4f; | |
1669 | |
1670 /** Only filter triangle reads, not all inverted repeats */ | |
1671 private boolean filterIceCreamOnly=true; | |
1672 | |
1673 /** Align again once the junction position is provisionally identified */ | |
1674 private boolean realign=true; | |
1675 | |
1676 /** For internal read array transfers */ | |
1677 private int queuelen=80; | |
1678 | |
1679 /** For grading synthetic data */ | |
1680 private boolean parseCustom; | |
1681 | |
1682 /** Input reads are CCS (full pass) */ | |
1683 private boolean CCS; | |
1684 | |
1685 private boolean modifyHeader=false; | |
1686 | |
1687 private boolean sendAmbigToGood=true; | |
1688 private boolean sendAmbigToBad=false; | |
1689 private boolean setAmbig=false; | |
1690 | |
1691 //Note: These flags are very similar... they need to be better-defined or merged. | |
1692 private boolean keepZMWsTogether=false; | |
1693 private boolean keepShortReads=true; | |
1694 | |
1695 private int format=FORMAT_TEXT; | |
1696 private static final int FORMAT_TEXT=1, FORMAT_JSON=2; | |
1697 | |
1698 /*--------------------------------------------------------------*/ | |
1699 | |
1700 /** Alignment iterations for inverted repeat calculation with ref columns and query rows */ | |
1701 protected long alignmentIters=0; | |
1702 | |
1703 /** Alignment iterations for inverted repeat calculation with query columns and ref rows */ | |
1704 protected long alignmentItersShort=0; | |
1705 | |
1706 /** Time spent in long iterations */ | |
1707 protected long elapsed=0; | |
1708 | |
1709 /** Time spent in short iterations */ | |
1710 protected long elapsedShort=0; | |
1711 | |
1712 /** Print iteration time statistics */ | |
1713 protected boolean printTiming=false; | |
1714 | |
1715 /** Number of reads processed */ | |
1716 protected long readsProcessed=0; | |
1717 /** Number of bases processed */ | |
1718 protected long basesProcessed=0; | |
1719 | |
1720 /** Number of reads retained */ | |
1721 protected long readsOut=0; | |
1722 /** Number of bases retained */ | |
1723 protected long basesOut=0; | |
1724 /** Number of junctions detected in IR reads */ | |
1725 protected long junctionsOut=0; | |
1726 | |
1727 /** Quit after processing this many input reads; -1 means no limit */ | |
1728 private long maxReads=-1; | |
1729 | |
1730 //Unused | |
1731 protected long iceCreamReads=0; | |
1732 protected long iceCreamBases=0; | |
1733 | |
1734 /** ZMWs with discarded reads */ | |
1735 protected long iceCreamZMWs=0; | |
1736 | |
1737 /** Sum of IR alignment ratios for IR reads not containing adapters */ | |
1738 protected double iceCreamRatio=0; | |
1739 | |
1740 /** Number of ratios in iceCreamRatio */ | |
1741 protected long ratiosCounted=0; | |
1742 | |
1743 /** Histogram */ | |
1744 protected final long[] adapterScores=new long[201]; | |
1745 | |
1746 /** Histogram */ | |
1747 protected final long[] repeatScores=new long[201]; | |
1748 | |
1749 /** ZMWs with a triangle read but no hidden adapters */ | |
1750 protected long missingAdapterZMWs=0; | |
1751 | |
1752 /** ZMWs with hidden adapters */ | |
1753 protected long hiddenAdapterZMWs=0; | |
1754 | |
1755 protected long basesTrimmed=0; | |
1756 protected long readsTrimmed=0; | |
1757 | |
1758 protected long lowEntropyZMWs=0; | |
1759 protected long lowEntropyReads=0; | |
1760 | |
1761 /** Total ZMWs observed */ | |
1762 protected long ZMWs=0; | |
1763 | |
1764 protected long truePositiveReads=0; | |
1765 protected long falsePositiveReads=0; | |
1766 protected long trueNegativeReads=0; | |
1767 protected long falseNegativeReads=0; | |
1768 protected long ambiguousReads=0; | |
1769 | |
1770 protected long truePositiveZMWs=0; | |
1771 protected long falsePositiveZMWs=0; | |
1772 protected long trueNegativeZMWs=0; | |
1773 protected long falseNegativeZMWs=0; | |
1774 protected long ambiguousZMWs=0; | |
1775 | |
1776 protected final int stride; | |
1777 protected final int window; | |
1778 protected final int ALIGN_ROWS; | |
1779 protected final int ALIGN_COLUMNS; | |
1780 | |
1781 private boolean timeless=true; | |
1782 protected final int maxSwScore; | |
1783 protected final int minSwScore; | |
1784 protected final int minSwScoreSuspect; | |
1785 protected final int maxImperfectSwScore; | |
1786 protected final int suspectMidpoint; | |
1787 protected final int suspectDistance=100; | |
1788 protected int npad=0; //This is for catching adapters on the tips, which are not very relevant to ice cream. Set to 35 for tip apdapters. | |
1789 | |
1790 private byte[] adapter="ATCTCTCTCAACAACAACAACGGAGGAGGAGGAAAAGAGAGAGAT".getBytes(); //This one seems to be correct | |
1791 // private byte[] adapter="ATCTCTCTCTTTTCCTCCTCCTCCGTTGTTGTTGTTGAGAGAGAT".getBytes(); | |
1792 private boolean alignAdapter=true; | |
1793 private boolean alignRC=true; | |
1794 private boolean flagLongReads=true; | |
1795 private boolean trimReads=true; | |
1796 private int minLengthAfterTrimming=40; | |
1797 private int adapterTipLen=100; | |
1798 private int adapterTipPad=35; | |
1799 | |
1800 boolean trimPolyA=false; | |
1801 | |
1802 /*--------------------------------------------------------------*/ | |
1803 /*---------------- Entropy Fields ----------------*/ | |
1804 /*--------------------------------------------------------------*/ | |
1805 | |
1806 /** Minimum entropy to be considered "complex", on a scale of 0-1 */ | |
1807 float entropyCutoff=-1; //suggested 0.55 | |
1808 /** Minimum length of a low-entropy block to fail a read */ | |
1809 int entropyLength=350; | |
1810 /** Minimum length of a low-entropy block as a fraction of read length; | |
1811 * the minimum of the two will be used */ | |
1812 float entropyFraction=0.5f; | |
1813 | |
1814 float maxMonomerFraction=0.74f; //Suggested... 0.74 | |
1815 | |
1816 /*--------------------------------------------------------------*/ | |
1817 /*---------------- Final Fields ----------------*/ | |
1818 /*--------------------------------------------------------------*/ | |
1819 | |
1820 /** Primary input file */ | |
1821 private final FileFormat ffin1; | |
1822 | |
1823 /** Primary output file */ | |
1824 private final FileFormat ffoutg; | |
1825 | |
1826 /** Ambiguous output file */ | |
1827 private final FileFormat ffouta; | |
1828 | |
1829 /** Bad output file */ | |
1830 private final FileFormat ffoutb; | |
1831 | |
1832 /** Junction output file */ | |
1833 private final FileFormat ffoutj; | |
1834 | |
1835 /** Stats output file */ | |
1836 private final FileFormat ffstats; | |
1837 | |
1838 /** Adapter score ratio histogram */ | |
1839 private final FileFormat ffasrhist; | |
1840 | |
1841 /** Inverted repeat score ratio histogram */ | |
1842 private final FileFormat ffirsrhist; | |
1843 | |
1844 private final int threads; | |
1845 | |
1846 /*--------------------------------------------------------------*/ | |
1847 /*---------------- Common Fields ----------------*/ | |
1848 /*--------------------------------------------------------------*/ | |
1849 | |
1850 /** Print status messages to this output stream */ | |
1851 private PrintStream outstream=System.err; | |
1852 /** Print verbose messages */ | |
1853 public static boolean verbose=false; | |
1854 /** True if an error was encountered */ | |
1855 public boolean errorState=false; | |
1856 /** Overwrite existing output files */ | |
1857 private boolean overwrite=false; | |
1858 /** Append to existing output files */ | |
1859 private boolean append=false; | |
1860 | |
1861 } |