Mercurial > repos > rliterman > csp2
comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/consensus/FixScaffoldGaps.java @ 68:5028fdace37b
planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author | jpayne |
---|---|
date | Tue, 18 Mar 2025 16:23:26 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
67:0e9998148a16 | 68:5028fdace37b |
---|---|
1 package consensus; | |
2 | |
3 import java.io.PrintStream; | |
4 import java.util.ArrayList; | |
5 import java.util.LinkedHashMap; | |
6 import java.util.Map.Entry; | |
7 import java.util.concurrent.atomic.AtomicIntegerArray; | |
8 import java.util.concurrent.atomic.AtomicLongArray; | |
9 | |
10 import fileIO.ByteFile; | |
11 import fileIO.FileFormat; | |
12 import fileIO.ReadWrite; | |
13 import shared.Parse; | |
14 import shared.Parser; | |
15 import shared.PreParser; | |
16 import shared.ReadStats; | |
17 import shared.Shared; | |
18 import shared.Timer; | |
19 import shared.Tools; | |
20 import stream.ConcurrentReadInputStream; | |
21 import stream.ConcurrentReadOutputStream; | |
22 import stream.FastaReadInputStream; | |
23 import stream.Read; | |
24 import stream.SamLine; | |
25 import stream.SamReadStreamer; | |
26 import stream.SamStreamer; | |
27 import structures.ByteBuilder; | |
28 import structures.ListNum; | |
29 import template.Accumulator; | |
30 import template.ThreadWaiter; | |
31 import var2.SamFilter; | |
32 | |
33 /** | |
34 * Resizes scaffold gaps to represent the best estimate | |
35 * based on the insert size distribution of paired reads. | |
36 * | |
37 * @author Brian Bushnell | |
38 * @date September 11, 2019 | |
39 * | |
40 */ | |
41 public class FixScaffoldGaps implements Accumulator<FixScaffoldGaps.ProcessThread> { | |
42 | |
43 /*--------------------------------------------------------------*/ | |
44 /*---------------- Initialization ----------------*/ | |
45 /*--------------------------------------------------------------*/ | |
46 | |
47 /** | |
48 * Code entrance from the command line. | |
49 * @param args Command line arguments | |
50 */ | |
51 public static void main(String[] args){ | |
52 //Start a timer immediately upon code entrance. | |
53 Timer t=new Timer(); | |
54 | |
55 //Create an instance of this class | |
56 FixScaffoldGaps x=new FixScaffoldGaps(args); | |
57 | |
58 //Run the object | |
59 x.process(t); | |
60 | |
61 //Close the print stream if it was redirected | |
62 Shared.closeStream(x.outstream); | |
63 } | |
64 | |
65 /** | |
66 * Constructor. | |
67 * @param args Command line arguments | |
68 */ | |
69 public FixScaffoldGaps(String[] args){ | |
70 | |
71 {//Preparse block for help, config files, and outstream | |
72 PreParser pp=new PreParser(args, getClass(), false); | |
73 args=pp.args; | |
74 outstream=pp.outstream; | |
75 } | |
76 | |
77 //Set shared static variables prior to parsing | |
78 ReadWrite.USE_PIGZ=ReadWrite.USE_UNPIGZ=true; | |
79 ReadWrite.MAX_ZIP_THREADS=Shared.threads(); | |
80 | |
81 samFilter.includeUnmapped=false; | |
82 samFilter.includeSupplimentary=false; | |
83 // samFilter.includeDuplicate=false; | |
84 samFilter.includeNonPrimary=false; | |
85 samFilter.includeQfail=false; | |
86 // samFilter.minMapq=4; | |
87 | |
88 {//Parse the arguments | |
89 final Parser parser=parse(args); | |
90 | |
91 Parser.processQuality(); | |
92 | |
93 maxReads=parser.maxReads; | |
94 overwrite=ReadStats.overwrite=parser.overwrite; | |
95 append=ReadStats.append=parser.append; | |
96 | |
97 in=parser.in1; | |
98 extin=parser.extin; | |
99 | |
100 out=parser.out1; | |
101 extout=parser.extout; | |
102 } | |
103 | |
104 { | |
105 // if("auto".equalsIgnoreCase(atomic)){Scaffold.setCA3A(Shared.threads()>8);} | |
106 // else{Scaffold.setCA3A(Parse.parseBoolean(atomic));} | |
107 samFilter.setSamtoolsFilter(); | |
108 | |
109 streamerThreads=Tools.max(1, Tools.min(streamerThreads, Shared.threads())); | |
110 assert(streamerThreads>0) : streamerThreads; | |
111 } | |
112 | |
113 validateParams(); | |
114 fixExtensions(); //Add or remove .gz or .bz2 as needed | |
115 checkFileExistence(); //Ensure files can be read and written | |
116 checkStatics(); //Adjust file-related static fields as needed for this program | |
117 | |
118 //Create output FileFormat objects | |
119 ffout=FileFormat.testOutput(out, FileFormat.FASTA, extout, true, overwrite, append, ordered); | |
120 | |
121 //Create input FileFormat objects | |
122 ffin=FileFormat.testInput(in, FileFormat.SAM, extin, true, true); | |
123 ffref=FileFormat.testInput(ref, FileFormat.FASTA, null, true, true); | |
124 } | |
125 | |
126 /*--------------------------------------------------------------*/ | |
127 /*---------------- Initialization Helpers ----------------*/ | |
128 /*--------------------------------------------------------------*/ | |
129 | |
130 /** Parse arguments from the command line */ | |
131 private Parser parse(String[] args){ | |
132 | |
133 //Create a parser object | |
134 Parser parser=new Parser(); | |
135 | |
136 //Set any necessary Parser defaults here | |
137 //parser.foo=bar; | |
138 | |
139 //Parse each argument | |
140 for(int i=0; i<args.length; i++){ | |
141 String arg=args[i]; | |
142 | |
143 //Break arguments into their constituent parts, in the form of "a=b" | |
144 String[] split=arg.split("="); | |
145 String a=split[0].toLowerCase(); | |
146 String b=split.length>1 ? split[1] : null; | |
147 if(b!=null && b.equalsIgnoreCase("null")){b=null;} | |
148 | |
149 if(a.equals("verbose")){ | |
150 verbose=Parse.parseBoolean(b); | |
151 }else if(a.equals("ref") || a.equals("scaffolds")){ | |
152 ref=b; | |
153 }else if(a.equals("insertlist")){ | |
154 insertList=b; | |
155 }else if(a.equals("ordered")){ | |
156 ordered=Parse.parseBoolean(b); | |
157 }else if(a.equalsIgnoreCase("ns") || a.equalsIgnoreCase("n") || a.equalsIgnoreCase("scaffoldbreak") || a.equalsIgnoreCase("gap")){ | |
158 scaffoldBreakNs=Integer.parseInt(b); | |
159 assert(scaffoldBreakNs>0); | |
160 }else if(a.equalsIgnoreCase("mindepth")){ | |
161 minDepth=Integer.parseInt(b); | |
162 assert(minDepth>0); | |
163 }else if(a.equalsIgnoreCase("trim") || a.equalsIgnoreCase("trimFraction") || a.equalsIgnoreCase("border")){ | |
164 trimFraction=Float.parseFloat(b); | |
165 assert(trimFraction>=0 && trimFraction<=1) : "trimFraction should be between 0 and 1"; | |
166 }else if(a.equals("clearfilters") || a.equals("clearfilter")){ | |
167 if(Parse.parseBoolean(b)){ | |
168 samFilter.clear(); | |
169 } | |
170 }else if(a.equals("parse_flag_goes_here")){ | |
171 long fake_variable=Parse.parseKMG(b); | |
172 //Set a variable here | |
173 }else if(samFilter.parse(arg, a, b)){ | |
174 //do nothing | |
175 }else if(parser.parse(arg, a, b)){//Parse standard flags in the parser | |
176 //do nothing | |
177 }else{ | |
178 outstream.println("Unknown parameter "+args[i]); | |
179 assert(false) : "Unknown parameter "+args[i]; | |
180 } | |
181 } | |
182 | |
183 return parser; | |
184 } | |
185 | |
186 /** Add or remove .gz or .bz2 as needed */ | |
187 private void fixExtensions(){ | |
188 in=Tools.fixExtension(in); | |
189 ref=Tools.fixExtension(ref); | |
190 } | |
191 | |
192 /** Ensure files can be read and written */ | |
193 private void checkFileExistence(){ | |
194 | |
195 //Ensure there is an input file | |
196 if(in==null){throw new RuntimeException("Error - an input file is required.");} | |
197 | |
198 //Ensure there is an input file | |
199 if(ref==null){throw new RuntimeException("Error - a reference file is required.");} | |
200 | |
201 //Ensure output files can be written | |
202 if(!Tools.testOutputFiles(overwrite, append, false, out)){ | |
203 outstream.println((out==null)+", "+out); | |
204 throw new RuntimeException("\n\noverwrite="+overwrite+"; Can't write to output file "+out+"\n"); | |
205 } | |
206 | |
207 //Ensure input files can be read | |
208 if(!Tools.testInputFiles(false, true, in, ref)){ | |
209 throw new RuntimeException("\nCan't read some input files.\n"); | |
210 } | |
211 | |
212 //Ensure that no file was specified multiple times | |
213 if(!Tools.testForDuplicateFiles(true, in, ref, out)){ | |
214 throw new RuntimeException("\nSome file names were specified multiple times.\n"); | |
215 } | |
216 } | |
217 | |
218 /** Adjust file-related static fields as needed for this program */ | |
219 private static void checkStatics(){ | |
220 //Adjust the number of threads for input file reading | |
221 if(!ByteFile.FORCE_MODE_BF1 && !ByteFile.FORCE_MODE_BF2 && Shared.threads()>2){ | |
222 ByteFile.FORCE_MODE_BF2=true; | |
223 } | |
224 | |
225 assert(FastaReadInputStream.settingsOK()); | |
226 } | |
227 | |
228 /** Ensure parameter ranges are within bounds and required parameters are set */ | |
229 private boolean validateParams(){ | |
230 // assert(minfoo>0 && minfoo<=maxfoo) : minfoo+", "+maxfoo; | |
231 return true; | |
232 } | |
233 | |
234 /*--------------------------------------------------------------*/ | |
235 /*---------------- Outer Methods ----------------*/ | |
236 /*--------------------------------------------------------------*/ | |
237 | |
238 /** Create read streams and process all data */ | |
239 void process(Timer t){ | |
240 | |
241 //Turn off read validation in the input threads to increase speed | |
242 final boolean vic=Read.VALIDATE_IN_CONSTRUCTOR; | |
243 Read.VALIDATE_IN_CONSTRUCTOR=Shared.threads()<4; | |
244 | |
245 //Create a read input stream | |
246 final SamStreamer ss=makeStreamer(ffin); | |
247 | |
248 //Load reference | |
249 loadReferenceCustom(); | |
250 | |
251 //Reset counters | |
252 readsProcessed=readsOut=0; | |
253 basesProcessed=basesOut=0; | |
254 | |
255 //Process the reads in separate threads | |
256 spawnThreads(ss); | |
257 | |
258 //Optionally create a read output stream | |
259 final ConcurrentReadOutputStream ros=makeCros(); | |
260 | |
261 if(verbose){outstream.println("Fixing reference.");} | |
262 | |
263 fixScaffolds(ros); | |
264 | |
265 if(verbose){outstream.println("Finished; closing streams.");} | |
266 | |
267 //Write anything that was accumulated by ReadStats | |
268 errorState|=ReadStats.writeAll(); | |
269 //Close the read streams | |
270 errorState|=ReadWrite.closeStream(ros); | |
271 | |
272 //Reset read validation | |
273 Read.VALIDATE_IN_CONSTRUCTOR=vic; | |
274 | |
275 //Report timing and results | |
276 t.stop(); | |
277 outstream.println(Tools.timeReadsBasesProcessed(t, readsProcessed, basesProcessed, 8)); | |
278 outstream.println(Tools.readsBasesOut(readsProcessed, basesProcessed, scaffoldsOut, scaffoldLengthOut, 8, false)); | |
279 | |
280 outstream.println(); | |
281 outstream.println(Tools.number("Average Insert", totalAverageInsert, 2, 8)); | |
282 outstream.println(Tools.number("Gaps Unchanged", gapsUnchanged, 8)); | |
283 outstream.println(Tools.number("Gaps Widened ", gapsWidened, 8)); | |
284 outstream.println(Tools.number("Gaps Narrowed ", gapsNarrowed, 8)); | |
285 outstream.println(Tools.number("Ns Total ", nsTotal, 8)); | |
286 outstream.println(Tools.number("Ns Added ", nsAdded, 8)); | |
287 outstream.println(Tools.number("Ns Removed ", nsRemoved, 8)); | |
288 | |
289 | |
290 //Throw an exception of there was an error in a thread | |
291 if(errorState){ | |
292 throw new RuntimeException(getClass().getName()+" terminated in an error state; the output may be corrupt."); | |
293 } | |
294 } | |
295 | |
296 private synchronized void loadReferenceCustom(){ | |
297 assert(!loadedRef); | |
298 ConcurrentReadInputStream cris=makeRefCris(); | |
299 for(ListNum<Read> ln=cris.nextList(); ln!=null && ln.size()>0; ln=cris.nextList()) { | |
300 for(Read r : ln){ | |
301 String name=r.id; | |
302 String name2=Tools.trimToWhitespace(r.id); | |
303 Scaffold scaf=new Scaffold(name, r.bases, r.numericID); | |
304 refMap.put(name, scaf); | |
305 refMap2.put(name2, scaf); | |
306 } | |
307 } | |
308 loadedRef=true; | |
309 } | |
310 | |
311 private ConcurrentReadInputStream makeRefCris(){ | |
312 ConcurrentReadInputStream cris=ConcurrentReadInputStream.getReadInputStream(maxReads, true, ffref, null); | |
313 cris.start(); //Start the stream | |
314 if(verbose){outstream.println("Started cris");} | |
315 boolean paired=cris.paired(); | |
316 assert(!paired) : "References should not be paired."; | |
317 return cris; | |
318 } | |
319 | |
320 private SamStreamer makeStreamer(FileFormat ff){ | |
321 if(ff==null){return null;} | |
322 SamStreamer ss=new SamReadStreamer(ff, streamerThreads, true, maxReads); | |
323 ss.start(); //Start the stream | |
324 if(verbose){outstream.println("Started Streamer");} | |
325 return ss; | |
326 } | |
327 | |
328 private ConcurrentReadOutputStream makeCros(){ | |
329 if(ffout==null){return null;} | |
330 | |
331 //Select output buffer size based on whether it needs to be ordered | |
332 final int buff=(ordered ? Tools.mid(16, 128, (Shared.threads()*2)/3) : 8); | |
333 | |
334 final ConcurrentReadOutputStream ros=ConcurrentReadOutputStream.getStream(ffout, null, buff, null, false); | |
335 ros.start(); //Start the stream | |
336 return ros; | |
337 } | |
338 | |
339 /*--------------------------------------------------------------*/ | |
340 /*---------------- Thread Management ----------------*/ | |
341 /*--------------------------------------------------------------*/ | |
342 | |
343 /** Spawn process threads */ | |
344 private void spawnThreads(final SamStreamer ss){ | |
345 | |
346 //Do anything necessary prior to processing | |
347 | |
348 //Determine how many threads may be used | |
349 final int threads=Shared.threads(); | |
350 | |
351 //Fill a list with ProcessThreads | |
352 ArrayList<ProcessThread> alpt=new ArrayList<ProcessThread>(threads); | |
353 for(int i=0; i<threads; i++){ | |
354 alpt.add(new ProcessThread(ss, i)); | |
355 } | |
356 | |
357 //Start the threads | |
358 for(ProcessThread pt : alpt){ | |
359 pt.start(); | |
360 } | |
361 | |
362 //Wait for threads to finish | |
363 boolean success=ThreadWaiter.waitForThreads(alpt, this); | |
364 errorState&=!success; | |
365 | |
366 //Do anything necessary after processing | |
367 totalAverageInsert=totalInsertSum/(double)totalInsertCount; | |
368 insertByPercentile=Tools.makeHistogram(insertCounts, buckets); | |
369 } | |
370 | |
371 @Override | |
372 public final void accumulate(ProcessThread pt){ | |
373 readsProcessed+=pt.readsProcessedT; | |
374 basesProcessed+=pt.basesProcessedT; | |
375 readsOut+=pt.readsOutT; | |
376 basesOut+=pt.basesOutT; | |
377 | |
378 totalInsertSum+=pt.totalInsertSumT; | |
379 totalInsertCount+=pt.totalInsertCountT; | |
380 | |
381 errorState|=(!pt.success); | |
382 } | |
383 | |
384 @Override | |
385 public final boolean success(){return !errorState;} | |
386 | |
387 /*--------------------------------------------------------------*/ | |
388 /*---------------- Inner Methods ----------------*/ | |
389 /*--------------------------------------------------------------*/ | |
390 | |
391 private void fixScaffolds(ConcurrentReadOutputStream ros){ | |
392 ByteBuilder bb=new ByteBuilder(1000000); | |
393 | |
394 ArrayList<Read> list=new ArrayList<Read>(200); | |
395 long num=0; | |
396 long lengthSum=0; | |
397 for(Entry<String, Scaffold> e : refMap.entrySet()){ | |
398 Scaffold scaf=e.getValue(); | |
399 Read r=scaf.fixScaffold(bb); | |
400 lengthSum+=r.length(); | |
401 list.add(r); | |
402 scaffoldsOut++; | |
403 scaffoldLengthOut+=r.length(); | |
404 | |
405 if(list.size()>=200 || lengthSum>=100000){ | |
406 if(ros!=null){ros.add(list, num);} | |
407 list=new ArrayList<Read>(200); | |
408 num++; | |
409 lengthSum=0; | |
410 } | |
411 } | |
412 if(list.size()>0){ | |
413 if(ros!=null){ros.add(list, num);} | |
414 } | |
415 } | |
416 | |
417 private static int calcInsertSize(SamLine sl) { | |
418 assert(sl.mapped() && sl.pairedOnSameChrom()); | |
419 assert(sl.primary()); | |
420 assert(!sl.supplementary()); | |
421 assert(sl.leftmost()); | |
422 | |
423 assert(sl.tlen>0) : sl.tlen+"\n\n"+sl; | |
424 return sl.tlen>0 ? sl.tlen : -sl.tlen; | |
425 | |
426 // final int insertSize; | |
427 // String insertTag=null; | |
428 // if(sl.optional!=null){ | |
429 // for(String s : sl.optional){ | |
430 // if(s.startsWith("X8:Z:")){ | |
431 // insertTag=s; | |
432 // break; | |
433 // } | |
434 // } | |
435 // } | |
436 // if(insertTag!=null){ | |
437 // insertSize=Integer.parseInt(insertTag.substring(5)); | |
438 // }else{ | |
439 // insertSize=sl.tlen;//This is unsafe due to indels. | |
440 // assert(false) : "Reads need insert size tags."; | |
441 // } | |
442 // assert(insertSize>0) : sl; | |
443 // | |
444 // return insertSize; | |
445 } | |
446 | |
447 /*--------------------------------------------------------------*/ | |
448 /*---------------- Inner Classes ----------------*/ | |
449 /*--------------------------------------------------------------*/ | |
450 | |
451 /** This class is static to prevent accidental writing to shared variables. | |
452 * It is safe to remove the static modifier. */ | |
453 class ProcessThread extends Thread { | |
454 | |
455 //Constructor | |
456 ProcessThread(final SamStreamer ss_, final int tid_){ | |
457 ss=ss_; | |
458 tid=tid_; | |
459 } | |
460 | |
461 //Called by start() | |
462 @Override | |
463 public void run(){ | |
464 //Do anything necessary prior to processing | |
465 | |
466 //Process the reads | |
467 processInner(); | |
468 | |
469 //Do anything necessary after processing | |
470 | |
471 //Indicate successful exit status | |
472 success=true; | |
473 } | |
474 | |
475 /** Iterate through the reads */ | |
476 void processInner(){ | |
477 | |
478 //Grab and process all lists | |
479 for(ListNum<Read> ln=ss.nextReads(); ln!=null; ln=ss.nextReads()){ | |
480 // if(verbose){outstream.println("Got list of size "+list.size());} //Disabled due to non-static access | |
481 | |
482 processList(ln); | |
483 } | |
484 | |
485 } | |
486 | |
487 void processList(ListNum<Read> ln){ | |
488 | |
489 //Grab the actual read list from the ListNum | |
490 final ArrayList<Read> reads=ln.list; | |
491 | |
492 //Loop through each read in the list | |
493 for(int idx=0; idx<reads.size(); idx++){ | |
494 final Read r=reads.get(idx); | |
495 | |
496 //Validate reads in worker threads | |
497 if(!r.validated()){r.validate(true);} | |
498 | |
499 //Track the initial length for statistics | |
500 final int initialLength=r.length(); | |
501 | |
502 //Increment counters | |
503 readsProcessedT+=r.pairCount(); | |
504 basesProcessedT+=initialLength; | |
505 | |
506 processRead(r); | |
507 } | |
508 } | |
509 | |
510 /** | |
511 * Process a read or a read pair. | |
512 * @param r Read 1 | |
513 * @param r2 Read 2 (may be null) | |
514 * @return True if the reads should be kept, false if they should be discarded. | |
515 */ | |
516 void processRead(final Read r){ | |
517 final SamLine sl=r.samline; | |
518 assert(sl!=null) : sl; | |
519 if(samFilter!=null && !samFilter.passesFilter(sl)){return;} | |
520 | |
521 //sl.nextMapped(); | |
522 if(sl.mapped() && sl.pairedOnSameChrom() && sl.properPair() && sl.primary() && !sl.supplementary() && sl.leftmost()){ | |
523 final String rname=sl.rnameS(); | |
524 Scaffold scaf=refMap.get(rname); | |
525 if(scaf==null){scaf=refMap2.get(Tools.trimToWhitespace(rname));} | |
526 assert(scaf!=null) : "Can't find graph for "+rname; | |
527 | |
528 if(scaf!=null){ | |
529 final int insertSize=calcInsertSize(sl); | |
530 insertCounts.incrementAndGet(Tools.mid(0, insertSize, insertCounts.length())); | |
531 scaf.add(sl, insertSize); | |
532 | |
533 readsOutT++; | |
534 basesOutT+=r.length(); | |
535 | |
536 totalInsertSumT+=insertSize; | |
537 totalInsertCountT++; | |
538 } | |
539 } | |
540 } | |
541 | |
542 /** Number of reads processed by this thread */ | |
543 protected long readsProcessedT=0; | |
544 /** Number of bases processed by this thread */ | |
545 protected long basesProcessedT=0; | |
546 | |
547 /** Number of reads retained by this thread */ | |
548 protected long readsOutT=0; | |
549 /** Number of bases retained by this thread */ | |
550 protected long basesOutT=0; | |
551 | |
552 protected long totalInsertSumT=0; | |
553 protected long totalInsertCountT=0; | |
554 | |
555 long insertSum=0; | |
556 | |
557 /** True only if this thread has completed successfully */ | |
558 boolean success=false; | |
559 | |
560 /** Shared input stream */ | |
561 private final SamStreamer ss; | |
562 /** Thread ID */ | |
563 final int tid; | |
564 } | |
565 | |
566 /*--------------------------------------------------------------*/ | |
567 | |
568 private class Scaffold { | |
569 | |
570 Scaffold(String name_, byte[] bases_, long numericID_){ | |
571 name=name_; | |
572 bases=bases_; | |
573 numericID=(int)numericID_; | |
574 depthArray=new AtomicIntegerArray(bases.length); | |
575 insertArray=new AtomicLongArray(bases.length); | |
576 } | |
577 | |
578 void add(SamLine sl, int insertSize){ | |
579 assert(sl.mapped() && sl.pairedOnSameChrom()); | |
580 assert(sl.primary()); | |
581 assert(!sl.supplementary()); | |
582 assert(sl.leftmost()); | |
583 | |
584 // final int insertSize=calcInsertSize(sl); | |
585 | |
586 int start=sl.pos-1; | |
587 int stop=start+sl.tlen; | |
588 | |
589 int trim=(int)(sl.length()*trimFraction); | |
590 start+=trim; | |
591 stop-=trim; | |
592 | |
593 for(int i=start; i<stop; i++){ | |
594 if(i>=0 && i<bases.length){ | |
595 depthArray.incrementAndGet(i); | |
596 insertArray.addAndGet(i, insertSize); | |
597 } | |
598 } | |
599 | |
600 } | |
601 | |
602 Read fixScaffold(ByteBuilder bb){ | |
603 int streak=0; | |
604 bb.clear(); | |
605 | |
606 if(insertList!=null){ | |
607 for(int i=0; i<bases.length; i++) { | |
608 bb.append(i).tab().append(depthArray.get(i)).tab().append(insertArray.get(i)/(Tools.max(1, depthArray.get(i)))).nl(); | |
609 } | |
610 ReadWrite.writeString(bb, insertList, false); | |
611 bb.clear(); | |
612 } | |
613 | |
614 for(int i=0; i<bases.length; i++){ | |
615 byte b=bases[i]; | |
616 if(b=='N'){ | |
617 streak++; | |
618 }else{ | |
619 if(streak>=scaffoldBreakNs && i-streak>300 && i<bases.length-300){ | |
620 int pivot=i-streak/2-1; | |
621 long depthSum=depthArray.get(pivot); | |
622 long insertSum=insertArray.get(pivot); | |
623 double avgInsert=(insertSum/(double)depthSum); | |
624 | |
625 int avgDepth=((depthArray.get(i-200-streak)+depthArray.get(i+200))/2); | |
626 int percentile=(int)(buckets*Tools.max(0.5, 1.0-depthSum/(double)(avgDepth+depthSum))); | |
627 int insertProxy=insertByPercentile[percentile]; | |
628 | |
629 // assert(false) : Arrays.toString(insertByPercentile); | |
630 | |
631 int dif=(int)Math.round(insertProxy-avgInsert); | |
632 int toAdd=Tools.max(scaffoldBreakNs, streak+dif); | |
633 | |
634 // System.err.println("totalAverageInsert="+(int)totalAverageInsert+", avg="+(int)avgInsert+", dif="+dif); | |
635 // System.err.println("proxy="+insertProxy+", percentile="+percentile+", avgDepth="+(int)avgDepth+", depth="+depthSum); | |
636 // System.err.println("pivot="+pivot+", depthSum="+depthSum+", avg="+(int)avgInsert+", streak="+streak+", dif="+dif+", toAdd="+toAdd); | |
637 | |
638 if(dif>0){ | |
639 //TODO: This is tricky because long-insert reads will be self-selected. | |
640 //Should be compared to average coverage, or nearby coverage, and then consult a size distribution histogram. | |
641 gapsWidened++; | |
642 nsAdded+=dif; | |
643 }else if(dif<0){ | |
644 gapsNarrowed++; | |
645 nsRemoved-=dif; | |
646 }else{ | |
647 gapsUnchanged++; | |
648 } | |
649 nsTotal+=toAdd; | |
650 for(int j=0; j<toAdd; j++){ | |
651 bb.append('N'); | |
652 } | |
653 } | |
654 streak=0; | |
655 bb.append(b); | |
656 } | |
657 } | |
658 | |
659 return new Read(bb.toBytes(), null, name, numericID); | |
660 } | |
661 | |
662 final int numericID; | |
663 final String name; | |
664 final byte[] bases; | |
665 final AtomicIntegerArray depthArray; | |
666 final AtomicLongArray insertArray; | |
667 | |
668 } | |
669 | |
670 /*--------------------------------------------------------------*/ | |
671 /*---------------- Fields ----------------*/ | |
672 /*--------------------------------------------------------------*/ | |
673 | |
674 /** Primary input file path */ | |
675 private String in=null; | |
676 /** Secondary input file path */ | |
677 private String ref=null; | |
678 | |
679 /** Primary output file path */ | |
680 private String out=null; | |
681 | |
682 /** Override input file extension */ | |
683 private String extin=null; | |
684 /** Override output file extension */ | |
685 private String extout=null; | |
686 | |
687 private String insertList=null; | |
688 | |
689 /*--------------------------------------------------------------*/ | |
690 | |
691 /** Number of reads processed */ | |
692 protected long readsProcessed=0; | |
693 /** Number of bases processed */ | |
694 protected long basesProcessed=0; | |
695 | |
696 /** Number of reads retained */ | |
697 protected long readsOut=0; | |
698 /** Number of bases retained */ | |
699 protected long basesOut=0; | |
700 | |
701 protected long scaffoldsOut=0; | |
702 protected long scaffoldLengthOut=0; | |
703 | |
704 protected long gapsUnchanged=0; | |
705 protected long gapsWidened=0; | |
706 protected long gapsNarrowed=0; | |
707 protected long nsAdded=0; | |
708 protected long nsRemoved=0; | |
709 protected long nsTotal=0; | |
710 | |
711 protected long totalInsertSum=0; | |
712 protected long totalInsertCount=0; | |
713 protected double totalAverageInsert; | |
714 | |
715 protected AtomicLongArray insertCounts=new AtomicLongArray(20000); | |
716 protected int[] insertByPercentile; | |
717 | |
718 /** Quit after processing this many input reads; -1 means no limit */ | |
719 private long maxReads=-1; | |
720 | |
721 /*--------------------------------------------------------------*/ | |
722 | |
723 /** Threads dedicated to reading the sam file */ | |
724 private int streamerThreads=SamStreamer.DEFAULT_THREADS; | |
725 | |
726 private boolean loadedRef=false; | |
727 | |
728 private int scaffoldBreakNs=10; | |
729 | |
730 int buckets=1000; | |
731 | |
732 private int minDepth=10; | |
733 | |
734 private float trimFraction=0.4f; | |
735 | |
736 public final SamFilter samFilter=new SamFilter(); | |
737 | |
738 /** Uses full ref names */ | |
739 public LinkedHashMap<String, Scaffold> refMap=new LinkedHashMap<String, Scaffold>(); | |
740 /** Uses truncated ref names */ | |
741 public LinkedHashMap<String, Scaffold> refMap2=new LinkedHashMap<String, Scaffold>();; | |
742 | |
743 /*--------------------------------------------------------------*/ | |
744 /*---------------- Final Fields ----------------*/ | |
745 /*--------------------------------------------------------------*/ | |
746 | |
747 /** Primary input file */ | |
748 private final FileFormat ffin; | |
749 /** Secondary input file */ | |
750 private final FileFormat ffref; | |
751 | |
752 /** Primary output file */ | |
753 private final FileFormat ffout; | |
754 | |
755 /*--------------------------------------------------------------*/ | |
756 /*---------------- Common Fields ----------------*/ | |
757 /*--------------------------------------------------------------*/ | |
758 | |
759 /** Print status messages to this output stream */ | |
760 private PrintStream outstream=System.err; | |
761 /** Print verbose messages */ | |
762 public static boolean verbose=false; | |
763 /** True if an error was encountered */ | |
764 public boolean errorState=false; | |
765 /** Overwrite existing output files */ | |
766 private boolean overwrite=false; | |
767 /** Append to existing output files */ | |
768 private boolean append=false; | |
769 /** Reads are output in input order */ | |
770 private boolean ordered=false; | |
771 | |
772 } |