Mercurial > repos > rliterman > csp2
comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/consensus/Lilypad.java @ 68:5028fdace37b
planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author | jpayne |
---|---|
date | Tue, 18 Mar 2025 16:23:26 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
67:0e9998148a16 | 68:5028fdace37b |
---|---|
1 package consensus; | |
2 | |
3 import java.io.PrintStream; | |
4 import java.util.ArrayList; | |
5 import java.util.LinkedHashMap; | |
6 import java.util.Map.Entry; | |
7 import java.util.concurrent.atomic.AtomicIntegerArray; | |
8 import java.util.concurrent.atomic.AtomicLongArray; | |
9 | |
10 import dna.AminoAcid; | |
11 import fileIO.ByteFile; | |
12 import fileIO.FileFormat; | |
13 import fileIO.ReadWrite; | |
14 import shared.Parse; | |
15 import shared.Parser; | |
16 import shared.PreParser; | |
17 import shared.ReadStats; | |
18 import shared.Shared; | |
19 import shared.Timer; | |
20 import shared.Tools; | |
21 import stream.ConcurrentReadInputStream; | |
22 import stream.ConcurrentReadOutputStream; | |
23 import stream.FastaReadInputStream; | |
24 import stream.Read; | |
25 import stream.SamLine; | |
26 import stream.SamReadStreamer; | |
27 import stream.SamStreamer; | |
28 import structures.ByteBuilder; | |
29 import structures.ListNum; | |
30 import template.Accumulator; | |
31 import template.ThreadWaiter; | |
32 import var2.SamFilter; | |
33 | |
34 /** | |
35 * Scaffolds contigs based on paired read mapping. | |
36 * | |
37 * @author Brian Bushnell | |
38 * @date September 11, 2019 | |
39 * | |
40 */ | |
41 public class Lilypad implements Accumulator<Lilypad.ProcessThread> { | |
42 | |
43 /*--------------------------------------------------------------*/ | |
44 /*---------------- Initialization ----------------*/ | |
45 /*--------------------------------------------------------------*/ | |
46 | |
47 /** | |
48 * Code entrance from the command line. | |
49 * @param args Command line arguments | |
50 */ | |
51 public static void main(String[] args){ | |
52 //Start a timer immediately upon code entrance. | |
53 Timer t=new Timer(); | |
54 | |
55 //Create an instance of this class | |
56 Lilypad x=new Lilypad(args); | |
57 | |
58 //Run the object | |
59 x.process(t); | |
60 | |
61 //Close the print stream if it was redirected | |
62 Shared.closeStream(x.outstream); | |
63 } | |
64 | |
65 /** | |
66 * Constructor. | |
67 * @param args Command line arguments | |
68 */ | |
69 public Lilypad(String[] args){ | |
70 | |
71 {//Preparse block for help, config files, and outstream | |
72 PreParser pp=new PreParser(args, getClass(), false); | |
73 args=pp.args; | |
74 outstream=pp.outstream; | |
75 } | |
76 | |
77 //Set shared static variables prior to parsing | |
78 ReadWrite.USE_PIGZ=ReadWrite.USE_UNPIGZ=true; | |
79 ReadWrite.MAX_ZIP_THREADS=Shared.threads(); | |
80 SamLine.RNAME_AS_BYTES=false; | |
81 | |
82 samFilter.includeUnmapped=false; | |
83 samFilter.includeSupplimentary=false; | |
84 // samFilter.includeDuplicate=false; | |
85 samFilter.includeNonPrimary=false; | |
86 samFilter.includeQfail=false; | |
87 samFilter.minMapq=4; | |
88 | |
89 {//Parse the arguments | |
90 final Parser parser=parse(args); | |
91 | |
92 Parser.processQuality(); | |
93 | |
94 maxReads=parser.maxReads; | |
95 overwrite=ReadStats.overwrite=parser.overwrite; | |
96 append=ReadStats.append=parser.append; | |
97 | |
98 in=parser.in1; | |
99 extin=parser.extin; | |
100 | |
101 out=parser.out1; | |
102 extout=parser.extout; | |
103 } | |
104 | |
105 { | |
106 // if("auto".equalsIgnoreCase(atomic)){Scaffold.setCA3A(Shared.threads()>8);} | |
107 // else{Scaffold.setCA3A(Parse.parseBoolean(atomic));} | |
108 samFilter.setSamtoolsFilter(); | |
109 | |
110 streamerThreads=Tools.max(1, Tools.min(streamerThreads, Shared.threads())); | |
111 assert(streamerThreads>0) : streamerThreads; | |
112 } | |
113 | |
114 validateParams(); | |
115 fixExtensions(); //Add or remove .gz or .bz2 as needed | |
116 checkFileExistence(); //Ensure files can be read and written | |
117 checkStatics(); //Adjust file-related static fields as needed for this program | |
118 | |
119 //Create output FileFormat objects | |
120 ffout=FileFormat.testOutput(out, FileFormat.FASTA, extout, true, overwrite, append, ordered); | |
121 | |
122 //Create input FileFormat objects | |
123 ffin=FileFormat.testInput(in, FileFormat.SAM, extin, true, true); | |
124 ffref=FileFormat.testInput(ref, FileFormat.FASTA, null, true, true); | |
125 } | |
126 | |
127 /*--------------------------------------------------------------*/ | |
128 /*---------------- Initialization Helpers ----------------*/ | |
129 /*--------------------------------------------------------------*/ | |
130 | |
131 /** Parse arguments from the command line */ | |
132 private Parser parse(String[] args){ | |
133 | |
134 //Create a parser object | |
135 Parser parser=new Parser(); | |
136 | |
137 //Set any necessary Parser defaults here | |
138 //parser.foo=bar; | |
139 | |
140 //Parse each argument | |
141 for(int i=0; i<args.length; i++){ | |
142 String arg=args[i]; | |
143 | |
144 //Break arguments into their constituent parts, in the form of "a=b" | |
145 String[] split=arg.split("="); | |
146 String a=split[0].toLowerCase(); | |
147 String b=split.length>1 ? split[1] : null; | |
148 if(b!=null && b.equalsIgnoreCase("null")){b=null;} | |
149 | |
150 if(a.equals("verbose")){ | |
151 verbose=Parse.parseBoolean(b); | |
152 }else if(a.equals("ref") || a.equals("scaffolds")){ | |
153 ref=b; | |
154 }else if(a.equals("insertlist")){ | |
155 insertList=b; | |
156 }else if(a.equals("ordered")){ | |
157 ordered=Parse.parseBoolean(b); | |
158 }else if(a.equalsIgnoreCase("sameStrandPairs")){ | |
159 sameStrandPairs=Parse.parseBoolean(b); | |
160 }else if(a.equalsIgnoreCase("ns") || a.equalsIgnoreCase("n") || a.equalsIgnoreCase("scaffoldbreak") || a.equalsIgnoreCase("gap") || a.equalsIgnoreCase("mingap")){ | |
161 scaffoldBreakNs=Integer.parseInt(b); | |
162 assert(scaffoldBreakNs>0); | |
163 }else if(a.equalsIgnoreCase("mindepth")){ | |
164 minDepth=Integer.parseInt(b); | |
165 assert(minDepth>0); | |
166 }else if(a.equalsIgnoreCase("maxinsert")){ | |
167 maxPairDist=Parse.parseIntKMG(b); | |
168 }else if(a.equalsIgnoreCase("minWeightRatio") || a.equalsIgnoreCase("minwr")){ | |
169 minWeightRatio=Float.parseFloat(b); | |
170 }else if(a.equalsIgnoreCase("minStrandRatio") || a.equalsIgnoreCase("minsr")){ | |
171 minStrandRatio=Float.parseFloat(b); | |
172 }else if(a.equals("clearfilters") || a.equals("clearfilter")){ | |
173 if(Parse.parseBoolean(b)){ | |
174 samFilter.clear(); | |
175 } | |
176 }else if(a.equals("parse_flag_goes_here")){ | |
177 long fake_variable=Parse.parseKMG(b); | |
178 //Set a variable here | |
179 }else if(samFilter.parse(arg, a, b)){ | |
180 //do nothing | |
181 }else if(parser.parse(arg, a, b)){//Parse standard flags in the parser | |
182 //do nothing | |
183 }else{ | |
184 outstream.println("Unknown parameter "+args[i]); | |
185 assert(false) : "Unknown parameter "+args[i]; | |
186 } | |
187 } | |
188 | |
189 return parser; | |
190 } | |
191 | |
192 /** Add or remove .gz or .bz2 as needed */ | |
193 private void fixExtensions(){ | |
194 in=Tools.fixExtension(in); | |
195 ref=Tools.fixExtension(ref); | |
196 } | |
197 | |
198 /** Ensure files can be read and written */ | |
199 private void checkFileExistence(){ | |
200 | |
201 //Ensure there is an input file | |
202 if(in==null){throw new RuntimeException("Error - an input file is required.");} | |
203 | |
204 //Ensure there is an input file | |
205 if(ref==null){throw new RuntimeException("Error - a reference file is required.");} | |
206 | |
207 //Ensure output files can be written | |
208 if(!Tools.testOutputFiles(overwrite, append, false, out)){ | |
209 outstream.println((out==null)+", "+out); | |
210 throw new RuntimeException("\n\noverwrite="+overwrite+"; Can't write to output file "+out+"\n"); | |
211 } | |
212 | |
213 //Ensure input files can be read | |
214 if(!Tools.testInputFiles(false, true, in, ref)){ | |
215 throw new RuntimeException("\nCan't read some input files.\n"); | |
216 } | |
217 | |
218 //Ensure that no file was specified multiple times | |
219 if(!Tools.testForDuplicateFiles(true, in, ref, out)){ | |
220 throw new RuntimeException("\nSome file names were specified multiple times.\n"); | |
221 } | |
222 } | |
223 | |
224 /** Adjust file-related static fields as needed for this program */ | |
225 private static void checkStatics(){ | |
226 //Adjust the number of threads for input file reading | |
227 if(!ByteFile.FORCE_MODE_BF1 && !ByteFile.FORCE_MODE_BF2 && Shared.threads()>2){ | |
228 ByteFile.FORCE_MODE_BF2=true; | |
229 } | |
230 | |
231 assert(FastaReadInputStream.settingsOK()); | |
232 } | |
233 | |
234 /** Ensure parameter ranges are within bounds and required parameters are set */ | |
235 private boolean validateParams(){ | |
236 // assert(minfoo>0 && minfoo<=maxfoo) : minfoo+", "+maxfoo; | |
237 return true; | |
238 } | |
239 | |
240 /*--------------------------------------------------------------*/ | |
241 /*---------------- Outer Methods ----------------*/ | |
242 /*--------------------------------------------------------------*/ | |
243 | |
244 /** Create read streams and process all data */ | |
245 void process(Timer t){ | |
246 | |
247 //Turn off read validation in the input threads to increase speed | |
248 final boolean vic=Read.VALIDATE_IN_CONSTRUCTOR; | |
249 Read.VALIDATE_IN_CONSTRUCTOR=Shared.threads()<4; | |
250 | |
251 //Create a read input stream | |
252 final SamStreamer ss=makeStreamer(ffin); | |
253 | |
254 //Load reference | |
255 loadReferenceCustom(); | |
256 | |
257 //Reset counters | |
258 readsProcessed=readsOut=0; | |
259 basesProcessed=basesOut=0; | |
260 | |
261 //Process the reads in separate threads | |
262 spawnThreads(ss); | |
263 | |
264 //Optionally create a read output stream | |
265 final ConcurrentReadOutputStream ros=makeCros(); | |
266 | |
267 if(verbose){outstream.println("Fixing reference.");} | |
268 | |
269 makeScaffolds(ros); | |
270 | |
271 if(verbose){outstream.println("Finished; closing streams.");} | |
272 | |
273 //Write anything that was accumulated by ReadStats | |
274 errorState|=ReadStats.writeAll(); | |
275 //Close the read streams | |
276 errorState|=ReadWrite.closeStream(ros); | |
277 | |
278 //Reset read validation | |
279 Read.VALIDATE_IN_CONSTRUCTOR=vic; | |
280 | |
281 //Report timing and results | |
282 t.stop(); | |
283 outstream.println(Tools.timeReadsBasesProcessed(t, readsProcessed, basesProcessed, 8)); | |
284 outstream.println(Tools.readsBasesOut(readsProcessed, basesProcessed, scaffoldsOut, scaffoldLengthOut, 8, false)); | |
285 | |
286 outstream.println(); | |
287 outstream.println(Tools.number("Average Insert", totalAverageInsert, 2, 8)); | |
288 outstream.println(Tools.number("Joins Made ", gapsAdded, 8)); | |
289 outstream.println(Tools.number("Ns Added ", nsAdded, 8)); | |
290 outstream.println(Tools.number("Contigs In ", refMap.size(), 8)); | |
291 outstream.println(Tools.number("Scaffolds Out ", scaffoldsOut, 8)); | |
292 | |
293 | |
294 //Throw an exception of there was an error in a thread | |
295 if(errorState){ | |
296 throw new RuntimeException(getClass().getName()+" terminated in an error state; the output may be corrupt."); | |
297 } | |
298 } | |
299 | |
300 private synchronized void loadReferenceCustom(){ | |
301 assert(!loadedRef); | |
302 ConcurrentReadInputStream cris=makeRefCris(); | |
303 for(ListNum<Read> ln=cris.nextList(); ln!=null && ln.size()>0; ln=cris.nextList()) { | |
304 for(Read r : ln){ | |
305 String name=r.id; | |
306 String name2=Tools.trimToWhitespace(r.id); | |
307 Contig cont=new Contig(name, r.bases, r.numericID); | |
308 refMap.put(name, cont); | |
309 refMap2.put(name2, cont); | |
310 } | |
311 cris.returnList(ln); | |
312 } | |
313 ReadWrite.closeStream(cris); | |
314 loadedRef=true; | |
315 } | |
316 | |
317 private ConcurrentReadInputStream makeRefCris(){ | |
318 ConcurrentReadInputStream cris=ConcurrentReadInputStream.getReadInputStream(maxReads, true, ffref, null); | |
319 cris.start(); //Start the stream | |
320 if(verbose){outstream.println("Started cris");} | |
321 boolean paired=cris.paired(); | |
322 assert(!paired) : "References should not be paired."; | |
323 return cris; | |
324 } | |
325 | |
326 private SamStreamer makeStreamer(FileFormat ff){ | |
327 if(ff==null){return null;} | |
328 SamStreamer ss=new SamReadStreamer(ff, streamerThreads, true, maxReads); | |
329 ss.start(); //Start the stream | |
330 if(verbose){outstream.println("Started Streamer");} | |
331 return ss; | |
332 } | |
333 | |
334 private ConcurrentReadOutputStream makeCros(){ | |
335 if(ffout==null){return null;} | |
336 | |
337 //Select output buffer size based on whether it needs to be ordered | |
338 final int buff=(ordered ? Tools.mid(16, 128, (Shared.threads()*2)/3) : 8); | |
339 | |
340 final ConcurrentReadOutputStream ros=ConcurrentReadOutputStream.getStream(ffout, null, buff, null, false); | |
341 ros.start(); //Start the stream | |
342 return ros; | |
343 } | |
344 | |
345 /*--------------------------------------------------------------*/ | |
346 /*---------------- Thread Management ----------------*/ | |
347 /*--------------------------------------------------------------*/ | |
348 | |
349 /** Spawn process threads */ | |
350 private void spawnThreads(final SamStreamer ss){ | |
351 | |
352 //Do anything necessary prior to processing | |
353 | |
354 //Determine how many threads may be used | |
355 final int threads=Shared.threads(); | |
356 | |
357 //Fill a list with ProcessThreads | |
358 ArrayList<ProcessThread> alpt=new ArrayList<ProcessThread>(threads); | |
359 for(int i=0; i<threads; i++){ | |
360 alpt.add(new ProcessThread(ss, i)); | |
361 } | |
362 | |
363 //Start the threads | |
364 for(ProcessThread pt : alpt){ | |
365 pt.start(); | |
366 } | |
367 | |
368 //Wait for threads to finish | |
369 boolean success=ThreadWaiter.waitForThreads(alpt, this); | |
370 errorState&=!success; | |
371 | |
372 //Do anything necessary after processing | |
373 totalAverageInsert=totalInsertSum/(double)totalInsertCount; | |
374 insertByPercentile=Tools.makeHistogram(insertCounts, buckets); | |
375 } | |
376 | |
377 @Override | |
378 public final void accumulate(ProcessThread pt){ | |
379 readsProcessed+=pt.readsProcessedT; | |
380 basesProcessed+=pt.basesProcessedT; | |
381 readsOut+=pt.readsOutT; | |
382 basesOut+=pt.basesOutT; | |
383 | |
384 totalInsertSum+=pt.totalInsertSumT; | |
385 totalInsertCount+=pt.totalInsertCountT; | |
386 | |
387 errorState|=(!pt.success); | |
388 } | |
389 | |
390 @Override | |
391 public final boolean success(){return !errorState;} | |
392 | |
393 /*--------------------------------------------------------------*/ | |
394 /*---------------- Inner Methods ----------------*/ | |
395 /*--------------------------------------------------------------*/ | |
396 | |
397 private void makeScaffolds(ConcurrentReadOutputStream ros){ | |
398 ByteBuilder bb=new ByteBuilder(1000000); | |
399 | |
400 ArrayList<Read> list=new ArrayList<Read>(200); | |
401 long num=0; | |
402 long lengthSum=0; | |
403 for(Entry<String, Contig> e : refMap.entrySet()){ | |
404 Contig cont=e.getValue(); | |
405 if(!cont.processed()){ | |
406 Read r=cont.makeScaffold(bb); | |
407 assert(r!=null); | |
408 | |
409 lengthSum+=r.length(); | |
410 list.add(r); | |
411 scaffoldsOut++; | |
412 scaffoldLengthOut+=r.length(); | |
413 | |
414 if(list.size()>=200 || lengthSum>=100000){ | |
415 if(ros!=null){ros.add(list, num);} | |
416 list=new ArrayList<Read>(200); | |
417 num++; | |
418 lengthSum=0; | |
419 } | |
420 assert(cont.processed()); | |
421 } | |
422 } | |
423 if(list.size()>0){ | |
424 if(ros!=null){ros.add(list, num);} | |
425 } | |
426 } | |
427 | |
428 private static int calcInsertSize(SamLine sl) { | |
429 assert(sl.mapped() && sl.pairedOnSameChrom()); | |
430 assert(sl.primary()); | |
431 assert(!sl.supplementary()); | |
432 assert(sl.leftmost()); | |
433 | |
434 assert(sl.tlen>0) : sl.tlen+"\n\n"+sl; | |
435 return sl.tlen>0 ? sl.tlen : -sl.tlen; | |
436 | |
437 // final int insertSize; | |
438 // String insertTag=null; | |
439 // if(sl.optional!=null){ | |
440 // for(String s : sl.optional){ | |
441 // if(s.startsWith("X8:Z:")){ | |
442 // insertTag=s; | |
443 // break; | |
444 // } | |
445 // } | |
446 // } | |
447 // if(insertTag!=null){ | |
448 // insertSize=Integer.parseInt(insertTag.substring(5)); | |
449 // }else{ | |
450 // insertSize=sl.tlen;//This is unsafe due to indels. | |
451 // assert(false) : "Reads need insert size tags."; | |
452 // } | |
453 // assert(insertSize>0) : sl; | |
454 // | |
455 // return insertSize; | |
456 } | |
457 | |
458 private Contig getScaffold(String rname){ | |
459 Contig scaf=refMap.get(rname); | |
460 if(scaf==null){scaf=refMap2.get(Tools.trimToWhitespace(rname));} | |
461 assert(scaf!=null) : "Can't find graph for "+rname; | |
462 return scaf; | |
463 } | |
464 | |
465 /*--------------------------------------------------------------*/ | |
466 /*---------------- Inner Classes ----------------*/ | |
467 /*--------------------------------------------------------------*/ | |
468 | |
469 /** This class is static to prevent accidental writing to shared variables. | |
470 * It is safe to remove the static modifier. */ | |
471 class ProcessThread extends Thread { | |
472 | |
473 //Constructor | |
474 ProcessThread(final SamStreamer ss_, final int tid_){ | |
475 ss=ss_; | |
476 tid=tid_; | |
477 } | |
478 | |
479 //Called by start() | |
480 @Override | |
481 public void run(){ | |
482 //Do anything necessary prior to processing | |
483 | |
484 //Process the reads | |
485 processInner(); | |
486 | |
487 //Do anything necessary after processing | |
488 | |
489 //Indicate successful exit status | |
490 success=true; | |
491 } | |
492 | |
493 /** Iterate through the reads */ | |
494 void processInner(){ | |
495 | |
496 //Grab and process all lists | |
497 for(ListNum<Read> ln=ss.nextReads(); ln!=null; ln=ss.nextReads()){ | |
498 // if(verbose){outstream.println("Got list of size "+list.size());} //Disabled due to non-static access | |
499 | |
500 processList(ln); | |
501 } | |
502 | |
503 } | |
504 | |
505 void processList(ListNum<Read> ln){ | |
506 | |
507 //Grab the actual read list from the ListNum | |
508 final ArrayList<Read> reads=ln.list; | |
509 | |
510 //Loop through each read in the list | |
511 for(int idx=0; idx<reads.size(); idx++){ | |
512 final Read r=reads.get(idx); | |
513 | |
514 //Validate reads in worker threads | |
515 if(!r.validated()){r.validate(true);} | |
516 | |
517 //Track the initial length for statistics | |
518 final int initialLength=r.length(); | |
519 | |
520 //Increment counters | |
521 readsProcessedT+=r.pairCount(); | |
522 basesProcessedT+=initialLength; | |
523 | |
524 processRead(r); | |
525 } | |
526 } | |
527 | |
528 /** | |
529 * Process a read or a read pair. | |
530 * @param r Read 1 | |
531 * @param r2 Read 2 (may be null) | |
532 * @return True if the reads should be kept, false if they should be discarded. | |
533 */ | |
534 void processRead(final Read r){ | |
535 final SamLine sl=r.samline; | |
536 assert(sl!=null) : sl; | |
537 if(samFilter!=null && !samFilter.passesFilter(sl)){return;} | |
538 | |
539 //sl.nextMapped(); | |
540 if(sl.mapped() && sl.primary() && !sl.supplementary()){ | |
541 final String rname=sl.rnameS(); | |
542 Contig scaf=getScaffold(rname); | |
543 if(scaf!=null){ | |
544 if(sl.pairedOnSameChrom() && sl.properPair() && sl.leftmost()){ | |
545 final int insertSize=calcInsertSize(sl); | |
546 insertCounts.incrementAndGet(Tools.mid(0, insertSize, insertCounts.length()-1)); | |
547 totalInsertSumT+=insertSize; | |
548 totalInsertCountT++; | |
549 } | |
550 scaf.add(sl); | |
551 | |
552 readsOutT++; | |
553 basesOutT+=r.length(); | |
554 } | |
555 } | |
556 if(sl.mapped() && sl.pairedOnSameChrom() && sl.properPair() && sl.primary() && !sl.supplementary() && sl.leftmost()){ | |
557 final String rname=sl.rnameS(); | |
558 | |
559 Contig scaf=getScaffold(rname); | |
560 if(scaf!=null){ | |
561 final int insertSize=calcInsertSize(sl); | |
562 insertCounts.incrementAndGet(Tools.mid(0, insertSize, insertCounts.length()-1)); | |
563 scaf.add(sl); | |
564 | |
565 readsOutT++; | |
566 basesOutT+=r.length(); | |
567 | |
568 totalInsertSumT+=insertSize; | |
569 totalInsertCountT++; | |
570 } | |
571 } | |
572 } | |
573 | |
574 /** Number of reads processed by this thread */ | |
575 protected long readsProcessedT=0; | |
576 /** Number of bases processed by this thread */ | |
577 protected long basesProcessedT=0; | |
578 | |
579 /** Number of reads retained by this thread */ | |
580 protected long readsOutT=0; | |
581 /** Number of bases retained by this thread */ | |
582 protected long basesOutT=0; | |
583 | |
584 protected long totalInsertSumT=0; | |
585 protected long totalInsertCountT=0; | |
586 | |
587 long insertSum=0; | |
588 | |
589 /** True only if this thread has completed successfully */ | |
590 boolean success=false; | |
591 | |
592 /** Shared input stream */ | |
593 private final SamStreamer ss; | |
594 /** Thread ID */ | |
595 final int tid; | |
596 } | |
597 | |
598 Contig findLeftmost(Contig source){ | |
599 if(verbose){System.err.println("findLeftmost("+source.name+")");} | |
600 while(true) { | |
601 assert(!source.processed()); | |
602 if(source.processed()){return null;} | |
603 source.processedLeft=true; | |
604 Edge se=source.bestEdge(true); | |
605 if(verbose){System.err.println("Found source edge "+se);} | |
606 if(se==null){return source;} | |
607 Contig dest=se.dest; | |
608 if(dest.processed()){ | |
609 if(verbose){System.err.println("Dest was processed; returning.");} | |
610 return source; | |
611 } | |
612 if(se.sameStrand()){ | |
613 if(source.strand==dest.strand){ | |
614 | |
615 }else{ | |
616 if(verbose){System.err.println("Flipping "+dest.name);} | |
617 dest.flip(); | |
618 } | |
619 }else{ | |
620 if(source.strand==dest.strand){ | |
621 if(verbose){System.err.println("Flipping "+dest.name);} | |
622 dest.flip(); | |
623 }else{ | |
624 | |
625 } | |
626 } | |
627 Edge de=dest.bestEdge(false); | |
628 if(verbose){System.err.println("Found dest edge "+de);} | |
629 if(de==null || de.dest!=source){ | |
630 if(dest.strand==1){dest.flip();} | |
631 if(verbose){System.err.println("Dest edge did not match; returning.");} | |
632 return source; | |
633 } | |
634 source=dest; | |
635 if(verbose){System.err.println("Migrated to next node.");} | |
636 } | |
637 } | |
638 | |
639 Read expandRight(final Contig source0, ByteBuilder bb){ | |
640 if(verbose){System.err.println("expandRight("+source0.name+")");} | |
641 bb.clear(); | |
642 Contig source=source0; | |
643 while(true) { | |
644 assert(!source.processedRight); | |
645 if(source.processedRight){return null;} | |
646 if(source.strand==1){ | |
647 Tools.reverseInPlace(source.depthArray); | |
648 } | |
649 source.processedRight=true; | |
650 bb.append(source.bases); | |
651 Edge se=source.bestEdge(false); | |
652 if(verbose){System.err.println("Found source edge "+se);} | |
653 if(se==null){break;} | |
654 Contig dest=se.dest; | |
655 if(dest.processedRight){ | |
656 if(verbose){System.err.println("Dest was processed; returning.");} | |
657 break; | |
658 } | |
659 if(se.sameStrand()){ | |
660 if(source.strand==dest.strand){ | |
661 | |
662 }else{ | |
663 if(verbose){System.err.println("Flipping "+dest.name);} | |
664 dest.flip(); | |
665 } | |
666 }else{ | |
667 if(source.strand==dest.strand){ | |
668 if(verbose){System.err.println("Flipping "+dest.name);} | |
669 dest.flip(); | |
670 }else{ | |
671 | |
672 } | |
673 } | |
674 Edge de=dest.bestEdge(true); | |
675 if(verbose){System.err.println("Found dest edge "+de);} | |
676 if(de==null || de.dest!=source){ | |
677 if(verbose){System.err.println("Dest edge did not match; returning.");} | |
678 if(dest.strand==1){dest.flip();} | |
679 break; | |
680 } | |
681 | |
682 //Now append Ns | |
683 int observedLength=(int)(se.distanceSum/se.count()); | |
684 long depth=se.count(); | |
685 int depthProxyIndex=(source.length()-Tools.min(source.length()/2, 300)); | |
686 long depthProxy=source.depthArray.get(depthProxyIndex); | |
687 int percentile=(int)(buckets*depth/(float)(depth+depthProxy)); | |
688 int inferredLength=insertByPercentile[percentile]; | |
689 | |
690 int Ns=(Tools.max(scaffoldBreakNs, inferredLength-observedLength)); | |
691 for(int i=0; i<Ns; i++){bb.append('N');} | |
692 source=dest; | |
693 | |
694 gapsAdded++; | |
695 nsAdded+=Ns; | |
696 } | |
697 Read r=new Read(bb.toBytes(), null, source0.name, source0.numericID); | |
698 return r; | |
699 } | |
700 | |
701 /*--------------------------------------------------------------*/ | |
702 | |
703 private class Contig { | |
704 | |
705 Contig(String name_, byte[] bases_, long numericID_){ | |
706 name=name_; | |
707 bases=bases_; | |
708 numericID=(int)numericID_; | |
709 depthArray=new AtomicIntegerArray(bases.length); | |
710 } | |
711 | |
712 public Read makeScaffold(ByteBuilder bb) { | |
713 assert(!processed()); | |
714 Contig leftmost=findLeftmost(this); | |
715 return expandRight(leftmost, bb); | |
716 } | |
717 | |
718 Edge bestEdge(boolean left) { | |
719 final LinkedHashMap<String, Edge> map=(left ? leftEdgeMap : rightEdgeMap); | |
720 if(map.isEmpty()){return null;} | |
721 long weightSum=0; | |
722 long countSum=0; | |
723 Edge best=null; | |
724 for(Entry<String, Edge> entry : map.entrySet()){ | |
725 Edge e=entry.getValue(); | |
726 weightSum+=e.weight; | |
727 countSum+=e.count(); | |
728 if(best==null || e.weight>best.weight){best=e;} | |
729 } | |
730 if(best.count()<minDepth){return null;} | |
731 if(weightSum*minWeightRatio>best.weight){return null;} | |
732 if(best.strandRatio()<minStrandRatio){return null;} | |
733 if(best.badCount>0.5*best.count()){return null;} | |
734 return best; | |
735 } | |
736 | |
737 void add(SamLine sl){ | |
738 assert(sl.mapped() && sl.primary() && !sl.supplementary()); | |
739 if(sl.nextMapped()){ | |
740 if(sl.pairedOnSameChrom()){ | |
741 if(!sl.properPair()){ | |
742 addCoverageSingleton(sl); | |
743 }else if(sl.leftmost()){ | |
744 addCoveragePaired(sl); | |
745 } | |
746 }else{ | |
747 addCoverageSingleton(sl); | |
748 handleMixedPair(sl); | |
749 } | |
750 }else{ | |
751 addCoverageSingleton(sl); | |
752 } | |
753 } | |
754 | |
755 private void addCoverageSingleton(SamLine sl){ | |
756 assert(sl.cigar!=null); | |
757 int start=sl.pos-1; | |
758 int stop=start+SamLine.calcCigarLength(sl.cigar, false, false); | |
759 | |
760 for(int i=start; i<stop; i++){ | |
761 if(i>=0 && i<bases.length){ | |
762 depthArray.incrementAndGet(i); | |
763 } | |
764 } | |
765 } | |
766 | |
767 private void addCoveragePaired(SamLine sl){ | |
768 assert(sl.cigar!=null); | |
769 assert(sl.leftmost() && sl.pairedOnSameChrom() && sl.nextMapped()); | |
770 int start=sl.pos-1; | |
771 int stop=start+sl.tlen; | |
772 | |
773 for(int i=start; i<stop; i++){ | |
774 if(i>=0 && i<bases.length){ | |
775 depthArray.incrementAndGet(i); | |
776 } | |
777 } | |
778 } | |
779 | |
780 /** Reads mapping to different contigs */ | |
781 private void handleMixedPair(SamLine sl){ | |
782 assert(sl.mapped() && sl.nextMapped() && !sl.pairedOnSameChrom()); | |
783 String rname=sl.rnameS(); | |
784 String rnext=sl.rnextS(); | |
785 if(rname.equals(rnext)){return;} | |
786 | |
787 final boolean left=(sl.strand()==1); | |
788 LinkedHashMap<String, Edge> map=(left ? leftEdgeMap : rightEdgeMap); | |
789 Edge e=map.get(rnext); | |
790 Contig dest=null; | |
791 if(e==null){ | |
792 dest=getScaffold(rnext); | |
793 if(dest==null){return;} | |
794 e=new Edge(this, dest, left); | |
795 map.put(rnext, e); | |
796 } | |
797 e.add(sl); | |
798 } | |
799 | |
800 void flip(){//Be careful with this | |
801 AminoAcid.reverseComplementBasesInPlace(bases); | |
802 strand^=1; | |
803 LinkedHashMap<String, Edge> temp=leftEdgeMap; | |
804 leftEdgeMap=rightEdgeMap; | |
805 rightEdgeMap=temp; | |
806 } | |
807 | |
808 int length(){return bases.length;} | |
809 | |
810 final int numericID; | |
811 final String name; | |
812 final byte[] bases; | |
813 final AtomicIntegerArray depthArray; | |
814 int strand=0; | |
815 | |
816 boolean processedLeft=false; | |
817 boolean processedRight=false; | |
818 boolean processed(){return processedLeft || processedRight;} | |
819 | |
820 LinkedHashMap<String, Edge> leftEdgeMap=new LinkedHashMap<String, Edge>(); | |
821 LinkedHashMap<String, Edge> rightEdgeMap=new LinkedHashMap<String, Edge>(); | |
822 } | |
823 | |
824 private class Edge{ | |
825 | |
826 Edge(Contig source_, Contig dest_, boolean left_){ | |
827 source=source_; | |
828 dest=dest_; | |
829 leftEdge=left_; | |
830 } | |
831 | |
832 void add(SamLine sl){ | |
833 final boolean sameStrandReads=(sl.strand()==sl.mateStrand()); | |
834 final boolean sameStrandContigs=(sameStrandPairs==sameStrandReads); | |
835 final int spos, dpos; | |
836 if(leftEdge){ | |
837 spos=sl.pos+sl.calcCigarLength(true, false)-1; | |
838 dpos=(sameStrandContigs ? dest.length()-sl.pnext : sl.pnext+sl.length())-1; | |
839 }else{ | |
840 spos=source.length()-sl.pos-1; | |
841 dpos=(sameStrandContigs ? sl.pnext+sl.length() : dest.length()-sl.pnext)-1; | |
842 } | |
843 final int distance=spos+dpos; | |
844 | |
845 if(distance>maxPairDist){ | |
846 | |
847 // assert(false) : "distance="+distance+", spos="+spos+", dpos="+dpos+", sameStrandContigs="+sameStrandContigs+ | |
848 // "\nsl.pos="+sl.pos+", sl.pnext="+sl.pnext+", strand="+sl.strand()+", nextStrand="+sl.mateStrand()+", left="+leftEdge | |
849 // +"\n"+sl; | |
850 // badCount++; | |
851 return; | |
852 } | |
853 | |
854 distanceSum+=distance; | |
855 | |
856 weight+=sl.mapq; | |
857 if(sameStrandContigs){ | |
858 sameStrandCount++; | |
859 }else{ | |
860 difStrandCount++; | |
861 } | |
862 // assert(false) : weight; | |
863 } | |
864 | |
865 public float strandRatio() { | |
866 return Tools.max(sameStrandCount, difStrandCount)/(float)(sameStrandCount+difStrandCount); | |
867 } | |
868 | |
869 public boolean sameStrand(){ | |
870 return sameStrandCount>=difStrandCount; | |
871 } | |
872 | |
873 @Override | |
874 public String toString(){ | |
875 return "("+source.name+"->"+dest.name+", "+(leftEdge ? "left" : "right")+", weight="+weight+ | |
876 ", same="+sameStrandCount+", dif="+difStrandCount+", bad="+badCount+")"; | |
877 } | |
878 | |
879 long count(){return sameStrandCount+difStrandCount;} | |
880 | |
881 final Contig source; | |
882 final Contig dest; | |
883 long sameStrandCount; | |
884 long difStrandCount; | |
885 long distanceSum; | |
886 long weight; | |
887 long badCount; | |
888 final boolean leftEdge; | |
889 } | |
890 | |
891 /*--------------------------------------------------------------*/ | |
892 /*---------------- Fields ----------------*/ | |
893 /*--------------------------------------------------------------*/ | |
894 | |
895 /** Primary input file path */ | |
896 private String in=null; | |
897 /** Secondary input file path */ | |
898 private String ref=null; | |
899 | |
900 /** Primary output file path */ | |
901 private String out=null; | |
902 | |
903 /** Override input file extension */ | |
904 private String extin=null; | |
905 /** Override output file extension */ | |
906 private String extout=null; | |
907 | |
908 private String insertList=null; | |
909 | |
910 /*--------------------------------------------------------------*/ | |
911 | |
912 /** Number of reads processed */ | |
913 protected long readsProcessed=0; | |
914 /** Number of bases processed */ | |
915 protected long basesProcessed=0; | |
916 | |
917 /** Number of reads retained */ | |
918 protected long readsOut=0; | |
919 /** Number of bases retained */ | |
920 protected long basesOut=0; | |
921 | |
922 protected long scaffoldsOut=0; | |
923 protected long scaffoldLengthOut=0; | |
924 | |
925 protected long totalInsertSum=0; | |
926 protected long totalInsertCount=0; | |
927 protected double totalAverageInsert; | |
928 | |
929 /** Quit after processing this many input reads; -1 means no limit */ | |
930 private long maxReads=-1; | |
931 | |
932 boolean sameStrandPairs=false; | |
933 | |
934 int gapsAdded=0; | |
935 long nsAdded=0; | |
936 | |
937 /*--------------------------------------------------------------*/ | |
938 | |
939 /** Threads dedicated to reading the sam file */ | |
940 private int streamerThreads=SamStreamer.DEFAULT_THREADS; | |
941 | |
942 private boolean loadedRef=false; | |
943 | |
944 private int minDepth=4; | |
945 | |
946 private float minWeightRatio=0.8f; | |
947 private float minStrandRatio=0.8f; | |
948 | |
949 private int scaffoldBreakNs=10; | |
950 | |
951 private int maxPairDist=3000; | |
952 | |
953 private int buckets=1000; | |
954 protected AtomicLongArray insertCounts=new AtomicLongArray(20000); | |
955 protected int[] insertByPercentile; | |
956 | |
957 public final SamFilter samFilter=new SamFilter(); | |
958 | |
959 /** Uses full ref names */ | |
960 public LinkedHashMap<String, Contig> refMap=new LinkedHashMap<String, Contig>(); | |
961 /** Uses truncated ref names */ | |
962 public LinkedHashMap<String, Contig> refMap2=new LinkedHashMap<String, Contig>(); | |
963 | |
964 /*--------------------------------------------------------------*/ | |
965 /*---------------- Final Fields ----------------*/ | |
966 /*--------------------------------------------------------------*/ | |
967 | |
968 /** Primary input file */ | |
969 private final FileFormat ffin; | |
970 /** Secondary input file */ | |
971 private final FileFormat ffref; | |
972 | |
973 /** Primary output file */ | |
974 private final FileFormat ffout; | |
975 | |
976 /*--------------------------------------------------------------*/ | |
977 /*---------------- Common Fields ----------------*/ | |
978 /*--------------------------------------------------------------*/ | |
979 | |
980 /** Print status messages to this output stream */ | |
981 private PrintStream outstream=System.err; | |
982 /** Print verbose messages */ | |
983 public static boolean verbose=false; | |
984 /** True if an error was encountered */ | |
985 public boolean errorState=false; | |
986 /** Overwrite existing output files */ | |
987 private boolean overwrite=false; | |
988 /** Append to existing output files */ | |
989 private boolean append=false; | |
990 /** Reads are output in input order */ | |
991 private boolean ordered=false; | |
992 | |
993 } |