Mercurial > repos > rliterman > csp2
comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/tax/SplitByTaxa.java @ 68:5028fdace37b
planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author | jpayne |
---|---|
date | Tue, 18 Mar 2025 16:23:26 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
67:0e9998148a16 | 68:5028fdace37b |
---|---|
1 package tax; | |
2 | |
3 import java.io.File; | |
4 import java.io.PrintStream; | |
5 import java.util.ArrayList; | |
6 import java.util.HashMap; | |
7 | |
8 import fileIO.ByteFile; | |
9 import fileIO.FileFormat; | |
10 import fileIO.ReadWrite; | |
11 import shared.Parse; | |
12 import shared.Parser; | |
13 import shared.PreParser; | |
14 import shared.ReadStats; | |
15 import shared.Shared; | |
16 import shared.Timer; | |
17 import shared.Tools; | |
18 import stream.ConcurrentReadInputStream; | |
19 import stream.ConcurrentReadOutputStream; | |
20 import stream.FASTQ; | |
21 import stream.FastaReadInputStream; | |
22 import stream.Read; | |
23 import structures.ListNum; | |
24 | |
25 /** | |
26 * Filters sequences according to their taxonomy, | |
27 * as determined by the sequence name. Sequences should | |
28 * be labeled with a gi number or NCBI taxID. | |
29 * | |
30 * @author Brian Bushnell | |
31 * @date November 23, 2015 | |
32 * | |
33 */ | |
34 public class SplitByTaxa { | |
35 | |
36 /*--------------------------------------------------------------*/ | |
37 /*---------------- Initialization ----------------*/ | |
38 /*--------------------------------------------------------------*/ | |
39 | |
40 /** | |
41 * Code entrance from the command line. | |
42 * @param args Command line arguments | |
43 */ | |
44 public static void main(String[] args){ | |
45 Timer t=new Timer(); | |
46 SplitByTaxa x=new SplitByTaxa(args); | |
47 x.process(t); | |
48 | |
49 //Close the print stream if it was redirected | |
50 Shared.closeStream(x.outstream); | |
51 } | |
52 | |
53 /** | |
54 * Constructor. | |
55 * @param args Command line arguments | |
56 */ | |
57 public SplitByTaxa(String[] args){ | |
58 | |
59 {//Preparse block for help, config files, and outstream | |
60 PreParser pp=new PreParser(args, getClass(), false); | |
61 args=pp.args; | |
62 outstream=pp.outstream; | |
63 } | |
64 | |
65 //Set some shared static variables | |
66 Shared.capBuffers(4); | |
67 ReadWrite.USE_UNPIGZ=true; | |
68 ReadWrite.USE_PIGZ=false; | |
69 ReadWrite.USE_GZIP=false; | |
70 ReadWrite.MAX_ZIP_THREADS=Shared.threads(); | |
71 | |
72 boolean setInterleaved=false; //Whether interleaved was explicitly set. | |
73 String giTableFile=null; | |
74 String taxTreeFile=null; | |
75 | |
76 //Create a parser object | |
77 Parser parser=new Parser(); | |
78 | |
79 //Parse each argument | |
80 for(int i=0; i<args.length; i++){ | |
81 String arg=args[i]; | |
82 | |
83 //Break arguments into their constituent parts, in the form of "a=b" | |
84 String[] split=arg.split("="); | |
85 String a=split[0].toLowerCase(); | |
86 String b=split.length>1 ? split[1] : null; | |
87 | |
88 if(parser.parse(arg, a, b)){//Parse standard flags in the parser | |
89 //do nothing | |
90 }else if(a.equals("verbose")){ | |
91 verbose=Parse.parseBoolean(b); | |
92 }else if(a.equals("taxlevel") || a.equals("tl") || a.equals("level") || a.equals("lv")){ | |
93 taxLevelE=TaxTree.parseLevelExtended(b); | |
94 }else if(a.equals("table") || a.equals("gi") || a.equals("gitable")){ | |
95 giTableFile=b; | |
96 }else if(a.equals("tree") || a.equals("taxtree")){ | |
97 taxTreeFile=b; | |
98 }else{ | |
99 outstream.println("Unknown parameter "+args[i]); | |
100 assert(false) : "Unknown parameter "+args[i]; | |
101 // throw new RuntimeException("Unknown parameter "+args[i]); | |
102 } | |
103 } | |
104 | |
105 if("auto".equalsIgnoreCase(taxTreeFile)){taxTreeFile=TaxTree.defaultTreeFile();} | |
106 if("auto".equalsIgnoreCase(giTableFile)){giTableFile=TaxTree.defaultTableFile();} | |
107 | |
108 {//Process parser fields | |
109 Parser.processQuality(); | |
110 | |
111 maxReads=parser.maxReads; | |
112 | |
113 overwrite=ReadStats.overwrite=parser.overwrite; | |
114 append=ReadStats.append=parser.append; | |
115 setInterleaved=parser.setInterleaved; | |
116 | |
117 in1=parser.in1; | |
118 in2=parser.in2; | |
119 | |
120 out1=parser.out1; | |
121 out2=parser.out2; | |
122 | |
123 extin=parser.extin; | |
124 extout=parser.extout; | |
125 } | |
126 | |
127 //Do input file # replacement | |
128 if(in1!=null && in2==null && in1.indexOf('#')>-1 && !new File(in1).exists()){ | |
129 in2=in1.replace("#", "2"); | |
130 in1=in1.replace("#", "1"); | |
131 } | |
132 | |
133 //Do output file # replacement | |
134 if(out1!=null && out2==null && out1.indexOf('#')>-1){ | |
135 out2=out1.replace("#", "2"); | |
136 out1=out1.replace("#", "1"); | |
137 } | |
138 | |
139 assert(out1==null || out1.contains("%")) : "Output filename must contain % symbol."; | |
140 assert(out2==null || out2.contains("%")) : "Output filename must contain % symbol."; | |
141 assert(taxTreeFile!=null) : "This requires a taxtree file. On NERSC systems, set tree=auto."; | |
142 | |
143 //Adjust interleaved detection based on the number of input files | |
144 if(in2!=null){ | |
145 if(FASTQ.FORCE_INTERLEAVED){outstream.println("Reset INTERLEAVED to false because paired input files were specified.");} | |
146 FASTQ.FORCE_INTERLEAVED=FASTQ.TEST_INTERLEAVED=false; | |
147 } | |
148 | |
149 assert(FastaReadInputStream.settingsOK()); | |
150 | |
151 //Ensure there is an input file | |
152 if(in1==null){throw new RuntimeException("Error - at least one input file is required.");} | |
153 | |
154 //Adjust the number of threads for input file reading | |
155 if(!ByteFile.FORCE_MODE_BF1 && !ByteFile.FORCE_MODE_BF2 && Shared.threads()>2){ | |
156 ByteFile.FORCE_MODE_BF2=true; | |
157 } | |
158 | |
159 //Ensure out2 is not set without out1 | |
160 if(out1==null && out2!=null){throw new RuntimeException("Error - cannot define out2 without defining out1.");} | |
161 | |
162 //Adjust interleaved settings based on number of output files | |
163 if(!setInterleaved){ | |
164 assert(in1!=null && (out1!=null || out2==null)) : "\nin1="+in1+"\nin2="+in2+"\nout1="+out1+"\nout2="+out2+"\n"; | |
165 if(in2!=null){ //If there are 2 input streams. | |
166 FASTQ.FORCE_INTERLEAVED=FASTQ.TEST_INTERLEAVED=false; | |
167 outstream.println("Set INTERLEAVED to "+FASTQ.FORCE_INTERLEAVED); | |
168 }else{ //There is one input stream. | |
169 if(out2!=null){ | |
170 FASTQ.FORCE_INTERLEAVED=true; | |
171 FASTQ.TEST_INTERLEAVED=false; | |
172 outstream.println("Set INTERLEAVED to "+FASTQ.FORCE_INTERLEAVED); | |
173 } | |
174 } | |
175 } | |
176 | |
177 //Ensure output files can be written | |
178 if(!Tools.testOutputFiles(overwrite, append, false, out1, out2)){ | |
179 outstream.println((out1==null)+", "+(out2==null)+", "+out1+", "+out2); | |
180 throw new RuntimeException("\n\noverwrite="+overwrite+"; Can't write to output files "+out1+", "+out2+"\n"); | |
181 } | |
182 | |
183 //Ensure input files can be read | |
184 if(!Tools.testInputFiles(false, true, in1, in2)){ | |
185 throw new RuntimeException("\nCan't read some input files.\n"); | |
186 } | |
187 | |
188 //Ensure that no file was specified multiple times | |
189 if(!Tools.testForDuplicateFiles(true, in1, in2, out1, out2)){ | |
190 throw new RuntimeException("\nSome file names were specified multiple times.\n"); | |
191 } | |
192 | |
193 //Create input FileFormat objects | |
194 ffin1=FileFormat.testInput(in1, FileFormat.FASTQ, extin, true, true); | |
195 ffin2=FileFormat.testInput(in2, FileFormat.FASTQ, extin, true, true); | |
196 | |
197 TaxFilter.loadGiTable(giTableFile); | |
198 tree=TaxFilter.loadTree(taxTreeFile); | |
199 } | |
200 | |
201 /*--------------------------------------------------------------*/ | |
202 /*---------------- Outer Methods ----------------*/ | |
203 /*--------------------------------------------------------------*/ | |
204 | |
205 /** Create read streams and process all data */ | |
206 public void process(Timer t){ | |
207 | |
208 //Create a read input stream | |
209 final ConcurrentReadInputStream cris; | |
210 { | |
211 cris=ConcurrentReadInputStream.getReadInputStream(maxReads, true, ffin1, ffin2, null, null); | |
212 cris.start(); //Start the stream | |
213 if(verbose){outstream.println("Started cris");} | |
214 } | |
215 boolean paired=cris.paired(); | |
216 if(!ffin1.samOrBam()){outstream.println("Input is being processed as "+(paired ? "paired" : "unpaired"));} | |
217 | |
218 //Reset counters | |
219 readsProcessed=0; | |
220 basesProcessed=0; | |
221 | |
222 final HashMap<String, ConcurrentReadOutputStream> map=new HashMap<String, ConcurrentReadOutputStream>(); | |
223 | |
224 //Process the read stream | |
225 processInner(cris, map); | |
226 | |
227 if(verbose){outstream.println("Finished; closing streams.");} | |
228 | |
229 //Write anything that was accumulated by ReadStats | |
230 errorState|=ReadStats.writeAll(); | |
231 //Close the read streams | |
232 errorState|=ReadWrite.closeStream(cris); | |
233 | |
234 for(ConcurrentReadOutputStream ros : map.values()){ | |
235 ReadWrite.closeStream(ros); | |
236 } | |
237 | |
238 //Report timing and results | |
239 t.stop(); | |
240 outstream.println("Reads In: \t"+readsProcessed+" reads \t"+basesProcessed+" bases"); | |
241 outstream.println(); | |
242 outstream.println(Tools.timeReadsBasesProcessed(t, readsProcessed, basesProcessed, 8)); | |
243 | |
244 //Throw an exception of there was an error in a thread | |
245 if(errorState){ | |
246 throw new RuntimeException(getClass().getName()+" terminated in an error state; the output may be corrupt."); | |
247 } | |
248 } | |
249 | |
250 /** Iterate through the reads */ | |
251 void processInner(final ConcurrentReadInputStream cris, HashMap<String, ConcurrentReadOutputStream> map){ | |
252 | |
253 { | |
254 //Grab the first ListNum of reads | |
255 ListNum<Read> ln=cris.nextList(); | |
256 //Grab the actual read list from the ListNum | |
257 ArrayList<Read> reads=(ln!=null ? ln.list : null); | |
258 | |
259 //Check to ensure pairing is as expected | |
260 if(reads!=null && !reads.isEmpty()){ | |
261 Read r=reads.get(0); | |
262 assert((ffin1==null || ffin1.samOrBam()) || (r.mate!=null)==cris.paired()); | |
263 } | |
264 | |
265 //As long as there is a nonempty read list... | |
266 while(ln!=null && reads!=null && reads.size()>0){//ln!=null prevents a compiler potential null access warning | |
267 if(verbose){outstream.println("Fetched "+reads.size()+" reads.");} | |
268 | |
269 //Loop through each read in the list | |
270 for(int idx=0; idx<reads.size(); idx++){ | |
271 final Read r1=reads.get(idx); | |
272 | |
273 //Track the initial length for statistics | |
274 final int initialLength1=r1.length(); | |
275 final int initialLength2=(r1.mateLength()); | |
276 | |
277 //Increment counters | |
278 readsProcessed+=r1.pairCount(); | |
279 basesProcessed+=initialLength1+initialLength2; | |
280 | |
281 TaxNode tn=tree.parseNodeFromHeader(r1.id, true); | |
282 if(tn==null){tn=tree.getNodeByName(r1.id);} | |
283 if(tn==null){tn=unknown;} | |
284 while(tn.levelExtended<taxLevelE && tn.id!=tn.pid){tn=tree.getNode(tn.pid);} | |
285 | |
286 if(out1!=null){ | |
287 ConcurrentReadOutputStream ros=map.get(tn.name); | |
288 if(ros==null){ | |
289 final int buff=4; | |
290 FileFormat ffout1=null, ffout2=null; | |
291 ffout1=FileFormat.testOutput(out1.replaceFirst("%", tn.name.replaceAll("\\s+", "_").replaceAll("[/\\\\]", "")), FileFormat.FASTQ, extout, false, overwrite, append, false); | |
292 if(out2!=null){ffout2=FileFormat.testOutput(out2.replaceFirst("%", tn.name.replaceAll("\\s+", "_").replaceAll("[/\\\\]", "")), FileFormat.FASTQ, extout, false, overwrite, append, false);} | |
293 ros=ConcurrentReadOutputStream.getStream(ffout1, ffout2, null, null, buff, null, false); | |
294 ros.start(); //Start the stream | |
295 map.put(tn.name, ros); | |
296 } | |
297 ArrayList<Read> temp=new ArrayList<Read>(1); //Kind of inefficient | |
298 temp.add(r1); | |
299 ros.add(temp, 0); | |
300 } | |
301 } | |
302 | |
303 //Notify the input stream that the list was used | |
304 cris.returnList(ln); | |
305 if(verbose){outstream.println("Returned a list.");} | |
306 | |
307 //Fetch a new list | |
308 ln=cris.nextList(); | |
309 reads=(ln!=null ? ln.list : null); | |
310 } | |
311 | |
312 //Notify the input stream that the final list was used | |
313 if(ln!=null){ | |
314 cris.returnList(ln.id, ln.list==null || ln.list.isEmpty()); | |
315 } | |
316 } | |
317 | |
318 //Do anything necessary after processing | |
319 | |
320 } | |
321 | |
322 /*--------------------------------------------------------------*/ | |
323 /*---------------- Inner Methods ----------------*/ | |
324 /*--------------------------------------------------------------*/ | |
325 | |
326 /*--------------------------------------------------------------*/ | |
327 /*---------------- Fields ----------------*/ | |
328 /*--------------------------------------------------------------*/ | |
329 | |
330 /** Primary input file path */ | |
331 private String in1=null; | |
332 /** Secondary input file path */ | |
333 private String in2=null; | |
334 | |
335 /** Primary output file path */ | |
336 private String out1=null; | |
337 /** Secondary output file path */ | |
338 private String out2=null; | |
339 | |
340 /** Override input file extension */ | |
341 private String extin=null; | |
342 /** Override output file extension */ | |
343 private String extout=null; | |
344 | |
345 /** The actual filter */ | |
346 private int taxLevelE=TaxTree.stringToLevelExtended("phylum"); | |
347 | |
348 /*--------------------------------------------------------------*/ | |
349 | |
350 /** Number of reads processed */ | |
351 protected long readsProcessed=0; | |
352 /** Number of bases processed */ | |
353 protected long basesProcessed=0; | |
354 | |
355 /** Quit after processing this many input reads; -1 means no limit */ | |
356 private long maxReads=-1; | |
357 | |
358 /*--------------------------------------------------------------*/ | |
359 /*---------------- Final Fields ----------------*/ | |
360 /*--------------------------------------------------------------*/ | |
361 | |
362 /** Primary input file */ | |
363 private final FileFormat ffin1; | |
364 /** Secondary input file */ | |
365 private final FileFormat ffin2; | |
366 | |
367 private final TaxTree tree; | |
368 | |
369 private final TaxNode unknown=new TaxNode(-99, -99, TaxTree.LIFE, TaxTree.LIFE_E, "UNKNOWN"); | |
370 | |
371 /*--------------------------------------------------------------*/ | |
372 /*---------------- Common Fields ----------------*/ | |
373 /*--------------------------------------------------------------*/ | |
374 | |
375 /** Print status messages to this output stream */ | |
376 private PrintStream outstream=System.err; | |
377 /** Print verbose messages */ | |
378 public static boolean verbose=false; | |
379 /** True if an error was encountered */ | |
380 public boolean errorState=false; | |
381 /** Overwrite existing output files */ | |
382 private boolean overwrite=false; | |
383 /** Append to existing output files */ | |
384 private boolean append=false; | |
385 /** This flag has no effect on singlethreaded programs */ | |
386 private final boolean ordered=false; | |
387 | |
388 } |