comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/prok/AnalyzeGenes.java @ 68:5028fdace37b

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 16:23:26 -0400
parents
children
comparison
equal deleted inserted replaced
67:0e9998148a16 68:5028fdace37b
1 package prok;
2
3 import java.io.File;
4 import java.io.PrintStream;
5 import java.util.ArrayList;
6 import java.util.Locale;
7 import java.util.concurrent.atomic.AtomicInteger;
8
9 import fileIO.ByteFile;
10 import fileIO.ByteStreamWriter;
11 import fileIO.FileFormat;
12 import fileIO.ReadWrite;
13 import shared.Parse;
14 import shared.Parser;
15 import shared.PreParser;
16 import shared.Shared;
17 import shared.Timer;
18 import shared.Tools;
19 import structures.ByteBuilder;
20 import structures.IntList;
21
22 /**
23 * This class is designed to analyze paired prokaryotic fna and gff files
24 * to calculate the patterns in coding and noncoding frames, start and stop sites.
25 * It outputs a pgm file.
26 * @author Brian Bushnell
27 * @date Sep 27, 2018
28 *
29 */
30 public class AnalyzeGenes {
31
32 /*--------------------------------------------------------------*/
33 /*---------------- Initialization ----------------*/
34 /*--------------------------------------------------------------*/
35
36 /**
37 * Code entrance from the command line.
38 * @param args Command line arguments
39 */
40 public static void main(String[] args){
41 //Start a timer immediately upon code entrance.
42 Timer t=new Timer();
43
44 //Create an instance of this class
45 AnalyzeGenes x=new AnalyzeGenes(args);
46
47 //Run the object
48 x.process(t);
49
50 //Close the print stream if it was redirected
51 Shared.closeStream(x.outstream);
52 }
53
54 /**
55 * Constructor.
56 * @param args Command line arguments
57 */
58 public AnalyzeGenes(String[] args){
59
60 {//Preparse block for help, config files, and outstream
61 PreParser pp=new PreParser(args, null/*getClass()*/, false);
62 args=pp.args;
63 outstream=pp.outstream;
64 }
65
66 //Set shared static variables prior to parsing
67 ReadWrite.USE_PIGZ=ReadWrite.USE_UNPIGZ=true;
68 ReadWrite.MAX_ZIP_THREADS=Shared.threads();
69
70 {//Parse the arguments
71 final Parser parser=parse(args);
72 overwrite=parser.overwrite;
73 append=parser.append;
74
75 out=parser.out1;
76 }
77
78 if(alignRibo){
79 //Load sequences
80 ProkObject.loadConsensusSequenceFromFile(false, false);
81 }
82
83 fixExtensions(); //Add or remove .gz or .bz2 as needed
84 checkFileExistence(); //Ensure files can be read and written
85 checkStatics(); //Adjust file-related static fields as needed for this program
86
87 //Determine how many threads may be used
88 threads=Tools.min(fnaList.size(), Shared.threads(), Tools.max(32, Shared.CALC_LOGICAL_PROCESSORS()/2));
89
90 ffout=FileFormat.testOutput(out, FileFormat.PGM, null, true, overwrite, append, false);
91 }
92
93 /*--------------------------------------------------------------*/
94 /*---------------- Initialization Helpers ----------------*/
95 /*--------------------------------------------------------------*/
96
97 /** Parse arguments from the command line */
98 private Parser parse(String[] args){
99
100 Parser parser=new Parser();
101 parser.overwrite=overwrite;
102 for(int i=0; i<args.length; i++){
103 String arg=args[i];
104 String[] split=arg.split("=");
105 String a=split[0].toLowerCase();
106 String b=split.length>1 ? split[1] : null;
107 if(b!=null && b.equalsIgnoreCase("null")){b=null;}
108
109 // outstream.println(arg+", "+a+", "+b);
110 if(PGMTools.parseStatic(arg, a, b)){
111 //do nothing
112 }else if(a.equals("in") || a.equals("infna") || a.equals("fnain") || a.equals("fna") || a.equals("ref")){
113 assert(b!=null);
114 Tools.addFiles(b, fnaList);
115 }else if(a.equals("gff") || a.equals("ingff") || a.equals("gffin")){
116 assert(b!=null);
117 Tools.addFiles(b, gffList);
118 }else if(a.equals("verbose")){
119 verbose=Parse.parseBoolean(b);
120 ReadWrite.verbose=verbose;
121 }else if(a.equals("alignribo") || a.equals("align")){
122 alignRibo=Parse.parseBoolean(b);
123 }else if(a.equals("adjustendpoints")){
124 adjustEndpoints=Parse.parseBoolean(b);
125 }
126
127 else if(ProkObject.parse(arg, a, b)){}
128
129 else if(parser.parse(arg, a, b)){
130 //do nothing
131 }else if(arg.indexOf('=')<0 && new File(arg).exists() && FileFormat.isFastaFile(arg)){
132 fnaList.add(arg);
133 }else{
134 outstream.println("Unknown parameter "+args[i]);
135 assert(false) : "Unknown parameter "+args[i];
136 // throw new RuntimeException("Unknown parameter "+args[i]);
137 }
138 }
139
140 if(gffList.isEmpty()){
141 for(String s : fnaList){
142 String prefix=ReadWrite.stripExtension(s);
143 String gff=prefix+".gff";
144 File f=new File(gff);
145 if(!f.exists()){
146 String gz=gff+".gz";
147 f=new File(gz);
148 assert(f.exists() && f.canRead()) : "Can't read file "+gff;
149 gff=gz;
150 }
151 gffList.add(gff);
152 }
153 }
154 assert(gffList.size()==fnaList.size()) : "Number of fna and gff files do not match: "+fnaList.size()+", "+gffList.size();
155 return parser;
156 }
157
158 /** Add or remove .gz or .bz2 as needed */
159 private void fixExtensions(){
160 fnaList=Tools.fixExtension(fnaList);
161 gffList=Tools.fixExtension(gffList);
162 if(fnaList.isEmpty()){throw new RuntimeException("Error - at least one input file is required.");}
163 }
164
165 /** Ensure files can be read and written */
166 private void checkFileExistence(){
167 //Ensure output files can be written
168 if(!Tools.testOutputFiles(overwrite, append, false, out)){
169 outstream.println((out==null)+", "+out);
170 throw new RuntimeException("\n\noverwrite="+overwrite+"; Can't write to output file "+out+"\n");
171 }
172
173 //Ensure input files can be read
174 ArrayList<String> foo=new ArrayList<String>();
175 foo.addAll(fnaList);
176 foo.addAll(gffList);
177 if(!Tools.testInputFiles(false, true, foo.toArray(new String[0]))){
178 throw new RuntimeException("\nCan't read some input files.\n");
179 }
180
181 //Ensure that no file was specified multiple times
182 foo.add(out);
183 if(!Tools.testForDuplicateFiles(true, foo.toArray(new String[0]))){
184 throw new RuntimeException("\nSome file names were specified multiple times.\n");
185 }
186 }
187
188 /** Adjust file-related static fields as needed for this program */
189 private static void checkStatics(){
190 //Adjust the number of threads for input file reading
191 if(!ByteFile.FORCE_MODE_BF1 && !ByteFile.FORCE_MODE_BF2 && Shared.threads()>2){
192 ByteFile.FORCE_MODE_BF2=true;
193 }
194 }
195
196 /*--------------------------------------------------------------*/
197 /*---------------- Outer Methods ----------------*/
198 /*--------------------------------------------------------------*/
199
200 void process(Timer t){
201
202 final GeneModel pgm;
203 if(Shared.threads()<2 || fnaList.size()<2){
204 pgm=makeModelST();
205 }else{
206 pgm=spawnThreads();
207 }
208
209 ByteStreamWriter bsw=ByteStreamWriter.makeBSW(ffout);
210
211 ByteBuilder bb=new ByteBuilder();
212 pgm.appendTo(bb);
213 bytesOut+=bb.length;
214
215 if(bsw!=null){
216 bsw.addJob(bb);
217 errorState|=bsw.poisonAndWait();
218 }
219
220 t.stop();
221
222 outstream.println(timeReadsBasesGenesProcessed(t, pgm.readsProcessed, pgm.basesProcessed, pgm.genesProcessed, pgm.filesProcessed, 8));
223
224 outstream.println();
225 outstream.println(typesProcessed(pgm, 12));
226
227 //outstream.println("Bytes Out: \t"+bytesOut);
228
229 if(errorState){
230 throw new RuntimeException(getClass().getName()+" terminated in an error state; the output may be corrupt.");
231 }
232 }
233
234 private static String timeReadsBasesGenesProcessed(Timer t, long readsProcessed, long basesProcessed, long genesProcessed, long filesProcessed, int pad){
235 return ("Time: \t"+t+"\n"+readsBasesGenesProcessed(t.elapsed, readsProcessed, basesProcessed, genesProcessed, filesProcessed, pad));
236 }
237
238 private static String readsBasesGenesProcessed(long elapsed, long reads, long bases, long genes, long files, int pad){
239 double rpnano=reads/(double)elapsed;
240 double bpnano=bases/(double)elapsed;
241 double gpnano=genes/(double)elapsed;
242 double fpnano=files/(double)elapsed;
243
244 String rstring=Tools.padKM(reads, pad);
245 String bstring=Tools.padKM(bases, pad);
246 String gstring=Tools.padKM(genes, pad);
247 String fstring=Tools.padKM(files, pad);
248 ByteBuilder sb=new ByteBuilder();
249 sb.append("Files Processed: ").append(fstring).append(String.format(Locale.ROOT, " \t%.2f files/sec", fpnano*1000000000)).append('\n');
250 sb.append("Sequences Processed:").append(rstring).append(String.format(Locale.ROOT, " \t%.2fk seqs/sec", rpnano*1000000)).append('\n');
251 sb.append("Genes Processed: ").append(gstring).append(String.format(Locale.ROOT, " \t%.2fk genes/sec", gpnano*1000000)).append('\n');
252 sb.append("Bases Processed: ").append(bstring).append(String.format(Locale.ROOT, " \t%.2fm bases/sec", bpnano*1000));
253 return sb.toString();
254 }
255
256 private static String typesProcessed(GeneModel pgm, int pad){
257
258 ByteBuilder sb=new ByteBuilder();
259 sb.append("CDS: "+Tools.padLeft(pgm.statsCDS.lengthCount, pad)).nl();
260 sb.append("tRNA: "+Tools.padLeft(pgm.statstRNA.lengthCount, pad)).nl();
261 sb.append("16S: "+Tools.padLeft(pgm.stats16S.lengthCount, pad)).nl();
262 sb.append("23S: "+Tools.padLeft(pgm.stats23S.lengthCount, pad)).nl();
263 sb.append("5S: "+Tools.padLeft(pgm.stats5S.lengthCount, pad)).nl();
264 sb.append("18S: "+Tools.padLeft(pgm.stats18S.lengthCount, pad));
265 return sb.toString();
266 }
267
268 /*--------------------------------------------------------------*/
269 /*---------------- Inner Methods ----------------*/
270 /*--------------------------------------------------------------*/
271
272 //TODO: Process each file in a thread.
273 private GeneModel makeModelST(){
274 GeneModel pgmSum=new GeneModel(true);
275
276 for(int i=0; i<fnaList.size(); i++){
277 String fna=fnaList.get(i);
278 String gff=gffList.get(i);
279 pgmSum.process(fna, gff);
280 }
281 return pgmSum;
282 }
283
284 /*--------------------------------------------------------------*/
285 /*---------------- Thread Management ----------------*/
286 /*--------------------------------------------------------------*/
287
288 /** Spawn process threads */
289 private GeneModel spawnThreads(){
290
291 //Do anything necessary prior to processing
292
293 final AtomicInteger aint=new AtomicInteger(0);
294
295 //Fill a list with FileThreads
296 ArrayList<FileThread> alpt=new ArrayList<FileThread>(threads);
297 for(int i=0; i<threads; i++){
298 alpt.add(new FileThread(aint));
299 }
300
301 //Start the threads
302 for(FileThread pt : alpt){
303 pt.start();
304 }
305
306 //Wait for threads to finish
307 GeneModel pgm=waitForThreads(alpt);
308
309 //Do anything necessary after processing
310 return pgm;
311 }
312
313 private GeneModel waitForThreads(ArrayList<FileThread> alpt){
314
315 GeneModel pgm=new GeneModel(false);
316
317 //Wait for completion of all threads
318 boolean success=true;
319 for(FileThread pt : alpt){
320
321 //Wait until this thread has terminated
322 while(pt.getState()!=Thread.State.TERMINATED){
323 try {
324 //Attempt a join operation
325 pt.join();
326 } catch (InterruptedException e) {
327 //Potentially handle this, if it is expected to occur
328 e.printStackTrace();
329 }
330 }
331
332 //Accumulate per-thread statistics
333 pgm.add(pt.pgm);
334
335 success&=pt.success;
336 errorState|=pt.errorStateT;
337 }
338
339 //Track whether any threads failed
340 if(!success){errorState=true;}
341 return pgm;
342 }
343
344 /*--------------------------------------------------------------*/
345 /*---------------- Inner Classes ----------------*/
346 /*--------------------------------------------------------------*/
347
348 private class FileThread extends Thread {
349
350 FileThread(AtomicInteger fnum_){
351 fnum=fnum_;
352 pgm=new GeneModel(true);
353 }
354
355 @Override
356 public void run(){
357 for(int i=fnum.getAndIncrement(); i<fnaList.size(); i=fnum.getAndIncrement()){
358 String fna=fnaList.get(i);
359 String gff=gffList.get(i);
360 errorStateT=pgm.process(fna, gff)|errorState;
361 // System.err.println("Processed "+fna+" in "+this.toString());
362 }
363 success=true;
364 }
365
366 private final AtomicInteger fnum;
367 private final GeneModel pgm;
368 boolean errorStateT=false;
369 boolean success=false;
370 }
371
372 /*--------------------------------------------------------------*/
373 /*---------------- Fields ----------------*/
374 /*--------------------------------------------------------------*/
375
376 private ArrayList<String> fnaList=new ArrayList<String>();
377 private ArrayList<String> gffList=new ArrayList<String>();
378 private IntList taxList=new IntList();
379 private String out=null;
380
381 /*--------------------------------------------------------------*/
382
383 private long bytesOut=0;
384 static boolean alignRibo=true;
385 static boolean adjustEndpoints=true;
386
387 /*--------------------------------------------------------------*/
388 /*---------------- Final Fields ----------------*/
389 /*--------------------------------------------------------------*/
390
391 private final FileFormat ffout;
392 private final int threads;
393
394 /*--------------------------------------------------------------*/
395 /*---------------- Common Fields ----------------*/
396 /*--------------------------------------------------------------*/
397
398 private PrintStream outstream=System.err;
399 public static boolean verbose=false;
400 public boolean errorState=false;
401 private boolean overwrite=true;
402 private boolean append=false;
403
404 }
405