Mercurial > repos > rliterman > csp2
comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/gff/CompareGff.java @ 68:5028fdace37b
planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author | jpayne |
---|---|
date | Tue, 18 Mar 2025 16:23:26 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
67:0e9998148a16 | 68:5028fdace37b |
---|---|
1 package gff; | |
2 | |
3 import java.io.PrintStream; | |
4 import java.util.ArrayList; | |
5 import java.util.HashMap; | |
6 import java.util.Locale; | |
7 import java.util.Map.Entry; | |
8 | |
9 import fileIO.ByteFile; | |
10 import fileIO.FileFormat; | |
11 import fileIO.ReadWrite; | |
12 import prok.ProkObject; | |
13 import shared.Parse; | |
14 import shared.Parser; | |
15 import shared.PreParser; | |
16 import shared.Shared; | |
17 import shared.Timer; | |
18 import shared.Tools; | |
19 import structures.StringNum; | |
20 | |
21 /** | |
22 * Compares gff files for the purpose of grading gene-calling. | |
23 * @author Brian Bushnell | |
24 * @date October 3, 2018 | |
25 * | |
26 */ | |
27 public class CompareGff { | |
28 | |
29 /*--------------------------------------------------------------*/ | |
30 /*---------------- Initialization ----------------*/ | |
31 /*--------------------------------------------------------------*/ | |
32 | |
33 /** | |
34 * Code entrance from the command line. | |
35 * @param args Command line arguments | |
36 */ | |
37 public static void main(String[] args){ | |
38 //Start a timer immediately upon code entrance. | |
39 Timer t=new Timer(); | |
40 | |
41 //Create an instance of this class | |
42 CompareGff x=new CompareGff(args); | |
43 | |
44 //Run the object | |
45 x.process(t); | |
46 | |
47 //Close the print stream if it was redirected | |
48 Shared.closeStream(x.outstream); | |
49 } | |
50 | |
51 /** | |
52 * Constructor. | |
53 * @param args Command line arguments | |
54 */ | |
55 public CompareGff(String[] args){ | |
56 | |
57 {//Preparse block for help, config files, and outstream | |
58 PreParser pp=new PreParser(args, getClass(), false); | |
59 args=pp.args; | |
60 outstream=pp.outstream; | |
61 } | |
62 | |
63 //Set shared static variables prior to parsing | |
64 ReadWrite.USE_PIGZ=ReadWrite.USE_UNPIGZ=true; | |
65 ReadWrite.MAX_ZIP_THREADS=Shared.threads(); | |
66 | |
67 {//Parse the arguments | |
68 final Parser parser=parse(args); | |
69 overwrite=parser.overwrite; | |
70 append=parser.append; | |
71 | |
72 in=parser.in1; | |
73 } | |
74 | |
75 fixExtensions(); //Add or remove .gz or .bz2 as needed | |
76 checkFileExistence(); //Ensure files can be read and written | |
77 checkStatics(); //Adjust file-related static fields as needed for this program | |
78 | |
79 ffin=FileFormat.testInput(in, FileFormat.GFF, null, true, true); | |
80 ffref=FileFormat.testInput(ref, FileFormat.GFF, null, true, true); | |
81 } | |
82 | |
83 /*--------------------------------------------------------------*/ | |
84 /*---------------- Initialization Helpers ----------------*/ | |
85 /*--------------------------------------------------------------*/ | |
86 | |
87 /** Parse arguments from the command line */ | |
88 private Parser parse(String[] args){ | |
89 | |
90 Parser parser=new Parser(); | |
91 for(int i=0; i<args.length; i++){ | |
92 String arg=args[i]; | |
93 String[] split=arg.split("="); | |
94 String a=split[0].toLowerCase(); | |
95 String b=split.length>1 ? split[1] : null; | |
96 if(b!=null && b.equalsIgnoreCase("null")){b=null;} | |
97 | |
98 if(a.equals("ref")){ | |
99 ref=b; | |
100 }else if(a.equals("lines")){ | |
101 maxLines=Long.parseLong(b); | |
102 if(maxLines<0){maxLines=Long.MAX_VALUE;} | |
103 }else if(a.equals("verbose")){ | |
104 verbose=Parse.parseBoolean(b); | |
105 // ByteFile1.verbose=verbose; | |
106 // ByteFile2.verbose=verbose; | |
107 // ReadWrite.verbose=verbose; | |
108 }else if(parser.parse(arg, a, b)){ | |
109 //do nothing | |
110 }else if(i==0 && arg.indexOf('=')<0){ | |
111 parser.in1=arg; | |
112 }else if(i==1 && arg.indexOf('=')<0 && ref==null){ | |
113 ref=arg; | |
114 }else{ | |
115 outstream.println("Unknown parameter "+args[i]); | |
116 assert(false) : "Unknown parameter "+args[i]; | |
117 // throw new RuntimeException("Unknown parameter "+args[i]); | |
118 } | |
119 } | |
120 | |
121 return parser; | |
122 } | |
123 | |
124 /** Add or remove .gz or .bz2 as needed */ | |
125 private void fixExtensions(){ | |
126 in=Tools.fixExtension(in); | |
127 ref=Tools.fixExtension(ref); | |
128 if(in==null || ref==null){throw new RuntimeException("Error - at least two input files are required.");} | |
129 } | |
130 | |
131 /** Ensure files can be read and written */ | |
132 private void checkFileExistence(){ | |
133 | |
134 //Ensure input files can be read | |
135 if(!Tools.testInputFiles(true, true, in, ref)){ | |
136 throw new RuntimeException("\nCan't read some input files.\n"); | |
137 } | |
138 } | |
139 | |
140 /** Adjust file-related static fields as needed for this program */ | |
141 private static void checkStatics(){ | |
142 //Adjust the number of threads for input file reading | |
143 if(!ByteFile.FORCE_MODE_BF1 && !ByteFile.FORCE_MODE_BF2 && Shared.threads()>2){ | |
144 ByteFile.FORCE_MODE_BF2=true; | |
145 } | |
146 | |
147 // if(!ByteFile.FORCE_MODE_BF2){ | |
148 // ByteFile.FORCE_MODE_BF2=false; | |
149 // ByteFile.FORCE_MODE_BF1=true; | |
150 // } | |
151 } | |
152 | |
153 /*--------------------------------------------------------------*/ | |
154 /*---------------- Outer Methods ----------------*/ | |
155 /*--------------------------------------------------------------*/ | |
156 | |
157 void process(Timer t){ | |
158 | |
159 ByteFile bf=ByteFile.makeByteFile(ffin); | |
160 | |
161 processInner(bf); | |
162 | |
163 errorState|=bf.close(); | |
164 | |
165 t.stop(); | |
166 | |
167 outstream.println(Tools.timeLinesBytesProcessed(t, linesProcessed, bytesProcessed, 8)); | |
168 | |
169 outstream.println(); | |
170 outstream.println("Ref count: \t"+refCount); | |
171 outstream.println("Query count: \t"+queryCount); | |
172 | |
173 outstream.println(); | |
174 outstream.println("Ref-relative counts:"); | |
175 outstream.println("True Positive Start: \t"+truePositiveStart+"\t"+(String.format(Locale.ROOT, "%.3f%%", truePositiveStart*100.0/refCount))); | |
176 outstream.println("True Positive Stop: \t"+truePositiveStop+"\t"+(String.format(Locale.ROOT, "%.3f%%", truePositiveStop*100.0/refCount))); | |
177 // outstream.println("False Positive Start:\t"+falsePositiveStart+"\t"+(String.format(Locale.ROOT, "%.3f%%", falsePositiveStart*100.0/refCount))); | |
178 // outstream.println("False Positive Stop: \t"+falsePositiveStop+"\t"+(String.format(Locale.ROOT, "%.3f%%", falsePositiveStop*100.0/refCount))); | |
179 outstream.println("False Negative Start:\t"+falseNegativeStart+"\t"+(String.format(Locale.ROOT, "%.3f%%", falseNegativeStart*100.0/refCount))); | |
180 outstream.println("False Negative Stop: \t"+falseNegativeStop+"\t"+(String.format(Locale.ROOT, "%.3f%%", falseNegativeStop*100.0/refCount))); | |
181 | |
182 outstream.println(); | |
183 outstream.println("Query-relative counts:"); | |
184 outstream.println("True Positive Start: \t"+truePositiveStart2+"\t"+(String.format(Locale.ROOT, "%.3f%%", truePositiveStart2*100.0/queryCount))); | |
185 outstream.println("True Positive Stop: \t"+truePositiveStop2+"\t"+(String.format(Locale.ROOT, "%.3f%%", truePositiveStop2*100.0/queryCount))); | |
186 outstream.println("False Positive Start:\t"+falsePositiveStart2+"\t"+(String.format(Locale.ROOT, "%.3f%%", falsePositiveStart2*100.0/queryCount))); | |
187 outstream.println("False Positive Stop: \t"+falsePositiveStop2+"\t"+(String.format(Locale.ROOT, "%.3f%%", falsePositiveStop2*100.0/queryCount))); | |
188 | |
189 outstream.println(); | |
190 outstream.println("SNR: \t"+String.format(Locale.ROOT, "%.4f", 10*Math.log10((truePositiveStart2+truePositiveStop2+0.1)/(falsePositiveStart2+falsePositiveStop2+0.1)))); | |
191 | |
192 if(errorState){ | |
193 throw new RuntimeException(getClass().getName()+" terminated in an error state; the output may be corrupt."); | |
194 } | |
195 } | |
196 | |
197 /*--------------------------------------------------------------*/ | |
198 /*---------------- Inner Methods ----------------*/ | |
199 /*--------------------------------------------------------------*/ | |
200 | |
201 @SuppressWarnings("unchecked") | |
202 private void processInner(ByteFile bf){ | |
203 byte[] line=bf.nextLine(); | |
204 | |
205 { | |
206 ArrayList<GffLine> refLines=GffLine.loadGffFile(ffref, "CDS,rRNA,tRNA", true); | |
207 | |
208 refCount=refLines.size(); | |
209 lineMap=new HashMap<StringNum, GffLine>(); | |
210 startCountMap=new HashMap<StringNum, Integer>(); | |
211 stopCountMap=new HashMap<StringNum, Integer>(); | |
212 | |
213 for(GffLine gline : refLines){ | |
214 final int stop=gline.trueStop(); | |
215 StringNum sn=new StringNum(gline.seqid, stop); | |
216 lineMap.put(sn, gline); | |
217 startCountMap.put(sn, 0); | |
218 stopCountMap.put(sn, 0); | |
219 assert(lineMap.get(sn)==gline); | |
220 // assert(false) : "\n\nsn='"+sn+"'\n"+lineMap.containsKey(sn)+"\n"+lineMap.keySet(); | |
221 } | |
222 if(verbose){ | |
223 System.err.println(lineMap); | |
224 System.err.println(startCountMap); | |
225 System.err.println(stopCountMap); | |
226 } | |
227 } | |
228 | |
229 while(line!=null){ | |
230 if(line.length>0){ | |
231 if(maxLines>0 && linesProcessed>=maxLines){break;} | |
232 linesProcessed++; | |
233 bytesProcessed+=(line.length+1); | |
234 | |
235 final boolean valid=(line[0]!='#'); | |
236 if(valid){ | |
237 queryCount++; | |
238 GffLine gline=new GffLine(line); | |
239 processLine(gline); | |
240 } | |
241 } | |
242 line=bf.nextLine(); | |
243 } | |
244 | |
245 for(Entry<StringNum, Integer> e : startCountMap.entrySet()){ | |
246 if(e.getValue()<1){ | |
247 falseNegativeStart++; | |
248 } | |
249 } | |
250 for(Entry<StringNum, Integer> e : stopCountMap.entrySet()){ | |
251 if(e.getValue()<1){ | |
252 falseNegativeStop++; | |
253 } | |
254 } | |
255 } | |
256 | |
257 private void processLine(GffLine gline){ | |
258 // boolean cds=gline.type.equals("CDS"); | |
259 // boolean trna=gline.type.equals("tRNA"); | |
260 // boolean rrna=gline.type.equals("rRNA"); | |
261 // if(!cds && !trna && !rrna){return;} | |
262 // if(cds && !ProkObject.callCDS){return;} | |
263 // if(trna && !ProkObject.calltRNA){return;} | |
264 // if(rrna){ | |
265 // int type=gline.prokType(); | |
266 // if(ProkObject.processType(type)){return;} | |
267 // } | |
268 int type=gline.prokType(); | |
269 if(!ProkObject.processType(type)){return;} | |
270 | |
271 final int stop=gline.trueStop(); | |
272 final int start=gline.trueStart(); | |
273 | |
274 // System.err.println("Considering "+start+", "+stop); | |
275 | |
276 StringNum sn=new StringNum(gline.seqid, stop); | |
277 GffLine refline=lineMap.get(sn); | |
278 | |
279 boolean fail=(refline==null || refline.strand!=gline.strand || !refline.type.equals(gline.type)); | |
280 if(fail){ | |
281 if(verbose){ | |
282 System.err.println("Can't find "+sn+"\n"+gline+"\n"+refline); | |
283 assert(false) : "\n\nsn='"+sn+"'\n"+lineMap.containsKey(sn)+"\n"+lineMap.keySet(); | |
284 } | |
285 falsePositiveStart++; | |
286 falsePositiveStop++; | |
287 falsePositiveStart2++; | |
288 falsePositiveStop2++; | |
289 }else{ | |
290 assert(stop==refline.trueStop()); | |
291 truePositiveStop++; | |
292 truePositiveStop2++; | |
293 stopCountMap.put(sn, stopCountMap.get(sn)+1); | |
294 if(start==refline.trueStart()){ | |
295 truePositiveStart++; | |
296 truePositiveStart2++; | |
297 startCountMap.put(sn, startCountMap.get(sn)+1); | |
298 }else{ | |
299 falsePositiveStart++; | |
300 falsePositiveStart2++; | |
301 } | |
302 } | |
303 } | |
304 | |
305 /*--------------------------------------------------------------*/ | |
306 /*---------------- Fields ----------------*/ | |
307 /*--------------------------------------------------------------*/ | |
308 | |
309 private String in=null; | |
310 private String ref=null; | |
311 | |
312 | |
313 /*--------------------------------------------------------------*/ | |
314 | |
315 private HashMap<StringNum, GffLine> lineMap; | |
316 private HashMap<StringNum, Integer> startCountMap; | |
317 private HashMap<StringNum, Integer> stopCountMap; | |
318 | |
319 // private HashMap<Integer, ArrayList<GffLine>> map; | |
320 // private HashSet<Integer> stopSet; | |
321 // private HashSet<Integer> startSet; | |
322 // private HashSet<Integer> stopSetM; | |
323 // private HashSet<Integer> startSetM; | |
324 | |
325 private long linesProcessed=0; | |
326 private long linesOut=0; | |
327 private long bytesProcessed=0; | |
328 private long bytesOut=0; | |
329 | |
330 private long maxLines=Long.MAX_VALUE; | |
331 | |
332 private long falsePositiveStart=0; | |
333 private long falsePositiveStop=0; | |
334 private long truePositiveStart=0; | |
335 private long truePositiveStop=0; | |
336 private long falseNegativeStart=0; | |
337 private long falseNegativeStop=0; | |
338 | |
339 private long falsePositiveStart2=0; | |
340 private long falsePositiveStop2=0; | |
341 private long truePositiveStart2=0; | |
342 private long truePositiveStop2=0; | |
343 | |
344 private long refCount=0; | |
345 private long queryCount=0; | |
346 | |
347 /*--------------------------------------------------------------*/ | |
348 /*---------------- Final Fields ----------------*/ | |
349 /*--------------------------------------------------------------*/ | |
350 | |
351 private final FileFormat ffin; | |
352 private final FileFormat ffref; | |
353 | |
354 /*--------------------------------------------------------------*/ | |
355 /*---------------- Common Fields ----------------*/ | |
356 /*--------------------------------------------------------------*/ | |
357 | |
358 private PrintStream outstream=System.err; | |
359 public static boolean verbose=false; | |
360 public boolean errorState=false; | |
361 private boolean overwrite=false; | |
362 private boolean append=false; | |
363 | |
364 } |