jpayne@68
|
1 package gff;
|
jpayne@68
|
2
|
jpayne@68
|
3 import java.io.PrintStream;
|
jpayne@68
|
4 import java.util.ArrayList;
|
jpayne@68
|
5 import java.util.HashMap;
|
jpayne@68
|
6 import java.util.Locale;
|
jpayne@68
|
7 import java.util.Map.Entry;
|
jpayne@68
|
8
|
jpayne@68
|
9 import fileIO.ByteFile;
|
jpayne@68
|
10 import fileIO.FileFormat;
|
jpayne@68
|
11 import fileIO.ReadWrite;
|
jpayne@68
|
12 import prok.ProkObject;
|
jpayne@68
|
13 import shared.Parse;
|
jpayne@68
|
14 import shared.Parser;
|
jpayne@68
|
15 import shared.PreParser;
|
jpayne@68
|
16 import shared.Shared;
|
jpayne@68
|
17 import shared.Timer;
|
jpayne@68
|
18 import shared.Tools;
|
jpayne@68
|
19 import structures.StringNum;
|
jpayne@68
|
20
|
jpayne@68
|
21 /**
|
jpayne@68
|
22 * Compares gff files for the purpose of grading gene-calling.
|
jpayne@68
|
23 * @author Brian Bushnell
|
jpayne@68
|
24 * @date October 3, 2018
|
jpayne@68
|
25 *
|
jpayne@68
|
26 */
|
jpayne@68
|
27 public class CompareGff {
|
jpayne@68
|
28
|
jpayne@68
|
29 /*--------------------------------------------------------------*/
|
jpayne@68
|
30 /*---------------- Initialization ----------------*/
|
jpayne@68
|
31 /*--------------------------------------------------------------*/
|
jpayne@68
|
32
|
jpayne@68
|
33 /**
|
jpayne@68
|
34 * Code entrance from the command line.
|
jpayne@68
|
35 * @param args Command line arguments
|
jpayne@68
|
36 */
|
jpayne@68
|
37 public static void main(String[] args){
|
jpayne@68
|
38 //Start a timer immediately upon code entrance.
|
jpayne@68
|
39 Timer t=new Timer();
|
jpayne@68
|
40
|
jpayne@68
|
41 //Create an instance of this class
|
jpayne@68
|
42 CompareGff x=new CompareGff(args);
|
jpayne@68
|
43
|
jpayne@68
|
44 //Run the object
|
jpayne@68
|
45 x.process(t);
|
jpayne@68
|
46
|
jpayne@68
|
47 //Close the print stream if it was redirected
|
jpayne@68
|
48 Shared.closeStream(x.outstream);
|
jpayne@68
|
49 }
|
jpayne@68
|
50
|
jpayne@68
|
51 /**
|
jpayne@68
|
52 * Constructor.
|
jpayne@68
|
53 * @param args Command line arguments
|
jpayne@68
|
54 */
|
jpayne@68
|
55 public CompareGff(String[] args){
|
jpayne@68
|
56
|
jpayne@68
|
57 {//Preparse block for help, config files, and outstream
|
jpayne@68
|
58 PreParser pp=new PreParser(args, getClass(), false);
|
jpayne@68
|
59 args=pp.args;
|
jpayne@68
|
60 outstream=pp.outstream;
|
jpayne@68
|
61 }
|
jpayne@68
|
62
|
jpayne@68
|
63 //Set shared static variables prior to parsing
|
jpayne@68
|
64 ReadWrite.USE_PIGZ=ReadWrite.USE_UNPIGZ=true;
|
jpayne@68
|
65 ReadWrite.MAX_ZIP_THREADS=Shared.threads();
|
jpayne@68
|
66
|
jpayne@68
|
67 {//Parse the arguments
|
jpayne@68
|
68 final Parser parser=parse(args);
|
jpayne@68
|
69 overwrite=parser.overwrite;
|
jpayne@68
|
70 append=parser.append;
|
jpayne@68
|
71
|
jpayne@68
|
72 in=parser.in1;
|
jpayne@68
|
73 }
|
jpayne@68
|
74
|
jpayne@68
|
75 fixExtensions(); //Add or remove .gz or .bz2 as needed
|
jpayne@68
|
76 checkFileExistence(); //Ensure files can be read and written
|
jpayne@68
|
77 checkStatics(); //Adjust file-related static fields as needed for this program
|
jpayne@68
|
78
|
jpayne@68
|
79 ffin=FileFormat.testInput(in, FileFormat.GFF, null, true, true);
|
jpayne@68
|
80 ffref=FileFormat.testInput(ref, FileFormat.GFF, null, true, true);
|
jpayne@68
|
81 }
|
jpayne@68
|
82
|
jpayne@68
|
83 /*--------------------------------------------------------------*/
|
jpayne@68
|
84 /*---------------- Initialization Helpers ----------------*/
|
jpayne@68
|
85 /*--------------------------------------------------------------*/
|
jpayne@68
|
86
|
jpayne@68
|
87 /** Parse arguments from the command line */
|
jpayne@68
|
88 private Parser parse(String[] args){
|
jpayne@68
|
89
|
jpayne@68
|
90 Parser parser=new Parser();
|
jpayne@68
|
91 for(int i=0; i<args.length; i++){
|
jpayne@68
|
92 String arg=args[i];
|
jpayne@68
|
93 String[] split=arg.split("=");
|
jpayne@68
|
94 String a=split[0].toLowerCase();
|
jpayne@68
|
95 String b=split.length>1 ? split[1] : null;
|
jpayne@68
|
96 if(b!=null && b.equalsIgnoreCase("null")){b=null;}
|
jpayne@68
|
97
|
jpayne@68
|
98 if(a.equals("ref")){
|
jpayne@68
|
99 ref=b;
|
jpayne@68
|
100 }else if(a.equals("lines")){
|
jpayne@68
|
101 maxLines=Long.parseLong(b);
|
jpayne@68
|
102 if(maxLines<0){maxLines=Long.MAX_VALUE;}
|
jpayne@68
|
103 }else if(a.equals("verbose")){
|
jpayne@68
|
104 verbose=Parse.parseBoolean(b);
|
jpayne@68
|
105 // ByteFile1.verbose=verbose;
|
jpayne@68
|
106 // ByteFile2.verbose=verbose;
|
jpayne@68
|
107 // ReadWrite.verbose=verbose;
|
jpayne@68
|
108 }else if(parser.parse(arg, a, b)){
|
jpayne@68
|
109 //do nothing
|
jpayne@68
|
110 }else if(i==0 && arg.indexOf('=')<0){
|
jpayne@68
|
111 parser.in1=arg;
|
jpayne@68
|
112 }else if(i==1 && arg.indexOf('=')<0 && ref==null){
|
jpayne@68
|
113 ref=arg;
|
jpayne@68
|
114 }else{
|
jpayne@68
|
115 outstream.println("Unknown parameter "+args[i]);
|
jpayne@68
|
116 assert(false) : "Unknown parameter "+args[i];
|
jpayne@68
|
117 // throw new RuntimeException("Unknown parameter "+args[i]);
|
jpayne@68
|
118 }
|
jpayne@68
|
119 }
|
jpayne@68
|
120
|
jpayne@68
|
121 return parser;
|
jpayne@68
|
122 }
|
jpayne@68
|
123
|
jpayne@68
|
124 /** Add or remove .gz or .bz2 as needed */
|
jpayne@68
|
125 private void fixExtensions(){
|
jpayne@68
|
126 in=Tools.fixExtension(in);
|
jpayne@68
|
127 ref=Tools.fixExtension(ref);
|
jpayne@68
|
128 if(in==null || ref==null){throw new RuntimeException("Error - at least two input files are required.");}
|
jpayne@68
|
129 }
|
jpayne@68
|
130
|
jpayne@68
|
131 /** Ensure files can be read and written */
|
jpayne@68
|
132 private void checkFileExistence(){
|
jpayne@68
|
133
|
jpayne@68
|
134 //Ensure input files can be read
|
jpayne@68
|
135 if(!Tools.testInputFiles(true, true, in, ref)){
|
jpayne@68
|
136 throw new RuntimeException("\nCan't read some input files.\n");
|
jpayne@68
|
137 }
|
jpayne@68
|
138 }
|
jpayne@68
|
139
|
jpayne@68
|
140 /** Adjust file-related static fields as needed for this program */
|
jpayne@68
|
141 private static void checkStatics(){
|
jpayne@68
|
142 //Adjust the number of threads for input file reading
|
jpayne@68
|
143 if(!ByteFile.FORCE_MODE_BF1 && !ByteFile.FORCE_MODE_BF2 && Shared.threads()>2){
|
jpayne@68
|
144 ByteFile.FORCE_MODE_BF2=true;
|
jpayne@68
|
145 }
|
jpayne@68
|
146
|
jpayne@68
|
147 // if(!ByteFile.FORCE_MODE_BF2){
|
jpayne@68
|
148 // ByteFile.FORCE_MODE_BF2=false;
|
jpayne@68
|
149 // ByteFile.FORCE_MODE_BF1=true;
|
jpayne@68
|
150 // }
|
jpayne@68
|
151 }
|
jpayne@68
|
152
|
jpayne@68
|
153 /*--------------------------------------------------------------*/
|
jpayne@68
|
154 /*---------------- Outer Methods ----------------*/
|
jpayne@68
|
155 /*--------------------------------------------------------------*/
|
jpayne@68
|
156
|
jpayne@68
|
157 void process(Timer t){
|
jpayne@68
|
158
|
jpayne@68
|
159 ByteFile bf=ByteFile.makeByteFile(ffin);
|
jpayne@68
|
160
|
jpayne@68
|
161 processInner(bf);
|
jpayne@68
|
162
|
jpayne@68
|
163 errorState|=bf.close();
|
jpayne@68
|
164
|
jpayne@68
|
165 t.stop();
|
jpayne@68
|
166
|
jpayne@68
|
167 outstream.println(Tools.timeLinesBytesProcessed(t, linesProcessed, bytesProcessed, 8));
|
jpayne@68
|
168
|
jpayne@68
|
169 outstream.println();
|
jpayne@68
|
170 outstream.println("Ref count: \t"+refCount);
|
jpayne@68
|
171 outstream.println("Query count: \t"+queryCount);
|
jpayne@68
|
172
|
jpayne@68
|
173 outstream.println();
|
jpayne@68
|
174 outstream.println("Ref-relative counts:");
|
jpayne@68
|
175 outstream.println("True Positive Start: \t"+truePositiveStart+"\t"+(String.format(Locale.ROOT, "%.3f%%", truePositiveStart*100.0/refCount)));
|
jpayne@68
|
176 outstream.println("True Positive Stop: \t"+truePositiveStop+"\t"+(String.format(Locale.ROOT, "%.3f%%", truePositiveStop*100.0/refCount)));
|
jpayne@68
|
177 // outstream.println("False Positive Start:\t"+falsePositiveStart+"\t"+(String.format(Locale.ROOT, "%.3f%%", falsePositiveStart*100.0/refCount)));
|
jpayne@68
|
178 // outstream.println("False Positive Stop: \t"+falsePositiveStop+"\t"+(String.format(Locale.ROOT, "%.3f%%", falsePositiveStop*100.0/refCount)));
|
jpayne@68
|
179 outstream.println("False Negative Start:\t"+falseNegativeStart+"\t"+(String.format(Locale.ROOT, "%.3f%%", falseNegativeStart*100.0/refCount)));
|
jpayne@68
|
180 outstream.println("False Negative Stop: \t"+falseNegativeStop+"\t"+(String.format(Locale.ROOT, "%.3f%%", falseNegativeStop*100.0/refCount)));
|
jpayne@68
|
181
|
jpayne@68
|
182 outstream.println();
|
jpayne@68
|
183 outstream.println("Query-relative counts:");
|
jpayne@68
|
184 outstream.println("True Positive Start: \t"+truePositiveStart2+"\t"+(String.format(Locale.ROOT, "%.3f%%", truePositiveStart2*100.0/queryCount)));
|
jpayne@68
|
185 outstream.println("True Positive Stop: \t"+truePositiveStop2+"\t"+(String.format(Locale.ROOT, "%.3f%%", truePositiveStop2*100.0/queryCount)));
|
jpayne@68
|
186 outstream.println("False Positive Start:\t"+falsePositiveStart2+"\t"+(String.format(Locale.ROOT, "%.3f%%", falsePositiveStart2*100.0/queryCount)));
|
jpayne@68
|
187 outstream.println("False Positive Stop: \t"+falsePositiveStop2+"\t"+(String.format(Locale.ROOT, "%.3f%%", falsePositiveStop2*100.0/queryCount)));
|
jpayne@68
|
188
|
jpayne@68
|
189 outstream.println();
|
jpayne@68
|
190 outstream.println("SNR: \t"+String.format(Locale.ROOT, "%.4f", 10*Math.log10((truePositiveStart2+truePositiveStop2+0.1)/(falsePositiveStart2+falsePositiveStop2+0.1))));
|
jpayne@68
|
191
|
jpayne@68
|
192 if(errorState){
|
jpayne@68
|
193 throw new RuntimeException(getClass().getName()+" terminated in an error state; the output may be corrupt.");
|
jpayne@68
|
194 }
|
jpayne@68
|
195 }
|
jpayne@68
|
196
|
jpayne@68
|
197 /*--------------------------------------------------------------*/
|
jpayne@68
|
198 /*---------------- Inner Methods ----------------*/
|
jpayne@68
|
199 /*--------------------------------------------------------------*/
|
jpayne@68
|
200
|
jpayne@68
|
201 @SuppressWarnings("unchecked")
|
jpayne@68
|
202 private void processInner(ByteFile bf){
|
jpayne@68
|
203 byte[] line=bf.nextLine();
|
jpayne@68
|
204
|
jpayne@68
|
205 {
|
jpayne@68
|
206 ArrayList<GffLine> refLines=GffLine.loadGffFile(ffref, "CDS,rRNA,tRNA", true);
|
jpayne@68
|
207
|
jpayne@68
|
208 refCount=refLines.size();
|
jpayne@68
|
209 lineMap=new HashMap<StringNum, GffLine>();
|
jpayne@68
|
210 startCountMap=new HashMap<StringNum, Integer>();
|
jpayne@68
|
211 stopCountMap=new HashMap<StringNum, Integer>();
|
jpayne@68
|
212
|
jpayne@68
|
213 for(GffLine gline : refLines){
|
jpayne@68
|
214 final int stop=gline.trueStop();
|
jpayne@68
|
215 StringNum sn=new StringNum(gline.seqid, stop);
|
jpayne@68
|
216 lineMap.put(sn, gline);
|
jpayne@68
|
217 startCountMap.put(sn, 0);
|
jpayne@68
|
218 stopCountMap.put(sn, 0);
|
jpayne@68
|
219 assert(lineMap.get(sn)==gline);
|
jpayne@68
|
220 // assert(false) : "\n\nsn='"+sn+"'\n"+lineMap.containsKey(sn)+"\n"+lineMap.keySet();
|
jpayne@68
|
221 }
|
jpayne@68
|
222 if(verbose){
|
jpayne@68
|
223 System.err.println(lineMap);
|
jpayne@68
|
224 System.err.println(startCountMap);
|
jpayne@68
|
225 System.err.println(stopCountMap);
|
jpayne@68
|
226 }
|
jpayne@68
|
227 }
|
jpayne@68
|
228
|
jpayne@68
|
229 while(line!=null){
|
jpayne@68
|
230 if(line.length>0){
|
jpayne@68
|
231 if(maxLines>0 && linesProcessed>=maxLines){break;}
|
jpayne@68
|
232 linesProcessed++;
|
jpayne@68
|
233 bytesProcessed+=(line.length+1);
|
jpayne@68
|
234
|
jpayne@68
|
235 final boolean valid=(line[0]!='#');
|
jpayne@68
|
236 if(valid){
|
jpayne@68
|
237 queryCount++;
|
jpayne@68
|
238 GffLine gline=new GffLine(line);
|
jpayne@68
|
239 processLine(gline);
|
jpayne@68
|
240 }
|
jpayne@68
|
241 }
|
jpayne@68
|
242 line=bf.nextLine();
|
jpayne@68
|
243 }
|
jpayne@68
|
244
|
jpayne@68
|
245 for(Entry<StringNum, Integer> e : startCountMap.entrySet()){
|
jpayne@68
|
246 if(e.getValue()<1){
|
jpayne@68
|
247 falseNegativeStart++;
|
jpayne@68
|
248 }
|
jpayne@68
|
249 }
|
jpayne@68
|
250 for(Entry<StringNum, Integer> e : stopCountMap.entrySet()){
|
jpayne@68
|
251 if(e.getValue()<1){
|
jpayne@68
|
252 falseNegativeStop++;
|
jpayne@68
|
253 }
|
jpayne@68
|
254 }
|
jpayne@68
|
255 }
|
jpayne@68
|
256
|
jpayne@68
|
257 private void processLine(GffLine gline){
|
jpayne@68
|
258 // boolean cds=gline.type.equals("CDS");
|
jpayne@68
|
259 // boolean trna=gline.type.equals("tRNA");
|
jpayne@68
|
260 // boolean rrna=gline.type.equals("rRNA");
|
jpayne@68
|
261 // if(!cds && !trna && !rrna){return;}
|
jpayne@68
|
262 // if(cds && !ProkObject.callCDS){return;}
|
jpayne@68
|
263 // if(trna && !ProkObject.calltRNA){return;}
|
jpayne@68
|
264 // if(rrna){
|
jpayne@68
|
265 // int type=gline.prokType();
|
jpayne@68
|
266 // if(ProkObject.processType(type)){return;}
|
jpayne@68
|
267 // }
|
jpayne@68
|
268 int type=gline.prokType();
|
jpayne@68
|
269 if(!ProkObject.processType(type)){return;}
|
jpayne@68
|
270
|
jpayne@68
|
271 final int stop=gline.trueStop();
|
jpayne@68
|
272 final int start=gline.trueStart();
|
jpayne@68
|
273
|
jpayne@68
|
274 // System.err.println("Considering "+start+", "+stop);
|
jpayne@68
|
275
|
jpayne@68
|
276 StringNum sn=new StringNum(gline.seqid, stop);
|
jpayne@68
|
277 GffLine refline=lineMap.get(sn);
|
jpayne@68
|
278
|
jpayne@68
|
279 boolean fail=(refline==null || refline.strand!=gline.strand || !refline.type.equals(gline.type));
|
jpayne@68
|
280 if(fail){
|
jpayne@68
|
281 if(verbose){
|
jpayne@68
|
282 System.err.println("Can't find "+sn+"\n"+gline+"\n"+refline);
|
jpayne@68
|
283 assert(false) : "\n\nsn='"+sn+"'\n"+lineMap.containsKey(sn)+"\n"+lineMap.keySet();
|
jpayne@68
|
284 }
|
jpayne@68
|
285 falsePositiveStart++;
|
jpayne@68
|
286 falsePositiveStop++;
|
jpayne@68
|
287 falsePositiveStart2++;
|
jpayne@68
|
288 falsePositiveStop2++;
|
jpayne@68
|
289 }else{
|
jpayne@68
|
290 assert(stop==refline.trueStop());
|
jpayne@68
|
291 truePositiveStop++;
|
jpayne@68
|
292 truePositiveStop2++;
|
jpayne@68
|
293 stopCountMap.put(sn, stopCountMap.get(sn)+1);
|
jpayne@68
|
294 if(start==refline.trueStart()){
|
jpayne@68
|
295 truePositiveStart++;
|
jpayne@68
|
296 truePositiveStart2++;
|
jpayne@68
|
297 startCountMap.put(sn, startCountMap.get(sn)+1);
|
jpayne@68
|
298 }else{
|
jpayne@68
|
299 falsePositiveStart++;
|
jpayne@68
|
300 falsePositiveStart2++;
|
jpayne@68
|
301 }
|
jpayne@68
|
302 }
|
jpayne@68
|
303 }
|
jpayne@68
|
304
|
jpayne@68
|
305 /*--------------------------------------------------------------*/
|
jpayne@68
|
306 /*---------------- Fields ----------------*/
|
jpayne@68
|
307 /*--------------------------------------------------------------*/
|
jpayne@68
|
308
|
jpayne@68
|
309 private String in=null;
|
jpayne@68
|
310 private String ref=null;
|
jpayne@68
|
311
|
jpayne@68
|
312
|
jpayne@68
|
313 /*--------------------------------------------------------------*/
|
jpayne@68
|
314
|
jpayne@68
|
315 private HashMap<StringNum, GffLine> lineMap;
|
jpayne@68
|
316 private HashMap<StringNum, Integer> startCountMap;
|
jpayne@68
|
317 private HashMap<StringNum, Integer> stopCountMap;
|
jpayne@68
|
318
|
jpayne@68
|
319 // private HashMap<Integer, ArrayList<GffLine>> map;
|
jpayne@68
|
320 // private HashSet<Integer> stopSet;
|
jpayne@68
|
321 // private HashSet<Integer> startSet;
|
jpayne@68
|
322 // private HashSet<Integer> stopSetM;
|
jpayne@68
|
323 // private HashSet<Integer> startSetM;
|
jpayne@68
|
324
|
jpayne@68
|
325 private long linesProcessed=0;
|
jpayne@68
|
326 private long linesOut=0;
|
jpayne@68
|
327 private long bytesProcessed=0;
|
jpayne@68
|
328 private long bytesOut=0;
|
jpayne@68
|
329
|
jpayne@68
|
330 private long maxLines=Long.MAX_VALUE;
|
jpayne@68
|
331
|
jpayne@68
|
332 private long falsePositiveStart=0;
|
jpayne@68
|
333 private long falsePositiveStop=0;
|
jpayne@68
|
334 private long truePositiveStart=0;
|
jpayne@68
|
335 private long truePositiveStop=0;
|
jpayne@68
|
336 private long falseNegativeStart=0;
|
jpayne@68
|
337 private long falseNegativeStop=0;
|
jpayne@68
|
338
|
jpayne@68
|
339 private long falsePositiveStart2=0;
|
jpayne@68
|
340 private long falsePositiveStop2=0;
|
jpayne@68
|
341 private long truePositiveStart2=0;
|
jpayne@68
|
342 private long truePositiveStop2=0;
|
jpayne@68
|
343
|
jpayne@68
|
344 private long refCount=0;
|
jpayne@68
|
345 private long queryCount=0;
|
jpayne@68
|
346
|
jpayne@68
|
347 /*--------------------------------------------------------------*/
|
jpayne@68
|
348 /*---------------- Final Fields ----------------*/
|
jpayne@68
|
349 /*--------------------------------------------------------------*/
|
jpayne@68
|
350
|
jpayne@68
|
351 private final FileFormat ffin;
|
jpayne@68
|
352 private final FileFormat ffref;
|
jpayne@68
|
353
|
jpayne@68
|
354 /*--------------------------------------------------------------*/
|
jpayne@68
|
355 /*---------------- Common Fields ----------------*/
|
jpayne@68
|
356 /*--------------------------------------------------------------*/
|
jpayne@68
|
357
|
jpayne@68
|
358 private PrintStream outstream=System.err;
|
jpayne@68
|
359 public static boolean verbose=false;
|
jpayne@68
|
360 public boolean errorState=false;
|
jpayne@68
|
361 private boolean overwrite=false;
|
jpayne@68
|
362 private boolean append=false;
|
jpayne@68
|
363
|
jpayne@68
|
364 }
|