annotate CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/gff/CompareGff.java @ 68:5028fdace37b

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 16:23:26 -0400
parents
children
rev   line source
jpayne@68 1 package gff;
jpayne@68 2
jpayne@68 3 import java.io.PrintStream;
jpayne@68 4 import java.util.ArrayList;
jpayne@68 5 import java.util.HashMap;
jpayne@68 6 import java.util.Locale;
jpayne@68 7 import java.util.Map.Entry;
jpayne@68 8
jpayne@68 9 import fileIO.ByteFile;
jpayne@68 10 import fileIO.FileFormat;
jpayne@68 11 import fileIO.ReadWrite;
jpayne@68 12 import prok.ProkObject;
jpayne@68 13 import shared.Parse;
jpayne@68 14 import shared.Parser;
jpayne@68 15 import shared.PreParser;
jpayne@68 16 import shared.Shared;
jpayne@68 17 import shared.Timer;
jpayne@68 18 import shared.Tools;
jpayne@68 19 import structures.StringNum;
jpayne@68 20
jpayne@68 21 /**
jpayne@68 22 * Compares gff files for the purpose of grading gene-calling.
jpayne@68 23 * @author Brian Bushnell
jpayne@68 24 * @date October 3, 2018
jpayne@68 25 *
jpayne@68 26 */
jpayne@68 27 public class CompareGff {
jpayne@68 28
jpayne@68 29 /*--------------------------------------------------------------*/
jpayne@68 30 /*---------------- Initialization ----------------*/
jpayne@68 31 /*--------------------------------------------------------------*/
jpayne@68 32
jpayne@68 33 /**
jpayne@68 34 * Code entrance from the command line.
jpayne@68 35 * @param args Command line arguments
jpayne@68 36 */
jpayne@68 37 public static void main(String[] args){
jpayne@68 38 //Start a timer immediately upon code entrance.
jpayne@68 39 Timer t=new Timer();
jpayne@68 40
jpayne@68 41 //Create an instance of this class
jpayne@68 42 CompareGff x=new CompareGff(args);
jpayne@68 43
jpayne@68 44 //Run the object
jpayne@68 45 x.process(t);
jpayne@68 46
jpayne@68 47 //Close the print stream if it was redirected
jpayne@68 48 Shared.closeStream(x.outstream);
jpayne@68 49 }
jpayne@68 50
jpayne@68 51 /**
jpayne@68 52 * Constructor.
jpayne@68 53 * @param args Command line arguments
jpayne@68 54 */
jpayne@68 55 public CompareGff(String[] args){
jpayne@68 56
jpayne@68 57 {//Preparse block for help, config files, and outstream
jpayne@68 58 PreParser pp=new PreParser(args, getClass(), false);
jpayne@68 59 args=pp.args;
jpayne@68 60 outstream=pp.outstream;
jpayne@68 61 }
jpayne@68 62
jpayne@68 63 //Set shared static variables prior to parsing
jpayne@68 64 ReadWrite.USE_PIGZ=ReadWrite.USE_UNPIGZ=true;
jpayne@68 65 ReadWrite.MAX_ZIP_THREADS=Shared.threads();
jpayne@68 66
jpayne@68 67 {//Parse the arguments
jpayne@68 68 final Parser parser=parse(args);
jpayne@68 69 overwrite=parser.overwrite;
jpayne@68 70 append=parser.append;
jpayne@68 71
jpayne@68 72 in=parser.in1;
jpayne@68 73 }
jpayne@68 74
jpayne@68 75 fixExtensions(); //Add or remove .gz or .bz2 as needed
jpayne@68 76 checkFileExistence(); //Ensure files can be read and written
jpayne@68 77 checkStatics(); //Adjust file-related static fields as needed for this program
jpayne@68 78
jpayne@68 79 ffin=FileFormat.testInput(in, FileFormat.GFF, null, true, true);
jpayne@68 80 ffref=FileFormat.testInput(ref, FileFormat.GFF, null, true, true);
jpayne@68 81 }
jpayne@68 82
jpayne@68 83 /*--------------------------------------------------------------*/
jpayne@68 84 /*---------------- Initialization Helpers ----------------*/
jpayne@68 85 /*--------------------------------------------------------------*/
jpayne@68 86
jpayne@68 87 /** Parse arguments from the command line */
jpayne@68 88 private Parser parse(String[] args){
jpayne@68 89
jpayne@68 90 Parser parser=new Parser();
jpayne@68 91 for(int i=0; i<args.length; i++){
jpayne@68 92 String arg=args[i];
jpayne@68 93 String[] split=arg.split("=");
jpayne@68 94 String a=split[0].toLowerCase();
jpayne@68 95 String b=split.length>1 ? split[1] : null;
jpayne@68 96 if(b!=null && b.equalsIgnoreCase("null")){b=null;}
jpayne@68 97
jpayne@68 98 if(a.equals("ref")){
jpayne@68 99 ref=b;
jpayne@68 100 }else if(a.equals("lines")){
jpayne@68 101 maxLines=Long.parseLong(b);
jpayne@68 102 if(maxLines<0){maxLines=Long.MAX_VALUE;}
jpayne@68 103 }else if(a.equals("verbose")){
jpayne@68 104 verbose=Parse.parseBoolean(b);
jpayne@68 105 // ByteFile1.verbose=verbose;
jpayne@68 106 // ByteFile2.verbose=verbose;
jpayne@68 107 // ReadWrite.verbose=verbose;
jpayne@68 108 }else if(parser.parse(arg, a, b)){
jpayne@68 109 //do nothing
jpayne@68 110 }else if(i==0 && arg.indexOf('=')<0){
jpayne@68 111 parser.in1=arg;
jpayne@68 112 }else if(i==1 && arg.indexOf('=')<0 && ref==null){
jpayne@68 113 ref=arg;
jpayne@68 114 }else{
jpayne@68 115 outstream.println("Unknown parameter "+args[i]);
jpayne@68 116 assert(false) : "Unknown parameter "+args[i];
jpayne@68 117 // throw new RuntimeException("Unknown parameter "+args[i]);
jpayne@68 118 }
jpayne@68 119 }
jpayne@68 120
jpayne@68 121 return parser;
jpayne@68 122 }
jpayne@68 123
jpayne@68 124 /** Add or remove .gz or .bz2 as needed */
jpayne@68 125 private void fixExtensions(){
jpayne@68 126 in=Tools.fixExtension(in);
jpayne@68 127 ref=Tools.fixExtension(ref);
jpayne@68 128 if(in==null || ref==null){throw new RuntimeException("Error - at least two input files are required.");}
jpayne@68 129 }
jpayne@68 130
jpayne@68 131 /** Ensure files can be read and written */
jpayne@68 132 private void checkFileExistence(){
jpayne@68 133
jpayne@68 134 //Ensure input files can be read
jpayne@68 135 if(!Tools.testInputFiles(true, true, in, ref)){
jpayne@68 136 throw new RuntimeException("\nCan't read some input files.\n");
jpayne@68 137 }
jpayne@68 138 }
jpayne@68 139
jpayne@68 140 /** Adjust file-related static fields as needed for this program */
jpayne@68 141 private static void checkStatics(){
jpayne@68 142 //Adjust the number of threads for input file reading
jpayne@68 143 if(!ByteFile.FORCE_MODE_BF1 && !ByteFile.FORCE_MODE_BF2 && Shared.threads()>2){
jpayne@68 144 ByteFile.FORCE_MODE_BF2=true;
jpayne@68 145 }
jpayne@68 146
jpayne@68 147 // if(!ByteFile.FORCE_MODE_BF2){
jpayne@68 148 // ByteFile.FORCE_MODE_BF2=false;
jpayne@68 149 // ByteFile.FORCE_MODE_BF1=true;
jpayne@68 150 // }
jpayne@68 151 }
jpayne@68 152
jpayne@68 153 /*--------------------------------------------------------------*/
jpayne@68 154 /*---------------- Outer Methods ----------------*/
jpayne@68 155 /*--------------------------------------------------------------*/
jpayne@68 156
jpayne@68 157 void process(Timer t){
jpayne@68 158
jpayne@68 159 ByteFile bf=ByteFile.makeByteFile(ffin);
jpayne@68 160
jpayne@68 161 processInner(bf);
jpayne@68 162
jpayne@68 163 errorState|=bf.close();
jpayne@68 164
jpayne@68 165 t.stop();
jpayne@68 166
jpayne@68 167 outstream.println(Tools.timeLinesBytesProcessed(t, linesProcessed, bytesProcessed, 8));
jpayne@68 168
jpayne@68 169 outstream.println();
jpayne@68 170 outstream.println("Ref count: \t"+refCount);
jpayne@68 171 outstream.println("Query count: \t"+queryCount);
jpayne@68 172
jpayne@68 173 outstream.println();
jpayne@68 174 outstream.println("Ref-relative counts:");
jpayne@68 175 outstream.println("True Positive Start: \t"+truePositiveStart+"\t"+(String.format(Locale.ROOT, "%.3f%%", truePositiveStart*100.0/refCount)));
jpayne@68 176 outstream.println("True Positive Stop: \t"+truePositiveStop+"\t"+(String.format(Locale.ROOT, "%.3f%%", truePositiveStop*100.0/refCount)));
jpayne@68 177 // outstream.println("False Positive Start:\t"+falsePositiveStart+"\t"+(String.format(Locale.ROOT, "%.3f%%", falsePositiveStart*100.0/refCount)));
jpayne@68 178 // outstream.println("False Positive Stop: \t"+falsePositiveStop+"\t"+(String.format(Locale.ROOT, "%.3f%%", falsePositiveStop*100.0/refCount)));
jpayne@68 179 outstream.println("False Negative Start:\t"+falseNegativeStart+"\t"+(String.format(Locale.ROOT, "%.3f%%", falseNegativeStart*100.0/refCount)));
jpayne@68 180 outstream.println("False Negative Stop: \t"+falseNegativeStop+"\t"+(String.format(Locale.ROOT, "%.3f%%", falseNegativeStop*100.0/refCount)));
jpayne@68 181
jpayne@68 182 outstream.println();
jpayne@68 183 outstream.println("Query-relative counts:");
jpayne@68 184 outstream.println("True Positive Start: \t"+truePositiveStart2+"\t"+(String.format(Locale.ROOT, "%.3f%%", truePositiveStart2*100.0/queryCount)));
jpayne@68 185 outstream.println("True Positive Stop: \t"+truePositiveStop2+"\t"+(String.format(Locale.ROOT, "%.3f%%", truePositiveStop2*100.0/queryCount)));
jpayne@68 186 outstream.println("False Positive Start:\t"+falsePositiveStart2+"\t"+(String.format(Locale.ROOT, "%.3f%%", falsePositiveStart2*100.0/queryCount)));
jpayne@68 187 outstream.println("False Positive Stop: \t"+falsePositiveStop2+"\t"+(String.format(Locale.ROOT, "%.3f%%", falsePositiveStop2*100.0/queryCount)));
jpayne@68 188
jpayne@68 189 outstream.println();
jpayne@68 190 outstream.println("SNR: \t"+String.format(Locale.ROOT, "%.4f", 10*Math.log10((truePositiveStart2+truePositiveStop2+0.1)/(falsePositiveStart2+falsePositiveStop2+0.1))));
jpayne@68 191
jpayne@68 192 if(errorState){
jpayne@68 193 throw new RuntimeException(getClass().getName()+" terminated in an error state; the output may be corrupt.");
jpayne@68 194 }
jpayne@68 195 }
jpayne@68 196
jpayne@68 197 /*--------------------------------------------------------------*/
jpayne@68 198 /*---------------- Inner Methods ----------------*/
jpayne@68 199 /*--------------------------------------------------------------*/
jpayne@68 200
jpayne@68 201 @SuppressWarnings("unchecked")
jpayne@68 202 private void processInner(ByteFile bf){
jpayne@68 203 byte[] line=bf.nextLine();
jpayne@68 204
jpayne@68 205 {
jpayne@68 206 ArrayList<GffLine> refLines=GffLine.loadGffFile(ffref, "CDS,rRNA,tRNA", true);
jpayne@68 207
jpayne@68 208 refCount=refLines.size();
jpayne@68 209 lineMap=new HashMap<StringNum, GffLine>();
jpayne@68 210 startCountMap=new HashMap<StringNum, Integer>();
jpayne@68 211 stopCountMap=new HashMap<StringNum, Integer>();
jpayne@68 212
jpayne@68 213 for(GffLine gline : refLines){
jpayne@68 214 final int stop=gline.trueStop();
jpayne@68 215 StringNum sn=new StringNum(gline.seqid, stop);
jpayne@68 216 lineMap.put(sn, gline);
jpayne@68 217 startCountMap.put(sn, 0);
jpayne@68 218 stopCountMap.put(sn, 0);
jpayne@68 219 assert(lineMap.get(sn)==gline);
jpayne@68 220 // assert(false) : "\n\nsn='"+sn+"'\n"+lineMap.containsKey(sn)+"\n"+lineMap.keySet();
jpayne@68 221 }
jpayne@68 222 if(verbose){
jpayne@68 223 System.err.println(lineMap);
jpayne@68 224 System.err.println(startCountMap);
jpayne@68 225 System.err.println(stopCountMap);
jpayne@68 226 }
jpayne@68 227 }
jpayne@68 228
jpayne@68 229 while(line!=null){
jpayne@68 230 if(line.length>0){
jpayne@68 231 if(maxLines>0 && linesProcessed>=maxLines){break;}
jpayne@68 232 linesProcessed++;
jpayne@68 233 bytesProcessed+=(line.length+1);
jpayne@68 234
jpayne@68 235 final boolean valid=(line[0]!='#');
jpayne@68 236 if(valid){
jpayne@68 237 queryCount++;
jpayne@68 238 GffLine gline=new GffLine(line);
jpayne@68 239 processLine(gline);
jpayne@68 240 }
jpayne@68 241 }
jpayne@68 242 line=bf.nextLine();
jpayne@68 243 }
jpayne@68 244
jpayne@68 245 for(Entry<StringNum, Integer> e : startCountMap.entrySet()){
jpayne@68 246 if(e.getValue()<1){
jpayne@68 247 falseNegativeStart++;
jpayne@68 248 }
jpayne@68 249 }
jpayne@68 250 for(Entry<StringNum, Integer> e : stopCountMap.entrySet()){
jpayne@68 251 if(e.getValue()<1){
jpayne@68 252 falseNegativeStop++;
jpayne@68 253 }
jpayne@68 254 }
jpayne@68 255 }
jpayne@68 256
jpayne@68 257 private void processLine(GffLine gline){
jpayne@68 258 // boolean cds=gline.type.equals("CDS");
jpayne@68 259 // boolean trna=gline.type.equals("tRNA");
jpayne@68 260 // boolean rrna=gline.type.equals("rRNA");
jpayne@68 261 // if(!cds && !trna && !rrna){return;}
jpayne@68 262 // if(cds && !ProkObject.callCDS){return;}
jpayne@68 263 // if(trna && !ProkObject.calltRNA){return;}
jpayne@68 264 // if(rrna){
jpayne@68 265 // int type=gline.prokType();
jpayne@68 266 // if(ProkObject.processType(type)){return;}
jpayne@68 267 // }
jpayne@68 268 int type=gline.prokType();
jpayne@68 269 if(!ProkObject.processType(type)){return;}
jpayne@68 270
jpayne@68 271 final int stop=gline.trueStop();
jpayne@68 272 final int start=gline.trueStart();
jpayne@68 273
jpayne@68 274 // System.err.println("Considering "+start+", "+stop);
jpayne@68 275
jpayne@68 276 StringNum sn=new StringNum(gline.seqid, stop);
jpayne@68 277 GffLine refline=lineMap.get(sn);
jpayne@68 278
jpayne@68 279 boolean fail=(refline==null || refline.strand!=gline.strand || !refline.type.equals(gline.type));
jpayne@68 280 if(fail){
jpayne@68 281 if(verbose){
jpayne@68 282 System.err.println("Can't find "+sn+"\n"+gline+"\n"+refline);
jpayne@68 283 assert(false) : "\n\nsn='"+sn+"'\n"+lineMap.containsKey(sn)+"\n"+lineMap.keySet();
jpayne@68 284 }
jpayne@68 285 falsePositiveStart++;
jpayne@68 286 falsePositiveStop++;
jpayne@68 287 falsePositiveStart2++;
jpayne@68 288 falsePositiveStop2++;
jpayne@68 289 }else{
jpayne@68 290 assert(stop==refline.trueStop());
jpayne@68 291 truePositiveStop++;
jpayne@68 292 truePositiveStop2++;
jpayne@68 293 stopCountMap.put(sn, stopCountMap.get(sn)+1);
jpayne@68 294 if(start==refline.trueStart()){
jpayne@68 295 truePositiveStart++;
jpayne@68 296 truePositiveStart2++;
jpayne@68 297 startCountMap.put(sn, startCountMap.get(sn)+1);
jpayne@68 298 }else{
jpayne@68 299 falsePositiveStart++;
jpayne@68 300 falsePositiveStart2++;
jpayne@68 301 }
jpayne@68 302 }
jpayne@68 303 }
jpayne@68 304
jpayne@68 305 /*--------------------------------------------------------------*/
jpayne@68 306 /*---------------- Fields ----------------*/
jpayne@68 307 /*--------------------------------------------------------------*/
jpayne@68 308
jpayne@68 309 private String in=null;
jpayne@68 310 private String ref=null;
jpayne@68 311
jpayne@68 312
jpayne@68 313 /*--------------------------------------------------------------*/
jpayne@68 314
jpayne@68 315 private HashMap<StringNum, GffLine> lineMap;
jpayne@68 316 private HashMap<StringNum, Integer> startCountMap;
jpayne@68 317 private HashMap<StringNum, Integer> stopCountMap;
jpayne@68 318
jpayne@68 319 // private HashMap<Integer, ArrayList<GffLine>> map;
jpayne@68 320 // private HashSet<Integer> stopSet;
jpayne@68 321 // private HashSet<Integer> startSet;
jpayne@68 322 // private HashSet<Integer> stopSetM;
jpayne@68 323 // private HashSet<Integer> startSetM;
jpayne@68 324
jpayne@68 325 private long linesProcessed=0;
jpayne@68 326 private long linesOut=0;
jpayne@68 327 private long bytesProcessed=0;
jpayne@68 328 private long bytesOut=0;
jpayne@68 329
jpayne@68 330 private long maxLines=Long.MAX_VALUE;
jpayne@68 331
jpayne@68 332 private long falsePositiveStart=0;
jpayne@68 333 private long falsePositiveStop=0;
jpayne@68 334 private long truePositiveStart=0;
jpayne@68 335 private long truePositiveStop=0;
jpayne@68 336 private long falseNegativeStart=0;
jpayne@68 337 private long falseNegativeStop=0;
jpayne@68 338
jpayne@68 339 private long falsePositiveStart2=0;
jpayne@68 340 private long falsePositiveStop2=0;
jpayne@68 341 private long truePositiveStart2=0;
jpayne@68 342 private long truePositiveStop2=0;
jpayne@68 343
jpayne@68 344 private long refCount=0;
jpayne@68 345 private long queryCount=0;
jpayne@68 346
jpayne@68 347 /*--------------------------------------------------------------*/
jpayne@68 348 /*---------------- Final Fields ----------------*/
jpayne@68 349 /*--------------------------------------------------------------*/
jpayne@68 350
jpayne@68 351 private final FileFormat ffin;
jpayne@68 352 private final FileFormat ffref;
jpayne@68 353
jpayne@68 354 /*--------------------------------------------------------------*/
jpayne@68 355 /*---------------- Common Fields ----------------*/
jpayne@68 356 /*--------------------------------------------------------------*/
jpayne@68 357
jpayne@68 358 private PrintStream outstream=System.err;
jpayne@68 359 public static boolean verbose=false;
jpayne@68 360 public boolean errorState=false;
jpayne@68 361 private boolean overwrite=false;
jpayne@68 362 private boolean append=false;
jpayne@68 363
jpayne@68 364 }