Mercurial > repos > rliterman > csp2
diff CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/gff/CompareGff.java @ 68:5028fdace37b
planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author | jpayne |
---|---|
date | Tue, 18 Mar 2025 16:23:26 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/gff/CompareGff.java Tue Mar 18 16:23:26 2025 -0400 @@ -0,0 +1,364 @@ +package gff; + +import java.io.PrintStream; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.Locale; +import java.util.Map.Entry; + +import fileIO.ByteFile; +import fileIO.FileFormat; +import fileIO.ReadWrite; +import prok.ProkObject; +import shared.Parse; +import shared.Parser; +import shared.PreParser; +import shared.Shared; +import shared.Timer; +import shared.Tools; +import structures.StringNum; + +/** + * Compares gff files for the purpose of grading gene-calling. + * @author Brian Bushnell + * @date October 3, 2018 + * + */ +public class CompareGff { + + /*--------------------------------------------------------------*/ + /*---------------- Initialization ----------------*/ + /*--------------------------------------------------------------*/ + + /** + * Code entrance from the command line. + * @param args Command line arguments + */ + public static void main(String[] args){ + //Start a timer immediately upon code entrance. + Timer t=new Timer(); + + //Create an instance of this class + CompareGff x=new CompareGff(args); + + //Run the object + x.process(t); + + //Close the print stream if it was redirected + Shared.closeStream(x.outstream); + } + + /** + * Constructor. + * @param args Command line arguments + */ + public CompareGff(String[] args){ + + {//Preparse block for help, config files, and outstream + PreParser pp=new PreParser(args, getClass(), false); + args=pp.args; + outstream=pp.outstream; + } + + //Set shared static variables prior to parsing + ReadWrite.USE_PIGZ=ReadWrite.USE_UNPIGZ=true; + ReadWrite.MAX_ZIP_THREADS=Shared.threads(); + + {//Parse the arguments + final Parser parser=parse(args); + overwrite=parser.overwrite; + append=parser.append; + + in=parser.in1; + } + + fixExtensions(); //Add or remove .gz or .bz2 as needed + checkFileExistence(); //Ensure files can be read and written + checkStatics(); //Adjust file-related static fields as needed for this program + + ffin=FileFormat.testInput(in, FileFormat.GFF, null, true, true); + ffref=FileFormat.testInput(ref, FileFormat.GFF, null, true, true); + } + + /*--------------------------------------------------------------*/ + /*---------------- Initialization Helpers ----------------*/ + /*--------------------------------------------------------------*/ + + /** Parse arguments from the command line */ + private Parser parse(String[] args){ + + Parser parser=new Parser(); + for(int i=0; i<args.length; i++){ + String arg=args[i]; + String[] split=arg.split("="); + String a=split[0].toLowerCase(); + String b=split.length>1 ? split[1] : null; + if(b!=null && b.equalsIgnoreCase("null")){b=null;} + + if(a.equals("ref")){ + ref=b; + }else if(a.equals("lines")){ + maxLines=Long.parseLong(b); + if(maxLines<0){maxLines=Long.MAX_VALUE;} + }else if(a.equals("verbose")){ + verbose=Parse.parseBoolean(b); +// ByteFile1.verbose=verbose; +// ByteFile2.verbose=verbose; +// ReadWrite.verbose=verbose; + }else if(parser.parse(arg, a, b)){ + //do nothing + }else if(i==0 && arg.indexOf('=')<0){ + parser.in1=arg; + }else if(i==1 && arg.indexOf('=')<0 && ref==null){ + ref=arg; + }else{ + outstream.println("Unknown parameter "+args[i]); + assert(false) : "Unknown parameter "+args[i]; + // throw new RuntimeException("Unknown parameter "+args[i]); + } + } + + return parser; + } + + /** Add or remove .gz or .bz2 as needed */ + private void fixExtensions(){ + in=Tools.fixExtension(in); + ref=Tools.fixExtension(ref); + if(in==null || ref==null){throw new RuntimeException("Error - at least two input files are required.");} + } + + /** Ensure files can be read and written */ + private void checkFileExistence(){ + + //Ensure input files can be read + if(!Tools.testInputFiles(true, true, in, ref)){ + throw new RuntimeException("\nCan't read some input files.\n"); + } + } + + /** Adjust file-related static fields as needed for this program */ + private static void checkStatics(){ + //Adjust the number of threads for input file reading + if(!ByteFile.FORCE_MODE_BF1 && !ByteFile.FORCE_MODE_BF2 && Shared.threads()>2){ + ByteFile.FORCE_MODE_BF2=true; + } + +// if(!ByteFile.FORCE_MODE_BF2){ +// ByteFile.FORCE_MODE_BF2=false; +// ByteFile.FORCE_MODE_BF1=true; +// } + } + + /*--------------------------------------------------------------*/ + /*---------------- Outer Methods ----------------*/ + /*--------------------------------------------------------------*/ + + void process(Timer t){ + + ByteFile bf=ByteFile.makeByteFile(ffin); + + processInner(bf); + + errorState|=bf.close(); + + t.stop(); + + outstream.println(Tools.timeLinesBytesProcessed(t, linesProcessed, bytesProcessed, 8)); + + outstream.println(); + outstream.println("Ref count: \t"+refCount); + outstream.println("Query count: \t"+queryCount); + + outstream.println(); + outstream.println("Ref-relative counts:"); + outstream.println("True Positive Start: \t"+truePositiveStart+"\t"+(String.format(Locale.ROOT, "%.3f%%", truePositiveStart*100.0/refCount))); + outstream.println("True Positive Stop: \t"+truePositiveStop+"\t"+(String.format(Locale.ROOT, "%.3f%%", truePositiveStop*100.0/refCount))); +// outstream.println("False Positive Start:\t"+falsePositiveStart+"\t"+(String.format(Locale.ROOT, "%.3f%%", falsePositiveStart*100.0/refCount))); +// outstream.println("False Positive Stop: \t"+falsePositiveStop+"\t"+(String.format(Locale.ROOT, "%.3f%%", falsePositiveStop*100.0/refCount))); + outstream.println("False Negative Start:\t"+falseNegativeStart+"\t"+(String.format(Locale.ROOT, "%.3f%%", falseNegativeStart*100.0/refCount))); + outstream.println("False Negative Stop: \t"+falseNegativeStop+"\t"+(String.format(Locale.ROOT, "%.3f%%", falseNegativeStop*100.0/refCount))); + + outstream.println(); + outstream.println("Query-relative counts:"); + outstream.println("True Positive Start: \t"+truePositiveStart2+"\t"+(String.format(Locale.ROOT, "%.3f%%", truePositiveStart2*100.0/queryCount))); + outstream.println("True Positive Stop: \t"+truePositiveStop2+"\t"+(String.format(Locale.ROOT, "%.3f%%", truePositiveStop2*100.0/queryCount))); + outstream.println("False Positive Start:\t"+falsePositiveStart2+"\t"+(String.format(Locale.ROOT, "%.3f%%", falsePositiveStart2*100.0/queryCount))); + outstream.println("False Positive Stop: \t"+falsePositiveStop2+"\t"+(String.format(Locale.ROOT, "%.3f%%", falsePositiveStop2*100.0/queryCount))); + + outstream.println(); + outstream.println("SNR: \t"+String.format(Locale.ROOT, "%.4f", 10*Math.log10((truePositiveStart2+truePositiveStop2+0.1)/(falsePositiveStart2+falsePositiveStop2+0.1)))); + + if(errorState){ + throw new RuntimeException(getClass().getName()+" terminated in an error state; the output may be corrupt."); + } + } + + /*--------------------------------------------------------------*/ + /*---------------- Inner Methods ----------------*/ + /*--------------------------------------------------------------*/ + + @SuppressWarnings("unchecked") + private void processInner(ByteFile bf){ + byte[] line=bf.nextLine(); + + { + ArrayList<GffLine> refLines=GffLine.loadGffFile(ffref, "CDS,rRNA,tRNA", true); + + refCount=refLines.size(); + lineMap=new HashMap<StringNum, GffLine>(); + startCountMap=new HashMap<StringNum, Integer>(); + stopCountMap=new HashMap<StringNum, Integer>(); + + for(GffLine gline : refLines){ + final int stop=gline.trueStop(); + StringNum sn=new StringNum(gline.seqid, stop); + lineMap.put(sn, gline); + startCountMap.put(sn, 0); + stopCountMap.put(sn, 0); + assert(lineMap.get(sn)==gline); +// assert(false) : "\n\nsn='"+sn+"'\n"+lineMap.containsKey(sn)+"\n"+lineMap.keySet(); + } + if(verbose){ + System.err.println(lineMap); + System.err.println(startCountMap); + System.err.println(stopCountMap); + } + } + + while(line!=null){ + if(line.length>0){ + if(maxLines>0 && linesProcessed>=maxLines){break;} + linesProcessed++; + bytesProcessed+=(line.length+1); + + final boolean valid=(line[0]!='#'); + if(valid){ + queryCount++; + GffLine gline=new GffLine(line); + processLine(gline); + } + } + line=bf.nextLine(); + } + + for(Entry<StringNum, Integer> e : startCountMap.entrySet()){ + if(e.getValue()<1){ + falseNegativeStart++; + } + } + for(Entry<StringNum, Integer> e : stopCountMap.entrySet()){ + if(e.getValue()<1){ + falseNegativeStop++; + } + } + } + + private void processLine(GffLine gline){ +// boolean cds=gline.type.equals("CDS"); +// boolean trna=gline.type.equals("tRNA"); +// boolean rrna=gline.type.equals("rRNA"); +// if(!cds && !trna && !rrna){return;} +// if(cds && !ProkObject.callCDS){return;} +// if(trna && !ProkObject.calltRNA){return;} +// if(rrna){ +// int type=gline.prokType(); +// if(ProkObject.processType(type)){return;} +// } + int type=gline.prokType(); + if(!ProkObject.processType(type)){return;} + + final int stop=gline.trueStop(); + final int start=gline.trueStart(); + +// System.err.println("Considering "+start+", "+stop); + + StringNum sn=new StringNum(gline.seqid, stop); + GffLine refline=lineMap.get(sn); + + boolean fail=(refline==null || refline.strand!=gline.strand || !refline.type.equals(gline.type)); + if(fail){ + if(verbose){ + System.err.println("Can't find "+sn+"\n"+gline+"\n"+refline); + assert(false) : "\n\nsn='"+sn+"'\n"+lineMap.containsKey(sn)+"\n"+lineMap.keySet(); + } + falsePositiveStart++; + falsePositiveStop++; + falsePositiveStart2++; + falsePositiveStop2++; + }else{ + assert(stop==refline.trueStop()); + truePositiveStop++; + truePositiveStop2++; + stopCountMap.put(sn, stopCountMap.get(sn)+1); + if(start==refline.trueStart()){ + truePositiveStart++; + truePositiveStart2++; + startCountMap.put(sn, startCountMap.get(sn)+1); + }else{ + falsePositiveStart++; + falsePositiveStart2++; + } + } + } + + /*--------------------------------------------------------------*/ + /*---------------- Fields ----------------*/ + /*--------------------------------------------------------------*/ + + private String in=null; + private String ref=null; + + + /*--------------------------------------------------------------*/ + + private HashMap<StringNum, GffLine> lineMap; + private HashMap<StringNum, Integer> startCountMap; + private HashMap<StringNum, Integer> stopCountMap; + +// private HashMap<Integer, ArrayList<GffLine>> map; +// private HashSet<Integer> stopSet; +// private HashSet<Integer> startSet; +// private HashSet<Integer> stopSetM; +// private HashSet<Integer> startSetM; + + private long linesProcessed=0; + private long linesOut=0; + private long bytesProcessed=0; + private long bytesOut=0; + + private long maxLines=Long.MAX_VALUE; + + private long falsePositiveStart=0; + private long falsePositiveStop=0; + private long truePositiveStart=0; + private long truePositiveStop=0; + private long falseNegativeStart=0; + private long falseNegativeStop=0; + + private long falsePositiveStart2=0; + private long falsePositiveStop2=0; + private long truePositiveStart2=0; + private long truePositiveStop2=0; + + private long refCount=0; + private long queryCount=0; + + /*--------------------------------------------------------------*/ + /*---------------- Final Fields ----------------*/ + /*--------------------------------------------------------------*/ + + private final FileFormat ffin; + private final FileFormat ffref; + + /*--------------------------------------------------------------*/ + /*---------------- Common Fields ----------------*/ + /*--------------------------------------------------------------*/ + + private PrintStream outstream=System.err; + public static boolean verbose=false; + public boolean errorState=false; + private boolean overwrite=false; + private boolean append=false; + +}