Mercurial > repos > rliterman > csp2
diff CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/gff/GffLine.java @ 68:5028fdace37b
planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author | jpayne |
---|---|
date | Tue, 18 Mar 2025 16:23:26 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/gff/GffLine.java Tue Mar 18 16:23:26 2025 -0400 @@ -0,0 +1,389 @@ +package gff; + +import java.util.ArrayList; +import java.util.HashSet; +import java.util.Locale; + +import dna.Data; +import fileIO.ByteFile; +import fileIO.FileFormat; +import prok.ProkObject; +import shared.Parse; +import shared.Shared; +import shared.Tools; +import structures.ByteBuilder; +import var2.ScafMap; +import var2.VCFLine; +import var2.Var; + +/** + * Used by both the var2 and prok packages for processing gff files. + * @author Brian Bushnell + * @date Sep 12, 2018 + * + */ +public class GffLine { + + //#seqid source type start end score strand phase attributes + public GffLine(byte[] line){ + int a=0, b=0; + + while(b<line.length && line[b]!='\t'){b++;} + assert(b>a) : "Missing field 0: "+new String(line); + seqid=parseSeqid ? intern(new String(line, a, b-a)) : null; +// assert(seqid==null || seqid.equals(new String(line, a, b-a))); +// assert(seqid!=null) : new String(line, a, b-a)+", "+a+", "+b+"\n"+line; + b++; + a=b; + + while(b<line.length && line[b]!='\t'){b++;} + assert(b>a) : "Missing field 1: "+new String(line); + if(b==a+1 && line[a]=='.'){source=DOTS;} + else{source=paseSource ? intern(new String(line, a, b-a)) : null;} + b++; + a=b; + + while(b<line.length && line[b]!='\t'){b++;} + assert(b>a) : "Missing field 2: "+new String(line); + if(b==a+1 && line[a]=='.'){type=DOTS;} + else{ + try {//This was to catch a probably intermittent hardware error; can't replicate. + type=(parseType ? intern(new String(line, a, b-a)) : null); + } catch (Exception e) { + // TODO Auto-generated catch block + e.printStackTrace(); + System.err.println("\n"+new String(line)+"\n"+a+", "+b+", "+(b-a)); + assert(false); + } + } + b++; + a=b; + + while(b<line.length && line[b]!='\t'){b++;} + assert(b>a) : "Missing field 3: "+new String(line); + start=Parse.parseInt(line, a, b); + b++; + a=b; + + while(b<line.length && line[b]!='\t'){b++;} + assert(b>a) : "Missing field 4: "+new String(line); + stop=Parse.parseInt(line, a, b); + b++; + a=b; + + while(b<line.length && line[b]!='\t'){b++;} + if(b<=a){ + //Badly formatted line; common in IMG + return; + } + assert(b>a) : "Missing field 5: "+new String(line); + if(b==a+1 && line[a]=='.'){score=-1;} + else{score=Parse.parseFloat(line, a, b);} + b++; + a=b; + + while(b<line.length && line[b]!='\t'){b++;} + assert(b>a) : "Missing field 6: "+new String(line); + assert(b==a+1); + strand=find(line[a], STRANDS); +// assert(strand>0) : line[a]+", "+Arrays.toString(STRANDS)+", "+(char)line[b]; + b++; + a=b; + + while(b<line.length && line[b]!='\t'){b++;} + assert(b>a) : "Missing field 7: "+new String(line); + assert(b==a+1); + if(line[a]=='.'){phase=-1;} + else{phase=Parse.parseInt(line, a, b);} + b++; + a=b; + + while(b<line.length && line[b]!='\t'){b++;} + assert(b>a) : "Missing field 8: "+new String(line); + if(b==a+1 && line[a]=='.'){attributes=DOTS;} + else{attributes=parseAttributes ? new String(line, a, b-a) : null;} + b++; + a=b; + +// assert(strand>=0) : "\n"+this.toString()+"\n"+new String(line); + } + + public GffLine(VCFLine vcf){ + seqid=vcf.scaf; + source=DOTS; + type="sequence_variant_obs"; + start=vcf.start()+1; + stop=vcf.stop()+1; + score=(float)vcf.qual; + strand=PLUS; + phase=-1; + final int vtype=vcf.type(); + ByteBuilder bb=new ByteBuilder(16); + bb.append("ID=").append(Var.typeArray[vtype]).append(' '); + if(vtype==Var.SUB){ + bb.append(vcf.ref).append('>').append(vcf.alt); + }else if(vtype==Var.DEL){ + bb.append("length ").append(vcf.reflen()-vcf.readlen()); + }else if(vtype==Var.INS){ + int offset=vcf.reflen(); + int length=vcf.readlen()-offset; + bb.append(vcf.alt, offset, length); + }else if(vtype==Var.NOCALL){ + bb.append("length ").append(vcf.reflen()); + } + attributes=bb.toString(); + bb.clear(); + } + + public GffLine(Var v, double properPairRate, double totalQualityAvg, double totalMapqAvg, double readLengthAvg, double rarity, int ploidy, ScafMap map){ + seqid=v.scafName(); + source=DOTS; + type="sequence_variant_obs"; + start=v.start+1; + stop=Tools.max(v.start+1, v.stop); + score=(float)v.score(properPairRate, totalQualityAvg, totalMapqAvg, readLengthAvg, rarity, ploidy, map); + strand=PLUS; + phase=-1; + final int vtype=v.type(); + ByteBuilder bb=new ByteBuilder(16); + bb.append("ID=").append(Var.typeArray[vtype]); + if(vtype==Var.SUB || vtype==Var.INS){ + bb.append(' ').append(v.allele); + }else if(vtype==Var.DEL || vtype==Var.NOCALL){ + bb.append(" length ").append(v.reflen()); + }else{assert(false) : vtype+"\n"+v;} + attributes=bb.toString(); + bb.clear(); + } + + public GffLine(Var v){ + seqid=v.scafName(); + source="BBTools"; + type="sequence_variant_obs"; + start=v.start+1; + stop=Tools.max(v.start+1, v.stop); + score=-1; + strand=PLUS; + phase=-1; + final int vtype=v.type(); + ByteBuilder bb=new ByteBuilder(16); + bb.append("ID=").append(Var.typeArray[vtype]); + if(vtype==Var.SUB || vtype==Var.INS){ + bb.append(' ').append(v.allele); + }else if(vtype==Var.DEL || vtype==Var.NOCALL){ + bb.append(" length ").append(v.reflen()); + }else{assert(false) : vtype+"\n"+v;} + attributes=bb.toString(); + bb.clear(); + } + + public static ArrayList<GffLine> loadGffFile(String fname, String types, boolean banUnprocessed){ + FileFormat ff=FileFormat.testInput(fname, FileFormat.GFF, null, false, false); + return loadGffFile(ff, types, banUnprocessed); + } + + public static ArrayList<GffLine>[] loadGffFileByType(FileFormat ff, String types, boolean banUnprocessed){ + ArrayList<GffLine> list=loadGffFile(ff, types, banUnprocessed); + String[] typeArray=types.split(","); + ArrayList<GffLine>[] lists=new ArrayList[typeArray.length]; + for(int i=0; i<typeArray.length; i++){ + String type=typeArray[i]; + lists[i]=new ArrayList<GffLine>(); + for(GffLine gline : list){ + if(gline.type.equals(type)){ + lists[i].add(gline); + } + } + } + return lists; + } + + public static ArrayList<GffLine> loadGffFile(FileFormat ff, String types, boolean banUnprocessed){ + HashSet<String> set=null; + if(types!=null){ + String[] split=types.split(","); + set=new HashSet<String>(split.length*2); + for(String s : split){ + set.add(s); + } + } + + ArrayList<GffLine> list=new ArrayList<GffLine>(); + ByteFile bf=ByteFile.makeByteFile(ff); + for(byte[] line=bf.nextLine(); line!=null; line=bf.nextLine()){ + if(line[0]=='#'){ + //skip + }else{ + GffLine gline=new GffLine(line); + assert(gline.strand>=0) : "\n"+gline.toString()+"\n"+new String(line)+"\n"; + if(set==null || (gline.type!=null && set.contains(gline.type))){ + if(!banUnprocessed || ProkObject.processType(gline.prokType())){ + list.add(gline); + } + } + } + } + + boolean error=bf.close(); + assert(!error) : "Problem with file "+ff.name(); + return list; + } + + public static void toText(ByteBuilder bb, Var v, double properPairRate, double totalQualityAvg, + double totalMapqAvg, double readLengthAvg, double rarity, int ploidy, ScafMap map){ +// assert(false); + bb.append(v.scafName(map)).append('\t'); + bb.append('.').append('\t'); + bb.append("sequence_variant_obs").append('\t'); + bb.append(v.start+1).append('\t'); + bb.append(Tools.max(v.start+1, v.stop)).append('\t'); + bb.append(v.score(properPairRate, totalQualityAvg, totalMapqAvg, readLengthAvg, rarity, ploidy, map), 2).append('\t'); + bb.append('+').append('\t'); + bb.append('.').append('\t'); +// System.err.println(v.typeString()+", "+v.start+", "+v.stop); + final int vtype=v.type(); + bb.append("ID=").append(Var.typeArray[vtype]); + if(vtype==Var.SUB || vtype==Var.INS){ + bb.append(' ').append(v.allele); + }else if(vtype==Var.DEL || vtype==Var.NOCALL){ + bb.append(" length ").append(v.reflen()); + }else{assert(false) : vtype+"\n"+v;} + } + + public static String toHeader(double properPairRate, double totalQualityAvg, double mapqAvg, double rarity, double minAlleleFraction, int ploidy, + long reads, long pairs, long properPairs, long bases, String ref){ + StringBuilder sb=new StringBuilder(); + + final double readLengthAvg=bases/Tools.max(1.0, reads); + sb.append("##gff-version 3\n"); + sb.append("#BBMapVersion\t"+Shared.BBMAP_VERSION_STRING+"\n"); + sb.append("#ploidy\t"+ploidy+"\n"); + sb.append(String.format(Locale.ROOT, "#rarity\t%.5f\n", rarity)); + sb.append(String.format(Locale.ROOT, "#minAlleleFraction\t%.4f\n", minAlleleFraction)); + sb.append("#reads\t"+reads+"\n"); + sb.append("#pairedReads\t"+pairs+"\n"); + sb.append("#properlyPairedReads\t"+properPairs+"\n"); + sb.append(String.format(Locale.ROOT, "#readLengthAvg\t%.2f\n", readLengthAvg)); + sb.append(String.format(Locale.ROOT, "#properPairRate\t%.4f\n", properPairRate)); + sb.append(String.format(Locale.ROOT, "#totalQualityAvg\t%.4f\n", totalQualityAvg)); + sb.append(String.format(Locale.ROOT, "#mapqAvg\t%.2f\n", mapqAvg)); + if(ref!=null){sb.append("#reference\t"+ref+"\n");} + + sb.append("#seqid source type start end score strand phase attributes"); + return sb.toString(); + } + + @Override + public String toString(){ + ByteBuilder bb=new ByteBuilder(); + appendTo(bb); + return bb.toString(); + } + + public ByteBuilder appendTo(ByteBuilder bb){ + bb.append(seqid==null ? "." : seqid).append('\t'); + bb.append(source==null ? "." : source).append('\t'); + bb.append(type==null ? "." : type).append('\t'); + bb.append(start).append('\t'); + bb.append(stop).append('\t'); + if(score<0){bb.append('.').append('\t');} + else{bb.append(score, 2).append('\t');} + + bb.append((strand>=0 ? STRANDS[strand] : (byte)'.')).append('\t'); + + if(phase<0){bb.append('.').append('\t');} + else{bb.append(phase).append('\t');} + + bb.append(attributes==null ? "." : attributes); + return bb; + } + + public int length() { + return stop-start+1; + } + + private static int find(byte a, byte[] array){ + for(int i=0; i<array.length; i++){ + if(array[i]==a){return i;} + } + return -1; + } + + private static String intern(String s){ + return Data.forceIntern(s); + } + + @Override + public int hashCode(){ + return trueStop()^seqid.hashCode(); + } + + @Override + public boolean equals(Object o){ + GffLine b=(GffLine)o; + if(start!=b.start){return false;} + if(stop!=b.stop){return false;} + if(strand!=b.strand){return false;} + if(!seqid.equals(b.seqid)){return false;} + if(!type.equals(b.type)){return false;} + return true; + } + + public int trueStart(){ + return strand==0 ? start : stop; + } + + public int trueStop(){ + return strand==0 ? stop : start; + } + + public final int prokType(){ + if(type.equals("CDS")){ + return ProkObject.CDS; + }else if(type.equals("tRNA")){ + return ProkObject.tRNA; + }else if(type.equals("rRNA")){ + if(attributes.contains("16S")){ + return ProkObject.r16S; + }else if(attributes.contains("23S")){ + return ProkObject.r23S; + }else if(attributes.contains("18S")){ + return ProkObject.r18S; + }else if(attributes.contains("5S") && length()<300){ + return ProkObject.r5S; + } + } + return -1; + } + + public final boolean partial(){return attributes!=null && attributes.contains("partial=true");} + + public final boolean inbounds(int scaflen){return start>=0 && stop<scaflen;} + + public String seqid; + public String source; + public String type; + public int start; + public int stop; + public float score; + public int strand; + public int phase; + public String attributes; + + private static final byte[] STRANDS=new byte[] {'+', '-', '?', '.'}; + public static final int PLUS=0, MINUS=1, QMARK=2, DOT=3; + public static final String DOTS="."; + + public static boolean parseSeqid=true; + public static boolean paseSource=false; + public static boolean parseType=true; + public static boolean parseScore=false; + public static boolean parseAttributes=true; + +// public static boolean parseSeqid=true; +// public static boolean paseSource=true; +// public static boolean parseType=true; +// public static boolean parseScore=true; +// public static boolean parseAttributes=true; + +}