jpayne@68: package gff; jpayne@68: jpayne@68: import java.util.ArrayList; jpayne@68: import java.util.HashSet; jpayne@68: import java.util.Locale; jpayne@68: jpayne@68: import dna.Data; jpayne@68: import fileIO.ByteFile; jpayne@68: import fileIO.FileFormat; jpayne@68: import prok.ProkObject; jpayne@68: import shared.Parse; jpayne@68: import shared.Shared; jpayne@68: import shared.Tools; jpayne@68: import structures.ByteBuilder; jpayne@68: import var2.ScafMap; jpayne@68: import var2.VCFLine; jpayne@68: import var2.Var; jpayne@68: jpayne@68: /** jpayne@68: * Used by both the var2 and prok packages for processing gff files. jpayne@68: * @author Brian Bushnell jpayne@68: * @date Sep 12, 2018 jpayne@68: * jpayne@68: */ jpayne@68: public class GffLine { jpayne@68: jpayne@68: //#seqid source type start end score strand phase attributes jpayne@68: public GffLine(byte[] line){ jpayne@68: int a=0, b=0; jpayne@68: jpayne@68: while(ba) : "Missing field 0: "+new String(line); jpayne@68: seqid=parseSeqid ? intern(new String(line, a, b-a)) : null; jpayne@68: // assert(seqid==null || seqid.equals(new String(line, a, b-a))); jpayne@68: // assert(seqid!=null) : new String(line, a, b-a)+", "+a+", "+b+"\n"+line; jpayne@68: b++; jpayne@68: a=b; jpayne@68: jpayne@68: while(ba) : "Missing field 1: "+new String(line); jpayne@68: if(b==a+1 && line[a]=='.'){source=DOTS;} jpayne@68: else{source=paseSource ? intern(new String(line, a, b-a)) : null;} jpayne@68: b++; jpayne@68: a=b; jpayne@68: jpayne@68: while(ba) : "Missing field 2: "+new String(line); jpayne@68: if(b==a+1 && line[a]=='.'){type=DOTS;} jpayne@68: else{ jpayne@68: try {//This was to catch a probably intermittent hardware error; can't replicate. jpayne@68: type=(parseType ? intern(new String(line, a, b-a)) : null); jpayne@68: } catch (Exception e) { jpayne@68: // TODO Auto-generated catch block jpayne@68: e.printStackTrace(); jpayne@68: System.err.println("\n"+new String(line)+"\n"+a+", "+b+", "+(b-a)); jpayne@68: assert(false); jpayne@68: } jpayne@68: } jpayne@68: b++; jpayne@68: a=b; jpayne@68: jpayne@68: while(ba) : "Missing field 3: "+new String(line); jpayne@68: start=Parse.parseInt(line, a, b); jpayne@68: b++; jpayne@68: a=b; jpayne@68: jpayne@68: while(ba) : "Missing field 4: "+new String(line); jpayne@68: stop=Parse.parseInt(line, a, b); jpayne@68: b++; jpayne@68: a=b; jpayne@68: jpayne@68: while(ba) : "Missing field 5: "+new String(line); jpayne@68: if(b==a+1 && line[a]=='.'){score=-1;} jpayne@68: else{score=Parse.parseFloat(line, a, b);} jpayne@68: b++; jpayne@68: a=b; jpayne@68: jpayne@68: while(ba) : "Missing field 6: "+new String(line); jpayne@68: assert(b==a+1); jpayne@68: strand=find(line[a], STRANDS); jpayne@68: // assert(strand>0) : line[a]+", "+Arrays.toString(STRANDS)+", "+(char)line[b]; jpayne@68: b++; jpayne@68: a=b; jpayne@68: jpayne@68: while(ba) : "Missing field 7: "+new String(line); jpayne@68: assert(b==a+1); jpayne@68: if(line[a]=='.'){phase=-1;} jpayne@68: else{phase=Parse.parseInt(line, a, b);} jpayne@68: b++; jpayne@68: a=b; jpayne@68: jpayne@68: while(ba) : "Missing field 8: "+new String(line); jpayne@68: if(b==a+1 && line[a]=='.'){attributes=DOTS;} jpayne@68: else{attributes=parseAttributes ? new String(line, a, b-a) : null;} jpayne@68: b++; jpayne@68: a=b; jpayne@68: jpayne@68: // assert(strand>=0) : "\n"+this.toString()+"\n"+new String(line); jpayne@68: } jpayne@68: jpayne@68: public GffLine(VCFLine vcf){ jpayne@68: seqid=vcf.scaf; jpayne@68: source=DOTS; jpayne@68: type="sequence_variant_obs"; jpayne@68: start=vcf.start()+1; jpayne@68: stop=vcf.stop()+1; jpayne@68: score=(float)vcf.qual; jpayne@68: strand=PLUS; jpayne@68: phase=-1; jpayne@68: final int vtype=vcf.type(); jpayne@68: ByteBuilder bb=new ByteBuilder(16); jpayne@68: bb.append("ID=").append(Var.typeArray[vtype]).append(' '); jpayne@68: if(vtype==Var.SUB){ jpayne@68: bb.append(vcf.ref).append('>').append(vcf.alt); jpayne@68: }else if(vtype==Var.DEL){ jpayne@68: bb.append("length ").append(vcf.reflen()-vcf.readlen()); jpayne@68: }else if(vtype==Var.INS){ jpayne@68: int offset=vcf.reflen(); jpayne@68: int length=vcf.readlen()-offset; jpayne@68: bb.append(vcf.alt, offset, length); jpayne@68: }else if(vtype==Var.NOCALL){ jpayne@68: bb.append("length ").append(vcf.reflen()); jpayne@68: } jpayne@68: attributes=bb.toString(); jpayne@68: bb.clear(); jpayne@68: } jpayne@68: jpayne@68: public GffLine(Var v, double properPairRate, double totalQualityAvg, double totalMapqAvg, double readLengthAvg, double rarity, int ploidy, ScafMap map){ jpayne@68: seqid=v.scafName(); jpayne@68: source=DOTS; jpayne@68: type="sequence_variant_obs"; jpayne@68: start=v.start+1; jpayne@68: stop=Tools.max(v.start+1, v.stop); jpayne@68: score=(float)v.score(properPairRate, totalQualityAvg, totalMapqAvg, readLengthAvg, rarity, ploidy, map); jpayne@68: strand=PLUS; jpayne@68: phase=-1; jpayne@68: final int vtype=v.type(); jpayne@68: ByteBuilder bb=new ByteBuilder(16); jpayne@68: bb.append("ID=").append(Var.typeArray[vtype]); jpayne@68: if(vtype==Var.SUB || vtype==Var.INS){ jpayne@68: bb.append(' ').append(v.allele); jpayne@68: }else if(vtype==Var.DEL || vtype==Var.NOCALL){ jpayne@68: bb.append(" length ").append(v.reflen()); jpayne@68: }else{assert(false) : vtype+"\n"+v;} jpayne@68: attributes=bb.toString(); jpayne@68: bb.clear(); jpayne@68: } jpayne@68: jpayne@68: public GffLine(Var v){ jpayne@68: seqid=v.scafName(); jpayne@68: source="BBTools"; jpayne@68: type="sequence_variant_obs"; jpayne@68: start=v.start+1; jpayne@68: stop=Tools.max(v.start+1, v.stop); jpayne@68: score=-1; jpayne@68: strand=PLUS; jpayne@68: phase=-1; jpayne@68: final int vtype=v.type(); jpayne@68: ByteBuilder bb=new ByteBuilder(16); jpayne@68: bb.append("ID=").append(Var.typeArray[vtype]); jpayne@68: if(vtype==Var.SUB || vtype==Var.INS){ jpayne@68: bb.append(' ').append(v.allele); jpayne@68: }else if(vtype==Var.DEL || vtype==Var.NOCALL){ jpayne@68: bb.append(" length ").append(v.reflen()); jpayne@68: }else{assert(false) : vtype+"\n"+v;} jpayne@68: attributes=bb.toString(); jpayne@68: bb.clear(); jpayne@68: } jpayne@68: jpayne@68: public static ArrayList loadGffFile(String fname, String types, boolean banUnprocessed){ jpayne@68: FileFormat ff=FileFormat.testInput(fname, FileFormat.GFF, null, false, false); jpayne@68: return loadGffFile(ff, types, banUnprocessed); jpayne@68: } jpayne@68: jpayne@68: public static ArrayList[] loadGffFileByType(FileFormat ff, String types, boolean banUnprocessed){ jpayne@68: ArrayList list=loadGffFile(ff, types, banUnprocessed); jpayne@68: String[] typeArray=types.split(","); jpayne@68: ArrayList[] lists=new ArrayList[typeArray.length]; jpayne@68: for(int i=0; i(); jpayne@68: for(GffLine gline : list){ jpayne@68: if(gline.type.equals(type)){ jpayne@68: lists[i].add(gline); jpayne@68: } jpayne@68: } jpayne@68: } jpayne@68: return lists; jpayne@68: } jpayne@68: jpayne@68: public static ArrayList loadGffFile(FileFormat ff, String types, boolean banUnprocessed){ jpayne@68: HashSet set=null; jpayne@68: if(types!=null){ jpayne@68: String[] split=types.split(","); jpayne@68: set=new HashSet(split.length*2); jpayne@68: for(String s : split){ jpayne@68: set.add(s); jpayne@68: } jpayne@68: } jpayne@68: jpayne@68: ArrayList list=new ArrayList(); jpayne@68: ByteFile bf=ByteFile.makeByteFile(ff); jpayne@68: for(byte[] line=bf.nextLine(); line!=null; line=bf.nextLine()){ jpayne@68: if(line[0]=='#'){ jpayne@68: //skip jpayne@68: }else{ jpayne@68: GffLine gline=new GffLine(line); jpayne@68: assert(gline.strand>=0) : "\n"+gline.toString()+"\n"+new String(line)+"\n"; jpayne@68: if(set==null || (gline.type!=null && set.contains(gline.type))){ jpayne@68: if(!banUnprocessed || ProkObject.processType(gline.prokType())){ jpayne@68: list.add(gline); jpayne@68: } jpayne@68: } jpayne@68: } jpayne@68: } jpayne@68: jpayne@68: boolean error=bf.close(); jpayne@68: assert(!error) : "Problem with file "+ff.name(); jpayne@68: return list; jpayne@68: } jpayne@68: jpayne@68: public static void toText(ByteBuilder bb, Var v, double properPairRate, double totalQualityAvg, jpayne@68: double totalMapqAvg, double readLengthAvg, double rarity, int ploidy, ScafMap map){ jpayne@68: // assert(false); jpayne@68: bb.append(v.scafName(map)).append('\t'); jpayne@68: bb.append('.').append('\t'); jpayne@68: bb.append("sequence_variant_obs").append('\t'); jpayne@68: bb.append(v.start+1).append('\t'); jpayne@68: bb.append(Tools.max(v.start+1, v.stop)).append('\t'); jpayne@68: bb.append(v.score(properPairRate, totalQualityAvg, totalMapqAvg, readLengthAvg, rarity, ploidy, map), 2).append('\t'); jpayne@68: bb.append('+').append('\t'); jpayne@68: bb.append('.').append('\t'); jpayne@68: // System.err.println(v.typeString()+", "+v.start+", "+v.stop); jpayne@68: final int vtype=v.type(); jpayne@68: bb.append("ID=").append(Var.typeArray[vtype]); jpayne@68: if(vtype==Var.SUB || vtype==Var.INS){ jpayne@68: bb.append(' ').append(v.allele); jpayne@68: }else if(vtype==Var.DEL || vtype==Var.NOCALL){ jpayne@68: bb.append(" length ").append(v.reflen()); jpayne@68: }else{assert(false) : vtype+"\n"+v;} jpayne@68: } jpayne@68: jpayne@68: public static String toHeader(double properPairRate, double totalQualityAvg, double mapqAvg, double rarity, double minAlleleFraction, int ploidy, jpayne@68: long reads, long pairs, long properPairs, long bases, String ref){ jpayne@68: StringBuilder sb=new StringBuilder(); jpayne@68: jpayne@68: final double readLengthAvg=bases/Tools.max(1.0, reads); jpayne@68: sb.append("##gff-version 3\n"); jpayne@68: sb.append("#BBMapVersion\t"+Shared.BBMAP_VERSION_STRING+"\n"); jpayne@68: sb.append("#ploidy\t"+ploidy+"\n"); jpayne@68: sb.append(String.format(Locale.ROOT, "#rarity\t%.5f\n", rarity)); jpayne@68: sb.append(String.format(Locale.ROOT, "#minAlleleFraction\t%.4f\n", minAlleleFraction)); jpayne@68: sb.append("#reads\t"+reads+"\n"); jpayne@68: sb.append("#pairedReads\t"+pairs+"\n"); jpayne@68: sb.append("#properlyPairedReads\t"+properPairs+"\n"); jpayne@68: sb.append(String.format(Locale.ROOT, "#readLengthAvg\t%.2f\n", readLengthAvg)); jpayne@68: sb.append(String.format(Locale.ROOT, "#properPairRate\t%.4f\n", properPairRate)); jpayne@68: sb.append(String.format(Locale.ROOT, "#totalQualityAvg\t%.4f\n", totalQualityAvg)); jpayne@68: sb.append(String.format(Locale.ROOT, "#mapqAvg\t%.2f\n", mapqAvg)); jpayne@68: if(ref!=null){sb.append("#reference\t"+ref+"\n");} jpayne@68: jpayne@68: sb.append("#seqid source type start end score strand phase attributes"); jpayne@68: return sb.toString(); jpayne@68: } jpayne@68: jpayne@68: @Override jpayne@68: public String toString(){ jpayne@68: ByteBuilder bb=new ByteBuilder(); jpayne@68: appendTo(bb); jpayne@68: return bb.toString(); jpayne@68: } jpayne@68: jpayne@68: public ByteBuilder appendTo(ByteBuilder bb){ jpayne@68: bb.append(seqid==null ? "." : seqid).append('\t'); jpayne@68: bb.append(source==null ? "." : source).append('\t'); jpayne@68: bb.append(type==null ? "." : type).append('\t'); jpayne@68: bb.append(start).append('\t'); jpayne@68: bb.append(stop).append('\t'); jpayne@68: if(score<0){bb.append('.').append('\t');} jpayne@68: else{bb.append(score, 2).append('\t');} jpayne@68: jpayne@68: bb.append((strand>=0 ? STRANDS[strand] : (byte)'.')).append('\t'); jpayne@68: jpayne@68: if(phase<0){bb.append('.').append('\t');} jpayne@68: else{bb.append(phase).append('\t');} jpayne@68: jpayne@68: bb.append(attributes==null ? "." : attributes); jpayne@68: return bb; jpayne@68: } jpayne@68: jpayne@68: public int length() { jpayne@68: return stop-start+1; jpayne@68: } jpayne@68: jpayne@68: private static int find(byte a, byte[] array){ jpayne@68: for(int i=0; i=0 && stop